60#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
61#define INCLUDED_volk_32f_x2_pow_32f_a_H
68#define POW_POLY_DEGREE 3
70#if LV_HAVE_AVX2 && LV_HAVE_FMA
73#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
74#define POLY1_AVX2_FMA(x, c0, c1) \
75 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
76#define POLY2_AVX2_FMA(x, c0, c1, c2) \
77 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
78#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
79 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
80#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
81 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
82#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
83 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
85static inline void volk_32f_x2_pow_32f_a_avx2_fma(
float* cVector,
88 unsigned int num_points)
90 float* cPtr = cVector;
91 const float* bPtr = bVector;
92 const float* aPtr = aVector;
94 unsigned int number = 0;
95 const unsigned int eighthPoints = num_points / 8;
97 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
98 __m256 tmp, fx, mask, pow2n, z, y;
99 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
100 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
101 __m256i bias, exp, emm0, pi32_0x7f;
103 one = _mm256_set1_ps(1.0);
104 exp_hi = _mm256_set1_ps(88.3762626647949);
105 exp_lo = _mm256_set1_ps(-88.3762626647949);
106 ln2 = _mm256_set1_ps(0.6931471805);
107 log2EF = _mm256_set1_ps(1.44269504088896341);
108 half = _mm256_set1_ps(0.5);
109 exp_C1 = _mm256_set1_ps(0.693359375);
110 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
111 pi32_0x7f = _mm256_set1_epi32(0x7f);
113 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
114 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
115 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
116 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
117 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
118 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
120 for (; number < eighthPoints; number++) {
122 aVal = _mm256_load_ps(aPtr);
123 bias = _mm256_set1_epi32(127);
124 leadingOne = _mm256_set1_ps(1.0f);
125 exp = _mm256_sub_epi32(
126 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
127 _mm256_set1_epi32(0x7f800000)),
130 logarithm = _mm256_cvtepi32_ps(exp);
134 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
136#if POW_POLY_DEGREE == 6
137 mantissa = POLY5_AVX2_FMA(frac,
144#elif POW_POLY_DEGREE == 5
145 mantissa = POLY4_AVX2_FMA(frac,
146 2.8882704548164776201f,
147 -2.52074962577807006663f,
148 1.48116647521213171641f,
149 -0.465725644288844778798f,
150 0.0596515482674574969533f);
151#elif POW_POLY_DEGREE == 4
152 mantissa = POLY3_AVX2_FMA(frac,
153 2.61761038894603480148f,
154 -1.75647175389045657003f,
155 0.688243882994381274313f,
156 -0.107254423828329604454f);
157#elif POW_POLY_DEGREE == 3
158 mantissa = POLY2_AVX2_FMA(frac,
159 2.28330284476918490682f,
160 -1.04913055217340124191f,
161 0.204446009836232697516f);
166 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
167 logarithm = _mm256_mul_ps(logarithm, ln2);
170 bVal = _mm256_load_ps(bPtr);
171 bVal = _mm256_mul_ps(bVal, logarithm);
174 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
176 fx = _mm256_fmadd_ps(bVal, log2EF, half);
178 emm0 = _mm256_cvttps_epi32(fx);
179 tmp = _mm256_cvtepi32_ps(emm0);
181 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
182 fx = _mm256_sub_ps(tmp, mask);
184 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
185 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
186 z = _mm256_mul_ps(bVal, bVal);
188 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
189 y = _mm256_fmadd_ps(y, bVal, exp_p2);
190 y = _mm256_fmadd_ps(y, bVal, exp_p3);
191 y = _mm256_fmadd_ps(y, bVal, exp_p4);
192 y = _mm256_fmadd_ps(y, bVal, exp_p5);
193 y = _mm256_fmadd_ps(y, z, bVal);
194 y = _mm256_add_ps(y, one);
197 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
199 pow2n = _mm256_castsi256_ps(emm0);
200 cVal = _mm256_mul_ps(y, pow2n);
202 _mm256_store_ps(cPtr, cVal);
209 number = eighthPoints * 8;
210 for (; number < num_points; number++) {
211 *cPtr++ = pow(*aPtr++, *bPtr++);
218#include <immintrin.h>
220#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
221#define POLY1_AVX2(x, c0, c1) \
222 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
223#define POLY2_AVX2(x, c0, c1, c2) \
224 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
225#define POLY3_AVX2(x, c0, c1, c2, c3) \
226 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
227#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
228 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
229#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
230 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
232static inline void volk_32f_x2_pow_32f_a_avx2(
float* cVector,
233 const float* bVector,
234 const float* aVector,
235 unsigned int num_points)
237 float* cPtr = cVector;
238 const float* bPtr = bVector;
239 const float* aPtr = aVector;
241 unsigned int number = 0;
242 const unsigned int eighthPoints = num_points / 8;
244 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
245 __m256 tmp, fx, mask, pow2n, z, y;
246 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
247 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
248 __m256i bias, exp, emm0, pi32_0x7f;
250 one = _mm256_set1_ps(1.0);
251 exp_hi = _mm256_set1_ps(88.3762626647949);
252 exp_lo = _mm256_set1_ps(-88.3762626647949);
253 ln2 = _mm256_set1_ps(0.6931471805);
254 log2EF = _mm256_set1_ps(1.44269504088896341);
255 half = _mm256_set1_ps(0.5);
256 exp_C1 = _mm256_set1_ps(0.693359375);
257 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
258 pi32_0x7f = _mm256_set1_epi32(0x7f);
260 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
261 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
262 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
263 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
264 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
265 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
267 for (; number < eighthPoints; number++) {
269 aVal = _mm256_load_ps(aPtr);
270 bias = _mm256_set1_epi32(127);
271 leadingOne = _mm256_set1_ps(1.0f);
272 exp = _mm256_sub_epi32(
273 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
274 _mm256_set1_epi32(0x7f800000)),
277 logarithm = _mm256_cvtepi32_ps(exp);
281 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
283#if POW_POLY_DEGREE == 6
284 mantissa = POLY5_AVX2(frac,
291#elif POW_POLY_DEGREE == 5
292 mantissa = POLY4_AVX2(frac,
293 2.8882704548164776201f,
294 -2.52074962577807006663f,
295 1.48116647521213171641f,
296 -0.465725644288844778798f,
297 0.0596515482674574969533f);
298#elif POW_POLY_DEGREE == 4
299 mantissa = POLY3_AVX2(frac,
300 2.61761038894603480148f,
301 -1.75647175389045657003f,
302 0.688243882994381274313f,
303 -0.107254423828329604454f);
304#elif POW_POLY_DEGREE == 3
305 mantissa = POLY2_AVX2(frac,
306 2.28330284476918490682f,
307 -1.04913055217340124191f,
308 0.204446009836232697516f);
313 logarithm = _mm256_add_ps(
314 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
315 logarithm = _mm256_mul_ps(logarithm, ln2);
318 bVal = _mm256_load_ps(bPtr);
319 bVal = _mm256_mul_ps(bVal, logarithm);
322 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
324 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
326 emm0 = _mm256_cvttps_epi32(fx);
327 tmp = _mm256_cvtepi32_ps(emm0);
329 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
330 fx = _mm256_sub_ps(tmp, mask);
332 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
333 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
334 z = _mm256_mul_ps(bVal, bVal);
336 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
337 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
338 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
339 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
340 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
341 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
342 y = _mm256_add_ps(y, one);
345 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
347 pow2n = _mm256_castsi256_ps(emm0);
348 cVal = _mm256_mul_ps(y, pow2n);
350 _mm256_store_ps(cPtr, cVal);
357 number = eighthPoints * 8;
358 for (; number < num_points; number++) {
359 *cPtr++ = pow(*aPtr++, *bPtr++);
367#include <smmintrin.h>
369#define POLY0(x, c0) _mm_set1_ps(c0)
370#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
371#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
372#define POLY3(x, c0, c1, c2, c3) \
373 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
374#define POLY4(x, c0, c1, c2, c3, c4) \
375 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
376#define POLY5(x, c0, c1, c2, c3, c4, c5) \
377 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
379static inline void volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
380 const float* bVector,
381 const float* aVector,
382 unsigned int num_points)
384 float* cPtr = cVector;
385 const float* bPtr = bVector;
386 const float* aPtr = aVector;
388 unsigned int number = 0;
389 const unsigned int quarterPoints = num_points / 4;
391 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
392 __m128 tmp, fx, mask, pow2n, z, y;
393 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
394 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
395 __m128i bias, exp, emm0, pi32_0x7f;
397 one = _mm_set1_ps(1.0);
398 exp_hi = _mm_set1_ps(88.3762626647949);
399 exp_lo = _mm_set1_ps(-88.3762626647949);
400 ln2 = _mm_set1_ps(0.6931471805);
401 log2EF = _mm_set1_ps(1.44269504088896341);
402 half = _mm_set1_ps(0.5);
403 exp_C1 = _mm_set1_ps(0.693359375);
404 exp_C2 = _mm_set1_ps(-2.12194440e-4);
405 pi32_0x7f = _mm_set1_epi32(0x7f);
407 exp_p0 = _mm_set1_ps(1.9875691500e-4);
408 exp_p1 = _mm_set1_ps(1.3981999507e-3);
409 exp_p2 = _mm_set1_ps(8.3334519073e-3);
410 exp_p3 = _mm_set1_ps(4.1665795894e-2);
411 exp_p4 = _mm_set1_ps(1.6666665459e-1);
412 exp_p5 = _mm_set1_ps(5.0000001201e-1);
414 for (; number < quarterPoints; number++) {
416 aVal = _mm_load_ps(aPtr);
417 bias = _mm_set1_epi32(127);
418 leadingOne = _mm_set1_ps(1.0f);
421 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
423 logarithm = _mm_cvtepi32_ps(exp);
425 frac = _mm_or_ps(leadingOne,
426 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
428#if POW_POLY_DEGREE == 6
429 mantissa = POLY5(frac,
436#elif POW_POLY_DEGREE == 5
437 mantissa = POLY4(frac,
438 2.8882704548164776201f,
439 -2.52074962577807006663f,
440 1.48116647521213171641f,
441 -0.465725644288844778798f,
442 0.0596515482674574969533f);
443#elif POW_POLY_DEGREE == 4
444 mantissa = POLY3(frac,
445 2.61761038894603480148f,
446 -1.75647175389045657003f,
447 0.688243882994381274313f,
448 -0.107254423828329604454f);
449#elif POW_POLY_DEGREE == 3
450 mantissa = POLY2(frac,
451 2.28330284476918490682f,
452 -1.04913055217340124191f,
453 0.204446009836232697516f);
459 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460 logarithm = _mm_mul_ps(logarithm, ln2);
464 bVal = _mm_load_ps(bPtr);
465 bVal = _mm_mul_ps(bVal, logarithm);
468 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
470 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
472 emm0 = _mm_cvttps_epi32(fx);
473 tmp = _mm_cvtepi32_ps(emm0);
475 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
476 fx = _mm_sub_ps(tmp, mask);
478 tmp = _mm_mul_ps(fx, exp_C1);
479 z = _mm_mul_ps(fx, exp_C2);
480 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
481 z = _mm_mul_ps(bVal, bVal);
483 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
484 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
485 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
486 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
487 y = _mm_add_ps(y, one);
489 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
491 pow2n = _mm_castsi128_ps(emm0);
492 cVal = _mm_mul_ps(y, pow2n);
494 _mm_store_ps(cPtr, cVal);
501 number = quarterPoints * 4;
502 for (; number < num_points; number++) {
503 *cPtr++ = powf(*aPtr++, *bPtr++);
511#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
512#define INCLUDED_volk_32f_x2_pow_32f_u_H
519#define POW_POLY_DEGREE 3
521#ifdef LV_HAVE_GENERIC
524 const float* bVector,
525 const float* aVector,
526 unsigned int num_points)
528 float* cPtr = cVector;
529 const float* bPtr = bVector;
530 const float* aPtr = aVector;
531 unsigned int number = 0;
533 for (number = 0; number < num_points; number++) {
534 *cPtr++ = powf(*aPtr++, *bPtr++);
543 const float* bVector,
544 const float* aVector,
545 unsigned int num_points)
547 float* cPtr = cVector;
548 const float* bPtr = bVector;
549 const float* aPtr = aVector;
551 unsigned int number = 0;
552 const unsigned int quarterPoints = num_points / 4;
555 float32x4_t one = vdupq_n_f32(1.0f);
556 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
557 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
558 float32x4_t ln2 = vdupq_n_f32(0.6931471805f);
559 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
560 float32x4_t half = vdupq_n_f32(0.5f);
561 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
562 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
563 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
564 int32x4_t bias = vdupq_n_s32(127);
565 int32x4_t mantMask = vdupq_n_s32(0x7fffff);
566 int32x4_t expMask = vdupq_n_s32(0x7f800000);
569 float32x4_t log_c0 = vdupq_n_f32(2.28330284476918490682f);
570 float32x4_t log_c1 = vdupq_n_f32(-1.04913055217340124191f);
571 float32x4_t log_c2 = vdupq_n_f32(0.204446009836232697516f);
574 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
575 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
576 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
577 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
578 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
579 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
581 for (; number < quarterPoints; number++) {
582 float32x4_t aVal = vld1q_f32(aPtr);
585 int32x4_t aInt = vreinterpretq_s32_f32(aVal);
586 int32x4_t expPart = vsubq_s32(vshrq_n_s32(vandq_s32(aInt, expMask), 23), bias);
587 float32x4_t logarithm = vcvtq_f32_s32(expPart);
590 vorrq_s32(vandq_s32(aInt, mantMask), vreinterpretq_s32_f32(one));
591 float32x4_t frac = vreinterpretq_f32_s32(mantPart);
594 float32x4_t mantissa = vmlaq_f32(log_c1, log_c2, frac);
595 mantissa = vmlaq_f32(log_c0, mantissa, frac);
597 float32x4_t fracMinusOne = vsubq_f32(frac, one);
598 logarithm = vmlaq_f32(logarithm, mantissa, fracMinusOne);
599 logarithm = vmulq_f32(logarithm, ln2);
602 float32x4_t bVal = vld1q_f32(bPtr);
603 bVal = vmulq_f32(bVal, logarithm);
606 bVal = vmaxq_f32(vminq_f32(bVal, exp_hi), exp_lo);
608 float32x4_t fx = vmlaq_f32(half, bVal, log2EF);
610 int32x4_t emm0 = vcvtq_s32_f32(fx);
611 float32x4_t tmp = vcvtq_f32_s32(emm0);
613 uint32x4_t mask = vcgtq_f32(tmp, fx);
614 float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
615 fx = vsubq_f32(tmp, mask_one);
617 tmp = vmulq_f32(fx, exp_C1);
618 float32x4_t z = vmulq_f32(fx, exp_C2);
619 bVal = vsubq_f32(vsubq_f32(bVal, tmp), z);
620 z = vmulq_f32(bVal, bVal);
622 float32x4_t y = vmlaq_f32(exp_p1, exp_p0, bVal);
623 y = vmulq_f32(y, bVal);
624 y = vaddq_f32(y, exp_p2);
625 y = vmulq_f32(y, bVal);
626 y = vaddq_f32(y, exp_p3);
627 y = vmlaq_f32(exp_p4, y, bVal);
628 y = vmulq_f32(y, bVal);
629 y = vaddq_f32(y, exp_p5);
630 y = vmlaq_f32(bVal, y, z);
631 y = vaddq_f32(y, one);
633 emm0 = vcvtq_s32_f32(fx);
634 emm0 = vaddq_s32(emm0, pi32_0x7f);
635 emm0 = vshlq_n_s32(emm0, 23);
636 float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
638 float32x4_t cVal = vmulq_f32(y, pow2n);
639 vst1q_f32(cPtr, cVal);
646 number = quarterPoints * 4;
647 for (; number < num_points; number++) {
648 *cPtr++ = powf(*aPtr++, *bPtr++);
657static inline void volk_32f_x2_pow_32f_neonv8(
float* cVector,
658 const float* bVector,
659 const float* aVector,
660 unsigned int num_points)
662 float* cPtr = cVector;
663 const float* bPtr = bVector;
664 const float* aPtr = aVector;
666 unsigned int number = 0;
667 const unsigned int quarterPoints = num_points / 4;
670 float32x4_t one = vdupq_n_f32(1.0f);
671 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
672 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
673 float32x4_t ln2 = vdupq_n_f32(0.6931471805f);
674 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
675 float32x4_t half = vdupq_n_f32(0.5f);
676 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
677 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
678 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
679 int32x4_t bias = vdupq_n_s32(127);
680 int32x4_t mantMask = vdupq_n_s32(0x7fffff);
681 int32x4_t expMask = vdupq_n_s32(0x7f800000);
684 float32x4_t log_c0 = vdupq_n_f32(2.28330284476918490682f);
685 float32x4_t log_c1 = vdupq_n_f32(-1.04913055217340124191f);
686 float32x4_t log_c2 = vdupq_n_f32(0.204446009836232697516f);
689 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
690 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
691 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
692 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
693 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
694 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
696 for (; number < quarterPoints; number++) {
700 float32x4_t aVal = vld1q_f32(aPtr);
703 int32x4_t aInt = vreinterpretq_s32_f32(aVal);
704 int32x4_t expPart = vsubq_s32(vshrq_n_s32(vandq_s32(aInt, expMask), 23), bias);
705 float32x4_t logarithm = vcvtq_f32_s32(expPart);
708 vorrq_s32(vandq_s32(aInt, mantMask), vreinterpretq_s32_f32(one));
709 float32x4_t frac = vreinterpretq_f32_s32(mantPart);
712 float32x4_t mantissa = vfmaq_f32(log_c1, log_c2, frac);
713 mantissa = vfmaq_f32(log_c0, mantissa, frac);
715 float32x4_t fracMinusOne = vsubq_f32(frac, one);
716 logarithm = vfmaq_f32(logarithm, mantissa, fracMinusOne);
717 logarithm = vmulq_f32(logarithm, ln2);
720 float32x4_t bVal = vld1q_f32(bPtr);
721 bVal = vmulq_f32(bVal, logarithm);
724 bVal = vmaxq_f32(vminq_f32(bVal, exp_hi), exp_lo);
726 float32x4_t fx = vfmaq_f32(half, bVal, log2EF);
728 int32x4_t emm0 = vcvtq_s32_f32(fx);
729 float32x4_t tmp = vcvtq_f32_s32(emm0);
731 uint32x4_t mask = vcgtq_f32(tmp, fx);
732 float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
733 fx = vsubq_f32(tmp, mask_one);
735 tmp = vmulq_f32(fx, exp_C1);
736 float32x4_t z = vmulq_f32(fx, exp_C2);
737 bVal = vsubq_f32(vsubq_f32(bVal, tmp), z);
738 z = vmulq_f32(bVal, bVal);
740 float32x4_t y = vfmaq_f32(exp_p1, exp_p0, bVal);
741 y = vmulq_f32(y, bVal);
742 y = vaddq_f32(y, exp_p2);
743 y = vmulq_f32(y, bVal);
744 y = vaddq_f32(y, exp_p3);
745 y = vfmaq_f32(exp_p4, y, bVal);
746 y = vmulq_f32(y, bVal);
747 y = vaddq_f32(y, exp_p5);
748 y = vfmaq_f32(bVal, y, z);
749 y = vaddq_f32(y, one);
751 emm0 = vcvtq_s32_f32(fx);
752 emm0 = vaddq_s32(emm0, pi32_0x7f);
753 emm0 = vshlq_n_s32(emm0, 23);
754 float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
756 float32x4_t cVal = vmulq_f32(y, pow2n);
757 vst1q_f32(cPtr, cVal);
764 number = quarterPoints * 4;
765 for (; number < num_points; number++) {
766 *cPtr++ = powf(*aPtr++, *bPtr++);
773#include <smmintrin.h>
775#define POLY0(x, c0) _mm_set1_ps(c0)
776#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
777#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
778#define POLY3(x, c0, c1, c2, c3) \
779 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
780#define POLY4(x, c0, c1, c2, c3, c4) \
781 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
782#define POLY5(x, c0, c1, c2, c3, c4, c5) \
783 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
785static inline void volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
786 const float* bVector,
787 const float* aVector,
788 unsigned int num_points)
790 float* cPtr = cVector;
791 const float* bPtr = bVector;
792 const float* aPtr = aVector;
794 unsigned int number = 0;
795 const unsigned int quarterPoints = num_points / 4;
797 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
798 __m128 tmp, fx, mask, pow2n, z, y;
799 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
800 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
801 __m128i bias, exp, emm0, pi32_0x7f;
803 one = _mm_set1_ps(1.0);
804 exp_hi = _mm_set1_ps(88.3762626647949);
805 exp_lo = _mm_set1_ps(-88.3762626647949);
806 ln2 = _mm_set1_ps(0.6931471805);
807 log2EF = _mm_set1_ps(1.44269504088896341);
808 half = _mm_set1_ps(0.5);
809 exp_C1 = _mm_set1_ps(0.693359375);
810 exp_C2 = _mm_set1_ps(-2.12194440e-4);
811 pi32_0x7f = _mm_set1_epi32(0x7f);
813 exp_p0 = _mm_set1_ps(1.9875691500e-4);
814 exp_p1 = _mm_set1_ps(1.3981999507e-3);
815 exp_p2 = _mm_set1_ps(8.3334519073e-3);
816 exp_p3 = _mm_set1_ps(4.1665795894e-2);
817 exp_p4 = _mm_set1_ps(1.6666665459e-1);
818 exp_p5 = _mm_set1_ps(5.0000001201e-1);
820 for (; number < quarterPoints; number++) {
822 aVal = _mm_loadu_ps(aPtr);
823 bias = _mm_set1_epi32(127);
824 leadingOne = _mm_set1_ps(1.0f);
827 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
829 logarithm = _mm_cvtepi32_ps(exp);
831 frac = _mm_or_ps(leadingOne,
832 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
834#if POW_POLY_DEGREE == 6
835 mantissa = POLY5(frac,
842#elif POW_POLY_DEGREE == 5
843 mantissa = POLY4(frac,
844 2.8882704548164776201f,
845 -2.52074962577807006663f,
846 1.48116647521213171641f,
847 -0.465725644288844778798f,
848 0.0596515482674574969533f);
849#elif POW_POLY_DEGREE == 4
850 mantissa = POLY3(frac,
851 2.61761038894603480148f,
852 -1.75647175389045657003f,
853 0.688243882994381274313f,
854 -0.107254423828329604454f);
855#elif POW_POLY_DEGREE == 3
856 mantissa = POLY2(frac,
857 2.28330284476918490682f,
858 -1.04913055217340124191f,
859 0.204446009836232697516f);
865 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
866 logarithm = _mm_mul_ps(logarithm, ln2);
870 bVal = _mm_loadu_ps(bPtr);
871 bVal = _mm_mul_ps(bVal, logarithm);
874 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
876 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
878 emm0 = _mm_cvttps_epi32(fx);
879 tmp = _mm_cvtepi32_ps(emm0);
881 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
882 fx = _mm_sub_ps(tmp, mask);
884 tmp = _mm_mul_ps(fx, exp_C1);
885 z = _mm_mul_ps(fx, exp_C2);
886 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
887 z = _mm_mul_ps(bVal, bVal);
889 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
890 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
891 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
892 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
893 y = _mm_add_ps(y, one);
895 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
897 pow2n = _mm_castsi128_ps(emm0);
898 cVal = _mm_mul_ps(y, pow2n);
900 _mm_storeu_ps(cPtr, cVal);
907 number = quarterPoints * 4;
908 for (; number < num_points; number++) {
909 *cPtr++ = powf(*aPtr++, *bPtr++);
915#if LV_HAVE_AVX2 && LV_HAVE_FMA
916#include <immintrin.h>
918#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
919#define POLY1_AVX2_FMA(x, c0, c1) \
920 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
921#define POLY2_AVX2_FMA(x, c0, c1, c2) \
922 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
923#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
924 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
925#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
926 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
927#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
928 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
930static inline void volk_32f_x2_pow_32f_u_avx2_fma(
float* cVector,
931 const float* bVector,
932 const float* aVector,
933 unsigned int num_points)
935 float* cPtr = cVector;
936 const float* bPtr = bVector;
937 const float* aPtr = aVector;
939 unsigned int number = 0;
940 const unsigned int eighthPoints = num_points / 8;
942 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
943 __m256 tmp, fx, mask, pow2n, z, y;
944 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
945 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
946 __m256i bias, exp, emm0, pi32_0x7f;
948 one = _mm256_set1_ps(1.0);
949 exp_hi = _mm256_set1_ps(88.3762626647949);
950 exp_lo = _mm256_set1_ps(-88.3762626647949);
951 ln2 = _mm256_set1_ps(0.6931471805);
952 log2EF = _mm256_set1_ps(1.44269504088896341);
953 half = _mm256_set1_ps(0.5);
954 exp_C1 = _mm256_set1_ps(0.693359375);
955 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
956 pi32_0x7f = _mm256_set1_epi32(0x7f);
958 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
959 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
960 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
961 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
962 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
963 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
965 for (; number < eighthPoints; number++) {
967 aVal = _mm256_loadu_ps(aPtr);
968 bias = _mm256_set1_epi32(127);
969 leadingOne = _mm256_set1_ps(1.0f);
970 exp = _mm256_sub_epi32(
971 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
972 _mm256_set1_epi32(0x7f800000)),
975 logarithm = _mm256_cvtepi32_ps(exp);
979 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
981#if POW_POLY_DEGREE == 6
982 mantissa = POLY5_AVX2_FMA(frac,
989#elif POW_POLY_DEGREE == 5
990 mantissa = POLY4_AVX2_FMA(frac,
991 2.8882704548164776201f,
992 -2.52074962577807006663f,
993 1.48116647521213171641f,
994 -0.465725644288844778798f,
995 0.0596515482674574969533f);
996#elif POW_POLY_DEGREE == 4
997 mantissa = POLY3_AVX2_FMA(frac,
998 2.61761038894603480148f,
999 -1.75647175389045657003f,
1000 0.688243882994381274313f,
1001 -0.107254423828329604454f);
1002#elif POW_POLY_DEGREE == 3
1003 mantissa = POLY2_AVX2_FMA(frac,
1004 2.28330284476918490682f,
1005 -1.04913055217340124191f,
1006 0.204446009836232697516f);
1011 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
1012 logarithm = _mm256_mul_ps(logarithm, ln2);
1016 bVal = _mm256_loadu_ps(bPtr);
1017 bVal = _mm256_mul_ps(bVal, logarithm);
1020 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
1022 fx = _mm256_fmadd_ps(bVal, log2EF, half);
1024 emm0 = _mm256_cvttps_epi32(fx);
1025 tmp = _mm256_cvtepi32_ps(emm0);
1027 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
1028 fx = _mm256_sub_ps(tmp, mask);
1030 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
1031 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
1032 z = _mm256_mul_ps(bVal, bVal);
1034 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
1035 y = _mm256_fmadd_ps(y, bVal, exp_p2);
1036 y = _mm256_fmadd_ps(y, bVal, exp_p3);
1037 y = _mm256_fmadd_ps(y, bVal, exp_p4);
1038 y = _mm256_fmadd_ps(y, bVal, exp_p5);
1039 y = _mm256_fmadd_ps(y, z, bVal);
1040 y = _mm256_add_ps(y, one);
1043 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
1045 pow2n = _mm256_castsi256_ps(emm0);
1046 cVal = _mm256_mul_ps(y, pow2n);
1048 _mm256_storeu_ps(cPtr, cVal);
1055 number = eighthPoints * 8;
1056 for (; number < num_points; number++) {
1057 *cPtr++ = pow(*aPtr++, *bPtr++);
1064#include <immintrin.h>
1066#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
1067#define POLY1_AVX2(x, c0, c1) \
1068 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
1069#define POLY2_AVX2(x, c0, c1, c2) \
1070 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
1071#define POLY3_AVX2(x, c0, c1, c2, c3) \
1072 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
1073#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
1074 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
1075#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
1076 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
1078static inline void volk_32f_x2_pow_32f_u_avx2(
float* cVector,
1079 const float* bVector,
1080 const float* aVector,
1081 unsigned int num_points)
1083 float* cPtr = cVector;
1084 const float* bPtr = bVector;
1085 const float* aPtr = aVector;
1087 unsigned int number = 0;
1088 const unsigned int eighthPoints = num_points / 8;
1090 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
1091 __m256 tmp, fx, mask, pow2n, z, y;
1092 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
1093 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
1094 __m256i bias, exp, emm0, pi32_0x7f;
1096 one = _mm256_set1_ps(1.0);
1097 exp_hi = _mm256_set1_ps(88.3762626647949);
1098 exp_lo = _mm256_set1_ps(-88.3762626647949);
1099 ln2 = _mm256_set1_ps(0.6931471805);
1100 log2EF = _mm256_set1_ps(1.44269504088896341);
1101 half = _mm256_set1_ps(0.5);
1102 exp_C1 = _mm256_set1_ps(0.693359375);
1103 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
1104 pi32_0x7f = _mm256_set1_epi32(0x7f);
1106 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
1107 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
1108 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
1109 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
1110 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
1111 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
1113 for (; number < eighthPoints; number++) {
1115 aVal = _mm256_loadu_ps(aPtr);
1116 bias = _mm256_set1_epi32(127);
1117 leadingOne = _mm256_set1_ps(1.0f);
1118 exp = _mm256_sub_epi32(
1119 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
1120 _mm256_set1_epi32(0x7f800000)),
1123 logarithm = _mm256_cvtepi32_ps(exp);
1125 frac = _mm256_or_ps(
1127 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
1129#if POW_POLY_DEGREE == 6
1130 mantissa = POLY5_AVX2(frac,
1137#elif POW_POLY_DEGREE == 5
1138 mantissa = POLY4_AVX2(frac,
1139 2.8882704548164776201f,
1140 -2.52074962577807006663f,
1141 1.48116647521213171641f,
1142 -0.465725644288844778798f,
1143 0.0596515482674574969533f);
1144#elif POW_POLY_DEGREE == 4
1145 mantissa = POLY3_AVX2(frac,
1146 2.61761038894603480148f,
1147 -1.75647175389045657003f,
1148 0.688243882994381274313f,
1149 -0.107254423828329604454f);
1150#elif POW_POLY_DEGREE == 3
1151 mantissa = POLY2_AVX2(frac,
1152 2.28330284476918490682f,
1153 -1.04913055217340124191f,
1154 0.204446009836232697516f);
1159 logarithm = _mm256_add_ps(
1160 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
1161 logarithm = _mm256_mul_ps(logarithm, ln2);
1164 bVal = _mm256_loadu_ps(bPtr);
1165 bVal = _mm256_mul_ps(bVal, logarithm);
1168 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
1170 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
1172 emm0 = _mm256_cvttps_epi32(fx);
1173 tmp = _mm256_cvtepi32_ps(emm0);
1175 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
1176 fx = _mm256_sub_ps(tmp, mask);
1178 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
1179 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
1180 z = _mm256_mul_ps(bVal, bVal);
1182 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
1183 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
1184 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
1185 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
1186 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
1187 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
1188 y = _mm256_add_ps(y, one);
1191 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
1193 pow2n = _mm256_castsi256_ps(emm0);
1194 cVal = _mm256_mul_ps(y, pow2n);
1196 _mm256_storeu_ps(cPtr, cVal);
1203 number = eighthPoints * 8;
1204 for (; number < num_points; number++) {
1205 *cPtr++ = pow(*aPtr++, *bPtr++);
1212#include <riscv_vector.h>
1214static inline void volk_32f_x2_pow_32f_rvv(
float* cVector,
1215 const float* bVector,
1216 const float* aVector,
1217 unsigned int num_points)
1219 size_t vlmax = __riscv_vsetvlmax_e32m1();
1221#if POW_POLY_DEGREE == 6
1222 const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax);
1223 const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax);
1224 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax);
1225 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax);
1226 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax);
1227 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax);
1228#elif POW_POLY_DEGREE == 5
1229 const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax);
1230 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax);
1231 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax);
1232 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax);
1233 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax);
1234#elif POW_POLY_DEGREE == 4
1235 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax);
1236 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax);
1237 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax);
1238 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax);
1239#elif POW_POLY_DEGREE == 3
1240 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax);
1241 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax);
1242 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax);
1247 const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax);
1248 const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax);
1249 const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax);
1250 const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax);
1251 const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax);
1252 const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax);
1253 const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax);
1254 const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax);
1256 const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax);
1257 const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax);
1258 const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax);
1259 const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax);
1260 const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax);
1261 const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax);
1263 const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1);
1264 const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax);
1265 const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax);
1267 size_t n = num_points;
1268 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
1269 vl = __riscv_vsetvl_e32m1(n);
1270 vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl);
1274 vfloat32m1_t a = __riscv_vfabs(va, vl);
1275 vfloat32m1_t exp = __riscv_vfcvt_f(
1277 __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl),
1279 vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor(
1280 __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl));
1282 vfloat32m1_t mant = cl0;
1283 mant = __riscv_vfmadd(mant, frac, cl1, vl);
1284 mant = __riscv_vfmadd(mant, frac, cl2, vl);
1285#if POW_POLY_DEGREE >= 4
1286 mant = __riscv_vfmadd(mant, frac, cl3, vl);
1287#if POW_POLY_DEGREE >= 5
1288 mant = __riscv_vfmadd(mant, frac, cl4, vl);
1289#if POW_POLY_DEGREE >= 6
1290 mant = __riscv_vfmadd(mant, frac, cl5, vl);
1294 log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
1295 log = __riscv_vfmul(log, ln2, vl);
1298 vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl);
1299 vb = __riscv_vfmul(vb, log, vl);
1303 vb = __riscv_vfmin(vb, exp_hi, vl);
1304 vb = __riscv_vfmax(vb, exp_lo, vl);
1305 vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl);
1307 vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
1308 fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
1309 vb = __riscv_vfmacc(vb, exp_C1, fx, vl);
1310 vb = __riscv_vfmacc(vb, exp_C2, fx, vl);
1311 vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl);
1313 vfloat32m1_t y = ce0;
1314 y = __riscv_vfmadd(y, vb, ce1, vl);
1315 y = __riscv_vfmadd(y, vb, ce2, vl);
1316 y = __riscv_vfmadd(y, vb, ce3, vl);
1317 y = __riscv_vfmadd(y, vb, ce4, vl);
1318 y = __riscv_vfmadd(y, vb, ce5, vl);
1319 y = __riscv_vfmadd(y, vv, vb, vl);
1320 y = __riscv_vfadd(y, cf1, vl);
1322 vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll(
1323 __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl));
1325 exp = __riscv_vfmul(y, pow2n, vl);
1328 __riscv_vse32(cVector, exp, vl);