86#ifndef INCLUDED_volk_32f_exp_32f_a_H
87#define INCLUDED_volk_32f_exp_32f_a_H
95 float* bPtr = bVector;
96 const float* aPtr = aVector;
98 unsigned int number = 0;
99 unsigned int quarterPoints = num_points / 4;
102 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105 __m128i emm0, pi32_0x7f;
107 one = _mm_set1_ps(1.0);
108 exp_hi = _mm_set1_ps(88.3762626647949);
109 exp_lo = _mm_set1_ps(-88.3762626647949);
110 log2EF = _mm_set1_ps(1.44269504088896341);
111 half = _mm_set1_ps(0.5);
112 exp_C1 = _mm_set1_ps(0.693359375);
113 exp_C2 = _mm_set1_ps(-2.12194440e-4);
114 pi32_0x7f = _mm_set1_epi32(0x7f);
116 exp_p0 = _mm_set1_ps(1.9875691500e-4);
117 exp_p1 = _mm_set1_ps(1.3981999507e-3);
118 exp_p2 = _mm_set1_ps(8.3334519073e-3);
119 exp_p3 = _mm_set1_ps(4.1665795894e-2);
120 exp_p4 = _mm_set1_ps(1.6666665459e-1);
121 exp_p5 = _mm_set1_ps(5.0000001201e-1);
123 for (; number < quarterPoints; number++) {
124 aVal = _mm_load_ps(aPtr);
125 tmp = _mm_setzero_ps();
127 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
130 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
132 emm0 = _mm_cvttps_epi32(fx);
133 tmp = _mm_cvtepi32_ps(emm0);
135 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136 fx = _mm_sub_ps(tmp, mask);
138 tmp = _mm_mul_ps(fx, exp_C1);
139 z = _mm_mul_ps(fx, exp_C2);
140 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141 z = _mm_mul_ps(aVal, aVal);
143 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147 y = _mm_add_ps(y, one);
149 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
151 pow2n = _mm_castsi128_ps(emm0);
152 bVal = _mm_mul_ps(y, pow2n);
154 _mm_store_ps(bPtr, bVal);
159 number = quarterPoints * 4;
160 for (; number < num_points; number++) {
161 *bPtr++ = expf(*aPtr++);
170#ifndef INCLUDED_volk_32f_exp_32f_u_H
171#define INCLUDED_volk_32f_exp_32f_u_H
174#include <emmintrin.h>
179 float* bPtr = bVector;
180 const float* aPtr = aVector;
182 unsigned int number = 0;
183 unsigned int quarterPoints = num_points / 4;
186 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
187 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
188 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
189 __m128i emm0, pi32_0x7f;
191 one = _mm_set1_ps(1.0);
192 exp_hi = _mm_set1_ps(88.3762626647949);
193 exp_lo = _mm_set1_ps(-88.3762626647949);
194 log2EF = _mm_set1_ps(1.44269504088896341);
195 half = _mm_set1_ps(0.5);
196 exp_C1 = _mm_set1_ps(0.693359375);
197 exp_C2 = _mm_set1_ps(-2.12194440e-4);
198 pi32_0x7f = _mm_set1_epi32(0x7f);
200 exp_p0 = _mm_set1_ps(1.9875691500e-4);
201 exp_p1 = _mm_set1_ps(1.3981999507e-3);
202 exp_p2 = _mm_set1_ps(8.3334519073e-3);
203 exp_p3 = _mm_set1_ps(4.1665795894e-2);
204 exp_p4 = _mm_set1_ps(1.6666665459e-1);
205 exp_p5 = _mm_set1_ps(5.0000001201e-1);
208 for (; number < quarterPoints; number++) {
209 aVal = _mm_loadu_ps(aPtr);
210 tmp = _mm_setzero_ps();
212 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
215 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
217 emm0 = _mm_cvttps_epi32(fx);
218 tmp = _mm_cvtepi32_ps(emm0);
220 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
221 fx = _mm_sub_ps(tmp, mask);
223 tmp = _mm_mul_ps(fx, exp_C1);
224 z = _mm_mul_ps(fx, exp_C2);
225 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
226 z = _mm_mul_ps(aVal, aVal);
228 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
229 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
230 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
231 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
232 y = _mm_add_ps(y, one);
234 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
236 pow2n = _mm_castsi128_ps(emm0);
237 bVal = _mm_mul_ps(y, pow2n);
239 _mm_storeu_ps(bPtr, bVal);
244 number = quarterPoints * 4;
245 for (; number < num_points; number++) {
246 *bPtr++ = expf(*aPtr++);
253#ifdef LV_HAVE_GENERIC
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
260 unsigned int number = 0;
262 for (number = 0; number < num_points; number++) {
263 *bPtr++ = expf(*aPtr++);
275 float* bPtr = bVector;
276 const float* aPtr = aVector;
278 unsigned int number = 0;
279 unsigned int quarterPoints = num_points / 4;
282 float32x4_t one = vdupq_n_f32(1.0f);
283 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
284 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
285 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
286 float32x4_t half = vdupq_n_f32(0.5f);
287 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
288 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
289 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
291 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
292 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
293 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
294 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
295 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
296 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
298 for (; number < quarterPoints; number++) {
299 float32x4_t aVal = vld1q_f32(aPtr);
302 aVal = vmaxq_f32(vminq_f32(aVal, exp_hi), exp_lo);
305 float32x4_t fx = vmlaq_f32(half, aVal, log2EF);
308 int32x4_t emm0 = vcvtq_s32_f32(fx);
309 float32x4_t tmp = vcvtq_f32_s32(emm0);
312 uint32x4_t mask = vcgtq_f32(tmp, fx);
313 float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
314 fx = vsubq_f32(tmp, mask_one);
317 tmp = vmulq_f32(fx, exp_C1);
318 float32x4_t z = vmulq_f32(fx, exp_C2);
319 aVal = vsubq_f32(vsubq_f32(aVal, tmp), z);
320 z = vmulq_f32(aVal, aVal);
323 float32x4_t y = vmlaq_f32(exp_p1, exp_p0, aVal);
324 y = vmulq_f32(y, aVal);
325 y = vaddq_f32(y, exp_p2);
326 y = vmulq_f32(y, aVal);
327 y = vaddq_f32(y, exp_p3);
328 y = vmlaq_f32(exp_p4, y, aVal);
329 y = vmulq_f32(y, aVal);
330 y = vaddq_f32(y, exp_p5);
331 y = vmlaq_f32(aVal, y, z);
332 y = vaddq_f32(y, one);
335 emm0 = vcvtq_s32_f32(fx);
336 emm0 = vaddq_s32(emm0, pi32_0x7f);
337 emm0 = vshlq_n_s32(emm0, 23);
338 float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
340 float32x4_t bVal = vmulq_f32(y, pow2n);
341 vst1q_f32(bPtr, bVal);
347 number = quarterPoints * 4;
348 for (; number < num_points; number++) {
349 *bPtr++ = expf(*aPtr++);
359volk_32f_exp_32f_neonv8(
float* bVector,
const float* aVector,
unsigned int num_points)
361 float* bPtr = bVector;
362 const float* aPtr = aVector;
364 unsigned int number = 0;
365 unsigned int eighthPoints = num_points / 8;
368 float32x4_t one = vdupq_n_f32(1.0f);
369 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
370 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
371 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
372 float32x4_t half = vdupq_n_f32(0.5f);
373 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
374 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
375 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
377 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
378 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
379 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
380 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
381 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
382 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
384 for (; number < eighthPoints; number++) {
387 float32x4_t aVal0 = vld1q_f32(aPtr);
388 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
391 aVal0 = vmaxq_f32(vminq_f32(aVal0, exp_hi), exp_lo);
392 aVal1 = vmaxq_f32(vminq_f32(aVal1, exp_hi), exp_lo);
395 float32x4_t fx0 = vfmaq_f32(half, aVal0, log2EF);
396 float32x4_t fx1 = vfmaq_f32(half, aVal1, log2EF);
399 int32x4_t emm0_0 = vcvtq_s32_f32(fx0);
400 int32x4_t emm0_1 = vcvtq_s32_f32(fx1);
401 float32x4_t tmp0 = vcvtq_f32_s32(emm0_0);
402 float32x4_t tmp1 = vcvtq_f32_s32(emm0_1);
405 uint32x4_t mask0 = vcgtq_f32(tmp0, fx0);
406 uint32x4_t mask1 = vcgtq_f32(tmp1, fx1);
407 float32x4_t mask_one0 = vbslq_f32(mask0, one, vdupq_n_f32(0.0f));
408 float32x4_t mask_one1 = vbslq_f32(mask1, one, vdupq_n_f32(0.0f));
409 fx0 = vsubq_f32(tmp0, mask_one0);
410 fx1 = vsubq_f32(tmp1, mask_one1);
413 tmp0 = vmulq_f32(fx0, exp_C1);
414 tmp1 = vmulq_f32(fx1, exp_C1);
415 float32x4_t z0 = vmulq_f32(fx0, exp_C2);
416 float32x4_t z1 = vmulq_f32(fx1, exp_C2);
417 aVal0 = vsubq_f32(vsubq_f32(aVal0, tmp0), z0);
418 aVal1 = vsubq_f32(vsubq_f32(aVal1, tmp1), z1);
419 z0 = vmulq_f32(aVal0, aVal0);
420 z1 = vmulq_f32(aVal1, aVal1);
423 float32x4_t y0 = vfmaq_f32(exp_p1, exp_p0, aVal0);
424 float32x4_t y1 = vfmaq_f32(exp_p1, exp_p0, aVal1);
425 y0 = vmulq_f32(y0, aVal0);
426 y1 = vmulq_f32(y1, aVal1);
427 y0 = vaddq_f32(y0, exp_p2);
428 y1 = vaddq_f32(y1, exp_p2);
429 y0 = vmulq_f32(y0, aVal0);
430 y1 = vmulq_f32(y1, aVal1);
431 y0 = vaddq_f32(y0, exp_p3);
432 y1 = vaddq_f32(y1, exp_p3);
433 y0 = vfmaq_f32(exp_p4, y0, aVal0);
434 y1 = vfmaq_f32(exp_p4, y1, aVal1);
435 y0 = vmulq_f32(y0, aVal0);
436 y1 = vmulq_f32(y1, aVal1);
437 y0 = vaddq_f32(y0, exp_p5);
438 y1 = vaddq_f32(y1, exp_p5);
439 y0 = vfmaq_f32(aVal0, y0, z0);
440 y1 = vfmaq_f32(aVal1, y1, z1);
441 y0 = vaddq_f32(y0, one);
442 y1 = vaddq_f32(y1, one);
445 emm0_0 = vcvtq_s32_f32(fx0);
446 emm0_1 = vcvtq_s32_f32(fx1);
447 emm0_0 = vaddq_s32(emm0_0, pi32_0x7f);
448 emm0_1 = vaddq_s32(emm0_1, pi32_0x7f);
449 emm0_0 = vshlq_n_s32(emm0_0, 23);
450 emm0_1 = vshlq_n_s32(emm0_1, 23);
451 float32x4_t pow2n0 = vreinterpretq_f32_s32(emm0_0);
452 float32x4_t pow2n1 = vreinterpretq_f32_s32(emm0_1);
454 float32x4_t bVal0 = vmulq_f32(y0, pow2n0);
455 float32x4_t bVal1 = vmulq_f32(y1, pow2n1);
456 vst1q_f32(bPtr, bVal0);
457 vst1q_f32(bPtr + 4, bVal1);
463 number = eighthPoints * 8;
464 for (; number < num_points; number++) {
465 *bPtr++ = expf(*aPtr++);
472#include <riscv_vector.h>
475volk_32f_exp_32f_rvv(
float* bVector,
const float* aVector,
unsigned int num_points)
477 size_t vlmax = __riscv_vsetvlmax_e32m2();
479 const vfloat32m2_t exp_hi = __riscv_vfmv_v_f_f32m2(88.376259f, vlmax);
480 const vfloat32m2_t exp_lo = __riscv_vfmv_v_f_f32m2(-88.376259f, vlmax);
481 const vfloat32m2_t log2EF = __riscv_vfmv_v_f_f32m2(1.442695f, vlmax);
482 const vfloat32m2_t exp_C1 = __riscv_vfmv_v_f_f32m2(-0.6933594f, vlmax);
483 const vfloat32m2_t exp_C2 = __riscv_vfmv_v_f_f32m2(0.000212194f, vlmax);
484 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
485 const vfloat32m2_t cf1o2 = __riscv_vfmv_v_f_f32m2(0.5f, vlmax);
487 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(1.9875691500e-4, vlmax);
488 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(1.3981999507e-3, vlmax);
489 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(8.3334519073e-3, vlmax);
490 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(4.1665795894e-2, vlmax);
491 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(1.6666665459e-1, vlmax);
492 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.0000001201e-1, vlmax);
494 size_t n = num_points;
495 for (
size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
496 vl = __riscv_vsetvl_e32m2(n);
497 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
498 v = __riscv_vfmin(v, exp_hi, vl);
499 v = __riscv_vfmax(v, exp_lo, vl);
500 vfloat32m2_t fx = __riscv_vfmadd(v, log2EF, cf1o2, vl);
502 vfloat32m2_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
503 fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
504 v = __riscv_vfmacc(v, fx, exp_C1, vl);
505 v = __riscv_vfmacc(v, fx, exp_C2, vl);
506 vfloat32m2_t vv = __riscv_vfmul(v, v, vl);
509 y = __riscv_vfmadd(y, v, c1, vl);
510 y = __riscv_vfmadd(y, v, c2, vl);
511 y = __riscv_vfmadd(y, v, c3, vl);
512 y = __riscv_vfmadd(y, v, c4, vl);
513 y = __riscv_vfmadd(y, v, c5, vl);
514 y = __riscv_vfmadd(y, vv, v, vl);
515 y = __riscv_vfadd(y, cf1, vl);
517 vfloat32m2_t pow2n = __riscv_vreinterpret_f32m2(
518 __riscv_vsll(__riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), 0x7f, vl), 23, vl));
520 __riscv_vse32(bVector, __riscv_vfmul(y, pow2n, vl), vl);