Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_exp_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10/* SIMD (SSE4) implementation of exp
11 Inspired by Intel Approximate Math library, and based on the
12 corresponding algorithms of the cephes math library
13*/
14
15/* Copyright (C) 2007 Julien Pommier
16
17 This software is provided 'as-is', without any express or implied
18 warranty. In no event will the authors be held liable for any damages
19 arising from the use of this software.
20
21 Permission is granted to anyone to use this software for any purpose,
22 including commercial applications, and to alter it and redistribute it
23 freely, subject to the following restrictions:
24
25 1. The origin of this software must not be misrepresented; you must not
26 claim that you wrote the original software. If you use this software
27 in a product, an acknowledgment in the product documentation would be
28 appreciated but is not required.
29 2. Altered source versions must be plainly marked as such, and must not be
30 misrepresented as being the original software.
31 3. This notice may not be removed or altered from any source distribution.
32
33 (this is the zlib license)
34*/
35
81
82#include <inttypes.h>
83#include <math.h>
84#include <stdio.h>
85
86#ifndef INCLUDED_volk_32f_exp_32f_a_H
87#define INCLUDED_volk_32f_exp_32f_a_H
88
89#ifdef LV_HAVE_SSE2
90#include <emmintrin.h>
91
92static inline void
93volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
94{
95 float* bPtr = bVector;
96 const float* aPtr = aVector;
97
98 unsigned int number = 0;
99 unsigned int quarterPoints = num_points / 4;
100
101 // Declare variables and constants
102 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
103 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
104 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
105 __m128i emm0, pi32_0x7f;
106
107 one = _mm_set1_ps(1.0);
108 exp_hi = _mm_set1_ps(88.3762626647949);
109 exp_lo = _mm_set1_ps(-88.3762626647949);
110 log2EF = _mm_set1_ps(1.44269504088896341);
111 half = _mm_set1_ps(0.5);
112 exp_C1 = _mm_set1_ps(0.693359375);
113 exp_C2 = _mm_set1_ps(-2.12194440e-4);
114 pi32_0x7f = _mm_set1_epi32(0x7f);
115
116 exp_p0 = _mm_set1_ps(1.9875691500e-4);
117 exp_p1 = _mm_set1_ps(1.3981999507e-3);
118 exp_p2 = _mm_set1_ps(8.3334519073e-3);
119 exp_p3 = _mm_set1_ps(4.1665795894e-2);
120 exp_p4 = _mm_set1_ps(1.6666665459e-1);
121 exp_p5 = _mm_set1_ps(5.0000001201e-1);
122
123 for (; number < quarterPoints; number++) {
124 aVal = _mm_load_ps(aPtr);
125 tmp = _mm_setzero_ps();
126
127 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
128
129 /* express exp(x) as exp(g + n*log(2)) */
130 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
131
132 emm0 = _mm_cvttps_epi32(fx);
133 tmp = _mm_cvtepi32_ps(emm0);
134
135 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
136 fx = _mm_sub_ps(tmp, mask);
137
138 tmp = _mm_mul_ps(fx, exp_C1);
139 z = _mm_mul_ps(fx, exp_C2);
140 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
141 z = _mm_mul_ps(aVal, aVal);
142
143 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
144 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
145 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
146 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
147 y = _mm_add_ps(y, one);
148
149 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
150
151 pow2n = _mm_castsi128_ps(emm0);
152 bVal = _mm_mul_ps(y, pow2n);
153
154 _mm_store_ps(bPtr, bVal);
155 aPtr += 4;
156 bPtr += 4;
157 }
158
159 number = quarterPoints * 4;
160 for (; number < num_points; number++) {
161 *bPtr++ = expf(*aPtr++);
162 }
163}
164
165#endif /* LV_HAVE_SSE2 for aligned */
166
167
168#endif /* INCLUDED_volk_32f_exp_32f_a_H */
169
170#ifndef INCLUDED_volk_32f_exp_32f_u_H
171#define INCLUDED_volk_32f_exp_32f_u_H
172
173#ifdef LV_HAVE_SSE2
174#include <emmintrin.h>
175
176static inline void
177volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
178{
179 float* bPtr = bVector;
180 const float* aPtr = aVector;
181
182 unsigned int number = 0;
183 unsigned int quarterPoints = num_points / 4;
184
185 // Declare variables and constants
186 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
187 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
188 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
189 __m128i emm0, pi32_0x7f;
190
191 one = _mm_set1_ps(1.0);
192 exp_hi = _mm_set1_ps(88.3762626647949);
193 exp_lo = _mm_set1_ps(-88.3762626647949);
194 log2EF = _mm_set1_ps(1.44269504088896341);
195 half = _mm_set1_ps(0.5);
196 exp_C1 = _mm_set1_ps(0.693359375);
197 exp_C2 = _mm_set1_ps(-2.12194440e-4);
198 pi32_0x7f = _mm_set1_epi32(0x7f);
199
200 exp_p0 = _mm_set1_ps(1.9875691500e-4);
201 exp_p1 = _mm_set1_ps(1.3981999507e-3);
202 exp_p2 = _mm_set1_ps(8.3334519073e-3);
203 exp_p3 = _mm_set1_ps(4.1665795894e-2);
204 exp_p4 = _mm_set1_ps(1.6666665459e-1);
205 exp_p5 = _mm_set1_ps(5.0000001201e-1);
206
207
208 for (; number < quarterPoints; number++) {
209 aVal = _mm_loadu_ps(aPtr);
210 tmp = _mm_setzero_ps();
211
212 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
213
214 /* express exp(x) as exp(g + n*log(2)) */
215 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
216
217 emm0 = _mm_cvttps_epi32(fx);
218 tmp = _mm_cvtepi32_ps(emm0);
219
220 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
221 fx = _mm_sub_ps(tmp, mask);
222
223 tmp = _mm_mul_ps(fx, exp_C1);
224 z = _mm_mul_ps(fx, exp_C2);
225 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
226 z = _mm_mul_ps(aVal, aVal);
227
228 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
229 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
230 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
231 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
232 y = _mm_add_ps(y, one);
233
234 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
235
236 pow2n = _mm_castsi128_ps(emm0);
237 bVal = _mm_mul_ps(y, pow2n);
238
239 _mm_storeu_ps(bPtr, bVal);
240 aPtr += 4;
241 bPtr += 4;
242 }
243
244 number = quarterPoints * 4;
245 for (; number < num_points; number++) {
246 *bPtr++ = expf(*aPtr++);
247 }
248}
249
250#endif /* LV_HAVE_SSE2 for unaligned */
251
252
253#ifdef LV_HAVE_GENERIC
254
255static inline void
256volk_32f_exp_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
257{
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
260 unsigned int number = 0;
261
262 for (number = 0; number < num_points; number++) {
263 *bPtr++ = expf(*aPtr++);
264 }
265}
266
267#endif /* LV_HAVE_GENERIC */
268
269#ifdef LV_HAVE_NEON
270#include <arm_neon.h>
271
272static inline void
273volk_32f_exp_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
274{
275 float* bPtr = bVector;
276 const float* aPtr = aVector;
277
278 unsigned int number = 0;
279 unsigned int quarterPoints = num_points / 4;
280
281 // Constants
282 float32x4_t one = vdupq_n_f32(1.0f);
283 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
284 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
285 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
286 float32x4_t half = vdupq_n_f32(0.5f);
287 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
288 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
289 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
290
291 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
292 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
293 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
294 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
295 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
296 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
297
298 for (; number < quarterPoints; number++) {
299 float32x4_t aVal = vld1q_f32(aPtr);
300
301 // Clamp to valid range
302 aVal = vmaxq_f32(vminq_f32(aVal, exp_hi), exp_lo);
303
304 // express exp(x) as exp(g + n*log(2))
305 float32x4_t fx = vmlaq_f32(half, aVal, log2EF);
306
307 // Floor function
308 int32x4_t emm0 = vcvtq_s32_f32(fx);
309 float32x4_t tmp = vcvtq_f32_s32(emm0);
310
311 // If tmp > fx, subtract 1 (floor correction)
312 uint32x4_t mask = vcgtq_f32(tmp, fx);
313 float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
314 fx = vsubq_f32(tmp, mask_one);
315
316 // Reduce x
317 tmp = vmulq_f32(fx, exp_C1);
318 float32x4_t z = vmulq_f32(fx, exp_C2);
319 aVal = vsubq_f32(vsubq_f32(aVal, tmp), z);
320 z = vmulq_f32(aVal, aVal);
321
322 // Polynomial approximation
323 float32x4_t y = vmlaq_f32(exp_p1, exp_p0, aVal);
324 y = vmulq_f32(y, aVal);
325 y = vaddq_f32(y, exp_p2);
326 y = vmulq_f32(y, aVal);
327 y = vaddq_f32(y, exp_p3);
328 y = vmlaq_f32(exp_p4, y, aVal);
329 y = vmulq_f32(y, aVal);
330 y = vaddq_f32(y, exp_p5);
331 y = vmlaq_f32(aVal, y, z);
332 y = vaddq_f32(y, one);
333
334 // Build 2^n
335 emm0 = vcvtq_s32_f32(fx);
336 emm0 = vaddq_s32(emm0, pi32_0x7f);
337 emm0 = vshlq_n_s32(emm0, 23);
338 float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
339
340 float32x4_t bVal = vmulq_f32(y, pow2n);
341 vst1q_f32(bPtr, bVal);
342
343 aPtr += 4;
344 bPtr += 4;
345 }
346
347 number = quarterPoints * 4;
348 for (; number < num_points; number++) {
349 *bPtr++ = expf(*aPtr++);
350 }
351}
352
353#endif /* LV_HAVE_NEON */
354
355#ifdef LV_HAVE_NEONV8
356#include <arm_neon.h>
357
358static inline void
359volk_32f_exp_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
360{
361 float* bPtr = bVector;
362 const float* aPtr = aVector;
363
364 unsigned int number = 0;
365 unsigned int eighthPoints = num_points / 8;
366
367 // Constants
368 float32x4_t one = vdupq_n_f32(1.0f);
369 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
370 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
371 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
372 float32x4_t half = vdupq_n_f32(0.5f);
373 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
374 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
375 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
376
377 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
378 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
379 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
380 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
381 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
382 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
383
384 for (; number < eighthPoints; number++) {
385 __VOLK_PREFETCH(aPtr + 16);
386
387 float32x4_t aVal0 = vld1q_f32(aPtr);
388 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
389
390 // Clamp to valid range
391 aVal0 = vmaxq_f32(vminq_f32(aVal0, exp_hi), exp_lo);
392 aVal1 = vmaxq_f32(vminq_f32(aVal1, exp_hi), exp_lo);
393
394 // express exp(x) as exp(g + n*log(2))
395 float32x4_t fx0 = vfmaq_f32(half, aVal0, log2EF);
396 float32x4_t fx1 = vfmaq_f32(half, aVal1, log2EF);
397
398 // Floor function
399 int32x4_t emm0_0 = vcvtq_s32_f32(fx0);
400 int32x4_t emm0_1 = vcvtq_s32_f32(fx1);
401 float32x4_t tmp0 = vcvtq_f32_s32(emm0_0);
402 float32x4_t tmp1 = vcvtq_f32_s32(emm0_1);
403
404 // If tmp > fx, subtract 1 (floor correction)
405 uint32x4_t mask0 = vcgtq_f32(tmp0, fx0);
406 uint32x4_t mask1 = vcgtq_f32(tmp1, fx1);
407 float32x4_t mask_one0 = vbslq_f32(mask0, one, vdupq_n_f32(0.0f));
408 float32x4_t mask_one1 = vbslq_f32(mask1, one, vdupq_n_f32(0.0f));
409 fx0 = vsubq_f32(tmp0, mask_one0);
410 fx1 = vsubq_f32(tmp1, mask_one1);
411
412 // Reduce x
413 tmp0 = vmulq_f32(fx0, exp_C1);
414 tmp1 = vmulq_f32(fx1, exp_C1);
415 float32x4_t z0 = vmulq_f32(fx0, exp_C2);
416 float32x4_t z1 = vmulq_f32(fx1, exp_C2);
417 aVal0 = vsubq_f32(vsubq_f32(aVal0, tmp0), z0);
418 aVal1 = vsubq_f32(vsubq_f32(aVal1, tmp1), z1);
419 z0 = vmulq_f32(aVal0, aVal0);
420 z1 = vmulq_f32(aVal1, aVal1);
421
422 // Polynomial approximation using FMA
423 float32x4_t y0 = vfmaq_f32(exp_p1, exp_p0, aVal0);
424 float32x4_t y1 = vfmaq_f32(exp_p1, exp_p0, aVal1);
425 y0 = vmulq_f32(y0, aVal0);
426 y1 = vmulq_f32(y1, aVal1);
427 y0 = vaddq_f32(y0, exp_p2);
428 y1 = vaddq_f32(y1, exp_p2);
429 y0 = vmulq_f32(y0, aVal0);
430 y1 = vmulq_f32(y1, aVal1);
431 y0 = vaddq_f32(y0, exp_p3);
432 y1 = vaddq_f32(y1, exp_p3);
433 y0 = vfmaq_f32(exp_p4, y0, aVal0);
434 y1 = vfmaq_f32(exp_p4, y1, aVal1);
435 y0 = vmulq_f32(y0, aVal0);
436 y1 = vmulq_f32(y1, aVal1);
437 y0 = vaddq_f32(y0, exp_p5);
438 y1 = vaddq_f32(y1, exp_p5);
439 y0 = vfmaq_f32(aVal0, y0, z0);
440 y1 = vfmaq_f32(aVal1, y1, z1);
441 y0 = vaddq_f32(y0, one);
442 y1 = vaddq_f32(y1, one);
443
444 // Build 2^n
445 emm0_0 = vcvtq_s32_f32(fx0);
446 emm0_1 = vcvtq_s32_f32(fx1);
447 emm0_0 = vaddq_s32(emm0_0, pi32_0x7f);
448 emm0_1 = vaddq_s32(emm0_1, pi32_0x7f);
449 emm0_0 = vshlq_n_s32(emm0_0, 23);
450 emm0_1 = vshlq_n_s32(emm0_1, 23);
451 float32x4_t pow2n0 = vreinterpretq_f32_s32(emm0_0);
452 float32x4_t pow2n1 = vreinterpretq_f32_s32(emm0_1);
453
454 float32x4_t bVal0 = vmulq_f32(y0, pow2n0);
455 float32x4_t bVal1 = vmulq_f32(y1, pow2n1);
456 vst1q_f32(bPtr, bVal0);
457 vst1q_f32(bPtr + 4, bVal1);
458
459 aPtr += 8;
460 bPtr += 8;
461 }
462
463 number = eighthPoints * 8;
464 for (; number < num_points; number++) {
465 *bPtr++ = expf(*aPtr++);
466 }
467}
468
469#endif /* LV_HAVE_NEONV8 */
470
471#ifdef LV_HAVE_RVV
472#include <riscv_vector.h>
473
474static inline void
475volk_32f_exp_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
476{
477 size_t vlmax = __riscv_vsetvlmax_e32m2();
478
479 const vfloat32m2_t exp_hi = __riscv_vfmv_v_f_f32m2(88.376259f, vlmax);
480 const vfloat32m2_t exp_lo = __riscv_vfmv_v_f_f32m2(-88.376259f, vlmax);
481 const vfloat32m2_t log2EF = __riscv_vfmv_v_f_f32m2(1.442695f, vlmax);
482 const vfloat32m2_t exp_C1 = __riscv_vfmv_v_f_f32m2(-0.6933594f, vlmax);
483 const vfloat32m2_t exp_C2 = __riscv_vfmv_v_f_f32m2(0.000212194f, vlmax);
484 const vfloat32m2_t cf1 = __riscv_vfmv_v_f_f32m2(1.0f, vlmax);
485 const vfloat32m2_t cf1o2 = __riscv_vfmv_v_f_f32m2(0.5f, vlmax);
486
487 const vfloat32m2_t c0 = __riscv_vfmv_v_f_f32m2(1.9875691500e-4, vlmax);
488 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(1.3981999507e-3, vlmax);
489 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(8.3334519073e-3, vlmax);
490 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(4.1665795894e-2, vlmax);
491 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(1.6666665459e-1, vlmax);
492 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(5.0000001201e-1, vlmax);
493
494 size_t n = num_points;
495 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
496 vl = __riscv_vsetvl_e32m2(n);
497 vfloat32m2_t v = __riscv_vle32_v_f32m2(aVector, vl);
498 v = __riscv_vfmin(v, exp_hi, vl);
499 v = __riscv_vfmax(v, exp_lo, vl);
500 vfloat32m2_t fx = __riscv_vfmadd(v, log2EF, cf1o2, vl);
501
502 vfloat32m2_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
503 fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
504 v = __riscv_vfmacc(v, fx, exp_C1, vl);
505 v = __riscv_vfmacc(v, fx, exp_C2, vl);
506 vfloat32m2_t vv = __riscv_vfmul(v, v, vl);
507
508 vfloat32m2_t y = c0;
509 y = __riscv_vfmadd(y, v, c1, vl);
510 y = __riscv_vfmadd(y, v, c2, vl);
511 y = __riscv_vfmadd(y, v, c3, vl);
512 y = __riscv_vfmadd(y, v, c4, vl);
513 y = __riscv_vfmadd(y, v, c5, vl);
514 y = __riscv_vfmadd(y, vv, v, vl);
515 y = __riscv_vfadd(y, cf1, vl);
516
517 vfloat32m2_t pow2n = __riscv_vreinterpret_f32m2(
518 __riscv_vsll(__riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), 0x7f, vl), 23, vl));
519
520 __riscv_vse32(bVector, __riscv_vfmul(y, pow2n, vl), vl);
521 }
522}
523#endif /*LV_HAVE_RVV*/
524
525#endif /* INCLUDED_volk_32f_exp_32f_u_H */