Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_expfast_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
51
52#include <inttypes.h>
53#include <math.h>
54#include <stdio.h>
55
56#define Mln2 0.6931471805f
57#define A 8388608.0f
58#define B 1065353216.0f
59#define C 60801.0f
60
61
62#ifndef INCLUDED_volk_32f_expfast_32f_a_H
63#define INCLUDED_volk_32f_expfast_32f_a_H
64
65#if LV_HAVE_AVX && LV_HAVE_FMA
66
67#include <immintrin.h>
68
69static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
70 const float* aVector,
71 unsigned int num_points)
72{
73 float* bPtr = bVector;
74 const float* aPtr = aVector;
75
76 unsigned int number = 0;
77 const unsigned int eighthPoints = num_points / 8;
78
79 __m256 aVal, bVal, a, b;
80 __m256i exp;
81 a = _mm256_set1_ps(A / Mln2);
82 b = _mm256_set1_ps(B - C);
83
84 for (; number < eighthPoints; number++) {
85 aVal = _mm256_load_ps(aPtr);
86 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
87 bVal = _mm256_castsi256_ps(exp);
88
89 _mm256_store_ps(bPtr, bVal);
90 aPtr += 8;
91 bPtr += 8;
92 }
93
94 number = eighthPoints * 8;
95 for (; number < num_points; number++) {
96 *bPtr++ = expf(*aPtr++);
97 }
98}
99
100#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
101
102#ifdef LV_HAVE_AVX
103
104#include <immintrin.h>
105
106static inline void
107volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
108{
109 float* bPtr = bVector;
110 const float* aPtr = aVector;
111
112 unsigned int number = 0;
113 const unsigned int eighthPoints = num_points / 8;
114
115 __m256 aVal, bVal, a, b;
116 __m256i exp;
117 a = _mm256_set1_ps(A / Mln2);
118 b = _mm256_set1_ps(B - C);
119
120 for (; number < eighthPoints; number++) {
121 aVal = _mm256_load_ps(aPtr);
122 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
123 bVal = _mm256_castsi256_ps(exp);
124
125 _mm256_store_ps(bPtr, bVal);
126 aPtr += 8;
127 bPtr += 8;
128 }
129
130 number = eighthPoints * 8;
131 for (; number < num_points; number++) {
132 *bPtr++ = expf(*aPtr++);
133 }
134}
135
136#endif /* LV_HAVE_AVX for aligned */
137
138#ifdef LV_HAVE_SSE4_1
139#include <smmintrin.h>
140
141static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
142 const float* aVector,
143 unsigned int num_points)
144{
145 float* bPtr = bVector;
146 const float* aPtr = aVector;
147
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
150
151 __m128 aVal, bVal, a, b;
152 __m128i exp;
153 a = _mm_set1_ps(A / Mln2);
154 b = _mm_set1_ps(B - C);
155
156 for (; number < quarterPoints; number++) {
157 aVal = _mm_load_ps(aPtr);
158 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
159 bVal = _mm_castsi128_ps(exp);
160
161 _mm_store_ps(bPtr, bVal);
162 aPtr += 4;
163 bPtr += 4;
164 }
165
166 number = quarterPoints * 4;
167 for (; number < num_points; number++) {
168 *bPtr++ = expf(*aPtr++);
169 }
170}
171
172#endif /* LV_HAVE_SSE4_1 for aligned */
173
174#endif /* INCLUDED_volk_32f_expfast_32f_a_H */
175
176#ifndef INCLUDED_volk_32f_expfast_32f_u_H
177#define INCLUDED_volk_32f_expfast_32f_u_H
178
179#if LV_HAVE_AVX && LV_HAVE_FMA
180#include <immintrin.h>
181
182static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
183 const float* aVector,
184 unsigned int num_points)
185{
186 float* bPtr = bVector;
187 const float* aPtr = aVector;
188
189 unsigned int number = 0;
190 const unsigned int eighthPoints = num_points / 8;
191
192 __m256 aVal, bVal, a, b;
193 __m256i exp;
194 a = _mm256_set1_ps(A / Mln2);
195 b = _mm256_set1_ps(B - C);
196
197 for (; number < eighthPoints; number++) {
198 aVal = _mm256_loadu_ps(aPtr);
199 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
200 bVal = _mm256_castsi256_ps(exp);
201
202 _mm256_storeu_ps(bPtr, bVal);
203 aPtr += 8;
204 bPtr += 8;
205 }
206
207 number = eighthPoints * 8;
208 for (; number < num_points; number++) {
209 *bPtr++ = expf(*aPtr++);
210 }
211}
212
213#endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
214
215#ifdef LV_HAVE_AVX
216#include <immintrin.h>
217
218static inline void
219volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
220{
221 float* bPtr = bVector;
222 const float* aPtr = aVector;
223
224 unsigned int number = 0;
225 const unsigned int eighthPoints = num_points / 8;
226
227 __m256 aVal, bVal, a, b;
228 __m256i exp;
229 a = _mm256_set1_ps(A / Mln2);
230 b = _mm256_set1_ps(B - C);
231
232 for (; number < eighthPoints; number++) {
233 aVal = _mm256_loadu_ps(aPtr);
234 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
235 bVal = _mm256_castsi256_ps(exp);
236
237 _mm256_storeu_ps(bPtr, bVal);
238 aPtr += 8;
239 bPtr += 8;
240 }
241
242 number = eighthPoints * 8;
243 for (; number < num_points; number++) {
244 *bPtr++ = expf(*aPtr++);
245 }
246}
247
248#endif /* LV_HAVE_AVX for unaligned */
249
250
251#ifdef LV_HAVE_SSE4_1
252#include <smmintrin.h>
253
254static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
255 const float* aVector,
256 unsigned int num_points)
257{
258 float* bPtr = bVector;
259 const float* aPtr = aVector;
260
261 unsigned int number = 0;
262 const unsigned int quarterPoints = num_points / 4;
263
264 __m128 aVal, bVal, a, b;
265 __m128i exp;
266 a = _mm_set1_ps(A / Mln2);
267 b = _mm_set1_ps(B - C);
268
269 for (; number < quarterPoints; number++) {
270 aVal = _mm_loadu_ps(aPtr);
271 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
272 bVal = _mm_castsi128_ps(exp);
273
274 _mm_storeu_ps(bPtr, bVal);
275 aPtr += 4;
276 bPtr += 4;
277 }
278
279 number = quarterPoints * 4;
280 for (; number < num_points; number++) {
281 *bPtr++ = expf(*aPtr++);
282 }
283}
284
285#endif /* LV_HAVE_SSE4_1 for unaligned */
286
287
288#ifdef LV_HAVE_GENERIC
289
290static inline void volk_32f_expfast_32f_generic(float* bVector,
291 const float* aVector,
292 unsigned int num_points)
293{
294 float* bPtr = bVector;
295 const float* aPtr = aVector;
296 unsigned int number = 0;
297
298 for (number = 0; number < num_points; number++) {
299 *bPtr++ = expf(*aPtr++);
300 }
301}
302#endif /* LV_HAVE_GENERIC */
303
304#ifdef LV_HAVE_NEON
305#include <arm_neon.h>
306
307static inline void
308volk_32f_expfast_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
309{
310 float* bPtr = bVector;
311 const float* aPtr = aVector;
312
313 unsigned int number = 0;
314 const unsigned int quarterPoints = num_points / 4;
315
316 float32x4_t a = vdupq_n_f32(A / Mln2);
317 float32x4_t b = vdupq_n_f32(B - C);
318
319 for (; number < quarterPoints; number++) {
320 float32x4_t aVal = vld1q_f32(aPtr);
321 int32x4_t exp = vcvtq_s32_f32(vmlaq_f32(b, a, aVal));
322 float32x4_t bVal = vreinterpretq_f32_s32(exp);
323 vst1q_f32(bPtr, bVal);
324
325 aPtr += 4;
326 bPtr += 4;
327 }
328
329 number = quarterPoints * 4;
330 for (; number < num_points; number++) {
331 *bPtr++ = expf(*aPtr++);
332 }
333}
334
335#endif /* LV_HAVE_NEON */
336
337#ifdef LV_HAVE_NEONV8
338#include <arm_neon.h>
339
340static inline void
341volk_32f_expfast_32f_neonv8(float* bVector, const float* aVector, unsigned int num_points)
342{
343 float* bPtr = bVector;
344 const float* aPtr = aVector;
345
346 unsigned int number = 0;
347 const unsigned int eighthPoints = num_points / 8;
348
349 float32x4_t a = vdupq_n_f32(A / Mln2);
350 float32x4_t b = vdupq_n_f32(B - C);
351
352 for (; number < eighthPoints; number++) {
353 __VOLK_PREFETCH(aPtr + 16);
354
355 float32x4_t aVal0 = vld1q_f32(aPtr);
356 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
357
358 int32x4_t exp0 = vcvtq_s32_f32(vfmaq_f32(b, a, aVal0));
359 int32x4_t exp1 = vcvtq_s32_f32(vfmaq_f32(b, a, aVal1));
360
361 float32x4_t bVal0 = vreinterpretq_f32_s32(exp0);
362 float32x4_t bVal1 = vreinterpretq_f32_s32(exp1);
363
364 vst1q_f32(bPtr, bVal0);
365 vst1q_f32(bPtr + 4, bVal1);
366
367 aPtr += 8;
368 bPtr += 8;
369 }
370
371 number = eighthPoints * 8;
372 for (; number < num_points; number++) {
373 *bPtr++ = expf(*aPtr++);
374 }
375}
376
377#endif /* LV_HAVE_NEONV8 */
378
379#ifdef LV_HAVE_RVV
380#include <riscv_vector.h>
381
382static inline void
383volk_32f_expfast_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
384{
385 size_t vlmax = __riscv_vsetvlmax_e32m8();
386 const vfloat32m8_t ca = __riscv_vfmv_v_f_f32m8(A / Mln2, vlmax);
387 const vfloat32m8_t cb = __riscv_vfmv_v_f_f32m8(B - C, vlmax);
388
389 size_t n = num_points;
390 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
391 vl = __riscv_vsetvl_e32m8(n);
392 vfloat32m8_t v = __riscv_vle32_v_f32m8(aVector, vl);
393 v = __riscv_vfmadd(v, ca, cb, vl);
394 v = __riscv_vreinterpret_f32m8(__riscv_vfcvt_x(v, vl));
395 __riscv_vse32(bVector, v, vl);
396 }
397}
398#endif /*LV_HAVE_RVV*/
399
400#endif /* INCLUDED_volk_32f_expfast_32f_u_H */