Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
59
60#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
61#define INCLUDED_volk_32f_x2_pow_32f_a_H
62
63#include <inttypes.h>
64#include <math.h>
65#include <stdio.h>
66#include <stdlib.h>
67
68#define POW_POLY_DEGREE 3
69
70#if LV_HAVE_AVX2 && LV_HAVE_FMA
71#include <immintrin.h>
72
73#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
74#define POLY1_AVX2_FMA(x, c0, c1) \
75 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
76#define POLY2_AVX2_FMA(x, c0, c1, c2) \
77 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
78#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
79 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
80#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
81 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
82#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
83 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
84
85static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
86 const float* bVector,
87 const float* aVector,
88 unsigned int num_points)
89{
90 float* cPtr = cVector;
91 const float* bPtr = bVector;
92 const float* aPtr = aVector;
93
94 unsigned int number = 0;
95 const unsigned int eighthPoints = num_points / 8;
96
97 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
98 __m256 tmp, fx, mask, pow2n, z, y;
99 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
100 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
101 __m256i bias, exp, emm0, pi32_0x7f;
102
103 one = _mm256_set1_ps(1.0);
104 exp_hi = _mm256_set1_ps(88.3762626647949);
105 exp_lo = _mm256_set1_ps(-88.3762626647949);
106 ln2 = _mm256_set1_ps(0.6931471805);
107 log2EF = _mm256_set1_ps(1.44269504088896341);
108 half = _mm256_set1_ps(0.5);
109 exp_C1 = _mm256_set1_ps(0.693359375);
110 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
111 pi32_0x7f = _mm256_set1_epi32(0x7f);
112
113 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
114 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
115 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
116 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
117 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
118 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
119
120 for (; number < eighthPoints; number++) {
121 // First compute the logarithm
122 aVal = _mm256_load_ps(aPtr);
123 bias = _mm256_set1_epi32(127);
124 leadingOne = _mm256_set1_ps(1.0f);
125 exp = _mm256_sub_epi32(
126 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
127 _mm256_set1_epi32(0x7f800000)),
128 23),
129 bias);
130 logarithm = _mm256_cvtepi32_ps(exp);
131
132 frac = _mm256_or_ps(
133 leadingOne,
134 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
135
136#if POW_POLY_DEGREE == 6
137 mantissa = POLY5_AVX2_FMA(frac,
138 3.1157899f,
139 -3.3241990f,
140 2.5988452f,
141 -1.2315303f,
142 3.1821337e-1f,
143 -3.4436006e-2f);
144#elif POW_POLY_DEGREE == 5
145 mantissa = POLY4_AVX2_FMA(frac,
146 2.8882704548164776201f,
147 -2.52074962577807006663f,
148 1.48116647521213171641f,
149 -0.465725644288844778798f,
150 0.0596515482674574969533f);
151#elif POW_POLY_DEGREE == 4
152 mantissa = POLY3_AVX2_FMA(frac,
153 2.61761038894603480148f,
154 -1.75647175389045657003f,
155 0.688243882994381274313f,
156 -0.107254423828329604454f);
157#elif POW_POLY_DEGREE == 3
158 mantissa = POLY2_AVX2_FMA(frac,
159 2.28330284476918490682f,
160 -1.04913055217340124191f,
161 0.204446009836232697516f);
162#else
163#error
164#endif
165
166 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
167 logarithm = _mm256_mul_ps(logarithm, ln2);
168
169 // Now calculate b*lna
170 bVal = _mm256_load_ps(bPtr);
171 bVal = _mm256_mul_ps(bVal, logarithm);
172
173 // Now compute exp(b*lna)
174 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
175
176 fx = _mm256_fmadd_ps(bVal, log2EF, half);
177
178 emm0 = _mm256_cvttps_epi32(fx);
179 tmp = _mm256_cvtepi32_ps(emm0);
180
181 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
182 fx = _mm256_sub_ps(tmp, mask);
183
184 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
185 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
186 z = _mm256_mul_ps(bVal, bVal);
187
188 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
189 y = _mm256_fmadd_ps(y, bVal, exp_p2);
190 y = _mm256_fmadd_ps(y, bVal, exp_p3);
191 y = _mm256_fmadd_ps(y, bVal, exp_p4);
192 y = _mm256_fmadd_ps(y, bVal, exp_p5);
193 y = _mm256_fmadd_ps(y, z, bVal);
194 y = _mm256_add_ps(y, one);
195
196 emm0 =
197 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
198
199 pow2n = _mm256_castsi256_ps(emm0);
200 cVal = _mm256_mul_ps(y, pow2n);
201
202 _mm256_store_ps(cPtr, cVal);
203
204 aPtr += 8;
205 bPtr += 8;
206 cPtr += 8;
207 }
208
209 number = eighthPoints * 8;
210 for (; number < num_points; number++) {
211 *cPtr++ = pow(*aPtr++, *bPtr++);
212 }
213}
214
215#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
216
217#ifdef LV_HAVE_AVX2
218#include <immintrin.h>
219
220#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
221#define POLY1_AVX2(x, c0, c1) \
222 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
223#define POLY2_AVX2(x, c0, c1, c2) \
224 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
225#define POLY3_AVX2(x, c0, c1, c2, c3) \
226 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
227#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
228 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
229#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
230 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
231
232static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
233 const float* bVector,
234 const float* aVector,
235 unsigned int num_points)
236{
237 float* cPtr = cVector;
238 const float* bPtr = bVector;
239 const float* aPtr = aVector;
240
241 unsigned int number = 0;
242 const unsigned int eighthPoints = num_points / 8;
243
244 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
245 __m256 tmp, fx, mask, pow2n, z, y;
246 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
247 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
248 __m256i bias, exp, emm0, pi32_0x7f;
249
250 one = _mm256_set1_ps(1.0);
251 exp_hi = _mm256_set1_ps(88.3762626647949);
252 exp_lo = _mm256_set1_ps(-88.3762626647949);
253 ln2 = _mm256_set1_ps(0.6931471805);
254 log2EF = _mm256_set1_ps(1.44269504088896341);
255 half = _mm256_set1_ps(0.5);
256 exp_C1 = _mm256_set1_ps(0.693359375);
257 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
258 pi32_0x7f = _mm256_set1_epi32(0x7f);
259
260 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
261 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
262 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
263 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
264 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
265 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
266
267 for (; number < eighthPoints; number++) {
268 // First compute the logarithm
269 aVal = _mm256_load_ps(aPtr);
270 bias = _mm256_set1_epi32(127);
271 leadingOne = _mm256_set1_ps(1.0f);
272 exp = _mm256_sub_epi32(
273 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
274 _mm256_set1_epi32(0x7f800000)),
275 23),
276 bias);
277 logarithm = _mm256_cvtepi32_ps(exp);
278
279 frac = _mm256_or_ps(
280 leadingOne,
281 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
282
283#if POW_POLY_DEGREE == 6
284 mantissa = POLY5_AVX2(frac,
285 3.1157899f,
286 -3.3241990f,
287 2.5988452f,
288 -1.2315303f,
289 3.1821337e-1f,
290 -3.4436006e-2f);
291#elif POW_POLY_DEGREE == 5
292 mantissa = POLY4_AVX2(frac,
293 2.8882704548164776201f,
294 -2.52074962577807006663f,
295 1.48116647521213171641f,
296 -0.465725644288844778798f,
297 0.0596515482674574969533f);
298#elif POW_POLY_DEGREE == 4
299 mantissa = POLY3_AVX2(frac,
300 2.61761038894603480148f,
301 -1.75647175389045657003f,
302 0.688243882994381274313f,
303 -0.107254423828329604454f);
304#elif POW_POLY_DEGREE == 3
305 mantissa = POLY2_AVX2(frac,
306 2.28330284476918490682f,
307 -1.04913055217340124191f,
308 0.204446009836232697516f);
309#else
310#error
311#endif
312
313 logarithm = _mm256_add_ps(
314 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
315 logarithm = _mm256_mul_ps(logarithm, ln2);
316
317 // Now calculate b*lna
318 bVal = _mm256_load_ps(bPtr);
319 bVal = _mm256_mul_ps(bVal, logarithm);
320
321 // Now compute exp(b*lna)
322 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
323
324 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
325
326 emm0 = _mm256_cvttps_epi32(fx);
327 tmp = _mm256_cvtepi32_ps(emm0);
328
329 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
330 fx = _mm256_sub_ps(tmp, mask);
331
332 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
333 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
334 z = _mm256_mul_ps(bVal, bVal);
335
336 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
337 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
338 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
339 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
340 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
341 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
342 y = _mm256_add_ps(y, one);
343
344 emm0 =
345 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
346
347 pow2n = _mm256_castsi256_ps(emm0);
348 cVal = _mm256_mul_ps(y, pow2n);
349
350 _mm256_store_ps(cPtr, cVal);
351
352 aPtr += 8;
353 bPtr += 8;
354 cPtr += 8;
355 }
356
357 number = eighthPoints * 8;
358 for (; number < num_points; number++) {
359 *cPtr++ = pow(*aPtr++, *bPtr++);
360 }
361}
362
363#endif /* LV_HAVE_AVX2 for aligned */
364
365
366#ifdef LV_HAVE_SSE4_1
367#include <smmintrin.h>
368
369#define POLY0(x, c0) _mm_set1_ps(c0)
370#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
371#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
372#define POLY3(x, c0, c1, c2, c3) \
373 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
374#define POLY4(x, c0, c1, c2, c3, c4) \
375 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
376#define POLY5(x, c0, c1, c2, c3, c4, c5) \
377 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
378
379static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
380 const float* bVector,
381 const float* aVector,
382 unsigned int num_points)
383{
384 float* cPtr = cVector;
385 const float* bPtr = bVector;
386 const float* aPtr = aVector;
387
388 unsigned int number = 0;
389 const unsigned int quarterPoints = num_points / 4;
390
391 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
392 __m128 tmp, fx, mask, pow2n, z, y;
393 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
394 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
395 __m128i bias, exp, emm0, pi32_0x7f;
396
397 one = _mm_set1_ps(1.0);
398 exp_hi = _mm_set1_ps(88.3762626647949);
399 exp_lo = _mm_set1_ps(-88.3762626647949);
400 ln2 = _mm_set1_ps(0.6931471805);
401 log2EF = _mm_set1_ps(1.44269504088896341);
402 half = _mm_set1_ps(0.5);
403 exp_C1 = _mm_set1_ps(0.693359375);
404 exp_C2 = _mm_set1_ps(-2.12194440e-4);
405 pi32_0x7f = _mm_set1_epi32(0x7f);
406
407 exp_p0 = _mm_set1_ps(1.9875691500e-4);
408 exp_p1 = _mm_set1_ps(1.3981999507e-3);
409 exp_p2 = _mm_set1_ps(8.3334519073e-3);
410 exp_p3 = _mm_set1_ps(4.1665795894e-2);
411 exp_p4 = _mm_set1_ps(1.6666665459e-1);
412 exp_p5 = _mm_set1_ps(5.0000001201e-1);
413
414 for (; number < quarterPoints; number++) {
415 // First compute the logarithm
416 aVal = _mm_load_ps(aPtr);
417 bias = _mm_set1_epi32(127);
418 leadingOne = _mm_set1_ps(1.0f);
419 exp = _mm_sub_epi32(
420 _mm_srli_epi32(
421 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
422 bias);
423 logarithm = _mm_cvtepi32_ps(exp);
424
425 frac = _mm_or_ps(leadingOne,
426 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
427
428#if POW_POLY_DEGREE == 6
429 mantissa = POLY5(frac,
430 3.1157899f,
431 -3.3241990f,
432 2.5988452f,
433 -1.2315303f,
434 3.1821337e-1f,
435 -3.4436006e-2f);
436#elif POW_POLY_DEGREE == 5
437 mantissa = POLY4(frac,
438 2.8882704548164776201f,
439 -2.52074962577807006663f,
440 1.48116647521213171641f,
441 -0.465725644288844778798f,
442 0.0596515482674574969533f);
443#elif POW_POLY_DEGREE == 4
444 mantissa = POLY3(frac,
445 2.61761038894603480148f,
446 -1.75647175389045657003f,
447 0.688243882994381274313f,
448 -0.107254423828329604454f);
449#elif POW_POLY_DEGREE == 3
450 mantissa = POLY2(frac,
451 2.28330284476918490682f,
452 -1.04913055217340124191f,
453 0.204446009836232697516f);
454#else
455#error
456#endif
457
458 logarithm =
459 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
460 logarithm = _mm_mul_ps(logarithm, ln2);
461
462
463 // Now calculate b*lna
464 bVal = _mm_load_ps(bPtr);
465 bVal = _mm_mul_ps(bVal, logarithm);
466
467 // Now compute exp(b*lna)
468 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
469
470 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
471
472 emm0 = _mm_cvttps_epi32(fx);
473 tmp = _mm_cvtepi32_ps(emm0);
474
475 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
476 fx = _mm_sub_ps(tmp, mask);
477
478 tmp = _mm_mul_ps(fx, exp_C1);
479 z = _mm_mul_ps(fx, exp_C2);
480 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
481 z = _mm_mul_ps(bVal, bVal);
482
483 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
484 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
485 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
486 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
487 y = _mm_add_ps(y, one);
488
489 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
490
491 pow2n = _mm_castsi128_ps(emm0);
492 cVal = _mm_mul_ps(y, pow2n);
493
494 _mm_store_ps(cPtr, cVal);
495
496 aPtr += 4;
497 bPtr += 4;
498 cPtr += 4;
499 }
500
501 number = quarterPoints * 4;
502 for (; number < num_points; number++) {
503 *cPtr++ = powf(*aPtr++, *bPtr++);
504 }
505}
506
507#endif /* LV_HAVE_SSE4_1 for aligned */
508
509#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
510
511#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
512#define INCLUDED_volk_32f_x2_pow_32f_u_H
513
514#include <inttypes.h>
515#include <math.h>
516#include <stdio.h>
517#include <stdlib.h>
518
519#define POW_POLY_DEGREE 3
520
521#ifdef LV_HAVE_GENERIC
522
523static inline void volk_32f_x2_pow_32f_generic(float* cVector,
524 const float* bVector,
525 const float* aVector,
526 unsigned int num_points)
527{
528 float* cPtr = cVector;
529 const float* bPtr = bVector;
530 const float* aPtr = aVector;
531 unsigned int number = 0;
532
533 for (number = 0; number < num_points; number++) {
534 *cPtr++ = powf(*aPtr++, *bPtr++);
535 }
536}
537#endif /* LV_HAVE_GENERIC */
538
539#ifdef LV_HAVE_NEON
540#include <arm_neon.h>
541
542static inline void volk_32f_x2_pow_32f_neon(float* cVector,
543 const float* bVector,
544 const float* aVector,
545 unsigned int num_points)
546{
547 float* cPtr = cVector;
548 const float* bPtr = bVector;
549 const float* aPtr = aVector;
550
551 unsigned int number = 0;
552 const unsigned int quarterPoints = num_points / 4;
553
554 // Constants
555 float32x4_t one = vdupq_n_f32(1.0f);
556 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
557 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
558 float32x4_t ln2 = vdupq_n_f32(0.6931471805f);
559 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
560 float32x4_t half = vdupq_n_f32(0.5f);
561 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
562 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
563 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
564 int32x4_t bias = vdupq_n_s32(127);
565 int32x4_t mantMask = vdupq_n_s32(0x7fffff);
566 int32x4_t expMask = vdupq_n_s32(0x7f800000);
567
568 // Polynomial coefficients for log
569 float32x4_t log_c0 = vdupq_n_f32(2.28330284476918490682f);
570 float32x4_t log_c1 = vdupq_n_f32(-1.04913055217340124191f);
571 float32x4_t log_c2 = vdupq_n_f32(0.204446009836232697516f);
572
573 // Polynomial coefficients for exp
574 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
575 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
576 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
577 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
578 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
579 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
580
581 for (; number < quarterPoints; number++) {
582 float32x4_t aVal = vld1q_f32(aPtr);
583
584 // First compute log(a)
585 int32x4_t aInt = vreinterpretq_s32_f32(aVal);
586 int32x4_t expPart = vsubq_s32(vshrq_n_s32(vandq_s32(aInt, expMask), 23), bias);
587 float32x4_t logarithm = vcvtq_f32_s32(expPart);
588
589 int32x4_t mantPart =
590 vorrq_s32(vandq_s32(aInt, mantMask), vreinterpretq_s32_f32(one));
591 float32x4_t frac = vreinterpretq_f32_s32(mantPart);
592
593 // Polynomial for log mantissa (degree 3)
594 float32x4_t mantissa = vmlaq_f32(log_c1, log_c2, frac);
595 mantissa = vmlaq_f32(log_c0, mantissa, frac);
596
597 float32x4_t fracMinusOne = vsubq_f32(frac, one);
598 logarithm = vmlaq_f32(logarithm, mantissa, fracMinusOne);
599 logarithm = vmulq_f32(logarithm, ln2);
600
601 // Now calculate b*log(a)
602 float32x4_t bVal = vld1q_f32(bPtr);
603 bVal = vmulq_f32(bVal, logarithm);
604
605 // Now compute exp(b*log(a))
606 bVal = vmaxq_f32(vminq_f32(bVal, exp_hi), exp_lo);
607
608 float32x4_t fx = vmlaq_f32(half, bVal, log2EF);
609
610 int32x4_t emm0 = vcvtq_s32_f32(fx);
611 float32x4_t tmp = vcvtq_f32_s32(emm0);
612
613 uint32x4_t mask = vcgtq_f32(tmp, fx);
614 float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
615 fx = vsubq_f32(tmp, mask_one);
616
617 tmp = vmulq_f32(fx, exp_C1);
618 float32x4_t z = vmulq_f32(fx, exp_C2);
619 bVal = vsubq_f32(vsubq_f32(bVal, tmp), z);
620 z = vmulq_f32(bVal, bVal);
621
622 float32x4_t y = vmlaq_f32(exp_p1, exp_p0, bVal);
623 y = vmulq_f32(y, bVal);
624 y = vaddq_f32(y, exp_p2);
625 y = vmulq_f32(y, bVal);
626 y = vaddq_f32(y, exp_p3);
627 y = vmlaq_f32(exp_p4, y, bVal);
628 y = vmulq_f32(y, bVal);
629 y = vaddq_f32(y, exp_p5);
630 y = vmlaq_f32(bVal, y, z);
631 y = vaddq_f32(y, one);
632
633 emm0 = vcvtq_s32_f32(fx);
634 emm0 = vaddq_s32(emm0, pi32_0x7f);
635 emm0 = vshlq_n_s32(emm0, 23);
636 float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
637
638 float32x4_t cVal = vmulq_f32(y, pow2n);
639 vst1q_f32(cPtr, cVal);
640
641 aPtr += 4;
642 bPtr += 4;
643 cPtr += 4;
644 }
645
646 number = quarterPoints * 4;
647 for (; number < num_points; number++) {
648 *cPtr++ = powf(*aPtr++, *bPtr++);
649 }
650}
651
652#endif /* LV_HAVE_NEON */
653
654#ifdef LV_HAVE_NEONV8
655#include <arm_neon.h>
656
657static inline void volk_32f_x2_pow_32f_neonv8(float* cVector,
658 const float* bVector,
659 const float* aVector,
660 unsigned int num_points)
661{
662 float* cPtr = cVector;
663 const float* bPtr = bVector;
664 const float* aPtr = aVector;
665
666 unsigned int number = 0;
667 const unsigned int quarterPoints = num_points / 4;
668
669 // Constants
670 float32x4_t one = vdupq_n_f32(1.0f);
671 float32x4_t exp_hi = vdupq_n_f32(88.3762626647949f);
672 float32x4_t exp_lo = vdupq_n_f32(-88.3762626647949f);
673 float32x4_t ln2 = vdupq_n_f32(0.6931471805f);
674 float32x4_t log2EF = vdupq_n_f32(1.44269504088896341f);
675 float32x4_t half = vdupq_n_f32(0.5f);
676 float32x4_t exp_C1 = vdupq_n_f32(0.693359375f);
677 float32x4_t exp_C2 = vdupq_n_f32(-2.12194440e-4f);
678 int32x4_t pi32_0x7f = vdupq_n_s32(0x7f);
679 int32x4_t bias = vdupq_n_s32(127);
680 int32x4_t mantMask = vdupq_n_s32(0x7fffff);
681 int32x4_t expMask = vdupq_n_s32(0x7f800000);
682
683 // Polynomial coefficients for log
684 float32x4_t log_c0 = vdupq_n_f32(2.28330284476918490682f);
685 float32x4_t log_c1 = vdupq_n_f32(-1.04913055217340124191f);
686 float32x4_t log_c2 = vdupq_n_f32(0.204446009836232697516f);
687
688 // Polynomial coefficients for exp
689 float32x4_t exp_p0 = vdupq_n_f32(1.9875691500e-4f);
690 float32x4_t exp_p1 = vdupq_n_f32(1.3981999507e-3f);
691 float32x4_t exp_p2 = vdupq_n_f32(8.3334519073e-3f);
692 float32x4_t exp_p3 = vdupq_n_f32(4.1665795894e-2f);
693 float32x4_t exp_p4 = vdupq_n_f32(1.6666665459e-1f);
694 float32x4_t exp_p5 = vdupq_n_f32(5.0000001201e-1f);
695
696 for (; number < quarterPoints; number++) {
697 __VOLK_PREFETCH(aPtr + 8);
698 __VOLK_PREFETCH(bPtr + 8);
699
700 float32x4_t aVal = vld1q_f32(aPtr);
701
702 // First compute log(a)
703 int32x4_t aInt = vreinterpretq_s32_f32(aVal);
704 int32x4_t expPart = vsubq_s32(vshrq_n_s32(vandq_s32(aInt, expMask), 23), bias);
705 float32x4_t logarithm = vcvtq_f32_s32(expPart);
706
707 int32x4_t mantPart =
708 vorrq_s32(vandq_s32(aInt, mantMask), vreinterpretq_s32_f32(one));
709 float32x4_t frac = vreinterpretq_f32_s32(mantPart);
710
711 // Polynomial for log mantissa (degree 3)
712 float32x4_t mantissa = vfmaq_f32(log_c1, log_c2, frac);
713 mantissa = vfmaq_f32(log_c0, mantissa, frac);
714
715 float32x4_t fracMinusOne = vsubq_f32(frac, one);
716 logarithm = vfmaq_f32(logarithm, mantissa, fracMinusOne);
717 logarithm = vmulq_f32(logarithm, ln2);
718
719 // Now calculate b*log(a)
720 float32x4_t bVal = vld1q_f32(bPtr);
721 bVal = vmulq_f32(bVal, logarithm);
722
723 // Now compute exp(b*log(a))
724 bVal = vmaxq_f32(vminq_f32(bVal, exp_hi), exp_lo);
725
726 float32x4_t fx = vfmaq_f32(half, bVal, log2EF);
727
728 int32x4_t emm0 = vcvtq_s32_f32(fx);
729 float32x4_t tmp = vcvtq_f32_s32(emm0);
730
731 uint32x4_t mask = vcgtq_f32(tmp, fx);
732 float32x4_t mask_one = vbslq_f32(mask, one, vdupq_n_f32(0.0f));
733 fx = vsubq_f32(tmp, mask_one);
734
735 tmp = vmulq_f32(fx, exp_C1);
736 float32x4_t z = vmulq_f32(fx, exp_C2);
737 bVal = vsubq_f32(vsubq_f32(bVal, tmp), z);
738 z = vmulq_f32(bVal, bVal);
739
740 float32x4_t y = vfmaq_f32(exp_p1, exp_p0, bVal);
741 y = vmulq_f32(y, bVal);
742 y = vaddq_f32(y, exp_p2);
743 y = vmulq_f32(y, bVal);
744 y = vaddq_f32(y, exp_p3);
745 y = vfmaq_f32(exp_p4, y, bVal);
746 y = vmulq_f32(y, bVal);
747 y = vaddq_f32(y, exp_p5);
748 y = vfmaq_f32(bVal, y, z);
749 y = vaddq_f32(y, one);
750
751 emm0 = vcvtq_s32_f32(fx);
752 emm0 = vaddq_s32(emm0, pi32_0x7f);
753 emm0 = vshlq_n_s32(emm0, 23);
754 float32x4_t pow2n = vreinterpretq_f32_s32(emm0);
755
756 float32x4_t cVal = vmulq_f32(y, pow2n);
757 vst1q_f32(cPtr, cVal);
758
759 aPtr += 4;
760 bPtr += 4;
761 cPtr += 4;
762 }
763
764 number = quarterPoints * 4;
765 for (; number < num_points; number++) {
766 *cPtr++ = powf(*aPtr++, *bPtr++);
767 }
768}
769
770#endif /* LV_HAVE_NEONV8 */
771
772#ifdef LV_HAVE_SSE4_1
773#include <smmintrin.h>
774
775#define POLY0(x, c0) _mm_set1_ps(c0)
776#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
777#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
778#define POLY3(x, c0, c1, c2, c3) \
779 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
780#define POLY4(x, c0, c1, c2, c3, c4) \
781 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
782#define POLY5(x, c0, c1, c2, c3, c4, c5) \
783 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
784
785static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
786 const float* bVector,
787 const float* aVector,
788 unsigned int num_points)
789{
790 float* cPtr = cVector;
791 const float* bPtr = bVector;
792 const float* aPtr = aVector;
793
794 unsigned int number = 0;
795 const unsigned int quarterPoints = num_points / 4;
796
797 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
798 __m128 tmp, fx, mask, pow2n, z, y;
799 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
800 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
801 __m128i bias, exp, emm0, pi32_0x7f;
802
803 one = _mm_set1_ps(1.0);
804 exp_hi = _mm_set1_ps(88.3762626647949);
805 exp_lo = _mm_set1_ps(-88.3762626647949);
806 ln2 = _mm_set1_ps(0.6931471805);
807 log2EF = _mm_set1_ps(1.44269504088896341);
808 half = _mm_set1_ps(0.5);
809 exp_C1 = _mm_set1_ps(0.693359375);
810 exp_C2 = _mm_set1_ps(-2.12194440e-4);
811 pi32_0x7f = _mm_set1_epi32(0x7f);
812
813 exp_p0 = _mm_set1_ps(1.9875691500e-4);
814 exp_p1 = _mm_set1_ps(1.3981999507e-3);
815 exp_p2 = _mm_set1_ps(8.3334519073e-3);
816 exp_p3 = _mm_set1_ps(4.1665795894e-2);
817 exp_p4 = _mm_set1_ps(1.6666665459e-1);
818 exp_p5 = _mm_set1_ps(5.0000001201e-1);
819
820 for (; number < quarterPoints; number++) {
821 // First compute the logarithm
822 aVal = _mm_loadu_ps(aPtr);
823 bias = _mm_set1_epi32(127);
824 leadingOne = _mm_set1_ps(1.0f);
825 exp = _mm_sub_epi32(
826 _mm_srli_epi32(
827 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
828 bias);
829 logarithm = _mm_cvtepi32_ps(exp);
830
831 frac = _mm_or_ps(leadingOne,
832 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
833
834#if POW_POLY_DEGREE == 6
835 mantissa = POLY5(frac,
836 3.1157899f,
837 -3.3241990f,
838 2.5988452f,
839 -1.2315303f,
840 3.1821337e-1f,
841 -3.4436006e-2f);
842#elif POW_POLY_DEGREE == 5
843 mantissa = POLY4(frac,
844 2.8882704548164776201f,
845 -2.52074962577807006663f,
846 1.48116647521213171641f,
847 -0.465725644288844778798f,
848 0.0596515482674574969533f);
849#elif POW_POLY_DEGREE == 4
850 mantissa = POLY3(frac,
851 2.61761038894603480148f,
852 -1.75647175389045657003f,
853 0.688243882994381274313f,
854 -0.107254423828329604454f);
855#elif POW_POLY_DEGREE == 3
856 mantissa = POLY2(frac,
857 2.28330284476918490682f,
858 -1.04913055217340124191f,
859 0.204446009836232697516f);
860#else
861#error
862#endif
863
864 logarithm =
865 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
866 logarithm = _mm_mul_ps(logarithm, ln2);
867
868
869 // Now calculate b*lna
870 bVal = _mm_loadu_ps(bPtr);
871 bVal = _mm_mul_ps(bVal, logarithm);
872
873 // Now compute exp(b*lna)
874 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
875
876 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
877
878 emm0 = _mm_cvttps_epi32(fx);
879 tmp = _mm_cvtepi32_ps(emm0);
880
881 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
882 fx = _mm_sub_ps(tmp, mask);
883
884 tmp = _mm_mul_ps(fx, exp_C1);
885 z = _mm_mul_ps(fx, exp_C2);
886 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
887 z = _mm_mul_ps(bVal, bVal);
888
889 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
890 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
891 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
892 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
893 y = _mm_add_ps(y, one);
894
895 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
896
897 pow2n = _mm_castsi128_ps(emm0);
898 cVal = _mm_mul_ps(y, pow2n);
899
900 _mm_storeu_ps(cPtr, cVal);
901
902 aPtr += 4;
903 bPtr += 4;
904 cPtr += 4;
905 }
906
907 number = quarterPoints * 4;
908 for (; number < num_points; number++) {
909 *cPtr++ = powf(*aPtr++, *bPtr++);
910 }
911}
912
913#endif /* LV_HAVE_SSE4_1 for unaligned */
914
915#if LV_HAVE_AVX2 && LV_HAVE_FMA
916#include <immintrin.h>
917
918#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
919#define POLY1_AVX2_FMA(x, c0, c1) \
920 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
921#define POLY2_AVX2_FMA(x, c0, c1, c2) \
922 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
923#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
924 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
925#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
926 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
927#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
928 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
929
930static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
931 const float* bVector,
932 const float* aVector,
933 unsigned int num_points)
934{
935 float* cPtr = cVector;
936 const float* bPtr = bVector;
937 const float* aPtr = aVector;
938
939 unsigned int number = 0;
940 const unsigned int eighthPoints = num_points / 8;
941
942 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
943 __m256 tmp, fx, mask, pow2n, z, y;
944 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
945 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
946 __m256i bias, exp, emm0, pi32_0x7f;
947
948 one = _mm256_set1_ps(1.0);
949 exp_hi = _mm256_set1_ps(88.3762626647949);
950 exp_lo = _mm256_set1_ps(-88.3762626647949);
951 ln2 = _mm256_set1_ps(0.6931471805);
952 log2EF = _mm256_set1_ps(1.44269504088896341);
953 half = _mm256_set1_ps(0.5);
954 exp_C1 = _mm256_set1_ps(0.693359375);
955 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
956 pi32_0x7f = _mm256_set1_epi32(0x7f);
957
958 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
959 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
960 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
961 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
962 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
963 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
964
965 for (; number < eighthPoints; number++) {
966 // First compute the logarithm
967 aVal = _mm256_loadu_ps(aPtr);
968 bias = _mm256_set1_epi32(127);
969 leadingOne = _mm256_set1_ps(1.0f);
970 exp = _mm256_sub_epi32(
971 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
972 _mm256_set1_epi32(0x7f800000)),
973 23),
974 bias);
975 logarithm = _mm256_cvtepi32_ps(exp);
976
977 frac = _mm256_or_ps(
978 leadingOne,
979 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
980
981#if POW_POLY_DEGREE == 6
982 mantissa = POLY5_AVX2_FMA(frac,
983 3.1157899f,
984 -3.3241990f,
985 2.5988452f,
986 -1.2315303f,
987 3.1821337e-1f,
988 -3.4436006e-2f);
989#elif POW_POLY_DEGREE == 5
990 mantissa = POLY4_AVX2_FMA(frac,
991 2.8882704548164776201f,
992 -2.52074962577807006663f,
993 1.48116647521213171641f,
994 -0.465725644288844778798f,
995 0.0596515482674574969533f);
996#elif POW_POLY_DEGREE == 4
997 mantissa = POLY3_AVX2_FMA(frac,
998 2.61761038894603480148f,
999 -1.75647175389045657003f,
1000 0.688243882994381274313f,
1001 -0.107254423828329604454f);
1002#elif POW_POLY_DEGREE == 3
1003 mantissa = POLY2_AVX2_FMA(frac,
1004 2.28330284476918490682f,
1005 -1.04913055217340124191f,
1006 0.204446009836232697516f);
1007#else
1008#error
1009#endif
1010
1011 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
1012 logarithm = _mm256_mul_ps(logarithm, ln2);
1013
1014
1015 // Now calculate b*lna
1016 bVal = _mm256_loadu_ps(bPtr);
1017 bVal = _mm256_mul_ps(bVal, logarithm);
1018
1019 // Now compute exp(b*lna)
1020 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
1021
1022 fx = _mm256_fmadd_ps(bVal, log2EF, half);
1023
1024 emm0 = _mm256_cvttps_epi32(fx);
1025 tmp = _mm256_cvtepi32_ps(emm0);
1026
1027 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
1028 fx = _mm256_sub_ps(tmp, mask);
1029
1030 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
1031 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
1032 z = _mm256_mul_ps(bVal, bVal);
1033
1034 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
1035 y = _mm256_fmadd_ps(y, bVal, exp_p2);
1036 y = _mm256_fmadd_ps(y, bVal, exp_p3);
1037 y = _mm256_fmadd_ps(y, bVal, exp_p4);
1038 y = _mm256_fmadd_ps(y, bVal, exp_p5);
1039 y = _mm256_fmadd_ps(y, z, bVal);
1040 y = _mm256_add_ps(y, one);
1041
1042 emm0 =
1043 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
1044
1045 pow2n = _mm256_castsi256_ps(emm0);
1046 cVal = _mm256_mul_ps(y, pow2n);
1047
1048 _mm256_storeu_ps(cPtr, cVal);
1049
1050 aPtr += 8;
1051 bPtr += 8;
1052 cPtr += 8;
1053 }
1054
1055 number = eighthPoints * 8;
1056 for (; number < num_points; number++) {
1057 *cPtr++ = pow(*aPtr++, *bPtr++);
1058 }
1059}
1060
1061#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
1062
1063#ifdef LV_HAVE_AVX2
1064#include <immintrin.h>
1065
1066#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
1067#define POLY1_AVX2(x, c0, c1) \
1068 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
1069#define POLY2_AVX2(x, c0, c1, c2) \
1070 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
1071#define POLY3_AVX2(x, c0, c1, c2, c3) \
1072 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
1073#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
1074 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
1075#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
1076 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
1077
1078static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
1079 const float* bVector,
1080 const float* aVector,
1081 unsigned int num_points)
1082{
1083 float* cPtr = cVector;
1084 const float* bPtr = bVector;
1085 const float* aPtr = aVector;
1086
1087 unsigned int number = 0;
1088 const unsigned int eighthPoints = num_points / 8;
1089
1090 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
1091 __m256 tmp, fx, mask, pow2n, z, y;
1092 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
1093 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
1094 __m256i bias, exp, emm0, pi32_0x7f;
1095
1096 one = _mm256_set1_ps(1.0);
1097 exp_hi = _mm256_set1_ps(88.3762626647949);
1098 exp_lo = _mm256_set1_ps(-88.3762626647949);
1099 ln2 = _mm256_set1_ps(0.6931471805);
1100 log2EF = _mm256_set1_ps(1.44269504088896341);
1101 half = _mm256_set1_ps(0.5);
1102 exp_C1 = _mm256_set1_ps(0.693359375);
1103 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
1104 pi32_0x7f = _mm256_set1_epi32(0x7f);
1105
1106 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
1107 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
1108 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
1109 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
1110 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
1111 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
1112
1113 for (; number < eighthPoints; number++) {
1114 // First compute the logarithm
1115 aVal = _mm256_loadu_ps(aPtr);
1116 bias = _mm256_set1_epi32(127);
1117 leadingOne = _mm256_set1_ps(1.0f);
1118 exp = _mm256_sub_epi32(
1119 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
1120 _mm256_set1_epi32(0x7f800000)),
1121 23),
1122 bias);
1123 logarithm = _mm256_cvtepi32_ps(exp);
1124
1125 frac = _mm256_or_ps(
1126 leadingOne,
1127 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
1128
1129#if POW_POLY_DEGREE == 6
1130 mantissa = POLY5_AVX2(frac,
1131 3.1157899f,
1132 -3.3241990f,
1133 2.5988452f,
1134 -1.2315303f,
1135 3.1821337e-1f,
1136 -3.4436006e-2f);
1137#elif POW_POLY_DEGREE == 5
1138 mantissa = POLY4_AVX2(frac,
1139 2.8882704548164776201f,
1140 -2.52074962577807006663f,
1141 1.48116647521213171641f,
1142 -0.465725644288844778798f,
1143 0.0596515482674574969533f);
1144#elif POW_POLY_DEGREE == 4
1145 mantissa = POLY3_AVX2(frac,
1146 2.61761038894603480148f,
1147 -1.75647175389045657003f,
1148 0.688243882994381274313f,
1149 -0.107254423828329604454f);
1150#elif POW_POLY_DEGREE == 3
1151 mantissa = POLY2_AVX2(frac,
1152 2.28330284476918490682f,
1153 -1.04913055217340124191f,
1154 0.204446009836232697516f);
1155#else
1156#error
1157#endif
1158
1159 logarithm = _mm256_add_ps(
1160 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
1161 logarithm = _mm256_mul_ps(logarithm, ln2);
1162
1163 // Now calculate b*lna
1164 bVal = _mm256_loadu_ps(bPtr);
1165 bVal = _mm256_mul_ps(bVal, logarithm);
1166
1167 // Now compute exp(b*lna)
1168 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
1169
1170 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
1171
1172 emm0 = _mm256_cvttps_epi32(fx);
1173 tmp = _mm256_cvtepi32_ps(emm0);
1174
1175 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
1176 fx = _mm256_sub_ps(tmp, mask);
1177
1178 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
1179 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
1180 z = _mm256_mul_ps(bVal, bVal);
1181
1182 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
1183 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
1184 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
1185 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
1186 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
1187 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
1188 y = _mm256_add_ps(y, one);
1189
1190 emm0 =
1191 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
1192
1193 pow2n = _mm256_castsi256_ps(emm0);
1194 cVal = _mm256_mul_ps(y, pow2n);
1195
1196 _mm256_storeu_ps(cPtr, cVal);
1197
1198 aPtr += 8;
1199 bPtr += 8;
1200 cPtr += 8;
1201 }
1202
1203 number = eighthPoints * 8;
1204 for (; number < num_points; number++) {
1205 *cPtr++ = pow(*aPtr++, *bPtr++);
1206 }
1207}
1208
1209#endif /* LV_HAVE_AVX2 for unaligned */
1210
1211#ifdef LV_HAVE_RVV
1212#include <riscv_vector.h>
1213
1214static inline void volk_32f_x2_pow_32f_rvv(float* cVector,
1215 const float* bVector,
1216 const float* aVector,
1217 unsigned int num_points)
1218{
1219 size_t vlmax = __riscv_vsetvlmax_e32m1();
1220
1221#if POW_POLY_DEGREE == 6
1222 const vfloat32m1_t cl5 = __riscv_vfmv_v_f_f32m1(3.1157899f, vlmax);
1223 const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(-3.3241990f, vlmax);
1224 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.5988452f, vlmax);
1225 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.2315303f, vlmax);
1226 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(3.1821337e-1f, vlmax);
1227 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-3.4436006e-2f, vlmax);
1228#elif POW_POLY_DEGREE == 5
1229 const vfloat32m1_t cl4 = __riscv_vfmv_v_f_f32m1(2.8882704548164776201f, vlmax);
1230 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(-2.52074962577807006663f, vlmax);
1231 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(1.48116647521213171641f, vlmax);
1232 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-0.465725644288844778798f, vlmax);
1233 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.0596515482674574969533f, vlmax);
1234#elif POW_POLY_DEGREE == 4
1235 const vfloat32m1_t cl3 = __riscv_vfmv_v_f_f32m1(2.61761038894603480148f, vlmax);
1236 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(-1.75647175389045657003f, vlmax);
1237 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(0.688243882994381274313f, vlmax);
1238 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(-0.107254423828329604454f, vlmax);
1239#elif POW_POLY_DEGREE == 3
1240 const vfloat32m1_t cl2 = __riscv_vfmv_v_f_f32m1(2.28330284476918490682f, vlmax);
1241 const vfloat32m1_t cl1 = __riscv_vfmv_v_f_f32m1(-1.04913055217340124191f, vlmax);
1242 const vfloat32m1_t cl0 = __riscv_vfmv_v_f_f32m1(0.204446009836232697516f, vlmax);
1243#else
1244#error
1245#endif
1246
1247 const vfloat32m1_t exp_hi = __riscv_vfmv_v_f_f32m1(88.376259f, vlmax);
1248 const vfloat32m1_t exp_lo = __riscv_vfmv_v_f_f32m1(-88.376259f, vlmax);
1249 const vfloat32m1_t log2EF = __riscv_vfmv_v_f_f32m1(1.442695f, vlmax);
1250 const vfloat32m1_t exp_C1 = __riscv_vfmv_v_f_f32m1(-0.6933594f, vlmax);
1251 const vfloat32m1_t exp_C2 = __riscv_vfmv_v_f_f32m1(0.000212194f, vlmax);
1252 const vfloat32m1_t cf1 = __riscv_vfmv_v_f_f32m1(1.0f, vlmax);
1253 const vfloat32m1_t cf1o2 = __riscv_vfmv_v_f_f32m1(0.5f, vlmax);
1254 const vfloat32m1_t ln2 = __riscv_vfmv_v_f_f32m1(0.6931471805f, vlmax);
1255
1256 const vfloat32m1_t ce0 = __riscv_vfmv_v_f_f32m1(1.9875691500e-4, vlmax);
1257 const vfloat32m1_t ce1 = __riscv_vfmv_v_f_f32m1(1.3981999507e-3, vlmax);
1258 const vfloat32m1_t ce2 = __riscv_vfmv_v_f_f32m1(8.3334519073e-3, vlmax);
1259 const vfloat32m1_t ce3 = __riscv_vfmv_v_f_f32m1(4.1665795894e-2, vlmax);
1260 const vfloat32m1_t ce4 = __riscv_vfmv_v_f_f32m1(1.6666665459e-1, vlmax);
1261 const vfloat32m1_t ce5 = __riscv_vfmv_v_f_f32m1(5.0000001201e-1, vlmax);
1262
1263 const vint32m1_t m1 = __riscv_vreinterpret_i32m1(cf1);
1264 const vint32m1_t m2 = __riscv_vmv_v_x_i32m1(0x7FFFFF, vlmax);
1265 const vint32m1_t c127 = __riscv_vmv_v_x_i32m1(127, vlmax);
1266
1267 size_t n = num_points;
1268 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl, cVector += vl) {
1269 vl = __riscv_vsetvl_e32m1(n);
1270 vfloat32m1_t va = __riscv_vle32_v_f32m1(aVector, vl);
1271 vfloat32m1_t log;
1272
1273 { /* log(a) */
1274 vfloat32m1_t a = __riscv_vfabs(va, vl);
1275 vfloat32m1_t exp = __riscv_vfcvt_f(
1276 __riscv_vsub(
1277 __riscv_vsra(__riscv_vreinterpret_i32m1(a), 23, vl), c127, vl),
1278 vl);
1279 vfloat32m1_t frac = __riscv_vreinterpret_f32m1(__riscv_vor(
1280 __riscv_vand(__riscv_vreinterpret_i32m1(va), m2, vl), m1, vl));
1281
1282 vfloat32m1_t mant = cl0;
1283 mant = __riscv_vfmadd(mant, frac, cl1, vl);
1284 mant = __riscv_vfmadd(mant, frac, cl2, vl);
1285#if POW_POLY_DEGREE >= 4
1286 mant = __riscv_vfmadd(mant, frac, cl3, vl);
1287#if POW_POLY_DEGREE >= 5
1288 mant = __riscv_vfmadd(mant, frac, cl4, vl);
1289#if POW_POLY_DEGREE >= 6
1290 mant = __riscv_vfmadd(mant, frac, cl5, vl);
1291#endif
1292#endif
1293#endif
1294 log = __riscv_vfmacc(exp, mant, __riscv_vfsub(frac, cf1, vl), vl);
1295 log = __riscv_vfmul(log, ln2, vl);
1296 }
1297
1298 vfloat32m1_t vb = __riscv_vle32_v_f32m1(bVector, vl);
1299 vb = __riscv_vfmul(vb, log, vl); /* b*log(a) */
1300 vfloat32m1_t exp;
1301
1302 { /* exp(b*log(a)) */
1303 vb = __riscv_vfmin(vb, exp_hi, vl);
1304 vb = __riscv_vfmax(vb, exp_lo, vl);
1305 vfloat32m1_t fx = __riscv_vfmadd(vb, log2EF, cf1o2, vl);
1306
1307 vfloat32m1_t rtz = __riscv_vfcvt_f(__riscv_vfcvt_rtz_x(fx, vl), vl);
1308 fx = __riscv_vfsub_mu(__riscv_vmfgt(rtz, fx, vl), rtz, rtz, cf1, vl);
1309 vb = __riscv_vfmacc(vb, exp_C1, fx, vl);
1310 vb = __riscv_vfmacc(vb, exp_C2, fx, vl);
1311 vfloat32m1_t vv = __riscv_vfmul(vb, vb, vl);
1312
1313 vfloat32m1_t y = ce0;
1314 y = __riscv_vfmadd(y, vb, ce1, vl);
1315 y = __riscv_vfmadd(y, vb, ce2, vl);
1316 y = __riscv_vfmadd(y, vb, ce3, vl);
1317 y = __riscv_vfmadd(y, vb, ce4, vl);
1318 y = __riscv_vfmadd(y, vb, ce5, vl);
1319 y = __riscv_vfmadd(y, vv, vb, vl);
1320 y = __riscv_vfadd(y, cf1, vl);
1321
1322 vfloat32m1_t pow2n = __riscv_vreinterpret_f32m1(__riscv_vsll(
1323 __riscv_vadd(__riscv_vfcvt_rtz_x(fx, vl), c127, vl), 23, vl));
1324
1325 exp = __riscv_vfmul(y, pow2n, vl);
1326 }
1327
1328 __riscv_vse32(cVector, exp, vl);
1329 }
1330}
1331
1332#endif /*LV_HAVE_RVV*/
1333
1334#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */