Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32f_tanh_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
54
55#ifndef INCLUDED_volk_32f_tanh_32f_a_H
56#define INCLUDED_volk_32f_tanh_32f_a_H
57
58#include <inttypes.h>
59#include <math.h>
60#include <stdio.h>
61#include <string.h>
62
63
64#ifdef LV_HAVE_GENERIC
65
66static inline void
67volk_32f_tanh_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
68{
69 unsigned int number = 0;
70 float* cPtr = cVector;
71 const float* aPtr = aVector;
72 for (; number < num_points; number++) {
73 *cPtr++ = tanhf(*aPtr++);
74 }
75}
76
77#endif /* LV_HAVE_GENERIC */
78
79
80#ifdef LV_HAVE_GENERIC
81
82static inline void
83volk_32f_tanh_32f_series(float* cVector, const float* aVector, unsigned int num_points)
84{
85 float* cPtr = cVector;
86 const float* aPtr = aVector;
87 for (unsigned int number = 0; number < num_points; number++) {
88 if (*aPtr > 4.97)
89 *cPtr++ = 1;
90 else if (*aPtr <= -4.97)
91 *cPtr++ = -1;
92 else {
93 float x2 = (*aPtr) * (*aPtr);
94 float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
95 float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
96 *cPtr++ = a / b;
97 aPtr++;
98 }
99 }
100}
101
102#endif /* LV_HAVE_GENERIC */
103
104
105#ifdef LV_HAVE_SSE
106#include <xmmintrin.h>
107
108static inline void
109volk_32f_tanh_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
110{
111 unsigned int number = 0;
112 const unsigned int quarterPoints = num_points / 4;
113
114 float* cPtr = cVector;
115 const float* aPtr = aVector;
116
117 __m128 aVal, cVal, x2, a, b;
118 __m128 const1, const2, const3, const4, const5, const6;
119 const1 = _mm_set_ps1(135135.0f);
120 const2 = _mm_set_ps1(17325.0f);
121 const3 = _mm_set_ps1(378.0f);
122 const4 = _mm_set_ps1(62370.0f);
123 const5 = _mm_set_ps1(3150.0f);
124 const6 = _mm_set_ps1(28.0f);
125 for (; number < quarterPoints; number++) {
126
127 aVal = _mm_load_ps(aPtr);
128 x2 = _mm_mul_ps(aVal, aVal);
129 a = _mm_mul_ps(
130 aVal,
131 _mm_add_ps(
132 const1,
133 _mm_mul_ps(x2,
134 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
135 b = _mm_add_ps(
136 const1,
137 _mm_mul_ps(
138 x2,
139 _mm_add_ps(const4,
140 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
141
142 cVal = _mm_div_ps(a, b);
143
144 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
145
146 aPtr += 4;
147 cPtr += 4;
148 }
149
150 number = quarterPoints * 4;
151 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
152}
153#endif /* LV_HAVE_SSE */
154
155
156#ifdef LV_HAVE_AVX
157#include <immintrin.h>
158
159static inline void
160volk_32f_tanh_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
161{
162 unsigned int number = 0;
163 const unsigned int eighthPoints = num_points / 8;
164
165 float* cPtr = cVector;
166 const float* aPtr = aVector;
167
168 __m256 aVal, cVal, x2, a, b;
169 __m256 const1, const2, const3, const4, const5, const6;
170 const1 = _mm256_set1_ps(135135.0f);
171 const2 = _mm256_set1_ps(17325.0f);
172 const3 = _mm256_set1_ps(378.0f);
173 const4 = _mm256_set1_ps(62370.0f);
174 const5 = _mm256_set1_ps(3150.0f);
175 const6 = _mm256_set1_ps(28.0f);
176 for (; number < eighthPoints; number++) {
177
178 aVal = _mm256_load_ps(aPtr);
179 x2 = _mm256_mul_ps(aVal, aVal);
180 a = _mm256_mul_ps(
181 aVal,
182 _mm256_add_ps(
183 const1,
184 _mm256_mul_ps(
185 x2,
186 _mm256_add_ps(const2,
187 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
188 b = _mm256_add_ps(
189 const1,
190 _mm256_mul_ps(
191 x2,
192 _mm256_add_ps(
193 const4,
194 _mm256_mul_ps(x2,
195 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
196
197 cVal = _mm256_div_ps(a, b);
198
199 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
200
201 aPtr += 8;
202 cPtr += 8;
203 }
204
205 number = eighthPoints * 8;
206 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
207}
208#endif /* LV_HAVE_AVX */
209
210#if LV_HAVE_AVX && LV_HAVE_FMA
211#include <immintrin.h>
212
213static inline void
214volk_32f_tanh_32f_a_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
215{
216 unsigned int number = 0;
217 const unsigned int eighthPoints = num_points / 8;
218
219 float* cPtr = cVector;
220 const float* aPtr = aVector;
221
222 __m256 aVal, cVal, x2, a, b;
223 __m256 const1, const2, const3, const4, const5, const6;
224 const1 = _mm256_set1_ps(135135.0f);
225 const2 = _mm256_set1_ps(17325.0f);
226 const3 = _mm256_set1_ps(378.0f);
227 const4 = _mm256_set1_ps(62370.0f);
228 const5 = _mm256_set1_ps(3150.0f);
229 const6 = _mm256_set1_ps(28.0f);
230 for (; number < eighthPoints; number++) {
231
232 aVal = _mm256_load_ps(aPtr);
233 x2 = _mm256_mul_ps(aVal, aVal);
234 a = _mm256_mul_ps(
235 aVal,
236 _mm256_fmadd_ps(
237 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
238 b = _mm256_fmadd_ps(
239 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
240
241 cVal = _mm256_div_ps(a, b);
242
243 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
244
245 aPtr += 8;
246 cPtr += 8;
247 }
248
249 number = eighthPoints * 8;
250 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
251}
252#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
253
254#ifdef LV_HAVE_NEON
255#include <arm_neon.h>
256
257static inline void
258volk_32f_tanh_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
259{
260 unsigned int number = 0;
261 const unsigned int quarterPoints = num_points / 4;
262
263 float* cPtr = cVector;
264 const float* aPtr = aVector;
265
266 // Polynomial coefficients for tanh approximation
267 const float32x4_t const1 = vdupq_n_f32(135135.0f);
268 const float32x4_t const2 = vdupq_n_f32(17325.0f);
269 const float32x4_t const3 = vdupq_n_f32(378.0f);
270 const float32x4_t const4 = vdupq_n_f32(62370.0f);
271 const float32x4_t const5 = vdupq_n_f32(3150.0f);
272 const float32x4_t const6 = vdupq_n_f32(28.0f);
273
274 for (; number < quarterPoints; number++) {
275 float32x4_t aVal = vld1q_f32(aPtr);
276 float32x4_t x2 = vmulq_f32(aVal, aVal);
277
278 // a = x * (135135 + x2 * (17325 + x2 * (378 + x2)))
279 float32x4_t inner_a = vaddq_f32(const3, x2);
280 inner_a = vmlaq_f32(const2, x2, inner_a);
281 inner_a = vmlaq_f32(const1, x2, inner_a);
282 float32x4_t a = vmulq_f32(aVal, inner_a);
283
284 // b = 135135 + x2 * (62370 + x2 * (3150 + x2 * 28))
285 float32x4_t inner_b = vmlaq_f32(const5, x2, const6);
286 inner_b = vmlaq_f32(const4, x2, inner_b);
287 float32x4_t b = vmlaq_f32(const1, x2, inner_b);
288
289 // c = a / b (use reciprocal approximation)
290 float32x4_t b_recip = vrecpeq_f32(b);
291 b_recip = vmulq_f32(b_recip, vrecpsq_f32(b, b_recip));
292 b_recip = vmulq_f32(b_recip, vrecpsq_f32(b, b_recip));
293 float32x4_t cVal = vmulq_f32(a, b_recip);
294
295 vst1q_f32(cPtr, cVal);
296 aPtr += 4;
297 cPtr += 4;
298 }
299
300 number = quarterPoints * 4;
301 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
302}
303
304#endif /* LV_HAVE_NEON */
305
306#ifdef LV_HAVE_NEONV8
307#include <arm_neon.h>
308
309static inline void
310volk_32f_tanh_32f_neonv8(float* cVector, const float* aVector, unsigned int num_points)
311{
312 unsigned int number = 0;
313 const unsigned int eighthPoints = num_points / 8;
314
315 float* cPtr = cVector;
316 const float* aPtr = aVector;
317
318 // Polynomial coefficients for tanh approximation
319 const float32x4_t const1 = vdupq_n_f32(135135.0f);
320 const float32x4_t const2 = vdupq_n_f32(17325.0f);
321 const float32x4_t const3 = vdupq_n_f32(378.0f);
322 const float32x4_t const4 = vdupq_n_f32(62370.0f);
323 const float32x4_t const5 = vdupq_n_f32(3150.0f);
324 const float32x4_t const6 = vdupq_n_f32(28.0f);
325
326 for (; number < eighthPoints; number++) {
327 __VOLK_PREFETCH(aPtr + 16);
328
329 float32x4_t aVal0 = vld1q_f32(aPtr);
330 float32x4_t aVal1 = vld1q_f32(aPtr + 4);
331 float32x4_t x2_0 = vmulq_f32(aVal0, aVal0);
332 float32x4_t x2_1 = vmulq_f32(aVal1, aVal1);
333
334 // a = x * (135135 + x2 * (17325 + x2 * (378 + x2))) using FMA
335 float32x4_t inner_a0 = vaddq_f32(const3, x2_0);
336 float32x4_t inner_a1 = vaddq_f32(const3, x2_1);
337 inner_a0 = vfmaq_f32(const2, x2_0, inner_a0);
338 inner_a1 = vfmaq_f32(const2, x2_1, inner_a1);
339 inner_a0 = vfmaq_f32(const1, x2_0, inner_a0);
340 inner_a1 = vfmaq_f32(const1, x2_1, inner_a1);
341 float32x4_t a0 = vmulq_f32(aVal0, inner_a0);
342 float32x4_t a1 = vmulq_f32(aVal1, inner_a1);
343
344 // b = 135135 + x2 * (62370 + x2 * (3150 + x2 * 28)) using FMA
345 float32x4_t inner_b0 = vfmaq_f32(const5, x2_0, const6);
346 float32x4_t inner_b1 = vfmaq_f32(const5, x2_1, const6);
347 inner_b0 = vfmaq_f32(const4, x2_0, inner_b0);
348 inner_b1 = vfmaq_f32(const4, x2_1, inner_b1);
349 float32x4_t b0 = vfmaq_f32(const1, x2_0, inner_b0);
350 float32x4_t b1 = vfmaq_f32(const1, x2_1, inner_b1);
351
352 // c = a / b using native division
353 float32x4_t cVal0 = vdivq_f32(a0, b0);
354 float32x4_t cVal1 = vdivq_f32(a1, b1);
355
356 vst1q_f32(cPtr, cVal0);
357 vst1q_f32(cPtr + 4, cVal1);
358 aPtr += 8;
359 cPtr += 8;
360 }
361
362 number = eighthPoints * 8;
363 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
364}
365
366#endif /* LV_HAVE_NEONV8 */
367
368#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
369
370
371#ifndef INCLUDED_volk_32f_tanh_32f_u_H
372#define INCLUDED_volk_32f_tanh_32f_u_H
373
374#include <inttypes.h>
375#include <math.h>
376#include <stdio.h>
377#include <string.h>
378
379
380#ifdef LV_HAVE_SSE
381#include <xmmintrin.h>
382
383static inline void
384volk_32f_tanh_32f_u_sse(float* cVector, const float* aVector, unsigned int num_points)
385{
386 unsigned int number = 0;
387 const unsigned int quarterPoints = num_points / 4;
388
389 float* cPtr = cVector;
390 const float* aPtr = aVector;
391
392 __m128 aVal, cVal, x2, a, b;
393 __m128 const1, const2, const3, const4, const5, const6;
394 const1 = _mm_set_ps1(135135.0f);
395 const2 = _mm_set_ps1(17325.0f);
396 const3 = _mm_set_ps1(378.0f);
397 const4 = _mm_set_ps1(62370.0f);
398 const5 = _mm_set_ps1(3150.0f);
399 const6 = _mm_set_ps1(28.0f);
400 for (; number < quarterPoints; number++) {
401
402 aVal = _mm_loadu_ps(aPtr);
403 x2 = _mm_mul_ps(aVal, aVal);
404 a = _mm_mul_ps(
405 aVal,
406 _mm_add_ps(
407 const1,
408 _mm_mul_ps(x2,
409 _mm_add_ps(const2, _mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
410 b = _mm_add_ps(
411 const1,
412 _mm_mul_ps(
413 x2,
414 _mm_add_ps(const4,
415 _mm_mul_ps(x2, _mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
416
417 cVal = _mm_div_ps(a, b);
418
419 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
420
421 aPtr += 4;
422 cPtr += 4;
423 }
424
425 number = quarterPoints * 4;
426 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
427}
428#endif /* LV_HAVE_SSE */
429
430
431#ifdef LV_HAVE_AVX
432#include <immintrin.h>
433
434static inline void
435volk_32f_tanh_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
436{
437 unsigned int number = 0;
438 const unsigned int eighthPoints = num_points / 8;
439
440 float* cPtr = cVector;
441 const float* aPtr = aVector;
442
443 __m256 aVal, cVal, x2, a, b;
444 __m256 const1, const2, const3, const4, const5, const6;
445 const1 = _mm256_set1_ps(135135.0f);
446 const2 = _mm256_set1_ps(17325.0f);
447 const3 = _mm256_set1_ps(378.0f);
448 const4 = _mm256_set1_ps(62370.0f);
449 const5 = _mm256_set1_ps(3150.0f);
450 const6 = _mm256_set1_ps(28.0f);
451 for (; number < eighthPoints; number++) {
452
453 aVal = _mm256_loadu_ps(aPtr);
454 x2 = _mm256_mul_ps(aVal, aVal);
455 a = _mm256_mul_ps(
456 aVal,
457 _mm256_add_ps(
458 const1,
459 _mm256_mul_ps(
460 x2,
461 _mm256_add_ps(const2,
462 _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
463 b = _mm256_add_ps(
464 const1,
465 _mm256_mul_ps(
466 x2,
467 _mm256_add_ps(
468 const4,
469 _mm256_mul_ps(x2,
470 _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
471
472 cVal = _mm256_div_ps(a, b);
473
474 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
475
476 aPtr += 8;
477 cPtr += 8;
478 }
479
480 number = eighthPoints * 8;
481 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
482}
483#endif /* LV_HAVE_AVX */
484
485#if LV_HAVE_AVX && LV_HAVE_FMA
486#include <immintrin.h>
487
488static inline void
489volk_32f_tanh_32f_u_avx_fma(float* cVector, const float* aVector, unsigned int num_points)
490{
491 unsigned int number = 0;
492 const unsigned int eighthPoints = num_points / 8;
493
494 float* cPtr = cVector;
495 const float* aPtr = aVector;
496
497 __m256 aVal, cVal, x2, a, b;
498 __m256 const1, const2, const3, const4, const5, const6;
499 const1 = _mm256_set1_ps(135135.0f);
500 const2 = _mm256_set1_ps(17325.0f);
501 const3 = _mm256_set1_ps(378.0f);
502 const4 = _mm256_set1_ps(62370.0f);
503 const5 = _mm256_set1_ps(3150.0f);
504 const6 = _mm256_set1_ps(28.0f);
505 for (; number < eighthPoints; number++) {
506
507 aVal = _mm256_loadu_ps(aPtr);
508 x2 = _mm256_mul_ps(aVal, aVal);
509 a = _mm256_mul_ps(
510 aVal,
511 _mm256_fmadd_ps(
512 x2, _mm256_fmadd_ps(x2, _mm256_add_ps(const3, x2), const2), const1));
513 b = _mm256_fmadd_ps(
514 x2, _mm256_fmadd_ps(x2, _mm256_fmadd_ps(x2, const6, const5), const4), const1);
515
516 cVal = _mm256_div_ps(a, b);
517
518 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
519
520 aPtr += 8;
521 cPtr += 8;
522 }
523
524 number = eighthPoints * 8;
525 volk_32f_tanh_32f_series(cPtr, aPtr, num_points - number);
526}
527#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
528
529#ifdef LV_HAVE_RVV
530#include <riscv_vector.h>
531
532static inline void
533volk_32f_tanh_32f_rvv(float* bVector, const float* aVector, unsigned int num_points)
534{
535 size_t vlmax = __riscv_vsetvlmax_e32m2();
536
537 const vfloat32m2_t c1 = __riscv_vfmv_v_f_f32m2(135135.0f, vlmax);
538 const vfloat32m2_t c2 = __riscv_vfmv_v_f_f32m2(17325.0f, vlmax);
539 const vfloat32m2_t c3 = __riscv_vfmv_v_f_f32m2(378.0f, vlmax);
540 const vfloat32m2_t c4 = __riscv_vfmv_v_f_f32m2(62370.0f, vlmax);
541 const vfloat32m2_t c5 = __riscv_vfmv_v_f_f32m2(3150.0f, vlmax);
542 const vfloat32m2_t c6 = __riscv_vfmv_v_f_f32m2(28.0f, vlmax);
543
544 size_t n = num_points;
545 for (size_t vl; n > 0; n -= vl, aVector += vl, bVector += vl) {
546 vl = __riscv_vsetvl_e32m2(n);
547 vfloat32m2_t x = __riscv_vle32_v_f32m2(aVector, vl);
548 vfloat32m2_t xx = __riscv_vfmul(x, x, vl);
549 vfloat32m2_t a, b;
550 a = __riscv_vfadd(xx, c3, vl);
551 a = __riscv_vfmadd(a, xx, c2, vl);
552 a = __riscv_vfmadd(a, xx, c1, vl);
553 a = __riscv_vfmul(a, x, vl);
554 b = c6;
555 b = __riscv_vfmadd(b, xx, c5, vl);
556 b = __riscv_vfmadd(b, xx, c4, vl);
557 b = __riscv_vfmadd(b, xx, c1, vl);
558 __riscv_vse32(bVector, __riscv_vfdiv(a, b, vl), vl);
559 }
560}
561#endif /*LV_HAVE_RVV*/
562
563#endif /* INCLUDED_volk_32f_tanh_32f_u_H */