68#ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
69#define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
76 float32x4_t iValue, qValue, result;
77 iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]);
78 qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]);
79 result = vaddq_f32(iValue, qValue);
87 float32x4_t x0 = vrsqrteq_f32(x);
90 float32x4_t x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x0), x0), x0);
91 x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x1), x1), x1);
95 uint32x4_t x_bits = vreinterpretq_u32_f32(x);
96 uint32x4_t zero_mask = vceqq_u32(x_bits, vdupq_n_u32(0x00000000));
97 uint32x4_t inf_mask = vceqq_u32(x_bits, vdupq_n_u32(0x7F800000));
98 uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
99 return vbslq_f32(special_mask, x0, x1);
107 const float32x4_t zero = vdupq_n_f32(0.0f);
108 uint32x4_t zero_mask = vceqq_f32(x, zero);
110 return vbslq_f32(zero_mask, zero, result);
117 float32x4_t recip = vrecpeq_f32(x);
118 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
119 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
132 const float32x4_t c0 = vdupq_n_f32(0x1.ffffcep-1f);
133 const float32x4_t c1 = vdupq_n_f32(0x1.55b648p-3f);
134 const float32x4_t c2 = vdupq_n_f32(0x1.24d192p-4f);
135 const float32x4_t c3 = vdupq_n_f32(0x1.0a788p-4f);
137 const float32x4_t u = vmulq_f32(x, x);
139 p = vmlaq_f32(c2, u, p);
140 p = vmlaq_f32(c1, u, p);
141 p = vmlaq_f32(c0, u, p);
143 return vmulq_f32(x, p);
154static inline float32x4_t _varcsinq_f32_neonv8(float32x4_t x)
156 const float32x4_t c0 = vdupq_n_f32(0x1.ffffcep-1f);
157 const float32x4_t c1 = vdupq_n_f32(0x1.55b648p-3f);
158 const float32x4_t c2 = vdupq_n_f32(0x1.24d192p-4f);
159 const float32x4_t c3 = vdupq_n_f32(0x1.0a788p-4f);
161 const float32x4_t u = vmulq_f32(x, x);
163 p = vfmaq_f32(c2, u, p);
164 p = vfmaq_f32(c1, u, p);
165 p = vfmaq_f32(c0, u, p);
167 return vmulq_f32(x, p);
175 float32x4x2_t tmp_real;
176 float32x4x2_t tmp_imag;
181 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
183 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
186 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
188 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
190 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
191 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
198 float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
199 float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
200 float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
201 float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
202 float32x4_t x2 = vmulq_f32(x, x);
203 float32x4_t x4 = vmulq_f32(x2, x2);
204 float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
212 const float32x4_t log_tab[8] = {
213 vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
214 vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
215 vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
216 vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
219 const int32x4_t CONST_127 = vdupq_n_s32(127);
220 const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f);
223 int32x4_t m = vsubq_s32(
224 vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
226 vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
232 poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
242 const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
243 const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
244 const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
245 const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
246 const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
247 const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
248 const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
249 const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
250 const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
251 const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516);
253 const float32x4_t CONST_1 = vdupq_n_f32(1.f);
254 const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
255 const float32x4_t CONST_0 = vdupq_n_f32(0.f);
256 const uint32x4_t CONST_2 = vdupq_n_u32(2);
257 const uint32x4_t CONST_4 = vdupq_n_u32(4);
261 uint32x4_t sign_mask_sin, sign_mask_cos;
262 sign_mask_sin = vcltq_f32(x, CONST_0);
265 float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
268 emm2 = vcvtq_u32_f32(y);
270 emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
271 emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
272 y = vcvtq_f32_u32(emm2);
278 const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
281 x = vmlaq_f32(x, y, c_minus_cephes_DP1);
282 x = vmlaq_f32(x, y, c_minus_cephes_DP2);
283 x = vmlaq_f32(x, y, c_minus_cephes_DP3);
285 sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
286 sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
291 float32x4_t z = vmulq_f32(x, x);
293 y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
294 y1 = vmlaq_f32(c_coscof_p2, z, y1);
295 y1 = vmulq_f32(y1, z);
296 y1 = vmulq_f32(y1, z);
297 y1 = vmlsq_f32(y1, z, CONST_1_2);
298 y1 = vaddq_f32(y1, CONST_1);
300 y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
301 y2 = vmlaq_f32(c_sincof_p2, z, y2);
302 y2 = vmulq_f32(y2, z);
303 y2 = vmlaq_f32(x, x, y2);
306 const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
307 const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
309 float32x4x2_t sincos;
310 sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
311 sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
319 return vmulq_f32(sincos.val[0],
_vinvq_f32(sincos.val[1]));
331 const float32x4_t a1 = vdupq_n_f32(+0x1.ffffeap-1f);
332 const float32x4_t a3 = vdupq_n_f32(-0x1.55437p-2f);
333 const float32x4_t a5 = vdupq_n_f32(+0x1.972be6p-3f);
334 const float32x4_t a7 = vdupq_n_f32(-0x1.1436ap-3f);
335 const float32x4_t a9 = vdupq_n_f32(+0x1.5785aap-4f);
336 const float32x4_t a11 = vdupq_n_f32(-0x1.2f3004p-5f);
337 const float32x4_t a13 = vdupq_n_f32(+0x1.01a37cp-7f);
339 const float32x4_t x_sq = vmulq_f32(x, x);
342 result = vmlaq_f32(a11, x_sq, result);
343 result = vmlaq_f32(a9, x_sq, result);
344 result = vmlaq_f32(a7, x_sq, result);
345 result = vmlaq_f32(a5, x_sq, result);
346 result = vmlaq_f32(a3, x_sq, result);
347 result = vmlaq_f32(a1, x_sq, result);
348 result = vmulq_f32(x, result);
359 aux = vmulq_f32(aux, val);
360 aux = vsubq_f32(aux, acc);
361 aux = vmulq_f32(aux, aux);
363 return vfmaq_f32(sq_acc, aux, rec);
365 aux = vmulq_f32(aux, rec);
366 return vaddq_f32(sq_acc, aux);
378 const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
379 const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
380 const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
382 const float32x4_t x2 = vmulq_f32(x, x);
383 const float32x4_t x3 = vmulq_f32(x2, x);
385 float32x4_t poly = vmlaq_f32(s2, x2, s3);
386 poly = vmlaq_f32(s1, x2, poly);
387 return vmlaq_f32(x, x3, poly);
398 const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
399 const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
400 const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
401 const float32x4_t one = vdupq_n_f32(1.0f);
403 const float32x4_t x2 = vmulq_f32(x, x);
405 float32x4_t poly = vmlaq_f32(c2, x2, c3);
406 poly = vmlaq_f32(c1, x2, poly);
407 return vmlaq_f32(one, x2, poly);
420 const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
421 const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
422 const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
423 const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
424 const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
425 const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
426 const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
429 float32x4_t poly = c6;
430 poly = vmlaq_f32(c5, poly, x);
431 poly = vmlaq_f32(c4, poly, x);
432 poly = vmlaq_f32(c3, poly, x);
433 poly = vmlaq_f32(c2, poly, x);
434 poly = vmlaq_f32(c1, poly, x);
435 poly = vmlaq_f32(c0, poly, x);
441static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
443 const float32x4_t a1 = vdupq_n_f32(+0x1.ffffeap-1f);
444 const float32x4_t a3 = vdupq_n_f32(-0x1.55437p-2f);
445 const float32x4_t a5 = vdupq_n_f32(+0x1.972be6p-3f);
446 const float32x4_t a7 = vdupq_n_f32(-0x1.1436ap-3f);
447 const float32x4_t a9 = vdupq_n_f32(+0x1.5785aap-4f);
448 const float32x4_t a11 = vdupq_n_f32(-0x1.2f3004p-5f);
449 const float32x4_t a13 = vdupq_n_f32(+0x1.01a37cp-7f);
451 const float32x4_t x_sq = vmulq_f32(x, x);
452 float32x4_t result = a13;
453 result = vfmaq_f32(a11, x_sq, result);
454 result = vfmaq_f32(a9, x_sq, result);
455 result = vfmaq_f32(a7, x_sq, result);
456 result = vfmaq_f32(a5, x_sq, result);
457 result = vfmaq_f32(a3, x_sq, result);
458 result = vfmaq_f32(a1, x_sq, result);
459 result = vmulq_f32(x, result);
465static inline float32x4_t _vsin_poly_neonv8(float32x4_t x)
467 const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
468 const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
469 const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
471 const float32x4_t x2 = vmulq_f32(x, x);
472 const float32x4_t x3 = vmulq_f32(x2, x);
474 float32x4_t poly = vfmaq_f32(s2, x2, s3);
475 poly = vfmaq_f32(s1, x2, poly);
476 return vfmaq_f32(x, x3, poly);
480static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
482 const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
483 const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
484 const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
485 const float32x4_t one = vdupq_n_f32(1.0f);
487 const float32x4_t x2 = vmulq_f32(x, x);
489 float32x4_t poly = vfmaq_f32(c2, x2, c3);
490 poly = vfmaq_f32(c1, x2, poly);
491 return vfmaq_f32(one, x2, poly);
499static inline float32x4_t _vlog2_poly_neonv8(float32x4_t x)
501 const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
502 const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
503 const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
504 const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
505 const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
506 const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
507 const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
510 float32x4_t poly = c6;
511 poly = vfmaq_f32(c5, poly, x);
512 poly = vfmaq_f32(c4, poly, x);
513 poly = vfmaq_f32(c3, poly, x);
514 poly = vfmaq_f32(c2, poly, x);
515 poly = vfmaq_f32(c1, poly, x);
516 poly = vfmaq_f32(c0, poly, x);