Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_neon_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2025, 2026 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11/*
12 * Copyright (c) 2016-2019 ARM Limited.
13 *
14 * SPDX-License-Identifier: MIT
15 *
16 * Permission is hereby granted, free of charge, to any person obtaining a copy
17 * of this software and associated documentation files (the "Software"), to
18 * deal in the Software without restriction, including without limitation the
19 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
20 * sell copies of the Software, and to permit persons to whom the Software is
21 * furnished to do so, subject to the following conditions:
22 *
23 * The above copyright notice and this permission notice shall be included in all
24 * copies or substantial portions of the Software.
25 *
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
31 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 *
34 * _vtaylor_polyq_f32
35 * _vlogq_f32
36 *
37 */
38
39/* Copyright (C) 2011 Julien Pommier
40
41 This software is provided 'as-is', without any express or implied
42 warranty. In no event will the authors be held liable for any damages
43 arising from the use of this software.
44
45 Permission is granted to anyone to use this software for any purpose,
46 including commercial applications, and to alter it and redistribute it
47 freely, subject to the following restrictions:
48
49 1. The origin of this software must not be misrepresented; you must not
50 claim that you wrote the original software. If you use this software
51 in a product, an acknowledgment in the product documentation would be
52 appreciated but is not required.
53 2. Altered source versions must be plainly marked as such, and must not be
54 misrepresented as being the original software.
55 3. This notice may not be removed or altered from any source distribution.
56
57 (this is the zlib license)
58
59 _vsincosq_f32
60
61*/
62
63/*
64 * This file is intended to hold NEON intrinsics of intrinsics.
65 * They should be used in VOLK kernels to avoid copy-pasta.
66 */
67
68#ifndef INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
69#define INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_
70#include <arm_neon.h>
71
72
73/* Magnitude squared for float32x4x2_t */
74static inline float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
75{
76 float32x4_t iValue, qValue, result;
77 iValue = vmulq_f32(cmplxValue.val[0], cmplxValue.val[0]); // Square the values
78 qValue = vmulq_f32(cmplxValue.val[1], cmplxValue.val[1]); // Square the values
79 result = vaddq_f32(iValue, qValue); // Add the I2 and Q2 values
80 return result;
81}
82
83/* Inverse square root for float32x4_t
84 * Handles edge cases: +0 → +Inf, +Inf → 0 */
85static inline float32x4_t _vinvsqrtq_f32(float32x4_t x)
86{
87 float32x4_t x0 = vrsqrteq_f32(x); // +Inf for +0, 0 for +Inf
88
89 // Newton-Raphson refinement using vrsqrtsq_f32
90 float32x4_t x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x0), x0), x0);
91 x1 = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x1), x1), x1);
92
93 // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
94 // Blend: use x0 where x == +0 or x == +Inf, else use x1
95 uint32x4_t x_bits = vreinterpretq_u32_f32(x);
96 uint32x4_t zero_mask = vceqq_u32(x_bits, vdupq_n_u32(0x00000000));
97 uint32x4_t inf_mask = vceqq_u32(x_bits, vdupq_n_u32(0x7F800000));
98 uint32x4_t special_mask = vorrq_u32(zero_mask, inf_mask);
99 return vbslq_f32(special_mask, x0, x1);
100}
101
102/* Square root for ARMv7 NEON (no vsqrtq_f32)
103 * Uses sqrt(x) = x * rsqrt(x) with refined rsqrt
104 * Handles x=0 case to avoid NaN from 0 * inf */
105static inline float32x4_t _vsqrtq_f32(float32x4_t x)
106{
107 const float32x4_t zero = vdupq_n_f32(0.0f);
108 uint32x4_t zero_mask = vceqq_f32(x, zero);
109 float32x4_t result = vmulq_f32(x, _vinvsqrtq_f32(x));
110 return vbslq_f32(zero_mask, zero, result);
111}
112
113/* Inverse */
114static inline float32x4_t _vinvq_f32(float32x4_t x)
115{
116 // Newton's method
117 float32x4_t recip = vrecpeq_f32(x);
118 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
119 recip = vmulq_f32(vrecpsq_f32(x, recip), recip);
120 return recip;
121}
122
123/*
124 * Approximate arcsin(x) via polynomial expansion
125 * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
126 *
127 * Maximum relative error ~1.5e-6
128 * Polynomial evaluated via Horner's method
129 */
130static inline float32x4_t _varcsinq_f32(float32x4_t x)
131{
132 const float32x4_t c0 = vdupq_n_f32(0x1.ffffcep-1f);
133 const float32x4_t c1 = vdupq_n_f32(0x1.55b648p-3f);
134 const float32x4_t c2 = vdupq_n_f32(0x1.24d192p-4f);
135 const float32x4_t c3 = vdupq_n_f32(0x1.0a788p-4f);
136
137 const float32x4_t u = vmulq_f32(x, x);
138 float32x4_t p = c3;
139 p = vmlaq_f32(c2, u, p);
140 p = vmlaq_f32(c1, u, p);
141 p = vmlaq_f32(c0, u, p);
142
143 return vmulq_f32(x, p);
144}
145
146#ifdef LV_HAVE_NEONV8
147/*
148 * Approximate arcsin(x) via polynomial expansion (NEONv8 with FMA)
149 * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
150 *
151 * Maximum relative error ~1.5e-6
152 * Polynomial evaluated via Horner's method
153 */
154static inline float32x4_t _varcsinq_f32_neonv8(float32x4_t x)
155{
156 const float32x4_t c0 = vdupq_n_f32(0x1.ffffcep-1f);
157 const float32x4_t c1 = vdupq_n_f32(0x1.55b648p-3f);
158 const float32x4_t c2 = vdupq_n_f32(0x1.24d192p-4f);
159 const float32x4_t c3 = vdupq_n_f32(0x1.0a788p-4f);
160
161 const float32x4_t u = vmulq_f32(x, x);
162 float32x4_t p = c3;
163 p = vfmaq_f32(c2, u, p);
164 p = vfmaq_f32(c1, u, p);
165 p = vfmaq_f32(c0, u, p);
166
167 return vmulq_f32(x, p);
168}
169#endif /* LV_HAVE_NEONV8 */
170
171/* Complex multiplication for float32x4x2_t */
172static inline float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val,
173 float32x4x2_t b_val)
174{
175 float32x4x2_t tmp_real;
176 float32x4x2_t tmp_imag;
177 float32x4x2_t c_val;
178
179 // multiply the real*real and imag*imag to get real result
180 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
181 tmp_real.val[0] = vmulq_f32(a_val.val[0], b_val.val[0]);
182 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
183 tmp_real.val[1] = vmulq_f32(a_val.val[1], b_val.val[1]);
184 // Multiply cross terms to get the imaginary result
185 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
186 tmp_imag.val[0] = vmulq_f32(a_val.val[0], b_val.val[1]);
187 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
188 tmp_imag.val[1] = vmulq_f32(a_val.val[1], b_val.val[0]);
189 // combine the products
190 c_val.val[0] = vsubq_f32(tmp_real.val[0], tmp_real.val[1]);
191 c_val.val[1] = vaddq_f32(tmp_imag.val[0], tmp_imag.val[1]);
192 return c_val;
193}
194
195/* From ARM Compute Library, MIT license */
196static inline float32x4_t _vtaylor_polyq_f32(float32x4_t x, const float32x4_t coeffs[8])
197{
198 float32x4_t cA = vmlaq_f32(coeffs[0], coeffs[4], x);
199 float32x4_t cB = vmlaq_f32(coeffs[2], coeffs[6], x);
200 float32x4_t cC = vmlaq_f32(coeffs[1], coeffs[5], x);
201 float32x4_t cD = vmlaq_f32(coeffs[3], coeffs[7], x);
202 float32x4_t x2 = vmulq_f32(x, x);
203 float32x4_t x4 = vmulq_f32(x2, x2);
204 float32x4_t res = vmlaq_f32(vmlaq_f32(cA, cB, x2), vmlaq_f32(cC, cD, x2), x4);
205 return res;
206}
207
208/* Natural logarithm.
209 * From ARM Compute Library, MIT license */
210static inline float32x4_t _vlogq_f32(float32x4_t x)
211{
212 const float32x4_t log_tab[8] = {
213 vdupq_n_f32(-2.29561495781f), vdupq_n_f32(-2.47071170807f),
214 vdupq_n_f32(-5.68692588806f), vdupq_n_f32(-0.165253549814f),
215 vdupq_n_f32(5.17591238022f), vdupq_n_f32(0.844007015228f),
216 vdupq_n_f32(4.58445882797f), vdupq_n_f32(0.0141278216615f),
217 };
218
219 const int32x4_t CONST_127 = vdupq_n_s32(127); // 127
220 const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
221
222 // Extract exponent
223 int32x4_t m = vsubq_s32(
224 vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
225 float32x4_t val =
226 vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
227
228 // Polynomial Approximation
229 float32x4_t poly = _vtaylor_polyq_f32(val, log_tab);
230
231 // Reconstruct
232 poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
233
234 return poly;
235}
236
237/* Evaluation of 4 sines & cosines at once.
238 * Optimized from here (zlib license)
239 * http://gruntthepeon.free.fr/ssemath/ */
240static inline float32x4x2_t _vsincosq_f32(float32x4_t x)
241{
242 const float32x4_t c_minus_cephes_DP1 = vdupq_n_f32(-0.78515625);
243 const float32x4_t c_minus_cephes_DP2 = vdupq_n_f32(-2.4187564849853515625e-4);
244 const float32x4_t c_minus_cephes_DP3 = vdupq_n_f32(-3.77489497744594108e-8);
245 const float32x4_t c_sincof_p0 = vdupq_n_f32(-1.9515295891e-4);
246 const float32x4_t c_sincof_p1 = vdupq_n_f32(8.3321608736e-3);
247 const float32x4_t c_sincof_p2 = vdupq_n_f32(-1.6666654611e-1);
248 const float32x4_t c_coscof_p0 = vdupq_n_f32(2.443315711809948e-005);
249 const float32x4_t c_coscof_p1 = vdupq_n_f32(-1.388731625493765e-003);
250 const float32x4_t c_coscof_p2 = vdupq_n_f32(4.166664568298827e-002);
251 const float32x4_t c_cephes_FOPI = vdupq_n_f32(1.27323954473516); // 4 / M_PI
252
253 const float32x4_t CONST_1 = vdupq_n_f32(1.f);
254 const float32x4_t CONST_1_2 = vdupq_n_f32(0.5f);
255 const float32x4_t CONST_0 = vdupq_n_f32(0.f);
256 const uint32x4_t CONST_2 = vdupq_n_u32(2);
257 const uint32x4_t CONST_4 = vdupq_n_u32(4);
258
259 uint32x4_t emm2;
260
261 uint32x4_t sign_mask_sin, sign_mask_cos;
262 sign_mask_sin = vcltq_f32(x, CONST_0);
263 x = vabsq_f32(x);
264 // scale by 4/pi
265 float32x4_t y = vmulq_f32(x, c_cephes_FOPI);
266
267 // store the integer part of y in mm0
268 emm2 = vcvtq_u32_f32(y);
269 /* j=(j+1) & (~1) (see the cephes sources) */
270 emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
271 emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
272 y = vcvtq_f32_u32(emm2);
273
274 /* get the polynom selection mask
275 there is one polynom for 0 <= x <= Pi/4
276 and another one for Pi/4<x<=Pi/2
277 Both branches will be computed. */
278 const uint32x4_t poly_mask = vtstq_u32(emm2, CONST_2);
279
280 // The magic pass: "Extended precision modular arithmetic"
281 x = vmlaq_f32(x, y, c_minus_cephes_DP1);
282 x = vmlaq_f32(x, y, c_minus_cephes_DP2);
283 x = vmlaq_f32(x, y, c_minus_cephes_DP3);
284
285 sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, CONST_4));
286 sign_mask_cos = vtstq_u32(vsubq_u32(emm2, CONST_2), CONST_4);
287
288 /* Evaluate the first polynom (0 <= x <= Pi/4) in y1,
289 and the second polynom (Pi/4 <= x <= 0) in y2 */
290 float32x4_t y1, y2;
291 float32x4_t z = vmulq_f32(x, x);
292
293 y1 = vmlaq_f32(c_coscof_p1, z, c_coscof_p0);
294 y1 = vmlaq_f32(c_coscof_p2, z, y1);
295 y1 = vmulq_f32(y1, z);
296 y1 = vmulq_f32(y1, z);
297 y1 = vmlsq_f32(y1, z, CONST_1_2);
298 y1 = vaddq_f32(y1, CONST_1);
299
300 y2 = vmlaq_f32(c_sincof_p1, z, c_sincof_p0);
301 y2 = vmlaq_f32(c_sincof_p2, z, y2);
302 y2 = vmulq_f32(y2, z);
303 y2 = vmlaq_f32(x, x, y2);
304
305 /* select the correct result from the two polynoms */
306 const float32x4_t ys = vbslq_f32(poly_mask, y1, y2);
307 const float32x4_t yc = vbslq_f32(poly_mask, y2, y1);
308
309 float32x4x2_t sincos;
310 sincos.val[0] = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
311 sincos.val[1] = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
312
313 return sincos;
314}
315
316static inline float32x4_t _vtanq_f32(float32x4_t x)
317{
318 const float32x4x2_t sincos = _vsincosq_f32(x);
319 return vmulq_f32(sincos.val[0], _vinvq_f32(sincos.val[1]));
320}
321
322/*
323 * Approximate arctan(x) via polynomial expansion
324 * on the interval [-1, 1]
325 *
326 * Maximum relative error ~6.5e-7
327 * Polynomial evaluated via Horner's method
328 */
329static inline float32x4_t _varctan_poly_f32(float32x4_t x)
330{
331 const float32x4_t a1 = vdupq_n_f32(+0x1.ffffeap-1f);
332 const float32x4_t a3 = vdupq_n_f32(-0x1.55437p-2f);
333 const float32x4_t a5 = vdupq_n_f32(+0x1.972be6p-3f);
334 const float32x4_t a7 = vdupq_n_f32(-0x1.1436ap-3f);
335 const float32x4_t a9 = vdupq_n_f32(+0x1.5785aap-4f);
336 const float32x4_t a11 = vdupq_n_f32(-0x1.2f3004p-5f);
337 const float32x4_t a13 = vdupq_n_f32(+0x1.01a37cp-7f);
338
339 const float32x4_t x_sq = vmulq_f32(x, x);
340 float32x4_t result;
341 result = a13;
342 result = vmlaq_f32(a11, x_sq, result);
343 result = vmlaq_f32(a9, x_sq, result);
344 result = vmlaq_f32(a7, x_sq, result);
345 result = vmlaq_f32(a5, x_sq, result);
346 result = vmlaq_f32(a3, x_sq, result);
347 result = vmlaq_f32(a1, x_sq, result);
348 result = vmulq_f32(x, result);
349
350 return result;
351}
352
353static inline float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc,
354 float32x4_t acc,
355 float32x4_t val,
356 float32x4_t rec,
357 float32x4_t aux)
358{
359 aux = vmulq_f32(aux, val);
360 aux = vsubq_f32(aux, acc);
361 aux = vmulq_f32(aux, aux);
362#ifdef LV_HAVE_NEONV8
363 return vfmaq_f32(sq_acc, aux, rec);
364#else
365 aux = vmulq_f32(aux, rec);
366 return vaddq_f32(sq_acc, aux);
367#endif
368}
369
370/*
371 * Minimax polynomial for sin(x) on [-pi/4, pi/4]
372 * Coefficients via Remez algorithm (Sollya)
373 * Max |error| < 7.3e-9
374 * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
375 */
376static inline float32x4_t _vsin_poly_f32(float32x4_t x)
377{
378 const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
379 const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
380 const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
381
382 const float32x4_t x2 = vmulq_f32(x, x);
383 const float32x4_t x3 = vmulq_f32(x2, x);
384
385 float32x4_t poly = vmlaq_f32(s2, x2, s3);
386 poly = vmlaq_f32(s1, x2, poly);
387 return vmlaq_f32(x, x3, poly);
388}
389
390/*
391 * Minimax polynomial for cos(x) on [-pi/4, pi/4]
392 * Coefficients via Remez algorithm (Sollya)
393 * Max |error| < 1.1e-7
394 * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
395 */
396static inline float32x4_t _vcos_poly_f32(float32x4_t x)
397{
398 const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
399 const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
400 const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
401 const float32x4_t one = vdupq_n_f32(1.0f);
402
403 const float32x4_t x2 = vmulq_f32(x, x);
404
405 float32x4_t poly = vmlaq_f32(c2, x2, c3);
406 poly = vmlaq_f32(c1, x2, poly);
407 return vmlaq_f32(one, x2, poly);
408}
409
410/*
411 * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
412 * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
413 * Max error: ~1.55e-6
414 *
415 * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
416 * Polynomial evaluated via Horner's method
417 */
418static inline float32x4_t _vlog2_poly_f32(float32x4_t x)
419{
420 const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
421 const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
422 const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
423 const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
424 const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
425 const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
426 const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
427
428 // Horner's method: c0 + x*(c1 + x*(c2 + ...))
429 float32x4_t poly = c6;
430 poly = vmlaq_f32(c5, poly, x);
431 poly = vmlaq_f32(c4, poly, x);
432 poly = vmlaq_f32(c3, poly, x);
433 poly = vmlaq_f32(c2, poly, x);
434 poly = vmlaq_f32(c1, poly, x);
435 poly = vmlaq_f32(c0, poly, x);
436 return poly;
437}
438
439#ifdef LV_HAVE_NEONV8
440/* ARMv8 NEON FMA-based arctan polynomial for better accuracy and throughput */
441static inline float32x4_t _varctan_poly_neonv8(float32x4_t x)
442{
443 const float32x4_t a1 = vdupq_n_f32(+0x1.ffffeap-1f);
444 const float32x4_t a3 = vdupq_n_f32(-0x1.55437p-2f);
445 const float32x4_t a5 = vdupq_n_f32(+0x1.972be6p-3f);
446 const float32x4_t a7 = vdupq_n_f32(-0x1.1436ap-3f);
447 const float32x4_t a9 = vdupq_n_f32(+0x1.5785aap-4f);
448 const float32x4_t a11 = vdupq_n_f32(-0x1.2f3004p-5f);
449 const float32x4_t a13 = vdupq_n_f32(+0x1.01a37cp-7f);
450
451 const float32x4_t x_sq = vmulq_f32(x, x);
452 float32x4_t result = a13;
453 result = vfmaq_f32(a11, x_sq, result);
454 result = vfmaq_f32(a9, x_sq, result);
455 result = vfmaq_f32(a7, x_sq, result);
456 result = vfmaq_f32(a5, x_sq, result);
457 result = vfmaq_f32(a3, x_sq, result);
458 result = vfmaq_f32(a1, x_sq, result);
459 result = vmulq_f32(x, result);
460
461 return result;
462}
463
464/* NEONv8 FMA sin polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
465static inline float32x4_t _vsin_poly_neonv8(float32x4_t x)
466{
467 const float32x4_t s1 = vdupq_n_f32(-0x1.555552p-3f);
468 const float32x4_t s2 = vdupq_n_f32(+0x1.110be2p-7f);
469 const float32x4_t s3 = vdupq_n_f32(-0x1.9ab22ap-13f);
470
471 const float32x4_t x2 = vmulq_f32(x, x);
472 const float32x4_t x3 = vmulq_f32(x2, x);
473
474 float32x4_t poly = vfmaq_f32(s2, x2, s3);
475 poly = vfmaq_f32(s1, x2, poly);
476 return vfmaq_f32(x, x3, poly);
477}
478
479/* NEONv8 FMA cos polynomial on [-pi/4, pi/4], coeffs via Remez (Sollya) */
480static inline float32x4_t _vcos_poly_neonv8(float32x4_t x)
481{
482 const float32x4_t c1 = vdupq_n_f32(-0x1.fffff4p-2f);
483 const float32x4_t c2 = vdupq_n_f32(+0x1.554a46p-5f);
484 const float32x4_t c3 = vdupq_n_f32(-0x1.661be2p-10f);
485 const float32x4_t one = vdupq_n_f32(1.0f);
486
487 const float32x4_t x2 = vmulq_f32(x, x);
488
489 float32x4_t poly = vfmaq_f32(c2, x2, c3);
490 poly = vfmaq_f32(c1, x2, poly);
491 return vfmaq_f32(one, x2, poly);
492}
493
494/*
495 * NEONv8 FMA log2 polynomial on [1, 2]
496 * log2(x) ≈ poly(x) * (x - 1)
497 * Max error: ~1.55e-6
498 */
499static inline float32x4_t _vlog2_poly_neonv8(float32x4_t x)
500{
501 const float32x4_t c0 = vdupq_n_f32(+0x1.a8a726p+1f);
502 const float32x4_t c1 = vdupq_n_f32(-0x1.0b7f7ep+2f);
503 const float32x4_t c2 = vdupq_n_f32(+0x1.05d9ccp+2f);
504 const float32x4_t c3 = vdupq_n_f32(-0x1.4d476cp+1f);
505 const float32x4_t c4 = vdupq_n_f32(+0x1.04fc3ap+0f);
506 const float32x4_t c5 = vdupq_n_f32(-0x1.c97982p-3f);
507 const float32x4_t c6 = vdupq_n_f32(+0x1.57aa42p-6f);
508
509 // Horner's method with FMA: c0 + x*(c1 + x*(c2 + ...))
510 float32x4_t poly = c6;
511 poly = vfmaq_f32(c5, poly, x);
512 poly = vfmaq_f32(c4, poly, x);
513 poly = vfmaq_f32(c3, poly, x);
514 poly = vfmaq_f32(c2, poly, x);
515 poly = vfmaq_f32(c1, poly, x);
516 poly = vfmaq_f32(c0, poly, x);
517 return poly;
518}
519#endif /* LV_HAVE_NEONV8 */
520
521#endif /* INCLUDE_VOLK_VOLK_NEON_INTRINSICS_H_ */