Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_common.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2010, 2011, 2015-2017, 2019, 2020 Free Software Foundation, Inc.
4 * Copyright 2023 - 2025 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11#ifndef INCLUDED_LIBVOLK_COMMON_H
12#define INCLUDED_LIBVOLK_COMMON_H
13
15// Cross-platform attribute macros
17#if _MSC_VER
18#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
19#define __VOLK_ATTR_UNUSED
20#define __VOLK_ATTR_INLINE __forceinline
21#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
22#define __VOLK_ATTR_EXPORT __declspec(dllexport)
23#define __VOLK_ATTR_IMPORT __declspec(dllimport)
24#define __VOLK_PREFETCH(addr)
25#define __VOLK_ASM __asm
26#elif defined(__clang__)
27// AppleClang also defines __GNUC__, so do this check first. These
28// will probably be the same as for __GNUC__, but let's keep them
29// separate just to be safe.
30#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
31#define __VOLK_ATTR_UNUSED __attribute__((unused))
32#define __VOLK_ATTR_INLINE __attribute__((always_inline))
33#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
34#define __VOLK_ASM __asm__
35#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
36#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
37#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
38#elif defined __GNUC__
39#define __VOLK_ATTR_ALIGNED(x) __attribute__((aligned(x)))
40#define __VOLK_ATTR_UNUSED __attribute__((unused))
41#define __VOLK_ATTR_INLINE __attribute__((always_inline))
42#define __VOLK_ATTR_DEPRECATED __attribute__((deprecated))
43#define __VOLK_ASM __asm__
44#if __GNUC__ >= 4
45#define __VOLK_ATTR_EXPORT __attribute__((visibility("default")))
46#define __VOLK_ATTR_IMPORT __attribute__((visibility("default")))
47#else
48#define __VOLK_ATTR_EXPORT
49#define __VOLK_ATTR_IMPORT
50#endif
51#define __VOLK_PREFETCH(addr) __builtin_prefetch(addr)
52#elif _MSC_VER
53#define __VOLK_ATTR_ALIGNED(x) __declspec(align(x))
54#define __VOLK_ATTR_UNUSED
55#define __VOLK_ATTR_INLINE __forceinline
56#define __VOLK_ATTR_DEPRECATED __declspec(deprecated)
57#define __VOLK_ATTR_EXPORT __declspec(dllexport)
58#define __VOLK_ATTR_IMPORT __declspec(dllimport)
59#define __VOLK_PREFETCH(addr)
60#define __VOLK_ASM __asm
61#else
62#define __VOLK_ATTR_ALIGNED(x)
63#define __VOLK_ATTR_UNUSED
64#define __VOLK_ATTR_INLINE
65#define __VOLK_ATTR_DEPRECATED
66#define __VOLK_ATTR_EXPORT
67#define __VOLK_ATTR_IMPORT
68#define __VOLK_PREFETCH(addr)
69#define __VOLK_ASM __asm__
70#endif
71
73// Ignore annoying warnings in MSVC
75#if defined(_MSC_VER)
76#pragma warning(disable : 4244) //'conversion' conversion from 'type1' to 'type2',
77 // possible loss of data
78#pragma warning(disable : 4305) //'identifier' : truncation from 'type1' to 'type2'
79#endif
80
82// C-linkage declaration macros
83// FIXME: due to the usage of complex.h, require gcc for c-linkage
85#if defined(__cplusplus) && (__GNUC__)
86#define __VOLK_DECL_BEGIN extern "C" {
87#define __VOLK_DECL_END }
88#else
89#define __VOLK_DECL_BEGIN
90#define __VOLK_DECL_END
91#endif
92
94// Define VOLK_API for library symbols
95// https://gcc.gnu.org/wiki/Visibility
97#ifdef volk_EXPORTS
98#define VOLK_API __VOLK_ATTR_EXPORT
99#else
100#define VOLK_API __VOLK_ATTR_IMPORT
101#endif
102
104// The bit128 union used by some
106#include <stdint.h>
107
108#ifdef LV_HAVE_SSE
109#ifdef _WIN32
110#include <intrin.h>
111#else
112#include <x86intrin.h>
113#endif
114#endif
115
116union bit128 {
117 uint8_t i8[16];
118 uint16_t i16[8];
119 uint32_t i[4];
120 float f[4];
121 double d[2];
122
123#ifdef LV_HAVE_SSE
124 __m128 float_vec;
125#endif
126
127#ifdef LV_HAVE_SSE2
128 __m128i int_vec;
129 __m128d double_vec;
130#endif
131};
132
133union bit256 {
134 uint8_t i8[32];
135 uint16_t i16[16];
136 uint32_t i[8];
137 float f[8];
138 double d[4];
139
140#ifdef LV_HAVE_AVX
141 __m256 float_vec;
142 __m256i int_vec;
143 __m256d double_vec;
144#endif
145};
146
147#define bit128_p(x) ((union bit128*)(x))
148#define bit256_p(x) ((union bit256*)(x))
149
151// log2f
153#include <math.h>
154// +-Inf -> +-127.0f in order to match the behaviour of the SIMD kernels
155// NaN -> NaN (preserved for consistency)
156static inline float log2f_non_ieee(float f)
157{
158 float const result = log2f(f);
159 // Return NaN for NaN inputs or negative values (preserves IEEE behavior for invalid
160 // inputs)
161 if (isnan(result))
162 return result;
163 // Map ±Inf to ±127.0f to match SIMD kernel behavior
164 return isinf(result) ? copysignf(127.0f, result) : result;
165}
166
168// Constant used to do log10 calculations as faster log2
170// precalculated 10.0 / log2f_non_ieee(10.0) to allow for constexpr
171#define volk_log2to10factor (0x1.815182p1) // 3.01029995663981209120
172
174// arctan(x) polynomial expansion
176static inline float volk_arctan_poly(const float x)
177{
178 /*
179 * arctan(x) polynomial expansion on the interval [-1, 1]
180 * Maximum relative error < 6.6e-7
181 */
182 const float a1 = +0x1.ffffeap-1f;
183 const float a3 = -0x1.55437p-2f;
184 const float a5 = +0x1.972be6p-3f;
185 const float a7 = -0x1.1436ap-3f;
186 const float a9 = +0x1.5785aap-4f;
187 const float a11 = -0x1.2f3004p-5f;
188 const float a13 = +0x1.01a37cp-7f;
189
190 const float x_times_x = x * x;
191 float arctan = a13;
192 arctan = fmaf(x_times_x, arctan, a11);
193 arctan = fmaf(x_times_x, arctan, a9);
194 arctan = fmaf(x_times_x, arctan, a7);
195 arctan = fmaf(x_times_x, arctan, a5);
196 arctan = fmaf(x_times_x, arctan, a3);
197 arctan = fmaf(x_times_x, arctan, a1);
198 arctan *= x;
199
200 return arctan;
201}
202
203// sin(x) polynomial expansion
205static inline float volk_sin_poly(const float x)
206{
207 /*
208 * Minimax polynomial for sin(x) on [-pi/4, pi/4]
209 * Coefficients via Remez algorithm (Sollya)
210 * Max |error| < 7.3e-9
211 * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
212 */
213 const float s1 = -0x1.555552p-3f;
214 const float s2 = +0x1.110be2p-7f;
215 const float s3 = -0x1.9ab22ap-13f;
216
217 const float x2 = x * x;
218 const float x3 = x2 * x;
219
220 float poly = fmaf(x2, s3, s2);
221 poly = fmaf(x2, poly, s1);
222 return fmaf(x3, poly, x);
223}
224
225// cos(x) polynomial expansion
227static inline float volk_cos_poly(const float x)
228{
229 /*
230 * Minimax polynomial for cos(x) on [-pi/4, pi/4]
231 * Coefficients via Remez algorithm (Sollya)
232 * Max |error| < 1.1e-7
233 * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
234 */
235 const float c1 = -0x1.fffff4p-2f;
236 const float c2 = +0x1.554a46p-5f;
237 const float c3 = -0x1.661be2p-10f;
238
239 const float x2 = x * x;
240
241 float poly = fmaf(x2, c3, c2);
242 poly = fmaf(x2, poly, c1);
243 return fmaf(x2, poly, 1.0f);
244}
245
246// sin(x) with Cody-Waite argument reduction
248static inline float volk_sin(const float x)
249{
250 /*
251 * Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
252 * Then use sin/cos polynomials based on quadrant
253 */
254 const float two_over_pi = 0x1.45f306p-1f;
255 const float pi_over_2_hi = 0x1.921fb6p+0f;
256 const float pi_over_2_lo = -0x1.777a5cp-25f;
257
258 float n_f = rintf(x * two_over_pi);
259 int n = (int)n_f;
260
261 float r = fmaf(-n_f, pi_over_2_hi, x);
262 r = fmaf(-n_f, pi_over_2_lo, r);
263
264 float sin_r = volk_sin_poly(r);
265 float cos_r = volk_cos_poly(r);
266
267 // Quadrant selection: n&1 swaps sin/cos, n&2 negates
268 float result = (n & 1) ? cos_r : sin_r;
269 return (n & 2) ? -result : result;
270}
271
272// cos(x) with Cody-Waite argument reduction
274static inline float volk_cos(const float x)
275{
276 /*
277 * Cody-Waite argument reduction: n = round(x * 2/pi), r = x - n * pi/2
278 * Then use sin/cos polynomials based on quadrant
279 */
280 const float two_over_pi = 0x1.45f306p-1f;
281 const float pi_over_2_hi = 0x1.921fb6p+0f;
282 const float pi_over_2_lo = -0x1.777a5cp-25f;
283
284 float n_f = rintf(x * two_over_pi);
285 int n = (int)n_f;
286
287 float r = fmaf(-n_f, pi_over_2_hi, x);
288 r = fmaf(-n_f, pi_over_2_lo, r);
289
290 float sin_r = volk_sin_poly(r);
291 float cos_r = volk_cos_poly(r);
292
293 // Quadrant selection: n&1 swaps sin/cos, (n+1)&2 negates
294 float result = (n & 1) ? sin_r : cos_r;
295 return ((n + 1) & 2) ? -result : result;
296}
297
298// arctan(x)
300static inline float volk_arctan(const float x)
301{
302 /*
303 * arctan(x) + arctan(1 / x) == sign(x) * pi / 2
304 */
305 const float pi_2 = 0x1.921fb6p0f;
306
307 // Propagate NaN
308 if (isnan(x)) {
309 return x;
310 }
311
312 // arctan(±∞) = ±π/2
313 if (isinf(x)) {
314 return copysignf(pi_2, x);
315 }
316
317 if (fabs(x) < 1.f) {
318 return volk_arctan_poly(x);
319 } else {
320 return copysignf(pi_2, x) - volk_arctan_poly(1.f / x);
321 }
322}
323
324// arctan2(y, x)
326static inline float volk_atan2(const float y, const float x)
327{
328 /*
329 * / arctan(y / x) if x > 0
330 * | arctan(y / x) + PI if x < 0 and y >= 0
331 * atan2(y, x) = | arctan(y / x) - PI if x < 0 and y < 0
332 * | sign(y) * PI / 2 if x = 0
333 * \ undefined if x = 0 and y = 0
334 * atan2f(0.f, 0.f) shall return 0.f
335 * atan2f(0.f, -0.f) shall return -0.f
336 */
337 const float pi = 0x1.921fb6p1f;
338 const float pi_2 = 0x1.921fb6p0f;
339
340 // Propagate NaN from inputs
341 if (isnan(x) || isnan(y)) {
342 return x + y;
343 }
344
345 // Handle infinity cases per IEEE 754
346 if (isinf(y)) {
347 if (isinf(x)) {
348 // Both infinite: atan2(±∞, ±∞) = ±π/4 or ±3π/4
349 const float angle = (x > 0.f) ? (pi_2 / 2.f) : (3.f * pi_2 / 2.f);
350 return copysignf(angle, y);
351 } else {
352 // y infinite, x finite: atan2(±∞, x) = ±π/2
353 return copysignf(pi_2, y);
354 }
355 }
356 if (isinf(x)) {
357 // x infinite, y finite: atan2(y, +∞) = ±0, atan2(y, -∞) = ±π
358 return (x > 0.f) ? copysignf(0.f, y) : copysignf(pi, y);
359 }
360
361 if (fabs(x) == 0.f) {
362 return (fabs(y) == 0.f) ? copysignf(0.f, y) : copysignf(pi_2, y);
363 }
364 const int swap = fabs(x) < fabs(y);
365 const float numerator = swap ? x : y;
366 const float denominator = swap ? y : x;
367 float input = numerator / denominator;
368
369 if (isnan(input)) {
370 input = numerator;
371 }
372
373 float result = volk_arctan_poly(input);
374 result = swap ? (input >= 0.f ? pi_2 : -pi_2) - result : result;
375 if (x < 0.f) {
376 result += copysignf(pi, y);
377 }
378 return result;
379}
380
382// arcsin(x) polynomial expansion
383// P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
384// Maximum relative error ~1.5e-6
386static inline float volk_arcsin_poly(const float x)
387{
388 const float c0 = 0x1.ffffcep-1f;
389 const float c1 = 0x1.55b648p-3f;
390 const float c2 = 0x1.24d192p-4f;
391 const float c3 = 0x1.0a788p-4f;
392
393 const float u = x * x;
394 float p = c3;
395 p = fmaf(u, p, c2);
396 p = fmaf(u, p, c1);
397 p = fmaf(u, p, c0);
398
399 return x * p;
400}
401
402// arcsin(x) using two-range algorithm
404static inline float volk_arcsin(const float x)
405{
406 const float pi_2 = 0x1.921fb6p0f;
407
408 const float ax = fabsf(x);
409 if (ax <= 0.5f) {
410 // Small argument: direct polynomial
411 return volk_arcsin_poly(x);
412 } else {
413 // Large argument: use identity asin(x) = pi/2 - 2*asin(sqrt((1-|x|)/2))
414 const float t = (1.0f - ax) * 0.5f;
415 const float s = sqrtf(t);
416 const float inner = volk_arcsin_poly(s);
417 const float result = pi_2 - 2.0f * inner;
418 return copysignf(result, x);
419 }
420}
421
422// arccos(x) = pi/2 - arcsin(x)
424static inline float volk_arccos(const float x)
425{
426 const float pi_2 = 0x1.921fb6p0f;
427 return pi_2 - volk_arcsin(x);
428}
429
430#endif /*INCLUDED_LIBVOLK_COMMON_H*/