Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_sse_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023-2026 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11/*
12 * This file is intended to hold SSE intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-pasta.
14 */
15
16#ifndef INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_
18#include <emmintrin.h>
19#include <xmmintrin.h>
20
21/*
22 * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
23 * One iteration doubles precision from ~12-bit to ~24-bit
24 * x1 = x0 * (1.5 - 0.5 * a * x0^2)
25 * Handles edge cases: +0 → +Inf, +Inf → 0
26 */
27static inline __m128 _mm_rsqrt_nr_ps(const __m128 a)
28{
29 const __m128 HALF = _mm_set1_ps(0.5f);
30 const __m128 THREE_HALFS = _mm_set1_ps(1.5f);
31
32 const __m128 x0 = _mm_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
33
34 // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
35 __m128 x1 = _mm_mul_ps(
36 x0, _mm_sub_ps(THREE_HALFS, _mm_mul_ps(HALF, _mm_mul_ps(_mm_mul_ps(x0, x0), a))));
37
38 // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
39 // Blend: use x0 where a == +0 or a == +Inf, else use x1
40 __m128i a_si = _mm_castps_si128(a);
41 __m128i zero_mask = _mm_cmpeq_epi32(a_si, _mm_setzero_si128());
42 __m128i inf_mask = _mm_cmpeq_epi32(a_si, _mm_set1_epi32(0x7F800000));
43 __m128 special_mask = _mm_castsi128_ps(_mm_or_si128(zero_mask, inf_mask));
44 // SSE2-compatible blend: (x0 & mask) | (x1 & ~mask)
45 return _mm_or_ps(_mm_and_ps(special_mask, x0), _mm_andnot_ps(special_mask, x1));
46}
47
48/*
49 * Approximate arctan(x) via polynomial expansion
50 * on the interval [-1, 1]
51 *
52 * Maximum relative error ~6.5e-7
53 * Polynomial evaluated via Horner's method
54 */
55static inline __m128 _mm_arctan_poly_sse(const __m128 x)
56{
57 const __m128 a1 = _mm_set1_ps(+0x1.ffffeap-1f);
58 const __m128 a3 = _mm_set1_ps(-0x1.55437p-2f);
59 const __m128 a5 = _mm_set1_ps(+0x1.972be6p-3f);
60 const __m128 a7 = _mm_set1_ps(-0x1.1436ap-3f);
61 const __m128 a9 = _mm_set1_ps(+0x1.5785aap-4f);
62 const __m128 a11 = _mm_set1_ps(-0x1.2f3004p-5f);
63 const __m128 a13 = _mm_set1_ps(+0x1.01a37cp-7f);
64
65 const __m128 x_times_x = _mm_mul_ps(x, x);
66 __m128 arctan;
67 arctan = a13;
68 arctan = _mm_mul_ps(x_times_x, arctan);
69 arctan = _mm_add_ps(arctan, a11);
70 arctan = _mm_mul_ps(x_times_x, arctan);
71 arctan = _mm_add_ps(arctan, a9);
72 arctan = _mm_mul_ps(x_times_x, arctan);
73 arctan = _mm_add_ps(arctan, a7);
74 arctan = _mm_mul_ps(x_times_x, arctan);
75 arctan = _mm_add_ps(arctan, a5);
76 arctan = _mm_mul_ps(x_times_x, arctan);
77 arctan = _mm_add_ps(arctan, a3);
78 arctan = _mm_mul_ps(x_times_x, arctan);
79 arctan = _mm_add_ps(arctan, a1);
80 arctan = _mm_mul_ps(x, arctan);
81
82 return arctan;
83}
84
85/*
86 * Approximate arcsin(x) via polynomial expansion
87 * P(u) such that asin(x) = x * P(x^2) on |x| <= 0.5
88 *
89 * Maximum relative error ~1.5e-6
90 * Polynomial evaluated via Horner's method
91 */
92static inline __m128 _mm_arcsin_poly_sse(const __m128 x)
93{
94 const __m128 c0 = _mm_set1_ps(0x1.ffffcep-1f);
95 const __m128 c1 = _mm_set1_ps(0x1.55b648p-3f);
96 const __m128 c2 = _mm_set1_ps(0x1.24d192p-4f);
97 const __m128 c3 = _mm_set1_ps(0x1.0a788p-4f);
98
99 const __m128 u = _mm_mul_ps(x, x);
100 __m128 p = c3;
101 p = _mm_mul_ps(u, p);
102 p = _mm_add_ps(p, c2);
103 p = _mm_mul_ps(u, p);
104 p = _mm_add_ps(p, c1);
105 p = _mm_mul_ps(u, p);
106 p = _mm_add_ps(p, c0);
107
108 return _mm_mul_ps(x, p);
109}
110
111static inline __m128 _mm_magnitudesquared_ps(__m128 cplxValue1, __m128 cplxValue2)
112{
113 __m128 iValue, qValue;
114 // Arrange in i1i2i3i4 format
115 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
116 // Arrange in q1q2q3q4 format
117 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
118 iValue = _mm_mul_ps(iValue, iValue); // Square the I values
119 qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
120 return _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
121}
122
123static inline __m128 _mm_magnitude_ps(__m128 cplxValue1, __m128 cplxValue2)
124{
125 return _mm_sqrt_ps(_mm_magnitudesquared_ps(cplxValue1, cplxValue2));
126}
127
128static inline __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0,
129 const __m128 symbols1,
130 const __m128 points0,
131 const __m128 points1,
132 const __m128 scalar)
133{
134 // calculate scalar * |x - y|^2
135 const __m128 diff0 = _mm_sub_ps(symbols0, points0);
136 const __m128 diff1 = _mm_sub_ps(symbols1, points1);
137 const __m128 norms = _mm_magnitudesquared_ps(diff0, diff1);
138 return _mm_mul_ps(norms, scalar);
139}
140
141static inline __m128 _mm_accumulate_square_sum_ps(
142 __m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
143{
144 aux = _mm_mul_ps(aux, val);
145 aux = _mm_sub_ps(aux, acc);
146 aux = _mm_mul_ps(aux, aux);
147 aux = _mm_mul_ps(aux, rec);
148 return _mm_add_ps(sq_acc, aux);
149}
150
151/*
152 * Minimax polynomial for sin(x) on [-pi/4, pi/4]
153 * Coefficients via Remez algorithm (Sollya)
154 * Max |error| < 7.3e-9
155 * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
156 */
157static inline __m128 _mm_sin_poly_sse(const __m128 x)
158{
159 const __m128 s1 = _mm_set1_ps(-0x1.555552p-3f);
160 const __m128 s2 = _mm_set1_ps(+0x1.110be2p-7f);
161 const __m128 s3 = _mm_set1_ps(-0x1.9ab22ap-13f);
162
163 const __m128 x2 = _mm_mul_ps(x, x);
164 const __m128 x3 = _mm_mul_ps(x2, x);
165
166 __m128 poly = _mm_add_ps(_mm_mul_ps(x2, s3), s2);
167 poly = _mm_add_ps(_mm_mul_ps(x2, poly), s1);
168 return _mm_add_ps(_mm_mul_ps(x3, poly), x);
169}
170
171/*
172 * Minimax polynomial for cos(x) on [-pi/4, pi/4]
173 * Coefficients via Remez algorithm (Sollya)
174 * Max |error| < 1.1e-7
175 * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
176 */
177static inline __m128 _mm_cos_poly_sse(const __m128 x)
178{
179 const __m128 c1 = _mm_set1_ps(-0x1.fffff4p-2f);
180 const __m128 c2 = _mm_set1_ps(+0x1.554a46p-5f);
181 const __m128 c3 = _mm_set1_ps(-0x1.661be2p-10f);
182 const __m128 one = _mm_set1_ps(1.0f);
183
184 const __m128 x2 = _mm_mul_ps(x, x);
185
186 __m128 poly = _mm_add_ps(_mm_mul_ps(x2, c3), c2);
187 poly = _mm_add_ps(_mm_mul_ps(x2, poly), c1);
188 return _mm_add_ps(_mm_mul_ps(x2, poly), one);
189}
190
191/*
192 * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
193 * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
194 * Max error: ~1.55e-6
195 *
196 * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
197 * Polynomial evaluated via Horner's method
198 */
199static inline __m128 _mm_log2_poly_sse(const __m128 x)
200{
201 const __m128 c0 = _mm_set1_ps(+0x1.a8a726p+1f);
202 const __m128 c1 = _mm_set1_ps(-0x1.0b7f7ep+2f);
203 const __m128 c2 = _mm_set1_ps(+0x1.05d9ccp+2f);
204 const __m128 c3 = _mm_set1_ps(-0x1.4d476cp+1f);
205 const __m128 c4 = _mm_set1_ps(+0x1.04fc3ap+0f);
206 const __m128 c5 = _mm_set1_ps(-0x1.c97982p-3f);
207 const __m128 c6 = _mm_set1_ps(+0x1.57aa42p-6f);
208
209 // Horner's method: c0 + x*(c1 + x*(c2 + ...))
210 __m128 poly = c6;
211 poly = _mm_add_ps(_mm_mul_ps(poly, x), c5);
212 poly = _mm_add_ps(_mm_mul_ps(poly, x), c4);
213 poly = _mm_add_ps(_mm_mul_ps(poly, x), c3);
214 poly = _mm_add_ps(_mm_mul_ps(poly, x), c2);
215 poly = _mm_add_ps(_mm_mul_ps(poly, x), c1);
216 poly = _mm_add_ps(_mm_mul_ps(poly, x), c0);
217 return poly;
218}
219
220#endif /* INCLUDE_VOLK_VOLK_SSE_INTRINSICS_H_ */