Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_avx2_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 * Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
5 *
6 * This file is part of VOLK
7 *
8 * SPDX-License-Identifier: LGPL-3.0-or-later
9 */
10
11/*
12 * This file is intended to hold AVX2 intrinsics of intrinsics.
13 * They should be used in VOLK kernels to avoid copy-paste.
14 */
15
16#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
17#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
19#include <immintrin.h>
20
21/*
22 * Newton-Raphson refined reciprocal square root: 1/sqrt(a)
23 * AVX2 version with native 256-bit integer operations for edge case handling
24 * Handles edge cases: +0 → +Inf, +Inf → 0
25 */
26static inline __m256 _mm256_rsqrt_nr_avx2(const __m256 a)
27{
28 const __m256 HALF = _mm256_set1_ps(0.5f);
29 const __m256 THREE_HALFS = _mm256_set1_ps(1.5f);
30
31 const __m256 x0 = _mm256_rsqrt_ps(a); // +Inf for +0, 0 for +Inf
32
33 // Newton-Raphson: x1 = x0 * (1.5 - 0.5 * a * x0^2)
34 __m256 x1 = _mm256_mul_ps(
35 x0,
36 _mm256_sub_ps(THREE_HALFS,
37 _mm256_mul_ps(HALF, _mm256_mul_ps(_mm256_mul_ps(x0, x0), a))));
38
39 // For +0 and +Inf inputs, x0 is correct but NR produces NaN due to Inf*0
40 // AVX2: native 256-bit integer compare
41 __m256i a_si = _mm256_castps_si256(a);
42 __m256i zero_mask = _mm256_cmpeq_epi32(a_si, _mm256_setzero_si256());
43 __m256i inf_mask = _mm256_cmpeq_epi32(a_si, _mm256_set1_epi32(0x7F800000));
44 __m256 special_mask = _mm256_castsi256_ps(_mm256_or_si256(zero_mask, inf_mask));
45 return _mm256_blendv_ps(x1, x0, special_mask);
46}
47
48static inline __m256 _mm256_real(const __m256 z1, const __m256 z2)
49{
50 const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
51 __m256 r = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(2, 0, 2, 0));
52 return _mm256_permutevar8x32_ps(r, permute_mask);
53}
54
55static inline __m256 _mm256_imag(const __m256 z1, const __m256 z2)
56{
57 const __m256i permute_mask = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
58 __m256 i = _mm256_shuffle_ps(z1, z2, _MM_SHUFFLE(3, 1, 3, 1));
59 return _mm256_permutevar8x32_ps(i, permute_mask);
60}
61
62static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
63{
64 const __m128i zeros = _mm_set1_epi8(0x00);
65 const __m128i sign_extract = _mm_set1_epi8(0x80);
66 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
67 0xff,
68 0xff,
69 0x00,
70 0xff,
71 0xff,
72 0xff,
73 0x01,
74 0xff,
75 0xff,
76 0xff,
77 0x02,
78 0xff,
79 0xff,
80 0xff,
81 0x03,
82 0xff,
83 0xff,
84 0xff,
85 0x04,
86 0xff,
87 0xff,
88 0xff,
89 0x05,
90 0xff,
91 0xff,
92 0xff,
93 0x06,
94 0xff,
95 0xff,
96 0xff,
97 0x07);
98 __m256i sign_bits = _mm256_setzero_si256();
99
100 fbits = _mm_cmpgt_epi8(fbits, zeros);
101 fbits = _mm_and_si128(fbits, sign_extract);
102 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
103 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
104 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
105
106 return _mm256_castsi256_ps(sign_bits);
107}
108
109static inline __m256
110_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
111{
112 // prepare sign mask for correct +-
113 __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
114
115 __m256 llr0, llr1;
116 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
117
118 // calculate result
119 llr0 = _mm256_xor_ps(llr0, sign_mask);
120 __m256 dst = _mm256_add_ps(llr0, llr1);
121 return dst;
122}
123
124static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
125 const __m256 cplxValue1)
126{
127 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
128 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
129 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
130 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
131 return _mm256_permutevar8x32_ps(complex_result, idx);
132}
133
134static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
135 const __m256 symbols1,
136 const __m256 points0,
137 const __m256 points1,
138 const __m256 scalar)
139{
140 /*
141 * Calculate: |y - x|^2 * SNR_lin
142 * Consider 'symbolsX' and 'pointsX' to be complex float
143 * 'symbolsX' are 'y' and 'pointsX' are 'x'
144 */
145 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
146 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
147 const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
148 return _mm256_mul_ps(norms, scalar);
149}
150
151/*
152 * The function below vectorizes the inner loop of the following code:
153 *
154 * float max_values[8] = {0.f};
155 * unsigned max_indices[8] = {0};
156 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
157 * for (unsigned i = 0; i < num_points / 8; ++i) {
158 * for (unsigned j = 0; j < 8; ++j) {
159 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
160 * bool compare = abs_squared > max_values[j];
161 * max_values[j] = compare ? abs_squared : max_values[j];
162 * max_indices[j] = compare ? current_indices[j] : max_indices[j]
163 * current_indices[j] += 8; // update for next outer loop iteration
164 * ++src0;
165 * }
166 * }
167 */
168static inline void vector_32fc_index_max_variant0(__m256 in0,
169 __m256 in1,
170 __m256* max_values,
171 __m256i* max_indices,
172 __m256i* current_indices,
173 __m256i indices_increment)
174{
175 in0 = _mm256_mul_ps(in0, in0);
176 in1 = _mm256_mul_ps(in1, in1);
177
178 /*
179 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
180 * hadd_ps(a, b) computes
181 * (b_7 + b_6,
182 * b_5 + b_4,
183 * ---------
184 * a_7 + b_6,
185 * a_5 + a_4,
186 * ---------
187 * b_3 + b_2,
188 * b_1 + b_0,
189 * ---------
190 * a_3 + a_2,
191 * a_1 + a_0).
192 * The result is the squared absolute value of complex numbers at index
193 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
194 * current_indices!
195 */
196 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
197
198 /*
199 * Compare the recently computed squared absolute values with the
200 * previously determined maximum values. cmp_ps(a, b) determines
201 * a > b ? 0xFFFFFFFF for each element in the vectors =>
202 * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
203 *
204 * If either operand is NaN, 0 is returned as an “ordered” comparision is
205 * used => the blend operation will select the value from *max_values.
206 */
207 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
208
209 /* Select maximum by blending. This is the only line which differs from variant1 */
210 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
211
212 /*
213 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
214 * each element in the vectors =>
215 * max_indices = compare_mask ? current_indices : max_indices
216 *
217 * Note: The casting of data types is required to make the compiler happy
218 * and does not change values.
219 */
220 *max_indices =
221 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
222 _mm256_castsi256_ps(*current_indices),
223 compare_mask));
224
225 /* compute indices of complex numbers which will be loaded in the next iteration */
226 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
227}
228
229/* See _variant0 for details */
230static inline void vector_32fc_index_max_variant1(__m256 in0,
231 __m256 in1,
232 __m256* max_values,
233 __m256i* max_indices,
234 __m256i* current_indices,
235 __m256i indices_increment)
236{
237 in0 = _mm256_mul_ps(in0, in0);
238 in1 = _mm256_mul_ps(in1, in1);
239
240 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
241 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
242
243 /*
244 * This is the only line which differs from variant0. Using maxps instead of
245 * blendvps is faster on Intel CPUs (on the ones tested with).
246 *
247 * Note: The order of arguments matters if a NaN is encountered in which
248 * case the value of the second argument is selected. This is consistent
249 * with the “ordered” comparision and the blend operation: The comparision
250 * returns false if a NaN is encountered and the blend operation
251 * consequently selects the value from max_indices.
252 */
253 *max_values = _mm256_max_ps(abs_squared, *max_values);
254
255 *max_indices =
256 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
257 _mm256_castsi256_ps(*current_indices),
258 compare_mask));
259
260 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
261}
262
263/*
264 * The function below vectorizes the inner loop of the following code:
265 *
266 * float min_values[8] = {FLT_MAX};
267 * unsigned min_indices[8] = {0};
268 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
269 * for (unsigned i = 0; i < num_points / 8; ++i) {
270 * for (unsigned j = 0; j < 8; ++j) {
271 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
272 * bool compare = abs_squared < min_values[j];
273 * min_values[j] = compare ? abs_squared : min_values[j];
274 * min_indices[j] = compare ? current_indices[j] : min_indices[j]
275 * current_indices[j] += 8; // update for next outer loop iteration
276 * ++src0;
277 * }
278 * }
279 */
280static inline void vector_32fc_index_min_variant0(__m256 in0,
281 __m256 in1,
282 __m256* min_values,
283 __m256i* min_indices,
284 __m256i* current_indices,
285 __m256i indices_increment)
286{
287 in0 = _mm256_mul_ps(in0, in0);
288 in1 = _mm256_mul_ps(in1, in1);
289
290 /*
291 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
292 * hadd_ps(a, b) computes
293 * (b_7 + b_6,
294 * b_5 + b_4,
295 * ---------
296 * a_7 + b_6,
297 * a_5 + a_4,
298 * ---------
299 * b_3 + b_2,
300 * b_1 + b_0,
301 * ---------
302 * a_3 + a_2,
303 * a_1 + a_0).
304 * The result is the squared absolute value of complex numbers at index
305 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
306 * current_indices!
307 */
308 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
309
310 /*
311 * Compare the recently computed squared absolute values with the
312 * previously determined minimum values. cmp_ps(a, b) determines
313 * a < b ? 0xFFFFFFFF for each element in the vectors =>
314 * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
315 *
316 * If either operand is NaN, 0 is returned as an “ordered” comparision is
317 * used => the blend operation will select the value from *min_values.
318 */
319 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
320
321 /* Select minimum by blending. This is the only line which differs from variant1 */
322 *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
323
324 /*
325 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
326 * each element in the vectors =>
327 * min_indices = compare_mask ? current_indices : min_indices
328 *
329 * Note: The casting of data types is required to make the compiler happy
330 * and does not change values.
331 */
332 *min_indices =
333 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
334 _mm256_castsi256_ps(*current_indices),
335 compare_mask));
336
337 /* compute indices of complex numbers which will be loaded in the next iteration */
338 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
339}
340
341/* See _variant0 for details */
342static inline void vector_32fc_index_min_variant1(__m256 in0,
343 __m256 in1,
344 __m256* min_values,
345 __m256i* min_indices,
346 __m256i* current_indices,
347 __m256i indices_increment)
348{
349 in0 = _mm256_mul_ps(in0, in0);
350 in1 = _mm256_mul_ps(in1, in1);
351
352 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
353 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
354
355 /*
356 * This is the only line which differs from variant0. Using maxps instead of
357 * blendvps is faster on Intel CPUs (on the ones tested with).
358 *
359 * Note: The order of arguments matters if a NaN is encountered in which
360 * case the value of the second argument is selected. This is consistent
361 * with the “ordered” comparision and the blend operation: The comparision
362 * returns false if a NaN is encountered and the blend operation
363 * consequently selects the value from min_indices.
364 */
365 *min_values = _mm256_min_ps(abs_squared, *min_values);
366
367 *min_indices =
368 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
369 _mm256_castsi256_ps(*current_indices),
370 compare_mask));
371
372 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
373}
374
375/*
376 * Approximate sin(x) via polynomial expansion
377 * on the interval [-pi/4, pi/4]
378 *
379 * Maximum absolute error ~7.3e-9
380 * sin(x) = x + x^3 * (s1 + x^2 * (s2 + x^2 * s3))
381 */
382static inline __m256 _mm256_sin_poly_avx2(const __m256 x)
383{
384 const __m256 s1 = _mm256_set1_ps(-0x1.555552p-3f);
385 const __m256 s2 = _mm256_set1_ps(+0x1.110be2p-7f);
386 const __m256 s3 = _mm256_set1_ps(-0x1.9ab22ap-13f);
387
388 const __m256 x2 = _mm256_mul_ps(x, x);
389 const __m256 x3 = _mm256_mul_ps(x2, x);
390
391 __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, s3), s2);
392 poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), s1);
393 return _mm256_add_ps(_mm256_mul_ps(x3, poly), x);
394}
395
396/*
397 * Approximate cos(x) via polynomial expansion
398 * on the interval [-pi/4, pi/4]
399 *
400 * Maximum absolute error ~1.1e-7
401 * cos(x) = 1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))
402 */
403static inline __m256 _mm256_cos_poly_avx2(const __m256 x)
404{
405 const __m256 c1 = _mm256_set1_ps(-0x1.fffff4p-2f);
406 const __m256 c2 = _mm256_set1_ps(+0x1.554a46p-5f);
407 const __m256 c3 = _mm256_set1_ps(-0x1.661be2p-10f);
408 const __m256 one = _mm256_set1_ps(1.0f);
409
410 const __m256 x2 = _mm256_mul_ps(x, x);
411
412 __m256 poly = _mm256_add_ps(_mm256_mul_ps(x2, c3), c2);
413 poly = _mm256_add_ps(_mm256_mul_ps(x2, poly), c1);
414 return _mm256_add_ps(_mm256_mul_ps(x2, poly), one);
415}
416
417/*
418 * Polynomial coefficients for log2(x)/(x-1) on [1, 2]
419 * Generated with Sollya: remez(log2(x)/(x-1), 6, [1+1b-20, 2])
420 * Max error: ~1.55e-6
421 *
422 * Usage: log2(x) ≈ poly(x) * (x - 1) for x ∈ [1, 2]
423 * Polynomial evaluated via Horner's method
424 */
425static inline __m256 _mm256_log2_poly_avx2(const __m256 x)
426{
427 const __m256 c0 = _mm256_set1_ps(+0x1.a8a726p+1f);
428 const __m256 c1 = _mm256_set1_ps(-0x1.0b7f7ep+2f);
429 const __m256 c2 = _mm256_set1_ps(+0x1.05d9ccp+2f);
430 const __m256 c3 = _mm256_set1_ps(-0x1.4d476cp+1f);
431 const __m256 c4 = _mm256_set1_ps(+0x1.04fc3ap+0f);
432 const __m256 c5 = _mm256_set1_ps(-0x1.c97982p-3f);
433 const __m256 c6 = _mm256_set1_ps(+0x1.57aa42p-6f);
434
435 // Horner's method: c0 + x*(c1 + x*(c2 + ...))
436 __m256 poly = c6;
437 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c5);
438 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c4);
439 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c3);
440 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c2);
441 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c1);
442 poly = _mm256_add_ps(_mm256_mul_ps(poly, x), c0);
443 return poly;
444}
445
446#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */