Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_x2_square_dist_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
64
65#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_a_H
66#define INCLUDED_volk_32fc_x2_square_dist_32f_a_H
67
68#include <inttypes.h>
69#include <stdio.h>
70#include <volk/volk_complex.h>
71
72#ifdef LV_HAVE_AVX2
73#include <immintrin.h>
74
75static inline void volk_32fc_x2_square_dist_32f_a_avx2(float* target,
76 const lv_32fc_t* src0,
77 const lv_32fc_t* points,
78 unsigned int num_points)
79{
80 const unsigned int num_bytes = num_points * 8;
81 __m128 xmm0, xmm9, xmm10;
82 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
83
84 lv_32fc_t diff;
85 float sq_dist;
86 int bound = num_bytes >> 6;
87 int leftovers0 = (num_bytes >> 5) & 1;
88 int leftovers1 = (num_bytes >> 4) & 1;
89 int leftovers2 = (num_bytes >> 3) & 1;
90 int i = 0;
91
92 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93 xmm1 = _mm256_setzero_ps();
94 xmm0 = _mm_load_ps((float*)src0);
95 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
96 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
97 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
98
99 for (; i < bound; ++i) {
100 xmm2 = _mm256_load_ps((float*)&points[0]);
101 xmm3 = _mm256_load_ps((float*)&points[4]);
102 points += 8;
103
104 xmm4 = _mm256_sub_ps(xmm1, xmm2);
105 xmm5 = _mm256_sub_ps(xmm1, xmm3);
106 xmm6 = _mm256_mul_ps(xmm4, xmm4);
107 xmm7 = _mm256_mul_ps(xmm5, xmm5);
108
109 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
110 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
111
112 _mm256_store_ps(target, xmm4);
113
114 target += 8;
115 }
116
117 for (i = 0; i < leftovers0; ++i) {
118
119 xmm2 = _mm256_load_ps((float*)&points[0]);
120
121 xmm4 = _mm256_sub_ps(xmm1, xmm2);
122
123 points += 4;
124
125 xmm6 = _mm256_mul_ps(xmm4, xmm4);
126
127 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
128 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
129
130 xmm9 = _mm256_extractf128_ps(xmm4, 1);
131 _mm_store_ps(target, xmm9);
132
133 target += 4;
134 }
135
136 for (i = 0; i < leftovers1; ++i) {
137 xmm9 = _mm_load_ps((float*)&points[0]);
138
139 xmm10 = _mm_sub_ps(xmm0, xmm9);
140
141 points += 2;
142
143 xmm9 = _mm_mul_ps(xmm10, xmm10);
144
145 xmm10 = _mm_hadd_ps(xmm9, xmm9);
146
147 _mm_storeh_pi((__m64*)target, xmm10);
148
149 target += 2;
150 }
151
152 for (i = 0; i < leftovers2; ++i) {
153
154 diff = src0[0] - points[0];
155
156 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
157
158 target[0] = sq_dist;
159 }
160}
161
162#endif /*LV_HAVE_AVX2*/
163
164#ifdef LV_HAVE_SSE3
165#include <pmmintrin.h>
166#include <xmmintrin.h>
167
168static inline void volk_32fc_x2_square_dist_32f_a_sse3(float* target,
169 const lv_32fc_t* src0,
170 const lv_32fc_t* points,
171 unsigned int num_points)
172{
173 const unsigned int num_bytes = num_points * 8;
174
175 __m128 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
176
177 lv_32fc_t diff;
178 float sq_dist;
179 int bound = num_bytes >> 5;
180 int i = 0;
181
182 xmm1 = _mm_setzero_ps();
183 xmm1 = _mm_loadl_pi(xmm1, (__m64*)src0);
184 xmm1 = _mm_movelh_ps(xmm1, xmm1);
185
186 for (; i < bound; ++i) {
187 xmm2 = _mm_load_ps((float*)&points[0]);
188 xmm4 = _mm_sub_ps(xmm1, xmm2);
189 xmm3 = _mm_load_ps((float*)&points[2]);
190 xmm5 = _mm_sub_ps(xmm1, xmm3);
191
192 xmm6 = _mm_mul_ps(xmm4, xmm4);
193 xmm7 = _mm_mul_ps(xmm5, xmm5);
194
195 xmm4 = _mm_hadd_ps(xmm6, xmm7);
196
197 _mm_store_ps(target, xmm4);
198
199 points += 4;
200 target += 4;
201 }
202
203 if (num_bytes >> 4 & 1) {
204
205 xmm2 = _mm_load_ps((float*)&points[0]);
206
207 xmm4 = _mm_sub_ps(xmm1, xmm2);
208
209 points += 2;
210
211 xmm6 = _mm_mul_ps(xmm4, xmm4);
212
213 xmm4 = _mm_hadd_ps(xmm6, xmm6);
214
215 _mm_storeh_pi((__m64*)target, xmm4);
216
217 target += 2;
218 }
219
220 if (num_bytes >> 3 & 1) {
221
222 diff = src0[0] - points[0];
223
224 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
225
226 target[0] = sq_dist;
227 }
228}
229
230#endif /*LV_HAVE_SSE3*/
231
232
233#ifdef LV_HAVE_NEON
234#include <arm_neon.h>
235static inline void volk_32fc_x2_square_dist_32f_neon(float* target,
236 const lv_32fc_t* src0,
237 const lv_32fc_t* points,
238 unsigned int num_points)
239{
240 const unsigned int quarter_points = num_points / 4;
241 unsigned int number;
242
243 float32x4x2_t a_vec, b_vec;
244 float32x4x2_t diff_vec;
245 float32x4_t tmp, tmp1, dist_sq;
246 a_vec.val[0] = vdupq_n_f32(lv_creal(src0[0]));
247 a_vec.val[1] = vdupq_n_f32(lv_cimag(src0[0]));
248 for (number = 0; number < quarter_points; ++number) {
249 b_vec = vld2q_f32((float*)points);
250 diff_vec.val[0] = vsubq_f32(a_vec.val[0], b_vec.val[0]);
251 diff_vec.val[1] = vsubq_f32(a_vec.val[1], b_vec.val[1]);
252 tmp = vmulq_f32(diff_vec.val[0], diff_vec.val[0]);
253 tmp1 = vmulq_f32(diff_vec.val[1], diff_vec.val[1]);
254
255 dist_sq = vaddq_f32(tmp, tmp1);
256 vst1q_f32(target, dist_sq);
257 points += 4;
258 target += 4;
259 }
260 for (number = quarter_points * 4; number < num_points; ++number) {
261 lv_32fc_t diff = src0[0] - *points++;
262 *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
263 }
264}
265#endif /* LV_HAVE_NEON */
266
267#ifdef LV_HAVE_NEONV8
268#include <arm_neon.h>
269
270static inline void volk_32fc_x2_square_dist_32f_neonv8(float* target,
271 const lv_32fc_t* src0,
272 const lv_32fc_t* points,
273 unsigned int num_points)
274{
275 const unsigned int quarter_points = num_points / 4;
276 unsigned int number;
277
278 float32x4x2_t b_vec;
279 float32x4_t diff_real, diff_imag, dist_sq;
280 float32x4_t a_real = vdupq_n_f32(lv_creal(src0[0]));
281 float32x4_t a_imag = vdupq_n_f32(lv_cimag(src0[0]));
282
283 for (number = 0; number < quarter_points; ++number) {
284 b_vec = vld2q_f32((float*)points);
285 __VOLK_PREFETCH(points + 8);
286
287 diff_real = vsubq_f32(a_real, b_vec.val[0]);
288 diff_imag = vsubq_f32(a_imag, b_vec.val[1]);
289
290 /* dist_sq = diff_real^2 + diff_imag^2 using FMA */
291 dist_sq = vfmaq_f32(vmulq_f32(diff_real, diff_real), diff_imag, diff_imag);
292
293 vst1q_f32(target, dist_sq);
294 points += 4;
295 target += 4;
296 }
297
298 for (number = quarter_points * 4; number < num_points; ++number) {
299 lv_32fc_t diff = src0[0] - *points++;
300 *target++ = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
301 }
302}
303#endif /* LV_HAVE_NEONV8 */
304
305
306#ifdef LV_HAVE_GENERIC
307static inline void volk_32fc_x2_square_dist_32f_generic(float* target,
308 const lv_32fc_t* src0,
309 const lv_32fc_t* points,
310 unsigned int num_points)
311{
312 const unsigned int num_bytes = num_points * 8;
313
314 lv_32fc_t diff;
315 float sq_dist;
316 unsigned int i = 0;
317
318 for (; i < (num_bytes >> 3); ++i) {
319 diff = src0[0] - points[i];
320
321 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
322
323 target[i] = sq_dist;
324 }
325}
326
327#endif /*LV_HAVE_GENERIC*/
328
329
330#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_a_H*/
331
332#ifndef INCLUDED_volk_32fc_x2_square_dist_32f_u_H
333#define INCLUDED_volk_32fc_x2_square_dist_32f_u_H
334
335#include <inttypes.h>
336#include <stdio.h>
337#include <volk/volk_complex.h>
338
339#ifdef LV_HAVE_AVX2
340#include <immintrin.h>
341
342static inline void volk_32fc_x2_square_dist_32f_u_avx2(float* target,
343 const lv_32fc_t* src0,
344 const lv_32fc_t* points,
345 unsigned int num_points)
346{
347 const unsigned int num_bytes = num_points * 8;
348 __m128 xmm0, xmm9;
349 __m256 xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
350
351 lv_32fc_t diff;
352 float sq_dist;
353 int bound = num_bytes >> 6;
354 int leftovers1 = (num_bytes >> 3) & 0b11;
355 int i = 0;
356
357 __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
358 xmm1 = _mm256_setzero_ps();
359 xmm0 = _mm_loadu_ps((float*)src0);
360 xmm0 = _mm_permute_ps(xmm0, 0b01000100);
361 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 0);
362 xmm1 = _mm256_insertf128_ps(xmm1, xmm0, 1);
363
364 for (; i < bound; ++i) {
365 xmm2 = _mm256_loadu_ps((float*)&points[0]);
366 xmm3 = _mm256_loadu_ps((float*)&points[4]);
367 points += 8;
368
369 xmm4 = _mm256_sub_ps(xmm1, xmm2);
370 xmm5 = _mm256_sub_ps(xmm1, xmm3);
371 xmm6 = _mm256_mul_ps(xmm4, xmm4);
372 xmm7 = _mm256_mul_ps(xmm5, xmm5);
373
374 xmm4 = _mm256_hadd_ps(xmm6, xmm7);
375 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
376
377 _mm256_storeu_ps(target, xmm4);
378
379 target += 8;
380 }
381
382 if (num_bytes >> 5 & 1) {
383
384 xmm2 = _mm256_loadu_ps((float*)&points[0]);
385
386 xmm4 = _mm256_sub_ps(xmm1, xmm2);
387
388 points += 4;
389
390 xmm6 = _mm256_mul_ps(xmm4, xmm4);
391
392 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
393 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
394
395 xmm9 = _mm256_extractf128_ps(xmm4, 1);
396 _mm_storeu_ps(target, xmm9);
397
398 target += 4;
399 }
400
401 for (i = 0; i < leftovers1; ++i) {
402
403 diff = src0[0] - points[0];
404 points += 1;
405
406 sq_dist = lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff);
407
408 target[0] = sq_dist;
409 target += 1;
410 }
411}
412
413#endif /*LV_HAVE_AVX2*/
414
415#ifdef LV_HAVE_RVV
416#include <riscv_vector.h>
417
418static inline void volk_32fc_x2_square_dist_32f_rvv(float* target,
419 const lv_32fc_t* src0,
420 const lv_32fc_t* points,
421 unsigned int num_points)
422{
423 size_t vlmax = __riscv_vsetvlmax_e32m4();
424 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
425 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
426
427 size_t n = num_points;
428 for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
429 vl = __riscv_vsetvl_e32m4(n);
430 vuint64m8_t vb = __riscv_vle64_v_u64m8((const uint64_t*)points, vl);
431 vfloat32m4_t vbr = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 0, vl));
432 vfloat32m4_t vbi = __riscv_vreinterpret_f32m4(__riscv_vnsrl(vb, 32, vl));
433 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
434 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
435 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
436 __riscv_vse32(target, v, vl);
437 }
438}
439#endif /*LV_HAVE_RVV*/
440
441#ifdef LV_HAVE_RVVSEG
442#include <riscv_vector.h>
443
444static inline void volk_32fc_x2_square_dist_32f_rvvseg(float* target,
445 const lv_32fc_t* src0,
446 const lv_32fc_t* points,
447 unsigned int num_points)
448{
449 size_t vlmax = __riscv_vsetvlmax_e32m4();
450 vfloat32m4_t var = __riscv_vfmv_v_f_f32m4(lv_creal(*src0), vlmax);
451 vfloat32m4_t vai = __riscv_vfmv_v_f_f32m4(lv_cimag(*src0), vlmax);
452
453 size_t n = num_points;
454 for (size_t vl; n > 0; n -= vl, target += vl, points += vl) {
455 vl = __riscv_vsetvl_e32m4(n);
456 vfloat32m4x2_t vb = __riscv_vlseg2e32_v_f32m4x2((const float*)points, vl);
457 vfloat32m4_t vbr = __riscv_vget_f32m4(vb, 0);
458 vfloat32m4_t vbi = __riscv_vget_f32m4(vb, 1);
459 vfloat32m4_t vr = __riscv_vfsub(var, vbr, vl);
460 vfloat32m4_t vi = __riscv_vfsub(vai, vbi, vl);
461 vfloat32m4_t v = __riscv_vfmacc(__riscv_vfmul(vi, vi, vl), vr, vr, vl);
462 __riscv_vse32(target, v, vl);
463 }
464}
465#endif /*LV_HAVE_RVVSEG*/
466
467#endif /*INCLUDED_volk_32fc_x2_square_dist_32f_u_H*/