Vector Optimized Library of Kernels 3.3.0
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_conjugate_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
54
55#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
56#define INCLUDED_volk_32fc_conjugate_32fc_u_H
57
58#include <float.h>
59#include <inttypes.h>
60#include <stdio.h>
61#include <volk/volk_complex.h>
62
63#ifdef LV_HAVE_AVX
64#include <immintrin.h>
65
66static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector,
67 const lv_32fc_t* aVector,
68 unsigned int num_points)
69{
70 unsigned int number = 0;
71 const unsigned int quarterPoints = num_points / 4;
72
73 __m256 x;
74 lv_32fc_t* c = cVector;
75 const lv_32fc_t* a = aVector;
76
77 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
78
79 for (; number < quarterPoints; number++) {
80
81 x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
82
83 x = _mm256_xor_ps(x, conjugator); // conjugate register
84
85 _mm256_storeu_ps((float*)c, x); // Store the results back into the C container
86
87 a += 4;
88 c += 4;
89 }
90
91 number = quarterPoints * 4;
92
93 for (; number < num_points; number++) {
94 *c++ = lv_conj(*a++);
95 }
96}
97#endif /* LV_HAVE_AVX */
98
99#ifdef LV_HAVE_SSE3
100#include <pmmintrin.h>
101
102static inline void volk_32fc_conjugate_32fc_u_sse3(lv_32fc_t* cVector,
103 const lv_32fc_t* aVector,
104 unsigned int num_points)
105{
106 unsigned int number = 0;
107 const unsigned int halfPoints = num_points / 2;
108
109 __m128 x;
110 lv_32fc_t* c = cVector;
111 const lv_32fc_t* a = aVector;
112
113 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
114
115 for (; number < halfPoints; number++) {
116
117 x = _mm_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
118
119 x = _mm_xor_ps(x, conjugator); // conjugate register
120
121 _mm_storeu_ps((float*)c, x); // Store the results back into the C container
122
123 a += 2;
124 c += 2;
125 }
126
127 if ((num_points % 2) != 0) {
128 *c = lv_conj(*a);
129 }
130}
131#endif /* LV_HAVE_SSE3 */
132
133#ifdef LV_HAVE_GENERIC
134
135static inline void volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector,
136 const lv_32fc_t* aVector,
137 unsigned int num_points)
138{
139 lv_32fc_t* cPtr = cVector;
140 const lv_32fc_t* aPtr = aVector;
141 unsigned int number = 0;
142
143 for (number = 0; number < num_points; number++) {
144 *cPtr++ = lv_conj(*aPtr++);
145 }
146}
147#endif /* LV_HAVE_GENERIC */
148
149
150#endif /* INCLUDED_volk_32fc_conjugate_32fc_u_H */
151#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
152#define INCLUDED_volk_32fc_conjugate_32fc_a_H
153
154#include <float.h>
155#include <inttypes.h>
156#include <stdio.h>
157#include <volk/volk_complex.h>
158
159#ifdef LV_HAVE_AVX
160#include <immintrin.h>
161
162static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector,
163 const lv_32fc_t* aVector,
164 unsigned int num_points)
165{
166 unsigned int number = 0;
167 const unsigned int quarterPoints = num_points / 4;
168
169 __m256 x;
170 lv_32fc_t* c = cVector;
171 const lv_32fc_t* a = aVector;
172
173 __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
174
175 for (; number < quarterPoints; number++) {
176
177 x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
178
179 x = _mm256_xor_ps(x, conjugator); // conjugate register
180
181 _mm256_store_ps((float*)c, x); // Store the results back into the C container
182
183 a += 4;
184 c += 4;
185 }
186
187 number = quarterPoints * 4;
188
189 for (; number < num_points; number++) {
190 *c++ = lv_conj(*a++);
191 }
192}
193#endif /* LV_HAVE_AVX */
194
195#ifdef LV_HAVE_SSE3
196#include <pmmintrin.h>
197
198static inline void volk_32fc_conjugate_32fc_a_sse3(lv_32fc_t* cVector,
199 const lv_32fc_t* aVector,
200 unsigned int num_points)
201{
202 unsigned int number = 0;
203 const unsigned int halfPoints = num_points / 2;
204
205 __m128 x;
206 lv_32fc_t* c = cVector;
207 const lv_32fc_t* a = aVector;
208
209 __m128 conjugator = _mm_setr_ps(0, -0.f, 0, -0.f);
210
211 for (; number < halfPoints; number++) {
212
213 x = _mm_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
214
215 x = _mm_xor_ps(x, conjugator); // conjugate register
216
217 _mm_store_ps((float*)c, x); // Store the results back into the C container
218
219 a += 2;
220 c += 2;
221 }
222
223 if ((num_points % 2) != 0) {
224 *c = lv_conj(*a);
225 }
226}
227#endif /* LV_HAVE_SSE3 */
228
229#ifdef LV_HAVE_NEON
230#include <arm_neon.h>
231
232static inline void volk_32fc_conjugate_32fc_a_neon(lv_32fc_t* cVector,
233 const lv_32fc_t* aVector,
234 unsigned int num_points)
235{
236 unsigned int number;
237 const unsigned int quarterPoints = num_points / 4;
238
239 float32x4x2_t x;
240 lv_32fc_t* c = cVector;
241 const lv_32fc_t* a = aVector;
242
243 for (number = 0; number < quarterPoints; number++) {
244 __VOLK_PREFETCH(a + 4);
245 x = vld2q_f32((float*)a); // Load the complex data as ar,br,cr,dr; ai,bi,ci,di
246
247 // xor the imaginary lane
248 x.val[1] = vnegq_f32(x.val[1]);
249
250 vst2q_f32((float*)c, x); // Store the results back into the C container
251
252 a += 4;
253 c += 4;
254 }
255
256 for (number = quarterPoints * 4; number < num_points; number++) {
257 *c++ = lv_conj(*a++);
258 }
259}
260#endif /* LV_HAVE_NEON */
261
262
263#ifdef LV_HAVE_NEONV8
264#include <arm_neon.h>
265
266static inline void volk_32fc_conjugate_32fc_neonv8(lv_32fc_t* cVector,
267 const lv_32fc_t* aVector,
268 unsigned int num_points)
269{
270 unsigned int n = num_points;
271 lv_32fc_t* c = cVector;
272 const lv_32fc_t* a = aVector;
273
274 /* Sign mask to flip imaginary parts: [0, -0, 0, -0] */
275 const uint32x4_t sign_mask =
276 vreinterpretq_u32_f32((float32x4_t){ 0.0f, -0.0f, 0.0f, -0.0f });
277
278 /* Process 4 complex numbers per iteration (2x unroll) */
279 while (n >= 4) {
280 float32x4_t v0 = vld1q_f32((const float*)a);
281 float32x4_t v1 = vld1q_f32((const float*)(a + 2));
282 __VOLK_PREFETCH(a + 8);
283
284 /* XOR to flip sign of imaginary parts */
285 v0 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v0), sign_mask));
286 v1 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v1), sign_mask));
287
288 vst1q_f32((float*)c, v0);
289 vst1q_f32((float*)(c + 2), v1);
290
291 a += 4;
292 c += 4;
293 n -= 4;
294 }
295
296 /* Scalar tail */
297 while (n > 0) {
298 *c++ = lv_conj(*a++);
299 n--;
300 }
301}
302
303#endif /* LV_HAVE_NEONV8 */
304
305
306#ifdef LV_HAVE_RVV
307#include <riscv_vector.h>
308
309static inline void volk_32fc_conjugate_32fc_rvv(lv_32fc_t* cVector,
310 const lv_32fc_t* aVector,
311 unsigned int num_points)
312{
313 size_t n = num_points;
314 vuint64m8_t m = __riscv_vmv_v_x_u64m8(1ull << 63, __riscv_vsetvlmax_e64m8());
315 for (size_t vl; n > 0; n -= vl, aVector += vl, cVector += vl) {
316 vl = __riscv_vsetvl_e64m8(n);
317 vuint64m8_t v = __riscv_vle64_v_u64m8((const uint64_t*)aVector, vl);
318 __riscv_vse64((uint64_t*)cVector, __riscv_vxor(v, m, vl), vl);
319 }
320}
321#endif /*LV_HAVE_RVV*/
322
323#endif /* INCLUDED_volk_32fc_conjugate_32fc_a_H */