Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
 
Loading...
Searching...
No Matches
volk_8ic_x2_multiply_conjugate_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
11#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H
12
13#include <inttypes.h>
14#include <stdio.h>
15#include <volk/volk_complex.h>
16
17#ifdef LV_HAVE_AVX2
18#include <immintrin.h>
27static inline void volk_8ic_x2_multiply_conjugate_16ic_a_avx2(lv_16sc_t* cVector,
28 const lv_8sc_t* aVector,
29 const lv_8sc_t* bVector,
30 unsigned int num_points)
31{
32 unsigned int number = 0;
33 const unsigned int quarterPoints = num_points / 8;
34
35 __m256i x, y, realz, imagz;
36 lv_16sc_t* c = cVector;
37 const lv_8sc_t* a = aVector;
38 const lv_8sc_t* b = bVector;
39 __m256i conjugateSign =
40 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
41
42 for (; number < quarterPoints; number++) {
43 // Convert 8 bit values into 16 bit values
44 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
45 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
46
47 // Calculate the ar*cr - ai*(-ci) portions
48 realz = _mm256_madd_epi16(x, y);
49
50 // Calculate the complex conjugate of the cr + ci j values
51 y = _mm256_sign_epi16(y, conjugateSign);
52
53 // Shift the order of the cr and ci values
54 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
55 _MM_SHUFFLE(2, 3, 0, 1));
56
57 // Calculate the ar*(-ci) + cr*(ai)
58 imagz = _mm256_madd_epi16(x, y);
59
60 // Perform the addition of products
61
62 _mm256_store_si256((__m256i*)c,
63 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
64 _mm256_unpackhi_epi32(realz, imagz)));
65
66 a += 8;
67 b += 8;
68 c += 8;
69 }
70
71 number = quarterPoints * 8;
72 int16_t* c16Ptr = (int16_t*)&cVector[number];
73 int8_t* a8Ptr = (int8_t*)&aVector[number];
74 int8_t* b8Ptr = (int8_t*)&bVector[number];
75 for (; number < num_points; number++) {
76 float aReal = (float)*a8Ptr++;
77 float aImag = (float)*a8Ptr++;
78 lv_32fc_t aVal = lv_cmake(aReal, aImag);
79 float bReal = (float)*b8Ptr++;
80 float bImag = (float)*b8Ptr++;
81 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
82 lv_32fc_t temp = aVal * bVal;
83
84 *c16Ptr++ = (int16_t)lv_creal(temp);
85 *c16Ptr++ = (int16_t)lv_cimag(temp);
86 }
87}
88#endif /* LV_HAVE_AVX2 */
89
90
91#ifdef LV_HAVE_SSE4_1
92#include <smmintrin.h>
101static inline void volk_8ic_x2_multiply_conjugate_16ic_a_sse4_1(lv_16sc_t* cVector,
102 const lv_8sc_t* aVector,
103 const lv_8sc_t* bVector,
104 unsigned int num_points)
105{
106 unsigned int number = 0;
107 const unsigned int quarterPoints = num_points / 4;
108
109 __m128i x, y, realz, imagz;
110 lv_16sc_t* c = cVector;
111 const lv_8sc_t* a = aVector;
112 const lv_8sc_t* b = bVector;
113 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
114
115 for (; number < quarterPoints; number++) {
116 // Convert into 8 bit values into 16 bit values
119
120 // Calculate the ar*cr - ai*(-ci) portions
121 realz = _mm_madd_epi16(x, y);
122
123 // Calculate the complex conjugate of the cr + ci j values
124 y = _mm_sign_epi16(y, conjugateSign);
125
126 // Shift the order of the cr and ci values
128 _MM_SHUFFLE(2, 3, 0, 1));
129
130 // Calculate the ar*(-ci) + cr*(ai)
131 imagz = _mm_madd_epi16(x, y);
132
135 _mm_unpackhi_epi32(realz, imagz)));
136
137 a += 4;
138 b += 4;
139 c += 4;
140 }
141
142 number = quarterPoints * 4;
143 int16_t* c16Ptr = (int16_t*)&cVector[number];
144 int8_t* a8Ptr = (int8_t*)&aVector[number];
145 int8_t* b8Ptr = (int8_t*)&bVector[number];
146 for (; number < num_points; number++) {
147 float aReal = (float)*a8Ptr++;
148 float aImag = (float)*a8Ptr++;
149 lv_32fc_t aVal = lv_cmake(aReal, aImag);
150 float bReal = (float)*b8Ptr++;
151 float bImag = (float)*b8Ptr++;
152 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
153 lv_32fc_t temp = aVal * bVal;
154
155 *c16Ptr++ = (int16_t)lv_creal(temp);
156 *c16Ptr++ = (int16_t)lv_cimag(temp);
157 }
158}
159#endif /* LV_HAVE_SSE4_1 */
160
161#ifdef LV_HAVE_GENERIC
171 const lv_8sc_t* aVector,
172 const lv_8sc_t* bVector,
173 unsigned int num_points)
174{
175 unsigned int number = 0;
176 int16_t* c16Ptr = (int16_t*)cVector;
177 int8_t* a8Ptr = (int8_t*)aVector;
178 int8_t* b8Ptr = (int8_t*)bVector;
179 for (number = 0; number < num_points; number++) {
180 float aReal = (float)*a8Ptr++;
181 float aImag = (float)*a8Ptr++;
182 lv_32fc_t aVal = lv_cmake(aReal, aImag);
183 float bReal = (float)*b8Ptr++;
184 float bImag = (float)*b8Ptr++;
185 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
186 lv_32fc_t temp = aVal * bVal;
187
188 *c16Ptr++ = (int16_t)lv_creal(temp);
189 *c16Ptr++ = (int16_t)lv_cimag(temp);
190 }
191}
192#endif /* LV_HAVE_GENERIC */
193
194#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_a_H */
195
196#ifndef INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
197#define INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H
198
199#include <inttypes.h>
200#include <stdio.h>
201#include <volk/volk_complex.h>
202
203#ifdef LV_HAVE_AVX2
204#include <immintrin.h>
213static inline void volk_8ic_x2_multiply_conjugate_16ic_u_avx2(lv_16sc_t* cVector,
214 const lv_8sc_t* aVector,
215 const lv_8sc_t* bVector,
216 unsigned int num_points)
217{
218 unsigned int number = 0;
219 const unsigned int oneEigthPoints = num_points / 8;
220
221 __m256i x, y, realz, imagz;
222 lv_16sc_t* c = cVector;
223 const lv_8sc_t* a = aVector;
224 const lv_8sc_t* b = bVector;
225 __m256i conjugateSign =
226 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
227
228 for (; number < oneEigthPoints; number++) {
229 // Convert 8 bit values into 16 bit values
230 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
231 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
232
233 // Calculate the ar*cr - ai*(-ci) portions
234 realz = _mm256_madd_epi16(x, y);
235
236 // Calculate the complex conjugate of the cr + ci j values
237 y = _mm256_sign_epi16(y, conjugateSign);
238
239 // Shift the order of the cr and ci values
240 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
241 _MM_SHUFFLE(2, 3, 0, 1));
242
243 // Calculate the ar*(-ci) + cr*(ai)
244 imagz = _mm256_madd_epi16(x, y);
245
246 // Perform the addition of products
247
248 _mm256_storeu_si256((__m256i*)c,
249 _mm256_packs_epi32(_mm256_unpacklo_epi32(realz, imagz),
250 _mm256_unpackhi_epi32(realz, imagz)));
251
252 a += 8;
253 b += 8;
254 c += 8;
255 }
256
257 number = oneEigthPoints * 8;
258 int16_t* c16Ptr = (int16_t*)&cVector[number];
259 int8_t* a8Ptr = (int8_t*)&aVector[number];
260 int8_t* b8Ptr = (int8_t*)&bVector[number];
261 for (; number < num_points; number++) {
262 float aReal = (float)*a8Ptr++;
263 float aImag = (float)*a8Ptr++;
264 lv_32fc_t aVal = lv_cmake(aReal, aImag);
265 float bReal = (float)*b8Ptr++;
266 float bImag = (float)*b8Ptr++;
267 lv_32fc_t bVal = lv_cmake(bReal, -bImag);
268 lv_32fc_t temp = aVal * bVal;
269
270 *c16Ptr++ = (int16_t)lv_creal(temp);
271 *c16Ptr++ = (int16_t)lv_cimag(temp);
272 }
273}
274#endif /* LV_HAVE_AVX2 */
275
276#endif /* INCLUDED_volk_8ic_x2_multiply_conjugate_16ic_u_H */
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:5050
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
Definition: sse2neon.h:5937
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6373
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
Definition: sse2neon.h:6263
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
Definition: sse2neon.h:4570
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
Definition: sse2neon.h:4595
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
Definition: sse2neon.h:4471
#define _mm_shufflelo_epi16(a, imm)
Definition: sse2neon.h:5459
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
Definition: sse2neon.h:4513
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
Definition: sse2neon.h:7565
FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0)
Definition: sse2neon.h:5100
#define _mm_shufflehi_epi16(a, imm)
Definition: sse2neon.h:5444
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
Definition: sse2neon.h:7132
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0)
Definition: sse2neon.h:195
int64x2_t __m128i
Definition: sse2neon.h:244
static void volk_8ic_x2_multiply_conjugate_16ic_generic(lv_16sc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, unsigned int num_points)
Multiplys the one complex vector with the complex conjugate of the second complex vector and stores t...
Definition: volk_8ic_x2_multiply_conjugate_16ic.h:170
#define lv_cimag(x)
Definition: volk_complex.h:98
#define lv_cmake(r, i)
Definition: volk_complex.h:77
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:70
#define lv_creal(x)
Definition: volk_complex.h:96
float complex lv_32fc_t
Definition: volk_complex.h:74
short complex lv_16sc_t
Definition: volk_complex.h:71