1 #ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
2 #define INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H
10 #ifdef LV_HAVE_GENERIC
15 float * res = (
float*) result;
16 float * in = (
float*) input;
17 float * tp = (
float*) taps;
18 unsigned int n_2_ccomplex_blocks = num_points/2;
19 unsigned int isodd = num_points &1;
23 float sum0[2] = {0,0};
24 float sum1[2] = {0,0};
28 for(i = 0; i < n_2_ccomplex_blocks; ++i) {
31 sum0[0] += in[0] * tp[0] - in[1] * tp[1];
32 sum0[1] += in[0] * tp[1] + in[1] * tp[0];
33 sum1[0] += in[2] * tp[2] - in[3] * tp[3];
34 sum1[1] += in[2] * tp[3] + in[3] * tp[2];
43 res[0] = sum0[0] + sum1[0];
44 res[1] = sum0[1] + sum1[1];
48 for(i = 0; i < isodd; ++i) {
51 *result += input[num_points - 1] * taps[num_points - 1];
61 #include <pmmintrin.h>
63 static inline void volk_32fc_x2_dot_prod_32fc_u_sse3(
lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points) {
67 memset(&dotProduct, 0x0, 2*
sizeof(
float));
69 unsigned int number = 0;
70 const unsigned int halfPoints = num_points/2;
72 __m128 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
77 dotProdVal = _mm_setzero_ps();
79 for(;number < halfPoints; number++){
81 x = _mm_loadu_ps((
float*)a);
82 y = _mm_loadu_ps((
float*)b);
84 yl = _mm_moveldup_ps(y);
85 yh = _mm_movehdup_ps(y);
87 tmp1 = _mm_mul_ps(x,yl);
89 x = _mm_shuffle_ps(x,x,0xB1);
91 tmp2 = _mm_mul_ps(x,yh);
93 z = _mm_addsub_ps(tmp1,tmp2);
95 dotProdVal = _mm_add_ps(dotProdVal, z);
103 _mm_storeu_ps((
float*)dotProductVector,dotProdVal);
105 dotProduct += ( dotProductVector[0] + dotProductVector[1] );
107 if(num_points % 1 != 0) {
108 dotProduct += (*a) * (*b);
111 *result = dotProduct;