GNU Radio 3.4.0 C++ API
volk_32fc_x2_conjugate_dot_prod_32fc_u.h
Go to the documentation of this file.
00001 #ifndef INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
00002 #define INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H
00003 
00004 
00005 #include<volk/volk_complex.h>
00006 
00007 
00008 #if LV_HAVE_GENERIC
00009 
00010 
00011 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_generic(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
00012   
00013   float * res = (float*) result;
00014   float * in = (float*) input;
00015   float * tp = (float*) taps;
00016   unsigned int n_2_ccomplex_blocks = num_bytes >> 4;
00017   unsigned int isodd = (num_bytes >> 3) &1;
00018   
00019   
00020   
00021   float sum0[2] = {0,0};
00022   float sum1[2] = {0,0};
00023   int i = 0;
00024 
00025   
00026   for(i = 0; i < n_2_ccomplex_blocks; ++i) {
00027     
00028     sum0[0] += in[0] * tp[0] + in[1] * tp[1];
00029     sum0[1] += (-in[0] * tp[1]) + in[1] * tp[0];
00030     sum1[0] += in[2] * tp[2] + in[3] * tp[3];
00031     sum1[1] += (-in[2] * tp[3]) + in[3] * tp[2];
00032     
00033     
00034     in += 4;
00035     tp += 4;
00036 
00037   }
00038  
00039   
00040   res[0] = sum0[0] + sum1[0];
00041   res[1] = sum0[1] + sum1[1];
00042   
00043   
00044   
00045   for(i = 0; i < isodd; ++i) {
00046 
00047 
00048     *result += input[(num_bytes >> 3) - 1] * lv_conj(taps[(num_bytes >> 3) - 1]);
00049 
00050   }
00051   /*
00052   for(i = 0; i < num_bytes >> 3; ++i) {
00053     *result += input[i] * conjf(taps[i]);
00054   }
00055   */
00056 }
00057 
00058 #endif /*LV_HAVE_GENERIC*/
00059 
00060 #if LV_HAVE_SSE3
00061 
00062 #include <xmmintrin.h>
00063 #include <pmmintrin.h>
00064 #include <mmintrin.h>
00065 
00066 
00067 static inline void volk_32fc_x2_conjugate_dot_prod_32fc_u_sse3(lv_32fc_t* result, const lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_bytes) {
00068 
00069   static const uint32_t conjugator[4] __attribute__((aligned(16)))= {0x00000000, 0x80000000, 0x00000000, 0x80000000};
00070 
00071   union HalfMask {
00072     uint32_t intRep[4];
00073     __m128 vec;
00074     } halfMask;
00075  
00076   union NegMask {
00077     int intRep[4];
00078     __m128 vec;
00079   } negMask;
00080 
00081   unsigned int offset = 0;
00082   float Rsum=0, Isum=0;
00083   float Im,Re;
00084 
00085   __m128 in1, in2, Rv, fehg, Iv, Rs, Ivm, Is;
00086   __m128 zv = {0,0,0,0};
00087   
00088   halfMask.intRep[0] = halfMask.intRep[1] = 0xFFFFFFFF;
00089   halfMask.intRep[2] = halfMask.intRep[3] = 0x00000000;
00090 
00091   negMask.intRep[0] = negMask.intRep[2] = 0x80000000;
00092   negMask.intRep[1] = negMask.intRep[3] = 0;
00093   
00094   // main loop
00095   while(num_bytes >= 4*sizeof(float)){
00096 
00097     in1 = _mm_loadu_ps( (float*) (input+offset) );
00098     in2 = _mm_loadu_ps( (float*) (taps+offset) );
00099     Rv = in1*in2;
00100     fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
00101     Iv = in1*fehg;
00102     Rs = _mm_hadd_ps( _mm_hadd_ps(Rv, zv) ,zv);
00103     Ivm = _mm_xor_ps( negMask.vec, Iv );
00104     Is = _mm_hadd_ps( _mm_hadd_ps(Ivm, zv) ,zv);
00105     _mm_store_ss( &Im, Is );
00106     _mm_store_ss( &Re, Rs );
00107     num_bytes -= 4*sizeof(float);
00108     offset += 2;
00109     Rsum += Re;
00110     Isum += Im;
00111   }
00112 
00113   // handle the last complex case ...
00114   if(num_bytes > 0){
00115 
00116     if(num_bytes != 4){
00117       // bad things are happening
00118     }
00119 
00120     in1 = _mm_loadu_ps( (float*) (input+offset) );
00121     in2 = _mm_loadu_ps( (float*) (taps+offset) );
00122     Rv = _mm_and_ps(in1*in2, halfMask.vec);
00123     fehg = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2,3,0,1));
00124     Iv = _mm_and_ps(in1*fehg, halfMask.vec);
00125     Rs = _mm_hadd_ps(_mm_hadd_ps(Rv, zv),zv);
00126     Ivm = _mm_xor_ps( negMask.vec, Iv );
00127     Is = _mm_hadd_ps(_mm_hadd_ps(Ivm, zv),zv);
00128     _mm_store_ss( &Im, Is );
00129     _mm_store_ss( &Re, Rs );
00130     Rsum += Re;
00131     Isum += Im;
00132   }
00133 
00134   result[0] = lv_32fc_init(Rsum,Isum);
00135   return;
00136 }
00137 
00138 #endif /*LV_HAVE_SSE3*/
00139 
00140 
00141 #endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_u_H*/
00142 
00143 
00144