M4RI  1.0.1
xor.h
Go to the documentation of this file.
00001 
00010 #ifndef M4RI_XOR_H
00011 #define M4RI_XOR_H
00012 
00013  /*******************************************************************
00014  *
00015  *                 M4RI:  Linear Algebra over GF(2)
00016  *
00017  *    Copyright (C) 2008-2010  Martin Albrecht <martinralbrecht@googlemail.com>
00018  *
00019  *  Distributed under the terms of the GNU General Public License (GPL)
00020  *  version 2 or higher.
00021  *
00022  *    This code is distributed in the hope that it will be useful,
00023  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
00024  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00025  *    General Public License for more details.
00026  *
00027  *  The full text of the GPL is available at:
00028  *
00029  *                  http://www.gnu.org/licenses/
00030  *
00031  ********************************************************************/
00032 
00033 #include "m4ri_config.h"
00034 
00035 #if __M4RI_HAVE_SSE2
00036 #include <emmintrin.h>
00037 #endif
00038 
00039 #include "misc.h"
00040 
00048 static inline void _mzd_combine8(word *c, word const *t1, word const *t2, word const *t3, word const *t4,
00049                                  word const *t5, word const *t6, word const *t7, word const *t8, wi_t wide_in) {
00050   wi_t wide = wide_in;
00051 #if __M4RI_HAVE_SSE2
00052   /* assuming t1 ... t8 are aligned, but c might not be */
00053   if (__M4RI_ALIGNMENT(c,16)==0) {
00054     __m128i *__c = (__m128i*)c;
00055     __m128i *__t1 = (__m128i*)t1;
00056     __m128i *__t2 = (__m128i*)t2;
00057     __m128i *__t3 = (__m128i*)t3;
00058     __m128i *__t4 = (__m128i*)t4;
00059     __m128i *__t5 = (__m128i*)t5;
00060     __m128i *__t6 = (__m128i*)t6;
00061     __m128i *__t7 = (__m128i*)t7;
00062     __m128i *__t8 = (__m128i*)t8;
00063     const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
00064     __m128i xmm1;
00065     
00066     while(__c < eof) {
00067       xmm1 = _mm_xor_si128(*__c, *__t1++);
00068       xmm1 = _mm_xor_si128(xmm1, *__t2++);
00069       xmm1 = _mm_xor_si128(xmm1, *__t3++);
00070       xmm1 = _mm_xor_si128(xmm1, *__t4++);
00071       xmm1 = _mm_xor_si128(xmm1, *__t5++);
00072       xmm1 = _mm_xor_si128(xmm1, *__t6++);
00073       xmm1 = _mm_xor_si128(xmm1, *__t7++);
00074       xmm1 = _mm_xor_si128(xmm1, *__t8++);
00075       *__c++ = xmm1;
00076     }
00077     c  = (word*)__c;
00078     t1 = (word*)__t1;
00079     t2 = (word*)__t2;
00080     t3 = (word*)__t3;
00081     t4 = (word*)__t4;
00082     t5 = (word*)__t5;
00083     t6 = (word*)__t6;
00084     t7 = (word*)__t7;
00085     t8 = (word*)__t8;
00086     wide = ((sizeof(word) * wide) % 16) / sizeof(word);
00087   }
00088 #endif
00089   for(wi_t i = 0; i < wide; ++i) {
00090     c[i] ^= t1[i] ^ t2[i] ^ t3[i] ^ t4[i] ^ t5[i] ^ t6[i] ^ t7[i] ^ t8[i];
00091   }
00092 
00093   __M4RI_DD_RAWROW(c, wide_in);
00094 }
00095 
00101 static inline void _mzd_combine4(word *c, word const *t1, word const *t2, word const *t3, word const *t4, wi_t wide_in) {
00102   wi_t wide = wide_in;
00103 #if __M4RI_HAVE_SSE2
00104   /* assuming t1 ... t4 are aligned, but c might not be */
00105   if (__M4RI_ALIGNMENT(c,16)==0) {
00106     __m128i *__c = (__m128i*)c;
00107     __m128i *__t1 = (__m128i*)t1;
00108     __m128i *__t2 = (__m128i*)t2;
00109     __m128i *__t3 = (__m128i*)t3;
00110     __m128i *__t4 = (__m128i*)t4;
00111     const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
00112     __m128i xmm1;
00113     
00114     while(__c < eof) {
00115       xmm1 = _mm_xor_si128(*__c, *__t1++);
00116       xmm1 = _mm_xor_si128(xmm1, *__t2++);
00117       xmm1 = _mm_xor_si128(xmm1, *__t3++);
00118       xmm1 = _mm_xor_si128(xmm1, *__t4++);
00119       *__c++ = xmm1;
00120     }
00121     c  = (word*)__c;
00122     t1 = (word*)__t1;
00123     t2 = (word*)__t2;
00124     t3 = (word*)__t3;
00125     t4 = (word*)__t4;
00126     wide = ((sizeof(word) * wide) % 16) / sizeof(word);
00127   }
00128   if(!wide) {
00129     __M4RI_DD_RAWROW(c, wide_in);
00130     return;
00131   }
00132 #endif // __M4RI_HAVE_SSE2
00133   wi_t n = (wide + 7) / 8;
00134   switch (wide % 8) {
00135   case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00136     case 7:    *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00137     case 6:    *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00138     case 5:    *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00139     case 4:    *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00140     case 3:    *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00141     case 2:    *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00142     case 1:    *c++ ^= *t1++ ^ *t2++ ^ *t3++ ^ *t4++;
00143     } while (--n > 0);
00144   }
00145   __M4RI_DD_RAWROW(c, wide_in);
00146 }
00147 
00153 static inline void _mzd_combine3(word *c, word const *t1, word const *t2, word const *t3, wi_t wide_in) {
00154   wi_t wide = wide_in;
00155 #if __M4RI_HAVE_SSE2
00156   /* assuming t1 ... t3 are aligned, but c might not be */
00157   if (__M4RI_ALIGNMENT(c,16)==0) {
00158     __m128i *__c = (__m128i*)c;
00159     __m128i *__t1 = (__m128i*)t1;
00160     __m128i *__t2 = (__m128i*)t2;
00161     __m128i *__t3 = (__m128i*)t3;
00162     const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
00163     __m128i xmm1;
00164     
00165     while(__c < eof) {
00166       xmm1 = _mm_xor_si128(*__c, *__t1++);
00167       xmm1 = _mm_xor_si128(xmm1, *__t2++);
00168       xmm1 = _mm_xor_si128(xmm1, *__t3++);
00169       *__c++ = xmm1;
00170     }
00171     c  = (word*)__c;
00172     t1 = (word*)__t1;
00173     t2 = (word*)__t2;
00174     t3 = (word*)__t3;
00175     wide = ((sizeof(word) * wide) % 16) / sizeof(word);
00176   }
00177   if(!wide) {
00178     __M4RI_DD_RAWROW(c, wide_in);
00179     return;
00180   }
00181 #endif // __M4RI_HAVE_SSE2
00182   wi_t n = (wide + 7) / 8;
00183   switch (wide % 8) {
00184   case 0: do { *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00185     case 7:    *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00186     case 6:    *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00187     case 5:    *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00188     case 4:    *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00189     case 3:    *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00190     case 2:    *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00191     case 1:    *c++ ^= *t1++ ^ *t2++ ^ *t3++;
00192     } while (--n > 0);
00193   }
00194   __M4RI_DD_RAWROW(c, wide_in);
00195 }
00196 
00197 
00203 static inline void _mzd_combine2(word *c, word const *t1, word const *t2, wi_t wide_in) {
00204   wi_t wide = wide_in;
00205 #if __M4RI_HAVE_SSE2
00206   /* assuming t1 ... t2 are aligned, but c might not be */
00207   if (__M4RI_ALIGNMENT(c,16)==0) {
00208     __m128i *__c = (__m128i*)c;
00209     __m128i *__t1 = (__m128i*)t1;
00210     __m128i *__t2 = (__m128i*)t2;
00211     const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
00212     __m128i xmm1;
00213     
00214     while(__c < eof) {
00215       xmm1 = _mm_xor_si128(*__c, *__t1++);
00216       xmm1 = _mm_xor_si128(xmm1, *__t2++);
00217       *__c++ = xmm1;
00218     }
00219     c  = (word*)__c;
00220     t1 = (word*)__t1;
00221     t2 = (word*)__t2;
00222     wide = ((sizeof(word) * wide) % 16) / sizeof(word);
00223   }
00224   if(!wide) {
00225     __M4RI_DD_RAWROW(c, wide_in);
00226     return;
00227   }
00228 #endif // __M4RI_HAVE_SSE2
00229   wi_t n = (wide + 7) / 8;
00230   switch (wide % 8) {
00231   case 0: do { *c++ ^= *t1++ ^ *t2++;
00232     case 7:    *c++ ^= *t1++ ^ *t2++;
00233     case 6:    *c++ ^= *t1++ ^ *t2++;
00234     case 5:    *c++ ^= *t1++ ^ *t2++;
00235     case 4:    *c++ ^= *t1++ ^ *t2++;
00236     case 3:    *c++ ^= *t1++ ^ *t2++;
00237     case 2:    *c++ ^= *t1++ ^ *t2++;
00238     case 1:    *c++ ^= *t1++ ^ *t2++;
00239     } while (--n > 0);
00240   }
00241   __M4RI_DD_RAWROW(c, wide_in);
00242 }
00243 
00249 static inline void _mzd_combine(word *c, word const *t1, wi_t wide_in) {
00250   wi_t wide = wide_in;
00251 #if __M4RI_HAVE_SSE2
00252   /* assuming c, t1 are alligned the same way */
00253 
00254   if (__M4RI_ALIGNMENT(c,16)==8 && wide) {
00255     *c++ ^= *t1++;
00256     wide--;
00257   }
00258 
00259   __m128i *__c = (__m128i*)c;
00260   __m128i *__t1 = (__m128i*)t1;
00261   const __m128i *eof = (__m128i*)((unsigned long)(c + wide) & ~0xFUL);
00262   __m128i xmm1;
00263   
00264   
00265   while(__c < eof-1) {
00266     xmm1 = _mm_xor_si128(*__c, *__t1++);
00267     *__c++ = xmm1;
00268     xmm1 = _mm_xor_si128(*__c, *__t1++);
00269     *__c++ = xmm1;
00270   }
00271 
00272   if(__c < eof) {
00273     xmm1 = _mm_xor_si128(*__c, *__t1++); 
00274     *__c++ = xmm1;      
00275   }
00276   
00277   c  = (word*)__c;
00278   t1 = (word*)__t1;
00279   wide = ((sizeof(word) * wide) % 16) / sizeof(word);
00280 
00281   if(!wide) {
00282     __M4RI_DD_RAWROW(c, wide_in);
00283     return;
00284   }
00285 #endif // __M4RI_HAVE_SSE2
00286 
00287   wi_t n = (wide + 7) / 8;
00288   switch (wide % 8) {
00289   case 0: do { *c++ ^= *t1++;
00290     case 7:    *c++ ^= *t1++;
00291     case 6:    *c++ ^= *t1++;
00292     case 5:    *c++ ^= *t1++;
00293     case 4:    *c++ ^= *t1++;
00294     case 3:    *c++ ^= *t1++;
00295     case 2:    *c++ ^= *t1++;
00296     case 1:    *c++ ^= *t1++;
00297     } while (--n > 0);
00298   }
00299   __M4RI_DD_RAWROW(c, wide_in);
00300 }
00301 
00302 
00303 #ifdef __M4RI_M4RM_GRAY8
00304 #define _MZD_COMBINE _mzd_combine8(c, t1, t2, t3, t4, t5, t6, t7, t8, wide)
00305 #else // __M4RI_M4RM_GRAY8
00306 #define _MZD_COMBINE _mzd_combine4(c, t1, t2, t3, t4, wide)
00307 #endif // __M4RI_M4RM_GRAY8
00308 
00309 #endif // M4RI_XOR_H