M4RI 1.0.1
packedmatrix.h
Go to the documentation of this file.
00001 
00010 #ifndef M4RI_PACKEDMATRIX_H
00011 #define M4RI_PACKEDMATRIX_H
00012 
00013 /*******************************************************************
00014 *
00015 *                M4RI: Linear Algebra over GF(2)
00016 *
00017 *    Copyright (C) 2007, 2008 Gregory Bard <bard@fordham.edu>
00018 *    Copyright (C) 2008-2010 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
00019 *    Copyright (C) 2011 Carlo Wood <carlo@alinoe.com>
00020 *
00021 *  Distributed under the terms of the GNU General Public License (GPL)
00022 *  version 2 or higher.
00023 *
00024 *    This code is distributed in the hope that it will be useful,
00025 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
00026 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00027 *    General Public License for more details.
00028 *
00029 *  The full text of the GPL is available at:
00030 *
00031 *                  http://www.gnu.org/licenses/
00032 *
00033 ********************************************************************/
00034 
00035 #include "m4ri_config.h"
00036 
00037 #include <math.h>
00038 #include <assert.h>
00039 #include <stdio.h>
00040 
00041 #if __M4RI_HAVE_SSE2
00042 #include <emmintrin.h>
00043 #endif
00044 
00045 #include "misc.h"
00046 #include "debug_dump.h"
00047 
00048 #if __M4RI_HAVE_SSE2
00049 
00056 #define __M4RI_SSE2_CUTOFF 10
00057 #endif
00058 
00065 #define __M4RI_MAX_MZD_BLOCKSIZE (((size_t)1) << 27)
00066 
00075 #define __M4RI_MUL_BLOCKSIZE MIN(((int)sqrt((double)(4 * __M4RI_CPU_L2_CACHE))) / 2, 2048)
00076 
00077 typedef struct {
00078   size_t size;
00079   word* begin;
00080   word* end;
00081 } mzd_block_t;
00082 
00089 typedef struct mzd_t {
00094   rci_t nrows;
00095 
00100   rci_t ncols;
00101 
00108   wi_t width; 
00109 
00117   wi_t rowstride;
00118 
00126   wi_t offset_vector;
00127 
00132   wi_t row_offset;
00133 
00138   uint16_t offset;
00139 
00153   uint8_t flags;
00154 
00160   uint8_t blockrows_log;
00161 
00162 #if 0   // Commented out in order to keep the size of mzd_t 64 bytes (one cache line). This could be added back if rows was ever removed.
00163 
00168   int blockrows_mask;
00169 #endif
00170 
00175   word high_bitmask;
00176 
00181   word low_bitmask;
00182 
00188   mzd_block_t *blocks;
00189 
00195   word **rows;
00196 
00197 } mzd_t;
00198 
00202 static wi_t const mzd_paddingwidth = 3;
00203 
00204 static uint8_t const mzd_flag_nonzero_offset = 0x1;
00205 static uint8_t const mzd_flag_nonzero_excess = 0x2;
00206 static uint8_t const mzd_flag_windowed_zerooffset = 0x4;
00207 static uint8_t const mzd_flag_windowed_zeroexcess = 0x8;
00208 static uint8_t const mzd_flag_windowed_ownsblocks = 0x10;
00209 static uint8_t const mzd_flag_multiple_blocks = 0x20;
00210 
00218 static inline int mzd_is_windowed(mzd_t const *M) {
00219   return M->flags & (mzd_flag_nonzero_offset | mzd_flag_windowed_zerooffset);
00220 }
00221 
00229 static inline int mzd_owns_blocks(mzd_t const *M) {
00230   return M->blocks && (!mzd_is_windowed(M) || ((M->flags & mzd_flag_windowed_ownsblocks)));
00231 }
00232 
00241 static inline word* mzd_first_row(mzd_t const *M) {
00242   word* result = M->blocks[0].begin + M->offset_vector;
00243   assert(M->nrows == 0 || result == M->rows[0]);
00244   return result;
00245 }
00246 
00257 static inline word* mzd_first_row_next_block(mzd_t const* M, int n) {
00258   assert(n > 0);
00259   return M->blocks[n].begin + M->offset_vector - M->row_offset * M->rowstride;
00260 }
00261 
00271 static inline int mzd_row_to_block(mzd_t const* M, rci_t row) {
00272   return (M->row_offset + row) >> M->blockrows_log;
00273 }
00274 
00288 static inline wi_t mzd_rows_in_block(mzd_t const* M, int n) {
00289   if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
00290     if (__M4RI_UNLIKELY(n == 0)) {
00291       return (1 << M->blockrows_log) - M->row_offset;
00292     } else {
00293       int const last_block = mzd_row_to_block(M, M->nrows - 1); 
00294       if (n < last_block)
00295         return (1 << M->blockrows_log);
00296       return M->nrows + M->row_offset - (n << M->blockrows_log);
00297     }
00298   }
00299   return n ? 0 : M->nrows;
00300 }
00301 
00311 static inline word* mzd_row(mzd_t const* M, rci_t row) {
00312   wi_t big_vector = M->offset_vector + row * M->rowstride;
00313   word* result = M->blocks[0].begin + big_vector;
00314   if (__M4RI_UNLIKELY(M->flags & mzd_flag_multiple_blocks)) {
00315     int const n = (M->row_offset + row) >> M->blockrows_log;
00316     result = M->blocks[n].begin + big_vector - n * (M->blocks[0].size / sizeof(word));
00317   }
00318   assert(result == M->rows[row]);
00319   return result;
00320 }
00321 
00332 mzd_t *mzd_init(rci_t const r, rci_t const c);
00333 
00340 void mzd_free(mzd_t *A);
00341 
00342 
00364 mzd_t *mzd_init_window(mzd_t *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
00365 
00372 static inline mzd_t const *mzd_init_window_const(mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc)
00373 {
00374   return mzd_init_window((mzd_t*)M, lowr, lowc, highr, highc);
00375 }
00376 
00383 #define mzd_free_window mzd_free
00384 
00394 static inline void _mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb, wi_t const startblock) {
00395   if ((rowa == rowb) || (startblock >= M->width))
00396     return;
00397 
00398   /* This is the case since we're only called from _mzd_pls_mmpf,
00399    * which makes the same assumption. Therefore we don't need
00400    * to take a mask_begin into account. */
00401   assert(M->offset == 0);
00402 
00403   wi_t width = M->width - startblock - 1;
00404   word *a = M->rows[rowa] + startblock;
00405   word *b = M->rows[rowb] + startblock;
00406   word tmp; 
00407   word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
00408 
00409   if (width != 0) {
00410     for(wi_t i = 0; i < width; ++i) {
00411       tmp = a[i];
00412       a[i] = b[i];
00413       b[i] = tmp;
00414     }
00415   }
00416   tmp = (a[width] ^ b[width]) & mask_end;
00417   a[width] ^= tmp;
00418   b[width] ^= tmp;
00419 
00420   __M4RI_DD_ROW(M, rowa);
00421   __M4RI_DD_ROW(M, rowb);
00422 }
00423 
00432 static inline void mzd_row_swap(mzd_t *M, rci_t const rowa, rci_t const rowb) {
00433   if(rowa == rowb)
00434     return;
00435 
00436   wi_t width = M->width - 1;
00437   word *a = M->rows[rowa];
00438   word *b = M->rows[rowb];
00439   word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - M->offset);
00440   word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
00441 
00442   word tmp = (a[0] ^ b[0]) & mask_begin;
00443   if (width != 0) {
00444     a[0] ^= tmp;
00445     b[0] ^= tmp;
00446     
00447     for(wi_t i = 1; i < width; ++i) {
00448       tmp = a[i];
00449       a[i] = b[i];
00450       b[i] = tmp;
00451     }
00452     tmp = (a[width] ^ b[width]) & mask_end;
00453     a[width] ^= tmp;
00454     b[width] ^= tmp;
00455     
00456   } else {
00457     tmp &= mask_end;
00458     a[0] ^= tmp;
00459     b[0] ^= tmp;
00460   }
00461 
00462   __M4RI_DD_ROW(M, rowa);
00463   __M4RI_DD_ROW(M, rowb);
00464 }
00465 
00478 void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j);
00479 
00488 void mzd_col_swap(mzd_t *M, rci_t const cola, rci_t const colb);
00489 
00500 static inline void mzd_col_swap_in_rows(mzd_t *M, rci_t const cola, rci_t const colb, rci_t const start_row, rci_t const stop_row) {
00501   if (cola == colb)
00502     return;
00503 
00504   rci_t const _cola = cola + M->offset;
00505   rci_t const _colb = colb + M->offset;
00506 
00507   wi_t const a_word = _cola / m4ri_radix;
00508   wi_t const b_word = _colb / m4ri_radix;
00509 
00510   int const a_bit = _cola % m4ri_radix;
00511   int const b_bit = _colb % m4ri_radix;
00512 
00513   word* RESTRICT ptr = mzd_row(M, start_row);
00514   int max_bit = MAX(a_bit, b_bit);
00515   int count_remaining = stop_row - start_row;
00516   int min_bit = a_bit + b_bit - max_bit;
00517   int block = mzd_row_to_block(M, start_row);
00518   int offset = max_bit - min_bit;
00519   word mask = m4ri_one << min_bit;
00520   int count = MIN(mzd_rows_in_block(M, 0), count_remaining);
00521   // Apparently we're calling with start_row == stop_row sometimes (seems a bug to me).
00522   if (count <= 0)
00523     return;
00524 
00525   if (a_word == b_word) {
00526     while(1) {
00527       count_remaining -= count;
00528       ptr += a_word;
00529       int fast_count = count / 4;
00530       int rest_count = count - 4 * fast_count;
00531       word xor_v[4];
00532       wi_t const rowstride = M->rowstride;
00533       while (fast_count--) {
00534         xor_v[0] = ptr[0];
00535         xor_v[1] = ptr[rowstride];
00536         xor_v[2] = ptr[2 * rowstride];
00537         xor_v[3] = ptr[3 * rowstride];
00538         xor_v[0] ^= xor_v[0] >> offset;
00539         xor_v[1] ^= xor_v[1] >> offset;
00540         xor_v[2] ^= xor_v[2] >> offset;
00541         xor_v[3] ^= xor_v[3] >> offset;
00542         xor_v[0] &= mask;
00543         xor_v[1] &= mask;
00544         xor_v[2] &= mask;
00545         xor_v[3] &= mask;
00546         xor_v[0] |= xor_v[0] << offset;
00547         xor_v[1] |= xor_v[1] << offset;
00548         xor_v[2] |= xor_v[2] << offset;
00549         xor_v[3] |= xor_v[3] << offset;
00550         ptr[0] ^= xor_v[0];
00551         ptr[rowstride] ^= xor_v[1];
00552         ptr[2 * rowstride] ^= xor_v[2];
00553         ptr[3 * rowstride] ^= xor_v[3];
00554         ptr += 4 * rowstride;
00555       }
00556       while (rest_count--) {
00557         word xor_v = *ptr;
00558         xor_v ^= xor_v >> offset;
00559         xor_v &= mask;
00560         *ptr ^= xor_v | (xor_v << offset);
00561         ptr += rowstride;
00562       }
00563       if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0)
00564         break;
00565       ptr = mzd_first_row_next_block(M, block);
00566     }
00567   } else {
00568     word* RESTRICT min_ptr;
00569     wi_t max_offset;
00570     if (min_bit == a_bit) {
00571       min_ptr = ptr + a_word;
00572       max_offset = b_word - a_word;
00573     } else {
00574       min_ptr = ptr + b_word;
00575       max_offset = a_word - b_word;
00576     }
00577     while(1) {
00578       count_remaining -= count;
00579       wi_t const rowstride = M->rowstride;
00580       while(count--) {
00581         word xor_v = (min_ptr[0] ^ (min_ptr[max_offset] >> offset)) & mask;
00582         min_ptr[0] ^= xor_v;
00583         min_ptr[max_offset] ^= xor_v << offset;
00584         min_ptr += rowstride;
00585       }
00586       if ((count = MIN(mzd_rows_in_block(M, ++block), count_remaining)) <= 0)
00587         break;
00588       ptr = mzd_first_row_next_block(M, block);
00589       if (min_bit == a_bit)
00590         min_ptr = ptr + a_word;
00591       else
00592         min_ptr = ptr + b_word;
00593     }
00594   }
00595 
00596   __M4RI_DD_MZD(M);
00597 }
00598 
00610 static inline BIT mzd_read_bit(mzd_t const *M, rci_t const row, rci_t const col ) {
00611   return __M4RI_GET_BIT(M->rows[row][(col+M->offset)/m4ri_radix], (col+M->offset) % m4ri_radix);
00612 }
00613 
00626 static inline void mzd_write_bit(mzd_t *M, rci_t const row, rci_t const col, BIT const value) {
00627   __M4RI_WRITE_BIT(M->rows[row][(col + M->offset) / m4ri_radix], (col + M->offset) % m4ri_radix, value);
00628 }
00629 
00630 
00641 static inline void mzd_xor_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
00642   int const spot = (y + M->offset) % m4ri_radix;
00643   wi_t const block = (y + M->offset) / m4ri_radix;
00644   M->rows[x][block] ^= values << spot;
00645   int const space = m4ri_radix - spot;
00646   if (n > space)
00647     M->rows[x][block + 1] ^= values >> space;
00648 }
00649 
00660 static inline void mzd_and_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n, word values) {
00661   /* This is the best way, since this will drop out once we inverse the bits in values: */
00662   values >>= (m4ri_radix - n);  /* Move the bits to the lowest columns */
00663 
00664   int const spot = (y + M->offset) % m4ri_radix;
00665   wi_t const block = (y + M->offset) / m4ri_radix;
00666   M->rows[x][block] &= values << spot;
00667   int const space = m4ri_radix - spot;
00668   if (n > space)
00669     M->rows[x][block + 1] &= values >> space;
00670 }
00671 
00681 static inline void mzd_clear_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
00682   word values = m4ri_ffff >> (m4ri_radix - n);
00683   int const spot = (y + M->offset) % m4ri_radix;
00684   wi_t const block = (y + M->offset) / m4ri_radix;
00685   M->rows[x][block] &= ~(values << spot);
00686   int const space = m4ri_radix - spot;
00687   if (n > space)
00688     M->rows[x][block + 1] &= ~(values >> space);
00689 }
00690 
00699 void mzd_print(mzd_t const *M);
00700 
00707 void mzd_print_tight(mzd_t const *M);
00708 
00719 static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_t coloffset) {
00720   assert(dstrow < M->nrows && srcrow < M->nrows && coloffset < M->ncols);
00721   coloffset += M->offset;
00722   wi_t const startblock= coloffset/m4ri_radix;
00723   wi_t wide = M->width - startblock;
00724   word *src = M->rows[srcrow] + startblock;
00725   word *dst = M->rows[dstrow] + startblock;
00726   word const mask_begin = __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset % m4ri_radix);
00727   word const mask_end = __M4RI_LEFT_BITMASK((M->ncols + M->offset) % m4ri_radix);
00728 
00729   *dst++ ^= *src++ & mask_begin;
00730   --wide;
00731 
00732 #if __M4RI_HAVE_SSE2 
00733   int not_aligned = __M4RI_ALIGNMENT(src,16) != 0;      /* 0: Aligned, 1: Not aligned */
00734   if (wide > not_aligned + 1)                           /* Speed up for small matrices */
00735   {
00736     if (not_aligned) {
00737       *dst++ ^= *src++;
00738       --wide;
00739     }
00740     /* Now wide > 1 */
00741     __m128i* __src = (__m128i*)src;
00742     __m128i* __dst = (__m128i*)dst;
00743     __m128i* const eof = (__m128i*)((unsigned long)(src + wide) & ~0xFUL);
00744     do
00745     {
00746       __m128i xmm1 = _mm_xor_si128(*__dst, *__src);
00747       *__dst++ = xmm1;
00748     }
00749     while(++__src < eof);
00750     src  = (word*)__src;
00751     dst = (word*)__dst;
00752     wide = ((sizeof(word)*wide)%16)/sizeof(word);
00753   }
00754 #endif
00755   wi_t i = -1;
00756   while(++i < wide)
00757     dst[i] ^= src[i];
00758   /* 
00759    * Revert possibly non-zero excess bits.
00760    * Note that i == wide here, and wide can be 0.
00761    * But really, src[wide - 1] is M->rows[srcrow][M->width - 1] ;)
00762    * We use i - 1 here to let the compiler know these are the same addresses
00763    * that we last accessed, in the previous loop.
00764    */
00765   dst[i - 1] ^= src[i - 1] & ~mask_end;
00766 
00767   __M4RI_DD_ROW(M, dstrow);
00768 }
00769 
00781 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow);
00782 
00797 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
00798 
00813 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
00814 
00829 mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B);
00830 
00842 mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, int const clear);
00843 
00853 mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear);
00854 
00865 void mzd_randomize(mzd_t *M);
00866 
00881 void mzd_set_ui(mzd_t *M, unsigned int const value);
00882 
00898 rci_t mzd_gauss_delayed(mzd_t *M, rci_t const startcol, int const full);
00899 
00915 rci_t mzd_echelonize_naive(mzd_t *M, int const full);
00916 
00926 int mzd_equal(mzd_t const *A, mzd_t const *B);
00927 
00941 int mzd_cmp(mzd_t const *A, mzd_t const *B);
00942 
00950 mzd_t *mzd_copy(mzd_t *DST, mzd_t const *A);
00951 
00972 mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B);
00973 
00993 mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B);
00994 
01007 mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const lowr, rci_t const lowc, rci_t const highr, rci_t const highc);
01008 
01022 mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I);
01023 
01035 mzd_t *mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
01036 
01047 mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B);
01048 
01059 #define mzd_sub mzd_add
01060 
01071 #define _mzd_sub _mzd_add
01072 
01073 
01074 
01084 static inline word mzd_read_bits(mzd_t const *M, rci_t const x, rci_t const y, int const n) {
01085   int const spot = (y + M->offset) % m4ri_radix;
01086   wi_t const block = (y + M->offset) / m4ri_radix;
01087   int const spill = spot + n - m4ri_radix;
01088   word temp = (spill <= 0) ? M->rows[x][block] << -spill : (M->rows[x][block + 1] << (m4ri_radix - spill)) | (M->rows[x][block] >> spill);
01089   return temp >> (m4ri_radix - n);
01090 }
01091 
01092 
01112 void mzd_combine(mzd_t *DST, rci_t const row3, wi_t const startblock3,
01113                  mzd_t const *SC1, rci_t const row1, wi_t const startblock1, 
01114                  mzd_t const *SC2, rci_t const row2, wi_t const startblock2);
01115 
01116 
01136 static inline void mzd_combine_weird(mzd_t *C,       rci_t const c_row, wi_t const c_startblock,
01137                                      mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 
01138                                      mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
01139   word tmp;
01140   rci_t i = 0;
01141 
01142 
01143   for(; i + m4ri_radix <= A->ncols; i += m4ri_radix) {
01144     tmp = mzd_read_bits(A, a_row, i, m4ri_radix) ^ mzd_read_bits(B, b_row, i, m4ri_radix);
01145     mzd_clear_bits(C, c_row, i, m4ri_radix);
01146     mzd_xor_bits(C, c_row, i, m4ri_radix, tmp);
01147   }
01148   if(A->ncols - i) {
01149     tmp = mzd_read_bits(A, a_row, i, (A->ncols - i)) ^ mzd_read_bits(B, b_row, i, (B->ncols - i));
01150     mzd_clear_bits(C, c_row, i, (C->ncols - i));
01151     mzd_xor_bits(C, c_row, i, (C->ncols - i), tmp);
01152   }
01153 
01154   __M4RI_DD_MZD(C);
01155 }
01156 
01173 static inline void mzd_combine_even_in_place(mzd_t *A,       rci_t const a_row, wi_t const a_startblock,
01174                                              mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
01175 
01176   wi_t wide = A->width - a_startblock - 1;
01177 
01178   word *a = A->rows[a_row] + a_startblock;
01179   word *b = B->rows[b_row] + b_startblock;
01180   
01181 #if __M4RI_HAVE_SSE2
01182   if(wide > __M4RI_SSE2_CUTOFF) {
01184     if (__M4RI_ALIGNMENT(a,16)) {
01185       *a++ ^= *b++;
01186       wide--;
01187     }
01188     
01189     if (__M4RI_ALIGNMENT(a, 16) == 0 && __M4RI_ALIGNMENT(b, 16) == 0) {
01190       __m128i *a128 = (__m128i*)a;
01191       __m128i *b128 = (__m128i*)b;
01192       const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
01193       
01194       do {
01195         *a128 = _mm_xor_si128(*a128, *b128);
01196         ++b128;
01197         ++a128;
01198       } while(a128 < eof);
01199       
01200       a = (word*)a128;
01201       b = (word*)b128;
01202       wide = ((sizeof(word) * wide) % 16) / sizeof(word);
01203     }
01204   }
01205 #endif // __M4RI_HAVE_SSE2
01206 
01207   if (wide > 0) {
01208     wi_t n = (wide + 7) / 8;
01209     switch (wide % 8) {
01210     case 0: do { *(a++) ^= *(b++);
01211     case 7:      *(a++) ^= *(b++);
01212     case 6:      *(a++) ^= *(b++);
01213     case 5:      *(a++) ^= *(b++);
01214     case 4:      *(a++) ^= *(b++);
01215     case 3:      *(a++) ^= *(b++);
01216     case 2:      *(a++) ^= *(b++);
01217     case 1:      *(a++) ^= *(b++);
01218     } while (--n > 0);
01219     }
01220   }
01221 
01222   *a ^= *b & __M4RI_LEFT_BITMASK(A->ncols%m4ri_radix);
01223 
01224   __M4RI_DD_MZD(A);
01225 }
01226 
01227 
01247 static inline void mzd_combine_even(mzd_t *C,       rci_t const c_row, wi_t const c_startblock,
01248                                     mzd_t const *A, rci_t const a_row, wi_t const a_startblock, 
01249                                     mzd_t const *B, rci_t const b_row, wi_t const b_startblock) {
01250 
01251   wi_t wide = A->width - a_startblock - 1;
01252   word *a = A->rows[a_row] + a_startblock;
01253   word *b = B->rows[b_row] + b_startblock;
01254   word *c = C->rows[c_row] + c_startblock;
01255   
01256   /* /\* this is a corner case triggered by Strassen multiplication */
01257   /*  * which assumes certain (virtual) matrix sizes  */
01258   /*  * 2011/03/07: I don't think this was ever correct *\/ */
01259   /* if (a_row >= A->nrows) { */
01260   /*   assert(a_row < A->nrows); */
01261   /*   for(wi_t i = 0; i < wide; ++i) { */
01262   /*     c[i] = b[i]; */
01263   /*   } */
01264   /* } else { */
01265 #if __M4RI_HAVE_SSE2
01266   if(wide > __M4RI_SSE2_CUTOFF) {
01268     if (__M4RI_ALIGNMENT(a,16)) {
01269       *c++ = *b++ ^ *a++;
01270       wide--;
01271     }
01272       
01273     if ( (__M4RI_ALIGNMENT(b, 16) | __M4RI_ALIGNMENT(c, 16)) == 0) {
01274       __m128i *a128 = (__m128i*)a;
01275       __m128i *b128 = (__m128i*)b;
01276       __m128i *c128 = (__m128i*)c;
01277       const __m128i *eof = (__m128i*)((unsigned long)(a + wide) & ~0xFUL);
01278       
01279       do {
01280         *c128 = _mm_xor_si128(*a128, *b128);
01281         ++c128;
01282         ++b128;
01283         ++a128;
01284       } while(a128 < eof);
01285       
01286       a = (word*)a128;
01287       b = (word*)b128;
01288       c = (word*)c128;
01289       wide = ((sizeof(word) * wide) % 16) / sizeof(word);
01290     }
01291   }
01292 #endif // __M4RI_HAVE_SSE2
01293 
01294   if (wide > 0) {
01295     wi_t n = (wide + 7) / 8;
01296     switch (wide % 8) {
01297     case 0: do { *(c++) = *(a++) ^ *(b++);
01298     case 7:      *(c++) = *(a++) ^ *(b++);
01299     case 6:      *(c++) = *(a++) ^ *(b++);
01300     case 5:      *(c++) = *(a++) ^ *(b++);
01301     case 4:      *(c++) = *(a++) ^ *(b++);
01302     case 3:      *(c++) = *(a++) ^ *(b++);
01303     case 2:      *(c++) = *(a++) ^ *(b++);
01304     case 1:      *(c++) = *(a++) ^ *(b++);
01305     } while (--n > 0);
01306     }
01307   }
01308   *c ^= ((*a ^ *b ^ *c) & __M4RI_LEFT_BITMASK(C->ncols%m4ri_radix));
01309 
01310   __M4RI_DD_MZD(C);
01311 }
01312 
01313 
01322 static inline int mzd_read_bits_int(mzd_t const *M, rci_t const x, rci_t const y, int const n)
01323 {
01324   return __M4RI_CONVERT_TO_INT(mzd_read_bits(M, x, y, n));
01325 }
01326 
01327 
01334 int mzd_is_zero(mzd_t const *A);
01335 
01344 void mzd_row_clear_offset(mzd_t *M, rci_t const row, rci_t const coloffset);
01345 
01362 int mzd_find_pivot(mzd_t const *M, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c);
01363 
01364 
01377 double mzd_density(mzd_t const *A, wi_t res);
01378 
01393 double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c);
01394 
01395 
01404 rci_t mzd_first_zero_row(mzd_t const *A);
01405 
01406 #endif // M4RI_PACKEDMATRIX_H