ergo
mm_kernel_inner_sse2_A.h
Go to the documentation of this file.
1 /* Ergo, version 3.4, a program for linear scaling electronic structure
2  * calculations.
3  * Copyright (C) 2014 Elias Rudberg, Emanuel H. Rubensson, and Pawel Salek.
4  *
5  * This program is free software: you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation, either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program. If not, see <http://www.gnu.org/licenses/>.
17  *
18  * Primary academic reference:
19  * Kohn−Sham Density Functional Theory Electronic Structure Calculations
20  * with Linearly Scaling Computational Time and Memory Usage,
21  * Elias Rudberg, Emanuel H. Rubensson, and Pawel Salek,
22  * J. Chem. Theory Comput. 7, 340 (2011),
23  * <http://dx.doi.org/10.1021/ct100611z>
24  *
25  * For further information about Ergo, see <http://www.ergoscf.org>.
26  */
27 /*
28  * Templates for efficient gemm kernels.
29  * For architectures with SSE2 or higher.
30  *
31  * Copyright Emanuel Rubensson, 2009
32  *
33  *
34  *
35  *
36  */
37 #ifndef MM_KERNEL_INNER_SSE2_A_H
38 #define MM_KERNEL_INNER_SSE2_A_H
39 #include "common.h"
40 #include "vector_intrin.h"
41 
60 template<typename T_real, typename T_reg, int T_M, int T_N, int T_K>
62  public:
63  typedef T_real real;
64  static int const M = T_M;
65  static int const N = T_N;
66  static int const K = T_K;
67  protected:
68  static int const floats_per_register = ( sizeof(T_reg) / sizeof(real) );
71  private:
73  template<int T_ROWS_kernel, int T_COLS_kernel, typename T_ordering_kernel, int T_repetitions>
74  class Pack;
75  public:
82  // Consider passing derived class from std::vector as arguments
83  // that have compile time constant length that can be checked at
84  // compile time using static asserts
88  static void exec( real const * const * const A,
89  real const * const * const B,
90  real * const C,
91  int const i = 1,
92  int const offset_A = 0,
93  int const offset_B = 0,
94  int const offset_C = 0 );
95 
96  template<int T_offset_A, int T_offset_B, int T_offset_C>
97  static void exec( real const * const * const A,
98  real const * const * const B,
99  real * const C,
100  int const i = 1 );
101 
102 
103 
104  protected:
105  template<int T_loop_index, int T_end>
106  struct Loop {
107  static inline void ALWAYS_INLINE set_to_zero( Vector_intrin<real, T_reg> * X_reg ) {
108  X_reg[T_loop_index].set_to_zero();
110  }
111  static inline void ALWAYS_INLINE inner( int const row_A_reg, // == row_C_reg
112  int const row_B,
113  Vector_intrin<real, T_reg> const & A_reg,
115  real const * B_packed ) {
117  B_reg.load_p( &B_packed[row_B * T_N * floats_per_register +
118  T_loop_index * floats_per_register] );
119  B_reg *= A_reg;
120  C_reg[row_A_reg + T_loop_index * T_M / floats_per_register] += B_reg;
121  Loop<T_loop_index+1, T_end>::inner( row_A_reg, row_B,
122  A_reg, C_reg,
123  B_packed );
124  }
125  static inline void ALWAYS_INLINE middle( int const col_A, // == row_B
127  real const * A,
128  real const * B_packed ) {
130  A_reg.load_p( &A[col_A * T_M + T_loop_index * floats_per_register] );
131  // Loop over cols of B
132  Loop<0, T_N>::inner( T_loop_index, // == row_A_reg == row_C_reg
133  col_A, // == row_B
134  A_reg,
135  C_reg,
136  B_packed );
137  Loop<T_loop_index+1, T_end>::middle( col_A, C_reg, A, B_packed );
138  }
139  static inline void ALWAYS_INLINE outer( int const start_i,
141  real const * A,
142  real const * B_packed ) {
143  // Loop over (register) rows of A and C
144  Loop<0, T_M/floats_per_register>::middle( start_i + T_loop_index,
145  C_reg,
146  A,
147  B_packed );
148  Loop<T_loop_index+1, T_end>::outer( start_i, C_reg, A, B_packed );
149  }
150  static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
151  real const * X ) {
152  X_reg[T_loop_index] += &X[T_loop_index * floats_per_register];
154  }
155  static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
156  real * X ) {
157  X_reg[T_loop_index].store_p( &X[T_loop_index * floats_per_register] );
159  }
160 
162  real const * const * const A,
163  real const * const * const B ) {
164  // Loop over columns of A and rows of B^T
165  Loop<0, T_K>::outer( 0, C_reg, A[T_loop_index], B[T_loop_index] );
167  }
168  };
169 
170  template<int T_end>
171  struct Loop<T_end, T_end> {
172  static inline void ALWAYS_INLINE set_to_zero( Vector_intrin<real, T_reg> * X_reg ) {}
173  static inline void ALWAYS_INLINE inner( int const row_A_reg, // == row_C_reg
174  int const row_B,
175  Vector_intrin<real, T_reg> const & A_reg,
177  real const * B_packed ) {}
178  static inline void ALWAYS_INLINE middle( int const col_A, // == row_B
180  real const * A,
181  real const * B_packed ) {}
182  static inline void ALWAYS_INLINE outer( int const start_i,
184  real const * A,
185  real const * B_packed ) {}
186  static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
187  real const * X ) {}
188  static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
189  real * X ) {}
191  real const * const * const A,
192  real const * const * const B ) {}
193  };
194 };
195 
196 // Doesn't matter if inlined or not... same performance for 4, 4, 5
197 // IT DOES MATTER WHEN OUTER CONTROL STRUCTURE IS ADDED ON TOP!!!
198 // THEN, INLINING IS BAD
199 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
201  real const * const * const B,
202  real * const C,
203  int const i,
204  int const offset_A,
205  int const offset_B,
206  int const offset_C) {
207  STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
209  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::set_to_zero( C_reg );
210 #if 1 // I loose a bit performance because of the offsets
211  for (int ind = 0; ind < i; ++ind)
212  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + offset_A, B[ind] + offset_B );
214  // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
215  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C + offset_C);
216  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C + offset_C);
217 #else
218  for (int ind = 0; ind < i; ++ind)
221  // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
222  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C);
223  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C);
224 #endif
225 } // end exec
226 
227 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
228  template<int T_offset_A, int T_offset_B, int T_offset_C>
230  real const * const * const B,
231  real * const C,
232  int const i ) {
233  STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
235  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::set_to_zero( C_reg );
236  for (int ind = 0; ind < i; ++ind)
237  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + T_offset_A, B[ind] + T_offset_B );
239  // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
240  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C + T_offset_C);
241  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C + T_offset_C);
242 } // end exec template
243 
244 
245 
246 
247 
248 
250 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
251  template<int T_rows, int T_cols, typename T_ordering_kernel, int T_repetitions>
252  class MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::Pack {
253  public:
254  static int const size_packed = T_rows * T_cols * T_repetitions;
255  static int const rows = T_rows;
256  static int const cols = T_cols;
257 
258  template<typename T_ordering_matrix>
260  typedef real * PtrTypePacked;
261  typedef real const * const PtrType;
262  inline static void exec( PtrType X, PtrTypePacked X_packed,
263  int const row_k,
264  int const col_k,
265  int const rows_total_matrix,
266  int const cols_total_matrix ) {
267  for ( int ir = 0; ir < T_repetitions; ++ir)
268  X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ]
269  = X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ];
270  }
271  };
272 
273  template<typename T_ordering_matrix>
275  typedef real const * const PtrTypePacked;
276  typedef real * PtrType;
277  inline static void exec( PtrType X, PtrTypePacked X_packed,
278  int const row_k,
279  int const col_k,
280  int const rows_total_matrix,
281  int const cols_total_matrix ) {
282  for ( int ir = 0; ir < T_repetitions; ++ir)
283  X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ] =
284  X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ];
285  }
286  };
287 
288  template<template<typename T_ordering> class T_assign, typename T_ordering_matrix>
289  static void exec(typename T_assign<T_ordering_matrix>::PtrType X,
290  typename T_assign<T_ordering_matrix>::PtrTypePacked X_packed,
291  int const rows_total_matrix, int const cols_total_matrix) {
292  // Loop over columns of kernel
293  for ( int col_k = 0; col_k < T_cols; ++col_k ) {
294  // Loop over rows of kernel
295  for ( int row_k = 0; row_k < T_rows; ++row_k ) {
296  T_assign<T_ordering_matrix>::exec( X, X_packed,
297  row_k, col_k,
298  rows_total_matrix, cols_total_matrix );
299  }
300  }
301  } // end exec()
302 
307  template<typename T_ordering_matrix>
308  inline static void pack(real const * const X, real * X_packed,
309  int const rows_total_matrix, int const cols_total_matrix) {
310  exec< Assign_to_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
311  }
316  template<typename T_ordering_matrix>
317  inline static void unpack(real * X, real const * const X_packed,
318  int const rows_total_matrix, int const cols_total_matrix) {
319  exec< Extract_from_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
320  }
321 };
322 #endif
#define A
static int const floats_per_register
Number of real numbers that fit in one register.
Definition: mm_kernel_inner_sse2_A.h:68
Definition: mm_kernel_inner_sse2_A.h:259
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:172
Pack< M, N, Ordering_col_wise, 1 > Pack_type_C
Type that can (should) be used to pack C.
Definition: mm_kernel_inner_sse2_A.h:78
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:161
static void exec(real const *const *const A, real const *const *const B, real *const C, int const i=1, int const offset_A=0, int const offset_B=0, int const offset_C=0)
Executes the matrix-matrix multiply C += A B with the three matrices A, B, and C stored according to ...
Definition: mm_kernel_inner_sse2_A.h:200
static int const M
Number of rows of A and C.
Definition: mm_kernel_inner_sse2_A.h:64
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:190
#define STATIC_ASSERT_DEBUG(expr, msg)
Definition: common.h:58
Matrix multiplication template for architectures with SSE2 or higher and compilers that support C++ i...
Definition: mm_kernel_inner_sse2_A.h:61
Definition: mm_kernel_inner_sse2_A.h:106
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:125
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:182
static int const N
Number of columns of B and C.
Definition: mm_kernel_inner_sse2_A.h:65
void ALWAYS_INLINE set_to_zero()
Definition: vector_intrin.h:79
static void pack(real const *const X, real *X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for assignments to packed matrix.
Definition: mm_kernel_inner_sse2_A.h:308
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:262
real * PtrType
Type of matrix pointer - note the absence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:276
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:173
Pack< K, N, Ordering_row_wise, floats_per_register > Pack_type_B
Type that can (should) be used to pack B.
Definition: mm_kernel_inner_sse2_A.h:77
#define ALWAYS_INLINE
Definition: common.h:31
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:139
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:178
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:188
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:150
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:111
Template for packing of matrix elements.
Definition: mm_kernel_inner_sse2_A.h:74
void ALWAYS_INLINE store_p(Treal *ptr) const
Definition: vector_intrin.h:57
Pack< M, K, Ordering_col_wise, 1 > Pack_type_A
Type that can (should) be used to pack A.
Definition: mm_kernel_inner_sse2_A.h:74
T_real real
Real number type (usually float or double)
Definition: mm_kernel_inner_sse2_A.h:63
static int const K
Number of columns of A and rows of B.
Definition: mm_kernel_inner_sse2_A.h:66
Definition: mm_kernel_inner_sse2_A.h:274
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:186
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:107
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:277
#define B
static void unpack(real *X, real const *const X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for extracting matrix from packed matrix.
Definition: mm_kernel_inner_sse2_A.h:317
static void exec(typename T_assign< T_ordering_matrix >::PtrType X, typename T_assign< T_ordering_matrix >::PtrTypePacked X_packed, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:289
real * PtrTypePacked
Type of packed pointer - note the absence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:260
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:155
real const *const PtrType
Type of matrix pointer - note the presence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:261
real const *const PtrTypePacked
Type of packed pointer - note the presence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:275
void ALWAYS_INLINE load_p(Treal const *ptr)
Definition: vector_intrin.h:51
Vector class template for access to SIMD operations.
Definition: vector_intrin.h:49