39 #ifndef MM_KERNEL_INNER_SSE2_A_H 40 #define MM_KERNEL_INNER_SSE2_A_H 62 template<
typename T_real,
typename T_reg,
int T_M,
int T_N,
int T_K>
66 static int const M = T_M;
67 static int const N = T_N;
68 static int const K = T_K;
75 template<
int T_ROWS_kernel,
int T_COLS_kernel,
typename T_ordering_kernel,
int T_repetitions>
90 static void exec(
real const *
const *
const A,
91 real const *
const *
const B,
94 int const offset_A = 0,
95 int const offset_B = 0,
96 int const offset_C = 0 );
98 template<
int T_offset_A,
int T_offset_B,
int T_offset_C>
99 static void exec(
real const *
const *
const A,
100 real const *
const *
const B,
107 template<
int T_loop_index,
int T_end>
117 real const * B_packed ) {
130 real const * B_packed ) {
144 real const * B_packed ) {
164 real const *
const *
const A,
165 real const *
const *
const B ) {
179 real const * B_packed ) {}
183 real const * B_packed ) {}
187 real const * B_packed ) {}
193 real const *
const *
const A,
194 real const *
const *
const B ) {}
201 template<
typename real,
typename T_reg,
int T_M,
int T_N,
int T_K>
203 real const *
const *
const B,
208 int const offset_C) {
209 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
212 #if 1 // I loose a bit performance because of the offsets 213 for (
int ind = 0; ind < i; ++ind)
220 for (
int ind = 0; ind < i; ++ind)
229 template<
typename real,
typename T_reg,
int T_M,
int T_N,
int T_K>
230 template<
int T_offset_A,
int T_offset_B,
int T_offset_C>
232 real const *
const *
const B,
235 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
238 for (
int ind = 0; ind < i; ++ind)
252 template<
typename real,
typename T_reg,
int T_M,
int T_N,
int T_K>
253 template<
int T_rows,
int T_cols,
typename T_ordering_kernel,
int T_repetitions>
256 static int const size_packed = T_rows * T_cols * T_repetitions;
257 static int const rows = T_rows;
258 static int const cols = T_cols;
260 template<
typename T_ordering_matrix>
267 int const rows_total_matrix,
268 int const cols_total_matrix ) {
269 for (
int ir = 0; ir < T_repetitions; ++ir)
270 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ]
271 = X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ];
275 template<
typename T_ordering_matrix>
282 int const rows_total_matrix,
283 int const cols_total_matrix ) {
284 for (
int ir = 0; ir < T_repetitions; ++ir)
285 X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ] =
286 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ];
290 template<
template<
typename T_ordering>
class T_assign,
typename T_ordering_matrix>
291 static void exec(
typename T_assign<T_ordering_matrix>::PtrType X,
292 typename T_assign<T_ordering_matrix>::PtrTypePacked X_packed,
293 int const rows_total_matrix,
int const cols_total_matrix) {
295 for (
int col_k = 0; col_k < T_cols; ++col_k ) {
297 for (
int row_k = 0; row_k < T_rows; ++row_k ) {
298 T_assign<T_ordering_matrix>::exec( X, X_packed,
300 rows_total_matrix, cols_total_matrix );
309 template<
typename T_ordering_matrix>
311 int const rows_total_matrix,
int const cols_total_matrix) {
312 exec< Assign_to_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
318 template<
typename T_ordering_matrix>
320 int const rows_total_matrix,
int const cols_total_matrix) {
321 exec< Extract_from_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
static int const floats_per_register
Number of real numbers that fit in one register.
Definition: mm_kernel_inner_sse2_A.h:70
Definition: mm_kernel_inner_sse2_A.h:261
mat::SizesAndBlocks rows
Definition: test.cc:51
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:174
Pack< M, N, Ordering_col_wise, 1 > Pack_type_C
Type that can (should) be used to pack C.
Definition: mm_kernel_inner_sse2_A.h:80
mat::SizesAndBlocks cols
Definition: test.cc:52
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:163
static void exec(real const *const *const A, real const *const *const B, real *const C, int const i=1, int const offset_A=0, int const offset_B=0, int const offset_C=0)
Executes the matrix-matrix multiply C += A B with the three matrices A, B, and C stored according to ...
Definition: mm_kernel_inner_sse2_A.h:202
static int const M
Number of rows of A and C.
Definition: mm_kernel_inner_sse2_A.h:66
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:192
#define STATIC_ASSERT_DEBUG(expr, msg)
Definition: common.h:72
Matrix multiplication template for architectures with SSE2 or higher and compilers that support C++ i...
Definition: mm_kernel_inner_sse2_A.h:63
Definition: mm_kernel_inner_sse2_A.h:108
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:127
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:184
static int const N
Number of columns of B and C.
Definition: mm_kernel_inner_sse2_A.h:67
ergo_real real
Definition: test.cc:46
void ALWAYS_INLINE set_to_zero()
Definition: vector_intrin.h:90
static void pack(real const *const X, real *X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for assignments to packed matrix.
Definition: mm_kernel_inner_sse2_A.h:310
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:264
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:175
Pack< K, N, Ordering_row_wise, floats_per_register > Pack_type_B
Type that can (should) be used to pack B.
Definition: mm_kernel_inner_sse2_A.h:79
#define ALWAYS_INLINE
Definition: common.h:45
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:141
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:180
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:190
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:152
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:113
Template for packing of matrix elements.
Definition: mm_kernel_inner_sse2_A.h:76
void ALWAYS_INLINE store_p(Treal *ptr) const
Definition: vector_intrin.h:68
Pack< M, K, Ordering_col_wise, 1 > Pack_type_A
Type that can (should) be used to pack A.
Definition: mm_kernel_inner_sse2_A.h:76
T_real real
Real number type (usually float or double)
Definition: mm_kernel_inner_sse2_A.h:65
static int const K
Number of columns of A and rows of B.
Definition: mm_kernel_inner_sse2_A.h:68
Vector template for convenient access to SIMD operations.
Macros for inlining and static assertions and structs for access to matrix elements specifying the la...
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:188
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:109
static void unpack(real *X, real const *const X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for extracting matrix from packed matrix.
Definition: mm_kernel_inner_sse2_A.h:319
static void exec(typename T_assign< T_ordering_matrix >::PtrType X, typename T_assign< T_ordering_matrix >::PtrTypePacked X_packed, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:291
real * PtrTypePacked
Type of packed pointer - note the absence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:262
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:157
real const *const PtrType
Type of matrix pointer - note the presence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:263
void ALWAYS_INLINE load_p(Treal const *ptr)
Definition: vector_intrin.h:62
Vector class template for access to SIMD operations.
Definition: vector_intrin.h:60