37 #ifndef MM_KERNEL_INNER_SSE2_A_H
38 #define MM_KERNEL_INNER_SSE2_A_H
60 template<
typename T_real,
typename T_reg,
int T_M,
int T_N,
int T_K>
64 static int const M = T_M;
65 static int const N = T_N;
66 static int const K = T_K;
73 template<
int T_ROWS_kernel,
int T_COLS_kernel,
typename T_ordering_kernel,
int T_repetitions>
88 static void exec( real
const *
const *
const A,
89 real
const *
const *
const B,
92 int const offset_A = 0,
93 int const offset_B = 0,
94 int const offset_C = 0 );
96 template<
int T_offset_A,
int T_offset_B,
int T_offset_C>
97 static void exec( real
const *
const *
const A,
98 real
const *
const *
const B,
105 template<
int T_loop_index,
int T_end>
115 real
const * B_packed ) {
117 B_reg.
load_p( &B_packed[row_B * T_N * floats_per_register +
118 T_loop_index * floats_per_register] );
128 real
const * B_packed ) {
130 A_reg.
load_p( &A[col_A * T_M + T_loop_index * floats_per_register] );
142 real
const * B_packed ) {
144 Loop<0, T_M/floats_per_register>
::middle( start_i + T_loop_index,
157 X_reg[T_loop_index].
store_p( &X[T_loop_index * floats_per_register] );
162 real
const *
const *
const A,
163 real
const *
const *
const B ) {
177 real
const * B_packed ) {}
181 real
const * B_packed ) {}
185 real
const * B_packed ) {}
191 real
const *
const *
const A,
192 real
const *
const *
const B ) {}
199 template<
typename real,
typename T_reg,
int T_M,
int T_N,
int T_K>
201 real
const *
const *
const B,
206 int const offset_C) {
207 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
210 #if 1 // I loose a bit performance because of the offsets
211 for (
int ind = 0; ind < i; ++ind)
218 for (
int ind = 0; ind < i; ++ind)
227 template<
typename real,
typename T_reg,
int T_M,
int T_N,
int T_K>
228 template<
int T_offset_A,
int T_offset_B,
int T_offset_C>
230 real
const *
const *
const B,
233 STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
236 for (
int ind = 0; ind < i; ++ind)
250 template<
typename real,
typename T_reg,
int T_M,
int T_N,
int T_K>
251 template<
int T_rows,
int T_cols,
typename T_ordering_kernel,
int T_repetitions>
254 static int const size_packed = T_rows * T_cols * T_repetitions;
255 static int const rows = T_rows;
256 static int const cols = T_cols;
258 template<
typename T_ordering_matrix>
262 inline static void exec( PtrType X, PtrTypePacked X_packed,
265 int const rows_total_matrix,
266 int const cols_total_matrix ) {
267 for (
int ir = 0; ir < T_repetitions; ++ir)
268 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ]
269 = X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ];
273 template<
typename T_ordering_matrix>
277 inline static void exec( PtrType X, PtrTypePacked X_packed,
280 int const rows_total_matrix,
281 int const cols_total_matrix ) {
282 for (
int ir = 0; ir < T_repetitions; ++ir)
283 X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ] =
284 X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ];
288 template<
template<
typename T_ordering>
class T_assign,
typename T_ordering_matrix>
289 static void exec(
typename T_assign<T_ordering_matrix>::PtrType X,
290 typename T_assign<T_ordering_matrix>::PtrTypePacked X_packed,
291 int const rows_total_matrix,
int const cols_total_matrix) {
293 for (
int col_k = 0; col_k < T_cols; ++col_k ) {
295 for (
int row_k = 0; row_k < T_rows; ++row_k ) {
296 T_assign<T_ordering_matrix>::exec( X, X_packed,
298 rows_total_matrix, cols_total_matrix );
307 template<
typename T_ordering_matrix>
308 inline static void pack(real
const *
const X, real * X_packed,
309 int const rows_total_matrix,
int const cols_total_matrix) {
310 exec< Assign_to_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
316 template<
typename T_ordering_matrix>
317 inline static void unpack(real * X, real
const *
const X_packed,
318 int const rows_total_matrix,
int const cols_total_matrix) {
319 exec< Extract_from_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
static int const floats_per_register
Number of real numbers that fit in one register.
Definition: mm_kernel_inner_sse2_A.h:68
Definition: mm_kernel_inner_sse2_A.h:259
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:172
Pack< M, N, Ordering_col_wise, 1 > Pack_type_C
Type that can (should) be used to pack C.
Definition: mm_kernel_inner_sse2_A.h:78
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:161
static void exec(real const *const *const A, real const *const *const B, real *const C, int const i=1, int const offset_A=0, int const offset_B=0, int const offset_C=0)
Executes the matrix-matrix multiply C += A B with the three matrices A, B, and C stored according to ...
Definition: mm_kernel_inner_sse2_A.h:200
static int const M
Number of rows of A and C.
Definition: mm_kernel_inner_sse2_A.h:64
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:190
#define STATIC_ASSERT_DEBUG(expr, msg)
Definition: common.h:58
Matrix multiplication template for architectures with SSE2 or higher and compilers that support C++ i...
Definition: mm_kernel_inner_sse2_A.h:61
Definition: mm_kernel_inner_sse2_A.h:106
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:125
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:182
static int const N
Number of columns of B and C.
Definition: mm_kernel_inner_sse2_A.h:65
void ALWAYS_INLINE set_to_zero()
Definition: vector_intrin.h:79
static void pack(real const *const X, real *X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for assignments to packed matrix.
Definition: mm_kernel_inner_sse2_A.h:308
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:262
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:173
Pack< K, N, Ordering_row_wise, floats_per_register > Pack_type_B
Type that can (should) be used to pack B.
Definition: mm_kernel_inner_sse2_A.h:77
#define ALWAYS_INLINE
Definition: common.h:31
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:139
void ALWAYS_INLINE store_p(Treal *ptr) const
Definition: vector_intrin.h:57
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:178
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:188
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:150
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:111
Template for packing of matrix elements.
Definition: mm_kernel_inner_sse2_A.h:74
Pack< M, K, Ordering_col_wise, 1 > Pack_type_A
Type that can (should) be used to pack A.
Definition: mm_kernel_inner_sse2_A.h:74
T_real real
Real number type (usually float or double)
Definition: mm_kernel_inner_sse2_A.h:63
static int const K
Number of columns of A and rows of B.
Definition: mm_kernel_inner_sse2_A.h:66
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:186
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:107
static void unpack(real *X, real const *const X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for extracting matrix from packed matrix.
Definition: mm_kernel_inner_sse2_A.h:317
static void exec(typename T_assign< T_ordering_matrix >::PtrType X, typename T_assign< T_ordering_matrix >::PtrTypePacked X_packed, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:289
real * PtrTypePacked
Type of packed pointer - note the absence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:260
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:155
real const *const PtrType
Type of matrix pointer - note the presence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:261
void ALWAYS_INLINE load_p(Treal const *ptr)
Definition: vector_intrin.h:51
Vector class template for access to SIMD operations.
Definition: vector_intrin.h:49