27 #ifndef MM_KERNEL_OUTER_A_H
28 #define MM_KERNEL_OUTER_A_H
44 template<
typename T_gemm_kernel,
int T_M_block,
int T_N_block>
46 template<
int T_rows_block,
int T_cols_block,
typename T_ordering_block,
typename T_pack_type_kernel>
70 static void exec( real
const *
const *
const A,
71 real
const *
const *
const B,
77 template<
typename T_gemm_kernel,
int T_M_block,
int T_N_block>
79 real const *
const *
const B,
83 for (
int n = 0; n < N_block; ++n )
84 for (
int m = 0; m < M_block; ++m ) {
85 T_gemm_kernel::exec( A, B, C, n_mul,
86 Ordering_block_A::get( m, 0, M_block, K_block ) * T_gemm_kernel::Pack_type_A::size_packed,
87 Ordering_block_B::get( 0, n, K_block, N_block ) * T_gemm_kernel::Pack_type_B::size_packed,
88 Ordering_block_C::get( m, n, M_block, N_block ) * T_gemm_kernel::Pack_type_C::size_packed );
94 T_gemm_kernel::template exec<Ordering_block_A::template Get<0, 0, M_block, K_block>::index * T_gemm_kernel::Pack_type_A::size_packed,
95 Ordering_block_B::template Get<0, 0, K_block, N_block>::index * T_gemm_kernel::Pack_type_B::size_packed,
96 Ordering_block_C::template Get<0, 0, M_block, N_block>::index * T_gemm_kernel::Pack_type_C::size_packed>(
A,
B, C, n_mul );
97 T_gemm_kernel::template exec<Ordering_block_A::template Get<1, 0, M_block, K_block>::index * T_gemm_kernel::Pack_type_A::size_packed,
98 Ordering_block_B::template Get<0, 0, K_block, N_block>::index * T_gemm_kernel::Pack_type_B::size_packed,
99 Ordering_block_C::template Get<1, 0, M_block, N_block>::index * T_gemm_kernel::Pack_type_C::size_packed>(
A,
B, C, n_mul );
101 T_gemm_kernel::exec( A, B, C, n_mul,
102 Ordering_block_A::get( 0, 0, M_block, K_block ) * T_gemm_kernel::Pack_type_A::size_packed,
103 Ordering_block_B::get( 0, 0, K_block, N_block ) * T_gemm_kernel::Pack_type_B::size_packed,
104 Ordering_block_C::get( 0, 0, M_block, N_block ) * T_gemm_kernel::Pack_type_C::size_packed );
119 template<
typename T_gemm_kernel,
int T_M_block,
int T_N_block>
120 template<
int T_rows_block,
int T_cols_block,
typename T_ordering_block,
typename T_pack_type_kernel>
122 static int const rows_kernel = T_pack_type_kernel::rows;
123 static int const cols_kernel = T_pack_type_kernel::cols;
125 static int const rows = rows_kernel * T_rows_block;
126 static int const cols = cols_kernel * T_cols_block;
131 static unsigned int const size_packed = T_rows_block * T_cols_block * T_pack_type_kernel::size_packed;
134 template<
typename T_ordering_matrix>
138 template<
typename T_ordering_matrix>
148 template<
template<
typename T_ordering>
class T_assign,
typename T_ordering_matrix>
149 static void exec(
typename T_assign<T_ordering_matrix>::PtrType X,
typename T_assign<T_ordering_matrix>::PtrTypePacked X_packed,
150 int const rows_total_matrix,
int const cols_total_matrix) {
152 for (
int col_b = 0; col_b < T_cols_block; ++col_b ) {
154 for (
int row_b = 0; row_b < T_rows_block; ++row_b ) {
155 T_pack_type_kernel::template exec< T_assign, T_ordering_matrix >
156 ( &X[ T_assign<T_ordering_matrix>::Ordering_matrix::get( row_b * rows_kernel, col_b * cols_kernel,
157 rows_total_matrix, cols_total_matrix ) ],
158 &X_packed[ T_ordering_block::get( row_b, col_b, T_rows_block, T_cols_block ) *
159 T_pack_type_kernel::size_packed ],
160 rows_total_matrix, cols_total_matrix );
172 template<
typename T_ordering_matrix>
174 int const rows_total_matrix,
int const cols_total_matrix) {
175 exec< Assign_to_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
181 template<
typename T_ordering_matrix>
183 int const rows_total_matrix,
int const cols_total_matrix) {
184 exec< Extract_from_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
static int const M_block
Number of rows of A and C (blocks).
Definition: mm_kernel_outer_A.h:52
static int const M
Number of rows of A and C.
Definition: mm_kernel_outer_A.h:55
Pack< M_block, N_block, Ordering_block_C, typename T_gemm_kernel::Pack_type_C > Pack_type_C
Definition: mm_kernel_outer_A.h:66
static void exec(real const *const *const A, real const *const *const B, real *const C, int const i=1)
Executes the matrix-matrix multiply C += A B with the three matrices A, B, and C stored using the pac...
Definition: mm_kernel_outer_A.h:78
static int const K_kernel
Number of columns of A kernels and rows of B kernels.
Definition: mm_kernel_outer_A.h:51
Ordering_col_wise Ordering_block_C
Definition: mm_kernel_outer_A.h:62
static void unpack(real *X, real const *const X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for extracting matrix from packed matrix.
Definition: mm_kernel_outer_A.h:182
T_ordering_matrix Ordering_matrix
Definition: mm_kernel_outer_A.h:136
static int const M_kernel
Number of rows of A and C kernels.
Definition: mm_kernel_outer_A.h:49
Template for for translations between unpacked and packed matrix storage.
Definition: mm_kernel_outer_A.h:47
T_gemm_kernel::real real
Real number type (usually float or double)
Definition: mm_kernel_outer_A.h:58
static void exec(typename T_assign< T_ordering_matrix >::PtrType X, typename T_assign< T_ordering_matrix >::PtrTypePacked X_packed, int const rows_total_matrix, int const cols_total_matrix)
Elaborate function that can be called either to assign to or extract from packed format.
Definition: mm_kernel_outer_A.h:149
static int const N_block
Number of columns of B and C (blocks).
Definition: mm_kernel_outer_A.h:53
static int const K_block
Number of columns of A and rows of B (blocks).
Definition: mm_kernel_outer_A.h:54
Pack< K_block, N_block, Ordering_block_B, typename T_gemm_kernel::Pack_type_B > Pack_type_B
Definition: mm_kernel_outer_A.h:65
ergo_real real
Definition: cubature_rules.h:33
Struct for access to matrix elements stored in column wise order.
Definition: common.h:90
static int const K
Number of columns of A and rows of B.
Definition: mm_kernel_outer_A.h:57
Definition: mm_kernel_outer_A.h:135
Ordering_col_wise Ordering_block_B
Definition: mm_kernel_outer_A.h:61
Pack< M_block, K_block, Ordering_block_A, typename T_gemm_kernel::Pack_type_A > Pack_type_A
Definition: mm_kernel_outer_A.h:64
static int const N_kernel
Number of columns of B and C kernels.
Definition: mm_kernel_outer_A.h:50
static int const N
Number of columns of B and C.
Definition: mm_kernel_outer_A.h:56
Template for matrix matrix multiplication that wraps around a kernel given as template argument...
Definition: mm_kernel_outer_A.h:45
static void pack(real const *const X, real *X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for assignments to packed matrix.
Definition: mm_kernel_outer_A.h:173
Ordering_col_wise Ordering_block_A
Definition: mm_kernel_outer_A.h:60