47 template<
typename real,
typename regType,
48 int m_kernel,
int n_kernel,
int k_kernel,
49 int m_block,
int n_block>
61 size_t const cp_size) {
66 if (m != m_kernel*m_block)
67 throw std::runtime_error(
"Error in gemm_sse(...): m != m_kernel*m_block");
68 if (n != n_kernel*n_block)
69 throw std::runtime_error(
"Error in gemm_sse(...): n != n_kernel*n_block");
71 throw std::runtime_error(
"Error in gemm_sse(...): k != k_kernel");
72 if (ap_size < MM_outer::Pack_type_A::size_packed)
73 throw std::runtime_error(
"Error in gemm_sse(...): " 74 "ap_size < MM_outer::Pack_type_A::size_packed");
75 if (bp_size < MM_outer::Pack_type_B::size_packed)
76 throw std::runtime_error(
"Error in gemm_sse(...): " 77 "bp_size < MM_outer::Pack_type_B::size_packed");
78 if (cp_size < MM_outer::Pack_type_C::size_packed)
79 throw std::runtime_error(
"Error in gemm_sse(...): " 80 "cp_size < MM_outer::Pack_type_C::size_packed");
81 MM_outer::Pack_type_C::template pack<Ordering_col_wise>( C, C_packed, m, n);
82 MM_outer::Pack_type_A::template pack<Ordering_col_wise>(
A, A_packed, m, k);
83 MM_outer::Pack_type_B::template pack<Ordering_col_wise>(
B, B_packed, k, n);
84 MM_outer::exec(&A_packed, &B_packed, C_packed);
85 MM_outer::Pack_type_C::template unpack<Ordering_col_wise>(C, C_packed, m, n);
88 template<
typename real>
100 size_t const cp_size) {
101 throw std::runtime_error(
"gemm_sse not implemented for chosen real type.");
106 double const *
const B,
114 size_t const ap_size,
115 size_t const bp_size,
116 size_t const cp_size) {
117 gemm_sse<double, __m128d, 4, 4, 32, 8, 8>
119 A_packed, B_packed, C_packed, ap_size, bp_size, cp_size);
124 float const *
const B,
132 size_t const ap_size,
133 size_t const bp_size,
134 size_t const cp_size) {
135 gemm_sse<float, __m128, 8, 4, 32, 4, 8>
137 A_packed, B_packed, C_packed, ap_size, bp_size, cp_size);
Matrix multiplication template for architectures with SSE2 or higher and compilers that support C++ i...
Definition: mm_kernel_inner_sse2_A.h:63
Templates for efficient gemm kernels.
ergo_real real
Definition: test.cc:46
Template for matrix matrix multiplication that wraps around a kernel given as template argument...
Definition: mm_kernel_outer_A.h:53
Templates for efficient gemm kernels.
static void gemm_sse(real const *const A, real const *const B, real *C, size_t const m, size_t const n, size_t const k, real *A_packed, real *B_packed, real *C_packed, size_t const ap_size, size_t const bp_size, size_t const cp_size)
Definition: gemm_sse.h:50