#include <kernel_gemm.h>
|
| static void | gemm (const T *A, my_size_t M, my_size_t K_len, my_size_t strideA, const T *B, my_size_t N, my_size_t strideB, T *C, my_size_t strideC) noexcept |
| | Register-blocked GEMM: C[M,N] = A[M,K] × B[K,N].
|
| |
|
| static constexpr my_size_t | simdWidth = K::simdWidth |
| |
| static constexpr my_size_t | MR = K::MR |
| | Tile height: rows of C computed per micro-kernel invocation.
|
| |
| static constexpr my_size_t | NR_VECS = K::NR_VECS |
| | Number of SIMD vectors per tile column. The tile width is NR = NR_VECS × simdWidth.
|
| |
| static constexpr my_size_t | NR = K::NR |
| | Tile width: columns of C computed per wide micro-kernel invocation.
|
| |
◆ Helpers
template<typename T ,
my_size_t Bits, typename Arch >
template<typename T ,
my_size_t Bits, typename Arch >
◆ gemm()
template<typename T ,
my_size_t Bits, typename Arch >
| static void detail::KernelGemm< T, Bits, Arch >::gemm |
( |
const T * |
A, |
|
|
my_size_t |
M, |
|
|
my_size_t |
K_len, |
|
|
my_size_t |
strideA, |
|
|
const T * |
B, |
|
|
my_size_t |
N, |
|
|
my_size_t |
strideB, |
|
|
T * |
C, |
|
|
my_size_t |
strideC |
|
) |
| |
|
inlinestaticnoexcept |
Register-blocked GEMM: C[M,N] = A[M,K] × B[K,N].
Top-level dispatcher that tiles the output matrix and routes each tile to the appropriate micro-kernel based on its position.
All pointers address raw physical memory with padded row strides. The caller must ensure the favorable layout (see Memory Layout Requirements).
- Parameters
-
| A | Pointer to first element of A |
| M | Number of rows of A (and C) |
| K_len | Contraction length (columns of A, rows of B) |
| strideA | Physical row stride of A (≥ K_len, includes padding) |
| B | Pointer to first element of B |
| N | Number of columns of B (and C) |
| strideB | Physical row stride of B (≥ N, includes padding) |
| C | Pointer to first element of C (output, zero-initialized not required) |
| strideC | Physical row stride of C (≥ N, includes padding) |
◆ MR
template<typename T ,
my_size_t Bits, typename Arch >
Tile height: rows of C computed per micro-kernel invocation.
◆ NR
template<typename T ,
my_size_t Bits, typename Arch >
Tile width: columns of C computed per wide micro-kernel invocation.
◆ NR_VECS
template<typename T ,
my_size_t Bits, typename Arch >
Number of SIMD vectors per tile column. The tile width is NR = NR_VECS × simdWidth.
◆ simdWidth
template<typename T ,
my_size_t Bits, typename Arch >
The documentation for this struct was generated from the following file: