112 template <
typename T, my_
size_t Bits,
typename Arch>
163 for (; i +
MR <= M; i +=
MR)
167 for (; j < wide_N; j +=
NR)
170 A + i * strideA, strideA,
172 C + i * strideC + j, strideC,
179 A + i * strideA, strideA,
181 C + i * strideC + j, strideC,
188 A + i * strideA, strideA,
190 C + i * strideC + j, strideC,
203 for (; j < wide_N; j +=
NR)
225 sum += A[i * strideA + k] * B[k * strideB + j];
226 C[i * strideC + j] =
sum;
280 auto a_bcast =
K::set1(A[r * strideA + k]);
321 auto b_vec =
K::load(B + k * strideB);
325 auto a_bcast =
K::set1(A[r * strideA + k]);
358 T b_val = B[k * strideB];
360 acc[r] += A[r * strideA + k] * b_val;
364 C[r * strideC] = acc[r];
422 auto b_vec =
K::load(B + k * strideB);
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
Shared SIMD helper utilities for kernel operations.
Expr::value_type sum(const BaseExpr< Expr > &expr)
Definition reductions.h:30
Definition microkernel_base.h:16
T VecType
Definition microkernel_base.h:18
static FORCE_INLINE void store(T *ptr, VecType val) noexcept
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
static FORCE_INLINE VecType load(const T *ptr) noexcept
static FORCE_INLINE VecType set1(T scalar) noexcept
Definition kernel_gemm.h:114
static void gemm(const T *A, my_size_t M, my_size_t K_len, my_size_t strideA, const T *B, my_size_t N, my_size_t strideB, T *C, my_size_t strideC) noexcept
Register-blocked GEMM: C[M,N] = A[M,K] × B[K,N].
Definition kernel_gemm.h:147
static constexpr my_size_t NR_VECS
Number of SIMD vectors per tile column. The tile width is NR = NR_VECS × simdWidth.
Definition kernel_gemm.h:123
static constexpr my_size_t MR
Tile height: rows of C computed per micro-kernel invocation.
Definition kernel_gemm.h:120
static constexpr my_size_t simdWidth
Definition kernel_gemm.h:117
static constexpr my_size_t NR
Tile width: columns of C computed per wide micro-kernel invocation.
Definition kernel_gemm.h:126
Definition kernel_helpers.h:19
static FORCE_INLINE K::VecType fmadd_safe(typename K::VecType a, typename K::VecType b, typename K::VecType c) noexcept
Fused multiply-add with fallback for architectures without native FMA.
Definition kernel_helpers.h:27