tesseract/kernel__gemm_8h_source.html

#ifndef KERNEL_GEMM_H

#define KERNEL_GEMM_H


#include "config.h"

#include "fused/microkernels/microkernel_base.h"

#include "fused/kernel_ops/kernel_helpers.h"


namespace detail

{


    template <typename T, my_size_t Bits, typename Arch>


    struct KernelGemm

    {

        using K = Microkernel<T, Bits, Arch>;

        using Helpers = KernelHelpers<T, Bits, Arch>;

        static constexpr my_size_t simdWidth = K::simdWidth;


        static constexpr my_size_t MR = K::MR;


        static constexpr my_size_t NR_VECS = K::NR_VECS;


        static constexpr my_size_t NR = K::NR;


        static void gemm(

            const T *A, my_size_t M, my_size_t K_len, my_size_t strideA,

            const T *B, my_size_t N, my_size_t strideB,

            T *C, my_size_t strideC) noexcept

        {

            // Column boundaries for the three-pass tiling:

            //   [0, wide_N)    → wide micro-kernel   (steps of NR)

            //   [wide_N, narrow_N) → narrow micro-kernel (steps of simdWidth)

            //   [narrow_N, N)  → scalar column loop   (steps of 1)

            const my_size_t wide_N = (N / NR) * NR;

            const my_size_t narrow_N = (N / simdWidth) * simdWidth;


            // ==============================================================

            // Main body: MR rows at a time

            // ==============================================================

            my_size_t i = 0;

            for (; i + MR <= M; i += MR)

            {

                my_size_t j = 0;


                for (; j < wide_N; j += NR)

                {

                    micro_kernel_wide(

                        A + i * strideA, strideA,

                        B + j, strideB,

                        C + i * strideC + j, strideC,

                        K_len);

                }


                for (; j < narrow_N; j += simdWidth)

                {

                    micro_kernel_narrow(

                        A + i * strideA, strideA,

                        B + j, strideB,

                        C + i * strideC + j, strideC,

                        K_len);

                }


                for (; j < N; ++j)

                {

                    scalar_column_MR(

                        A + i * strideA, strideA,

                        B + j, strideB,

                        C + i * strideC + j, strideC,

                        K_len);

                }

            }


            // ==============================================================

            // Remainder rows (< MR): same three-pass column strategy,

            // but processing one row at a time.

            // ==============================================================

            for (; i < M; ++i)

            {

                my_size_t j = 0;


                for (; j < wide_N; j += NR)

                {

                    single_row_wide(

                        A + i * strideA,

                        B + j, strideB,

                        C + i * strideC + j,

                        K_len);

                }


                for (; j < narrow_N; j += simdWidth)

                {

                    single_row_narrow(

                        A + i * strideA,

                        B + j, strideB,

                        C + i * strideC + j,

                        K_len);

                }


                for (; j < N; ++j)

                {

                    T sum = T{0};

                    for (my_size_t k = 0; k < K_len; ++k)

                        sum += A[i * strideA + k] * B[k * strideB + j];

                    C[i * strideC + j] = sum;

                }

            }

        }


    private:

        FORCE_INLINE static void micro_kernel_wide(

            const T *A, my_size_t strideA,

            const T *B, my_size_t strideB,

            T *C, my_size_t strideC,

            my_size_t K_len) noexcept

        {

            // Step 1: zero accumulators

            typename K::VecType acc[MR][NR_VECS];

            for (my_size_t r = 0; r < MR; ++r)

                for (my_size_t v = 0; v < NR_VECS; ++v)

                    acc[r][v] = K::set1(T{0});


            // Step 2: outer-product accumulation over k

            for (my_size_t k = 0; k < K_len; ++k)

            {

                // 2a: load NR_VECS contiguous vectors from B[k, j..j+NR-1]

                typename K::VecType b_vec[NR_VECS];

                for (my_size_t v = 0; v < NR_VECS; ++v)

                    b_vec[v] = K::load(B + k * strideB + v * simdWidth);


                // 2b: broadcast each A element and FMA into accumulators

                for (my_size_t r = 0; r < MR; ++r)

                {

                    auto a_bcast = K::set1(A[r * strideA + k]);

                    for (my_size_t v = 0; v < NR_VECS; ++v)

                        acc[r][v] = Helpers::fmadd_safe(a_bcast, b_vec[v], acc[r][v]);

                }

            }


            // Step 3: store completed tile to C

            for (my_size_t r = 0; r < MR; ++r)

                for (my_size_t v = 0; v < NR_VECS; ++v)

                    K::store(C + r * strideC + v * simdWidth, acc[r][v]);

        }


        FORCE_INLINE static void micro_kernel_narrow(

            const T *A, my_size_t strideA,

            const T *B, my_size_t strideB,

            T *C, my_size_t strideC,

            my_size_t K_len) noexcept

        {

            typename K::VecType acc[MR];

            for (my_size_t r = 0; r < MR; ++r)

                acc[r] = K::set1(T{0});


            for (my_size_t k = 0; k < K_len; ++k)

            {

                auto b_vec = K::load(B + k * strideB);


                for (my_size_t r = 0; r < MR; ++r)

                {

                    auto a_bcast = K::set1(A[r * strideA + k]);

                    acc[r] = Helpers::fmadd_safe(a_bcast, b_vec, acc[r]);

                }

            }


            for (my_size_t r = 0; r < MR; ++r)

                K::store(C + r * strideC, acc[r]);

        }


        FORCE_INLINE static void scalar_column_MR(

            const T *A, my_size_t strideA,

            const T *B, my_size_t strideB,

            T *C, my_size_t strideC,

            my_size_t K_len) noexcept

        {

            T acc[MR] = {};


            for (my_size_t k = 0; k < K_len; ++k)

            {

                T b_val = B[k * strideB];

                for (my_size_t r = 0; r < MR; ++r)

                    acc[r] += A[r * strideA + k] * b_val;

            }


            for (my_size_t r = 0; r < MR; ++r)

                C[r * strideC] = acc[r];

        }


        FORCE_INLINE static void single_row_wide(

            const T *A,

            const T *B, my_size_t strideB,

            T *C,

            my_size_t K_len) noexcept

        {

            typename K::VecType acc[NR_VECS];

            for (my_size_t v = 0; v < NR_VECS; ++v)

                acc[v] = K::set1(T{0});


            for (my_size_t k = 0; k < K_len; ++k)

            {

                auto a_bcast = K::set1(A[k]);

                for (my_size_t v = 0; v < NR_VECS; ++v)

                    acc[v] = Helpers::fmadd_safe(a_bcast, K::load(B + k * strideB + v * simdWidth), acc[v]);

            }


            for (my_size_t v = 0; v < NR_VECS; ++v)

                K::store(C + v * simdWidth, acc[v]);

        }


        FORCE_INLINE static void single_row_narrow(

            const T *A,

            const T *B, my_size_t strideB,

            T *C,

            my_size_t K_len) noexcept

        {

            typename K::VecType acc = K::set1(T{0});


            for (my_size_t k = 0; k < K_len; ++k)

            {

                auto b_vec = K::load(B + k * strideB);

                auto a_bcast = K::set1(A[k]);

                acc = Helpers::fmadd_safe(a_bcast, b_vec, acc);

            }


            K::store(C, acc);

        }

    };


} // namespace detail


#endif // KERNEL_GEMM_H

config.h
Global configuration for the tesseract tensor library.

my_size_t
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126

FORCE_INLINE
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26

kernel_helpers.h
Shared SIMD helper utilities for kernel operations.

microkernel_base.h

detail
Definition BaseExpr.h:4

sum
Expr::value_type sum(const BaseExpr< Expr > &expr)
Definition reductions.h:30

Microkernel
Definition microkernel_base.h:16

Microkernel::VecType
T VecType
Definition microkernel_base.h:18

Microkernel::store
static FORCE_INLINE void store(T *ptr, VecType val) noexcept

Microkernel::simdWidth
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17

Microkernel::load
static FORCE_INLINE VecType load(const T *ptr) noexcept

Microkernel::set1
static FORCE_INLINE VecType set1(T scalar) noexcept

detail::KernelGemm
Definition kernel_gemm.h:114

detail::KernelGemm::gemm
static void gemm(const T *A, my_size_t M, my_size_t K_len, my_size_t strideA, const T *B, my_size_t N, my_size_t strideB, T *C, my_size_t strideC) noexcept
Register-blocked GEMM: C[M,N] = A[M,K] × B[K,N].
Definition kernel_gemm.h:147

detail::KernelGemm::NR_VECS
static constexpr my_size_t NR_VECS
Number of SIMD vectors per tile column. The tile width is NR = NR_VECS × simdWidth.
Definition kernel_gemm.h:123

detail::KernelGemm::MR
static constexpr my_size_t MR
Tile height: rows of C computed per micro-kernel invocation.
Definition kernel_gemm.h:120

detail::KernelGemm::simdWidth
static constexpr my_size_t simdWidth
Definition kernel_gemm.h:117

detail::KernelGemm::NR
static constexpr my_size_t NR
Tile width: columns of C computed per wide micro-kernel invocation.
Definition kernel_gemm.h:126

detail::KernelHelpers
Definition kernel_helpers.h:19

detail::KernelHelpers::fmadd_safe
static FORCE_INLINE K::VecType fmadd_safe(typename K::VecType a, typename K::VecType b, typename K::VecType c) noexcept
Fused multiply-add with fallback for architectures without native FMA.
Definition kernel_helpers.h:27