34 template <
typename T, my_
size_t Bits,
typename Arch>
54 template <
typename Expr1,
typename Expr2>
60 if (stride1 == 1 && stride2 == 1)
63 return dot_contiguous_impl(expr1, expr2, base1, base2, len);
68 return dot_strided_impl(expr1, expr2, base1, base2, stride1, stride2, len);
78 template <
typename Expr1,
typename Expr2>
86 sum += expr1.data()[base1 + i * stride1] *
87 expr2.data()[base2 + i * stride2];
107 template <
typename Expr1,
typename Expr2>
116 const T *ptr1 = expr1.data() + base1;
117 const T *ptr2 = expr2.data() + base2;
128 for (
my_size_t i = 0; i < simdSteps; ++i)
142 for (
my_size_t i = scalarStart; i < len; ++i)
143 result += ptr1[i] * ptr2[i];
163 template <
typename Expr1,
typename Expr2>
183 for (
my_size_t i = 0; i < simdSteps; ++i)
190 idxList1[j] = idx1 + j * stride1;
191 idxList2[j] = idx2 + j * stride2;
194 auto v1 = K::gather(expr1.data(), idxList1);
195 auto v2 = K::gather(expr2.data(), idxList2);
210 for (
my_size_t i = scalarStart; i < len; ++i)
212 result += expr1.data()[idx1] * expr2.data()[idx2];
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
Shared SIMD helper utilities for kernel operations.
constexpr my_size_t DATA_ALIGNAS
Definition microkernel_base.h:145
Expr::value_type sum(const BaseExpr< Expr > &expr)
Definition reductions.h:30
Definition microkernel_base.h:16
T VecType
Definition microkernel_base.h:18
static FORCE_INLINE void store(T *ptr, VecType val) noexcept
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
static FORCE_INLINE VecType load(const T *ptr) noexcept
static FORCE_INLINE VecType set1(T scalar) noexcept
Definition kernel_dot.h:36
static constexpr my_size_t simdWidth
Definition kernel_dot.h:39
static FORCE_INLINE T naive_dot_physical(const Expr1 &expr1, my_size_t base1, my_size_t stride1, const Expr2 &expr2, my_size_t base2, my_size_t stride2, my_size_t len) noexcept
Naive scalar dot product for testing/validation.
Definition kernel_dot.h:79
static FORCE_INLINE T dot(const Expr1 &expr1, my_size_t base1, my_size_t stride1, const Expr2 &expr2, my_size_t base2, my_size_t stride2, my_size_t len) noexcept
Dispatch dot product based on stride values.
Definition kernel_dot.h:55
Definition kernel_helpers.h:19
static FORCE_INLINE K::VecType fmadd_safe(typename K::VecType a, typename K::VecType b, typename K::VecType c) noexcept
Fused multiply-add with fallback for architectures without native FMA.
Definition kernel_helpers.h:27