tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
kernel_eval.h
Go to the documentation of this file.
1
10#ifndef KERNEL_EVAL_H
11#define KERNEL_EVAL_H
12
13#include "config.h"
15#include "helper_traits.h"
18
19namespace detail
20{
21
22 template <typename T, my_size_t Bits, typename Arch>
24 {
26 static constexpr my_size_t simdWidth = K::simdWidth;
27
28 // ========================================================================
29 // Public dispatch
30 // ========================================================================
31
35 template <typename Expr>
36 FORCE_INLINE static void eval(T *output, const Expr &expr) noexcept
37 {
39 {
40 // std::cout << "eval_contiguous" << std::endl;
41 eval_vectorized_contiguous(output, expr);
42 }
43 else
44 {
45 // std::cout << "eval_permuted" << std::endl;
46 eval_vectorized_permuted(output, expr);
47 }
48 }
49
50 private:
51 // ========================================================================
52 // OutputPadPolicy — derive output padding from permuted expression dims
53 // ========================================================================
54
55 template <typename Expr, typename Seq>
56 struct OutputPadImpl
57 {
58 };
59
60 template <typename Expr, my_size_t... Is>
61 struct OutputPadImpl<Expr, index_seq<Is...>>
62 {
63 using type = SimdPaddingPolicy<typename Expr::value_type, Expr::Dim[Is]...>;
64 };
65
66 template <typename Expr>
67 struct OutputPadPolicy
68 {
69 using type = typename OutputPadImpl<Expr, typename make_index_seq<Expr::NumDims>::type>::type;
70 };
71
72 // ========================================================================
73 // Contiguous path
74 // ========================================================================
75
91 template <typename Expr>
92 FORCE_INLINE static void eval_vectorized_contiguous(
93 T *output,
94 const Expr &expr) noexcept
95 {
96 using Layout = typename Expr::Layout;
97 static constexpr my_size_t physicalSize = Layout::PhysicalSize;
98 static constexpr my_size_t simdSteps = physicalSize / simdWidth;
99 static constexpr bool hasRemainder = (physicalSize % simdWidth) != 0;
100
101 // Paranoia check: ensure physical size is a multiple of SIMD width,
102 // so we never read out of bounds
103 static_assert(physicalSize % simdWidth == 0,
104 "PhysicalSize must be a multiple of SimdWidth");
105
106 // SIMD loop
107 for (my_size_t i = 0; i < simdSteps; ++i)
108 {
109 auto val = expr.template evalu<T, Bits, Arch>(i * simdWidth);
110 K::store(output + i * simdWidth, val);
111 }
112
113 // Scalar remainder TODO: The whole point of padding is that PhysicalSize is already
114 // a multiple of SimdWidth — so there's no scalar remainder
115 // Delete this code if confirmed unnecessary
116 if constexpr (hasRemainder)
117 {
118 std::cout << "Warning: Scalar evaluation for remainder elements." << std::endl;
119 // for (my_size_t i = simdSteps * simdWidth; i < physicalSize; ++i)
120 // {
121 // output[i] = expr.template evalu<T, 1, GENERICARCH>(i);
122 // }
123 }
124 }
125
126 // ========================================================================
127 // Permuted path
128 // ========================================================================
129
150 template <typename Expr>
151 FORCE_INLINE static void eval_vectorized_permuted(
152 T *output,
153 const Expr &expr) noexcept
154 {
155 using OutputPad = typename OutputPadPolicy<Expr>::type;
156
157 static constexpr my_size_t lastDim = OutputPad::LastDim;
158 static constexpr my_size_t paddedLastDim = OutputPad::PaddedLastDim;
159 static constexpr my_size_t numSlices = OutputPad::PhysicalSize / paddedLastDim;
160
161 static constexpr my_size_t simdSteps = lastDim / simdWidth;
162 static constexpr my_size_t scalarStart = simdSteps * simdWidth;
163
164 my_size_t logical_flat = 0;
165
166 for (my_size_t slice = 0; slice < numSlices; ++slice)
167 {
168 const my_size_t out_base = slice * paddedLastDim;
169
170 for (my_size_t i = 0; i < simdSteps; ++i)
171 {
172 auto val = expr.template logical_evalu<T, Bits, Arch>(logical_flat);
173 K::store(output + out_base + i * simdWidth, val);
174 logical_flat += simdWidth;
175 }
176
177 if constexpr (scalarStart < lastDim)
178 {
179 for (my_size_t i = scalarStart; i < lastDim; ++i)
180 {
181 output[out_base + i] = expr.template logical_evalu<T, 1, GENERICARCH>(logical_flat);
182 ++logical_flat;
183 }
184 }
185 }
186 }
187 };
188
189} // namespace detail
190
191#endif // KERNEL_EVAL_H
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
Definition BaseExpr.h:4
Definition microkernel_base.h:16
static FORCE_INLINE void store(T *ptr, VecType val) noexcept
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
Padding policy that pads the last dimension for SIMD alignment.
Definition simd_padding_policy.h:186
Definition kernel_eval.h:24
static constexpr my_size_t simdWidth
Definition kernel_eval.h:26
static FORCE_INLINE void eval(T *output, const Expr &expr) noexcept
Dispatch: pick contiguous or permuted eval based on expression layout.
Definition kernel_eval.h:36
Definition basic_expr_traits.h:6
Compile-time index sequence (lightweight std::index_sequence alternative).
Definition helper_traits.h:172