tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
kernel_reduce.h
Go to the documentation of this file.
1
35#ifndef KERNEL_REDUCE_H
36#define KERNEL_REDUCE_H
37
38#include "config.h"
40#include "numeric_limits.h"
42
43namespace detail
44{
45
46 // ============================================================================
47 // ReduceOp enum — shared by all reduction machinery
48 // ============================================================================
49
50 enum class ReduceOp
51 {
52 Min,
53 Max,
54 Sum
55 };
56
57 template <typename T, my_size_t Bits, typename Arch>
59 {
61 static constexpr my_size_t simdWidth = K::simdWidth;
62
63 // ========================================================================
64 // Public API
65 // ========================================================================
66
67 template <typename Expr>
68 FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
69 {
70 return reduce<ReduceOp::Min>(expr);
71 }
72
73 template <typename Expr>
74 FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
75 {
76 return reduce<ReduceOp::Max>(expr);
77 }
78
79 template <typename Expr>
80 FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
81 {
82 return reduce<ReduceOp::Sum>(expr);
83 }
84
85 // ========================================================================
86 // Private implementation
87 // ========================================================================
88
89 private: // TODO: make private once KernelOps facade is the only caller
90 // --- ReduceOp traits ---
91
92 template <ReduceOp Op>
93 FORCE_INLINE static T reduce_identity() noexcept
94 {
95 if constexpr (Op == ReduceOp::Min)
96 return NumericLimits<T>::max();
97 if constexpr (Op == ReduceOp::Max)
99 if constexpr (Op == ReduceOp::Sum)
100 return T{0};
101 }
102
103 template <ReduceOp Op>
104 FORCE_INLINE static typename K::VecType reduce_simd_combine(
105 typename K::VecType a, typename K::VecType b) noexcept
106 {
107 if constexpr (Op == ReduceOp::Min)
108 return K::min(a, b);
109 if constexpr (Op == ReduceOp::Max)
110 return K::max(a, b);
111 if constexpr (Op == ReduceOp::Sum)
112 return K::add(a, b);
113 }
114
115 template <ReduceOp Op>
116 FORCE_INLINE static T reduce_scalar_combine(T a, T b) noexcept
117 {
118 using ScalarK = Microkernel<T, 1, GENERICARCH>;
119 if constexpr (Op == ReduceOp::Min)
120 return ScalarK::min(a, b);
121 if constexpr (Op == ReduceOp::Max)
122 return ScalarK::max(a, b);
123 if constexpr (Op == ReduceOp::Sum)
124 return ScalarK::add(a, b);
125 }
126
127 // --- Dispatch ---
128
129 template <ReduceOp Op, typename Expr>
130 FORCE_INLINE static T reduce(const Expr &expr) noexcept
131 {
133 {
134 // std::cout << "reduce_contiguous" << std::endl;
135 return reduce_contiguous<Op>(expr);
136 }
137 else
138 {
139 // std::cout << "reduce_logical" << std::endl;
140 return reduce_logical<Op>(expr);
141 }
142 }
143
144 // --- Contiguous path — iterate physical slices, skip padding ---
145
146 template <ReduceOp Op, typename Expr>
147 FORCE_INLINE static T reduce_contiguous(const Expr &expr) noexcept
148 {
149 using ExprPadPolicy = typename Expr::Layout::PadPolicyType;
150
151 static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
152 static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
153 static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
154 static constexpr my_size_t simdSteps = lastDim / simdWidth;
155 static constexpr my_size_t scalarStart = simdSteps * simdWidth;
156
157 T result = reduce_identity<Op>();
158
159 if constexpr (simdSteps > 0)
160 {
161 typename K::VecType acc = K::set1(reduce_identity<Op>());
162
163 for (my_size_t slice = 0; slice < numSlices; ++slice)
164 {
165 const my_size_t base = slice * paddedLastDim;
166 for (my_size_t i = 0; i < simdSteps; ++i)
167 acc = reduce_simd_combine<Op>(
168 acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
169 }
170
171 alignas(DATA_ALIGNAS) T tmp[simdWidth];
172 K::store(tmp, acc);
173
174 for (my_size_t i = 0; i < simdWidth; ++i)
175 result = reduce_scalar_combine<Op>(result, tmp[i]);
176 }
177
178 if constexpr (scalarStart < lastDim)
179 {
180 for (my_size_t slice = 0; slice < numSlices; ++slice)
181 {
182 const my_size_t base = slice * paddedLastDim;
183 for (my_size_t i = scalarStart; i < lastDim; ++i)
184 result = reduce_scalar_combine<Op>(
185 result, expr.template evalu<T, 1, GENERICARCH>(base + i));
186 }
187 }
188
189 return result;
190 }
191
192 // --- Logical path — iterate logical flat indices, scalar only ---
193 //
194 // For permuted views. Consecutive logical flats are non-contiguous
195 // in physical memory, so SIMD load would read wrong data.
196 // Scalar evalu handles the remapping internally.
197
198 template <ReduceOp Op, typename Expr>
199 FORCE_INLINE static T reduce_logical(const Expr &expr) noexcept
200 {
201 static constexpr my_size_t totalSize = Expr::TotalSize;
202
203 T result = reduce_identity<Op>();
204
205 for (my_size_t i = 0; i < totalSize; ++i)
206 result = reduce_scalar_combine<Op>(
207 result, expr.template logical_evalu<T, 1, GENERICARCH>(i));
208
209 return result;
210 }
211 };
212
213} // namespace detail
214
215// =============================================================================
216// OLD IMPLEMENTATIONS — kept for reference during future refactors
217// =============================================================================
218
219// /**
220// * @brief Find the minimum element across all logical elements of an expression.
221// *
222// * Iterates slice-by-slice over the physical buffer, where "slice" means a contiguous
223// * slice along the last dimension. This naturally skips padding without needing
224// * identity-element tricks.
225// *
226// * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
227// * @param expr The expression to reduce
228// * @return T The minimum logical element
229// *
230// * ============================================================================
231// * STRATEGY
232// * ============================================================================
233// *
234// * Physical memory is organized as numSlices × paddedLastDim, where only the
235// * first lastDim elements per slice are logical data:
236// *
237// * slice 0: [d d d d d P P P]
238// * slice 1: [d d d d d P P P] d = data, P = padding
239// * slice 2: [d d d d d P P P]
240// * |← lastDim→|
241// * |← paddedLastDim →|
242// *
243// * For a 3D tensor [2, 3, 5], numSlices = 2*3 = 6:
244// * slice 0 = [0,0,:] slice 3 = [1,0,:]
245// * slice 1 = [0,1,:] slice 4 = [1,1,:]
246// * slice 2 = [0,2,:] slice 5 = [1,2,:]
247// *
248// * Per slice, SIMD processes simdWidth-aligned chunks, then a scalar tail
249// * handles the remainder. Padding is never read.
250// *
251// * ============================================================================
252// * EXAMPLE: FusedTensorND<double, 2, 3, 5>, AVX (SimdWidth=4)
253// * ============================================================================
254// *
255// * lastDim=5, paddedLastDim=8, numSlices=6, simdSteps=1, scalarStart=4
256// *
257// * Per slice (8 physical slots):
258// * [d0 d1 d2 d3 | d4 P P P ]
259// * ^^^^^^^^^^^ ^^
260// * SIMD (i=0) scalar tail
261// *
262// * ============================================================================
263// * EXAMPLE: FusedTensorND<double, 2, 2>, AVX (SimdWidth=4)
264// * ============================================================================
265// *
266// * lastDim=2, paddedLastDim=4, numSlices=2, simdSteps=0, scalarStart=0
267// *
268// * Per slice (4 physical slots):
269// * [d0 d1 P P ]
270// * ^^^^^
271// * scalar only (SIMD block skipped entirely)
272// *
273// * ============================================================================
274// * GENERICARCH (SimdWidth=1): no padding, simdSteps=lastDim, no scalar tail.
275// * Microkernel ops inline to plain scalar — same codegen as a manual loop.
276// * ============================================================================
277// */
278// template <typename Expr>
279// FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
280// {
281// using ExprLayout = typename Expr::Layout;
282// using ExprPadPolicy = typename ExprLayout::PadPolicyType;
283//
284// // Slice geometry from padding policy
285// static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
286// static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
287// static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
288// static constexpr my_size_t simdSteps = lastDim / simdWidth;
289// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
290//
291// T result = NumericLimits<T>::max();
292//
293// // --- SIMD path: process simdWidth elements at a time per slice ---
294// if constexpr (simdSteps > 0)
295// {
296// typename K::VecType acc = K::set1(NumericLimits<T>::max());
297//
298// for (my_size_t slice = 0; slice < numSlices; ++slice)
299// {
300// const my_size_t base = slice * paddedLastDim;
301// for (my_size_t i = 0; i < simdSteps; ++i)
302// acc = K::min(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
303// }
304//
305// // Horizontal reduction: collapse SIMD register to scalar
306// alignas(DATA_ALIGNAS) T tmp[simdWidth];
307// K::store(tmp, acc);
308//
309// for (my_size_t i = 0; i < simdWidth; ++i)
310// {
311// if (tmp[i] < result)
312// result = tmp[i];
313// }
314// }
315//
316// // --- Scalar tail: elements at [scalarStart, lastDim) per slice ---
317// if constexpr (scalarStart < lastDim)
318// {
319// for (my_size_t slice = 0; slice < numSlices; ++slice)
320// {
321// const my_size_t base = slice * paddedLastDim;
322// for (my_size_t i = scalarStart; i < lastDim; ++i)
323// {
324// T val = expr.template evalu<T, 1, GENERICARCH>(base + i);
325// if (val < result)
326// result = val;
327// }
328// }
329// }
330//
331// return result;
332// }
333
334// /**
335// * @brief Find the maximum element across all logical elements of an expression.
336// * (Same strategy as reduce_min — see above for full documentation)
337// */
338// template <typename Expr>
339// FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
340// {
341// using ExprLayout = typename Expr::Layout;
342// using ExprPadPolicy = typename ExprLayout::PadPolicyType;
343//
344// static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
345// static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
346// static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
347// static constexpr my_size_t simdSteps = lastDim / simdWidth;
348// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
349//
350// T result = NumericLimits<T>::lowest();
351//
352// if constexpr (simdSteps > 0)
353// {
354// typename K::VecType acc = K::set1(NumericLimits<T>::lowest());
355//
356// for (my_size_t slice = 0; slice < numSlices; ++slice)
357// {
358// const my_size_t base = slice * paddedLastDim;
359// for (my_size_t i = 0; i < simdSteps; ++i)
360// acc = K::max(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
361// }
362//
363// alignas(DATA_ALIGNAS) T tmp[simdWidth];
364// K::store(tmp, acc);
365//
366// for (my_size_t i = 0; i < simdWidth; ++i)
367// {
368// if (tmp[i] > result)
369// result = tmp[i];
370// }
371// }
372//
373// if constexpr (scalarStart < lastDim)
374// {
375// for (my_size_t slice = 0; slice < numSlices; ++slice)
376// {
377// const my_size_t base = slice * paddedLastDim;
378// for (my_size_t i = scalarStart; i < lastDim; ++i)
379// {
380// T val = expr.template evalu<T, 1, GENERICARCH>(base + i);
381// if (val > result)
382// result = val;
383// }
384// }
385// }
386//
387// return result;
388// }
389
390// /**
391// * @brief Sum all logical elements of an expression.
392// * (Same strategy as reduce_min — see above for full documentation)
393// *
394// * NOTE: Padding is zero, so iterating the full physical buffer would give
395// * a correct sum. But slice-by-slice is used for consistency and because
396// * linear iteration over TotalSize crosses padding boundaries mid-SIMD-load.
397// */
398// template <typename Expr>
399// FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
400// {
401// using ExprLayout = typename Expr::Layout;
402// using ExprPadPolicy = typename ExprLayout::PadPolicyType;
403//
404// static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
405// static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
406// static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
407// static constexpr my_size_t simdSteps = lastDim / simdWidth;
408// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
409//
410// T result = T{0};
411//
412// if constexpr (simdSteps > 0)
413// {
414// typename K::VecType acc = K::set1(T{0});
415//
416// for (my_size_t slice = 0; slice < numSlices; ++slice)
417// {
418// const my_size_t base = slice * paddedLastDim;
419// for (my_size_t i = 0; i < simdSteps; ++i)
420// acc = K::add(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
421// }
422//
423// alignas(DATA_ALIGNAS) T tmp[simdWidth];
424// K::store(tmp, acc);
425//
426// for (my_size_t i = 0; i < simdWidth; ++i)
427// result += tmp[i];
428// }
429//
430// if constexpr (scalarStart < lastDim)
431// {
432// for (my_size_t slice = 0; slice < numSlices; ++slice)
433// {
434// const my_size_t base = slice * paddedLastDim;
435// for (my_size_t i = scalarStart; i < lastDim; ++i)
436// result += expr.template evalu<T, 1, GENERICARCH>(base + i);
437// }
438// }
439//
440// return result;
441// }
442
443#endif // KERNEL_REDUCE_H
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
constexpr my_size_t DATA_ALIGNAS
Definition microkernel_base.h:145
Definition BaseExpr.h:4
ReduceOp
Definition kernel_reduce.h:51
Definition microkernel_base.h:16
T VecType
Definition microkernel_base.h:18
static FORCE_INLINE void store(T *ptr, VecType val) noexcept
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
static FORCE_INLINE VecType set1(T scalar) noexcept
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Compile-time numeric limits for a given type.
Definition numeric_limits.h:17
Definition kernel_reduce.h:59
static FORCE_INLINE T reduce_sum(const Expr &expr) noexcept
Definition kernel_reduce.h:80
static FORCE_INLINE T reduce_min(const Expr &expr) noexcept
Definition kernel_reduce.h:68
static FORCE_INLINE T reduce_max(const Expr &expr) noexcept
Definition kernel_reduce.h:74
static constexpr my_size_t simdWidth
Definition kernel_reduce.h:61
Definition basic_expr_traits.h:6