tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
kernel_ops.h
Go to the documentation of this file.
1// // Higher-level kernel operations built on top of microkernels
2// #ifndef KERNEL_OPS_H
3// #define KERNEL_OPS_H
4
5// #include "config.h"
6// #include "fused/microkernels/microkernel_base.h"
7// #include "numeric_limits.h"
8// #include "helper_traits.h"
9// #include "fused/padding_policies/simd_padding_policy.h"
10// #include "expression_traits/expression_traits.h"
11
12// template <typename T, my_size_t Bits, typename Arch>
13// struct KernelOps
14// {
15// using K = Microkernel<T, Bits, Arch>;
16// static constexpr my_size_t simdWidth = K::simdWidth;
17
18// // ========================================================================
19// // Evaluation
20// // ========================================================================
21
22// /**
23// * @brief Dispatch: pick contiguous or permuted eval based on expression layout.
24// *
25// * @tparam Expr
26// * @param output
27// * @param expr
28// */
29// template <typename Expr>
30// FORCE_INLINE static void eval(T *output, const Expr &expr) noexcept
31// {
32// if constexpr (!expression::traits<Expr>::IsPermuted)
33// {
34// // std::cout << "eval_contiguous" << std::endl;
35// eval_vectorized_contiguous(output, expr);
36// }
37// else
38// {
39// // std::cout << "eval_permuted" << std::endl;
40// eval_vectorized_permuted(output, expr);
41// }
42// }
43
44// // // ========================================================================
45// // // Reductions
46// // // ========================================================================
47
48// // /**
49// // * @brief Find the minimum element across all logical elements of an expression.
50// // *
51// // * Iterates slice-by-slice over the physical buffer, where "slice" means a contiguous
52// // * slice along the last dimension. This naturally skips padding without needing
53// // * identity-element tricks.
54// // *
55// // * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
56// // * @param expr The expression to reduce
57// // * @return T The minimum logical element
58// // *
59// // * ============================================================================
60// // * STRATEGY
61// // * ============================================================================
62// // *
63// // * Physical memory is organized as numSlices × paddedLastDim, where only the
64// // * first lastDim elements per slice are logical data:
65// // *
66// // * slice 0: [d d d d d P P P]
67// // * slice 1: [d d d d d P P P] d = data, P = padding
68// // * slice 2: [d d d d d P P P]
69// // * |← lastDim→|
70// // * |← paddedLastDim →|
71// // *
72// // * For a 3D tensor [2, 3, 5], numSlices = 2*3 = 6:
73// // * slice 0 = [0,0,:] slice 3 = [1,0,:]
74// // * slice 1 = [0,1,:] slice 4 = [1,1,:]
75// // * slice 2 = [0,2,:] slice 5 = [1,2,:]
76// // *
77// // * Per slice, SIMD processes simdWidth-aligned chunks, then a scalar tail
78// // * handles the remainder. Padding is never read.
79// // *
80// // * ============================================================================
81// // * EXAMPLE: FusedTensorND<double, 2, 3, 5>, AVX (SimdWidth=4)
82// // * ============================================================================
83// // *
84// // * lastDim=5, paddedLastDim=8, numSlices=6, simdSteps=1, scalarStart=4
85// // *
86// // * Per slice (8 physical slots):
87// // * [d0 d1 d2 d3 | d4 P P P ]
88// // * ^^^^^^^^^^^ ^^
89// // * SIMD (i=0) scalar tail
90// // *
91// // * ============================================================================
92// // * EXAMPLE: FusedTensorND<double, 2, 2>, AVX (SimdWidth=4)
93// // * ============================================================================
94// // *
95// // * lastDim=2, paddedLastDim=4, numSlices=2, simdSteps=0, scalarStart=0
96// // *
97// // * Per slice (4 physical slots):
98// // * [d0 d1 P P ]
99// // * ^^^^^
100// // * scalar only (SIMD block skipped entirely)
101// // *
102// // * ============================================================================
103// // * GENERICARCH (SimdWidth=1): no padding, simdSteps=lastDim, no scalar tail.
104// // * Microkernel ops inline to plain scalar — same codegen as a manual loop.
105// // * ============================================================================
106// // */
107// // template <typename Expr>
108// // FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
109// // {
110// // using ExprLayout = typename Expr::Layout;
111// // using ExprPadPolicy = typename ExprLayout::PadPolicyType;
112
113// // // Slice geometry from padding policy
114// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim; // logical elements per slice
115// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim; // physical stride per slice
116// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim; // total slices across all dims
117// // static constexpr my_size_t simdSteps = lastDim / simdWidth; // full SIMD chunks per slice
118// // static constexpr my_size_t scalarStart = simdSteps * simdWidth; // where scalar tail begins
119
120// // T result = NumericLimits<T>::max();
121
122// // // --- SIMD path: process simdWidth elements at a time per slice ---
123// // if constexpr (simdSteps > 0)
124// // {
125// // typename K::VecType acc = K::set1(NumericLimits<T>::max());
126
127// // for (my_size_t slice = 0; slice < numSlices; ++slice)
128// // {
129// // const my_size_t base = slice * paddedLastDim; // physical offset of slice start
130// // for (my_size_t i = 0; i < simdSteps; ++i)
131// // acc = K::min(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
132// // }
133
134// // // Horizontal reduction: collapse SIMD register to scalar
135// // alignas(DATA_ALIGNAS) T tmp[simdWidth];
136// // K::store(tmp, acc);
137
138// // for (my_size_t i = 0; i < simdWidth; ++i)
139// // {
140// // if (tmp[i] < result)
141// // result = tmp[i];
142// // }
143// // }
144
145// // // --- Scalar tail: elements at [scalarStart, lastDim) per slice ---
146// // if constexpr (scalarStart < lastDim)
147// // {
148// // for (my_size_t slice = 0; slice < numSlices; ++slice)
149// // {
150// // const my_size_t base = slice * paddedLastDim;
151// // for (my_size_t i = scalarStart; i < lastDim; ++i)
152// // {
153// // T val = expr.template evalu<T, 1, GENERICARCH>(base + i);
154// // if (val < result)
155// // result = val;
156// // }
157// // }
158// // }
159
160// // return result;
161// // }
162
163// // /**
164// // * @brief Find the maximum element across all logical elements of an expression.
165// // *
166// // * Iterates slice-by-slice over the physical buffer, where "slice" means a contiguous
167// // * slice along the last dimension. This naturally skips padding without needing
168// // * identity-element tricks.
169// // *
170// // * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
171// // * @param expr The expression to reduce
172// // * @return T The maximum logical element
173// // *
174// // * ============================================================================
175// // * STRATEGY
176// // * ============================================================================
177// // *
178// // * Physical memory is organized as numSlices × paddedLastDim, where only the
179// // * first lastDim elements per slice are logical data:
180// // *
181// // * slice 0: [d d d d d P P P]
182// // * slice 1: [d d d d d P P P] d = data, P = padding
183// // * slice 2: [d d d d d P P P]
184// // * |← lastDim→|
185// // * |← paddedLastDim →|
186// // *
187// // * For a 3D tensor [2, 3, 5], numSlices = 2*3 = 6:
188// // * slice 0 = [0,0,:] slice 3 = [1,0,:]
189// // * slice 1 = [0,1,:] slice 4 = [1,1,:]
190// // * slice 2 = [0,2,:] slice 5 = [1,2,:]
191// // *
192// // * Per slice, SIMD processes simdWidth-aligned chunks, then a scalar tail
193// // * handles the remainder. Padding is never read.
194// // *
195// // * ============================================================================
196// // * GENERICARCH (SimdWidth=1): no padding, simdSteps=lastDim, no scalar tail.
197// // * Microkernel ops inline to plain scalar — same codegen as a manual loop.
198// // * ============================================================================
199// // */
200// // template <typename Expr>
201// // FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
202// // {
203// // using ExprLayout = typename Expr::Layout;
204// // using ExprPadPolicy = typename ExprLayout::PadPolicyType;
205
206// // // Slice geometry from padding policy
207// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim; // logical elements per slice
208// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim; // physical stride per slice
209// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim; // total slices across all dims
210// // static constexpr my_size_t simdSteps = lastDim / simdWidth; // full SIMD chunks per slice
211// // static constexpr my_size_t scalarStart = simdSteps * simdWidth; // where scalar tail begins
212
213// // T result = NumericLimits<T>::lowest();
214
215// // // --- SIMD path: process simdWidth elements at a time per slice ---
216// // if constexpr (simdSteps > 0)
217// // {
218// // typename K::VecType acc = K::set1(NumericLimits<T>::lowest());
219
220// // for (my_size_t slice = 0; slice < numSlices; ++slice)
221// // {
222// // const my_size_t base = slice * paddedLastDim; // physical offset of slice start
223// // for (my_size_t i = 0; i < simdSteps; ++i)
224// // acc = K::max(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
225// // }
226
227// // // Horizontal reduction: collapse SIMD register to scalar
228// // alignas(DATA_ALIGNAS) T tmp[simdWidth];
229// // K::store(tmp, acc);
230
231// // for (my_size_t i = 0; i < simdWidth; ++i)
232// // {
233// // if (tmp[i] > result)
234// // result = tmp[i];
235// // }
236// // }
237
238// // // --- Scalar tail: elements at [scalarStart, lastDim) per slice ---
239// // if constexpr (scalarStart < lastDim)
240// // {
241// // for (my_size_t slice = 0; slice < numSlices; ++slice)
242// // {
243// // const my_size_t base = slice * paddedLastDim;
244// // for (my_size_t i = scalarStart; i < lastDim; ++i)
245// // {
246// // T val = expr.template evalu<T, 1, GENERICARCH>(base + i);
247// // if (val > result)
248// // result = val;
249// // }
250// // }
251// // }
252
253// // return result;
254// // }
255
256// // /**
257// // * @brief Sum all logical elements of an expression.
258// // *
259// // * Iterates slice-by-slice over the physical buffer, skipping padding.
260// // * Same strategy as reduce_min/reduce_max.
261// // *
262// // * ============================================================================
263// // * NOTE: Padding is zero, so iterating the full physical buffer would give
264// // * a correct sum. But slice-by-slice is used for consistency and because
265// // * linear iteration over TotalSize crosses padding boundaries mid-SIMD-load.
266// // * ============================================================================
267// // *
268// // * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
269// // * @param expr The expression to reduce
270// // * @return T The sum of all logical elements
271// // */
272// // template <typename Expr>
273// // FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
274// // {
275// // using ExprLayout = typename Expr::Layout;
276// // using ExprPadPolicy = typename ExprLayout::PadPolicyType;
277
278// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
279// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
280// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
281// // static constexpr my_size_t simdSteps = lastDim / simdWidth;
282// // static constexpr my_size_t scalarStart = simdSteps * simdWidth;
283
284// // T result = T{0};
285
286// // if constexpr (simdSteps > 0)
287// // {
288// // typename K::VecType acc = K::set1(T{0});
289
290// // for (my_size_t slice = 0; slice < numSlices; ++slice)
291// // {
292// // const my_size_t base = slice * paddedLastDim;
293// // for (my_size_t i = 0; i < simdSteps; ++i)
294// // acc = K::add(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
295// // }
296
297// // alignas(DATA_ALIGNAS) T tmp[simdWidth];
298// // K::store(tmp, acc);
299
300// // for (my_size_t i = 0; i < simdWidth; ++i)
301// // result += tmp[i];
302// // }
303
304// // if constexpr (scalarStart < lastDim)
305// // {
306// // for (my_size_t slice = 0; slice < numSlices; ++slice)
307// // {
308// // const my_size_t base = slice * paddedLastDim;
309// // for (my_size_t i = scalarStart; i < lastDim; ++i)
310// // result += expr.template evalu<T, 1, GENERICARCH>(base + i);
311// // }
312// // }
313
314// // return result;
315// // }
316
317// template <typename Expr>
318// FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
319// {
320// return reduce<ReduceOp::Min>(expr);
321// }
322
323// template <typename Expr>
324// FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
325// {
326// return reduce<ReduceOp::Max>(expr);
327// }
328
329// template <typename Expr>
330// FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
331// {
332// return reduce<ReduceOp::Sum>(expr);
333// }
334
335// /**
336// * @brief Check if all logical elements of two expressions are approximately equal.
337// *
338// * Dispatches based on layout compatibility:
339// * - Both unpermuted, same padding → fast physical slice iteration
340// * - Otherwise → logical flat iteration (handles any layout combination)
341// *
342// * Both expressions must have the same logical dimensions.
343// * TODO: do we need to check this at compile time, or is it guaranteed by the caller? think...!
344// *
345// * @tparam Expr1 First expression type
346// * @tparam Expr2 Second expression type
347// * @param lhs Left-hand side expression
348// * @param rhs Right-hand side expression
349// * @param tolerance Approximation tolerance for floating-point comparisons
350// * @return true if all corresponding logical elements are approximately equal within the given tolerance, false otherwise
351// */
352// template <typename Expr1, typename Expr2>
353// FORCE_INLINE static bool reduce_all_approx_equal(
354// const Expr1 &lhs,
355// const Expr2 &rhs,
356// T tolerance) noexcept
357// {
358// if constexpr (expression::traits<Expr1>::IsContiguous &&
359// expression::traits<Expr2>::IsContiguous)
360// return approx_equal_contiguous(lhs, rhs, tolerance);
361// else
362// return approx_equal_logical(lhs, rhs, tolerance);
363// }
364
365// // ========================================================================
366// // DOT PRODUCTS (used by einsum for contraction along a shared axis)
367// // ========================================================================
368// //
369// // All functions take PHYSICAL offsets and strides from the layout.
370// // The caller (einsum) computes these via Layout::stride() and
371// // Layout::logical_coords_to_physical_flat().
372// //
373// // For C[i,j] = sum_k A[i,k] * B[k,j] with A[M,K] and B[K,N]:
374// //
375// // A's fiber along k (last dim): base=A.stride(0)*i, stride=1 → contiguous
376// // B's fiber along k (first dim): base=j, stride=B.stride(0) → strided
377// //
378// // Physical memory for A[2,3] padded to [2,4]:
379// // [a00 a01 a02 P | a10 a11 a12 P]
380// // ^^^^^^^^^^^ ^^^^^^^^^^^
381// // fiber i=0 fiber i=1 → contiguous, len=3
382// //
383// // Physical memory for B[3,2] padded to [3,4]:
384// // [b00 b01 P P | b10 b11 P P | b20 b21 P P]
385// // ^ ^ ^
386// // fiber j=0, stride=4 → strided, len=3
387
388// /**
389// * @brief Dispatch dot product based on stride values.
390// *
391// * @param base1 Physical offset of first fiber's start
392// * @param stride1 Physical stride along contraction axis (1 = contiguous)
393// * @param base2 Physical offset of second fiber's start
394// * @param stride2 Physical stride along contraction axis (1 = contiguous)
395// * @param len Number of elements along contraction axis (logical dim)
396// */
397// template <typename Expr1, typename Expr2>
398// FORCE_INLINE static T dot(
399// const Expr1 &expr1, my_size_t base1, my_size_t stride1,
400// const Expr2 &expr2, my_size_t base2, my_size_t stride2,
401// my_size_t len) noexcept
402// {
403// if (stride1 == 1 && stride2 == 1)
404// return dot_contiguous_impl(expr1, expr2, base1, base2, len);
405// else
406// return dot_strided_impl(expr1, expr2, base1, base2, stride1, stride2, len);
407// }
408
409// /**
410// * @brief Contiguous dot product — both fibers have stride 1.
411// *
412// * Uses K::load for aligned SIMD access. len may not be a multiple
413// * of simdWidth (e.g., logical last dim = 5, simdWidth = 4), so
414// * a scalar remainder handles the tail.
415// *
416// * fiber1: [v0 v1 v2 v3 | v4] fiber2: [w0 w1 w2 w3 | w4]
417// * ^^^^^^^^^^^ ^^ ^^^^^^^^^^^ ^^
418// * SIMD scalar SIMD scalar
419// */
420// template <typename Expr1, typename Expr2>
421// FORCE_INLINE static T dot_contiguous_impl(
422// const Expr1 &expr1,
423// const Expr2 &expr2,
424// my_size_t base1,
425// my_size_t base2,
426// my_size_t len) noexcept
427// {
428// // std::cout << "dot_contiguous_impl" << std::endl;
429// const my_size_t simdSteps = len / simdWidth;
430// const my_size_t scalarStart = simdSteps * simdWidth;
431
432// T result = T{0};
433
434// if (simdSteps > 0)
435// {
436// typename K::VecType acc = K::set1(T{0});
437
438// for (my_size_t i = 0; i < simdSteps; ++i)
439// {
440// auto v1 = expr1.template evalu<T, Bits, Arch>(base1 + i * simdWidth);
441// auto v2 = expr2.template evalu<T, Bits, Arch>(base2 + i * simdWidth);
442// acc = fmadd_safe(v1, v2, acc);
443// }
444
445// alignas(DATA_ALIGNAS) T tmp[simdWidth];
446// K::store(tmp, acc);
447
448// for (my_size_t i = 0; i < simdWidth; ++i)
449// result += tmp[i];
450// }
451
452// for (my_size_t i = scalarStart; i < len; ++i)
453// {
454// T v1 = expr1.template evalu<T, 1, GENERICARCH>(base1 + i);
455// T v2 = expr2.template evalu<T, 1, GENERICARCH>(base2 + i);
456// result += v1 * v2;
457// }
458
459// return result;
460// }
461
462// /**
463// * @brief Strided dot product — one or both fibers have stride > 1.
464// *
465// * Builds explicit index lists and uses K::gather for SIMD access.
466// * Falls back to scalar for remainder.
467// *
468// * fiber along dim 0 of B[3,2] padded to [3,4]:
469// * [b00 _ _ _ | b10 _ _ _ | b20 _ _ _]
470// * ^ ^ ^
471// * idx=0 idx=4 idx=8 stride=4, len=3
472// */
473// template <typename Expr1, typename Expr2>
474// FORCE_INLINE static T dot_strided_impl(
475// const Expr1 &expr1,
476// const Expr2 &expr2,
477// my_size_t idx1,
478// my_size_t idx2,
479// my_size_t stride1,
480// my_size_t stride2,
481// my_size_t len) noexcept
482// {
483// // std::cout << "dot_strided_impl" << std::endl;
484// const my_size_t simdSteps = len / simdWidth;
485// const my_size_t scalarStart = simdSteps * simdWidth;
486
487// T result = T{0};
488
489// if (simdSteps > 0)
490// {
491// typename K::VecType acc = K::set1(T{0});
492
493// for (my_size_t i = 0; i < simdSteps; ++i)
494// {
495// // Build gather indices for this chunk
496// my_size_t idxList1[simdWidth];
497// my_size_t idxList2[simdWidth];
498// for (my_size_t j = 0; j < simdWidth; ++j)
499// {
500// idxList1[j] = idx1 + j * stride1;
501// idxList2[j] = idx2 + j * stride2;
502// }
503
504// auto v1 = K::gather(expr1.data(), idxList1);
505// auto v2 = K::gather(expr2.data(), idxList2);
506// acc = fmadd_safe(v1, v2, acc);
507
508// idx1 += simdWidth * stride1;
509// idx2 += simdWidth * stride2;
510// }
511
512// alignas(DATA_ALIGNAS) T tmp[simdWidth];
513// K::store(tmp, acc);
514
515// for (my_size_t i = 0; i < simdWidth; ++i)
516// result += tmp[i];
517// }
518
519// // Scalar tail
520// for (my_size_t i = scalarStart; i < len; ++i)
521// {
522// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
523// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
524// result += v1 * v2;
525// idx1 += stride1;
526// idx2 += stride2;
527// }
528
529// return result;
530// }
531
532// /**
533// * @brief Naive scalar dot product for testing/validation.
534// *
535// * Accesses physical memory directly via data_.data().
536// * Only used in tests to verify SIMD dot results.
537// */
538// template <typename Expr1, typename Expr2>
539// FORCE_INLINE static T naive_dot_physical(
540// const Expr1 &expr1, my_size_t base1, my_size_t stride1,
541// const Expr2 &expr2, my_size_t base2, my_size_t stride2,
542// my_size_t len) noexcept
543// {
544// T sum = T{0};
545// for (my_size_t i = 0; i < len; ++i)
546// sum += expr1.data_.data()[base1 + i * stride1] *
547// expr2.data_.data()[base2 + i * stride2];
548// return sum;
549// }
550
551// private:
552// enum class ReduceOp
553// {
554// Min,
555// Max,
556// Sum
557// };
558
559// template <ReduceOp Op>
560// FORCE_INLINE static T reduce_identity() noexcept
561// {
562// if constexpr (Op == ReduceOp::Min)
563// return NumericLimits<T>::max();
564// if constexpr (Op == ReduceOp::Max)
565// return NumericLimits<T>::lowest();
566// if constexpr (Op == ReduceOp::Sum)
567// return T{0};
568// }
569
570// template <ReduceOp Op>
571// FORCE_INLINE static typename K::VecType reduce_simd_combine(
572// typename K::VecType a, typename K::VecType b) noexcept
573// {
574// if constexpr (Op == ReduceOp::Min)
575// return K::min(a, b);
576// if constexpr (Op == ReduceOp::Max)
577// return K::max(a, b);
578// if constexpr (Op == ReduceOp::Sum)
579// return K::add(a, b);
580// }
581
582// template <ReduceOp Op>
583// FORCE_INLINE static T reduce_scalar_combine(T a, T b) noexcept
584// {
585// using ScalarK = Microkernel<T, 1, GENERICARCH>;
586// if constexpr (Op == ReduceOp::Min)
587// return ScalarK::min(a, b);
588// if constexpr (Op == ReduceOp::Max)
589// return ScalarK::max(a, b);
590// if constexpr (Op == ReduceOp::Sum)
591// return ScalarK::add(a, b);
592// }
593// /**
594// * @brief CONTIGUOUS PATH (identity layout) or fast path — no permutation, no remapping.
595// *
596// * Iterates entire physical buffer linearly. PhysicalSize is guaranteed
597// * to be a multiple of simdWidth by the padding policy. Padding
598// * slots contain zeros — harmless for element-wise ops.
599// *
600// * evalu receives physical flat offsets.
601// *
602// * Note: FusedTensorND::evalu works for both paths because with identity layout,
603// * physical flat and logical flat are the same within each slice
604// * (the contiguous path passes physical offsets,
605// * the permuted path passes logical flats,
606// * same numbers when unpermuted).
607// *
608// * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
609// * @param output Pointer to output buffer (already allocated, size = PhysicalSize)
610// * @param expr The expression to evaluate and store in output
611// */
612// template <typename Expr>
613// FORCE_INLINE static void eval_vectorized_contiguous(
614// T *output,
615// const Expr &expr) noexcept
616// {
617// using Layout = typename Expr::Layout;
618// static constexpr my_size_t physicalSize = Layout::PhysicalSize;
619// static constexpr my_size_t simdSteps = physicalSize / simdWidth;
620// static constexpr bool hasRemainder = (physicalSize % simdWidth) != 0;
621
622// // Paranoia check: ensure physical size is a multiple of SIMD width,
623// // so we never read out of bounds
624// static_assert(physicalSize % simdWidth == 0,
625// "PhysicalSize must be a multiple of SimdWidth");
626
627// // SIMD loop
628// for (my_size_t i = 0; i < simdSteps; ++i)
629// {
630// auto val = expr.template evalu<T, Bits, Arch>(i * simdWidth);
631// K::store(output + i * simdWidth, val);
632// }
633
634// // Scalar remainder TODO: The whole point of padding is that PhysicalSize is already
635// // a multiple of SimdWidth — so there's no scalar remainder
636// // Delete this code if confirmed unnecessary
637// if constexpr (hasRemainder)
638// {
639// std::cout << "Warning: Scalar evaluation for remainder elements." << std::endl;
640// // for (my_size_t i = simdSteps * simdWidth; i < physicalSize; ++i)
641// // {
642// // output[i] = expr.template evalu<T, 1, GENERICARCH>(i);
643// // }
644// }
645// }
646
647// /**
648// * @brief PERMUTED PATH (any layout) or general path — works with any layout, including permuted.
649// * TODO: untested
650// * Iterates output physical slices for aligned stores. Tracks a running
651// * logical_flat for the expression. Permuted evalu uses K::gather,
652// * unpermuted uses K::load — dispatch happens inside evalu, not here.
653// *
654// * ============================================================================
655// * EXAMPLE: output [3,2] padded to [3,4], source [2,3] transposed
656// * ============================================================================
657// *
658// * lastDim=2, paddedLastDim=4, numSlices=3, simdSteps=0, scalarStart=0
659// *
660// * slice 0: out_base=0, logical_flat 0,1 → store at output[0], output[1]
661// * slice 1: out_base=4, logical_flat 2,3 → store at output[4], output[5]
662// * slice 2: out_base=8, logical_flat 4,5 → store at output[8], output[9]
663// *
664// * Padding at output[2,3,6,7,10,11] untouched.
665// * ============================================================================
666// */
667
668// template <typename Expr, typename Seq>
669// struct OutputPadImpl
670// {
671// };
672
673// template <typename Expr, my_size_t... Is>
674// struct OutputPadImpl<Expr, index_seq<Is...>>
675// {
676// using type = SimdPaddingPolicy<typename Expr::value_type, Expr::Dim[Is]...>;
677// };
678
679// template <typename Expr>
680// struct OutputPadPolicy
681// {
682// using type = typename OutputPadImpl<Expr, typename make_index_seq<Expr::NumDims>::type>::type;
683// };
684
685// template <typename Expr>
686// FORCE_INLINE static void eval_vectorized_permuted(
687// T *output,
688// const Expr &expr) noexcept
689// {
690// // using ExprPadPolicy = typename Expr::Layout::PadPolicyType;
691
692// // using OutputPad = SimdPaddingPolicy<T, Expr::Layout::PadPolicyType::LogicalDims...>;
693// // using OutputPad = OutputPadPolicy<typename Expr::Layout>;
694
695// using OutputPad = typename OutputPadPolicy<Expr>::type;
696
697// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
698// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
699// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
700
701// static constexpr my_size_t lastDim = OutputPad::LastDim;
702// static constexpr my_size_t paddedLastDim = OutputPad::PaddedLastDim;
703// static constexpr my_size_t numSlices = OutputPad::PhysicalSize / paddedLastDim;
704
705// static constexpr my_size_t simdSteps = lastDim / simdWidth;
706// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
707
708// my_size_t logical_flat = 0;
709
710// for (my_size_t slice = 0; slice < numSlices; ++slice)
711// {
712// const my_size_t out_base = slice * paddedLastDim;
713
714// for (my_size_t i = 0; i < simdSteps; ++i)
715// {
716// auto val = expr.template evalu<T, Bits, Arch>(logical_flat);
717// K::store(output + out_base + i * simdWidth, val);
718// logical_flat += simdWidth;
719// }
720
721// if constexpr (scalarStart < lastDim)
722// {
723// for (my_size_t i = scalarStart; i < lastDim; ++i)
724// {
725// output[out_base + i] = expr.template evalu<T, 1, GENERICARCH>(logical_flat);
726// ++logical_flat;
727// }
728// }
729// }
730// }
731
732// /**
733// * @brief Fast path — both expressions share the same physical layout.
734// *
735// * Iterates physical slices. evalu receives physical offsets.
736// * Same slice geometry for both sides.
737// *
738// * @tparam Expr1 First expression type
739// * @tparam Expr2 Second expression type
740// * @param lhs Left-hand side expression
741// * @param rhs Right-hand side expression
742// * @param tolerance Approximation tolerance for floating-point comparisons
743// * @return true if all corresponding logical elements are approximately equal within the given tolerance, false otherwise
744// */
745// template <typename Expr1, typename Expr2>
746// FORCE_INLINE static bool approx_equal_contiguous(
747// const Expr1 &lhs,
748// const Expr2 &rhs,
749// T tolerance) noexcept
750// {
751// using ExprPadPolicy = typename Expr1::Layout::PadPolicyType;
752
753// static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
754// static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
755// static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
756// static constexpr my_size_t simdSteps = lastDim / simdWidth;
757// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
758
759// if constexpr (simdSteps > 0)
760// {
761// for (my_size_t slice = 0; slice < numSlices; ++slice)
762// {
763// const my_size_t base = slice * paddedLastDim;
764// for (my_size_t i = 0; i < simdSteps; ++i)
765// {
766// auto lhs_vec = lhs.template evalu<T, Bits, Arch>(base + i * simdWidth);
767// auto rhs_vec = rhs.template evalu<T, Bits, Arch>(base + i * simdWidth);
768// if (!K::all_within_tolerance(lhs_vec, rhs_vec, tolerance))
769// return false;
770// }
771// }
772// }
773
774// if constexpr (scalarStart < lastDim)
775// {
776// using ScalarK = Microkernel<T, 1, GENERICARCH>;
777// for (my_size_t slice = 0; slice < numSlices; ++slice)
778// {
779// const my_size_t base = slice * paddedLastDim;
780// for (my_size_t i = scalarStart; i < lastDim; ++i)
781// {
782// T lhs_val = lhs.template evalu<T, 1, GENERICARCH>(base + i);
783// T rhs_val = rhs.template evalu<T, 1, GENERICARCH>(base + i);
784// if (ScalarK::abs(lhs_val - rhs_val) > tolerance)
785// return false;
786// }
787// }
788// }
789
790// return true;
791// }
792
793// /**
794// * @brief General path — one or both expressions are permuted.
795// *
796// * Iterates logical flat indices. Each expression's evalu handles
797// * its own remapping (gather for permuted, load for unpermuted).
798// *
799// * Uses the OUTPUT-side slice geometry to track logical_flat while
800// * skipping output padding gaps — same pattern as eval_vectorized_permuted.
801// * But since we're comparing (not storing), we use Expr1's padding policy
802// * as reference.
803// *
804// * TODO: Untested
805// *
806// * @tparam Expr1 First expression type
807// * @tparam Expr2 Second expression type
808// * @param lhs Left-hand side expression
809// * @param rhs Right-hand side expression
810// * @param tolerance Approximation tolerance for floating-point comparisons
811// * @return true if all corresponding logical elements are approximately equal within the given tolerance, false otherwise
812// */
813// template <typename Expr1, typename Expr2>
814// FORCE_INLINE static bool approx_equal_logical(
815// const Expr1 &lhs,
816// const Expr2 &rhs,
817// T tolerance) noexcept
818// {
819// // Use Expr1's logical dims to drive iteration
820// // (both must have same logical dims)
821// // TODO: enforce this at compile time? Or is it guaranteed anyways? think...!
822// static constexpr my_size_t logicalSize = Expr1::TotalSize;
823
824// using ScalarK = Microkernel<T, 1, GENERICARCH>;
825
826// // Pure scalar — each evalu handles its own layout remapping
827// for (my_size_t i = 0; i < logicalSize; ++i)
828// {
829// T lhs_val = lhs.template evalu<T, 1, GENERICARCH>(i);
830// T rhs_val = rhs.template evalu<T, 1, GENERICARCH>(i);
831// if (ScalarK::abs(lhs_val - rhs_val) > tolerance)
832// return false;
833// }
834
835// return true;
836// }
837
838// // ========================================================================
839// // REDUCTION DISPATCH
840// // ========================================================================
841
842// template <ReduceOp Op, typename Expr>
843// FORCE_INLINE static T reduce(const Expr &expr) noexcept
844// {
845// if constexpr (!expression::traits<Expr>::IsPermuted)
846// {
847// std::cout << "reduce_contiguous" << std::endl;
848// return reduce_contiguous<Op>(expr);
849// }
850// else
851// {
852// std::cout << "reduce_logical" << std::endl;
853// return reduce_logical<Op>(expr);
854// }
855// }
856
857// // ========================================================================
858// // CONTIGUOUS PATH — iterate physical slices, skip padding
859// // ========================================================================
860
861// template <ReduceOp Op, typename Expr>
862// FORCE_INLINE static T reduce_contiguous(const Expr &expr) noexcept
863// {
864// using ExprPadPolicy = typename Expr::Layout::PadPolicyType;
865
866// static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
867// static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
868// static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
869// static constexpr my_size_t simdSteps = lastDim / simdWidth;
870// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
871
872// T result = reduce_identity<Op>();
873
874// if constexpr (simdSteps > 0)
875// {
876// typename K::VecType acc = K::set1(reduce_identity<Op>());
877
878// for (my_size_t slice = 0; slice < numSlices; ++slice)
879// {
880// const my_size_t base = slice * paddedLastDim;
881// for (my_size_t i = 0; i < simdSteps; ++i)
882// acc = reduce_simd_combine<Op>(
883// acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
884// }
885
886// alignas(DATA_ALIGNAS) T tmp[simdWidth];
887// K::store(tmp, acc);
888
889// for (my_size_t i = 0; i < simdWidth; ++i)
890// result = reduce_scalar_combine<Op>(result, tmp[i]);
891// }
892
893// if constexpr (scalarStart < lastDim)
894// {
895// for (my_size_t slice = 0; slice < numSlices; ++slice)
896// {
897// const my_size_t base = slice * paddedLastDim;
898// for (my_size_t i = scalarStart; i < lastDim; ++i)
899// result = reduce_scalar_combine<Op>(
900// result, expr.template evalu<T, 1, GENERICARCH>(base + i));
901// }
902// }
903
904// return result;
905// }
906
907// // ========================================================================
908// // LOGICAL PATH — iterate logical flat indices, scalar only
909// // ========================================================================
910// //
911// // For permuted views. Consecutive logical flats are non-contiguous
912// // in physical memory, so SIMD load would read wrong data.
913// // Scalar evalu handles the remapping internally.
914
915// template <ReduceOp Op, typename Expr>
916// FORCE_INLINE static T reduce_logical(const Expr &expr) noexcept
917// {
918// static constexpr my_size_t totalSize = Expr::TotalSize;
919
920// T result = reduce_identity<Op>();
921
922// for (my_size_t i = 0; i < totalSize; ++i)
923// result = reduce_scalar_combine<Op>(
924// result, expr.template evalu<T, 1, GENERICARCH>(i));
925
926// return result;
927// }
928
929// // ========================================================================
930// // Helper
931// // ========================================================================
932
933// FORCE_INLINE static typename K::VecType fmadd_safe(
934// typename K::VecType a,
935// typename K::VecType b,
936// typename K::VecType c) noexcept
937// {
938// if constexpr (requires { K::fmadd(a, b, c); })
939// {
940// return K::fmadd(a, b, c);
941// }
942// else
943// {
944// return K::add(K::mul(a, b), c);
945// }
946// }
947// };
948
949// #endif // KERNEL_OPS_H