tesseract++
0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
core
include
fused
kernel_ops
deprecated
kernel_ops.h
Go to the documentation of this file.
1
// // Higher-level kernel operations built on top of microkernels
2
// #ifndef KERNEL_OPS_H
3
// #define KERNEL_OPS_H
4
5
// #include "config.h"
6
// #include "fused/microkernels/microkernel_base.h"
7
// #include "numeric_limits.h"
8
// #include "helper_traits.h"
9
// #include "fused/padding_policies/simd_padding_policy.h"
10
// #include "expression_traits/expression_traits.h"
11
12
// template <typename T, my_size_t Bits, typename Arch>
13
// struct KernelOps
14
// {
15
// using K = Microkernel<T, Bits, Arch>;
16
// static constexpr my_size_t simdWidth = K::simdWidth;
17
18
// // ========================================================================
19
// // Evaluation
20
// // ========================================================================
21
22
// /**
23
// * @brief Dispatch: pick contiguous or permuted eval based on expression layout.
24
// *
25
// * @tparam Expr
26
// * @param output
27
// * @param expr
28
// */
29
// template <typename Expr>
30
// FORCE_INLINE static void eval(T *output, const Expr &expr) noexcept
31
// {
32
// if constexpr (!expression::traits<Expr>::IsPermuted)
33
// {
34
// // std::cout << "eval_contiguous" << std::endl;
35
// eval_vectorized_contiguous(output, expr);
36
// }
37
// else
38
// {
39
// // std::cout << "eval_permuted" << std::endl;
40
// eval_vectorized_permuted(output, expr);
41
// }
42
// }
43
44
// // // ========================================================================
45
// // // Reductions
46
// // // ========================================================================
47
48
// // /**
49
// // * @brief Find the minimum element across all logical elements of an expression.
50
// // *
51
// // * Iterates slice-by-slice over the physical buffer, where "slice" means a contiguous
52
// // * slice along the last dimension. This naturally skips padding without needing
53
// // * identity-element tricks.
54
// // *
55
// // * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
56
// // * @param expr The expression to reduce
57
// // * @return T The minimum logical element
58
// // *
59
// // * ============================================================================
60
// // * STRATEGY
61
// // * ============================================================================
62
// // *
63
// // * Physical memory is organized as numSlices × paddedLastDim, where only the
64
// // * first lastDim elements per slice are logical data:
65
// // *
66
// // * slice 0: [d d d d d P P P]
67
// // * slice 1: [d d d d d P P P] d = data, P = padding
68
// // * slice 2: [d d d d d P P P]
69
// // * |← lastDim→|
70
// // * |← paddedLastDim →|
71
// // *
72
// // * For a 3D tensor [2, 3, 5], numSlices = 2*3 = 6:
73
// // * slice 0 = [0,0,:] slice 3 = [1,0,:]
74
// // * slice 1 = [0,1,:] slice 4 = [1,1,:]
75
// // * slice 2 = [0,2,:] slice 5 = [1,2,:]
76
// // *
77
// // * Per slice, SIMD processes simdWidth-aligned chunks, then a scalar tail
78
// // * handles the remainder. Padding is never read.
79
// // *
80
// // * ============================================================================
81
// // * EXAMPLE: FusedTensorND<double, 2, 3, 5>, AVX (SimdWidth=4)
82
// // * ============================================================================
83
// // *
84
// // * lastDim=5, paddedLastDim=8, numSlices=6, simdSteps=1, scalarStart=4
85
// // *
86
// // * Per slice (8 physical slots):
87
// // * [d0 d1 d2 d3 | d4 P P P ]
88
// // * ^^^^^^^^^^^ ^^
89
// // * SIMD (i=0) scalar tail
90
// // *
91
// // * ============================================================================
92
// // * EXAMPLE: FusedTensorND<double, 2, 2>, AVX (SimdWidth=4)
93
// // * ============================================================================
94
// // *
95
// // * lastDim=2, paddedLastDim=4, numSlices=2, simdSteps=0, scalarStart=0
96
// // *
97
// // * Per slice (4 physical slots):
98
// // * [d0 d1 P P ]
99
// // * ^^^^^
100
// // * scalar only (SIMD block skipped entirely)
101
// // *
102
// // * ============================================================================
103
// // * GENERICARCH (SimdWidth=1): no padding, simdSteps=lastDim, no scalar tail.
104
// // * Microkernel ops inline to plain scalar — same codegen as a manual loop.
105
// // * ============================================================================
106
// // */
107
// // template <typename Expr>
108
// // FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
109
// // {
110
// // using ExprLayout = typename Expr::Layout;
111
// // using ExprPadPolicy = typename ExprLayout::PadPolicyType;
112
113
// // // Slice geometry from padding policy
114
// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim; // logical elements per slice
115
// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim; // physical stride per slice
116
// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim; // total slices across all dims
117
// // static constexpr my_size_t simdSteps = lastDim / simdWidth; // full SIMD chunks per slice
118
// // static constexpr my_size_t scalarStart = simdSteps * simdWidth; // where scalar tail begins
119
120
// // T result = NumericLimits<T>::max();
121
122
// // // --- SIMD path: process simdWidth elements at a time per slice ---
123
// // if constexpr (simdSteps > 0)
124
// // {
125
// // typename K::VecType acc = K::set1(NumericLimits<T>::max());
126
127
// // for (my_size_t slice = 0; slice < numSlices; ++slice)
128
// // {
129
// // const my_size_t base = slice * paddedLastDim; // physical offset of slice start
130
// // for (my_size_t i = 0; i < simdSteps; ++i)
131
// // acc = K::min(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
132
// // }
133
134
// // // Horizontal reduction: collapse SIMD register to scalar
135
// // alignas(DATA_ALIGNAS) T tmp[simdWidth];
136
// // K::store(tmp, acc);
137
138
// // for (my_size_t i = 0; i < simdWidth; ++i)
139
// // {
140
// // if (tmp[i] < result)
141
// // result = tmp[i];
142
// // }
143
// // }
144
145
// // // --- Scalar tail: elements at [scalarStart, lastDim) per slice ---
146
// // if constexpr (scalarStart < lastDim)
147
// // {
148
// // for (my_size_t slice = 0; slice < numSlices; ++slice)
149
// // {
150
// // const my_size_t base = slice * paddedLastDim;
151
// // for (my_size_t i = scalarStart; i < lastDim; ++i)
152
// // {
153
// // T val = expr.template evalu<T, 1, GENERICARCH>(base + i);
154
// // if (val < result)
155
// // result = val;
156
// // }
157
// // }
158
// // }
159
160
// // return result;
161
// // }
162
163
// // /**
164
// // * @brief Find the maximum element across all logical elements of an expression.
165
// // *
166
// // * Iterates slice-by-slice over the physical buffer, where "slice" means a contiguous
167
// // * slice along the last dimension. This naturally skips padding without needing
168
// // * identity-element tricks.
169
// // *
170
// // * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
171
// // * @param expr The expression to reduce
172
// // * @return T The maximum logical element
173
// // *
174
// // * ============================================================================
175
// // * STRATEGY
176
// // * ============================================================================
177
// // *
178
// // * Physical memory is organized as numSlices × paddedLastDim, where only the
179
// // * first lastDim elements per slice are logical data:
180
// // *
181
// // * slice 0: [d d d d d P P P]
182
// // * slice 1: [d d d d d P P P] d = data, P = padding
183
// // * slice 2: [d d d d d P P P]
184
// // * |← lastDim→|
185
// // * |← paddedLastDim →|
186
// // *
187
// // * For a 3D tensor [2, 3, 5], numSlices = 2*3 = 6:
188
// // * slice 0 = [0,0,:] slice 3 = [1,0,:]
189
// // * slice 1 = [0,1,:] slice 4 = [1,1,:]
190
// // * slice 2 = [0,2,:] slice 5 = [1,2,:]
191
// // *
192
// // * Per slice, SIMD processes simdWidth-aligned chunks, then a scalar tail
193
// // * handles the remainder. Padding is never read.
194
// // *
195
// // * ============================================================================
196
// // * GENERICARCH (SimdWidth=1): no padding, simdSteps=lastDim, no scalar tail.
197
// // * Microkernel ops inline to plain scalar — same codegen as a manual loop.
198
// // * ============================================================================
199
// // */
200
// // template <typename Expr>
201
// // FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
202
// // {
203
// // using ExprLayout = typename Expr::Layout;
204
// // using ExprPadPolicy = typename ExprLayout::PadPolicyType;
205
206
// // // Slice geometry from padding policy
207
// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim; // logical elements per slice
208
// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim; // physical stride per slice
209
// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim; // total slices across all dims
210
// // static constexpr my_size_t simdSteps = lastDim / simdWidth; // full SIMD chunks per slice
211
// // static constexpr my_size_t scalarStart = simdSteps * simdWidth; // where scalar tail begins
212
213
// // T result = NumericLimits<T>::lowest();
214
215
// // // --- SIMD path: process simdWidth elements at a time per slice ---
216
// // if constexpr (simdSteps > 0)
217
// // {
218
// // typename K::VecType acc = K::set1(NumericLimits<T>::lowest());
219
220
// // for (my_size_t slice = 0; slice < numSlices; ++slice)
221
// // {
222
// // const my_size_t base = slice * paddedLastDim; // physical offset of slice start
223
// // for (my_size_t i = 0; i < simdSteps; ++i)
224
// // acc = K::max(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
225
// // }
226
227
// // // Horizontal reduction: collapse SIMD register to scalar
228
// // alignas(DATA_ALIGNAS) T tmp[simdWidth];
229
// // K::store(tmp, acc);
230
231
// // for (my_size_t i = 0; i < simdWidth; ++i)
232
// // {
233
// // if (tmp[i] > result)
234
// // result = tmp[i];
235
// // }
236
// // }
237
238
// // // --- Scalar tail: elements at [scalarStart, lastDim) per slice ---
239
// // if constexpr (scalarStart < lastDim)
240
// // {
241
// // for (my_size_t slice = 0; slice < numSlices; ++slice)
242
// // {
243
// // const my_size_t base = slice * paddedLastDim;
244
// // for (my_size_t i = scalarStart; i < lastDim; ++i)
245
// // {
246
// // T val = expr.template evalu<T, 1, GENERICARCH>(base + i);
247
// // if (val > result)
248
// // result = val;
249
// // }
250
// // }
251
// // }
252
253
// // return result;
254
// // }
255
256
// // /**
257
// // * @brief Sum all logical elements of an expression.
258
// // *
259
// // * Iterates slice-by-slice over the physical buffer, skipping padding.
260
// // * Same strategy as reduce_min/reduce_max.
261
// // *
262
// // * ============================================================================
263
// // * NOTE: Padding is zero, so iterating the full physical buffer would give
264
// // * a correct sum. But slice-by-slice is used for consistency and because
265
// // * linear iteration over TotalSize crosses padding boundaries mid-SIMD-load.
266
// // * ============================================================================
267
// // *
268
// // * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
269
// // * @param expr The expression to reduce
270
// // * @return T The sum of all logical elements
271
// // */
272
// // template <typename Expr>
273
// // FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
274
// // {
275
// // using ExprLayout = typename Expr::Layout;
276
// // using ExprPadPolicy = typename ExprLayout::PadPolicyType;
277
278
// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
279
// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
280
// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
281
// // static constexpr my_size_t simdSteps = lastDim / simdWidth;
282
// // static constexpr my_size_t scalarStart = simdSteps * simdWidth;
283
284
// // T result = T{0};
285
286
// // if constexpr (simdSteps > 0)
287
// // {
288
// // typename K::VecType acc = K::set1(T{0});
289
290
// // for (my_size_t slice = 0; slice < numSlices; ++slice)
291
// // {
292
// // const my_size_t base = slice * paddedLastDim;
293
// // for (my_size_t i = 0; i < simdSteps; ++i)
294
// // acc = K::add(acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
295
// // }
296
297
// // alignas(DATA_ALIGNAS) T tmp[simdWidth];
298
// // K::store(tmp, acc);
299
300
// // for (my_size_t i = 0; i < simdWidth; ++i)
301
// // result += tmp[i];
302
// // }
303
304
// // if constexpr (scalarStart < lastDim)
305
// // {
306
// // for (my_size_t slice = 0; slice < numSlices; ++slice)
307
// // {
308
// // const my_size_t base = slice * paddedLastDim;
309
// // for (my_size_t i = scalarStart; i < lastDim; ++i)
310
// // result += expr.template evalu<T, 1, GENERICARCH>(base + i);
311
// // }
312
// // }
313
314
// // return result;
315
// // }
316
317
// template <typename Expr>
318
// FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
319
// {
320
// return reduce<ReduceOp::Min>(expr);
321
// }
322
323
// template <typename Expr>
324
// FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
325
// {
326
// return reduce<ReduceOp::Max>(expr);
327
// }
328
329
// template <typename Expr>
330
// FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
331
// {
332
// return reduce<ReduceOp::Sum>(expr);
333
// }
334
335
// /**
336
// * @brief Check if all logical elements of two expressions are approximately equal.
337
// *
338
// * Dispatches based on layout compatibility:
339
// * - Both unpermuted, same padding → fast physical slice iteration
340
// * - Otherwise → logical flat iteration (handles any layout combination)
341
// *
342
// * Both expressions must have the same logical dimensions.
343
// * TODO: do we need to check this at compile time, or is it guaranteed by the caller? think...!
344
// *
345
// * @tparam Expr1 First expression type
346
// * @tparam Expr2 Second expression type
347
// * @param lhs Left-hand side expression
348
// * @param rhs Right-hand side expression
349
// * @param tolerance Approximation tolerance for floating-point comparisons
350
// * @return true if all corresponding logical elements are approximately equal within the given tolerance, false otherwise
351
// */
352
// template <typename Expr1, typename Expr2>
353
// FORCE_INLINE static bool reduce_all_approx_equal(
354
// const Expr1 &lhs,
355
// const Expr2 &rhs,
356
// T tolerance) noexcept
357
// {
358
// if constexpr (expression::traits<Expr1>::IsContiguous &&
359
// expression::traits<Expr2>::IsContiguous)
360
// return approx_equal_contiguous(lhs, rhs, tolerance);
361
// else
362
// return approx_equal_logical(lhs, rhs, tolerance);
363
// }
364
365
// // ========================================================================
366
// // DOT PRODUCTS (used by einsum for contraction along a shared axis)
367
// // ========================================================================
368
// //
369
// // All functions take PHYSICAL offsets and strides from the layout.
370
// // The caller (einsum) computes these via Layout::stride() and
371
// // Layout::logical_coords_to_physical_flat().
372
// //
373
// // For C[i,j] = sum_k A[i,k] * B[k,j] with A[M,K] and B[K,N]:
374
// //
375
// // A's fiber along k (last dim): base=A.stride(0)*i, stride=1 → contiguous
376
// // B's fiber along k (first dim): base=j, stride=B.stride(0) → strided
377
// //
378
// // Physical memory for A[2,3] padded to [2,4]:
379
// // [a00 a01 a02 P | a10 a11 a12 P]
380
// // ^^^^^^^^^^^ ^^^^^^^^^^^
381
// // fiber i=0 fiber i=1 → contiguous, len=3
382
// //
383
// // Physical memory for B[3,2] padded to [3,4]:
384
// // [b00 b01 P P | b10 b11 P P | b20 b21 P P]
385
// // ^ ^ ^
386
// // fiber j=0, stride=4 → strided, len=3
387
388
// /**
389
// * @brief Dispatch dot product based on stride values.
390
// *
391
// * @param base1 Physical offset of first fiber's start
392
// * @param stride1 Physical stride along contraction axis (1 = contiguous)
393
// * @param base2 Physical offset of second fiber's start
394
// * @param stride2 Physical stride along contraction axis (1 = contiguous)
395
// * @param len Number of elements along contraction axis (logical dim)
396
// */
397
// template <typename Expr1, typename Expr2>
398
// FORCE_INLINE static T dot(
399
// const Expr1 &expr1, my_size_t base1, my_size_t stride1,
400
// const Expr2 &expr2, my_size_t base2, my_size_t stride2,
401
// my_size_t len) noexcept
402
// {
403
// if (stride1 == 1 && stride2 == 1)
404
// return dot_contiguous_impl(expr1, expr2, base1, base2, len);
405
// else
406
// return dot_strided_impl(expr1, expr2, base1, base2, stride1, stride2, len);
407
// }
408
409
// /**
410
// * @brief Contiguous dot product — both fibers have stride 1.
411
// *
412
// * Uses K::load for aligned SIMD access. len may not be a multiple
413
// * of simdWidth (e.g., logical last dim = 5, simdWidth = 4), so
414
// * a scalar remainder handles the tail.
415
// *
416
// * fiber1: [v0 v1 v2 v3 | v4] fiber2: [w0 w1 w2 w3 | w4]
417
// * ^^^^^^^^^^^ ^^ ^^^^^^^^^^^ ^^
418
// * SIMD scalar SIMD scalar
419
// */
420
// template <typename Expr1, typename Expr2>
421
// FORCE_INLINE static T dot_contiguous_impl(
422
// const Expr1 &expr1,
423
// const Expr2 &expr2,
424
// my_size_t base1,
425
// my_size_t base2,
426
// my_size_t len) noexcept
427
// {
428
// // std::cout << "dot_contiguous_impl" << std::endl;
429
// const my_size_t simdSteps = len / simdWidth;
430
// const my_size_t scalarStart = simdSteps * simdWidth;
431
432
// T result = T{0};
433
434
// if (simdSteps > 0)
435
// {
436
// typename K::VecType acc = K::set1(T{0});
437
438
// for (my_size_t i = 0; i < simdSteps; ++i)
439
// {
440
// auto v1 = expr1.template evalu<T, Bits, Arch>(base1 + i * simdWidth);
441
// auto v2 = expr2.template evalu<T, Bits, Arch>(base2 + i * simdWidth);
442
// acc = fmadd_safe(v1, v2, acc);
443
// }
444
445
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
446
// K::store(tmp, acc);
447
448
// for (my_size_t i = 0; i < simdWidth; ++i)
449
// result += tmp[i];
450
// }
451
452
// for (my_size_t i = scalarStart; i < len; ++i)
453
// {
454
// T v1 = expr1.template evalu<T, 1, GENERICARCH>(base1 + i);
455
// T v2 = expr2.template evalu<T, 1, GENERICARCH>(base2 + i);
456
// result += v1 * v2;
457
// }
458
459
// return result;
460
// }
461
462
// /**
463
// * @brief Strided dot product — one or both fibers have stride > 1.
464
// *
465
// * Builds explicit index lists and uses K::gather for SIMD access.
466
// * Falls back to scalar for remainder.
467
// *
468
// * fiber along dim 0 of B[3,2] padded to [3,4]:
469
// * [b00 _ _ _ | b10 _ _ _ | b20 _ _ _]
470
// * ^ ^ ^
471
// * idx=0 idx=4 idx=8 stride=4, len=3
472
// */
473
// template <typename Expr1, typename Expr2>
474
// FORCE_INLINE static T dot_strided_impl(
475
// const Expr1 &expr1,
476
// const Expr2 &expr2,
477
// my_size_t idx1,
478
// my_size_t idx2,
479
// my_size_t stride1,
480
// my_size_t stride2,
481
// my_size_t len) noexcept
482
// {
483
// // std::cout << "dot_strided_impl" << std::endl;
484
// const my_size_t simdSteps = len / simdWidth;
485
// const my_size_t scalarStart = simdSteps * simdWidth;
486
487
// T result = T{0};
488
489
// if (simdSteps > 0)
490
// {
491
// typename K::VecType acc = K::set1(T{0});
492
493
// for (my_size_t i = 0; i < simdSteps; ++i)
494
// {
495
// // Build gather indices for this chunk
496
// my_size_t idxList1[simdWidth];
497
// my_size_t idxList2[simdWidth];
498
// for (my_size_t j = 0; j < simdWidth; ++j)
499
// {
500
// idxList1[j] = idx1 + j * stride1;
501
// idxList2[j] = idx2 + j * stride2;
502
// }
503
504
// auto v1 = K::gather(expr1.data(), idxList1);
505
// auto v2 = K::gather(expr2.data(), idxList2);
506
// acc = fmadd_safe(v1, v2, acc);
507
508
// idx1 += simdWidth * stride1;
509
// idx2 += simdWidth * stride2;
510
// }
511
512
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
513
// K::store(tmp, acc);
514
515
// for (my_size_t i = 0; i < simdWidth; ++i)
516
// result += tmp[i];
517
// }
518
519
// // Scalar tail
520
// for (my_size_t i = scalarStart; i < len; ++i)
521
// {
522
// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
523
// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
524
// result += v1 * v2;
525
// idx1 += stride1;
526
// idx2 += stride2;
527
// }
528
529
// return result;
530
// }
531
532
// /**
533
// * @brief Naive scalar dot product for testing/validation.
534
// *
535
// * Accesses physical memory directly via data_.data().
536
// * Only used in tests to verify SIMD dot results.
537
// */
538
// template <typename Expr1, typename Expr2>
539
// FORCE_INLINE static T naive_dot_physical(
540
// const Expr1 &expr1, my_size_t base1, my_size_t stride1,
541
// const Expr2 &expr2, my_size_t base2, my_size_t stride2,
542
// my_size_t len) noexcept
543
// {
544
// T sum = T{0};
545
// for (my_size_t i = 0; i < len; ++i)
546
// sum += expr1.data_.data()[base1 + i * stride1] *
547
// expr2.data_.data()[base2 + i * stride2];
548
// return sum;
549
// }
550
551
// private:
552
// enum class ReduceOp
553
// {
554
// Min,
555
// Max,
556
// Sum
557
// };
558
559
// template <ReduceOp Op>
560
// FORCE_INLINE static T reduce_identity() noexcept
561
// {
562
// if constexpr (Op == ReduceOp::Min)
563
// return NumericLimits<T>::max();
564
// if constexpr (Op == ReduceOp::Max)
565
// return NumericLimits<T>::lowest();
566
// if constexpr (Op == ReduceOp::Sum)
567
// return T{0};
568
// }
569
570
// template <ReduceOp Op>
571
// FORCE_INLINE static typename K::VecType reduce_simd_combine(
572
// typename K::VecType a, typename K::VecType b) noexcept
573
// {
574
// if constexpr (Op == ReduceOp::Min)
575
// return K::min(a, b);
576
// if constexpr (Op == ReduceOp::Max)
577
// return K::max(a, b);
578
// if constexpr (Op == ReduceOp::Sum)
579
// return K::add(a, b);
580
// }
581
582
// template <ReduceOp Op>
583
// FORCE_INLINE static T reduce_scalar_combine(T a, T b) noexcept
584
// {
585
// using ScalarK = Microkernel<T, 1, GENERICARCH>;
586
// if constexpr (Op == ReduceOp::Min)
587
// return ScalarK::min(a, b);
588
// if constexpr (Op == ReduceOp::Max)
589
// return ScalarK::max(a, b);
590
// if constexpr (Op == ReduceOp::Sum)
591
// return ScalarK::add(a, b);
592
// }
593
// /**
594
// * @brief CONTIGUOUS PATH (identity layout) or fast path — no permutation, no remapping.
595
// *
596
// * Iterates entire physical buffer linearly. PhysicalSize is guaranteed
597
// * to be a multiple of simdWidth by the padding policy. Padding
598
// * slots contain zeros — harmless for element-wise ops.
599
// *
600
// * evalu receives physical flat offsets.
601
// *
602
// * Note: FusedTensorND::evalu works for both paths because with identity layout,
603
// * physical flat and logical flat are the same within each slice
604
// * (the contiguous path passes physical offsets,
605
// * the permuted path passes logical flats,
606
// * same numbers when unpermuted).
607
// *
608
// * @tparam Expr Expression type (FusedTensorND, PermutedViewConstExpr, etc.)
609
// * @param output Pointer to output buffer (already allocated, size = PhysicalSize)
610
// * @param expr The expression to evaluate and store in output
611
// */
612
// template <typename Expr>
613
// FORCE_INLINE static void eval_vectorized_contiguous(
614
// T *output,
615
// const Expr &expr) noexcept
616
// {
617
// using Layout = typename Expr::Layout;
618
// static constexpr my_size_t physicalSize = Layout::PhysicalSize;
619
// static constexpr my_size_t simdSteps = physicalSize / simdWidth;
620
// static constexpr bool hasRemainder = (physicalSize % simdWidth) != 0;
621
622
// // Paranoia check: ensure physical size is a multiple of SIMD width,
623
// // so we never read out of bounds
624
// static_assert(physicalSize % simdWidth == 0,
625
// "PhysicalSize must be a multiple of SimdWidth");
626
627
// // SIMD loop
628
// for (my_size_t i = 0; i < simdSteps; ++i)
629
// {
630
// auto val = expr.template evalu<T, Bits, Arch>(i * simdWidth);
631
// K::store(output + i * simdWidth, val);
632
// }
633
634
// // Scalar remainder TODO: The whole point of padding is that PhysicalSize is already
635
// // a multiple of SimdWidth — so there's no scalar remainder
636
// // Delete this code if confirmed unnecessary
637
// if constexpr (hasRemainder)
638
// {
639
// std::cout << "Warning: Scalar evaluation for remainder elements." << std::endl;
640
// // for (my_size_t i = simdSteps * simdWidth; i < physicalSize; ++i)
641
// // {
642
// // output[i] = expr.template evalu<T, 1, GENERICARCH>(i);
643
// // }
644
// }
645
// }
646
647
// /**
648
// * @brief PERMUTED PATH (any layout) or general path — works with any layout, including permuted.
649
// * TODO: untested
650
// * Iterates output physical slices for aligned stores. Tracks a running
651
// * logical_flat for the expression. Permuted evalu uses K::gather,
652
// * unpermuted uses K::load — dispatch happens inside evalu, not here.
653
// *
654
// * ============================================================================
655
// * EXAMPLE: output [3,2] padded to [3,4], source [2,3] transposed
656
// * ============================================================================
657
// *
658
// * lastDim=2, paddedLastDim=4, numSlices=3, simdSteps=0, scalarStart=0
659
// *
660
// * slice 0: out_base=0, logical_flat 0,1 → store at output[0], output[1]
661
// * slice 1: out_base=4, logical_flat 2,3 → store at output[4], output[5]
662
// * slice 2: out_base=8, logical_flat 4,5 → store at output[8], output[9]
663
// *
664
// * Padding at output[2,3,6,7,10,11] untouched.
665
// * ============================================================================
666
// */
667
668
// template <typename Expr, typename Seq>
669
// struct OutputPadImpl
670
// {
671
// };
672
673
// template <typename Expr, my_size_t... Is>
674
// struct OutputPadImpl<Expr, index_seq<Is...>>
675
// {
676
// using type = SimdPaddingPolicy<typename Expr::value_type, Expr::Dim[Is]...>;
677
// };
678
679
// template <typename Expr>
680
// struct OutputPadPolicy
681
// {
682
// using type = typename OutputPadImpl<Expr, typename make_index_seq<Expr::NumDims>::type>::type;
683
// };
684
685
// template <typename Expr>
686
// FORCE_INLINE static void eval_vectorized_permuted(
687
// T *output,
688
// const Expr &expr) noexcept
689
// {
690
// // using ExprPadPolicy = typename Expr::Layout::PadPolicyType;
691
692
// // using OutputPad = SimdPaddingPolicy<T, Expr::Layout::PadPolicyType::LogicalDims...>;
693
// // using OutputPad = OutputPadPolicy<typename Expr::Layout>;
694
695
// using OutputPad = typename OutputPadPolicy<Expr>::type;
696
697
// // static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
698
// // static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
699
// // static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
700
701
// static constexpr my_size_t lastDim = OutputPad::LastDim;
702
// static constexpr my_size_t paddedLastDim = OutputPad::PaddedLastDim;
703
// static constexpr my_size_t numSlices = OutputPad::PhysicalSize / paddedLastDim;
704
705
// static constexpr my_size_t simdSteps = lastDim / simdWidth;
706
// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
707
708
// my_size_t logical_flat = 0;
709
710
// for (my_size_t slice = 0; slice < numSlices; ++slice)
711
// {
712
// const my_size_t out_base = slice * paddedLastDim;
713
714
// for (my_size_t i = 0; i < simdSteps; ++i)
715
// {
716
// auto val = expr.template evalu<T, Bits, Arch>(logical_flat);
717
// K::store(output + out_base + i * simdWidth, val);
718
// logical_flat += simdWidth;
719
// }
720
721
// if constexpr (scalarStart < lastDim)
722
// {
723
// for (my_size_t i = scalarStart; i < lastDim; ++i)
724
// {
725
// output[out_base + i] = expr.template evalu<T, 1, GENERICARCH>(logical_flat);
726
// ++logical_flat;
727
// }
728
// }
729
// }
730
// }
731
732
// /**
733
// * @brief Fast path — both expressions share the same physical layout.
734
// *
735
// * Iterates physical slices. evalu receives physical offsets.
736
// * Same slice geometry for both sides.
737
// *
738
// * @tparam Expr1 First expression type
739
// * @tparam Expr2 Second expression type
740
// * @param lhs Left-hand side expression
741
// * @param rhs Right-hand side expression
742
// * @param tolerance Approximation tolerance for floating-point comparisons
743
// * @return true if all corresponding logical elements are approximately equal within the given tolerance, false otherwise
744
// */
745
// template <typename Expr1, typename Expr2>
746
// FORCE_INLINE static bool approx_equal_contiguous(
747
// const Expr1 &lhs,
748
// const Expr2 &rhs,
749
// T tolerance) noexcept
750
// {
751
// using ExprPadPolicy = typename Expr1::Layout::PadPolicyType;
752
753
// static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
754
// static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
755
// static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
756
// static constexpr my_size_t simdSteps = lastDim / simdWidth;
757
// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
758
759
// if constexpr (simdSteps > 0)
760
// {
761
// for (my_size_t slice = 0; slice < numSlices; ++slice)
762
// {
763
// const my_size_t base = slice * paddedLastDim;
764
// for (my_size_t i = 0; i < simdSteps; ++i)
765
// {
766
// auto lhs_vec = lhs.template evalu<T, Bits, Arch>(base + i * simdWidth);
767
// auto rhs_vec = rhs.template evalu<T, Bits, Arch>(base + i * simdWidth);
768
// if (!K::all_within_tolerance(lhs_vec, rhs_vec, tolerance))
769
// return false;
770
// }
771
// }
772
// }
773
774
// if constexpr (scalarStart < lastDim)
775
// {
776
// using ScalarK = Microkernel<T, 1, GENERICARCH>;
777
// for (my_size_t slice = 0; slice < numSlices; ++slice)
778
// {
779
// const my_size_t base = slice * paddedLastDim;
780
// for (my_size_t i = scalarStart; i < lastDim; ++i)
781
// {
782
// T lhs_val = lhs.template evalu<T, 1, GENERICARCH>(base + i);
783
// T rhs_val = rhs.template evalu<T, 1, GENERICARCH>(base + i);
784
// if (ScalarK::abs(lhs_val - rhs_val) > tolerance)
785
// return false;
786
// }
787
// }
788
// }
789
790
// return true;
791
// }
792
793
// /**
794
// * @brief General path — one or both expressions are permuted.
795
// *
796
// * Iterates logical flat indices. Each expression's evalu handles
797
// * its own remapping (gather for permuted, load for unpermuted).
798
// *
799
// * Uses the OUTPUT-side slice geometry to track logical_flat while
800
// * skipping output padding gaps — same pattern as eval_vectorized_permuted.
801
// * But since we're comparing (not storing), we use Expr1's padding policy
802
// * as reference.
803
// *
804
// * TODO: Untested
805
// *
806
// * @tparam Expr1 First expression type
807
// * @tparam Expr2 Second expression type
808
// * @param lhs Left-hand side expression
809
// * @param rhs Right-hand side expression
810
// * @param tolerance Approximation tolerance for floating-point comparisons
811
// * @return true if all corresponding logical elements are approximately equal within the given tolerance, false otherwise
812
// */
813
// template <typename Expr1, typename Expr2>
814
// FORCE_INLINE static bool approx_equal_logical(
815
// const Expr1 &lhs,
816
// const Expr2 &rhs,
817
// T tolerance) noexcept
818
// {
819
// // Use Expr1's logical dims to drive iteration
820
// // (both must have same logical dims)
821
// // TODO: enforce this at compile time? Or is it guaranteed anyways? think...!
822
// static constexpr my_size_t logicalSize = Expr1::TotalSize;
823
824
// using ScalarK = Microkernel<T, 1, GENERICARCH>;
825
826
// // Pure scalar — each evalu handles its own layout remapping
827
// for (my_size_t i = 0; i < logicalSize; ++i)
828
// {
829
// T lhs_val = lhs.template evalu<T, 1, GENERICARCH>(i);
830
// T rhs_val = rhs.template evalu<T, 1, GENERICARCH>(i);
831
// if (ScalarK::abs(lhs_val - rhs_val) > tolerance)
832
// return false;
833
// }
834
835
// return true;
836
// }
837
838
// // ========================================================================
839
// // REDUCTION DISPATCH
840
// // ========================================================================
841
842
// template <ReduceOp Op, typename Expr>
843
// FORCE_INLINE static T reduce(const Expr &expr) noexcept
844
// {
845
// if constexpr (!expression::traits<Expr>::IsPermuted)
846
// {
847
// std::cout << "reduce_contiguous" << std::endl;
848
// return reduce_contiguous<Op>(expr);
849
// }
850
// else
851
// {
852
// std::cout << "reduce_logical" << std::endl;
853
// return reduce_logical<Op>(expr);
854
// }
855
// }
856
857
// // ========================================================================
858
// // CONTIGUOUS PATH — iterate physical slices, skip padding
859
// // ========================================================================
860
861
// template <ReduceOp Op, typename Expr>
862
// FORCE_INLINE static T reduce_contiguous(const Expr &expr) noexcept
863
// {
864
// using ExprPadPolicy = typename Expr::Layout::PadPolicyType;
865
866
// static constexpr my_size_t lastDim = ExprPadPolicy::LastDim;
867
// static constexpr my_size_t paddedLastDim = ExprPadPolicy::PaddedLastDim;
868
// static constexpr my_size_t numSlices = ExprPadPolicy::PhysicalSize / paddedLastDim;
869
// static constexpr my_size_t simdSteps = lastDim / simdWidth;
870
// static constexpr my_size_t scalarStart = simdSteps * simdWidth;
871
872
// T result = reduce_identity<Op>();
873
874
// if constexpr (simdSteps > 0)
875
// {
876
// typename K::VecType acc = K::set1(reduce_identity<Op>());
877
878
// for (my_size_t slice = 0; slice < numSlices; ++slice)
879
// {
880
// const my_size_t base = slice * paddedLastDim;
881
// for (my_size_t i = 0; i < simdSteps; ++i)
882
// acc = reduce_simd_combine<Op>(
883
// acc, expr.template evalu<T, Bits, Arch>(base + i * simdWidth));
884
// }
885
886
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
887
// K::store(tmp, acc);
888
889
// for (my_size_t i = 0; i < simdWidth; ++i)
890
// result = reduce_scalar_combine<Op>(result, tmp[i]);
891
// }
892
893
// if constexpr (scalarStart < lastDim)
894
// {
895
// for (my_size_t slice = 0; slice < numSlices; ++slice)
896
// {
897
// const my_size_t base = slice * paddedLastDim;
898
// for (my_size_t i = scalarStart; i < lastDim; ++i)
899
// result = reduce_scalar_combine<Op>(
900
// result, expr.template evalu<T, 1, GENERICARCH>(base + i));
901
// }
902
// }
903
904
// return result;
905
// }
906
907
// // ========================================================================
908
// // LOGICAL PATH — iterate logical flat indices, scalar only
909
// // ========================================================================
910
// //
911
// // For permuted views. Consecutive logical flats are non-contiguous
912
// // in physical memory, so SIMD load would read wrong data.
913
// // Scalar evalu handles the remapping internally.
914
915
// template <ReduceOp Op, typename Expr>
916
// FORCE_INLINE static T reduce_logical(const Expr &expr) noexcept
917
// {
918
// static constexpr my_size_t totalSize = Expr::TotalSize;
919
920
// T result = reduce_identity<Op>();
921
922
// for (my_size_t i = 0; i < totalSize; ++i)
923
// result = reduce_scalar_combine<Op>(
924
// result, expr.template evalu<T, 1, GENERICARCH>(i));
925
926
// return result;
927
// }
928
929
// // ========================================================================
930
// // Helper
931
// // ========================================================================
932
933
// FORCE_INLINE static typename K::VecType fmadd_safe(
934
// typename K::VecType a,
935
// typename K::VecType b,
936
// typename K::VecType c) noexcept
937
// {
938
// if constexpr (requires { K::fmadd(a, b, c); })
939
// {
940
// return K::fmadd(a, b, c);
941
// }
942
// else
943
// {
944
// return K::add(K::mul(a, b), c);
945
// }
946
// }
947
// };
948
949
// #endif // KERNEL_OPS_H
Generated by
1.9.8