tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
kernel_ops2.h
Go to the documentation of this file.
1// // Higher-level kernel operations built on top of microkernels
2// #ifndef KERNEL_OPS_H
3// #define KERNEL_OPS_H
4
5// #include "config.h"
6// #include "fused/microkernels/microkernel_base.h"
7// #include "numeric_limits.h"
8
9// template <typename T, my_size_t Bits, typename Arch>
10// struct KernelOps
11// {
12// using K = Microkernel<T, Bits, Arch>;
13// static constexpr my_size_t simdWidth = K::simdWidth;
14
15// // ========================================================================
16// // Helper
17// // ========================================================================
18
19// FORCE_INLINE static typename K::VecType fmadd_safe(
20// typename K::VecType a,
21// typename K::VecType b,
22// typename K::VecType c) noexcept
23// {
24// if constexpr (requires { K::fmadd(a, b, c); })
25// {
26// return K::fmadd(a, b, c);
27// }
28// else
29// {
30// return K::add(K::mul(a, b), c);
31// }
32// }
33
34// // ========================================================================
35// // Evaluation
36// // ========================================================================
37
38// template <typename Expr>
39// FORCE_INLINE static void eval_vectorized_contiguous(
40// T *output,
41// const Expr &expr) noexcept
42// {
43// using Layout = typename Expr::Layout;
44// static constexpr my_size_t physicalSize = Layout::PhysicalSize;
45// // static constexpr my_size_t totalSize = Expr::TotalSize;
46// static constexpr my_size_t simdSteps = physicalSize / simdWidth;
47// static constexpr bool hasRemainder = (physicalSize % simdWidth) != 0;
48
49// // SIMD loop
50// for (my_size_t i = 0; i < simdSteps; ++i)
51// {
52// auto val = expr.template evalu<T, Bits, Arch>(i * simdWidth);
53// K::store(output + i * simdWidth, val);
54// }
55
56// // Scalar remainder TODO: The whole point of padding is that PhysicalSize is already a multiple of SimdWidth — so there's no scalar remainder
57// // Delete this code if confirmed unnecessary
58// if constexpr (hasRemainder)
59// {
60// std::cout << "Warning: Scalar evaluation for remainder elements." << std::endl;
61// // for (my_size_t i = simdSteps * simdWidth; i < physicalSize; ++i)
62// // {
63// // output[i] = expr.template evalu<T, 1, GENERICARCH>(i);
64// // }
65// }
66// }
67
68// // ========================================================================
69// // Reductions
70// // ========================================================================
71
72// template <typename Expr>
73// FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
74// {
75// static constexpr my_size_t totalSize = Expr::TotalSize;
76// static constexpr my_size_t simdSteps = totalSize / simdWidth;
77// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
78
79// typename K::VecType acc = K::set1(NumericLimits<T>::max());
80
81// for (my_size_t i = 0; i < simdSteps; ++i)
82// {
83// acc = K::min(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
84// }
85
86// // Horizontal reduction
87// alignas(DATA_ALIGNAS) T tmp[simdWidth];
88// K::store(tmp, acc);
89
90// T result = tmp[0];
91// for (my_size_t i = 1; i < simdWidth; ++i)
92// {
93// if (tmp[i] < result)
94// result = tmp[i];
95// }
96
97// if constexpr (hasRemainder)
98// {
99// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
100// {
101// T val = expr.template evalu<T, 1, GENERICARCH>(i);
102// if (val < result)
103// result = val;
104// }
105// }
106
107// return result;
108// }
109
110// template <typename Expr>
111// FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
112// {
113// static constexpr my_size_t totalSize = Expr::TotalSize;
114// static constexpr my_size_t simdSteps = totalSize / simdWidth;
115// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
116
117// typename K::VecType acc = K::set1(NumericLimits<T>::lowest());
118
119// for (my_size_t i = 0; i < simdSteps; ++i)
120// {
121// acc = K::max(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
122// }
123
124// // Horizontal reduction
125// alignas(DATA_ALIGNAS) T tmp[simdWidth];
126// K::store(tmp, acc);
127
128// T result = tmp[0];
129// for (my_size_t i = 1; i < simdWidth; ++i)
130// {
131// if (tmp[i] > result)
132// result = tmp[i];
133// }
134
135// if constexpr (hasRemainder)
136// {
137// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
138// {
139// T val = expr.template evalu<T, 1, GENERICARCH>(i);
140// if (val > result)
141// result = val;
142// }
143// }
144
145// return result;
146// }
147
148// template <typename Expr>
149// FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
150// {
151// static constexpr my_size_t totalSize = Expr::TotalSize;
152// static constexpr my_size_t simdSteps = totalSize / simdWidth;
153// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
154
155// typename K::VecType acc = K::set1(T{0});
156
157// for (my_size_t i = 0; i < simdSteps; ++i)
158// {
159// acc = K::add(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
160// }
161
162// // Horizontal reduction
163// alignas(DATA_ALIGNAS) T tmp[simdWidth];
164// K::store(tmp, acc);
165
166// T result = tmp[0];
167// for (my_size_t i = 1; i < simdWidth; ++i)
168// {
169// result += tmp[i];
170// }
171
172// if constexpr (hasRemainder)
173// {
174// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
175// {
176// result += expr.template evalu<T, 1, GENERICARCH>(i);
177// }
178// }
179
180// return result;
181// }
182
183// template <typename Expr1, typename Expr2>
184// FORCE_INLINE static bool reduce_all_approx_equal(
185// const Expr1 &lhs,
186// const Expr2 &rhs,
187// T tolerance) noexcept
188// {
189// static constexpr my_size_t totalSize = Expr1::TotalSize;
190// static constexpr my_size_t simdSteps = totalSize / simdWidth;
191// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
192
193// for (my_size_t i = 0; i < simdSteps; ++i)
194// {
195// auto lhs_vec = lhs.template evalu<T, Bits, Arch>(i * simdWidth);
196// auto rhs_vec = rhs.template evalu<T, Bits, Arch>(i * simdWidth);
197// if (!K::all_within_tolerance(lhs_vec, rhs_vec, tolerance))
198// {
199// return false;
200// }
201// }
202
203// if constexpr (hasRemainder)
204// {
205// using ScalarK = Microkernel<T, 1, GENERICARCH>;
206// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
207// {
208// T lhs_val = lhs.template evalu<T, 1, GENERICARCH>(i);
209// T rhs_val = rhs.template evalu<T, 1, GENERICARCH>(i);
210// T abs_diff = ScalarK::abs(lhs_val - rhs_val);
211// if (abs_diff > tolerance)
212// {
213// return false;
214// }
215// }
216// }
217
218// return true;
219// }
220
221// // ========================================================================
222// // Dot products (for einsum)
223// // ========================================================================
224
225// // Contiguous dot product - both strides along k are 1
226// template <typename Expr1, typename Expr2>
227// FORCE_INLINE static T dot_contiguous(
228// const Expr1 &expr1,
229// const Expr2 &expr2,
230// my_size_t base1,
231// my_size_t base2,
232// const my_size_t len) noexcept
233// {
234// const my_size_t steps = len / simdWidth;
235
236// typename K::VecType acc = K::set1(T{0});
237
238// for (my_size_t i = 0; i < steps; ++i)
239// {
240// auto v1 = expr1.template evalu<T, Bits, Arch>(base1 + i * simdWidth);
241// auto v2 = expr2.template evalu<T, Bits, Arch>(base2 + i * simdWidth);
242// acc = fmadd_safe(v1, v2, acc);
243// }
244
245// // Horizontal reduction
246// alignas(DATA_ALIGNAS) T tmp[simdWidth];
247// K::store(tmp, acc);
248
249// T result = tmp[0];
250// for (my_size_t i = 1; i < simdWidth; ++i)
251// {
252// result += tmp[i];
253// }
254
255// // Remainder
256// for (my_size_t i = steps * simdWidth; i < len; ++i)
257// {
258// T v1 = expr1.template evalu<T, 1, GENERICARCH>(base1 + i);
259// T v2 = expr2.template evalu<T, 1, GENERICARCH>(base2 + i);
260// result += v1 * v2;
261// }
262
263// return result;
264// }
265
266// // Strided dot product - scalar fallback
267// template <typename Expr1, typename Expr2>
268// FORCE_INLINE static T dot_strided_scalar(
269// const Expr1 &expr1,
270// const Expr2 &expr2,
271// my_size_t idx1,
272// my_size_t idx2,
273// const my_size_t stride1,
274// const my_size_t stride2,
275// const my_size_t len) noexcept
276// {
277// T sum = T{0};
278// for (my_size_t k = 0; k < len; ++k)
279// {
280// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
281// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
282// sum += v1 * v2;
283// idx1 += stride1;
284// idx2 += stride2;
285// }
286// return sum;
287// }
288
289// // Strided dot product - SIMD with gather
290// template <typename Expr1, typename Expr2>
291// FORCE_INLINE static T dot_strided_gather(
292// const Expr1 &expr1,
293// const Expr2 &expr2,
294// my_size_t idx1,
295// my_size_t idx2,
296// const my_size_t stride1,
297// const my_size_t stride2,
298// const my_size_t len) noexcept
299// {
300// const my_size_t steps = len / simdWidth;
301
302// typename K::VecType acc = K::set1(T{0});
303
304// for (my_size_t i = 0; i < steps; ++i)
305// {
306// auto v1 = expr1.template evalu_strided<T, Bits, Arch>(idx1, stride1);
307// auto v2 = expr2.template evalu_strided<T, Bits, Arch>(idx2, stride2);
308// acc = fmadd_safe(v1, v2, acc);
309
310// idx1 += simdWidth * stride1;
311// idx2 += simdWidth * stride2;
312// }
313
314// // Horizontal reduction
315// alignas(DATA_ALIGNAS) T tmp[simdWidth];
316// K::store(tmp, acc);
317
318// T result = tmp[0];
319// for (my_size_t i = 1; i < simdWidth; ++i)
320// {
321// result += tmp[i];
322// }
323
324// // Scalar remainder
325// for (my_size_t i = steps * simdWidth; i < len; ++i)
326// {
327// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
328// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
329// result += v1 * v2;
330// idx1 += stride1;
331// idx2 += stride2;
332// }
333
334// return result;
335// }
336// };
337
338// #endif // KERNEL_OPS_H