tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
kernel_ops1.h
Go to the documentation of this file.
1// // Higher-level kernel operations built on top of microkernels
2// #ifndef KERNEL_OPS_H
3// #define KERNEL_OPS_H
4
5// #include "config.h"
6// #include "fused/microkernels/microkernel_base.h"
7// #include "numeric_limits.h"
8
9// // template <typename T, my_size_t Bits, typename Arch, my_size_t... Dims>
10// // struct TensorKernels
11// // {
12// // using K = Microkernel<T, Bits, Arch>;
13// // static constexpr my_size_t simdWidth = K::simdWidth;
14// // static constexpr my_size_t dimCount = sizeof...(Dims);
15// // static constexpr my_size_t totalSize = (Dims * ...); // product of all dims
16
17// // // ========================================================================
18// // // ELEMENT-WISE OPERATIONS
19// // // ========================================================================
20
21// // // Vectorized evaluation with contiguous storage assumption (aka the result tensor is not transposed)
22// // template <typename Expr>
23// // FORCE_INLINE static void eval_vectorized_contiguous(
24// // T *output,
25// // const Expr &expr,
26// // auto &&unravelIndexfn) noexcept
27// // {
28
29// // const my_size_t simdSteps = totalSize / simdWidth;
30
31// // // SIMD loop
32// // for (my_size_t i = 0; i < simdSteps; ++i)
33// // {
34// // auto val = expr.template evalu<T, Bits, Arch>(i * simdWidth);
35// // // Contiguous case — fast path
36// // K::store(output + i * simdWidth, val);
37// // }
38
39// // if constexpr ((totalSize % simdWidth) != 0) // TODO: verify if constexpr works here
40// // // in theory the compiler should be able to optimize out this branch if totalSize is known at compile time to be multiple of simdWidth
41// // // but better be safe than sorry, whith constexpr we ensure no runtime overhead if not needed
42// // {
43// // // Scalar remainder
44// // my_size_t indices[dimCount];
45// // for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
46// // {
47// // std::forward<decltype(unravelIndexfn)>(unravelIndexfn)(i, indices); // TODO: get rid of std
48// // // evaluate the remainder
49// // output[i] = expr(indices);
50// // }
51// // }
52// // }
53
54// // // Vectorized evaluation with non-contiguous storage (aka the result tensor is transposed)
55// // // In other words this algorithm uses scatter which means the result is saved in non-continuous memmory
56// // // TODO: not tested yet
57// // // TODO: not sure if this is needed at all
58// // // template <typename Expr>
59// // // FORCE_INLINE static void eval_vectorized_non_contiguous(
60// // // T *output,
61// // // const Expr &expr,
62// // // my_size_t (&transposeOrder)[dimCount]) noexcept
63// // // {
64
65// // // const my_size_t simdSteps = totalSize / simdWidth;
66
67// // // // SIMD loop
68// // // for (my_size_t i = 0; i < simdSteps; ++i)
69// // // {
70// // // my_size_t baseIdx = i * simdWidth;
71
72// // // auto val = expr.evalu(i * simdWidth);
73
74// // // // Non-contiguous (result tensor is transposed) case
75// // // my_size_t idxList[simdWidth];
76// // // for (int j = 0; j < simdWidth; ++j)
77// // // idxList[j] = remapFlatIndex(baseIdx + j, transposeOrder);
78// // // K::scatter(output, idxList, val);
79// // // }
80
81// // // // Scalar remainder TODO: this is wrong? — need to remap indices here as well?
82// // // for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
83// // // {
84// // // // std::cout << "Scalar remainder loop" << std::endl;
85// // // my_size_t indices[dimCount];
86// // // unravelIndex(i, indices, transposeOrder);
87// // // output[i] = expr(indices);
88// // // }
89// // // }
90
91// // template <typename Expr>
92// // FORCE_INLINE static void eval_scalar(
93// // T *output,
94// // const Expr &expr,
95// // auto &&unravelIndexfn) noexcept
96// // {
97// // // Pure scalar fallback
98// // for (my_size_t i = 0; i < totalSize; ++i)
99// // {
100// // my_size_t indices[dimCount];
101// // std::forward<decltype(unravelIndexfn)>(unravelIndexfn)(i, indices); // TODO: get rid of std
102// // output[i] = expr(indices);
103// // }
104// // }
105// // };
106
107// template <typename Expr, my_size_t Bits, typename Arch>
108// struct KernelOps
109// {
110// using T = typename Expr::value_type;
111// using K = Microkernel<T, Bits, Arch>;
112
113// static constexpr my_size_t simdWidth = K::simdWidth;
114// static constexpr my_size_t numDims = Expr::NumDims;
115// static constexpr my_size_t totalSize = Expr::TotalSize;
116// static constexpr my_size_t simdSteps = totalSize / simdWidth;
117// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
118
119// FORCE_INLINE static void eval_vectorized_contiguous(
120// T *output,
121// const Expr &expr) noexcept
122// {
123// // SIMD loop
124// for (my_size_t i = 0; i < simdSteps; ++i)
125// {
126// // for GENERICARCH, Bits does not matter, simdWidth=1,
127// // so this works for both scalar and vectorized cases
128// auto val = expr.template evalu<T, Bits, Arch>(i * simdWidth);
129// K::store(output + i * simdWidth, val);
130// }
131
132// // Scalar remainder
133// if constexpr (hasRemainder)
134// {
135// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
136// {
137// // fallback to scalar evaluation using GENERICARCH microkernel
138// // the 1 here is skipped since in scalar mode simdWidth=1
139// output[i] = expr.template evalu<T, 1, GENERICARCH>(i);
140// }
141// }
142// }
143
144// FORCE_INLINE static T reduce_min(
145// const Expr &expr) noexcept
146// {
147// typename K::VecType acc = K::set1(NumericLimits<T>::max());
148
149// // SIMD loop
150// for (my_size_t i = 0; i < simdSteps; ++i)
151// {
152// acc = K::min(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
153// }
154
155// // Horizontal reduction
156// alignas(DATA_ALIGNAS) T tmp[simdWidth];
157// K::store(tmp, acc);
158
159// T result = tmp[0];
160// for (my_size_t i = 1; i < simdWidth; ++i)
161// {
162// if (tmp[i] < result)
163// result = tmp[i];
164// }
165
166// if constexpr (hasRemainder)
167// {
168// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
169// {
170// T val = expr.template evalu<T, 1, GENERICARCH>(i);
171// if (val < result)
172// result = val;
173// }
174// }
175
176// return result;
177// }
178
179// FORCE_INLINE static T reduce_max(
180// const Expr &expr) noexcept
181// {
182// typename K::VecType acc = K::set1(NumericLimits<T>::lowest());
183
184// // SIMD loop
185// for (my_size_t i = 0; i < simdSteps; ++i)
186// {
187// acc = K::max(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
188// }
189
190// // Horizontal reduction
191// alignas(DATA_ALIGNAS) T tmp[simdWidth];
192// K::store(tmp, acc);
193
194// T result = tmp[0];
195// for (my_size_t i = 1; i < simdWidth; ++i)
196// {
197// if (tmp[i] > result)
198// result = tmp[i];
199// }
200
201// // Scalar remainder
202// if constexpr (hasRemainder)
203// {
204// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
205// {
206// T val = expr.template evalu<T, 1, GENERICARCH>(i);
207// if (val > result)
208// result = val;
209// }
210// }
211
212// return result;
213// }
214
215// FORCE_INLINE static T reduce_sum(
216// const Expr &expr) noexcept
217// {
218// typename K::VecType acc = K::set1(T{0});
219
220// // SIMD loop
221// for (my_size_t i = 0; i < simdSteps; ++i)
222// {
223// acc = K::add(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
224// }
225
226// // Horizontal reduction
227// alignas(DATA_ALIGNAS) T tmp[simdWidth];
228// K::store(tmp, acc);
229
230// T result = tmp[0];
231// for (my_size_t i = 1; i < simdWidth; ++i)
232// {
233// result += tmp[i];
234// }
235
236// // Scalar remainder
237// if constexpr (hasRemainder)
238// {
239// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
240// {
241// result += expr.template evalu<T, 1, GENERICARCH>(i);
242// }
243// }
244
245// return result;
246// }
247
248// template <typename ExprLHS, typename ExprRHS>
249// FORCE_INLINE static bool reduce_all_approx_equal(
250// const ExprLHS &lhs,
251// const ExprRHS &rhs,
252// T tolerance) noexcept
253// {
254// // SIMD loop
255// for (my_size_t i = 0; i < simdSteps; ++i)
256// {
257// auto lhs_vec = lhs.template evalu<T, Bits, Arch>(i * simdWidth);
258// auto rhs_vec = rhs.template evalu<T, Bits, Arch>(i * simdWidth);
259// if (!K::all_within_tolerance(lhs_vec, rhs_vec, tolerance))
260// {
261// return false;
262// }
263// }
264
265// // Scalar remainder
266// if constexpr (hasRemainder)
267// {
268// using ScalarK = Microkernel<T, 1, GENERICARCH>;
269// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
270// {
271// T lhs_val = lhs.template evalu<T, 1, GENERICARCH>(i);
272// T rhs_val = rhs.template evalu<T, 1, GENERICARCH>(i);
273// T abs_diff = ScalarK::abs(lhs_val - rhs_val);
274// if (abs_diff > tolerance)
275// {
276// return false;
277// }
278// }
279// }
280
281// return true;
282// }
283// };
284
285// template <typename T, my_size_t Bits, typename Arch>
286// struct EinsumKernel
287// {
288// using K = Microkernel<T, Bits, Arch>;
289// static constexpr my_size_t simdWidth = K::simdWidth;
290
291// FORCE_INLINE static typename K::VecType fmadd_safe(
292// typename K::VecType a,
293// typename K::VecType b,
294// typename K::VecType c) noexcept
295// {
296// if constexpr (requires { K::fmadd(a, b, c); })
297// {
298// return K::fmadd(a, b, c);
299// }
300// else
301// {
302// return K::add(K::mul(a, b), c);
303// }
304// }
305
306// // Contiguous dot product - both strides along k are 1
307// template <typename Expr1, typename Expr2>
308// FORCE_INLINE static T dot_contiguous(
309// const Expr1 &expr1,
310// const Expr2 &expr2,
311// my_size_t base1,
312// my_size_t base2,
313// const my_size_t len) noexcept
314// {
315// const my_size_t simdSteps = len / simdWidth;
316
317// typename K::VecType acc = K::set1(T{0});
318
319// for (my_size_t i = 0; i < simdSteps; ++i)
320// {
321// auto v1 = expr1.template evalu<T, Bits, Arch>(base1 + i * simdWidth);
322// auto v2 = expr2.template evalu<T, Bits, Arch>(base2 + i * simdWidth);
323// acc = fmadd_safe(v1, v2, acc);
324// }
325
326// // Horizontal reduction
327// alignas(DATA_ALIGNAS) T tmp[simdWidth];
328// K::store(tmp, acc);
329
330// T result = tmp[0];
331// for (my_size_t i = 1; i < simdWidth; ++i)
332// {
333// result += tmp[i];
334// }
335
336// // Remainder
337// for (my_size_t i = simdSteps * simdWidth; i < len; ++i)
338// {
339// T v1 = expr1.template evalu<T, 1, GENERICARCH>(base1 + i);
340// T v2 = expr2.template evalu<T, 1, GENERICARCH>(base2 + i);
341// result += v1 * v2;
342// }
343
344// return result;
345// }
346
347// // Strided dot product - scalar fallback
348// template <typename Expr1, typename Expr2>
349// FORCE_INLINE static T dot_strided_scalar(
350// const Expr1 &expr1,
351// const Expr2 &expr2,
352// my_size_t idx1,
353// my_size_t idx2,
354// const my_size_t stride1,
355// const my_size_t stride2,
356// const my_size_t len) noexcept
357// {
358// T sum = T{0};
359// for (my_size_t k = 0; k < len; ++k)
360// {
361// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
362// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
363// sum += v1 * v2;
364// idx1 += stride1;
365// idx2 += stride2;
366// }
367// return sum;
368// }
369
370// // Strided dot product - SIMD with gather
371// template <typename Expr1, typename Expr2>
372// FORCE_INLINE static T dot_strided_gather(
373// const Expr1 &expr1,
374// const Expr2 &expr2,
375// my_size_t idx1,
376// my_size_t idx2,
377// const my_size_t stride1,
378// const my_size_t stride2,
379// const my_size_t len) noexcept
380// {
381// const my_size_t simdSteps = len / simdWidth;
382
383// typename K::VecType acc = K::set1(T{0});
384
385// for (my_size_t i = 0; i < simdSteps; ++i)
386// {
387// auto v1 = expr1.template evalu_strided<T, Bits, Arch>(idx1, stride1);
388// auto v2 = expr2.template evalu_strided<T, Bits, Arch>(idx2, stride2);
389// acc = fmadd_safe(v1, v2, acc);
390
391// idx1 += simdWidth * stride1;
392// idx2 += simdWidth * stride2;
393// }
394
395// // Horizontal reduction
396// alignas(DATA_ALIGNAS) T tmp[simdWidth];
397// K::store(tmp, acc);
398
399// T result = tmp[0];
400// for (my_size_t i = 1; i < simdWidth; ++i)
401// {
402// result += tmp[i];
403// }
404
405// // Scalar remainder
406// for (my_size_t i = simdSteps * simdWidth; i < len; ++i)
407// {
408// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
409// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
410// result += v1 * v2;
411// idx1 += stride1;
412// idx2 += stride2;
413// }
414
415// return result;
416// }
417// };
418
419// #endif // KERNEL_OPS_H