tesseract++
0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
core
include
fused
kernel_ops
deprecated
kernel_ops2.h
Go to the documentation of this file.
1
// // Higher-level kernel operations built on top of microkernels
2
// #ifndef KERNEL_OPS_H
3
// #define KERNEL_OPS_H
4
5
// #include "config.h"
6
// #include "fused/microkernels/microkernel_base.h"
7
// #include "numeric_limits.h"
8
9
// template <typename T, my_size_t Bits, typename Arch>
10
// struct KernelOps
11
// {
12
// using K = Microkernel<T, Bits, Arch>;
13
// static constexpr my_size_t simdWidth = K::simdWidth;
14
15
// // ========================================================================
16
// // Helper
17
// // ========================================================================
18
19
// FORCE_INLINE static typename K::VecType fmadd_safe(
20
// typename K::VecType a,
21
// typename K::VecType b,
22
// typename K::VecType c) noexcept
23
// {
24
// if constexpr (requires { K::fmadd(a, b, c); })
25
// {
26
// return K::fmadd(a, b, c);
27
// }
28
// else
29
// {
30
// return K::add(K::mul(a, b), c);
31
// }
32
// }
33
34
// // ========================================================================
35
// // Evaluation
36
// // ========================================================================
37
38
// template <typename Expr>
39
// FORCE_INLINE static void eval_vectorized_contiguous(
40
// T *output,
41
// const Expr &expr) noexcept
42
// {
43
// using Layout = typename Expr::Layout;
44
// static constexpr my_size_t physicalSize = Layout::PhysicalSize;
45
// // static constexpr my_size_t totalSize = Expr::TotalSize;
46
// static constexpr my_size_t simdSteps = physicalSize / simdWidth;
47
// static constexpr bool hasRemainder = (physicalSize % simdWidth) != 0;
48
49
// // SIMD loop
50
// for (my_size_t i = 0; i < simdSteps; ++i)
51
// {
52
// auto val = expr.template evalu<T, Bits, Arch>(i * simdWidth);
53
// K::store(output + i * simdWidth, val);
54
// }
55
56
// // Scalar remainder TODO: The whole point of padding is that PhysicalSize is already a multiple of SimdWidth — so there's no scalar remainder
57
// // Delete this code if confirmed unnecessary
58
// if constexpr (hasRemainder)
59
// {
60
// std::cout << "Warning: Scalar evaluation for remainder elements." << std::endl;
61
// // for (my_size_t i = simdSteps * simdWidth; i < physicalSize; ++i)
62
// // {
63
// // output[i] = expr.template evalu<T, 1, GENERICARCH>(i);
64
// // }
65
// }
66
// }
67
68
// // ========================================================================
69
// // Reductions
70
// // ========================================================================
71
72
// template <typename Expr>
73
// FORCE_INLINE static T reduce_min(const Expr &expr) noexcept
74
// {
75
// static constexpr my_size_t totalSize = Expr::TotalSize;
76
// static constexpr my_size_t simdSteps = totalSize / simdWidth;
77
// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
78
79
// typename K::VecType acc = K::set1(NumericLimits<T>::max());
80
81
// for (my_size_t i = 0; i < simdSteps; ++i)
82
// {
83
// acc = K::min(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
84
// }
85
86
// // Horizontal reduction
87
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
88
// K::store(tmp, acc);
89
90
// T result = tmp[0];
91
// for (my_size_t i = 1; i < simdWidth; ++i)
92
// {
93
// if (tmp[i] < result)
94
// result = tmp[i];
95
// }
96
97
// if constexpr (hasRemainder)
98
// {
99
// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
100
// {
101
// T val = expr.template evalu<T, 1, GENERICARCH>(i);
102
// if (val < result)
103
// result = val;
104
// }
105
// }
106
107
// return result;
108
// }
109
110
// template <typename Expr>
111
// FORCE_INLINE static T reduce_max(const Expr &expr) noexcept
112
// {
113
// static constexpr my_size_t totalSize = Expr::TotalSize;
114
// static constexpr my_size_t simdSteps = totalSize / simdWidth;
115
// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
116
117
// typename K::VecType acc = K::set1(NumericLimits<T>::lowest());
118
119
// for (my_size_t i = 0; i < simdSteps; ++i)
120
// {
121
// acc = K::max(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
122
// }
123
124
// // Horizontal reduction
125
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
126
// K::store(tmp, acc);
127
128
// T result = tmp[0];
129
// for (my_size_t i = 1; i < simdWidth; ++i)
130
// {
131
// if (tmp[i] > result)
132
// result = tmp[i];
133
// }
134
135
// if constexpr (hasRemainder)
136
// {
137
// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
138
// {
139
// T val = expr.template evalu<T, 1, GENERICARCH>(i);
140
// if (val > result)
141
// result = val;
142
// }
143
// }
144
145
// return result;
146
// }
147
148
// template <typename Expr>
149
// FORCE_INLINE static T reduce_sum(const Expr &expr) noexcept
150
// {
151
// static constexpr my_size_t totalSize = Expr::TotalSize;
152
// static constexpr my_size_t simdSteps = totalSize / simdWidth;
153
// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
154
155
// typename K::VecType acc = K::set1(T{0});
156
157
// for (my_size_t i = 0; i < simdSteps; ++i)
158
// {
159
// acc = K::add(acc, expr.template evalu<T, Bits, Arch>(i * simdWidth));
160
// }
161
162
// // Horizontal reduction
163
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
164
// K::store(tmp, acc);
165
166
// T result = tmp[0];
167
// for (my_size_t i = 1; i < simdWidth; ++i)
168
// {
169
// result += tmp[i];
170
// }
171
172
// if constexpr (hasRemainder)
173
// {
174
// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
175
// {
176
// result += expr.template evalu<T, 1, GENERICARCH>(i);
177
// }
178
// }
179
180
// return result;
181
// }
182
183
// template <typename Expr1, typename Expr2>
184
// FORCE_INLINE static bool reduce_all_approx_equal(
185
// const Expr1 &lhs,
186
// const Expr2 &rhs,
187
// T tolerance) noexcept
188
// {
189
// static constexpr my_size_t totalSize = Expr1::TotalSize;
190
// static constexpr my_size_t simdSteps = totalSize / simdWidth;
191
// static constexpr bool hasRemainder = (totalSize % simdWidth) != 0;
192
193
// for (my_size_t i = 0; i < simdSteps; ++i)
194
// {
195
// auto lhs_vec = lhs.template evalu<T, Bits, Arch>(i * simdWidth);
196
// auto rhs_vec = rhs.template evalu<T, Bits, Arch>(i * simdWidth);
197
// if (!K::all_within_tolerance(lhs_vec, rhs_vec, tolerance))
198
// {
199
// return false;
200
// }
201
// }
202
203
// if constexpr (hasRemainder)
204
// {
205
// using ScalarK = Microkernel<T, 1, GENERICARCH>;
206
// for (my_size_t i = simdSteps * simdWidth; i < totalSize; ++i)
207
// {
208
// T lhs_val = lhs.template evalu<T, 1, GENERICARCH>(i);
209
// T rhs_val = rhs.template evalu<T, 1, GENERICARCH>(i);
210
// T abs_diff = ScalarK::abs(lhs_val - rhs_val);
211
// if (abs_diff > tolerance)
212
// {
213
// return false;
214
// }
215
// }
216
// }
217
218
// return true;
219
// }
220
221
// // ========================================================================
222
// // Dot products (for einsum)
223
// // ========================================================================
224
225
// // Contiguous dot product - both strides along k are 1
226
// template <typename Expr1, typename Expr2>
227
// FORCE_INLINE static T dot_contiguous(
228
// const Expr1 &expr1,
229
// const Expr2 &expr2,
230
// my_size_t base1,
231
// my_size_t base2,
232
// const my_size_t len) noexcept
233
// {
234
// const my_size_t steps = len / simdWidth;
235
236
// typename K::VecType acc = K::set1(T{0});
237
238
// for (my_size_t i = 0; i < steps; ++i)
239
// {
240
// auto v1 = expr1.template evalu<T, Bits, Arch>(base1 + i * simdWidth);
241
// auto v2 = expr2.template evalu<T, Bits, Arch>(base2 + i * simdWidth);
242
// acc = fmadd_safe(v1, v2, acc);
243
// }
244
245
// // Horizontal reduction
246
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
247
// K::store(tmp, acc);
248
249
// T result = tmp[0];
250
// for (my_size_t i = 1; i < simdWidth; ++i)
251
// {
252
// result += tmp[i];
253
// }
254
255
// // Remainder
256
// for (my_size_t i = steps * simdWidth; i < len; ++i)
257
// {
258
// T v1 = expr1.template evalu<T, 1, GENERICARCH>(base1 + i);
259
// T v2 = expr2.template evalu<T, 1, GENERICARCH>(base2 + i);
260
// result += v1 * v2;
261
// }
262
263
// return result;
264
// }
265
266
// // Strided dot product - scalar fallback
267
// template <typename Expr1, typename Expr2>
268
// FORCE_INLINE static T dot_strided_scalar(
269
// const Expr1 &expr1,
270
// const Expr2 &expr2,
271
// my_size_t idx1,
272
// my_size_t idx2,
273
// const my_size_t stride1,
274
// const my_size_t stride2,
275
// const my_size_t len) noexcept
276
// {
277
// T sum = T{0};
278
// for (my_size_t k = 0; k < len; ++k)
279
// {
280
// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
281
// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
282
// sum += v1 * v2;
283
// idx1 += stride1;
284
// idx2 += stride2;
285
// }
286
// return sum;
287
// }
288
289
// // Strided dot product - SIMD with gather
290
// template <typename Expr1, typename Expr2>
291
// FORCE_INLINE static T dot_strided_gather(
292
// const Expr1 &expr1,
293
// const Expr2 &expr2,
294
// my_size_t idx1,
295
// my_size_t idx2,
296
// const my_size_t stride1,
297
// const my_size_t stride2,
298
// const my_size_t len) noexcept
299
// {
300
// const my_size_t steps = len / simdWidth;
301
302
// typename K::VecType acc = K::set1(T{0});
303
304
// for (my_size_t i = 0; i < steps; ++i)
305
// {
306
// auto v1 = expr1.template evalu_strided<T, Bits, Arch>(idx1, stride1);
307
// auto v2 = expr2.template evalu_strided<T, Bits, Arch>(idx2, stride2);
308
// acc = fmadd_safe(v1, v2, acc);
309
310
// idx1 += simdWidth * stride1;
311
// idx2 += simdWidth * stride2;
312
// }
313
314
// // Horizontal reduction
315
// alignas(DATA_ALIGNAS) T tmp[simdWidth];
316
// K::store(tmp, acc);
317
318
// T result = tmp[0];
319
// for (my_size_t i = 1; i < simdWidth; ++i)
320
// {
321
// result += tmp[i];
322
// }
323
324
// // Scalar remainder
325
// for (my_size_t i = steps * simdWidth; i < len; ++i)
326
// {
327
// T v1 = expr1.template evalu<T, 1, GENERICARCH>(idx1);
328
// T v2 = expr2.template evalu<T, 1, GENERICARCH>(idx2);
329
// result += v1 * v2;
330
// idx1 += stride1;
331
// idx2 += stride2;
332
// }
333
334
// return result;
335
// }
336
// };
337
338
// #endif // KERNEL_OPS_H
Generated by
1.9.8