tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
microkernel_base.h
Go to the documentation of this file.
1// Base interface and architecture tags
2#ifndef MICROKERNEL_BASE_H
3#define MICROKERNEL_BASE_H
4
5#include "config.h"
7
8// Base microkernel interface - all architecture-specific kernels implement this
9// Template parameters:
10// T = scalar type (float, double, int, Complex<float>, etc.)
11// Bits = SIMD register width in bits (128, 256, 512, etc.)
12// Arch = architecture tag (X86_AVX, NEONArch, etc.)
13
14template <typename T, my_size_t Bits, typename Arch>
16{
17 static constexpr my_size_t simdWidth = 1; // Override in specializations
18 using VecType = T; // Override with architecture-specific vector type
19
20 // Core SIMD operations
21 // Memory operations
22 FORCE_INLINE static VecType load(const T *ptr) noexcept;
23 FORCE_INLINE static void store(T *ptr, VecType val) noexcept;
24
25 // Broadcast scalar to vector (CRITICAL for tensor-scalar ops)
26 FORCE_INLINE static VecType set1(T scalar) noexcept;
27
28 // Vector-vector operations
29 FORCE_INLINE static VecType add(VecType a, VecType b) noexcept;
30 FORCE_INLINE static VecType mul(VecType a, VecType b) noexcept;
31 FORCE_INLINE static VecType sub(VecType a, VecType b) noexcept;
32 FORCE_INLINE static VecType div(VecType a, VecType b) noexcept;
33 FORCE_INLINE static VecType min(VecType a, VecType b) noexcept;
34 FORCE_INLINE static VecType max(VecType a, VecType b) noexcept;
35
36 // // Vector-scalar operations (using set1 internally)
37 // template <typename Vec = VecType, typename Scalar = T>
38 // requires(!is_same_v<Vec, Scalar>)
39 // FORCE_INLINE static Vec add(Vec a, T scalar) noexcept
40 // {
41 // return add(a, set1(scalar));
42 // }
43 // template <typename Vec = VecType, typename Scalar = T>
44 // requires(!is_same_v<Vec, Scalar>)
45 // FORCE_INLINE static Vec mul(Vec a, T scalar) noexcept
46 // {
47 // return mul(a, set1(scalar));
48 // }
49 // template <typename Vec = VecType, typename Scalar = T>
50 // requires(!is_same_v<Vec, Scalar>)
51 // FORCE_INLINE static Vec sub(Vec a, T scalar) noexcept
52 // {
53 // return sub(a, set1(scalar));
54 // }
55 // template <typename Vec = VecType, typename Scalar = T>
56 // requires(!is_same_v<Vec, Scalar>)
57 // FORCE_INLINE static Vec div(Vec a, T scalar) noexcept
58 // {
59 // return div(a, set1(scalar));
60 // }
61
62 // // Scalar-vector operations (order matters for sub/div!)
63 // template <typename Vec = VecType, typename Scalar = T>
64 // requires(!is_same_v<Vec, Scalar>)
65 // FORCE_INLINE static Vec sub(T scalar, Vec a) noexcept
66 // {
67 // return sub(set1(scalar), a);
68 // }
69
70 // template <typename Vec = VecType, typename Scalar = T>
71 // requires(!is_same_v<Vec, Scalar>)
72 // FORCE_INLINE static Vec div(T scalar, Vec a) noexcept
73 // {
74 // return div(set1(scalar), a);
75 // }
76};
77
78// ============================================================================
79// Architecture Tags
80// ============================================================================
81// struct X86_SSE {}; // 128-bit SSE/SSE2
82// struct X86_AVX {}; // 256-bit AVX/AVX2
83// struct X86_AVX512 {}; // 512-bit AVX-512
84// struct ARM_NEON_A76 {}; // 128-bit NEON on Cortex-A76 and newer (RPi5, Graviton)
85// struct ARM_NEON_A72 {}; // 128-bit NEON on Cortex-A72 and newer (RPi4)
86// struct ARM_NEON_A55 {}; // 128-bit NEON on Cortex-A55 and older (many ARMv8 phones)
87
88// Include all architecture implementations
90// #include "fused/microkernels/generic/generic_complex_microkernel.h"
91
92#if __AVX512F__
93#include "fused/microkernels/avx2/avx512_microkernel.h"
94// #include "fused/microkernels/avx2/avx512_complex_microkernel.h"
95#pragma message "[COMPILE-TIME] Using X86_AVX512F arch"
96constexpr my_size_t BITS = 512;
97using DefaultArch = X86_AVX512;
98
99#elif __AVX2__
101// #include "fused/microkernels/avx2/avx2_complex_microkernel.h"
102#pragma message "[COMPILE-TIME] Using X86_AVX arch"
103constexpr my_size_t BITS = 256;
104using DefaultArch = X86_AVX;
105
106#elif __SSE2__
108// #include "fused/microkernels/avx2/sse2_complex_microkernel.h"
109#pragma message "[COMPILE-TIME] Using X86_SSE2 arch"
110constexpr my_size_t BITS = 128;
111using DefaultArch = X86_SSE;
112
113#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
115constexpr my_size_t BITS = 128;
116
117 // User override takes priority
118 #if defined(TESSERACT_ARM_UARCH_A76)
119 #pragma message "[COMPILE-TIME] Using ARM_NEON_A76 arch (user override)"
121 #elif defined(TESSERACT_ARM_UARCH_A72)
122 #pragma message "[COMPILE-TIME] Using ARM_NEON_A72 arch (user override)"
124 #elif defined(TESSERACT_ARM_UARCH_A55)
125 #pragma message "[COMPILE-TIME] Using ARM_NEON_A55 arch (user override)"
127 // Auto-detection fallback
128 #elif defined(__ARM_FEATURE_DOTPROD)
129 #pragma message "[COMPILE-TIME] Using ARM_NEON_A76 arch (auto-detected)"
131 #elif defined(__ARM_ARCH) && (__ARM_ARCH >= 8)
132 #pragma message "[COMPILE-TIME] Using ARM_NEON_A72 arch (auto-detected)"
134 #else
135 #pragma message "[COMPILE-TIME] Using ARM_NEON_A55 arch (auto-detected)"
137 #endif
138
139#else
140#pragma message "[COMPILE-TIME] Using GENERICARCH arch"
141constexpr my_size_t BITS = 0;
143#endif
144
146
147#endif // MICROKERNEL_BASE_H
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
constexpr my_size_t DATA_ALIGNAS
Definition microkernel_base.h:145
constexpr my_size_t BITS
Definition microkernel_base.h:141
Definition neon_microkernel.h:10
Definition neon_microkernel.h:13
Definition neon_microkernel.h:16
Definition generic_microkernel.h:9
Definition microkernel_base.h:16
T VecType
Definition microkernel_base.h:18
static FORCE_INLINE void store(T *ptr, VecType val) noexcept
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
static FORCE_INLINE VecType load(const T *ptr) noexcept
static FORCE_INLINE VecType set1(T scalar) noexcept
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:9
Definition sse2_microkernel.h:9