tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
sse2_microkernel.h
Go to the documentation of this file.
1#ifndef __SSE2_MICROKERNEL_H__
2#define __SSE2_MICROKERNEL_H__
3
4#include <immintrin.h>
5#include "config.h"
6
7// Architecture tag
8struct X86_SSE
9{
10}; // 128-bit SSE/SSE2
11
12// ============================================================================
13// SSE (128-bit) specializations
14// ============================================================================
15
16template <>
17struct Microkernel<float, 128, X86_SSE>
18{
19 static constexpr my_size_t simdWidth = 4; // 128 bits / 32 bits per float = 4
20 using VecType = __m128;
21 using ScalarType = float;
22
23 FORCE_INLINE static VecType load(const ScalarType *ptr) noexcept { return _mm_load_ps(ptr); }
24 FORCE_INLINE static VecType loadu(const ScalarType *ptr) noexcept { return _mm_loadu_ps(ptr); }
25 FORCE_INLINE static void store(ScalarType *ptr, VecType val) noexcept { _mm_store_ps(ptr, val); }
26 FORCE_INLINE static void storeu(ScalarType *ptr, VecType val) noexcept { _mm_storeu_ps(ptr, val); }
27 FORCE_INLINE static VecType set1(ScalarType scalar) noexcept { return _mm_set1_ps(scalar); }
28
29 FORCE_INLINE static VecType add(VecType a, VecType b) noexcept { return _mm_add_ps(a, b); }
30 FORCE_INLINE static VecType add(VecType a, ScalarType b) noexcept { return _mm_add_ps(a, set1(b)); }
31
32 FORCE_INLINE static VecType mul(VecType a, VecType b) noexcept { return _mm_mul_ps(a, b); }
33 FORCE_INLINE static VecType mul(VecType a, ScalarType b) noexcept { return _mm_mul_ps(a, set1(b)); }
34
35 FORCE_INLINE static VecType sub(VecType a, VecType b) noexcept { return _mm_sub_ps(a, b); }
36 FORCE_INLINE static VecType sub(VecType a, ScalarType b) noexcept { return _mm_sub_ps(a, set1(b)); }
37 FORCE_INLINE static VecType sub(ScalarType a, VecType b) noexcept { return _mm_sub_ps(set1(a), b); }
38
39 FORCE_INLINE static VecType div(VecType a, VecType b) noexcept { return _mm_div_ps(a, b); }
40 FORCE_INLINE static VecType div(VecType a, ScalarType b) noexcept { return _mm_div_ps(a, set1(b)); }
41 FORCE_INLINE static VecType div(ScalarType a, VecType b) noexcept { return _mm_div_ps(set1(a), b); }
42
43 FORCE_INLINE static VecType fmadd(VecType a, VecType b, VecType c) noexcept { return _mm_fmadd_ps(a, b, c); }
44
45 FORCE_INLINE static VecType min(VecType a, VecType b) noexcept { return _mm_min_ps(a, b); }
46 FORCE_INLINE static VecType min(VecType a, ScalarType b) noexcept { return _mm_min_ps(a, set1(b)); }
47
48 FORCE_INLINE static VecType max(VecType a, VecType b) noexcept { return _mm_max_ps(a, b); }
49 FORCE_INLINE static VecType max(VecType a, ScalarType b) noexcept { return _mm_max_ps(a, set1(b)); }
50
51 FORCE_INLINE static VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
52 {
53 // _mm_i32gather_ps requires 4 × 32-bit indices.
54 // so we convert size_t → int32_t.
55 alignas(16) int32_t idx32[simdWidth];
56 for (my_size_t i = 0; i < simdWidth; ++i)
57 {
58 idx32[i] = static_cast<int32_t>(indices[i]);
59 }
60
61 // loadu (“unaligned load”) is recommended for temporary stack buffers, even when aligned, because:
62 // it's just as fast as load on aligned addresses
63 // never invokes undefined behavior
64 // does not depend on type alignment rules
65 __m128i vindex = _mm_loadu_si128(reinterpret_cast<const __m128i *>(idx32));
66 return _mm_i32gather_ps(base, vindex, sizeof(ScalarType));
67 }
68
69 FORCE_INLINE static void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
70 {
71 alignas(16) ScalarType tmp[simdWidth];
72 _mm_storeu_ps(tmp, val);
73 for (my_size_t i = 0; i < simdWidth; ++i)
74 base[indices[i]] = tmp[i];
75 }
76
77 FORCE_INLINE static VecType abs(VecType v) noexcept
78 {
79 __m128 sign_mask = _mm_set1_ps(-0.0f);
80 return _mm_andnot_ps(sign_mask, v);
81 }
82
84 {
85 __m128 diff = _mm_sub_ps(a, b);
86 __m128 abs_diff = abs(diff);
87 __m128 tol_vec = _mm_set1_ps(tol);
88 __m128 cmp = _mm_cmple_ps(abs_diff, tol_vec);
89 int mask = _mm_movemask_ps(cmp);
90 return mask == 0xF; // all 4 lanes passed
91 }
92};
93
94template <>
95struct Microkernel<double, 128, X86_SSE>
96{
97 static constexpr my_size_t simdWidth = 2; // 128 bits / 64 bits per double = 2
98 using VecType = __m128d;
99 using ScalarType = double;
100
101 FORCE_INLINE static VecType load(const ScalarType *ptr) noexcept { return _mm_load_pd(ptr); }
102 FORCE_INLINE static VecType loadu(const ScalarType *ptr) noexcept { return _mm_loadu_pd(ptr); }
103 FORCE_INLINE static void store(ScalarType *ptr, VecType val) noexcept { _mm_store_pd(ptr, val); }
104 FORCE_INLINE static void storeu(ScalarType *ptr, VecType val) noexcept { _mm_storeu_pd(ptr, val); }
105 FORCE_INLINE static VecType set1(ScalarType scalar) noexcept { return _mm_set1_pd(scalar); }
106
107 FORCE_INLINE static VecType add(VecType a, VecType b) noexcept { return _mm_add_pd(a, b); }
108 FORCE_INLINE static VecType add(VecType a, ScalarType b) noexcept { return _mm_add_pd(a, set1(b)); }
109
110 FORCE_INLINE static VecType mul(VecType a, VecType b) noexcept { return _mm_mul_pd(a, b); }
111 FORCE_INLINE static VecType mul(VecType a, ScalarType b) noexcept { return _mm_mul_pd(a, set1(b)); }
112
113 FORCE_INLINE static VecType sub(VecType a, VecType b) noexcept { return _mm_sub_pd(a, b); }
114 FORCE_INLINE static VecType sub(VecType a, ScalarType b) noexcept { return _mm_sub_pd(a, set1(b)); }
115 FORCE_INLINE static VecType sub(ScalarType a, VecType b) noexcept { return _mm_sub_pd(set1(a), b); }
116
117 FORCE_INLINE static VecType div(VecType a, VecType b) noexcept { return _mm_div_pd(a, b); }
118 FORCE_INLINE static VecType div(VecType a, ScalarType b) noexcept { return _mm_div_pd(a, set1(b)); }
119 FORCE_INLINE static VecType div(ScalarType a, VecType b) noexcept { return _mm_div_pd(set1(a), b); }
120
121 FORCE_INLINE static VecType fmadd(VecType a, VecType b, VecType c) noexcept { return _mm_fmadd_pd(a, b, c); }
122
123 FORCE_INLINE static VecType min(VecType a, VecType b) noexcept { return _mm_min_pd(a, b); }
124 FORCE_INLINE static VecType min(VecType a, ScalarType b) noexcept { return _mm_min_pd(a, set1(b)); }
125
126 FORCE_INLINE static VecType max(VecType a, VecType b) noexcept { return _mm_max_pd(a, b); }
127 FORCE_INLINE static VecType max(VecType a, ScalarType b) noexcept { return _mm_max_pd(a, set1(b)); }
128
129 FORCE_INLINE static VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
130 {
131 __m128i vindex = _mm_loadu_si128(reinterpret_cast<const __m128i *>(indices));
132 return _mm_i64gather_pd(base, vindex, sizeof(ScalarType));
133 }
134
135 FORCE_INLINE static void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
136 {
137 alignas(16) ScalarType tmp[simdWidth];
138 _mm_storeu_pd(tmp, val);
139 for (my_size_t i = 0; i < simdWidth; ++i)
140 base[indices[i]] = tmp[i];
141 }
142
143 FORCE_INLINE static VecType abs(VecType v) noexcept
144 {
145 __m128d sign_mask = _mm_set1_pd(-0.0);
146 return _mm_andnot_pd(sign_mask, v);
147 }
148
150 {
151 __m128d diff = _mm_sub_pd(a, b);
152 __m128d abs_diff = abs(diff);
153 __m128d tol_vec = _mm_set1_pd(tol);
154 __m128d cmp = _mm_cmple_pd(abs_diff, tol_vec);
155 int mask = _mm_movemask_pd(cmp);
156 return mask == 0x3; // all 2 lanes passed
157 }
158};
159
160#endif // __SSE2_MICROKERNEL_H__
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:110
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition sse2_microkernel.h:135
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:103
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:107
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:111
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:123
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:104
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition sse2_microkernel.h:121
__m128d VecType
Definition sse2_microkernel.h:98
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:126
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:119
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:117
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:108
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:124
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition sse2_microkernel.h:105
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:113
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:101
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition sse2_microkernel.h:149
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:118
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition sse2_microkernel.h:143
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:102
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:127
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:114
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:115
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition sse2_microkernel.h:129
double ScalarType
Definition sse2_microkernel.h:99
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:23
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:24
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:46
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition sse2_microkernel.h:27
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:35
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:29
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:48
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:40
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:39
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:32
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:33
float ScalarType
Definition sse2_microkernel.h:21
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition sse2_microkernel.h:43
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition sse2_microkernel.h:83
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:25
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition sse2_microkernel.h:69
__m128 VecType
Definition sse2_microkernel.h:20
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:45
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:30
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition sse2_microkernel.h:51
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:26
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:37
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:49
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:41
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition sse2_microkernel.h:77
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:36
Definition microkernel_base.h:16
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
static FORCE_INLINE VecType set1(T scalar) noexcept
Definition sse2_microkernel.h:9