tesseract++ 0.0.1
N-dimensional tensor library for embedded systems
Loading...
Searching...
No Matches
neon_intrinsics.h
Go to the documentation of this file.
1#pragma once
2
3#include <arm_neon.h>
4#include "config.h"
5
6// ============================================================================
7// NEON (128-bit) float intrinsics
8// ============================================================================
9
11{
12 static constexpr my_size_t simdWidth = 4; // 128 bits / 32 bits per float = 4
13 static constexpr my_size_t num_registers = 32;
14 using VecType = float32x4_t;
15 using ScalarType = float;
16
17 FORCE_INLINE static VecType load(const ScalarType *ptr) noexcept { return vld1q_f32(ptr); }
18 FORCE_INLINE static VecType loadu(const ScalarType *ptr) noexcept { return vld1q_f32(ptr); } // NEON has no alignment requirement
19 FORCE_INLINE static void store(ScalarType *ptr, VecType val) noexcept { vst1q_f32(ptr, val); }
20 FORCE_INLINE static void storeu(ScalarType *ptr, VecType val) noexcept { vst1q_f32(ptr, val); }
21 FORCE_INLINE static VecType set1(ScalarType scalar) noexcept { return vdupq_n_f32(scalar); }
22
23 FORCE_INLINE static VecType add(VecType a, VecType b) noexcept { return vaddq_f32(a, b); }
24 FORCE_INLINE static VecType add(VecType a, ScalarType b) noexcept { return vaddq_f32(a, set1(b)); }
25
26 FORCE_INLINE static VecType mul(VecType a, VecType b) noexcept { return vmulq_f32(a, b); }
27 FORCE_INLINE static VecType mul(VecType a, ScalarType b) noexcept { return vmulq_f32(a, set1(b)); }
28
29 FORCE_INLINE static VecType sub(VecType a, VecType b) noexcept { return vsubq_f32(a, b); }
30 FORCE_INLINE static VecType sub(VecType a, ScalarType b) noexcept { return vsubq_f32(a, set1(b)); }
31 FORCE_INLINE static VecType sub(ScalarType a, VecType b) noexcept { return vsubq_f32(set1(a), b); }
32
33 FORCE_INLINE static VecType div(VecType a, VecType b) noexcept
34 {
35 // AArch64 has vdivq_f32; AArch32 needs reciprocal estimate + Newton-Raphson.
36#ifdef __aarch64__
37 return vdivq_f32(a, b);
38#else
39 // Two Newton-Raphson iterations on the reciprocal estimate
40 float32x4_t recip = vrecpeq_f32(b);
41 recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
42 recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
43 return vmulq_f32(a, recip);
44#endif
45 }
46 FORCE_INLINE static VecType div(VecType a, ScalarType b) noexcept { return div(a, set1(b)); }
47 FORCE_INLINE static VecType div(ScalarType a, VecType b) noexcept { return div(set1(a), b); }
48
49 // fmadd: a*b + c — maps to single VFMA instruction on Cortex-A72
50 FORCE_INLINE static VecType fmadd(VecType a, VecType b, VecType c) noexcept { return vfmaq_f32(c, a, b); }
51 FORCE_INLINE static VecType fmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmaq_f32(c, a, set1(b)); }
52
53 // fmsub: a*b - c
54 FORCE_INLINE static VecType fmsub(VecType a, VecType b, VecType c) noexcept { return vsubq_f32(vfmaq_f32(vdupq_n_f32(0.0f), a, b), c); }
55 FORCE_INLINE static VecType fmsub(VecType a, ScalarType b, VecType c) noexcept { return fmsub(a, set1(b), c); }
56
57 // fnmadd: -(a*b) + c — NEON vfmsq_f32 computes c - a*b
58 FORCE_INLINE static VecType fnmadd(VecType a, VecType b, VecType c) noexcept { return vfmsq_f32(c, a, b); }
59 FORCE_INLINE static VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmsq_f32(c, a, set1(b)); }
60
61 // fnmsub: -(a*b) - c
62 FORCE_INLINE static VecType fnmsub(VecType a, VecType b, VecType c) noexcept { return vnegq_f32(vfmaq_f32(c, a, b)); }
63 FORCE_INLINE static VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept { return fnmsub(a, set1(b), c); }
64
65 FORCE_INLINE static VecType min(VecType a, VecType b) noexcept { return vminq_f32(a, b); }
66 FORCE_INLINE static VecType min(VecType a, ScalarType b) noexcept { return vminq_f32(a, set1(b)); }
67
68 FORCE_INLINE static VecType max(VecType a, VecType b) noexcept { return vmaxq_f32(a, b); }
69 FORCE_INLINE static VecType max(VecType a, ScalarType b) noexcept { return vmaxq_f32(a, set1(b)); }
70
71 // ============================================================================
72 // Gather: NEON has no hardware gather — scalar fallback
73 // ============================================================================
74 FORCE_INLINE static VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
75 {
76 alignas(16) ScalarType tmp[simdWidth];
77 for (my_size_t i = 0; i < simdWidth; ++i)
78 tmp[i] = base[indices[i]];
79 return vld1q_f32(tmp);
80 }
81
82 FORCE_INLINE static void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
83 {
84 alignas(16) ScalarType tmp[simdWidth];
85 vst1q_f32(tmp, val);
86 for (my_size_t i = 0; i < simdWidth; ++i)
87 base[indices[i]] = tmp[i];
88 }
89
90 FORCE_INLINE static VecType abs(VecType v) noexcept
91 {
92 return vabsq_f32(v);
93 }
94
96 {
97 float32x4_t diff = vsubq_f32(a, b);
98 float32x4_t abs_diff = vabsq_f32(diff);
99 float32x4_t tol_vec = vdupq_n_f32(tol);
100 uint32x4_t cmp = vcleq_f32(abs_diff, tol_vec); // abs_diff <= tol
101 // All lanes must be 0xFFFFFFFF → min across lanes must be non-zero
102 return vminvq_u32(cmp) != 0;
103 }
104};
105
106// ============================================================================
107// NEON (128-bit) double intrinsics
108// ============================================================================
109
111{
112 static constexpr my_size_t simdWidth = 2; // 128 bits / 64 bits per double = 2
113 static constexpr my_size_t num_registers = 32;
114 using VecType = float64x2_t;
115 using ScalarType = double;
116
117 FORCE_INLINE static VecType load(const ScalarType *ptr) noexcept { return vld1q_f64(ptr); }
118 FORCE_INLINE static VecType loadu(const ScalarType *ptr) noexcept { return vld1q_f64(ptr); }
119 FORCE_INLINE static void store(ScalarType *ptr, VecType val) noexcept { vst1q_f64(ptr, val); }
120 FORCE_INLINE static void storeu(ScalarType *ptr, VecType val) noexcept { vst1q_f64(ptr, val); }
121 FORCE_INLINE static VecType set1(ScalarType scalar) noexcept { return vdupq_n_f64(scalar); }
122
123 FORCE_INLINE static VecType add(VecType a, VecType b) noexcept { return vaddq_f64(a, b); }
124 FORCE_INLINE static VecType add(VecType a, ScalarType b) noexcept { return vaddq_f64(a, set1(b)); }
125
126 FORCE_INLINE static VecType mul(VecType a, VecType b) noexcept { return vmulq_f64(a, b); }
127 FORCE_INLINE static VecType mul(VecType a, ScalarType b) noexcept { return vmulq_f64(a, set1(b)); }
128
129 FORCE_INLINE static VecType sub(VecType a, VecType b) noexcept { return vsubq_f64(a, b); }
130 FORCE_INLINE static VecType sub(VecType a, ScalarType b) noexcept { return vsubq_f64(a, set1(b)); }
131 FORCE_INLINE static VecType sub(ScalarType a, VecType b) noexcept { return vsubq_f64(set1(a), b); }
132
133 FORCE_INLINE static VecType div(VecType a, VecType b) noexcept { return vdivq_f64(a, b); }
134 FORCE_INLINE static VecType div(VecType a, ScalarType b) noexcept { return vdivq_f64(a, set1(b)); }
135 FORCE_INLINE static VecType div(ScalarType a, VecType b) noexcept { return vdivq_f64(set1(a), b); }
136
137 // fmadd: a*b + c
138 FORCE_INLINE static VecType fmadd(VecType a, VecType b, VecType c) noexcept { return vfmaq_f64(c, a, b); }
139 FORCE_INLINE static VecType fmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmaq_f64(c, a, set1(b)); }
140
141 // fmsub: a*b - c
142 FORCE_INLINE static VecType fmsub(VecType a, VecType b, VecType c) noexcept { return vsubq_f64(vfmaq_f64(vdupq_n_f64(0.0), a, b), c); }
143 FORCE_INLINE static VecType fmsub(VecType a, ScalarType b, VecType c) noexcept { return fmsub(a, set1(b), c); }
144
145 // fnmadd: -(a*b) + c
146 FORCE_INLINE static VecType fnmadd(VecType a, VecType b, VecType c) noexcept { return vfmsq_f64(c, a, b); }
147 FORCE_INLINE static VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmsq_f64(c, a, set1(b)); }
148
149 // fnmsub: -(a*b) - c
150 FORCE_INLINE static VecType fnmsub(VecType a, VecType b, VecType c) noexcept { return vnegq_f64(vfmaq_f64(c, a, b)); }
151 FORCE_INLINE static VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept { return fnmsub(a, set1(b), c); }
152
153 FORCE_INLINE static VecType min(VecType a, VecType b) noexcept { return vminq_f64(a, b); }
154 FORCE_INLINE static VecType min(VecType a, ScalarType b) noexcept { return vminq_f64(a, set1(b)); }
155
156 FORCE_INLINE static VecType max(VecType a, VecType b) noexcept { return vmaxq_f64(a, b); }
157 FORCE_INLINE static VecType max(VecType a, ScalarType b) noexcept { return vmaxq_f64(a, set1(b)); }
158
159 FORCE_INLINE static VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
160 {
161 alignas(16) ScalarType tmp[simdWidth];
162 for (my_size_t i = 0; i < simdWidth; ++i)
163 tmp[i] = base[indices[i]];
164 return vld1q_f64(tmp);
165 }
166
167 FORCE_INLINE static void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
168 {
169 alignas(16) ScalarType tmp[simdWidth];
170 vst1q_f64(tmp, val);
171 for (my_size_t i = 0; i < simdWidth; ++i)
172 base[indices[i]] = tmp[i];
173 }
174
175 FORCE_INLINE static VecType abs(VecType v) noexcept
176 {
177 return vabsq_f64(v);
178 }
179
181 {
182 float64x2_t diff = vsubq_f64(a, b);
183 float64x2_t abs_diff = vabsq_f64(diff);
184 float64x2_t tol_vec = vdupq_n_f64(tol);
185 uint64x2_t cmp = vcleq_f64(abs_diff, tol_vec);
186 // Both lanes must pass
187 return (vgetq_lane_u64(cmp, 0) & vgetq_lane_u64(cmp, 1)) != 0;
188 }
189};
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
Definition neon_intrinsics.h:111
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:117
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:143
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:130
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:142
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:147
static constexpr my_size_t simdWidth
Definition neon_intrinsics.h:112
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:156
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:119
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:146
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:157
double ScalarType
Definition neon_intrinsics.h:115
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:126
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:153
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition neon_intrinsics.h:159
float64x2_t VecType
Definition neon_intrinsics.h:114
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:124
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:118
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition neon_intrinsics.h:175
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:151
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:127
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:131
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:138
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:139
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:129
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:135
static constexpr my_size_t num_registers
Definition neon_intrinsics.h:113
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:120
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition neon_intrinsics.h:121
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition neon_intrinsics.h:180
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:134
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:154
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:123
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:133
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition neon_intrinsics.h:167
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:150
Definition neon_intrinsics.h:11
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:31
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:65
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:63
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:69
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:58
float32x4_t VecType
Definition neon_intrinsics.h:14
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition neon_intrinsics.h:95
static constexpr my_size_t simdWidth
Definition neon_intrinsics.h:12
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:68
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:17
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:54
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition neon_intrinsics.h:90
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:46
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:59
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:29
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:55
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:24
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:27
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition neon_intrinsics.h:74
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:66
float ScalarType
Definition neon_intrinsics.h:15
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:18
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:26
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:30
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:20
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:50
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:19
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:23
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:51
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:62
static constexpr my_size_t num_registers
Definition neon_intrinsics.h:13
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:47
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition neon_intrinsics.h:21
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:33
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition neon_intrinsics.h:82