1#ifndef __AVX2_MICROKERNEL_H__
2#define __AVX2_MICROKERNEL_H__
80 idx32[i] =
static_cast<int32_t
>(indices[i]);
88 __m256i vindex = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(idx32));
89 return _mm256_i32gather_ps(base, vindex,
sizeof(
ScalarType));
95 _mm256_storeu_ps(tmp, val);
97 base[indices[i]] = tmp[i];
103 __m256 sign_mask = _mm256_set1_ps(-0.0f);
104 return _mm256_andnot_ps(sign_mask, v);
109 __m256 diff = _mm256_sub_ps(a, b);
110 __m256 abs_diff = abs(diff);
111 __m256 tol_vec = _mm256_set1_ps(tol);
112 __m256 cmp = _mm256_cmp_ps(abs_diff, tol_vec, _CMP_LE_OQ);
113 int mask = _mm256_movemask_ps(cmp);
174 __m256i vindex = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(indices));
175 return _mm256_i64gather_pd(base, vindex,
sizeof(
ScalarType));
181 _mm256_storeu_pd(tmp, val);
183 base[indices[i]] = tmp[i];
188 __m256d sign_mask = _mm256_set1_pd(-0.0);
189 return _mm256_andnot_pd(sign_mask, v);
194 __m256d diff = _mm256_sub_pd(a, b);
195 __m256d abs_diff = abs(diff);
196 __m256d tol_vec = _mm256_set1_pd(tol);
197 __m256d cmp = _mm256_cmp_pd(abs_diff, tol_vec, _CMP_LE_OQ);
198 int mask = _mm256_movemask_pd(cmp);
238 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(va), a);
239 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(vb), b);
242 return _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(va));
247 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(va), a);
250 return _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(va));
255 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(vb), b);
259 return _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(vr));
279 __m256i neg_ab = _mm256_sub_epi32(_mm256_setzero_si256(), _mm256_mullo_epi32(a, b));
280 return _mm256_sub_epi32(neg_ab, c);
284 __m256i neg_ab = _mm256_sub_epi32(_mm256_setzero_si256(), _mm256_mullo_epi32(a,
set1(b)));
285 return _mm256_sub_epi32(neg_ab, c);
302 idx32[i] =
static_cast<int32_t
>(indices[i]);
304 __m256i vindex = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(idx32));
305 return _mm256_i32gather_epi32(base, vindex,
sizeof(
ScalarType));
311 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(tmp), val);
313 base[indices[i]] = tmp[i];
318 return _mm256_abs_epi32(v);
323 __m256i diff = _mm256_sub_epi32(a, b);
324 __m256i abs_diff = _mm256_abs_epi32(diff);
325 __m256i tol_vec = _mm256_set1_epi32(tol);
327 __m256i cmp = _mm256_cmpgt_epi32(abs_diff, tol_vec);
328 return _mm256_testz_si256(cmp, cmp);
363 __m256i a_hi = _mm256_srli_epi64(a, 32);
364 __m256i b_hi = _mm256_srli_epi64(b, 32);
366 __m256i lo_lo = _mm256_mul_epu32(a, b);
367 __m256i lo_hi = _mm256_mul_epu32(a, b_hi);
368 __m256i hi_lo = _mm256_mul_epu32(a_hi, b);
370 __m256i cross = _mm256_add_epi64(lo_hi, hi_lo);
371 __m256i cross_shifted = _mm256_slli_epi64(cross, 32);
373 return _mm256_add_epi64(lo_lo, cross_shifted);
385 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(va), a);
386 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(vb), b);
389 return _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(va));
394 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(va), a);
397 return _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(va));
402 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(vb), b);
406 return _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(vr));
421 return sub(_mm256_setzero_si256(),
add(
mul(a, b), c));
425 return sub(_mm256_setzero_si256(),
add(
mul(a, b), c));
432 __m256i gt = _mm256_cmpgt_epi64(a, b);
433 return _mm256_blendv_epi8(a, b, gt);
439 __m256i gt = _mm256_cmpgt_epi64(a, b);
440 return _mm256_blendv_epi8(b, a, gt);
450 __m256i vindex = _mm256_loadu_si256(
reinterpret_cast<const __m256i *
>(indices));
451 return _mm256_i64gather_epi64(
reinterpret_cast<const long long *
>(base), vindex,
sizeof(
ScalarType));
457 _mm256_storeu_si256(
reinterpret_cast<__m256i *
>(tmp), val);
459 base[indices[i]] = tmp[i];
465 __m256i sign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), v);
466 __m256i neg_v = _mm256_sub_epi64(_mm256_setzero_si256(), v);
467 return _mm256_blendv_epi8(v, neg_v, sign);
472 __m256i diff = _mm256_sub_epi64(a, b);
473 __m256i abs_diff = abs(diff);
474 __m256i tol_vec = _mm256_set1_epi64x(tol);
476 __m256i gt = _mm256_cmpgt_epi64(abs_diff, tol_vec);
477 return _mm256_testz_si256(gt, gt);
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:133
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:132
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:170
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:159
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:144
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:156
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition avx2_microkernel.h:186
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:136
__m256d VecType
Definition avx2_microkernel.h:127
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:131
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:166
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition avx2_microkernel.h:134
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:148
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:147
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition avx2_microkernel.h:192
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:169
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:164
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:137
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:140
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:139
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:155
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:152
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:130
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition avx2_microkernel.h:172
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:143
double ScalarType
Definition avx2_microkernel.h:128
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition avx2_microkernel.h:178
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:146
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:167
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:163
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:142
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:151
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:160
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:34
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition avx2_microkernel.h:92
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:40
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:57
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition avx2_microkernel.h:73
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:35
float ScalarType
Definition avx2_microkernel.h:26
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition avx2_microkernel.h:32
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:42
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:67
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:41
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:28
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:38
__m256 VecType
Definition avx2_microkernel.h:25
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:29
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:68
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:53
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:65
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:31
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:44
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:64
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:54
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:46
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:50
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:37
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:49
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition avx2_microkernel.h:107
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:58
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition avx2_microkernel.h:100
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:61
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:30
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:45
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:62
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:244
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:218
__m256i VecType
Definition avx2_microkernel.h:215
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:273
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:227
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition avx2_microkernel.h:297
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:225
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition avx2_microkernel.h:321
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:282
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:230
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:272
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition avx2_microkernel.h:308
int32_t ScalarType
Definition avx2_microkernel.h:216
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:268
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition avx2_microkernel.h:316
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:231
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:221
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:219
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:228
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:264
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:224
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:276
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:291
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:269
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:252
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition avx2_microkernel.h:222
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:265
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:232
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:292
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:235
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:220
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:289
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:288
__m256i VecType
Definition avx2_microkernel.h:344
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:377
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:358
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:435
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:382
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:410
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:423
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:417
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:437
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:348
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:442
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition avx2_microkernel.h:351
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:416
int64_t ScalarType
Definition avx2_microkernel.h:345
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition avx2_microkernel.h:470
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:378
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition avx2_microkernel.h:462
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:349
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:379
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition avx2_microkernel.h:454
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition avx2_microkernel.h:350
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:354
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:419
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition avx2_microkernel.h:347
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:414
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:429
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:375
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition avx2_microkernel.h:399
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:353
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition avx2_microkernel.h:391
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition avx2_microkernel.h:411
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition avx2_microkernel.h:447
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition avx2_microkernel.h:413
Definition microkernel_base.h:16
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
static FORCE_INLINE VecType set1(T scalar) noexcept
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition avx2_microkernel.h:9