1#ifndef __SSE2_MICROKERNEL_H__
2#define __SSE2_MICROKERNEL_H__
58 idx32[i] =
static_cast<int32_t
>(indices[i]);
65 __m128i vindex = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(idx32));
66 return _mm_i32gather_ps(base, vindex,
sizeof(
ScalarType));
72 _mm_storeu_ps(tmp, val);
74 base[indices[i]] = tmp[i];
79 __m128 sign_mask = _mm_set1_ps(-0.0f);
80 return _mm_andnot_ps(sign_mask, v);
85 __m128 diff = _mm_sub_ps(a, b);
86 __m128 abs_diff = abs(diff);
87 __m128 tol_vec = _mm_set1_ps(tol);
88 __m128 cmp = _mm_cmple_ps(abs_diff, tol_vec);
89 int mask = _mm_movemask_ps(cmp);
131 __m128i vindex = _mm_loadu_si128(
reinterpret_cast<const __m128i *
>(indices));
132 return _mm_i64gather_pd(base, vindex,
sizeof(
ScalarType));
138 _mm_storeu_pd(tmp, val);
140 base[indices[i]] = tmp[i];
145 __m128d sign_mask = _mm_set1_pd(-0.0);
146 return _mm_andnot_pd(sign_mask, v);
151 __m128d diff = _mm_sub_pd(a, b);
152 __m128d abs_diff = abs(diff);
153 __m128d tol_vec = _mm_set1_pd(tol);
154 __m128d cmp = _mm_cmple_pd(abs_diff, tol_vec);
155 int mask = _mm_movemask_pd(cmp);
Global configuration for the tesseract tensor library.
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:110
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition sse2_microkernel.h:135
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:103
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:107
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:111
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:123
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:104
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition sse2_microkernel.h:121
__m128d VecType
Definition sse2_microkernel.h:98
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:126
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:119
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:117
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:108
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:124
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition sse2_microkernel.h:105
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:113
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:101
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition sse2_microkernel.h:149
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:118
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition sse2_microkernel.h:143
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:102
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:127
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:114
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:115
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition sse2_microkernel.h:129
double ScalarType
Definition sse2_microkernel.h:99
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:23
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition sse2_microkernel.h:24
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:46
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition sse2_microkernel.h:27
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:35
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:29
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:48
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:40
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:39
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:32
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:33
float ScalarType
Definition sse2_microkernel.h:21
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition sse2_microkernel.h:43
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition sse2_microkernel.h:83
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:25
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition sse2_microkernel.h:69
__m128 VecType
Definition sse2_microkernel.h:20
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition sse2_microkernel.h:45
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:30
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition sse2_microkernel.h:51
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition sse2_microkernel.h:26
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:37
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:49
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition sse2_microkernel.h:41
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition sse2_microkernel.h:77
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition sse2_microkernel.h:36
Definition microkernel_base.h:16
static constexpr my_size_t simdWidth
Definition microkernel_base.h:17
static FORCE_INLINE VecType set1(T scalar) noexcept
Definition sse2_microkernel.h:9