tesseract/neon__intrinsics_8h_source.html

#pragma once


#include <arm_neon.h>

#include "config.h"


// ============================================================================

// NEON (128-bit) float intrinsics

// ============================================================================


struct NeonFloatIntrinsics

{

    static constexpr my_size_t simdWidth = 4; // 128 bits / 32 bits per float = 4

    static constexpr my_size_t num_registers = 32;

    using VecType = float32x4_t;

    using ScalarType = float;


    FORCE_INLINE static VecType load(const ScalarType *ptr) noexcept { return vld1q_f32(ptr); }

    FORCE_INLINE static VecType loadu(const ScalarType *ptr) noexcept { return vld1q_f32(ptr); } // NEON has no alignment requirement

    FORCE_INLINE static void store(ScalarType *ptr, VecType val) noexcept { vst1q_f32(ptr, val); }

    FORCE_INLINE static void storeu(ScalarType *ptr, VecType val) noexcept { vst1q_f32(ptr, val); }

    FORCE_INLINE static VecType set1(ScalarType scalar) noexcept { return vdupq_n_f32(scalar); }


    FORCE_INLINE static VecType add(VecType a, VecType b) noexcept { return vaddq_f32(a, b); }

    FORCE_INLINE static VecType add(VecType a, ScalarType b) noexcept { return vaddq_f32(a, set1(b)); }


    FORCE_INLINE static VecType mul(VecType a, VecType b) noexcept { return vmulq_f32(a, b); }

    FORCE_INLINE static VecType mul(VecType a, ScalarType b) noexcept { return vmulq_f32(a, set1(b)); }


    FORCE_INLINE static VecType sub(VecType a, VecType b) noexcept { return vsubq_f32(a, b); }

    FORCE_INLINE static VecType sub(VecType a, ScalarType b) noexcept { return vsubq_f32(a, set1(b)); }

    FORCE_INLINE static VecType sub(ScalarType a, VecType b) noexcept { return vsubq_f32(set1(a), b); }


    FORCE_INLINE static VecType div(VecType a, VecType b) noexcept

    {

        // AArch64 has vdivq_f32; AArch32 needs reciprocal estimate + Newton-Raphson.

#ifdef __aarch64__

        return vdivq_f32(a, b);

#else

        // Two Newton-Raphson iterations on the reciprocal estimate

        float32x4_t recip = vrecpeq_f32(b);

        recip = vmulq_f32(vrecpsq_f32(b, recip), recip);

        recip = vmulq_f32(vrecpsq_f32(b, recip), recip);

        return vmulq_f32(a, recip);

#endif

    }


    FORCE_INLINE static VecType div(VecType a, ScalarType b) noexcept { return div(a, set1(b)); }

    FORCE_INLINE static VecType div(ScalarType a, VecType b) noexcept { return div(set1(a), b); }


    // fmadd: a*b + c   — maps to single VFMA instruction on Cortex-A72

    FORCE_INLINE static VecType fmadd(VecType a, VecType b, VecType c) noexcept { return vfmaq_f32(c, a, b); }

    FORCE_INLINE static VecType fmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmaq_f32(c, a, set1(b)); }


    // fmsub: a*b - c

    FORCE_INLINE static VecType fmsub(VecType a, VecType b, VecType c) noexcept { return vsubq_f32(vfmaq_f32(vdupq_n_f32(0.0f), a, b), c); }

    FORCE_INLINE static VecType fmsub(VecType a, ScalarType b, VecType c) noexcept { return fmsub(a, set1(b), c); }


    // fnmadd: -(a*b) + c  — NEON vfmsq_f32 computes c - a*b

    FORCE_INLINE static VecType fnmadd(VecType a, VecType b, VecType c) noexcept { return vfmsq_f32(c, a, b); }

    FORCE_INLINE static VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmsq_f32(c, a, set1(b)); }


    // fnmsub: -(a*b) - c

    FORCE_INLINE static VecType fnmsub(VecType a, VecType b, VecType c) noexcept { return vnegq_f32(vfmaq_f32(c, a, b)); }

    FORCE_INLINE static VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept { return fnmsub(a, set1(b), c); }


    FORCE_INLINE static VecType min(VecType a, VecType b) noexcept { return vminq_f32(a, b); }

    FORCE_INLINE static VecType min(VecType a, ScalarType b) noexcept { return vminq_f32(a, set1(b)); }


    FORCE_INLINE static VecType max(VecType a, VecType b) noexcept { return vmaxq_f32(a, b); }

    FORCE_INLINE static VecType max(VecType a, ScalarType b) noexcept { return vmaxq_f32(a, set1(b)); }


    // ============================================================================

    // Gather: NEON has no hardware gather — scalar fallback

    // ============================================================================


    FORCE_INLINE static VecType gather(const ScalarType *base, const my_size_t *indices) noexcept

    {

        alignas(16) ScalarType tmp[simdWidth];

        for (my_size_t i = 0; i < simdWidth; ++i)

            tmp[i] = base[indices[i]];

        return vld1q_f32(tmp);

    }


    FORCE_INLINE static void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept

    {

        alignas(16) ScalarType tmp[simdWidth];

        vst1q_f32(tmp, val);

        for (my_size_t i = 0; i < simdWidth; ++i)

            base[indices[i]] = tmp[i];

    }


    FORCE_INLINE static VecType abs(VecType v) noexcept

    {

        return vabsq_f32(v);

    }


    FORCE_INLINE static bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept

    {

        float32x4_t diff = vsubq_f32(a, b);

        float32x4_t abs_diff = vabsq_f32(diff);

        float32x4_t tol_vec = vdupq_n_f32(tol);

        uint32x4_t cmp = vcleq_f32(abs_diff, tol_vec); // abs_diff <= tol

        // All lanes must be 0xFFFFFFFF → min across lanes must be non-zero

        return vminvq_u32(cmp) != 0;

    }


};


// ============================================================================

// NEON (128-bit) double intrinsics

// ============================================================================


struct NeonDoubleIntrinsics

{

    static constexpr my_size_t simdWidth = 2; // 128 bits / 64 bits per double = 2

    static constexpr my_size_t num_registers = 32;

    using VecType = float64x2_t;

    using ScalarType = double;


    FORCE_INLINE static VecType load(const ScalarType *ptr) noexcept { return vld1q_f64(ptr); }

    FORCE_INLINE static VecType loadu(const ScalarType *ptr) noexcept { return vld1q_f64(ptr); }

    FORCE_INLINE static void store(ScalarType *ptr, VecType val) noexcept { vst1q_f64(ptr, val); }

    FORCE_INLINE static void storeu(ScalarType *ptr, VecType val) noexcept { vst1q_f64(ptr, val); }

    FORCE_INLINE static VecType set1(ScalarType scalar) noexcept { return vdupq_n_f64(scalar); }


    FORCE_INLINE static VecType add(VecType a, VecType b) noexcept { return vaddq_f64(a, b); }

    FORCE_INLINE static VecType add(VecType a, ScalarType b) noexcept { return vaddq_f64(a, set1(b)); }


    FORCE_INLINE static VecType mul(VecType a, VecType b) noexcept { return vmulq_f64(a, b); }

    FORCE_INLINE static VecType mul(VecType a, ScalarType b) noexcept { return vmulq_f64(a, set1(b)); }


    FORCE_INLINE static VecType sub(VecType a, VecType b) noexcept { return vsubq_f64(a, b); }

    FORCE_INLINE static VecType sub(VecType a, ScalarType b) noexcept { return vsubq_f64(a, set1(b)); }

    FORCE_INLINE static VecType sub(ScalarType a, VecType b) noexcept { return vsubq_f64(set1(a), b); }


    FORCE_INLINE static VecType div(VecType a, VecType b) noexcept { return vdivq_f64(a, b); }

    FORCE_INLINE static VecType div(VecType a, ScalarType b) noexcept { return vdivq_f64(a, set1(b)); }

    FORCE_INLINE static VecType div(ScalarType a, VecType b) noexcept { return vdivq_f64(set1(a), b); }


    // fmadd: a*b + c

    FORCE_INLINE static VecType fmadd(VecType a, VecType b, VecType c) noexcept { return vfmaq_f64(c, a, b); }

    FORCE_INLINE static VecType fmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmaq_f64(c, a, set1(b)); }


    // fmsub: a*b - c

    FORCE_INLINE static VecType fmsub(VecType a, VecType b, VecType c) noexcept { return vsubq_f64(vfmaq_f64(vdupq_n_f64(0.0), a, b), c); }

    FORCE_INLINE static VecType fmsub(VecType a, ScalarType b, VecType c) noexcept { return fmsub(a, set1(b), c); }


    // fnmadd: -(a*b) + c

    FORCE_INLINE static VecType fnmadd(VecType a, VecType b, VecType c) noexcept { return vfmsq_f64(c, a, b); }

    FORCE_INLINE static VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept { return vfmsq_f64(c, a, set1(b)); }


    // fnmsub: -(a*b) - c

    FORCE_INLINE static VecType fnmsub(VecType a, VecType b, VecType c) noexcept { return vnegq_f64(vfmaq_f64(c, a, b)); }

    FORCE_INLINE static VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept { return fnmsub(a, set1(b), c); }


    FORCE_INLINE static VecType min(VecType a, VecType b) noexcept { return vminq_f64(a, b); }

    FORCE_INLINE static VecType min(VecType a, ScalarType b) noexcept { return vminq_f64(a, set1(b)); }


    FORCE_INLINE static VecType max(VecType a, VecType b) noexcept { return vmaxq_f64(a, b); }

    FORCE_INLINE static VecType max(VecType a, ScalarType b) noexcept { return vmaxq_f64(a, set1(b)); }


    FORCE_INLINE static VecType gather(const ScalarType *base, const my_size_t *indices) noexcept

    {

        alignas(16) ScalarType tmp[simdWidth];

        for (my_size_t i = 0; i < simdWidth; ++i)

            tmp[i] = base[indices[i]];

        return vld1q_f64(tmp);

    }


    FORCE_INLINE static void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept

    {

        alignas(16) ScalarType tmp[simdWidth];

        vst1q_f64(tmp, val);

        for (my_size_t i = 0; i < simdWidth; ++i)

            base[indices[i]] = tmp[i];

    }


    FORCE_INLINE static VecType abs(VecType v) noexcept

    {

        return vabsq_f64(v);

    }


    FORCE_INLINE static bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept

    {

        float64x2_t diff = vsubq_f64(a, b);

        float64x2_t abs_diff = vabsq_f64(diff);

        float64x2_t tol_vec = vdupq_n_f64(tol);

        uint64x2_t cmp = vcleq_f64(abs_diff, tol_vec);

        // Both lanes must pass

        return (vgetq_lane_u64(cmp, 0) & vgetq_lane_u64(cmp, 1)) != 0;

    }


};


config.h
Global configuration for the tesseract tensor library.

my_size_t
#define my_size_t
Size/index type used throughout the library.
Definition config.h:126

FORCE_INLINE
#define FORCE_INLINE
Hint the compiler to always inline a function.
Definition config.h:26

NeonDoubleIntrinsics
Definition neon_intrinsics.h:111

NeonDoubleIntrinsics::load
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:117

NeonDoubleIntrinsics::fmsub
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:143

NeonDoubleIntrinsics::sub
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:130

NeonDoubleIntrinsics::fmsub
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:142

NeonDoubleIntrinsics::fnmadd
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:147

NeonDoubleIntrinsics::simdWidth
static constexpr my_size_t simdWidth
Definition neon_intrinsics.h:112

NeonDoubleIntrinsics::max
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:156

NeonDoubleIntrinsics::store
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:119

NeonDoubleIntrinsics::fnmadd
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:146

NeonDoubleIntrinsics::max
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:157

NeonDoubleIntrinsics::ScalarType
double ScalarType
Definition neon_intrinsics.h:115

NeonDoubleIntrinsics::mul
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:126

NeonDoubleIntrinsics::min
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:153

NeonDoubleIntrinsics::gather
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition neon_intrinsics.h:159

NeonDoubleIntrinsics::VecType
float64x2_t VecType
Definition neon_intrinsics.h:114

NeonDoubleIntrinsics::add
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:124

NeonDoubleIntrinsics::loadu
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:118

NeonDoubleIntrinsics::abs
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition neon_intrinsics.h:175

NeonDoubleIntrinsics::fnmsub
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:151

NeonDoubleIntrinsics::mul
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:127

NeonDoubleIntrinsics::sub
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:131

NeonDoubleIntrinsics::fmadd
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:138

NeonDoubleIntrinsics::fmadd
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:139

NeonDoubleIntrinsics::sub
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:129

NeonDoubleIntrinsics::div
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:135

NeonDoubleIntrinsics::num_registers
static constexpr my_size_t num_registers
Definition neon_intrinsics.h:113

NeonDoubleIntrinsics::storeu
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:120

NeonDoubleIntrinsics::set1
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition neon_intrinsics.h:121

NeonDoubleIntrinsics::all_within_tolerance
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition neon_intrinsics.h:180

NeonDoubleIntrinsics::div
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:134

NeonDoubleIntrinsics::min
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:154

NeonDoubleIntrinsics::add
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:123

NeonDoubleIntrinsics::div
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:133

NeonDoubleIntrinsics::scatter
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition neon_intrinsics.h:167

NeonDoubleIntrinsics::fnmsub
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:150

NeonFloatIntrinsics
Definition neon_intrinsics.h:11

NeonFloatIntrinsics::sub
static FORCE_INLINE VecType sub(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:31

NeonFloatIntrinsics::min
static FORCE_INLINE VecType min(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:65

NeonFloatIntrinsics::fnmsub
static FORCE_INLINE VecType fnmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:63

NeonFloatIntrinsics::max
static FORCE_INLINE VecType max(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:69

NeonFloatIntrinsics::fnmadd
static FORCE_INLINE VecType fnmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:58

NeonFloatIntrinsics::VecType
float32x4_t VecType
Definition neon_intrinsics.h:14

NeonFloatIntrinsics::all_within_tolerance
static FORCE_INLINE bool all_within_tolerance(VecType a, VecType b, ScalarType tol) noexcept
Definition neon_intrinsics.h:95

NeonFloatIntrinsics::simdWidth
static constexpr my_size_t simdWidth
Definition neon_intrinsics.h:12

NeonFloatIntrinsics::max
static FORCE_INLINE VecType max(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:68

NeonFloatIntrinsics::load
static FORCE_INLINE VecType load(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:17

NeonFloatIntrinsics::fmsub
static FORCE_INLINE VecType fmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:54

NeonFloatIntrinsics::abs
static FORCE_INLINE VecType abs(VecType v) noexcept
Definition neon_intrinsics.h:90

NeonFloatIntrinsics::div
static FORCE_INLINE VecType div(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:46

NeonFloatIntrinsics::fnmadd
static FORCE_INLINE VecType fnmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:59

NeonFloatIntrinsics::sub
static FORCE_INLINE VecType sub(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:29

NeonFloatIntrinsics::fmsub
static FORCE_INLINE VecType fmsub(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:55

NeonFloatIntrinsics::add
static FORCE_INLINE VecType add(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:24

NeonFloatIntrinsics::mul
static FORCE_INLINE VecType mul(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:27

NeonFloatIntrinsics::gather
static FORCE_INLINE VecType gather(const ScalarType *base, const my_size_t *indices) noexcept
Definition neon_intrinsics.h:74

NeonFloatIntrinsics::min
static FORCE_INLINE VecType min(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:66

NeonFloatIntrinsics::ScalarType
float ScalarType
Definition neon_intrinsics.h:15

NeonFloatIntrinsics::loadu
static FORCE_INLINE VecType loadu(const ScalarType *ptr) noexcept
Definition neon_intrinsics.h:18

NeonFloatIntrinsics::mul
static FORCE_INLINE VecType mul(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:26

NeonFloatIntrinsics::sub
static FORCE_INLINE VecType sub(VecType a, ScalarType b) noexcept
Definition neon_intrinsics.h:30

NeonFloatIntrinsics::storeu
static FORCE_INLINE void storeu(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:20

NeonFloatIntrinsics::fmadd
static FORCE_INLINE VecType fmadd(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:50

NeonFloatIntrinsics::store
static FORCE_INLINE void store(ScalarType *ptr, VecType val) noexcept
Definition neon_intrinsics.h:19

NeonFloatIntrinsics::add
static FORCE_INLINE VecType add(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:23

NeonFloatIntrinsics::fmadd
static FORCE_INLINE VecType fmadd(VecType a, ScalarType b, VecType c) noexcept
Definition neon_intrinsics.h:51

NeonFloatIntrinsics::fnmsub
static FORCE_INLINE VecType fnmsub(VecType a, VecType b, VecType c) noexcept
Definition neon_intrinsics.h:62

NeonFloatIntrinsics::num_registers
static constexpr my_size_t num_registers
Definition neon_intrinsics.h:13

NeonFloatIntrinsics::div
static FORCE_INLINE VecType div(ScalarType a, VecType b) noexcept
Definition neon_intrinsics.h:47

NeonFloatIntrinsics::set1
static FORCE_INLINE VecType set1(ScalarType scalar) noexcept
Definition neon_intrinsics.h:21

NeonFloatIntrinsics::div
static FORCE_INLINE VecType div(VecType a, VecType b) noexcept
Definition neon_intrinsics.h:33

NeonFloatIntrinsics::scatter
static FORCE_INLINE void scatter(ScalarType *base, const my_size_t *indices, VecType val) noexcept
Definition neon_intrinsics.h:82