Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/arch/simddetect.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/arch/simddetect.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,371 @@ +/////////////////////////////////////////////////////////////////////// +// File: simddetect.cpp +// Description: Architecture detector. +// Author: Stefan Weil (based on code from Ray Smith) +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifdef HAVE_CONFIG_H +# include "config_auto.h" // for HAVE_AVX, ... +#endif +#include <numeric> // for std::inner_product +#include "dotproduct.h" +#include "intsimdmatrix.h" // for IntSimdMatrix +#include "params.h" // for STRING_VAR +#include "simddetect.h" +#include "tprintf.h" // for tprintf + +#if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12) +// The GNU compiler g++ fails to compile with the Accelerate framework +// (tested with versions 10 and 11), so unconditionally disable it. +#undef HAVE_FRAMEWORK_ACCELERATE +#endif + +#if defined(HAVE_FRAMEWORK_ACCELERATE) + +// Use Apple Accelerate framework. +// https://developer.apple.com/documentation/accelerate/simd + +#include <Accelerate/Accelerate.h> + +#endif + +#if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1) +// See https://en.wikipedia.org/wiki/CPUID. +# define HAS_CPUID +#endif + +#if defined(HAS_CPUID) +# if defined(__GNUC__) +# include <cpuid.h> +# elif defined(_WIN32) +# include <intrin.h> +# endif +#endif + +#if defined(HAVE_NEON) && !defined(__aarch64__) +# if defined(HAVE_ANDROID_GETCPUFAMILY) +# include <cpu-features.h> +# elif defined(HAVE_GETAUXVAL) +# include <asm/hwcap.h> +# include <sys/auxv.h> +# elif defined(HAVE_ELF_AUX_INFO) +# include <sys/auxv.h> +# include <sys/elf.h> +# endif +#endif + +#if defined(HAVE_RVV) +# if defined(HAVE_GETAUXVAL) +# include <sys/auxv.h> +# define HWCAP_RV(letter) (1ul << ((letter) - 'A')) +# endif +#endif + +namespace tesseract { + +// Computes and returns the dot product of the two n-vectors u and v. +// Note: because the order of addition is different among the different dot +// product functions, the results can (and do) vary slightly (although they +// agree to within about 4e-15). This produces different results when running +// training, despite all random inputs being precisely equal. +// To get consistent results, use just one of these dot product functions. +// On a test multi-layer network, serial is 57% slower than SSE, and AVX +// is about 8% faster than SSE. This suggests that the time is memory +// bandwidth constrained and could benefit from holding the reused vector +// in AVX registers. +DotProductFunction DotProduct; + +static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product"); + +SIMDDetect SIMDDetect::detector; + +#if defined(__aarch64__) +// ARMv8 always has NEON. +bool SIMDDetect::neon_available_ = true; +#elif defined(HAVE_NEON) +// If true, then Neon has been detected. +bool SIMDDetect::neon_available_; +#elif defined(HAVE_RVV) +bool SIMDDetect::rvv_available_; +#else +// If true, then AVX has been detected. +bool SIMDDetect::avx_available_; +bool SIMDDetect::avx2_available_; +bool SIMDDetect::avx512F_available_; +bool SIMDDetect::avx512BW_available_; +bool SIMDDetect::avx512VNNI_available_; +// If true, then FMA has been detected. +bool SIMDDetect::fma_available_; +// If true, then SSe4.1 has been detected. +bool SIMDDetect::sse_available_; +#endif + +#if defined(HAVE_FRAMEWORK_ACCELERATE) +static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) { + TFloat total = 0; + const int stride = 1; +#if defined(FAST_FLOAT) + vDSP_dotpr(u, stride, v, stride, &total, n); +#else + vDSP_dotprD(u, stride, v, stride, &total, n); +#endif + return total; +} +#endif + +// Computes and returns the dot product of the two n-vectors u and v. +static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) { + TFloat total = 0; + for (int k = 0; k < n; ++k) { + total += u[k] * v[k]; + } + return total; +} + +// Compute dot product using std::inner_product. +static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) { + return std::inner_product(u, u + n, v, static_cast<TFloat>(0)); +} + +static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) { + DotProduct = f; + IntSimdMatrix::intSimdMatrix = m; +} + +// Constructor. +// Tests the architecture in a system-dependent way to detect AVX, SSE and +// any other available SIMD equipment. +// __GNUC__ is also defined by compilers that include GNU extensions such as +// clang. +SIMDDetect::SIMDDetect() { + // The fallback is a generic dot product calculation. + SetDotProduct(DotProductGeneric); + +#if defined(HAS_CPUID) +# if defined(__GNUC__) + unsigned int eax, ebx, ecx, edx; + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) { + // Note that these tests all use hex because the older compilers don't have + // the newer flags. +# if defined(HAVE_SSE4_1) + sse_available_ = (ecx & 0x00080000) != 0; +# endif +# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) + auto xgetbv = []() { + uint32_t xcr0; + __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); + return xcr0; + }; + if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) { + // OSXSAVE bit is set, XMM state and YMM state are fine. +# if defined(HAVE_FMA) + fma_available_ = (ecx & 0x00001000) != 0; +# endif +# if defined(HAVE_AVX) + avx_available_ = (ecx & 0x10000000) != 0; + if (avx_available_) { + // There is supposed to be a __get_cpuid_count function, but this is all + // there is in my cpuid.h. It is a macro for an asm statement and cannot + // be used inside an if. + __cpuid_count(7, 0, eax, ebx, ecx, edx); + avx2_available_ = (ebx & 0x00000020) != 0; + avx512F_available_ = (ebx & 0x00010000) != 0; + avx512BW_available_ = (ebx & 0x40000000) != 0; + avx512VNNI_available_ = (ecx & 0x00000800) != 0; + } +# endif + } +# endif + } +# elif defined(_WIN32) + int cpuInfo[4]; + int max_function_id; + __cpuid(cpuInfo, 0); + max_function_id = cpuInfo[0]; + if (max_function_id >= 1) { + __cpuid(cpuInfo, 1); +# if defined(HAVE_SSE4_1) + sse_available_ = (cpuInfo[2] & 0x00080000) != 0; +# endif +# if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) + if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) { + // OSXSAVE bit is set, XMM state and YMM state are fine. +# if defined(HAVE_FMA) + fma_available_ = (cpuInfo[2] & 0x00001000) != 0; +# endif +# if defined(HAVE_AVX) + avx_available_ = (cpuInfo[2] & 0x10000000) != 0; +# endif +# if defined(HAVE_AVX2) + if (max_function_id >= 7) { + __cpuid(cpuInfo, 7); + avx2_available_ = (cpuInfo[1] & 0x00000020) != 0; + avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0; + avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0; + avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0; + } +# endif + } +# endif + } +# else +# error "I don't know how to test for SIMD with this compiler" +# endif +#endif + +#if defined(HAVE_NEON) && !defined(__aarch64__) +# if defined(HAVE_ANDROID_GETCPUFAMILY) + { + AndroidCpuFamily family = android_getCpuFamily(); + if (family == ANDROID_CPU_FAMILY_ARM) + neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON); + } +# elif defined(HAVE_GETAUXVAL) + neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON; +# elif defined(HAVE_ELF_AUX_INFO) + unsigned long hwcap = 0; + elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); + neon_available_ = hwcap & HWCAP_NEON; +# endif +#endif + +#if defined(HAVE_RVV) +# if defined(HAVE_GETAUXVAL) + const unsigned long hwcap = getauxval(AT_HWCAP); + rvv_available_ = hwcap & HWCAP_RV('V'); +# endif +#endif + + // Select code for calculation of dot product based on autodetection. + if (false) { + // This is a dummy to support conditional compilation. +#if defined(HAVE_AVX512F) + } else if (avx512F_available_) { + // AVX512F detected. + SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2); +#endif +#if defined(HAVE_AVX2) + } else if (avx2_available_) { + // AVX2 detected. + SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2); +#endif +#if defined(HAVE_AVX) + } else if (avx_available_) { + // AVX detected. + SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE); +#endif +#if defined(HAVE_SSE4_1) + } else if (sse_available_) { + // SSE detected. + SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE); +#endif +#if defined(HAVE_NEON) || defined(__aarch64__) + } else if (neon_available_) { + // NEON detected. + SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); +#endif +#if defined(HAVE_RVV) + } else if (rvv_available_) { + SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV); +#endif + } + + const char *dotproduct_env = getenv("DOTPRODUCT"); + if (dotproduct_env != nullptr) { + // Override automatic settings by value from environment variable. + dotproduct = dotproduct_env; + Update(); + } +} + +void SIMDDetect::Update() { + // Select code for calculation of dot product based on the + // value of the config variable if that value is not empty. + const char *dotproduct_method = "generic"; + if (dotproduct == "auto") { + // Automatic detection. Nothing to be done. + } else if (dotproduct == "generic") { + // Generic code selected by config variable. + SetDotProduct(DotProductGeneric); + dotproduct_method = "generic"; + } else if (dotproduct == "native") { + // Native optimized code selected by config variable. + SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix); + dotproduct_method = "native"; +#if defined(HAVE_AVX2) + } else if (dotproduct == "avx2") { + // AVX2 selected by config variable. + SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2); + dotproduct_method = "avx2"; +#endif +#if defined(HAVE_AVX) + } else if (dotproduct == "avx") { + // AVX selected by config variable. + SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE); + dotproduct_method = "avx"; +#endif +#if defined(HAVE_FMA) + } else if (dotproduct == "fma") { + // FMA selected by config variable. + SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix); + dotproduct_method = "fma"; +#endif +#if defined(HAVE_SSE4_1) + } else if (dotproduct == "sse") { + // SSE selected by config variable. + SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE); + dotproduct_method = "sse"; +#endif +#if defined(HAVE_FRAMEWORK_ACCELERATE) + } else if (dotproduct == "accelerate") { + SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix); +#endif +#if defined(HAVE_NEON) || defined(__aarch64__) + } else if (dotproduct == "neon" && neon_available_) { + // NEON selected by config variable. + SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); + dotproduct_method = "neon"; +#endif + } else if (dotproduct == "std::inner_product") { + // std::inner_product selected by config variable. + SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix); + dotproduct_method = "std::inner_product"; + } else { + // Unsupported value of config variable. + tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n", + dotproduct.c_str()); + tprintf( + "Supported values for dotproduct: auto generic native" +#if defined(HAVE_AVX2) + " avx2" +#endif +#if defined(HAVE_AVX) + " avx" +#endif +#if defined(HAVE_FMA) + " fma" +#endif +#if defined(HAVE_SSE4_1) + " sse" +#endif +#if defined(HAVE_FRAMEWORK_ACCELERATE) + " accelerate" +#endif + " std::inner_product.\n"); + } + + dotproduct.set_value(dotproduct_method); +} + +} // namespace tesseract
