Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/arch/simddetect.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: simddetect.cpp | |
| 3 // Description: Architecture detector. | |
| 4 // Author: Stefan Weil (based on code from Ray Smith) | |
| 5 // | |
| 6 // (C) Copyright 2014, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 /////////////////////////////////////////////////////////////////////// | |
| 17 | |
| 18 #ifdef HAVE_CONFIG_H | |
| 19 # include "config_auto.h" // for HAVE_AVX, ... | |
| 20 #endif | |
| 21 #include <numeric> // for std::inner_product | |
| 22 #include "dotproduct.h" | |
| 23 #include "intsimdmatrix.h" // for IntSimdMatrix | |
| 24 #include "params.h" // for STRING_VAR | |
| 25 #include "simddetect.h" | |
| 26 #include "tprintf.h" // for tprintf | |
| 27 | |
| 28 #if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12) | |
| 29 // The GNU compiler g++ fails to compile with the Accelerate framework | |
| 30 // (tested with versions 10 and 11), so unconditionally disable it. | |
| 31 #undef HAVE_FRAMEWORK_ACCELERATE | |
| 32 #endif | |
| 33 | |
| 34 #if defined(HAVE_FRAMEWORK_ACCELERATE) | |
| 35 | |
| 36 // Use Apple Accelerate framework. | |
| 37 // https://developer.apple.com/documentation/accelerate/simd | |
| 38 | |
| 39 #include <Accelerate/Accelerate.h> | |
| 40 | |
| 41 #endif | |
| 42 | |
| 43 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1) | |
| 44 // See https://en.wikipedia.org/wiki/CPUID. | |
| 45 # define HAS_CPUID | |
| 46 #endif | |
| 47 | |
| 48 #if defined(HAS_CPUID) | |
| 49 # if defined(__GNUC__) | |
| 50 # include <cpuid.h> | |
| 51 # elif defined(_WIN32) | |
| 52 # include <intrin.h> | |
| 53 # endif | |
| 54 #endif | |
| 55 | |
| 56 #if defined(HAVE_NEON) && !defined(__aarch64__) | |
| 57 # if defined(HAVE_ANDROID_GETCPUFAMILY) | |
| 58 # include <cpu-features.h> | |
| 59 # elif defined(HAVE_GETAUXVAL) | |
| 60 # include <asm/hwcap.h> | |
| 61 # include <sys/auxv.h> | |
| 62 # elif defined(HAVE_ELF_AUX_INFO) | |
| 63 # include <sys/auxv.h> | |
| 64 # include <sys/elf.h> | |
| 65 # endif | |
| 66 #endif | |
| 67 | |
| 68 #if defined(HAVE_RVV) | |
| 69 # if defined(HAVE_GETAUXVAL) | |
| 70 # include <sys/auxv.h> | |
| 71 # define HWCAP_RV(letter) (1ul << ((letter) - 'A')) | |
| 72 # endif | |
| 73 #endif | |
| 74 | |
| 75 namespace tesseract { | |
| 76 | |
| 77 // Computes and returns the dot product of the two n-vectors u and v. | |
| 78 // Note: because the order of addition is different among the different dot | |
| 79 // product functions, the results can (and do) vary slightly (although they | |
| 80 // agree to within about 4e-15). This produces different results when running | |
| 81 // training, despite all random inputs being precisely equal. | |
| 82 // To get consistent results, use just one of these dot product functions. | |
| 83 // On a test multi-layer network, serial is 57% slower than SSE, and AVX | |
| 84 // is about 8% faster than SSE. This suggests that the time is memory | |
| 85 // bandwidth constrained and could benefit from holding the reused vector | |
| 86 // in AVX registers. | |
| 87 DotProductFunction DotProduct; | |
| 88 | |
| 89 static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product"); | |
| 90 | |
| 91 SIMDDetect SIMDDetect::detector; | |
| 92 | |
| 93 #if defined(__aarch64__) | |
| 94 // ARMv8 always has NEON. | |
| 95 bool SIMDDetect::neon_available_ = true; | |
| 96 #elif defined(HAVE_NEON) | |
| 97 // If true, then Neon has been detected. | |
| 98 bool SIMDDetect::neon_available_; | |
| 99 #elif defined(HAVE_RVV) | |
| 100 bool SIMDDetect::rvv_available_; | |
| 101 #else | |
| 102 // If true, then AVX has been detected. | |
| 103 bool SIMDDetect::avx_available_; | |
| 104 bool SIMDDetect::avx2_available_; | |
| 105 bool SIMDDetect::avx512F_available_; | |
| 106 bool SIMDDetect::avx512BW_available_; | |
| 107 bool SIMDDetect::avx512VNNI_available_; | |
| 108 // If true, then FMA has been detected. | |
| 109 bool SIMDDetect::fma_available_; | |
| 110 // If true, then SSe4.1 has been detected. | |
| 111 bool SIMDDetect::sse_available_; | |
| 112 #endif | |
| 113 | |
| 114 #if defined(HAVE_FRAMEWORK_ACCELERATE) | |
| 115 static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) { | |
| 116 TFloat total = 0; | |
| 117 const int stride = 1; | |
| 118 #if defined(FAST_FLOAT) | |
| 119 vDSP_dotpr(u, stride, v, stride, &total, n); | |
| 120 #else | |
| 121 vDSP_dotprD(u, stride, v, stride, &total, n); | |
| 122 #endif | |
| 123 return total; | |
| 124 } | |
| 125 #endif | |
| 126 | |
| 127 // Computes and returns the dot product of the two n-vectors u and v. | |
| 128 static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) { | |
| 129 TFloat total = 0; | |
| 130 for (int k = 0; k < n; ++k) { | |
| 131 total += u[k] * v[k]; | |
| 132 } | |
| 133 return total; | |
| 134 } | |
| 135 | |
| 136 // Compute dot product using std::inner_product. | |
| 137 static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) { | |
| 138 return std::inner_product(u, u + n, v, static_cast<TFloat>(0)); | |
| 139 } | |
| 140 | |
| 141 static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) { | |
| 142 DotProduct = f; | |
| 143 IntSimdMatrix::intSimdMatrix = m; | |
| 144 } | |
| 145 | |
| 146 // Constructor. | |
| 147 // Tests the architecture in a system-dependent way to detect AVX, SSE and | |
| 148 // any other available SIMD equipment. | |
| 149 // __GNUC__ is also defined by compilers that include GNU extensions such as | |
| 150 // clang. | |
| 151 SIMDDetect::SIMDDetect() { | |
| 152 // The fallback is a generic dot product calculation. | |
| 153 SetDotProduct(DotProductGeneric); | |
| 154 | |
| 155 #if defined(HAS_CPUID) | |
| 156 # if defined(__GNUC__) | |
| 157 unsigned int eax, ebx, ecx, edx; | |
| 158 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) { | |
| 159 // Note that these tests all use hex because the older compilers don't have | |
| 160 // the newer flags. | |
| 161 # if defined(HAVE_SSE4_1) | |
| 162 sse_available_ = (ecx & 0x00080000) != 0; | |
| 163 # endif | |
| 164 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) | |
| 165 auto xgetbv = []() { | |
| 166 uint32_t xcr0; | |
| 167 __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx"); | |
| 168 return xcr0; | |
| 169 }; | |
| 170 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) { | |
| 171 // OSXSAVE bit is set, XMM state and YMM state are fine. | |
| 172 # if defined(HAVE_FMA) | |
| 173 fma_available_ = (ecx & 0x00001000) != 0; | |
| 174 # endif | |
| 175 # if defined(HAVE_AVX) | |
| 176 avx_available_ = (ecx & 0x10000000) != 0; | |
| 177 if (avx_available_) { | |
| 178 // There is supposed to be a __get_cpuid_count function, but this is all | |
| 179 // there is in my cpuid.h. It is a macro for an asm statement and cannot | |
| 180 // be used inside an if. | |
| 181 __cpuid_count(7, 0, eax, ebx, ecx, edx); | |
| 182 avx2_available_ = (ebx & 0x00000020) != 0; | |
| 183 avx512F_available_ = (ebx & 0x00010000) != 0; | |
| 184 avx512BW_available_ = (ebx & 0x40000000) != 0; | |
| 185 avx512VNNI_available_ = (ecx & 0x00000800) != 0; | |
| 186 } | |
| 187 # endif | |
| 188 } | |
| 189 # endif | |
| 190 } | |
| 191 # elif defined(_WIN32) | |
| 192 int cpuInfo[4]; | |
| 193 int max_function_id; | |
| 194 __cpuid(cpuInfo, 0); | |
| 195 max_function_id = cpuInfo[0]; | |
| 196 if (max_function_id >= 1) { | |
| 197 __cpuid(cpuInfo, 1); | |
| 198 # if defined(HAVE_SSE4_1) | |
| 199 sse_available_ = (cpuInfo[2] & 0x00080000) != 0; | |
| 200 # endif | |
| 201 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) | |
| 202 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) { | |
| 203 // OSXSAVE bit is set, XMM state and YMM state are fine. | |
| 204 # if defined(HAVE_FMA) | |
| 205 fma_available_ = (cpuInfo[2] & 0x00001000) != 0; | |
| 206 # endif | |
| 207 # if defined(HAVE_AVX) | |
| 208 avx_available_ = (cpuInfo[2] & 0x10000000) != 0; | |
| 209 # endif | |
| 210 # if defined(HAVE_AVX2) | |
| 211 if (max_function_id >= 7) { | |
| 212 __cpuid(cpuInfo, 7); | |
| 213 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0; | |
| 214 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0; | |
| 215 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0; | |
| 216 avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0; | |
| 217 } | |
| 218 # endif | |
| 219 } | |
| 220 # endif | |
| 221 } | |
| 222 # else | |
| 223 # error "I don't know how to test for SIMD with this compiler" | |
| 224 # endif | |
| 225 #endif | |
| 226 | |
| 227 #if defined(HAVE_NEON) && !defined(__aarch64__) | |
| 228 # if defined(HAVE_ANDROID_GETCPUFAMILY) | |
| 229 { | |
| 230 AndroidCpuFamily family = android_getCpuFamily(); | |
| 231 if (family == ANDROID_CPU_FAMILY_ARM) | |
| 232 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON); | |
| 233 } | |
| 234 # elif defined(HAVE_GETAUXVAL) | |
| 235 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON; | |
| 236 # elif defined(HAVE_ELF_AUX_INFO) | |
| 237 unsigned long hwcap = 0; | |
| 238 elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap); | |
| 239 neon_available_ = hwcap & HWCAP_NEON; | |
| 240 # endif | |
| 241 #endif | |
| 242 | |
| 243 #if defined(HAVE_RVV) | |
| 244 # if defined(HAVE_GETAUXVAL) | |
| 245 const unsigned long hwcap = getauxval(AT_HWCAP); | |
| 246 rvv_available_ = hwcap & HWCAP_RV('V'); | |
| 247 # endif | |
| 248 #endif | |
| 249 | |
| 250 // Select code for calculation of dot product based on autodetection. | |
| 251 if (false) { | |
| 252 // This is a dummy to support conditional compilation. | |
| 253 #if defined(HAVE_AVX512F) | |
| 254 } else if (avx512F_available_) { | |
| 255 // AVX512F detected. | |
| 256 SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2); | |
| 257 #endif | |
| 258 #if defined(HAVE_AVX2) | |
| 259 } else if (avx2_available_) { | |
| 260 // AVX2 detected. | |
| 261 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2); | |
| 262 #endif | |
| 263 #if defined(HAVE_AVX) | |
| 264 } else if (avx_available_) { | |
| 265 // AVX detected. | |
| 266 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE); | |
| 267 #endif | |
| 268 #if defined(HAVE_SSE4_1) | |
| 269 } else if (sse_available_) { | |
| 270 // SSE detected. | |
| 271 SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE); | |
| 272 #endif | |
| 273 #if defined(HAVE_NEON) || defined(__aarch64__) | |
| 274 } else if (neon_available_) { | |
| 275 // NEON detected. | |
| 276 SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); | |
| 277 #endif | |
| 278 #if defined(HAVE_RVV) | |
| 279 } else if (rvv_available_) { | |
| 280 SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV); | |
| 281 #endif | |
| 282 } | |
| 283 | |
| 284 const char *dotproduct_env = getenv("DOTPRODUCT"); | |
| 285 if (dotproduct_env != nullptr) { | |
| 286 // Override automatic settings by value from environment variable. | |
| 287 dotproduct = dotproduct_env; | |
| 288 Update(); | |
| 289 } | |
| 290 } | |
| 291 | |
| 292 void SIMDDetect::Update() { | |
| 293 // Select code for calculation of dot product based on the | |
| 294 // value of the config variable if that value is not empty. | |
| 295 const char *dotproduct_method = "generic"; | |
| 296 if (dotproduct == "auto") { | |
| 297 // Automatic detection. Nothing to be done. | |
| 298 } else if (dotproduct == "generic") { | |
| 299 // Generic code selected by config variable. | |
| 300 SetDotProduct(DotProductGeneric); | |
| 301 dotproduct_method = "generic"; | |
| 302 } else if (dotproduct == "native") { | |
| 303 // Native optimized code selected by config variable. | |
| 304 SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix); | |
| 305 dotproduct_method = "native"; | |
| 306 #if defined(HAVE_AVX2) | |
| 307 } else if (dotproduct == "avx2") { | |
| 308 // AVX2 selected by config variable. | |
| 309 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2); | |
| 310 dotproduct_method = "avx2"; | |
| 311 #endif | |
| 312 #if defined(HAVE_AVX) | |
| 313 } else if (dotproduct == "avx") { | |
| 314 // AVX selected by config variable. | |
| 315 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE); | |
| 316 dotproduct_method = "avx"; | |
| 317 #endif | |
| 318 #if defined(HAVE_FMA) | |
| 319 } else if (dotproduct == "fma") { | |
| 320 // FMA selected by config variable. | |
| 321 SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix); | |
| 322 dotproduct_method = "fma"; | |
| 323 #endif | |
| 324 #if defined(HAVE_SSE4_1) | |
| 325 } else if (dotproduct == "sse") { | |
| 326 // SSE selected by config variable. | |
| 327 SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE); | |
| 328 dotproduct_method = "sse"; | |
| 329 #endif | |
| 330 #if defined(HAVE_FRAMEWORK_ACCELERATE) | |
| 331 } else if (dotproduct == "accelerate") { | |
| 332 SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix); | |
| 333 #endif | |
| 334 #if defined(HAVE_NEON) || defined(__aarch64__) | |
| 335 } else if (dotproduct == "neon" && neon_available_) { | |
| 336 // NEON selected by config variable. | |
| 337 SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON); | |
| 338 dotproduct_method = "neon"; | |
| 339 #endif | |
| 340 } else if (dotproduct == "std::inner_product") { | |
| 341 // std::inner_product selected by config variable. | |
| 342 SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix); | |
| 343 dotproduct_method = "std::inner_product"; | |
| 344 } else { | |
| 345 // Unsupported value of config variable. | |
| 346 tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n", | |
| 347 dotproduct.c_str()); | |
| 348 tprintf( | |
| 349 "Supported values for dotproduct: auto generic native" | |
| 350 #if defined(HAVE_AVX2) | |
| 351 " avx2" | |
| 352 #endif | |
| 353 #if defined(HAVE_AVX) | |
| 354 " avx" | |
| 355 #endif | |
| 356 #if defined(HAVE_FMA) | |
| 357 " fma" | |
| 358 #endif | |
| 359 #if defined(HAVE_SSE4_1) | |
| 360 " sse" | |
| 361 #endif | |
| 362 #if defined(HAVE_FRAMEWORK_ACCELERATE) | |
| 363 " accelerate" | |
| 364 #endif | |
| 365 " std::inner_product.\n"); | |
| 366 } | |
| 367 | |
| 368 dotproduct.set_value(dotproduct_method); | |
| 369 } | |
| 370 | |
| 371 } // namespace tesseract |
