comparison mupdf-source/thirdparty/tesseract/src/arch/simddetect.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: simddetect.cpp
3 // Description: Architecture detector.
4 // Author: Stefan Weil (based on code from Ray Smith)
5 //
6 // (C) Copyright 2014, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 ///////////////////////////////////////////////////////////////////////
17
18 #ifdef HAVE_CONFIG_H
19 # include "config_auto.h" // for HAVE_AVX, ...
20 #endif
21 #include <numeric> // for std::inner_product
22 #include "dotproduct.h"
23 #include "intsimdmatrix.h" // for IntSimdMatrix
24 #include "params.h" // for STRING_VAR
25 #include "simddetect.h"
26 #include "tprintf.h" // for tprintf
27
28 #if !defined(__clang__) && defined(__GNUC__) && (__GNUC__ < 12)
29 // The GNU compiler g++ fails to compile with the Accelerate framework
30 // (tested with versions 10 and 11), so unconditionally disable it.
31 #undef HAVE_FRAMEWORK_ACCELERATE
32 #endif
33
34 #if defined(HAVE_FRAMEWORK_ACCELERATE)
35
36 // Use Apple Accelerate framework.
37 // https://developer.apple.com/documentation/accelerate/simd
38
39 #include <Accelerate/Accelerate.h>
40
41 #endif
42
43 #if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA) || defined(HAVE_SSE4_1)
44 // See https://en.wikipedia.org/wiki/CPUID.
45 # define HAS_CPUID
46 #endif
47
48 #if defined(HAS_CPUID)
49 # if defined(__GNUC__)
50 # include <cpuid.h>
51 # elif defined(_WIN32)
52 # include <intrin.h>
53 # endif
54 #endif
55
56 #if defined(HAVE_NEON) && !defined(__aarch64__)
57 # if defined(HAVE_ANDROID_GETCPUFAMILY)
58 # include <cpu-features.h>
59 # elif defined(HAVE_GETAUXVAL)
60 # include <asm/hwcap.h>
61 # include <sys/auxv.h>
62 # elif defined(HAVE_ELF_AUX_INFO)
63 # include <sys/auxv.h>
64 # include <sys/elf.h>
65 # endif
66 #endif
67
68 #if defined(HAVE_RVV)
69 # if defined(HAVE_GETAUXVAL)
70 # include <sys/auxv.h>
71 # define HWCAP_RV(letter) (1ul << ((letter) - 'A'))
72 # endif
73 #endif
74
75 namespace tesseract {
76
77 // Computes and returns the dot product of the two n-vectors u and v.
78 // Note: because the order of addition is different among the different dot
79 // product functions, the results can (and do) vary slightly (although they
80 // agree to within about 4e-15). This produces different results when running
81 // training, despite all random inputs being precisely equal.
82 // To get consistent results, use just one of these dot product functions.
83 // On a test multi-layer network, serial is 57% slower than SSE, and AVX
84 // is about 8% faster than SSE. This suggests that the time is memory
85 // bandwidth constrained and could benefit from holding the reused vector
86 // in AVX registers.
87 DotProductFunction DotProduct;
88
89 static STRING_VAR(dotproduct, "auto", "Function used for calculation of dot product");
90
91 SIMDDetect SIMDDetect::detector;
92
93 #if defined(__aarch64__)
94 // ARMv8 always has NEON.
95 bool SIMDDetect::neon_available_ = true;
96 #elif defined(HAVE_NEON)
97 // If true, then Neon has been detected.
98 bool SIMDDetect::neon_available_;
99 #elif defined(HAVE_RVV)
100 bool SIMDDetect::rvv_available_;
101 #else
102 // If true, then AVX has been detected.
103 bool SIMDDetect::avx_available_;
104 bool SIMDDetect::avx2_available_;
105 bool SIMDDetect::avx512F_available_;
106 bool SIMDDetect::avx512BW_available_;
107 bool SIMDDetect::avx512VNNI_available_;
108 // If true, then FMA has been detected.
109 bool SIMDDetect::fma_available_;
110 // If true, then SSe4.1 has been detected.
111 bool SIMDDetect::sse_available_;
112 #endif
113
114 #if defined(HAVE_FRAMEWORK_ACCELERATE)
115 static TFloat DotProductAccelerate(const TFloat* u, const TFloat* v, int n) {
116 TFloat total = 0;
117 const int stride = 1;
118 #if defined(FAST_FLOAT)
119 vDSP_dotpr(u, stride, v, stride, &total, n);
120 #else
121 vDSP_dotprD(u, stride, v, stride, &total, n);
122 #endif
123 return total;
124 }
125 #endif
126
127 // Computes and returns the dot product of the two n-vectors u and v.
128 static TFloat DotProductGeneric(const TFloat *u, const TFloat *v, int n) {
129 TFloat total = 0;
130 for (int k = 0; k < n; ++k) {
131 total += u[k] * v[k];
132 }
133 return total;
134 }
135
136 // Compute dot product using std::inner_product.
137 static TFloat DotProductStdInnerProduct(const TFloat *u, const TFloat *v, int n) {
138 return std::inner_product(u, u + n, v, static_cast<TFloat>(0));
139 }
140
141 static void SetDotProduct(DotProductFunction f, const IntSimdMatrix *m = nullptr) {
142 DotProduct = f;
143 IntSimdMatrix::intSimdMatrix = m;
144 }
145
146 // Constructor.
147 // Tests the architecture in a system-dependent way to detect AVX, SSE and
148 // any other available SIMD equipment.
149 // __GNUC__ is also defined by compilers that include GNU extensions such as
150 // clang.
151 SIMDDetect::SIMDDetect() {
152 // The fallback is a generic dot product calculation.
153 SetDotProduct(DotProductGeneric);
154
155 #if defined(HAS_CPUID)
156 # if defined(__GNUC__)
157 unsigned int eax, ebx, ecx, edx;
158 if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
159 // Note that these tests all use hex because the older compilers don't have
160 // the newer flags.
161 # if defined(HAVE_SSE4_1)
162 sse_available_ = (ecx & 0x00080000) != 0;
163 # endif
164 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
165 auto xgetbv = []() {
166 uint32_t xcr0;
167 __asm__("xgetbv" : "=a"(xcr0) : "c"(0) : "%edx");
168 return xcr0;
169 };
170 if ((ecx & 0x08000000) && ((xgetbv() & 6) == 6)) {
171 // OSXSAVE bit is set, XMM state and YMM state are fine.
172 # if defined(HAVE_FMA)
173 fma_available_ = (ecx & 0x00001000) != 0;
174 # endif
175 # if defined(HAVE_AVX)
176 avx_available_ = (ecx & 0x10000000) != 0;
177 if (avx_available_) {
178 // There is supposed to be a __get_cpuid_count function, but this is all
179 // there is in my cpuid.h. It is a macro for an asm statement and cannot
180 // be used inside an if.
181 __cpuid_count(7, 0, eax, ebx, ecx, edx);
182 avx2_available_ = (ebx & 0x00000020) != 0;
183 avx512F_available_ = (ebx & 0x00010000) != 0;
184 avx512BW_available_ = (ebx & 0x40000000) != 0;
185 avx512VNNI_available_ = (ecx & 0x00000800) != 0;
186 }
187 # endif
188 }
189 # endif
190 }
191 # elif defined(_WIN32)
192 int cpuInfo[4];
193 int max_function_id;
194 __cpuid(cpuInfo, 0);
195 max_function_id = cpuInfo[0];
196 if (max_function_id >= 1) {
197 __cpuid(cpuInfo, 1);
198 # if defined(HAVE_SSE4_1)
199 sse_available_ = (cpuInfo[2] & 0x00080000) != 0;
200 # endif
201 # if defined(HAVE_AVX) || defined(HAVE_AVX2) || defined(HAVE_FMA)
202 if ((cpuInfo[2] & 0x08000000) && ((_xgetbv(0) & 6) == 6)) {
203 // OSXSAVE bit is set, XMM state and YMM state are fine.
204 # if defined(HAVE_FMA)
205 fma_available_ = (cpuInfo[2] & 0x00001000) != 0;
206 # endif
207 # if defined(HAVE_AVX)
208 avx_available_ = (cpuInfo[2] & 0x10000000) != 0;
209 # endif
210 # if defined(HAVE_AVX2)
211 if (max_function_id >= 7) {
212 __cpuid(cpuInfo, 7);
213 avx2_available_ = (cpuInfo[1] & 0x00000020) != 0;
214 avx512F_available_ = (cpuInfo[1] & 0x00010000) != 0;
215 avx512BW_available_ = (cpuInfo[1] & 0x40000000) != 0;
216 avx512VNNI_available_ = (cpuInfo[2] & 0x00000800) != 0;
217 }
218 # endif
219 }
220 # endif
221 }
222 # else
223 # error "I don't know how to test for SIMD with this compiler"
224 # endif
225 #endif
226
227 #if defined(HAVE_NEON) && !defined(__aarch64__)
228 # if defined(HAVE_ANDROID_GETCPUFAMILY)
229 {
230 AndroidCpuFamily family = android_getCpuFamily();
231 if (family == ANDROID_CPU_FAMILY_ARM)
232 neon_available_ = (android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON);
233 }
234 # elif defined(HAVE_GETAUXVAL)
235 neon_available_ = getauxval(AT_HWCAP) & HWCAP_NEON;
236 # elif defined(HAVE_ELF_AUX_INFO)
237 unsigned long hwcap = 0;
238 elf_aux_info(AT_HWCAP, &hwcap, sizeof hwcap);
239 neon_available_ = hwcap & HWCAP_NEON;
240 # endif
241 #endif
242
243 #if defined(HAVE_RVV)
244 # if defined(HAVE_GETAUXVAL)
245 const unsigned long hwcap = getauxval(AT_HWCAP);
246 rvv_available_ = hwcap & HWCAP_RV('V');
247 # endif
248 #endif
249
250 // Select code for calculation of dot product based on autodetection.
251 if (false) {
252 // This is a dummy to support conditional compilation.
253 #if defined(HAVE_AVX512F)
254 } else if (avx512F_available_) {
255 // AVX512F detected.
256 SetDotProduct(DotProductAVX512F, &IntSimdMatrix::intSimdMatrixAVX2);
257 #endif
258 #if defined(HAVE_AVX2)
259 } else if (avx2_available_) {
260 // AVX2 detected.
261 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
262 #endif
263 #if defined(HAVE_AVX)
264 } else if (avx_available_) {
265 // AVX detected.
266 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
267 #endif
268 #if defined(HAVE_SSE4_1)
269 } else if (sse_available_) {
270 // SSE detected.
271 SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
272 #endif
273 #if defined(HAVE_NEON) || defined(__aarch64__)
274 } else if (neon_available_) {
275 // NEON detected.
276 SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
277 #endif
278 #if defined(HAVE_RVV)
279 } else if (rvv_available_) {
280 SetDotProduct(DotProductGeneric, &IntSimdMatrix::intSimdMatrixRVV);
281 #endif
282 }
283
284 const char *dotproduct_env = getenv("DOTPRODUCT");
285 if (dotproduct_env != nullptr) {
286 // Override automatic settings by value from environment variable.
287 dotproduct = dotproduct_env;
288 Update();
289 }
290 }
291
292 void SIMDDetect::Update() {
293 // Select code for calculation of dot product based on the
294 // value of the config variable if that value is not empty.
295 const char *dotproduct_method = "generic";
296 if (dotproduct == "auto") {
297 // Automatic detection. Nothing to be done.
298 } else if (dotproduct == "generic") {
299 // Generic code selected by config variable.
300 SetDotProduct(DotProductGeneric);
301 dotproduct_method = "generic";
302 } else if (dotproduct == "native") {
303 // Native optimized code selected by config variable.
304 SetDotProduct(DotProductNative, IntSimdMatrix::intSimdMatrix);
305 dotproduct_method = "native";
306 #if defined(HAVE_AVX2)
307 } else if (dotproduct == "avx2") {
308 // AVX2 selected by config variable.
309 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixAVX2);
310 dotproduct_method = "avx2";
311 #endif
312 #if defined(HAVE_AVX)
313 } else if (dotproduct == "avx") {
314 // AVX selected by config variable.
315 SetDotProduct(DotProductAVX, &IntSimdMatrix::intSimdMatrixSSE);
316 dotproduct_method = "avx";
317 #endif
318 #if defined(HAVE_FMA)
319 } else if (dotproduct == "fma") {
320 // FMA selected by config variable.
321 SetDotProduct(DotProductFMA, IntSimdMatrix::intSimdMatrix);
322 dotproduct_method = "fma";
323 #endif
324 #if defined(HAVE_SSE4_1)
325 } else if (dotproduct == "sse") {
326 // SSE selected by config variable.
327 SetDotProduct(DotProductSSE, &IntSimdMatrix::intSimdMatrixSSE);
328 dotproduct_method = "sse";
329 #endif
330 #if defined(HAVE_FRAMEWORK_ACCELERATE)
331 } else if (dotproduct == "accelerate") {
332 SetDotProduct(DotProductAccelerate, IntSimdMatrix::intSimdMatrix);
333 #endif
334 #if defined(HAVE_NEON) || defined(__aarch64__)
335 } else if (dotproduct == "neon" && neon_available_) {
336 // NEON selected by config variable.
337 SetDotProduct(DotProductNEON, &IntSimdMatrix::intSimdMatrixNEON);
338 dotproduct_method = "neon";
339 #endif
340 } else if (dotproduct == "std::inner_product") {
341 // std::inner_product selected by config variable.
342 SetDotProduct(DotProductStdInnerProduct, IntSimdMatrix::intSimdMatrix);
343 dotproduct_method = "std::inner_product";
344 } else {
345 // Unsupported value of config variable.
346 tprintf("Warning, ignoring unsupported config variable value: dotproduct=%s\n",
347 dotproduct.c_str());
348 tprintf(
349 "Supported values for dotproduct: auto generic native"
350 #if defined(HAVE_AVX2)
351 " avx2"
352 #endif
353 #if defined(HAVE_AVX)
354 " avx"
355 #endif
356 #if defined(HAVE_FMA)
357 " fma"
358 #endif
359 #if defined(HAVE_SSE4_1)
360 " sse"
361 #endif
362 #if defined(HAVE_FRAMEWORK_ACCELERATE)
363 " accelerate"
364 #endif
365 " std::inner_product.\n");
366 }
367
368 dotproduct.set_value(dotproduct_method);
369 }
370
371 } // namespace tesseract