comparison mupdf-source/thirdparty/tesseract/sw.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 void build(Solution &s)
2 {
3 auto &tess = s.addProject("google.tesseract", "main");
4 tess += Git("https://github.com/tesseract-ocr/tesseract", "", "{v}");
5
6 auto cppstd = cpp17;
7
8 auto &libtesseract = tess.addTarget<LibraryTarget>("libtesseract");
9 {
10 libtesseract.setChecks("libtesseract");
11
12 libtesseract.PackageDefinitions = true;
13
14 libtesseract += cppstd;
15
16 libtesseract += "TESS_API"_api;
17 libtesseract += "include/.*"_rr;
18 libtesseract += "src/.+/.*"_rr;
19 libtesseract -= "src/lstm/.*\\.cc"_rr;
20 libtesseract -= "src/training/.*"_rr;
21
22 libtesseract.Public += "include"_idir;
23 libtesseract.Protected +=
24 "src/ccmain"_id,
25 "src/api"_id,
26 "src/dict"_id,
27 "src/viewer"_id,
28 "src/wordrec"_id,
29 "src/ccstruct"_id,
30 "src/cutil"_id,
31 "src/textord"_id,
32 "src/ccutil"_id,
33 "src/lstm"_id,
34 "src/classify"_id,
35 "src/arch"_id,
36 "src/training"_id;
37
38 if (libtesseract.getCompilerType() == CompilerType::MSVC ||
39 libtesseract.getCompilerType() == CompilerType::ClangCl)
40 {
41 libtesseract += "__SSE4_1__"_def;
42 libtesseract.CompileOptions.push_back("-arch:AVX2");
43
44 // openmp
45 //if (libtesseract.getOptions()["openmp"] == "true")
46 if (0)
47 {
48 if (libtesseract.getCompilerType() == CompilerType::MSVC)
49 libtesseract.CompileOptions.push_back("-openmp");
50 else
51 libtesseract.CompileOptions.push_back("-fopenmp");
52 libtesseract += "_OPENMP=201107"_def;
53 if (libtesseract.getBuildSettings().Native.ConfigurationType == ConfigurationType::Debug)
54 libtesseract += "vcompd.lib"_slib;
55 else
56 libtesseract += "vcomp.lib"_slib;
57 }
58 }
59
60 auto win_or_mingw =
61 libtesseract.getBuildSettings().TargetOS.Type == OSType::Windows ||
62 libtesseract.getBuildSettings().TargetOS.Type == OSType::Mingw
63 ;
64
65 // check fma flags
66 libtesseract -= "src/arch/dotproductfma.cpp";
67 // check arch (arm)
68 libtesseract -= "src/arch/dotproductneon.cpp";
69
70 if (libtesseract.getBuildSettings().TargetOS.Type != OSType::Windows &&
71 libtesseract.getBuildSettings().TargetOS.Arch != ArchType::aarch64)
72 {
73 libtesseract["src/arch/dotproductavx.cpp"].args.push_back("-mavx");
74 libtesseract["src/arch/dotproductavx512.cpp"].args.push_back("-mavx512f");
75 libtesseract["src/arch/dotproductsse.cpp"].args.push_back("-msse4.1");
76 libtesseract["src/arch/intsimdmatrixsse.cpp"].args.push_back("-msse4.1");
77 libtesseract["src/arch/intsimdmatrixavx2.cpp"].args.push_back("-mavx2");
78 }
79 if (!win_or_mingw)
80 {
81 #if SW_MODULE_ABI_VERSION > 29
82 if (!libtesseract.getBuildSettings().TargetOS.Android)
83 #endif
84 libtesseract += "pthread"_slib;
85 }
86 if (libtesseract.getBuildSettings().TargetOS.Arch == ArchType::aarch64)
87 {
88 libtesseract += "src/arch/dotproductneon.cpp";
89 }
90
91 libtesseract.Public += "HAVE_CONFIG_H"_d;
92 libtesseract.Public += "_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS=1"_d;
93 libtesseract.Public += "HAVE_LIBARCHIVE"_d;
94
95 libtesseract.Public += "org.sw.demo.danbloomberg.leptonica"_dep;
96 libtesseract.Public += "org.sw.demo.libarchive.libarchive"_dep;
97
98 if (win_or_mingw)
99 {
100 libtesseract.Public += "ws2_32.lib"_slib;
101 libtesseract.Protected += "NOMINMAX"_def;
102 }
103
104 if (libtesseract.getCompilerType() == CompilerType::MSVC)
105 libtesseract.Protected.CompileOptions.push_back("-utf-8");
106
107 libtesseract.Variables["TESSERACT_MAJOR_VERSION"] = libtesseract.Variables["PACKAGE_MAJOR_VERSION"];
108 libtesseract.Variables["TESSERACT_MINOR_VERSION"] = libtesseract.Variables["PACKAGE_MINOR_VERSION"];
109 libtesseract.Variables["TESSERACT_MICRO_VERSION"] = libtesseract.Variables["PACKAGE_PATCH_VERSION"];
110 libtesseract.Variables["TESSERACT_VERSION_STR"] = "master";
111 libtesseract.configureFile("include/tesseract/version.h.in", "tesseract/version.h");
112 }
113
114 //
115 auto &tesseract = tess.addExecutable("tesseract");
116 {
117 tesseract += cppstd;
118 tesseract += "src/tesseract.cpp";
119 tesseract += libtesseract;
120 }
121
122 auto &svpaint = tess.addExecutable("svpaint");
123 {
124 svpaint += cppstd;
125 svpaint += "src/svpaint.cpp";
126 svpaint += libtesseract;
127 }
128
129 auto &training = tess.addDirectory("training");
130
131 //
132 auto &common_training = training.addLibrary("common_training");
133 {
134 common_training += "TESS_COMMON_TRAINING_API"_api;
135 common_training += cppstd;
136 common_training += "src/training/common/.*"_rr;
137 common_training.Public += "src/training/common"_idir;
138 common_training.Public += libtesseract;
139 }
140
141 //
142 auto &unicharset_training = training.addLibrary("unicharset_training");
143 {
144 unicharset_training += "TESS_UNICHARSET_TRAINING_API"_api;
145 unicharset_training += cppstd;
146 unicharset_training += "src/training/unicharset/.*"_rr;
147 unicharset_training.Public += "src/training/unicharset"_idir;
148 unicharset_training.Public += common_training;
149 unicharset_training.Public += "org.sw.demo.unicode.icu.i18n"_dep;
150
151 auto win_or_mingw =
152 unicharset_training.getBuildSettings().TargetOS.Type == OSType::Windows ||
153 unicharset_training.getBuildSettings().TargetOS.Type == OSType::Mingw
154 ;
155 if (!win_or_mingw)
156 unicharset_training += "pthread"_slib;
157 }
158
159 //
160 #define ADD_EXE(n, ...) \
161 auto &n = training.addExecutable(#n); \
162 n += cppstd; \
163 n += "src/training/" #n ".*"_rr; \
164 n.Public += __VA_ARGS__; \
165 n
166
167 ADD_EXE(ambiguous_words, common_training);
168 ADD_EXE(classifier_tester, common_training);
169 ADD_EXE(combine_lang_model, unicharset_training);
170 ADD_EXE(combine_tessdata, common_training);
171 ADD_EXE(cntraining, common_training);
172 ADD_EXE(dawg2wordlist, common_training);
173 ADD_EXE(mftraining, common_training) += "src/training/mergenf.*"_rr;
174 ADD_EXE(shapeclustering, common_training);
175 ADD_EXE(unicharset_extractor, unicharset_training);
176 ADD_EXE(wordlist2dawg, common_training);
177 ADD_EXE(lstmeval, unicharset_training);
178 ADD_EXE(lstmtraining, unicharset_training);
179 ADD_EXE(set_unicharset_properties, unicharset_training);
180 ADD_EXE(merge_unicharsets, common_training);
181
182 //
183 auto &pango_training = training.addLibrary("pango_training");
184 {
185 pango_training += "TESS_PANGO_TRAINING_API"_api;
186 pango_training += cppstd;
187 pango_training += "src/training/pango/.*"_rr;
188 pango_training.Public += "src/training/pango"_idir;
189 pango_training.Public += unicharset_training;
190 pango_training.Public += "org.sw.demo.gnome.pango.pangocairo"_dep;
191 }
192
193 ADD_EXE(text2image, pango_training);
194 {
195 text2image += cppstd;
196 text2image +=
197 "src/training/degradeimage.cpp",
198 "src/training/degradeimage.h",
199 "src/training/text2image.cpp"
200 ;
201 }
202
203 if (!s.getExternalVariables()["with-tests"])
204 return;
205
206 // tests
207 {
208 auto &test = tess.addDirectory("test");
209 test.Scope = TargetScope::Test;
210
211 String skipped_tests_str;
212 if (s.getExternalVariables()["skip-tests"])
213 skipped_tests_str = s.getExternalVariables()["skip-tests"].getValue();
214 auto skipped_tests = split_string(skipped_tests_str, ",");
215
216 auto add_test = [&test, &s, &cppstd, &libtesseract, &pango_training, &skipped_tests](const String &name) -> decltype(auto)
217 {
218 auto &t = test.addTarget<ExecutableTarget>(name);
219 t += cppstd;
220 t += FileRegex("unittest", name + "_test.*", false);
221 t += "unittest"_idir;
222
223 t += "SW_TESTING"_def;
224
225 auto datadir = test.SourceDir / "tessdata_unittest";
226 if (s.getExternalVariables()["test-data-dir"])
227 datadir = fs::current_path() / s.getExternalVariables()["test-data-dir"].getValue();
228 t += Definition("TESSBIN_DIR=\"" + ""s + "\"");
229
230 t += Definition("TESTING_DIR=\"" + to_printable_string(normalize_path(test.SourceDir / "test/testing")) + "\"");
231 t += Definition("TESTDATA_DIR=\"" + to_printable_string(normalize_path(test.SourceDir / "test/testdata")) + "\"");
232
233 t += Definition("LANGDATA_DIR=\"" + to_printable_string(normalize_path(datadir / "langdata_lstm")) + "\"");
234 t += Definition("TESSDATA_DIR=\"" + to_printable_string(normalize_path(datadir / "tessdata")) + "\"");
235 t += Definition("TESSDATA_BEST_DIR=\"" + to_printable_string(normalize_path(datadir / "tessdata_best")) + "\"");
236
237 // we push all deps to all tests simplify things
238 t += pango_training;
239 t += "org.sw.demo.google.googletest.gmock.main"_dep;
240 t += "org.sw.demo.google.googletest.gtest.main"_dep;
241
242 if (t.getCompilerType() == CompilerType::MSVC)
243 t.CompileOptions.push_back("-utf-8");
244
245 auto win_or_mingw =
246 t.getBuildSettings().TargetOS.Type == OSType::Windows ||
247 t.getBuildSettings().TargetOS.Type == OSType::Mingw
248 ;
249 if (!win_or_mingw)
250 t += "pthread"_slib;
251
252 auto tst = libtesseract.addTest(t, name);
253 for (auto &st : skipped_tests)
254 {
255 std::regex r(st);
256 if (std::regex_match(name, r))
257 {
258 tst.skip(true);
259 break;
260 }
261 }
262
263 return t;
264 };
265
266 Strings tests
267 {
268 "apiexample",
269 "applybox",
270 "baseapi",
271 "baseapi_thread",
272 "bitvector",
273 "capiexample",
274 "capiexample_c",
275 "cleanapi",
276 "colpartition",
277 "commandlineflags",
278 "denorm",
279 "equationdetect",
280 "fileio",
281 "heap",
282 "imagedata",
283 "indexmapbidi",
284 "intfeaturemap",
285 "intsimdmatrix",
286 "lang_model",
287 "layout",
288 "ligature_table",
289 "linlsq",
290 "list",
291 "lstm_recode",
292 "lstm_squashed",
293 "lstm",
294 "lstmtrainer",
295 "loadlang",
296 "mastertrainer",
297 "matrix",
298 "networkio",
299 "normstrngs",
300 "nthitem",
301 "osd",
302 "pagesegmode",
303 "pango_font_info",
304 "paragraphs",
305 "params_model",
306 "progress",
307 "qrsequence",
308 "recodebeam",
309 "rect",
310 "resultiterator",
311 "scanutils",
312 "shapetable",
313 "stats",
314 "stringrenderer",
315 "stridemap",
316 "tablefind",
317 "tablerecog",
318 "tabvector",
319 "textlineprojection",
320 "tfile",
321 "unichar",
322 "unicharcompress",
323 "unicharset",
324 "validate_grapheme",
325 "validate_indic",
326 "validate_khmer",
327 "validate_myanmar",
328 "validator",
329 };
330 for (auto t : tests)
331 add_test(t);
332 auto &dt = add_test("dawg");
333 dt += Definition("wordlist2dawg_prog=\"" + to_printable_string(normalize_path(wordlist2dawg.getOutputFile())) + "\"");
334 dt += Definition("dawg2wordlist_prog=\"" + to_printable_string(normalize_path(dawg2wordlist.getOutputFile())) + "\"");
335
336 auto &tw = add_test("tatweel");
337 tw += "unittest/util/.*"_rr;
338 tw += "unittest/third_party/.*"_rr;
339 tw -= "unittest/third_party/googletest/.*"_rr;
340 }
341 }
342
343 void check(Checker &c)
344 {
345 auto &s = c.addSet("libtesseract");
346 s.checkFunctionExists("getline");
347 s.checkIncludeExists("dlfcn.h");
348 s.checkIncludeExists("inttypes.h");
349 s.checkIncludeExists("memory.h");
350 s.checkIncludeExists("stdint.h");
351 s.checkIncludeExists("stdlib.h");
352 s.checkIncludeExists("string.h");
353 s.checkIncludeExists("sys/stat.h");
354 s.checkIncludeExists("sys/types.h");
355 s.checkIncludeExists("tiffio.h");
356 s.checkIncludeExists("unistd.h");
357 s.checkTypeSize("long long int");
358 s.checkTypeSize("size_t");
359 s.checkTypeSize("void *");
360 s.checkTypeSize("wchar_t");
361 {
362 auto &c = s.checkSymbolExists("snprintf");
363 c.Parameters.Includes.push_back("stdio.h");
364 }
365 }
366