Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/sw.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/sw.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,366 @@ +void build(Solution &s) +{ + auto &tess = s.addProject("google.tesseract", "main"); + tess += Git("https://github.com/tesseract-ocr/tesseract", "", "{v}"); + + auto cppstd = cpp17; + + auto &libtesseract = tess.addTarget<LibraryTarget>("libtesseract"); + { + libtesseract.setChecks("libtesseract"); + + libtesseract.PackageDefinitions = true; + + libtesseract += cppstd; + + libtesseract += "TESS_API"_api; + libtesseract += "include/.*"_rr; + libtesseract += "src/.+/.*"_rr; + libtesseract -= "src/lstm/.*\\.cc"_rr; + libtesseract -= "src/training/.*"_rr; + + libtesseract.Public += "include"_idir; + libtesseract.Protected += + "src/ccmain"_id, + "src/api"_id, + "src/dict"_id, + "src/viewer"_id, + "src/wordrec"_id, + "src/ccstruct"_id, + "src/cutil"_id, + "src/textord"_id, + "src/ccutil"_id, + "src/lstm"_id, + "src/classify"_id, + "src/arch"_id, + "src/training"_id; + + if (libtesseract.getCompilerType() == CompilerType::MSVC || + libtesseract.getCompilerType() == CompilerType::ClangCl) + { + libtesseract += "__SSE4_1__"_def; + libtesseract.CompileOptions.push_back("-arch:AVX2"); + + // openmp + //if (libtesseract.getOptions()["openmp"] == "true") + if (0) + { + if (libtesseract.getCompilerType() == CompilerType::MSVC) + libtesseract.CompileOptions.push_back("-openmp"); + else + libtesseract.CompileOptions.push_back("-fopenmp"); + libtesseract += "_OPENMP=201107"_def; + if (libtesseract.getBuildSettings().Native.ConfigurationType == ConfigurationType::Debug) + libtesseract += "vcompd.lib"_slib; + else + libtesseract += "vcomp.lib"_slib; + } + } + + auto win_or_mingw = + libtesseract.getBuildSettings().TargetOS.Type == OSType::Windows || + libtesseract.getBuildSettings().TargetOS.Type == OSType::Mingw + ; + + // check fma flags + libtesseract -= "src/arch/dotproductfma.cpp"; + // check arch (arm) + libtesseract -= "src/arch/dotproductneon.cpp"; + + if (libtesseract.getBuildSettings().TargetOS.Type != OSType::Windows && + libtesseract.getBuildSettings().TargetOS.Arch != ArchType::aarch64) + { + libtesseract["src/arch/dotproductavx.cpp"].args.push_back("-mavx"); + libtesseract["src/arch/dotproductavx512.cpp"].args.push_back("-mavx512f"); + libtesseract["src/arch/dotproductsse.cpp"].args.push_back("-msse4.1"); + libtesseract["src/arch/intsimdmatrixsse.cpp"].args.push_back("-msse4.1"); + libtesseract["src/arch/intsimdmatrixavx2.cpp"].args.push_back("-mavx2"); + } + if (!win_or_mingw) + { +#if SW_MODULE_ABI_VERSION > 29 + if (!libtesseract.getBuildSettings().TargetOS.Android) +#endif + libtesseract += "pthread"_slib; + } + if (libtesseract.getBuildSettings().TargetOS.Arch == ArchType::aarch64) + { + libtesseract += "src/arch/dotproductneon.cpp"; + } + + libtesseract.Public += "HAVE_CONFIG_H"_d; + libtesseract.Public += "_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS=1"_d; + libtesseract.Public += "HAVE_LIBARCHIVE"_d; + + libtesseract.Public += "org.sw.demo.danbloomberg.leptonica"_dep; + libtesseract.Public += "org.sw.demo.libarchive.libarchive"_dep; + + if (win_or_mingw) + { + libtesseract.Public += "ws2_32.lib"_slib; + libtesseract.Protected += "NOMINMAX"_def; + } + + if (libtesseract.getCompilerType() == CompilerType::MSVC) + libtesseract.Protected.CompileOptions.push_back("-utf-8"); + + libtesseract.Variables["TESSERACT_MAJOR_VERSION"] = libtesseract.Variables["PACKAGE_MAJOR_VERSION"]; + libtesseract.Variables["TESSERACT_MINOR_VERSION"] = libtesseract.Variables["PACKAGE_MINOR_VERSION"]; + libtesseract.Variables["TESSERACT_MICRO_VERSION"] = libtesseract.Variables["PACKAGE_PATCH_VERSION"]; + libtesseract.Variables["TESSERACT_VERSION_STR"] = "master"; + libtesseract.configureFile("include/tesseract/version.h.in", "tesseract/version.h"); + } + + // + auto &tesseract = tess.addExecutable("tesseract"); + { + tesseract += cppstd; + tesseract += "src/tesseract.cpp"; + tesseract += libtesseract; + } + + auto &svpaint = tess.addExecutable("svpaint"); + { + svpaint += cppstd; + svpaint += "src/svpaint.cpp"; + svpaint += libtesseract; + } + + auto &training = tess.addDirectory("training"); + + // + auto &common_training = training.addLibrary("common_training"); + { + common_training += "TESS_COMMON_TRAINING_API"_api; + common_training += cppstd; + common_training += "src/training/common/.*"_rr; + common_training.Public += "src/training/common"_idir; + common_training.Public += libtesseract; + } + + // + auto &unicharset_training = training.addLibrary("unicharset_training"); + { + unicharset_training += "TESS_UNICHARSET_TRAINING_API"_api; + unicharset_training += cppstd; + unicharset_training += "src/training/unicharset/.*"_rr; + unicharset_training.Public += "src/training/unicharset"_idir; + unicharset_training.Public += common_training; + unicharset_training.Public += "org.sw.demo.unicode.icu.i18n"_dep; + + auto win_or_mingw = + unicharset_training.getBuildSettings().TargetOS.Type == OSType::Windows || + unicharset_training.getBuildSettings().TargetOS.Type == OSType::Mingw + ; + if (!win_or_mingw) + unicharset_training += "pthread"_slib; + } + + // +#define ADD_EXE(n, ...) \ + auto &n = training.addExecutable(#n); \ + n += cppstd; \ + n += "src/training/" #n ".*"_rr; \ + n.Public += __VA_ARGS__; \ + n + + ADD_EXE(ambiguous_words, common_training); + ADD_EXE(classifier_tester, common_training); + ADD_EXE(combine_lang_model, unicharset_training); + ADD_EXE(combine_tessdata, common_training); + ADD_EXE(cntraining, common_training); + ADD_EXE(dawg2wordlist, common_training); + ADD_EXE(mftraining, common_training) += "src/training/mergenf.*"_rr; + ADD_EXE(shapeclustering, common_training); + ADD_EXE(unicharset_extractor, unicharset_training); + ADD_EXE(wordlist2dawg, common_training); + ADD_EXE(lstmeval, unicharset_training); + ADD_EXE(lstmtraining, unicharset_training); + ADD_EXE(set_unicharset_properties, unicharset_training); + ADD_EXE(merge_unicharsets, common_training); + + // + auto &pango_training = training.addLibrary("pango_training"); + { + pango_training += "TESS_PANGO_TRAINING_API"_api; + pango_training += cppstd; + pango_training += "src/training/pango/.*"_rr; + pango_training.Public += "src/training/pango"_idir; + pango_training.Public += unicharset_training; + pango_training.Public += "org.sw.demo.gnome.pango.pangocairo"_dep; + } + + ADD_EXE(text2image, pango_training); + { + text2image += cppstd; + text2image += + "src/training/degradeimage.cpp", + "src/training/degradeimage.h", + "src/training/text2image.cpp" + ; + } + + if (!s.getExternalVariables()["with-tests"]) + return; + + // tests + { + auto &test = tess.addDirectory("test"); + test.Scope = TargetScope::Test; + + String skipped_tests_str; + if (s.getExternalVariables()["skip-tests"]) + skipped_tests_str = s.getExternalVariables()["skip-tests"].getValue(); + auto skipped_tests = split_string(skipped_tests_str, ","); + + auto add_test = [&test, &s, &cppstd, &libtesseract, &pango_training, &skipped_tests](const String &name) -> decltype(auto) + { + auto &t = test.addTarget<ExecutableTarget>(name); + t += cppstd; + t += FileRegex("unittest", name + "_test.*", false); + t += "unittest"_idir; + + t += "SW_TESTING"_def; + + auto datadir = test.SourceDir / "tessdata_unittest"; + if (s.getExternalVariables()["test-data-dir"]) + datadir = fs::current_path() / s.getExternalVariables()["test-data-dir"].getValue(); + t += Definition("TESSBIN_DIR=\"" + ""s + "\""); + + t += Definition("TESTING_DIR=\"" + to_printable_string(normalize_path(test.SourceDir / "test/testing")) + "\""); + t += Definition("TESTDATA_DIR=\"" + to_printable_string(normalize_path(test.SourceDir / "test/testdata")) + "\""); + + t += Definition("LANGDATA_DIR=\"" + to_printable_string(normalize_path(datadir / "langdata_lstm")) + "\""); + t += Definition("TESSDATA_DIR=\"" + to_printable_string(normalize_path(datadir / "tessdata")) + "\""); + t += Definition("TESSDATA_BEST_DIR=\"" + to_printable_string(normalize_path(datadir / "tessdata_best")) + "\""); + + // we push all deps to all tests simplify things + t += pango_training; + t += "org.sw.demo.google.googletest.gmock.main"_dep; + t += "org.sw.demo.google.googletest.gtest.main"_dep; + + if (t.getCompilerType() == CompilerType::MSVC) + t.CompileOptions.push_back("-utf-8"); + + auto win_or_mingw = + t.getBuildSettings().TargetOS.Type == OSType::Windows || + t.getBuildSettings().TargetOS.Type == OSType::Mingw + ; + if (!win_or_mingw) + t += "pthread"_slib; + + auto tst = libtesseract.addTest(t, name); + for (auto &st : skipped_tests) + { + std::regex r(st); + if (std::regex_match(name, r)) + { + tst.skip(true); + break; + } + } + + return t; + }; + + Strings tests + { + "apiexample", + "applybox", + "baseapi", + "baseapi_thread", + "bitvector", + "capiexample", + "capiexample_c", + "cleanapi", + "colpartition", + "commandlineflags", + "denorm", + "equationdetect", + "fileio", + "heap", + "imagedata", + "indexmapbidi", + "intfeaturemap", + "intsimdmatrix", + "lang_model", + "layout", + "ligature_table", + "linlsq", + "list", + "lstm_recode", + "lstm_squashed", + "lstm", + "lstmtrainer", + "loadlang", + "mastertrainer", + "matrix", + "networkio", + "normstrngs", + "nthitem", + "osd", + "pagesegmode", + "pango_font_info", + "paragraphs", + "params_model", + "progress", + "qrsequence", + "recodebeam", + "rect", + "resultiterator", + "scanutils", + "shapetable", + "stats", + "stringrenderer", + "stridemap", + "tablefind", + "tablerecog", + "tabvector", + "textlineprojection", + "tfile", + "unichar", + "unicharcompress", + "unicharset", + "validate_grapheme", + "validate_indic", + "validate_khmer", + "validate_myanmar", + "validator", + }; + for (auto t : tests) + add_test(t); + auto &dt = add_test("dawg"); + dt += Definition("wordlist2dawg_prog=\"" + to_printable_string(normalize_path(wordlist2dawg.getOutputFile())) + "\""); + dt += Definition("dawg2wordlist_prog=\"" + to_printable_string(normalize_path(dawg2wordlist.getOutputFile())) + "\""); + + auto &tw = add_test("tatweel"); + tw += "unittest/util/.*"_rr; + tw += "unittest/third_party/.*"_rr; + tw -= "unittest/third_party/googletest/.*"_rr; + } +} + +void check(Checker &c) +{ + auto &s = c.addSet("libtesseract"); + s.checkFunctionExists("getline"); + s.checkIncludeExists("dlfcn.h"); + s.checkIncludeExists("inttypes.h"); + s.checkIncludeExists("memory.h"); + s.checkIncludeExists("stdint.h"); + s.checkIncludeExists("stdlib.h"); + s.checkIncludeExists("string.h"); + s.checkIncludeExists("sys/stat.h"); + s.checkIncludeExists("sys/types.h"); + s.checkIncludeExists("tiffio.h"); + s.checkIncludeExists("unistd.h"); + s.checkTypeSize("long long int"); + s.checkTypeSize("size_t"); + s.checkTypeSize("void *"); + s.checkTypeSize("wchar_t"); + { + auto &c = s.checkSymbolExists("snprintf"); + c.Parameters.Includes.push_back("stdio.h"); + } +} +
