Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/ambigs.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: ambigs.h | |
| 3 // Description: Constants, flags, functions for dealing with | |
| 4 // ambiguities (training and recognition). | |
| 5 // Author: Daria Antonova | |
| 6 // | |
| 7 // (C) Copyright 2008, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 // | |
| 18 /////////////////////////////////////////////////////////////////////// | |
| 19 | |
| 20 #ifndef TESSERACT_CCUTIL_AMBIGS_H_ | |
| 21 #define TESSERACT_CCUTIL_AMBIGS_H_ | |
| 22 | |
| 23 #ifdef HAVE_CONFIG_H | |
| 24 # include "config_auto.h" // DISABLED_LEGACY_ENGINE | |
| 25 #endif | |
| 26 | |
| 27 #if !defined(DISABLED_LEGACY_ENGINE) | |
| 28 | |
| 29 # include <tesseract/unichar.h> | |
| 30 # include "elst.h" | |
| 31 # include "tprintf.h" | |
| 32 # include "unicharset.h" | |
| 33 | |
| 34 # define MAX_AMBIG_SIZE 10 | |
| 35 | |
| 36 namespace tesseract { | |
| 37 | |
| 38 using UnicharIdVector = std::vector<UNICHAR_ID>; | |
| 39 | |
| 40 enum AmbigType { | |
| 41 NOT_AMBIG, // the ngram pair is not ambiguous | |
| 42 REPLACE_AMBIG, // ocred ngram should always be substituted with correct | |
| 43 DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1) | |
| 44 SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1) | |
| 45 CASE_AMBIG, // this is a case ambiguity (1-1) | |
| 46 | |
| 47 AMBIG_TYPE_COUNT // number of enum entries | |
| 48 }; | |
| 49 | |
| 50 // A collection of utility functions for arrays of UNICHAR_IDs that are | |
| 51 // terminated by INVALID_UNICHAR_ID. | |
| 52 class UnicharIdArrayUtils { | |
| 53 public: | |
| 54 // Compares two arrays of unichar ids. Returns -1 if the length of array1 is | |
| 55 // less than length of array2, if any array1[i] is less than array2[i]. | |
| 56 // Returns 0 if the arrays are equal, 1 otherwise. | |
| 57 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID. | |
| 58 static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) { | |
| 59 for (;;) { | |
| 60 const UNICHAR_ID val1 = *ptr1++; | |
| 61 const UNICHAR_ID val2 = *ptr2++; | |
| 62 if (val1 != val2) { | |
| 63 if (val1 == INVALID_UNICHAR_ID) { | |
| 64 return -1; | |
| 65 } | |
| 66 if (val2 == INVALID_UNICHAR_ID) { | |
| 67 return 1; | |
| 68 } | |
| 69 if (val1 < val2) { | |
| 70 return -1; | |
| 71 } | |
| 72 return 1; | |
| 73 } | |
| 74 if (val1 == INVALID_UNICHAR_ID) { | |
| 75 return 0; | |
| 76 } | |
| 77 } | |
| 78 } | |
| 79 | |
| 80 // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied. | |
| 81 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID | |
| 82 // and that dst has enough space for all the elements from src. | |
| 83 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) { | |
| 84 int i = 0; | |
| 85 do { | |
| 86 dst[i] = src[i]; | |
| 87 } while (dst[i++] != INVALID_UNICHAR_ID); | |
| 88 return i - 1; | |
| 89 } | |
| 90 | |
| 91 // Prints unichars corresponding to the unichar_ids in the given array. | |
| 92 // The function assumes that array is terminated by INVALID_UNICHAR_ID. | |
| 93 static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) { | |
| 94 const UNICHAR_ID *ptr = array; | |
| 95 if (*ptr == INVALID_UNICHAR_ID) { | |
| 96 tprintf("[Empty]"); | |
| 97 } | |
| 98 while (*ptr != INVALID_UNICHAR_ID) { | |
| 99 tprintf("%s ", unicharset.id_to_unichar(*ptr++)); | |
| 100 } | |
| 101 tprintf("( "); | |
| 102 ptr = array; | |
| 103 while (*ptr != INVALID_UNICHAR_ID) { | |
| 104 tprintf("%d ", *ptr++); | |
| 105 } | |
| 106 tprintf(")\n"); | |
| 107 } | |
| 108 }; | |
| 109 | |
| 110 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that | |
| 111 // start with the same unichar (e.g. r->t rn->m rr1->m). | |
| 112 class AmbigSpec : public ELIST_LINK { | |
| 113 public: | |
| 114 AmbigSpec(); | |
| 115 ~AmbigSpec() = default; | |
| 116 | |
| 117 // Comparator function for sorting AmbigSpec_LISTs. The lists will | |
| 118 // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors | |
| 119 // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1]. | |
| 120 static int compare_ambig_specs(const void *spec1, const void *spec2) { | |
| 121 const AmbigSpec *s1 = *static_cast<const AmbigSpec *const *>(spec1); | |
| 122 const AmbigSpec *s2 = *static_cast<const AmbigSpec *const *>(spec2); | |
| 123 int result = UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram); | |
| 124 if (result != 0) { | |
| 125 return result; | |
| 126 } | |
| 127 return UnicharIdArrayUtils::compare(s1->correct_fragments, s2->correct_fragments); | |
| 128 } | |
| 129 | |
| 130 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1]; | |
| 131 UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1]; | |
| 132 UNICHAR_ID correct_ngram_id; | |
| 133 AmbigType type; | |
| 134 int wrong_ngram_size; | |
| 135 }; | |
| 136 ELISTIZEH(AmbigSpec) | |
| 137 | |
| 138 // AMBIG_TABLE[i] stores a set of ambiguities whose | |
| 139 // wrong ngram starts with unichar id i. | |
| 140 using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>; | |
| 141 | |
| 142 class UnicharAmbigs { | |
| 143 public: | |
| 144 UnicharAmbigs() = default; | |
| 145 ~UnicharAmbigs() { | |
| 146 for (auto data : replace_ambigs_) { | |
| 147 delete data; | |
| 148 } | |
| 149 for (auto data : dang_ambigs_) { | |
| 150 delete data; | |
| 151 } | |
| 152 for (auto data : one_to_one_definite_ambigs_) { | |
| 153 delete data; | |
| 154 } | |
| 155 } | |
| 156 | |
| 157 const UnicharAmbigsVector &dang_ambigs() const { | |
| 158 return dang_ambigs_; | |
| 159 } | |
| 160 const UnicharAmbigsVector &replace_ambigs() const { | |
| 161 return replace_ambigs_; | |
| 162 } | |
| 163 | |
| 164 // Initializes the ambigs by adding a nullptr pointer to each table. | |
| 165 void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption); | |
| 166 | |
| 167 // Loads the universal ambigs that are useful for any language. | |
| 168 void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset); | |
| 169 | |
| 170 // Fills in two ambiguity tables (replaceable and dangerous) with information | |
| 171 // read from the ambigs file. An ambiguity table is an array of lists. | |
| 172 // The array is indexed by a class id. Each entry in the table provides | |
| 173 // a list of potential ambiguities which can start with the corresponding | |
| 174 // character. For example the ambiguity "rn -> m", would be located in the | |
| 175 // table at index of unicharset.unichar_to_id('r'). | |
| 176 // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in | |
| 177 // one_to_one_definite_ambigs_. This vector is also indexed by the class id | |
| 178 // of the wrong part of the ambiguity and each entry contains a vector of | |
| 179 // unichar ids that are ambiguous to it. | |
| 180 // encoder_set is used to encode the ambiguity strings, undisturbed by new | |
| 181 // unichar_ids that may be created by adding the ambigs. | |
| 182 void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, | |
| 183 bool use_ambigs_for_adaption, UNICHARSET *unicharset); | |
| 184 | |
| 185 // Returns definite 1-1 ambigs for the given unichar id. | |
| 186 inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const { | |
| 187 if (one_to_one_definite_ambigs_.empty()) { | |
| 188 return nullptr; | |
| 189 } | |
| 190 return one_to_one_definite_ambigs_[unichar_id]; | |
| 191 } | |
| 192 | |
| 193 // Returns a pointer to the vector with all unichar ids that appear in the | |
| 194 // 'correct' part of the ambiguity pair when the given unichar id appears | |
| 195 // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of | |
| 196 // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of | |
| 197 // m will return a pointer to a vector with unichar ids of r,n,i. | |
| 198 inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const { | |
| 199 if (ambigs_for_adaption_.empty()) { | |
| 200 return nullptr; | |
| 201 } | |
| 202 return ambigs_for_adaption_[unichar_id]; | |
| 203 } | |
| 204 | |
| 205 // Similar to the above, but return the vector of unichar ids for which | |
| 206 // the given unichar_id is an ambiguity (appears in the 'wrong' part of | |
| 207 // some ambiguity pair). | |
| 208 inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const { | |
| 209 if (reverse_ambigs_for_adaption_.empty()) { | |
| 210 return nullptr; | |
| 211 } | |
| 212 return reverse_ambigs_for_adaption_[unichar_id]; | |
| 213 } | |
| 214 | |
| 215 private: | |
| 216 bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset, | |
| 217 char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids, | |
| 218 int *replacement_ambig_part_size, char *replacement_string, int *type); | |
| 219 bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size, | |
| 220 UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size, | |
| 221 const char *replacement_string, int type, AmbigSpec *ambig_spec, | |
| 222 UNICHARSET *unicharset); | |
| 223 | |
| 224 UnicharAmbigsVector dang_ambigs_; | |
| 225 UnicharAmbigsVector replace_ambigs_; | |
| 226 std::vector<UnicharIdVector *> one_to_one_definite_ambigs_; | |
| 227 std::vector<UnicharIdVector *> ambigs_for_adaption_; | |
| 228 std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_; | |
| 229 }; | |
| 230 | |
| 231 } // namespace tesseract | |
| 232 | |
| 233 #endif // !defined(DISABLED_LEGACY_ENGINE) | |
| 234 | |
| 235 #endif // TESSERACT_CCUTIL_AMBIGS_H_ |
