Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/output.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************** | |
| 2 * File: output.cpp (Formerly output.c) | |
| 3 * Description: Output pass | |
| 4 * Author: Phil Cheatle | |
| 5 * | |
| 6 * (C) Copyright 1994, Hewlett-Packard Ltd. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #include "output.h" | |
| 20 | |
| 21 #include "control.h" | |
| 22 #include "tesseractclass.h" | |
| 23 #include "tessvars.h" | |
| 24 #ifndef DISABLED_LEGACY_ENGINE | |
| 25 # include "docqual.h" | |
| 26 # include "reject.h" | |
| 27 #endif | |
| 28 | |
| 29 #include "helpers.h" | |
| 30 | |
| 31 #include <cctype> | |
| 32 #include <cerrno> | |
| 33 #include <cstring> | |
| 34 | |
| 35 #define CTRL_NEWLINE '\012' // newline | |
| 36 #define CTRL_HARDLINE '\015' // cr | |
| 37 | |
| 38 namespace tesseract { | |
| 39 void Tesseract::output_pass( // Tess output pass //send to api | |
| 40 PAGE_RES_IT &page_res_it, const TBOX *target_word_box) { | |
| 41 BLOCK_RES *block_of_last_word; | |
| 42 bool force_eol; // During output | |
| 43 BLOCK *nextblock; // block of next word | |
| 44 WERD *nextword; // next word | |
| 45 | |
| 46 page_res_it.restart_page(); | |
| 47 block_of_last_word = nullptr; | |
| 48 while (page_res_it.word() != nullptr) { | |
| 49 check_debug_pt(page_res_it.word(), 120); | |
| 50 | |
| 51 if (target_word_box) { | |
| 52 TBOX current_word_box = page_res_it.word()->word->bounding_box(); | |
| 53 FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2, | |
| 54 (current_word_box.bottom() + current_word_box.top()) / 2); | |
| 55 if (!target_word_box->contains(center_pt)) { | |
| 56 page_res_it.forward(); | |
| 57 continue; | |
| 58 } | |
| 59 } | |
| 60 if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) { | |
| 61 block_of_last_word = page_res_it.block(); | |
| 62 } | |
| 63 | |
| 64 force_eol = | |
| 65 (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) || | |
| 66 (page_res_it.next_word() == nullptr); | |
| 67 | |
| 68 if (page_res_it.next_word() != nullptr) { | |
| 69 nextword = page_res_it.next_word()->word; | |
| 70 } else { | |
| 71 nextword = nullptr; | |
| 72 } | |
| 73 if (page_res_it.next_block() != nullptr) { | |
| 74 nextblock = page_res_it.next_block()->block; | |
| 75 } else { | |
| 76 nextblock = nullptr; | |
| 77 } | |
| 78 // regardless of tilde crunching | |
| 79 write_results(page_res_it, | |
| 80 determine_newline_type(page_res_it.word()->word, page_res_it.block()->block, | |
| 81 nextword, nextblock), | |
| 82 force_eol); | |
| 83 page_res_it.forward(); | |
| 84 } | |
| 85 } | |
| 86 | |
| 87 /************************************************************************* | |
| 88 * write_results() | |
| 89 * | |
| 90 * All recognition and rejection has now been done. Generate the following: | |
| 91 * .txt file - giving the final best choices with NO highlighting | |
| 92 * .raw file - giving the tesseract top choice output for each word | |
| 93 * .map file - showing how the .txt file has been rejected in the .ep file | |
| 94 * epchoice list - a list of one element per word, containing the text for the | |
| 95 * epaper. Reject strings are inserted. | |
| 96 * inset list - a list of bounding boxes of reject insets - indexed by the | |
| 97 * reject strings in the epchoice text. | |
| 98 *************************************************************************/ | |
| 99 void Tesseract::write_results(PAGE_RES_IT &page_res_it, | |
| 100 char newline_type, // type of newline | |
| 101 bool force_eol) { // override tilde crunch? | |
| 102 WERD_RES *word = page_res_it.word(); | |
| 103 const UNICHARSET &uchset = *word->uch_set; | |
| 104 UNICHAR_ID space = uchset.unichar_to_id(" "); | |
| 105 | |
| 106 if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) && | |
| 107 !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { | |
| 108 bool need_reject = false; | |
| 109 if ((word->unlv_crunch_mode != CR_DELETE) && | |
| 110 (!stats_.tilde_crunch_written || | |
| 111 ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) && | |
| 112 !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) { | |
| 113 if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) && | |
| 114 !word->word->flag(W_FUZZY_SP)) { | |
| 115 stats_.last_char_was_tilde = false; | |
| 116 } | |
| 117 need_reject = true; | |
| 118 } | |
| 119 if ((need_reject && !stats_.last_char_was_tilde) || | |
| 120 (force_eol && stats_.write_results_empty_block)) { | |
| 121 /* Write a reject char - mark as rejected unless zero_rejection mode */ | |
| 122 stats_.last_char_was_tilde = true; | |
| 123 stats_.tilde_crunch_written = true; | |
| 124 stats_.last_char_was_newline = false; | |
| 125 stats_.write_results_empty_block = false; | |
| 126 } | |
| 127 | |
| 128 if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) { | |
| 129 stats_.tilde_crunch_written = false; | |
| 130 stats_.last_char_was_newline = true; | |
| 131 stats_.last_char_was_tilde = false; | |
| 132 } | |
| 133 | |
| 134 if (force_eol) { | |
| 135 stats_.write_results_empty_block = true; | |
| 136 } | |
| 137 return; | |
| 138 } | |
| 139 | |
| 140 /* NORMAL PROCESSING of non tilde crunched words */ | |
| 141 | |
| 142 stats_.tilde_crunch_written = false; | |
| 143 if (newline_type) { | |
| 144 stats_.last_char_was_newline = true; | |
| 145 } else { | |
| 146 stats_.last_char_was_newline = false; | |
| 147 } | |
| 148 stats_.write_results_empty_block = force_eol; // about to write a real word | |
| 149 | |
| 150 if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) && | |
| 151 !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) && | |
| 152 (word->best_choice->unichar_id(0) == space)) { | |
| 153 /* Prevent adjacent tilde across words - we know that adjacent tildes within | |
| 154 words have been removed */ | |
| 155 word->MergeAdjacentBlobs(0); | |
| 156 } | |
| 157 if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) { | |
| 158 stats_.last_char_was_tilde = false; | |
| 159 } else { | |
| 160 if (word->reject_map.length() > 0) { | |
| 161 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) { | |
| 162 stats_.last_char_was_tilde = true; | |
| 163 } else { | |
| 164 stats_.last_char_was_tilde = false; | |
| 165 } | |
| 166 } else if (word->word->space() > 0) { | |
| 167 stats_.last_char_was_tilde = false; | |
| 168 } | |
| 169 /* else it is unchanged as there are no output chars */ | |
| 170 } | |
| 171 | |
| 172 ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); | |
| 173 | |
| 174 set_unlv_suspects(word); | |
| 175 check_debug_pt(word, 120); | |
| 176 if (tessedit_rejection_debug) { | |
| 177 tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(), | |
| 178 dict_word(*(word->best_choice))); | |
| 179 } | |
| 180 if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) { | |
| 181 if (tessedit_zero_rejection) { | |
| 182 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ | |
| 183 for (unsigned i = 0; i < word->best_choice->length(); ++i) { | |
| 184 if (word->reject_map[i].rejected()) { | |
| 185 word->reject_map[i].setrej_minimal_rej_accept(); | |
| 186 } | |
| 187 } | |
| 188 } | |
| 189 if (tessedit_minimal_rejection) { | |
| 190 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ | |
| 191 for (unsigned i = 0; i < word->best_choice->length(); ++i) { | |
| 192 if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) { | |
| 193 word->reject_map[i].setrej_minimal_rej_accept(); | |
| 194 } | |
| 195 } | |
| 196 } | |
| 197 } | |
| 198 } | |
| 199 | |
| 200 /********************************************************************** | |
| 201 * determine_newline_type | |
| 202 * | |
| 203 * Find whether we have a wrapping or hard newline. | |
| 204 * Return false if not at end of line. | |
| 205 **********************************************************************/ | |
| 206 | |
| 207 char determine_newline_type( // test line ends | |
| 208 WERD *word, // word to do | |
| 209 BLOCK *block, // current block | |
| 210 WERD *next_word, // next word | |
| 211 BLOCK *next_block // block of next word | |
| 212 ) { | |
| 213 int16_t end_gap; // to right edge | |
| 214 int16_t width; // of next word | |
| 215 TBOX word_box; // bounding | |
| 216 TBOX next_box; // next word | |
| 217 TBOX block_box; // block bounding | |
| 218 | |
| 219 if (!word->flag(W_EOL)) { | |
| 220 return false; // not end of line | |
| 221 } | |
| 222 if (next_word == nullptr || next_block == nullptr || block != next_block) { | |
| 223 return CTRL_NEWLINE; | |
| 224 } | |
| 225 if (next_word->space() > 0) { | |
| 226 return CTRL_HARDLINE; // it is tabbed | |
| 227 } | |
| 228 word_box = word->bounding_box(); | |
| 229 next_box = next_word->bounding_box(); | |
| 230 block_box = block->pdblk.bounding_box(); | |
| 231 // gap to eol | |
| 232 end_gap = block_box.right() - word_box.right(); | |
| 233 end_gap -= static_cast<int32_t>(block->space()); | |
| 234 width = next_box.right() - next_box.left(); | |
| 235 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", | |
| 236 // block_box.right(),word_box.right(),end_gap, | |
| 237 // next_box.right(),next_box.left(),width, | |
| 238 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); | |
| 239 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE; | |
| 240 } | |
| 241 | |
| 242 /************************************************************************* | |
| 243 * get_rep_char() | |
| 244 * Return the first accepted character from the repetition string. This is the | |
| 245 * character which is repeated - as determined earlier by fix_rep_char() | |
| 246 *************************************************************************/ | |
| 247 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? | |
| 248 int i; | |
| 249 for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) { | |
| 250 ; | |
| 251 } | |
| 252 | |
| 253 if (i < word->reject_map.length()) { | |
| 254 return word->best_choice->unichar_id(i); | |
| 255 } else { | |
| 256 return word->uch_set->unichar_to_id(unrecognised_char.c_str()); | |
| 257 } | |
| 258 } | |
| 259 | |
| 260 /************************************************************************* | |
| 261 * SUSPECT LEVELS | |
| 262 * | |
| 263 * 0 - don't reject ANYTHING | |
| 264 * 1,2 - partial rejection | |
| 265 * 3 - BEST | |
| 266 * | |
| 267 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and | |
| 268 * tessedit_minimal_rejection. | |
| 269 *************************************************************************/ | |
| 270 void Tesseract::set_unlv_suspects(WERD_RES *word_res) { | |
| 271 int len = word_res->reject_map.length(); | |
| 272 const WERD_CHOICE &word = *(word_res->best_choice); | |
| 273 const UNICHARSET &uchset = *word.unicharset(); | |
| 274 int i; | |
| 275 float rating_per_ch; | |
| 276 | |
| 277 if (suspect_level == 0) { | |
| 278 for (i = 0; i < len; i++) { | |
| 279 if (word_res->reject_map[i].rejected()) { | |
| 280 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 281 } | |
| 282 } | |
| 283 return; | |
| 284 } | |
| 285 | |
| 286 if (suspect_level >= 3) { | |
| 287 return; // Use defaults | |
| 288 } | |
| 289 | |
| 290 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ | |
| 291 | |
| 292 if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) { | |
| 293 /* Unreject alphas in dictionary words */ | |
| 294 for (i = 0; i < len; ++i) { | |
| 295 if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) { | |
| 296 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 297 } | |
| 298 } | |
| 299 } | |
| 300 | |
| 301 rating_per_ch = word.rating() / word_res->reject_map.length(); | |
| 302 | |
| 303 if (rating_per_ch >= suspect_rating_per_ch) { | |
| 304 return; // Don't touch bad ratings | |
| 305 } | |
| 306 | |
| 307 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { | |
| 308 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ | |
| 309 for (i = 0; i < len; ++i) { | |
| 310 if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) { | |
| 311 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 312 } | |
| 313 } | |
| 314 } | |
| 315 | |
| 316 for (i = 0; i < len; i++) { | |
| 317 if (word_res->reject_map[i].rejected()) { | |
| 318 if (word_res->reject_map[i].flag(R_DOC_REJ)) { | |
| 319 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 320 } | |
| 321 if (word_res->reject_map[i].flag(R_BLOCK_REJ)) { | |
| 322 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 323 } | |
| 324 if (word_res->reject_map[i].flag(R_ROW_REJ)) { | |
| 325 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 326 } | |
| 327 } | |
| 328 } | |
| 329 | |
| 330 if (suspect_level == 2) { | |
| 331 return; | |
| 332 } | |
| 333 | |
| 334 if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) { | |
| 335 for (i = 0; i < len; i++) { | |
| 336 if (word_res->reject_map[i].rejected()) { | |
| 337 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) || | |
| 338 word_res->reject_map[i].flag(R_POSTNN_1IL))) { | |
| 339 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 340 } | |
| 341 | |
| 342 if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) { | |
| 343 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 344 } | |
| 345 } | |
| 346 } | |
| 347 } | |
| 348 | |
| 349 if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(), | |
| 350 word.unichar_lengths().c_str()) != AC_UNACCEPTABLE || | |
| 351 acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) { | |
| 352 if (word_res->reject_map.length() > suspect_short_words) { | |
| 353 for (i = 0; i < len; i++) { | |
| 354 if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() || | |
| 355 word_res->reject_map[i].flag(R_1IL_CONFLICT) || | |
| 356 word_res->reject_map[i].flag(R_POSTNN_1IL) || | |
| 357 word_res->reject_map[i].flag(R_MM_REJECT))) { | |
| 358 word_res->reject_map[i].setrej_minimal_rej_accept(); | |
| 359 } | |
| 360 } | |
| 361 } | |
| 362 } | |
| 363 } | |
| 364 | |
| 365 int16_t Tesseract::count_alphas(const WERD_CHOICE &word) { | |
| 366 int count = 0; | |
| 367 for (unsigned i = 0; i < word.length(); ++i) { | |
| 368 if (word.unicharset()->get_isalpha(word.unichar_id(i))) { | |
| 369 count++; | |
| 370 } | |
| 371 } | |
| 372 return count; | |
| 373 } | |
| 374 | |
| 375 int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) { | |
| 376 int count = 0; | |
| 377 for (unsigned i = 0; i < word.length(); ++i) { | |
| 378 if (word.unicharset()->get_isalpha(word.unichar_id(i)) || | |
| 379 word.unicharset()->get_isdigit(word.unichar_id(i))) { | |
| 380 count++; | |
| 381 } | |
| 382 } | |
| 383 return count; | |
| 384 } | |
| 385 | |
| 386 bool Tesseract::acceptable_number_string(const char *s, const char *lengths) { | |
| 387 bool prev_digit = false; | |
| 388 | |
| 389 if (*lengths == 1 && *s == '(') { | |
| 390 s++; | |
| 391 } | |
| 392 | |
| 393 if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) { | |
| 394 s++; | |
| 395 } | |
| 396 | |
| 397 for (; *s != '\0'; s += *(lengths++)) { | |
| 398 if (unicharset.get_isdigit(s, *lengths)) { | |
| 399 prev_digit = true; | |
| 400 } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) { | |
| 401 prev_digit = false; | |
| 402 } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') && | |
| 403 ((*s == '%') || (*s == ')'))) { | |
| 404 return true; | |
| 405 } else if (prev_digit && *lengths == 1 && (*s == '%') && | |
| 406 (*(lengths + 1) == 1 && *(s + *lengths) == ')') && | |
| 407 (*(s + *lengths + *(lengths + 1)) == '\0')) { | |
| 408 return true; | |
| 409 } else { | |
| 410 return false; | |
| 411 } | |
| 412 } | |
| 413 return true; | |
| 414 } | |
| 415 } // namespace tesseract |
