comparison mupdf-source/thirdparty/tesseract/src/ccmain/output.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /******************************************************************
2 * File: output.cpp (Formerly output.c)
3 * Description: Output pass
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1994, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include "output.h"
20
21 #include "control.h"
22 #include "tesseractclass.h"
23 #include "tessvars.h"
24 #ifndef DISABLED_LEGACY_ENGINE
25 # include "docqual.h"
26 # include "reject.h"
27 #endif
28
29 #include "helpers.h"
30
31 #include <cctype>
32 #include <cerrno>
33 #include <cstring>
34
35 #define CTRL_NEWLINE '\012' // newline
36 #define CTRL_HARDLINE '\015' // cr
37
38 namespace tesseract {
39 void Tesseract::output_pass( // Tess output pass //send to api
40 PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
41 BLOCK_RES *block_of_last_word;
42 bool force_eol; // During output
43 BLOCK *nextblock; // block of next word
44 WERD *nextword; // next word
45
46 page_res_it.restart_page();
47 block_of_last_word = nullptr;
48 while (page_res_it.word() != nullptr) {
49 check_debug_pt(page_res_it.word(), 120);
50
51 if (target_word_box) {
52 TBOX current_word_box = page_res_it.word()->word->bounding_box();
53 FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
54 (current_word_box.bottom() + current_word_box.top()) / 2);
55 if (!target_word_box->contains(center_pt)) {
56 page_res_it.forward();
57 continue;
58 }
59 }
60 if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
61 block_of_last_word = page_res_it.block();
62 }
63
64 force_eol =
65 (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
66 (page_res_it.next_word() == nullptr);
67
68 if (page_res_it.next_word() != nullptr) {
69 nextword = page_res_it.next_word()->word;
70 } else {
71 nextword = nullptr;
72 }
73 if (page_res_it.next_block() != nullptr) {
74 nextblock = page_res_it.next_block()->block;
75 } else {
76 nextblock = nullptr;
77 }
78 // regardless of tilde crunching
79 write_results(page_res_it,
80 determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
81 nextword, nextblock),
82 force_eol);
83 page_res_it.forward();
84 }
85 }
86
87 /*************************************************************************
88 * write_results()
89 *
90 * All recognition and rejection has now been done. Generate the following:
91 * .txt file - giving the final best choices with NO highlighting
92 * .raw file - giving the tesseract top choice output for each word
93 * .map file - showing how the .txt file has been rejected in the .ep file
94 * epchoice list - a list of one element per word, containing the text for the
95 * epaper. Reject strings are inserted.
96 * inset list - a list of bounding boxes of reject insets - indexed by the
97 * reject strings in the epchoice text.
98 *************************************************************************/
99 void Tesseract::write_results(PAGE_RES_IT &page_res_it,
100 char newline_type, // type of newline
101 bool force_eol) { // override tilde crunch?
102 WERD_RES *word = page_res_it.word();
103 const UNICHARSET &uchset = *word->uch_set;
104 UNICHAR_ID space = uchset.unichar_to_id(" ");
105
106 if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
107 !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
108 bool need_reject = false;
109 if ((word->unlv_crunch_mode != CR_DELETE) &&
110 (!stats_.tilde_crunch_written ||
111 ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
112 !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
113 if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
114 !word->word->flag(W_FUZZY_SP)) {
115 stats_.last_char_was_tilde = false;
116 }
117 need_reject = true;
118 }
119 if ((need_reject && !stats_.last_char_was_tilde) ||
120 (force_eol && stats_.write_results_empty_block)) {
121 /* Write a reject char - mark as rejected unless zero_rejection mode */
122 stats_.last_char_was_tilde = true;
123 stats_.tilde_crunch_written = true;
124 stats_.last_char_was_newline = false;
125 stats_.write_results_empty_block = false;
126 }
127
128 if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
129 stats_.tilde_crunch_written = false;
130 stats_.last_char_was_newline = true;
131 stats_.last_char_was_tilde = false;
132 }
133
134 if (force_eol) {
135 stats_.write_results_empty_block = true;
136 }
137 return;
138 }
139
140 /* NORMAL PROCESSING of non tilde crunched words */
141
142 stats_.tilde_crunch_written = false;
143 if (newline_type) {
144 stats_.last_char_was_newline = true;
145 } else {
146 stats_.last_char_was_newline = false;
147 }
148 stats_.write_results_empty_block = force_eol; // about to write a real word
149
150 if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
151 !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
152 (word->best_choice->unichar_id(0) == space)) {
153 /* Prevent adjacent tilde across words - we know that adjacent tildes within
154 words have been removed */
155 word->MergeAdjacentBlobs(0);
156 }
157 if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
158 stats_.last_char_was_tilde = false;
159 } else {
160 if (word->reject_map.length() > 0) {
161 if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
162 stats_.last_char_was_tilde = true;
163 } else {
164 stats_.last_char_was_tilde = false;
165 }
166 } else if (word->word->space() > 0) {
167 stats_.last_char_was_tilde = false;
168 }
169 /* else it is unchanged as there are no output chars */
170 }
171
172 ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
173
174 set_unlv_suspects(word);
175 check_debug_pt(word, 120);
176 if (tessedit_rejection_debug) {
177 tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
178 dict_word(*(word->best_choice)));
179 }
180 if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
181 if (tessedit_zero_rejection) {
182 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
183 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
184 if (word->reject_map[i].rejected()) {
185 word->reject_map[i].setrej_minimal_rej_accept();
186 }
187 }
188 }
189 if (tessedit_minimal_rejection) {
190 /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
191 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
192 if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
193 word->reject_map[i].setrej_minimal_rej_accept();
194 }
195 }
196 }
197 }
198 }
199
200 /**********************************************************************
201 * determine_newline_type
202 *
203 * Find whether we have a wrapping or hard newline.
204 * Return false if not at end of line.
205 **********************************************************************/
206
207 char determine_newline_type( // test line ends
208 WERD *word, // word to do
209 BLOCK *block, // current block
210 WERD *next_word, // next word
211 BLOCK *next_block // block of next word
212 ) {
213 int16_t end_gap; // to right edge
214 int16_t width; // of next word
215 TBOX word_box; // bounding
216 TBOX next_box; // next word
217 TBOX block_box; // block bounding
218
219 if (!word->flag(W_EOL)) {
220 return false; // not end of line
221 }
222 if (next_word == nullptr || next_block == nullptr || block != next_block) {
223 return CTRL_NEWLINE;
224 }
225 if (next_word->space() > 0) {
226 return CTRL_HARDLINE; // it is tabbed
227 }
228 word_box = word->bounding_box();
229 next_box = next_word->bounding_box();
230 block_box = block->pdblk.bounding_box();
231 // gap to eol
232 end_gap = block_box.right() - word_box.right();
233 end_gap -= static_cast<int32_t>(block->space());
234 width = next_box.right() - next_box.left();
235 // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
236 // block_box.right(),word_box.right(),end_gap,
237 // next_box.right(),next_box.left(),width,
238 // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
239 return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
240 }
241
242 /*************************************************************************
243 * get_rep_char()
244 * Return the first accepted character from the repetition string. This is the
245 * character which is repeated - as determined earlier by fix_rep_char()
246 *************************************************************************/
247 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
248 int i;
249 for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
250 ;
251 }
252
253 if (i < word->reject_map.length()) {
254 return word->best_choice->unichar_id(i);
255 } else {
256 return word->uch_set->unichar_to_id(unrecognised_char.c_str());
257 }
258 }
259
260 /*************************************************************************
261 * SUSPECT LEVELS
262 *
263 * 0 - don't reject ANYTHING
264 * 1,2 - partial rejection
265 * 3 - BEST
266 *
267 * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
268 * tessedit_minimal_rejection.
269 *************************************************************************/
270 void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
271 int len = word_res->reject_map.length();
272 const WERD_CHOICE &word = *(word_res->best_choice);
273 const UNICHARSET &uchset = *word.unicharset();
274 int i;
275 float rating_per_ch;
276
277 if (suspect_level == 0) {
278 for (i = 0; i < len; i++) {
279 if (word_res->reject_map[i].rejected()) {
280 word_res->reject_map[i].setrej_minimal_rej_accept();
281 }
282 }
283 return;
284 }
285
286 if (suspect_level >= 3) {
287 return; // Use defaults
288 }
289
290 /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
291
292 if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
293 /* Unreject alphas in dictionary words */
294 for (i = 0; i < len; ++i) {
295 if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
296 word_res->reject_map[i].setrej_minimal_rej_accept();
297 }
298 }
299 }
300
301 rating_per_ch = word.rating() / word_res->reject_map.length();
302
303 if (rating_per_ch >= suspect_rating_per_ch) {
304 return; // Don't touch bad ratings
305 }
306
307 if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
308 /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
309 for (i = 0; i < len; ++i) {
310 if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
311 word_res->reject_map[i].setrej_minimal_rej_accept();
312 }
313 }
314 }
315
316 for (i = 0; i < len; i++) {
317 if (word_res->reject_map[i].rejected()) {
318 if (word_res->reject_map[i].flag(R_DOC_REJ)) {
319 word_res->reject_map[i].setrej_minimal_rej_accept();
320 }
321 if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
322 word_res->reject_map[i].setrej_minimal_rej_accept();
323 }
324 if (word_res->reject_map[i].flag(R_ROW_REJ)) {
325 word_res->reject_map[i].setrej_minimal_rej_accept();
326 }
327 }
328 }
329
330 if (suspect_level == 2) {
331 return;
332 }
333
334 if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
335 for (i = 0; i < len; i++) {
336 if (word_res->reject_map[i].rejected()) {
337 if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
338 word_res->reject_map[i].flag(R_POSTNN_1IL))) {
339 word_res->reject_map[i].setrej_minimal_rej_accept();
340 }
341
342 if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
343 word_res->reject_map[i].setrej_minimal_rej_accept();
344 }
345 }
346 }
347 }
348
349 if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
350 word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
351 acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
352 if (word_res->reject_map.length() > suspect_short_words) {
353 for (i = 0; i < len; i++) {
354 if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
355 word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
356 word_res->reject_map[i].flag(R_POSTNN_1IL) ||
357 word_res->reject_map[i].flag(R_MM_REJECT))) {
358 word_res->reject_map[i].setrej_minimal_rej_accept();
359 }
360 }
361 }
362 }
363 }
364
365 int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
366 int count = 0;
367 for (unsigned i = 0; i < word.length(); ++i) {
368 if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
369 count++;
370 }
371 }
372 return count;
373 }
374
375 int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
376 int count = 0;
377 for (unsigned i = 0; i < word.length(); ++i) {
378 if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
379 word.unicharset()->get_isdigit(word.unichar_id(i))) {
380 count++;
381 }
382 }
383 return count;
384 }
385
386 bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
387 bool prev_digit = false;
388
389 if (*lengths == 1 && *s == '(') {
390 s++;
391 }
392
393 if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
394 s++;
395 }
396
397 for (; *s != '\0'; s += *(lengths++)) {
398 if (unicharset.get_isdigit(s, *lengths)) {
399 prev_digit = true;
400 } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
401 prev_digit = false;
402 } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
403 ((*s == '%') || (*s == ')'))) {
404 return true;
405 } else if (prev_digit && *lengths == 1 && (*s == '%') &&
406 (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
407 (*(s + *lengths + *(lengths + 1)) == '\0')) {
408 return true;
409 } else {
410 return false;
411 }
412 }
413 return true;
414 }
415 } // namespace tesseract