comparison mupdf-source/thirdparty/tesseract/src/ccmain/ltrresultiterator.cpp @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 ///////////////////////////////////////////////////////////////////////
2 // File: ltrresultiterator.cpp
3 // Description: Iterator for tesseract results in strict left-to-right
4 // order that avoids using tesseract internal data structures.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2010, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
18 ///////////////////////////////////////////////////////////////////////
19
20 #include <tesseract/ltrresultiterator.h>
21
22 #include "helpers.h" // for copy_string
23 #include "pageres.h"
24 #include "tesseractclass.h"
25
26 #include <allheaders.h>
27
28 namespace tesseract {
29
30 LTRResultIterator::LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
31 int scaled_yres, int rect_left, int rect_top, int rect_width,
32 int rect_height)
33 : PageIterator(page_res, tesseract, scale, scaled_yres, rect_left, rect_top, rect_width,
34 rect_height)
35 , line_separator_("\n")
36 , paragraph_separator_("\n") {}
37
38 // Destructor.
39 // It is defined here, so the compiler can create a single vtable
40 // instead of weak vtables in every compilation unit.
41 LTRResultIterator::~LTRResultIterator() = default;
42
43 // Returns the null terminated UTF-8 encoded text string for the current
44 // object at the given level. Use delete [] to free after use.
45 char *LTRResultIterator::GetUTF8Text(PageIteratorLevel level) const {
46 if (it_->word() == nullptr) {
47 return nullptr; // Already at the end!
48 }
49 std::string text;
50 PAGE_RES_IT res_it(*it_);
51 WERD_CHOICE *best_choice = res_it.word()->best_choice;
52 ASSERT_HOST(best_choice != nullptr);
53 if (level == RIL_SYMBOL) {
54 text = res_it.word()->BestUTF8(blob_index_, false);
55 } else if (level == RIL_WORD) {
56 text = best_choice->unichar_string();
57 } else {
58 bool eol = false; // end of line?
59 bool eop = false; // end of paragraph?
60 do { // for each paragraph in a block
61 do { // for each text line in a paragraph
62 do { // for each word in a text line
63 best_choice = res_it.word()->best_choice;
64 ASSERT_HOST(best_choice != nullptr);
65 text += best_choice->unichar_string();
66 text += " ";
67 res_it.forward();
68 eol = res_it.row() != res_it.prev_row();
69 } while (!eol);
70 text.resize(text.length() - 1);
71 text += line_separator_;
72 eop = res_it.block() != res_it.prev_block() ||
73 res_it.row()->row->para() != res_it.prev_row()->row->para();
74 } while (level != RIL_TEXTLINE && !eop);
75 if (eop) {
76 text += paragraph_separator_;
77 }
78 } while (level == RIL_BLOCK && res_it.block() == res_it.prev_block());
79 }
80 return copy_string(text);
81 }
82
83 // Set the string inserted at the end of each text line. "\n" by default.
84 void LTRResultIterator::SetLineSeparator(const char *new_line) {
85 line_separator_ = new_line;
86 }
87
88 // Set the string inserted at the end of each paragraph. "\n" by default.
89 void LTRResultIterator::SetParagraphSeparator(const char *new_para) {
90 paragraph_separator_ = new_para;
91 }
92
93 // Returns the mean confidence of the current object at the given level.
94 // The number should be interpreted as a percent probability. (0.0f-100.0f)
95 float LTRResultIterator::Confidence(PageIteratorLevel level) const {
96 if (it_->word() == nullptr) {
97 return 0.0f; // Already at the end!
98 }
99 float mean_certainty = 0.0f;
100 int certainty_count = 0;
101 PAGE_RES_IT res_it(*it_);
102 WERD_CHOICE *best_choice;
103 switch (level) {
104 case RIL_BLOCK:
105 do {
106 best_choice = res_it.word()->best_choice;
107 mean_certainty += best_choice->certainty();
108 ++certainty_count;
109 res_it.forward();
110 } while (res_it.block() == res_it.prev_block());
111 break;
112 case RIL_PARA:
113 do {
114 best_choice = res_it.word()->best_choice;
115 mean_certainty += best_choice->certainty();
116 ++certainty_count;
117 res_it.forward();
118 } while (res_it.block() == res_it.prev_block() &&
119 res_it.row()->row->para() == res_it.prev_row()->row->para());
120 break;
121 case RIL_TEXTLINE:
122 do {
123 best_choice = res_it.word()->best_choice;
124 mean_certainty += best_choice->certainty();
125 ++certainty_count;
126 res_it.forward();
127 } while (res_it.row() == res_it.prev_row());
128 break;
129 case RIL_WORD:
130 best_choice = res_it.word()->best_choice;
131 mean_certainty = best_choice->certainty();
132 certainty_count = 1;
133 break;
134 case RIL_SYMBOL:
135 best_choice = res_it.word()->best_choice;
136 mean_certainty = best_choice->certainty(blob_index_);
137 certainty_count = 1;
138 }
139 if (certainty_count > 0) {
140 mean_certainty /= certainty_count;
141 return ClipToRange(100 + 5 * mean_certainty, 0.0f, 100.0f);
142 }
143 return 0.0f;
144 }
145
146 // Returns the font attributes of the current word. If iterating at a higher
147 // level object than words, eg textlines, then this will return the
148 // attributes of the first word in that textline.
149 // The actual return value is a string representing a font name. It points
150 // to an internal table and SHOULD NOT BE DELETED. Lifespan is the same as
151 // the iterator itself, ie rendered invalid by various members of
152 // TessBaseAPI, including Init, SetImage, End or deleting the TessBaseAPI.
153 // Pointsize is returned in printers points (1/72 inch.)
154 const char *LTRResultIterator::WordFontAttributes(bool *is_bold, bool *is_italic,
155 bool *is_underlined, bool *is_monospace,
156 bool *is_serif, bool *is_smallcaps,
157 int *pointsize, int *font_id) const {
158 const char *result = nullptr;
159
160 if (it_->word() == nullptr) {
161 // Already at the end!
162 *pointsize = 0;
163 } else {
164 float row_height =
165 it_->row()->row->x_height() + it_->row()->row->ascenders() - it_->row()->row->descenders();
166 // Convert from pixels to printers points.
167 *pointsize =
168 scaled_yres_ > 0 ? static_cast<int>(row_height * kPointsPerInch / scaled_yres_ + 0.5) : 0;
169
170 #ifndef DISABLED_LEGACY_ENGINE
171 const FontInfo *font_info = it_->word()->fontinfo;
172 if (font_info) {
173 // Font information available.
174 *font_id = font_info->universal_id;
175 *is_bold = font_info->is_bold();
176 *is_italic = font_info->is_italic();
177 *is_underlined = false; // TODO(rays) fix this!
178 *is_monospace = font_info->is_fixed_pitch();
179 *is_serif = font_info->is_serif();
180 result = font_info->name;
181 }
182 #endif // ndef DISABLED_LEGACY_ENGINE
183
184 *is_smallcaps = it_->word()->small_caps;
185 }
186
187 if (!result) {
188 *is_bold = false;
189 *is_italic = false;
190 *is_underlined = false;
191 *is_monospace = false;
192 *is_serif = false;
193 *is_smallcaps = false;
194 *font_id = -1;
195 }
196
197 return result;
198 }
199
200 // Returns the name of the language used to recognize this word.
201 const char *LTRResultIterator::WordRecognitionLanguage() const {
202 if (it_->word() == nullptr || it_->word()->tesseract == nullptr) {
203 return nullptr;
204 }
205 return it_->word()->tesseract->lang.c_str();
206 }
207
208 // Return the overall directionality of this word.
209 StrongScriptDirection LTRResultIterator::WordDirection() const {
210 if (it_->word() == nullptr) {
211 return DIR_NEUTRAL;
212 }
213 bool has_rtl = it_->word()->AnyRtlCharsInWord();
214 bool has_ltr = it_->word()->AnyLtrCharsInWord();
215 if (has_rtl && !has_ltr) {
216 return DIR_RIGHT_TO_LEFT;
217 }
218 if (has_ltr && !has_rtl) {
219 return DIR_LEFT_TO_RIGHT;
220 }
221 if (!has_ltr && !has_rtl) {
222 return DIR_NEUTRAL;
223 }
224 return DIR_MIX;
225 }
226
227 // Returns true if the current word was found in a dictionary.
228 bool LTRResultIterator::WordIsFromDictionary() const {
229 if (it_->word() == nullptr) {
230 return false; // Already at the end!
231 }
232 int permuter = it_->word()->best_choice->permuter();
233 return permuter == SYSTEM_DAWG_PERM || permuter == FREQ_DAWG_PERM || permuter == USER_DAWG_PERM;
234 }
235
236 // Returns the number of blanks before the current word.
237 int LTRResultIterator::BlanksBeforeWord() const {
238 if (it_->word() == nullptr) {
239 return 1;
240 }
241 return it_->word()->word->space();
242 }
243
244 // Returns true if the current word is numeric.
245 bool LTRResultIterator::WordIsNumeric() const {
246 if (it_->word() == nullptr) {
247 return false; // Already at the end!
248 }
249 int permuter = it_->word()->best_choice->permuter();
250 return permuter == NUMBER_PERM;
251 }
252
253 // Returns true if the word contains blamer information.
254 bool LTRResultIterator::HasBlamerInfo() const {
255 return it_->word() != nullptr && it_->word()->blamer_bundle != nullptr &&
256 it_->word()->blamer_bundle->HasDebugInfo();
257 }
258
259 #ifndef DISABLED_LEGACY_ENGINE
260 // Returns the pointer to ParamsTrainingBundle stored in the BlamerBundle
261 // of the current word.
262 const void *LTRResultIterator::GetParamsTrainingBundle() const {
263 return (it_->word() != nullptr && it_->word()->blamer_bundle != nullptr)
264 ? &(it_->word()->blamer_bundle->params_training_bundle())
265 : nullptr;
266 }
267 #endif // ndef DISABLED_LEGACY_ENGINE
268
269 // Returns the pointer to the string with blamer information for this word.
270 // Assumes that the word's blamer_bundle is not nullptr.
271 const char *LTRResultIterator::GetBlamerDebug() const {
272 return it_->word()->blamer_bundle->debug().c_str();
273 }
274
275 // Returns the pointer to the string with misadaption information for this word.
276 // Assumes that the word's blamer_bundle is not nullptr.
277 const char *LTRResultIterator::GetBlamerMisadaptionDebug() const {
278 return it_->word()->blamer_bundle->misadaption_debug().c_str();
279 }
280
281 // Returns true if a truth string was recorded for the current word.
282 bool LTRResultIterator::HasTruthString() const {
283 if (it_->word() == nullptr) {
284 return false; // Already at the end!
285 }
286 if (it_->word()->blamer_bundle == nullptr || it_->word()->blamer_bundle->NoTruth()) {
287 return false; // no truth information for this word
288 }
289 return true;
290 }
291
292 // Returns true if the given string is equivalent to the truth string for
293 // the current word.
294 bool LTRResultIterator::EquivalentToTruth(const char *str) const {
295 if (!HasTruthString()) {
296 return false;
297 }
298 ASSERT_HOST(it_->word()->uch_set != nullptr);
299 WERD_CHOICE str_wd(str, *(it_->word()->uch_set));
300 return it_->word()->blamer_bundle->ChoiceIsCorrect(&str_wd);
301 }
302
303 // Returns the null terminated UTF-8 encoded truth string for the current word.
304 // Use delete [] to free after use.
305 char *LTRResultIterator::WordTruthUTF8Text() const {
306 if (!HasTruthString()) {
307 return nullptr;
308 }
309 return copy_string(it_->word()->blamer_bundle->TruthString());
310 }
311
312 // Returns the null terminated UTF-8 encoded normalized OCR string for the
313 // current word. Use delete [] to free after use.
314 char *LTRResultIterator::WordNormedUTF8Text() const {
315 if (it_->word() == nullptr) {
316 return nullptr; // Already at the end!
317 }
318 std::string ocr_text;
319 WERD_CHOICE *best_choice = it_->word()->best_choice;
320 const UNICHARSET *unicharset = it_->word()->uch_set;
321 for (unsigned i = 0; i < best_choice->length(); ++i) {
322 ocr_text += unicharset->get_normed_unichar(best_choice->unichar_id(i));
323 }
324 return copy_string(ocr_text);
325 }
326
327 // Returns a pointer to serialized choice lattice.
328 // Fills lattice_size with the number of bytes in lattice data.
329 const char *LTRResultIterator::WordLattice(int *lattice_size) const {
330 if (it_->word() == nullptr) {
331 return nullptr; // Already at the end!
332 }
333 if (it_->word()->blamer_bundle == nullptr) {
334 return nullptr;
335 }
336 *lattice_size = it_->word()->blamer_bundle->lattice_size();
337 return it_->word()->blamer_bundle->lattice_data();
338 }
339
340 // Returns true if the current symbol is a superscript.
341 // If iterating at a higher level object than symbols, eg words, then
342 // this will return the attributes of the first symbol in that word.
343 bool LTRResultIterator::SymbolIsSuperscript() const {
344 if (cblob_it_ == nullptr && it_->word() != nullptr) {
345 return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUPERSCRIPT;
346 }
347 return false;
348 }
349
350 // Returns true if the current symbol is a subscript.
351 // If iterating at a higher level object than symbols, eg words, then
352 // this will return the attributes of the first symbol in that word.
353 bool LTRResultIterator::SymbolIsSubscript() const {
354 if (cblob_it_ == nullptr && it_->word() != nullptr) {
355 return it_->word()->best_choice->BlobPosition(blob_index_) == SP_SUBSCRIPT;
356 }
357 return false;
358 }
359
360 // Returns true if the current symbol is a dropcap.
361 // If iterating at a higher level object than symbols, eg words, then
362 // this will return the attributes of the first symbol in that word.
363 bool LTRResultIterator::SymbolIsDropcap() const {
364 if (cblob_it_ == nullptr && it_->word() != nullptr) {
365 return it_->word()->best_choice->BlobPosition(blob_index_) == SP_DROPCAP;
366 }
367 return false;
368 }
369
370 ChoiceIterator::ChoiceIterator(const LTRResultIterator &result_it) {
371 ASSERT_HOST(result_it.it_->word() != nullptr);
372 word_res_ = result_it.it_->word();
373 oemLSTM_ = word_res_->tesseract->AnyLSTMLang();
374 // Is there legacy engine related trained data?
375 bool oemLegacy = word_res_->tesseract->AnyTessLang();
376 // Is lstm_choice_mode activated?
377 bool lstm_choice_mode = word_res_->tesseract->lstm_choice_mode;
378 rating_coefficient_ = word_res_->tesseract->lstm_rating_coefficient;
379 blanks_before_word_ = result_it.BlanksBeforeWord();
380 BLOB_CHOICE_LIST *choices = nullptr;
381 tstep_index_ = &result_it.blob_index_;
382 if (oemLSTM_ && !word_res_->CTC_symbol_choices.empty()) {
383 if (!word_res_->CTC_symbol_choices[0].empty() &&
384 strcmp(word_res_->CTC_symbol_choices[0][0].first, " ")) {
385 blanks_before_word_ = 0;
386 }
387 unsigned index = *tstep_index_;
388 index += blanks_before_word_;
389 if (index < word_res_->CTC_symbol_choices.size()) {
390 LSTM_choices_ = &word_res_->CTC_symbol_choices[index];
391 filterSpaces();
392 }
393 }
394 if ((oemLegacy || !lstm_choice_mode) && word_res_->ratings != nullptr) {
395 choices = word_res_->GetBlobChoices(result_it.blob_index_);
396 }
397 if (choices != nullptr && !choices->empty()) {
398 choice_it_ = new BLOB_CHOICE_IT(choices);
399 choice_it_->mark_cycle_pt();
400 } else {
401 choice_it_ = nullptr;
402 }
403 if (LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
404 LSTM_choice_it_ = LSTM_choices_->begin();
405 }
406 }
407 ChoiceIterator::~ChoiceIterator() {
408 delete choice_it_;
409 }
410
411 // Moves to the next choice for the symbol and returns false if there
412 // are none left.
413 bool ChoiceIterator::Next() {
414 if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
415 if (LSTM_choice_it_ == LSTM_choices_->end() ||
416 next(LSTM_choice_it_) == LSTM_choices_->end()) {
417 return false;
418 } else {
419 ++LSTM_choice_it_;
420 return true;
421 }
422 } else {
423 if (choice_it_ == nullptr) {
424 return false;
425 }
426 choice_it_->forward();
427 return !choice_it_->cycled_list();
428 }
429 }
430
431 // Returns the null terminated UTF-8 encoded text string for the current
432 // choice. Do NOT use delete [] to free after use.
433 const char *ChoiceIterator::GetUTF8Text() const {
434 if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
435 std::pair<const char *, float> choice = *LSTM_choice_it_;
436 return choice.first;
437 } else {
438 if (choice_it_ == nullptr) {
439 return nullptr;
440 }
441 UNICHAR_ID id = choice_it_->data()->unichar_id();
442 return word_res_->uch_set->id_to_unichar_ext(id);
443 }
444 }
445
446 // Returns the confidence of the current choice depending on the used language
447 // data. If only LSTM traineddata is used the value range is 0.0f - 1.0f. All
448 // choices for one symbol should roughly add up to 1.0f.
449 // If only traineddata of the legacy engine is used, the number should be
450 // interpreted as a percent probability. (0.0f-100.0f) In this case
451 // probabilities won't add up to 100. Each one stands on its own.
452 float ChoiceIterator::Confidence() const {
453 float confidence;
454 if (oemLSTM_ && LSTM_choices_ != nullptr && !LSTM_choices_->empty()) {
455 std::pair<const char *, float> choice = *LSTM_choice_it_;
456 confidence = 100 - rating_coefficient_ * choice.second;
457 } else {
458 if (choice_it_ == nullptr) {
459 return 0.0f;
460 }
461 confidence = 100 + 5 * choice_it_->data()->certainty();
462 }
463 return ClipToRange(confidence, 0.0f, 100.0f);
464 }
465
466 // Returns the set of timesteps which belong to the current symbol
467 std::vector<std::vector<std::pair<const char *, float>>> *ChoiceIterator::Timesteps() const {
468 unsigned offset = *tstep_index_ + blanks_before_word_;
469 if (offset >= word_res_->segmented_timesteps.size() || !oemLSTM_) {
470 return nullptr;
471 }
472 return &word_res_->segmented_timesteps[offset];
473 }
474
475 void ChoiceIterator::filterSpaces() {
476 if (LSTM_choices_->empty()) {
477 return;
478 }
479 std::vector<std::pair<const char *, float>>::iterator it;
480 for (it = LSTM_choices_->begin(); it != LSTM_choices_->end();) {
481 if (!strcmp(it->first, " ")) {
482 it = LSTM_choices_->erase(it);
483 } else {
484 ++it;
485 }
486 }
487 }
488 } // namespace tesseract.