Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/linerec.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: linerec.cpp | |
| 3 // Description: Top-level line-based recognition module for Tesseract. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2013, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 /////////////////////////////////////////////////////////////////////// | |
| 17 | |
| 18 #include "tesseractclass.h" | |
| 19 | |
| 20 #include <allheaders.h> | |
| 21 #include "boxread.h" | |
| 22 #include "imagedata.h" // for ImageData | |
| 23 #include "lstmrecognizer.h" | |
| 24 #include "pageres.h" | |
| 25 #include "recodebeam.h" | |
| 26 #include "tprintf.h" | |
| 27 | |
| 28 #include <algorithm> | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 // Scale factor to make certainty more comparable to Tesseract. | |
| 33 const float kCertaintyScale = 7.0f; | |
| 34 // Worst acceptable certainty for a dictionary word. | |
| 35 const float kWorstDictCertainty = -25.0f; | |
| 36 | |
| 37 // Generates training data for training a line recognizer, eg LSTM. | |
| 38 // Breaks the page into lines, according to the boxes, and writes them to a | |
| 39 // serialized DocumentData based on output_basename. | |
| 40 // Return true if successful, false if an error occurred. | |
| 41 bool Tesseract::TrainLineRecognizer(const char *input_imagename, const std::string &output_basename, | |
| 42 BLOCK_LIST *block_list) { | |
| 43 std::string lstmf_name = output_basename + ".lstmf"; | |
| 44 DocumentData images(lstmf_name); | |
| 45 if (applybox_page > 0) { | |
| 46 // Load existing document for the previous pages. | |
| 47 if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) { | |
| 48 tprintf("Failed to read training data from %s!\n", lstmf_name.c_str()); | |
| 49 return false; | |
| 50 } | |
| 51 } | |
| 52 std::vector<TBOX> boxes; | |
| 53 std::vector<std::string> texts; | |
| 54 // Get the boxes for this page, if there are any. | |
| 55 if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, nullptr, nullptr) || | |
| 56 boxes.empty()) { | |
| 57 tprintf("Failed to read boxes from %s\n", input_imagename); | |
| 58 return false; | |
| 59 } | |
| 60 TrainFromBoxes(boxes, texts, block_list, &images); | |
| 61 if (images.PagesSize() == 0) { | |
| 62 tprintf("Failed to read pages from %s\n", input_imagename); | |
| 63 return false; | |
| 64 } | |
| 65 images.Shuffle(); | |
| 66 if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) { | |
| 67 tprintf("Failed to write training data to %s!\n", lstmf_name.c_str()); | |
| 68 return false; | |
| 69 } | |
| 70 return true; | |
| 71 } | |
| 72 | |
| 73 // Generates training data for training a line recognizer, eg LSTM. | |
| 74 // Breaks the boxes into lines, normalizes them, converts to ImageData and | |
| 75 // appends them to the given training_data. | |
| 76 void Tesseract::TrainFromBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts, | |
| 77 BLOCK_LIST *block_list, DocumentData *training_data) { | |
| 78 auto box_count = boxes.size(); | |
| 79 // Process all the text lines in this page, as defined by the boxes. | |
| 80 unsigned end_box = 0; | |
| 81 // Don't let \t, which marks newlines in the box file, get into the line | |
| 82 // content, as that makes the line unusable in training. | |
| 83 while (end_box < texts.size() && texts[end_box] == "\t") { | |
| 84 ++end_box; | |
| 85 } | |
| 86 for (auto start_box = end_box; start_box < box_count; start_box = end_box) { | |
| 87 // Find the textline of boxes starting at start and their bounding box. | |
| 88 TBOX line_box = boxes[start_box]; | |
| 89 std::string line_str = texts[start_box]; | |
| 90 for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; ++end_box) { | |
| 91 line_box += boxes[end_box]; | |
| 92 line_str += texts[end_box]; | |
| 93 } | |
| 94 // Find the most overlapping block. | |
| 95 BLOCK *best_block = nullptr; | |
| 96 int best_overlap = 0; | |
| 97 BLOCK_IT b_it(block_list); | |
| 98 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { | |
| 99 BLOCK *block = b_it.data(); | |
| 100 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { | |
| 101 continue; // Not a text block. | |
| 102 } | |
| 103 TBOX block_box = block->pdblk.bounding_box(); | |
| 104 block_box.rotate(block->re_rotation()); | |
| 105 if (block_box.major_overlap(line_box)) { | |
| 106 TBOX overlap_box = line_box.intersection(block_box); | |
| 107 if (overlap_box.area() > best_overlap) { | |
| 108 best_overlap = overlap_box.area(); | |
| 109 best_block = block; | |
| 110 } | |
| 111 } | |
| 112 } | |
| 113 ImageData *imagedata = nullptr; | |
| 114 if (best_block == nullptr) { | |
| 115 tprintf("No block overlapping textline: %s\n", line_str.c_str()); | |
| 116 } else { | |
| 117 imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, *best_block); | |
| 118 } | |
| 119 if (imagedata != nullptr) { | |
| 120 training_data->AddPageToDocument(imagedata); | |
| 121 } | |
| 122 // Don't let \t, which marks newlines in the box file, get into the line | |
| 123 // content, as that makes the line unusable in training. | |
| 124 while (end_box < texts.size() && texts[end_box] == "\t") { | |
| 125 ++end_box; | |
| 126 } | |
| 127 } | |
| 128 } | |
| 129 | |
| 130 // Returns an Imagedata containing the image of the given box, | |
| 131 // and ground truth boxes/truth text if available in the input. | |
| 132 // The image is not normalized in any way. | |
| 133 ImageData *Tesseract::GetLineData(const TBOX &line_box, const std::vector<TBOX> &boxes, | |
| 134 const std::vector<std::string> &texts, int start_box, int end_box, | |
| 135 const BLOCK &block) { | |
| 136 TBOX revised_box; | |
| 137 ImageData *image_data = GetRectImage(line_box, block, kImagePadding, &revised_box); | |
| 138 if (image_data == nullptr) { | |
| 139 return nullptr; | |
| 140 } | |
| 141 image_data->set_page_number(applybox_page); | |
| 142 // Copy the boxes and shift them so they are relative to the image. | |
| 143 FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y()); | |
| 144 ICOORD shift = -revised_box.botleft(); | |
| 145 std::vector<TBOX> line_boxes; | |
| 146 std::vector<std::string> line_texts; | |
| 147 for (int b = start_box; b < end_box; ++b) { | |
| 148 TBOX box = boxes[b]; | |
| 149 box.rotate(block_rotation); | |
| 150 box.move(shift); | |
| 151 line_boxes.push_back(box); | |
| 152 line_texts.push_back(texts[b]); | |
| 153 } | |
| 154 std::vector<int> page_numbers(line_boxes.size(), applybox_page); | |
| 155 image_data->AddBoxes(line_boxes, line_texts, page_numbers); | |
| 156 return image_data; | |
| 157 } | |
| 158 | |
| 159 // Helper gets the image of a rectangle, using the block.re_rotation() if | |
| 160 // needed to get to the image, and rotating the result back to horizontal | |
| 161 // layout. (CJK characters will be on their left sides) The vertical text flag | |
| 162 // is set in the returned ImageData if the text was originally vertical, which | |
| 163 // can be used to invoke a different CJK recognition engine. The revised_box | |
| 164 // is also returned to enable calculation of output bounding boxes. | |
| 165 ImageData *Tesseract::GetRectImage(const TBOX &box, const BLOCK &block, int padding, | |
| 166 TBOX *revised_box) const { | |
| 167 TBOX wbox = box; | |
| 168 wbox.pad(padding, padding); | |
| 169 *revised_box = wbox; | |
| 170 // Number of clockwise 90 degree rotations needed to get back to tesseract | |
| 171 // coords from the clipped image. | |
| 172 int num_rotations = 0; | |
| 173 if (block.re_rotation().y() > 0.0f) { | |
| 174 num_rotations = 1; | |
| 175 } else if (block.re_rotation().x() < 0.0f) { | |
| 176 num_rotations = 2; | |
| 177 } else if (block.re_rotation().y() < 0.0f) { | |
| 178 num_rotations = 3; | |
| 179 } | |
| 180 // Handle two cases automatically: 1 the box came from the block, 2 the box | |
| 181 // came from a box file, and refers to the image, which the block may not. | |
| 182 if (block.pdblk.bounding_box().major_overlap(*revised_box)) { | |
| 183 revised_box->rotate(block.re_rotation()); | |
| 184 } | |
| 185 // Now revised_box always refers to the image. | |
| 186 // BestPix is never colormapped, but may be of any depth. | |
| 187 Image pix = BestPix(); | |
| 188 int width = pixGetWidth(pix); | |
| 189 int height = pixGetHeight(pix); | |
| 190 TBOX image_box(0, 0, width, height); | |
| 191 // Clip to image bounds; | |
| 192 *revised_box &= image_box; | |
| 193 if (revised_box->null_box()) { | |
| 194 return nullptr; | |
| 195 } | |
| 196 Box *clip_box = boxCreate(revised_box->left(), height - revised_box->top(), revised_box->width(), | |
| 197 revised_box->height()); | |
| 198 Image box_pix = pixClipRectangle(pix, clip_box, nullptr); | |
| 199 boxDestroy(&clip_box); | |
| 200 if (box_pix == nullptr) { | |
| 201 return nullptr; | |
| 202 } | |
| 203 if (num_rotations > 0) { | |
| 204 Image rot_pix = pixRotateOrth(box_pix, num_rotations); | |
| 205 box_pix.destroy(); | |
| 206 box_pix = rot_pix; | |
| 207 } | |
| 208 // Convert sub-8-bit images to 8 bit. | |
| 209 int depth = pixGetDepth(box_pix); | |
| 210 if (depth < 8) { | |
| 211 Image grey; | |
| 212 grey = pixConvertTo8(box_pix, false); | |
| 213 box_pix.destroy(); | |
| 214 box_pix = grey; | |
| 215 } | |
| 216 bool vertical_text = false; | |
| 217 if (num_rotations > 0) { | |
| 218 // Rotated the clipped revised box back to internal coordinates. | |
| 219 FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y()); | |
| 220 revised_box->rotate(rotation); | |
| 221 if (num_rotations != 2) { | |
| 222 vertical_text = true; | |
| 223 } | |
| 224 } | |
| 225 return new ImageData(vertical_text, box_pix); | |
| 226 } | |
| 227 | |
| 228 // Recognizes a word or group of words, converting to WERD_RES in *words. | |
| 229 // Analogous to classify_word_pass1, but can handle a group of words as well. | |
| 230 void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, | |
| 231 PointerVector<WERD_RES> *words) { | |
| 232 TBOX word_box = word->word->bounding_box(); | |
| 233 // Get the word image - no frills. | |
| 234 if (tessedit_pageseg_mode == PSM_SINGLE_WORD || tessedit_pageseg_mode == PSM_RAW_LINE) { | |
| 235 // In single word mode, use the whole image without any other row/word | |
| 236 // interpretation. | |
| 237 word_box = TBOX(0, 0, ImageWidth(), ImageHeight()); | |
| 238 } else { | |
| 239 float baseline = row->base_line((word_box.left() + word_box.right()) / 2); | |
| 240 if (baseline + row->descenders() < word_box.bottom()) { | |
| 241 word_box.set_bottom(baseline + row->descenders()); | |
| 242 } | |
| 243 if (baseline + row->x_height() + row->ascenders() > word_box.top()) { | |
| 244 word_box.set_top(baseline + row->x_height() + row->ascenders()); | |
| 245 } | |
| 246 } | |
| 247 ImageData *im_data = GetRectImage(word_box, block, kImagePadding, &word_box); | |
| 248 if (im_data == nullptr) { | |
| 249 return; | |
| 250 } | |
| 251 | |
| 252 bool do_invert = tessedit_do_invert; | |
| 253 float threshold = do_invert ? double(invert_threshold) : 0.0f; | |
| 254 lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0, | |
| 255 kWorstDictCertainty / kCertaintyScale, word_box, words, | |
| 256 lstm_choice_mode, lstm_choice_iterations); | |
| 257 delete im_data; | |
| 258 SearchWords(words); | |
| 259 } | |
| 260 | |
| 261 // Apply segmentation search to the given set of words, within the constraints | |
| 262 // of the existing ratings matrix. If there is already a best_choice on a word | |
| 263 // leaves it untouched and just sets the done/accepted etc flags. | |
| 264 void Tesseract::SearchWords(PointerVector<WERD_RES> *words) { | |
| 265 // Run the segmentation search on the network outputs and make a BoxWord | |
| 266 // for each of the output words. | |
| 267 // If we drop a word as junk, then there is always a space in front of the | |
| 268 // next. | |
| 269 const Dict *stopper_dict = lstm_recognizer_->GetDict(); | |
| 270 if (stopper_dict == nullptr) { | |
| 271 stopper_dict = &getDict(); | |
| 272 } | |
| 273 for (unsigned w = 0; w < words->size(); ++w) { | |
| 274 WERD_RES *word = (*words)[w]; | |
| 275 if (word->best_choice == nullptr) { | |
| 276 // It is a dud. | |
| 277 word->SetupFake(lstm_recognizer_->GetUnicharset()); | |
| 278 } else { | |
| 279 // Set the best state. | |
| 280 for (unsigned i = 0; i < word->best_choice->length(); ++i) { | |
| 281 int length = word->best_choice->state(i); | |
| 282 word->best_state.push_back(length); | |
| 283 } | |
| 284 word->reject_map.initialise(word->best_choice->length()); | |
| 285 word->tess_failed = false; | |
| 286 word->tess_accepted = true; | |
| 287 word->tess_would_adapt = false; | |
| 288 word->done = true; | |
| 289 word->tesseract = this; | |
| 290 float word_certainty = std::min(word->space_certainty, word->best_choice->certainty()); | |
| 291 word_certainty *= kCertaintyScale; | |
| 292 if (getDict().stopper_debug_level >= 1) { | |
| 293 tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n", | |
| 294 word->best_choice->certainty(), word->space_certainty, | |
| 295 std::min(word->space_certainty, word->best_choice->certainty()) * kCertaintyScale, | |
| 296 word_certainty); | |
| 297 word->best_choice->print(); | |
| 298 } | |
| 299 word->best_choice->set_certainty(word_certainty); | |
| 300 | |
| 301 word->tess_accepted = stopper_dict->AcceptableResult(word); | |
| 302 } | |
| 303 } | |
| 304 } | |
| 305 | |
| 306 } // namespace tesseract. |
