Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/recogtraining.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: recogtraining.cpp | |
| 3 // Description: Functions for ambiguity and parameter training. | |
| 4 // Author: Daria Antonova | |
| 5 // | |
| 6 // (C) Copyright 2009, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include "tesseractclass.h" | |
| 20 | |
| 21 #include "boxread.h" | |
| 22 #include "control.h" | |
| 23 #include "host.h" // for NearlyEqual | |
| 24 #include "ratngs.h" | |
| 25 #ifndef DISABLED_LEGACY_ENGINE | |
| 26 # include "reject.h" | |
| 27 #endif | |
| 28 #include "stopper.h" | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 const int16_t kMaxBoxEdgeDiff = 2; | |
| 33 | |
| 34 // Sets flags necessary for recognition in the training mode. | |
| 35 // Opens and returns the pointer to the output file. | |
| 36 FILE *Tesseract::init_recog_training(const char *filename) { | |
| 37 if (tessedit_ambigs_training) { | |
| 38 tessedit_tess_adaption_mode.set_value(0); // turn off adaption | |
| 39 tessedit_enable_doc_dict.set_value(false); // turn off document dictionary | |
| 40 // Explore all segmentations. | |
| 41 getDict().stopper_no_acceptable_choices.set_value(true); | |
| 42 } | |
| 43 | |
| 44 std::string output_fname = filename; | |
| 45 const char *lastdot = strrchr(output_fname.c_str(), '.'); | |
| 46 if (lastdot != nullptr) { | |
| 47 output_fname[lastdot - output_fname.c_str()] = '\0'; | |
| 48 } | |
| 49 output_fname += ".txt"; | |
| 50 FILE *output_file = fopen(output_fname.c_str(), "a+"); | |
| 51 if (output_file == nullptr) { | |
| 52 tprintf("Error: Could not open file %s\n", output_fname.c_str()); | |
| 53 ASSERT_HOST(output_file); | |
| 54 } | |
| 55 return output_file; | |
| 56 } | |
| 57 | |
| 58 // Copies the bounding box from page_res_it->word() to the given TBOX. | |
| 59 static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) { | |
| 60 while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) { | |
| 61 page_res_it->forward(); | |
| 62 } | |
| 63 | |
| 64 if (page_res_it->word() != nullptr) { | |
| 65 *tbox = page_res_it->word()->word->bounding_box(); | |
| 66 | |
| 67 // If tbox->left() is negative, the training image has vertical text and | |
| 68 // all the coordinates of bounding boxes of page_res are rotated by 90 | |
| 69 // degrees in a counterclockwise direction. We need to rotate the TBOX back | |
| 70 // in order to compare with the TBOXes of box files. | |
| 71 if (tbox->left() < 0) { | |
| 72 tbox->rotate(FCOORD(0.0, -1.0)); | |
| 73 } | |
| 74 | |
| 75 return true; | |
| 76 } else { | |
| 77 return false; | |
| 78 } | |
| 79 } | |
| 80 | |
| 81 // This function takes tif/box pair of files and runs recognition on the image, | |
| 82 // while making sure that the word bounds that tesseract identified roughly | |
| 83 // match to those specified by the input box file. For each word (ngram in a | |
| 84 // single bounding box from the input box file) it outputs the ocred result, | |
| 85 // the correct label, rating and certainty. | |
| 86 void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res, | |
| 87 volatile ETEXT_DESC *monitor, FILE *output_file) { | |
| 88 std::string box_fname = filename; | |
| 89 const char *lastdot = strrchr(box_fname.c_str(), '.'); | |
| 90 if (lastdot != nullptr) { | |
| 91 box_fname[lastdot - box_fname.c_str()] = '\0'; | |
| 92 } | |
| 93 box_fname += ".box"; | |
| 94 // ReadNextBox() will close box_file | |
| 95 FILE *box_file = fopen(box_fname.c_str(), "r"); | |
| 96 if (box_file == nullptr) { | |
| 97 tprintf("Error: Could not open file %s\n", box_fname.c_str()); | |
| 98 ASSERT_HOST(box_file); | |
| 99 } | |
| 100 | |
| 101 PAGE_RES_IT page_res_it; | |
| 102 page_res_it.page_res = page_res; | |
| 103 page_res_it.restart_page(); | |
| 104 std::string label; | |
| 105 | |
| 106 // Process all the words on this page. | |
| 107 TBOX tbox; // tesseract-identified box | |
| 108 TBOX bbox; // box from the box file | |
| 109 bool keep_going; | |
| 110 int line_number = 0; | |
| 111 int examined_words = 0; | |
| 112 do { | |
| 113 keep_going = read_t(&page_res_it, &tbox); | |
| 114 keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox); | |
| 115 // Align bottom left points of the TBOXes. | |
| 116 while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) { | |
| 117 if (bbox.bottom() < tbox.bottom()) { | |
| 118 page_res_it.forward(); | |
| 119 keep_going = read_t(&page_res_it, &tbox); | |
| 120 } else { | |
| 121 keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox); | |
| 122 } | |
| 123 } | |
| 124 while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) { | |
| 125 if (bbox.left() > tbox.left()) { | |
| 126 page_res_it.forward(); | |
| 127 keep_going = read_t(&page_res_it, &tbox); | |
| 128 } else { | |
| 129 keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox); | |
| 130 } | |
| 131 } | |
| 132 // OCR the word if top right points of the TBOXes are similar. | |
| 133 if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) && | |
| 134 NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) { | |
| 135 ambigs_classify_and_output(label.c_str(), &page_res_it, output_file); | |
| 136 examined_words++; | |
| 137 } | |
| 138 page_res_it.forward(); | |
| 139 } while (keep_going); | |
| 140 | |
| 141 // Set up scripts on all of the words that did not get sent to | |
| 142 // ambigs_classify_and_output. They all should have, but if all the | |
| 143 // werd_res's don't get uch_sets, tesseract will crash when you try | |
| 144 // to iterate over them. :-( | |
| 145 int total_words = 0; | |
| 146 for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) { | |
| 147 if (page_res_it.word()) { | |
| 148 if (page_res_it.word()->uch_set == nullptr) { | |
| 149 page_res_it.word()->SetupFake(unicharset); | |
| 150 } | |
| 151 total_words++; | |
| 152 } | |
| 153 } | |
| 154 if (examined_words < 0.85 * total_words) { | |
| 155 tprintf( | |
| 156 "TODO(antonova): clean up recog_training_segmented; " | |
| 157 " It examined only a small fraction of the ambigs image.\n"); | |
| 158 } | |
| 159 tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words); | |
| 160 } | |
| 161 | |
| 162 // Helper prints the given set of blob choices. | |
| 163 static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset, | |
| 164 const char *label, FILE *output_file) { | |
| 165 float rating = 0.0f; | |
| 166 float certainty = 0.0f; | |
| 167 for (int i = 0; i < length; ++i) { | |
| 168 const BLOB_CHOICE *blob_choice = blob_choices[i]; | |
| 169 fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id())); | |
| 170 rating += blob_choice->rating(); | |
| 171 if (certainty > blob_choice->certainty()) { | |
| 172 certainty = blob_choice->certainty(); | |
| 173 } | |
| 174 } | |
| 175 fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty); | |
| 176 } | |
| 177 | |
| 178 // Helper recursively prints all paths through the ratings matrix, starting | |
| 179 // at column col. | |
| 180 static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length, | |
| 181 const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset, | |
| 182 const char *label, FILE *output_file) { | |
| 183 for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) { | |
| 184 if (ratings.get(col, row) != NOT_CLASSIFIED) { | |
| 185 BLOB_CHOICE_IT bc_it(ratings.get(col, row)); | |
| 186 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) { | |
| 187 blob_choices[length] = bc_it.data(); | |
| 188 if (row + 1 < dim) { | |
| 189 PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label, | |
| 190 output_file); | |
| 191 } else { | |
| 192 PrintPath(length + 1, blob_choices, unicharset, label, output_file); | |
| 193 } | |
| 194 } | |
| 195 } | |
| 196 } | |
| 197 } | |
| 198 | |
| 199 // Runs classify_word_pass1() on the current word. Outputs Tesseract's | |
| 200 // raw choice as a result of the classification. For words labeled with a | |
| 201 // single unichar also outputs all alternatives from blob_choices of the | |
| 202 // best choice. | |
| 203 void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, | |
| 204 FILE *output_file) { | |
| 205 // Classify word. | |
| 206 fflush(stdout); | |
| 207 WordData word_data(*pr_it); | |
| 208 SetupWordPassN(1, &word_data); | |
| 209 classify_word_and_language(1, pr_it, &word_data); | |
| 210 WERD_RES *werd_res = word_data.word; | |
| 211 WERD_CHOICE *best_choice = werd_res->best_choice; | |
| 212 ASSERT_HOST(best_choice != nullptr); | |
| 213 | |
| 214 // Compute the number of unichars in the label. | |
| 215 std::vector<UNICHAR_ID> encoding; | |
| 216 if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) { | |
| 217 tprintf("Not outputting illegal unichar %s\n", label); | |
| 218 return; | |
| 219 } | |
| 220 | |
| 221 // Dump all paths through the ratings matrix (which is normally small). | |
| 222 int dim = werd_res->ratings->dimension(); | |
| 223 const auto **blob_choices = new const BLOB_CHOICE *[dim]; | |
| 224 PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file); | |
| 225 delete[] blob_choices; | |
| 226 } | |
| 227 | |
| 228 } // namespace tesseract |
