Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/boxread.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: boxread.cpp | |
| 3 * Description: Read data from a box file. | |
| 4 * Author: Ray Smith | |
| 5 * | |
| 6 * (C) Copyright 2007, Google Inc. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #include "boxread.h" | |
| 20 | |
| 21 #include "errcode.h" // for ERRCODE, TESSEXIT | |
| 22 #include "fileerr.h" // for CANTOPENFILE | |
| 23 #include "rect.h" // for TBOX | |
| 24 #include "tprintf.h" // for tprintf | |
| 25 | |
| 26 #include <tesseract/unichar.h> // for UNICHAR | |
| 27 #include "helpers.h" // for chomp_string | |
| 28 | |
| 29 #include <climits> // for INT_MAX | |
| 30 #include <cstring> // for strchr, strcmp | |
| 31 #include <fstream> // for std::ifstream | |
| 32 #include <locale> // for std::locale::classic | |
| 33 #include <sstream> // for std::stringstream | |
| 34 #include <string> // for std::string | |
| 35 | |
| 36 namespace tesseract { | |
| 37 | |
| 38 // Special char code used to identify multi-blob labels. | |
| 39 static const char *kMultiBlobLabelCode = "WordStr"; | |
| 40 | |
| 41 // Returns the box file name corresponding to the given image_filename. | |
| 42 static std::string BoxFileName(const char *image_filename) { | |
| 43 std::string box_filename = image_filename; | |
| 44 size_t length = box_filename.length(); | |
| 45 std::string last = (length > 8) ? box_filename.substr(length - 8) : ""; | |
| 46 if (last == ".bin.png" || last == ".nrm.png" || last == ".raw.png") { | |
| 47 box_filename.resize(length - 8); | |
| 48 } else { | |
| 49 size_t lastdot = box_filename.find_last_of('.'); | |
| 50 if (lastdot < length) { | |
| 51 box_filename.resize(lastdot); | |
| 52 } | |
| 53 } | |
| 54 box_filename += ".box"; | |
| 55 return box_filename; | |
| 56 } | |
| 57 | |
| 58 // Open the boxfile based on the given image filename. | |
| 59 FILE *OpenBoxFile(const char *fname) { | |
| 60 std::string filename = BoxFileName(fname); | |
| 61 FILE *box_file = nullptr; | |
| 62 if (!(box_file = fopen(filename.c_str(), "rb"))) { | |
| 63 CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s", filename.c_str()); | |
| 64 tprintf("Can't open box file %s", filename.c_str()); | |
| 65 } | |
| 66 return box_file; | |
| 67 } | |
| 68 | |
| 69 // Reads all boxes from the given filename. | |
| 70 // Reads a specific target_page number if >= 0, or all pages otherwise. | |
| 71 // Skips blanks if skip_blanks is true. | |
| 72 // The UTF-8 label of the box is put in texts, and the full box definition as | |
| 73 // a string is put in box_texts, with the corresponding page number in pages. | |
| 74 // Each of the output vectors is optional (may be nullptr). | |
| 75 // Returns false if no boxes are found. | |
| 76 bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes, | |
| 77 std::vector<std::string> *texts, std::vector<std::string> *box_texts, | |
| 78 std::vector<int> *pages) { | |
| 79 std::ifstream input(BoxFileName(filename), std::ios::in | std::ios::binary); | |
| 80 if (input.fail()) { | |
| 81 tprintf("Cannot read box data from '%s'.\n", BoxFileName(filename).c_str()); | |
| 82 tprintf("Does it exists?\n"); | |
| 83 return false; | |
| 84 } | |
| 85 std::vector<char> box_data(std::istreambuf_iterator<char>(input), {}); | |
| 86 if (box_data.empty()) { | |
| 87 tprintf("No box data found in '%s'.\n", BoxFileName(filename).c_str()); | |
| 88 return false; | |
| 89 } | |
| 90 // Convert the array of bytes to a string, so it can be used by the parser. | |
| 91 box_data.push_back('\0'); | |
| 92 return ReadMemBoxes(target_page, skip_blanks, &box_data[0], | |
| 93 /*continue_on_failure*/ true, boxes, texts, box_texts, pages); | |
| 94 } | |
| 95 | |
| 96 // Reads all boxes from the string. Otherwise, as ReadAllBoxes. | |
| 97 bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, | |
| 98 std::vector<TBOX> *boxes, std::vector<std::string> *texts, | |
| 99 std::vector<std::string> *box_texts, std::vector<int> *pages) { | |
| 100 std::string box_str(box_data); | |
| 101 std::vector<std::string> lines = split(box_str, '\n'); | |
| 102 if (lines.empty()) { | |
| 103 return false; | |
| 104 } | |
| 105 int num_boxes = 0; | |
| 106 for (auto &line : lines) { | |
| 107 int page = 0; | |
| 108 std::string utf8_str; | |
| 109 TBOX box; | |
| 110 if (!ParseBoxFileStr(line.c_str(), &page, utf8_str, &box)) { | |
| 111 if (continue_on_failure) { | |
| 112 continue; | |
| 113 } else { | |
| 114 return false; | |
| 115 } | |
| 116 } | |
| 117 if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) { | |
| 118 continue; | |
| 119 } | |
| 120 if (target_page >= 0 && page != target_page) { | |
| 121 continue; | |
| 122 } | |
| 123 if (boxes != nullptr) { | |
| 124 boxes->push_back(box); | |
| 125 } | |
| 126 if (texts != nullptr) { | |
| 127 texts->push_back(utf8_str); | |
| 128 } | |
| 129 if (box_texts != nullptr) { | |
| 130 std::string full_text; | |
| 131 MakeBoxFileStr(utf8_str.c_str(), box, target_page, full_text); | |
| 132 box_texts->push_back(full_text); | |
| 133 } | |
| 134 if (pages != nullptr) { | |
| 135 pages->push_back(page); | |
| 136 } | |
| 137 ++num_boxes; | |
| 138 } | |
| 139 return num_boxes > 0; | |
| 140 } | |
| 141 | |
| 142 // TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes. | |
| 143 // Box files are used ONLY DURING TRAINING, but by both processes of | |
| 144 // creating tr files with tesseract, and unicharset_extractor. | |
| 145 // ReadNextBox factors out the code to interpret a line of a box | |
| 146 // file so that applybox and unicharset_extractor interpret the same way. | |
| 147 // This function returns the next valid box file utf8 string and coords | |
| 148 // and returns true, or false on eof (and closes the file). | |
| 149 // It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks | |
| 150 // for valid utf-8 and allows space or tab between fields. | |
| 151 // utf8_str is set with the unichar string, and bounding box with the box. | |
| 152 // If there are page numbers in the file, it reads them all. | |
| 153 bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box) { | |
| 154 return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box); | |
| 155 } | |
| 156 | |
| 157 // As ReadNextBox above, but get a specific page number. (0-based) | |
| 158 // Use -1 to read any page number. Files without page number all | |
| 159 // read as if they are page 0. | |
| 160 bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str, | |
| 161 TBOX *bounding_box) { | |
| 162 int page = 0; | |
| 163 char buff[kBoxReadBufSize]; // boxfile read buffer | |
| 164 char *buffptr = buff; | |
| 165 | |
| 166 while (fgets(buff, sizeof(buff) - 1, box_file)) { | |
| 167 (*line_number)++; | |
| 168 | |
| 169 buffptr = buff; | |
| 170 const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr); | |
| 171 if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) { | |
| 172 buffptr += 3; // Skip unicode file designation. | |
| 173 } | |
| 174 // Check for blank lines in box file | |
| 175 if (*buffptr == '\n' || *buffptr == '\0') { | |
| 176 continue; | |
| 177 } | |
| 178 // Skip blank boxes. | |
| 179 if (*buffptr == ' ' || *buffptr == '\t') { | |
| 180 continue; | |
| 181 } | |
| 182 if (*buffptr != '\0') { | |
| 183 if (!ParseBoxFileStr(buffptr, &page, utf8_str, bounding_box)) { | |
| 184 tprintf("Box file format error on line %i; ignored\n", *line_number); | |
| 185 continue; | |
| 186 } | |
| 187 if (target_page >= 0 && target_page != page) { | |
| 188 continue; // Not on the appropriate page. | |
| 189 } | |
| 190 return true; // Successfully read a box. | |
| 191 } | |
| 192 } | |
| 193 fclose(box_file); | |
| 194 return false; // EOF | |
| 195 } | |
| 196 | |
| 197 // Parses the given box file string into a page_number, utf8_str, and | |
| 198 // bounding_box. Returns true on a successful parse. | |
| 199 // The box file is assumed to contain box definitions, one per line, of the | |
| 200 // following format for blob-level boxes: | |
| 201 // <UTF8 str> <left> <bottom> <right> <top> <page id> | |
| 202 // and for word/line-level boxes: | |
| 203 // WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str> | |
| 204 // See applyybox.cpp for more information. | |
| 205 bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, | |
| 206 TBOX *bounding_box) { | |
| 207 *bounding_box = TBOX(); // Initialize it to empty. | |
| 208 utf8_str = ""; | |
| 209 char uch[kBoxReadBufSize]; | |
| 210 const char *buffptr = boxfile_str; | |
| 211 // Read the unichar without messing up on Tibetan. | |
| 212 // According to issue 253 the utf-8 surrogates 85 and A0 are treated | |
| 213 // as whitespace by sscanf, so it is more reliable to just find | |
| 214 // ascii space and tab. | |
| 215 int uch_len = 0; | |
| 216 // Skip unicode file designation, if present. | |
| 217 const auto *ubuf = reinterpret_cast<const unsigned char *>(buffptr); | |
| 218 if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) { | |
| 219 buffptr += 3; | |
| 220 } | |
| 221 // Allow a single blank as the UTF-8 string. Check for empty string and | |
| 222 // then blindly eat the first character. | |
| 223 if (*buffptr == '\0') { | |
| 224 return false; | |
| 225 } | |
| 226 do { | |
| 227 uch[uch_len++] = *buffptr++; | |
| 228 } while (*buffptr != '\0' && *buffptr != ' ' && *buffptr != '\t' && | |
| 229 uch_len < kBoxReadBufSize - 1); | |
| 230 uch[uch_len] = '\0'; | |
| 231 if (*buffptr != '\0') { | |
| 232 ++buffptr; | |
| 233 } | |
| 234 int x_min = INT_MAX; | |
| 235 int y_min = INT_MAX; | |
| 236 int x_max = INT_MIN; | |
| 237 int y_max = INT_MIN; | |
| 238 *page_number = 0; | |
| 239 std::stringstream stream(buffptr); | |
| 240 stream.imbue(std::locale::classic()); | |
| 241 stream >> x_min; | |
| 242 stream >> y_min; | |
| 243 stream >> x_max; | |
| 244 stream >> y_max; | |
| 245 stream >> *page_number; | |
| 246 if (x_max < x_min || y_max < y_min) { | |
| 247 tprintf("Bad box coordinates in boxfile string! %s\n", ubuf); | |
| 248 return false; | |
| 249 } | |
| 250 // Test for long space-delimited string label. | |
| 251 if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr, '#')) != nullptr) { | |
| 252 strncpy(uch, buffptr + 1, kBoxReadBufSize - 1); | |
| 253 uch[kBoxReadBufSize - 1] = '\0'; // Prevent buffer overrun. | |
| 254 chomp_string(uch); | |
| 255 uch_len = strlen(uch); | |
| 256 } | |
| 257 // Validate UTF8 by making unichars with it. | |
| 258 int used = 0; | |
| 259 while (used < uch_len) { | |
| 260 tesseract::UNICHAR ch(uch + used, uch_len - used); | |
| 261 int new_used = ch.utf8_len(); | |
| 262 if (new_used == 0) { | |
| 263 tprintf("Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1); | |
| 264 return false; | |
| 265 } | |
| 266 used += new_used; | |
| 267 } | |
| 268 utf8_str = uch; | |
| 269 if (x_min > x_max) { | |
| 270 std::swap(x_min, x_max); | |
| 271 } | |
| 272 if (y_min > y_max) { | |
| 273 std::swap(y_min, y_max); | |
| 274 } | |
| 275 bounding_box->set_to_given_coords(x_min, y_min, x_max, y_max); | |
| 276 return true; // Successfully read a box. | |
| 277 } | |
| 278 | |
| 279 // Creates a box file string from a unichar string, TBOX and page number. | |
| 280 void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str) { | |
| 281 box_str = unichar_str; | |
| 282 box_str += " " + std::to_string(box.left()); | |
| 283 box_str += " " + std::to_string(box.bottom()); | |
| 284 box_str += " " + std::to_string(box.right()); | |
| 285 box_str += " " + std::to_string(box.top()); | |
| 286 box_str += " " + std::to_string(page_num); | |
| 287 } | |
| 288 | |
| 289 } // namespace tesseract |
