Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/pango/boxchar.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: boxchar.cpp | |
| 3 * Description: Simple class to associate a Tesseract classification unit with | |
| 4 * its bounding box so that the boxes can be rotated as the image | |
| 5 * is rotated for degradation. Also includes routines to output | |
| 6 * the character-tagged boxes to a boxfile. | |
| 7 * Author: Ray Smith | |
| 8 * | |
| 9 * (C) Copyright 2013, Google Inc. | |
| 10 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 11 * you may not use this file except in compliance with the License. | |
| 12 * You may obtain a copy of the License at | |
| 13 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 14 * Unless required by applicable law or agreed to in writing, software | |
| 15 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 17 * See the License for the specific language governing permissions and | |
| 18 * limitations under the License. | |
| 19 * | |
| 20 **********************************************************************/ | |
| 21 | |
| 22 #include "boxchar.h" | |
| 23 | |
| 24 #include "fileio.h" | |
| 25 #include "normstrngs.h" | |
| 26 #include "tesserrstream.h" // for tesserr | |
| 27 #include "tprintf.h" | |
| 28 #include "unicharset.h" | |
| 29 #include "unicode/uchar.h" // from libicu | |
| 30 | |
| 31 #include <algorithm> | |
| 32 #include <cstddef> | |
| 33 #include <vector> | |
| 34 | |
| 35 // Absolute Ratio of dx:dy or dy:dx to be a newline. | |
| 36 const int kMinNewlineRatio = 5; | |
| 37 | |
| 38 namespace tesseract { | |
| 39 | |
| 40 BoxChar::BoxChar(const char *utf8_str, int len) | |
| 41 : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {} | |
| 42 | |
| 43 BoxChar::~BoxChar() { | |
| 44 boxDestroy(&box_); | |
| 45 } | |
| 46 | |
| 47 void BoxChar::AddBox(int x, int y, int width, int height) { | |
| 48 box_ = boxCreate(x, y, width, height); | |
| 49 } | |
| 50 | |
| 51 // Increments *num_rtl and *num_ltr according to the directionality of | |
| 52 // characters in the box. | |
| 53 void BoxChar::GetDirection(int *num_rtl, int *num_ltr) const { | |
| 54 // Convert the unichar to UTF32 representation | |
| 55 std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str()); | |
| 56 if (uni_vector.empty()) { | |
| 57 tprintf("Illegal utf8 in boxchar string:%s = ", ch_.c_str()); | |
| 58 for (char c : ch_) { | |
| 59 tprintf(" 0x%x", c); | |
| 60 } | |
| 61 tprintf("\n"); | |
| 62 return; | |
| 63 } | |
| 64 for (char32 ch : uni_vector) { | |
| 65 UCharDirection dir = u_charDirection(ch); | |
| 66 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || dir == U_RIGHT_TO_LEFT_ISOLATE) { | |
| 67 ++*num_rtl; | |
| 68 } else if ((dir == U_ARABIC_NUMBER) || | |
| 69 (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) { | |
| 70 ++*num_ltr; | |
| 71 } | |
| 72 } | |
| 73 } | |
| 74 | |
| 75 // Reverses the order of unicodes within the box. If Pango generates a | |
| 76 // ligature, these will get reversed on output, so reverse now. | |
| 77 void BoxChar::ReverseUnicodesInBox() { | |
| 78 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str()); | |
| 79 std::reverse(unicodes.begin(), unicodes.end()); | |
| 80 ch_ = UNICHAR::UTF32ToUTF8(unicodes); | |
| 81 } | |
| 82 | |
| 83 /* static */ | |
| 84 void BoxChar::TranslateBoxes(int xshift, int yshift, std::vector<BoxChar *> *boxes) { | |
| 85 for (auto &boxe : *boxes) { | |
| 86 Box *box = boxe->box_; | |
| 87 if (box != nullptr) { | |
| 88 box->x += xshift; | |
| 89 box->y += yshift; | |
| 90 } | |
| 91 } | |
| 92 } | |
| 93 | |
| 94 // Prepares for writing the boxes to a file by inserting newlines, spaces, | |
| 95 // and re-ordering so the boxes are strictly left-to-right. | |
| 96 /* static */ | |
| 97 void BoxChar::PrepareToWrite(std::vector<BoxChar *> *boxes) { | |
| 98 bool rtl_rules = ContainsMostlyRTL(*boxes); | |
| 99 bool vertical_rules = MostlyVertical(*boxes); | |
| 100 InsertNewlines(rtl_rules, vertical_rules, boxes); | |
| 101 InsertSpaces(rtl_rules, vertical_rules, boxes); | |
| 102 for (size_t i = 0; i < boxes->size(); ++i) { | |
| 103 if ((*boxes)[i]->box_ == nullptr) { | |
| 104 tesserr << "Null box at index " << i << '\n'; | |
| 105 } | |
| 106 } | |
| 107 if (rtl_rules) { | |
| 108 ReorderRTLText(boxes); | |
| 109 } | |
| 110 } | |
| 111 | |
| 112 // Inserts newline (tab) characters into the vector at newline positions. | |
| 113 /* static */ | |
| 114 void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) { | |
| 115 size_t prev_i = SIZE_MAX; | |
| 116 int max_shift = 0; | |
| 117 for (size_t i = 0; i < boxes->size(); ++i) { | |
| 118 Box *box = (*boxes)[i]->box_; | |
| 119 if (box == nullptr) { | |
| 120 if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) { | |
| 121 // Erase null boxes at the start of a line and after another null box. | |
| 122 do { | |
| 123 delete (*boxes)[i]; | |
| 124 boxes->erase(boxes->begin() + i); | |
| 125 if (i == 0) { | |
| 126 break; | |
| 127 } | |
| 128 } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr); | |
| 129 } | |
| 130 continue; | |
| 131 } | |
| 132 if (prev_i != SIZE_MAX) { | |
| 133 Box *prev_box = (*boxes)[prev_i]->box_; | |
| 134 int shift = box->x - prev_box->x; | |
| 135 if (vertical_rules) { | |
| 136 shift = box->y - prev_box->y; | |
| 137 } else if (rtl_rules) { | |
| 138 shift = -shift; | |
| 139 } | |
| 140 if (-shift > max_shift) { | |
| 141 // This is a newline. Since nothing cares about the size of the box, | |
| 142 // except the out-of-bounds checker, minimize the chance of creating | |
| 143 // a box outside the image by making the width and height 1. | |
| 144 int width = 1; | |
| 145 int height = 1; | |
| 146 int x = prev_box->x + prev_box->w; | |
| 147 int y = prev_box->y; | |
| 148 if (vertical_rules) { | |
| 149 x = prev_box->x; | |
| 150 y = prev_box->y + prev_box->h; | |
| 151 } else if (rtl_rules) { | |
| 152 x = prev_box->x - width; | |
| 153 if (x < 0) { | |
| 154 tprintf("prev x = %d, width=%d\n", prev_box->x, width); | |
| 155 x = 0; | |
| 156 } | |
| 157 } | |
| 158 if (prev_i + 1 == i) { | |
| 159 // New character needed. | |
| 160 auto *new_box = new BoxChar("\t", 1); | |
| 161 new_box->AddBox(x, y, width, height); | |
| 162 new_box->page_ = (*boxes)[i]->page_; | |
| 163 boxes->insert(boxes->begin() + i, new_box); | |
| 164 ++i; | |
| 165 } else { | |
| 166 (*boxes)[i - 1]->AddBox(x, y, width, height); | |
| 167 (*boxes)[i - 1]->ch_ = "\t"; | |
| 168 } | |
| 169 max_shift = 0; | |
| 170 } else if (shift > max_shift) { | |
| 171 max_shift = shift; | |
| 172 } | |
| 173 } | |
| 174 prev_i = i; | |
| 175 } | |
| 176 } | |
| 177 | |
| 178 // Converts nullptr boxes to space characters, with appropriate bounding boxes. | |
| 179 /* static */ | |
| 180 void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) { | |
| 181 // After InsertNewlines, any remaining null boxes are not newlines, and are | |
| 182 // singletons, so add a box to each remaining null box. | |
| 183 for (size_t i = 1; i + 1 < boxes->size(); ++i) { | |
| 184 Box *box = (*boxes)[i]->box_; | |
| 185 if (box == nullptr) { | |
| 186 Box *prev = (*boxes)[i - 1]->box_; | |
| 187 Box *next = (*boxes)[i + 1]->box_; | |
| 188 ASSERT_HOST(prev != nullptr && next != nullptr); | |
| 189 int top = std::min(prev->y, next->y); | |
| 190 int bottom = std::max(prev->y + prev->h, next->y + next->h); | |
| 191 int left = prev->x + prev->w; | |
| 192 int right = next->x; | |
| 193 if (vertical_rules) { | |
| 194 top = prev->y + prev->h; | |
| 195 bottom = next->y; | |
| 196 left = std::min(prev->x, next->x); | |
| 197 right = std::max(prev->x + prev->w, next->x + next->w); | |
| 198 } else if (rtl_rules) { | |
| 199 // With RTL we have to account for BiDi. | |
| 200 // Right becomes the min left of all prior boxes back to the first | |
| 201 // space or newline. | |
| 202 right = prev->x; | |
| 203 left = next->x + next->w; | |
| 204 for (int j = i - 2; j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; --j) { | |
| 205 prev = (*boxes)[j]->box_; | |
| 206 ASSERT_HOST(prev != nullptr); | |
| 207 if (prev->x < right) { | |
| 208 right = prev->x; | |
| 209 } | |
| 210 } | |
| 211 // Left becomes the max right of all next boxes forward to the first | |
| 212 // space or newline. | |
| 213 for (size_t j = i + 2; | |
| 214 j < boxes->size() && (*boxes)[j]->box_ != nullptr && (*boxes)[j]->ch_ != "\t"; ++j) { | |
| 215 next = (*boxes)[j]->box_; | |
| 216 if (next->x + next->w > left) { | |
| 217 left = next->x + next->w; | |
| 218 } | |
| 219 } | |
| 220 } | |
| 221 // Italic and stylized characters can produce negative spaces, which | |
| 222 // Leptonica doesn't like, so clip to a positive size. | |
| 223 if (right <= left) { | |
| 224 right = left + 1; | |
| 225 } | |
| 226 if (bottom <= top) { | |
| 227 bottom = top + 1; | |
| 228 } | |
| 229 (*boxes)[i]->AddBox(left, top, right - left, bottom - top); | |
| 230 (*boxes)[i]->ch_ = " "; | |
| 231 } | |
| 232 } | |
| 233 } | |
| 234 | |
| 235 // Reorders text in a right-to-left script in left-to-right order. | |
| 236 /* static */ | |
| 237 void BoxChar::ReorderRTLText(std::vector<BoxChar *> *boxes) { | |
| 238 // Ideally we need the inverse of the algorithm used by ResultIterator. | |
| 239 // For now, let's try a sort that reverses original positions for RTL | |
| 240 // characters, otherwise by x-position. This should be much closer to | |
| 241 // correct than just sorting by x-position. | |
| 242 size_t num_boxes = boxes->size(); | |
| 243 for (size_t i = 0; i < num_boxes; ++i) { | |
| 244 int num_rtl = 0, num_ltr = 0; | |
| 245 (*boxes)[i]->GetDirection(&num_rtl, &num_ltr); | |
| 246 if (num_rtl > num_ltr) { | |
| 247 (*boxes)[i]->set_rtl_index(i); | |
| 248 (*boxes)[i]->ReverseUnicodesInBox(); | |
| 249 } | |
| 250 } | |
| 251 BoxCharPtrSort sorter; | |
| 252 size_t end = 0; | |
| 253 for (size_t start = 0; start < boxes->size(); start = end + 1) { | |
| 254 end = start + 1; | |
| 255 while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") { | |
| 256 ++end; | |
| 257 } | |
| 258 std::sort(boxes->begin() + start, boxes->begin() + end, sorter); | |
| 259 } | |
| 260 } | |
| 261 | |
| 262 // Returns true if the vector contains mostly RTL characters. | |
| 263 /* static */ | |
| 264 bool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar *> &boxes) { | |
| 265 int num_rtl = 0, num_ltr = 0; | |
| 266 for (auto boxe : boxes) { | |
| 267 boxe->GetDirection(&num_rtl, &num_ltr); | |
| 268 } | |
| 269 return num_rtl > num_ltr; | |
| 270 } | |
| 271 | |
| 272 // Returns true if the text is mostly laid out vertically. | |
| 273 /* static */ | |
| 274 bool BoxChar::MostlyVertical(const std::vector<BoxChar *> &boxes) { | |
| 275 int64_t total_dx = 0, total_dy = 0; | |
| 276 for (size_t i = 1; i < boxes.size(); ++i) { | |
| 277 if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr && | |
| 278 boxes[i - 1]->page_ == boxes[i]->page_) { | |
| 279 int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x; | |
| 280 int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y; | |
| 281 if (abs(dx) > abs(dy) * kMinNewlineRatio || abs(dy) > abs(dx) * kMinNewlineRatio) { | |
| 282 total_dx += static_cast<int64_t>(dx) * dx; | |
| 283 total_dy += static_cast<int64_t>(dy) * dy; | |
| 284 } | |
| 285 } | |
| 286 } | |
| 287 return total_dy > total_dx; | |
| 288 } | |
| 289 | |
| 290 // Returns the total length of all the strings in the boxes. | |
| 291 /* static */ | |
| 292 int BoxChar::TotalByteLength(const std::vector<BoxChar *> &boxes) { | |
| 293 int total_length = 0; | |
| 294 for (auto boxe : boxes) { | |
| 295 total_length += boxe->ch_.size(); | |
| 296 } | |
| 297 return total_length; | |
| 298 } | |
| 299 | |
| 300 // Rotate the boxes in [start_box, end_box) by the given rotation. | |
| 301 // The rotation is in radians clockwise about the given center. | |
| 302 /* static */ | |
| 303 void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, | |
| 304 std::vector<BoxChar *> *boxes) { | |
| 305 Boxa *orig = boxaCreate(0); | |
| 306 for (int i = start_box; i < end_box; ++i) { | |
| 307 Box *box = (*boxes)[i]->box_; | |
| 308 if (box) { | |
| 309 boxaAddBox(orig, box, L_CLONE); | |
| 310 } | |
| 311 } | |
| 312 Boxa *rotated = boxaRotate(orig, xcenter, ycenter, rotation); | |
| 313 boxaDestroy(&orig); | |
| 314 for (int i = start_box, box_ind = 0; i < end_box; ++i) { | |
| 315 if ((*boxes)[i]->box_) { | |
| 316 boxDestroy(&((*boxes)[i]->box_)); | |
| 317 (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE); | |
| 318 } | |
| 319 } | |
| 320 boxaDestroy(&rotated); | |
| 321 } | |
| 322 | |
| 323 const int kMaxLineLength = 1024; | |
| 324 /* static */ | |
| 325 void BoxChar::WriteTesseractBoxFile(const std::string &filename, int height, | |
| 326 const std::vector<BoxChar *> &boxes) { | |
| 327 std::string output = GetTesseractBoxStr(height, boxes); | |
| 328 File::WriteStringToFileOrDie(output, filename); | |
| 329 } | |
| 330 | |
| 331 /* static */ | |
| 332 std::string BoxChar::GetTesseractBoxStr(int height, const std::vector<BoxChar *> &boxes) { | |
| 333 std::string output; | |
| 334 char buffer[kMaxLineLength]; | |
| 335 for (auto boxe : boxes) { | |
| 336 const Box *box = boxe->box_; | |
| 337 if (box == nullptr) { | |
| 338 tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n"); | |
| 339 return ""; | |
| 340 } | |
| 341 int nbytes = snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", boxe->ch_.c_str(), box->x, | |
| 342 height - box->y - box->h, box->x + box->w, height - box->y, boxe->page_); | |
| 343 output.append(buffer, nbytes); | |
| 344 } | |
| 345 return output; | |
| 346 } | |
| 347 | |
| 348 } // namespace tesseract |
