comparison mupdf-source/thirdparty/tesseract/src/training/pango/boxchar.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: boxchar.cpp
3 * Description: Simple class to associate a Tesseract classification unit with
4 * its bounding box so that the boxes can be rotated as the image
5 * is rotated for degradation. Also includes routines to output
6 * the character-tagged boxes to a boxfile.
7 * Author: Ray Smith
8 *
9 * (C) Copyright 2013, Google Inc.
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
19 *
20 **********************************************************************/
21
22 #include "boxchar.h"
23
24 #include "fileio.h"
25 #include "normstrngs.h"
26 #include "tesserrstream.h" // for tesserr
27 #include "tprintf.h"
28 #include "unicharset.h"
29 #include "unicode/uchar.h" // from libicu
30
31 #include <algorithm>
32 #include <cstddef>
33 #include <vector>
34
35 // Absolute Ratio of dx:dy or dy:dx to be a newline.
36 const int kMinNewlineRatio = 5;
37
38 namespace tesseract {
39
40 BoxChar::BoxChar(const char *utf8_str, int len)
41 : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}
42
43 BoxChar::~BoxChar() {
44 boxDestroy(&box_);
45 }
46
47 void BoxChar::AddBox(int x, int y, int width, int height) {
48 box_ = boxCreate(x, y, width, height);
49 }
50
51 // Increments *num_rtl and *num_ltr according to the directionality of
52 // characters in the box.
53 void BoxChar::GetDirection(int *num_rtl, int *num_ltr) const {
54 // Convert the unichar to UTF32 representation
55 std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str());
56 if (uni_vector.empty()) {
57 tprintf("Illegal utf8 in boxchar string:%s = ", ch_.c_str());
58 for (char c : ch_) {
59 tprintf(" 0x%x", c);
60 }
61 tprintf("\n");
62 return;
63 }
64 for (char32 ch : uni_vector) {
65 UCharDirection dir = u_charDirection(ch);
66 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || dir == U_RIGHT_TO_LEFT_ISOLATE) {
67 ++*num_rtl;
68 } else if ((dir == U_ARABIC_NUMBER) ||
69 (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) {
70 ++*num_ltr;
71 }
72 }
73 }
74
75 // Reverses the order of unicodes within the box. If Pango generates a
76 // ligature, these will get reversed on output, so reverse now.
77 void BoxChar::ReverseUnicodesInBox() {
78 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str());
79 std::reverse(unicodes.begin(), unicodes.end());
80 ch_ = UNICHAR::UTF32ToUTF8(unicodes);
81 }
82
83 /* static */
84 void BoxChar::TranslateBoxes(int xshift, int yshift, std::vector<BoxChar *> *boxes) {
85 for (auto &boxe : *boxes) {
86 Box *box = boxe->box_;
87 if (box != nullptr) {
88 box->x += xshift;
89 box->y += yshift;
90 }
91 }
92 }
93
94 // Prepares for writing the boxes to a file by inserting newlines, spaces,
95 // and re-ordering so the boxes are strictly left-to-right.
96 /* static */
97 void BoxChar::PrepareToWrite(std::vector<BoxChar *> *boxes) {
98 bool rtl_rules = ContainsMostlyRTL(*boxes);
99 bool vertical_rules = MostlyVertical(*boxes);
100 InsertNewlines(rtl_rules, vertical_rules, boxes);
101 InsertSpaces(rtl_rules, vertical_rules, boxes);
102 for (size_t i = 0; i < boxes->size(); ++i) {
103 if ((*boxes)[i]->box_ == nullptr) {
104 tesserr << "Null box at index " << i << '\n';
105 }
106 }
107 if (rtl_rules) {
108 ReorderRTLText(boxes);
109 }
110 }
111
112 // Inserts newline (tab) characters into the vector at newline positions.
113 /* static */
114 void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) {
115 size_t prev_i = SIZE_MAX;
116 int max_shift = 0;
117 for (size_t i = 0; i < boxes->size(); ++i) {
118 Box *box = (*boxes)[i]->box_;
119 if (box == nullptr) {
120 if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) {
121 // Erase null boxes at the start of a line and after another null box.
122 do {
123 delete (*boxes)[i];
124 boxes->erase(boxes->begin() + i);
125 if (i == 0) {
126 break;
127 }
128 } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr);
129 }
130 continue;
131 }
132 if (prev_i != SIZE_MAX) {
133 Box *prev_box = (*boxes)[prev_i]->box_;
134 int shift = box->x - prev_box->x;
135 if (vertical_rules) {
136 shift = box->y - prev_box->y;
137 } else if (rtl_rules) {
138 shift = -shift;
139 }
140 if (-shift > max_shift) {
141 // This is a newline. Since nothing cares about the size of the box,
142 // except the out-of-bounds checker, minimize the chance of creating
143 // a box outside the image by making the width and height 1.
144 int width = 1;
145 int height = 1;
146 int x = prev_box->x + prev_box->w;
147 int y = prev_box->y;
148 if (vertical_rules) {
149 x = prev_box->x;
150 y = prev_box->y + prev_box->h;
151 } else if (rtl_rules) {
152 x = prev_box->x - width;
153 if (x < 0) {
154 tprintf("prev x = %d, width=%d\n", prev_box->x, width);
155 x = 0;
156 }
157 }
158 if (prev_i + 1 == i) {
159 // New character needed.
160 auto *new_box = new BoxChar("\t", 1);
161 new_box->AddBox(x, y, width, height);
162 new_box->page_ = (*boxes)[i]->page_;
163 boxes->insert(boxes->begin() + i, new_box);
164 ++i;
165 } else {
166 (*boxes)[i - 1]->AddBox(x, y, width, height);
167 (*boxes)[i - 1]->ch_ = "\t";
168 }
169 max_shift = 0;
170 } else if (shift > max_shift) {
171 max_shift = shift;
172 }
173 }
174 prev_i = i;
175 }
176 }
177
178 // Converts nullptr boxes to space characters, with appropriate bounding boxes.
179 /* static */
180 void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) {
181 // After InsertNewlines, any remaining null boxes are not newlines, and are
182 // singletons, so add a box to each remaining null box.
183 for (size_t i = 1; i + 1 < boxes->size(); ++i) {
184 Box *box = (*boxes)[i]->box_;
185 if (box == nullptr) {
186 Box *prev = (*boxes)[i - 1]->box_;
187 Box *next = (*boxes)[i + 1]->box_;
188 ASSERT_HOST(prev != nullptr && next != nullptr);
189 int top = std::min(prev->y, next->y);
190 int bottom = std::max(prev->y + prev->h, next->y + next->h);
191 int left = prev->x + prev->w;
192 int right = next->x;
193 if (vertical_rules) {
194 top = prev->y + prev->h;
195 bottom = next->y;
196 left = std::min(prev->x, next->x);
197 right = std::max(prev->x + prev->w, next->x + next->w);
198 } else if (rtl_rules) {
199 // With RTL we have to account for BiDi.
200 // Right becomes the min left of all prior boxes back to the first
201 // space or newline.
202 right = prev->x;
203 left = next->x + next->w;
204 for (int j = i - 2; j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; --j) {
205 prev = (*boxes)[j]->box_;
206 ASSERT_HOST(prev != nullptr);
207 if (prev->x < right) {
208 right = prev->x;
209 }
210 }
211 // Left becomes the max right of all next boxes forward to the first
212 // space or newline.
213 for (size_t j = i + 2;
214 j < boxes->size() && (*boxes)[j]->box_ != nullptr && (*boxes)[j]->ch_ != "\t"; ++j) {
215 next = (*boxes)[j]->box_;
216 if (next->x + next->w > left) {
217 left = next->x + next->w;
218 }
219 }
220 }
221 // Italic and stylized characters can produce negative spaces, which
222 // Leptonica doesn't like, so clip to a positive size.
223 if (right <= left) {
224 right = left + 1;
225 }
226 if (bottom <= top) {
227 bottom = top + 1;
228 }
229 (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
230 (*boxes)[i]->ch_ = " ";
231 }
232 }
233 }
234
235 // Reorders text in a right-to-left script in left-to-right order.
236 /* static */
237 void BoxChar::ReorderRTLText(std::vector<BoxChar *> *boxes) {
238 // Ideally we need the inverse of the algorithm used by ResultIterator.
239 // For now, let's try a sort that reverses original positions for RTL
240 // characters, otherwise by x-position. This should be much closer to
241 // correct than just sorting by x-position.
242 size_t num_boxes = boxes->size();
243 for (size_t i = 0; i < num_boxes; ++i) {
244 int num_rtl = 0, num_ltr = 0;
245 (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);
246 if (num_rtl > num_ltr) {
247 (*boxes)[i]->set_rtl_index(i);
248 (*boxes)[i]->ReverseUnicodesInBox();
249 }
250 }
251 BoxCharPtrSort sorter;
252 size_t end = 0;
253 for (size_t start = 0; start < boxes->size(); start = end + 1) {
254 end = start + 1;
255 while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") {
256 ++end;
257 }
258 std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
259 }
260 }
261
262 // Returns true if the vector contains mostly RTL characters.
263 /* static */
264 bool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar *> &boxes) {
265 int num_rtl = 0, num_ltr = 0;
266 for (auto boxe : boxes) {
267 boxe->GetDirection(&num_rtl, &num_ltr);
268 }
269 return num_rtl > num_ltr;
270 }
271
272 // Returns true if the text is mostly laid out vertically.
273 /* static */
274 bool BoxChar::MostlyVertical(const std::vector<BoxChar *> &boxes) {
275 int64_t total_dx = 0, total_dy = 0;
276 for (size_t i = 1; i < boxes.size(); ++i) {
277 if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr &&
278 boxes[i - 1]->page_ == boxes[i]->page_) {
279 int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
280 int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
281 if (abs(dx) > abs(dy) * kMinNewlineRatio || abs(dy) > abs(dx) * kMinNewlineRatio) {
282 total_dx += static_cast<int64_t>(dx) * dx;
283 total_dy += static_cast<int64_t>(dy) * dy;
284 }
285 }
286 }
287 return total_dy > total_dx;
288 }
289
290 // Returns the total length of all the strings in the boxes.
291 /* static */
292 int BoxChar::TotalByteLength(const std::vector<BoxChar *> &boxes) {
293 int total_length = 0;
294 for (auto boxe : boxes) {
295 total_length += boxe->ch_.size();
296 }
297 return total_length;
298 }
299
300 // Rotate the boxes in [start_box, end_box) by the given rotation.
301 // The rotation is in radians clockwise about the given center.
302 /* static */
303 void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box,
304 std::vector<BoxChar *> *boxes) {
305 Boxa *orig = boxaCreate(0);
306 for (int i = start_box; i < end_box; ++i) {
307 Box *box = (*boxes)[i]->box_;
308 if (box) {
309 boxaAddBox(orig, box, L_CLONE);
310 }
311 }
312 Boxa *rotated = boxaRotate(orig, xcenter, ycenter, rotation);
313 boxaDestroy(&orig);
314 for (int i = start_box, box_ind = 0; i < end_box; ++i) {
315 if ((*boxes)[i]->box_) {
316 boxDestroy(&((*boxes)[i]->box_));
317 (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
318 }
319 }
320 boxaDestroy(&rotated);
321 }
322
323 const int kMaxLineLength = 1024;
324 /* static */
325 void BoxChar::WriteTesseractBoxFile(const std::string &filename, int height,
326 const std::vector<BoxChar *> &boxes) {
327 std::string output = GetTesseractBoxStr(height, boxes);
328 File::WriteStringToFileOrDie(output, filename);
329 }
330
331 /* static */
332 std::string BoxChar::GetTesseractBoxStr(int height, const std::vector<BoxChar *> &boxes) {
333 std::string output;
334 char buffer[kMaxLineLength];
335 for (auto boxe : boxes) {
336 const Box *box = boxe->box_;
337 if (box == nullptr) {
338 tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
339 return "";
340 }
341 int nbytes = snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", boxe->ch_.c_str(), box->x,
342 height - box->y - box->h, box->x + box->w, height - box->y, boxe->page_);
343 output.append(buffer, nbytes);
344 }
345 return output;
346 }
347
348 } // namespace tesseract