comparison mupdf-source/thirdparty/tesseract/src/api/pagerenderer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // File: pagerenderer.cpp
2 // Description: PAGE XML rendering interface
3 // Author: Jan Kamlah
4
5 // (C) Copyright 2024
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include "errcode.h" // for ASSERT_HOST
17 #include "helpers.h" // for copy_string
18 #include "tprintf.h" // for tprintf
19
20 #include <tesseract/baseapi.h>
21 #include <tesseract/renderer.h>
22
23 #include <ctime>
24 #include <iomanip>
25 #include <memory>
26 #include <regex>
27 #include <sstream> // for std::stringstream
28 #include <unordered_set>
29
30 #include <allheaders.h>
31 #if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || \
32 LIBLEPT_MAJOR_VERSION > 1
33 # include <array_internal.h>
34 # include <pix_internal.h>
35 #endif
36
37 namespace tesseract {
38
39 ///
40 /// Slope and offset between two points
41 ///
42 static void GetSlopeAndOffset(float x0, float y0, float x1, float y1, float *m,
43 float *b) {
44 float slope;
45
46 slope = ((y1 - y0) / (x1 - x0));
47 *m = slope;
48 *b = y0 - slope * x0;
49 }
50
51 ///
52 /// Write coordinates in the form of a points to a stream
53 ///
54 static void AddPointsToPAGE(Pta *pts, std::stringstream &str) {
55 int num_pts;
56
57 str << "<Coords points=\"";
58 num_pts = ptaGetCount(pts);
59 for (int p = 0; p < num_pts; ++p) {
60 int x, y;
61 ptaGetIPt(pts, p, &x, &y);
62 if (p != 0) {
63 str << " ";
64 }
65 str << std::to_string(x) << "," << std::to_string(y);
66 }
67 str << "\"/>\n";
68 }
69
70 ///
71 /// Convert bbox information to top and bottom polygon
72 ///
73 static void AddPointToWordPolygon(
74 const ResultIterator *res_it, PageIteratorLevel level, Pta *word_top_pts,
75 Pta *word_bottom_pts, tesseract::WritingDirection writing_direction) {
76 int left, top, right, bottom;
77
78 res_it->BoundingBox(level, &left, &top, &right, &bottom);
79
80 if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
81 ptaAddPt(word_top_pts, left, top);
82 ptaAddPt(word_top_pts, right, top);
83
84 ptaAddPt(word_bottom_pts, left, bottom);
85 ptaAddPt(word_bottom_pts, right, bottom);
86
87 } else {
88 // Transform from ttb to ltr
89 ptaAddPt(word_top_pts, top, right);
90 ptaAddPt(word_top_pts, bottom, right);
91
92 ptaAddPt(word_bottom_pts, top, left);
93 ptaAddPt(word_bottom_pts, bottom, left);
94 }
95 }
96
97 ///
98 /// Transpose polygonline, destroy old and return new pts
99 ///
100 Pta *TransposePolygonline(Pta *pts) {
101 Pta *pts_transposed;
102
103 pts_transposed = ptaTranspose(pts);
104 ptaDestroy(&pts);
105 return pts_transposed;
106 }
107
108 ///
109 /// Reverse polygonline, destroy old and return new pts
110 ///
111 Pta *ReversePolygonline(Pta *pts, int type) {
112 Pta *pts_reversed;
113
114 pts_reversed = ptaReverse(pts, type);
115 ptaDestroy(&pts);
116 return pts_reversed;
117 }
118
119 ///
120 /// Destroy old and create new pts
121 ///
122 Pta *DestroyAndCreatePta(Pta *pts) {
123 ptaDestroy(&pts);
124 return ptaCreate(0);
125 }
126
127 ///
128 /// Recalculate linepolygon
129 /// Create a hull for overlapping areas
130 ///
131 Pta *RecalcPolygonline(Pta *pts, bool upper) {
132 int num_pts, num_bin, index = 0;
133 int y, x0, y0, x1, y1;
134 float x_min, y_min, x_max, y_max;
135 NUMA *bin_line;
136 Pta *pts_recalc;
137
138 ptaGetMinMax(pts, &x_min, &y_min, &x_max, &y_max);
139 num_bin = x_max - x_min;
140 bin_line = numaCreate(num_bin + 1);
141
142 for (int p = 0; p <= num_bin; ++p) {
143 bin_line->array[p] = -1.;
144 }
145
146 num_pts = ptaGetCount(pts);
147
148 if (num_pts == 2) {
149 pts_recalc = ptaCopy(pts);
150 ptaDestroy(&pts);
151 return pts_recalc;
152 }
153
154 do {
155 ptaGetIPt(pts, index, &x0, &y0);
156 ptaGetIPt(pts, index + 1, &x1, &y1);
157 for (int p = x0 - x_min; p <= x1 - x_min; ++p) {
158 if (!upper) {
159 if (bin_line->array[p] == -1. || y0 > bin_line->array[p]) {
160 bin_line->array[p] = y0;
161 }
162 } else {
163 if (bin_line->array[p] == -1. || y0 < bin_line->array[p]) {
164 bin_line->array[p] = y0;
165 }
166 }
167 }
168 index += 2;
169 } while (index < num_pts - 1);
170
171 pts_recalc = ptaCreate(0);
172
173 for (int p = 0; p <= num_bin; ++p) {
174 if (p == 0) {
175 y = bin_line->array[p];
176 ptaAddPt(pts_recalc, x_min + p, y);
177 } else if (p == num_bin) {
178 ptaAddPt(pts_recalc, x_min + p, y);
179 break;
180 } else if (y != bin_line->array[p]) {
181 if (y != -1.) {
182 ptaAddPt(pts_recalc, x_min + p, y);
183 }
184 y = bin_line->array[p];
185 if (y != -1.) {
186 ptaAddPt(pts_recalc, x_min + p, y);
187 }
188 }
189 }
190
191 ptaDestroy(&pts);
192 return pts_recalc;
193 }
194
195 ///
196 /// Create a rectangle hull around a single line
197 ///
198 Pta *PolygonToBoxCoords(Pta *pts) {
199 Pta *pts_box;
200 float x_min, y_min, x_max, y_max;
201
202 pts_box = ptaCreate(0);
203 ptaGetMinMax(pts, &x_min, &y_min, &x_max, &y_max);
204 ptaAddPt(pts_box, x_min, y_min);
205 ptaAddPt(pts_box, x_max, y_min);
206 ptaAddPt(pts_box, x_max, y_max);
207 ptaAddPt(pts_box, x_min, y_max);
208 ptaDestroy(&pts);
209 return pts_box;
210 }
211
212 ///
213 /// Create a rectangle polygon round the existing multiple lines
214 ///
215 static void UpdateBlockPoints(Pta *block_top_pts, Pta *block_bottom_pts,
216 Pta *line_top_pts, Pta *line_bottom_pts, int lcnt,
217 int last_word_in_cblock) {
218 int num_pts;
219 int x, y;
220
221 // Create a hull around all lines
222 if (lcnt == 0 && last_word_in_cblock) {
223 ptaJoin(block_top_pts, line_top_pts, 0, -1);
224 ptaJoin(block_bottom_pts, line_bottom_pts, 0, -1);
225 } else if (lcnt == 0) {
226 ptaJoin(block_top_pts, line_top_pts, 0, -1);
227 num_pts = ptaGetCount(line_bottom_pts);
228 ptaGetIPt(line_bottom_pts, num_pts - 1, &x, &y);
229 ptaAddPt(block_top_pts, x, y);
230 ptaGetIPt(line_bottom_pts, 0, &x, &y);
231 ptaAddPt(block_bottom_pts, x, y);
232 } else if (last_word_in_cblock) {
233 ptaGetIPt(line_top_pts, 0, &x, &y);
234 ptaAddPt(block_bottom_pts, x, y);
235 ptaJoin(block_bottom_pts, line_bottom_pts, 0, -1);
236 num_pts = ptaGetCount(line_top_pts);
237 ptaGetIPt(line_top_pts, num_pts - 1, &x, &y);
238 ptaAddPt(block_top_pts, x, y);
239 } else {
240 ptaGetIPt(line_top_pts, 0, &x, &y);
241 ptaAddPt(block_bottom_pts, x, y);
242 ptaGetIPt(line_bottom_pts, 0, &x, &y);
243 ptaAddPt(block_bottom_pts, x, y);
244 num_pts = ptaGetCount(line_top_pts);
245 ptaGetIPt(line_top_pts, num_pts - 1, &x, &y);
246 ptaAddPt(block_top_pts, x, y);
247 num_pts = ptaGetCount(line_bottom_pts);
248 ptaGetIPt(line_bottom_pts, num_pts - 1, &x, &y);
249 ptaAddPt(block_top_pts, x, y);
250 };
251 }
252
253 ///
254 /// Simplify polygonlines (only expanding not shrinking) (Due to recalculation
255 /// currently not necessary)
256 ///
257 static void SimplifyLinePolygon(Pta *polyline, int tolerance, bool upper) {
258 int x0, y0, x1, y1, x2, y2, x3, y3, index = 1;
259 float m, b, y_min, y_max;
260
261 while (index <= polyline->n - 2) {
262 ptaGetIPt(polyline, index - 1, &x0, &y0);
263 ptaGetIPt(polyline, index, &x1, &y1);
264 ptaGetIPt(polyline, index + 1, &x2, &y2);
265 if (index + 2 < polyline->n) {
266 // Delete two point indentations
267 ptaGetIPt(polyline, index + 2, &x3, &y3);
268 if (abs(x3 - x0) <= tolerance * 2) {
269 GetSlopeAndOffset(x0, y0, x3, y3, &m, &b);
270
271 if (upper && (m * x1 + b) < y1 && (m * x2 + b) < y2) {
272 ptaRemovePt(polyline, index + 1);
273 ptaRemovePt(polyline, index);
274 continue;
275 } else if (!upper && (m * x1 + b) > y1 && (m * x2 + b) > y2) {
276 ptaRemovePt(polyline, index + 1);
277 ptaRemovePt(polyline, index);
278 continue;
279 }
280 }
281 }
282 // Delete one point indentations
283 if (abs(y0 - y1) <= tolerance && abs(y1 - y2) <= tolerance) {
284 GetSlopeAndOffset(x0, y0, x2, y2, &m, &b);
285 if (upper && (m * x1 + b) <= y1) {
286 ptaRemovePt(polyline, index);
287 continue;
288 } else if (!upper && (m * x1 + b) >= y1) {
289 ptaRemovePt(polyline, index);
290 continue;
291 }
292 }
293 // Delete near by points
294 if (x1 != x0 && abs(y1 - y0) < 4 && abs(x1 - x0) <= tolerance) {
295 if (upper) {
296 y_min = std::min(y0, y1);
297 GetSlopeAndOffset(x0, y_min, x2, y2, &m, &b);
298 if ((m * x1 + b) <= y1) {
299 polyline->y[index - 1] = std::min(y0, y1);
300 ptaRemovePt(polyline, index);
301 continue;
302 }
303 } else {
304 y_max = std::max(y0, y1);
305 GetSlopeAndOffset(x0, y_max, x2, y2, &m, &b);
306 if ((m * x1 + b) >= y1) {
307 polyline->y[index - 1] = y_max;
308 ptaRemovePt(polyline, index);
309 continue;
310 }
311 }
312 }
313 index++;
314 }
315 }
316
317 ///
318 /// Directly write bounding box information as coordinates a stream
319 ///
320 static void AddBoxToPAGE(const ResultIterator *it, PageIteratorLevel level,
321 std::stringstream &page_str) {
322 int left, top, right, bottom;
323
324 it->BoundingBox(level, &left, &top, &right, &bottom);
325 page_str << "<Coords points=\"" << left << "," << top << " " << right << ","
326 << top << " " << right << "," << bottom << " " << left << ","
327 << bottom << "\"/>\n";
328 }
329
330 ///
331 /// Join ltr and rtl polygon information
332 ///
333 static void AppendLinePolygon(Pta *pts_ltr, Pta *pts_rtl, Pta *ptss,
334 tesseract::WritingDirection writing_direction) {
335 // If writing direction is NOT right-to-left, handle the left-to-right case.
336 if (writing_direction != WRITING_DIRECTION_RIGHT_TO_LEFT) {
337 if (ptaGetCount(pts_rtl) != 0) {
338 ptaJoin(pts_ltr, pts_rtl, 0, -1);
339 DestroyAndCreatePta(pts_rtl);
340 }
341 ptaJoin(pts_ltr, ptss, 0, -1);
342 } else {
343 // For right-to-left, work with a copy of ptss initially.
344 PTA *ptsd = ptaCopy(ptss);
345 if (ptaGetCount(pts_rtl) != 0) {
346 ptaJoin(ptsd, pts_rtl, 0, -1);
347 }
348 ptaDestroy(&pts_rtl);
349 ptaCopy(ptsd);
350 }
351 }
352
353 ///
354 /// Convert baseline to points and add to polygon
355 ///
356 static void AddBaselineToPTA(const ResultIterator *it, PageIteratorLevel level,
357 Pta *baseline_pts) {
358 int x1, y1, x2, y2;
359
360 it->Baseline(level, &x1, &y1, &x2, &y2);
361 ptaAddPt(baseline_pts, x1, y1);
362 ptaAddPt(baseline_pts, x2, y2);
363 }
364
365 ///
366 /// Directly write baseline information as baseline points a stream
367 ///
368 static void AddBaselinePtsToPAGE(Pta *baseline_pts, std::stringstream &str) {
369 int x, y, num_pts = baseline_pts->n;
370
371 str << "<Baseline points=\"";
372 for (int p = 0; p < num_pts; ++p) {
373 ptaGetIPt(baseline_pts, p, &x, &y);
374 if (p != 0) {
375 str << " ";
376 }
377 str << std::to_string(x) << "," << std::to_string(y);
378 }
379 str << "\"/>\n";
380 }
381
382 ///
383 /// Sort baseline points ascending and deleting duplicates
384 ///
385 Pta *SortBaseline(Pta *baseline_pts,
386 tesseract::WritingDirection writing_direction) {
387 int num_pts, index = 0;
388 float x0, y0, x1, y1;
389 Pta *sorted_baseline_pts;
390
391 sorted_baseline_pts =
392 ptaSort(baseline_pts, L_SORT_BY_X, L_SORT_INCREASING, nullptr);
393
394 do {
395 ptaGetPt(sorted_baseline_pts, index, &x0, &y0);
396 ptaGetPt(sorted_baseline_pts, index + 1, &x1, &y1);
397 if (x0 >= x1) {
398 sorted_baseline_pts->y[index] = std::min(y0, y1);
399 ptaRemovePt(sorted_baseline_pts, index + 1);
400 } else {
401 index++;
402 }
403 num_pts = ptaGetCount(sorted_baseline_pts);
404 } while (index < num_pts - 1);
405
406 ptaDestroy(&baseline_pts);
407 return sorted_baseline_pts;
408 }
409
410 ///
411 /// Clip baseline to range of the exsitings polygon and simplifies the baseline
412 /// linepolygon
413 ///
414 Pta *ClipAndSimplifyBaseline(Pta *bottom_pts, Pta *baseline_pts,
415 tesseract::WritingDirection writing_direction) {
416 int num_pts;
417 float m, b, x0, y0, x1, y1;
418 float x_min, y_min, x_max, y_max;
419 Pta *baseline_clipped_pts;
420
421 ptaGetMinMax(bottom_pts, &x_min, &y_min, &x_max, &y_max);
422 num_pts = ptaGetCount(baseline_pts);
423 baseline_clipped_pts = ptaCreate(0);
424
425 // Clip Baseline
426 for (int p = 0; p < num_pts; ++p) {
427 ptaGetPt(baseline_pts, p, &x0, &y0);
428 if (x0 < x_min) {
429 if (p + 1 < num_pts) {
430 ptaGetPt(baseline_pts, p + 1, &x1, &y1);
431 if (x1 < x_min) {
432 continue;
433 } else {
434 GetSlopeAndOffset(x0, y0, x1, y1, &m, &b);
435 y0 = int(x_min * m + b);
436 x0 = x_min;
437 }
438 }
439 } else if (x0 > x_max) {
440 if (ptaGetCount(baseline_clipped_pts) > 0 && p > 0) {
441 ptaGetPt(baseline_pts, p - 1, &x1, &y1);
442 // See comment above
443 GetSlopeAndOffset(x1, y1, x0, y0, &m, &b);
444 y0 = int(x_max * m + b);
445 x0 = x_max;
446 ptaAddPt(baseline_clipped_pts, x0, y0);
447 break;
448 }
449 }
450 ptaAddPt(baseline_clipped_pts, x0, y0);
451 }
452 if (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM) {
453 SimplifyLinePolygon(baseline_clipped_pts, 3, 0);
454 } else {
455 SimplifyLinePolygon(baseline_clipped_pts, 3, 1);
456 }
457 SimplifyLinePolygon(
458 baseline_clipped_pts, 3,
459 writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM ? 0 : 1);
460
461 // Check the number of points in baseline_clipped_pts after processing
462 int clipped_pts_count = ptaGetCount(baseline_clipped_pts);
463
464 if (clipped_pts_count < 2) {
465 // If there's only one point in baseline_clipped_pts, duplicate it
466 ptaDestroy(&baseline_clipped_pts); // Clean up the created but unused Pta
467 baseline_clipped_pts = ptaCreate(0);
468 ptaAddPt(baseline_clipped_pts, x_min, y_min);
469 ptaAddPt(baseline_clipped_pts, x_max, y_min);
470 }
471
472 return baseline_clipped_pts;
473 }
474
475 ///
476 /// Fit the baseline points into the existings polygon
477 ///
478 Pta *FitBaselineIntoLinePolygon(Pta *bottom_pts, Pta *baseline_pts,
479 tesseract::WritingDirection writing_direction) {
480 int num_pts, num_bin, x0, y0, x1, y1;
481 float m, b;
482 float x_min, y_min, x_max, y_max;
483 float delta_median, delta_median_Q1, delta_median_Q3;
484 NUMA *bin_line, *poly_bl_delta;
485 Pta *baseline_recalc_pts, *baseline_clipped_pts;
486
487 ptaGetMinMax(bottom_pts, &x_min, &y_min, &x_max, &y_max);
488 num_bin = x_max - x_min;
489 bin_line = numaCreate(num_bin + 1);
490
491 for (int p = 0; p < num_bin + 1; ++p) {
492 bin_line->array[p] = -1.;
493 }
494
495 num_pts = ptaGetCount(bottom_pts);
496 // Create an interpolated polygon with stepsize 1.
497 for (int index = 0; index < num_pts - 1; ++index) {
498 ptaGetIPt(bottom_pts, index, &x0, &y0);
499 ptaGetIPt(bottom_pts, index + 1, &x1, &y1);
500 if (x0 >= x1) {
501 continue;
502 }
503 if (y0 == y1) {
504 for (int p = x0 - x_min; p < x1 - x_min + 1; ++p) {
505 if (bin_line->array[p] == -1. || y0 > bin_line->array[p]) {
506 bin_line->array[p] = y0;
507 }
508 }
509 } else {
510 GetSlopeAndOffset(x0, y0, x1, y1, &m, &b);
511 for (int p = x0 - x_min; p < x1 - x_min + 1; ++p) {
512 if (bin_line->array[p] == -1. ||
513 ((p + x_min) * m + b) > bin_line->array[p]) {
514 bin_line->array[p] = ((p + x_min) * m + b);
515 }
516 }
517 }
518 }
519
520 num_pts = ptaGetCount(baseline_pts);
521 baseline_clipped_pts = ptaCreate(0);
522 poly_bl_delta = numaCreate(0);
523
524 // Clip Baseline and create a set of deltas between baseline and polygon
525 for (int p = 0; p < num_pts; ++p) {
526 ptaGetIPt(baseline_pts, p, &x0, &y0);
527
528 if (x0 < x_min) {
529 ptaGetIPt(baseline_pts, p + 1, &x1, &y1);
530 if (x1 < x_min) {
531 continue;
532 } else {
533 GetSlopeAndOffset(x0, y0, x1, y1, &m, &b);
534 y0 = int(x_min * m + b);
535 x0 = x_min;
536 }
537 } else if (x0 > x_max) {
538 if (ptaGetCount(baseline_clipped_pts) > 0) {
539 ptaGetIPt(baseline_pts, p - 1, &x1, &y1);
540 GetSlopeAndOffset(x1, y1, x0, y0, &m, &b);
541 y0 = int(x_max * m + b);
542 x0 = x_max;
543 int x_val = x0 - x_min;
544 numaAddNumber(poly_bl_delta, abs(bin_line->array[x_val] - y0));
545 ptaAddPt(baseline_clipped_pts, x0, y0);
546 break;
547 }
548 }
549 int x_val = x0 - x_min;
550 numaAddNumber(poly_bl_delta, abs(bin_line->array[x_val] - y0));
551 ptaAddPt(baseline_clipped_pts, x0, y0);
552 }
553
554 ptaDestroy(&baseline_pts);
555
556 // Calculate quartiles to find outliers
557 numaGetMedian(poly_bl_delta, &delta_median);
558 numaGetRankValue(poly_bl_delta, 0.25, nullptr, 0, &delta_median_Q1);
559 numaGetRankValue(poly_bl_delta, 0.75, nullptr, 0, &delta_median_Q3);
560
561 // Fit baseline into the polygon
562 // Todo: Needs maybe some adjustments to suppress fitting to superscript
563 // glyphs
564 baseline_recalc_pts = ptaCreate(0);
565 num_pts = ptaGetCount(baseline_clipped_pts);
566 for (int p = 0; p < num_pts; ++p) {
567 ptaGetIPt(baseline_clipped_pts, p, &x0, &y0);
568 int x_val = x0 - x_min;
569 // Delete outliers with IQR
570 if (abs(y0 - bin_line->array[x_val]) >
571 1.5 * delta_median_Q3 + delta_median &&
572 p != 0 && p != num_pts - 1) {
573 continue;
574 }
575 if (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM) {
576 if (y0 < bin_line->array[x_val]) {
577 ptaAddPt(baseline_recalc_pts, x0, bin_line->array[x_val]);
578 } else {
579 ptaAddPt(baseline_recalc_pts, x0, y0);
580 }
581 } else {
582 if (y0 > bin_line->array[x_val]) {
583 ptaAddPt(baseline_recalc_pts, x0, bin_line->array[x_val]);
584 } else {
585 ptaAddPt(baseline_recalc_pts, x0, y0);
586 }
587 }
588 }
589 // Return recalculated baseline if this fails return the bottom line as
590 // baseline
591 ptaDestroy(&baseline_clipped_pts);
592 if (ptaGetCount(baseline_recalc_pts) < 2) {
593 ptaDestroy(&baseline_recalc_pts);
594 return ptaCopy(bottom_pts);
595 } else {
596 return baseline_recalc_pts;
597 }
598 }
599
600 /// Convert writing direction to string representation
601 const char *WritingDirectionToStr(int wd) {
602 switch (wd) {
603 case 0:
604 return "left-to-right";
605 case 1:
606 return "right-to-left";
607 case 2:
608 return "top-to-bottom";
609 default:
610 return "bottom-to-top";
611 }
612 }
613 ///
614 /// Append the PAGE XML for the beginning of the document
615 ///
616 bool TessPAGERenderer::BeginDocumentHandler() {
617 // Delay the XML output because we need the name of the image file.
618 begin_document = true;
619 return true;
620 }
621
622 ///
623 /// Append the PAGE XML for the layout of the image
624 ///
625 bool TessPAGERenderer::AddImageHandler(TessBaseAPI *api) {
626 if (begin_document) {
627 AppendString(
628 "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n"
629 "<PcGts "
630 "xmlns=\"http://schema.primaresearch.org/PAGE/gts/pagecontent/"
631 "2019-07-15\" "
632 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
633 "xsi:schemaLocation=\"http://schema.primaresearch.org/PAGE/gts/"
634 "pagecontent/2019-07-15 "
635 "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/"
636 "pagecontent.xsd\">\n"
637 "\t<Metadata");
638
639 // If a URL is used to recognize an image add it as <Metadata
640 // externalRef="url">
641 if (std::regex_search(api->GetInputName(),
642 std::regex("^(https?|ftp|ssh):"))) {
643 AppendString(" externalRef=\"");
644 AppendString(api->GetInputName());
645 AppendString("\" ");
646 }
647
648 AppendString(
649 ">\n"
650 "\t\t<Creator>Tesseract - ");
651 AppendString(TESSERACT_VERSION_STR);
652 // If gmtime conversion is problematic maybe l_getFormattedDate can be used
653 // here
654 // char *datestr = l_getFormattedDate();
655 std::time_t now = std::time(nullptr);
656 std::tm *now_tm = std::gmtime(&now);
657 char mbstr[100];
658 std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%dT%H:%M:%S", now_tm);
659 AppendString(
660 "</Creator>\n"
661 "\t\t<Created>");
662 AppendString(mbstr);
663 AppendString("</Created>\n");
664 AppendString("\t\t<LastChange>");
665 AppendString(mbstr);
666 AppendString(
667 "</LastChange>\n"
668 "\t</Metadata>\n");
669 begin_document = false;
670 }
671
672 const std::unique_ptr<const char[]> text(api->GetPAGEText(imagenum()));
673 if (text == nullptr) {
674 return false;
675 }
676
677 AppendString(text.get());
678
679 return true;
680 }
681
682 ///
683 /// Append the PAGE XML for the end of the document
684 ///
685 bool TessPAGERenderer::EndDocumentHandler() {
686 AppendString("\t\t</Page>\n</PcGts>\n");
687 return true;
688 }
689
690 TessPAGERenderer::TessPAGERenderer(const char *outputbase)
691 : TessResultRenderer(outputbase, "page.xml"), begin_document(false) {}
692
693 ///
694 /// Make an XML-formatted string with PAGE markup from the internal
695 /// data structures.
696 ///
697 char *TessBaseAPI::GetPAGEText(int page_number) {
698 return GetPAGEText(nullptr, page_number);
699 }
700
701 ///
702 /// Make an XML-formatted string with PAGE markup from the internal
703 /// data structures.
704 ///
705 char *TessBaseAPI::GetPAGEText(ETEXT_DESC *monitor, int page_number) {
706 if (tesseract_ == nullptr ||
707 (page_res_ == nullptr && Recognize(monitor) < 0)) {
708 return nullptr;
709 }
710
711 int rcnt = 0, lcnt = 0, wcnt = 0;
712
713 if (input_file_.empty()) {
714 SetInputName(nullptr);
715 }
716
717 // Used variables
718
719 std::stringstream reading_order_str;
720 std::stringstream region_content;
721 std::stringstream line_content;
722 std::stringstream word_content;
723 std::stringstream line_str;
724 std::stringstream line_inter_str;
725 std::stringstream word_str;
726 std::stringstream page_str;
727
728 float x1, y1, x2, y2;
729
730 tesseract::Orientation orientation_block = ORIENTATION_PAGE_UP;
731 tesseract::WritingDirection writing_direction_block =
732 WRITING_DIRECTION_LEFT_TO_RIGHT;
733 tesseract::TextlineOrder textline_order_block;
734
735 Pta *block_top_pts = ptaCreate(0);
736 Pta *block_bottom_pts = ptaCreate(0);
737 Pta *line_top_ltr_pts = ptaCreate(0);
738 Pta *line_bottom_ltr_pts = ptaCreate(0);
739 Pta *line_top_rtl_pts = ptaCreate(0);
740 Pta *line_bottom_rtl_pts = ptaCreate(0);
741 Pta *word_top_pts = ptaCreate(0);
742 Pta *word_bottom_pts = ptaCreate(0);
743 Pta *word_baseline_pts = ptaCreate(0);
744 Pta *line_baseline_rtl_pts = ptaCreate(0);
745 Pta *line_baseline_ltr_pts = ptaCreate(0);
746 Pta *line_baseline_pts = ptaCreate(0);
747
748 bool POLYGONFLAG;
749 GetBoolVariable("page_xml_polygon", &POLYGONFLAG);
750 int LEVELFLAG;
751 GetIntVariable("page_xml_level", &LEVELFLAG);
752
753 if (LEVELFLAG != 0 && LEVELFLAG != 1) {
754 tprintf(
755 "For now, only line level and word level are available, and the level "
756 "is reset to line level.\n");
757 LEVELFLAG = 0;
758 }
759
760 // Use "C" locale (needed for int values larger than 999).
761 page_str.imbue(std::locale::classic());
762 reading_order_str << "\t<Page " << "imageFilename=\"" << GetInputName();
763 // AppendString(api->GetInputName());
764 reading_order_str << "\" " << "imageWidth=\"" << rect_width_ << "\" "
765 << "imageHeight=\"" << rect_height_ << "\">\n";
766 std::size_t ro_id = std::hash<std::string>{}(GetInputName());
767 reading_order_str << "\t\t<ReadingOrder>\n"
768 << "\t\t\t<OrderedGroup id=\"ro" << ro_id
769 << "\" caption=\"Regions reading order\">\n";
770
771 std::unique_ptr<ResultIterator> res_it(GetIterator());
772
773 float block_conf = 0;
774 float line_conf = 0;
775
776 while (!res_it->Empty(RIL_BLOCK)) {
777 if (res_it->Empty(RIL_WORD)) {
778 res_it->Next(RIL_WORD);
779 continue;
780 }
781
782 auto block_type = res_it->BlockType();
783
784 switch (block_type) {
785 case PT_FLOWING_IMAGE:
786 case PT_HEADING_IMAGE:
787 case PT_PULLOUT_IMAGE: {
788 // Handle all kinds of images.
789 page_str << "\t\t<GraphicRegion id=\"r" << rcnt++ << "\">\n";
790 page_str << "\t\t\t";
791 AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
792 page_str << "\t\t</GraphicRegion>\n";
793 res_it->Next(RIL_BLOCK);
794 continue;
795 }
796 case PT_HORZ_LINE:
797 case PT_VERT_LINE:
798 // Handle horizontal and vertical lines.
799 page_str << "\t\t<SeparatorRegion id=\"r" << rcnt++ << "\">\n";
800 page_str << "\t\t\t";
801 AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
802 page_str << "\t\t</SeparatorRegion>\n";
803 res_it->Next(RIL_BLOCK);
804 continue;
805 case PT_NOISE:
806 tprintf("TODO: Please report image which triggers the noise case.\n");
807 ASSERT_HOST(false);
808 default:
809 break;
810 }
811
812 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
813 // Add Block to reading order
814 reading_order_str << "\t\t\t\t<RegionRefIndexed " << "index=\"" << rcnt
815 << "\" " << "regionRef=\"r" << rcnt << "\"/>\n";
816
817 float deskew_angle;
818 res_it->Orientation(&orientation_block, &writing_direction_block,
819 &textline_order_block, &deskew_angle);
820 block_conf = ((res_it->Confidence(RIL_BLOCK)) / 100.);
821 page_str << "\t\t<TextRegion id=\"r" << rcnt << "\" " << "custom=\""
822 << "readingOrder {index:" << rcnt << ";} ";
823 if (writing_direction_block != WRITING_DIRECTION_LEFT_TO_RIGHT) {
824 page_str << "readingDirection {"
825 << WritingDirectionToStr(writing_direction_block) << ";} ";
826 }
827 page_str << "orientation {" << orientation_block << ";}\">\n";
828 page_str << "\t\t\t";
829 if ((!POLYGONFLAG || (orientation_block != ORIENTATION_PAGE_UP &&
830 orientation_block != ORIENTATION_PAGE_DOWN)) &&
831 LEVELFLAG == 0) {
832 AddBoxToPAGE(res_it.get(), RIL_BLOCK, page_str);
833 }
834 }
835
836 // Writing direction changes at a per-word granularity
837 // tesseract::WritingDirection writing_direction_before;
838 auto writing_direction = writing_direction_block;
839 if (writing_direction_block != WRITING_DIRECTION_TOP_TO_BOTTOM) {
840 switch (res_it->WordDirection()) {
841 case DIR_LEFT_TO_RIGHT:
842 writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
843 break;
844 case DIR_RIGHT_TO_LEFT:
845 writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
846 break;
847 default:
848 break;
849 }
850 }
851
852 bool ttb_flag = (writing_direction == WRITING_DIRECTION_TOP_TO_BOTTOM);
853 // TODO: Rework polygon handling if line is skewed (90 or 180 degress),
854 // for now using LinePts
855 bool skewed_flag = (orientation_block != ORIENTATION_PAGE_UP &&
856 orientation_block != ORIENTATION_PAGE_DOWN);
857
858 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
859 // writing_direction_before = writing_direction;
860 line_conf = ((res_it->Confidence(RIL_TEXTLINE)) / 100.);
861 std::string textline = res_it->GetUTF8Text(RIL_TEXTLINE);
862 if (textline.back() == '\n') {
863 textline.erase(textline.length() - 1);
864 }
865 line_content << HOcrEscape(textline.c_str());
866 line_str << "\t\t\t<TextLine id=\"r" << rcnt << "l" << lcnt << "\" ";
867 if (writing_direction != WRITING_DIRECTION_LEFT_TO_RIGHT &&
868 writing_direction != writing_direction_block) {
869 line_str << "readingDirection=\""
870 << WritingDirectionToStr(writing_direction) << "\" ";
871 }
872 line_str << "custom=\"" << "readingOrder {index:" << lcnt << ";}\">\n";
873 // If level is linebased, get the line polygon and baseline
874 if (LEVELFLAG == 0 && (!POLYGONFLAG || skewed_flag)) {
875 AddPointToWordPolygon(res_it.get(), RIL_TEXTLINE, line_top_ltr_pts,
876 line_bottom_ltr_pts, writing_direction);
877 AddBaselineToPTA(res_it.get(), RIL_TEXTLINE, line_baseline_pts);
878 if (ttb_flag) {
879 line_baseline_pts = TransposePolygonline(line_baseline_pts);
880 }
881 }
882 }
883
884 // Get information if word is last in line and if its last in the region
885 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
886 bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
887
888 float word_conf = ((res_it->Confidence(RIL_WORD)) / 100.);
889
890 // Create word stream if word level output is active
891 if (LEVELFLAG > 0) {
892 word_str << "\t\t\t\t<Word id=\"r" << rcnt << "l" << lcnt << "w" << wcnt
893 << "\" readingDirection=\""
894 << WritingDirectionToStr(writing_direction) << "\" "
895 << "custom=\"" << "readingOrder {index:" << wcnt << ";}\">\n";
896 if ((!POLYGONFLAG || skewed_flag) || ttb_flag) {
897 AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
898 writing_direction);
899 }
900 }
901
902 if (POLYGONFLAG && !skewed_flag && ttb_flag && LEVELFLAG == 0) {
903 AddPointToWordPolygon(res_it.get(), RIL_WORD, word_top_pts, word_bottom_pts,
904 writing_direction);
905 }
906
907 // Get the word baseline information
908 AddBaselineToPTA(res_it.get(), RIL_WORD, word_baseline_pts);
909
910 // Get the word text content and polygon
911 do {
912 const std::unique_ptr<const char[]> grapheme(
913 res_it->GetUTF8Text(RIL_SYMBOL));
914 if (grapheme && grapheme[0] != 0) {
915 word_content << HOcrEscape(grapheme.get()).c_str();
916 if (POLYGONFLAG && !skewed_flag && !ttb_flag) {
917 AddPointToWordPolygon(res_it.get(), RIL_SYMBOL, word_top_pts,
918 word_bottom_pts, writing_direction);
919 }
920 }
921 res_it->Next(RIL_SYMBOL);
922 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
923
924 if (LEVELFLAG > 0 || (POLYGONFLAG && !skewed_flag)) {
925 // Sort wordpolygons
926 word_top_pts = RecalcPolygonline(word_top_pts, 1 - ttb_flag);
927 word_bottom_pts = RecalcPolygonline(word_bottom_pts, 0 + ttb_flag);
928
929 // AppendLinePolygon
930 AppendLinePolygon(line_top_ltr_pts, line_top_rtl_pts, word_top_pts,
931 writing_direction);
932 AppendLinePolygon(line_bottom_ltr_pts, line_bottom_rtl_pts,
933 word_bottom_pts, writing_direction);
934
935 // Word level polygon
936 word_bottom_pts = ReversePolygonline(word_bottom_pts, 1);
937 ptaJoin(word_top_pts, word_bottom_pts, 0, -1);
938 }
939
940 // Reverse the word baseline direction for rtl
941 if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
942 word_baseline_pts = ReversePolygonline(word_baseline_pts, 1);
943 }
944
945 // Write word information to the output
946 if (LEVELFLAG > 0) {
947 word_str << "\t\t\t\t\t";
948 if (ttb_flag) {
949 word_top_pts = TransposePolygonline(word_top_pts);
950 }
951 AddPointsToPAGE(word_top_pts, word_str);
952 word_str << "\t\t\t\t\t";
953 AddBaselinePtsToPAGE(word_baseline_pts, word_str);
954 word_str << "\t\t\t\t\t<TextEquiv index=\"1\" conf=\""
955 << std::setprecision(4) << word_conf << "\">\n"
956 << "\t\t\t\t\t\t<Unicode>" << word_content.str()
957 << "</Unicode>\n"
958 << "\t\t\t\t\t</TextEquiv>\n"
959 << "\t\t\t\t</Word>\n";
960 }
961 if (LEVELFLAG > 0 || (POLYGONFLAG && !skewed_flag)) {
962 // Add wordbaseline to linebaseline
963 if (ttb_flag) {
964 word_baseline_pts = TransposePolygonline(word_baseline_pts);
965 }
966 ptaJoin(line_baseline_pts, word_baseline_pts, 0, -1);
967 }
968 word_baseline_pts = DestroyAndCreatePta(word_baseline_pts);
969
970 // Reset word pts arrays
971 word_top_pts = DestroyAndCreatePta(word_top_pts);
972 word_bottom_pts = DestroyAndCreatePta(word_bottom_pts);
973
974 // Check why this combination of words is not working as expected!
975 // Write the word contents to the line
976 #if 0
977 if (!last_word_in_line && writing_direction_before != writing_direction &&
978 writing_direction < 2 && writing_direction_before < 2 &&
979 res_it->WordDirection()) {
980 if (writing_direction_before == WRITING_DIRECTION_LEFT_TO_RIGHT) {
981 // line_content << "‏" << word_content.str();
982 } else {
983 // line_content << "‎" << word_content.str();
984 }
985 } else {
986 // line_content << word_content.str();
987 }
988 // Check if WordIsNeutral
989 if (res_it->WordDirection()) {
990 writing_direction_before = writing_direction;
991 }
992 #endif
993 word_content.str("");
994 wcnt++;
995
996 // Write line information to the output
997 if (last_word_in_line) {
998 // Combine ltr and rtl lines
999 if (ptaGetCount(line_top_rtl_pts) != 0) {
1000 ptaJoin(line_top_ltr_pts, line_top_rtl_pts, 0, -1);
1001 line_top_rtl_pts = DestroyAndCreatePta(line_top_rtl_pts);
1002 }
1003 if (ptaGetCount(line_bottom_rtl_pts) != 0) {
1004 ptaJoin(line_bottom_ltr_pts, line_bottom_rtl_pts, 0, -1);
1005 line_bottom_rtl_pts = DestroyAndCreatePta(line_bottom_rtl_pts);
1006 }
1007 if ((POLYGONFLAG && !skewed_flag) || LEVELFLAG > 0) {
1008 // Recalc Polygonlines
1009 line_top_ltr_pts = RecalcPolygonline(line_top_ltr_pts, 1 - ttb_flag);
1010 line_bottom_ltr_pts =
1011 RecalcPolygonline(line_bottom_ltr_pts, 0 + ttb_flag);
1012
1013 // Smooth the polygonline
1014 SimplifyLinePolygon(line_top_ltr_pts, 5, 1 - ttb_flag);
1015 SimplifyLinePolygon(line_bottom_ltr_pts, 5, 0 + ttb_flag);
1016
1017 // Fit linepolygon matching the baselinepoints
1018 line_baseline_pts = SortBaseline(line_baseline_pts, writing_direction);
1019 // Fitting baseline into polygon is currently deactivated
1020 // it tends to push the baseline directly under superscritpts
1021 // but the baseline is always inside the polygon maybe it will be useful
1022 // for something line_baseline_pts =
1023 // FitBaselineIntoLinePolygon(line_bottom_ltr_pts, line_baseline_pts,
1024 // writing_direction); and it only cut it to the length and simplifies
1025 // the linepolyon
1026 line_baseline_pts = ClipAndSimplifyBaseline(
1027 line_bottom_ltr_pts, line_baseline_pts, writing_direction);
1028
1029 // Update polygon of the block
1030 UpdateBlockPoints(block_top_pts, block_bottom_pts, line_top_ltr_pts,
1031 line_bottom_ltr_pts, lcnt, last_word_in_cblock);
1032 }
1033 // Line level polygon
1034 line_bottom_ltr_pts = ReversePolygonline(line_bottom_ltr_pts, 1);
1035 ptaJoin(line_top_ltr_pts, line_bottom_ltr_pts, 0, -1);
1036 line_bottom_ltr_pts = DestroyAndCreatePta(line_bottom_ltr_pts);
1037
1038 if (LEVELFLAG > 0 && !(POLYGONFLAG && !skewed_flag)) {
1039 line_top_ltr_pts = PolygonToBoxCoords(line_top_ltr_pts);
1040 }
1041
1042 // Write level points
1043 line_str << "\t\t\t\t";
1044 if (ttb_flag) {
1045 line_top_ltr_pts = TransposePolygonline(line_top_ltr_pts);
1046 }
1047 AddPointsToPAGE(line_top_ltr_pts, line_str);
1048 line_top_ltr_pts = DestroyAndCreatePta(line_top_ltr_pts);
1049
1050 // Write Baseline
1051 line_str << "\t\t\t\t";
1052 if (ttb_flag) {
1053 line_baseline_pts = TransposePolygonline(line_baseline_pts);
1054 }
1055 AddBaselinePtsToPAGE(line_baseline_pts, line_str);
1056 line_baseline_pts = DestroyAndCreatePta(line_baseline_pts);
1057
1058 // Add word information if word level output is active
1059 line_str << word_str.str();
1060 word_str.str("");
1061 // Write Line TextEquiv
1062 line_str << "\t\t\t\t<TextEquiv index=\"1\" conf=\""
1063 << std::setprecision(4) << line_conf << "\">\n"
1064 << "\t\t\t\t\t<Unicode>" << line_content.str() << "</Unicode>\n"
1065 << "\t\t\t\t</TextEquiv>\n";
1066 line_str << "\t\t\t</TextLine>\n";
1067 region_content << line_content.str();
1068 line_content.str("");
1069 if (!last_word_in_cblock) {
1070 region_content << '\n';
1071 }
1072 lcnt++;
1073 wcnt = 0;
1074 }
1075
1076 // Write region information to the output
1077 if (last_word_in_cblock) {
1078 if ((POLYGONFLAG && !skewed_flag) || LEVELFLAG > 0) {
1079 page_str << "<Coords points=\"";
1080 block_bottom_pts = ReversePolygonline(block_bottom_pts, 1);
1081 ptaJoin(block_top_pts, block_bottom_pts, 0, -1);
1082 if (ttb_flag) {
1083 block_top_pts = TransposePolygonline(block_top_pts);
1084 }
1085 ptaGetMinMax(block_top_pts, &x1, &y1, &x2, &y2);
1086 page_str << (l_uint32)x1 << "," << (l_uint32)y1;
1087 page_str << " " << (l_uint32)x2 << "," << (l_uint32)y1;
1088 page_str << " " << (l_uint32)x2 << "," << (l_uint32)y2;
1089 page_str << " " << (l_uint32)x1 << "," << (l_uint32)y2;
1090 page_str << "\"/>\n";
1091 block_top_pts = DestroyAndCreatePta(block_top_pts);
1092 block_bottom_pts = DestroyAndCreatePta(block_bottom_pts);
1093 }
1094 page_str << line_str.str();
1095 line_str.str("");
1096 page_str << "\t\t\t<TextEquiv index=\"1\" conf=\"" << std::setprecision(4)
1097 << block_conf << "\">\n"
1098 << "\t\t\t\t<Unicode>" << region_content.str() << "</Unicode>\n"
1099 << "\t\t\t</TextEquiv>\n";
1100 page_str << "\t\t</TextRegion>\n";
1101 region_content.str("");
1102 rcnt++;
1103 lcnt = 0;
1104 }
1105 }
1106
1107 // Destroy all point information
1108 ptaDestroy(&block_top_pts);
1109 ptaDestroy(&block_bottom_pts);
1110 ptaDestroy(&line_top_ltr_pts);
1111 ptaDestroy(&line_bottom_ltr_pts);
1112 ptaDestroy(&line_top_rtl_pts);
1113 ptaDestroy(&line_bottom_rtl_pts);
1114 ptaDestroy(&word_top_pts);
1115 ptaDestroy(&word_bottom_pts);
1116 ptaDestroy(&word_baseline_pts);
1117 ptaDestroy(&line_baseline_rtl_pts);
1118 ptaDestroy(&line_baseline_ltr_pts);
1119 ptaDestroy(&line_baseline_pts);
1120
1121 reading_order_str << "\t\t\t</OrderedGroup>\n"
1122 << "\t\t</ReadingOrder>\n";
1123
1124 reading_order_str << page_str.str();
1125 page_str.str("");
1126 const std::string &text = reading_order_str.str();
1127 reading_order_str.str("");
1128
1129 return copy_string(text);
1130 }
1131
1132 } // namespace tesseract