Mercurial > hgrepos > Python2 > PyMuPDF
annotate mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 21:2f43e400f144
Provide an "all" target to build both the sdist and the wheel
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Fri, 19 Sep 2025 10:28:53 +0200 |
| parents | b50eed0cc0ef |
| children |
| rev | line source |
|---|---|
|
2
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
1 /********************************************************************** |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
2 * File: hocrrenderer.cpp |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
3 * Description: Simple API for calling tesseract. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
4 * Author: Ray Smith (original code from baseapi.cpp) |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
5 * Author: Stefan Weil (moved to separate file and cleaned code) |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
6 * |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
7 * (C) Copyright 2006, Google Inc. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
8 ** Licensed under the Apache License, Version 2.0 (the "License"); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
9 ** you may not use this file except in compliance with the License. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
10 ** You may obtain a copy of the License at |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
11 ** http://www.apache.org/licenses/LICENSE-2.0 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
12 ** Unless required by applicable law or agreed to in writing, software |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
13 ** distributed under the License is distributed on an "AS IS" BASIS, |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
15 ** See the License for the specific language governing permissions and |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
16 ** limitations under the License. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
17 * |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
18 **********************************************************************/ |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
19 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
20 #include <tesseract/baseapi.h> // for TessBaseAPI |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
21 #include <locale> // for std::locale::classic |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
22 #include <memory> // for std::unique_ptr |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
23 #include <sstream> // for std::stringstream |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
24 #include <tesseract/renderer.h> |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
25 #include "helpers.h" // for copy_string |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
26 #include "tesseractclass.h" // for Tesseract |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
27 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
28 namespace tesseract { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
29 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
30 /** |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
31 * Gets the block orientation at the current iterator position. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
32 */ |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
33 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
34 tesseract::Orientation orientation; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
35 tesseract::WritingDirection writing_direction; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
36 tesseract::TextlineOrder textline_order; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
37 float deskew_angle; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
38 it->Orientation(&orientation, &writing_direction, &textline_order, |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
39 &deskew_angle); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
40 return orientation; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
41 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
42 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
43 /** |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
44 * Fits a line to the baseline at the given level, and appends its coefficients |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
45 * to the hOCR string. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
46 * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
47 * rotated textlines. For this reason, on textlines that are not upright, this |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
48 * method currently only inserts a 'textangle' property to indicate the rotation |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
49 * direction and does not add any baseline information to the hocr string. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
50 */ |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
51 static void AddBaselineCoordsTohOCR(const PageIterator *it, |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
52 PageIteratorLevel level, |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
53 std::stringstream &hocr_str) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
54 tesseract::Orientation orientation = GetBlockTextOrientation(it); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
55 if (orientation != ORIENTATION_PAGE_UP) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
56 hocr_str << "; textangle " << 360 - orientation * 90; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
57 return; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
58 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
59 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
60 int left, top, right, bottom; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
61 it->BoundingBox(level, &left, &top, &right, &bottom); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
62 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
63 // Try to get the baseline coordinates at this level. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
64 int x1, y1, x2, y2; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
65 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
66 return; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
67 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
68 // Following the description of this field of the hOCR spec, we convert the |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
69 // baseline coordinates so that "the bottom left of the bounding box is the |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
70 // origin". |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
71 x1 -= left; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
72 x2 -= left; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
73 y1 -= bottom; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
74 y2 -= bottom; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
75 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
76 // Now fit a line through the points so we can extract coefficients for the |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
77 // equation: y = p1 x + p0 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
78 if (x1 == x2) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
79 // Problem computing the polynomial coefficients. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
80 return; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
81 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
82 double p1 = (y2 - y1) / static_cast<double>(x2 - x1); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
83 double p0 = y1 - p1 * x1; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
84 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
85 hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
86 << round(p0 * 1000.0) / 1000.0; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
87 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
88 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
89 static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level, |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
90 std::stringstream &hocr_str) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
91 int left, top, right, bottom; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
92 it->BoundingBox(level, &left, &top, &right, &bottom); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
93 // This is the only place we use double quotes instead of single quotes, |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
94 // but it may too late to change for consistency |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
95 hocr_str << " title=\"bbox " << left << " " << top << " " << right << " " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
96 << bottom; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
97 // Add baseline coordinates & heights for textlines only. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
98 if (level == RIL_TEXTLINE) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
99 AddBaselineCoordsTohOCR(it, level, hocr_str); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
100 // add custom height measures |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
101 float row_height, descenders, ascenders; // row attributes |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
102 it->RowAttributes(&row_height, &descenders, &ascenders); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
103 // TODO(rays): Do we want to limit these to a single decimal place? |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
104 hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
105 << "; x_ascenders " << ascenders; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
106 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
107 hocr_str << "\">"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
108 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
109 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
110 /** |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
111 * Make a HTML-formatted string with hOCR markup from the internal |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
112 * data structures. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
113 * page_number is 0-based but will appear in the output as 1-based. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
114 * Image name/input_file_ can be set by SetInputName before calling |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
115 * GetHOCRText |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
116 * STL removed from original patch submission and refactored by rays. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
117 * Returned string must be freed with the delete [] operator. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
118 */ |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
119 char *TessBaseAPI::GetHOCRText(int page_number) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
120 return GetHOCRText(nullptr, page_number); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
121 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
122 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
123 /** |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
124 * Make a HTML-formatted string with hOCR markup from the internal |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
125 * data structures. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
126 * page_number is 0-based but will appear in the output as 1-based. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
127 * Image name/input_file_ can be set by SetInputName before calling |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
128 * GetHOCRText |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
129 * STL removed from original patch submission and refactored by rays. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
130 * Returned string must be freed with the delete [] operator. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
131 */ |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
132 char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
133 if (tesseract_ == nullptr || |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
134 (page_res_ == nullptr && Recognize(monitor) < 0)) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
135 return nullptr; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
136 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
137 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
138 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
139 int page_id = page_number + 1; // hOCR uses 1-based page numbers. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
140 bool para_is_ltr = true; // Default direction is LTR |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
141 const char *paragraph_lang = nullptr; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
142 bool font_info = false; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
143 bool hocr_boxes = false; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
144 GetBoolVariable("hocr_font_info", &font_info); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
145 GetBoolVariable("hocr_char_boxes", &hocr_boxes); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
146 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
147 if (input_file_.empty()) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
148 SetInputName(nullptr); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
149 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
150 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
151 std::stringstream hocr_str; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
152 // Use "C" locale (needed for double values x_size and x_descenders). |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
153 hocr_str.imbue(std::locale::classic()); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
154 // Use 8 digits for double values. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
155 hocr_str.precision(8); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
156 hocr_str << " <div class='ocr_page'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
157 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
158 << "page_" << page_id << "'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
159 << " title='image \""; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
160 if (!input_file_.empty()) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
161 hocr_str << HOcrEscape(input_file_.c_str()); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
162 } else { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
163 hocr_str << "unknown"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
164 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
165 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
166 hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
167 << rect_width_ << " " << rect_height_ << "; ppageno " << page_number |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
168 << "; scan_res " << GetSourceYResolution() << " " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
169 << GetSourceYResolution() << "'>\n"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
170 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
171 std::unique_ptr<ResultIterator> res_it(GetIterator()); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
172 while (!res_it->Empty(RIL_BLOCK)) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
173 int left, top, right, bottom; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
174 auto block_type = res_it->BlockType(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
175 switch (block_type) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
176 case PT_FLOWING_IMAGE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
177 case PT_HEADING_IMAGE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
178 case PT_PULLOUT_IMAGE: { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
179 // Handle all kinds of images. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
180 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
181 hocr_str << " <div class='ocr_photo' id='block_" << page_id << '_' |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
182 << bcnt++ << "' title=\"bbox " << left << " " << top << " " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
183 << right << " " << bottom << "\"></div>\n"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
184 res_it->Next(RIL_BLOCK); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
185 continue; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
186 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
187 case PT_HORZ_LINE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
188 case PT_VERT_LINE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
189 // Handle horizontal and vertical lines. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
190 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
191 hocr_str << " <div class='ocr_separator' id='block_" << page_id << '_' |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
192 << bcnt++ << "' title=\"bbox " << left << " " << top << " " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
193 << right << " " << bottom << "\"></div>\n"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
194 res_it->Next(RIL_BLOCK); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
195 continue; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
196 case PT_NOISE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
197 tprintf("TODO: Please report image which triggers the noise case.\n"); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
198 ASSERT_HOST(false); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
199 default: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
200 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
201 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
202 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
203 if (res_it->Empty(RIL_WORD)) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
204 res_it->Next(RIL_WORD); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
205 continue; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
206 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
207 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
208 // Open any new block/paragraph/textline. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
209 if (res_it->IsAtBeginningOf(RIL_BLOCK)) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
210 para_is_ltr = true; // reset to default direction |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
211 hocr_str << " <div class='ocr_carea'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
212 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
213 << "block_" << page_id << "_" << bcnt << "'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
214 AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
215 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
216 if (res_it->IsAtBeginningOf(RIL_PARA)) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
217 hocr_str << "\n <p class='ocr_par'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
218 para_is_ltr = res_it->ParagraphIsLtr(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
219 if (!para_is_ltr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
220 hocr_str << " dir='rtl'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
221 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
222 hocr_str << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
223 << "par_" << page_id << "_" << pcnt << "'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
224 paragraph_lang = res_it->WordRecognitionLanguage(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
225 if (paragraph_lang) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
226 hocr_str << " lang='" << paragraph_lang << "'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
227 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
228 AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
229 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
230 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
231 hocr_str << "\n <span class='"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
232 switch (block_type) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
233 case PT_HEADING_TEXT: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
234 hocr_str << "ocr_header"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
235 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
236 case PT_PULLOUT_TEXT: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
237 hocr_str << "ocr_textfloat"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
238 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
239 case PT_CAPTION_TEXT: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
240 hocr_str << "ocr_caption"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
241 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
242 case PT_FLOWING_IMAGE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
243 case PT_HEADING_IMAGE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
244 case PT_PULLOUT_IMAGE: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
245 ASSERT_HOST(false); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
246 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
247 default: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
248 hocr_str << "ocr_line"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
249 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
250 hocr_str << "' id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
251 << "line_" << page_id << "_" << lcnt << "'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
252 AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
253 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
254 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
255 // Now, process the word... |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
256 int32_t lstm_choice_mode = tesseract_->lstm_choice_mode; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
257 std::vector<std::vector<std::vector<std::pair<const char *, float>>>> |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
258 *rawTimestepMap = nullptr; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
259 std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
260 if (lstm_choice_mode) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
261 CTCMap = res_it->GetBestLSTMSymbolChoices(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
262 rawTimestepMap = res_it->GetRawLSTMTimesteps(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
263 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
264 hocr_str << "\n <span class='ocrx_word'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
265 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
266 << "word_" << page_id << "_" << wcnt << "'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
267 bool bold, italic, underlined, monospace, serif, smallcaps; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
268 int pointsize, font_id; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
269 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
270 const char *font_name = |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
271 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
272 &serif, &smallcaps, &pointsize, &font_id); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
273 hocr_str << " title='bbox " << left << " " << top << " " << right << " " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
274 << bottom << "; x_wconf " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
275 << static_cast<int>(res_it->Confidence(RIL_WORD)); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
276 if (font_info) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
277 if (font_name) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
278 hocr_str << "; x_font " << HOcrEscape(font_name).c_str(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
279 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
280 hocr_str << "; x_fsize " << pointsize; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
281 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
282 hocr_str << "'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
283 const char *lang = res_it->WordRecognitionLanguage(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
284 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
285 hocr_str << " lang='" << lang << "'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
286 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
287 switch (res_it->WordDirection()) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
288 // Only emit direction if different from current paragraph direction |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
289 case DIR_LEFT_TO_RIGHT: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
290 if (!para_is_ltr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
291 hocr_str << " dir='ltr'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
292 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
293 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
294 case DIR_RIGHT_TO_LEFT: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
295 if (para_is_ltr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
296 hocr_str << " dir='rtl'"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
297 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
298 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
299 case DIR_MIX: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
300 case DIR_NEUTRAL: |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
301 default: // Do nothing. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
302 break; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
303 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
304 hocr_str << ">"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
305 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
306 bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
307 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
308 if (bold) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
309 hocr_str << "<strong>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
310 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
311 if (italic) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
312 hocr_str << "<em>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
313 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
314 do { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
315 const std::unique_ptr<const char[]> grapheme( |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
316 res_it->GetUTF8Text(RIL_SYMBOL)); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
317 if (grapheme && grapheme[0] != 0) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
318 if (hocr_boxes) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
319 res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
320 hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
321 << left << " " << top << " " << right << " " << bottom |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
322 << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
323 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
324 hocr_str << HOcrEscape(grapheme.get()).c_str(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
325 if (hocr_boxes) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
326 hocr_str << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
327 tesseract::ChoiceIterator ci(*res_it); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
328 if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
329 std::vector<std::vector<std::pair<const char *, float>>> *symbol = |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
330 ci.Timesteps(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
331 hocr_str << "\n <span class='ocr_symbol'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
332 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
333 << "symbol_" << page_id << "_" << wcnt << "_" << scnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
334 << "'>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
335 for (const auto ×tep : *symbol) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
336 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
337 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
338 << "timestep" << page_id << "_" << wcnt << "_" << tcnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
339 << "'>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
340 for (auto conf : timestep) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
341 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
342 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
343 << "choice_" << page_id << "_" << wcnt << "_" << ccnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
344 << "'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
345 << " title='x_confs " << int(conf.second * 100) << "'>" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
346 << HOcrEscape(conf.first).c_str() << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
347 ++ccnt; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
348 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
349 hocr_str << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
350 ++tcnt; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
351 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
352 hocr_str << "\n </span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
353 ++scnt; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
354 } else if (lstm_choice_mode == 2) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
355 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
356 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
357 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
358 << "'>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
359 do { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
360 const char *choice = ci.GetUTF8Text(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
361 float choiceconf = ci.Confidence(); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
362 if (choice != nullptr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
363 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
364 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
365 << "choice_" << page_id << "_" << wcnt << "_" << ccnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
366 << "'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
367 << " title='x_confs " << choiceconf << "'>" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
368 << HOcrEscape(choice).c_str() << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
369 ccnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
370 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
371 } while (ci.Next()); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
372 hocr_str << "\n </span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
373 tcnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
374 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
375 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
376 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
377 res_it->Next(RIL_SYMBOL); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
378 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
379 if (italic) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
380 hocr_str << "</em>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
381 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
382 if (bold) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
383 hocr_str << "</strong>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
384 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
385 // If the lstm choice mode is required it is added here |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
386 if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
387 for (const auto &symbol : *rawTimestepMap) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
388 hocr_str << "\n <span class='ocr_symbol'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
389 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
390 << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
391 for (const auto ×tep : symbol) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
392 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
393 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
394 << "timestep" << page_id << "_" << wcnt << "_" << tcnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
395 << "'>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
396 for (auto &&conf : timestep) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
397 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
398 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
399 << "choice_" << page_id << "_" << wcnt << "_" << ccnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
400 << "'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
401 << " title='x_confs " << int(conf.second * 100) << "'>" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
402 << HOcrEscape(conf.first).c_str() << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
403 ++ccnt; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
404 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
405 hocr_str << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
406 ++tcnt; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
407 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
408 hocr_str << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
409 ++scnt; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
410 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
411 } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
412 for (const auto ×tep : *CTCMap) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
413 if (timestep.size() > 0) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
414 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
415 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
416 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
417 << "'>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
418 for (auto &j : timestep) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
419 float conf = 100 - tesseract_->lstm_rating_coefficient * j.second; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
420 if (conf < 0.0f) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
421 conf = 0.0f; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
422 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
423 if (conf > 100.0f) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
424 conf = 100.0f; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
425 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
426 hocr_str << "\n <span class='ocrx_cinfo'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
427 << " id='" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
428 << "choice_" << page_id << "_" << wcnt << "_" << ccnt |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
429 << "'" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
430 << " title='x_confs " << conf << "'>" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
431 << HOcrEscape(j.first).c_str() << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
432 ccnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
433 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
434 hocr_str << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
435 tcnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
436 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
437 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
438 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
439 // Close ocrx_word. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
440 if (hocr_boxes || lstm_choice_mode > 0) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
441 hocr_str << "\n "; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
442 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
443 hocr_str << "</span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
444 tcnt = 1; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
445 ccnt = 1; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
446 wcnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
447 // Close any ending block/paragraph/textline. |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
448 if (last_word_in_line) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
449 hocr_str << "\n </span>"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
450 lcnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
451 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
452 if (last_word_in_para) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
453 hocr_str << "\n </p>\n"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
454 pcnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
455 para_is_ltr = true; // back to default direction |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
456 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
457 if (last_word_in_block) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
458 hocr_str << " </div>\n"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
459 bcnt++; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
460 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
461 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
462 hocr_str << " </div>\n"; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
463 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
464 return copy_string(hocr_str.str()); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
465 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
466 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
467 /********************************************************************** |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
468 * HOcr Text Renderer interface implementation |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
469 **********************************************************************/ |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
470 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase) |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
471 : TessResultRenderer(outputbase, "hocr") { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
472 font_info_ = false; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
473 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
474 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
475 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
476 : TessResultRenderer(outputbase, "hocr") { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
477 font_info_ = font_info; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
478 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
479 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
480 bool TessHOcrRenderer::BeginDocumentHandler() { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
481 AppendString( |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
482 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
483 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
484 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
485 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" " |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
486 "lang=\"en\">\n <head>\n <title>"); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
487 AppendString(title()); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
488 AppendString( |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
489 "</title>\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
490 " <meta http-equiv=\"Content-Type\" content=\"text/html;" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
491 "charset=utf-8\"/>\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
492 " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
493 "' />\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
494 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
495 " ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf"); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
496 if (font_info_) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
497 AppendString(" ocrp_font ocrp_fsize"); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
498 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
499 AppendString( |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
500 "'/>\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
501 " </head>\n" |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
502 " <body>\n"); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
503 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
504 return true; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
505 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
506 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
507 bool TessHOcrRenderer::EndDocumentHandler() { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
508 AppendString(" </body>\n</html>\n"); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
509 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
510 return true; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
511 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
512 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
513 bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
514 const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum())); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
515 if (hocr == nullptr) { |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
516 return false; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
517 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
518 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
519 AppendString(hocr.get()); |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
520 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
521 return true; |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
522 } |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
523 |
|
b50eed0cc0ef
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
Franz Glasner <fzglas.hg@dom66.de>
parents:
diff
changeset
|
524 } // namespace tesseract |
