comparison mupdf-source/thirdparty/tesseract/src/api/altorenderer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // File: altorenderer.cpp
2 // Description: ALTO rendering interface
3 // Author: Jake Sebright
4
5 // (C) Copyright 2018
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
9 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include "errcode.h" // for ASSERT_HOST
17 #include "helpers.h" // for copy_string
18 #include "tprintf.h" // for tprintf
19
20 #include <tesseract/baseapi.h>
21 #include <tesseract/renderer.h>
22
23 #include <memory>
24 #include <sstream> // for std::stringstream
25
26 namespace tesseract {
27
28 /// Add coordinates to specified TextBlock, TextLine or String bounding box.
29 /// Add word confidence if adding to a String bounding box.
30 ///
31 static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
32 std::stringstream &alto_str) {
33 int left, top, right, bottom;
34 it->BoundingBox(level, &left, &top, &right, &bottom);
35
36 int hpos = left;
37 int vpos = top;
38 int height = bottom - top;
39 int width = right - left;
40
41 alto_str << " HPOS=\"" << hpos << "\"";
42 alto_str << " VPOS=\"" << vpos << "\"";
43 alto_str << " WIDTH=\"" << width << "\"";
44 alto_str << " HEIGHT=\"" << height << "\"";
45
46 if (level == RIL_WORD) {
47 int wc = it->Confidence(RIL_WORD);
48 alto_str << " WC=\"0." << wc << "\"";
49 } else {
50 alto_str << ">";
51 }
52 }
53
54 ///
55 /// Append the ALTO XML for the beginning of the document
56 ///
57 bool TessAltoRenderer::BeginDocumentHandler() {
58 // Delay the XML output because we need the name of the image file.
59 begin_document = true;
60 return true;
61 }
62
63 ///
64 /// Append the ALTO XML for the layout of the image
65 ///
66 bool TessAltoRenderer::AddImageHandler(TessBaseAPI *api) {
67 if (begin_document) {
68 AppendString(
69 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
70 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
71 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
72 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
73 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
74 "\t<Description>\n"
75 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
76 "\t\t<sourceImageInformation>\n"
77 "\t\t\t<fileName>");
78
79 AppendString(api->GetInputName());
80
81 AppendString(
82 "</fileName>\n"
83 "\t\t</sourceImageInformation>\n"
84 "\t\t<OCRProcessing ID=\"OCR_0\">\n"
85 "\t\t\t<ocrProcessingStep>\n"
86 "\t\t\t\t<processingSoftware>\n"
87 "\t\t\t\t\t<softwareName>tesseract ");
88 AppendString(TessBaseAPI::Version());
89 AppendString(
90 "</softwareName>\n"
91 "\t\t\t\t</processingSoftware>\n"
92 "\t\t\t</ocrProcessingStep>\n"
93 "\t\t</OCRProcessing>\n"
94 "\t</Description>\n"
95 "\t<Layout>\n");
96 begin_document = false;
97 }
98
99 const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
100 if (text == nullptr) {
101 return false;
102 }
103
104 AppendString(text.get());
105
106 return true;
107 }
108
109 ///
110 /// Append the ALTO XML for the end of the document
111 ///
112 bool TessAltoRenderer::EndDocumentHandler() {
113 AppendString("\t</Layout>\n</alto>\n");
114
115 return true;
116 }
117
118 TessAltoRenderer::TessAltoRenderer(const char *outputbase)
119 : TessResultRenderer(outputbase, "xml"),
120 begin_document(false) {}
121
122 ///
123 /// Make an XML-formatted string with ALTO markup from the internal
124 /// data structures.
125 ///
126 char *TessBaseAPI::GetAltoText(int page_number) {
127 return GetAltoText(nullptr, page_number);
128 }
129
130 ///
131 /// Make an XML-formatted string with ALTO markup from the internal
132 /// data structures.
133 ///
134 char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
135 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
136 return nullptr;
137 }
138
139 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
140
141 if (input_file_.empty()) {
142 SetInputName(nullptr);
143 }
144
145 std::stringstream alto_str;
146 // Use "C" locale (needed for int values larger than 999).
147 alto_str.imbue(std::locale::classic());
148 alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
149 << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
150 << " ID=\"page_" << page_number << "\">\n"
151 << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
152 << " WIDTH=\"" << rect_width_ << "\""
153 << " HEIGHT=\"" << rect_height_ << "\">\n";
154
155 std::unique_ptr<ResultIterator> res_it(GetIterator());
156 while (!res_it->Empty(RIL_BLOCK)) {
157 if (res_it->Empty(RIL_WORD)) {
158 res_it->Next(RIL_WORD);
159 continue;
160 }
161
162 int left, top, right, bottom;
163 auto block_type = res_it->BlockType();
164
165 switch (block_type) {
166 case PT_FLOWING_IMAGE:
167 case PT_HEADING_IMAGE:
168 case PT_PULLOUT_IMAGE: {
169 // Handle all kinds of images.
170 // TODO: optionally add TYPE, for example TYPE="photo".
171 alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
172 AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
173 alto_str << "</Illustration>\n";
174 res_it->Next(RIL_BLOCK);
175 continue;
176 }
177 case PT_HORZ_LINE:
178 case PT_VERT_LINE:
179 // Handle horizontal and vertical lines.
180 alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
181 AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
182 alto_str << "</GraphicalElement >\n";
183 res_it->Next(RIL_BLOCK);
184 continue;
185 case PT_NOISE:
186 tprintf("TODO: Please report image which triggers the noise case.\n");
187 ASSERT_HOST(false);
188 default:
189 break;
190 }
191
192 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
193 alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
194 AddBoxToAlto(res_it.get(), RIL_BLOCK, alto_str);
195 alto_str << "\n";
196 }
197
198 if (res_it->IsAtBeginningOf(RIL_PARA)) {
199 alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
200 AddBoxToAlto(res_it.get(), RIL_PARA, alto_str);
201 alto_str << "\n";
202 }
203
204 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
205 alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
206 AddBoxToAlto(res_it.get(), RIL_TEXTLINE, alto_str);
207 alto_str << "\n";
208 }
209
210 alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
211 AddBoxToAlto(res_it.get(), RIL_WORD, alto_str);
212 alto_str << " CONTENT=\"";
213
214 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
215 bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
216 bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
217
218 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
219
220 do {
221 const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
222 if (grapheme && grapheme[0] != 0) {
223 alto_str << HOcrEscape(grapheme.get()).c_str();
224 }
225 res_it->Next(RIL_SYMBOL);
226 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
227
228 alto_str << "\"/>";
229
230 wcnt++;
231
232 if (last_word_in_line) {
233 alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
234 lcnt++;
235 } else {
236 int hpos = right;
237 int vpos = top;
238 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
239 int width = left - hpos;
240 alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
241 << "\"/>\n";
242 }
243
244 if (last_word_in_tblock) {
245 alto_str << "\t\t\t\t\t</TextBlock>\n";
246 tcnt++;
247 }
248
249 if (last_word_in_cblock) {
250 alto_str << "\t\t\t\t</ComposedBlock>\n";
251 bcnt++;
252 }
253 }
254
255 alto_str << "\t\t\t</PrintSpace>\n"
256 << "\t\t</Page>\n";
257
258 return copy_string(alto_str.str());
259 }
260
261 } // namespace tesseract