comparison mupdf-source/thirdparty/tesseract/src/api/hocrrenderer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: hocrrenderer.cpp
3 * Description: Simple API for calling tesseract.
4 * Author: Ray Smith (original code from baseapi.cpp)
5 * Author: Stefan Weil (moved to separate file and cleaned code)
6 *
7 * (C) Copyright 2006, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #include <tesseract/baseapi.h> // for TessBaseAPI
21 #include <locale> // for std::locale::classic
22 #include <memory> // for std::unique_ptr
23 #include <sstream> // for std::stringstream
24 #include <tesseract/renderer.h>
25 #include "helpers.h" // for copy_string
26 #include "tesseractclass.h" // for Tesseract
27
28 namespace tesseract {
29
30 /**
31 * Gets the block orientation at the current iterator position.
32 */
33 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
34 tesseract::Orientation orientation;
35 tesseract::WritingDirection writing_direction;
36 tesseract::TextlineOrder textline_order;
37 float deskew_angle;
38 it->Orientation(&orientation, &writing_direction, &textline_order,
39 &deskew_angle);
40 return orientation;
41 }
42
43 /**
44 * Fits a line to the baseline at the given level, and appends its coefficients
45 * to the hOCR string.
46 * NOTE: The hOCR spec is unclear on how to specify baseline coefficients for
47 * rotated textlines. For this reason, on textlines that are not upright, this
48 * method currently only inserts a 'textangle' property to indicate the rotation
49 * direction and does not add any baseline information to the hocr string.
50 */
51 static void AddBaselineCoordsTohOCR(const PageIterator *it,
52 PageIteratorLevel level,
53 std::stringstream &hocr_str) {
54 tesseract::Orientation orientation = GetBlockTextOrientation(it);
55 if (orientation != ORIENTATION_PAGE_UP) {
56 hocr_str << "; textangle " << 360 - orientation * 90;
57 return;
58 }
59
60 int left, top, right, bottom;
61 it->BoundingBox(level, &left, &top, &right, &bottom);
62
63 // Try to get the baseline coordinates at this level.
64 int x1, y1, x2, y2;
65 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
66 return;
67 }
68 // Following the description of this field of the hOCR spec, we convert the
69 // baseline coordinates so that "the bottom left of the bounding box is the
70 // origin".
71 x1 -= left;
72 x2 -= left;
73 y1 -= bottom;
74 y2 -= bottom;
75
76 // Now fit a line through the points so we can extract coefficients for the
77 // equation: y = p1 x + p0
78 if (x1 == x2) {
79 // Problem computing the polynomial coefficients.
80 return;
81 }
82 double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
83 double p0 = y1 - p1 * x1;
84
85 hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
86 << round(p0 * 1000.0) / 1000.0;
87 }
88
89 static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
90 std::stringstream &hocr_str) {
91 int left, top, right, bottom;
92 it->BoundingBox(level, &left, &top, &right, &bottom);
93 // This is the only place we use double quotes instead of single quotes,
94 // but it may too late to change for consistency
95 hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
96 << bottom;
97 // Add baseline coordinates & heights for textlines only.
98 if (level == RIL_TEXTLINE) {
99 AddBaselineCoordsTohOCR(it, level, hocr_str);
100 // add custom height measures
101 float row_height, descenders, ascenders; // row attributes
102 it->RowAttributes(&row_height, &descenders, &ascenders);
103 // TODO(rays): Do we want to limit these to a single decimal place?
104 hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
105 << "; x_ascenders " << ascenders;
106 }
107 hocr_str << "\">";
108 }
109
110 /**
111 * Make a HTML-formatted string with hOCR markup from the internal
112 * data structures.
113 * page_number is 0-based but will appear in the output as 1-based.
114 * Image name/input_file_ can be set by SetInputName before calling
115 * GetHOCRText
116 * STL removed from original patch submission and refactored by rays.
117 * Returned string must be freed with the delete [] operator.
118 */
119 char *TessBaseAPI::GetHOCRText(int page_number) {
120 return GetHOCRText(nullptr, page_number);
121 }
122
123 /**
124 * Make a HTML-formatted string with hOCR markup from the internal
125 * data structures.
126 * page_number is 0-based but will appear in the output as 1-based.
127 * Image name/input_file_ can be set by SetInputName before calling
128 * GetHOCRText
129 * STL removed from original patch submission and refactored by rays.
130 * Returned string must be freed with the delete [] operator.
131 */
132 char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
133 if (tesseract_ == nullptr ||
134 (page_res_ == nullptr && Recognize(monitor) < 0)) {
135 return nullptr;
136 }
137
138 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
139 int page_id = page_number + 1; // hOCR uses 1-based page numbers.
140 bool para_is_ltr = true; // Default direction is LTR
141 const char *paragraph_lang = nullptr;
142 bool font_info = false;
143 bool hocr_boxes = false;
144 GetBoolVariable("hocr_font_info", &font_info);
145 GetBoolVariable("hocr_char_boxes", &hocr_boxes);
146
147 if (input_file_.empty()) {
148 SetInputName(nullptr);
149 }
150
151 std::stringstream hocr_str;
152 // Use "C" locale (needed for double values x_size and x_descenders).
153 hocr_str.imbue(std::locale::classic());
154 // Use 8 digits for double values.
155 hocr_str.precision(8);
156 hocr_str << " <div class='ocr_page'"
157 << " id='"
158 << "page_" << page_id << "'"
159 << " title='image \"";
160 if (!input_file_.empty()) {
161 hocr_str << HOcrEscape(input_file_.c_str());
162 } else {
163 hocr_str << "unknown";
164 }
165
166 hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
167 << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
168 << "; scan_res " << GetSourceYResolution() << " "
169 << GetSourceYResolution() << "'>\n";
170
171 std::unique_ptr<ResultIterator> res_it(GetIterator());
172 while (!res_it->Empty(RIL_BLOCK)) {
173 int left, top, right, bottom;
174 auto block_type = res_it->BlockType();
175 switch (block_type) {
176 case PT_FLOWING_IMAGE:
177 case PT_HEADING_IMAGE:
178 case PT_PULLOUT_IMAGE: {
179 // Handle all kinds of images.
180 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
181 hocr_str << " <div class='ocr_photo' id='block_" << page_id << '_'
182 << bcnt++ << "' title=\"bbox " << left << " " << top << " "
183 << right << " " << bottom << "\"></div>\n";
184 res_it->Next(RIL_BLOCK);
185 continue;
186 }
187 case PT_HORZ_LINE:
188 case PT_VERT_LINE:
189 // Handle horizontal and vertical lines.
190 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
191 hocr_str << " <div class='ocr_separator' id='block_" << page_id << '_'
192 << bcnt++ << "' title=\"bbox " << left << " " << top << " "
193 << right << " " << bottom << "\"></div>\n";
194 res_it->Next(RIL_BLOCK);
195 continue;
196 case PT_NOISE:
197 tprintf("TODO: Please report image which triggers the noise case.\n");
198 ASSERT_HOST(false);
199 default:
200 break;
201 }
202
203 if (res_it->Empty(RIL_WORD)) {
204 res_it->Next(RIL_WORD);
205 continue;
206 }
207
208 // Open any new block/paragraph/textline.
209 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
210 para_is_ltr = true; // reset to default direction
211 hocr_str << " <div class='ocr_carea'"
212 << " id='"
213 << "block_" << page_id << "_" << bcnt << "'";
214 AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
215 }
216 if (res_it->IsAtBeginningOf(RIL_PARA)) {
217 hocr_str << "\n <p class='ocr_par'";
218 para_is_ltr = res_it->ParagraphIsLtr();
219 if (!para_is_ltr) {
220 hocr_str << " dir='rtl'";
221 }
222 hocr_str << " id='"
223 << "par_" << page_id << "_" << pcnt << "'";
224 paragraph_lang = res_it->WordRecognitionLanguage();
225 if (paragraph_lang) {
226 hocr_str << " lang='" << paragraph_lang << "'";
227 }
228 AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
229 }
230 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
231 hocr_str << "\n <span class='";
232 switch (block_type) {
233 case PT_HEADING_TEXT:
234 hocr_str << "ocr_header";
235 break;
236 case PT_PULLOUT_TEXT:
237 hocr_str << "ocr_textfloat";
238 break;
239 case PT_CAPTION_TEXT:
240 hocr_str << "ocr_caption";
241 break;
242 case PT_FLOWING_IMAGE:
243 case PT_HEADING_IMAGE:
244 case PT_PULLOUT_IMAGE:
245 ASSERT_HOST(false);
246 break;
247 default:
248 hocr_str << "ocr_line";
249 }
250 hocr_str << "' id='"
251 << "line_" << page_id << "_" << lcnt << "'";
252 AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
253 }
254
255 // Now, process the word...
256 int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
257 std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
258 *rawTimestepMap = nullptr;
259 std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
260 if (lstm_choice_mode) {
261 CTCMap = res_it->GetBestLSTMSymbolChoices();
262 rawTimestepMap = res_it->GetRawLSTMTimesteps();
263 }
264 hocr_str << "\n <span class='ocrx_word'"
265 << " id='"
266 << "word_" << page_id << "_" << wcnt << "'";
267 bool bold, italic, underlined, monospace, serif, smallcaps;
268 int pointsize, font_id;
269 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
270 const char *font_name =
271 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
272 &serif, &smallcaps, &pointsize, &font_id);
273 hocr_str << " title='bbox " << left << " " << top << " " << right << " "
274 << bottom << "; x_wconf "
275 << static_cast<int>(res_it->Confidence(RIL_WORD));
276 if (font_info) {
277 if (font_name) {
278 hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
279 }
280 hocr_str << "; x_fsize " << pointsize;
281 }
282 hocr_str << "'";
283 const char *lang = res_it->WordRecognitionLanguage();
284 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
285 hocr_str << " lang='" << lang << "'";
286 }
287 switch (res_it->WordDirection()) {
288 // Only emit direction if different from current paragraph direction
289 case DIR_LEFT_TO_RIGHT:
290 if (!para_is_ltr) {
291 hocr_str << " dir='ltr'";
292 }
293 break;
294 case DIR_RIGHT_TO_LEFT:
295 if (para_is_ltr) {
296 hocr_str << " dir='rtl'";
297 }
298 break;
299 case DIR_MIX:
300 case DIR_NEUTRAL:
301 default: // Do nothing.
302 break;
303 }
304 hocr_str << ">";
305 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
306 bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
307 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
308 if (bold) {
309 hocr_str << "<strong>";
310 }
311 if (italic) {
312 hocr_str << "<em>";
313 }
314 do {
315 const std::unique_ptr<const char[]> grapheme(
316 res_it->GetUTF8Text(RIL_SYMBOL));
317 if (grapheme && grapheme[0] != 0) {
318 if (hocr_boxes) {
319 res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
320 hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
321 << left << " " << top << " " << right << " " << bottom
322 << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
323 }
324 hocr_str << HOcrEscape(grapheme.get()).c_str();
325 if (hocr_boxes) {
326 hocr_str << "</span>";
327 tesseract::ChoiceIterator ci(*res_it);
328 if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
329 std::vector<std::vector<std::pair<const char *, float>>> *symbol =
330 ci.Timesteps();
331 hocr_str << "\n <span class='ocr_symbol'"
332 << " id='"
333 << "symbol_" << page_id << "_" << wcnt << "_" << scnt
334 << "'>";
335 for (const auto &timestep : *symbol) {
336 hocr_str << "\n <span class='ocrx_cinfo'"
337 << " id='"
338 << "timestep" << page_id << "_" << wcnt << "_" << tcnt
339 << "'>";
340 for (auto conf : timestep) {
341 hocr_str << "\n <span class='ocrx_cinfo'"
342 << " id='"
343 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
344 << "'"
345 << " title='x_confs " << int(conf.second * 100) << "'>"
346 << HOcrEscape(conf.first).c_str() << "</span>";
347 ++ccnt;
348 }
349 hocr_str << "</span>";
350 ++tcnt;
351 }
352 hocr_str << "\n </span>";
353 ++scnt;
354 } else if (lstm_choice_mode == 2) {
355 hocr_str << "\n <span class='ocrx_cinfo'"
356 << " id='"
357 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
358 << "'>";
359 do {
360 const char *choice = ci.GetUTF8Text();
361 float choiceconf = ci.Confidence();
362 if (choice != nullptr) {
363 hocr_str << "\n <span class='ocrx_cinfo'"
364 << " id='"
365 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
366 << "'"
367 << " title='x_confs " << choiceconf << "'>"
368 << HOcrEscape(choice).c_str() << "</span>";
369 ccnt++;
370 }
371 } while (ci.Next());
372 hocr_str << "\n </span>";
373 tcnt++;
374 }
375 }
376 }
377 res_it->Next(RIL_SYMBOL);
378 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
379 if (italic) {
380 hocr_str << "</em>";
381 }
382 if (bold) {
383 hocr_str << "</strong>";
384 }
385 // If the lstm choice mode is required it is added here
386 if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
387 for (const auto &symbol : *rawTimestepMap) {
388 hocr_str << "\n <span class='ocr_symbol'"
389 << " id='"
390 << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
391 for (const auto &timestep : symbol) {
392 hocr_str << "\n <span class='ocrx_cinfo'"
393 << " id='"
394 << "timestep" << page_id << "_" << wcnt << "_" << tcnt
395 << "'>";
396 for (auto &&conf : timestep) {
397 hocr_str << "\n <span class='ocrx_cinfo'"
398 << " id='"
399 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
400 << "'"
401 << " title='x_confs " << int(conf.second * 100) << "'>"
402 << HOcrEscape(conf.first).c_str() << "</span>";
403 ++ccnt;
404 }
405 hocr_str << "</span>";
406 ++tcnt;
407 }
408 hocr_str << "</span>";
409 ++scnt;
410 }
411 } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
412 for (const auto &timestep : *CTCMap) {
413 if (timestep.size() > 0) {
414 hocr_str << "\n <span class='ocrx_cinfo'"
415 << " id='"
416 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
417 << "'>";
418 for (auto &j : timestep) {
419 float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
420 if (conf < 0.0f) {
421 conf = 0.0f;
422 }
423 if (conf > 100.0f) {
424 conf = 100.0f;
425 }
426 hocr_str << "\n <span class='ocrx_cinfo'"
427 << " id='"
428 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
429 << "'"
430 << " title='x_confs " << conf << "'>"
431 << HOcrEscape(j.first).c_str() << "</span>";
432 ccnt++;
433 }
434 hocr_str << "</span>";
435 tcnt++;
436 }
437 }
438 }
439 // Close ocrx_word.
440 if (hocr_boxes || lstm_choice_mode > 0) {
441 hocr_str << "\n ";
442 }
443 hocr_str << "</span>";
444 tcnt = 1;
445 ccnt = 1;
446 wcnt++;
447 // Close any ending block/paragraph/textline.
448 if (last_word_in_line) {
449 hocr_str << "\n </span>";
450 lcnt++;
451 }
452 if (last_word_in_para) {
453 hocr_str << "\n </p>\n";
454 pcnt++;
455 para_is_ltr = true; // back to default direction
456 }
457 if (last_word_in_block) {
458 hocr_str << " </div>\n";
459 bcnt++;
460 }
461 }
462 hocr_str << " </div>\n";
463
464 return copy_string(hocr_str.str());
465 }
466
467 /**********************************************************************
468 * HOcr Text Renderer interface implementation
469 **********************************************************************/
470 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
471 : TessResultRenderer(outputbase, "hocr") {
472 font_info_ = false;
473 }
474
475 TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
476 : TessResultRenderer(outputbase, "hocr") {
477 font_info_ = font_info;
478 }
479
480 bool TessHOcrRenderer::BeginDocumentHandler() {
481 AppendString(
482 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
483 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
484 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
485 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
486 "lang=\"en\">\n <head>\n <title>");
487 AppendString(title());
488 AppendString(
489 "</title>\n"
490 " <meta http-equiv=\"Content-Type\" content=\"text/html;"
491 "charset=utf-8\"/>\n"
492 " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
493 "' />\n"
494 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
495 " ocr_line ocrx_word ocrp_dir ocrp_lang ocrp_wconf");
496 if (font_info_) {
497 AppendString(" ocrp_font ocrp_fsize");
498 }
499 AppendString(
500 "'/>\n"
501 " </head>\n"
502 " <body>\n");
503
504 return true;
505 }
506
507 bool TessHOcrRenderer::EndDocumentHandler() {
508 AppendString(" </body>\n</html>\n");
509
510 return true;
511 }
512
513 bool TessHOcrRenderer::AddImageHandler(TessBaseAPI *api) {
514 const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
515 if (hocr == nullptr) {
516 return false;
517 }
518
519 AppendString(hocr.get());
520
521 return true;
522 }
523
524 } // namespace tesseract