comparison mupdf-source/thirdparty/tesseract/src/api/wordstrboxrenderer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: wordstrboxrenderer.cpp
3 * Description: Renderer for creating box file with WordStr strings.
4 * based on the tsv renderer.
5 *
6 * (C) Copyright 2019, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include <tesseract/baseapi.h> // for TessBaseAPI
20 #include <tesseract/renderer.h>
21 #include "helpers.h" // for copy_string
22 #include "tesseractclass.h" // for Tesseract
23
24 namespace tesseract {
25
26 /**
27 * Create a UTF8 box file with WordStr strings from the internal data
28 * structures. page_number is a 0-base page index that will appear in the box
29 * file. Returned string must be freed with the delete [] operator.
30 */
31
32 char *TessBaseAPI::GetWordStrBoxText(int page_number = 0) {
33 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
34 return nullptr;
35 }
36
37 std::string wordstr_box_str;
38 int left = 0, top = 0, right = 0, bottom = 0;
39
40 bool first_line = true;
41
42 LTRResultIterator *res_it = GetLTRIterator();
43 while (!res_it->Empty(RIL_BLOCK)) {
44 if (res_it->Empty(RIL_WORD)) {
45 res_it->Next(RIL_WORD);
46 continue;
47 }
48
49 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
50 if (!first_line) {
51 wordstr_box_str += "\n\t " + std::to_string(right + 1);
52 wordstr_box_str += " " + std::to_string(image_height_ - bottom);
53 wordstr_box_str += " " + std::to_string(right + 5);
54 wordstr_box_str += " " + std::to_string(image_height_ - top);
55 wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
56 wordstr_box_str += "\n";
57 } else {
58 first_line = false;
59 }
60 // Use bounding box for whole line for WordStr
61 res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
62 wordstr_box_str += "WordStr " + std::to_string(left);
63 wordstr_box_str += " " + std::to_string(image_height_ - bottom);
64 wordstr_box_str += " " + std::to_string(right);
65 wordstr_box_str += " " + std::to_string(image_height_ - top);
66 wordstr_box_str += " " + std::to_string(page_number); // word
67 wordstr_box_str += " #";
68 }
69 do {
70 wordstr_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
71 wordstr_box_str += " ";
72 res_it->Next(RIL_WORD);
73 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
74 }
75
76 if (left != 0 && top != 0 && right != 0 && bottom != 0) {
77 wordstr_box_str += "\n\t " + std::to_string(right + 1);
78 wordstr_box_str += " " + std::to_string(image_height_ - bottom);
79 wordstr_box_str += " " + std::to_string(right + 5);
80 wordstr_box_str += " " + std::to_string(image_height_ - top);
81 wordstr_box_str += " " + std::to_string(page_number); // row for tab for EOL
82 wordstr_box_str += "\n";
83 }
84 delete res_it;
85 return copy_string(wordstr_box_str);
86 }
87
88 /**********************************************************************
89 * WordStrBox Renderer interface implementation
90 **********************************************************************/
91 TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
92 : TessResultRenderer(outputbase, "box") {}
93
94 bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI *api) {
95 const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
96 if (wordstrbox == nullptr) {
97 return false;
98 }
99
100 AppendString(wordstrbox.get());
101
102 return true;
103 }
104
105 } // namespace tesseract.