comparison mupdf-source/thirdparty/tesseract/src/api/lstmboxrenderer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: lstmboxrenderer.cpp
3 * Description: Renderer for creating box file for LSTM training.
4 * based on the tsv renderer.
5 *
6 * (C) Copyright 2019, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 #include <tesseract/baseapi.h> // for TessBaseAPI
20 #include <tesseract/renderer.h>
21 #include "helpers.h" // for copy_string
22 #include "tesseractclass.h" // for Tesseract
23
24 namespace tesseract {
25
26 /**
27 * Create a UTF8 box file for LSTM training from the internal data structures.
28 * page_number is a 0-base page index that will appear in the box file.
29 * Returned string must be freed with the delete [] operator.
30 */
31 static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,
32 std::string &text) {
33 text += " " + std::to_string(image_height - bottom);
34 text += " " + std::to_string(right + 5);
35 text += " " + std::to_string(image_height - top);
36 text += " " + std::to_string(page_num);
37 }
38
39 char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
40 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
41 return nullptr;
42 }
43
44 std::string lstm_box_str;
45 bool first_word = true;
46 int left = 0, top = 0, right = 0, bottom = 0;
47
48 LTRResultIterator *res_it = GetLTRIterator();
49 while (!res_it->Empty(RIL_BLOCK)) {
50 if (res_it->Empty(RIL_SYMBOL)) {
51 res_it->Next(RIL_SYMBOL);
52 continue;
53 }
54 if (!first_word) {
55 if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
56 if (res_it->IsAtBeginningOf(RIL_WORD)) {
57 lstm_box_str += " " + std::to_string(left);
58 AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
59 lstm_box_str += "\n"; // end of row for word
60 } // word
61 } else {
62 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
63 lstm_box_str += "\t " + std::to_string(left);
64 AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
65 lstm_box_str += "\n"; // end of row for line
66 } // line
67 }
68 } // not first word
69 first_word = false;
70 // Use bounding box for whole line for everything
71 res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
72 do {
73 lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
74 res_it->Next(RIL_SYMBOL);
75 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
76 lstm_box_str += " " + std::to_string(left);
77 AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
78 lstm_box_str += "\n"; // end of row for symbol
79 }
80 if (!first_word) { // if first_word is true => empty page
81 lstm_box_str += "\t " + std::to_string(left);
82 AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
83 lstm_box_str += "\n"; // end of PAGE
84 }
85 delete res_it;
86 return copy_string(lstm_box_str);
87 }
88
89 /**********************************************************************
90 * LSTMBox Renderer interface implementation
91 **********************************************************************/
92 TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)
93 : TessResultRenderer(outputbase, "box") {}
94
95 bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {
96 const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));
97 if (lstmbox == nullptr) {
98 return false;
99 }
100
101 AppendString(lstmbox.get());
102
103 return true;
104 }
105
106 } // namespace tesseract.