Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/api/lstmboxrenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/api/lstmboxrenderer.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,106 @@ +/********************************************************************** + * File: lstmboxrenderer.cpp + * Description: Renderer for creating box file for LSTM training. + * based on the tsv renderer. + * + * (C) Copyright 2019, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include <tesseract/baseapi.h> // for TessBaseAPI +#include <tesseract/renderer.h> +#include "helpers.h" // for copy_string +#include "tesseractclass.h" // for Tesseract + +namespace tesseract { + +/** + * Create a UTF8 box file for LSTM training from the internal data structures. + * page_number is a 0-base page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ +static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num, + std::string &text) { + text += " " + std::to_string(image_height - bottom); + text += " " + std::to_string(right + 5); + text += " " + std::to_string(image_height - top); + text += " " + std::to_string(page_num); +} + +char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) { + return nullptr; + } + + std::string lstm_box_str; + bool first_word = true; + int left = 0, top = 0, right = 0, bottom = 0; + + LTRResultIterator *res_it = GetLTRIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_SYMBOL)) { + res_it->Next(RIL_SYMBOL); + continue; + } + if (!first_word) { + if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) { + if (res_it->IsAtBeginningOf(RIL_WORD)) { + lstm_box_str += " " + std::to_string(left); + AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str); + lstm_box_str += "\n"; // end of row for word + } // word + } else { + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + lstm_box_str += "\t " + std::to_string(left); + AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str); + lstm_box_str += "\n"; // end of row for line + } // line + } + } // not first word + first_word = false; + // Use bounding box for whole line for everything + res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + do { + lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get(); + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL)); + lstm_box_str += " " + std::to_string(left); + AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str); + lstm_box_str += "\n"; // end of row for symbol + } + if (!first_word) { // if first_word is true => empty page + lstm_box_str += "\t " + std::to_string(left); + AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str); + lstm_box_str += "\n"; // end of PAGE + } + delete res_it; + return copy_string(lstm_box_str); +} + +/********************************************************************** + * LSTMBox Renderer interface implementation + **********************************************************************/ +TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "box") {} + +bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) { + const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum())); + if (lstmbox == nullptr) { + return false; + } + + AppendString(lstmbox.get()); + + return true; +} + +} // namespace tesseract.
