diff mupdf-source/thirdparty/tesseract/src/api/lstmboxrenderer.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/api/lstmboxrenderer.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,106 @@
+/**********************************************************************
+ * File:        lstmboxrenderer.cpp
+ * Description: Renderer for creating box file for LSTM training.
+ *              based on the tsv renderer.
+ *
+ * (C) Copyright 2019, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include <tesseract/baseapi.h> // for TessBaseAPI
+#include <tesseract/renderer.h>
+#include "helpers.h"        // for copy_string
+#include "tesseractclass.h" // for Tesseract
+
+namespace tesseract {
+
+/**
+ * Create a UTF8 box file for LSTM training from the internal data structures.
+ * page_number is a 0-base page index that will appear in the box file.
+ * Returned string must be freed with the delete [] operator.
+ */
+static void AddBoxToLSTM(int right, int bottom, int top, int image_height, int page_num,
+                         std::string &text) {
+  text += " " + std::to_string(image_height - bottom);
+  text += " " + std::to_string(right + 5);
+  text += " " + std::to_string(image_height - top);
+  text += " " + std::to_string(page_num);
+}
+
+char *TessBaseAPI::GetLSTMBoxText(int page_number = 0) {
+  if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
+    return nullptr;
+  }
+
+  std::string lstm_box_str;
+  bool first_word = true;
+  int left = 0, top = 0, right = 0, bottom = 0;
+
+  LTRResultIterator *res_it = GetLTRIterator();
+  while (!res_it->Empty(RIL_BLOCK)) {
+    if (res_it->Empty(RIL_SYMBOL)) {
+      res_it->Next(RIL_SYMBOL);
+      continue;
+    }
+    if (!first_word) {
+      if (!(res_it->IsAtBeginningOf(RIL_TEXTLINE))) {
+        if (res_it->IsAtBeginningOf(RIL_WORD)) {
+          lstm_box_str += "  " + std::to_string(left);
+          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+          lstm_box_str += "\n"; // end of row for word
+        }                       // word
+      } else {
+        if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
+          lstm_box_str += "\t " + std::to_string(left);
+          AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+          lstm_box_str += "\n"; // end of row for line
+        }                       // line
+      }
+    } // not first word
+    first_word = false;
+    // Use bounding box for whole line for everything
+    res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
+    do {
+      lstm_box_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
+      res_it->Next(RIL_SYMBOL);
+    } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));
+    lstm_box_str += " " + std::to_string(left);
+    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+    lstm_box_str += "\n"; // end of row for symbol
+  }
+  if (!first_word) { // if first_word is true  => empty page
+    lstm_box_str += "\t " + std::to_string(left);
+    AddBoxToLSTM(right, bottom, top, image_height_, page_number, lstm_box_str);
+    lstm_box_str += "\n"; // end of PAGE
+  }
+  delete res_it;
+  return copy_string(lstm_box_str);
+}
+
+/**********************************************************************
+ * LSTMBox Renderer interface implementation
+ **********************************************************************/
+TessLSTMBoxRenderer::TessLSTMBoxRenderer(const char *outputbase)
+    : TessResultRenderer(outputbase, "box") {}
+
+bool TessLSTMBoxRenderer::AddImageHandler(TessBaseAPI *api) {
+  const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBoxText(imagenum()));
+  if (lstmbox == nullptr) {
+    return false;
+  }
+
+  AppendString(lstmbox.get());
+
+  return true;
+}
+
+} // namespace tesseract.