diff mupdf-source/thirdparty/tesseract/src/ccmain/recogtraining.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccmain/recogtraining.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,228 @@
+///////////////////////////////////////////////////////////////////////
+// File:        recogtraining.cpp
+// Description: Functions for ambiguity and parameter training.
+// Author:      Daria Antonova
+//
+// (C) Copyright 2009, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "tesseractclass.h"
+
+#include "boxread.h"
+#include "control.h"
+#include "host.h" // for NearlyEqual
+#include "ratngs.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "reject.h"
+#endif
+#include "stopper.h"
+
+namespace tesseract {
+
+const int16_t kMaxBoxEdgeDiff = 2;
+
+// Sets flags necessary for recognition in the training mode.
+// Opens and returns the pointer to the output file.
+FILE *Tesseract::init_recog_training(const char *filename) {
+  if (tessedit_ambigs_training) {
+    tessedit_tess_adaption_mode.set_value(0); // turn off adaption
+    tessedit_enable_doc_dict.set_value(false); // turn off document dictionary
+    // Explore all segmentations.
+    getDict().stopper_no_acceptable_choices.set_value(true);
+  }
+
+  std::string output_fname = filename;
+  const char *lastdot = strrchr(output_fname.c_str(), '.');
+  if (lastdot != nullptr) {
+    output_fname[lastdot - output_fname.c_str()] = '\0';
+  }
+  output_fname += ".txt";
+  FILE *output_file = fopen(output_fname.c_str(), "a+");
+  if (output_file == nullptr) {
+    tprintf("Error: Could not open file %s\n", output_fname.c_str());
+    ASSERT_HOST(output_file);
+  }
+  return output_file;
+}
+
+// Copies the bounding box from page_res_it->word() to the given TBOX.
+static bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
+  while (page_res_it->block() != nullptr && page_res_it->word() == nullptr) {
+    page_res_it->forward();
+  }
+
+  if (page_res_it->word() != nullptr) {
+    *tbox = page_res_it->word()->word->bounding_box();
+
+    // If tbox->left() is negative, the training image has vertical text and
+    // all the coordinates of bounding boxes of page_res are rotated by 90
+    // degrees in a counterclockwise direction. We need to rotate the TBOX back
+    // in order to compare with the TBOXes of box files.
+    if (tbox->left() < 0) {
+      tbox->rotate(FCOORD(0.0, -1.0));
+    }
+
+    return true;
+  } else {
+    return false;
+  }
+}
+
+// This function takes tif/box pair of files and runs recognition on the image,
+// while making sure that the word bounds that tesseract identified roughly
+// match to those specified by the input box file. For each word (ngram in a
+// single bounding box from the input box file) it outputs the ocred result,
+// the correct label, rating and certainty.
+void Tesseract::recog_training_segmented(const char *filename, PAGE_RES *page_res,
+                                         volatile ETEXT_DESC *monitor, FILE *output_file) {
+  std::string box_fname = filename;
+  const char *lastdot = strrchr(box_fname.c_str(), '.');
+  if (lastdot != nullptr) {
+    box_fname[lastdot - box_fname.c_str()] = '\0';
+  }
+  box_fname += ".box";
+  // ReadNextBox() will close box_file
+  FILE *box_file = fopen(box_fname.c_str(), "r");
+  if (box_file == nullptr) {
+    tprintf("Error: Could not open file %s\n", box_fname.c_str());
+    ASSERT_HOST(box_file);
+  }
+
+  PAGE_RES_IT page_res_it;
+  page_res_it.page_res = page_res;
+  page_res_it.restart_page();
+  std::string label;
+
+  // Process all the words on this page.
+  TBOX tbox; // tesseract-identified box
+  TBOX bbox; // box from the box file
+  bool keep_going;
+  int line_number = 0;
+  int examined_words = 0;
+  do {
+    keep_going = read_t(&page_res_it, &tbox);
+    keep_going &= ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
+    // Align bottom left points of the TBOXes.
+    while (keep_going && !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
+      if (bbox.bottom() < tbox.bottom()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
+      }
+    }
+    while (keep_going && !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
+      if (bbox.left() > tbox.left()) {
+        page_res_it.forward();
+        keep_going = read_t(&page_res_it, &tbox);
+      } else {
+        keep_going = ReadNextBox(applybox_page, &line_number, box_file, label, &bbox);
+      }
+    }
+    // OCR the word if top right points of the TBOXes are similar.
+    if (keep_going && NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
+        NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
+      ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
+      examined_words++;
+    }
+    page_res_it.forward();
+  } while (keep_going);
+
+  // Set up scripts on all of the words that did not get sent to
+  // ambigs_classify_and_output.  They all should have, but if all the
+  // werd_res's don't get uch_sets, tesseract will crash when you try
+  // to iterate over them. :-(
+  int total_words = 0;
+  for (page_res_it.restart_page(); page_res_it.block() != nullptr; page_res_it.forward()) {
+    if (page_res_it.word()) {
+      if (page_res_it.word()->uch_set == nullptr) {
+        page_res_it.word()->SetupFake(unicharset);
+      }
+      total_words++;
+    }
+  }
+  if (examined_words < 0.85 * total_words) {
+    tprintf(
+        "TODO(antonova): clean up recog_training_segmented; "
+        " It examined only a small fraction of the ambigs image.\n");
+  }
+  tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words, total_words);
+}
+
+// Helper prints the given set of blob choices.
+static void PrintPath(int length, const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
+                      const char *label, FILE *output_file) {
+  float rating = 0.0f;
+  float certainty = 0.0f;
+  for (int i = 0; i < length; ++i) {
+    const BLOB_CHOICE *blob_choice = blob_choices[i];
+    fprintf(output_file, "%s", unicharset.id_to_unichar(blob_choice->unichar_id()));
+    rating += blob_choice->rating();
+    if (certainty > blob_choice->certainty()) {
+      certainty = blob_choice->certainty();
+    }
+  }
+  fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
+}
+
+// Helper recursively prints all paths through the ratings matrix, starting
+// at column col.
+static void PrintMatrixPaths(int col, int dim, const MATRIX &ratings, int length,
+                             const BLOB_CHOICE **blob_choices, const UNICHARSET &unicharset,
+                             const char *label, FILE *output_file) {
+  for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
+    if (ratings.get(col, row) != NOT_CLASSIFIED) {
+      BLOB_CHOICE_IT bc_it(ratings.get(col, row));
+      for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
+        blob_choices[length] = bc_it.data();
+        if (row + 1 < dim) {
+          PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices, unicharset, label,
+                           output_file);
+        } else {
+          PrintPath(length + 1, blob_choices, unicharset, label, output_file);
+        }
+      }
+    }
+  }
+}
+
+// Runs classify_word_pass1() on the current word. Outputs Tesseract's
+// raw choice as a result of the classification. For words labeled with a
+// single unichar also outputs all alternatives from blob_choices of the
+// best choice.
+void Tesseract::ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it,
+                                           FILE *output_file) {
+  // Classify word.
+  fflush(stdout);
+  WordData word_data(*pr_it);
+  SetupWordPassN(1, &word_data);
+  classify_word_and_language(1, pr_it, &word_data);
+  WERD_RES *werd_res = word_data.word;
+  WERD_CHOICE *best_choice = werd_res->best_choice;
+  ASSERT_HOST(best_choice != nullptr);
+
+  // Compute the number of unichars in the label.
+  std::vector<UNICHAR_ID> encoding;
+  if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
+    tprintf("Not outputting illegal unichar %s\n", label);
+    return;
+  }
+
+  // Dump all paths through the ratings matrix (which is normally small).
+  int dim = werd_res->ratings->dimension();
+  const auto **blob_choices = new const BLOB_CHOICE *[dim];
+  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset, label, output_file);
+  delete[] blob_choices;
+}
+
+} // namespace tesseract