diff mupdf-source/thirdparty/tesseract/src/ccmain/output.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccmain/output.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,415 @@
+/******************************************************************
+ * File:        output.cpp  (Formerly output.c)
+ * Description: Output pass
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "output.h"
+
+#include "control.h"
+#include "tesseractclass.h"
+#include "tessvars.h"
+#ifndef DISABLED_LEGACY_ENGINE
+#  include "docqual.h"
+#  include "reject.h"
+#endif
+
+#include "helpers.h"
+
+#include <cctype>
+#include <cerrno>
+#include <cstring>
+
+#define CTRL_NEWLINE '\012'  // newline
+#define CTRL_HARDLINE '\015' // cr
+
+namespace tesseract {
+void Tesseract::output_pass( // Tess output pass //send to api
+    PAGE_RES_IT &page_res_it, const TBOX *target_word_box) {
+  BLOCK_RES *block_of_last_word;
+  bool force_eol;   // During output
+  BLOCK *nextblock; // block of next word
+  WERD *nextword;   // next word
+
+  page_res_it.restart_page();
+  block_of_last_word = nullptr;
+  while (page_res_it.word() != nullptr) {
+    check_debug_pt(page_res_it.word(), 120);
+
+    if (target_word_box) {
+      TBOX current_word_box = page_res_it.word()->word->bounding_box();
+      FCOORD center_pt((current_word_box.right() + current_word_box.left()) / 2,
+                       (current_word_box.bottom() + current_word_box.top()) / 2);
+      if (!target_word_box->contains(center_pt)) {
+        page_res_it.forward();
+        continue;
+      }
+    }
+    if (tessedit_write_block_separators && block_of_last_word != page_res_it.block()) {
+      block_of_last_word = page_res_it.block();
+    }
+
+    force_eol =
+        (tessedit_write_block_separators && (page_res_it.block() != page_res_it.next_block())) ||
+        (page_res_it.next_word() == nullptr);
+
+    if (page_res_it.next_word() != nullptr) {
+      nextword = page_res_it.next_word()->word;
+    } else {
+      nextword = nullptr;
+    }
+    if (page_res_it.next_block() != nullptr) {
+      nextblock = page_res_it.next_block()->block;
+    } else {
+      nextblock = nullptr;
+    }
+    // regardless of tilde crunching
+    write_results(page_res_it,
+                  determine_newline_type(page_res_it.word()->word, page_res_it.block()->block,
+                                         nextword, nextblock),
+                  force_eol);
+    page_res_it.forward();
+  }
+}
+
+/*************************************************************************
+ * write_results()
+ *
+ * All recognition and rejection has now been done. Generate the following:
+ *   .txt file     - giving the final best choices with NO highlighting
+ *   .raw file     - giving the tesseract top choice output for each word
+ *   .map file     - showing how the .txt file has been rejected in the .ep file
+ *   epchoice list - a list of one element per word, containing the text for the
+ *                   epaper. Reject strings are inserted.
+ *   inset list    - a list of bounding boxes of reject insets - indexed by the
+ *                   reject strings in the epchoice text.
+ *************************************************************************/
+void Tesseract::write_results(PAGE_RES_IT &page_res_it,
+                              char newline_type, // type of newline
+                              bool force_eol) {  // override tilde crunch?
+  WERD_RES *word = page_res_it.word();
+  const UNICHARSET &uchset = *word->uch_set;
+  UNICHAR_ID space = uchset.unichar_to_id(" ");
+
+  if ((word->unlv_crunch_mode != CR_NONE || word->best_choice->empty()) &&
+      !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
+    bool need_reject = false;
+    if ((word->unlv_crunch_mode != CR_DELETE) &&
+        (!stats_.tilde_crunch_written ||
+         ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space() > 0) &&
+          !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
+      if (!word->word->flag(W_BOL) && (word->word->space() > 0) && !word->word->flag(W_FUZZY_NON) &&
+          !word->word->flag(W_FUZZY_SP)) {
+        stats_.last_char_was_tilde = false;
+      }
+      need_reject = true;
+    }
+    if ((need_reject && !stats_.last_char_was_tilde) ||
+        (force_eol && stats_.write_results_empty_block)) {
+      /* Write a reject char - mark as rejected unless zero_rejection mode */
+      stats_.last_char_was_tilde = true;
+      stats_.tilde_crunch_written = true;
+      stats_.last_char_was_newline = false;
+      stats_.write_results_empty_block = false;
+    }
+
+    if ((word->word->flag(W_EOL) && !stats_.last_char_was_newline) || force_eol) {
+      stats_.tilde_crunch_written = false;
+      stats_.last_char_was_newline = true;
+      stats_.last_char_was_tilde = false;
+    }
+
+    if (force_eol) {
+      stats_.write_results_empty_block = true;
+    }
+    return;
+  }
+
+  /* NORMAL PROCESSING of non tilde crunched words */
+
+  stats_.tilde_crunch_written = false;
+  if (newline_type) {
+    stats_.last_char_was_newline = true;
+  } else {
+    stats_.last_char_was_newline = false;
+  }
+  stats_.write_results_empty_block = force_eol; // about to write a real word
+
+  if (unlv_tilde_crunching && stats_.last_char_was_tilde && (word->word->space() == 0) &&
+      !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
+      (word->best_choice->unichar_id(0) == space)) {
+    /* Prevent adjacent tilde across words - we know that adjacent tildes within
+   words have been removed */
+    word->MergeAdjacentBlobs(0);
+  }
+  if (newline_type || (word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes)) {
+    stats_.last_char_was_tilde = false;
+  } else {
+    if (word->reject_map.length() > 0) {
+      if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space) {
+        stats_.last_char_was_tilde = true;
+      } else {
+        stats_.last_char_was_tilde = false;
+      }
+    } else if (word->word->space() > 0) {
+      stats_.last_char_was_tilde = false;
+    }
+    /* else it is unchanged as there are no output chars */
+  }
+
+  ASSERT_HOST(word->best_choice->length() == word->reject_map.length());
+
+  set_unlv_suspects(word);
+  check_debug_pt(word, 120);
+  if (tessedit_rejection_debug) {
+    tprintf("Dict word: \"%s\": %d\n", word->best_choice->debug_string().c_str(),
+            dict_word(*(word->best_choice)));
+  }
+  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
+    if (tessedit_zero_rejection) {
+      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
+        if (word->reject_map[i].rejected()) {
+          word->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+    if (tessedit_minimal_rejection) {
+      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
+      for (unsigned i = 0; i < word->best_choice->length(); ++i) {
+        if ((word->best_choice->unichar_id(i) != space) && word->reject_map[i].rejected()) {
+          word->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+  }
+}
+
+/**********************************************************************
+ * determine_newline_type
+ *
+ * Find whether we have a wrapping or hard newline.
+ * Return false if not at end of line.
+ **********************************************************************/
+
+char determine_newline_type( // test line ends
+    WERD *word,              // word to do
+    BLOCK *block,            // current block
+    WERD *next_word,         // next word
+    BLOCK *next_block        // block of next word
+) {
+  int16_t end_gap; // to right edge
+  int16_t width;   // of next word
+  TBOX word_box;   // bounding
+  TBOX next_box;   // next word
+  TBOX block_box;  // block bounding
+
+  if (!word->flag(W_EOL)) {
+    return false; // not end of line
+  }
+  if (next_word == nullptr || next_block == nullptr || block != next_block) {
+    return CTRL_NEWLINE;
+  }
+  if (next_word->space() > 0) {
+    return CTRL_HARDLINE; // it is tabbed
+  }
+  word_box = word->bounding_box();
+  next_box = next_word->bounding_box();
+  block_box = block->pdblk.bounding_box();
+  // gap to eol
+  end_gap = block_box.right() - word_box.right();
+  end_gap -= static_cast<int32_t>(block->space());
+  width = next_box.right() - next_box.left();
+  //      tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
+  //              block_box.right(),word_box.right(),end_gap,
+  //              next_box.right(),next_box.left(),width,
+  //              end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
+  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
+}
+
+/*************************************************************************
+ * get_rep_char()
+ * Return the first accepted character from the repetition string. This is the
+ * character which is repeated - as determined earlier by fix_rep_char()
+ *************************************************************************/
+UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
+  int i;
+  for (i = 0; ((i < word->reject_map.length()) && (word->reject_map[i].rejected())); ++i) {
+    ;
+  }
+
+  if (i < word->reject_map.length()) {
+    return word->best_choice->unichar_id(i);
+  } else {
+    return word->uch_set->unichar_to_id(unrecognised_char.c_str());
+  }
+}
+
+/*************************************************************************
+ * SUSPECT LEVELS
+ *
+ * 0 - don't reject ANYTHING
+ * 1,2 - partial rejection
+ * 3 - BEST
+ *
+ * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
+ * tessedit_minimal_rejection.
+ *************************************************************************/
+void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
+  int len = word_res->reject_map.length();
+  const WERD_CHOICE &word = *(word_res->best_choice);
+  const UNICHARSET &uchset = *word.unicharset();
+  int i;
+  float rating_per_ch;
+
+  if (suspect_level == 0) {
+    for (i = 0; i < len; i++) {
+      if (word_res->reject_map[i].rejected()) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+    return;
+  }
+
+  if (suspect_level >= 3) {
+    return; // Use defaults
+  }
+
+  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
+
+  if (safe_dict_word(word_res) && (count_alphas(word) > suspect_short_words)) {
+    /* Unreject alphas in dictionary words */
+    for (i = 0; i < len; ++i) {
+      if (word_res->reject_map[i].rejected() && uchset.get_isalpha(word.unichar_id(i))) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+
+  rating_per_ch = word.rating() / word_res->reject_map.length();
+
+  if (rating_per_ch >= suspect_rating_per_ch) {
+    return; // Don't touch bad ratings
+  }
+
+  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
+    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
+    for (i = 0; i < len; ++i) {
+      if (word_res->reject_map[i].rejected() && (!uchset.eq(word.unichar_id(i), " "))) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+
+  for (i = 0; i < len; i++) {
+    if (word_res->reject_map[i].rejected()) {
+      if (word_res->reject_map[i].flag(R_DOC_REJ)) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+      if (word_res->reject_map[i].flag(R_BLOCK_REJ)) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+      if (word_res->reject_map[i].flag(R_ROW_REJ)) {
+        word_res->reject_map[i].setrej_minimal_rej_accept();
+      }
+    }
+  }
+
+  if (suspect_level == 2) {
+    return;
+  }
+
+  if (!suspect_constrain_1Il || (word_res->reject_map.length() <= suspect_short_words)) {
+    for (i = 0; i < len; i++) {
+      if (word_res->reject_map[i].rejected()) {
+        if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
+             word_res->reject_map[i].flag(R_POSTNN_1IL))) {
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+        }
+
+        if (!suspect_constrain_1Il && word_res->reject_map[i].flag(R_MM_REJECT)) {
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+  }
+
+  if (acceptable_word_string(*word_res->uch_set, word.unichar_string().c_str(),
+                             word.unichar_lengths().c_str()) != AC_UNACCEPTABLE ||
+      acceptable_number_string(word.unichar_string().c_str(), word.unichar_lengths().c_str())) {
+    if (word_res->reject_map.length() > suspect_short_words) {
+      for (i = 0; i < len; i++) {
+        if (word_res->reject_map[i].rejected() && (!word_res->reject_map[i].perm_rejected() ||
+                                                   word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
+                                                   word_res->reject_map[i].flag(R_POSTNN_1IL) ||
+                                                   word_res->reject_map[i].flag(R_MM_REJECT))) {
+          word_res->reject_map[i].setrej_minimal_rej_accept();
+        }
+      }
+    }
+  }
+}
+
+int16_t Tesseract::count_alphas(const WERD_CHOICE &word) {
+  int count = 0;
+  for (unsigned i = 0; i < word.length(); ++i) {
+    if (word.unicharset()->get_isalpha(word.unichar_id(i))) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int16_t Tesseract::count_alphanums(const WERD_CHOICE &word) {
+  int count = 0;
+  for (unsigned i = 0; i < word.length(); ++i) {
+    if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
+        word.unicharset()->get_isdigit(word.unichar_id(i))) {
+      count++;
+    }
+  }
+  return count;
+}
+
+bool Tesseract::acceptable_number_string(const char *s, const char *lengths) {
+  bool prev_digit = false;
+
+  if (*lengths == 1 && *s == '(') {
+    s++;
+  }
+
+  if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) {
+    s++;
+  }
+
+  for (; *s != '\0'; s += *(lengths++)) {
+    if (unicharset.get_isdigit(s, *lengths)) {
+      prev_digit = true;
+    } else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) {
+      prev_digit = false;
+    } else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') &&
+               ((*s == '%') || (*s == ')'))) {
+      return true;
+    } else if (prev_digit && *lengths == 1 && (*s == '%') &&
+               (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
+               (*(s + *lengths + *(lengths + 1)) == '\0')) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+} // namespace tesseract