Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1741 @@
+/**********************************************************************
+ * File:        pageres.cpp  (Formerly page_res.c)
+ * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
+ *              and an iterator class to iterate over the words.
+ * Main purposes:
+ *              Easy way to iterate over the words without a 3-nested loop.
+ *              Holds data used during word recognition.
+ *              Holds information about alternative spacing paths.
+ * Author:      Phil Cheatle
+ *
+ * (C) Copyright 1992, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "pageres.h"
+
+#include "blamer.h"   // for BlamerBundle
+#include "blobs.h"    // for TWERD, TBLOB
+#include "boxword.h"  // for BoxWord
+#include "errcode.h"  // for ASSERT_HOST
+#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
+#include "ocrrow.h"   // for ROW, ROW_IT
+#include "pdblock.h"  // for PDBLK
+#include "polyblk.h"  // for POLY_BLOCK
+#include "seam.h"     // for SEAM, start_seam_list
+#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
+#include "tprintf.h"  // for tprintf
+
+#include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY
+
+#include <cassert> // for assert
+#include <cstdint> // for INT32_MAX
+#include <cstring> // for strlen
+
+struct Pix;
+
+namespace tesseract {
+
+// Gain factor for computing thresholds that determine the ambiguity of a
+// word.
+static const double kStopperAmbiguityThresholdGain = 8.0;
+// Constant offset for computing thresholds that determine the ambiguity of a
+// word.
+static const double kStopperAmbiguityThresholdOffset = 1.5;
+// Max number of broken pieces to associate.
+const int kWordrecMaxNumJoinChunks = 4;
+// Max ratio of word box height to line size to allow it to be processed as
+// a line with other words.
+const double kMaxWordSizeRatio = 1.25;
+// Max ratio of line box height to line size to allow a new word to be added.
+const double kMaxLineSizeRatio = 1.25;
+// Max ratio of word gap to line size to allow a new word to be added.
+const double kMaxWordGapRatio = 2.0;
+
+// Computes and returns a threshold of certainty difference used to determine
+// which words to keep, based on the adjustment factors of the two words.
+// TODO(rays) This is horrible. Replace with an enhance params training model.
+static double StopperAmbigThreshold(double f1, double f2) {
+  return (f2 - f1) * kStopperAmbiguityThresholdGain -
+         kStopperAmbiguityThresholdOffset;
+}
+
+/*************************************************************************
+ * PAGE_RES::PAGE_RES
+ *
+ * Constructor for page results
+ *************************************************************************/
+PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list,
+                   WERD_CHOICE **prev_word_best_choice_ptr) {
+  Init();
+  BLOCK_IT block_it(the_block_list);
+  BLOCK_RES_IT block_res_it(&block_res_list);
+  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
+    block_res_it.add_to_end(
+        new BLOCK_RES(merge_similar_words, block_it.data()));
+  }
+  prev_word_best_choice = prev_word_best_choice_ptr;
+}
+
+/*************************************************************************
+ * BLOCK_RES::BLOCK_RES
+ *
+ * Constructor for BLOCK results
+ *************************************************************************/
+
+BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
+  ROW_IT row_it(the_block->row_list());
+  ROW_RES_IT row_res_it(&row_res_list);
+
+  char_count = 0;
+  rej_count = 0;
+  font_class = -1; // not assigned
+  x_height = -1.0;
+  font_assigned = false;
+  row_count = 0;
+
+  block = the_block;
+
+  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+    row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
+  }
+}
+
+/*************************************************************************
+ * ROW_RES::ROW_RES
+ *
+ * Constructor for ROW results
+ *************************************************************************/
+
+ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
+  WERD_IT word_it(the_row->word_list());
+  WERD_RES_IT word_res_it(&word_res_list);
+  WERD_RES *combo = nullptr; // current combination of fuzzies
+  WERD *copy_word;
+
+  char_count = 0;
+  rej_count = 0;
+  whole_word_rej_count = 0;
+
+  row = the_row;
+  bool add_next_word = false;
+  TBOX union_box;
+  float line_height =
+      the_row->x_height() + the_row->ascenders() - the_row->descenders();
+  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
+    auto *word_res = new WERD_RES(word_it.data());
+    word_res->x_height = the_row->x_height();
+    if (add_next_word) {
+      ASSERT_HOST(combo != nullptr);
+      // We are adding this word to the combination.
+      word_res->part_of_combo = true;
+      combo->copy_on(word_res);
+    } else if (merge_similar_words) {
+      union_box = word_res->word->bounding_box();
+      add_next_word = !word_res->word->flag(W_REP_CHAR) &&
+                      union_box.height() <= line_height * kMaxWordSizeRatio;
+      word_res->odd_size = !add_next_word;
+    }
+    WERD *next_word = word_it.data_relative(1);
+    if (merge_similar_words) {
+      if (add_next_word && !next_word->flag(W_REP_CHAR)) {
+        // Next word will be added on if all of the following are true:
+        // Not a rep char.
+        // Box height small enough.
+        // Union box height small enough.
+        // Horizontal gap small enough.
+        TBOX next_box = next_word->bounding_box();
+        int prev_right = union_box.right();
+        union_box += next_box;
+        if (next_box.height() > line_height * kMaxWordSizeRatio ||
+            union_box.height() > line_height * kMaxLineSizeRatio ||
+            next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
+          add_next_word = false;
+        }
+      }
+      next_word->set_flag(W_FUZZY_NON, add_next_word);
+    } else {
+      add_next_word = next_word->flag(W_FUZZY_NON);
+    }
+    if (add_next_word) {
+      if (combo == nullptr) {
+        copy_word = new WERD;
+        *copy_word = *(word_it.data()); // deep copy
+        combo = new WERD_RES(copy_word);
+        combo->x_height = the_row->x_height();
+        combo->combination = true;
+        word_res_it.add_to_end(combo);
+      }
+      word_res->part_of_combo = true;
+    } else {
+      combo = nullptr;
+    }
+    word_res_it.add_to_end(word_res);
+  }
+}
+
+WERD_RES &WERD_RES::operator=(const WERD_RES &source) {
+  this->ELIST_LINK::operator=(source);
+  Clear();
+  if (source.combination) {
+    word = new WERD;
+    *word = *(source.word); // deep copy
+  } else {
+    word = source.word; // pt to same word
+  }
+  if (source.bln_boxes != nullptr) {
+    bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
+  }
+  if (source.chopped_word != nullptr) {
+    chopped_word = new TWERD(*source.chopped_word);
+  }
+  if (source.rebuild_word != nullptr) {
+    rebuild_word = new TWERD(*source.rebuild_word);
+  }
+  // TODO(rays) Do we ever need to copy the seam_array?
+  blob_row = source.blob_row;
+  denorm = source.denorm;
+  if (source.box_word != nullptr) {
+    box_word = new tesseract::BoxWord(*source.box_word);
+  }
+  best_state = source.best_state;
+  correct_text = source.correct_text;
+  blob_widths = source.blob_widths;
+  blob_gaps = source.blob_gaps;
+  // None of the uses of operator= require the ratings matrix to be copied,
+  // so don't as it would be really slow.
+
+  // Copy the cooked choices.
+  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));
+  WERD_CHOICE_IT wc_dest_it(&best_choices);
+  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
+    const WERD_CHOICE *choice = wc_it.data();
+    wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
+  }
+  if (!wc_dest_it.empty()) {
+    wc_dest_it.move_to_first();
+    best_choice = wc_dest_it.data();
+  } else {
+    best_choice = nullptr;
+  }
+
+  if (source.raw_choice != nullptr) {
+    raw_choice = new WERD_CHOICE(*source.raw_choice);
+  } else {
+    raw_choice = nullptr;
+  }
+  if (source.ep_choice != nullptr) {
+    ep_choice = new WERD_CHOICE(*source.ep_choice);
+  } else {
+    ep_choice = nullptr;
+  }
+  reject_map = source.reject_map;
+  combination = source.combination;
+  part_of_combo = source.part_of_combo;
+  CopySimpleFields(source);
+  if (source.blamer_bundle != nullptr) {
+    blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
+  }
+  return *this;
+}
+
+// Copies basic fields that don't involve pointers that might be useful
+// to copy when making one WERD_RES from another.
+void WERD_RES::CopySimpleFields(const WERD_RES &source) {
+  tess_failed = source.tess_failed;
+  tess_accepted = source.tess_accepted;
+  tess_would_adapt = source.tess_would_adapt;
+  done = source.done;
+  unlv_crunch_mode = source.unlv_crunch_mode;
+  small_caps = source.small_caps;
+  odd_size = source.odd_size;
+  fontinfo = source.fontinfo;
+  fontinfo2 = source.fontinfo2;
+  fontinfo_id_count = source.fontinfo_id_count;
+  fontinfo_id2_count = source.fontinfo_id2_count;
+  x_height = source.x_height;
+  caps_height = source.caps_height;
+  baseline_shift = source.baseline_shift;
+  guessed_x_ht = source.guessed_x_ht;
+  guessed_caps_ht = source.guessed_caps_ht;
+  reject_spaces = source.reject_spaces;
+  uch_set = source.uch_set;
+  tesseract = source.tesseract;
+}
+
+// Initializes a blank (default constructed) WERD_RES from one that has
+// already been recognized.
+// Use SetupFor*Recognition afterwards to complete the setup and make
+// it ready for a retry recognition.
+void WERD_RES::InitForRetryRecognition(const WERD_RES &source) {
+  word = source.word;
+  CopySimpleFields(source);
+  if (source.blamer_bundle != nullptr) {
+    blamer_bundle = new BlamerBundle();
+    blamer_bundle->CopyTruth(*source.blamer_bundle);
+  }
+}
+
+// Sets up the members used in recognition: bln_boxes, chopped_word,
+// seam_array, denorm.  Returns false if
+// the word is empty and sets up fake results.  If use_body_size is
+// true and row->body_size is set, then body_size will be used for
+// blob normalization instead of xheight + ascrise. This flag is for
+// those languages that are using CJK pitch model and thus it has to
+// be true if and only if tesseract->textord_use_cjk_fp_model is
+// true.
+// If allow_detailed_fx is true, the feature extractor will receive fine
+// precision outline information, allowing smoother features and better
+// features on low resolution images.
+// The norm_mode_hint sets the default mode for normalization in absence
+// of any of the above flags.
+// norm_box is used to override the word bounding box to determine the
+// normalization scale and offset.
+// Returns false if the word is empty and sets up fake results.
+bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in,
+                                   tesseract::Tesseract *tess, Image pix,
+                                   int norm_mode, const TBOX *norm_box,
+                                   bool numeric_mode, bool use_body_size,
+                                   bool allow_detailed_fx, ROW *row,
+                                   const BLOCK *block) {
+  auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);
+  tesseract = tess;
+  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;
+  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&
+       word->cblob_list()->empty()) ||
+      (pb != nullptr && !pb->IsText())) {
+    // Empty words occur when all the blobs have been moved to the rej_blobs
+    // list, which seems to occur frequently in junk.
+    SetupFake(unicharset_in);
+    word->set_flag(W_REP_CHAR, false);
+    return false;
+  }
+  ClearResults();
+  SetupWordScript(unicharset_in);
+  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
+  float word_xheight =
+      use_body_size && row != nullptr && row->body_size() > 0.0f
+          ? row->body_size()
+          : x_height;
+  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
+                            word_xheight, baseline_shift, numeric_mode,
+                            norm_mode_hint, norm_box, &denorm);
+  blob_row = row;
+  SetupBasicsFromChoppedWord(unicharset_in);
+  SetupBlamerBundle();
+  int num_blobs = chopped_word->NumBlobs();
+  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
+  tess_failed = false;
+  return true;
+}
+
+// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
+// accumulators from a made chopped word.  We presume the fields are already
+// empty.
+void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {
+  bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);
+  start_seam_list(chopped_word, &seam_array);
+  SetupBlobWidthsAndGaps();
+  ClearWordChoices();
+}
+
+// Sets up the members used in recognition for an empty recognition result:
+// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
+void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) {
+  ClearResults();
+  SetupWordScript(unicharset_in);
+  chopped_word = new TWERD;
+  rebuild_word = new TWERD;
+  bln_boxes = new tesseract::BoxWord;
+  box_word = new tesseract::BoxWord;
+  int blob_count = word->cblob_list()->length();
+  if (blob_count > 0) {
+    auto **fake_choices = new BLOB_CHOICE *[blob_count];
+    // For non-text blocks, just pass any blobs through to the box_word
+    // and call the word failed with a fake classification.
+    C_BLOB_IT b_it(word->cblob_list());
+    int blob_id = 0;
+    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+      TBOX box = b_it.data()->bounding_box();
+      box_word->InsertBox(box_word->length(), box);
+      fake_choices[blob_id++] = new BLOB_CHOICE;
+    }
+    FakeClassifyWord(blob_count, fake_choices);
+    delete[] fake_choices;
+  } else {
+    auto *word = new WERD_CHOICE(&unicharset_in);
+    word->make_bad();
+    LogNewRawChoice(word);
+    // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
+    LogNewCookedChoice(1, false, word);
+  }
+  tess_failed = true;
+  done = true;
+}
+
+void WERD_RES::SetupWordScript(const UNICHARSET &uch) {
+  uch_set = &uch;
+  int script = uch.default_sid();
+  word->set_script_id(script);
+  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());
+  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
+}
+
+// Sets up the blamer_bundle if it is not null, using the initialized denorm.
+void WERD_RES::SetupBlamerBundle() {
+  if (blamer_bundle != nullptr) {
+    blamer_bundle->SetupNormTruthWord(denorm);
+  }
+}
+
+// Computes the blob_widths and blob_gaps from the chopped_word.
+void WERD_RES::SetupBlobWidthsAndGaps() {
+  blob_widths.clear();
+  blob_gaps.clear();
+  int num_blobs = chopped_word->NumBlobs();
+  for (int b = 0; b < num_blobs; ++b) {
+    TBLOB *blob = chopped_word->blobs[b];
+    TBOX box = blob->bounding_box();
+    blob_widths.push_back(box.width());
+    if (b + 1 < num_blobs) {
+      blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -
+                          box.right());
+    }
+  }
+}
+
+// Updates internal data to account for a new SEAM (chop) at the given
+// blob_number. Fixes the ratings matrix and states in the choices, as well
+// as the blob widths and gaps.
+void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {
+  // Insert the seam into the SEAMS array.
+  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
+  seam_array.insert(seam_array.begin() + blob_number, seam);
+  if (ratings != nullptr) {
+    // Expand the ratings matrix.
+    ratings = ratings->ConsumeAndMakeBigger(blob_number);
+    // Fix all the segmentation states.
+    if (raw_choice != nullptr) {
+      raw_choice->UpdateStateForSplit(blob_number);
+    }
+    WERD_CHOICE_IT wc_it(&best_choices);
+    for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
+      WERD_CHOICE *choice = wc_it.data();
+      choice->UpdateStateForSplit(blob_number);
+    }
+    SetupBlobWidthsAndGaps();
+  }
+}
+
+// Returns true if all the word choices except the first have adjust_factors
+// worse than the given threshold.
+bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {
+  // The choices are not changed by this iteration.
+  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
+  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
+    WERD_CHOICE *choice = wc_it.data();
+    if (choice->adjust_factor() <= threshold) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns true if the current word is ambiguous (by number of answers or
+// by dangerous ambigs.)
+bool WERD_RES::IsAmbiguous() {
+  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
+}
+
+// Returns true if the ratings matrix size matches the sum of each of the
+// segmentation states.
+bool WERD_RES::StatesAllValid() {
+  unsigned ratings_dim = ratings->dimension();
+  if (raw_choice->TotalOfStates() != ratings_dim) {
+    tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",
+            raw_choice->TotalOfStates(), ratings_dim);
+    return false;
+  }
+  WERD_CHOICE_IT it(&best_choices);
+  unsigned index = 0;
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
+    WERD_CHOICE *choice = it.data();
+    if (choice->TotalOfStates() != ratings_dim) {
+      tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n",
+              index, choice->TotalOfStates(), ratings_dim);
+      return false;
+    }
+  }
+  return true;
+}
+
+// Prints a list of words found if debug is true or the word result matches
+// the word_to_debug.
+void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) {
+  if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' &&
+                best_choice != nullptr &&
+                best_choice->unichar_string() == std::string(word_to_debug))) {
+    if (raw_choice != nullptr) {
+      raw_choice->print("\nBest Raw Choice");
+    }
+
+    WERD_CHOICE_IT it(&best_choices);
+    int index = 0;
+    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
+      WERD_CHOICE *choice = it.data();
+      std::string label;
+      label += "\nCooked Choice #" + std::to_string(index);
+      choice->print(label.c_str());
+    }
+  }
+}
+
+// Prints the top choice along with the accepted/done flags.
+void WERD_RES::DebugTopChoice(const char *msg) const {
+  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted,
+          tess_would_adapt, done);
+  if (best_choice == nullptr) {
+    tprintf("<Null choice>\n");
+  } else {
+    best_choice->print(msg);
+  }
+}
+
+// Removes from best_choices all choices which are not within a reasonable
+// range of the best choice.
+// TODO(rays) incorporate the information used here into the params training
+// re-ranker, in place of this heuristic that is based on the previous
+// adjustment factor.
+void WERD_RES::FilterWordChoices(int debug_level) {
+  if (best_choice == nullptr || best_choices.singleton()) {
+    return;
+  }
+
+  if (debug_level >= 2) {
+    best_choice->print("\nFiltering against best choice");
+  }
+  WERD_CHOICE_IT it(&best_choices);
+  int index = 0;
+  for (it.forward(); !it.at_first(); it.forward(), ++index) {
+    WERD_CHOICE *choice = it.data();
+    float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
+                                            choice->adjust_factor());
+    // i, j index the blob choice in choice, best_choice.
+    // chunk is an index into the chopped_word blobs (AKA chunks).
+    // Since the two words may use different segmentations of the chunks, we
+    // iterate over the chunks to find out whether a comparable blob
+    // classification is much worse than the best result.
+    unsigned i = 0, j = 0, chunk = 0;
+    // Each iteration of the while deals with 1 chunk. On entry choice_chunk
+    // and best_chunk are the indices of the first chunk in the NEXT blob,
+    // i.e. we don't have to increment i, j while chunk < choice_chunk and
+    // best_chunk respectively.
+    auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
+    while (i < choice->length() && j < best_choice->length()) {
+      if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
+          choice->certainty(i) - best_choice->certainty(j) < threshold) {
+        if (debug_level >= 2) {
+          choice->print("WorstCertaintyDiffWorseThan");
+          tprintf(
+              "i %u j %u Choice->Blob[i].Certainty %.4g"
+              " WorstOtherChoiceCertainty %g Threshold %g\n",
+              i, j, choice->certainty(i), best_choice->certainty(j), threshold);
+          tprintf("Discarding bad choice #%d\n", index);
+        }
+        delete it.extract();
+        break;
+      }
+      ++chunk;
+      // If needed, advance choice_chunk to keep up with chunk.
+      while (choice_chunk < chunk && ++i < choice->length()) {
+        choice_chunk += choice->state(i);
+      }
+      // If needed, advance best_chunk to keep up with chunk.
+      while (best_chunk < chunk && ++j < best_choice->length()) {
+        best_chunk += best_choice->state(j);
+      }
+    }
+  }
+}
+
+void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
+                                         float min_rating, float max_rating,
+                                         float rating_margin,
+                                         float *thresholds) {
+  int chunk = 0;
+  int end_chunk = best_choice->state(0);
+  int end_raw_chunk = raw_choice->state(0);
+  int raw_blob = 0;
+  for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {
+    float avg_rating = 0.0f;
+    int num_error_chunks = 0;
+
+    // For each chunk in best choice blob i, count non-matching raw results.
+    while (chunk < end_chunk) {
+      if (chunk >= end_raw_chunk) {
+        ++raw_blob;
+        end_raw_chunk += raw_choice->state(raw_blob);
+      }
+      if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {
+        avg_rating += raw_choice->certainty(raw_blob);
+        ++num_error_chunks;
+      }
+      ++chunk;
+    }
+
+    if (num_error_chunks > 0) {
+      avg_rating /= num_error_chunks;
+      *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
+    } else {
+      *thresholds = max_rating;
+    }
+
+    if (*thresholds > max_rating) {
+      *thresholds = max_rating;
+    }
+    if (*thresholds < min_rating) {
+      *thresholds = min_rating;
+    }
+  }
+}
+
+// Saves a copy of the word_choice if it has the best unadjusted rating.
+// Returns true if the word_choice was the new best.
+bool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) {
+  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {
+    delete raw_choice;
+    raw_choice = new WERD_CHOICE(*word_choice);
+    raw_choice->set_permuter(TOP_CHOICE_PERM);
+    return true;
+  }
+  return false;
+}
+
+// Consumes word_choice by adding it to best_choices, (taking ownership) if
+// the certainty for word_choice is some distance of the best choice in
+// best_choices, or by deleting the word_choice and returning false.
+// The best_choices list is kept in sorted order by rating. Duplicates are
+// removed, and the list is kept no longer than max_num_choices in length.
+// Returns true if the word_choice is still a valid pointer.
+bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
+                                  WERD_CHOICE *word_choice) {
+  if (best_choice != nullptr) {
+    // Throw out obviously bad choices to save some work.
+    // TODO(rays) Get rid of this! This piece of code produces different
+    // results according to the order in which words are found, which is an
+    // undesirable behavior. It would be better to keep all the choices and
+    // prune them later when more information is available.
+    float max_certainty_delta = StopperAmbigThreshold(
+        best_choice->adjust_factor(), word_choice->adjust_factor());
+    if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {
+      max_certainty_delta = -kStopperAmbiguityThresholdOffset;
+    }
+    if (word_choice->certainty() - best_choice->certainty() <
+        max_certainty_delta) {
+      if (debug) {
+        std::string bad_string;
+        word_choice->string_and_lengths(&bad_string, nullptr);
+        tprintf(
+            "Discarding choice \"%s\" with an overly low certainty"
+            " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
+            bad_string.c_str(), word_choice->certainty(),
+            best_choice->certainty(),
+            max_certainty_delta + best_choice->certainty());
+      }
+      delete word_choice;
+      return false;
+    }
+  }
+
+  // Insert in the list in order of increasing rating, but knock out worse
+  // string duplicates.
+  WERD_CHOICE_IT it(&best_choices);
+  const std::string &new_str = word_choice->unichar_string();
+  bool inserted = false;
+  int num_choices = 0;
+  if (!it.empty()) {
+    do {
+      WERD_CHOICE *choice = it.data();
+      if (choice->rating() > word_choice->rating() && !inserted) {
+        // Time to insert.
+        it.add_before_stay_put(word_choice);
+        inserted = true;
+        if (num_choices == 0) {
+          best_choice = word_choice; // This is the new best.
+        }
+        ++num_choices;
+      }
+      if (choice->unichar_string() == new_str) {
+        if (inserted) {
+          // New is better.
+          delete it.extract();
+        } else {
+          // Old is better.
+          if (debug) {
+            tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
+                    new_str.c_str(), word_choice->rating(), choice->rating());
+          }
+          delete word_choice;
+          return false;
+        }
+      } else {
+        ++num_choices;
+        if (num_choices > max_num_choices) {
+          delete it.extract();
+        }
+      }
+      it.forward();
+    } while (!it.at_first());
+  }
+  if (!inserted && num_choices < max_num_choices) {
+    it.add_to_end(word_choice);
+    inserted = true;
+    if (num_choices == 0) {
+      best_choice = word_choice; // This is the new best.
+    }
+  }
+  if (debug) {
+    if (inserted) {
+      tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
+    } else {
+      tprintf("Poor");
+    }
+    word_choice->print(" Word Choice");
+  }
+  if (!inserted) {
+    delete word_choice;
+    return false;
+  }
+  return true;
+}
+
+// Simple helper moves the ownership of the pointer data from src to dest,
+// first deleting anything in dest, and nulling out src afterwards.
+template <class T>
+static void MovePointerData(T **dest, T **src) {
+  delete *dest;
+  *dest = *src;
+  *src = nullptr;
+}
+
+// Prints a brief list of all the best choices.
+void WERD_RES::PrintBestChoices() const {
+  std::string alternates_str;
+  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    if (!it.at_first()) {
+      alternates_str += "\", \"";
+    }
+    alternates_str += it.data()->unichar_string();
+  }
+  tprintf("Alternates for \"%s\": {\"%s\"}\n",
+          best_choice->unichar_string().c_str(), alternates_str.c_str());
+}
+
+// Returns the sum of the widths of the blob between start_blob and last_blob
+// inclusive.
+int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {
+  int result = 0;
+  for (int b = start_blob; b <= last_blob; ++b) {
+    result += blob_widths[b];
+    if (b < last_blob) {
+      result += blob_gaps[b];
+    }
+  }
+  return result;
+}
+// Returns the width of a gap between the specified blob and the next one.
+int WERD_RES::GetBlobsGap(unsigned blob_index) const {
+  if (blob_index >= blob_gaps.size()) {
+    return 0;
+  }
+  return blob_gaps[blob_index];
+}
+
+// Returns the BLOB_CHOICE corresponding to the given index in the
+// best choice word taken from the appropriate cell in the ratings MATRIX.
+// Borrowed pointer, so do not delete. May return nullptr if there is no
+// BLOB_CHOICE matching the unichar_id at the given index.
+BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {
+  if (index >= best_choice->length()) {
+    return nullptr;
+  }
+  BLOB_CHOICE_LIST *choices = GetBlobChoices(index);
+  return FindMatchingChoice(best_choice->unichar_id(index), choices);
+}
+
+// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
+// best choice word taken from the appropriate cell in the ratings MATRIX.
+// Borrowed pointer, so do not delete.
+BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const {
+  return best_choice->blob_choices(index, ratings);
+}
+
+// Moves the results fields from word to this. This takes ownership of all
+// the data, so src can be destructed.
+void WERD_RES::ConsumeWordResults(WERD_RES *word) {
+  denorm = word->denorm;
+  blob_row = word->blob_row;
+  MovePointerData(&chopped_word, &word->chopped_word);
+  MovePointerData(&rebuild_word, &word->rebuild_word);
+  MovePointerData(&box_word, &word->box_word);
+  for (auto data : seam_array) {
+    delete data;
+  }
+  seam_array = word->seam_array;
+  word->seam_array.clear();
+  // TODO: optimize moves.
+  best_state = word->best_state;
+  word->best_state.clear();
+  correct_text = word->correct_text;
+  word->correct_text.clear();
+  blob_widths = word->blob_widths;
+  word->blob_widths.clear();
+  blob_gaps = word->blob_gaps;
+  word->blob_gaps.clear();
+  if (ratings != nullptr) {
+    ratings->delete_matrix_pointers();
+  }
+  MovePointerData(&ratings, &word->ratings);
+  best_choice = word->best_choice;
+  MovePointerData(&raw_choice, &word->raw_choice);
+  best_choices.clear();
+  WERD_CHOICE_IT wc_it(&best_choices);
+  wc_it.add_list_after(&word->best_choices);
+  reject_map = word->reject_map;
+  if (word->blamer_bundle != nullptr) {
+    assert(blamer_bundle != nullptr);
+    blamer_bundle->CopyResults(*(word->blamer_bundle));
+  }
+  CopySimpleFields(*word);
+}
+
+// Replace the best choice and rebuild box word.
+// choice must be from the current best_choices list.
+void WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) {
+  best_choice = choice;
+  RebuildBestState();
+  SetupBoxWord();
+  // Make up a fake reject map of the right length to keep the
+  // rejection pass happy.
+  reject_map.initialise(best_state.size());
+  done = tess_accepted = tess_would_adapt = true;
+  SetScriptPositions();
+}
+
+// Builds the rebuild_word and sets the best_state from the chopped_word and
+// the best_choice->state.
+void WERD_RES::RebuildBestState() {
+  ASSERT_HOST(best_choice != nullptr);
+  delete rebuild_word;
+  rebuild_word = new TWERD;
+  if (seam_array.empty()) {
+    start_seam_list(chopped_word, &seam_array);
+  }
+  best_state.clear();
+  int start = 0;
+  for (unsigned i = 0; i < best_choice->length(); ++i) {
+    int length = best_choice->state(i);
+    best_state.push_back(length);
+    if (length > 1) {
+      SEAM::JoinPieces(seam_array, chopped_word->blobs, start,
+                       start + length - 1);
+    }
+    TBLOB *blob = chopped_word->blobs[start];
+    rebuild_word->blobs.push_back(new TBLOB(*blob));
+    if (length > 1) {
+      SEAM::BreakPieces(seam_array, chopped_word->blobs, start,
+                        start + length - 1);
+    }
+    start += length;
+  }
+}
+
+// Copies the chopped_word to the rebuild_word, faking a best_state as well.
+// Also sets up the output box_word.
+void WERD_RES::CloneChoppedToRebuild() {
+  delete rebuild_word;
+  rebuild_word = new TWERD(*chopped_word);
+  SetupBoxWord();
+  auto word_len = box_word->length();
+  best_state.reserve(word_len);
+  correct_text.reserve(word_len);
+  for (unsigned i = 0; i < word_len; ++i) {
+    best_state.push_back(1);
+    correct_text.emplace_back("");
+  }
+}
+
+// Sets/replaces the box_word with one made from the rebuild_word.
+void WERD_RES::SetupBoxWord() {
+  delete box_word;
+  rebuild_word->ComputeBoundingBoxes();
+  box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);
+  box_word->ClipToOriginalWord(denorm.block(), word);
+}
+
+// Sets up the script positions in the output best_choice using the best_choice
+// to get the unichars, and the unicharset to get the target positions.
+void WERD_RES::SetScriptPositions() {
+  best_choice->SetScriptPositions(small_caps, chopped_word);
+}
+// Sets all the blobs in all the words (raw choice and best choices) to be
+// the given position. (When a sub/superscript is recognized as a separate
+// word, it falls victim to the rule that a whole word cannot be sub or
+// superscript, so this function overrides that problem.)
+void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {
+  raw_choice->SetAllScriptPositions(position);
+  WERD_CHOICE_IT wc_it(&best_choices);
+  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
+    wc_it.data()->SetAllScriptPositions(position);
+  }
+}
+
+// Classifies the word with some already-calculated BLOB_CHOICEs.
+// The choices are an array of blob_count pointers to BLOB_CHOICE,
+// providing a single classifier result for each blob.
+// The BLOB_CHOICEs are consumed and the word takes ownership.
+// The number of blobs in the box_word must match blob_count.
+void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {
+  // Setup the WERD_RES.
+  ASSERT_HOST(box_word != nullptr);
+  ASSERT_HOST(blob_count == box_word->length());
+  ClearWordChoices();
+  ClearRatings();
+  ratings = new MATRIX(blob_count, 1);
+  for (unsigned c = 0; c < blob_count; ++c) {
+    auto *choice_list = new BLOB_CHOICE_LIST;
+    BLOB_CHOICE_IT choice_it(choice_list);
+    choice_it.add_after_then_move(choices[c]);
+    ratings->put(c, c, choice_list);
+  }
+  FakeWordFromRatings(TOP_CHOICE_PERM);
+  reject_map.initialise(blob_count);
+  best_state.clear();
+  best_state.resize(blob_count, 1);
+  done = true;
+}
+
+// Creates a WERD_CHOICE for the word using the top choices from the leading
+// diagonal of the ratings matrix.
+void WERD_RES::FakeWordFromRatings(PermuterType permuter) {
+  int num_blobs = ratings->dimension();
+  auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);
+  word_choice->set_permuter(permuter);
+  for (int b = 0; b < num_blobs; ++b) {
+    UNICHAR_ID unichar_id = UNICHAR_SPACE;
+    // Initialize rating and certainty like in WERD_CHOICE::make_bad().
+    float rating = WERD_CHOICE::kBadRating;
+    float certainty = -FLT_MAX;
+    BLOB_CHOICE_LIST *choices = ratings->get(b, b);
+    if (choices != nullptr && !choices->empty()) {
+      BLOB_CHOICE_IT bc_it(choices);
+      BLOB_CHOICE *choice = bc_it.data();
+      unichar_id = choice->unichar_id();
+      rating = choice->rating();
+      certainty = choice->certainty();
+    }
+    word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
+                                                   certainty);
+  }
+  LogNewRawChoice(word_choice);
+  // Ownership of word_choice taken by word here.
+  LogNewCookedChoice(1, false, word_choice);
+}
+
+// Copies the best_choice strings to the correct_text for adaption/training.
+void WERD_RES::BestChoiceToCorrectText() {
+  correct_text.clear();
+  ASSERT_HOST(best_choice != nullptr);
+  for (unsigned i = 0; i < best_choice->length(); ++i) {
+    UNICHAR_ID choice_id = best_choice->unichar_id(i);
+    const char *blob_choice = uch_set->id_to_unichar(choice_id);
+    correct_text.emplace_back(blob_choice);
+  }
+}
+
+// Merges 2 adjacent blobs in the result if the permanent callback
+// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
+// callback box_cb is nullptr or returns true, setting the merged blob
+// result to the class returned from class_cb.
+// Returns true if anything was merged.
+bool WERD_RES::ConditionalBlobMerge(
+    const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
+    const std::function<bool(const TBOX &, const TBOX &)> &box_cb) {
+  ASSERT_HOST(best_choice->empty() || ratings != nullptr);
+  bool modified = false;
+  for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {
+    UNICHAR_ID new_id =
+        class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));
+    if (new_id != INVALID_UNICHAR_ID &&
+        (box_cb == nullptr ||
+         box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {
+      // Raw choice should not be fixed.
+      best_choice->set_unichar_id(new_id, i);
+      modified = true;
+      MergeAdjacentBlobs(i);
+      const MATRIX_COORD &coord = best_choice->MatrixCoord(i);
+      if (!coord.Valid(*ratings)) {
+        ratings->IncreaseBandSize(coord.row + 1 - coord.col);
+      }
+      BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);
+      if (FindMatchingChoice(new_id, blob_choices) == nullptr) {
+        // Insert a fake result.
+        auto *blob_choice = new BLOB_CHOICE;
+        blob_choice->set_unichar_id(new_id);
+        BLOB_CHOICE_IT bc_it(blob_choices);
+        bc_it.add_before_then_move(blob_choice);
+      }
+    }
+  }
+  return modified;
+}
+
+// Merges 2 adjacent blobs in the result (index and index+1) and corrects
+// all the data to account for the change.
+void WERD_RES::MergeAdjacentBlobs(unsigned index) {
+  if (reject_map.length() == best_choice->length()) {
+    reject_map.remove_pos(index);
+  }
+  best_choice->remove_unichar_id(index + 1);
+  rebuild_word->MergeBlobs(index, index + 2);
+  box_word->MergeBoxes(index, index + 2);
+  if (index + 1 < best_state.size()) {
+    best_state[index] += best_state[index + 1];
+    best_state.erase(best_state.begin() + index + 1);
+  }
+}
+
+// TODO(tkielbus) Decide between keeping this behavior here or modifying the
+// training data.
+
+// Utility function for fix_quotes
+// Return true if the next character in the string (given the UTF8 length in
+// bytes) is a quote character.
+static int is_simple_quote(const char *signed_str, int length) {
+  const auto *str = reinterpret_cast<const unsigned char *>(signed_str);
+  // Standard 1 byte quotes.
+  return (length == 1 && (*str == '\'' || *str == '`')) ||
+         // UTF-8 3 bytes curved quotes.
+         (length == 3 &&
+          ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) ||
+           (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99)));
+}
+
+// Callback helper for fix_quotes returns a double quote if both
+// arguments are quote, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {
+  const char *ch = uch_set->id_to_unichar(id1);
+  const char *next_ch = uch_set->id_to_unichar(id2);
+  if (is_simple_quote(ch, strlen(ch)) &&
+      is_simple_quote(next_ch, strlen(next_ch))) {
+    return uch_set->unichar_to_id("\"");
+  }
+  return INVALID_UNICHAR_ID;
+}
+
+// Change pairs of quotes to double quotes.
+void WERD_RES::fix_quotes() {
+  if (!uch_set->contains_unichar("\"") ||
+      !uch_set->get_enabled(uch_set->unichar_to_id("\""))) {
+    return; // Don't create it if it is disallowed.
+  }
+
+  using namespace std::placeholders; // for _1, _2
+  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr);
+}
+
+// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
+// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {
+  const char *ch = uch_set->id_to_unichar(id1);
+  const char *next_ch = uch_set->id_to_unichar(id2);
+  if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') &&
+      (*next_ch == '-' || *next_ch == '~')) {
+    return uch_set->unichar_to_id("-");
+  }
+  return INVALID_UNICHAR_ID;
+}
+
+// Callback helper for fix_hyphens returns true if box1 and box2 overlap
+// (assuming both on the same textline, are in order and a chopped em dash.)
+bool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) {
+  return box1.right() >= box2.left();
+}
+
+// Change pairs of hyphens to a single hyphen if the bounding boxes touch
+// Typically a long dash which has been segmented.
+void WERD_RES::fix_hyphens() {
+  if (!uch_set->contains_unichar("-") ||
+      !uch_set->get_enabled(uch_set->unichar_to_id("-"))) {
+    return; // Don't create it if it is disallowed.
+  }
+
+  using namespace std::placeholders; // for _1, _2
+  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),
+                       std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));
+}
+
+// Callback helper for merge_tess_fails returns a space if both
+// arguments are space, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {
+  if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) {
+    return id1;
+  } else {
+    return INVALID_UNICHAR_ID;
+  }
+}
+
+// Change pairs of tess failures to a single one
+void WERD_RES::merge_tess_fails() {
+  using namespace std::placeholders; // for _1, _2
+  if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2),
+                           nullptr)) {
+    unsigned len = best_choice->length();
+    ASSERT_HOST(reject_map.length() == len);
+    ASSERT_HOST(box_word->length() == len);
+  }
+}
+
+// Returns true if the collection of count pieces, starting at start, are all
+// natural connected components, ie there are no real chops involved.
+bool WERD_RES::PiecesAllNatural(int start, int count) const {
+  // all seams must have no splits.
+  for (int index = start; index < start + count - 1; ++index) {
+    if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {
+      SEAM *seam = seam_array[index];
+      if (seam != nullptr && seam->HasAnySplits()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+WERD_RES::~WERD_RES() {
+  Clear();
+}
+
+void WERD_RES::Clear() {
+  if (combination) {
+    delete word;
+  }
+  word = nullptr;
+  delete blamer_bundle;
+  blamer_bundle = nullptr;
+  ClearResults();
+}
+
+void WERD_RES::ClearResults() {
+  done = false;
+  fontinfo = nullptr;
+  fontinfo2 = nullptr;
+  fontinfo_id_count = 0;
+  fontinfo_id2_count = 0;
+  delete bln_boxes;
+  bln_boxes = nullptr;
+  blob_row = nullptr;
+  delete chopped_word;
+  chopped_word = nullptr;
+  delete rebuild_word;
+  rebuild_word = nullptr;
+  delete box_word;
+  box_word = nullptr;
+  best_state.clear();
+  correct_text.clear();
+  for (auto data : seam_array) {
+    delete data;
+  }
+  seam_array.clear();
+  blob_widths.clear();
+  blob_gaps.clear();
+  ClearRatings();
+  ClearWordChoices();
+  if (blamer_bundle != nullptr) {
+    blamer_bundle->ClearResults();
+  }
+}
+void WERD_RES::ClearWordChoices() {
+  best_choice = nullptr;
+  delete raw_choice;
+  raw_choice = nullptr;
+  best_choices.clear();
+  delete ep_choice;
+  ep_choice = nullptr;
+}
+void WERD_RES::ClearRatings() {
+  if (ratings != nullptr) {
+    ratings->delete_matrix_pointers();
+    delete ratings;
+    ratings = nullptr;
+  }
+}
+
+int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
+  ASSERT_HOST(page_res == other.page_res);
+  if (other.block_res == nullptr) {
+    // other points to the end of the page.
+    if (block_res == nullptr) {
+      return 0;
+    }
+    return -1;
+  }
+  if (block_res == nullptr) {
+    return 1; // we point to the end of the page.
+  }
+  if (block_res == other.block_res) {
+    if (other.row_res == nullptr || row_res == nullptr) {
+      // this should only happen if we hit an image block.
+      return 0;
+    }
+    if (row_res == other.row_res) {
+      // we point to the same block and row.
+      ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
+      if (word_res == other.word_res) {
+        // we point to the same word!
+        return 0;
+      }
+
+      WERD_RES_IT word_res_it(&row_res->word_res_list);
+      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
+           word_res_it.forward()) {
+        if (word_res_it.data() == word_res) {
+          return -1;
+        } else if (word_res_it.data() == other.word_res) {
+          return 1;
+        }
+      }
+      ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
+    }
+
+    // we both point to the same block, but different rows.
+    ROW_RES_IT row_res_it(&block_res->row_res_list);
+    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
+         row_res_it.forward()) {
+      if (row_res_it.data() == row_res) {
+        return -1;
+      } else if (row_res_it.data() == other.row_res) {
+        return 1;
+      }
+    }
+    ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
+  }
+
+  // We point to different blocks.
+  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
+  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
+       block_res_it.forward()) {
+    if (block_res_it.data() == block_res) {
+      return -1;
+    } else if (block_res_it.data() == other.block_res) {
+      return 1;
+    }
+  }
+  // Shouldn't happen...
+  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
+  return 0;
+}
+
+// Inserts the new_word as a combination owned by a corresponding WERD_RES
+// before the current position. The simple fields of the WERD_RES are copied
+// from clone_res and the resulting WERD_RES is returned for further setup
+// with best_choice etc.
+WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res,
+                                             WERD *new_word) {
+  // Make a WERD_RES for the new_word.
+  auto *new_res = new WERD_RES(new_word);
+  new_res->CopySimpleFields(clone_res);
+  new_res->combination = true;
+  // Insert into the appropriate place in the ROW_RES.
+  WERD_RES_IT wr_it(&row()->word_res_list);
+  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
+    WERD_RES *word = wr_it.data();
+    if (word == word_res) {
+      break;
+    }
+  }
+  ASSERT_HOST(!wr_it.cycled_list());
+  wr_it.add_before_then_move(new_res);
+  if (wr_it.at_first()) {
+    // This is the new first word, so reset the member iterator so it
+    // detects the cycled_list state correctly.
+    ResetWordIterator();
+  }
+  return new_res;
+}
+
+// Helper computes the boundaries between blobs in the word. The blob bounds
+// are likely very poor, if they come from LSTM, where it only outputs the
+// character at one pixel within it, so we find the midpoints between them.
+static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,
+                            C_BLOB_LIST *next_word_blobs,
+                            std::vector<int> *blob_ends) {
+  C_BLOB_IT blob_it(word.word->cblob_list());
+  for (int length : word.best_state) {
+    // Get the bounding box of the fake blobs
+    TBOX blob_box = blob_it.data()->bounding_box();
+    blob_it.forward();
+    for (int b = 1; b < length; ++b) {
+      blob_box += blob_it.data()->bounding_box();
+      blob_it.forward();
+    }
+    // This blob_box is crap, so for now we are only looking for the
+    // boundaries between them.
+    int blob_end = INT32_MAX;
+    if (!blob_it.at_first() || next_word_blobs != nullptr) {
+      if (blob_it.at_first()) {
+        blob_it.set_to_list(next_word_blobs);
+      }
+      blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
+    }
+    blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
+    blob_ends->push_back(blob_end);
+  }
+  blob_ends->back() = clip_box.right();
+}
+
+// Helper computes the bounds of a word by restricting it to existing words
+// that significantly overlap.
+static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,
+                              int w_index, TBOX prev_box, WERD_RES_IT w_it) {
+  constexpr int kSignificantOverlapFraction = 4;
+  TBOX clipped_box;
+  TBOX current_box = words[w_index]->word->bounding_box();
+  TBOX next_box;
+  if (static_cast<size_t>(w_index + 1) < words.size() &&
+      words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) {
+    next_box = words[w_index + 1]->word->bounding_box();
+  }
+  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
+       w_it.forward()) {
+    if (w_it.data() == nullptr || w_it.data()->word == nullptr) {
+      continue;
+    }
+    TBOX w_box = w_it.data()->word->bounding_box();
+    int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
+    int width_limit = w_box.width() / kSignificantOverlapFraction;
+    int min_significant_overlap = std::max(height_limit, width_limit);
+    int overlap = w_box.intersection(current_box).width();
+    int prev_overlap = w_box.intersection(prev_box).width();
+    int next_overlap = w_box.intersection(next_box).width();
+    if (overlap > min_significant_overlap) {
+      if (prev_overlap > min_significant_overlap) {
+        // We have no choice but to use the LSTM word edge.
+        clipped_box.set_left(current_box.left());
+      } else if (next_overlap > min_significant_overlap) {
+        // We have no choice but to use the LSTM word edge.
+        clipped_box.set_right(current_box.right());
+      } else {
+        clipped_box += w_box;
+      }
+    }
+  }
+  if (clipped_box.height() <= 0) {
+    clipped_box.set_top(current_box.top());
+    clipped_box.set_bottom(current_box.bottom());
+  }
+  if (clipped_box.width() <= 0) {
+    clipped_box = current_box;
+  }
+  return clipped_box;
+}
+
+// Helper moves the blob from src to dest. If it isn't contained by clip_box,
+// the blob is replaced by a fake that is contained.
+static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,
+                            const TBOX &clip_box) {
+  C_BLOB *src_blob = src_it->extract();
+  TBOX box = src_blob->bounding_box();
+  if (!clip_box.contains(box)) {
+    int left =
+        ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
+    int right =
+        ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
+    int top =
+        ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
+    int bottom =
+        ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
+    box = TBOX(left, bottom, right, top);
+    delete src_blob;
+    src_blob = C_BLOB::FakeBlob(box);
+  }
+  dest_it->add_after_then_move(src_blob);
+  return box;
+}
+
+// Replaces the current WERD/WERD_RES with the given words. The given words
+// contain fake blobs that indicate the position of the characters. These are
+// replaced with real blobs from the current word as much as possible.
+void PAGE_RES_IT::ReplaceCurrentWord(
+    tesseract::PointerVector<WERD_RES> *words) {
+  if (words->empty()) {
+    DeleteCurrentWord();
+    return;
+  }
+  WERD_RES *input_word = word();
+  // Set the BOL/EOL flags on the words from the input word.
+  if (input_word->word->flag(W_BOL)) {
+    (*words)[0]->word->set_flag(W_BOL, true);
+  } else {
+    (*words)[0]->word->set_blanks(input_word->word->space());
+  }
+  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
+
+  // Move the blobs from the input word to the new set of words.
+  // If the input word_res is a combination, then the replacements will also be
+  // combinations, and will own their own words. If the input word_res is not a
+  // combination, then the final replacements will not be either, (although it
+  // is allowed for the input words to be combinations) and their words
+  // will get put on the row list. This maintains the ownership rules.
+  WERD_IT w_it(row()->row->word_list());
+  if (!input_word->combination) {
+    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+      WERD *word = w_it.data();
+      if (word == input_word->word) {
+        break;
+      }
+    }
+    // w_it is now set to the input_word's word.
+    ASSERT_HOST(!w_it.cycled_list());
+  }
+  // Insert into the appropriate place in the ROW_RES.
+  WERD_RES_IT wr_it(&row()->word_res_list);
+  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
+    WERD_RES *word = wr_it.data();
+    if (word == input_word) {
+      break;
+    }
+  }
+  ASSERT_HOST(!wr_it.cycled_list());
+  // Since we only have an estimate of the bounds between blobs, use the blob
+  // x-middle as the determiner of where to put the blobs
+  C_BLOB_IT src_b_it(input_word->word->cblob_list());
+  src_b_it.sort(&C_BLOB::SortByXMiddle);
+  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
+  rej_b_it.sort(&C_BLOB::SortByXMiddle);
+  TBOX clip_box;
+  for (size_t w = 0; w < words->size(); ++w) {
+    WERD_RES *word_w = (*words)[w];
+    clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
+    // Compute blob boundaries.
+    std::vector<int> blob_ends;
+    C_BLOB_LIST *next_word_blobs =
+        w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
+    ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
+    // Remove the fake blobs on the current word, but keep safe for back-up if
+    // no blob can be found.
+    C_BLOB_LIST fake_blobs;
+    C_BLOB_IT fake_b_it(&fake_blobs);
+    fake_b_it.add_list_after(word_w->word->cblob_list());
+    fake_b_it.move_to_first();
+    word_w->word->cblob_list()->clear();
+    C_BLOB_IT dest_it(word_w->word->cblob_list());
+    // Build the box word as we move the blobs.
+    auto *box_word = new tesseract::BoxWord;
+    for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
+      int end_x = blob_ends[i];
+      TBOX blob_box;
+      // Add the blobs up to end_x.
+      while (!src_b_it.empty() &&
+             src_b_it.data()->bounding_box().x_middle() < end_x) {
+        blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
+        src_b_it.forward();
+      }
+      while (!rej_b_it.empty() &&
+             rej_b_it.data()->bounding_box().x_middle() < end_x) {
+        blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
+        rej_b_it.forward();
+      }
+      if (blob_box.null_box()) {
+        // Use the original box as a back-up.
+        blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
+      }
+      box_word->InsertBox(i, blob_box);
+    }
+    delete word_w->box_word;
+    word_w->box_word = box_word;
+    if (!input_word->combination) {
+      // Insert word_w->word into the ROW. It doesn't own its word, so the
+      // ROW needs to own it.
+      w_it.add_before_stay_put(word_w->word);
+      word_w->combination = false;
+    }
+    (*words)[w] = nullptr; // We are taking ownership.
+    wr_it.add_before_stay_put(word_w);
+  }
+  // We have taken ownership of the words.
+  words->clear();
+  // Delete the current word, which has been replaced. We could just call
+  // DeleteCurrentWord, but that would iterate both lists again, and we know
+  // we are already in the right place.
+  if (!input_word->combination) {
+    delete w_it.extract();
+  }
+  delete wr_it.extract();
+  ResetWordIterator();
+}
+
+// Deletes the current WERD_RES and its underlying WERD.
+void PAGE_RES_IT::DeleteCurrentWord() {
+  // Check that this word is as we expect. part_of_combos are NEVER iterated
+  // by the normal iterator, so we should never be trying to delete them.
+  ASSERT_HOST(!word_res->part_of_combo);
+  if (!word_res->combination) {
+    // Combinations own their own word, so we won't find the word on the
+    // row's word_list, but it is legitimate to try to delete them.
+    // Delete word from the ROW when not a combination.
+    WERD_IT w_it(row()->row->word_list());
+    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+      if (w_it.data() == word_res->word) {
+        break;
+      }
+    }
+    ASSERT_HOST(!w_it.cycled_list());
+    delete w_it.extract();
+  }
+  // Remove the WERD_RES for the new_word.
+  // Remove the WORD_RES from the ROW_RES.
+  WERD_RES_IT wr_it(&row()->word_res_list);
+  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
+    if (wr_it.data() == word_res) {
+      word_res = nullptr;
+      break;
+    }
+  }
+  ASSERT_HOST(!wr_it.cycled_list());
+  delete wr_it.extract();
+  ResetWordIterator();
+}
+
+// Makes the current word a fuzzy space if not already fuzzy. Updates
+// corresponding part of combo if required.
+void PAGE_RES_IT::MakeCurrentWordFuzzy() {
+  WERD *real_word = word_res->word;
+  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
+    real_word->set_flag(W_FUZZY_SP, true);
+    if (word_res->combination) {
+      // The next word should be the corresponding part of combo, but we have
+      // already stepped past it, so find it by search.
+      WERD_RES_IT wr_it(&row()->word_res_list);
+      for (wr_it.mark_cycle_pt();
+           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
+      }
+      wr_it.forward();
+      ASSERT_HOST(wr_it.data()->part_of_combo);
+      real_word = wr_it.data()->word;
+      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
+                  !real_word->flag(W_FUZZY_NON));
+      real_word->set_flag(W_FUZZY_SP, true);
+    }
+  }
+}
+
+/*************************************************************************
+ * PAGE_RES_IT::restart_page
+ *
+ * Set things up at the start of the page
+ *************************************************************************/
+
+WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {
+  block_res_it.set_to_list(&page_res->block_res_list);
+  block_res_it.mark_cycle_pt();
+  prev_block_res = nullptr;
+  prev_row_res = nullptr;
+  prev_word_res = nullptr;
+  block_res = nullptr;
+  row_res = nullptr;
+  word_res = nullptr;
+  next_block_res = nullptr;
+  next_row_res = nullptr;
+  next_word_res = nullptr;
+  internal_forward(true, empty_ok);
+  return internal_forward(false, empty_ok);
+}
+
+// Recovers from operations on the current word, such as in InsertCloneWord
+// and DeleteCurrentWord.
+// Resets the word_res_it so that it is one past the next_word_res, as
+// it should be after internal_forward. If next_row_res != row_res,
+// then the next_word_res is in the next row, so there is no need to do
+// anything to word_res_it, but it is still a good idea to reset the pointers
+// word_res and prev_word_res, which are still in the current row.
+void PAGE_RES_IT::ResetWordIterator() {
+  if (row_res == next_row_res) {
+    // Reset the member iterator so it can move forward and detect the
+    // cycled_list state correctly.
+    word_res_it.move_to_first();
+    for (word_res_it.mark_cycle_pt();
+         !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
+         word_res_it.forward()) {
+      if (!word_res_it.data()->part_of_combo) {
+        if (prev_row_res == row_res) {
+          prev_word_res = word_res;
+        }
+        word_res = word_res_it.data();
+      }
+    }
+    ASSERT_HOST(!word_res_it.cycled_list());
+    wr_it_of_next_word = word_res_it;
+    word_res_it.forward();
+  } else {
+    // word_res_it is OK, but reset word_res and prev_word_res if needed.
+    WERD_RES_IT wr_it(&row_res->word_res_list);
+    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
+      if (!wr_it.data()->part_of_combo) {
+        if (prev_row_res == row_res) {
+          prev_word_res = word_res;
+        }
+        word_res = wr_it.data();
+      }
+    }
+  }
+}
+
+/*************************************************************************
+ * PAGE_RES_IT::internal_forward
+ *
+ * Find the next word on the page. If empty_ok is true, then non-text blocks
+ * and text blocks with no text are visited as if they contain a single
+ * imaginary word in a single imaginary row. (word() and row() both return
+ *nullptr in such a block and the return value is nullptr.) If empty_ok is
+ *false, the old behaviour is maintained. Each real word is visited and empty
+ *and non-text blocks and rows are skipped. new_block is used to initialize the
+ *iterators for a new block. The iterator maintains pointers to block, row and
+ *word for the previous, current and next words.  These are correct, regardless
+ *of block/row boundaries. nullptr values denote start and end of the page.
+ *************************************************************************/
+
+WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
+  bool new_row = false;
+
+  prev_block_res = block_res;
+  prev_row_res = row_res;
+  prev_word_res = word_res;
+  block_res = next_block_res;
+  row_res = next_row_res;
+  word_res = next_word_res;
+  wr_it_of_current_word = wr_it_of_next_word;
+  next_block_res = nullptr;
+  next_row_res = nullptr;
+  next_word_res = nullptr;
+
+  while (!block_res_it.cycled_list()) {
+    if (new_block) {
+      new_block = false;
+      row_res_it.set_to_list(&block_res_it.data()->row_res_list);
+      row_res_it.mark_cycle_pt();
+      if (row_res_it.empty() && empty_ok) {
+        next_block_res = block_res_it.data();
+        break;
+      }
+      new_row = true;
+    }
+    while (!row_res_it.cycled_list()) {
+      if (new_row) {
+        new_row = false;
+        word_res_it.set_to_list(&row_res_it.data()->word_res_list);
+        word_res_it.mark_cycle_pt();
+      }
+      // Skip any part_of_combo words.
+      while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) {
+        word_res_it.forward();
+      }
+      if (!word_res_it.cycled_list()) {
+        next_block_res = block_res_it.data();
+        next_row_res = row_res_it.data();
+        next_word_res = word_res_it.data();
+        wr_it_of_next_word = word_res_it;
+        word_res_it.forward();
+        goto foundword;
+      }
+      // end of row reached
+      row_res_it.forward();
+      new_row = true;
+    }
+    // end of block reached
+    block_res_it.forward();
+    new_block = true;
+  }
+foundword:
+  // Update prev_word_best_choice pointer.
+  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {
+    *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr)
+                                           ? nullptr
+                                           : prev_word_res->best_choice;
+  }
+  return word_res;
+}
+
+/*************************************************************************
+ * PAGE_RES_IT::restart_row()
+ *
+ * Move to the beginning (leftmost word) of the current row.
+ *************************************************************************/
+WERD_RES *PAGE_RES_IT::restart_row() {
+  ROW_RES *row = this->row();
+  if (!row) {
+    return nullptr;
+  }
+  for (restart_page(); this->row() != row; forward()) {
+    // pass
+  }
+  return word();
+}
+
+/*************************************************************************
+ * PAGE_RES_IT::forward_paragraph
+ *
+ * Move to the beginning of the next paragraph, allowing empty blocks.
+ *************************************************************************/
+
+WERD_RES *PAGE_RES_IT::forward_paragraph() {
+  while (block_res == next_block_res &&
+         (next_row_res != nullptr && next_row_res->row != nullptr &&
+          row_res->row->para() == next_row_res->row->para())) {
+    internal_forward(false, true);
+  }
+  return internal_forward(false, true);
+}
+
+/*************************************************************************
+ * PAGE_RES_IT::forward_block
+ *
+ * Move to the beginning of the next block, allowing empty blocks.
+ *************************************************************************/
+
+WERD_RES *PAGE_RES_IT::forward_block() {
+  while (block_res == next_block_res) {
+    internal_forward(false, true);
+  }
+  return internal_forward(false, true);
+}
+
+void PAGE_RES_IT::rej_stat_word() {
+  int16_t chars_in_word;
+  int16_t rejects_in_word = 0;
+
+  chars_in_word = word_res->reject_map.length();
+  page_res->char_count += chars_in_word;
+  block_res->char_count += chars_in_word;
+  row_res->char_count += chars_in_word;
+
+  rejects_in_word = word_res->reject_map.reject_count();
+
+  page_res->rej_count += rejects_in_word;
+  block_res->rej_count += rejects_in_word;
+  row_res->rej_count += rejects_in_word;
+  if (chars_in_word == rejects_in_word) {
+    row_res->whole_word_rej_count += rejects_in_word;
+  }
+}
+
+} // namespace tesseract
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children