diff mupdf-source/thirdparty/tesseract/src/wordrec/lm_pain_points.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/wordrec/lm_pain_points.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,133 @@
+///////////////////////////////////////////////////////////////////////
+// File:        lm_pain_points.h
+// Description: Functions that utilize the knowledge about the properties
+//              of the paths explored by the segmentation search in order
+//              to generate "pain points" - the locations in the ratings
+//              matrix which should be classified next.
+// Author:      Rika Antonova
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
+#define TESSERACT_WORDREC_PAIN_POINTS_H_
+
+#include "genericheap.h" // for GenericHeap
+#include "matrix.h"      // for MATRIX_COORD (ptr only), MatrixCoordPair
+#include "stopper.h"     // for DANGERR
+
+namespace tesseract {
+
+class Dict;
+struct ViterbiStateEntry;
+class WERD_RES;
+
+// Heap of pain points used for determining where to chop/join.
+using PainPointHeap = GenericHeap<MatrixCoordPair>;
+
+// Types of pain points (ordered in the decreasing level of importance).
+enum LMPainPointsType {
+  LM_PPTYPE_BLAMER,
+  LM_PPTYPE_AMBIG,
+  LM_PPTYPE_PATH,
+  LM_PPTYPE_SHAPE,
+
+  LM_PPTYPE_NUM
+};
+
+static const char *const LMPainPointsTypeName[] = {
+    "LM_PPTYPE_BLAMER",
+    "LM_PPTYPE_AMBIGS",
+    "LM_PPTYPE_PATH",
+    "LM_PPTYPE_SHAPE",
+};
+
+class LMPainPoints {
+public:
+  static const float kDefaultPainPointPriorityAdjustment;
+  // If there is a significant drop in character ngram probability or a
+  // dangerous ambiguity make the thresholds on what blob combinations
+  // can be classified looser.
+  static const float kLooseMaxCharWhRatio;
+  // Returns a description of the type of a pain point.
+  static const char *PainPointDescription(LMPainPointsType type) {
+    return LMPainPointsTypeName[type];
+  }
+
+  LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
+      : max_heap_size_(max)
+      , max_char_wh_ratio_(rat)
+      , fixed_pitch_(fp)
+      , dict_(d)
+      , debug_level_(deb) {}
+  ~LMPainPoints() = default;
+
+  // Returns true if the heap of pain points of pp_type is not empty().
+  inline bool HasPainPoints(LMPainPointsType pp_type) const {
+    return !pain_points_heaps_[pp_type].empty();
+  }
+
+  // Dequeues the next pain point from the pain points queue and copies
+  // its contents and priority to *pp and *priority.
+  // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
+  LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
+
+  // Clears pain points heap.
+  void Clear() {
+    for (auto &pain_points_heap : pain_points_heaps_) {
+      pain_points_heap.clear();
+    }
+  }
+
+  // For each cell, generate a "pain point" if the cell is not classified
+  // and has a left or right neighbor that was classified.
+  void GenerateInitial(WERD_RES *word_res);
+
+  // Generate pain points from the given path.
+  void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res);
+
+  // Generate pain points from dangerous ambiguities in best choice.
+  void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res);
+
+  // Adds a pain point to classify chunks_record->ratings(col, row).
+  // Returns true if a new pain point was added to an appropriate heap.
+  // Pain point priority is set to special_priority for pain points of
+  // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
+  // AssociateStats::gap_sum is used.
+  bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority,
+                         bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res);
+
+  // Adjusts the pain point coordinates to cope with expansion of the ratings
+  // matrix due to a split of the blob with the given index.
+  void RemapForSplit(int index);
+
+private:
+  // Priority queues containing pain points generated by the language model
+  // The priority is set by the language model components, adjustments like
+  // seam cost and width priority are factored into the priority.
+  PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
+  // Maximum number of points to keep in the heap.
+  int max_heap_size_;
+  // Maximum character width/height ratio.
+  float max_char_wh_ratio_;
+  // Set to true if fixed pitch should be assumed.
+  bool fixed_pitch_;
+  // Cached pointer to dictionary.
+  const Dict *dict_;
+  // Debug level for print statements.
+  int debug_level_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_WORDREC_PAIN_POINTS_H_