diff mupdf-source/thirdparty/tesseract/src/classify/shapetable.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/classify/shapetable.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,375 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+///////////////////////////////////////////////////////////////////////
+// File:        shapetable.h
+// Description: Class to map a classifier shape index to unicharset
+//              indices and font indices.
+// Author:      Ray Smith
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
+#define TESSERACT_CLASSIFY_SHAPETABLE_H_
+
+#include "bitvector.h"
+#include "fontinfo.h"
+#include "genericheap.h"
+#include "intmatcher.h"
+#include "tesserrstream.h"  // for tesserr
+
+namespace tesseract {
+
+class UNICHARSET;
+class ShapeTable;
+
+// Simple struct to hold a single classifier unichar selection, a corresponding
+// rating, and a list of appropriate fonts.
+struct UnicharRating {
+  UnicharRating() : unichar_id(0), rating(0.0f), adapted(false), config(0), feature_misses(0) {}
+  UnicharRating(int u, float r)
+      : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {}
+
+  // Print debug info.
+  void Print() const {
+    tesserr << "Unichar-id=" << unichar_id << ", rating=" << rating
+            << ", adapted=" << adapted << ", config=" << config
+            << ", misses=" << feature_misses << ", "
+            << fonts.size() << " fonts\n";
+  }
+
+  // Helper function to get the index of the first result with the required
+  // unichar_id. If the results are sorted by rating, this will also be the
+  // best result with the required unichar_id.
+  // Returns -1 if the unichar_id is not found
+  static int FirstResultWithUnichar(const std::vector<UnicharRating> &results,
+                                    UNICHAR_ID unichar_id);
+
+  // Index into some UNICHARSET table indicates the class of the answer.
+  UNICHAR_ID unichar_id;
+  // Rating from classifier with 1.0 perfect and 0.0 impossible.
+  // Call it a probability if you must.
+  float rating;
+  // True if this result is from the adaptive classifier.
+  bool adapted;
+  // Index of best matching font configuration of result.
+  uint8_t config;
+  // Number of features that were total misses - were liked by no classes.
+  uint16_t feature_misses;
+  // Unsorted collection of fontinfo ids and scores. Note that a raw result
+  // from the IntegerMatch will contain config ids, that require transforming
+  // to fontinfo ids via fontsets and (possibly) shapetable.
+  std::vector<ScoredFont> fonts;
+};
+
+// Classifier result from a low-level classification is an index into some
+// ShapeTable and a rating.
+struct ShapeRating {
+  ShapeRating() : shape_id(0), rating(0.0f), raw(0.0f), font(0.0f), joined(false), broken(false) {}
+  ShapeRating(int s, float r)
+      : shape_id(s), rating(r), raw(1.0f), font(0.0f), joined(false), broken(false) {}
+
+  // Helper function to get the index of the first result with the required
+  // unichar_id. If the results are sorted by rating, this will also be the
+  // best result with the required unichar_id.
+  // Returns -1 if the unichar_id is not found
+  static int FirstResultWithUnichar(const std::vector<ShapeRating> &results,
+                                    const ShapeTable &shape_table, UNICHAR_ID unichar_id);
+
+  // Index into some shape table indicates the class of the answer.
+  int shape_id;
+  // Rating from classifier with 1.0 perfect and 0.0 impossible.
+  // Call it a probability if you must.
+  float rating;
+  // Subsidiary rating that a classifier may use internally.
+  float raw;
+  // Subsidiary rating that a classifier may use internally.
+  float font;
+  // Flag indicating that the input may be joined.
+  bool joined;
+  // Flag indicating that the input may be broken (a fragment).
+  bool broken;
+};
+
+// Simple struct to hold an entry for a heap-based priority queue of
+// ShapeRating.
+struct ShapeQueueEntry {
+  ShapeQueueEntry() : result(ShapeRating(0, 0.0f)), level(0) {}
+  ShapeQueueEntry(const ShapeRating &rating, int level0) : result(rating), level(level0) {}
+
+  // Sort by decreasing rating and decreasing level for equal rating.
+  bool operator<(const ShapeQueueEntry &other) const {
+    if (result.rating > other.result.rating) {
+      return true;
+    }
+    if (result.rating == other.result.rating) {
+      return level > other.level;
+    }
+    return false;
+  }
+
+  // Output from classifier.
+  ShapeRating result;
+  // Which level in the tree did this come from?
+  int level;
+};
+using ShapeQueue = GenericHeap<ShapeQueueEntry>;
+
+// Simple struct to hold a set of fonts associated with a single unichar-id.
+// A vector of UnicharAndFonts makes a shape.
+struct UnicharAndFonts {
+  UnicharAndFonts() : unichar_id(0) {}
+  UnicharAndFonts(int uni_id, int font_id) : unichar_id(uni_id) {
+    font_ids.push_back(font_id);
+  }
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE *fp) const;
+  // Reads from the given file. Returns false in case of error.
+  bool DeSerialize(TFile *fp);
+
+  // Sort function to sort a pair of UnicharAndFonts by unichar_id.
+  static int SortByUnicharId(const void *v1, const void *v2);
+  static bool StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2);
+
+  std::vector<int32_t> font_ids;
+  int32_t unichar_id;
+};
+
+// A Shape is a collection of unichar-ids and a list of fonts associated with
+// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
+// a classifiable unit, and represents a group of characters or parts of
+// characters that have a similar or identical shape. Shapes/ShapeTables may
+// be organized hierarchically from identical shapes at the leaves to vaguely
+// similar shapes near the root.
+class TESS_API Shape {
+public:
+  Shape() : destination_index_(-1) {}
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE *fp) const;
+  // Reads from the given file. Returns false in case of error.
+  bool DeSerialize(TFile *fp);
+
+  int destination_index() const {
+    return destination_index_;
+  }
+  void set_destination_index(int index) {
+    destination_index_ = index;
+  }
+  int size() const {
+    return unichars_.size();
+  }
+  // Returns a UnicharAndFonts entry for the given index, which must be
+  // in the range [0, size()).
+  const UnicharAndFonts &operator[](int index) const {
+    return unichars_[index];
+  }
+  // Sets the unichar_id of the given index to the new unichar_id.
+  void SetUnicharId(int index, int unichar_id) {
+    unichars_[index].unichar_id = unichar_id;
+  }
+  // Adds a font_id for the given unichar_id. If the unichar_id is not
+  // in the shape, it is added.
+  void AddToShape(int unichar_id, int font_id);
+  // Adds everything in other to this.
+  void AddShape(const Shape &other);
+  // Returns true if the shape contains the given unichar_id, font_id pair.
+  bool ContainsUnicharAndFont(int unichar_id, int font_id) const;
+  // Returns true if the shape contains the given unichar_id, ignoring font.
+  bool ContainsUnichar(int unichar_id) const;
+  // Returns true if the shape contains the given font, ignoring unichar_id.
+  bool ContainsFont(int font_id) const;
+  // Returns true if the shape contains the given font properties, ignoring
+  // unichar_id.
+  bool ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const;
+  // Returns true if the shape contains multiple different font properties,
+  // ignoring unichar_id.
+  bool ContainsMultipleFontProperties(const FontInfoTable &font_table) const;
+  // Returns true if this shape is equal to other (ignoring order of unichars
+  // and fonts).
+  bool operator==(const Shape &other) const;
+  // Returns true if this is a subset (including equal) of other.
+  bool IsSubsetOf(const Shape &other) const;
+  // Returns true if the lists of unichar ids are the same in this and other,
+  // ignoring fonts.
+  // NOT const, as it will sort the unichars on demand.
+  bool IsEqualUnichars(Shape *other);
+
+private:
+  // Sorts the unichars_ vector by unichar.
+  void SortUnichars();
+
+  // Flag indicates that the unichars are sorted, allowing faster set
+  // operations with another shape.
+  bool unichars_sorted_ = false;
+  // If this Shape is part of a ShapeTable the destination_index_ is the index
+  // of some other shape in the ShapeTable with which this shape is merged.
+  int destination_index_ = 0;
+  // Array of unichars, each with a set of fonts. Each unichar has at most
+  // one entry in the vector.
+  std::vector<UnicharAndFonts> unichars_;
+};
+
+// ShapeTable is a class to encapsulate the triple indirection that is
+// used here.
+// ShapeTable is a vector of shapes.
+// Each shape is a vector of UnicharAndFonts representing the set of unichars
+// that the shape represents.
+// Each UnicharAndFonts also lists the fonts of the unichar_id that were
+// mapped to the shape during training.
+class TESS_API ShapeTable {
+public:
+  ShapeTable();
+  // The UNICHARSET reference supplied here, or in set_unicharset below must
+  // exist for the entire life of the ShapeTable. It is used only by DebugStr.
+  explicit ShapeTable(const UNICHARSET &unicharset);
+  ~ShapeTable() {
+    for (auto data : shape_table_) {
+      delete data;
+    }
+  }
+
+  // Writes to the given file. Returns false in case of error.
+  bool Serialize(FILE *fp) const;
+  // Reads from the given file. Returns false in case of error.
+  bool DeSerialize(TFile *fp);
+
+  // Accessors.
+  unsigned NumShapes() const {
+    return shape_table_.size();
+  }
+  const UNICHARSET &unicharset() const {
+    return *unicharset_;
+  }
+  // Returns the number of fonts used in this ShapeTable, computing it if
+  // necessary.
+  int NumFonts() const;
+  // Shapetable takes a pointer to the UNICHARSET, so it must persist for the
+  // entire life of the ShapeTable.
+  void set_unicharset(const UNICHARSET &unicharset) {
+    unicharset_ = &unicharset;
+  }
+  // Re-indexes the class_ids in the shapetable according to the given map.
+  // Useful in conjunction with set_unicharset.
+  void ReMapClassIds(const std::vector<int> &unicharset_map);
+  // Returns a string listing the classes/fonts in a shape.
+  std::string DebugStr(unsigned shape_id) const;
+  // Returns a debug string summarizing the table.
+  std::string SummaryStr() const;
+
+  // Adds a new shape starting with the given unichar_id and font_id.
+  // Returns the assigned index.
+  unsigned AddShape(int unichar_id, int font_id);
+  // Adds a copy of the given shape unless it is already present.
+  // Returns the assigned index or index of existing shape if already present.
+  unsigned AddShape(const Shape &other);
+  // Removes the shape given by the shape index. All indices above are changed!
+  void DeleteShape(unsigned shape_id);
+  // Adds a font_id to the given existing shape index for the given
+  // unichar_id. If the unichar_id is not in the shape, it is added.
+  void AddToShape(unsigned shape_id, int unichar_id, int font_id);
+  // Adds the given shape to the existing shape with the given index.
+  void AddShapeToShape(unsigned shape_id, const Shape &other);
+  // Returns the id of the shape that contains the given unichar and font.
+  // If not found, returns -1.
+  // If font_id < 0, the font_id is ignored and the first shape that matches
+  // the unichar_id is returned.
+  int FindShape(int unichar_id, int font_id) const;
+  // Returns the first unichar_id and font_id in the given shape.
+  void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const;
+
+  // Accessors for the Shape with the given shape_id.
+  const Shape &GetShape(unsigned shape_id) const {
+    return *shape_table_[shape_id];
+  }
+  Shape *MutableShape(unsigned shape_id) {
+    return shape_table_[shape_id];
+  }
+
+  // Expands all the classes/fonts in the shape individually to build
+  // a ShapeTable.
+  int BuildFromShape(const Shape &shape, const ShapeTable &master_shapes);
+
+  // Returns true if the shapes are already merged.
+  bool AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const;
+  // Returns true if any shape contains multiple unichars.
+  bool AnyMultipleUnichars() const;
+  // Returns the maximum number of unichars over all shapes.
+  int MaxNumUnichars() const;
+  // Merges shapes with a common unichar over the [start, end) interval.
+  // Assumes single unichar per shape.
+  void ForceFontMerges(unsigned start, unsigned end);
+  // Returns the number of unichars in the master shape.
+  unsigned MasterUnicharCount(unsigned shape_id) const;
+  // Returns the sum of the font counts in the master shape.
+  int MasterFontCount(unsigned shape_id) const;
+  // Returns the number of unichars that would result from merging the shapes.
+  int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const;
+  // Merges two shape_ids, leaving shape_id2 marked as merged.
+  void MergeShapes(unsigned shape_id1, unsigned shape_id2);
+  // Swaps two shape_ids.
+  void SwapShapes(unsigned shape_id1, unsigned shape_id2);
+  // Appends the master shapes from other to this.
+  // Used to create a clean ShapeTable from a merged one, or to create a
+  // copy of a ShapeTable.
+  // If not nullptr, shape_map is set to map other shape_ids to this's
+  // shape_ids.
+  void AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map);
+  // Returns the number of master shapes remaining after merging.
+  int NumMasterShapes() const;
+  // Returns the destination of this shape, (if merged), taking into account
+  // the fact that the destination may itself have been merged.
+  // For a non-merged shape, returns the input shape_id.
+  unsigned MasterDestinationIndex(unsigned shape_id) const;
+
+  // Returns false if the unichars in neither shape is a subset of the other..
+  bool SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const;
+  // Returns false if the unichars in neither shape is a subset of the other..
+  bool MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const;
+  // Returns true if the unichar sets are equal between the shapes.
+  bool EqualUnichars(unsigned shape_id1, unsigned shape_id2) const;
+  bool MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const;
+  // Returns true if there is a common unichar between the shapes.
+  bool CommonUnichars(unsigned shape_id1, unsigned shape_id2) const;
+  // Returns true if there is a common font id between the shapes.
+  bool CommonFont(unsigned shape_id1, unsigned shape_id2) const;
+
+  // Adds the unichars of the given shape_id to the vector of results. Any
+  // unichar_id that is already present just has the fonts added to the
+  // font set for that result without adding a new entry in the vector.
+  // NOTE: it is assumed that the results are given to this function in order
+  // of decreasing rating.
+  // The unichar_map vector indicates the index of the results entry containing
+  // each unichar, or -1 if the unichar is not yet included in results.
+  void AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map,
+                         std::vector<UnicharRating> *results) const;
+
+private:
+  // Adds the given unichar_id to the results if needed, updating unichar_map
+  // and returning the index of unichar in results.
+  int AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map,
+                          std::vector<UnicharRating> *results) const;
+
+  // Pointer to a provided unicharset used only by the Debugstr member.
+  const UNICHARSET *unicharset_;
+  // Vector of pointers to the Shapes in this ShapeTable.
+  std::vector<Shape *> shape_table_;
+
+  // Cached data calculated on demand.
+  mutable int num_fonts_;
+};
+
+} // namespace tesseract.
+
+#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_