diff mupdf-source/thirdparty/tesseract/src/ccstruct/werd.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/werd.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,212 @@
+/**********************************************************************
+ * File:        werd.h
+ * Description: Code for the WERD class.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1991, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef WERD_H
+#define WERD_H
+
+#include "elst2.h"
+#include "params.h"
+#include "stepblob.h"
+
+#include <bitset>
+
+namespace tesseract {
+
+enum WERD_FLAGS {
+  W_SEGMENTED,          ///< correctly segmented
+  W_ITALIC,             ///< italic text
+  W_BOLD,               ///< bold text
+  W_BOL,                ///< start of line
+  W_EOL,                ///< end of line
+  W_NORMALIZED,         ///< flags
+  W_SCRIPT_HAS_XHEIGHT, ///< x-height concept makes sense.
+  W_SCRIPT_IS_LATIN,    ///< Special case latin for y. splitting.
+  W_DONT_CHOP,          ///< fixed pitch chopped
+  W_REP_CHAR,           ///< repeated character
+  W_FUZZY_SP,           ///< fuzzy space
+  W_FUZZY_NON,          ///< fuzzy nonspace
+  W_INVERSE             ///< white on black
+};
+
+enum DISPLAY_FLAGS {
+  /* Display flags bit number allocations */
+  DF_BOX,          ///< Bounding box
+  DF_TEXT,         ///< Correct ascii
+  DF_POLYGONAL,    ///< Polyg approx
+  DF_EDGE_STEP,    ///< Edge steps
+  DF_BN_POLYGONAL, ///< BL normalisd polyapx
+  DF_BLAMER        ///< Blamer information
+};
+
+class ROW; // forward decl
+
+class TESS_API WERD : public ELIST2_LINK {
+public:
+  WERD() = default;
+  // WERD constructed with:
+  //   blob_list - blobs of the word (we take this list's contents)
+  //   blanks - number of blanks before the word
+  //   text - correct text (outlives WERD)
+  WERD(C_BLOB_LIST *blob_list, uint8_t blanks, const char *text);
+
+  // WERD constructed from:
+  //   blob_list - blobs in the word
+  //   clone - werd to clone flags, etc from.
+  WERD(C_BLOB_LIST *blob_list, WERD *clone);
+
+  // Construct a WERD from a single_blob and clone the flags from this.
+  // W_BOL and W_EOL flags are set according to the given values.
+  WERD *ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob);
+
+  ~WERD() = default;
+
+  // assignment
+  WERD &operator=(const WERD &source);
+
+  // This method returns a new werd constructed using the blobs in the input
+  // all_blobs list, which correspond to the blobs in this werd object. The
+  // blobs used to construct the new word are consumed and removed from the
+  // input all_blobs list.
+  // Returns nullptr if the word couldn't be constructed.
+  // Returns original blobs for which no matches were found in the output list
+  // orphan_blobs (appends).
+  WERD *ConstructWerdWithNewBlobs(C_BLOB_LIST *all_blobs, C_BLOB_LIST *orphan_blobs);
+
+  // Accessors for reject / DUFF blobs in various formats
+  C_BLOB_LIST *rej_cblob_list() { // compact format
+    return &rej_cblobs;
+  }
+
+  // Accessors for good blobs in various formats.
+  C_BLOB_LIST *cblob_list() { // get compact blobs
+    return &cblobs;
+  }
+
+  uint8_t space() const { // access function
+    return blanks;
+  }
+  void set_blanks(uint8_t new_blanks) {
+    blanks = new_blanks;
+  }
+  int script_id() const {
+    return script_id_;
+  }
+  void set_script_id(int id) {
+    script_id_ = id;
+  }
+
+  // Returns the (default) bounding box including all the dots.
+  TBOX bounding_box() const; // compute bounding box
+  // Returns the bounding box including the desired combination of upper and
+  // lower noise/diacritic elements.
+  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
+  // Returns the bounding box of only the good blobs.
+  TBOX true_bounding_box() const;
+
+  const char *text() const {
+    return correct.c_str();
+  }
+  void set_text(const char *new_text) {
+    correct = new_text;
+  }
+
+  bool flag(WERD_FLAGS mask) const {
+    return flags[mask];
+  }
+  void set_flag(WERD_FLAGS mask, bool value) {
+    flags.set(mask, value);
+  }
+
+  bool display_flag(uint8_t flag) const {
+    return disp_flags[flag];
+  }
+  void set_display_flag(uint8_t flag, bool value) {
+    disp_flags.set(flag, value);
+  }
+
+  WERD *shallow_copy(); // shallow copy word
+
+  // reposition word by vector
+  void move(const ICOORD vec);
+
+  // join other's blobs onto this werd, emptying out other.
+  void join_on(WERD *other);
+
+  // copy other's blobs onto this word, leaving other intact.
+  void copy_on(WERD *other);
+
+  // tprintf word metadata (but not blob innards)
+  void print() const;
+
+#ifndef GRAPHICS_DISABLED
+  // plot word on window in a uniform colour
+  void plot(ScrollView *window, ScrollView::Color colour);
+
+  // Get the next color in the (looping) rainbow.
+  static ScrollView::Color NextColor(ScrollView::Color colour);
+
+  // plot word on window in a rainbow of colours
+  void plot(ScrollView *window);
+
+  // plot rejected blobs in a rainbow of colours
+  void plot_rej_blobs(ScrollView *window);
+#endif // !GRAPHICS_DISABLED
+
+  // Removes noise from the word by moving small outlines to the rej_cblobs
+  // list, based on the size_threshold.
+  void CleanNoise(float size_threshold);
+
+  // Extracts all the noise outlines and stuffs the pointers into the given
+  // vector of outlines. Afterwards, the outlines vector owns the pointers.
+  void GetNoiseOutlines(std::vector<C_OUTLINE *> *outlines);
+  // Adds the selected outlines to the indcated real blobs, and puts the rest
+  // back in rej_cblobs where they came from. Where the target_blobs entry is
+  // nullptr, a run of wanted outlines is put into a single new blob.
+  // Ownership of the outlines is transferred back to the word. (Hence
+  // vector and not PointerVector.)
+  // Returns true if any new blob was added to the start of the word, which
+  // suggests that it might need joining to the word before it, and likewise
+  // sets make_next_word_fuzzy true if any new blob was added to the end.
+  bool AddSelectedOutlines(const std::vector<bool> &wanted,
+                           const std::vector<C_BLOB *> &target_blobs,
+                           const std::vector<C_OUTLINE *> &outlines, bool *make_next_word_fuzzy);
+
+private:
+  uint8_t blanks = 0;     // no of blanks
+  std::bitset<16> flags;  // flags about word
+  std::bitset<16> disp_flags; // display flags
+  int16_t script_id_ = 0; // From unicharset.
+  std::string correct;    // correct text
+  C_BLOB_LIST cblobs;     // compacted blobs
+  C_BLOB_LIST rej_cblobs; // DUFF blobs
+};
+
+ELIST2IZEH(WERD)
+
+} // namespace tesseract
+
+#include "ocrrow.h" // placed here due to
+
+namespace tesseract {
+
+// compare words by increasing order of left edge, suitable for qsort(3)
+int word_comparator(const void *word1p, const void *word2p);
+
+} // namespace tesseract
+
+#endif