Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.h comparison

comparison mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/**********************************************************************
+* File:        pageres.h  (Formerly page_res.h)
+* Description: Results classes used by control.c
+* Author:      Phil Cheatle
+*
+* (C) Copyright 1992, Hewlett-Packard Ltd.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*
+**********************************************************************/
+#ifndef PAGERES_H
+#define PAGERES_H
+#include "blamer.h"        // for BlamerBundle (ptr only), IRR_NUM_REASONS
+#include "clst.h"          // for CLIST_ITERATOR, CLISTIZEH
+#include "elst.h"          // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH
+#include "genericvector.h" // for PointerVector
+#include "matrix.h"        // for MATRIX
+#include "normalis.h"      // for DENORM
+#include "ratngs.h"        // for WERD_CHOICE, BLOB_CHOICE (ptr only)
+#include "rect.h"          // for TBOX
+#include "rejctmap.h"      // for REJMAP
+#include "unicharset.h"    // for UNICHARSET, UNICHARSET::Direction, UNI...
+#include "werd.h"          // for WERD, W_BOL, W_EOL
+#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
+#include <cstdint>    // for int32_t, int16_t
+#include <functional> // for std::function
+#include <set>        // for std::pair
+#include <vector>     // for std::vector
+#include <sys/types.h> // for int8_t
+struct Pix;
+namespace tesseract {
+class BLOCK;
+class BLOCK_LIST;
+class BLOCK_RES;
+class ROW;
+class ROW_RES;
+class SEAM;
+class WERD_RES;
+struct TWERD;
+class BoxWord;
+class Tesseract;
+struct FontInfo;
+/* Forward declarations */
+class BLOCK_RES;
+ELISTIZEH(BLOCK_RES)
+CLISTIZEH(BLOCK_RES)
+class ROW_RES;
+ELISTIZEH(ROW_RES)
+class WERD_RES;
+ELISTIZEH(WERD_RES)
+/*************************************************************************
+* PAGE_RES - Page results
+*************************************************************************/
+class PAGE_RES { // page result
+public:
+int32_t char_count;
+int32_t rej_count;
+BLOCK_RES_LIST block_res_list;
+bool rejected;
+// Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to
+// the next word. This pointer is not owned by PAGE_RES class.
+WERD_CHOICE **prev_word_best_choice;
+// Sums of blame reasons computed by the blamer.
+std::vector<int> blame_reasons;
+// Debug information about all the misadaptions on this page.
+// Each BlamerBundle contains an index into this vector, so that words that
+// caused misadaption could be marked. However, since words could be
+// deleted/split/merged, the log is stored on the PAGE_RES level.
+std::vector<std::string> misadaption_log;
+inline void Init() {
+char_count = 0;
+rej_count = 0;
+rejected = false;
+prev_word_best_choice = nullptr;
+blame_reasons.clear();
+blame_reasons.resize(IRR_NUM_REASONS);
+}
+PAGE_RES() {
+Init();
+} // empty constructor
+PAGE_RES(bool merge_similar_words,
+BLOCK_LIST *block_list, // real blocks
+WERD_CHOICE **prev_word_best_choice_ptr);
+~PAGE_RES() = default;
+};
+/*************************************************************************
+* BLOCK_RES - Block results
+*************************************************************************/
+class BLOCK_RES : public ELIST_LINK {
+public:
+BLOCK *block;       // real block
+int32_t char_count; // chars in block
+int32_t rej_count;  // rejected chars
+int16_t font_class; //
+int16_t row_count;
+float x_height;
+bool font_assigned; // block already
+//      processed
+ROW_RES_LIST row_res_list;
+BLOCK_RES() = default;
+BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block
+~BLOCK_RES() = default;
+};
+/*************************************************************************
+* ROW_RES - Row results
+*************************************************************************/
+class ROW_RES : public ELIST_LINK {
+public:
+ROW *row;                     // real row
+int32_t char_count;           // chars in block
+int32_t rej_count;            // rejected chars
+int32_t whole_word_rej_count; // rejs in total rej wds
+WERD_RES_LIST word_res_list;
+ROW_RES() = default;
+ROW_RES(bool merge_similar_words, ROW *the_row); // real row
+~ROW_RES() = default;
+};
+/*************************************************************************
+* WERD_RES - Word results
+*************************************************************************/
+enum CRUNCH_MODE { CR_NONE, CR_KEEP_SPACE, CR_LOOSE_SPACE, CR_DELETE };
+// WERD_RES is a collection of publicly accessible members that gathers
+// information about a word result.
+class TESS_API WERD_RES : public ELIST_LINK {
+public:
+// Which word is which?
+// There are 3 coordinate spaces in use here: a possibly rotated pixel space,
+// the original image coordinate space, and the BLN space in which the
+// baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight,
+// and the x-middle of the word is at 0.
+// In the rotated pixel space, coordinates correspond to the input image,
+// but may be rotated about the origin by a multiple of 90 degrees,
+// and may therefore be negative.
+// In any case a rotation by denorm.block()->re_rotation() will take them
+// back to the original image.
+// The other differences between words all represent different stages of
+// processing during recognition.
+// ---------------------------INPUT-------------------------------------
+// The word is the input C_BLOBs in the rotated pixel space.
+// word is NOT owned by the WERD_RES unless combination is true.
+// All the other word pointers ARE owned by the WERD_RES.
+WERD *word = nullptr; // Input C_BLOB word.
+// -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------
+// The bln_boxes contains the bounding boxes (only) of the input word, in the
+// BLN space. The lengths of word and bln_boxes
+// match as they are both before any chopping.
+// TODO(rays) determine if docqual does anything useful and delete bln_boxes
+// if it doesn't.
+tesseract::BoxWord *bln_boxes = nullptr; // BLN input bounding boxes.
+// The ROW that this word sits in. NOT owned by the WERD_RES.
+ROW *blob_row = nullptr;
+// The denorm provides the transformation to get back to the rotated image
+// coords from the chopped_word/rebuild_word BLN coords, but each blob also
+// has its own denorm.
+DENORM denorm; // For use on chopped_word.
+// Unicharset used by the classifier output in best_choice and raw_choice.
+const UNICHARSET *uch_set = nullptr; // For converting back to utf8.
+// ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION----
+// ----Setup to a (different!) state expected by the various classifiers----
+// TODO(rays) Tidy and make more consistent.
+// The chopped_word is also in BLN space, and represents the fully chopped
+// character fragments that make up the word.
+// The length of chopped_word matches length of seam_array + 1 (if set).
+TWERD *chopped_word = nullptr; // BLN chopped fragments output.
+// Vector of SEAM* holding chopping points matching chopped_word.
+std::vector<SEAM *> seam_array;
+// Widths of blobs in chopped_word.
+std::vector<int> blob_widths;
+// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
+// blob i and blob i+1.
+std::vector<int> blob_gaps;
+// Stores the lstm choices of every timestep
+std::vector<std::vector<std::pair<const char *, float>>> timesteps;
+// Stores the lstm choices of every timestep segmented by character
+std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
+segmented_timesteps;
+// Symbolchoices acquired during CTC
+std::vector<std::vector<std::pair<const char *, float>>> CTC_symbol_choices;
+// Stores if the timestep vector starts with a space
+bool leading_space = false;
+// Stores value when the word ends
+int end = 0;
+// Ratings matrix contains classifier choices for each classified combination
+// of blobs. The dimension is the same as the number of blobs in chopped_word
+// and the leading diagonal corresponds to classifier results of the blobs
+// in chopped_word. The state_ members of best_choice, raw_choice and
+// best_choices all correspond to this ratings matrix and allow extraction
+// of the blob choices for any given WERD_CHOICE.
+MATRIX *ratings = nullptr; // Owned pointer.
+// Pointer to the first WERD_CHOICE in best_choices. This is the result that
+// will be output from Tesseract. Note that this is now a borrowed pointer
+// and should NOT be deleted.
+WERD_CHOICE *best_choice = nullptr; // Borrowed pointer.
+// The best raw_choice found during segmentation search. Differs from the
+// best_choice by being the best result according to just the character
+// classifier, not taking any language model information into account.
+// Unlike best_choice, the pointer IS owned by this WERD_RES.
+WERD_CHOICE *raw_choice = nullptr; // Owned pointer.
+// Alternative results found during chopping/segmentation search stages.
+// Note that being an ELIST, best_choices owns the WERD_CHOICEs.
+WERD_CHOICE_LIST best_choices;
+// Truth bounding boxes, text and incorrect choice reason.
+BlamerBundle *blamer_bundle = nullptr;
+// --------------OUTPUT FROM RECOGNITION-------------------------------
+// --------------Not all fields are necessarily set.-------------------
+// ---best_choice, raw_choice *must* end up set, with a box_word-------
+// ---In complete output, the number of blobs in rebuild_word matches---
+// ---the number of boxes in box_word, the number of unichar_ids in---
+// ---best_choice, the number of ints in best_state, and the number---
+// ---of strings in correct_text--------------------------------------
+// ---SetupFake Sets everything to appropriate values if the word is---
+// ---known to be bad before recognition.------------------------------
+// The rebuild_word is also in BLN space, but represents the final best
+// segmentation of the word. Its length is therefore the same as box_word.
+TWERD *rebuild_word = nullptr; // BLN best segmented word.
+// The box_word is in the original image coordinate space. It is the
+// bounding boxes of the rebuild_word, after denormalization.
+// The length of box_word matches rebuild_word, best_state (if set) and
+// correct_text (if set), as well as best_choice and represents the
+// number of classified units in the output.
+tesseract::BoxWord *box_word = nullptr; // Denormalized output boxes.
+// The Tesseract that was used to recognize this word. Just a borrowed
+// pointer. Note: Tesseract's class definition is in a higher-level library.
+// We avoid introducing a cyclic dependency by not using the Tesseract
+// within WERD_RES. We are just storing it to provide access to it
+// for the top-level multi-language controller, and maybe for output of
+// the recognized language.
+// tesseract points to data owned elsewhere.
+tesseract::Tesseract *tesseract = nullptr;
+// The best_state stores the relationship between chopped_word and
+// rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i]
+// adjacent blobs in chopped_word. The seams in seam_array are hidden
+// within a rebuild_word blob and revealed between them.
+std::vector<int> best_state; // Number of blobs in each best blob.
+// The correct_text is used during training and adaption to carry the
+// text to the training system without the need for a unicharset. There
+// is one entry in the vector for each blob in rebuild_word and box_word.
+std::vector<std::string> correct_text;
+// Less-well documented members.
+// TODO(rays) Add more documentation here.
+WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this.
+REJMAP reject_map;                // best_choice rejects
+bool tess_failed = false;
+/*
+If tess_failed is true, one of the following tests failed when Tess
+returned:
+- The outword blob list was not the same length as the best_choice string;
+- The best_choice string contained ALL blanks;
+- The best_choice string was zero length
+*/
+bool tess_accepted = false;    // Tess thinks its ok?
+bool tess_would_adapt = false; // Tess would adapt?
+bool done = false;             // ready for output?
+bool small_caps = false;       // word appears to be small caps
+bool odd_size = false;         // word is bigger than line or leader dots.
+// The fontinfos are pointers to data owned by the classifier.
+const FontInfo *fontinfo = nullptr;
+const FontInfo *fontinfo2 = nullptr;
+int8_t fontinfo_id_count = 0;  // number of votes
+int8_t fontinfo_id2_count = 0; // number of votes
+bool guessed_x_ht = true;
+bool guessed_caps_ht = true;
+CRUNCH_MODE unlv_crunch_mode = CR_NONE;
+float x_height = 0.0f;       // post match estimate
+float caps_height = 0.0f;    // post match estimate
+float baseline_shift = 0.0f; // post match estimate.
+// Certainty score for the spaces either side of this word (LSTM mode).
+// MIN this value with the actual word certainty.
+float space_certainty = 0.0f;
+/*
+To deal with fuzzy spaces we need to be able to combine "words" to form
+combinations when we suspect that the gap is a non-space. The (new) text
+ord code generates separate words for EVERY fuzzy gap - flags in the word
+indicate whether the gap is below the threshold (fuzzy kern) and is thus
+NOT a real word break by default, or above the threshold (fuzzy space) and
+this is a real word break by default.
+The WERD_RES list contains all these words PLUS "combination" words built
+out of (copies of) the words split by fuzzy kerns. The separate parts have
+their "part_of_combo" flag set true and should be IGNORED on a default
+reading of the list.
+Combination words are FOLLOWED by the sequence of part_of_combo words
+which they combine.
+*/
+bool combination = false;   // of two fuzzy gap wds
+bool part_of_combo = false; // part of a combo
+bool reject_spaces = false; // Reject spacing?
+WERD_RES() = default;
+WERD_RES(WERD *the_word) {
+word = the_word;
+}
+// Deep copies everything except the ratings MATRIX.
+// To get that use deep_copy below.
+WERD_RES(const WERD_RES &source) : ELIST_LINK(source) {
+// combination is used in function Clear which is called from operator=.
+combination = false;
+*this = source; // see operator=
+}
+~WERD_RES();
+// Returns the UTF-8 string for the given blob index in the best_choice word,
+// given that we know whether we are in a right-to-left reading context.
+// This matters for mirrorable characters such as parentheses.  We recognize
+// characters purely based on their shape on the page, and by default produce
+// the corresponding unicode for a left-to-right context.
+const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const {
+if (best_choice == nullptr || blob_index >= best_choice->length()) {
+return nullptr;
+}
+UNICHAR_ID id = best_choice->unichar_id(blob_index);
+if (static_cast<unsigned>(id) >= uch_set->size()) {
+return nullptr;
+}
+UNICHAR_ID mirrored = uch_set->get_mirror(id);
+if (in_rtl_context && mirrored > 0) {
+id = mirrored;
+}
+return uch_set->id_to_unichar_ext(id);
+}
+// Returns the UTF-8 string for the given blob index in the raw_choice word.
+const char *RawUTF8(unsigned blob_index) const {
+if (blob_index >= raw_choice->length()) {
+return nullptr;
+}
+UNICHAR_ID id = raw_choice->unichar_id(blob_index);
+if (static_cast<unsigned>(id) >= uch_set->size()) {
+return nullptr;
+}
+return uch_set->id_to_unichar(id);
+}
+UNICHARSET::Direction SymbolDirection(unsigned blob_index) const {
+if (best_choice == nullptr || blob_index >= best_choice->length()) {
+return UNICHARSET::U_OTHER_NEUTRAL;
+}
+return uch_set->get_direction(best_choice->unichar_id(blob_index));
+}
+bool AnyRtlCharsInWord() const {
+if (uch_set == nullptr || best_choice == nullptr ||
+best_choice->length() < 1) {
+return false;
+}
+for (unsigned id = 0; id < best_choice->length(); id++) {
+unsigned unichar_id = best_choice->unichar_id(id);
+if (unichar_id >= uch_set->size()) {
+continue; // Ignore illegal chars.
+}
+UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
+if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
+dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) {
+return true;
+}
+}
+return false;
+}
+bool AnyLtrCharsInWord() const {
+if (uch_set == nullptr || best_choice == nullptr ||
+best_choice->length() < 1) {
+return false;
+}
+for (unsigned id = 0; id < best_choice->length(); id++) {
+unsigned unichar_id = best_choice->unichar_id(id);
+if (unichar_id >= uch_set->size()) {
+continue; // Ignore illegal chars.
+}
+UNICHARSET::Direction dir = uch_set->get_direction(unichar_id);
+if (dir == UNICHARSET::U_LEFT_TO_RIGHT ||
+dir == UNICHARSET::U_ARABIC_NUMBER) {
+return true;
+}
+}
+return false;
+}
+// Return whether the blobs in this WERD_RES 0, 1,... come from an engine
+// that gave us the unichars in reading order (as opposed to strict left
+// to right).
+bool UnicharsInReadingOrder() const {
+return best_choice->unichars_in_script_order();
+}
+void Clear();
+void ClearResults();
+void ClearWordChoices();
+void ClearRatings();
+// Deep copies everything except the ratings MATRIX.
+// To get that use deep_copy below.
+WERD_RES &operator=(const WERD_RES &source); // from this
+void CopySimpleFields(const WERD_RES &source);
+// Initializes a blank (default constructed) WERD_RES from one that has
+// already been recognized.
+// Use SetupFor*Recognition afterwards to complete the setup and make
+// it ready for a retry recognition.
+void InitForRetryRecognition(const WERD_RES &source);
+// Sets up the members used in recognition: bln_boxes, chopped_word,
+// seam_array, denorm.  Returns false if
+// the word is empty and sets up fake results.  If use_body_size is
+// true and row->body_size is set, then body_size will be used for
+// blob normalization instead of xheight + ascrise. This flag is for
+// those languages that are using CJK pitch model and thus it has to
+// be true if and only if tesseract->textord_use_cjk_fp_model is
+// true.
+// If allow_detailed_fx is true, the feature extractor will receive fine
+// precision outline information, allowing smoother features and better
+// features on low resolution images.
+// The norm_mode sets the default mode for normalization in absence
+// of any of the above flags. It should really be a tesseract::OcrEngineMode
+// but is declared as int for ease of use with tessedit_ocr_engine_mode.
+// Returns false if the word is empty and sets up fake results.
+bool SetupForRecognition(const UNICHARSET &unicharset_in,
+tesseract::Tesseract *tesseract, Image pix,
+int norm_mode, const TBOX *norm_box,
+bool numeric_mode, bool use_body_size,
+bool allow_detailed_fx, ROW *row,
+const BLOCK *block);
+// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
+// accumulators from a made chopped word.  We presume the fields are already
+// empty.
+void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in);
+// Sets up the members used in recognition for an empty recognition result:
+// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
+void SetupFake(const UNICHARSET &uch);
+// Set the word as having the script of the input unicharset.
+void SetupWordScript(const UNICHARSET &unicharset_in);
+// Sets up the blamer_bundle if it is not null, using the initialized denorm.
+void SetupBlamerBundle();
+// Computes the blob_widths and blob_gaps from the chopped_word.
+void SetupBlobWidthsAndGaps();
+// Updates internal data to account for a new SEAM (chop) at the given
+// blob_number. Fixes the ratings matrix and states in the choices, as well
+// as the blob widths and gaps.
+void InsertSeam(int blob_number, SEAM *seam);
+// Returns true if all the word choices except the first have adjust_factors
+// worse than the given threshold.
+bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const;
+// Returns true if the current word is ambiguous (by number of answers or
+// by dangerous ambigs.)
+bool IsAmbiguous();
+// Returns true if the ratings matrix size matches the sum of each of the
+// segmentation states.
+bool StatesAllValid();
+// Prints a list of words found if debug is true or the word result matches
+// the word_to_debug.
+void DebugWordChoices(bool debug, const char *word_to_debug);
+// Prints the top choice along with the accepted/done flags.
+void DebugTopChoice(const char *msg) const;
+// Removes from best_choices all choices which are not within a reasonable
+// range of the best choice.
+void FilterWordChoices(int debug_level);
+// Computes a set of distance thresholds used to control adaption.
+// Compares the best choice for the current word to the best raw choice
+// to determine which characters were classified incorrectly by the
+// classifier. Then places a separate threshold into thresholds for each
+// character in the word. If the classifier was correct, max_rating is placed
+// into thresholds. If the classifier was incorrect, the mean match rating
+// (error percentage) of the classifier's incorrect choice minus some margin
+// is placed into thresholds. This can then be used by the caller to try to
+// create a new template for the desired class that will classify the
+// character with a rating better than the threshold value. The match rating
+// placed into thresholds is never allowed to be below min_rating in order to
+// prevent trying to make overly tight templates.
+// min_rating limits how tight to make a template.
+// max_rating limits how loose to make a template.
+// rating_margin denotes the amount of margin to put in template.
+void ComputeAdaptionThresholds(float certainty_scale, float min_rating,
+float max_rating, float rating_margin,
+float *thresholds);
+// Saves a copy of the word_choice if it has the best unadjusted rating.
+// Returns true if the word_choice was the new best.
+bool LogNewRawChoice(WERD_CHOICE *word_choice);
+// Consumes word_choice by adding it to best_choices, (taking ownership) if
+// the certainty for word_choice is some distance of the best choice in
+// best_choices, or by deleting the word_choice and returning false.
+// The best_choices list is kept in sorted order by rating. Duplicates are
+// removed, and the list is kept no longer than max_num_choices in length.
+// Returns true if the word_choice is still a valid pointer.
+bool LogNewCookedChoice(int max_num_choices, bool debug,
+WERD_CHOICE *word_choice);
+// Prints a brief list of all the best choices.
+void PrintBestChoices() const;
+// Returns the sum of the widths of the blob between start_blob and last_blob
+// inclusive.
+int GetBlobsWidth(int start_blob, int last_blob) const;
+// Returns the width of a gap between the specified blob and the next one.
+int GetBlobsGap(unsigned blob_index) const;
+// Returns the BLOB_CHOICE corresponding to the given index in the
+// best choice word taken from the appropriate cell in the ratings MATRIX.
+// Borrowed pointer, so do not delete. May return nullptr if there is no
+// BLOB_CHOICE matching the unichar_id at the given index.
+BLOB_CHOICE *GetBlobChoice(unsigned index) const;
+// Returns the BLOB_CHOICE_LIST corresponding to the given index in the
+// best choice word taken from the appropriate cell in the ratings MATRIX.
+// Borrowed pointer, so do not delete.
+BLOB_CHOICE_LIST *GetBlobChoices(int index) const;
+// Moves the results fields from word to this. This takes ownership of all
+// the data, so src can be destructed.
+// word1.ConsumeWordResult(word);
+// delete word;
+// is simpler and faster than:
+// word1 = *word;
+// delete word;
+// as it doesn't need to copy and reallocate anything.
+void ConsumeWordResults(WERD_RES *word);
+// Replace the best choice and rebuild box word.
+// choice must be from the current best_choices list.
+void ReplaceBestChoice(WERD_CHOICE *choice);
+// Builds the rebuild_word and sets the best_state from the chopped_word and
+// the best_choice->state.
+void RebuildBestState();
+// Copies the chopped_word to the rebuild_word, faking a best_state as well.
+// Also sets up the output box_word.
+void CloneChoppedToRebuild();
+// Sets/replaces the box_word with one made from the rebuild_word.
+void SetupBoxWord();
+// Sets up the script positions in the best_choice using the best_choice
+// to get the unichars, and the unicharset to get the target positions.
+void SetScriptPositions();
+// Sets all the blobs in all the words (best choice and alternates) to be
+// the given position. (When a sub/superscript is recognized as a separate
+// word, it falls victim to the rule that a whole word cannot be sub or
+// superscript, so this function overrides that problem.)
+void SetAllScriptPositions(tesseract::ScriptPos position);
+// Classifies the word with some already-calculated BLOB_CHOICEs.
+// The choices are an array of blob_count pointers to BLOB_CHOICE,
+// providing a single classifier result for each blob.
+// The BLOB_CHOICEs are consumed and the word takes ownership.
+// The number of blobs in the box_word must match blob_count.
+void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices);
+// Creates a WERD_CHOICE for the word using the top choices from the leading
+// diagonal of the ratings matrix.
+void FakeWordFromRatings(PermuterType permuter);
+// Copies the best_choice strings to the correct_text for adaption/training.
+void BestChoiceToCorrectText();
+// Merges 2 adjacent blobs in the result if the permanent callback
+// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
+// callback box_cb is nullptr or returns true, setting the merged blob
+// result to the class returned from class_cb.
+// Returns true if anything was merged.
+bool ConditionalBlobMerge(
+const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,
+const std::function<bool(const TBOX &, const TBOX &)> &box_cb);
+// Merges 2 adjacent blobs in the result (index and index+1) and corrects
+// all the data to account for the change.
+void MergeAdjacentBlobs(unsigned index);
+// Callback helper for fix_quotes returns a double quote if both
+// arguments are quote, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2);
+void fix_quotes();
+// Callback helper for fix_hyphens returns UNICHAR_ID of - if both
+// arguments are hyphen, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2);
+// Callback helper for fix_hyphens returns true if box1 and box2 overlap
+// (assuming both on the same textline, are in order and a chopped em dash.)
+bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2);
+void fix_hyphens();
+// Callback helper for merge_tess_fails returns a space if both
+// arguments are space, otherwise INVALID_UNICHAR_ID.
+UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2);
+void merge_tess_fails();
+// Returns a really deep copy of *src, including the ratings MATRIX.
+static WERD_RES *deep_copy(const WERD_RES *src) {
+auto *result = new WERD_RES(*src);
+// That didn't copy the ratings, but we want a copy if there is one to
+// begin with.
+if (src->ratings != nullptr) {
+result->ratings = src->ratings->DeepCopy();
+}
+return result;
+}
+// Copy blobs from word_res onto this word (eliminating spaces between).
+// Since this may be called bidirectionally OR both the BOL and EOL flags.
+void copy_on(WERD_RES *word_res) { // from this word
+word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL));
+word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL));
+word->copy_on(word_res->word);
+}
+// Returns true if the collection of count pieces, starting at start, are all
+// natural connected components, ie there are no real chops involved.
+bool PiecesAllNatural(int start, int count) const;
+};
+/*************************************************************************
+* PAGE_RES_IT - Page results iterator
+*************************************************************************/
+class TESS_API PAGE_RES_IT {
+public:
+PAGE_RES *page_res; // page being iterated
+PAGE_RES_IT() = default;
+PAGE_RES_IT(PAGE_RES *the_page_res) { // page result
+page_res = the_page_res;
+restart_page(); // ready to scan
+}
+// Do two PAGE_RES_ITs point at the same word?
+// This is much cheaper than cmp().
+bool operator==(const PAGE_RES_IT &other) const {
+return word_res == other.word_res && row_res == other.row_res &&
+block_res == other.block_res;
+}
+bool operator!=(const PAGE_RES_IT &other) const {
+return !(*this == other);
+}
+// Given another PAGE_RES_IT to the same page,
+//  this before other:     -1
+//  this equal to other:    0
+//  this later than other:  1
+int cmp(const PAGE_RES_IT &other) const;
+WERD_RES *restart_page() {
+return start_page(false); // Skip empty blocks.
+}
+WERD_RES *restart_page_with_empties() {
+return start_page(true); // Allow empty blocks.
+}
+WERD_RES *start_page(bool empty_ok);
+WERD_RES *restart_row();
+// ============ Methods that mutate the underling structures ===========
+// Note that these methods will potentially invalidate other PAGE_RES_ITs
+// and are intended to be used only while a single PAGE_RES_IT is  active.
+// This problem needs to be taken into account if these mutation operators
+// are ever provided to PageIterator or its subclasses.
+// Inserts the new_word and a corresponding WERD_RES before the current
+// position. The simple fields of the WERD_RES are copied from clone_res and
+// the resulting WERD_RES is returned for further setup with best_choice etc.
+WERD_RES *InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word);
+// Replaces the current WERD/WERD_RES with the given words. The given words
+// contain fake blobs that indicate the position of the characters. These are
+// replaced with real blobs from the current word as much as possible.
+void ReplaceCurrentWord(PointerVector<WERD_RES> *words);
+// Deletes the current WERD_RES and its underlying WERD.
+void DeleteCurrentWord();
+// Makes the current word a fuzzy space if not already fuzzy. Updates
+// corresponding part of combo if required.
+void MakeCurrentWordFuzzy();
+WERD_RES *forward() { // Get next word.
+return internal_forward(false, false);
+}
+// Move forward, but allow empty blocks to show as single nullptr words.
+WERD_RES *forward_with_empties() {
+return internal_forward(false, true);
+}
+WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph
+WERD_RES *forward_block();     // get first word in next non-empty block
+WERD_RES *prev_word() const { // previous word
+return prev_word_res;
+}
+ROW_RES *prev_row() const { // row of prev word
+return prev_row_res;
+}
+BLOCK_RES *prev_block() const { // block of prev word
+return prev_block_res;
+}
+WERD_RES *word() const { // current word
+return word_res;
+}
+ROW_RES *row() const { // row of current word
+return row_res;
+}
+BLOCK_RES *block() const { // block of cur. word
+return block_res;
+}
+WERD_RES *next_word() const { // next word
+return next_word_res;
+}
+ROW_RES *next_row() const { // row of next word
+return next_row_res;
+}
+BLOCK_RES *next_block() const { // block of next word
+return next_block_res;
+}
+void rej_stat_word(); // for page/block/row
+void ResetWordIterator();
+private:
+WERD_RES *internal_forward(bool new_block, bool empty_ok);
+WERD_RES *prev_word_res;   // previous word
+ROW_RES *prev_row_res;     // row of prev word
+BLOCK_RES *prev_block_res; // block of prev word
+WERD_RES *word_res;   // current word
+ROW_RES *row_res;     // row of current word
+BLOCK_RES *block_res; // block of cur. word
+WERD_RES *next_word_res;   // next word
+ROW_RES *next_row_res;     // row of next word
+BLOCK_RES *next_block_res; // block of next word
+BLOCK_RES_IT block_res_it; // iterators
+ROW_RES_IT row_res_it;
+WERD_RES_IT word_res_it;
+// Iterators used to get the state of word_res_it for the current word.
+// Since word_res_it is 2 words further on, this is otherwise hard to do.
+WERD_RES_IT wr_it_of_current_word;
+WERD_RES_IT wr_it_of_next_word;
+};
+} // namespace tesseract
+#endif

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.h @ 2:b50eed0cc0ef upstream