Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/pageres.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,810 @@ +/********************************************************************** + * File: pageres.h (Formerly page_res.h) + * Description: Results classes used by control.c + * Author: Phil Cheatle + * + * (C) Copyright 1992, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifndef PAGERES_H +#define PAGERES_H + +#include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS +#include "clst.h" // for CLIST_ITERATOR, CLISTIZEH +#include "elst.h" // for ELIST_ITERATOR, ELIST_LINK, ELISTIZEH +#include "genericvector.h" // for PointerVector +#include "matrix.h" // for MATRIX +#include "normalis.h" // for DENORM +#include "ratngs.h" // for WERD_CHOICE, BLOB_CHOICE (ptr only) +#include "rect.h" // for TBOX +#include "rejctmap.h" // for REJMAP +#include "unicharset.h" // for UNICHARSET, UNICHARSET::Direction, UNI... +#include "werd.h" // for WERD, W_BOL, W_EOL + +#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID + +#include <cstdint> // for int32_t, int16_t +#include <functional> // for std::function +#include <set> // for std::pair +#include <vector> // for std::vector + +#include <sys/types.h> // for int8_t + +struct Pix; + +namespace tesseract { + +class BLOCK; +class BLOCK_LIST; +class BLOCK_RES; +class ROW; +class ROW_RES; +class SEAM; +class WERD_RES; + +struct TWERD; + +class BoxWord; +class Tesseract; +struct FontInfo; + +/* Forward declarations */ + +class BLOCK_RES; + +ELISTIZEH(BLOCK_RES) +CLISTIZEH(BLOCK_RES) +class ROW_RES; + +ELISTIZEH(ROW_RES) +class WERD_RES; + +ELISTIZEH(WERD_RES) + +/************************************************************************* + * PAGE_RES - Page results + *************************************************************************/ +class PAGE_RES { // page result +public: + int32_t char_count; + int32_t rej_count; + BLOCK_RES_LIST block_res_list; + bool rejected; + // Updated every time PAGE_RES_IT iterating on this PAGE_RES moves to + // the next word. This pointer is not owned by PAGE_RES class. + WERD_CHOICE **prev_word_best_choice; + // Sums of blame reasons computed by the blamer. + std::vector<int> blame_reasons; + // Debug information about all the misadaptions on this page. + // Each BlamerBundle contains an index into this vector, so that words that + // caused misadaption could be marked. However, since words could be + // deleted/split/merged, the log is stored on the PAGE_RES level. + std::vector<std::string> misadaption_log; + + inline void Init() { + char_count = 0; + rej_count = 0; + rejected = false; + prev_word_best_choice = nullptr; + blame_reasons.clear(); + blame_reasons.resize(IRR_NUM_REASONS); + } + + PAGE_RES() { + Init(); + } // empty constructor + + PAGE_RES(bool merge_similar_words, + BLOCK_LIST *block_list, // real blocks + WERD_CHOICE **prev_word_best_choice_ptr); + + ~PAGE_RES() = default; +}; + +/************************************************************************* + * BLOCK_RES - Block results + *************************************************************************/ + +class BLOCK_RES : public ELIST_LINK { +public: + BLOCK *block; // real block + int32_t char_count; // chars in block + int32_t rej_count; // rejected chars + int16_t font_class; // + int16_t row_count; + float x_height; + bool font_assigned; // block already + // processed + + ROW_RES_LIST row_res_list; + + BLOCK_RES() = default; + + BLOCK_RES(bool merge_similar_words, BLOCK *the_block); // real block + + ~BLOCK_RES() = default; +}; + +/************************************************************************* + * ROW_RES - Row results + *************************************************************************/ + +class ROW_RES : public ELIST_LINK { +public: + ROW *row; // real row + int32_t char_count; // chars in block + int32_t rej_count; // rejected chars + int32_t whole_word_rej_count; // rejs in total rej wds + WERD_RES_LIST word_res_list; + + ROW_RES() = default; + + ROW_RES(bool merge_similar_words, ROW *the_row); // real row + + ~ROW_RES() = default; +}; + +/************************************************************************* + * WERD_RES - Word results + *************************************************************************/ +enum CRUNCH_MODE { CR_NONE, CR_KEEP_SPACE, CR_LOOSE_SPACE, CR_DELETE }; + +// WERD_RES is a collection of publicly accessible members that gathers +// information about a word result. +class TESS_API WERD_RES : public ELIST_LINK { +public: + // Which word is which? + // There are 3 coordinate spaces in use here: a possibly rotated pixel space, + // the original image coordinate space, and the BLN space in which the + // baseline of a word is at kBlnBaselineOffset, the xheight is kBlnXHeight, + // and the x-middle of the word is at 0. + // In the rotated pixel space, coordinates correspond to the input image, + // but may be rotated about the origin by a multiple of 90 degrees, + // and may therefore be negative. + // In any case a rotation by denorm.block()->re_rotation() will take them + // back to the original image. + // The other differences between words all represent different stages of + // processing during recognition. + + // ---------------------------INPUT------------------------------------- + + // The word is the input C_BLOBs in the rotated pixel space. + // word is NOT owned by the WERD_RES unless combination is true. + // All the other word pointers ARE owned by the WERD_RES. + WERD *word = nullptr; // Input C_BLOB word. + + // -------------SETUP BY SetupFor*Recognition---READONLY-INPUT------------ + + // The bln_boxes contains the bounding boxes (only) of the input word, in the + // BLN space. The lengths of word and bln_boxes + // match as they are both before any chopping. + // TODO(rays) determine if docqual does anything useful and delete bln_boxes + // if it doesn't. + tesseract::BoxWord *bln_boxes = nullptr; // BLN input bounding boxes. + // The ROW that this word sits in. NOT owned by the WERD_RES. + ROW *blob_row = nullptr; + // The denorm provides the transformation to get back to the rotated image + // coords from the chopped_word/rebuild_word BLN coords, but each blob also + // has its own denorm. + DENORM denorm; // For use on chopped_word. + // Unicharset used by the classifier output in best_choice and raw_choice. + const UNICHARSET *uch_set = nullptr; // For converting back to utf8. + + // ----Initialized by SetupFor*Recognition---BUT OUTPUT FROM RECOGNITION---- + // ----Setup to a (different!) state expected by the various classifiers---- + // TODO(rays) Tidy and make more consistent. + + // The chopped_word is also in BLN space, and represents the fully chopped + // character fragments that make up the word. + // The length of chopped_word matches length of seam_array + 1 (if set). + TWERD *chopped_word = nullptr; // BLN chopped fragments output. + // Vector of SEAM* holding chopping points matching chopped_word. + std::vector<SEAM *> seam_array; + // Widths of blobs in chopped_word. + std::vector<int> blob_widths; + // Gaps between blobs in chopped_word. blob_gaps[i] is the gap between + // blob i and blob i+1. + std::vector<int> blob_gaps; + // Stores the lstm choices of every timestep + std::vector<std::vector<std::pair<const char *, float>>> timesteps; + // Stores the lstm choices of every timestep segmented by character + std::vector<std::vector<std::vector<std::pair<const char *, float>>>> + segmented_timesteps; + // Symbolchoices acquired during CTC + std::vector<std::vector<std::pair<const char *, float>>> CTC_symbol_choices; + // Stores if the timestep vector starts with a space + bool leading_space = false; + // Stores value when the word ends + int end = 0; + // Ratings matrix contains classifier choices for each classified combination + // of blobs. The dimension is the same as the number of blobs in chopped_word + // and the leading diagonal corresponds to classifier results of the blobs + // in chopped_word. The state_ members of best_choice, raw_choice and + // best_choices all correspond to this ratings matrix and allow extraction + // of the blob choices for any given WERD_CHOICE. + MATRIX *ratings = nullptr; // Owned pointer. + // Pointer to the first WERD_CHOICE in best_choices. This is the result that + // will be output from Tesseract. Note that this is now a borrowed pointer + // and should NOT be deleted. + WERD_CHOICE *best_choice = nullptr; // Borrowed pointer. + // The best raw_choice found during segmentation search. Differs from the + // best_choice by being the best result according to just the character + // classifier, not taking any language model information into account. + // Unlike best_choice, the pointer IS owned by this WERD_RES. + WERD_CHOICE *raw_choice = nullptr; // Owned pointer. + // Alternative results found during chopping/segmentation search stages. + // Note that being an ELIST, best_choices owns the WERD_CHOICEs. + WERD_CHOICE_LIST best_choices; + + // Truth bounding boxes, text and incorrect choice reason. + BlamerBundle *blamer_bundle = nullptr; + + // --------------OUTPUT FROM RECOGNITION------------------------------- + // --------------Not all fields are necessarily set.------------------- + // ---best_choice, raw_choice *must* end up set, with a box_word------- + // ---In complete output, the number of blobs in rebuild_word matches--- + // ---the number of boxes in box_word, the number of unichar_ids in--- + // ---best_choice, the number of ints in best_state, and the number--- + // ---of strings in correct_text-------------------------------------- + // ---SetupFake Sets everything to appropriate values if the word is--- + // ---known to be bad before recognition.------------------------------ + + // The rebuild_word is also in BLN space, but represents the final best + // segmentation of the word. Its length is therefore the same as box_word. + TWERD *rebuild_word = nullptr; // BLN best segmented word. + // The box_word is in the original image coordinate space. It is the + // bounding boxes of the rebuild_word, after denormalization. + // The length of box_word matches rebuild_word, best_state (if set) and + // correct_text (if set), as well as best_choice and represents the + // number of classified units in the output. + tesseract::BoxWord *box_word = nullptr; // Denormalized output boxes. + // The Tesseract that was used to recognize this word. Just a borrowed + // pointer. Note: Tesseract's class definition is in a higher-level library. + // We avoid introducing a cyclic dependency by not using the Tesseract + // within WERD_RES. We are just storing it to provide access to it + // for the top-level multi-language controller, and maybe for output of + // the recognized language. + // tesseract points to data owned elsewhere. + tesseract::Tesseract *tesseract = nullptr; + // The best_state stores the relationship between chopped_word and + // rebuild_word. Each blob[i] in rebuild_word is composed of best_state[i] + // adjacent blobs in chopped_word. The seams in seam_array are hidden + // within a rebuild_word blob and revealed between them. + std::vector<int> best_state; // Number of blobs in each best blob. + // The correct_text is used during training and adaption to carry the + // text to the training system without the need for a unicharset. There + // is one entry in the vector for each blob in rebuild_word and box_word. + std::vector<std::string> correct_text; + + // Less-well documented members. + // TODO(rays) Add more documentation here. + WERD_CHOICE *ep_choice = nullptr; // ep text TODO(rays) delete this. + REJMAP reject_map; // best_choice rejects + bool tess_failed = false; + /* + If tess_failed is true, one of the following tests failed when Tess + returned: + - The outword blob list was not the same length as the best_choice string; + - The best_choice string contained ALL blanks; + - The best_choice string was zero length +*/ + bool tess_accepted = false; // Tess thinks its ok? + bool tess_would_adapt = false; // Tess would adapt? + bool done = false; // ready for output? + bool small_caps = false; // word appears to be small caps + bool odd_size = false; // word is bigger than line or leader dots. + // The fontinfos are pointers to data owned by the classifier. + const FontInfo *fontinfo = nullptr; + const FontInfo *fontinfo2 = nullptr; + int8_t fontinfo_id_count = 0; // number of votes + int8_t fontinfo_id2_count = 0; // number of votes + bool guessed_x_ht = true; + bool guessed_caps_ht = true; + CRUNCH_MODE unlv_crunch_mode = CR_NONE; + float x_height = 0.0f; // post match estimate + float caps_height = 0.0f; // post match estimate + float baseline_shift = 0.0f; // post match estimate. + // Certainty score for the spaces either side of this word (LSTM mode). + // MIN this value with the actual word certainty. + float space_certainty = 0.0f; + + /* + To deal with fuzzy spaces we need to be able to combine "words" to form + combinations when we suspect that the gap is a non-space. The (new) text + ord code generates separate words for EVERY fuzzy gap - flags in the word + indicate whether the gap is below the threshold (fuzzy kern) and is thus + NOT a real word break by default, or above the threshold (fuzzy space) and + this is a real word break by default. + + The WERD_RES list contains all these words PLUS "combination" words built + out of (copies of) the words split by fuzzy kerns. The separate parts have + their "part_of_combo" flag set true and should be IGNORED on a default + reading of the list. + + Combination words are FOLLOWED by the sequence of part_of_combo words + which they combine. +*/ + bool combination = false; // of two fuzzy gap wds + bool part_of_combo = false; // part of a combo + bool reject_spaces = false; // Reject spacing? + + WERD_RES() = default; + + WERD_RES(WERD *the_word) { + word = the_word; + } + // Deep copies everything except the ratings MATRIX. + // To get that use deep_copy below. + WERD_RES(const WERD_RES &source) : ELIST_LINK(source) { + // combination is used in function Clear which is called from operator=. + combination = false; + *this = source; // see operator= + } + + ~WERD_RES(); + + // Returns the UTF-8 string for the given blob index in the best_choice word, + // given that we know whether we are in a right-to-left reading context. + // This matters for mirrorable characters such as parentheses. We recognize + // characters purely based on their shape on the page, and by default produce + // the corresponding unicode for a left-to-right context. + const char *BestUTF8(unsigned blob_index, bool in_rtl_context) const { + if (best_choice == nullptr || blob_index >= best_choice->length()) { + return nullptr; + } + UNICHAR_ID id = best_choice->unichar_id(blob_index); + if (static_cast<unsigned>(id) >= uch_set->size()) { + return nullptr; + } + UNICHAR_ID mirrored = uch_set->get_mirror(id); + if (in_rtl_context && mirrored > 0) { + id = mirrored; + } + return uch_set->id_to_unichar_ext(id); + } + // Returns the UTF-8 string for the given blob index in the raw_choice word. + const char *RawUTF8(unsigned blob_index) const { + if (blob_index >= raw_choice->length()) { + return nullptr; + } + UNICHAR_ID id = raw_choice->unichar_id(blob_index); + if (static_cast<unsigned>(id) >= uch_set->size()) { + return nullptr; + } + return uch_set->id_to_unichar(id); + } + + UNICHARSET::Direction SymbolDirection(unsigned blob_index) const { + if (best_choice == nullptr || blob_index >= best_choice->length()) { + return UNICHARSET::U_OTHER_NEUTRAL; + } + return uch_set->get_direction(best_choice->unichar_id(blob_index)); + } + + bool AnyRtlCharsInWord() const { + if (uch_set == nullptr || best_choice == nullptr || + best_choice->length() < 1) { + return false; + } + for (unsigned id = 0; id < best_choice->length(); id++) { + unsigned unichar_id = best_choice->unichar_id(id); + if (unichar_id >= uch_set->size()) { + continue; // Ignore illegal chars. + } + UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); + if (dir == UNICHARSET::U_RIGHT_TO_LEFT || + dir == UNICHARSET::U_RIGHT_TO_LEFT_ARABIC) { + return true; + } + } + return false; + } + + bool AnyLtrCharsInWord() const { + if (uch_set == nullptr || best_choice == nullptr || + best_choice->length() < 1) { + return false; + } + for (unsigned id = 0; id < best_choice->length(); id++) { + unsigned unichar_id = best_choice->unichar_id(id); + if (unichar_id >= uch_set->size()) { + continue; // Ignore illegal chars. + } + UNICHARSET::Direction dir = uch_set->get_direction(unichar_id); + if (dir == UNICHARSET::U_LEFT_TO_RIGHT || + dir == UNICHARSET::U_ARABIC_NUMBER) { + return true; + } + } + return false; + } + + // Return whether the blobs in this WERD_RES 0, 1,... come from an engine + // that gave us the unichars in reading order (as opposed to strict left + // to right). + bool UnicharsInReadingOrder() const { + return best_choice->unichars_in_script_order(); + } + + void Clear(); + void ClearResults(); + void ClearWordChoices(); + void ClearRatings(); + + // Deep copies everything except the ratings MATRIX. + // To get that use deep_copy below. + WERD_RES &operator=(const WERD_RES &source); // from this + + void CopySimpleFields(const WERD_RES &source); + + // Initializes a blank (default constructed) WERD_RES from one that has + // already been recognized. + // Use SetupFor*Recognition afterwards to complete the setup and make + // it ready for a retry recognition. + void InitForRetryRecognition(const WERD_RES &source); + + // Sets up the members used in recognition: bln_boxes, chopped_word, + // seam_array, denorm. Returns false if + // the word is empty and sets up fake results. If use_body_size is + // true and row->body_size is set, then body_size will be used for + // blob normalization instead of xheight + ascrise. This flag is for + // those languages that are using CJK pitch model and thus it has to + // be true if and only if tesseract->textord_use_cjk_fp_model is + // true. + // If allow_detailed_fx is true, the feature extractor will receive fine + // precision outline information, allowing smoother features and better + // features on low resolution images. + // The norm_mode sets the default mode for normalization in absence + // of any of the above flags. It should really be a tesseract::OcrEngineMode + // but is declared as int for ease of use with tessedit_ocr_engine_mode. + // Returns false if the word is empty and sets up fake results. + bool SetupForRecognition(const UNICHARSET &unicharset_in, + tesseract::Tesseract *tesseract, Image pix, + int norm_mode, const TBOX *norm_box, + bool numeric_mode, bool use_body_size, + bool allow_detailed_fx, ROW *row, + const BLOCK *block); + + // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty + // accumulators from a made chopped word. We presume the fields are already + // empty. + void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in); + + // Sets up the members used in recognition for an empty recognition result: + // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice. + void SetupFake(const UNICHARSET &uch); + + // Set the word as having the script of the input unicharset. + void SetupWordScript(const UNICHARSET &unicharset_in); + + // Sets up the blamer_bundle if it is not null, using the initialized denorm. + void SetupBlamerBundle(); + + // Computes the blob_widths and blob_gaps from the chopped_word. + void SetupBlobWidthsAndGaps(); + + // Updates internal data to account for a new SEAM (chop) at the given + // blob_number. Fixes the ratings matrix and states in the choices, as well + // as the blob widths and gaps. + void InsertSeam(int blob_number, SEAM *seam); + + // Returns true if all the word choices except the first have adjust_factors + // worse than the given threshold. + bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const; + + // Returns true if the current word is ambiguous (by number of answers or + // by dangerous ambigs.) + bool IsAmbiguous(); + + // Returns true if the ratings matrix size matches the sum of each of the + // segmentation states. + bool StatesAllValid(); + + // Prints a list of words found if debug is true or the word result matches + // the word_to_debug. + void DebugWordChoices(bool debug, const char *word_to_debug); + + // Prints the top choice along with the accepted/done flags. + void DebugTopChoice(const char *msg) const; + + // Removes from best_choices all choices which are not within a reasonable + // range of the best choice. + void FilterWordChoices(int debug_level); + + // Computes a set of distance thresholds used to control adaption. + // Compares the best choice for the current word to the best raw choice + // to determine which characters were classified incorrectly by the + // classifier. Then places a separate threshold into thresholds for each + // character in the word. If the classifier was correct, max_rating is placed + // into thresholds. If the classifier was incorrect, the mean match rating + // (error percentage) of the classifier's incorrect choice minus some margin + // is placed into thresholds. This can then be used by the caller to try to + // create a new template for the desired class that will classify the + // character with a rating better than the threshold value. The match rating + // placed into thresholds is never allowed to be below min_rating in order to + // prevent trying to make overly tight templates. + // min_rating limits how tight to make a template. + // max_rating limits how loose to make a template. + // rating_margin denotes the amount of margin to put in template. + void ComputeAdaptionThresholds(float certainty_scale, float min_rating, + float max_rating, float rating_margin, + float *thresholds); + + // Saves a copy of the word_choice if it has the best unadjusted rating. + // Returns true if the word_choice was the new best. + bool LogNewRawChoice(WERD_CHOICE *word_choice); + // Consumes word_choice by adding it to best_choices, (taking ownership) if + // the certainty for word_choice is some distance of the best choice in + // best_choices, or by deleting the word_choice and returning false. + // The best_choices list is kept in sorted order by rating. Duplicates are + // removed, and the list is kept no longer than max_num_choices in length. + // Returns true if the word_choice is still a valid pointer. + bool LogNewCookedChoice(int max_num_choices, bool debug, + WERD_CHOICE *word_choice); + + // Prints a brief list of all the best choices. + void PrintBestChoices() const; + + // Returns the sum of the widths of the blob between start_blob and last_blob + // inclusive. + int GetBlobsWidth(int start_blob, int last_blob) const; + // Returns the width of a gap between the specified blob and the next one. + int GetBlobsGap(unsigned blob_index) const; + + // Returns the BLOB_CHOICE corresponding to the given index in the + // best choice word taken from the appropriate cell in the ratings MATRIX. + // Borrowed pointer, so do not delete. May return nullptr if there is no + // BLOB_CHOICE matching the unichar_id at the given index. + BLOB_CHOICE *GetBlobChoice(unsigned index) const; + + // Returns the BLOB_CHOICE_LIST corresponding to the given index in the + // best choice word taken from the appropriate cell in the ratings MATRIX. + // Borrowed pointer, so do not delete. + BLOB_CHOICE_LIST *GetBlobChoices(int index) const; + + // Moves the results fields from word to this. This takes ownership of all + // the data, so src can be destructed. + // word1.ConsumeWordResult(word); + // delete word; + // is simpler and faster than: + // word1 = *word; + // delete word; + // as it doesn't need to copy and reallocate anything. + void ConsumeWordResults(WERD_RES *word); + + // Replace the best choice and rebuild box word. + // choice must be from the current best_choices list. + void ReplaceBestChoice(WERD_CHOICE *choice); + + // Builds the rebuild_word and sets the best_state from the chopped_word and + // the best_choice->state. + void RebuildBestState(); + + // Copies the chopped_word to the rebuild_word, faking a best_state as well. + // Also sets up the output box_word. + void CloneChoppedToRebuild(); + + // Sets/replaces the box_word with one made from the rebuild_word. + void SetupBoxWord(); + + // Sets up the script positions in the best_choice using the best_choice + // to get the unichars, and the unicharset to get the target positions. + void SetScriptPositions(); + // Sets all the blobs in all the words (best choice and alternates) to be + // the given position. (When a sub/superscript is recognized as a separate + // word, it falls victim to the rule that a whole word cannot be sub or + // superscript, so this function overrides that problem.) + void SetAllScriptPositions(tesseract::ScriptPos position); + + // Classifies the word with some already-calculated BLOB_CHOICEs. + // The choices are an array of blob_count pointers to BLOB_CHOICE, + // providing a single classifier result for each blob. + // The BLOB_CHOICEs are consumed and the word takes ownership. + // The number of blobs in the box_word must match blob_count. + void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices); + + // Creates a WERD_CHOICE for the word using the top choices from the leading + // diagonal of the ratings matrix. + void FakeWordFromRatings(PermuterType permuter); + + // Copies the best_choice strings to the correct_text for adaption/training. + void BestChoiceToCorrectText(); + + // Merges 2 adjacent blobs in the result if the permanent callback + // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent + // callback box_cb is nullptr or returns true, setting the merged blob + // result to the class returned from class_cb. + // Returns true if anything was merged. + bool ConditionalBlobMerge( + const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, + const std::function<bool(const TBOX &, const TBOX &)> &box_cb); + + // Merges 2 adjacent blobs in the result (index and index+1) and corrects + // all the data to account for the change. + void MergeAdjacentBlobs(unsigned index); + + // Callback helper for fix_quotes returns a double quote if both + // arguments are quote, otherwise INVALID_UNICHAR_ID. + UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2); + void fix_quotes(); + + // Callback helper for fix_hyphens returns UNICHAR_ID of - if both + // arguments are hyphen, otherwise INVALID_UNICHAR_ID. + UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2); + // Callback helper for fix_hyphens returns true if box1 and box2 overlap + // (assuming both on the same textline, are in order and a chopped em dash.) + bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2); + void fix_hyphens(); + + // Callback helper for merge_tess_fails returns a space if both + // arguments are space, otherwise INVALID_UNICHAR_ID. + UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2); + void merge_tess_fails(); + + // Returns a really deep copy of *src, including the ratings MATRIX. + static WERD_RES *deep_copy(const WERD_RES *src) { + auto *result = new WERD_RES(*src); + // That didn't copy the ratings, but we want a copy if there is one to + // begin with. + if (src->ratings != nullptr) { + result->ratings = src->ratings->DeepCopy(); + } + return result; + } + + // Copy blobs from word_res onto this word (eliminating spaces between). + // Since this may be called bidirectionally OR both the BOL and EOL flags. + void copy_on(WERD_RES *word_res) { // from this word + word->set_flag(W_BOL, word->flag(W_BOL) || word_res->word->flag(W_BOL)); + word->set_flag(W_EOL, word->flag(W_EOL) || word_res->word->flag(W_EOL)); + word->copy_on(word_res->word); + } + + // Returns true if the collection of count pieces, starting at start, are all + // natural connected components, ie there are no real chops involved. + bool PiecesAllNatural(int start, int count) const; +}; + +/************************************************************************* + * PAGE_RES_IT - Page results iterator + *************************************************************************/ + +class TESS_API PAGE_RES_IT { +public: + PAGE_RES *page_res; // page being iterated + + PAGE_RES_IT() = default; + + PAGE_RES_IT(PAGE_RES *the_page_res) { // page result + page_res = the_page_res; + restart_page(); // ready to scan + } + + // Do two PAGE_RES_ITs point at the same word? + // This is much cheaper than cmp(). + bool operator==(const PAGE_RES_IT &other) const { + return word_res == other.word_res && row_res == other.row_res && + block_res == other.block_res; + } + + bool operator!=(const PAGE_RES_IT &other) const { + return !(*this == other); + } + + // Given another PAGE_RES_IT to the same page, + // this before other: -1 + // this equal to other: 0 + // this later than other: 1 + int cmp(const PAGE_RES_IT &other) const; + + WERD_RES *restart_page() { + return start_page(false); // Skip empty blocks. + } + WERD_RES *restart_page_with_empties() { + return start_page(true); // Allow empty blocks. + } + WERD_RES *start_page(bool empty_ok); + + WERD_RES *restart_row(); + + // ============ Methods that mutate the underling structures =========== + // Note that these methods will potentially invalidate other PAGE_RES_ITs + // and are intended to be used only while a single PAGE_RES_IT is active. + // This problem needs to be taken into account if these mutation operators + // are ever provided to PageIterator or its subclasses. + + // Inserts the new_word and a corresponding WERD_RES before the current + // position. The simple fields of the WERD_RES are copied from clone_res and + // the resulting WERD_RES is returned for further setup with best_choice etc. + WERD_RES *InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word); + + // Replaces the current WERD/WERD_RES with the given words. The given words + // contain fake blobs that indicate the position of the characters. These are + // replaced with real blobs from the current word as much as possible. + void ReplaceCurrentWord(PointerVector<WERD_RES> *words); + + // Deletes the current WERD_RES and its underlying WERD. + void DeleteCurrentWord(); + + // Makes the current word a fuzzy space if not already fuzzy. Updates + // corresponding part of combo if required. + void MakeCurrentWordFuzzy(); + + WERD_RES *forward() { // Get next word. + return internal_forward(false, false); + } + // Move forward, but allow empty blocks to show as single nullptr words. + WERD_RES *forward_with_empties() { + return internal_forward(false, true); + } + + WERD_RES *forward_paragraph(); // get first word in next non-empty paragraph + WERD_RES *forward_block(); // get first word in next non-empty block + + WERD_RES *prev_word() const { // previous word + return prev_word_res; + } + ROW_RES *prev_row() const { // row of prev word + return prev_row_res; + } + BLOCK_RES *prev_block() const { // block of prev word + return prev_block_res; + } + WERD_RES *word() const { // current word + return word_res; + } + ROW_RES *row() const { // row of current word + return row_res; + } + BLOCK_RES *block() const { // block of cur. word + return block_res; + } + WERD_RES *next_word() const { // next word + return next_word_res; + } + ROW_RES *next_row() const { // row of next word + return next_row_res; + } + BLOCK_RES *next_block() const { // block of next word + return next_block_res; + } + void rej_stat_word(); // for page/block/row + void ResetWordIterator(); + +private: + WERD_RES *internal_forward(bool new_block, bool empty_ok); + + WERD_RES *prev_word_res; // previous word + ROW_RES *prev_row_res; // row of prev word + BLOCK_RES *prev_block_res; // block of prev word + + WERD_RES *word_res; // current word + ROW_RES *row_res; // row of current word + BLOCK_RES *block_res; // block of cur. word + + WERD_RES *next_word_res; // next word + ROW_RES *next_row_res; // row of next word + BLOCK_RES *next_block_res; // block of next word + + BLOCK_RES_IT block_res_it; // iterators + ROW_RES_IT row_res_it; + WERD_RES_IT word_res_it; + // Iterators used to get the state of word_res_it for the current word. + // Since word_res_it is 2 words further on, this is otherwise hard to do. + WERD_RES_IT wr_it_of_current_word; + WERD_RES_IT wr_it_of_next_word; +}; + +} // namespace tesseract + +#endif
