Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/lstm/recodebeam.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/lstm/recodebeam.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,429 @@ +/////////////////////////////////////////////////////////////////////// +// File: recodebeam.h +// Description: Beam search to decode from the re-encoded CJK as a sequence of +// smaller numbers in place of a single large code. +// Author: Ray Smith +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_ +#define THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_ + +#include "dawg.h" +#include "dict.h" +#include "genericheap.h" +#include "genericvector.h" +#include "kdpair.h" +#include "networkio.h" +#include "ratngs.h" +#include "unicharcompress.h" + +#include <unordered_set> // for std::unordered_set +#include <vector> // for std::vector + +namespace tesseract { + +// Enum describing what can follow the current node. +// Consider the following softmax outputs: +// Timestep 0 1 2 3 4 5 6 7 8 +// X-score 0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01 +// Y-score 0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01 +// Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98 +// Then the correct CTC decoding (in which adjacent equal classes are folded, +// and then all nulls are dropped) is clearly XYX, but simple decoding (taking +// the max at each timestep) leads to: +// Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98, +// which folds to the correct XYX. The conversion to Tesseract rating and +// certainty uses the sum of the log probs (log of the product of probabilities) +// for the Rating and the minimum log prob for the certainty, but that yields a +// minimum certainty of log(0.55), which is poor for such an obvious case. +// CTC says that the probability of the result is the SUM of the products of the +// probabilities over ALL PATHS that decode to the same result, which includes: +// NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX. +// That is intractable, so some compromise between simple and ideal is needed. +// Observing that evenly split timesteps rarely happen next to each other, we +// allow scores at a transition between classes to be added for decoding thus: +// N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98. +// This works because NNX and NXX both decode to X, so in the middle we can use +// N+X. Note that the classes either side of a sum must stand alone, i.e. use a +// single score, to force all paths to pass through them and decode to the same +// result. Also in the special case of a transition from X to Y, with only one +// timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all +// decode to XY. +// An important condition is that we cannot combine X and Null between two +// stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for +// X and Null have to go in separate paths. Combining scores in this way +// provides a much better minimum certainty of log(0.95). +// In the implementation of the beam search, we have to place the possibilities +// X, X+N and X+Y+N in the beam under appropriate conditions of the previous +// node, and constrain what can follow, to enforce the rules explained above. +// We therefore have 3 different types of node determined by what can follow: +enum NodeContinuation { + NC_ANYTHING, // This node used just its own score, so anything can follow. + NC_ONLY_DUP, // The current node combined another score with the score for + // itself, without a stand-alone duplicate before, so must be + // followed by a stand-alone duplicate. + NC_NO_DUP, // The current node combined another score with the score for + // itself, after a stand-alone, so can only be followed by + // something other than a duplicate of the current node. + NC_COUNT +}; + +// Enum describing the top-n status of a code. +enum TopNState { + TN_TOP2, // Winner or 2nd. + TN_TOPN, // Runner up in top-n, but not 1st or 2nd. + TN_ALSO_RAN, // Not in the top-n. + TN_COUNT +}; + +// Lattice element for Re-encode beam search. +struct RecodeNode { + RecodeNode() + : code(-1) + , unichar_id(INVALID_UNICHAR_ID) + , permuter(TOP_CHOICE_PERM) + , start_of_dawg(false) + , start_of_word(false) + , end_of_word(false) + , duplicate(false) + , certainty(0.0f) + , score(0.0f) + , prev(nullptr) + , dawgs(nullptr) + , code_hash(0) {} + RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start, bool word_start, bool end, + bool dup, float cert, float s, const RecodeNode *p, DawgPositionVector *d, + uint64_t hash) + : code(c) + , unichar_id(uni_id) + , permuter(perm) + , start_of_dawg(dawg_start) + , start_of_word(word_start) + , end_of_word(end) + , duplicate(dup) + , certainty(cert) + , score(s) + , prev(p) + , dawgs(d) + , code_hash(hash) {} + // NOTE: If we could use C++11, then this would be a move constructor. + // Instead we have copy constructor that does a move!! This is because we + // don't want to copy the whole DawgPositionVector each time, and true + // copying isn't necessary for this struct. It does get moved around a lot + // though inside the heap and during heap push, hence the move semantics. + RecodeNode(const RecodeNode &src) : dawgs(nullptr) { + *this = src; + ASSERT_HOST(src.dawgs == nullptr); + } + RecodeNode &operator=(const RecodeNode &src) { + delete dawgs; + memcpy(this, &src, sizeof(src)); + ((RecodeNode &)src).dawgs = nullptr; + return *this; + } + ~RecodeNode() { + delete dawgs; + } + // Prints details of the node. + void Print(int null_char, const UNICHARSET &unicharset, int depth) const; + + // The re-encoded code here = index to network output. + int code; + // The decoded unichar_id is only valid for the final code of a sequence. + int unichar_id; + // The type of permuter active at this point. Intervals between start_of_word + // and end_of_word make valid words of type given by permuter where + // end_of_word is true. These aren't necessarily delimited by spaces. + PermuterType permuter; + // True if this is the initial dawg state. May be attached to a space or, + // in a non-space-delimited lang, the end of the previous word. + bool start_of_dawg; + // True if this is the first node in a dictionary word. + bool start_of_word; + // True if this represents a valid candidate end of word position. Does not + // necessarily mark the end of a word, since a word can be extended beyond a + // candidate end by a continuation, eg 'the' continues to 'these'. + bool end_of_word; + // True if this->code is a duplicate of prev->code. Some training modes + // allow the network to output duplicate characters and crush them with CTC, + // but that would mess up the dictionary search, so we just smash them + // together on the fly using the duplicate flag. + bool duplicate; + // Certainty (log prob) of (just) this position. + float certainty; + // Total certainty of the path to this position. + float score; + // The previous node in this chain. Borrowed pointer. + const RecodeNode *prev; + // The currently active dawgs at this position. Owned pointer. + DawgPositionVector *dawgs; + // A hash of all codes in the prefix and this->code as well. Used for + // duplicate path removal. + uint64_t code_hash; +}; + +using RecodePair = KDPairInc<double, RecodeNode>; +using RecodeHeap = GenericHeap<RecodePair>; + +// Class that holds the entire beam search for recognition of a text line. +class TESS_API RecodeBeamSearch { +public: + // Borrows the pointer, which is expected to survive until *this is deleted. + RecodeBeamSearch(const UnicharCompress &recoder, int null_char, bool simple_text, Dict *dict); + ~RecodeBeamSearch(); + + // Decodes the set of network outputs, storing the lattice internally. + // If charset is not null, it enables detailed debugging of the beam search. + void Decode(const NetworkIO &output, double dict_ratio, double cert_offset, + double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode = 0); + void Decode(const GENERIC_2D_ARRAY<float> &output, double dict_ratio, double cert_offset, + double worst_dict_cert, const UNICHARSET *charset); + + void DecodeSecondaryBeams(const NetworkIO &output, double dict_ratio, double cert_offset, + double worst_dict_cert, const UNICHARSET *charset, + int lstm_choice_mode = 0); + + // Returns the best path as labels/scores/xcoords similar to simple CTC. + void ExtractBestPathAsLabels(std::vector<int> *labels, std::vector<int> *xcoords) const; + // Returns the best path as unichar-ids/certs/ratings/xcoords skipping + // duplicates, nulls and intermediate parts. + void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET *unicharset, + std::vector<int> *unichar_ids, std::vector<float> *certs, + std::vector<float> *ratings, std::vector<int> *xcoords) const; + + // Returns the best path as a set of WERD_RES. + void ExtractBestPathAsWords(const TBOX &line_box, float scale_factor, bool debug, + const UNICHARSET *unicharset, PointerVector<WERD_RES> *words, + int lstm_choice_mode = 0); + + // Generates debug output of the content of the beams after a Decode. + void DebugBeams(const UNICHARSET &unicharset) const; + + // Extract the best characters from the current decode iteration and block + // those symbols for the next iteration. In contrast to Tesseract's standard + // method to chose the best overall node chain, this methods looks at a short + // node chain segmented by the character boundaries and chooses the best + // option independent of the remaining node chain. + void extractSymbolChoices(const UNICHARSET *unicharset); + + // Generates debug output of the content of the beams after a Decode. + void PrintBeam2(bool uids, int num_outputs, const UNICHARSET *charset, bool secondary) const; + // Segments the timestep bundle by the character_boundaries. + void segmentTimestepsByCharacters(); + std::vector<std::vector<std::pair<const char *, float>>> + // Unions the segmented timestep character bundles to one big bundle. + combineSegmentedTimesteps( + std::vector<std::vector<std::vector<std::pair<const char *, float>>>> *segmentedTimesteps); + // Stores the alternative characters of every timestep together with their + // probability. + std::vector<std::vector<std::pair<const char *, float>>> timesteps; + std::vector<std::vector<std::vector<std::pair<const char *, float>>>> segmentedTimesteps; + // Stores the character choices found in the ctc algorithm + std::vector<std::vector<std::pair<const char *, float>>> ctc_choices; + // Stores all unicharids which are excluded for future iterations + std::vector<std::unordered_set<int>> excludedUnichars; + // Stores the character boundaries regarding timesteps. + std::vector<int> character_boundaries_; + // Clipping value for certainty inside Tesseract. Reflects the minimum value + // of certainty that will be returned by ExtractBestPathAsUnicharIds. + // Supposedly on a uniform scale that can be compared across languages and + // engines. + static constexpr float kMinCertainty = -20.0f; + // Number of different code lengths for which we have a separate beam. + static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1; + // Total number of beams: dawg/nodawg * number of NodeContinuation * number + // of different lengths. + static const int kNumBeams = 2 * NC_COUNT * kNumLengths; + // Returns the relevant factor in the beams_ index. + static int LengthFromBeamsIndex(int index) { + return index % kNumLengths; + } + static NodeContinuation ContinuationFromBeamsIndex(int index) { + return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT); + } + static bool IsDawgFromBeamsIndex(int index) { + return index / (kNumLengths * NC_COUNT) > 0; + } + // Computes a beams_ index from the given factors. + static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) { + return (is_dawg * NC_COUNT + cont) * kNumLengths + length; + } + +private: + // Struct for the Re-encode beam search. This struct holds the data for + // a single time-step position of the output. Use a vector<RecodeBeam> + // to hold all the timesteps and prevent reallocation of the individual heaps. + struct RecodeBeam { + // Resets to the initial state without deleting all the memory. + void Clear() { + for (auto &beam : beams_) { + beam.clear(); + } + RecodeNode empty; + for (auto &best_initial_dawg : best_initial_dawgs_) { + best_initial_dawg = empty; + } + } + + // A separate beam for each combination of code length, + // NodeContinuation, and dictionary flag. Separating out all these types + // allows the beam to be quite narrow, and yet still have a low chance of + // losing the best path. + // We have to keep all these beams separate, since the highest scoring paths + // come from the paths that are most likely to dead-end at any time, like + // dawg paths, NC_ONLY_DUP etc. + // Each heap is stored with the WORST result at the top, so we can quickly + // get the top-n values. + RecodeHeap beams_[kNumBeams]; + // While the language model is only a single word dictionary, we can use + // word starts as a choke point in the beam, and keep only a single dict + // start node at each step (for each NodeContinuation type), so we find the + // best one here and push it on the heap, if it qualifies, after processing + // all of the step. + RecodeNode best_initial_dawgs_[NC_COUNT]; + }; + using TopPair = KDPairInc<float, int>; + + // Generates debug output of the content of a single beam position. + void DebugBeamPos(const UNICHARSET &unicharset, const RecodeHeap &heap) const; + + // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping + // duplicates, nulls and intermediate parts. + static void ExtractPathAsUnicharIds(const std::vector<const RecodeNode *> &best_nodes, + std::vector<int> *unichar_ids, std::vector<float> *certs, + std::vector<float> *ratings, std::vector<int> *xcoords, + std::vector<int> *character_boundaries = nullptr); + + // Sets up a word with the ratings matrix and fake blobs with boxes in the + // right places. + WERD_RES *InitializeWord(bool leading_space, const TBOX &line_box, int word_start, int word_end, + float space_certainty, const UNICHARSET *unicharset, + const std::vector<int> &xcoords, float scale_factor); + + // Fills top_n_flags_ with bools that are true iff the corresponding output + // is one of the top_n. + void ComputeTopN(const float *outputs, int num_outputs, int top_n); + + void ComputeSecTopN(std::unordered_set<int> *exList, const float *outputs, int num_outputs, + int top_n); + + // Adds the computation for the current time-step to the beam. Call at each + // time-step in sequence from left to right. outputs is the activation vector + // for the current timestep. + void DecodeStep(const float *outputs, int t, double dict_ratio, double cert_offset, + double worst_dict_cert, const UNICHARSET *charset, bool debug = false); + + void DecodeSecondaryStep(const float *outputs, int t, double dict_ratio, double cert_offset, + double worst_dict_cert, const UNICHARSET *charset, bool debug = false); + + // Saves the most certain choices for the current time-step. + void SaveMostCertainChoices(const float *outputs, int num_outputs, const UNICHARSET *charset, + int xCoord); + + // Calculates more accurate character boundaries which can be used to + // provide more accurate alternative symbol choices. + static void calculateCharBoundaries(std::vector<int> *starts, std::vector<int> *ends, + std::vector<int> *character_boundaries_, int maxWidth); + + // Adds to the appropriate beams the legal (according to recoder) + // continuations of context prev, which is from the given index to beams_, + // using the given network outputs to provide scores to the choices. Uses only + // those choices for which top_n_flags[code] == top_n_flag. + void ContinueContext(const RecodeNode *prev, int index, const float *outputs, + TopNState top_n_flag, const UNICHARSET *unicharset, double dict_ratio, + double cert_offset, double worst_dict_cert, RecodeBeam *step); + // Continues for a new unichar, using dawg or non-dawg as per flag. + void ContinueUnichar(int code, int unichar_id, float cert, float worst_dict_cert, + float dict_ratio, bool use_dawgs, NodeContinuation cont, + const RecodeNode *prev, RecodeBeam *step); + // Adds a RecodeNode composed of the args to the correct heap in step if + // unichar_id is a valid dictionary continuation of whatever is in prev. + void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont, + const RecodeNode *prev, RecodeBeam *step); + // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args + // if better than what is already there. + void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter, bool start, + bool end, float cert, NodeContinuation cont, const RecodeNode *prev, + RecodeBeam *step); + // Adds a RecodeNode composed of the args to the correct heap in step for + // partial unichar or duplicate if there is room or if better than the + // current worst element if already full. + void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id, float cert, + float worst_dict_cert, float dict_ratio, bool use_dawgs, + NodeContinuation cont, const RecodeNode *prev, RecodeBeam *step); + // Adds a RecodeNode composed of the args to the correct heap in step if there + // is room or if better than the current worst element if already full. + void PushHeapIfBetter(int max_size, int code, int unichar_id, PermuterType permuter, + bool dawg_start, bool word_start, bool end, bool dup, float cert, + const RecodeNode *prev, DawgPositionVector *d, RecodeHeap *heap); + // Adds a RecodeNode to heap if there is room + // or if better than the current worst element if already full. + void PushHeapIfBetter(int max_size, RecodeNode *node, RecodeHeap *heap); + // Searches the heap for an entry matching new_node, and updates the entry + // with reshuffle if needed. Returns true if there was a match. + bool UpdateHeapIfMatched(RecodeNode *new_node, RecodeHeap *heap); + // Computes and returns the code-hash for the given code and prev. + uint64_t ComputeCodeHash(int code, bool dup, const RecodeNode *prev) const; + // Backtracks to extract the best path through the lattice that was built + // during Decode. On return the best_nodes vector essentially contains the set + // of code, score pairs that make the optimal path with the constraint that + // the recoder can decode the code sequence back to a sequence of unichar-ids. + void ExtractBestPaths(std::vector<const RecodeNode *> *best_nodes, + std::vector<const RecodeNode *> *second_nodes) const; + // Helper backtracks through the lattice from the given node, storing the + // path and reversing it. + void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path) const; + void ExtractPath(const RecodeNode *node, std::vector<const RecodeNode *> *path, + int limiter) const; + // Helper prints debug information on the given lattice path. + void DebugPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path) const; + // Helper prints debug information on the given unichar path. + void DebugUnicharPath(const UNICHARSET *unicharset, const std::vector<const RecodeNode *> &path, + const std::vector<int> &unichar_ids, const std::vector<float> &certs, + const std::vector<float> &ratings, const std::vector<int> &xcoords) const; + + static const int kBeamWidths[RecodedCharID::kMaxCodeLen + 1]; + + // The encoder/decoder that we will be using. + const UnicharCompress &recoder_; + // The beam for each timestep in the output. + std::vector<RecodeBeam *> beam_; + // Secondary Beam for Results with less Probability + std::vector<RecodeBeam *> secondary_beam_; + // The number of timesteps valid in beam_; + int beam_size_; + // A flag to indicate which outputs are the top-n choices. Current timestep + // only. + std::vector<TopNState> top_n_flags_; + // A record of the highest and second scoring codes. + int top_code_; + int second_code_; + // Heap used to compute the top_n_flags_. + GenericHeap<TopPair> top_heap_; + // Borrowed pointer to the dictionary to use in the search. + Dict *dict_; + // True if the language is space-delimited, which is true for most languages + // except chi*, jpn, tha. + bool space_delimited_; + // True if the input is simple text, ie adjacent equal chars are not to be + // eliminated. + bool is_simple_text_; + // The encoded (class label) of the null/reject character. + int null_char_; +}; + +} // namespace tesseract. + +#endif // THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
