Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/ccstruct/rejctmap.h @ 46:7ee69f120f19 default tip
>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:17:30 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
/********************************************************************** * File: rejctmap.h (Formerly rejmap.h) * Description: REJ and REJMAP class functions. * Author: Phil Cheatle * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * This module may look unnecessarily verbose, but here's the philosophy... ALL processing of the reject map is done in this module. There are lots of separate calls to set reject/accept flags. These have DELIBERATELY been kept distinct so that this module can decide what to do. Basically, there is a flag for each sort of rejection or acceptance. This provides a history of what has happened to EACH character. Determining whether a character is CURRENTLY rejected depends on implicit understanding of the SEQUENCE of possible calls. The flags are defined and grouped in the REJ_FLAGS enum. These groupings are used in determining a characters CURRENT rejection status. Basically, a character is ACCEPTED if none of the permanent rej flags are set AND ( the character has never been rejected OR an accept flag is set which is LATER than the latest reject flag ) IT IS FUNDAMENTAL THAT ANYONE HACKING THIS CODE UNDERSTANDS THE SIGNIFICANCE OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! **********************************************************************/ #ifndef REJCTMAP_H #define REJCTMAP_H #include "errcode.h" #include "params.h" #include <bitset> #include <memory> namespace tesseract { enum REJ_FLAGS { /* Reject modes which are NEVER overridden */ R_TESS_FAILURE, // PERM Tess didn't classify R_SMALL_XHT, // PERM Xht too small R_EDGE_CHAR, // PERM Too close to edge of image R_1IL_CONFLICT, // PERM 1Il confusion R_POSTNN_1IL, // PERM 1Il unrejected by NN R_REJ_CBLOB, // PERM Odd blob R_MM_REJECT, // PERM Matrix match rejection (m's) R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend /* Initial reject modes (pre NN_ACCEPT) */ R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop R_DUBIOUS, // TEMP Post NN dodgy chars R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest R_XHT_FIXUP, // TEMP Xht tests unsure /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ R_BAD_QUALITY, // TEMP Quality metrics bad for WERD /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ R_DOC_REJ, // TEMP Document rejection R_BLOCK_REJ, // TEMP Block rejection R_ROW_REJ, // TEMP Row rejection R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space /* Accept modes which occur between the above rejection groups */ R_NN_ACCEPT, // NN acceptance R_HYPHEN_ACCEPT, // Hyphen acceptance R_MM_ACCEPT, // Matrix match acceptance R_QUALITY_ACCEPT, // Accept word in good quality doc R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures }; /* REJECT MAP VALUES */ #define MAP_ACCEPT '1' #define MAP_REJECT_PERM '0' #define MAP_REJECT_TEMP '2' #define MAP_REJECT_POTENTIAL '3' class REJ { std::bitset<32> flags; void set_flag(REJ_FLAGS rej_flag) { flags.set(rej_flag); } public: REJ() = default; REJ( // classwise copy const REJ &source) { flags = source.flags; } REJ &operator=( // assign REJ const REJ &source) = default; bool flag(REJ_FLAGS rej_flag) const { return flags[rej_flag]; } char display_char() const { if (perm_rejected()) { return MAP_REJECT_PERM; } else if (accept_if_good_quality()) { return MAP_REJECT_POTENTIAL; } else if (rejected()) { return MAP_REJECT_TEMP; } else { return MAP_ACCEPT; } } bool perm_rejected() const { // Is char perm reject? return (flag(R_TESS_FAILURE) || flag(R_SMALL_XHT) || flag(R_EDGE_CHAR) || flag(R_1IL_CONFLICT) || flag(R_POSTNN_1IL) || flag(R_REJ_CBLOB) || flag(R_BAD_REPETITION) || flag(R_MM_REJECT)); } private: bool rej_before_nn_accept() const { return flag(R_POOR_MATCH) || flag(R_NOT_TESS_ACCEPTED) || flag(R_CONTAINS_BLANKS) || flag(R_BAD_PERMUTER); } bool rej_between_nn_and_mm() const { return flag(R_HYPHEN) || flag(R_DUBIOUS) || flag(R_NO_ALPHANUMS) || flag(R_MOSTLY_REJ) || flag(R_XHT_FIXUP); } bool rej_between_mm_and_quality_accept() const { return flag(R_BAD_QUALITY); } bool rej_between_quality_and_minimal_rej_accept() const { return flag(R_DOC_REJ) || flag(R_BLOCK_REJ) || flag(R_ROW_REJ) || flag(R_UNLV_REJ); } bool rej_before_mm_accept() const { return rej_between_nn_and_mm() || (rej_before_nn_accept() && !flag(R_NN_ACCEPT) && !flag(R_HYPHEN_ACCEPT)); } bool rej_before_quality_accept() const { return rej_between_mm_and_quality_accept() || (!flag(R_MM_ACCEPT) && rej_before_mm_accept()); } public: bool rejected() const { // Is char rejected? if (flag(R_MINIMAL_REJ_ACCEPT)) { return false; } else { return (perm_rejected() || rej_between_quality_and_minimal_rej_accept() || (!flag(R_QUALITY_ACCEPT) && rej_before_quality_accept())); } } bool accept_if_good_quality() const { // potential rej? return (rejected() && !perm_rejected() && flag(R_BAD_PERMUTER) && !flag(R_POOR_MATCH) && !flag(R_NOT_TESS_ACCEPTED) && !flag(R_CONTAINS_BLANKS) && (!rej_between_nn_and_mm() && !rej_between_mm_and_quality_accept() && !rej_between_quality_and_minimal_rej_accept())); } void setrej_tess_failure() { // Tess generated blank set_flag(R_TESS_FAILURE); } void setrej_small_xht() { // Small xht char/wd set_flag(R_SMALL_XHT); } void setrej_edge_char() { // Close to image edge set_flag(R_EDGE_CHAR); } void setrej_1Il_conflict() { // Initial reject map set_flag(R_1IL_CONFLICT); } void setrej_postNN_1Il() { // 1Il after NN set_flag(R_POSTNN_1IL); } void setrej_rej_cblob() { // Insert duff blob set_flag(R_REJ_CBLOB); } void setrej_mm_reject() { // Matrix matcher set_flag(R_MM_REJECT); } void setrej_bad_repetition() { // Odd repeated char set_flag(R_BAD_REPETITION); } void setrej_poor_match() { // Failed Rays heuristic set_flag(R_POOR_MATCH); } void setrej_not_tess_accepted() { // TEMP reject_word set_flag(R_NOT_TESS_ACCEPTED); } void setrej_contains_blanks() { // TEMP reject_word set_flag(R_CONTAINS_BLANKS); } void setrej_bad_permuter() { // POTENTIAL reject_word set_flag(R_BAD_PERMUTER); } void setrej_hyphen() { // PostNN dubious hyphen or . set_flag(R_HYPHEN); } void setrej_dubious() { // PostNN dubious limit set_flag(R_DUBIOUS); } void setrej_no_alphanums() { // TEMP reject_word set_flag(R_NO_ALPHANUMS); } void setrej_mostly_rej() { // TEMP reject_word set_flag(R_MOSTLY_REJ); } void setrej_xht_fixup() { // xht fixup set_flag(R_XHT_FIXUP); } void setrej_bad_quality() { // TEMP reject_word set_flag(R_BAD_QUALITY); } void setrej_doc_rej() { // TEMP reject_word set_flag(R_DOC_REJ); } void setrej_block_rej() { // TEMP reject_word set_flag(R_BLOCK_REJ); } void setrej_row_rej() { // TEMP reject_word set_flag(R_ROW_REJ); } void setrej_unlv_rej() { // TEMP reject_word set_flag(R_UNLV_REJ); } void setrej_hyphen_accept() { // NN Flipped a char set_flag(R_HYPHEN_ACCEPT); } void setrej_nn_accept() { // NN Flipped a char set_flag(R_NN_ACCEPT); } void setrej_mm_accept() { // Matrix matcher set_flag(R_MM_ACCEPT); } void setrej_quality_accept() { // Quality flip a char set_flag(R_QUALITY_ACCEPT); } void setrej_minimal_rej_accept() { // Accept all except blank set_flag(R_MINIMAL_REJ_ACCEPT); } bool accepted() const { // Is char accepted? return !rejected(); } bool recoverable() const { return (rejected() && !perm_rejected()); } void full_print(FILE *fp) const; }; class REJMAP { std::unique_ptr<REJ[]> ptr; // ptr to the chars uint16_t len = 0; // Number of chars public: REJMAP() = default; REJMAP(const REJMAP &rejmap) { *this = rejmap; } REJMAP &operator=(const REJMAP &source); // Sets up the ptr array to length, whatever it was before. void initialise(uint16_t length); REJ &operator[]( // access function uint16_t index) const // map index { ASSERT_HOST(index < len); return ptr[index]; // no bounds checks } uint16_t length() const { // map length return len; } int16_t accept_count() const; // How many accepted? int16_t reject_count() const { // How many rejects? return len - accept_count(); } // Cut out an element. void remove_pos(uint16_t pos); void print(FILE *fp) const; void full_print(FILE *fp) const; bool recoverable_rejects() const; // Any non perm rejs? bool quality_recoverable_rejects() const; // Any potential rejs? void rej_word_small_xht(); // Reject whole word // Reject whole word void rej_word_tess_failure(); void rej_word_not_tess_accepted(); // Reject whole word // Reject whole word void rej_word_contains_blanks(); // Reject whole word void rej_word_bad_permuter(); void rej_word_xht_fixup(); // Reject whole word // Reject whole word void rej_word_no_alphanums(); void rej_word_mostly_rej(); // Reject whole word void rej_word_bad_quality(); // Reject whole word void rej_word_doc_rej(); // Reject whole word void rej_word_block_rej(); // Reject whole word void rej_word_row_rej(); // Reject whole word }; } // namespace tesseract #endif
