Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/lm_consistency.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/lm_consistency.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,144 @@ +/////////////////////////////////////////////////////////////////////// +// File: lm_consistency.h +// Description: Struct for recording consistency of the paths representing +// OCR hypotheses. +// Author: Rika Antonova +// +// (C) Copyright 2012, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_WORDREC_LM_CONSISTENCY_H_ +#define TESSERACT_WORDREC_LM_CONSISTENCY_H_ + +#include <cstdint> // for INT16_MAX +#include "dawg.h" // for EDGE_REF, NO_EDGE +#include "dict.h" // for XH_GOOD, XH_INCONSISTENT, XHeightConsi... + +class BLOB_CHOICE; + +namespace tesseract { + +static const char *const XHeightConsistencyEnumName[] = { + "XH_GOOD", + "XH_SUBNORMAL", + "XH_INCONSISTENT", +}; + +// Struct for keeping track of the consistency of the path. +struct LMConsistencyInfo { + enum ChartypeEnum { CT_NONE, CT_ALPHA, CT_DIGIT, CT_OTHER }; + + // How much do characters have to be shifted away from normal parameters + // before we say they're not normal? + static const int kShiftThresh = 1; + + // How much shifting from subscript to superscript and back + // before we declare shenanigans? + static const int kMaxEntropy = 1; + + // Script positions - order important for entropy calculation. + static const int kSUB = 0, kNORM = 1, kSUP = 2; + static const int kNumPos = 3; + + explicit LMConsistencyInfo(const LMConsistencyInfo *parent_info) { + if (parent_info == nullptr) { + // Initialize from scratch. + num_alphas = 0; + num_digits = 0; + num_punc = 0; + num_other = 0; + chartype = CT_NONE; + punc_ref = NO_EDGE; + invalid_punc = false; + num_non_first_upper = 0; + num_lower = 0; + script_id = 0; + inconsistent_script = false; + num_inconsistent_spaces = 0; + inconsistent_font = false; + // Initialize XHeight stats. + for (int i = 0; i < kNumPos; i++) { + xht_count[i] = 0; + xht_count_punc[i] = 0; + xht_lo[i] = 0; + xht_hi[i] = 256; // kBlnCellHeight + } + xht_sp = -1; // This invalid value indicates that there was no parent. + xpos_entropy = 0; + xht_decision = XH_GOOD; + } else { + // Copy parent info + *this = *parent_info; + } + } + inline int NumInconsistentPunc() const { + return invalid_punc ? num_punc : 0; + } + inline int NumInconsistentCase() const { + return (num_non_first_upper > num_lower) ? num_lower : num_non_first_upper; + } + inline int NumInconsistentChartype() const { + return (NumInconsistentPunc() + num_other + + ((num_alphas > num_digits) ? num_digits : num_alphas)); + } + inline bool Consistent() const { + return (NumInconsistentPunc() == 0 && NumInconsistentCase() == 0 && + NumInconsistentChartype() == 0 && !inconsistent_script && !inconsistent_font && + !InconsistentXHeight()); + } + inline int NumInconsistentSpaces() const { + return num_inconsistent_spaces; + } + inline int InconsistentXHeight() const { + return xht_decision == XH_INCONSISTENT; + } + void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc); + float BodyMinXHeight() const { + if (InconsistentXHeight()) { + return 0.0f; + } + return xht_lo[kNORM]; + } + float BodyMaxXHeight() const { + if (InconsistentXHeight()) { + return static_cast<float>(INT16_MAX); + } + return xht_hi[kNORM]; + } + + EDGE_REF punc_ref; + int num_alphas; + int num_digits; + int num_punc; + int num_other; + ChartypeEnum chartype; + XHeightConsistencyEnum xht_decision; + int num_non_first_upper; + int num_lower; + int script_id; + int num_inconsistent_spaces; + // Metrics clumped by position. + float xht_lo[kNumPos]; + float xht_hi[kNumPos]; + int16_t xht_count[kNumPos]; + int16_t xht_count_punc[kNumPos]; + int16_t xht_sp; + int16_t xpos_entropy; + bool invalid_punc; + bool inconsistent_script; + bool inconsistent_font; +}; + +} // namespace tesseract + +#endif // TESSERACT_WORDREC_LM_CONSISTENCY_H_
