Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/dict/context.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/dict/context.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,79 @@ +/****************************************************************************** + * + * File: context.cpp (Formerly context.c) + * Description: Context checking functions + * Author: Mark Seaman, OCR Technology + * + * (c) Copyright 1990, Hewlett-Packard Company. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + *****************************************************************************/ + +#include "dict.h" +#include "unicharset.h" + +namespace tesseract { + +static const int kMinAbsoluteGarbageWordLength = 10; +static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f; + +const int case_state_table[6][4] = { + {/* 0. Beginning of word */ + /* P U L D */ + /* -1. Error on case */ + 0, 1, 5, 4}, + {/* 1. After initial capital */ + 0, 3, 2, 4}, + {/* 2. After lower case */ + 0, -1, 2, -1}, + {/* 3. After upper case */ + 0, 3, -1, 4}, + {/* 4. After a digit */ + 0, -1, -1, 4}, + {/* 5. After initial lower case */ + 5, -1, 2, -1}, +}; + +int Dict::case_ok(const WERD_CHOICE &word) const { + int state = 0; + const UNICHARSET *unicharset = word.unicharset(); + for (unsigned x = 0; x < word.length(); ++x) { + UNICHAR_ID ch_id = word.unichar_id(x); + if (unicharset->get_isupper(ch_id)) { + state = case_state_table[state][1]; + } else if (unicharset->get_islower(ch_id)) { + state = case_state_table[state][2]; + } else if (unicharset->get_isdigit(ch_id)) { + state = case_state_table[state][3]; + } else { + state = case_state_table[state][0]; + } + if (state == -1) { + return false; + } + } + return state != 5; // single lower is bad +} + +bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) { + if (word.length() < kMinAbsoluteGarbageWordLength) { + return false; + } + int num_alphanum = 0; + for (unsigned x = 0; x < word.length(); ++x) { + num_alphanum += + (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x))); + } + return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) < + kMinAbsoluteGarbageAlphanumFrac); +} + +} // namespace tesseract
