Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/dict/context.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /****************************************************************************** | |
| 2 * | |
| 3 * File: context.cpp (Formerly context.c) | |
| 4 * Description: Context checking functions | |
| 5 * Author: Mark Seaman, OCR Technology | |
| 6 * | |
| 7 * (c) Copyright 1990, Hewlett-Packard Company. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 * | |
| 18 *****************************************************************************/ | |
| 19 | |
| 20 #include "dict.h" | |
| 21 #include "unicharset.h" | |
| 22 | |
| 23 namespace tesseract { | |
| 24 | |
| 25 static const int kMinAbsoluteGarbageWordLength = 10; | |
| 26 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f; | |
| 27 | |
| 28 const int case_state_table[6][4] = { | |
| 29 {/* 0. Beginning of word */ | |
| 30 /* P U L D */ | |
| 31 /* -1. Error on case */ | |
| 32 0, 1, 5, 4}, | |
| 33 {/* 1. After initial capital */ | |
| 34 0, 3, 2, 4}, | |
| 35 {/* 2. After lower case */ | |
| 36 0, -1, 2, -1}, | |
| 37 {/* 3. After upper case */ | |
| 38 0, 3, -1, 4}, | |
| 39 {/* 4. After a digit */ | |
| 40 0, -1, -1, 4}, | |
| 41 {/* 5. After initial lower case */ | |
| 42 5, -1, 2, -1}, | |
| 43 }; | |
| 44 | |
| 45 int Dict::case_ok(const WERD_CHOICE &word) const { | |
| 46 int state = 0; | |
| 47 const UNICHARSET *unicharset = word.unicharset(); | |
| 48 for (unsigned x = 0; x < word.length(); ++x) { | |
| 49 UNICHAR_ID ch_id = word.unichar_id(x); | |
| 50 if (unicharset->get_isupper(ch_id)) { | |
| 51 state = case_state_table[state][1]; | |
| 52 } else if (unicharset->get_islower(ch_id)) { | |
| 53 state = case_state_table[state][2]; | |
| 54 } else if (unicharset->get_isdigit(ch_id)) { | |
| 55 state = case_state_table[state][3]; | |
| 56 } else { | |
| 57 state = case_state_table[state][0]; | |
| 58 } | |
| 59 if (state == -1) { | |
| 60 return false; | |
| 61 } | |
| 62 } | |
| 63 return state != 5; // single lower is bad | |
| 64 } | |
| 65 | |
| 66 bool Dict::absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset) { | |
| 67 if (word.length() < kMinAbsoluteGarbageWordLength) { | |
| 68 return false; | |
| 69 } | |
| 70 int num_alphanum = 0; | |
| 71 for (unsigned x = 0; x < word.length(); ++x) { | |
| 72 num_alphanum += | |
| 73 (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x))); | |
| 74 } | |
| 75 return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) < | |
| 76 kMinAbsoluteGarbageAlphanumFrac); | |
| 77 } | |
| 78 | |
| 79 } // namespace tesseract |
