Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/unicharmap.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: unicharmap.cpp | |
| 3 // Description: Unicode character/ligature to integer id class. | |
| 4 // Author: Thomas Kielbus | |
| 5 // | |
| 6 // (C) Copyright 2006, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #include "unicharmap.h" | |
| 20 | |
| 21 #include <tesseract/unichar.h> | |
| 22 | |
| 23 #include <cassert> | |
| 24 | |
| 25 namespace tesseract { | |
| 26 | |
| 27 UNICHARMAP::UNICHARMAP() : nodes(nullptr) {} | |
| 28 | |
| 29 UNICHARMAP::~UNICHARMAP() { | |
| 30 delete[] nodes; | |
| 31 } | |
| 32 | |
| 33 // Search the given unichar representation in the tree, using length characters | |
| 34 // from it maximum. Each character in the string is interpreted as an index in | |
| 35 // an array of nodes. | |
| 36 UNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const { | |
| 37 UNICHARMAP_NODE *current_nodes = nodes; | |
| 38 | |
| 39 assert(*unichar_repr != '\0'); | |
| 40 assert(length > 0 && length <= UNICHAR_LEN); | |
| 41 | |
| 42 int index = 0; | |
| 43 if (length <= 0 || unichar_repr[index] == '\0') { | |
| 44 return INVALID_UNICHAR_ID; | |
| 45 } | |
| 46 do { | |
| 47 if (index + 1 >= length || unichar_repr[index + 1] == '\0') { | |
| 48 return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id; | |
| 49 } | |
| 50 current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; | |
| 51 ++index; | |
| 52 } while (true); | |
| 53 } | |
| 54 | |
| 55 // Search the given unichar representation in the tree, creating the possibly | |
| 56 // missing nodes. Once the right place has been found, insert the given id and | |
| 57 // update the inserted flag to keep track of the insert. Each character in the | |
| 58 // string is interpreted as an index in an array of nodes. | |
| 59 void UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) { | |
| 60 const char *current_char = unichar_repr; | |
| 61 if (*current_char == '\0') { | |
| 62 return; | |
| 63 } | |
| 64 UNICHARMAP_NODE **current_nodes_pointer = &nodes; | |
| 65 do { | |
| 66 if (*current_nodes_pointer == nullptr) { | |
| 67 *current_nodes_pointer = new UNICHARMAP_NODE[256]; | |
| 68 } | |
| 69 if (current_char[1] == '\0') { | |
| 70 (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id; | |
| 71 return; | |
| 72 } | |
| 73 current_nodes_pointer = | |
| 74 &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children); | |
| 75 ++current_char; | |
| 76 } while (true); | |
| 77 } | |
| 78 | |
| 79 // Search the given unichar representation in the tree, using length characters | |
| 80 // from it maximum. Each character in the string is interpreted as an index in | |
| 81 // an array of nodes. Stop once the tree does not have anymore nodes or once we | |
| 82 // found the right unichar_repr. | |
| 83 bool UNICHARMAP::contains(const char *const unichar_repr, int length) const { | |
| 84 if (unichar_repr == nullptr || *unichar_repr == '\0') { | |
| 85 return false; | |
| 86 } | |
| 87 if (length <= 0 || length > UNICHAR_LEN) { | |
| 88 return false; | |
| 89 } | |
| 90 int index = 0; | |
| 91 if (unichar_repr[index] == '\0') { | |
| 92 return false; | |
| 93 } | |
| 94 UNICHARMAP_NODE *current_nodes = nodes; | |
| 95 | |
| 96 while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\0') { | |
| 97 current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; | |
| 98 ++index; | |
| 99 } | |
| 100 return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\0') && | |
| 101 current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0; | |
| 102 } | |
| 103 | |
| 104 // Return the minimum number of characters that must be used from this string | |
| 105 // to obtain a match in the UNICHARMAP. | |
| 106 int UNICHARMAP::minmatch(const char *const unichar_repr) const { | |
| 107 const char *current_char = unichar_repr; | |
| 108 if (*current_char == '\0') { | |
| 109 return 0; | |
| 110 } | |
| 111 UNICHARMAP_NODE *current_nodes = nodes; | |
| 112 | |
| 113 while (current_nodes != nullptr && *current_char != '\0') { | |
| 114 if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) { | |
| 115 return current_char + 1 - unichar_repr; | |
| 116 } | |
| 117 current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children; | |
| 118 ++current_char; | |
| 119 } | |
| 120 return 0; | |
| 121 } | |
| 122 | |
| 123 void UNICHARMAP::clear() { | |
| 124 delete[] nodes; | |
| 125 nodes = nullptr; | |
| 126 } | |
| 127 | |
| 128 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {} | |
| 129 | |
| 130 // Recursively delete the children | |
| 131 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() { | |
| 132 delete[] children; | |
| 133 } | |
| 134 | |
| 135 } // namespace tesseract |
