Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/ccutil/unicharmap.cpp @ 46:7ee69f120f19 default tip
>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:17:30 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
/////////////////////////////////////////////////////////////////////// // File: unicharmap.cpp // Description: Unicode character/ligature to integer id class. // Author: Thomas Kielbus // // (C) Copyright 2006, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // /////////////////////////////////////////////////////////////////////// #include "unicharmap.h" #include <tesseract/unichar.h> #include <cassert> namespace tesseract { UNICHARMAP::UNICHARMAP() : nodes(nullptr) {} UNICHARMAP::~UNICHARMAP() { delete[] nodes; } // Search the given unichar representation in the tree, using length characters // from it maximum. Each character in the string is interpreted as an index in // an array of nodes. UNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const { UNICHARMAP_NODE *current_nodes = nodes; assert(*unichar_repr != '\0'); assert(length > 0 && length <= UNICHAR_LEN); int index = 0; if (length <= 0 || unichar_repr[index] == '\0') { return INVALID_UNICHAR_ID; } do { if (index + 1 >= length || unichar_repr[index + 1] == '\0') { return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id; } current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; ++index; } while (true); } // Search the given unichar representation in the tree, creating the possibly // missing nodes. Once the right place has been found, insert the given id and // update the inserted flag to keep track of the insert. Each character in the // string is interpreted as an index in an array of nodes. void UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) { const char *current_char = unichar_repr; if (*current_char == '\0') { return; } UNICHARMAP_NODE **current_nodes_pointer = &nodes; do { if (*current_nodes_pointer == nullptr) { *current_nodes_pointer = new UNICHARMAP_NODE[256]; } if (current_char[1] == '\0') { (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id; return; } current_nodes_pointer = &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children); ++current_char; } while (true); } // Search the given unichar representation in the tree, using length characters // from it maximum. Each character in the string is interpreted as an index in // an array of nodes. Stop once the tree does not have anymore nodes or once we // found the right unichar_repr. bool UNICHARMAP::contains(const char *const unichar_repr, int length) const { if (unichar_repr == nullptr || *unichar_repr == '\0') { return false; } if (length <= 0 || length > UNICHAR_LEN) { return false; } int index = 0; if (unichar_repr[index] == '\0') { return false; } UNICHARMAP_NODE *current_nodes = nodes; while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\0') { current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; ++index; } return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\0') && current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0; } // Return the minimum number of characters that must be used from this string // to obtain a match in the UNICHARMAP. int UNICHARMAP::minmatch(const char *const unichar_repr) const { const char *current_char = unichar_repr; if (*current_char == '\0') { return 0; } UNICHARMAP_NODE *current_nodes = nodes; while (current_nodes != nullptr && *current_char != '\0') { if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) { return current_char + 1 - unichar_repr; } current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children; ++current_char; } return 0; } void UNICHARMAP::clear() { delete[] nodes; nodes = nullptr; } UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {} // Recursively delete the children UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() { delete[] children; } } // namespace tesseract
