Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccutil/unicharmap.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccutil/unicharmap.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,135 @@ +/////////////////////////////////////////////////////////////////////// +// File: unicharmap.cpp +// Description: Unicode character/ligature to integer id class. +// Author: Thomas Kielbus +// +// (C) Copyright 2006, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "unicharmap.h" + +#include <tesseract/unichar.h> + +#include <cassert> + +namespace tesseract { + +UNICHARMAP::UNICHARMAP() : nodes(nullptr) {} + +UNICHARMAP::~UNICHARMAP() { + delete[] nodes; +} + +// Search the given unichar representation in the tree, using length characters +// from it maximum. Each character in the string is interpreted as an index in +// an array of nodes. +UNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const { + UNICHARMAP_NODE *current_nodes = nodes; + + assert(*unichar_repr != '\0'); + assert(length > 0 && length <= UNICHAR_LEN); + + int index = 0; + if (length <= 0 || unichar_repr[index] == '\0') { + return INVALID_UNICHAR_ID; + } + do { + if (index + 1 >= length || unichar_repr[index + 1] == '\0') { + return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id; + } + current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; + ++index; + } while (true); +} + +// Search the given unichar representation in the tree, creating the possibly +// missing nodes. Once the right place has been found, insert the given id and +// update the inserted flag to keep track of the insert. Each character in the +// string is interpreted as an index in an array of nodes. +void UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) { + const char *current_char = unichar_repr; + if (*current_char == '\0') { + return; + } + UNICHARMAP_NODE **current_nodes_pointer = &nodes; + do { + if (*current_nodes_pointer == nullptr) { + *current_nodes_pointer = new UNICHARMAP_NODE[256]; + } + if (current_char[1] == '\0') { + (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id; + return; + } + current_nodes_pointer = + &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children); + ++current_char; + } while (true); +} + +// Search the given unichar representation in the tree, using length characters +// from it maximum. Each character in the string is interpreted as an index in +// an array of nodes. Stop once the tree does not have anymore nodes or once we +// found the right unichar_repr. +bool UNICHARMAP::contains(const char *const unichar_repr, int length) const { + if (unichar_repr == nullptr || *unichar_repr == '\0') { + return false; + } + if (length <= 0 || length > UNICHAR_LEN) { + return false; + } + int index = 0; + if (unichar_repr[index] == '\0') { + return false; + } + UNICHARMAP_NODE *current_nodes = nodes; + + while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\0') { + current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children; + ++index; + } + return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\0') && + current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0; +} + +// Return the minimum number of characters that must be used from this string +// to obtain a match in the UNICHARMAP. +int UNICHARMAP::minmatch(const char *const unichar_repr) const { + const char *current_char = unichar_repr; + if (*current_char == '\0') { + return 0; + } + UNICHARMAP_NODE *current_nodes = nodes; + + while (current_nodes != nullptr && *current_char != '\0') { + if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) { + return current_char + 1 - unichar_repr; + } + current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children; + ++current_char; + } + return 0; +} + +void UNICHARMAP::clear() { + delete[] nodes; + nodes = nullptr; +} + +UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {} + +// Recursively delete the children +UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() { + delete[] children; +} + +} // namespace tesseract
