Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/pango/boxchar.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/pango/boxchar.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,145 @@ +/********************************************************************** + * File: boxchar.h + * Description: Simple class to associate a Tesseract classification unit with + * its bounding box so that the boxes can be rotated as the image + * is rotated for degradation. Also includes routines to output + * the character-tagged boxes to a boxfile. + * Author: Ray Smith + * + * (C) Copyright 2013, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_TRAINING_BOXCHAR_H_ +#define TESSERACT_TRAINING_BOXCHAR_H_ + +#include <string> +#include <vector> + +#include <allheaders.h> // for Leptonica API +#if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || LIBLEPT_MAJOR_VERSION > 1 +#include <pix_internal.h> // for fast access to Box geometry +#endif +#include <tesseract/export.h> + +namespace tesseract { + +class BoxChar { +public: + BoxChar(const char *utf8_str, int len); + + ~BoxChar(); + + // Accessors. + const std::string &ch() const { + return ch_; + } + const Box *box() const { + return box_; + } + const int &page() const { + return page_; + } + void set_rtl_index(int index) { + rtl_index_ = index; + } + const int &rtl_index() const { + return rtl_index_; + } + + // Set the box_ member. + void AddBox(int x, int y, int width, int height); + + void set_page(int page) { + page_ = page; + } + + std::string *mutable_ch() { + return &ch_; + } + Box *mutable_box() { + return box_; + } + + // Sort function for sorting by left edge of box. Note that this will not + // work properly until after InsertNewlines and InsertSpaces. + bool operator<(const BoxChar &other) const { + if (box_ == nullptr) { + return true; + } + if (other.box_ == nullptr) { + return false; + } + return box_->x < other.box_->x; + } + // Increments *num_rtl and *num_ltr according to the directionality of + // characters in the box. + void GetDirection(int *num_rtl, int *num_ltr) const; + // Reverses the order of unicodes within the box. If Pango generates a + // ligature, these will get reversed on output, so reverse now. + void ReverseUnicodesInBox(); + + static void TranslateBoxes(int xshift, int yshift, std::vector<BoxChar *> *boxes); + + // Prepares for writing the boxes to a file by inserting newlines, spaces, + // and re-ordering so the boxes are strictly left-to-right. + static void PrepareToWrite(std::vector<BoxChar *> *boxes); + // Inserts newline (tab) characters into the vector at newline positions. + static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes); + // Converts nullptr boxes to space characters, with appropriate bounding + // boxes. + static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes); + // Reorders text in a right-to-left script in left-to-right order. + static void ReorderRTLText(std::vector<BoxChar *> *boxes); + // Returns true if the vector contains mostly RTL characters. + static bool ContainsMostlyRTL(const std::vector<BoxChar *> &boxes); + // Returns true if the text is mostly laid out vertically. + static bool MostlyVertical(const std::vector<BoxChar *> &boxes); + + // Returns the total length of all the strings in the boxes. + static int TotalByteLength(const std::vector<BoxChar *> &boxes); + + // Rotate the vector of boxes between start and end by the given rotation. + // The rotation is in radians clockwise about the given center. + static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, + std::vector<BoxChar *> *boxes); + + // Create a tesseract box file from the vector of boxes. The image height + // is needed to convert to tesseract coordinates. + static void WriteTesseractBoxFile(const std::string &name, int height, + const std::vector<BoxChar *> &boxes); + // Gets the tesseract box file as a string from the vector of boxes. + // The image height is needed to convert to tesseract coordinates. + static std::string GetTesseractBoxStr(int height, const std::vector<BoxChar *> &boxes); + +private: + std::string ch_; + Box *box_; + int page_; + // If the box is an RTL character, contains the original position in the + // array of boxes (before reversal), otherwise -1. + int rtl_index_; +}; + +// Sort predicate to sort a vector of BoxChar*. +struct BoxCharPtrSort { + bool operator()(const BoxChar *box1, const BoxChar *box2) const { + if (box1->rtl_index() >= 0 && box2->rtl_index() >= 0) { + return box2->rtl_index() < box1->rtl_index(); + } + return *box1 < *box2; + } +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_BOXCHAR_H_
