Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/pango/boxchar.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: boxchar.h | |
| 3 * Description: Simple class to associate a Tesseract classification unit with | |
| 4 * its bounding box so that the boxes can be rotated as the image | |
| 5 * is rotated for degradation. Also includes routines to output | |
| 6 * the character-tagged boxes to a boxfile. | |
| 7 * Author: Ray Smith | |
| 8 * | |
| 9 * (C) Copyright 2013, Google Inc. | |
| 10 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 11 * you may not use this file except in compliance with the License. | |
| 12 * You may obtain a copy of the License at | |
| 13 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 14 * Unless required by applicable law or agreed to in writing, software | |
| 15 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 17 * See the License for the specific language governing permissions and | |
| 18 * limitations under the License. | |
| 19 * | |
| 20 **********************************************************************/ | |
| 21 | |
| 22 #ifndef TESSERACT_TRAINING_BOXCHAR_H_ | |
| 23 #define TESSERACT_TRAINING_BOXCHAR_H_ | |
| 24 | |
| 25 #include <string> | |
| 26 #include <vector> | |
| 27 | |
| 28 #include <allheaders.h> // for Leptonica API | |
| 29 #if (LIBLEPT_MAJOR_VERSION == 1 && LIBLEPT_MINOR_VERSION >= 83) || LIBLEPT_MAJOR_VERSION > 1 | |
| 30 #include <pix_internal.h> // for fast access to Box geometry | |
| 31 #endif | |
| 32 #include <tesseract/export.h> | |
| 33 | |
| 34 namespace tesseract { | |
| 35 | |
| 36 class BoxChar { | |
| 37 public: | |
| 38 BoxChar(const char *utf8_str, int len); | |
| 39 | |
| 40 ~BoxChar(); | |
| 41 | |
| 42 // Accessors. | |
| 43 const std::string &ch() const { | |
| 44 return ch_; | |
| 45 } | |
| 46 const Box *box() const { | |
| 47 return box_; | |
| 48 } | |
| 49 const int &page() const { | |
| 50 return page_; | |
| 51 } | |
| 52 void set_rtl_index(int index) { | |
| 53 rtl_index_ = index; | |
| 54 } | |
| 55 const int &rtl_index() const { | |
| 56 return rtl_index_; | |
| 57 } | |
| 58 | |
| 59 // Set the box_ member. | |
| 60 void AddBox(int x, int y, int width, int height); | |
| 61 | |
| 62 void set_page(int page) { | |
| 63 page_ = page; | |
| 64 } | |
| 65 | |
| 66 std::string *mutable_ch() { | |
| 67 return &ch_; | |
| 68 } | |
| 69 Box *mutable_box() { | |
| 70 return box_; | |
| 71 } | |
| 72 | |
| 73 // Sort function for sorting by left edge of box. Note that this will not | |
| 74 // work properly until after InsertNewlines and InsertSpaces. | |
| 75 bool operator<(const BoxChar &other) const { | |
| 76 if (box_ == nullptr) { | |
| 77 return true; | |
| 78 } | |
| 79 if (other.box_ == nullptr) { | |
| 80 return false; | |
| 81 } | |
| 82 return box_->x < other.box_->x; | |
| 83 } | |
| 84 // Increments *num_rtl and *num_ltr according to the directionality of | |
| 85 // characters in the box. | |
| 86 void GetDirection(int *num_rtl, int *num_ltr) const; | |
| 87 // Reverses the order of unicodes within the box. If Pango generates a | |
| 88 // ligature, these will get reversed on output, so reverse now. | |
| 89 void ReverseUnicodesInBox(); | |
| 90 | |
| 91 static void TranslateBoxes(int xshift, int yshift, std::vector<BoxChar *> *boxes); | |
| 92 | |
| 93 // Prepares for writing the boxes to a file by inserting newlines, spaces, | |
| 94 // and re-ordering so the boxes are strictly left-to-right. | |
| 95 static void PrepareToWrite(std::vector<BoxChar *> *boxes); | |
| 96 // Inserts newline (tab) characters into the vector at newline positions. | |
| 97 static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes); | |
| 98 // Converts nullptr boxes to space characters, with appropriate bounding | |
| 99 // boxes. | |
| 100 static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes); | |
| 101 // Reorders text in a right-to-left script in left-to-right order. | |
| 102 static void ReorderRTLText(std::vector<BoxChar *> *boxes); | |
| 103 // Returns true if the vector contains mostly RTL characters. | |
| 104 static bool ContainsMostlyRTL(const std::vector<BoxChar *> &boxes); | |
| 105 // Returns true if the text is mostly laid out vertically. | |
| 106 static bool MostlyVertical(const std::vector<BoxChar *> &boxes); | |
| 107 | |
| 108 // Returns the total length of all the strings in the boxes. | |
| 109 static int TotalByteLength(const std::vector<BoxChar *> &boxes); | |
| 110 | |
| 111 // Rotate the vector of boxes between start and end by the given rotation. | |
| 112 // The rotation is in radians clockwise about the given center. | |
| 113 static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, | |
| 114 std::vector<BoxChar *> *boxes); | |
| 115 | |
| 116 // Create a tesseract box file from the vector of boxes. The image height | |
| 117 // is needed to convert to tesseract coordinates. | |
| 118 static void WriteTesseractBoxFile(const std::string &name, int height, | |
| 119 const std::vector<BoxChar *> &boxes); | |
| 120 // Gets the tesseract box file as a string from the vector of boxes. | |
| 121 // The image height is needed to convert to tesseract coordinates. | |
| 122 static std::string GetTesseractBoxStr(int height, const std::vector<BoxChar *> &boxes); | |
| 123 | |
| 124 private: | |
| 125 std::string ch_; | |
| 126 Box *box_; | |
| 127 int page_; | |
| 128 // If the box is an RTL character, contains the original position in the | |
| 129 // array of boxes (before reversal), otherwise -1. | |
| 130 int rtl_index_; | |
| 131 }; | |
| 132 | |
| 133 // Sort predicate to sort a vector of BoxChar*. | |
| 134 struct BoxCharPtrSort { | |
| 135 bool operator()(const BoxChar *box1, const BoxChar *box2) const { | |
| 136 if (box1->rtl_index() >= 0 && box2->rtl_index() >= 0) { | |
| 137 return box2->rtl_index() < box1->rtl_index(); | |
| 138 } | |
| 139 return *box1 < *box2; | |
| 140 } | |
| 141 }; | |
| 142 | |
| 143 } // namespace tesseract | |
| 144 | |
| 145 #endif // TESSERACT_TRAINING_BOXCHAR_H_ |
