Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ocrpara.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 ///////////////////////////////////////////////////////////////////// | |
| 2 // File: ocrpara.h | |
| 3 // Description: OCR Paragraph Output Type | |
| 4 // Author: David Eger | |
| 5 // | |
| 6 // (C) Copyright 2010, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 // | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_CCSTRUCT_OCRPARA_H_ | |
| 20 #define TESSERACT_CCSTRUCT_OCRPARA_H_ | |
| 21 | |
| 22 #include "elst.h" | |
| 23 | |
| 24 #include <tesseract/publictypes.h> | |
| 25 | |
| 26 namespace tesseract { | |
| 27 | |
| 28 class ParagraphModel; | |
| 29 | |
| 30 struct PARA : public ELIST_LINK { | |
| 31 public: | |
| 32 PARA() | |
| 33 : model(nullptr) | |
| 34 , is_list_item(false) | |
| 35 , is_very_first_or_continuation(false) | |
| 36 , has_drop_cap(false) {} | |
| 37 | |
| 38 // We do not own the model, we just reference it. | |
| 39 // model may be nullptr if there is not a good model for this paragraph. | |
| 40 const ParagraphModel *model; | |
| 41 | |
| 42 bool is_list_item; | |
| 43 | |
| 44 // The first paragraph on a page often lacks a first line indent, but should | |
| 45 // still be modeled by the same model as other body text paragraphs on the | |
| 46 // page. | |
| 47 bool is_very_first_or_continuation; | |
| 48 | |
| 49 // Does this paragraph begin with a drop cap? | |
| 50 bool has_drop_cap; | |
| 51 }; | |
| 52 | |
| 53 ELISTIZEH(PARA) | |
| 54 | |
| 55 // A geometric model of paragraph indentation and alignment. | |
| 56 // | |
| 57 // Measurements are in pixels. The meaning of the integer arguments changes | |
| 58 // depending upon the value of justification. Distances less than or equal | |
| 59 // to tolerance apart we take as "equivalent" for the purpose of model | |
| 60 // matching, and in the examples below, we assume tolerance is zero. | |
| 61 // | |
| 62 // justification = LEFT: | |
| 63 // margin the "ignored" margin to the left block edge. | |
| 64 // first_indent indent from the left margin to a typical first text line. | |
| 65 // body_indent indent from the left margin of a typical body text line. | |
| 66 // | |
| 67 // justification = RIGHT: | |
| 68 // margin the "ignored" margin to the right block edge. | |
| 69 // first_indent indent from the right margin to a typical first text line. | |
| 70 // body_indent indent from the right margin of a typical body text line. | |
| 71 // | |
| 72 // justification = CENTER: | |
| 73 // margin ignored | |
| 74 // first_indent ignored | |
| 75 // body_indent ignored | |
| 76 // | |
| 77 // ====== Extended example, assuming each letter is ten pixels wide: ======= | |
| 78 // | |
| 79 // +--------------------------------+ | |
| 80 // | Awesome | ParagraphModel(CENTER, 0, 0, 0) | |
| 81 // | Centered Title | | |
| 82 // | Paragraph Detection | | |
| 83 // | OCR TEAM | | |
| 84 // | 10 November 2010 | | |
| 85 // | | | |
| 86 // | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0) | |
| 87 // |This paragraph starts at the top| | |
| 88 // |of the page and takes 3 lines. | | |
| 89 // | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0) | |
| 90 // |which indicates that the first | | |
| 91 // |paragraph is not a continuation | | |
| 92 // |from a previous page, as it is | | |
| 93 // |indented just like this second | | |
| 94 // |paragraph. | | |
| 95 // | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0) | |
| 96 // | looks like the prior text | | |
| 97 // | but it is indented more | | |
| 98 // | and is fully justified. | | |
| 99 // | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0) | |
| 100 // |centered text, block quotes, | | |
| 101 // |normal paragraphs, and lists | | |
| 102 // |like what follows? | | |
| 103 // |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30) | |
| 104 // |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30) | |
| 105 // | looking for lines where the | | |
| 106 // | first word of the next line | | |
| 107 // | would fit on the previous | | |
| 108 // | line. | | |
| 109 // |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30) | |
| 110 // | Python and try it out. | | |
| 111 // |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30) | |
| 112 // | mistakes. | | |
| 113 // |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30) | |
| 114 // | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0) | |
| 115 // |you can try to identify source | | |
| 116 // |code. Ouch! | | |
| 117 // +--------------------------------+ | |
| 118 class TESS_API ParagraphModel { | |
| 119 public: | |
| 120 ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent, | |
| 121 int body_indent, int tolerance) | |
| 122 : justification_(justification) | |
| 123 , margin_(margin) | |
| 124 , first_indent_(first_indent) | |
| 125 , body_indent_(body_indent) | |
| 126 , tolerance_(tolerance) { | |
| 127 // Make one of {first_indent, body_indent} is 0. | |
| 128 int added_margin = first_indent; | |
| 129 if (body_indent < added_margin) { | |
| 130 added_margin = body_indent; | |
| 131 } | |
| 132 margin_ += added_margin; | |
| 133 first_indent_ -= added_margin; | |
| 134 body_indent_ -= added_margin; | |
| 135 } | |
| 136 | |
| 137 ParagraphModel() | |
| 138 : justification_(tesseract::JUSTIFICATION_UNKNOWN) | |
| 139 , margin_(0) | |
| 140 , first_indent_(0) | |
| 141 , body_indent_(0) | |
| 142 , tolerance_(0) {} | |
| 143 | |
| 144 // ValidFirstLine() and ValidBodyLine() take arguments describing a text line | |
| 145 // in a block of text which we are trying to model: | |
| 146 // lmargin, lindent: these add up to the distance from the leftmost ink | |
| 147 // in the text line to the surrounding text block's left | |
| 148 // edge. | |
| 149 // rmargin, rindent: these add up to the distance from the rightmost ink | |
| 150 // in the text line to the surrounding text block's right | |
| 151 // edge. | |
| 152 // The caller determines the division between "margin" and "indent", which | |
| 153 // only actually affect whether we think the line may be centered. | |
| 154 // | |
| 155 // If the amount of whitespace matches the amount of whitespace expected on | |
| 156 // the relevant side of the line (within tolerance_) we say it matches. | |
| 157 | |
| 158 // Return whether a given text line could be a first paragraph line according | |
| 159 // to this paragraph model. | |
| 160 bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const; | |
| 161 | |
| 162 // Return whether a given text line could be a first paragraph line according | |
| 163 // to this paragraph model. | |
| 164 bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const; | |
| 165 | |
| 166 tesseract::ParagraphJustification justification() const { | |
| 167 return justification_; | |
| 168 } | |
| 169 int margin() const { | |
| 170 return margin_; | |
| 171 } | |
| 172 int first_indent() const { | |
| 173 return first_indent_; | |
| 174 } | |
| 175 int body_indent() const { | |
| 176 return body_indent_; | |
| 177 } | |
| 178 int tolerance() const { | |
| 179 return tolerance_; | |
| 180 } | |
| 181 bool is_flush() const { | |
| 182 return (justification_ == tesseract::JUSTIFICATION_LEFT || | |
| 183 justification_ == tesseract::JUSTIFICATION_RIGHT) && | |
| 184 abs(first_indent_ - body_indent_) <= tolerance_; | |
| 185 } | |
| 186 | |
| 187 // Return whether this model is likely to agree with the other model on most | |
| 188 // paragraphs they are marked. | |
| 189 bool Comparable(const ParagraphModel &other) const; | |
| 190 | |
| 191 std::string ToString() const; | |
| 192 | |
| 193 private: | |
| 194 tesseract::ParagraphJustification justification_; | |
| 195 int margin_; | |
| 196 int first_indent_; | |
| 197 int body_indent_; | |
| 198 int tolerance_; | |
| 199 }; | |
| 200 | |
| 201 } // namespace tesseract | |
| 202 | |
| 203 #endif // TESSERACT_CCSTRUCT_OCRPARA_H_ |
