comparison mupdf-source/thirdparty/tesseract/src/ccstruct/ocrpara.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /////////////////////////////////////////////////////////////////////
2 // File: ocrpara.h
3 // Description: OCR Paragraph Output Type
4 // Author: David Eger
5 //
6 // (C) Copyright 2010, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
20 #define TESSERACT_CCSTRUCT_OCRPARA_H_
21
22 #include "elst.h"
23
24 #include <tesseract/publictypes.h>
25
26 namespace tesseract {
27
28 class ParagraphModel;
29
30 struct PARA : public ELIST_LINK {
31 public:
32 PARA()
33 : model(nullptr)
34 , is_list_item(false)
35 , is_very_first_or_continuation(false)
36 , has_drop_cap(false) {}
37
38 // We do not own the model, we just reference it.
39 // model may be nullptr if there is not a good model for this paragraph.
40 const ParagraphModel *model;
41
42 bool is_list_item;
43
44 // The first paragraph on a page often lacks a first line indent, but should
45 // still be modeled by the same model as other body text paragraphs on the
46 // page.
47 bool is_very_first_or_continuation;
48
49 // Does this paragraph begin with a drop cap?
50 bool has_drop_cap;
51 };
52
53 ELISTIZEH(PARA)
54
55 // A geometric model of paragraph indentation and alignment.
56 //
57 // Measurements are in pixels. The meaning of the integer arguments changes
58 // depending upon the value of justification. Distances less than or equal
59 // to tolerance apart we take as "equivalent" for the purpose of model
60 // matching, and in the examples below, we assume tolerance is zero.
61 //
62 // justification = LEFT:
63 // margin the "ignored" margin to the left block edge.
64 // first_indent indent from the left margin to a typical first text line.
65 // body_indent indent from the left margin of a typical body text line.
66 //
67 // justification = RIGHT:
68 // margin the "ignored" margin to the right block edge.
69 // first_indent indent from the right margin to a typical first text line.
70 // body_indent indent from the right margin of a typical body text line.
71 //
72 // justification = CENTER:
73 // margin ignored
74 // first_indent ignored
75 // body_indent ignored
76 //
77 // ====== Extended example, assuming each letter is ten pixels wide: =======
78 //
79 // +--------------------------------+
80 // | Awesome | ParagraphModel(CENTER, 0, 0, 0)
81 // | Centered Title |
82 // | Paragraph Detection |
83 // | OCR TEAM |
84 // | 10 November 2010 |
85 // | |
86 // | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
87 // |This paragraph starts at the top|
88 // |of the page and takes 3 lines. |
89 // | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
90 // |which indicates that the first |
91 // |paragraph is not a continuation |
92 // |from a previous page, as it is |
93 // |indented just like this second |
94 // |paragraph. |
95 // | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0)
96 // | looks like the prior text |
97 // | but it is indented more |
98 // | and is fully justified. |
99 // | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0)
100 // |centered text, block quotes, |
101 // |normal paragraphs, and lists |
102 // |like what follows? |
103 // |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30)
104 // |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
105 // | looking for lines where the |
106 // | first word of the next line |
107 // | would fit on the previous |
108 // | line. |
109 // |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
110 // | Python and try it out. |
111 // |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30)
112 // | mistakes. |
113 // |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30)
114 // | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
115 // |you can try to identify source |
116 // |code. Ouch! |
117 // +--------------------------------+
118 class TESS_API ParagraphModel {
119 public:
120 ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent,
121 int body_indent, int tolerance)
122 : justification_(justification)
123 , margin_(margin)
124 , first_indent_(first_indent)
125 , body_indent_(body_indent)
126 , tolerance_(tolerance) {
127 // Make one of {first_indent, body_indent} is 0.
128 int added_margin = first_indent;
129 if (body_indent < added_margin) {
130 added_margin = body_indent;
131 }
132 margin_ += added_margin;
133 first_indent_ -= added_margin;
134 body_indent_ -= added_margin;
135 }
136
137 ParagraphModel()
138 : justification_(tesseract::JUSTIFICATION_UNKNOWN)
139 , margin_(0)
140 , first_indent_(0)
141 , body_indent_(0)
142 , tolerance_(0) {}
143
144 // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
145 // in a block of text which we are trying to model:
146 // lmargin, lindent: these add up to the distance from the leftmost ink
147 // in the text line to the surrounding text block's left
148 // edge.
149 // rmargin, rindent: these add up to the distance from the rightmost ink
150 // in the text line to the surrounding text block's right
151 // edge.
152 // The caller determines the division between "margin" and "indent", which
153 // only actually affect whether we think the line may be centered.
154 //
155 // If the amount of whitespace matches the amount of whitespace expected on
156 // the relevant side of the line (within tolerance_) we say it matches.
157
158 // Return whether a given text line could be a first paragraph line according
159 // to this paragraph model.
160 bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
161
162 // Return whether a given text line could be a first paragraph line according
163 // to this paragraph model.
164 bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
165
166 tesseract::ParagraphJustification justification() const {
167 return justification_;
168 }
169 int margin() const {
170 return margin_;
171 }
172 int first_indent() const {
173 return first_indent_;
174 }
175 int body_indent() const {
176 return body_indent_;
177 }
178 int tolerance() const {
179 return tolerance_;
180 }
181 bool is_flush() const {
182 return (justification_ == tesseract::JUSTIFICATION_LEFT ||
183 justification_ == tesseract::JUSTIFICATION_RIGHT) &&
184 abs(first_indent_ - body_indent_) <= tolerance_;
185 }
186
187 // Return whether this model is likely to agree with the other model on most
188 // paragraphs they are marked.
189 bool Comparable(const ParagraphModel &other) const;
190
191 std::string ToString() const;
192
193 private:
194 tesseract::ParagraphJustification justification_;
195 int margin_;
196 int first_indent_;
197 int body_indent_;
198 int tolerance_;
199 };
200
201 } // namespace tesseract
202
203 #endif // TESSERACT_CCSTRUCT_OCRPARA_H_