Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs_internal.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: paragraphs_internal.h | |
| 3 * Description: Paragraph Detection internal data structures. | |
| 4 * Author: David Eger | |
| 5 * | |
| 6 * (C) Copyright 2011, Google Inc. | |
| 7 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 ** you may not use this file except in compliance with the License. | |
| 9 ** You may obtain a copy of the License at | |
| 10 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 ** Unless required by applicable law or agreed to in writing, software | |
| 12 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 ** See the License for the specific language governing permissions and | |
| 15 ** limitations under the License. | |
| 16 * | |
| 17 **********************************************************************/ | |
| 18 | |
| 19 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ | |
| 20 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ | |
| 21 | |
| 22 #include <tesseract/publictypes.h> // for ParagraphJustification | |
| 23 #include "paragraphs.h" | |
| 24 | |
| 25 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS | |
| 26 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE. | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 class UNICHARSET; | |
| 31 class WERD_CHOICE; | |
| 32 | |
| 33 // Return whether the given word is likely to be a list item start word. | |
| 34 TESS_API | |
| 35 bool AsciiLikelyListItem(const std::string &word); | |
| 36 | |
| 37 // Set right word attributes given either a unicharset and werd or a utf8 | |
| 38 // string. | |
| 39 TESS_API | |
| 40 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, | |
| 41 bool *is_list, bool *starts_idea, bool *ends_idea); | |
| 42 | |
| 43 // Set left word attributes given either a unicharset and werd or a utf8 string. | |
| 44 TESS_API | |
| 45 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, | |
| 46 bool *is_list, bool *starts_idea, bool *ends_idea); | |
| 47 | |
| 48 enum LineType { | |
| 49 LT_START = 'S', // First line of a paragraph. | |
| 50 LT_BODY = 'C', // Continuation line of a paragraph. | |
| 51 LT_UNKNOWN = 'U', // No clues. | |
| 52 LT_MULTIPLE = 'M', // Matches for both LT_START and LT_BODY. | |
| 53 }; | |
| 54 | |
| 55 // The first paragraph in a page of body text is often un-indented. | |
| 56 // This is a typographic convention which is common to indicate either that: | |
| 57 // (1) The paragraph is the continuation of a previous paragraph, or | |
| 58 // (2) The paragraph is the first paragraph in a chapter. | |
| 59 // | |
| 60 // I refer to such paragraphs as "crown"s, and the output of the paragraph | |
| 61 // detection algorithm attempts to give them the same paragraph model as | |
| 62 // the rest of the body text. | |
| 63 // | |
| 64 // Nonetheless, while building hypotheses, it is useful to mark the lines | |
| 65 // of crown paragraphs temporarily as crowns, either aligned left or right. | |
| 66 extern const ParagraphModel *kCrownLeft; | |
| 67 extern const ParagraphModel *kCrownRight; | |
| 68 | |
| 69 inline bool StrongModel(const ParagraphModel *model) { | |
| 70 return model != nullptr && model != kCrownLeft && model != kCrownRight; | |
| 71 } | |
| 72 | |
| 73 struct LineHypothesis { | |
| 74 LineHypothesis() : ty(LT_UNKNOWN), model(nullptr) {} | |
| 75 LineHypothesis(LineType line_type, const ParagraphModel *m) : ty(line_type), model(m) {} | |
| 76 LineHypothesis(const LineHypothesis &other) = default; | |
| 77 | |
| 78 // Copy assignment operator. | |
| 79 LineHypothesis &operator=(const LineHypothesis &other) = default; | |
| 80 | |
| 81 bool operator==(const LineHypothesis &other) const { | |
| 82 return ty == other.ty && model == other.model; | |
| 83 } | |
| 84 | |
| 85 LineType ty; | |
| 86 const ParagraphModel *model; | |
| 87 }; | |
| 88 | |
| 89 class ParagraphTheory; // Forward Declaration | |
| 90 | |
| 91 using SetOfModels = std::vector<const ParagraphModel *>; | |
| 92 | |
| 93 // Row Scratch Registers are data generated by the paragraph detection | |
| 94 // algorithm based on a RowInfo input. | |
| 95 class RowScratchRegisters { | |
| 96 public: | |
| 97 // We presume row will outlive us. | |
| 98 void Init(const RowInfo &row); | |
| 99 | |
| 100 LineType GetLineType() const; | |
| 101 | |
| 102 LineType GetLineType(const ParagraphModel *model) const; | |
| 103 | |
| 104 // Mark this as a start line type, sans model. This is useful for the | |
| 105 // initial marking of probable body lines or paragraph start lines. | |
| 106 void SetStartLine(); | |
| 107 | |
| 108 // Mark this as a body line type, sans model. This is useful for the | |
| 109 // initial marking of probably body lines or paragraph start lines. | |
| 110 void SetBodyLine(); | |
| 111 | |
| 112 // Record that this row fits as a paragraph start line in the given model, | |
| 113 void AddStartLine(const ParagraphModel *model); | |
| 114 // Record that this row fits as a paragraph body line in the given model, | |
| 115 void AddBodyLine(const ParagraphModel *model); | |
| 116 | |
| 117 // Clear all hypotheses about this line. | |
| 118 void SetUnknown() { | |
| 119 hypotheses_.clear(); | |
| 120 } | |
| 121 | |
| 122 // Append all hypotheses of strong models that match this row as a start. | |
| 123 void StartHypotheses(SetOfModels *models) const; | |
| 124 | |
| 125 // Append all hypotheses of strong models matching this row. | |
| 126 void StrongHypotheses(SetOfModels *models) const; | |
| 127 | |
| 128 // Append all hypotheses for this row. | |
| 129 void NonNullHypotheses(SetOfModels *models) const; | |
| 130 | |
| 131 // Discard any hypotheses whose model is not in the given list. | |
| 132 void DiscardNonMatchingHypotheses(const SetOfModels &models); | |
| 133 | |
| 134 // If we have only one hypothesis and that is that this line is a paragraph | |
| 135 // start line of a certain model, return that model. Else return nullptr. | |
| 136 const ParagraphModel *UniqueStartHypothesis() const; | |
| 137 | |
| 138 // If we have only one hypothesis and that is that this line is a paragraph | |
| 139 // body line of a certain model, return that model. Else return nullptr. | |
| 140 const ParagraphModel *UniqueBodyHypothesis() const; | |
| 141 | |
| 142 // Return the indentation for the side opposite of the aligned side. | |
| 143 int OffsideIndent(tesseract::ParagraphJustification just) const { | |
| 144 switch (just) { | |
| 145 case tesseract::JUSTIFICATION_RIGHT: | |
| 146 return lindent_; | |
| 147 case tesseract::JUSTIFICATION_LEFT: | |
| 148 return rindent_; | |
| 149 default: | |
| 150 return lindent_ > rindent_ ? lindent_ : rindent_; | |
| 151 } | |
| 152 } | |
| 153 | |
| 154 // Return the indentation for the side the text is aligned to. | |
| 155 int AlignsideIndent(tesseract::ParagraphJustification just) const { | |
| 156 switch (just) { | |
| 157 case tesseract::JUSTIFICATION_RIGHT: | |
| 158 return rindent_; | |
| 159 case tesseract::JUSTIFICATION_LEFT: | |
| 160 return lindent_; | |
| 161 default: | |
| 162 return lindent_ > rindent_ ? lindent_ : rindent_; | |
| 163 } | |
| 164 } | |
| 165 | |
| 166 // Append header fields to a vector of row headings. | |
| 167 static void AppendDebugHeaderFields(std::vector<std::string> &header); | |
| 168 | |
| 169 // Append data for this row to a vector of debug strings. | |
| 170 void AppendDebugInfo(const ParagraphTheory &theory, std::vector<std::string> &dbg) const; | |
| 171 | |
| 172 const RowInfo *ri_; | |
| 173 | |
| 174 // These four constants form a horizontal box model for the white space | |
| 175 // on the edges of each line. At each point in the algorithm, the following | |
| 176 // shall hold: | |
| 177 // ri_->pix_ldistance = lmargin_ + lindent_ | |
| 178 // ri_->pix_rdistance = rindent_ + rmargin_ | |
| 179 int lmargin_; | |
| 180 int lindent_; | |
| 181 int rindent_; | |
| 182 int rmargin_; | |
| 183 | |
| 184 private: | |
| 185 // Hypotheses of either LT_START or LT_BODY | |
| 186 std::vector<LineHypothesis> hypotheses_; | |
| 187 }; | |
| 188 | |
| 189 // A collection of convenience functions for wrapping the set of | |
| 190 // Paragraph Models we believe correctly model the paragraphs in the image. | |
| 191 class ParagraphTheory { | |
| 192 public: | |
| 193 // We presume models will outlive us, and that models will take ownership | |
| 194 // of any ParagraphModel *'s we add. | |
| 195 explicit ParagraphTheory(std::vector<ParagraphModel *> *models) : models_(models) {} | |
| 196 std::vector<ParagraphModel *> &models() { | |
| 197 return *models_; | |
| 198 } | |
| 199 const std::vector<ParagraphModel *> &models() const { | |
| 200 return *models_; | |
| 201 } | |
| 202 | |
| 203 // Return an existing model if one that is Comparable() can be found. | |
| 204 // Else, allocate a new copy of model to save and return a pointer to it. | |
| 205 const ParagraphModel *AddModel(const ParagraphModel &model); | |
| 206 | |
| 207 // Discard any models we've made that are not in the list of used models. | |
| 208 void DiscardUnusedModels(const SetOfModels &used_models); | |
| 209 | |
| 210 // Return the set of all non-centered models. | |
| 211 void NonCenteredModels(SetOfModels *models); | |
| 212 | |
| 213 // If any of the non-centered paragraph models we know about fit | |
| 214 // rows[start, end), return it. Else nullptr. | |
| 215 const ParagraphModel *Fits(const std::vector<RowScratchRegisters> *rows, int start, | |
| 216 int end) const; | |
| 217 | |
| 218 int IndexOf(const ParagraphModel *model) const; | |
| 219 | |
| 220 private: | |
| 221 std::vector<ParagraphModel *> *models_; | |
| 222 std::vector<ParagraphModel *> models_we_added_; | |
| 223 }; | |
| 224 | |
| 225 bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row, | |
| 226 const ParagraphModel *model); | |
| 227 bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row, | |
| 228 const ParagraphModel *model); | |
| 229 bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b, | |
| 230 const ParagraphModel *model); | |
| 231 | |
| 232 // A class for smearing Paragraph Model hypotheses to surrounding rows. | |
| 233 // The idea here is that StrongEvidenceClassify first marks only exceedingly | |
| 234 // obvious start and body rows and constructs models of them. Thereafter, | |
| 235 // we may have left over unmarked lines (mostly end-of-paragraph lines) which | |
| 236 // were too short to have much confidence about, but which fit the models we've | |
| 237 // constructed perfectly and which we ought to mark. This class is used to | |
| 238 // "smear" our models over the text. | |
| 239 class ParagraphModelSmearer { | |
| 240 public: | |
| 241 ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows, int row_start, int row_end, | |
| 242 ParagraphTheory *theory); | |
| 243 | |
| 244 // Smear forward paragraph models from existing row markings to subsequent | |
| 245 // text lines if they fit, and mark any thereafter still unmodeled rows | |
| 246 // with any model in the theory that fits them. | |
| 247 void Smear(); | |
| 248 | |
| 249 private: | |
| 250 // Record in open_models_ for rows [start_row, end_row) the list of models | |
| 251 // currently open at each row. | |
| 252 // A model is still open in a row if some previous row has said model as a | |
| 253 // start hypothesis, and all rows since (including this row) would fit as | |
| 254 // either a body or start line in that model. | |
| 255 void CalculateOpenModels(int row_start, int row_end); | |
| 256 | |
| 257 SetOfModels &OpenModels(int row) { | |
| 258 return open_models_[row - row_start_ + 1]; | |
| 259 } | |
| 260 | |
| 261 ParagraphTheory *theory_; | |
| 262 std::vector<RowScratchRegisters> *rows_; | |
| 263 int row_start_; | |
| 264 int row_end_; | |
| 265 | |
| 266 // open_models_ corresponds to rows[start_row_ - 1, end_row_] | |
| 267 // | |
| 268 // open_models_: Contains models which there was an active (open) paragraph | |
| 269 // as of the previous line and for which the left and right | |
| 270 // indents admit the possibility that this text line continues | |
| 271 // to fit the same model. | |
| 272 // TODO(eger): Think about whether we can get rid of "Open" models and just | |
| 273 // use the current hypotheses on RowScratchRegisters. | |
| 274 std::vector<SetOfModels> open_models_; | |
| 275 }; | |
| 276 | |
| 277 // Clear all hypotheses about lines [start, end) and reset the margins to the | |
| 278 // percentile (0..100) value of the left and right row edges for this run of | |
| 279 // rows. | |
| 280 void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start, | |
| 281 int end, int percentile); | |
| 282 | |
| 283 // Return the median inter-word space in rows[row_start, row_end). | |
| 284 int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end); | |
| 285 | |
| 286 // Return whether the first word on the after line can fit in the space at | |
| 287 // the end of the before line (knowing which way the text is aligned and read). | |
| 288 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, | |
| 289 tesseract::ParagraphJustification justification); | |
| 290 | |
| 291 // Return whether the first word on the after line can fit in the space at | |
| 292 // the end of the before line (not knowing the text alignment). | |
| 293 bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after); | |
| 294 | |
| 295 // Do rows[start, end) form a single instance of the given paragraph model? | |
| 296 bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end, | |
| 297 const ParagraphModel *model); | |
| 298 | |
| 299 // Given a set of row_owners pointing to PARAs or nullptr (no paragraph known), | |
| 300 // normalize each row_owner to point to an actual PARA, and output the | |
| 301 // paragraphs in order onto paragraphs. | |
| 302 void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs); | |
| 303 | |
| 304 } // namespace tesseract | |
| 305 | |
| 306 #endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_ |
