Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: paragraphs.h | |
| 3 * Description: Paragraph Detection data structures. | |
| 4 * Author: David Eger | |
| 5 * Created: 25 February 2011 | |
| 6 * | |
| 7 * (C) Copyright 2011, Google Inc. | |
| 8 ** Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 ** you may not use this file except in compliance with the License. | |
| 10 ** You may obtain a copy of the License at | |
| 11 ** http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 ** Unless required by applicable law or agreed to in writing, software | |
| 13 ** distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 ** See the License for the specific language governing permissions and | |
| 16 ** limitations under the License. | |
| 17 * | |
| 18 **********************************************************************/ | |
| 19 | |
| 20 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_ | |
| 21 #define TESSERACT_CCMAIN_PARAGRAPHS_H_ | |
| 22 | |
| 23 #include <list> | |
| 24 #include <string> | |
| 25 #include "rect.h" // for TBOX | |
| 26 | |
| 27 namespace tesseract { | |
| 28 | |
| 29 class MutableIterator; | |
| 30 class ParagraphModel; | |
| 31 class PARA_LIST; | |
| 32 struct PARA; | |
| 33 | |
| 34 // This structure captures all information needed about a text line for the | |
| 35 // purposes of paragraph detection. It is meant to be exceedingly light-weight | |
| 36 // so that we can easily test paragraph detection independent of the rest of | |
| 37 // Tesseract. | |
| 38 class RowInfo { | |
| 39 public: | |
| 40 // Constant data derived from Tesseract output. | |
| 41 std::string text; // the full UTF-8 text of the line. | |
| 42 bool ltr; // whether the majority of the text is left-to-right | |
| 43 // TODO(eger) make this more fine-grained. | |
| 44 | |
| 45 bool has_leaders; // does the line contain leader dots (.....)? | |
| 46 bool has_drop_cap; // does the line have a drop cap? | |
| 47 int pix_ldistance; // distance to the left pblock boundary in pixels | |
| 48 int pix_rdistance; // distance to the right pblock boundary in pixels | |
| 49 float pix_xheight; // guessed xheight for the line | |
| 50 int average_interword_space; // average space between words in pixels. | |
| 51 | |
| 52 int num_words; | |
| 53 TBOX lword_box; // in normalized (horiz text rows) space | |
| 54 TBOX rword_box; // in normalized (horiz text rows) space | |
| 55 | |
| 56 std::string lword_text; // the UTF-8 text of the leftmost werd | |
| 57 std::string rword_text; // the UTF-8 text of the rightmost werd | |
| 58 | |
| 59 // The text of a paragraph typically starts with the start of an idea and | |
| 60 // ends with the end of an idea. Here we define paragraph as something that | |
| 61 // may have a first line indent and a body indent which may be different. | |
| 62 // Typical words that start an idea are: | |
| 63 // 1. Words in western scripts that start with | |
| 64 // a capital letter, for example "The" | |
| 65 // 2. Bulleted or numbered list items, for | |
| 66 // example "2." | |
| 67 // Typical words which end an idea are words ending in punctuation marks. In | |
| 68 // this vocabulary, each list item is represented as a paragraph. | |
| 69 bool lword_indicates_list_item; | |
| 70 bool lword_likely_starts_idea; | |
| 71 bool lword_likely_ends_idea; | |
| 72 | |
| 73 bool rword_indicates_list_item; | |
| 74 bool rword_likely_starts_idea; | |
| 75 bool rword_likely_ends_idea; | |
| 76 }; | |
| 77 | |
| 78 // Main entry point for Paragraph Detection Algorithm. | |
| 79 // | |
| 80 // Given a set of equally spaced textlines (described by row_infos), | |
| 81 // Split them into paragraphs. See http://goto/paragraphstalk | |
| 82 // | |
| 83 // Output: | |
| 84 // row_owners - one pointer for each row, to the paragraph it belongs to. | |
| 85 // paragraphs - this is the actual list of PARA objects. | |
| 86 // models - the list of paragraph models referenced by the PARA objects. | |
| 87 // caller is responsible for deleting the models. | |
| 88 TESS_API | |
| 89 void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos, | |
| 90 std::vector<PARA *> *row_owners, PARA_LIST *paragraphs, | |
| 91 std::vector<ParagraphModel *> *models); | |
| 92 | |
| 93 // Given a MutableIterator to the start of a block, run DetectParagraphs on | |
| 94 // that block and commit the results to the underlying ROW and BLOCK structs, | |
| 95 // saving the ParagraphModels in models. Caller owns the models. | |
| 96 // We use unicharset during the function to answer questions such as "is the | |
| 97 // first letter of this word upper case?" | |
| 98 TESS_API | |
| 99 void DetectParagraphs(int debug_level, bool after_text_recognition, | |
| 100 const MutableIterator *block_start, std::vector<ParagraphModel *> *models); | |
| 101 | |
| 102 } // namespace tesseract | |
| 103 | |
| 104 #endif // TESSERACT_CCMAIN_PARAGRAPHS_H_ |
