comparison mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: paragraphs.h
3 * Description: Paragraph Detection data structures.
4 * Author: David Eger
5 * Created: 25 February 2011
6 *
7 * (C) Copyright 2011, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
21 #define TESSERACT_CCMAIN_PARAGRAPHS_H_
22
23 #include <list>
24 #include <string>
25 #include "rect.h" // for TBOX
26
27 namespace tesseract {
28
29 class MutableIterator;
30 class ParagraphModel;
31 class PARA_LIST;
32 struct PARA;
33
34 // This structure captures all information needed about a text line for the
35 // purposes of paragraph detection. It is meant to be exceedingly light-weight
36 // so that we can easily test paragraph detection independent of the rest of
37 // Tesseract.
38 class RowInfo {
39 public:
40 // Constant data derived from Tesseract output.
41 std::string text; // the full UTF-8 text of the line.
42 bool ltr; // whether the majority of the text is left-to-right
43 // TODO(eger) make this more fine-grained.
44
45 bool has_leaders; // does the line contain leader dots (.....)?
46 bool has_drop_cap; // does the line have a drop cap?
47 int pix_ldistance; // distance to the left pblock boundary in pixels
48 int pix_rdistance; // distance to the right pblock boundary in pixels
49 float pix_xheight; // guessed xheight for the line
50 int average_interword_space; // average space between words in pixels.
51
52 int num_words;
53 TBOX lword_box; // in normalized (horiz text rows) space
54 TBOX rword_box; // in normalized (horiz text rows) space
55
56 std::string lword_text; // the UTF-8 text of the leftmost werd
57 std::string rword_text; // the UTF-8 text of the rightmost werd
58
59 // The text of a paragraph typically starts with the start of an idea and
60 // ends with the end of an idea. Here we define paragraph as something that
61 // may have a first line indent and a body indent which may be different.
62 // Typical words that start an idea are:
63 // 1. Words in western scripts that start with
64 // a capital letter, for example "The"
65 // 2. Bulleted or numbered list items, for
66 // example "2."
67 // Typical words which end an idea are words ending in punctuation marks. In
68 // this vocabulary, each list item is represented as a paragraph.
69 bool lword_indicates_list_item;
70 bool lword_likely_starts_idea;
71 bool lword_likely_ends_idea;
72
73 bool rword_indicates_list_item;
74 bool rword_likely_starts_idea;
75 bool rword_likely_ends_idea;
76 };
77
78 // Main entry point for Paragraph Detection Algorithm.
79 //
80 // Given a set of equally spaced textlines (described by row_infos),
81 // Split them into paragraphs. See http://goto/paragraphstalk
82 //
83 // Output:
84 // row_owners - one pointer for each row, to the paragraph it belongs to.
85 // paragraphs - this is the actual list of PARA objects.
86 // models - the list of paragraph models referenced by the PARA objects.
87 // caller is responsible for deleting the models.
88 TESS_API
89 void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
90 std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
91 std::vector<ParagraphModel *> *models);
92
93 // Given a MutableIterator to the start of a block, run DetectParagraphs on
94 // that block and commit the results to the underlying ROW and BLOCK structs,
95 // saving the ParagraphModels in models. Caller owns the models.
96 // We use unicharset during the function to answer questions such as "is the
97 // first letter of this word upper case?"
98 TESS_API
99 void DetectParagraphs(int debug_level, bool after_text_recognition,
100 const MutableIterator *block_start, std::vector<ParagraphModel *> *models);
101
102 } // namespace tesseract
103
104 #endif // TESSERACT_CCMAIN_PARAGRAPHS_H_