Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,104 @@ +/********************************************************************** + * File: paragraphs.h + * Description: Paragraph Detection data structures. + * Author: David Eger + * Created: 25 February 2011 + * + * (C) Copyright 2011, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_ +#define TESSERACT_CCMAIN_PARAGRAPHS_H_ + +#include <list> +#include <string> +#include "rect.h" // for TBOX + +namespace tesseract { + +class MutableIterator; +class ParagraphModel; +class PARA_LIST; +struct PARA; + +// This structure captures all information needed about a text line for the +// purposes of paragraph detection. It is meant to be exceedingly light-weight +// so that we can easily test paragraph detection independent of the rest of +// Tesseract. +class RowInfo { +public: + // Constant data derived from Tesseract output. + std::string text; // the full UTF-8 text of the line. + bool ltr; // whether the majority of the text is left-to-right + // TODO(eger) make this more fine-grained. + + bool has_leaders; // does the line contain leader dots (.....)? + bool has_drop_cap; // does the line have a drop cap? + int pix_ldistance; // distance to the left pblock boundary in pixels + int pix_rdistance; // distance to the right pblock boundary in pixels + float pix_xheight; // guessed xheight for the line + int average_interword_space; // average space between words in pixels. + + int num_words; + TBOX lword_box; // in normalized (horiz text rows) space + TBOX rword_box; // in normalized (horiz text rows) space + + std::string lword_text; // the UTF-8 text of the leftmost werd + std::string rword_text; // the UTF-8 text of the rightmost werd + + // The text of a paragraph typically starts with the start of an idea and + // ends with the end of an idea. Here we define paragraph as something that + // may have a first line indent and a body indent which may be different. + // Typical words that start an idea are: + // 1. Words in western scripts that start with + // a capital letter, for example "The" + // 2. Bulleted or numbered list items, for + // example "2." + // Typical words which end an idea are words ending in punctuation marks. In + // this vocabulary, each list item is represented as a paragraph. + bool lword_indicates_list_item; + bool lword_likely_starts_idea; + bool lword_likely_ends_idea; + + bool rword_indicates_list_item; + bool rword_likely_starts_idea; + bool rword_likely_ends_idea; +}; + +// Main entry point for Paragraph Detection Algorithm. +// +// Given a set of equally spaced textlines (described by row_infos), +// Split them into paragraphs. See http://goto/paragraphstalk +// +// Output: +// row_owners - one pointer for each row, to the paragraph it belongs to. +// paragraphs - this is the actual list of PARA objects. +// models - the list of paragraph models referenced by the PARA objects. +// caller is responsible for deleting the models. +TESS_API +void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos, + std::vector<PARA *> *row_owners, PARA_LIST *paragraphs, + std::vector<ParagraphModel *> *models); + +// Given a MutableIterator to the start of a block, run DetectParagraphs on +// that block and commit the results to the underlying ROW and BLOCK structs, +// saving the ParagraphModels in models. Caller owns the models. +// We use unicharset during the function to answer questions such as "is the +// first letter of this word upper case?" +TESS_API +void DetectParagraphs(int debug_level, bool after_text_recognition, + const MutableIterator *block_start, std::vector<ParagraphModel *> *models); + +} // namespace tesseract + +#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_
