diff mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccmain/paragraphs.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,104 @@
+/**********************************************************************
+ * File:        paragraphs.h
+ * Description: Paragraph Detection data structures.
+ * Author:      David Eger
+ * Created:     25 February 2011
+ *
+ * (C) Copyright 2011, Google Inc.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_CCMAIN_PARAGRAPHS_H_
+#define TESSERACT_CCMAIN_PARAGRAPHS_H_
+
+#include <list>
+#include <string>
+#include "rect.h"   // for TBOX
+
+namespace tesseract {
+
+class MutableIterator;
+class ParagraphModel;
+class PARA_LIST;
+struct PARA;
+
+// This structure captures all information needed about a text line for the
+// purposes of paragraph detection.  It is meant to be exceedingly light-weight
+// so that we can easily test paragraph detection independent of the rest of
+// Tesseract.
+class RowInfo {
+public:
+  // Constant data derived from Tesseract output.
+  std::string text; // the full UTF-8 text of the line.
+  bool ltr;    // whether the majority of the text is left-to-right
+               // TODO(eger) make this more fine-grained.
+
+  bool has_leaders;            // does the line contain leader dots (.....)?
+  bool has_drop_cap;           // does the line have a drop cap?
+  int pix_ldistance;           // distance to the left pblock boundary in pixels
+  int pix_rdistance;           // distance to the right pblock boundary in pixels
+  float pix_xheight;           // guessed xheight for the line
+  int average_interword_space; // average space between words in pixels.
+
+  int num_words;
+  TBOX lword_box; // in normalized (horiz text rows) space
+  TBOX rword_box; // in normalized (horiz text rows) space
+
+  std::string lword_text; // the UTF-8 text of the leftmost werd
+  std::string rword_text; // the UTF-8 text of the rightmost werd
+
+  //   The text of a paragraph typically starts with the start of an idea and
+  // ends with the end of an idea.  Here we define paragraph as something that
+  // may have a first line indent and a body indent which may be different.
+  // Typical words that start an idea are:
+  //   1. Words in western scripts that start with
+  //      a capital letter, for example "The"
+  //   2. Bulleted or numbered list items, for
+  //      example "2."
+  // Typical words which end an idea are words ending in punctuation marks. In
+  // this vocabulary, each list item is represented as a paragraph.
+  bool lword_indicates_list_item;
+  bool lword_likely_starts_idea;
+  bool lword_likely_ends_idea;
+
+  bool rword_indicates_list_item;
+  bool rword_likely_starts_idea;
+  bool rword_likely_ends_idea;
+};
+
+// Main entry point for Paragraph Detection Algorithm.
+//
+// Given a set of equally spaced textlines (described by row_infos),
+// Split them into paragraphs.  See http://goto/paragraphstalk
+//
+// Output:
+//   row_owners - one pointer for each row, to the paragraph it belongs to.
+//   paragraphs - this is the actual list of PARA objects.
+//   models - the list of paragraph models referenced by the PARA objects.
+//            caller is responsible for deleting the models.
+TESS_API
+void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
+                      std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
+                      std::vector<ParagraphModel *> *models);
+
+// Given a MutableIterator to the start of a block, run DetectParagraphs on
+// that block and commit the results to the underlying ROW and BLOCK structs,
+// saving the ParagraphModels in models.  Caller owns the models.
+// We use unicharset during the function to answer questions such as "is the
+// first letter of this word upper case?"
+TESS_API
+void DetectParagraphs(int debug_level, bool after_text_recognition,
+                      const MutableIterator *block_start, std::vector<ParagraphModel *> *models);
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCMAIN_PARAGRAPHS_H_