diff mupdf-source/thirdparty/tesseract/src/ccstruct/ocrpara.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/ocrpara.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,203 @@
+/////////////////////////////////////////////////////////////////////
+// File:        ocrpara.h
+// Description: OCR Paragraph Output Type
+// Author:      David Eger
+//
+// (C) Copyright 2010, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
+#define TESSERACT_CCSTRUCT_OCRPARA_H_
+
+#include "elst.h"
+
+#include <tesseract/publictypes.h>
+
+namespace tesseract {
+
+class ParagraphModel;
+
+struct PARA : public ELIST_LINK {
+public:
+  PARA()
+      : model(nullptr)
+      , is_list_item(false)
+      , is_very_first_or_continuation(false)
+      , has_drop_cap(false) {}
+
+  // We do not own the model, we just reference it.
+  // model may be nullptr if there is not a good model for this paragraph.
+  const ParagraphModel *model;
+
+  bool is_list_item;
+
+  // The first paragraph on a page often lacks a first line indent, but should
+  // still be modeled by the same model as other body text paragraphs on the
+  // page.
+  bool is_very_first_or_continuation;
+
+  // Does this paragraph begin with a drop cap?
+  bool has_drop_cap;
+};
+
+ELISTIZEH(PARA)
+
+// A geometric model of paragraph indentation and alignment.
+//
+// Measurements are in pixels. The meaning of the integer arguments changes
+// depending upon the value of justification.  Distances less than or equal
+// to tolerance apart we take as "equivalent" for the purpose of model
+// matching, and in the examples below, we assume tolerance is zero.
+//
+// justification = LEFT:
+//   margin       the "ignored" margin to the left block edge.
+//   first_indent indent from the left margin to a typical first text line.
+//   body_indent  indent from the left margin of a typical body text line.
+//
+// justification = RIGHT:
+//   margin       the "ignored" margin to the right block edge.
+//   first_indent indent from the right margin to a typical first text line.
+//   body_indent  indent from the right margin of a typical body text line.
+//
+// justification = CENTER:
+//   margin       ignored
+//   first_indent ignored
+//   body_indent  ignored
+//
+//  ====== Extended example, assuming each letter is ten pixels wide: =======
+//
+// +--------------------------------+
+// |      Awesome                   | ParagraphModel(CENTER, 0, 0, 0)
+// |   Centered Title               |
+// | Paragraph Detection            |
+// |      OCR TEAM                  |
+// |  10 November 2010              |
+// |                                |
+// |  Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
+// |This paragraph starts at the top|
+// |of the page and takes 3 lines.  |
+// |  Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
+// |which indicates that the first  |
+// |paragraph is not a continuation |
+// |from a previous page, as it is  |
+// |indented just like this second  |
+// |paragraph.                      |
+// |   Here is a block quote. It    | ParagraphModel(LEFT, 30, 0, 0)
+// |   looks like the prior text    |
+// |   but it  is indented  more    |
+// |   and is fully justified.      |
+// |  So how does one deal with     | ParagraphModel(LEFT, 0, 20, 0)
+// |centered text, block quotes,    |
+// |normal paragraphs, and lists    |
+// |like what follows?              |
+// |1. Make a plan.                 | ParagraphModel(LEFT, 0, 0, 30)
+// |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
+// |   looking for lines where the  |
+// |   first word of the next line  |
+// |   would fit on the previous    |
+// |   line.                        |
+// |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
+// |   Python and try it out.       |
+// |4. Determine how to fix the     | ParagraphModel(LEFT, 0, 0, 30)
+// |   mistakes.                    |
+// |5. Repeat.                      | ParagraphModel(LEFT, 0, 0, 30)
+// |  For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
+// |you can try to identify source  |
+// |code.  Ouch!                    |
+// +--------------------------------+
+class TESS_API ParagraphModel {
+public:
+  ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent,
+                 int body_indent, int tolerance)
+      : justification_(justification)
+      , margin_(margin)
+      , first_indent_(first_indent)
+      , body_indent_(body_indent)
+      , tolerance_(tolerance) {
+    // Make one of {first_indent, body_indent} is 0.
+    int added_margin = first_indent;
+    if (body_indent < added_margin) {
+      added_margin = body_indent;
+    }
+    margin_ += added_margin;
+    first_indent_ -= added_margin;
+    body_indent_ -= added_margin;
+  }
+
+  ParagraphModel()
+      : justification_(tesseract::JUSTIFICATION_UNKNOWN)
+      , margin_(0)
+      , first_indent_(0)
+      , body_indent_(0)
+      , tolerance_(0) {}
+
+  // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
+  // in a block of text which we are trying to model:
+  //   lmargin, lindent:  these add up to the distance from the leftmost ink
+  //                      in the text line to the surrounding text block's left
+  //                      edge.
+  //   rmargin, rindent:  these add up to the distance from the rightmost ink
+  //                      in the text line to the surrounding text block's right
+  //                      edge.
+  // The caller determines the division between "margin" and "indent", which
+  // only actually affect whether we think the line may be centered.
+  //
+  // If the amount of whitespace matches the amount of whitespace expected on
+  // the relevant side of the line (within tolerance_) we say it matches.
+
+  // Return whether a given text line could be a first paragraph line according
+  // to this paragraph model.
+  bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
+
+  // Return whether a given text line could be a first paragraph line according
+  // to this paragraph model.
+  bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
+
+  tesseract::ParagraphJustification justification() const {
+    return justification_;
+  }
+  int margin() const {
+    return margin_;
+  }
+  int first_indent() const {
+    return first_indent_;
+  }
+  int body_indent() const {
+    return body_indent_;
+  }
+  int tolerance() const {
+    return tolerance_;
+  }
+  bool is_flush() const {
+    return (justification_ == tesseract::JUSTIFICATION_LEFT ||
+            justification_ == tesseract::JUSTIFICATION_RIGHT) &&
+           abs(first_indent_ - body_indent_) <= tolerance_;
+  }
+
+  // Return whether this model is likely to agree with the other model on most
+  // paragraphs they are marked.
+  bool Comparable(const ParagraphModel &other) const;
+
+  std::string ToString() const;
+
+private:
+  tesseract::ParagraphJustification justification_;
+  int margin_;
+  int first_indent_;
+  int body_indent_;
+  int tolerance_;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_CCSTRUCT_OCRPARA_H_