diff mupdf-source/thirdparty/tesseract/src/ccstruct/seam.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/seam.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,275 @@
+/******************************************************************************
+ *
+ * File:         seam.cpp  (Formerly seam.c)
+ * Author:       Mark Seaman, OCR Technology
+ *
+ * (c) Copyright 1987, Hewlett-Packard Company.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ *****************************************************************************/
+/*----------------------------------------------------------------------
+              I n c l u d e s
+----------------------------------------------------------------------*/
+#include "seam.h"
+
+#include "blobs.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+/*----------------------------------------------------------------------
+        Public Function Code
+----------------------------------------------------------------------*/
+
+// Returns the bounding box of all the points in the seam.
+TBOX SEAM::bounding_box() const {
+  TBOX box(location_.x, location_.y, location_.x, location_.y);
+  for (int s = 0; s < num_splits_; ++s) {
+    box += splits_[s].bounding_box();
+  }
+  return box;
+}
+
+// Returns true if the splits in *this SEAM appear OK in the sense that they
+// do not cross any outlines and do not chop off any ridiculously small
+// pieces.
+bool SEAM::IsHealthy(const TBLOB &blob, int min_points, int min_area) const {
+  // TODO(rays) Try testing all the splits. Duplicating original code for now,
+  // which tested only the first.
+  return num_splits_ == 0 || splits_[0].IsHealthy(blob, min_points, min_area);
+}
+
+// Computes the widthp_/widthn_ range for all existing SEAMs and for *this
+// seam, which is about to be inserted at insert_index. Returns false if
+// any of the computations fails, as this indicates an invalid chop.
+// widthn_/widthp_ are only changed if modify is true.
+bool SEAM::PrepareToInsertSeam(const std::vector<SEAM *> &seams,
+                               const std::vector<TBLOB *> &blobs, int insert_index, bool modify) {
+  for (int s = 0; s < insert_index; ++s) {
+    if (!seams[s]->FindBlobWidth(blobs, s, modify)) {
+      return false;
+    }
+  }
+  if (!FindBlobWidth(blobs, insert_index, modify)) {
+    return false;
+  }
+  for (unsigned s = insert_index; s < seams.size(); ++s) {
+    if (!seams[s]->FindBlobWidth(blobs, s + 1, modify)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Computes the widthp_/widthn_ range. Returns false if not all the splits
+// are accounted for. widthn_/widthp_ are only changed if modify is true.
+bool SEAM::FindBlobWidth(const std::vector<TBLOB *> &blobs, int index, bool modify) {
+  int num_found = 0;
+  if (modify) {
+    widthp_ = 0;
+    widthn_ = 0;
+  }
+  for (int s = 0; s < num_splits_; ++s) {
+    const SPLIT &split = splits_[s];
+    bool found_split = split.ContainedByBlob(*blobs[index]);
+    // Look right.
+    for (unsigned b = index + 1; !found_split && b < blobs.size(); ++b) {
+      found_split = split.ContainedByBlob(*blobs[b]);
+      if (found_split && b - index > widthp_ && modify) {
+        widthp_ = b - index;
+      }
+    }
+    // Look left.
+    for (int b = index - 1; !found_split && b >= 0; --b) {
+      found_split = split.ContainedByBlob(*blobs[b]);
+      if (found_split && index - b > widthn_ && modify) {
+        widthn_ = index - b;
+      }
+    }
+    if (found_split) {
+      ++num_found;
+    }
+  }
+  return num_found == num_splits_;
+}
+
+// Splits this blob into two blobs by applying the splits included in
+// *this SEAM
+void SEAM::ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const {
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].SplitOutlineList(blob->outlines);
+  }
+  blob->ComputeBoundingBoxes();
+
+  divide_blobs(blob, other_blob, italic_blob, location_);
+
+  blob->EliminateDuplicateOutlines();
+  other_blob->EliminateDuplicateOutlines();
+
+  blob->CorrectBlobOrder(other_blob);
+}
+
+// Undoes ApplySeam by removing the seam between these two blobs.
+// Produces one blob as a result, and deletes other_blob.
+void SEAM::UndoSeam(TBLOB *blob, TBLOB *other_blob) const {
+  if (blob->outlines == nullptr) {
+    blob->outlines = other_blob->outlines;
+    other_blob->outlines = nullptr;
+  }
+
+  TESSLINE *outline = blob->outlines;
+  while (outline->next) {
+    outline = outline->next;
+  }
+  outline->next = other_blob->outlines;
+  other_blob->outlines = nullptr;
+  delete other_blob;
+
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].UnsplitOutlineList(blob);
+  }
+  blob->ComputeBoundingBoxes();
+  blob->EliminateDuplicateOutlines();
+}
+
+// Prints everything in *this SEAM.
+void SEAM::Print(const char *label) const {
+  tprintf("%s", label);
+  tprintf(" %6.2f @ (%d,%d), p=%u, n=%u ", priority_, location_.x, location_.y, widthp_, widthn_);
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].Print();
+    if (s + 1 < num_splits_) {
+      tprintf(",   ");
+    }
+  }
+  tprintf("\n");
+}
+
+// Prints a collection of SEAMs.
+/* static */
+void SEAM::PrintSeams(const char *label, const std::vector<SEAM *> &seams) {
+  if (!seams.empty()) {
+    tprintf("%s\n", label);
+    for (unsigned x = 0; x < seams.size(); ++x) {
+      tprintf("%2u:   ", x);
+      seams[x]->Print("");
+    }
+    tprintf("\n");
+  }
+}
+
+#ifndef GRAPHICS_DISABLED
+// Draws the seam in the given window.
+void SEAM::Mark(ScrollView *window) const {
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].Mark(window);
+  }
+}
+#endif
+
+// Break up the blobs in this chain so that they are all independent.
+// This operation should undo the affect of join_pieces.
+/* static */
+void SEAM::BreakPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
+                       int first, int last) {
+  for (int x = first; x < last; ++x) {
+    seams[x]->Reveal();
+  }
+
+  TESSLINE *outline = blobs[first]->outlines;
+  int next_blob = first + 1;
+
+  while (outline != nullptr && next_blob <= last) {
+    if (outline->next == blobs[next_blob]->outlines) {
+      outline->next = nullptr;
+      outline = blobs[next_blob]->outlines;
+      ++next_blob;
+    } else {
+      outline = outline->next;
+    }
+  }
+}
+
+// Join a group of base level pieces into a single blob that can then
+// be classified.
+/* static */
+void SEAM::JoinPieces(const std::vector<SEAM *> &seams, const std::vector<TBLOB *> &blobs,
+                      int first, int last) {
+  TESSLINE *outline = blobs[first]->outlines;
+  if (!outline) {
+    return;
+  }
+
+  for (int x = first; x < last; ++x) {
+    SEAM *seam = seams[x];
+    if (x - seam->widthn_ >= first && x + seam->widthp_ < last) {
+      seam->Hide();
+    }
+    while (outline->next) {
+      outline = outline->next;
+    }
+    outline->next = blobs[x + 1]->outlines;
+  }
+}
+
+// Hides the seam so the outlines appear not to be cut by it.
+void SEAM::Hide() const {
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].Hide();
+  }
+}
+
+// Undoes hide, so the outlines are cut by the seam.
+void SEAM::Reveal() const {
+  for (int s = 0; s < num_splits_; ++s) {
+    splits_[s].Reveal();
+  }
+}
+
+// Computes and returns, but does not set, the full priority of *this SEAM.
+float SEAM::FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth,
+                         double center_knob, double width_change_knob) const {
+  if (num_splits_ == 0) {
+    return 0.0f;
+  }
+  for (int s = 1; s < num_splits_; ++s) {
+    splits_[s].SplitOutline();
+  }
+  float full_priority =
+      priority_ + splits_[0].FullPriority(xmin, xmax, overlap_knob, centered_maxwidth, center_knob,
+                                          width_change_knob);
+  for (int s = num_splits_ - 1; s >= 1; --s) {
+    splits_[s].UnsplitOutlines();
+  }
+  return full_priority;
+}
+
+/**
+ * @name start_seam_list
+ *
+ * Initialize a list of seams that match the original number of blobs
+ * present in the starting segmentation.  Each of the seams created
+ * by this routine have location information only.
+ */
+void start_seam_list(TWERD *word, std::vector<SEAM *> *seam_array) {
+  seam_array->clear();
+  TPOINT location;
+
+  for (unsigned b = 1; b < word->NumBlobs(); ++b) {
+    TBOX bbox = word->blobs[b - 1]->bounding_box();
+    TBOX nbox = word->blobs[b]->bounding_box();
+    location.x = (bbox.right() + nbox.left()) / 2;
+    location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4;
+    seam_array->push_back(new SEAM(0.0f, location));
+  }
+}
+
+} // namespace tesseract