diff mupdf-source/thirdparty/tesseract/src/textord/underlin.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/textord/underlin.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,259 @@
+/**********************************************************************
+ * File:        underlin.cpp  (Formerly undrline.c)
+ * Description: Code to chop blobs apart from underlines.
+ * Author:      Ray Smith
+ *
+ * (C) Copyright 1994, Hewlett-Packard Ltd.
+ ** Licensed under the Apache License, Version 2.0 (the "License");
+ ** you may not use this file except in compliance with the License.
+ ** You may obtain a copy of the License at
+ ** http://www.apache.org/licenses/LICENSE-2.0
+ ** Unless required by applicable law or agreed to in writing, software
+ ** distributed under the License is distributed on an "AS IS" BASIS,
+ ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ** See the License for the specific language governing permissions and
+ ** limitations under the License.
+ *
+ **********************************************************************/
+
+#include "underlin.h"
+
+namespace tesseract {
+
+double_VAR(textord_underline_offset, 0.1, "Fraction of x to ignore");
+BOOL_VAR(textord_restore_underlines, true, "Chop underlines & put back");
+
+/**********************************************************************
+ * restore_underlined_blobs
+ *
+ * Find underlined blobs and put them back in the row.
+ **********************************************************************/
+
+void restore_underlined_blobs( // get chop points
+    TO_BLOCK *block            // block to do
+) {
+  int16_t chop_coord;        // chop boundary
+  TBOX blob_box;             // of underline
+  BLOBNBOX *u_line;          // underline bit
+  TO_ROW *row;               // best row for blob
+  ICOORDELT_LIST chop_cells; // blobs to cut out
+                             // real underlines
+  BLOBNBOX_LIST residual_underlines;
+  C_OUTLINE_LIST left_coutlines;
+  C_OUTLINE_LIST right_coutlines;
+  ICOORDELT_IT cell_it = &chop_cells;
+  // under lines
+  BLOBNBOX_IT under_it = &block->underlines;
+  BLOBNBOX_IT ru_it = &residual_underlines;
+
+  if (block->get_rows()->empty()) {
+    return; // Don't crash if there are no rows.
+  }
+  for (under_it.mark_cycle_pt(); !under_it.cycled_list(); under_it.forward()) {
+    u_line = under_it.extract();
+    blob_box = u_line->bounding_box();
+    row = most_overlapping_row(block->get_rows(), u_line);
+    if (row == nullptr) {
+      return; // Don't crash if there is no row.
+    }
+    find_underlined_blobs(u_line, &row->baseline, row->xheight,
+                          row->xheight * textord_underline_offset, &chop_cells);
+    cell_it.set_to_list(&chop_cells);
+    for (cell_it.mark_cycle_pt(); !cell_it.cycled_list(); cell_it.forward()) {
+      chop_coord = cell_it.data()->x();
+      if (cell_it.data()->y() - chop_coord > textord_fp_chop_error + 1) {
+        split_to_blob(u_line, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines,
+                      &right_coutlines);
+        if (!left_coutlines.empty()) {
+          ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines)));
+        }
+        chop_coord = cell_it.data()->y();
+        split_to_blob(nullptr, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines,
+                      &right_coutlines);
+        if (!left_coutlines.empty()) {
+          row->insert_blob(new BLOBNBOX(new C_BLOB(&left_coutlines)));
+        }
+        u_line = nullptr; // no more blobs to add
+      }
+      delete cell_it.extract();
+    }
+    if (!right_coutlines.empty()) {
+      split_to_blob(nullptr, blob_box.right(), textord_fp_chop_error + 0.5, &left_coutlines,
+                    &right_coutlines);
+      if (!left_coutlines.empty()) {
+        ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines)));
+      }
+    }
+    delete u_line;
+  }
+  if (!ru_it.empty()) {
+    ru_it.move_to_first();
+    for (ru_it.mark_cycle_pt(); !ru_it.cycled_list(); ru_it.forward()) {
+      under_it.add_after_then_move(ru_it.extract());
+    }
+  }
+}
+
+/**********************************************************************
+ * most_overlapping_row
+ *
+ * Return the row which most overlaps the blob.
+ **********************************************************************/
+
+TO_ROW *most_overlapping_row( // find best row
+    TO_ROW_LIST *rows,        // list of rows
+    BLOBNBOX *blob            // blob to place
+) {
+  int16_t x = (blob->bounding_box().left() + blob->bounding_box().right()) / 2;
+  TO_ROW_IT row_it = rows; // row iterator
+  TO_ROW *row;             // current row
+  TO_ROW *best_row;        // output row
+  float overlap;           // of blob & row
+  float bestover;          // best overlap
+
+  best_row = nullptr;
+  bestover = static_cast<float>(-INT32_MAX);
+  if (row_it.empty()) {
+    return nullptr;
+  }
+  row = row_it.data();
+  row_it.mark_cycle_pt();
+  while (row->baseline.y(x) + row->descdrop > blob->bounding_box().top() && !row_it.cycled_list()) {
+    best_row = row;
+    bestover = blob->bounding_box().top() - row->baseline.y(x) + row->descdrop;
+    row_it.forward();
+    row = row_it.data();
+  }
+  while (row->baseline.y(x) + row->xheight + row->ascrise >= blob->bounding_box().bottom() &&
+         !row_it.cycled_list()) {
+    overlap = row->baseline.y(x) + row->xheight + row->ascrise;
+    if (blob->bounding_box().top() < overlap) {
+      overlap = blob->bounding_box().top();
+    }
+    if (blob->bounding_box().bottom() > row->baseline.y(x) + row->descdrop) {
+      overlap -= blob->bounding_box().bottom();
+    } else {
+      overlap -= row->baseline.y(x) + row->descdrop;
+    }
+    if (overlap > bestover) {
+      bestover = overlap;
+      best_row = row;
+    }
+    row_it.forward();
+    row = row_it.data();
+  }
+  if (bestover < 0 &&
+      row->baseline.y(x) + row->xheight + row->ascrise - blob->bounding_box().bottom() > bestover) {
+    best_row = row;
+  }
+  return best_row;
+}
+
+/**********************************************************************
+ * find_underlined_blobs
+ *
+ * Find the start and end coords of blobs in the underline.
+ **********************************************************************/
+
+void find_underlined_blobs(    // get chop points
+    BLOBNBOX *u_line,          // underlined unit
+    QSPLINE *baseline,         // actual baseline
+    float xheight,             // height of line
+    float baseline_offset,     // amount to shrinke it
+    ICOORDELT_LIST *chop_cells // places to chop
+) {
+  ICOORD blob_chop; // sides of blob
+  TBOX blob_box = u_line->bounding_box();
+  // cell iterator
+  ICOORDELT_IT cell_it = chop_cells;
+  STATS upper_proj(blob_box.left(), blob_box.right());
+  STATS middle_proj(blob_box.left(), blob_box.right());
+  STATS lower_proj(blob_box.left(), blob_box.right());
+  C_OUTLINE_IT out_it; // outlines of blob
+
+  ASSERT_HOST(u_line->cblob() != nullptr);
+
+  out_it.set_to_list(u_line->cblob()->out_list());
+  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
+    vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, &lower_proj,
+                                   &middle_proj, &upper_proj);
+  }
+
+  for (auto x = blob_box.left(); x < blob_box.right(); x++) {
+    if (middle_proj.pile_count(x) > 0) {
+      auto y = x + 1;
+      for (; y < blob_box.right() && middle_proj.pile_count(y) > 0; y++) {
+        ;
+      }
+      blob_chop = ICOORD(x, y);
+      cell_it.add_after_then_move(new ICOORDELT(blob_chop));
+      x = y;
+    }
+  }
+}
+
+/**********************************************************************
+ * vertical_cunderline_projection
+ *
+ * Compute the vertical projection of an outline from its outlines
+ * and add to the given STATS.
+ **********************************************************************/
+
+void vertical_cunderline_projection( // project outlines
+    C_OUTLINE *outline,              // outline to project
+    QSPLINE *baseline,               // actual baseline
+    float xheight,                   // height of line
+    float baseline_offset,           // amount to shrinke it
+    STATS *lower_proj,               // below baseline
+    STATS *middle_proj,              // centre region
+    STATS *upper_proj                // top region
+) {
+  ICOORD pos;               // current point
+  ICOORD step;              // edge step
+  int16_t lower_y, upper_y; // region limits
+  C_OUTLINE_IT out_it = outline->child();
+
+  pos = outline->start_pos();
+  int16_t length = outline->pathlength();
+  for (int16_t stepindex = 0; stepindex < length; stepindex++) {
+    step = outline->step(stepindex);
+    if (step.x() > 0) {
+      lower_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + 0.5));
+      upper_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + xheight + 0.5));
+      if (pos.y() >= lower_y) {
+        lower_proj->add(pos.x(), -lower_y);
+        if (pos.y() >= upper_y) {
+          middle_proj->add(pos.x(), lower_y - upper_y);
+          upper_proj->add(pos.x(), upper_y - pos.y());
+        } else {
+          middle_proj->add(pos.x(), lower_y - pos.y());
+        }
+      } else {
+        lower_proj->add(pos.x(), -pos.y());
+      }
+    } else if (step.x() < 0) {
+      lower_y = static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + 0.5));
+      upper_y =
+          static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + xheight + 0.5));
+      if (pos.y() >= lower_y) {
+        lower_proj->add(pos.x() - 1, lower_y);
+        if (pos.y() >= upper_y) {
+          middle_proj->add(pos.x() - 1, upper_y - lower_y);
+          upper_proj->add(pos.x() - 1, pos.y() - upper_y);
+        } else {
+          middle_proj->add(pos.x() - 1, pos.y() - lower_y);
+        }
+      } else {
+        lower_proj->add(pos.x() - 1, pos.y());
+      }
+    }
+    pos += step;
+  }
+
+  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
+    vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, lower_proj,
+                                   middle_proj, upper_proj);
+  }
+}
+
+} // namespace tesseract