Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/textord/underlin.cpp @ 21:2f43e400f144
Provide an "all" target to build both the sdist and the wheel
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Fri, 19 Sep 2025 10:28:53 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
/********************************************************************** * File: underlin.cpp (Formerly undrline.c) * Description: Code to chop blobs apart from underlines. * Author: Ray Smith * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/ #include "underlin.h" namespace tesseract { double_VAR(textord_underline_offset, 0.1, "Fraction of x to ignore"); BOOL_VAR(textord_restore_underlines, true, "Chop underlines & put back"); /********************************************************************** * restore_underlined_blobs * * Find underlined blobs and put them back in the row. **********************************************************************/ void restore_underlined_blobs( // get chop points TO_BLOCK *block // block to do ) { int16_t chop_coord; // chop boundary TBOX blob_box; // of underline BLOBNBOX *u_line; // underline bit TO_ROW *row; // best row for blob ICOORDELT_LIST chop_cells; // blobs to cut out // real underlines BLOBNBOX_LIST residual_underlines; C_OUTLINE_LIST left_coutlines; C_OUTLINE_LIST right_coutlines; ICOORDELT_IT cell_it = &chop_cells; // under lines BLOBNBOX_IT under_it = &block->underlines; BLOBNBOX_IT ru_it = &residual_underlines; if (block->get_rows()->empty()) { return; // Don't crash if there are no rows. } for (under_it.mark_cycle_pt(); !under_it.cycled_list(); under_it.forward()) { u_line = under_it.extract(); blob_box = u_line->bounding_box(); row = most_overlapping_row(block->get_rows(), u_line); if (row == nullptr) { return; // Don't crash if there is no row. } find_underlined_blobs(u_line, &row->baseline, row->xheight, row->xheight * textord_underline_offset, &chop_cells); cell_it.set_to_list(&chop_cells); for (cell_it.mark_cycle_pt(); !cell_it.cycled_list(); cell_it.forward()) { chop_coord = cell_it.data()->x(); if (cell_it.data()->y() - chop_coord > textord_fp_chop_error + 1) { split_to_blob(u_line, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines, &right_coutlines); if (!left_coutlines.empty()) { ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines))); } chop_coord = cell_it.data()->y(); split_to_blob(nullptr, chop_coord, textord_fp_chop_error + 0.5, &left_coutlines, &right_coutlines); if (!left_coutlines.empty()) { row->insert_blob(new BLOBNBOX(new C_BLOB(&left_coutlines))); } u_line = nullptr; // no more blobs to add } delete cell_it.extract(); } if (!right_coutlines.empty()) { split_to_blob(nullptr, blob_box.right(), textord_fp_chop_error + 0.5, &left_coutlines, &right_coutlines); if (!left_coutlines.empty()) { ru_it.add_after_then_move(new BLOBNBOX(new C_BLOB(&left_coutlines))); } } delete u_line; } if (!ru_it.empty()) { ru_it.move_to_first(); for (ru_it.mark_cycle_pt(); !ru_it.cycled_list(); ru_it.forward()) { under_it.add_after_then_move(ru_it.extract()); } } } /********************************************************************** * most_overlapping_row * * Return the row which most overlaps the blob. **********************************************************************/ TO_ROW *most_overlapping_row( // find best row TO_ROW_LIST *rows, // list of rows BLOBNBOX *blob // blob to place ) { int16_t x = (blob->bounding_box().left() + blob->bounding_box().right()) / 2; TO_ROW_IT row_it = rows; // row iterator TO_ROW *row; // current row TO_ROW *best_row; // output row float overlap; // of blob & row float bestover; // best overlap best_row = nullptr; bestover = static_cast<float>(-INT32_MAX); if (row_it.empty()) { return nullptr; } row = row_it.data(); row_it.mark_cycle_pt(); while (row->baseline.y(x) + row->descdrop > blob->bounding_box().top() && !row_it.cycled_list()) { best_row = row; bestover = blob->bounding_box().top() - row->baseline.y(x) + row->descdrop; row_it.forward(); row = row_it.data(); } while (row->baseline.y(x) + row->xheight + row->ascrise >= blob->bounding_box().bottom() && !row_it.cycled_list()) { overlap = row->baseline.y(x) + row->xheight + row->ascrise; if (blob->bounding_box().top() < overlap) { overlap = blob->bounding_box().top(); } if (blob->bounding_box().bottom() > row->baseline.y(x) + row->descdrop) { overlap -= blob->bounding_box().bottom(); } else { overlap -= row->baseline.y(x) + row->descdrop; } if (overlap > bestover) { bestover = overlap; best_row = row; } row_it.forward(); row = row_it.data(); } if (bestover < 0 && row->baseline.y(x) + row->xheight + row->ascrise - blob->bounding_box().bottom() > bestover) { best_row = row; } return best_row; } /********************************************************************** * find_underlined_blobs * * Find the start and end coords of blobs in the underline. **********************************************************************/ void find_underlined_blobs( // get chop points BLOBNBOX *u_line, // underlined unit QSPLINE *baseline, // actual baseline float xheight, // height of line float baseline_offset, // amount to shrinke it ICOORDELT_LIST *chop_cells // places to chop ) { ICOORD blob_chop; // sides of blob TBOX blob_box = u_line->bounding_box(); // cell iterator ICOORDELT_IT cell_it = chop_cells; STATS upper_proj(blob_box.left(), blob_box.right()); STATS middle_proj(blob_box.left(), blob_box.right()); STATS lower_proj(blob_box.left(), blob_box.right()); C_OUTLINE_IT out_it; // outlines of blob ASSERT_HOST(u_line->cblob() != nullptr); out_it.set_to_list(u_line->cblob()->out_list()); for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, &lower_proj, &middle_proj, &upper_proj); } for (auto x = blob_box.left(); x < blob_box.right(); x++) { if (middle_proj.pile_count(x) > 0) { auto y = x + 1; for (; y < blob_box.right() && middle_proj.pile_count(y) > 0; y++) { ; } blob_chop = ICOORD(x, y); cell_it.add_after_then_move(new ICOORDELT(blob_chop)); x = y; } } } /********************************************************************** * vertical_cunderline_projection * * Compute the vertical projection of an outline from its outlines * and add to the given STATS. **********************************************************************/ void vertical_cunderline_projection( // project outlines C_OUTLINE *outline, // outline to project QSPLINE *baseline, // actual baseline float xheight, // height of line float baseline_offset, // amount to shrinke it STATS *lower_proj, // below baseline STATS *middle_proj, // centre region STATS *upper_proj // top region ) { ICOORD pos; // current point ICOORD step; // edge step int16_t lower_y, upper_y; // region limits C_OUTLINE_IT out_it = outline->child(); pos = outline->start_pos(); int16_t length = outline->pathlength(); for (int16_t stepindex = 0; stepindex < length; stepindex++) { step = outline->step(stepindex); if (step.x() > 0) { lower_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + 0.5)); upper_y = static_cast<int16_t>(floor(baseline->y(pos.x()) + baseline_offset + xheight + 0.5)); if (pos.y() >= lower_y) { lower_proj->add(pos.x(), -lower_y); if (pos.y() >= upper_y) { middle_proj->add(pos.x(), lower_y - upper_y); upper_proj->add(pos.x(), upper_y - pos.y()); } else { middle_proj->add(pos.x(), lower_y - pos.y()); } } else { lower_proj->add(pos.x(), -pos.y()); } } else if (step.x() < 0) { lower_y = static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + 0.5)); upper_y = static_cast<int16_t>(floor(baseline->y(pos.x() - 1) + baseline_offset + xheight + 0.5)); if (pos.y() >= lower_y) { lower_proj->add(pos.x() - 1, lower_y); if (pos.y() >= upper_y) { middle_proj->add(pos.x() - 1, upper_y - lower_y); upper_proj->add(pos.x() - 1, pos.y() - upper_y); } else { middle_proj->add(pos.x() - 1, pos.y() - lower_y); } } else { lower_proj->add(pos.x() - 1, pos.y()); } } pos += step; } for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) { vertical_cunderline_projection(out_it.data(), baseline, xheight, baseline_offset, lower_proj, middle_proj, upper_proj); } } } // namespace tesseract
