Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccstruct/ocrblock.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/ocrblock.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,515 @@ +/********************************************************************** + * File: ocrblock.cpp (Formerly block.c) + * Description: BLOCK member functions and iterator functions. + * Author: Ray Smith + * + * (C) Copyright 1991, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "ocrblock.h" + +#include "stepblob.h" +#include "tprintf.h" + +#include <cstdlib> +#include <memory> // std::unique_ptr + +namespace tesseract { + +/** + * BLOCK::BLOCK + * + * Constructor for a simple rectangular block. + */ +BLOCK::BLOCK(const char *name, ///< filename + bool prop, ///< proportional + int16_t kern, ///< kerning + int16_t space, ///< spacing + TDimension xmin, ///< bottom left + TDimension ymin, + TDimension xmax, ///< top right + TDimension ymax) + : pdblk(xmin, ymin, xmax, ymax) + , filename(name) + , re_rotation_(1.0f, 0.0f) + , classify_rotation_(1.0f, 0.0f) + , skew_(1.0f, 0.0f) { + ICOORDELT_IT left_it = &pdblk.leftside; + ICOORDELT_IT right_it = &pdblk.rightside; + + proportional = prop; + kerning = kern; + spacing = space; + font_class = -1; // not assigned + cell_over_xheight_ = 2.0f; + pdblk.hand_poly = nullptr; + left_it.set_to_list(&pdblk.leftside); + right_it.set_to_list(&pdblk.rightside); + // make default box + left_it.add_to_end(new ICOORDELT(xmin, ymin)); + left_it.add_to_end(new ICOORDELT(xmin, ymax)); + right_it.add_to_end(new ICOORDELT(xmax, ymin)); + right_it.add_to_end(new ICOORDELT(xmax, ymax)); +} + +/** + * decreasing_top_order + * + * Sort Comparator: Return <0 if row1 top < row2 top + */ + +static int decreasing_top_order(const void *row1, const void *row2) { + return (*reinterpret_cast<ROW *const *>(row2))->bounding_box().top() - + (*reinterpret_cast<ROW *const *>(row1))->bounding_box().top(); +} + +/** + * BLOCK::rotate + * + * Rotate the polygon by the given rotation and recompute the bounding_box. + */ +void BLOCK::rotate(const FCOORD &rotation) { + pdblk.poly_block()->rotate(rotation); + pdblk.box = *pdblk.poly_block()->bounding_box(); +} + +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box; + // This is a read-only iteration of the rows in the block. + ROW_IT it(const_cast<ROW_LIST *>(&rows)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->restricted_bounding_box(upper_dots, lower_dots); + } + return box; +} + +/** + * BLOCK::reflect_polygon_in_y_axis + * + * Reflects the polygon in the y-axis and recompute the bounding_box. + * Does nothing to any contained rows/words/blobs etc. + */ +void BLOCK::reflect_polygon_in_y_axis() { + pdblk.poly_block()->reflect_in_y_axis(); + pdblk.box = *pdblk.poly_block()->bounding_box(); +} + +/** + * BLOCK::sort_rows + * + * Order rows so that they are in order of decreasing Y coordinate + */ + +void BLOCK::sort_rows() { // order on "top" + ROW_IT row_it(&rows); + + row_it.sort(decreasing_top_order); +} + +/** + * BLOCK::compress + * + * Delete space between the rows. (And maybe one day, compress the rows) + * Fill space of block from top down, left aligning rows. + */ + +void BLOCK::compress() { // squash it up +#define ROW_SPACING 5 + + ROW_IT row_it(&rows); + ROW *row; + ICOORD row_spacing(0, ROW_SPACING); + + ICOORDELT_IT icoordelt_it; + + sort_rows(); + + pdblk.box = TBOX(pdblk.box.topleft(), pdblk.box.topleft()); + pdblk.box.move_bottom_edge(ROW_SPACING); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + row->move(pdblk.box.botleft() - row_spacing - row->bounding_box().topleft()); + pdblk.box += row->bounding_box(); + } + + pdblk.leftside.clear(); + icoordelt_it.set_to_list(&pdblk.leftside); + icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.bottom())); + icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.left(), pdblk.box.top())); + pdblk.rightside.clear(); + icoordelt_it.set_to_list(&pdblk.rightside); + icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.bottom())); + icoordelt_it.add_to_end(new ICOORDELT(pdblk.box.right(), pdblk.box.top())); +} + +/** + * BLOCK::check_pitch + * + * Check whether the block is fixed or prop, set the flag, and set + * the pitch if it is fixed. + */ + +void BLOCK::check_pitch() { // check prop + // tprintf("Missing FFT fixed pitch stuff!\n"); + pitch = -1; +} + +/** + * BLOCK::compress + * + * Compress and move in a single operation. + */ + +void BLOCK::compress( // squash it up + const ICOORD vec // and move +) { + pdblk.box.move(vec); + compress(); +} + +/** + * BLOCK::print + * + * Print the info on a block + */ + +void BLOCK::print( // print list of sides + FILE *, ///< file to print on + bool dump ///< print full detail +) { + ICOORDELT_IT it = &pdblk.leftside; // iterator + + pdblk.box.print(); + tprintf("Proportional= %s\n", proportional ? "TRUE" : "FALSE"); + tprintf("Kerning= %d\n", kerning); + tprintf("Spacing= %d\n", spacing); + tprintf("Fixed_pitch=%d\n", pitch); + tprintf("Filename= %s\n", filename.c_str()); + + if (dump) { + tprintf("Left side coords are:\n"); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + tprintf("(%d,%d) ", it.data()->x(), it.data()->y()); + } + tprintf("\n"); + tprintf("Right side coords are:\n"); + it.set_to_list(&pdblk.rightside); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + tprintf("(%d,%d) ", it.data()->x(), it.data()->y()); + } + tprintf("\n"); + } +} + +/** + * BLOCK::operator= + * + * Assignment - duplicate the block structure, but with an EMPTY row list. + */ + +BLOCK &BLOCK::operator=( // assignment + const BLOCK &source // from this +) { + this->ELIST_LINK::operator=(source); + pdblk = source.pdblk; + proportional = source.proportional; + kerning = source.kerning; + spacing = source.spacing; + filename = source.filename; // STRINGs assign ok + if (!rows.empty()) { + rows.clear(); + } + re_rotation_ = source.re_rotation_; + classify_rotation_ = source.classify_rotation_; + skew_ = source.skew_; + return *this; +} + +// This function is for finding the approximate (horizontal) distance from +// the x-coordinate of the left edge of a symbol to the left edge of the +// text block which contains it. We are passed: +// segments - output of PB_LINE_IT::get_line() which contains x-coordinate +// intervals for the scan line going through the symbol's y-coordinate. +// Each element of segments is of the form (x()=start_x, y()=length). +// x - the x coordinate of the symbol we're interested in. +// margin - return value, the distance from x,y to the left margin of the +// block containing it. +// If all segments were to the right of x, we return false and 0. +static bool LeftMargin(ICOORDELT_LIST *segments, int x, int *margin) { + bool found = false; + *margin = 0; + if (segments->empty()) { + return found; + } + ICOORDELT_IT seg_it(segments); + for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { + int cur_margin = x - seg_it.data()->x(); + if (cur_margin >= 0) { + if (!found) { + *margin = cur_margin; + } else if (cur_margin < *margin) { + *margin = cur_margin; + } + found = true; + } + } + return found; +} + +// This function is for finding the approximate (horizontal) distance from +// the x-coordinate of the right edge of a symbol to the right edge of the +// text block which contains it. We are passed: +// segments - output of PB_LINE_IT::get_line() which contains x-coordinate +// intervals for the scan line going through the symbol's y-coordinate. +// Each element of segments is of the form (x()=start_x, y()=length). +// x - the x coordinate of the symbol we're interested in. +// margin - return value, the distance from x,y to the right margin of the +// block containing it. +// If all segments were to the left of x, we return false and 0. +static bool RightMargin(ICOORDELT_LIST *segments, int x, int *margin) { + bool found = false; + *margin = 0; + if (segments->empty()) { + return found; + } + ICOORDELT_IT seg_it(segments); + for (seg_it.mark_cycle_pt(); !seg_it.cycled_list(); seg_it.forward()) { + int cur_margin = seg_it.data()->x() + seg_it.data()->y() - x; + if (cur_margin >= 0) { + if (!found) { + *margin = cur_margin; + } else if (cur_margin < *margin) { + *margin = cur_margin; + } + found = true; + } + } + return found; +} + +// Compute the distance from the left and right ends of each row to the +// left and right edges of the block's polyblock. Illustration: +// ____________________________ _______________________ +// | Howdy neighbor! | |rectangular blocks look| +// | This text is written to| |more like stacked pizza| +// |illustrate how useful poly- |boxes. | +// |blobs are in ----------- ------ The polyblob| +// |dealing with| _________ |for a BLOCK rec-| +// |harder layout| /===========\ |ords the possibly| +// |issues. | | _ _ | |skewed pseudo-| +// | You see this| | |_| \|_| | |rectangular | +// |text is flowed| | } | |boundary that| +// |around a mid-| \ ____ | |forms the ideal-| +// |column portrait._____ \ / __|ized text margin| +// | Polyblobs exist| \ / |from which we should| +// |to account for insets| | | |measure paragraph| +// |which make otherwise| ----- |indentation. | +// ----------------------- ---------------------- +// +// If we identify a drop-cap, we measure the left margin for the lines +// below the first line relative to one space past the drop cap. The +// first line's margin and those past the drop cap area are measured +// relative to the enclosing polyblock. +// +// TODO(rays): Before this will work well, we'll need to adjust the +// polyblob tighter around the text near images, as in: +// UNLV_AUTO:mag.3G0 page 2 +// UNLV_AUTO:mag.3G4 page 16 +void BLOCK::compute_row_margins() { + if (row_list()->empty() || row_list()->singleton()) { + return; + } + + // If Layout analysis was not called, default to this. + POLY_BLOCK rect_block(pdblk.bounding_box(), PT_FLOWING_TEXT); + POLY_BLOCK *pblock = &rect_block; + if (pdblk.poly_block() != nullptr) { + pblock = pdblk.poly_block(); + } + + // Step One: Determine if there is a drop-cap. + // TODO(eger): Fix up drop cap code for RTL languages. + ROW_IT r_it(row_list()); + ROW *first_row = r_it.data(); + ROW *second_row = r_it.data_relative(1); + + // initialize the bottom of a fictitious drop cap far above the first line. + int drop_cap_bottom = first_row->bounding_box().top() + first_row->bounding_box().height(); + int drop_cap_right = first_row->bounding_box().left(); + int mid_second_line = second_row->bounding_box().top() - second_row->bounding_box().height() / 2; + WERD_IT werd_it(r_it.data()->word_list()); // words of line one + if (!werd_it.empty()) { + C_BLOB_IT cblob_it(werd_it.data()->cblob_list()); + for (cblob_it.mark_cycle_pt(); !cblob_it.cycled_list(); cblob_it.forward()) { + TBOX bbox = cblob_it.data()->bounding_box(); + if (bbox.bottom() <= mid_second_line) { + // we found a real drop cap + first_row->set_has_drop_cap(true); + if (drop_cap_bottom > bbox.bottom()) { + drop_cap_bottom = bbox.bottom(); + } + if (drop_cap_right < bbox.right()) { + drop_cap_right = bbox.right(); + } + } + } + } + + // Step Two: Calculate the margin from the text of each row to the block + // (or drop-cap) boundaries. + PB_LINE_IT lines(pblock); + r_it.set_to_list(row_list()); + for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) { + ROW *row = r_it.data(); + TBOX row_box = row->bounding_box(); + int left_y = row->base_line(row_box.left()) + row->x_height(); + int left_margin; + const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_left(lines.get_line(left_y)); + LeftMargin(segments_left.get(), row_box.left(), &left_margin); + + if (row_box.top() >= drop_cap_bottom) { + int drop_cap_distance = row_box.left() - row->space() - drop_cap_right; + if (drop_cap_distance < 0) { + drop_cap_distance = 0; + } + if (drop_cap_distance < left_margin) { + left_margin = drop_cap_distance; + } + } + + int right_y = row->base_line(row_box.right()) + row->x_height(); + int right_margin; + const std::unique_ptr</*non-const*/ ICOORDELT_LIST> segments_right(lines.get_line(right_y)); + RightMargin(segments_right.get(), row_box.right(), &right_margin); + row->set_lmargin(left_margin); + row->set_rmargin(right_margin); + } +} + +/********************************************************************** + * PrintSegmentationStats + * + * Prints segmentation stats for the given block list. + **********************************************************************/ + +void PrintSegmentationStats(BLOCK_LIST *block_list) { + int num_blocks = 0; + int num_rows = 0; + int num_words = 0; + int num_blobs = 0; + BLOCK_IT block_it(block_list); + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + BLOCK *block = block_it.data(); + ++num_blocks; + ROW_IT row_it(block->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ++num_rows; + ROW *row = row_it.data(); + // Iterate over all werds in the row. + WERD_IT werd_it(row->word_list()); + for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { + WERD *werd = werd_it.data(); + ++num_words; + num_blobs += werd->cblob_list()->length(); + } + } + } + tprintf("Block list stats:\nBlocks = %d\nRows = %d\nWords = %d\nBlobs = %d\n", num_blocks, + num_rows, num_words, num_blobs); +} + +/********************************************************************** + * ExtractBlobsFromSegmentation + * + * Extracts blobs from the given block list and adds them to the output list. + * The block list must have been created by performing a page segmentation. + **********************************************************************/ + +void ExtractBlobsFromSegmentation(BLOCK_LIST *blocks, C_BLOB_LIST *output_blob_list) { + C_BLOB_IT return_list_it(output_blob_list); + BLOCK_IT block_it(blocks); + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + BLOCK *block = block_it.data(); + ROW_IT row_it(block->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW *row = row_it.data(); + // Iterate over all werds in the row. + WERD_IT werd_it(row->word_list()); + for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { + WERD *werd = werd_it.data(); + return_list_it.move_to_last(); + return_list_it.add_list_after(werd->cblob_list()); + return_list_it.move_to_last(); + return_list_it.add_list_after(werd->rej_cblob_list()); + } + } + } +} + +/********************************************************************** + * RefreshWordBlobsFromNewBlobs() + * + * Refreshes the words in the block_list by using blobs in the + * new_blobs list. + * Block list must have word segmentation in it. + * It consumes the blobs provided in the new_blobs list. The blobs leftover in + * the new_blobs list after the call weren't matched to any blobs of the words + * in block list. + * The output not_found_blobs is a list of blobs from the original segmentation + * in the block_list for which no corresponding new blobs were found. + **********************************************************************/ + +void RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs, + C_BLOB_LIST *not_found_blobs) { + // Now iterate over all the blobs in the segmentation_block_list_, and just + // replace the corresponding c-blobs inside the werds. + BLOCK_IT block_it(block_list); + for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) { + BLOCK *block = block_it.data(); + if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) { + continue; // Don't touch non-text blocks. + } + // Iterate over all rows in the block. + ROW_IT row_it(block->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW *row = row_it.data(); + // Iterate over all werds in the row. + WERD_IT werd_it(row->word_list()); + WERD_LIST new_words; + WERD_IT new_words_it(&new_words); + for (werd_it.mark_cycle_pt(); !werd_it.cycled_list(); werd_it.forward()) { + WERD *werd = werd_it.extract(); + WERD *new_werd = werd->ConstructWerdWithNewBlobs(new_blobs, not_found_blobs); + if (new_werd) { + // Insert this new werd into the actual row's werd-list. Remove the + // existing one. + new_words_it.add_after_then_move(new_werd); + delete werd; + } else { + // Reinsert the older word back, for lack of better options. + // This is critical since dropping the words messes up segmentation: + // eg. 1st word in the row might otherwise have W_FUZZY_NON turned on. + new_words_it.add_after_then_move(werd); + } + } + // Get rid of the old word list & replace it with the new one. + row->word_list()->clear(); + werd_it.move_to_first(); + werd_it.add_list_after(&new_words); + } + } +} + +} // namespace tesseract
