Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/textord/gap_map.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/textord/gap_map.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,198 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gap_map.h" + +#include "statistc.h" + +namespace tesseract { + +BOOL_VAR(gapmap_debug, false, "Say which blocks have tables"); +BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows"); +BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide"); +double_VAR(gapmap_big_gaps, 1.75, "xht multiplier"); + +/************************************************************************* + * A block gap map is a quantised histogram of whitespace regions in the + * block. It is a vertical projection of wide gaps WITHIN lines + * + * The map is held as an array of counts of rows which have a wide gap + * covering that region of the row. Each bucket in the map represents a width + * of about half an xheight - (The median of the xhts in the rows is used.) + * + * The block is considered RECTANGULAR - delimited by the left and right + * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are + * counted. + * + *************************************************************************/ + +GAPMAP::GAPMAP( // Constructor + TO_BLOCK *block // block +) { + TO_ROW *row; // current row + BLOBNBOX_IT blob_it; // iterator + TBOX blob_box; + TBOX prev_blob_box; + int16_t gap_width; + int16_t start_of_row; + int16_t end_of_row; + STATS xht_stats(0, 127); + int16_t min_quantum; + int16_t max_quantum; + int16_t i; + + /* + Find left and right extremes and bucket size +*/ + map = nullptr; + min_left = INT16_MAX; + max_right = -INT16_MAX; + total_rows = 0; + any_tabs = false; + + // row iterator + TO_ROW_IT row_it(block->get_rows()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if (!row->blob_list()->empty()) { + total_rows++; + xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1); + blob_it.set_to_list(row->blob_list()); + start_of_row = blob_it.data()->bounding_box().left(); + end_of_row = blob_it.data_relative(-1)->bounding_box().right(); + if (min_left > start_of_row) { + min_left = start_of_row; + } + if (max_right < end_of_row) { + max_right = end_of_row; + } + } + } + if ((total_rows < 3) || (min_left >= max_right)) { + bucket_size = 0; + map_max = 0; + total_rows = 0; + min_left = max_right = 0; + return; + } + bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2; + map_max = (max_right - min_left) / bucket_size; + map = new int16_t[map_max + 1]; + for (i = 0; i <= map_max; i++) { + map[i] = 0; + } + + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + row = row_it.data(); + if (!row->blob_list()->empty()) { + blob_it.set_to_list(row->blob_list()); + blob_it.mark_cycle_pt(); + blob_box = box_next(&blob_it); + prev_blob_box = blob_box; + if (gapmap_use_ends) { + /* Leading space */ + gap_width = blob_box.left() - min_left; + if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { + max_quantum = (blob_box.left() - min_left) / bucket_size; + if (max_quantum > map_max) { + max_quantum = map_max; + } + for (i = 0; i <= max_quantum; i++) { + map[i]++; + } + } + } + while (!blob_it.cycled_list()) { + blob_box = box_next(&blob_it); + gap_width = blob_box.left() - prev_blob_box.right(); + if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { + min_quantum = (prev_blob_box.right() - min_left) / bucket_size; + max_quantum = (blob_box.left() - min_left) / bucket_size; + if (max_quantum > map_max) { + max_quantum = map_max; + } + for (i = min_quantum; i <= max_quantum; i++) { + map[i]++; + } + } + prev_blob_box = blob_box; + } + if (gapmap_use_ends) { + /* Trailing space */ + gap_width = max_right - prev_blob_box.right(); + if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { + min_quantum = (prev_blob_box.right() - min_left) / bucket_size; + if (min_quantum < 0) { + min_quantum = 0; + } + for (i = min_quantum; i <= map_max; i++) { + map[i]++; + } + } + } + } + } + for (i = 0; i <= map_max; i++) { + if (map[i] > total_rows / 2) { + if (gapmap_no_isolated_quanta && + (((i == 0) && (map[i + 1] <= total_rows / 2)) || + ((i == map_max) && (map[i - 1] <= total_rows / 2)) || + ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) && + (map[i + 1] <= total_rows / 2)))) { + map[i] = 0; // prevent isolated quantum + } else { + any_tabs = true; + } + } + } + if (gapmap_debug && any_tabs) { + tprintf("Table found\n"); + } +} + +/************************************************************************* + * GAPMAP::table_gap() + * Is there a bucket in the specified range where more than half the rows in the + * block have a wide gap? + *************************************************************************/ + +bool GAPMAP::table_gap( // Is gap a table? + int16_t left, // From here + int16_t right // To here +) { + int16_t min_quantum; + int16_t max_quantum; + int16_t i; + bool tab_found = false; + + if (!any_tabs) { + return false; + } + + min_quantum = (left - min_left) / bucket_size; + max_quantum = (right - min_left) / bucket_size; + // Clip to the bounds of the array. In some circumstances (big blob followed + // by small blob) max_quantum can exceed the map_max bounds, but we clip + // here instead, as it provides better long-term safety. + if (min_quantum < 0) { + min_quantum = 0; + } + if (max_quantum > map_max) { + max_quantum = map_max; + } + for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) { + if (map[i] > total_rows / 2) { + tab_found = true; + } + } + return tab_found; +} + +} // namespace tesseract
