Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/tesseract/src/textord/gap_map.cpp @ 38:8934ac156ef5
Allow to build with the PyPI package "clang" instead of "libclang".
1. It seems to be maintained.
2. In the FreeBSD base system there is no pre-built libclang.so. If you
need this library you have to install llvm from ports additionally.
2. On FreeBSD there is no pre-built wheel "libclang" with a packaged
libclang.so.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Tue, 23 Sep 2025 10:27:15 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
// Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "gap_map.h" #include "statistc.h" namespace tesseract { BOOL_VAR(gapmap_debug, false, "Say which blocks have tables"); BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows"); BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide"); double_VAR(gapmap_big_gaps, 1.75, "xht multiplier"); /************************************************************************* * A block gap map is a quantised histogram of whitespace regions in the * block. It is a vertical projection of wide gaps WITHIN lines * * The map is held as an array of counts of rows which have a wide gap * covering that region of the row. Each bucket in the map represents a width * of about half an xheight - (The median of the xhts in the rows is used.) * * The block is considered RECTANGULAR - delimited by the left and right * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are * counted. * *************************************************************************/ GAPMAP::GAPMAP( // Constructor TO_BLOCK *block // block ) { TO_ROW *row; // current row BLOBNBOX_IT blob_it; // iterator TBOX blob_box; TBOX prev_blob_box; int16_t gap_width; int16_t start_of_row; int16_t end_of_row; STATS xht_stats(0, 127); int16_t min_quantum; int16_t max_quantum; int16_t i; /* Find left and right extremes and bucket size */ map = nullptr; min_left = INT16_MAX; max_right = -INT16_MAX; total_rows = 0; any_tabs = false; // row iterator TO_ROW_IT row_it(block->get_rows()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { row = row_it.data(); if (!row->blob_list()->empty()) { total_rows++; xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1); blob_it.set_to_list(row->blob_list()); start_of_row = blob_it.data()->bounding_box().left(); end_of_row = blob_it.data_relative(-1)->bounding_box().right(); if (min_left > start_of_row) { min_left = start_of_row; } if (max_right < end_of_row) { max_right = end_of_row; } } } if ((total_rows < 3) || (min_left >= max_right)) { bucket_size = 0; map_max = 0; total_rows = 0; min_left = max_right = 0; return; } bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2; map_max = (max_right - min_left) / bucket_size; map = new int16_t[map_max + 1]; for (i = 0; i <= map_max; i++) { map[i] = 0; } for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { row = row_it.data(); if (!row->blob_list()->empty()) { blob_it.set_to_list(row->blob_list()); blob_it.mark_cycle_pt(); blob_box = box_next(&blob_it); prev_blob_box = blob_box; if (gapmap_use_ends) { /* Leading space */ gap_width = blob_box.left() - min_left; if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { max_quantum = (blob_box.left() - min_left) / bucket_size; if (max_quantum > map_max) { max_quantum = map_max; } for (i = 0; i <= max_quantum; i++) { map[i]++; } } } while (!blob_it.cycled_list()) { blob_box = box_next(&blob_it); gap_width = blob_box.left() - prev_blob_box.right(); if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { min_quantum = (prev_blob_box.right() - min_left) / bucket_size; max_quantum = (blob_box.left() - min_left) / bucket_size; if (max_quantum > map_max) { max_quantum = map_max; } for (i = min_quantum; i <= max_quantum; i++) { map[i]++; } } prev_blob_box = blob_box; } if (gapmap_use_ends) { /* Trailing space */ gap_width = max_right - prev_blob_box.right(); if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { min_quantum = (prev_blob_box.right() - min_left) / bucket_size; if (min_quantum < 0) { min_quantum = 0; } for (i = min_quantum; i <= map_max; i++) { map[i]++; } } } } } for (i = 0; i <= map_max; i++) { if (map[i] > total_rows / 2) { if (gapmap_no_isolated_quanta && (((i == 0) && (map[i + 1] <= total_rows / 2)) || ((i == map_max) && (map[i - 1] <= total_rows / 2)) || ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) && (map[i + 1] <= total_rows / 2)))) { map[i] = 0; // prevent isolated quantum } else { any_tabs = true; } } } if (gapmap_debug && any_tabs) { tprintf("Table found\n"); } } /************************************************************************* * GAPMAP::table_gap() * Is there a bucket in the specified range where more than half the rows in the * block have a wide gap? *************************************************************************/ bool GAPMAP::table_gap( // Is gap a table? int16_t left, // From here int16_t right // To here ) { int16_t min_quantum; int16_t max_quantum; int16_t i; bool tab_found = false; if (!any_tabs) { return false; } min_quantum = (left - min_left) / bucket_size; max_quantum = (right - min_left) / bucket_size; // Clip to the bounds of the array. In some circumstances (big blob followed // by small blob) max_quantum can exceed the map_max bounds, but we clip // here instead, as it provides better long-term safety. if (min_quantum < 0) { min_quantum = 0; } if (max_quantum > map_max) { max_quantum = map_max; } for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) { if (map[i] > total_rows / 2) { tab_found = true; } } return tab_found; } } // namespace tesseract
