Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/textord/gap_map.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 2 // you may not use this file except in compliance with the License. | |
| 3 // You may obtain a copy of the License at | |
| 4 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 5 // Unless required by applicable law or agreed to in writing, software | |
| 6 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 8 // See the License for the specific language governing permissions and | |
| 9 // limitations under the License. | |
| 10 | |
| 11 #include "gap_map.h" | |
| 12 | |
| 13 #include "statistc.h" | |
| 14 | |
| 15 namespace tesseract { | |
| 16 | |
| 17 BOOL_VAR(gapmap_debug, false, "Say which blocks have tables"); | |
| 18 BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows"); | |
| 19 BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide"); | |
| 20 double_VAR(gapmap_big_gaps, 1.75, "xht multiplier"); | |
| 21 | |
| 22 /************************************************************************* | |
| 23 * A block gap map is a quantised histogram of whitespace regions in the | |
| 24 * block. It is a vertical projection of wide gaps WITHIN lines | |
| 25 * | |
| 26 * The map is held as an array of counts of rows which have a wide gap | |
| 27 * covering that region of the row. Each bucket in the map represents a width | |
| 28 * of about half an xheight - (The median of the xhts in the rows is used.) | |
| 29 * | |
| 30 * The block is considered RECTANGULAR - delimited by the left and right | |
| 31 * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are | |
| 32 * counted. | |
| 33 * | |
| 34 *************************************************************************/ | |
| 35 | |
| 36 GAPMAP::GAPMAP( // Constructor | |
| 37 TO_BLOCK *block // block | |
| 38 ) { | |
| 39 TO_ROW *row; // current row | |
| 40 BLOBNBOX_IT blob_it; // iterator | |
| 41 TBOX blob_box; | |
| 42 TBOX prev_blob_box; | |
| 43 int16_t gap_width; | |
| 44 int16_t start_of_row; | |
| 45 int16_t end_of_row; | |
| 46 STATS xht_stats(0, 127); | |
| 47 int16_t min_quantum; | |
| 48 int16_t max_quantum; | |
| 49 int16_t i; | |
| 50 | |
| 51 /* | |
| 52 Find left and right extremes and bucket size | |
| 53 */ | |
| 54 map = nullptr; | |
| 55 min_left = INT16_MAX; | |
| 56 max_right = -INT16_MAX; | |
| 57 total_rows = 0; | |
| 58 any_tabs = false; | |
| 59 | |
| 60 // row iterator | |
| 61 TO_ROW_IT row_it(block->get_rows()); | |
| 62 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 63 row = row_it.data(); | |
| 64 if (!row->blob_list()->empty()) { | |
| 65 total_rows++; | |
| 66 xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1); | |
| 67 blob_it.set_to_list(row->blob_list()); | |
| 68 start_of_row = blob_it.data()->bounding_box().left(); | |
| 69 end_of_row = blob_it.data_relative(-1)->bounding_box().right(); | |
| 70 if (min_left > start_of_row) { | |
| 71 min_left = start_of_row; | |
| 72 } | |
| 73 if (max_right < end_of_row) { | |
| 74 max_right = end_of_row; | |
| 75 } | |
| 76 } | |
| 77 } | |
| 78 if ((total_rows < 3) || (min_left >= max_right)) { | |
| 79 bucket_size = 0; | |
| 80 map_max = 0; | |
| 81 total_rows = 0; | |
| 82 min_left = max_right = 0; | |
| 83 return; | |
| 84 } | |
| 85 bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2; | |
| 86 map_max = (max_right - min_left) / bucket_size; | |
| 87 map = new int16_t[map_max + 1]; | |
| 88 for (i = 0; i <= map_max; i++) { | |
| 89 map[i] = 0; | |
| 90 } | |
| 91 | |
| 92 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { | |
| 93 row = row_it.data(); | |
| 94 if (!row->blob_list()->empty()) { | |
| 95 blob_it.set_to_list(row->blob_list()); | |
| 96 blob_it.mark_cycle_pt(); | |
| 97 blob_box = box_next(&blob_it); | |
| 98 prev_blob_box = blob_box; | |
| 99 if (gapmap_use_ends) { | |
| 100 /* Leading space */ | |
| 101 gap_width = blob_box.left() - min_left; | |
| 102 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { | |
| 103 max_quantum = (blob_box.left() - min_left) / bucket_size; | |
| 104 if (max_quantum > map_max) { | |
| 105 max_quantum = map_max; | |
| 106 } | |
| 107 for (i = 0; i <= max_quantum; i++) { | |
| 108 map[i]++; | |
| 109 } | |
| 110 } | |
| 111 } | |
| 112 while (!blob_it.cycled_list()) { | |
| 113 blob_box = box_next(&blob_it); | |
| 114 gap_width = blob_box.left() - prev_blob_box.right(); | |
| 115 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { | |
| 116 min_quantum = (prev_blob_box.right() - min_left) / bucket_size; | |
| 117 max_quantum = (blob_box.left() - min_left) / bucket_size; | |
| 118 if (max_quantum > map_max) { | |
| 119 max_quantum = map_max; | |
| 120 } | |
| 121 for (i = min_quantum; i <= max_quantum; i++) { | |
| 122 map[i]++; | |
| 123 } | |
| 124 } | |
| 125 prev_blob_box = blob_box; | |
| 126 } | |
| 127 if (gapmap_use_ends) { | |
| 128 /* Trailing space */ | |
| 129 gap_width = max_right - prev_blob_box.right(); | |
| 130 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) { | |
| 131 min_quantum = (prev_blob_box.right() - min_left) / bucket_size; | |
| 132 if (min_quantum < 0) { | |
| 133 min_quantum = 0; | |
| 134 } | |
| 135 for (i = min_quantum; i <= map_max; i++) { | |
| 136 map[i]++; | |
| 137 } | |
| 138 } | |
| 139 } | |
| 140 } | |
| 141 } | |
| 142 for (i = 0; i <= map_max; i++) { | |
| 143 if (map[i] > total_rows / 2) { | |
| 144 if (gapmap_no_isolated_quanta && | |
| 145 (((i == 0) && (map[i + 1] <= total_rows / 2)) || | |
| 146 ((i == map_max) && (map[i - 1] <= total_rows / 2)) || | |
| 147 ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) && | |
| 148 (map[i + 1] <= total_rows / 2)))) { | |
| 149 map[i] = 0; // prevent isolated quantum | |
| 150 } else { | |
| 151 any_tabs = true; | |
| 152 } | |
| 153 } | |
| 154 } | |
| 155 if (gapmap_debug && any_tabs) { | |
| 156 tprintf("Table found\n"); | |
| 157 } | |
| 158 } | |
| 159 | |
| 160 /************************************************************************* | |
| 161 * GAPMAP::table_gap() | |
| 162 * Is there a bucket in the specified range where more than half the rows in the | |
| 163 * block have a wide gap? | |
| 164 *************************************************************************/ | |
| 165 | |
| 166 bool GAPMAP::table_gap( // Is gap a table? | |
| 167 int16_t left, // From here | |
| 168 int16_t right // To here | |
| 169 ) { | |
| 170 int16_t min_quantum; | |
| 171 int16_t max_quantum; | |
| 172 int16_t i; | |
| 173 bool tab_found = false; | |
| 174 | |
| 175 if (!any_tabs) { | |
| 176 return false; | |
| 177 } | |
| 178 | |
| 179 min_quantum = (left - min_left) / bucket_size; | |
| 180 max_quantum = (right - min_left) / bucket_size; | |
| 181 // Clip to the bounds of the array. In some circumstances (big blob followed | |
| 182 // by small blob) max_quantum can exceed the map_max bounds, but we clip | |
| 183 // here instead, as it provides better long-term safety. | |
| 184 if (min_quantum < 0) { | |
| 185 min_quantum = 0; | |
| 186 } | |
| 187 if (max_quantum > map_max) { | |
| 188 max_quantum = map_max; | |
| 189 } | |
| 190 for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) { | |
| 191 if (map[i] > total_rows / 2) { | |
| 192 tab_found = true; | |
| 193 } | |
| 194 } | |
| 195 return tab_found; | |
| 196 } | |
| 197 | |
| 198 } // namespace tesseract |
