Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/stext-table.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 | |
| 25 #include <assert.h> | |
| 26 | |
| 27 /* #define DEBUG_WRITE_AS_PS */ | |
| 28 | |
| 29 /* #define DEBUG_TABLE_STRUCTURE */ | |
| 30 | |
| 31 /* #define DEBUG_TABLE_HUNT */ | |
| 32 | |
| 33 /* | |
| 34 * The algorithm. | |
| 35 * | |
| 36 * The goal of the algorithm is to identify tables on a page. | |
| 37 * First we have to find the tables on a page, then we have to | |
| 38 * figure out where the columns/rows are, and then how the | |
| 39 * cells span them. | |
| 40 * | |
| 41 * We do this as a series of steps. | |
| 42 * | |
| 43 * To illustrate what's going on, let's use an example page | |
| 44 * that we can follow through all the steps. | |
| 45 * | |
| 46 * +---------------------------+ | |
| 47 * | | | |
| 48 * | #### ## ### ## | <- Title | |
| 49 * | | | |
| 50 * | ##### ##### #### ## | \ | |
| 51 * | ## ###### ###### ## | | | |
| 52 * | #### ####### ###### | |- Abstract | |
| 53 * | ####### #### ## ### | | | |
| 54 * | ### ##### ###### | / | |
| 55 * | | | |
| 56 * | ######### ######### | 2 Columns of text | |
| 57 * | ######### ######### | | |
| 58 * | ######## ######### | | |
| 59 * | ######### | | |
| 60 * | +-------+ ####### | <- With an image on the left | |
| 61 * | | | | | |
| 62 * | | | ## ## # # | <- And a table on the right | |
| 63 * | +-------+ ## ## # # | | |
| 64 * | ## ## # # | | |
| 65 * | ######### ## ## # # | | |
| 66 * | ######### ## ## # # | | |
| 67 * | ######### ## ## # # | | |
| 68 * | | | |
| 69 * +---------------------------+ | |
| 70 * | |
| 71 * | |
| 72 * Step 1: Segmentation. | |
| 73 * | |
| 74 * First, we segment the page, trying to break it down into a | |
| 75 * series of non-overlapping rectangles. We do this (in stext-boxer.c) | |
| 76 * by looking for where the content isn't. If we can identify breaks | |
| 77 * that run through the page (either from top to bottom or from left | |
| 78 * to right), then we can split the page there, and recursively consider | |
| 79 * the two halves of the page. | |
| 80 * | |
| 81 * It's not a perfect algorithm, but it manages to in many cases. | |
| 82 * | |
| 83 * After segmenting the above example, first we'll find the horizontal | |
| 84 * splits, giving: | |
| 85 * | |
| 86 * +---------------------------+ | |
| 87 * | | | |
| 88 * | #### ## ### ## | | |
| 89 * +---------------------------+ | |
| 90 * | ##### ##### #### ## | | |
| 91 * | ## ###### ###### ## | | |
| 92 * | #### ####### ###### | | |
| 93 * | ####### #### ## ### | | |
| 94 * | ### ##### ###### | | |
| 95 * +---------------------------+ | |
| 96 * | ######### ######### | | |
| 97 * | ######### ######### | | |
| 98 * | ######## ######### | | |
| 99 * | ######### | | |
| 100 * | +-------+ ####### | | |
| 101 * | | | | | |
| 102 * | | | ## ## # # | | |
| 103 * | +-------+ ## ## # # | | |
| 104 * | ## ## # # | | |
| 105 * | ######### ## ## # # | | |
| 106 * | ######### ## ## # # | | |
| 107 * | ######### ## ## # # | | |
| 108 * | | | |
| 109 * +---------------------------+ | |
| 110 * | |
| 111 * Then we'll recurse and find the vertical split between | |
| 112 * the columns: | |
| 113 * | |
| 114 * +---------------------------+ | |
| 115 * | | | |
| 116 * | #### ## ### ## | | |
| 117 * +---------------------------+ | |
| 118 * | ##### ##### #### ## | | |
| 119 * | ## ###### ###### ## | | |
| 120 * | #### ####### ###### | | |
| 121 * | ####### #### ## ### | | |
| 122 * | ### ##### ###### | | |
| 123 * +-------------+-------------+ | |
| 124 * | ######### | ######### | | |
| 125 * | ######### | ######### | | |
| 126 * | ######## | ######### | | |
| 127 * | | ######### | | |
| 128 * | +-------+ | ####### | | |
| 129 * | | | | | | |
| 130 * | | | | ## ## # # | | |
| 131 * | +-------+ | ## ## # # | | |
| 132 * | | ## ## # # | | |
| 133 * | ######### | ## ## # # | | |
| 134 * | ######### | ## ## # # | | |
| 135 * | ######### | ## ## # # | | |
| 136 * | | | | |
| 137 * +-------------+-------------+ | |
| 138 * | |
| 139 * Then we recurse again and find the horizontal splits | |
| 140 * within the columns: | |
| 141 * | |
| 142 * +---------------------------+ | |
| 143 * | | | |
| 144 * | #### ## ### ## | | |
| 145 * +---------------------------+ | |
| 146 * | ##### ##### #### ## | | |
| 147 * | ## ###### ###### ## | | |
| 148 * | #### ####### ###### | | |
| 149 * | ####### #### ## ### | | |
| 150 * | ### ##### ###### | | |
| 151 * +-------------+-------------+ | |
| 152 * | ######### | ######### | | |
| 153 * | ######### | ######### | | |
| 154 * | ######## | ######### | | |
| 155 * +-------------+ ######### | | |
| 156 * | +-------+ | ####### | | |
| 157 * | | | +-------------+ | |
| 158 * | | | | ## ## # # | | |
| 159 * | +-------+ | ## ## # # | | |
| 160 * +-------------+ ## ## # # | | |
| 161 * | ######### | ## ## # # | | |
| 162 * | ######### | ## ## # # | | |
| 163 * | ######### | ## ## # # | | |
| 164 * | | | | |
| 165 * +-------------+-------------+ | |
| 166 * | |
| 167 * We recurse a fixed maximum number of times (currently | |
| 168 * 6, IIRC) or until we fail to find any suitable splits. | |
| 169 * | |
| 170 * This completes the page segmentation step. | |
| 171 * | |
| 172 * Step 2: Grid finding | |
| 173 * | |
| 174 * Next, we look at each of those segments and try to identify | |
| 175 * where grids might be. | |
| 176 * | |
| 177 * Imagine the bottom right section of that page as | |
| 178 * a board with lego blocks on where there is text. | |
| 179 * Now imagine viewing that from the bottom of the page. | |
| 180 * The gaps between the columns of the table are where you | |
| 181 * can see through to the top between the blocks. | |
| 182 * | |
| 183 * Similarly, if you view it from the side, the gaps between the | |
| 184 * rows of the page are where you can see through to the other | |
| 185 * side. | |
| 186 * | |
| 187 * So, how do we code that? Well, we run through the page content | |
| 188 * (obviously, restricted to the content that falls into this | |
| 189 * segment of the page - that'll go without saying from here on | |
| 190 * in). For each bit of content, we look at the "x extent" of that | |
| 191 * content - for instance a given string might start at position | |
| 192 * 10 and continue to position 100. We build a list of all these | |
| 193 * start, and stop positions, and keep them in a sorted list. | |
| 194 * | |
| 195 * Then we walk this list from left to right, keeping a sum. I | |
| 196 * call this sum "wind", because it's very similar to the winding | |
| 197 * number that you get when doing scan conversion of bezier shapes. | |
| 198 * | |
| 199 * wind starts out as 0. We increment it whenever we pass a 'start' | |
| 200 * position, and decrement it whenever we pass a 'stop' position. | |
| 201 * So at any given x position along the line wind tells us the | |
| 202 * number of pieces of content that overlap that x position. | |
| 203 * So wind(left) = 0 = wind(right), and wind(x) >= x for all x. | |
| 204 * | |
| 205 * So, if we walk from left to right, the trace of wind might | |
| 206 * look something like: | |
| 207 * | |
| 208 * __ | |
| 209 * ___ / \ _ __ | |
| 210 * / \ / \/ \ _/ \_ | |
| 211 * / \___/ \___/ \ | |
| 212 * | |
| 213 * The left and right edges of the table are pretty clear. | |
| 214 * The regions where wind drops to 0 represent the column dividers. | |
| 215 * The left and right hand side of those regions gives us the min | |
| 216 * and max values for that divider. | |
| 217 * | |
| 218 * We can then repeat this process for Y ranges instead of X ranges | |
| 219 * to get the row dividers. | |
| 220 * | |
| 221 * BUT, this only works for pure grid tables. It falls down for | |
| 222 * cases where we have merged cells (which is very common due to | |
| 223 * titles etc). | |
| 224 * | |
| 225 * We can modify the algorithm slightly to allow for this. | |
| 226 * | |
| 227 * Consider the following table: | |
| 228 * | |
| 229 * +-----------------------------------+ | |
| 230 * | Long Table title across the top | | |
| 231 * +---------------+---------+---------+ | |
| 232 * | Name | Result1 | Result2 | | |
| 233 * +---------------+----+----+----+----+ | |
| 234 * | Homer Simpson | 1 | 23 | 4 | 56 | | |
| 235 * | Barney Gumble | 1 | 23 | 4 | 56 | | |
| 236 * | Moe | 1 | 23 | 4 | 56 | | |
| 237 * | Apu | 1 | 23 | 4 | 56 | | |
| 238 * | Ned Flanders | 1 | 23 | 4 | 56 | | |
| 239 * +---------------+----+----+----+----+ | |
| 240 * | |
| 241 * The wind trace for that looks something like | |
| 242 * (with a certain degree of artistic license for the | |
| 243 * limitations of ascii art): | |
| 244 * | |
| 245 * ________ | |
| 246 * / \ _ __ _ _ | |
| 247 * / \____/ \_/ \___/ \_/ \ | |
| 248 * / \ | |
| 249 * | |
| 250 * So, the trace never quite drops back to zero in the | |
| 251 * middle due to the spanning of the top title. | |
| 252 * | |
| 253 * So, instead of just looking for points where the trace | |
| 254 * drops to zero, we instead look for local minima. Each local | |
| 255 * minima represents a place where there might be a grid divider. | |
| 256 * The value of wind at such points can be considered the | |
| 257 * "uncertainty" with which there might be a divider there. | |
| 258 * Clear dividers (with a wind value of 0) have no uncertainty. | |
| 259 * Places where cells are spanned have a higher value of uncertainty. | |
| 260 * | |
| 261 * The output from this step is the list of possible grid positions | |
| 262 * (X and Y), with uncertainty values. | |
| 263 * | |
| 264 * We have skipped over the handling of spaces in the above | |
| 265 * description. On the one hand, we'd quite like to treat | |
| 266 * sentences as long unbroken regions of content. But sometimes | |
| 267 * we can get awkward content like: | |
| 268 * | |
| 269 * P subtelomeric 24.38 8.15 11.31 0.94 1.46 | |
| 270 * 11.08 6.93 15.34 0.00 0.73 | |
| 271 * Pericentromeric region; C band 20.42 9.26 13.64 0.81 0.81 | |
| 272 * 16.50 7.03 14.45 0.17 5.15 | |
| 273 | |
| 274 * where the content is frequently sent with spaces instead of | |
| 275 * small gaps (particular in tables of digits, because numerals | |
| 276 * are often the same width even in proportional fonts). | |
| 277 * | |
| 278 * To cope with this, as we send potential edges into the start/stop | |
| 279 * positions, we send 'weak' edges for the start and stops of runs | |
| 280 * of spaces. We then post process the edges to remove any local | |
| 281 * minima regions in the 'wind' values that are bounded purely by | |
| 282 * 'weak' edges. | |
| 283 * | |
| 284 * Step 3: Cell analysis | |
| 285 * | |
| 286 * So, armed with the output from step 2, we can examine each grid | |
| 287 * found. If we have W x-dividers and H y-dividers, we know we have | |
| 288 * a potential table with (W-1) x (H-1) cells in it. | |
| 289 * | |
| 290 * We represent this as a W x H grid of cells, each like: | |
| 291 * | |
| 292 * . . | |
| 293 * . . | |
| 294 * . . .+-------+. . . Each cell holds information about the | |
| 295 * | . edges above, and to the left of it. | |
| 296 * | . | |
| 297 * | . | |
| 298 * . . .+. . . .+. . . | |
| 299 * . . | |
| 300 * . . | |
| 301 * | |
| 302 * h_line: Is there a horizontal divider drawn on the page that | |
| 303 * corresponds to the top of this cell (i.e. is there a cell border | |
| 304 * here?) | |
| 305 * v_line: Is there a vertical divider drawn on the page that | |
| 306 * corresponds to the left of this cell (i.e. is there a cell border | |
| 307 * here?) | |
| 308 * h_crossed: Does content cross this line (i.e. are we merged | |
| 309 * with the cell above?) | |
| 310 * v_crossed: Does content cross this line (i.e. are we merged | |
| 311 * with the cell to the left?) | |
| 312 * full: Is there any content in this cell at all? | |
| 313 * | |
| 314 * We need a W x H grid of cells to represent the entire table due | |
| 315 * to the potential right and bottom edge lines. The right and | |
| 316 * bottom rows of cells should never be full, or be crossed, but | |
| 317 * it's easiest just to use a simple representation that copes with | |
| 318 * the h_line and v_line values naturally. | |
| 319 * | |
| 320 * So, we start with the cells structure empty, and we run through | |
| 321 * the page content, filling in the details as we go. | |
| 322 * | |
| 323 * At the end of the process, we have enough information to draw | |
| 324 * an asciiart representation of our table. It might look something | |
| 325 * like this (this comes from dotted-gridlines-tables.pdf): | |
| 326 * | |
| 327 * +-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 328 * | | | | | |#|#|#|#| | | | | |
| 329 * + + + + + + +v+ +v+v+ + + + | |
| 330 * | | | | | |#|#|#|#| | | | | |
| 331 * + + + + + + + +v+ + + + + + | |
| 332 * | |#| | |#|#|#|#|#|#|#|#| | |
| 333 * + +v+ + + +v+v+ +v+v+v+v+v+ | |
| 334 * | |#| |#|#|#|#|#|#|#|#|#| | |
| 335 * + + + + +v+ + +v+ + + + + + | |
| 336 * |#|#| #|#|#|#|#|#|#|#|#|#| | |
| 337 * +v+v+ +v+ +v+v+ +v+v+v+v+v+ | |
| 338 * |#|#| #|#|#|#|#|#|#|#|#|#| | |
| 339 * + + + + +v+ + +v+ + + + + + | |
| 340 * | |#| |#|#|#|#|#|#|#|#|#| | |
| 341 * + +v+ + + +v+v+ +v+v+v+v+v+ | |
| 342 * | |#| | |#|#|#|#|#|#|#|#| | |
| 343 * + + + + + + + +v+ + + + + + | |
| 344 * | | | | | |#|#|#|#| | | | | |
| 345 * + + + + + + +v+ +v+v+ + + + | |
| 346 * | | | | | |#|#|#|#| | | | | |
| 347 * +-+-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 348 * | |# |#| | |#| | |#|#|#| | |
| 349 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 350 * | |# |#| | |#| | |#|#|#| | |
| 351 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 352 * | |#>#|#| | |#| | |#|#|#| | |
| 353 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 354 * | |#>#|#| | |#| | |#|#|#| | |
| 355 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 356 * | |#>#|#| |#| | | |#|#|#| | |
| 357 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 358 * | |# |#| | |#| | |#|#|#| | |
| 359 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 360 * | |# |#| | |#| | |#|#|#| | |
| 361 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 362 * | |# |#| | |#| | |#|#|#| | |
| 363 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 364 * | |# |#| | |#| | |#|#|#| | |
| 365 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 366 * |# |#|#|#|#|#|#|#|#|#| | |
| 367 * + +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 368 * | |
| 369 * This shows where lines are detected ( - and | ), | |
| 370 * where they are crossed ( > and v) and where cells | |
| 371 * are full ( # ). | |
| 372 * | |
| 373 * Step 4: Row and column merging. | |
| 374 * | |
| 375 * Based on the information above, we then try to merge | |
| 376 * cells and columns to simplify the table. | |
| 377 * | |
| 378 * The best rules I've come up with this so far are: | |
| 379 * We can merge two adjacent columns if all the pairs of | |
| 380 * cells in the two columns are mergeable. | |
| 381 * | |
| 382 * Cells are held to be mergeable or not based upon the following | |
| 383 * rules: | |
| 384 * If there is a line between 2 cells - not mergeable. | |
| 385 * else if the uncertainty between 2 cells is 0 - not mergeable. | |
| 386 * else if the line between the 2 cells is crossed - mergeable. | |
| 387 * else if strictly one of the cells is full - mergeable. | |
| 388 * else not mergeable. | |
| 389 * | |
| 390 * So in the above example, column 2 (numbered from 0) can be merged | |
| 391 * with column 3. | |
| 392 * | |
| 393 * This gives: | |
| 394 * | |
| 395 * +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 396 * | | | | | |#|#|#|#| | | | | |
| 397 * + + + + + +v+ +v+v+ + + + | |
| 398 * | | | | | |#|#|#|#| | | | | |
| 399 * + + + + + + +v+ + + + + + | |
| 400 * | |#| | |#|#|#|#|#|#|#|#| | |
| 401 * + +v+ + +v+v+ +v+v+v+v+v+ | |
| 402 * | |#| |#|#|#|#|#|#|#|#|#| | |
| 403 * + + + +v+ + +v+ + + + + + | |
| 404 * |#|#|#|#|#|#|#|#|#|#|#|#| | |
| 405 * +v+v+v+ +v+v+ +v+v+v+v+v+ | |
| 406 * |#|#|#|#|#|#|#|#|#|#|#|#| | |
| 407 * + + + +v+ + +v+ + + + + + | |
| 408 * | |#| |#|#|#|#|#|#|#|#|#| | |
| 409 * + +v+ + +v+v+ +v+v+v+v+v+ | |
| 410 * | |#| | |#|#|#|#|#|#|#|#| | |
| 411 * + + + + + + +v+ + + + + + | |
| 412 * | | | | | |#|#|#|#| | | | | |
| 413 * + + + + + +v+ +v+v+ + + + | |
| 414 * | | | | | |#|#|#|#| | | | | |
| 415 * +-+-+-+-+-+-+-+-+-+-+-+-+ | |
| 416 * | |#|#| | |#| | |#|#|#| | |
| 417 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 418 * | |#|#| | |#| | |#|#|#| | |
| 419 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 420 * | |#|#| | |#| | |#|#|#| | |
| 421 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 422 * | |#|#| | |#| | |#|#|#| | |
| 423 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 424 * | |#|#| |#| | | |#|#|#| | |
| 425 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 426 * | |#|#| | |#| | |#|#|#| | |
| 427 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 428 * | |#|#| | |#| | |#|#|#| | |
| 429 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 430 * | |#|#| | |#| | |#|#|#| | |
| 431 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 432 * | |#|#| | |#| | |#|#|#| | |
| 433 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 434 * |# |#|#|#|#|#|#|#|#|#| | |
| 435 * + +-+-+-+-+-+-+-+-+-+-+-+ | |
| 436 * | |
| 437 * We then perform the same merging process for rows as for | |
| 438 * columns - though there are no rows in the above example | |
| 439 * that can be merged. | |
| 440 * | |
| 441 * You'll note that, for example, we don't merge row 0 and | |
| 442 * row 1 in the above, because we have a pair of cells that | |
| 443 * are both full without crossing. | |
| 444 * | |
| 445 * Step 5: Cell spanning | |
| 446 * | |
| 447 * Now we actually start to output the table. We keep a 'sent_table' | |
| 448 * (a grid of W x H bools) to keep track of whether we've output | |
| 449 * the content for a given cell or not yet. | |
| 450 * | |
| 451 * For each cell we reach, assuming sent_table[x,y] is false, | |
| 452 * we merge it with as many cells on the right as required, | |
| 453 * according to 'v_crossed' values (subject to not passing | |
| 454 * v_lines or uncertainty == 0's). | |
| 455 * | |
| 456 * We then try to merge cells below according to 'h_crossed' | |
| 457 * values (subject to not passing h_lines or uncertainty == 0's). | |
| 458 * | |
| 459 * In theory this can leave us with some cases where we'd like | |
| 460 * to merge some cells (because of crossed) and can't (because | |
| 461 * of lines or sent_table[]) values. In the absence of better | |
| 462 * cell spanning algorithms we have no choice here. | |
| 463 * | |
| 464 * Then we output the contents and set sent_table[] values as | |
| 465 * appropriate. | |
| 466 * | |
| 467 * If a row has no cells in it, we currently omit the TR. If/when | |
| 468 * we figure out how to indicate rowspan/colspan in stext, we can | |
| 469 * revisit that. | |
| 470 */ | |
| 471 | |
| 472 | |
| 473 static fz_stext_block * | |
| 474 add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block) | |
| 475 { | |
| 476 fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block)); | |
| 477 memset(block, 0, sizeof(*block)); | |
| 478 block->type = FZ_STEXT_BLOCK_GRID; | |
| 479 block->bbox = fz_empty_rect; /* Fixes bug 703267. */ | |
| 480 block->next = *first_block; | |
| 481 if (*first_block) | |
| 482 { | |
| 483 (*first_block)->prev = block; | |
| 484 assert(*last_block); | |
| 485 } | |
| 486 else | |
| 487 { | |
| 488 assert(*last_block == NULL); | |
| 489 *last_block = block; | |
| 490 } | |
| 491 *first_block = block; | |
| 492 return block; | |
| 493 } | |
| 494 | |
| 495 static void | |
| 496 insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest) | |
| 497 { | |
| 498 if (before) | |
| 499 { | |
| 500 /* We have a block to insert it before, so we know it's not the last. */ | |
| 501 assert(dest ? (dest->first_block != NULL && dest->last_block != NULL) : (page->first_block != NULL && page->last_block != NULL)); | |
| 502 block->next = before; | |
| 503 block->prev = before->prev; | |
| 504 if (before->prev) | |
| 505 { | |
| 506 assert(before->prev->next == before); | |
| 507 before->prev->next = block; | |
| 508 } | |
| 509 else if (dest) | |
| 510 { | |
| 511 assert(dest->first_block == before); | |
| 512 dest->first_block = block; | |
| 513 } | |
| 514 else | |
| 515 { | |
| 516 assert(page->first_block == before); | |
| 517 page->first_block = block; | |
| 518 } | |
| 519 before->prev = block; | |
| 520 } | |
| 521 else if (dest) | |
| 522 { | |
| 523 /* Will be the last block. */ | |
| 524 block->next = NULL; | |
| 525 block->prev = dest->last_block; | |
| 526 if (dest->last_block) | |
| 527 { | |
| 528 assert(dest->last_block->next == NULL); | |
| 529 dest->last_block->next = block; | |
| 530 } | |
| 531 if (dest->first_block == NULL) | |
| 532 dest->first_block = block; | |
| 533 dest->last_block = block; | |
| 534 } | |
| 535 else | |
| 536 { | |
| 537 /* Will be the last block. */ | |
| 538 block->next = NULL; | |
| 539 block->prev = page->last_block; | |
| 540 if (page->last_block) | |
| 541 { | |
| 542 assert(page->last_block->next == NULL); | |
| 543 page->last_block->next = block; | |
| 544 } | |
| 545 if (page->first_block == NULL) | |
| 546 page->first_block = block; | |
| 547 page->last_block = block; | |
| 548 } | |
| 549 } | |
| 550 | |
| 551 static fz_stext_struct * | |
| 552 add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw) | |
| 553 { | |
| 554 fz_stext_block *block; | |
| 555 int idx = 0; | |
| 556 size_t z; | |
| 557 fz_stext_struct *newstruct; | |
| 558 | |
| 559 if (raw == NULL) | |
| 560 raw = ""; | |
| 561 z = strlen(raw); | |
| 562 | |
| 563 /* We're going to insert a struct block. We need an idx, so walk the list */ | |
| 564 for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next) | |
| 565 { | |
| 566 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 567 { | |
| 568 assert(block->u.s.index >= idx); | |
| 569 idx = block->u.s.index + 1; | |
| 570 } | |
| 571 } | |
| 572 /* So we'll add our block as idx. But all the other struct blocks that follow us need to have | |
| 573 * larger values. */ | |
| 574 | |
| 575 /* Update all the subsequent structs to have a higher idx */ | |
| 576 if (before) | |
| 577 { | |
| 578 int idx2 = idx+1; | |
| 579 for (block = before->next; block != NULL; block = block->next) | |
| 580 { | |
| 581 if (block->type != FZ_STEXT_BLOCK_STRUCT) | |
| 582 continue; | |
| 583 if (block->u.s.index > idx2) | |
| 584 break; | |
| 585 block->u.s.index = idx2++; | |
| 586 } | |
| 587 } | |
| 588 | |
| 589 /* Now make our new struct block and insert it. */ | |
| 590 block = fz_pool_alloc(ctx, page->pool, sizeof(*block)); | |
| 591 block->type = FZ_STEXT_BLOCK_STRUCT; | |
| 592 block->bbox = fz_empty_rect; /* Fixes bug 703267. */ | |
| 593 insert_block_before(block, before, page, parent); | |
| 594 | |
| 595 block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z); | |
| 596 block->u.s.index = idx; | |
| 597 newstruct->parent = parent; | |
| 598 newstruct->standard = std; | |
| 599 memcpy(newstruct->raw, raw, z); | |
| 600 newstruct->raw[z] = 0; | |
| 601 newstruct->up = block; | |
| 602 | |
| 603 return newstruct; | |
| 604 } | |
| 605 | |
| 606 typedef struct | |
| 607 { | |
| 608 int len; | |
| 609 int max; | |
| 610 struct { | |
| 611 uint8_t left; | |
| 612 uint8_t weak; | |
| 613 float pos; | |
| 614 int freq; | |
| 615 } *list; | |
| 616 } div_list; | |
| 617 | |
| 618 static void | |
| 619 div_list_push(fz_context *ctx, div_list *div, int left, int weak, float pos) | |
| 620 { | |
| 621 int i; | |
| 622 | |
| 623 /* FIXME: Could be bsearch. */ | |
| 624 for (i = 0; i < div->len; i++) | |
| 625 { | |
| 626 if (div->list[i].pos > pos) | |
| 627 break; | |
| 628 else if (div->list[i].pos == pos && div->list[i].left == left) | |
| 629 { | |
| 630 div->list[i].freq++; | |
| 631 div->list[i].weak &= weak; | |
| 632 return; | |
| 633 } | |
| 634 } | |
| 635 | |
| 636 if (div->len == div->max) | |
| 637 { | |
| 638 int newmax = div->max * 2; | |
| 639 if (newmax == 0) | |
| 640 newmax = 32; | |
| 641 div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax); | |
| 642 div->max = newmax; | |
| 643 } | |
| 644 | |
| 645 if (i < div->len) | |
| 646 memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i)); | |
| 647 div->len++; | |
| 648 div->list[i].left = left; | |
| 649 div->list[i].weak = weak; | |
| 650 div->list[i].pos = pos; | |
| 651 div->list[i].freq = 1; | |
| 652 } | |
| 653 | |
| 654 static fz_stext_grid_positions * | |
| 655 make_table_positions(fz_context *ctx, div_list *xs, float min, float max) | |
| 656 { | |
| 657 int wind; | |
| 658 fz_stext_grid_positions *pos; | |
| 659 int len = xs->len; | |
| 660 int i; | |
| 661 int hi = 0; | |
| 662 | |
| 663 /* Count the number of edges */ | |
| 664 int local_min = 0; | |
| 665 int edges = 2; | |
| 666 | |
| 667 if (len == 0) | |
| 668 return NULL; | |
| 669 | |
| 670 assert(xs->list[0].left); | |
| 671 for (i = 0; i < len; i++) | |
| 672 { | |
| 673 if (xs->list[i].pos >= min) | |
| 674 break; | |
| 675 } | |
| 676 for (; i < len; i++) | |
| 677 { | |
| 678 if (xs->list[i].pos >= max) | |
| 679 break; | |
| 680 if (xs->list[i].left) | |
| 681 { | |
| 682 if (local_min) | |
| 683 edges++; | |
| 684 } | |
| 685 else | |
| 686 local_min = 1; | |
| 687 } | |
| 688 assert(!xs->list[len-1].left); | |
| 689 | |
| 690 pos = fz_malloc_flexible(ctx, fz_stext_grid_positions, list, edges); | |
| 691 pos->len = edges; | |
| 692 | |
| 693 /* Copy the edges in */ | |
| 694 wind = 0; | |
| 695 local_min = 0; | |
| 696 edges = 1; | |
| 697 pos->list[0].pos = min; | |
| 698 pos->list[0].min = min; | |
| 699 pos->list[0].max = fz_max(xs->list[0].pos, min); | |
| 700 pos->list[0].uncertainty = 0; | |
| 701 pos->list[0].reinforcement = 0; | |
| 702 #ifdef DEBUG_TABLE_HUNT | |
| 703 printf("|%g ", pos->list[0].pos); | |
| 704 #endif | |
| 705 /* Skip over entries to the left of min. */ | |
| 706 for (i = 0; i < len; i++) | |
| 707 { | |
| 708 if (xs->list[i].pos >= min) | |
| 709 break; | |
| 710 if (xs->list[i].left) | |
| 711 wind += xs->list[i].freq; | |
| 712 else | |
| 713 wind -= xs->list[i].freq; | |
| 714 } | |
| 715 for (; i < len; i++) | |
| 716 { | |
| 717 if (xs->list[i].pos >= max) | |
| 718 break; | |
| 719 if (xs->list[i].left) | |
| 720 { | |
| 721 if (local_min) | |
| 722 { | |
| 723 pos->list[edges].min = xs->list[i-1].pos; | |
| 724 pos->list[edges].max = xs->list[i].pos; | |
| 725 pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2; | |
| 726 pos->list[edges].uncertainty = wind; | |
| 727 #ifdef DEBUG_TABLE_HUNT | |
| 728 if (wind) | |
| 729 printf("?%g(%d) ", pos->list[edges].pos, wind); | |
| 730 else | |
| 731 printf("|%g ", pos->list[edges].pos); | |
| 732 #endif | |
| 733 edges++; | |
| 734 } | |
| 735 wind += xs->list[i].freq; | |
| 736 if (wind > hi) | |
| 737 hi = wind; | |
| 738 } | |
| 739 else | |
| 740 { | |
| 741 wind -= xs->list[i].freq; | |
| 742 local_min = 1; | |
| 743 } | |
| 744 } | |
| 745 assert(i < len || wind == 0); | |
| 746 pos->list[edges].pos = max; | |
| 747 pos->list[edges].min = fz_min(xs->list[i-1].pos, max); | |
| 748 pos->list[edges].max = max; | |
| 749 assert(max >= xs->list[i-1].pos); | |
| 750 pos->list[edges].uncertainty = 0; | |
| 751 pos->list[edges].reinforcement = 0; | |
| 752 pos->max_uncertainty = hi; | |
| 753 #ifdef DEBUG_TABLE_HUNT | |
| 754 printf("|%g\n", pos->list[edges].pos); | |
| 755 #endif | |
| 756 | |
| 757 return pos; | |
| 758 } | |
| 759 | |
| 760 static fz_stext_grid_positions * | |
| 761 copy_grid_positions_to_pool(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs) | |
| 762 { | |
| 763 size_t z = offsetof(fz_stext_grid_positions, list) + sizeof(xs->list[0]) * (xs->len); | |
| 764 fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z); | |
| 765 memcpy(xs2, xs, z); | |
| 766 return xs2; | |
| 767 } | |
| 768 | |
| 769 static void | |
| 770 sanitize_positions(fz_context *ctx, div_list *xs) | |
| 771 { | |
| 772 int i, j, wind, changed; | |
| 773 | |
| 774 #ifdef DEBUG_TABLE_HUNT | |
| 775 printf("OK:\n"); | |
| 776 for (i = 0; i < xs->len; i++) | |
| 777 { | |
| 778 if (xs->list[i].left) | |
| 779 printf("["); | |
| 780 printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); | |
| 781 if (!xs->list[i].left) | |
| 782 printf("]"); | |
| 783 printf(" "); | |
| 784 } | |
| 785 printf("\n"); | |
| 786 #endif | |
| 787 | |
| 788 if (xs->len == 0) | |
| 789 return; | |
| 790 | |
| 791 do | |
| 792 { | |
| 793 /* Now, combine runs of left and right */ | |
| 794 for (i = 0; i < xs->len; i++) | |
| 795 { | |
| 796 if (xs->list[i].left) | |
| 797 { | |
| 798 j = i; | |
| 799 while (i < xs->len-1 && xs->list[i+1].left) | |
| 800 { | |
| 801 i++; | |
| 802 xs->list[j].freq += xs->list[i].freq; | |
| 803 xs->list[j].weak &= xs->list[i].weak; | |
| 804 xs->list[i].freq = 0; | |
| 805 } | |
| 806 } | |
| 807 else | |
| 808 { | |
| 809 while (i < xs->len-1 && !xs->list[i+1].left) | |
| 810 { | |
| 811 i++; | |
| 812 xs->list[i].freq += xs->list[i-1].freq; | |
| 813 xs->list[i].weak &= xs->list[i-1].weak; | |
| 814 xs->list[i-1].freq = 0; | |
| 815 } | |
| 816 } | |
| 817 } | |
| 818 | |
| 819 #ifdef DEBUG_TABLE_HUNT | |
| 820 printf("Shrunk:\n"); | |
| 821 for (i = 0; i < xs->len; i++) | |
| 822 { | |
| 823 if (xs->list[i].left) | |
| 824 printf("["); | |
| 825 printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); | |
| 826 if (!xs->list[i].left) | |
| 827 printf("]"); | |
| 828 printf(" "); | |
| 829 } | |
| 830 printf("\n"); | |
| 831 #endif | |
| 832 | |
| 833 /* Now remove the 0 frequency ones. */ | |
| 834 j = 0; | |
| 835 for (i = 0; i < xs->len; i++) | |
| 836 { | |
| 837 if (xs->list[i].freq == 0) | |
| 838 continue; | |
| 839 if (i != j) | |
| 840 xs->list[j] = xs->list[i]; | |
| 841 j++; | |
| 842 } | |
| 843 xs->len = j; | |
| 844 | |
| 845 /* Now run across looking for local minima where at least one | |
| 846 * edge is 'weak'. If the wind at that point is non-zero, then | |
| 847 * remove the weak edges from consideration and retry. */ | |
| 848 wind = 0; | |
| 849 changed = 0; | |
| 850 i = 0; | |
| 851 while (1) | |
| 852 { | |
| 853 assert(xs->list[i].left); | |
| 854 for (; xs->list[i].left; i++) | |
| 855 { | |
| 856 wind += xs->list[i].freq; | |
| 857 } | |
| 858 assert(i < xs->len); | |
| 859 for (; xs->list[i].left == 0 && i < xs->len; i++) | |
| 860 { | |
| 861 wind -= xs->list[i].freq; | |
| 862 } | |
| 863 if (i == xs->len) | |
| 864 break; | |
| 865 if (wind != 0 && (xs->list[i-1].weak || xs->list[i].weak)) | |
| 866 { | |
| 867 int m = fz_mini(xs->list[i-1].freq, xs->list[i].freq); | |
| 868 assert(m > 0); | |
| 869 xs->list[i-1].freq -= m; | |
| 870 xs->list[i].freq -= m; | |
| 871 changed = 1; | |
| 872 } | |
| 873 } | |
| 874 } | |
| 875 while (changed); | |
| 876 | |
| 877 #ifdef DEBUG_TABLE_HUNT | |
| 878 printf("Compacted:\n"); | |
| 879 for (i = 0; i < xs->len; i++) | |
| 880 { | |
| 881 if (xs->list[i].left) | |
| 882 printf("["); | |
| 883 printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); | |
| 884 if (!xs->list[i].left) | |
| 885 printf("]"); | |
| 886 printf(" "); | |
| 887 } | |
| 888 printf("\n"); | |
| 889 #endif | |
| 890 } | |
| 891 | |
| 892 /* We want to check for whether a DIV that we are about to descend into | |
| 893 * contains a column of justified text. We will accept some headers in | |
| 894 * this text, but not JUST headers. */ | |
| 895 static int | |
| 896 all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block) | |
| 897 { | |
| 898 int just_headers = 1; | |
| 899 | |
| 900 for (; block != NULL; block = block->next) | |
| 901 { | |
| 902 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 903 { | |
| 904 if (block->u.s.down == NULL) | |
| 905 continue; | |
| 906 if (block->u.s.down->standard == FZ_STRUCTURE_H || | |
| 907 block->u.s.down->standard == FZ_STRUCTURE_H1 || | |
| 908 block->u.s.down->standard == FZ_STRUCTURE_H2 || | |
| 909 block->u.s.down->standard == FZ_STRUCTURE_H3 || | |
| 910 block->u.s.down->standard == FZ_STRUCTURE_H4 || | |
| 911 block->u.s.down->standard == FZ_STRUCTURE_H5 || | |
| 912 block->u.s.down->standard == FZ_STRUCTURE_H6) | |
| 913 continue; | |
| 914 if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block)) | |
| 915 return 0; | |
| 916 } | |
| 917 just_headers = 0; | |
| 918 if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL) | |
| 919 return 0; | |
| 920 } | |
| 921 | |
| 922 if (just_headers) | |
| 923 return 0; | |
| 924 | |
| 925 return 1; | |
| 926 } | |
| 927 | |
| 928 #define TWO_INCHES (72*2) | |
| 929 | |
| 930 /* Walk through the blocks, finding the bbox. */ | |
| 931 static fz_rect | |
| 932 walk_to_find_bounds(fz_context *ctx, fz_stext_block *first_block) | |
| 933 { | |
| 934 fz_rect bounds = fz_empty_rect; | |
| 935 fz_stext_block *block; | |
| 936 fz_stext_line *line; | |
| 937 fz_stext_char *ch; | |
| 938 | |
| 939 for (block = first_block; block != NULL; block = block->next) | |
| 940 { | |
| 941 switch (block->type) | |
| 942 { | |
| 943 case FZ_STEXT_BLOCK_STRUCT: | |
| 944 if (!block->u.s.down) | |
| 945 continue; | |
| 946 if (block->u.s.down->standard == FZ_STRUCTURE_H) | |
| 947 { | |
| 948 if (block->next != NULL && | |
| 949 block->next->type == FZ_STEXT_BLOCK_TEXT && | |
| 950 block->next->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) | |
| 951 continue; | |
| 952 } | |
| 953 bounds = fz_union_rect(bounds, walk_to_find_bounds(ctx, block->u.s.down->first_block)); | |
| 954 break; | |
| 955 case FZ_STEXT_BLOCK_VECTOR: | |
| 956 bounds = fz_union_rect(bounds, block->bbox); | |
| 957 break; | |
| 958 case FZ_STEXT_BLOCK_TEXT: | |
| 959 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL && block->bbox.x1 - block->bbox.x0 >= TWO_INCHES) | |
| 960 continue; | |
| 961 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 962 { | |
| 963 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 964 { | |
| 965 if (ch->c != ' ') | |
| 966 bounds = fz_union_rect(bounds, fz_rect_from_quad(ch->quad)); | |
| 967 } | |
| 968 } | |
| 969 break; | |
| 970 } | |
| 971 } | |
| 972 | |
| 973 return bounds; | |
| 974 } | |
| 975 | |
| 976 static void | |
| 977 walk_to_find_content(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, fz_rect bounds) | |
| 978 { | |
| 979 fz_stext_block *block; | |
| 980 fz_stext_line *line; | |
| 981 fz_stext_char *ch; | |
| 982 | |
| 983 for (block = first_block; block != NULL; block = block->next) | |
| 984 { | |
| 985 switch (block->type) | |
| 986 { | |
| 987 case FZ_STEXT_BLOCK_STRUCT: | |
| 988 if (block->u.s.down && !fz_is_empty_rect(fz_intersect_rect(bounds, block->bbox))) | |
| 989 walk_to_find_content(ctx, xs, ys, block->u.s.down->first_block, bounds); | |
| 990 break; | |
| 991 case FZ_STEXT_BLOCK_VECTOR: | |
| 992 break; | |
| 993 case FZ_STEXT_BLOCK_TEXT: | |
| 994 { | |
| 995 fz_rect justified_region = fz_empty_rect; | |
| 996 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 997 { | |
| 998 fz_rect region = fz_empty_rect; | |
| 999 | |
| 1000 region.y0 = line->bbox.y0; | |
| 1001 region.y1 = line->bbox.y1; | |
| 1002 | |
| 1003 if (region.y0 < bounds.y0) | |
| 1004 region.y0 = bounds.y0; | |
| 1005 if (region.y1 > bounds.y1) | |
| 1006 region.y1 = bounds.y1; | |
| 1007 if (region.y0 >= region.y1) | |
| 1008 break; | |
| 1009 | |
| 1010 /* Skip leading spaces. */ | |
| 1011 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 1012 if (ch->c != ' ') | |
| 1013 break; | |
| 1014 | |
| 1015 for (; ch != NULL; ch = ch->next) | |
| 1016 { | |
| 1017 if (ch->c == ' ') | |
| 1018 { | |
| 1019 /* Find the last space char in this run. */ | |
| 1020 fz_stext_char *last_space; | |
| 1021 | |
| 1022 for (last_space = ch; last_space->next != NULL && last_space->next->c == ' '; last_space = last_space->next); | |
| 1023 | |
| 1024 /* If we're not the last char in the line (i.e. we're not a trailing space, | |
| 1025 * then send a 'weak' gap for the spaces, assuming it's sane to do so). */ | |
| 1026 if (last_space->next != NULL) | |
| 1027 { | |
| 1028 float rpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); | |
| 1029 float lpos = fz_min(last_space->next->quad.ll.x, last_space->next->quad.ll.x); | |
| 1030 | |
| 1031 /* Clamp these to the bounds */ | |
| 1032 rpos = fz_clamp(rpos, bounds.x0, bounds.x1); | |
| 1033 lpos = fz_clamp(lpos, bounds.x0, bounds.x1); | |
| 1034 | |
| 1035 /* So we have a region (rpos...lpos) to add. */ | |
| 1036 /* This can be positioned in various different ways relative to the | |
| 1037 * current region: | |
| 1038 * [region] | |
| 1039 * (rpos..lpos) OK | |
| 1040 * (rpos..lpos) OK, but adjust lpos | |
| 1041 * (rpos..lpos) OK, but adjust rpos | |
| 1042 * (rpos..lpos) OK | |
| 1043 * (rpos .. lpos) OK | |
| 1044 */ | |
| 1045 if (lpos >= region.x1) | |
| 1046 { | |
| 1047 if (rpos >= region.x0 && rpos < region.x1) | |
| 1048 rpos = region.x1; | |
| 1049 } | |
| 1050 else if (rpos <= region.x0) | |
| 1051 { | |
| 1052 if (lpos > region.x0) | |
| 1053 lpos = region.x0; | |
| 1054 } | |
| 1055 else | |
| 1056 rpos = lpos; /* Make it an invalid region */ | |
| 1057 | |
| 1058 if (rpos < lpos) | |
| 1059 { | |
| 1060 /* Send a weak right at the start of the spaces... */ | |
| 1061 div_list_push(ctx, xs, 0, 1, rpos); | |
| 1062 /* and a weak left at the end. */ | |
| 1063 div_list_push(ctx, xs, 1, 1, lpos); | |
| 1064 } | |
| 1065 | |
| 1066 /* Expand the region as required. */ | |
| 1067 if (rpos < region.x0) | |
| 1068 region.x0 = rpos; | |
| 1069 if (lpos > region.x1) | |
| 1070 region.x1 = lpos; | |
| 1071 } | |
| 1072 /* Jump over the spaces */ | |
| 1073 ch = last_space; | |
| 1074 } | |
| 1075 else | |
| 1076 { | |
| 1077 float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); | |
| 1078 float rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x); | |
| 1079 if (lpos < region.x0) | |
| 1080 region.x0 = lpos; | |
| 1081 if (rpos > region.x1) | |
| 1082 region.x1 = rpos; | |
| 1083 } | |
| 1084 } | |
| 1085 if (!fz_is_empty_rect(region)) | |
| 1086 { | |
| 1087 div_list_push(ctx, xs, 1, 0, region.x0); | |
| 1088 div_list_push(ctx, xs, 0, 0, region.x1); | |
| 1089 /* For justified regions, we don't break after each line, but | |
| 1090 * rather before/after the region as a whole. */ | |
| 1091 if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL) | |
| 1092 { | |
| 1093 div_list_push(ctx, ys, 1, 0, region.y0); | |
| 1094 div_list_push(ctx, ys, 0, 0, region.y1); | |
| 1095 } | |
| 1096 else | |
| 1097 justified_region = fz_union_rect(justified_region, region); | |
| 1098 } | |
| 1099 } | |
| 1100 if (!fz_is_empty_rect(justified_region) && block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) | |
| 1101 { | |
| 1102 div_list_push(ctx, ys, 1, 0, justified_region.y0); | |
| 1103 div_list_push(ctx, ys, 0, 0, justified_region.y1); | |
| 1104 } | |
| 1105 break; | |
| 1106 } | |
| 1107 } | |
| 1108 } | |
| 1109 } | |
| 1110 | |
| 1111 /* One of our datastructures (cells_t) is an array of details about the | |
| 1112 * cells that make up our table. It's a w * h array of cell_t's. Each | |
| 1113 * cell contains data on one of the cells in the table, as you'd expect. | |
| 1114 * | |
| 1115 * . . | |
| 1116 * . . | |
| 1117 * - - +-------+ - - | |
| 1118 * | . | |
| 1119 * | . | |
| 1120 * | . | |
| 1121 * - - + - - - + - - | |
| 1122 * . . | |
| 1123 * . . | |
| 1124 * | |
| 1125 * For any given cell, we store details about the top (lowest y coord) | |
| 1126 * and left (lowest x coord) edges. Specifically we store whether | |
| 1127 * there is a line at this position (h_line and v_line) (i.e. a drawn | |
| 1128 * border), and we also store whether content crosses this edge (h_crossed | |
| 1129 * and y_crossed). Finally, we store whether the cell has any content | |
| 1130 * in it at all (full). | |
| 1131 * | |
| 1132 * A table which has w positions across and h positions vertically, will | |
| 1133 * only really have (w-1) * (h-1) cells. We store w*h though to allow for | |
| 1134 * the right and bottom edges to have their lines represented. | |
| 1135 */ | |
| 1136 | |
| 1137 typedef struct | |
| 1138 { | |
| 1139 int h_line; | |
| 1140 int v_line; | |
| 1141 int h_crossed; | |
| 1142 int v_crossed; | |
| 1143 int full; | |
| 1144 } cell_t; | |
| 1145 | |
| 1146 typedef struct | |
| 1147 { | |
| 1148 int w; | |
| 1149 int h; | |
| 1150 cell_t cell[FZ_FLEXIBLE_ARRAY]; | |
| 1151 } cells_t; | |
| 1152 | |
| 1153 typedef struct | |
| 1154 { | |
| 1155 cells_t *cells; | |
| 1156 fz_stext_grid_positions *xpos; | |
| 1157 fz_stext_grid_positions *ypos; | |
| 1158 fz_rect bounds; | |
| 1159 } grid_walker_data; | |
| 1160 | |
| 1161 static cell_t * | |
| 1162 get_cell(cells_t *cells, int x, int y) | |
| 1163 { | |
| 1164 return &cells->cell[x + y * cells->w]; | |
| 1165 } | |
| 1166 | |
| 1167 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1168 static void | |
| 1169 asciiart_table(grid_walker_data *gd); | |
| 1170 #endif | |
| 1171 | |
| 1172 static fz_stext_grid_positions * | |
| 1173 split_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, int i, int early) | |
| 1174 { | |
| 1175 fz_stext_grid_positions **posp = row ? &gd->ypos : &gd->xpos; | |
| 1176 fz_stext_grid_positions *pos = *posp; | |
| 1177 int n = pos->len; | |
| 1178 int x, y, w, h; | |
| 1179 cells_t *cells; | |
| 1180 | |
| 1181 /* Realloc the required structs */ | |
| 1182 *posp = pos = fz_realloc_flexible(ctx, pos, fz_stext_grid_positions, list, n+1); | |
| 1183 cells = gd->cells = fz_realloc_flexible(ctx, gd->cells, cells_t, cell, (gd->cells->w + (1-row)) * (gd->cells->h + row)); | |
| 1184 /* If both pass, then we're safe to shuffle the data. */ | |
| 1185 | |
| 1186 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1187 printf("Before split %s %d\n", row ? "row" : "col", i); | |
| 1188 asciiart_table(gd); | |
| 1189 #endif | |
| 1190 | |
| 1191 assert(i >= 0 && i < n); | |
| 1192 memmove(&pos->list[i+1], &pos->list[i], sizeof(pos->list[0]) * (n-i)); | |
| 1193 pos->len++; | |
| 1194 | |
| 1195 /* Next expand the cells. Only h_line and v_line are filled in so far. */ | |
| 1196 w = cells->w; | |
| 1197 h = cells->h; | |
| 1198 if (early && i > 0) | |
| 1199 i--, early = 0; | |
| 1200 if (row) | |
| 1201 { | |
| 1202 /* Add a row */ | |
| 1203 cells->h = h+1; | |
| 1204 /* Expand the table, duplicating row i */ | |
| 1205 memmove(&cells->cell[(i+1)*w], &cells->cell[i*w], sizeof(cells->cell[0])*(h-i)*w); | |
| 1206 | |
| 1207 if (early) | |
| 1208 { | |
| 1209 /* We are splitting row 0 into 0 and 1, with 0 being the new one. */ | |
| 1210 for (x = 0; x < w; x++) | |
| 1211 { | |
| 1212 cells->cell[x].h_line = 0; | |
| 1213 cells->cell[x].v_line = 0; | |
| 1214 } | |
| 1215 } | |
| 1216 else | |
| 1217 { | |
| 1218 /* We are splitting row i into i and i+1, with i+1 being the new one. */ | |
| 1219 /* v_lines are carried over. h_lines need to be unset. */ | |
| 1220 for (x = 0; x < w; x++) | |
| 1221 cells->cell[x + (i+1)*w].h_line = 0; | |
| 1222 } | |
| 1223 } | |
| 1224 else | |
| 1225 { | |
| 1226 /* Add a column */ | |
| 1227 cells->w = w+1; | |
| 1228 /* Expand the table, duplicating column i */ | |
| 1229 for (y = h-1; y >= 0; y--) | |
| 1230 { | |
| 1231 for (x = w; x > i; x--) | |
| 1232 cells->cell[x + y*(w+1)] = cells->cell[x-1 + y*w]; | |
| 1233 for (; x >= 0; x--) | |
| 1234 cells->cell[x + y*(w+1)] = cells->cell[x + y*w]; | |
| 1235 } | |
| 1236 if (early) | |
| 1237 { | |
| 1238 /* We are splitting col 0 into 0 and 1, with 0 being the new one. */ | |
| 1239 for (y = 0; y < h; y++) | |
| 1240 { | |
| 1241 cells->cell[y*(w+1)].h_line = 0; | |
| 1242 cells->cell[y*(w+1)].v_line = 0; | |
| 1243 } | |
| 1244 } | |
| 1245 else | |
| 1246 { | |
| 1247 /* h_lines are carried over. v_lines need to be reset */ | |
| 1248 for (y = 0; y < h; y++) | |
| 1249 cells->cell[i+1 + y*(w+1)].v_line = 0; | |
| 1250 } | |
| 1251 } | |
| 1252 | |
| 1253 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1254 printf("After split\n"); | |
| 1255 asciiart_table(gd); | |
| 1256 #endif | |
| 1257 return pos; | |
| 1258 } | |
| 1259 | |
| 1260 /* This routine finds (and reinforces) grid positions for lines. | |
| 1261 * | |
| 1262 * If we have a thin line from (x0, y0) to (x1, y0), then we are | |
| 1263 * pretty sure that y0 will be on the edge of a cell. We are less | |
| 1264 * sure that x0 and x1 will match up to the edge of a cell. | |
| 1265 * Stylistically some tables overrun or underrun such lines. | |
| 1266 * | |
| 1267 * Similarly from (x0, y0) to (x0, y1), we expect x0 to be accurate | |
| 1268 * but y0 and y1 less so. | |
| 1269 * | |
| 1270 * If we have a wider rectangle, from (x0, y0) to (x1, y1) then | |
| 1271 * we fully expect all sides to be accurate. | |
| 1272 */ | |
| 1273 static int | |
| 1274 find_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, float x, int inaccurate) | |
| 1275 { | |
| 1276 const int WIGGLE_ROOM = 2; | |
| 1277 int i; | |
| 1278 fz_stext_grid_positions *pos = row ? gd->ypos : gd->xpos; | |
| 1279 | |
| 1280 assert(x >= pos->list[0].min && x <= pos->list[pos->len-1].max); | |
| 1281 | |
| 1282 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1283 printf("Looking for %g in %s splits:\n", x, row ? "row" : "col"); | |
| 1284 for (i = 0; i < pos->len; i++) | |
| 1285 { | |
| 1286 printf("%d\t%g\t%g\t%g\t%d\n", i, pos->list[i].min, pos->list[i].pos, pos->list[i].max, pos->list[i].reinforcement); | |
| 1287 } | |
| 1288 #endif | |
| 1289 | |
| 1290 while (inaccurate) /* So we can break out */ | |
| 1291 { | |
| 1292 float prev = 0; | |
| 1293 | |
| 1294 /* If we start/finish outside the range of the table, then we | |
| 1295 * want to extend the table. So ignore 'inaccurate' in this | |
| 1296 * case. Match the logic below. */ | |
| 1297 if (x < pos->list[0].min) | |
| 1298 break; | |
| 1299 if (x < pos->list[0].pos - WIGGLE_ROOM && pos->list[0].reinforcement > 0) | |
| 1300 break; | |
| 1301 if (x > pos->list[pos->len-1].max) | |
| 1302 break; | |
| 1303 if (x > pos->list[pos->len-1].pos + WIGGLE_ROOM && pos->list[pos->len-1].reinforcement > 0) | |
| 1304 break; | |
| 1305 | |
| 1306 /* Just find the closest one. No reinforcement. */ | |
| 1307 for (i = 0; i < pos->len; i++) | |
| 1308 { | |
| 1309 if (x < pos->list[i].min) | |
| 1310 { | |
| 1311 float mid = (prev + pos->list[i].min)/2; | |
| 1312 if (x < mid) | |
| 1313 return i-1; | |
| 1314 return i; | |
| 1315 } | |
| 1316 prev = pos->list[i].max; | |
| 1317 if (x <= prev) | |
| 1318 return i; | |
| 1319 } | |
| 1320 assert("Never happens" == NULL); | |
| 1321 | |
| 1322 return -1; | |
| 1323 } | |
| 1324 | |
| 1325 for (i = 0; i < pos->len; i++) | |
| 1326 { | |
| 1327 if (x < pos->list[i].min) | |
| 1328 { | |
| 1329 /* Split i into i and i+1, and make i the new one. */ | |
| 1330 assert(i > 0); | |
| 1331 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1332 printf("Splitting before %d\n", i); | |
| 1333 #endif | |
| 1334 pos = split_grid_pos(ctx, gd, row, i, 1); | |
| 1335 pos->list[i-1].max = pos->list[i].min = (pos->list[i-1].max + x)/2; | |
| 1336 pos->list[i].pos = x; | |
| 1337 pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2; | |
| 1338 pos->list[i].reinforcement = 1; | |
| 1339 return i; | |
| 1340 } | |
| 1341 else if (x <= pos->list[i].max) | |
| 1342 { | |
| 1343 /* We are in the range for the ith divider. */ | |
| 1344 if (pos->list[i].reinforcement == 0) | |
| 1345 { | |
| 1346 /* If we've not been reinforced before, reinforce now. */ | |
| 1347 pos->list[i].pos = x; | |
| 1348 pos->list[i].reinforcement = 1; | |
| 1349 return i; | |
| 1350 } | |
| 1351 /* We've been reinforced before. This ought to be a pretty good | |
| 1352 * indication. */ | |
| 1353 if (pos->list[i].pos - WIGGLE_ROOM < x && x < pos->list[i].pos + WIGGLE_ROOM) | |
| 1354 { | |
| 1355 /* We are a close match to the previously predicted pos | |
| 1356 * value. */ | |
| 1357 pos->list[i].pos = pos->list[i].pos * pos->list[i].reinforcement + x; | |
| 1358 pos->list[i].pos /= ++pos->list[i].reinforcement; | |
| 1359 return i; | |
| 1360 } | |
| 1361 /* We need to split i into i and i+1. */ | |
| 1362 pos = split_grid_pos(ctx, gd, row, i, pos->list[i].pos > x); | |
| 1363 if (pos->list[i].pos > x) | |
| 1364 { | |
| 1365 /* Make i the new one */ | |
| 1366 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1367 printf("Splitting %d (early)\n", i); | |
| 1368 #endif | |
| 1369 pos->list[i].pos = x; | |
| 1370 pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2; | |
| 1371 pos->list[i].reinforcement = 1; | |
| 1372 return i; | |
| 1373 } | |
| 1374 else | |
| 1375 { | |
| 1376 /* Make i+1 the new one */ | |
| 1377 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1378 printf("Splitting %d (late)\n", i); | |
| 1379 #endif | |
| 1380 pos->list[i+1].pos = x; | |
| 1381 pos->list[i].max = pos->list[i+1].min = (pos->list[i].pos + x)/2; | |
| 1382 pos->list[i].reinforcement = 1; | |
| 1383 return i+1; | |
| 1384 } | |
| 1385 } | |
| 1386 } | |
| 1387 assert("Never happens" == NULL); | |
| 1388 | |
| 1389 return -1; | |
| 1390 } | |
| 1391 | |
| 1392 static int | |
| 1393 find_cell_l(fz_stext_grid_positions *pos, float x) | |
| 1394 { | |
| 1395 int i; | |
| 1396 | |
| 1397 for (i = 0; i < pos->len; i++) | |
| 1398 if (x < pos->list[i].pos) | |
| 1399 return i-1; | |
| 1400 | |
| 1401 return -1; | |
| 1402 } | |
| 1403 | |
| 1404 static int | |
| 1405 find_cell_r(fz_stext_grid_positions *pos, float x) | |
| 1406 { | |
| 1407 int i; | |
| 1408 | |
| 1409 for (i = 0; i < pos->len; i++) | |
| 1410 if (x <= pos->list[i].pos) | |
| 1411 return i-1; | |
| 1412 | |
| 1413 return -1; | |
| 1414 } | |
| 1415 | |
| 1416 /* Add a horizontal line. Return 1 if the line doesn't seem to be a border line. | |
| 1417 * Record which cells that was a border for. */ | |
| 1418 static void | |
| 1419 add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1) | |
| 1420 { | |
| 1421 int start = find_grid_pos(ctx, gd, 0, x0, 1); | |
| 1422 int end = find_grid_pos(ctx, gd, 0, x1, 1); | |
| 1423 float y = (y0 + y1) / 2; | |
| 1424 int yidx = find_grid_pos(ctx, gd, 1, y, 0); | |
| 1425 int i; | |
| 1426 | |
| 1427 assert(start > 0 && end > 0); | |
| 1428 | |
| 1429 for (i = start; i < end; i++) | |
| 1430 get_cell(gd->cells, i, yidx)->h_line++; | |
| 1431 } | |
| 1432 | |
| 1433 /* Add a vertical line. Return 1 if the line doesn't seem to be a border line. | |
| 1434 * Record which cells that was a border for. */ | |
| 1435 static void | |
| 1436 add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1) | |
| 1437 { | |
| 1438 int start = find_grid_pos(ctx, gd, 1, y0, 1); | |
| 1439 int end = find_grid_pos(ctx, gd, 1, y1, 1); | |
| 1440 float x = (x0 + x1) / 2; | |
| 1441 int xidx = find_grid_pos(ctx, gd, 0, x, 0); | |
| 1442 int i; | |
| 1443 | |
| 1444 for (i = start; i < end; i++) | |
| 1445 get_cell(gd->cells, xidx, i)->v_line++; | |
| 1446 } | |
| 1447 | |
| 1448 static void | |
| 1449 add_hv_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1, int stroked) | |
| 1450 { | |
| 1451 int ix0 = find_grid_pos(ctx, gd, 0, x0, 0); | |
| 1452 int ix1 = find_grid_pos(ctx, gd, 0, x1, 0); | |
| 1453 int iy0 = find_grid_pos(ctx, gd, 1, y0, 0); | |
| 1454 int iy1 = find_grid_pos(ctx, gd, 1, y1, 0); | |
| 1455 int i; | |
| 1456 | |
| 1457 if (stroked) | |
| 1458 { | |
| 1459 for (i = ix0; i < ix1; i++) | |
| 1460 { | |
| 1461 get_cell(gd->cells, i, iy0)->h_line++; | |
| 1462 get_cell(gd->cells, i, iy1)->h_line++; | |
| 1463 } | |
| 1464 for (i = iy0; i < iy1; i++) | |
| 1465 { | |
| 1466 get_cell(gd->cells, ix0, i)->v_line++; | |
| 1467 get_cell(gd->cells, ix1, i)->v_line++; | |
| 1468 } | |
| 1469 } | |
| 1470 } | |
| 1471 | |
| 1472 /* Shared internal routine with stext-boxer.c */ | |
| 1473 fz_rect fz_collate_small_vector_run(fz_stext_block **blockp); | |
| 1474 | |
| 1475 static void | |
| 1476 walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) | |
| 1477 { | |
| 1478 for (; block != NULL; block = block->next) | |
| 1479 { | |
| 1480 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 1481 { | |
| 1482 if (block->u.s.down) | |
| 1483 walk_grid_lines(ctx, gd, block->u.s.down->first_block); | |
| 1484 continue; | |
| 1485 } | |
| 1486 else if (block->type == FZ_STEXT_BLOCK_VECTOR) | |
| 1487 { | |
| 1488 fz_rect r; | |
| 1489 float w, h; | |
| 1490 | |
| 1491 /* Only process rectangle blocks. */ | |
| 1492 if ((block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0) | |
| 1493 continue; | |
| 1494 | |
| 1495 r = fz_collate_small_vector_run(&block); | |
| 1496 r = fz_intersect_rect(r, gd->bounds); | |
| 1497 if (!fz_is_valid_rect(r)) | |
| 1498 continue; | |
| 1499 | |
| 1500 w = r.x1 - r.x0; | |
| 1501 h = r.y1 - r.y0; | |
| 1502 if (w > h && h < 2) | |
| 1503 { | |
| 1504 /* Thin, wide line */ | |
| 1505 (void) add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1); | |
| 1506 } | |
| 1507 else if (w < h && w < 2) | |
| 1508 { | |
| 1509 /* Thin, wide line */ | |
| 1510 (void) add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1); | |
| 1511 } | |
| 1512 else | |
| 1513 { | |
| 1514 /* Rectangle */ | |
| 1515 (void) add_hv_line(ctx, gd, r.x0, r.x1, r.y0, r.y1, block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED); | |
| 1516 } | |
| 1517 } | |
| 1518 } | |
| 1519 } | |
| 1520 | |
| 1521 static int | |
| 1522 is_numeric(int c) | |
| 1523 { | |
| 1524 return (c >= '0' && c <= '9'); | |
| 1525 } | |
| 1526 | |
| 1527 static int | |
| 1528 mark_cells_for_content(fz_context *ctx, grid_walker_data *gd, fz_rect s) | |
| 1529 { | |
| 1530 fz_rect r = fz_intersect_rect(gd->bounds, s); | |
| 1531 int x0, x1, y0, y1, x, y; | |
| 1532 | |
| 1533 if (fz_is_empty_rect(r)) | |
| 1534 return 0; | |
| 1535 | |
| 1536 x0 = find_cell_l(gd->xpos, r.x0); | |
| 1537 x1 = find_cell_r(gd->xpos, r.x1); | |
| 1538 y0 = find_cell_l(gd->ypos, r.y0); | |
| 1539 y1 = find_cell_r(gd->ypos, r.y1); | |
| 1540 | |
| 1541 if (x0 < 0 || x1 < 0 || y0 < 0 || y1 < 0) | |
| 1542 return 1; | |
| 1543 if (x0 < x1) | |
| 1544 { | |
| 1545 for (y = y0; y <= y1; y++) | |
| 1546 for (x = x0; x < x1; x++) | |
| 1547 get_cell(gd->cells, x+1, y)->v_crossed++; | |
| 1548 } | |
| 1549 if (y0 < y1) | |
| 1550 { | |
| 1551 for (y = y0; y < y1; y++) | |
| 1552 for (x = x0; x <= x1; x++) | |
| 1553 get_cell(gd->cells, x, y+1)->h_crossed++; | |
| 1554 } | |
| 1555 for (y = y0; y <= y1; y++) | |
| 1556 for (x = x0; x <= x1; x++) | |
| 1557 get_cell(gd->cells, x, y)->full++; | |
| 1558 | |
| 1559 return 0; | |
| 1560 } | |
| 1561 | |
| 1562 #define IN_CELL 0 | |
| 1563 #define IN_BORDER 1 | |
| 1564 #define IN_UNKNOWN 2 | |
| 1565 | |
| 1566 #define SPLIT_MARGIN 4 | |
| 1567 static int | |
| 1568 where_is(fz_stext_grid_positions *pos, float x, int *in) | |
| 1569 { | |
| 1570 int i; | |
| 1571 | |
| 1572 *in = IN_UNKNOWN; | |
| 1573 | |
| 1574 /* If the table is empty, nothing to do. */ | |
| 1575 if (pos->len == 0) | |
| 1576 return -1; | |
| 1577 | |
| 1578 /* If we are completely outside the table, give up. */ | |
| 1579 if (x <= pos->list[0].pos - SPLIT_MARGIN || x >= pos->list[pos->len-1].max + SPLIT_MARGIN) | |
| 1580 return -1; | |
| 1581 | |
| 1582 for (i = 0; i < pos->len; i++) | |
| 1583 { | |
| 1584 /* Calculate the region (below..above) that counts as being | |
| 1585 * on the border of position i. */ | |
| 1586 float prev = i > 0 ? pos->list[i-1].max : pos->list[0].min; | |
| 1587 float next = i < pos->len-1 ? pos->list[i+1].min : pos->list[i].max; | |
| 1588 float below = pos->list[i].pos - SPLIT_MARGIN; | |
| 1589 float above = pos->list[i].pos + SPLIT_MARGIN; | |
| 1590 /* Find the distance half way back to the previous pos as | |
| 1591 * a limit to our margin. */ | |
| 1592 prev = (prev + pos->list[i].pos)/2; | |
| 1593 next = (next + pos->list[i].pos)/2; | |
| 1594 if (below < prev) | |
| 1595 below = prev; | |
| 1596 if (above > next) | |
| 1597 above = next; | |
| 1598 | |
| 1599 if (x < below) | |
| 1600 { | |
| 1601 *in = IN_CELL; | |
| 1602 return i-1; | |
| 1603 } | |
| 1604 else if (x <= above) | |
| 1605 { | |
| 1606 *in = IN_BORDER; | |
| 1607 return i; | |
| 1608 } | |
| 1609 } | |
| 1610 | |
| 1611 *in = IN_BORDER; | |
| 1612 return i-1; | |
| 1613 } | |
| 1614 | |
| 1615 enum | |
| 1616 { | |
| 1617 VECTOR_IS_CONTENT = 0, | |
| 1618 VECTOR_IS_BORDER = 1, | |
| 1619 VECTOR_IS_UNKNOWN = 2, | |
| 1620 VECTOR_IS_IGNORABLE = 3 | |
| 1621 }; | |
| 1622 | |
| 1623 /* So a vector can either be a border, or contained | |
| 1624 * in some cells, or something completely else. */ | |
| 1625 static int | |
| 1626 classify_vector(fz_context *ctx, grid_walker_data *gd, fz_rect r, int is_rect) | |
| 1627 { | |
| 1628 int at_x0, at_x1, at_y0, at_y1; | |
| 1629 int ix0, ix1, iy0, iy1; | |
| 1630 | |
| 1631 r = fz_intersect_rect(r, gd->bounds); | |
| 1632 if (fz_is_empty_rect(r)) | |
| 1633 return VECTOR_IS_IGNORABLE; | |
| 1634 ix0 = where_is(gd->xpos, r.x0, &at_x0); | |
| 1635 ix1 = where_is(gd->xpos, r.x1, &at_x1); | |
| 1636 iy0 = where_is(gd->ypos, r.y0, &at_y0); | |
| 1637 iy1 = where_is(gd->ypos, r.y1, &at_y1); | |
| 1638 | |
| 1639 /* No idea, just treat it as a border. */ | |
| 1640 if (at_x0 == IN_UNKNOWN || at_x1 == IN_UNKNOWN || at_y0 == IN_UNKNOWN || at_y1 == IN_UNKNOWN) | |
| 1641 return VECTOR_IS_IGNORABLE; | |
| 1642 | |
| 1643 if (at_x0 == IN_BORDER && at_x1 == IN_BORDER) | |
| 1644 { | |
| 1645 /* Vector is aligned along sides of cells. */ | |
| 1646 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; | |
| 1647 } | |
| 1648 if (at_y0 == IN_BORDER && at_y1 == IN_BORDER) | |
| 1649 { | |
| 1650 /* Vector is aligned along sides of cells. */ | |
| 1651 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; | |
| 1652 } | |
| 1653 if (at_x0 == IN_CELL && at_x1 == IN_CELL) | |
| 1654 { | |
| 1655 /* Content within a cell (or 1d range of cells). */ | |
| 1656 return VECTOR_IS_CONTENT; | |
| 1657 } | |
| 1658 if (at_y0 == IN_CELL && at_y1 == IN_CELL) | |
| 1659 { | |
| 1660 /* Content within a cell (or 1d range of cells). */ | |
| 1661 return VECTOR_IS_CONTENT; | |
| 1662 } | |
| 1663 if (at_x0 == IN_BORDER && at_x1 == IN_CELL && ix0 == ix1) | |
| 1664 { | |
| 1665 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; | |
| 1666 } | |
| 1667 if (at_x0 == IN_CELL && at_x1 == IN_BORDER && ix0+1 == ix1) | |
| 1668 { | |
| 1669 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; | |
| 1670 } | |
| 1671 if (at_y0 == IN_BORDER && at_y1 == IN_CELL && iy0 == iy1) | |
| 1672 { | |
| 1673 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; | |
| 1674 } | |
| 1675 if (at_y0 == IN_CELL && at_y1 == IN_BORDER && iy0+1 == iy1) | |
| 1676 { | |
| 1677 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; | |
| 1678 } | |
| 1679 | |
| 1680 if (is_rect) | |
| 1681 { | |
| 1682 return VECTOR_IS_IGNORABLE; | |
| 1683 } | |
| 1684 | |
| 1685 /* unknown - take this as indication that this maybe isn't | |
| 1686 * table. */ | |
| 1687 return VECTOR_IS_UNKNOWN; | |
| 1688 } | |
| 1689 | |
| 1690 #undef IN_CELL | |
| 1691 #undef IN_BORDER | |
| 1692 #undef IN_UNKNOWN | |
| 1693 | |
| 1694 | |
| 1695 /* Walk through the content, looking at how it spans our grid. | |
| 1696 * Record gridlines, which cells have content that cross into | |
| 1697 * neighbours, and which cells have any content at all. | |
| 1698 * Return a count of vector graphics that are found that don't | |
| 1699 * look plausibly like cell contents. */ | |
| 1700 static int | |
| 1701 calculate_spanned_content(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) | |
| 1702 { | |
| 1703 int duff = 0; | |
| 1704 fz_rect bounds = { | |
| 1705 gd->xpos->list[0].pos, | |
| 1706 gd->ypos->list[0].pos, | |
| 1707 gd->xpos->list[gd->xpos->len-1].pos, | |
| 1708 gd->ypos->list[gd->ypos->len-1].pos }; | |
| 1709 | |
| 1710 for (; block != NULL; block = block->next) | |
| 1711 { | |
| 1712 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 1713 { | |
| 1714 if (block->u.s.down) | |
| 1715 duff += calculate_spanned_content(ctx, gd, block->u.s.down->first_block); | |
| 1716 continue; | |
| 1717 } | |
| 1718 else if (block->type == FZ_STEXT_BLOCK_VECTOR) | |
| 1719 { | |
| 1720 switch (classify_vector(ctx, gd, block->bbox, !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE))) | |
| 1721 { | |
| 1722 case VECTOR_IS_CONTENT: | |
| 1723 mark_cells_for_content(ctx, gd, block->bbox); | |
| 1724 break; | |
| 1725 case VECTOR_IS_BORDER: | |
| 1726 case VECTOR_IS_IGNORABLE: | |
| 1727 break; | |
| 1728 default: | |
| 1729 duff++; | |
| 1730 break; | |
| 1731 } | |
| 1732 } | |
| 1733 else if (block->type == FZ_STEXT_BLOCK_TEXT) | |
| 1734 { | |
| 1735 fz_stext_line *line; | |
| 1736 | |
| 1737 if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 || | |
| 1738 block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0) | |
| 1739 continue; | |
| 1740 | |
| 1741 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 1742 { | |
| 1743 fz_stext_char *ch = line->first_char; | |
| 1744 int was_numeric = 0; | |
| 1745 | |
| 1746 /* Skip leading spaces */ | |
| 1747 while (ch != NULL && ch->c == ' ') | |
| 1748 ch = ch->next; | |
| 1749 | |
| 1750 for (; ch != NULL; ch = ch->next) | |
| 1751 { | |
| 1752 if (ch->c == 32) | |
| 1753 { | |
| 1754 /* Trailing space, skip it. */ | |
| 1755 if (ch->next == NULL) | |
| 1756 break; | |
| 1757 if (ch->next->c == 32) | |
| 1758 { | |
| 1759 /* Run of spaces. Skip 'em. */ | |
| 1760 while (ch->next && ch->next->c == 32) | |
| 1761 ch = ch->next; | |
| 1762 was_numeric = 0; | |
| 1763 continue; | |
| 1764 } | |
| 1765 if (was_numeric || is_numeric(ch->next->c)) | |
| 1766 { | |
| 1767 /* Single spaces around numbers are ignored. */ | |
| 1768 was_numeric = 0; | |
| 1769 continue; | |
| 1770 } | |
| 1771 /* A single space. Accept it. */ | |
| 1772 was_numeric = 0; | |
| 1773 } | |
| 1774 else | |
| 1775 was_numeric = is_numeric(ch->c); | |
| 1776 duff += mark_cells_for_content(ctx, gd, fz_rect_from_quad(ch->quad)); | |
| 1777 } | |
| 1778 } | |
| 1779 } | |
| 1780 } | |
| 1781 | |
| 1782 return duff; | |
| 1783 } | |
| 1784 | |
| 1785 static cells_t *new_cells(fz_context *ctx, int w, int h) | |
| 1786 { | |
| 1787 cells_t *cells = fz_malloc_flexible(ctx, cells_t, cell, w * h); | |
| 1788 cells->w = w; | |
| 1789 cells->h = h; | |
| 1790 | |
| 1791 return cells; | |
| 1792 } | |
| 1793 | |
| 1794 #ifdef DEBUG_TABLE_STRUCTURE | |
| 1795 static void | |
| 1796 asciiart_table(grid_walker_data *gd) | |
| 1797 { | |
| 1798 int w = gd->xpos->len; | |
| 1799 int h = gd->ypos->len; | |
| 1800 int x, y; | |
| 1801 | |
| 1802 for (y = 0; y < h; y++) | |
| 1803 { | |
| 1804 for (x = 0; x < w-1; x++) | |
| 1805 { | |
| 1806 cell_t *cell = get_cell(gd->cells, x, y); | |
| 1807 int line = cell->h_line; | |
| 1808 int erase = cell->h_crossed; | |
| 1809 printf("+"); | |
| 1810 if (line && !erase) | |
| 1811 { | |
| 1812 printf("-"); | |
| 1813 } | |
| 1814 else if (!line && erase) | |
| 1815 { | |
| 1816 printf("v"); | |
| 1817 } | |
| 1818 else if (line && erase) | |
| 1819 { | |
| 1820 printf("*"); | |
| 1821 } | |
| 1822 else | |
| 1823 { | |
| 1824 printf(" "); | |
| 1825 } | |
| 1826 } | |
| 1827 printf("+\n"); | |
| 1828 if (y == h-1) | |
| 1829 break; | |
| 1830 for (x = 0; x < w; x++) | |
| 1831 { | |
| 1832 cell_t *cell = get_cell(gd->cells, x, y); | |
| 1833 int line = cell->v_line; | |
| 1834 int erase = cell->v_crossed; | |
| 1835 if (line && !erase) | |
| 1836 { | |
| 1837 printf("|"); | |
| 1838 } | |
| 1839 else if (!line && erase) | |
| 1840 { | |
| 1841 printf(">"); | |
| 1842 } | |
| 1843 else if (line && erase) | |
| 1844 { | |
| 1845 printf("*"); | |
| 1846 } | |
| 1847 else | |
| 1848 { | |
| 1849 printf(" "); | |
| 1850 } | |
| 1851 if (x < w-1) | |
| 1852 { | |
| 1853 if (cell->full) | |
| 1854 printf("#"); | |
| 1855 else | |
| 1856 printf(" "); | |
| 1857 } | |
| 1858 else | |
| 1859 printf("\n"); | |
| 1860 } | |
| 1861 } | |
| 1862 } | |
| 1863 #endif | |
| 1864 | |
| 1865 static void | |
| 1866 recalc_bbox(fz_stext_block *block) | |
| 1867 { | |
| 1868 fz_rect bbox = fz_empty_rect; | |
| 1869 fz_stext_line *line; | |
| 1870 | |
| 1871 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 1872 bbox = fz_union_rect(bbox, line->bbox); | |
| 1873 | |
| 1874 block->bbox = bbox; | |
| 1875 } | |
| 1876 | |
| 1877 static void | |
| 1878 unlink_line_from_block(fz_stext_line *line, fz_stext_block *block) | |
| 1879 { | |
| 1880 fz_stext_line *next_line = line->next; | |
| 1881 | |
| 1882 if (line->prev) | |
| 1883 { | |
| 1884 assert(line->prev->next == line); | |
| 1885 line->prev->next = next_line; | |
| 1886 } | |
| 1887 else | |
| 1888 { | |
| 1889 assert(block->u.t.first_line == line); | |
| 1890 block->u.t.first_line = next_line; | |
| 1891 } | |
| 1892 if (next_line) | |
| 1893 { | |
| 1894 assert(next_line->prev == line); | |
| 1895 next_line->prev = line->prev; | |
| 1896 } | |
| 1897 else | |
| 1898 { | |
| 1899 assert(block->u.t.last_line == line); | |
| 1900 block->u.t.last_line = line->prev; | |
| 1901 } | |
| 1902 } | |
| 1903 | |
| 1904 static void | |
| 1905 append_line_to_block(fz_stext_line *line, fz_stext_block *block) | |
| 1906 { | |
| 1907 if (block->u.t.last_line == NULL) | |
| 1908 { | |
| 1909 assert(block->u.t.first_line == NULL); | |
| 1910 block->u.t.first_line = block->u.t.last_line = line; | |
| 1911 line->prev = NULL; | |
| 1912 } | |
| 1913 else | |
| 1914 { | |
| 1915 assert(block->u.t.last_line->next == NULL); | |
| 1916 line->prev = block->u.t.last_line; | |
| 1917 block->u.t.last_line->next = line; | |
| 1918 block->u.t.last_line = line; | |
| 1919 } | |
| 1920 line->next = NULL; | |
| 1921 } | |
| 1922 | |
| 1923 static void | |
| 1924 unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last) | |
| 1925 { | |
| 1926 if (block->prev) | |
| 1927 { | |
| 1928 assert(block->prev->next == block); | |
| 1929 block->prev->next = block->next; | |
| 1930 } | |
| 1931 else | |
| 1932 { | |
| 1933 assert(*first == block); | |
| 1934 *first = block->next; | |
| 1935 } | |
| 1936 if (block->next) | |
| 1937 { | |
| 1938 assert(block->next->prev == block); | |
| 1939 block->next->prev = block->prev; | |
| 1940 } | |
| 1941 else | |
| 1942 { | |
| 1943 assert(*last == block); | |
| 1944 *last = block->prev; | |
| 1945 } | |
| 1946 } | |
| 1947 | |
| 1948 #ifndef NDEBUG | |
| 1949 static int | |
| 1950 verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src) | |
| 1951 { | |
| 1952 fz_stext_block *block; | |
| 1953 fz_stext_block **first = src ? &src->first_block : &page->first_block; | |
| 1954 fz_stext_block **last = src ? &src->last_block : &page->last_block; | |
| 1955 int max = 0; | |
| 1956 | |
| 1957 assert((*first == NULL) == (*last == NULL)); | |
| 1958 | |
| 1959 for (block = *first; block != NULL; block = block->next) | |
| 1960 { | |
| 1961 fz_stext_line *line; | |
| 1962 | |
| 1963 if (block->prev == NULL) | |
| 1964 assert(*first == block); | |
| 1965 else | |
| 1966 assert(block->prev->next == block); | |
| 1967 if (block->next == NULL) | |
| 1968 assert(*last == block); | |
| 1969 else | |
| 1970 assert(block->next->prev == block); | |
| 1971 | |
| 1972 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 1973 { | |
| 1974 if (block->u.s.down) | |
| 1975 { | |
| 1976 int m = verify_stext(ctx, page, block->u.s.down); | |
| 1977 if (m > max) | |
| 1978 max = m; | |
| 1979 } | |
| 1980 continue; | |
| 1981 } | |
| 1982 if (block->type != FZ_STEXT_BLOCK_TEXT) | |
| 1983 continue; | |
| 1984 assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL)); | |
| 1985 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 1986 { | |
| 1987 fz_stext_char *ch; | |
| 1988 | |
| 1989 if (line->next == NULL) | |
| 1990 assert(block->u.t.last_line == line); | |
| 1991 else | |
| 1992 assert(line->next->prev == line); | |
| 1993 | |
| 1994 assert((line->first_char == NULL) == (line->last_char == NULL)); | |
| 1995 | |
| 1996 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 1997 assert(ch->next != NULL || line->last_char == ch); | |
| 1998 } | |
| 1999 } | |
| 2000 | |
| 2001 return max+1; | |
| 2002 } | |
| 2003 #endif | |
| 2004 | |
| 2005 static fz_rect | |
| 2006 move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r) | |
| 2007 { | |
| 2008 fz_stext_block *before = dest ? dest->first_block : page->first_block; | |
| 2009 fz_stext_block **sfirst = src ? &src->first_block : &page->first_block; | |
| 2010 fz_stext_block **slast = src ? &src->last_block : &page->last_block; | |
| 2011 fz_stext_block *block, *next_block; | |
| 2012 | |
| 2013 for (block = *sfirst; block != NULL; block = next_block) | |
| 2014 { | |
| 2015 fz_rect bbox = fz_intersect_rect(block->bbox, r); | |
| 2016 next_block = block->next; | |
| 2017 /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ | |
| 2018 if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1) | |
| 2019 continue; /* Trivially excluded */ | |
| 2020 if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1) | |
| 2021 { | |
| 2022 /* Trivially included */ | |
| 2023 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) | |
| 2024 { | |
| 2025 if (block->u.s.down->standard == FZ_STRUCTURE_TD) | |
| 2026 { | |
| 2027 /* Don't copy TDs! Just copy the contents */ | |
| 2028 move_contained_content(ctx, page, dest, block->u.s.down, r); | |
| 2029 continue; | |
| 2030 } | |
| 2031 } | |
| 2032 unlink_block(block, sfirst, slast); | |
| 2033 insert_block_before(block, before, page, dest); | |
| 2034 assert(before == block->next); | |
| 2035 continue; | |
| 2036 } | |
| 2037 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 2038 { | |
| 2039 if (block->u.s.down) | |
| 2040 move_contained_content(ctx, page, dest, block->u.s.down, r); | |
| 2041 } | |
| 2042 if (block->type == FZ_STEXT_BLOCK_TEXT) | |
| 2043 { | |
| 2044 /* Partially included text block */ | |
| 2045 fz_stext_line *line, *next_line; | |
| 2046 fz_stext_block *newblock = NULL; | |
| 2047 | |
| 2048 for (line = block->u.t.first_line; line != NULL; line = next_line) | |
| 2049 { | |
| 2050 fz_rect lrect = fz_intersect_rect(line->bbox, r); | |
| 2051 next_line = line->next; | |
| 2052 | |
| 2053 /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ | |
| 2054 if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1) | |
| 2055 continue; /* Trivial exclusion */ | |
| 2056 if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1) | |
| 2057 { | |
| 2058 /* Trivial inclusion */ | |
| 2059 if (newblock == NULL) | |
| 2060 { | |
| 2061 newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); | |
| 2062 insert_block_before(newblock, before, page, dest); | |
| 2063 newblock->u.t.flags = block->u.t.flags; | |
| 2064 assert(before == newblock->next); | |
| 2065 } | |
| 2066 | |
| 2067 unlink_line_from_block(line, block); | |
| 2068 append_line_to_block(line, newblock); | |
| 2069 } | |
| 2070 else | |
| 2071 { | |
| 2072 /* Need to walk the line and just take parts */ | |
| 2073 fz_stext_line *newline = NULL; | |
| 2074 fz_stext_char *ch, *next_ch, *prev_ch = NULL; | |
| 2075 | |
| 2076 for (ch = line->first_char; ch != NULL; ch = next_ch) | |
| 2077 { | |
| 2078 fz_rect crect = fz_rect_from_quad(ch->quad); | |
| 2079 float x = (crect.x0 + crect.x1)/2; | |
| 2080 float y = (crect.y0 + crect.y1)/2; | |
| 2081 next_ch = ch->next; | |
| 2082 if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y) | |
| 2083 { | |
| 2084 prev_ch = ch; | |
| 2085 continue; | |
| 2086 } | |
| 2087 /* Take this char */ | |
| 2088 if (newline == NULL) | |
| 2089 { | |
| 2090 newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline)); | |
| 2091 newline->dir = line->dir; | |
| 2092 newline->wmode = line->wmode; | |
| 2093 newline->bbox = fz_empty_rect; | |
| 2094 } | |
| 2095 /* Unlink char */ | |
| 2096 if (prev_ch == NULL) | |
| 2097 line->first_char = next_ch; | |
| 2098 else | |
| 2099 prev_ch->next = next_ch; | |
| 2100 if (next_ch == NULL) | |
| 2101 line->last_char = prev_ch; | |
| 2102 /* Relink char */ | |
| 2103 ch->next = NULL; | |
| 2104 if (newline->last_char == NULL) | |
| 2105 newline->first_char = ch; | |
| 2106 else | |
| 2107 newline->last_char->next = ch; | |
| 2108 newline->last_char = ch; | |
| 2109 newline->bbox = fz_union_rect(newline->bbox, crect); | |
| 2110 } | |
| 2111 if (line->first_char == NULL) | |
| 2112 { | |
| 2113 /* We've removed all the chars from this line. | |
| 2114 * Better remove the line too. */ | |
| 2115 if (line->prev) | |
| 2116 line->prev->next = next_line; | |
| 2117 else | |
| 2118 block->u.t.first_line = next_line; | |
| 2119 if (next_line) | |
| 2120 next_line->prev = line->prev; | |
| 2121 else | |
| 2122 block->u.t.last_line = line->prev; | |
| 2123 } | |
| 2124 if (newline) | |
| 2125 { | |
| 2126 if (newblock == NULL) | |
| 2127 { | |
| 2128 newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); | |
| 2129 | |
| 2130 /* Add the block onto our target list */ | |
| 2131 insert_block_before(newblock, before, page, dest); | |
| 2132 newblock->u.t.flags = block->u.t.flags; | |
| 2133 before = newblock->next; | |
| 2134 } | |
| 2135 append_line_to_block(newline, newblock); | |
| 2136 } | |
| 2137 } | |
| 2138 } | |
| 2139 if (newblock) | |
| 2140 { | |
| 2141 recalc_bbox(block); | |
| 2142 recalc_bbox(newblock); | |
| 2143 } | |
| 2144 if (block->u.t.first_line == NULL) | |
| 2145 { | |
| 2146 /* We've removed all the lines from the block. Should remove that too! */ | |
| 2147 if (block->prev) | |
| 2148 { | |
| 2149 assert(block->prev->next == block); | |
| 2150 block->prev->next = next_block; | |
| 2151 } | |
| 2152 else | |
| 2153 { | |
| 2154 assert(*sfirst == block); | |
| 2155 *sfirst = block->next; | |
| 2156 } | |
| 2157 if (next_block) | |
| 2158 { | |
| 2159 assert(next_block->prev == block); | |
| 2160 next_block->prev = block->prev; | |
| 2161 } | |
| 2162 else | |
| 2163 { | |
| 2164 assert(*slast == block); | |
| 2165 *slast = block->prev; | |
| 2166 } | |
| 2167 } | |
| 2168 } | |
| 2169 } | |
| 2170 | |
| 2171 return r; | |
| 2172 } | |
| 2173 | |
| 2174 typedef struct | |
| 2175 { | |
| 2176 fz_stext_block *block; | |
| 2177 fz_stext_struct *parent; | |
| 2178 } tree_pos; | |
| 2179 | |
| 2180 /* This is still not perfect, but it's better! */ | |
| 2181 static tree_pos | |
| 2182 find_table_insertion_point(fz_context *ctx, fz_rect r, tree_pos current, tree_pos best) | |
| 2183 { | |
| 2184 fz_stext_block *block; | |
| 2185 | |
| 2186 for (block = current.block; block != NULL; block = block->next) | |
| 2187 { | |
| 2188 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 2189 { | |
| 2190 if (block->u.s.down != NULL && fz_is_rect_inside_rect(r, block->bbox)) | |
| 2191 { | |
| 2192 tree_pos down; | |
| 2193 | |
| 2194 down.parent = block->u.s.down; | |
| 2195 down.block = block->u.s.down->first_block; | |
| 2196 best = find_table_insertion_point(ctx, r, down, best); | |
| 2197 } | |
| 2198 } | |
| 2199 else | |
| 2200 { | |
| 2201 /* Is block a better precursor than best? (Or a valid precursor, if best.block == NULL) */ | |
| 2202 if (block->bbox.y1 < r.y0 && (best.block == NULL || best.block->bbox.y1 < block->bbox.y1)) | |
| 2203 { | |
| 2204 best.block = block; | |
| 2205 best.parent = current.parent; | |
| 2206 } | |
| 2207 } | |
| 2208 } | |
| 2209 | |
| 2210 return best; | |
| 2211 } | |
| 2212 | |
| 2213 static int | |
| 2214 tr_is_empty(fz_context *ctx, fz_stext_block *block) | |
| 2215 { | |
| 2216 for (; block != NULL; block = block->next) | |
| 2217 { | |
| 2218 if (block->type != FZ_STEXT_BLOCK_STRUCT) | |
| 2219 return 0; | |
| 2220 if (!block->u.s.down) | |
| 2221 { | |
| 2222 } | |
| 2223 else if (block->u.s.down->standard != FZ_STRUCTURE_TD) | |
| 2224 return 0; | |
| 2225 else if (block->u.s.down->first_block != NULL) | |
| 2226 return 0; | |
| 2227 } | |
| 2228 | |
| 2229 return 1; | |
| 2230 } | |
| 2231 | |
| 2232 static int | |
| 2233 table_is_empty(fz_context *ctx, fz_stext_block *block) | |
| 2234 { | |
| 2235 for (; block != NULL; block = block->next) | |
| 2236 { | |
| 2237 if (block->type == FZ_STEXT_BLOCK_GRID) | |
| 2238 continue; | |
| 2239 if (block->type != FZ_STEXT_BLOCK_STRUCT) | |
| 2240 return 0; | |
| 2241 if (!block->u.s.down) | |
| 2242 { | |
| 2243 } | |
| 2244 else if (block->u.s.down->standard != FZ_STRUCTURE_TR) | |
| 2245 return 0; | |
| 2246 else if (!tr_is_empty(ctx, block->u.s.down->first_block)) | |
| 2247 return 0; | |
| 2248 } | |
| 2249 | |
| 2250 return 1; | |
| 2251 } | |
| 2252 | |
| 2253 static void | |
| 2254 tidy_td_divs(fz_context *ctx, fz_stext_struct *parent) | |
| 2255 { | |
| 2256 while (1) | |
| 2257 { | |
| 2258 fz_stext_block *block = parent->first_block; | |
| 2259 | |
| 2260 if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV) | |
| 2261 return; | |
| 2262 | |
| 2263 parent->first_block = block->u.s.down->first_block; | |
| 2264 parent->last_block = block->u.s.down->last_block; | |
| 2265 } | |
| 2266 } | |
| 2267 | |
| 2268 static void | |
| 2269 tidy_tr_divs(fz_context *ctx, fz_stext_block *block) | |
| 2270 { | |
| 2271 for (; block != NULL; block = block->next) | |
| 2272 { | |
| 2273 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD) | |
| 2274 tidy_td_divs(ctx, block->u.s.down); | |
| 2275 } | |
| 2276 } | |
| 2277 | |
| 2278 static void | |
| 2279 tidy_table_divs(fz_context *ctx, fz_stext_block *block) | |
| 2280 { | |
| 2281 for (; block != NULL; block = block->next) | |
| 2282 { | |
| 2283 if (block->type == FZ_STEXT_BLOCK_GRID) | |
| 2284 continue; | |
| 2285 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR) | |
| 2286 tidy_tr_divs(ctx, block->u.s.down->first_block); | |
| 2287 } | |
| 2288 } | |
| 2289 | |
| 2290 static int | |
| 2291 struct_is_empty(fz_context *ctx, fz_stext_block *block) | |
| 2292 { | |
| 2293 for (; block != NULL; block = block->next) | |
| 2294 { | |
| 2295 if (block->type != FZ_STEXT_BLOCK_STRUCT) | |
| 2296 return 0; | |
| 2297 if (!block->u.s.down) | |
| 2298 { | |
| 2299 } | |
| 2300 else if (!struct_is_empty(ctx, block->u.s.down->first_block)) | |
| 2301 return 0; | |
| 2302 } | |
| 2303 | |
| 2304 return 1; | |
| 2305 } | |
| 2306 | |
| 2307 static int | |
| 2308 div_is_empty(fz_context *ctx, fz_stext_block *block) | |
| 2309 { | |
| 2310 for (; block != NULL; block = block->next) | |
| 2311 { | |
| 2312 if (block->type != FZ_STEXT_BLOCK_STRUCT) | |
| 2313 return 0; | |
| 2314 if (!block->u.s.down) | |
| 2315 { | |
| 2316 } | |
| 2317 else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) | |
| 2318 { | |
| 2319 tidy_table_divs(ctx, block->u.s.down->first_block); | |
| 2320 return table_is_empty(ctx, block->u.s.down->first_block); | |
| 2321 } | |
| 2322 else if (block->u.s.down->standard != FZ_STRUCTURE_DIV) | |
| 2323 { | |
| 2324 if (!struct_is_empty(ctx, block->u.s.down->first_block)) | |
| 2325 return 0; | |
| 2326 } | |
| 2327 else if (!div_is_empty(ctx, block->u.s.down->first_block)) | |
| 2328 return 0; | |
| 2329 } | |
| 2330 | |
| 2331 return 1; | |
| 2332 } | |
| 2333 | |
| 2334 static void | |
| 2335 tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) | |
| 2336 { | |
| 2337 fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block; | |
| 2338 fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block; | |
| 2339 fz_stext_block *block, *next_block; | |
| 2340 | |
| 2341 for (block = *first_blockp; block != NULL; block = next_block) | |
| 2342 { | |
| 2343 next_block = block->next; | |
| 2344 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) | |
| 2345 { | |
| 2346 if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) | |
| 2347 { | |
| 2348 tidy_table_divs(ctx, block->u.s.down->first_block); | |
| 2349 if (table_is_empty(ctx, block->u.s.down->first_block)) | |
| 2350 { | |
| 2351 /* Remove block */ | |
| 2352 if (block->prev) | |
| 2353 { | |
| 2354 assert(block->prev->next == block); | |
| 2355 block->prev->next = next_block; | |
| 2356 } | |
| 2357 else | |
| 2358 { | |
| 2359 assert(*first_blockp == block); | |
| 2360 *first_blockp = next_block; | |
| 2361 } | |
| 2362 if (next_block) | |
| 2363 { | |
| 2364 assert(next_block->prev == block); | |
| 2365 next_block->prev = block->prev; | |
| 2366 } | |
| 2367 else | |
| 2368 { | |
| 2369 assert(*last_blockp == block); | |
| 2370 *last_blockp = block->prev; | |
| 2371 } | |
| 2372 } | |
| 2373 else | |
| 2374 { | |
| 2375 tidy_orphaned_tables(ctx, page, block->u.s.down); | |
| 2376 } | |
| 2377 } | |
| 2378 if (block->u.s.down->standard == FZ_STRUCTURE_DIV) | |
| 2379 { | |
| 2380 tidy_orphaned_tables(ctx, page, block->u.s.down); | |
| 2381 if (div_is_empty(ctx, block->u.s.down->first_block)) | |
| 2382 { | |
| 2383 /* Remove block */ | |
| 2384 if (block->prev) | |
| 2385 { | |
| 2386 assert(block->prev->next == block); | |
| 2387 block->prev->next = next_block; | |
| 2388 } | |
| 2389 else | |
| 2390 { | |
| 2391 assert(*first_blockp == block); | |
| 2392 *first_blockp = next_block; | |
| 2393 } | |
| 2394 if (next_block) | |
| 2395 { | |
| 2396 assert(next_block->prev == block); | |
| 2397 next_block->prev = block->prev; | |
| 2398 } | |
| 2399 else | |
| 2400 { | |
| 2401 assert(*last_blockp == block); | |
| 2402 *last_blockp = block->prev; | |
| 2403 } | |
| 2404 } | |
| 2405 } | |
| 2406 } | |
| 2407 } | |
| 2408 } | |
| 2409 | |
| 2410 static fz_stext_struct * | |
| 2411 transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent) | |
| 2412 { | |
| 2413 int w = gd->xpos->len; | |
| 2414 int h = gd->ypos->len; | |
| 2415 int x, y; | |
| 2416 char *sent_tab = fz_calloc(ctx, 1, w*(size_t)h); | |
| 2417 fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; | |
| 2418 fz_stext_struct *table, *tr, *td; | |
| 2419 fz_rect r; | |
| 2420 tree_pos insert, top; | |
| 2421 | |
| 2422 /* Where should we insert the table in the data? */ | |
| 2423 r.x0 = gd->xpos->list[0].pos; | |
| 2424 r.x1 = gd->xpos->list[w-1].pos; | |
| 2425 r.y0 = gd->ypos->list[0].pos; | |
| 2426 r.y1 = gd->ypos->list[h-1].pos; | |
| 2427 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2428 fz_print_stext_page_as_xml(ctx, fz_stddbg(ctx), page, 0); | |
| 2429 #endif | |
| 2430 top.block = *first_block; | |
| 2431 top.parent = parent; | |
| 2432 insert.block = NULL; | |
| 2433 insert.parent = NULL; | |
| 2434 insert = find_table_insertion_point(ctx, r, top, insert); | |
| 2435 | |
| 2436 /* Convert to 'before' */ | |
| 2437 if (insert.block) | |
| 2438 insert.block = insert.block->next; | |
| 2439 | |
| 2440 /* Make table */ | |
| 2441 table = add_struct_block_before(ctx, insert.block, page, insert.parent, FZ_STRUCTURE_TABLE, "Table"); | |
| 2442 | |
| 2443 /* Run through the cells, and guess at spanning. */ | |
| 2444 for (y = 0; y < h-1; y++) | |
| 2445 { | |
| 2446 /* Have we sent this entire row before? */ | |
| 2447 for (x = 0; x < w-1; x++) | |
| 2448 { | |
| 2449 if (!sent_tab[x+y*w]) | |
| 2450 break; | |
| 2451 } | |
| 2452 if (x == w-1) | |
| 2453 continue; /* No point in sending a row with nothing in it! */ | |
| 2454 | |
| 2455 /* Make TR */ | |
| 2456 tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR"); | |
| 2457 | |
| 2458 for (x = 0; x < w-1; x++) | |
| 2459 { | |
| 2460 int x2, y2; | |
| 2461 int cellw = 1; | |
| 2462 int cellh = 1; | |
| 2463 | |
| 2464 /* Have we sent this cell already? */ | |
| 2465 if (sent_tab[x+y*w]) | |
| 2466 continue; | |
| 2467 | |
| 2468 /* Find the width of the cell */ | |
| 2469 for (x2 = x+1; x2 < w-1; x2++) | |
| 2470 { | |
| 2471 cell_t *cell = get_cell(gd->cells, x2, y); | |
| 2472 if (cell->v_line) | |
| 2473 break; /* Can't go past a line */ | |
| 2474 if (gd->xpos->list[x2].uncertainty == 0) | |
| 2475 break; /* An uncertainty of 0 is as good as a line. */ | |
| 2476 if (!cell->v_crossed) | |
| 2477 break; | |
| 2478 cellw++; | |
| 2479 } | |
| 2480 /* Find the height of the cell */ | |
| 2481 for (y2 = y+1; y2 < h-1; y2++) | |
| 2482 { | |
| 2483 cell_t *cell; | |
| 2484 int h_crossed = 0; | |
| 2485 if (gd->ypos->list[y2].uncertainty == 0) | |
| 2486 break; /* An uncertainty of 0 is as good as a line. */ | |
| 2487 | |
| 2488 cell = get_cell(gd->cells, x, y2); | |
| 2489 if (cell->h_line) | |
| 2490 break; /* Can't extend down through a line. */ | |
| 2491 if (cell->h_crossed) | |
| 2492 h_crossed = 1; | |
| 2493 for (x2 = x+1; x2 < x+cellw; x2++) | |
| 2494 { | |
| 2495 cell = get_cell(gd->cells, x2, y2); | |
| 2496 if (cell->h_line) | |
| 2497 break; | |
| 2498 if (cell->v_line) | |
| 2499 break; /* Can't go past a line */ | |
| 2500 if (gd->xpos->list[x2].uncertainty == 0) | |
| 2501 break; /* An uncertainty of 0 is as good as a line. */ | |
| 2502 if (!cell->v_crossed) | |
| 2503 break; | |
| 2504 if (cell->h_crossed) | |
| 2505 h_crossed = 1; | |
| 2506 } | |
| 2507 if (x2 == x+cellw && h_crossed) | |
| 2508 cellh++; | |
| 2509 else | |
| 2510 break; | |
| 2511 } | |
| 2512 /* Make TD */ | |
| 2513 td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD"); | |
| 2514 r.x0 = gd->xpos->list[x].pos; | |
| 2515 r.x1 = gd->xpos->list[x+cellw].pos; | |
| 2516 r.y0 = gd->ypos->list[y].pos; | |
| 2517 r.y1 = gd->ypos->list[y+cellh].pos; | |
| 2518 /* Use r, not REAL contents bbox, as otherwise spanned rows | |
| 2519 * can end up empty. */ | |
| 2520 td->up->bbox = r; | |
| 2521 move_contained_content(ctx, page, td, parent, r); | |
| 2522 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2523 printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh); | |
| 2524 #endif | |
| 2525 for (y2 = y; y2 < y+cellh; y2++) | |
| 2526 for (x2 = x; x2 < x+cellw; x2++) | |
| 2527 sent_tab[x2+y2*w] = 1; | |
| 2528 } | |
| 2529 r.x0 = gd->xpos->list[0].pos; | |
| 2530 r.x1 = gd->xpos->list[gd->xpos->len-1].pos; | |
| 2531 r.y0 = gd->ypos->list[y].pos; | |
| 2532 r.y1 = gd->ypos->list[y+1].pos; | |
| 2533 tr->up->bbox = r; | |
| 2534 table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox); | |
| 2535 } | |
| 2536 fz_free(ctx, sent_tab); | |
| 2537 | |
| 2538 { | |
| 2539 fz_stext_block *block; | |
| 2540 fz_stext_grid_positions *xps2 = copy_grid_positions_to_pool(ctx, page, gd->xpos); | |
| 2541 fz_stext_grid_positions *yps2 = copy_grid_positions_to_pool(ctx, page, gd->ypos); | |
| 2542 block = add_grid_block(ctx, page, &table->first_block, &table->last_block); | |
| 2543 block->u.b.xs = xps2; | |
| 2544 block->u.b.ys = yps2; | |
| 2545 block->bbox.x0 = block->u.b.xs->list[0].pos; | |
| 2546 block->bbox.y0 = block->u.b.ys->list[0].pos; | |
| 2547 block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos; | |
| 2548 block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos; | |
| 2549 } | |
| 2550 tidy_orphaned_tables(ctx, page, parent); | |
| 2551 | |
| 2552 return table; | |
| 2553 } | |
| 2554 | |
| 2555 static void | |
| 2556 merge_column(grid_walker_data *gd, int x) | |
| 2557 { | |
| 2558 int y; | |
| 2559 for (y = 0; y < gd->cells->h; y++) | |
| 2560 { | |
| 2561 cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)]; | |
| 2562 cell_t *s = &gd->cells->cell[x + y * gd->cells->w]; | |
| 2563 | |
| 2564 if (x > 0) | |
| 2565 memcpy(d-x, s-x, sizeof(*d) * x); | |
| 2566 d->full = s[0].full || s[1].full; | |
| 2567 d->h_crossed = s[0].h_crossed || s[1].h_crossed; | |
| 2568 d->h_line = s[0].h_line; /* == s[1].h_line */ | |
| 2569 d->v_crossed = s[0].v_crossed; | |
| 2570 d->v_line = s[0].v_line; | |
| 2571 if (x < gd->cells->w - 2) | |
| 2572 memcpy(d+1, s+2, sizeof(*d) * (gd->cells->w - 2 - x)); | |
| 2573 } | |
| 2574 gd->cells->w--; | |
| 2575 | |
| 2576 if (x < gd->xpos->len - 2) | |
| 2577 memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], sizeof(gd->xpos->list[0]) * (gd->xpos->len - 2 - x)); | |
| 2578 gd->xpos->len--; | |
| 2579 } | |
| 2580 | |
| 2581 static void | |
| 2582 merge_columns(grid_walker_data *gd) | |
| 2583 { | |
| 2584 int x, y; | |
| 2585 | |
| 2586 for (x = gd->cells->w-3; x >= 0; x--) | |
| 2587 { | |
| 2588 /* Can column x be merged with column x+1? */ | |
| 2589 /* An empty column can certainly be merged if the h_lines are the same, | |
| 2590 * and there is no v_line. */ | |
| 2591 for (y = 0; y < gd->cells->h-1; y++) | |
| 2592 { | |
| 2593 cell_t *a = get_cell(gd->cells, x, y); | |
| 2594 cell_t *b = get_cell(gd->cells, x+1, y); | |
| 2595 if (a->full || !!a->h_line != !!b->h_line || b->v_line) | |
| 2596 break; | |
| 2597 } | |
| 2598 if (y == gd->cells->h-1) | |
| 2599 goto merge_column; | |
| 2600 /* An empty column can certainly be merged if the h_lines are the same. */ | |
| 2601 for (y = 0; y < gd->cells->h-1; y++) | |
| 2602 { | |
| 2603 cell_t *a = get_cell(gd->cells, x, y); | |
| 2604 cell_t *b = get_cell(gd->cells, x+1, y); | |
| 2605 if (b->full || !!a->h_line != !!b->h_line || b->v_line) | |
| 2606 break; | |
| 2607 } | |
| 2608 if (y == gd->cells->h-1) | |
| 2609 goto merge_column; | |
| 2610 /* We only ever want to merge columns if content crossed between them somewhere. | |
| 2611 * Don't use uncertainty for this, because uncertainty doesn't allow for | |
| 2612 * whitespace. */ | |
| 2613 for (y = 0; y < gd->cells->h-1; y++) | |
| 2614 if (get_cell(gd->cells, x+1, y)->v_crossed == 1) | |
| 2615 break; | |
| 2616 if (y == gd->cells->h-1) | |
| 2617 continue; | |
| 2618 /* This requires all the pairs of cells in those 2 columns to be mergeable. */ | |
| 2619 for (y = 0; y < gd->cells->h-1; y++) | |
| 2620 { | |
| 2621 cell_t *a = get_cell(gd->cells, x, y); | |
| 2622 cell_t *b = get_cell(gd->cells, x+1, y); | |
| 2623 /* If there is a divider, we can't merge. */ | |
| 2624 if (b->v_line) | |
| 2625 break; | |
| 2626 /* If either is empty, we can merge. */ | |
| 2627 if (!a->full || !b->full) | |
| 2628 continue; | |
| 2629 /* If we differ in h linedness, we can't merge */ | |
| 2630 if (!!a->h_line != !!b->h_line) | |
| 2631 break; | |
| 2632 /* If both are full, we can only merge if we cross. */ | |
| 2633 if (a->full && b->full && b->v_crossed) | |
| 2634 continue; | |
| 2635 /* Otherwise we can't merge */ | |
| 2636 break; | |
| 2637 } | |
| 2638 if (y == gd->cells->h-1) | |
| 2639 { | |
| 2640 /* Merge the column! */ | |
| 2641 merge_column: | |
| 2642 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2643 printf("Merging column %d\n", x); | |
| 2644 #endif | |
| 2645 merge_column(gd, x); | |
| 2646 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2647 asciiart_table(gd); | |
| 2648 #endif | |
| 2649 } | |
| 2650 } | |
| 2651 } | |
| 2652 | |
| 2653 static void | |
| 2654 merge_row(grid_walker_data *gd, int y) | |
| 2655 { | |
| 2656 int x; | |
| 2657 int w = gd->cells->w; | |
| 2658 cell_t *d = &gd->cells->cell[y * w]; | |
| 2659 for (x = 0; x < gd->cells->w-1; x++) | |
| 2660 { | |
| 2661 if (d->full == 0) | |
| 2662 d->full = d[w].full; | |
| 2663 if (d->v_crossed == 0) | |
| 2664 d->v_crossed = d[w].v_crossed; | |
| 2665 d++; | |
| 2666 } | |
| 2667 if (y < gd->cells->h - 2) | |
| 2668 memcpy(d, d+w, sizeof(*d) * (gd->cells->h - 2 - y) * w); | |
| 2669 gd->cells->h--; | |
| 2670 | |
| 2671 if (y < gd->ypos->len - 2) | |
| 2672 memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], sizeof(gd->ypos->list[0]) * (gd->ypos->len - 2 - y)); | |
| 2673 gd->ypos->len--; | |
| 2674 } | |
| 2675 | |
| 2676 static void | |
| 2677 merge_rows(grid_walker_data *gd) | |
| 2678 { | |
| 2679 int x, y; | |
| 2680 | |
| 2681 for (y = gd->cells->h-3; y >= 0; y--) | |
| 2682 { | |
| 2683 /* Can row y be merged with row y+1? */ | |
| 2684 /* An empty row can certainly be merged if the v_lines are the same, | |
| 2685 * and there is no h_line. */ | |
| 2686 for (x = 0; x < gd->cells->w-1; x++) | |
| 2687 { | |
| 2688 cell_t *a = get_cell(gd->cells, x, y); | |
| 2689 cell_t *b = get_cell(gd->cells, x, y+1); | |
| 2690 if (a->full || !!a->v_line != !!b->v_line || b->h_line) | |
| 2691 break; | |
| 2692 } | |
| 2693 if (x == gd->cells->w-1) | |
| 2694 goto merge_row; | |
| 2695 /* An empty row can certainly be merged if the v_lines are the same. */ | |
| 2696 for (x = 0; x < gd->cells->w-1; x++) | |
| 2697 { | |
| 2698 cell_t *a = get_cell(gd->cells, x, y); | |
| 2699 cell_t *b = get_cell(gd->cells, x, y+1); | |
| 2700 if (b->full || !!a->v_line != !!b->v_line || b->h_line) | |
| 2701 break; | |
| 2702 } | |
| 2703 if (x == gd->cells->w-1) | |
| 2704 goto merge_row; | |
| 2705 /* We only ever want to merge rows if content crossed between them somewhere. | |
| 2706 * Don't use uncertainty for this, because uncertainty doesn't allow for | |
| 2707 * whitespace. */ | |
| 2708 for (x = 0; x < gd->cells->w-1; x++) | |
| 2709 if (get_cell(gd->cells, x, y+1)->h_crossed == 1) | |
| 2710 break; | |
| 2711 if (x == gd->cells->w-1) | |
| 2712 continue; | |
| 2713 /* This requires all the pairs of cells in those 2 rows to be mergeable. */ | |
| 2714 for (x = 0; x < gd->cells->w-1; x++) | |
| 2715 { | |
| 2716 cell_t *a = get_cell(gd->cells, x, y); | |
| 2717 cell_t *b = get_cell(gd->cells, x, y+1); | |
| 2718 /* If there is a divider, we can't merge. */ | |
| 2719 if (b->h_line) | |
| 2720 goto cant_merge; | |
| 2721 /* If either is empty, we can merge. */ | |
| 2722 if (!a->full || !b->full) | |
| 2723 continue; | |
| 2724 /* If we differ in v linedness, we can't merge */ | |
| 2725 if (!!a->v_line != !!b->v_line) | |
| 2726 goto cant_merge; | |
| 2727 /* If both are full, we can only merge if we cross. */ | |
| 2728 if (a->full && b->full && b->h_crossed) | |
| 2729 continue; | |
| 2730 /* Otherwise we can't merge */ | |
| 2731 break; | |
| 2732 } | |
| 2733 if (x == gd->cells->w-1) | |
| 2734 { | |
| 2735 /* Merge the row! */ | |
| 2736 merge_row: | |
| 2737 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2738 printf("Merging row %d\n", y); | |
| 2739 #endif | |
| 2740 merge_row(gd, y); | |
| 2741 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2742 asciiart_table(gd); | |
| 2743 #endif | |
| 2744 continue; | |
| 2745 } | |
| 2746 | |
| 2747 /* OK, so we failed to be able to merge y and y+1. But if we get here, we | |
| 2748 * know this wasn't because of any lines being in the way. So we can try | |
| 2749 * a different set of criteria. If every non-empty cell in line y+1 is either | |
| 2750 * into from line y, or crosses into line y+2, then it's probably a 'straddling' | |
| 2751 * line. Just remove it. */ | |
| 2752 if (y+2 >= gd->cells->h) | |
| 2753 continue; | |
| 2754 for (x = 0; x < gd->cells->w-1; x++) | |
| 2755 { | |
| 2756 cell_t *b = get_cell(gd->cells, x, y+1); | |
| 2757 cell_t *c = get_cell(gd->cells, x, y+2); | |
| 2758 if (!b->full) | |
| 2759 continue; | |
| 2760 if (!b->h_crossed && !c->h_crossed) | |
| 2761 break; | |
| 2762 } | |
| 2763 if (x == gd->cells->w-1) | |
| 2764 { | |
| 2765 /* Merge the row! */ | |
| 2766 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2767 printf("Merging row %d (case 2)\n", y); | |
| 2768 #endif | |
| 2769 merge_row(gd, y); | |
| 2770 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2771 asciiart_table(gd); | |
| 2772 #endif | |
| 2773 } | |
| 2774 | |
| 2775 | |
| 2776 cant_merge: | |
| 2777 {} | |
| 2778 } | |
| 2779 } | |
| 2780 | |
| 2781 static int | |
| 2782 remove_bordered_empty_cells(grid_walker_data *gd) | |
| 2783 { | |
| 2784 int x, y, x1, y1, x2, y2; | |
| 2785 int changed = 0; | |
| 2786 | |
| 2787 /* We are looking for a region of cells that's bordered, | |
| 2788 * and empty, and where the borders don't extend... | |
| 2789 * | |
| 2790 * ' ' | |
| 2791 * ' ' | |
| 2792 * . . .'____'. . . | |
| 2793 * | | | |
| 2794 * | | | |
| 2795 * . . .|____|. . . | |
| 2796 * ' ' | |
| 2797 * ' ' | |
| 2798 * ' ' | |
| 2799 */ | |
| 2800 | |
| 2801 for (y = 0; y < gd->cells->h-1; y++) | |
| 2802 { | |
| 2803 for (x = 0; x < gd->cells->w-1; x++) | |
| 2804 { | |
| 2805 cell_t *u = y > 0 ? get_cell(gd->cells, x, y-1) : NULL; | |
| 2806 cell_t *l = x > 0 ? get_cell(gd->cells, x-1, y) : NULL; | |
| 2807 cell_t *c = get_cell(gd->cells, x, y); | |
| 2808 | |
| 2809 if (c->full) | |
| 2810 continue; | |
| 2811 if (!c->h_line || !c->v_line) | |
| 2812 continue; | |
| 2813 if (l != NULL && l->h_line) | |
| 2814 continue; | |
| 2815 if (u != NULL && u->v_line) | |
| 2816 continue; | |
| 2817 /* So c (x, y) is a reasonable top left of the bounded region. */ | |
| 2818 for (x1 = x+1; x1 < gd->cells->w; x1++) | |
| 2819 { | |
| 2820 u = y > 0 ? get_cell(gd->cells, x1, y-1) : NULL; | |
| 2821 c = get_cell(gd->cells, x1, y); | |
| 2822 | |
| 2823 if (u != NULL && u->v_line) | |
| 2824 goto bad_region; | |
| 2825 if (c->h_line && !c->v_line) | |
| 2826 continue; | |
| 2827 if (c->h_line || !c->v_line) | |
| 2828 goto bad_region; | |
| 2829 break; | |
| 2830 } | |
| 2831 if (x1 == gd->cells->w) | |
| 2832 goto bad_region; | |
| 2833 /* If we get here, then we have the top row of a bounded region | |
| 2834 * that runs from (x,y) to (x1-1, y). So, can we extend that region | |
| 2835 * downwards? */ | |
| 2836 for (y1 = y+1; y1 < gd->cells->h; y1++) | |
| 2837 { | |
| 2838 c = get_cell(gd->cells, x, y1); | |
| 2839 | |
| 2840 if (!c->h_line && c->v_line) | |
| 2841 { | |
| 2842 /* This could be the start of a line. */ | |
| 2843 for (x2 = x+1; x2 < x1; x2++) | |
| 2844 { | |
| 2845 c = get_cell(gd->cells, x2, y1); | |
| 2846 if (c->full || c->h_line || c->v_line) | |
| 2847 goto bad_region; | |
| 2848 } | |
| 2849 c = get_cell(gd->cells, x1, y1); | |
| 2850 if (c->h_line || !c->v_line) | |
| 2851 goto bad_region; | |
| 2852 /* That line was fine! Region is extended. */ | |
| 2853 } | |
| 2854 else if (c->h_line && !c->v_line) | |
| 2855 { | |
| 2856 /* This might be the bottom line of the bounded region. */ | |
| 2857 for (x2 = x+1; x2 < x1; x2++) | |
| 2858 { | |
| 2859 c = get_cell(gd->cells, x2, y1); | |
| 2860 if (!c->h_line || c->v_line) | |
| 2861 goto bad_region; | |
| 2862 } | |
| 2863 c = get_cell(gd->cells, x1, y1); | |
| 2864 if (c->h_line || c->v_line) | |
| 2865 goto bad_region; | |
| 2866 break; | |
| 2867 } | |
| 2868 else | |
| 2869 goto bad_region; | |
| 2870 } | |
| 2871 if (y1 == gd->cells->h) | |
| 2872 goto bad_region; | |
| 2873 /* So we have a bounded region from x,y to x1-1,y1-1 */ | |
| 2874 for (y2 = y; y2 < y1; y2++) | |
| 2875 { | |
| 2876 for (x2 = x; x2 < x1; x2++) | |
| 2877 { | |
| 2878 c = get_cell(gd->cells, x2, y2); | |
| 2879 c->h_line = 0; | |
| 2880 c->v_line = 0; | |
| 2881 c->full = 1; | |
| 2882 if (x2 > x) | |
| 2883 c->v_crossed = 1; | |
| 2884 if (y2 > y) | |
| 2885 c->h_crossed = 1; | |
| 2886 } | |
| 2887 c = get_cell(gd->cells, x2, y2); | |
| 2888 c->v_line = 0; | |
| 2889 } | |
| 2890 for (x2 = x; x2 < x1; x2++) | |
| 2891 get_cell(gd->cells, x2, y2)->h_line = 0; | |
| 2892 changed = 1; | |
| 2893 bad_region: | |
| 2894 {} | |
| 2895 } | |
| 2896 } | |
| 2897 | |
| 2898 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2899 if (changed) | |
| 2900 { | |
| 2901 printf("Simplified empty bounded cells to give:\n"); | |
| 2902 asciiart_table(gd); | |
| 2903 } | |
| 2904 #endif | |
| 2905 | |
| 2906 return changed; | |
| 2907 } | |
| 2908 | |
| 2909 static int | |
| 2910 find_table_within_bounds(fz_context *ctx, grid_walker_data *gd, fz_stext_block *content, fz_rect bounds) | |
| 2911 { | |
| 2912 div_list xs = { 0 }; | |
| 2913 div_list ys = { 0 }; | |
| 2914 int failed = 1; | |
| 2915 | |
| 2916 fz_try(ctx) | |
| 2917 { | |
| 2918 walk_to_find_content(ctx, &xs, &ys, content, bounds); | |
| 2919 | |
| 2920 sanitize_positions(ctx, &xs); | |
| 2921 sanitize_positions(ctx, &ys); | |
| 2922 | |
| 2923 /* Run across the line, counting 'winding' */ | |
| 2924 /* If we don't have at least 2 rows and 2 columns, give up. */ | |
| 2925 if (xs.len <= 2 || ys.len <= 2) | |
| 2926 break; | |
| 2927 | |
| 2928 gd->xpos = make_table_positions(ctx, &xs, bounds.x0, bounds.x1); | |
| 2929 gd->ypos = make_table_positions(ctx, &ys, bounds.y0, bounds.y1); | |
| 2930 gd->cells = new_cells(ctx, gd->xpos->len, gd->ypos->len); | |
| 2931 | |
| 2932 /* Walk the content looking for grid lines. These | |
| 2933 * lines refine our positions. */ | |
| 2934 walk_grid_lines(ctx, gd, content); | |
| 2935 /* Now, we walk the content looking for content that crosses | |
| 2936 * these grid lines. This allows us to spot spanned cells. */ | |
| 2937 if (calculate_spanned_content(ctx, gd, content)) | |
| 2938 break; /* Unlikely to be a table. */ | |
| 2939 | |
| 2940 #ifdef DEBUG_TABLE_STRUCTURE | |
| 2941 asciiart_table(gd); | |
| 2942 #endif | |
| 2943 /* Now, can we remove some columns or rows? i.e. have we oversegmented? */ | |
| 2944 do | |
| 2945 { | |
| 2946 merge_columns(gd); | |
| 2947 merge_rows(gd); | |
| 2948 } | |
| 2949 while (remove_bordered_empty_cells(gd)); | |
| 2950 | |
| 2951 /* Did we shrink the table so much it's not a table any more? */ | |
| 2952 if (gd->xpos->len <= 2 || gd->ypos->len <= 2) | |
| 2953 break; | |
| 2954 | |
| 2955 failed = 0; | |
| 2956 } | |
| 2957 fz_always(ctx) | |
| 2958 { | |
| 2959 fz_free(ctx, xs.list); | |
| 2960 fz_free(ctx, ys.list); | |
| 2961 } | |
| 2962 fz_catch(ctx) | |
| 2963 { | |
| 2964 fz_rethrow(ctx); | |
| 2965 } | |
| 2966 | |
| 2967 return failed; | |
| 2968 } | |
| 2969 | |
| 2970 static int | |
| 2971 do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) | |
| 2972 { | |
| 2973 fz_stext_block *block; | |
| 2974 int count; | |
| 2975 fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; | |
| 2976 int num_subtables = 0; | |
| 2977 grid_walker_data gd = { 0 }; | |
| 2978 | |
| 2979 /* No content? Just bale. */ | |
| 2980 if (*first_block == NULL) | |
| 2981 return 0; | |
| 2982 | |
| 2983 /* If all the content here looks like a column of text, don't | |
| 2984 * hunt for a table within it. */ | |
| 2985 if (all_blocks_are_justified_or_headers(ctx, *first_block)) | |
| 2986 return num_subtables; | |
| 2987 | |
| 2988 gd.bounds = fz_infinite_rect; | |
| 2989 | |
| 2990 /* First off, descend into any children to see if those look like tables. */ | |
| 2991 count = 0; | |
| 2992 for (block = *first_block; block != NULL; block = block->next) | |
| 2993 { | |
| 2994 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 2995 { | |
| 2996 if (block->u.s.down) | |
| 2997 { | |
| 2998 num_subtables += do_table_hunt(ctx, page, block->u.s.down); | |
| 2999 count++; | |
| 3000 } | |
| 3001 } | |
| 3002 else if (block->type == FZ_STEXT_BLOCK_TEXT) | |
| 3003 count++; | |
| 3004 } | |
| 3005 | |
| 3006 /* If we don't have at least a single child, no more to hunt. */ | |
| 3007 /* We only need a single block, because a single text block can | |
| 3008 * contain an entire unbordered table. */ | |
| 3009 if (count < 1) | |
| 3010 return num_subtables; | |
| 3011 | |
| 3012 /* We only look for a table at this level if we either at the top | |
| 3013 * or on a div. This saves us looking for tables within an 'H' | |
| 3014 * for example. */ | |
| 3015 if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV) | |
| 3016 return num_subtables; | |
| 3017 | |
| 3018 | |
| 3019 fz_var(gd); | |
| 3020 | |
| 3021 fz_try(ctx) | |
| 3022 { | |
| 3023 /* Now see whether the content looks like tables. */ | |
| 3024 fz_rect bounds = walk_to_find_bounds(ctx, *first_block); | |
| 3025 | |
| 3026 if (find_table_within_bounds(ctx, &gd, *first_block, bounds)) | |
| 3027 break; | |
| 3028 | |
| 3029 if (num_subtables > 0) | |
| 3030 { | |
| 3031 /* We are risking throwing away a table we've already found for this | |
| 3032 * one. Only do it if this one is really convincing. */ | |
| 3033 int x, y; | |
| 3034 for (x = 0; x < gd.xpos->len; x++) | |
| 3035 if (gd.xpos->list[x].uncertainty != 0) | |
| 3036 break; | |
| 3037 if (x != gd.xpos->len) | |
| 3038 break; | |
| 3039 for (y = 0; y < gd.ypos->len; y++) | |
| 3040 if (gd.ypos->list[y].uncertainty != 0) | |
| 3041 break; | |
| 3042 if (y != gd.ypos->len) | |
| 3043 break; | |
| 3044 } | |
| 3045 | |
| 3046 #ifdef DEBUG_TABLE_STRUCTURE | |
| 3047 printf("Transcribing table: (%g,%g)->(%g,%g)\n", | |
| 3048 gd.xpos->list[0].pos, | |
| 3049 gd.ypos->list[0].pos, | |
| 3050 gd.xpos->list[gd.xpos->len-1].pos, | |
| 3051 gd.ypos->list[gd.ypos->len-1].pos); | |
| 3052 #endif | |
| 3053 | |
| 3054 /* Now we should have the entire table calculated. */ | |
| 3055 (void)transcribe_table(ctx, &gd, page, parent); | |
| 3056 num_subtables = 1; | |
| 3057 #ifdef DEBUG_WRITE_AS_PS | |
| 3058 { | |
| 3059 int i; | |
| 3060 printf("%% TABLE\n"); | |
| 3061 for (i = 0; i < block->u.b.xs->len; i++) | |
| 3062 { | |
| 3063 if (block->u.b.xs->list[i].uncertainty) | |
| 3064 printf("0 1 0 setrgbcolor\n"); | |
| 3065 else | |
| 3066 printf("0 0.5 0 setrgbcolor\n"); | |
| 3067 printf("%g %g moveto %g %g lineto stroke\n", | |
| 3068 block->u.b.xs->list[i].pos, block->bbox.y0, | |
| 3069 block->u.b.xs->list[i].pos, block->bbox.y1); | |
| 3070 } | |
| 3071 for (i = 0; i < block->u.b.ys->len; i++) | |
| 3072 { | |
| 3073 if (block->u.b.ys->list[i].uncertainty) | |
| 3074 printf("0 1 0 setrgbcolor\n"); | |
| 3075 else | |
| 3076 printf("0 0.5 0 setrgbcolor\n"); | |
| 3077 printf("%g %g moveto %g %g lineto stroke\n", | |
| 3078 block->bbox.x0, block->u.b.ys->list[i].pos, | |
| 3079 block->bbox.x1, block->u.b.ys->list[i].pos); | |
| 3080 } | |
| 3081 } | |
| 3082 #endif | |
| 3083 } | |
| 3084 fz_always(ctx) | |
| 3085 { | |
| 3086 fz_free(ctx, gd.xpos); | |
| 3087 fz_free(ctx, gd.ypos); | |
| 3088 fz_free(ctx, gd.cells); | |
| 3089 } | |
| 3090 fz_catch(ctx) | |
| 3091 fz_rethrow(ctx); | |
| 3092 | |
| 3093 return num_subtables; | |
| 3094 } | |
| 3095 | |
| 3096 void | |
| 3097 fz_table_hunt(fz_context *ctx, fz_stext_page *page) | |
| 3098 { | |
| 3099 if (page == NULL) | |
| 3100 return; | |
| 3101 | |
| 3102 assert(verify_stext(ctx, page, NULL)); | |
| 3103 | |
| 3104 do_table_hunt(ctx, page, NULL); | |
| 3105 | |
| 3106 assert(verify_stext(ctx, page, NULL)); | |
| 3107 } | |
| 3108 | |
| 3109 fz_stext_block * | |
| 3110 fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds) | |
| 3111 { | |
| 3112 fz_stext_struct *table = NULL; | |
| 3113 grid_walker_data gd = { 0 }; | |
| 3114 | |
| 3115 /* No content? Just bale. */ | |
| 3116 if (page == NULL || page->first_block == NULL) | |
| 3117 return NULL; | |
| 3118 | |
| 3119 fz_var(gd); | |
| 3120 | |
| 3121 fz_try(ctx) | |
| 3122 { | |
| 3123 gd.bounds = bounds; | |
| 3124 if (find_table_within_bounds(ctx, &gd, page->first_block, bounds)) | |
| 3125 break; | |
| 3126 | |
| 3127 #ifdef DEBUG_TABLE_STRUCTURE | |
| 3128 printf("Transcribing table: (%g,%g)->(%g,%g)\n", | |
| 3129 gd.xpos->list[0].pos, | |
| 3130 gd.ypos->list[0].pos, | |
| 3131 gd.xpos->list[gd.xpos->len-1].pos, | |
| 3132 gd.ypos->list[gd.ypos->len-1].pos); | |
| 3133 #endif | |
| 3134 | |
| 3135 /* Now we should have the entire table calculated. */ | |
| 3136 table = transcribe_table(ctx, &gd, page, NULL); | |
| 3137 #ifdef DEBUG_WRITE_AS_PS | |
| 3138 { | |
| 3139 int i; | |
| 3140 printf("%% TABLE\n"); | |
| 3141 for (i = 0; i < block->u.b.xs->len; i++) | |
| 3142 { | |
| 3143 if (block->u.b.xs->list[i].uncertainty) | |
| 3144 printf("0 1 0 setrgbcolor\n"); | |
| 3145 else | |
| 3146 printf("0 0.5 0 setrgbcolor\n"); | |
| 3147 printf("%g %g moveto %g %g lineto stroke\n", | |
| 3148 block->u.b.xs->list[i].pos, block->bbox.y0, | |
| 3149 block->u.b.xs->list[i].pos, block->bbox.y1); | |
| 3150 } | |
| 3151 for (i = 0; i < block->u.b.ys->len; i++) | |
| 3152 { | |
| 3153 if (block->u.b.ys->list[i].uncertainty) | |
| 3154 printf("0 1 0 setrgbcolor\n"); | |
| 3155 else | |
| 3156 printf("0 0.5 0 setrgbcolor\n"); | |
| 3157 printf("%g %g moveto %g %g lineto stroke\n", | |
| 3158 block->bbox.x0, block->u.b.ys->list[i].pos, | |
| 3159 block->bbox.x1, block->u.b.ys->list[i].pos); | |
| 3160 } | |
| 3161 } | |
| 3162 #endif | |
| 3163 } | |
| 3164 fz_always(ctx) | |
| 3165 { | |
| 3166 fz_free(ctx, gd.xpos); | |
| 3167 fz_free(ctx, gd.ypos); | |
| 3168 fz_free(ctx, gd.cells); | |
| 3169 } | |
| 3170 fz_catch(ctx) | |
| 3171 fz_rethrow(ctx); | |
| 3172 | |
| 3173 return table ? table->first_block : NULL; | |
| 3174 } |
