comparison mupdf-source/source/fitz/stext-table.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24
25 #include <assert.h>
26
27 /* #define DEBUG_WRITE_AS_PS */
28
29 /* #define DEBUG_TABLE_STRUCTURE */
30
31 /* #define DEBUG_TABLE_HUNT */
32
33 /*
34 * The algorithm.
35 *
36 * The goal of the algorithm is to identify tables on a page.
37 * First we have to find the tables on a page, then we have to
38 * figure out where the columns/rows are, and then how the
39 * cells span them.
40 *
41 * We do this as a series of steps.
42 *
43 * To illustrate what's going on, let's use an example page
44 * that we can follow through all the steps.
45 *
46 * +---------------------------+
47 * | |
48 * | #### ## ### ## | <- Title
49 * | |
50 * | ##### ##### #### ## | \
51 * | ## ###### ###### ## | |
52 * | #### ####### ###### | |- Abstract
53 * | ####### #### ## ### | |
54 * | ### ##### ###### | /
55 * | |
56 * | ######### ######### | 2 Columns of text
57 * | ######### ######### |
58 * | ######## ######### |
59 * | ######### |
60 * | +-------+ ####### | <- With an image on the left
61 * | | | |
62 * | | | ## ## # # | <- And a table on the right
63 * | +-------+ ## ## # # |
64 * | ## ## # # |
65 * | ######### ## ## # # |
66 * | ######### ## ## # # |
67 * | ######### ## ## # # |
68 * | |
69 * +---------------------------+
70 *
71 *
72 * Step 1: Segmentation.
73 *
74 * First, we segment the page, trying to break it down into a
75 * series of non-overlapping rectangles. We do this (in stext-boxer.c)
76 * by looking for where the content isn't. If we can identify breaks
77 * that run through the page (either from top to bottom or from left
78 * to right), then we can split the page there, and recursively consider
79 * the two halves of the page.
80 *
81 * It's not a perfect algorithm, but it manages to in many cases.
82 *
83 * After segmenting the above example, first we'll find the horizontal
84 * splits, giving:
85 *
86 * +---------------------------+
87 * | |
88 * | #### ## ### ## |
89 * +---------------------------+
90 * | ##### ##### #### ## |
91 * | ## ###### ###### ## |
92 * | #### ####### ###### |
93 * | ####### #### ## ### |
94 * | ### ##### ###### |
95 * +---------------------------+
96 * | ######### ######### |
97 * | ######### ######### |
98 * | ######## ######### |
99 * | ######### |
100 * | +-------+ ####### |
101 * | | | |
102 * | | | ## ## # # |
103 * | +-------+ ## ## # # |
104 * | ## ## # # |
105 * | ######### ## ## # # |
106 * | ######### ## ## # # |
107 * | ######### ## ## # # |
108 * | |
109 * +---------------------------+
110 *
111 * Then we'll recurse and find the vertical split between
112 * the columns:
113 *
114 * +---------------------------+
115 * | |
116 * | #### ## ### ## |
117 * +---------------------------+
118 * | ##### ##### #### ## |
119 * | ## ###### ###### ## |
120 * | #### ####### ###### |
121 * | ####### #### ## ### |
122 * | ### ##### ###### |
123 * +-------------+-------------+
124 * | ######### | ######### |
125 * | ######### | ######### |
126 * | ######## | ######### |
127 * | | ######### |
128 * | +-------+ | ####### |
129 * | | | | |
130 * | | | | ## ## # # |
131 * | +-------+ | ## ## # # |
132 * | | ## ## # # |
133 * | ######### | ## ## # # |
134 * | ######### | ## ## # # |
135 * | ######### | ## ## # # |
136 * | | |
137 * +-------------+-------------+
138 *
139 * Then we recurse again and find the horizontal splits
140 * within the columns:
141 *
142 * +---------------------------+
143 * | |
144 * | #### ## ### ## |
145 * +---------------------------+
146 * | ##### ##### #### ## |
147 * | ## ###### ###### ## |
148 * | #### ####### ###### |
149 * | ####### #### ## ### |
150 * | ### ##### ###### |
151 * +-------------+-------------+
152 * | ######### | ######### |
153 * | ######### | ######### |
154 * | ######## | ######### |
155 * +-------------+ ######### |
156 * | +-------+ | ####### |
157 * | | | +-------------+
158 * | | | | ## ## # # |
159 * | +-------+ | ## ## # # |
160 * +-------------+ ## ## # # |
161 * | ######### | ## ## # # |
162 * | ######### | ## ## # # |
163 * | ######### | ## ## # # |
164 * | | |
165 * +-------------+-------------+
166 *
167 * We recurse a fixed maximum number of times (currently
168 * 6, IIRC) or until we fail to find any suitable splits.
169 *
170 * This completes the page segmentation step.
171 *
172 * Step 2: Grid finding
173 *
174 * Next, we look at each of those segments and try to identify
175 * where grids might be.
176 *
177 * Imagine the bottom right section of that page as
178 * a board with lego blocks on where there is text.
179 * Now imagine viewing that from the bottom of the page.
180 * The gaps between the columns of the table are where you
181 * can see through to the top between the blocks.
182 *
183 * Similarly, if you view it from the side, the gaps between the
184 * rows of the page are where you can see through to the other
185 * side.
186 *
187 * So, how do we code that? Well, we run through the page content
188 * (obviously, restricted to the content that falls into this
189 * segment of the page - that'll go without saying from here on
190 * in). For each bit of content, we look at the "x extent" of that
191 * content - for instance a given string might start at position
192 * 10 and continue to position 100. We build a list of all these
193 * start, and stop positions, and keep them in a sorted list.
194 *
195 * Then we walk this list from left to right, keeping a sum. I
196 * call this sum "wind", because it's very similar to the winding
197 * number that you get when doing scan conversion of bezier shapes.
198 *
199 * wind starts out as 0. We increment it whenever we pass a 'start'
200 * position, and decrement it whenever we pass a 'stop' position.
201 * So at any given x position along the line wind tells us the
202 * number of pieces of content that overlap that x position.
203 * So wind(left) = 0 = wind(right), and wind(x) >= x for all x.
204 *
205 * So, if we walk from left to right, the trace of wind might
206 * look something like:
207 *
208 * __
209 * ___ / \ _ __
210 * / \ / \/ \ _/ \_
211 * / \___/ \___/ \
212 *
213 * The left and right edges of the table are pretty clear.
214 * The regions where wind drops to 0 represent the column dividers.
215 * The left and right hand side of those regions gives us the min
216 * and max values for that divider.
217 *
218 * We can then repeat this process for Y ranges instead of X ranges
219 * to get the row dividers.
220 *
221 * BUT, this only works for pure grid tables. It falls down for
222 * cases where we have merged cells (which is very common due to
223 * titles etc).
224 *
225 * We can modify the algorithm slightly to allow for this.
226 *
227 * Consider the following table:
228 *
229 * +-----------------------------------+
230 * | Long Table title across the top |
231 * +---------------+---------+---------+
232 * | Name | Result1 | Result2 |
233 * +---------------+----+----+----+----+
234 * | Homer Simpson | 1 | 23 | 4 | 56 |
235 * | Barney Gumble | 1 | 23 | 4 | 56 |
236 * | Moe | 1 | 23 | 4 | 56 |
237 * | Apu | 1 | 23 | 4 | 56 |
238 * | Ned Flanders | 1 | 23 | 4 | 56 |
239 * +---------------+----+----+----+----+
240 *
241 * The wind trace for that looks something like
242 * (with a certain degree of artistic license for the
243 * limitations of ascii art):
244 *
245 * ________
246 * / \ _ __ _ _
247 * / \____/ \_/ \___/ \_/ \
248 * / \
249 *
250 * So, the trace never quite drops back to zero in the
251 * middle due to the spanning of the top title.
252 *
253 * So, instead of just looking for points where the trace
254 * drops to zero, we instead look for local minima. Each local
255 * minima represents a place where there might be a grid divider.
256 * The value of wind at such points can be considered the
257 * "uncertainty" with which there might be a divider there.
258 * Clear dividers (with a wind value of 0) have no uncertainty.
259 * Places where cells are spanned have a higher value of uncertainty.
260 *
261 * The output from this step is the list of possible grid positions
262 * (X and Y), with uncertainty values.
263 *
264 * We have skipped over the handling of spaces in the above
265 * description. On the one hand, we'd quite like to treat
266 * sentences as long unbroken regions of content. But sometimes
267 * we can get awkward content like:
268 *
269 * P subtelomeric 24.38 8.15 11.31 0.94 1.46
270 * 11.08 6.93 15.34 0.00 0.73
271 * Pericentromeric region; C band 20.42 9.26 13.64 0.81 0.81
272 * 16.50 7.03 14.45 0.17 5.15
273
274 * where the content is frequently sent with spaces instead of
275 * small gaps (particular in tables of digits, because numerals
276 * are often the same width even in proportional fonts).
277 *
278 * To cope with this, as we send potential edges into the start/stop
279 * positions, we send 'weak' edges for the start and stops of runs
280 * of spaces. We then post process the edges to remove any local
281 * minima regions in the 'wind' values that are bounded purely by
282 * 'weak' edges.
283 *
284 * Step 3: Cell analysis
285 *
286 * So, armed with the output from step 2, we can examine each grid
287 * found. If we have W x-dividers and H y-dividers, we know we have
288 * a potential table with (W-1) x (H-1) cells in it.
289 *
290 * We represent this as a W x H grid of cells, each like:
291 *
292 * . .
293 * . .
294 * . . .+-------+. . . Each cell holds information about the
295 * | . edges above, and to the left of it.
296 * | .
297 * | .
298 * . . .+. . . .+. . .
299 * . .
300 * . .
301 *
302 * h_line: Is there a horizontal divider drawn on the page that
303 * corresponds to the top of this cell (i.e. is there a cell border
304 * here?)
305 * v_line: Is there a vertical divider drawn on the page that
306 * corresponds to the left of this cell (i.e. is there a cell border
307 * here?)
308 * h_crossed: Does content cross this line (i.e. are we merged
309 * with the cell above?)
310 * v_crossed: Does content cross this line (i.e. are we merged
311 * with the cell to the left?)
312 * full: Is there any content in this cell at all?
313 *
314 * We need a W x H grid of cells to represent the entire table due
315 * to the potential right and bottom edge lines. The right and
316 * bottom rows of cells should never be full, or be crossed, but
317 * it's easiest just to use a simple representation that copes with
318 * the h_line and v_line values naturally.
319 *
320 * So, we start with the cells structure empty, and we run through
321 * the page content, filling in the details as we go.
322 *
323 * At the end of the process, we have enough information to draw
324 * an asciiart representation of our table. It might look something
325 * like this (this comes from dotted-gridlines-tables.pdf):
326 *
327 * +-+-+-+-+-+-+-+-+-+-+-+-+-+
328 * | | | | | |#|#|#|#| | | |
329 * + + + + + + +v+ +v+v+ + + +
330 * | | | | | |#|#|#|#| | | |
331 * + + + + + + + +v+ + + + + +
332 * | |#| | |#|#|#|#|#|#|#|#|
333 * + +v+ + + +v+v+ +v+v+v+v+v+
334 * | |#| |#|#|#|#|#|#|#|#|#|
335 * + + + + +v+ + +v+ + + + + +
336 * |#|#| #|#|#|#|#|#|#|#|#|#|
337 * +v+v+ +v+ +v+v+ +v+v+v+v+v+
338 * |#|#| #|#|#|#|#|#|#|#|#|#|
339 * + + + + +v+ + +v+ + + + + +
340 * | |#| |#|#|#|#|#|#|#|#|#|
341 * + +v+ + + +v+v+ +v+v+v+v+v+
342 * | |#| | |#|#|#|#|#|#|#|#|
343 * + + + + + + + +v+ + + + + +
344 * | | | | | |#|#|#|#| | | |
345 * + + + + + + +v+ +v+v+ + + +
346 * | | | | | |#|#|#|#| | | |
347 * +-+-+-+-+-+-+-+-+-+-+-+-+-+
348 * | |# |#| | |#| | |#|#|#|
349 * + +-+-+-+-+-+-+-+-+-+-+-+-+
350 * | |# |#| | |#| | |#|#|#|
351 * + +-+-+-+-+-+-+-+-+-+-+-+-+
352 * | |#>#|#| | |#| | |#|#|#|
353 * + +-+-+-+-+-+-+-+-+-+-+-+-+
354 * | |#>#|#| | |#| | |#|#|#|
355 * + +-+-+-+-+-+-+-+-+-+-+-+-+
356 * | |#>#|#| |#| | | |#|#|#|
357 * + +-+-+-+-+-+-+-+-+-+-+-+-+
358 * | |# |#| | |#| | |#|#|#|
359 * + +-+-+-+-+-+-+-+-+-+-+-+-+
360 * | |# |#| | |#| | |#|#|#|
361 * + +-+-+-+-+-+-+-+-+-+-+-+-+
362 * | |# |#| | |#| | |#|#|#|
363 * + +-+-+-+-+-+-+-+-+-+-+-+-+
364 * | |# |#| | |#| | |#|#|#|
365 * + +-+-+-+-+-+-+-+-+-+-+-+-+
366 * |# |#|#|#|#|#|#|#|#|#|
367 * + +-+-+-+-+-+-+-+-+-+-+-+-+
368 *
369 * This shows where lines are detected ( - and | ),
370 * where they are crossed ( > and v) and where cells
371 * are full ( # ).
372 *
373 * Step 4: Row and column merging.
374 *
375 * Based on the information above, we then try to merge
376 * cells and columns to simplify the table.
377 *
378 * The best rules I've come up with this so far are:
379 * We can merge two adjacent columns if all the pairs of
380 * cells in the two columns are mergeable.
381 *
382 * Cells are held to be mergeable or not based upon the following
383 * rules:
384 * If there is a line between 2 cells - not mergeable.
385 * else if the uncertainty between 2 cells is 0 - not mergeable.
386 * else if the line between the 2 cells is crossed - mergeable.
387 * else if strictly one of the cells is full - mergeable.
388 * else not mergeable.
389 *
390 * So in the above example, column 2 (numbered from 0) can be merged
391 * with column 3.
392 *
393 * This gives:
394 *
395 * +-+-+-+-+-+-+-+-+-+-+-+-+
396 * | | | | | |#|#|#|#| | | |
397 * + + + + + +v+ +v+v+ + + +
398 * | | | | | |#|#|#|#| | | |
399 * + + + + + + +v+ + + + + +
400 * | |#| | |#|#|#|#|#|#|#|#|
401 * + +v+ + +v+v+ +v+v+v+v+v+
402 * | |#| |#|#|#|#|#|#|#|#|#|
403 * + + + +v+ + +v+ + + + + +
404 * |#|#|#|#|#|#|#|#|#|#|#|#|
405 * +v+v+v+ +v+v+ +v+v+v+v+v+
406 * |#|#|#|#|#|#|#|#|#|#|#|#|
407 * + + + +v+ + +v+ + + + + +
408 * | |#| |#|#|#|#|#|#|#|#|#|
409 * + +v+ + +v+v+ +v+v+v+v+v+
410 * | |#| | |#|#|#|#|#|#|#|#|
411 * + + + + + + +v+ + + + + +
412 * | | | | | |#|#|#|#| | | |
413 * + + + + + +v+ +v+v+ + + +
414 * | | | | | |#|#|#|#| | | |
415 * +-+-+-+-+-+-+-+-+-+-+-+-+
416 * | |#|#| | |#| | |#|#|#|
417 * + +-+-+-+-+-+-+-+-+-+-+-+
418 * | |#|#| | |#| | |#|#|#|
419 * + +-+-+-+-+-+-+-+-+-+-+-+
420 * | |#|#| | |#| | |#|#|#|
421 * + +-+-+-+-+-+-+-+-+-+-+-+
422 * | |#|#| | |#| | |#|#|#|
423 * + +-+-+-+-+-+-+-+-+-+-+-+
424 * | |#|#| |#| | | |#|#|#|
425 * + +-+-+-+-+-+-+-+-+-+-+-+
426 * | |#|#| | |#| | |#|#|#|
427 * + +-+-+-+-+-+-+-+-+-+-+-+
428 * | |#|#| | |#| | |#|#|#|
429 * + +-+-+-+-+-+-+-+-+-+-+-+
430 * | |#|#| | |#| | |#|#|#|
431 * + +-+-+-+-+-+-+-+-+-+-+-+
432 * | |#|#| | |#| | |#|#|#|
433 * + +-+-+-+-+-+-+-+-+-+-+-+
434 * |# |#|#|#|#|#|#|#|#|#|
435 * + +-+-+-+-+-+-+-+-+-+-+-+
436 *
437 * We then perform the same merging process for rows as for
438 * columns - though there are no rows in the above example
439 * that can be merged.
440 *
441 * You'll note that, for example, we don't merge row 0 and
442 * row 1 in the above, because we have a pair of cells that
443 * are both full without crossing.
444 *
445 * Step 5: Cell spanning
446 *
447 * Now we actually start to output the table. We keep a 'sent_table'
448 * (a grid of W x H bools) to keep track of whether we've output
449 * the content for a given cell or not yet.
450 *
451 * For each cell we reach, assuming sent_table[x,y] is false,
452 * we merge it with as many cells on the right as required,
453 * according to 'v_crossed' values (subject to not passing
454 * v_lines or uncertainty == 0's).
455 *
456 * We then try to merge cells below according to 'h_crossed'
457 * values (subject to not passing h_lines or uncertainty == 0's).
458 *
459 * In theory this can leave us with some cases where we'd like
460 * to merge some cells (because of crossed) and can't (because
461 * of lines or sent_table[]) values. In the absence of better
462 * cell spanning algorithms we have no choice here.
463 *
464 * Then we output the contents and set sent_table[] values as
465 * appropriate.
466 *
467 * If a row has no cells in it, we currently omit the TR. If/when
468 * we figure out how to indicate rowspan/colspan in stext, we can
469 * revisit that.
470 */
471
472
473 static fz_stext_block *
474 add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block)
475 {
476 fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block));
477 memset(block, 0, sizeof(*block));
478 block->type = FZ_STEXT_BLOCK_GRID;
479 block->bbox = fz_empty_rect; /* Fixes bug 703267. */
480 block->next = *first_block;
481 if (*first_block)
482 {
483 (*first_block)->prev = block;
484 assert(*last_block);
485 }
486 else
487 {
488 assert(*last_block == NULL);
489 *last_block = block;
490 }
491 *first_block = block;
492 return block;
493 }
494
495 static void
496 insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest)
497 {
498 if (before)
499 {
500 /* We have a block to insert it before, so we know it's not the last. */
501 assert(dest ? (dest->first_block != NULL && dest->last_block != NULL) : (page->first_block != NULL && page->last_block != NULL));
502 block->next = before;
503 block->prev = before->prev;
504 if (before->prev)
505 {
506 assert(before->prev->next == before);
507 before->prev->next = block;
508 }
509 else if (dest)
510 {
511 assert(dest->first_block == before);
512 dest->first_block = block;
513 }
514 else
515 {
516 assert(page->first_block == before);
517 page->first_block = block;
518 }
519 before->prev = block;
520 }
521 else if (dest)
522 {
523 /* Will be the last block. */
524 block->next = NULL;
525 block->prev = dest->last_block;
526 if (dest->last_block)
527 {
528 assert(dest->last_block->next == NULL);
529 dest->last_block->next = block;
530 }
531 if (dest->first_block == NULL)
532 dest->first_block = block;
533 dest->last_block = block;
534 }
535 else
536 {
537 /* Will be the last block. */
538 block->next = NULL;
539 block->prev = page->last_block;
540 if (page->last_block)
541 {
542 assert(page->last_block->next == NULL);
543 page->last_block->next = block;
544 }
545 if (page->first_block == NULL)
546 page->first_block = block;
547 page->last_block = block;
548 }
549 }
550
551 static fz_stext_struct *
552 add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw)
553 {
554 fz_stext_block *block;
555 int idx = 0;
556 size_t z;
557 fz_stext_struct *newstruct;
558
559 if (raw == NULL)
560 raw = "";
561 z = strlen(raw);
562
563 /* We're going to insert a struct block. We need an idx, so walk the list */
564 for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next)
565 {
566 if (block->type == FZ_STEXT_BLOCK_STRUCT)
567 {
568 assert(block->u.s.index >= idx);
569 idx = block->u.s.index + 1;
570 }
571 }
572 /* So we'll add our block as idx. But all the other struct blocks that follow us need to have
573 * larger values. */
574
575 /* Update all the subsequent structs to have a higher idx */
576 if (before)
577 {
578 int idx2 = idx+1;
579 for (block = before->next; block != NULL; block = block->next)
580 {
581 if (block->type != FZ_STEXT_BLOCK_STRUCT)
582 continue;
583 if (block->u.s.index > idx2)
584 break;
585 block->u.s.index = idx2++;
586 }
587 }
588
589 /* Now make our new struct block and insert it. */
590 block = fz_pool_alloc(ctx, page->pool, sizeof(*block));
591 block->type = FZ_STEXT_BLOCK_STRUCT;
592 block->bbox = fz_empty_rect; /* Fixes bug 703267. */
593 insert_block_before(block, before, page, parent);
594
595 block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z);
596 block->u.s.index = idx;
597 newstruct->parent = parent;
598 newstruct->standard = std;
599 memcpy(newstruct->raw, raw, z);
600 newstruct->raw[z] = 0;
601 newstruct->up = block;
602
603 return newstruct;
604 }
605
606 typedef struct
607 {
608 int len;
609 int max;
610 struct {
611 uint8_t left;
612 uint8_t weak;
613 float pos;
614 int freq;
615 } *list;
616 } div_list;
617
618 static void
619 div_list_push(fz_context *ctx, div_list *div, int left, int weak, float pos)
620 {
621 int i;
622
623 /* FIXME: Could be bsearch. */
624 for (i = 0; i < div->len; i++)
625 {
626 if (div->list[i].pos > pos)
627 break;
628 else if (div->list[i].pos == pos && div->list[i].left == left)
629 {
630 div->list[i].freq++;
631 div->list[i].weak &= weak;
632 return;
633 }
634 }
635
636 if (div->len == div->max)
637 {
638 int newmax = div->max * 2;
639 if (newmax == 0)
640 newmax = 32;
641 div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax);
642 div->max = newmax;
643 }
644
645 if (i < div->len)
646 memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i));
647 div->len++;
648 div->list[i].left = left;
649 div->list[i].weak = weak;
650 div->list[i].pos = pos;
651 div->list[i].freq = 1;
652 }
653
654 static fz_stext_grid_positions *
655 make_table_positions(fz_context *ctx, div_list *xs, float min, float max)
656 {
657 int wind;
658 fz_stext_grid_positions *pos;
659 int len = xs->len;
660 int i;
661 int hi = 0;
662
663 /* Count the number of edges */
664 int local_min = 0;
665 int edges = 2;
666
667 if (len == 0)
668 return NULL;
669
670 assert(xs->list[0].left);
671 for (i = 0; i < len; i++)
672 {
673 if (xs->list[i].pos >= min)
674 break;
675 }
676 for (; i < len; i++)
677 {
678 if (xs->list[i].pos >= max)
679 break;
680 if (xs->list[i].left)
681 {
682 if (local_min)
683 edges++;
684 }
685 else
686 local_min = 1;
687 }
688 assert(!xs->list[len-1].left);
689
690 pos = fz_malloc_flexible(ctx, fz_stext_grid_positions, list, edges);
691 pos->len = edges;
692
693 /* Copy the edges in */
694 wind = 0;
695 local_min = 0;
696 edges = 1;
697 pos->list[0].pos = min;
698 pos->list[0].min = min;
699 pos->list[0].max = fz_max(xs->list[0].pos, min);
700 pos->list[0].uncertainty = 0;
701 pos->list[0].reinforcement = 0;
702 #ifdef DEBUG_TABLE_HUNT
703 printf("|%g ", pos->list[0].pos);
704 #endif
705 /* Skip over entries to the left of min. */
706 for (i = 0; i < len; i++)
707 {
708 if (xs->list[i].pos >= min)
709 break;
710 if (xs->list[i].left)
711 wind += xs->list[i].freq;
712 else
713 wind -= xs->list[i].freq;
714 }
715 for (; i < len; i++)
716 {
717 if (xs->list[i].pos >= max)
718 break;
719 if (xs->list[i].left)
720 {
721 if (local_min)
722 {
723 pos->list[edges].min = xs->list[i-1].pos;
724 pos->list[edges].max = xs->list[i].pos;
725 pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2;
726 pos->list[edges].uncertainty = wind;
727 #ifdef DEBUG_TABLE_HUNT
728 if (wind)
729 printf("?%g(%d) ", pos->list[edges].pos, wind);
730 else
731 printf("|%g ", pos->list[edges].pos);
732 #endif
733 edges++;
734 }
735 wind += xs->list[i].freq;
736 if (wind > hi)
737 hi = wind;
738 }
739 else
740 {
741 wind -= xs->list[i].freq;
742 local_min = 1;
743 }
744 }
745 assert(i < len || wind == 0);
746 pos->list[edges].pos = max;
747 pos->list[edges].min = fz_min(xs->list[i-1].pos, max);
748 pos->list[edges].max = max;
749 assert(max >= xs->list[i-1].pos);
750 pos->list[edges].uncertainty = 0;
751 pos->list[edges].reinforcement = 0;
752 pos->max_uncertainty = hi;
753 #ifdef DEBUG_TABLE_HUNT
754 printf("|%g\n", pos->list[edges].pos);
755 #endif
756
757 return pos;
758 }
759
760 static fz_stext_grid_positions *
761 copy_grid_positions_to_pool(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs)
762 {
763 size_t z = offsetof(fz_stext_grid_positions, list) + sizeof(xs->list[0]) * (xs->len);
764 fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z);
765 memcpy(xs2, xs, z);
766 return xs2;
767 }
768
769 static void
770 sanitize_positions(fz_context *ctx, div_list *xs)
771 {
772 int i, j, wind, changed;
773
774 #ifdef DEBUG_TABLE_HUNT
775 printf("OK:\n");
776 for (i = 0; i < xs->len; i++)
777 {
778 if (xs->list[i].left)
779 printf("[");
780 printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
781 if (!xs->list[i].left)
782 printf("]");
783 printf(" ");
784 }
785 printf("\n");
786 #endif
787
788 if (xs->len == 0)
789 return;
790
791 do
792 {
793 /* Now, combine runs of left and right */
794 for (i = 0; i < xs->len; i++)
795 {
796 if (xs->list[i].left)
797 {
798 j = i;
799 while (i < xs->len-1 && xs->list[i+1].left)
800 {
801 i++;
802 xs->list[j].freq += xs->list[i].freq;
803 xs->list[j].weak &= xs->list[i].weak;
804 xs->list[i].freq = 0;
805 }
806 }
807 else
808 {
809 while (i < xs->len-1 && !xs->list[i+1].left)
810 {
811 i++;
812 xs->list[i].freq += xs->list[i-1].freq;
813 xs->list[i].weak &= xs->list[i-1].weak;
814 xs->list[i-1].freq = 0;
815 }
816 }
817 }
818
819 #ifdef DEBUG_TABLE_HUNT
820 printf("Shrunk:\n");
821 for (i = 0; i < xs->len; i++)
822 {
823 if (xs->list[i].left)
824 printf("[");
825 printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
826 if (!xs->list[i].left)
827 printf("]");
828 printf(" ");
829 }
830 printf("\n");
831 #endif
832
833 /* Now remove the 0 frequency ones. */
834 j = 0;
835 for (i = 0; i < xs->len; i++)
836 {
837 if (xs->list[i].freq == 0)
838 continue;
839 if (i != j)
840 xs->list[j] = xs->list[i];
841 j++;
842 }
843 xs->len = j;
844
845 /* Now run across looking for local minima where at least one
846 * edge is 'weak'. If the wind at that point is non-zero, then
847 * remove the weak edges from consideration and retry. */
848 wind = 0;
849 changed = 0;
850 i = 0;
851 while (1)
852 {
853 assert(xs->list[i].left);
854 for (; xs->list[i].left; i++)
855 {
856 wind += xs->list[i].freq;
857 }
858 assert(i < xs->len);
859 for (; xs->list[i].left == 0 && i < xs->len; i++)
860 {
861 wind -= xs->list[i].freq;
862 }
863 if (i == xs->len)
864 break;
865 if (wind != 0 && (xs->list[i-1].weak || xs->list[i].weak))
866 {
867 int m = fz_mini(xs->list[i-1].freq, xs->list[i].freq);
868 assert(m > 0);
869 xs->list[i-1].freq -= m;
870 xs->list[i].freq -= m;
871 changed = 1;
872 }
873 }
874 }
875 while (changed);
876
877 #ifdef DEBUG_TABLE_HUNT
878 printf("Compacted:\n");
879 for (i = 0; i < xs->len; i++)
880 {
881 if (xs->list[i].left)
882 printf("[");
883 printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : "");
884 if (!xs->list[i].left)
885 printf("]");
886 printf(" ");
887 }
888 printf("\n");
889 #endif
890 }
891
892 /* We want to check for whether a DIV that we are about to descend into
893 * contains a column of justified text. We will accept some headers in
894 * this text, but not JUST headers. */
895 static int
896 all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block)
897 {
898 int just_headers = 1;
899
900 for (; block != NULL; block = block->next)
901 {
902 if (block->type == FZ_STEXT_BLOCK_STRUCT)
903 {
904 if (block->u.s.down == NULL)
905 continue;
906 if (block->u.s.down->standard == FZ_STRUCTURE_H ||
907 block->u.s.down->standard == FZ_STRUCTURE_H1 ||
908 block->u.s.down->standard == FZ_STRUCTURE_H2 ||
909 block->u.s.down->standard == FZ_STRUCTURE_H3 ||
910 block->u.s.down->standard == FZ_STRUCTURE_H4 ||
911 block->u.s.down->standard == FZ_STRUCTURE_H5 ||
912 block->u.s.down->standard == FZ_STRUCTURE_H6)
913 continue;
914 if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block))
915 return 0;
916 }
917 just_headers = 0;
918 if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL)
919 return 0;
920 }
921
922 if (just_headers)
923 return 0;
924
925 return 1;
926 }
927
928 #define TWO_INCHES (72*2)
929
930 /* Walk through the blocks, finding the bbox. */
931 static fz_rect
932 walk_to_find_bounds(fz_context *ctx, fz_stext_block *first_block)
933 {
934 fz_rect bounds = fz_empty_rect;
935 fz_stext_block *block;
936 fz_stext_line *line;
937 fz_stext_char *ch;
938
939 for (block = first_block; block != NULL; block = block->next)
940 {
941 switch (block->type)
942 {
943 case FZ_STEXT_BLOCK_STRUCT:
944 if (!block->u.s.down)
945 continue;
946 if (block->u.s.down->standard == FZ_STRUCTURE_H)
947 {
948 if (block->next != NULL &&
949 block->next->type == FZ_STEXT_BLOCK_TEXT &&
950 block->next->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
951 continue;
952 }
953 bounds = fz_union_rect(bounds, walk_to_find_bounds(ctx, block->u.s.down->first_block));
954 break;
955 case FZ_STEXT_BLOCK_VECTOR:
956 bounds = fz_union_rect(bounds, block->bbox);
957 break;
958 case FZ_STEXT_BLOCK_TEXT:
959 if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL && block->bbox.x1 - block->bbox.x0 >= TWO_INCHES)
960 continue;
961 for (line = block->u.t.first_line; line != NULL; line = line->next)
962 {
963 for (ch = line->first_char; ch != NULL; ch = ch->next)
964 {
965 if (ch->c != ' ')
966 bounds = fz_union_rect(bounds, fz_rect_from_quad(ch->quad));
967 }
968 }
969 break;
970 }
971 }
972
973 return bounds;
974 }
975
976 static void
977 walk_to_find_content(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, fz_rect bounds)
978 {
979 fz_stext_block *block;
980 fz_stext_line *line;
981 fz_stext_char *ch;
982
983 for (block = first_block; block != NULL; block = block->next)
984 {
985 switch (block->type)
986 {
987 case FZ_STEXT_BLOCK_STRUCT:
988 if (block->u.s.down && !fz_is_empty_rect(fz_intersect_rect(bounds, block->bbox)))
989 walk_to_find_content(ctx, xs, ys, block->u.s.down->first_block, bounds);
990 break;
991 case FZ_STEXT_BLOCK_VECTOR:
992 break;
993 case FZ_STEXT_BLOCK_TEXT:
994 {
995 fz_rect justified_region = fz_empty_rect;
996 for (line = block->u.t.first_line; line != NULL; line = line->next)
997 {
998 fz_rect region = fz_empty_rect;
999
1000 region.y0 = line->bbox.y0;
1001 region.y1 = line->bbox.y1;
1002
1003 if (region.y0 < bounds.y0)
1004 region.y0 = bounds.y0;
1005 if (region.y1 > bounds.y1)
1006 region.y1 = bounds.y1;
1007 if (region.y0 >= region.y1)
1008 break;
1009
1010 /* Skip leading spaces. */
1011 for (ch = line->first_char; ch != NULL; ch = ch->next)
1012 if (ch->c != ' ')
1013 break;
1014
1015 for (; ch != NULL; ch = ch->next)
1016 {
1017 if (ch->c == ' ')
1018 {
1019 /* Find the last space char in this run. */
1020 fz_stext_char *last_space;
1021
1022 for (last_space = ch; last_space->next != NULL && last_space->next->c == ' '; last_space = last_space->next);
1023
1024 /* If we're not the last char in the line (i.e. we're not a trailing space,
1025 * then send a 'weak' gap for the spaces, assuming it's sane to do so). */
1026 if (last_space->next != NULL)
1027 {
1028 float rpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
1029 float lpos = fz_min(last_space->next->quad.ll.x, last_space->next->quad.ll.x);
1030
1031 /* Clamp these to the bounds */
1032 rpos = fz_clamp(rpos, bounds.x0, bounds.x1);
1033 lpos = fz_clamp(lpos, bounds.x0, bounds.x1);
1034
1035 /* So we have a region (rpos...lpos) to add. */
1036 /* This can be positioned in various different ways relative to the
1037 * current region:
1038 * [region]
1039 * (rpos..lpos) OK
1040 * (rpos..lpos) OK, but adjust lpos
1041 * (rpos..lpos) OK, but adjust rpos
1042 * (rpos..lpos) OK
1043 * (rpos .. lpos) OK
1044 */
1045 if (lpos >= region.x1)
1046 {
1047 if (rpos >= region.x0 && rpos < region.x1)
1048 rpos = region.x1;
1049 }
1050 else if (rpos <= region.x0)
1051 {
1052 if (lpos > region.x0)
1053 lpos = region.x0;
1054 }
1055 else
1056 rpos = lpos; /* Make it an invalid region */
1057
1058 if (rpos < lpos)
1059 {
1060 /* Send a weak right at the start of the spaces... */
1061 div_list_push(ctx, xs, 0, 1, rpos);
1062 /* and a weak left at the end. */
1063 div_list_push(ctx, xs, 1, 1, lpos);
1064 }
1065
1066 /* Expand the region as required. */
1067 if (rpos < region.x0)
1068 region.x0 = rpos;
1069 if (lpos > region.x1)
1070 region.x1 = lpos;
1071 }
1072 /* Jump over the spaces */
1073 ch = last_space;
1074 }
1075 else
1076 {
1077 float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x);
1078 float rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x);
1079 if (lpos < region.x0)
1080 region.x0 = lpos;
1081 if (rpos > region.x1)
1082 region.x1 = rpos;
1083 }
1084 }
1085 if (!fz_is_empty_rect(region))
1086 {
1087 div_list_push(ctx, xs, 1, 0, region.x0);
1088 div_list_push(ctx, xs, 0, 0, region.x1);
1089 /* For justified regions, we don't break after each line, but
1090 * rather before/after the region as a whole. */
1091 if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL)
1092 {
1093 div_list_push(ctx, ys, 1, 0, region.y0);
1094 div_list_push(ctx, ys, 0, 0, region.y1);
1095 }
1096 else
1097 justified_region = fz_union_rect(justified_region, region);
1098 }
1099 }
1100 if (!fz_is_empty_rect(justified_region) && block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL)
1101 {
1102 div_list_push(ctx, ys, 1, 0, justified_region.y0);
1103 div_list_push(ctx, ys, 0, 0, justified_region.y1);
1104 }
1105 break;
1106 }
1107 }
1108 }
1109 }
1110
1111 /* One of our datastructures (cells_t) is an array of details about the
1112 * cells that make up our table. It's a w * h array of cell_t's. Each
1113 * cell contains data on one of the cells in the table, as you'd expect.
1114 *
1115 * . .
1116 * . .
1117 * - - +-------+ - -
1118 * | .
1119 * | .
1120 * | .
1121 * - - + - - - + - -
1122 * . .
1123 * . .
1124 *
1125 * For any given cell, we store details about the top (lowest y coord)
1126 * and left (lowest x coord) edges. Specifically we store whether
1127 * there is a line at this position (h_line and v_line) (i.e. a drawn
1128 * border), and we also store whether content crosses this edge (h_crossed
1129 * and y_crossed). Finally, we store whether the cell has any content
1130 * in it at all (full).
1131 *
1132 * A table which has w positions across and h positions vertically, will
1133 * only really have (w-1) * (h-1) cells. We store w*h though to allow for
1134 * the right and bottom edges to have their lines represented.
1135 */
1136
1137 typedef struct
1138 {
1139 int h_line;
1140 int v_line;
1141 int h_crossed;
1142 int v_crossed;
1143 int full;
1144 } cell_t;
1145
1146 typedef struct
1147 {
1148 int w;
1149 int h;
1150 cell_t cell[FZ_FLEXIBLE_ARRAY];
1151 } cells_t;
1152
1153 typedef struct
1154 {
1155 cells_t *cells;
1156 fz_stext_grid_positions *xpos;
1157 fz_stext_grid_positions *ypos;
1158 fz_rect bounds;
1159 } grid_walker_data;
1160
1161 static cell_t *
1162 get_cell(cells_t *cells, int x, int y)
1163 {
1164 return &cells->cell[x + y * cells->w];
1165 }
1166
1167 #ifdef DEBUG_TABLE_STRUCTURE
1168 static void
1169 asciiart_table(grid_walker_data *gd);
1170 #endif
1171
1172 static fz_stext_grid_positions *
1173 split_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, int i, int early)
1174 {
1175 fz_stext_grid_positions **posp = row ? &gd->ypos : &gd->xpos;
1176 fz_stext_grid_positions *pos = *posp;
1177 int n = pos->len;
1178 int x, y, w, h;
1179 cells_t *cells;
1180
1181 /* Realloc the required structs */
1182 *posp = pos = fz_realloc_flexible(ctx, pos, fz_stext_grid_positions, list, n+1);
1183 cells = gd->cells = fz_realloc_flexible(ctx, gd->cells, cells_t, cell, (gd->cells->w + (1-row)) * (gd->cells->h + row));
1184 /* If both pass, then we're safe to shuffle the data. */
1185
1186 #ifdef DEBUG_TABLE_STRUCTURE
1187 printf("Before split %s %d\n", row ? "row" : "col", i);
1188 asciiart_table(gd);
1189 #endif
1190
1191 assert(i >= 0 && i < n);
1192 memmove(&pos->list[i+1], &pos->list[i], sizeof(pos->list[0]) * (n-i));
1193 pos->len++;
1194
1195 /* Next expand the cells. Only h_line and v_line are filled in so far. */
1196 w = cells->w;
1197 h = cells->h;
1198 if (early && i > 0)
1199 i--, early = 0;
1200 if (row)
1201 {
1202 /* Add a row */
1203 cells->h = h+1;
1204 /* Expand the table, duplicating row i */
1205 memmove(&cells->cell[(i+1)*w], &cells->cell[i*w], sizeof(cells->cell[0])*(h-i)*w);
1206
1207 if (early)
1208 {
1209 /* We are splitting row 0 into 0 and 1, with 0 being the new one. */
1210 for (x = 0; x < w; x++)
1211 {
1212 cells->cell[x].h_line = 0;
1213 cells->cell[x].v_line = 0;
1214 }
1215 }
1216 else
1217 {
1218 /* We are splitting row i into i and i+1, with i+1 being the new one. */
1219 /* v_lines are carried over. h_lines need to be unset. */
1220 for (x = 0; x < w; x++)
1221 cells->cell[x + (i+1)*w].h_line = 0;
1222 }
1223 }
1224 else
1225 {
1226 /* Add a column */
1227 cells->w = w+1;
1228 /* Expand the table, duplicating column i */
1229 for (y = h-1; y >= 0; y--)
1230 {
1231 for (x = w; x > i; x--)
1232 cells->cell[x + y*(w+1)] = cells->cell[x-1 + y*w];
1233 for (; x >= 0; x--)
1234 cells->cell[x + y*(w+1)] = cells->cell[x + y*w];
1235 }
1236 if (early)
1237 {
1238 /* We are splitting col 0 into 0 and 1, with 0 being the new one. */
1239 for (y = 0; y < h; y++)
1240 {
1241 cells->cell[y*(w+1)].h_line = 0;
1242 cells->cell[y*(w+1)].v_line = 0;
1243 }
1244 }
1245 else
1246 {
1247 /* h_lines are carried over. v_lines need to be reset */
1248 for (y = 0; y < h; y++)
1249 cells->cell[i+1 + y*(w+1)].v_line = 0;
1250 }
1251 }
1252
1253 #ifdef DEBUG_TABLE_STRUCTURE
1254 printf("After split\n");
1255 asciiart_table(gd);
1256 #endif
1257 return pos;
1258 }
1259
1260 /* This routine finds (and reinforces) grid positions for lines.
1261 *
1262 * If we have a thin line from (x0, y0) to (x1, y0), then we are
1263 * pretty sure that y0 will be on the edge of a cell. We are less
1264 * sure that x0 and x1 will match up to the edge of a cell.
1265 * Stylistically some tables overrun or underrun such lines.
1266 *
1267 * Similarly from (x0, y0) to (x0, y1), we expect x0 to be accurate
1268 * but y0 and y1 less so.
1269 *
1270 * If we have a wider rectangle, from (x0, y0) to (x1, y1) then
1271 * we fully expect all sides to be accurate.
1272 */
1273 static int
1274 find_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, float x, int inaccurate)
1275 {
1276 const int WIGGLE_ROOM = 2;
1277 int i;
1278 fz_stext_grid_positions *pos = row ? gd->ypos : gd->xpos;
1279
1280 assert(x >= pos->list[0].min && x <= pos->list[pos->len-1].max);
1281
1282 #ifdef DEBUG_TABLE_STRUCTURE
1283 printf("Looking for %g in %s splits:\n", x, row ? "row" : "col");
1284 for (i = 0; i < pos->len; i++)
1285 {
1286 printf("%d\t%g\t%g\t%g\t%d\n", i, pos->list[i].min, pos->list[i].pos, pos->list[i].max, pos->list[i].reinforcement);
1287 }
1288 #endif
1289
1290 while (inaccurate) /* So we can break out */
1291 {
1292 float prev = 0;
1293
1294 /* If we start/finish outside the range of the table, then we
1295 * want to extend the table. So ignore 'inaccurate' in this
1296 * case. Match the logic below. */
1297 if (x < pos->list[0].min)
1298 break;
1299 if (x < pos->list[0].pos - WIGGLE_ROOM && pos->list[0].reinforcement > 0)
1300 break;
1301 if (x > pos->list[pos->len-1].max)
1302 break;
1303 if (x > pos->list[pos->len-1].pos + WIGGLE_ROOM && pos->list[pos->len-1].reinforcement > 0)
1304 break;
1305
1306 /* Just find the closest one. No reinforcement. */
1307 for (i = 0; i < pos->len; i++)
1308 {
1309 if (x < pos->list[i].min)
1310 {
1311 float mid = (prev + pos->list[i].min)/2;
1312 if (x < mid)
1313 return i-1;
1314 return i;
1315 }
1316 prev = pos->list[i].max;
1317 if (x <= prev)
1318 return i;
1319 }
1320 assert("Never happens" == NULL);
1321
1322 return -1;
1323 }
1324
1325 for (i = 0; i < pos->len; i++)
1326 {
1327 if (x < pos->list[i].min)
1328 {
1329 /* Split i into i and i+1, and make i the new one. */
1330 assert(i > 0);
1331 #ifdef DEBUG_TABLE_STRUCTURE
1332 printf("Splitting before %d\n", i);
1333 #endif
1334 pos = split_grid_pos(ctx, gd, row, i, 1);
1335 pos->list[i-1].max = pos->list[i].min = (pos->list[i-1].max + x)/2;
1336 pos->list[i].pos = x;
1337 pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2;
1338 pos->list[i].reinforcement = 1;
1339 return i;
1340 }
1341 else if (x <= pos->list[i].max)
1342 {
1343 /* We are in the range for the ith divider. */
1344 if (pos->list[i].reinforcement == 0)
1345 {
1346 /* If we've not been reinforced before, reinforce now. */
1347 pos->list[i].pos = x;
1348 pos->list[i].reinforcement = 1;
1349 return i;
1350 }
1351 /* We've been reinforced before. This ought to be a pretty good
1352 * indication. */
1353 if (pos->list[i].pos - WIGGLE_ROOM < x && x < pos->list[i].pos + WIGGLE_ROOM)
1354 {
1355 /* We are a close match to the previously predicted pos
1356 * value. */
1357 pos->list[i].pos = pos->list[i].pos * pos->list[i].reinforcement + x;
1358 pos->list[i].pos /= ++pos->list[i].reinforcement;
1359 return i;
1360 }
1361 /* We need to split i into i and i+1. */
1362 pos = split_grid_pos(ctx, gd, row, i, pos->list[i].pos > x);
1363 if (pos->list[i].pos > x)
1364 {
1365 /* Make i the new one */
1366 #ifdef DEBUG_TABLE_STRUCTURE
1367 printf("Splitting %d (early)\n", i);
1368 #endif
1369 pos->list[i].pos = x;
1370 pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2;
1371 pos->list[i].reinforcement = 1;
1372 return i;
1373 }
1374 else
1375 {
1376 /* Make i+1 the new one */
1377 #ifdef DEBUG_TABLE_STRUCTURE
1378 printf("Splitting %d (late)\n", i);
1379 #endif
1380 pos->list[i+1].pos = x;
1381 pos->list[i].max = pos->list[i+1].min = (pos->list[i].pos + x)/2;
1382 pos->list[i].reinforcement = 1;
1383 return i+1;
1384 }
1385 }
1386 }
1387 assert("Never happens" == NULL);
1388
1389 return -1;
1390 }
1391
1392 static int
1393 find_cell_l(fz_stext_grid_positions *pos, float x)
1394 {
1395 int i;
1396
1397 for (i = 0; i < pos->len; i++)
1398 if (x < pos->list[i].pos)
1399 return i-1;
1400
1401 return -1;
1402 }
1403
1404 static int
1405 find_cell_r(fz_stext_grid_positions *pos, float x)
1406 {
1407 int i;
1408
1409 for (i = 0; i < pos->len; i++)
1410 if (x <= pos->list[i].pos)
1411 return i-1;
1412
1413 return -1;
1414 }
1415
1416 /* Add a horizontal line. Return 1 if the line doesn't seem to be a border line.
1417 * Record which cells that was a border for. */
1418 static void
1419 add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1)
1420 {
1421 int start = find_grid_pos(ctx, gd, 0, x0, 1);
1422 int end = find_grid_pos(ctx, gd, 0, x1, 1);
1423 float y = (y0 + y1) / 2;
1424 int yidx = find_grid_pos(ctx, gd, 1, y, 0);
1425 int i;
1426
1427 assert(start > 0 && end > 0);
1428
1429 for (i = start; i < end; i++)
1430 get_cell(gd->cells, i, yidx)->h_line++;
1431 }
1432
1433 /* Add a vertical line. Return 1 if the line doesn't seem to be a border line.
1434 * Record which cells that was a border for. */
1435 static void
1436 add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1)
1437 {
1438 int start = find_grid_pos(ctx, gd, 1, y0, 1);
1439 int end = find_grid_pos(ctx, gd, 1, y1, 1);
1440 float x = (x0 + x1) / 2;
1441 int xidx = find_grid_pos(ctx, gd, 0, x, 0);
1442 int i;
1443
1444 for (i = start; i < end; i++)
1445 get_cell(gd->cells, xidx, i)->v_line++;
1446 }
1447
1448 static void
1449 add_hv_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1, int stroked)
1450 {
1451 int ix0 = find_grid_pos(ctx, gd, 0, x0, 0);
1452 int ix1 = find_grid_pos(ctx, gd, 0, x1, 0);
1453 int iy0 = find_grid_pos(ctx, gd, 1, y0, 0);
1454 int iy1 = find_grid_pos(ctx, gd, 1, y1, 0);
1455 int i;
1456
1457 if (stroked)
1458 {
1459 for (i = ix0; i < ix1; i++)
1460 {
1461 get_cell(gd->cells, i, iy0)->h_line++;
1462 get_cell(gd->cells, i, iy1)->h_line++;
1463 }
1464 for (i = iy0; i < iy1; i++)
1465 {
1466 get_cell(gd->cells, ix0, i)->v_line++;
1467 get_cell(gd->cells, ix1, i)->v_line++;
1468 }
1469 }
1470 }
1471
1472 /* Shared internal routine with stext-boxer.c */
1473 fz_rect fz_collate_small_vector_run(fz_stext_block **blockp);
1474
1475 static void
1476 walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
1477 {
1478 for (; block != NULL; block = block->next)
1479 {
1480 if (block->type == FZ_STEXT_BLOCK_STRUCT)
1481 {
1482 if (block->u.s.down)
1483 walk_grid_lines(ctx, gd, block->u.s.down->first_block);
1484 continue;
1485 }
1486 else if (block->type == FZ_STEXT_BLOCK_VECTOR)
1487 {
1488 fz_rect r;
1489 float w, h;
1490
1491 /* Only process rectangle blocks. */
1492 if ((block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0)
1493 continue;
1494
1495 r = fz_collate_small_vector_run(&block);
1496 r = fz_intersect_rect(r, gd->bounds);
1497 if (!fz_is_valid_rect(r))
1498 continue;
1499
1500 w = r.x1 - r.x0;
1501 h = r.y1 - r.y0;
1502 if (w > h && h < 2)
1503 {
1504 /* Thin, wide line */
1505 (void) add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1);
1506 }
1507 else if (w < h && w < 2)
1508 {
1509 /* Thin, wide line */
1510 (void) add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1);
1511 }
1512 else
1513 {
1514 /* Rectangle */
1515 (void) add_hv_line(ctx, gd, r.x0, r.x1, r.y0, r.y1, block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED);
1516 }
1517 }
1518 }
1519 }
1520
1521 static int
1522 is_numeric(int c)
1523 {
1524 return (c >= '0' && c <= '9');
1525 }
1526
1527 static int
1528 mark_cells_for_content(fz_context *ctx, grid_walker_data *gd, fz_rect s)
1529 {
1530 fz_rect r = fz_intersect_rect(gd->bounds, s);
1531 int x0, x1, y0, y1, x, y;
1532
1533 if (fz_is_empty_rect(r))
1534 return 0;
1535
1536 x0 = find_cell_l(gd->xpos, r.x0);
1537 x1 = find_cell_r(gd->xpos, r.x1);
1538 y0 = find_cell_l(gd->ypos, r.y0);
1539 y1 = find_cell_r(gd->ypos, r.y1);
1540
1541 if (x0 < 0 || x1 < 0 || y0 < 0 || y1 < 0)
1542 return 1;
1543 if (x0 < x1)
1544 {
1545 for (y = y0; y <= y1; y++)
1546 for (x = x0; x < x1; x++)
1547 get_cell(gd->cells, x+1, y)->v_crossed++;
1548 }
1549 if (y0 < y1)
1550 {
1551 for (y = y0; y < y1; y++)
1552 for (x = x0; x <= x1; x++)
1553 get_cell(gd->cells, x, y+1)->h_crossed++;
1554 }
1555 for (y = y0; y <= y1; y++)
1556 for (x = x0; x <= x1; x++)
1557 get_cell(gd->cells, x, y)->full++;
1558
1559 return 0;
1560 }
1561
1562 #define IN_CELL 0
1563 #define IN_BORDER 1
1564 #define IN_UNKNOWN 2
1565
1566 #define SPLIT_MARGIN 4
1567 static int
1568 where_is(fz_stext_grid_positions *pos, float x, int *in)
1569 {
1570 int i;
1571
1572 *in = IN_UNKNOWN;
1573
1574 /* If the table is empty, nothing to do. */
1575 if (pos->len == 0)
1576 return -1;
1577
1578 /* If we are completely outside the table, give up. */
1579 if (x <= pos->list[0].pos - SPLIT_MARGIN || x >= pos->list[pos->len-1].max + SPLIT_MARGIN)
1580 return -1;
1581
1582 for (i = 0; i < pos->len; i++)
1583 {
1584 /* Calculate the region (below..above) that counts as being
1585 * on the border of position i. */
1586 float prev = i > 0 ? pos->list[i-1].max : pos->list[0].min;
1587 float next = i < pos->len-1 ? pos->list[i+1].min : pos->list[i].max;
1588 float below = pos->list[i].pos - SPLIT_MARGIN;
1589 float above = pos->list[i].pos + SPLIT_MARGIN;
1590 /* Find the distance half way back to the previous pos as
1591 * a limit to our margin. */
1592 prev = (prev + pos->list[i].pos)/2;
1593 next = (next + pos->list[i].pos)/2;
1594 if (below < prev)
1595 below = prev;
1596 if (above > next)
1597 above = next;
1598
1599 if (x < below)
1600 {
1601 *in = IN_CELL;
1602 return i-1;
1603 }
1604 else if (x <= above)
1605 {
1606 *in = IN_BORDER;
1607 return i;
1608 }
1609 }
1610
1611 *in = IN_BORDER;
1612 return i-1;
1613 }
1614
1615 enum
1616 {
1617 VECTOR_IS_CONTENT = 0,
1618 VECTOR_IS_BORDER = 1,
1619 VECTOR_IS_UNKNOWN = 2,
1620 VECTOR_IS_IGNORABLE = 3
1621 };
1622
1623 /* So a vector can either be a border, or contained
1624 * in some cells, or something completely else. */
1625 static int
1626 classify_vector(fz_context *ctx, grid_walker_data *gd, fz_rect r, int is_rect)
1627 {
1628 int at_x0, at_x1, at_y0, at_y1;
1629 int ix0, ix1, iy0, iy1;
1630
1631 r = fz_intersect_rect(r, gd->bounds);
1632 if (fz_is_empty_rect(r))
1633 return VECTOR_IS_IGNORABLE;
1634 ix0 = where_is(gd->xpos, r.x0, &at_x0);
1635 ix1 = where_is(gd->xpos, r.x1, &at_x1);
1636 iy0 = where_is(gd->ypos, r.y0, &at_y0);
1637 iy1 = where_is(gd->ypos, r.y1, &at_y1);
1638
1639 /* No idea, just treat it as a border. */
1640 if (at_x0 == IN_UNKNOWN || at_x1 == IN_UNKNOWN || at_y0 == IN_UNKNOWN || at_y1 == IN_UNKNOWN)
1641 return VECTOR_IS_IGNORABLE;
1642
1643 if (at_x0 == IN_BORDER && at_x1 == IN_BORDER)
1644 {
1645 /* Vector is aligned along sides of cells. */
1646 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
1647 }
1648 if (at_y0 == IN_BORDER && at_y1 == IN_BORDER)
1649 {
1650 /* Vector is aligned along sides of cells. */
1651 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
1652 }
1653 if (at_x0 == IN_CELL && at_x1 == IN_CELL)
1654 {
1655 /* Content within a cell (or 1d range of cells). */
1656 return VECTOR_IS_CONTENT;
1657 }
1658 if (at_y0 == IN_CELL && at_y1 == IN_CELL)
1659 {
1660 /* Content within a cell (or 1d range of cells). */
1661 return VECTOR_IS_CONTENT;
1662 }
1663 if (at_x0 == IN_BORDER && at_x1 == IN_CELL && ix0 == ix1)
1664 {
1665 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
1666 }
1667 if (at_x0 == IN_CELL && at_x1 == IN_BORDER && ix0+1 == ix1)
1668 {
1669 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
1670 }
1671 if (at_y0 == IN_BORDER && at_y1 == IN_CELL && iy0 == iy1)
1672 {
1673 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
1674 }
1675 if (at_y0 == IN_CELL && at_y1 == IN_BORDER && iy0+1 == iy1)
1676 {
1677 return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT;
1678 }
1679
1680 if (is_rect)
1681 {
1682 return VECTOR_IS_IGNORABLE;
1683 }
1684
1685 /* unknown - take this as indication that this maybe isn't
1686 * table. */
1687 return VECTOR_IS_UNKNOWN;
1688 }
1689
1690 #undef IN_CELL
1691 #undef IN_BORDER
1692 #undef IN_UNKNOWN
1693
1694
1695 /* Walk through the content, looking at how it spans our grid.
1696 * Record gridlines, which cells have content that cross into
1697 * neighbours, and which cells have any content at all.
1698 * Return a count of vector graphics that are found that don't
1699 * look plausibly like cell contents. */
1700 static int
1701 calculate_spanned_content(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block)
1702 {
1703 int duff = 0;
1704 fz_rect bounds = {
1705 gd->xpos->list[0].pos,
1706 gd->ypos->list[0].pos,
1707 gd->xpos->list[gd->xpos->len-1].pos,
1708 gd->ypos->list[gd->ypos->len-1].pos };
1709
1710 for (; block != NULL; block = block->next)
1711 {
1712 if (block->type == FZ_STEXT_BLOCK_STRUCT)
1713 {
1714 if (block->u.s.down)
1715 duff += calculate_spanned_content(ctx, gd, block->u.s.down->first_block);
1716 continue;
1717 }
1718 else if (block->type == FZ_STEXT_BLOCK_VECTOR)
1719 {
1720 switch (classify_vector(ctx, gd, block->bbox, !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE)))
1721 {
1722 case VECTOR_IS_CONTENT:
1723 mark_cells_for_content(ctx, gd, block->bbox);
1724 break;
1725 case VECTOR_IS_BORDER:
1726 case VECTOR_IS_IGNORABLE:
1727 break;
1728 default:
1729 duff++;
1730 break;
1731 }
1732 }
1733 else if (block->type == FZ_STEXT_BLOCK_TEXT)
1734 {
1735 fz_stext_line *line;
1736
1737 if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 ||
1738 block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0)
1739 continue;
1740
1741 for (line = block->u.t.first_line; line != NULL; line = line->next)
1742 {
1743 fz_stext_char *ch = line->first_char;
1744 int was_numeric = 0;
1745
1746 /* Skip leading spaces */
1747 while (ch != NULL && ch->c == ' ')
1748 ch = ch->next;
1749
1750 for (; ch != NULL; ch = ch->next)
1751 {
1752 if (ch->c == 32)
1753 {
1754 /* Trailing space, skip it. */
1755 if (ch->next == NULL)
1756 break;
1757 if (ch->next->c == 32)
1758 {
1759 /* Run of spaces. Skip 'em. */
1760 while (ch->next && ch->next->c == 32)
1761 ch = ch->next;
1762 was_numeric = 0;
1763 continue;
1764 }
1765 if (was_numeric || is_numeric(ch->next->c))
1766 {
1767 /* Single spaces around numbers are ignored. */
1768 was_numeric = 0;
1769 continue;
1770 }
1771 /* A single space. Accept it. */
1772 was_numeric = 0;
1773 }
1774 else
1775 was_numeric = is_numeric(ch->c);
1776 duff += mark_cells_for_content(ctx, gd, fz_rect_from_quad(ch->quad));
1777 }
1778 }
1779 }
1780 }
1781
1782 return duff;
1783 }
1784
1785 static cells_t *new_cells(fz_context *ctx, int w, int h)
1786 {
1787 cells_t *cells = fz_malloc_flexible(ctx, cells_t, cell, w * h);
1788 cells->w = w;
1789 cells->h = h;
1790
1791 return cells;
1792 }
1793
1794 #ifdef DEBUG_TABLE_STRUCTURE
1795 static void
1796 asciiart_table(grid_walker_data *gd)
1797 {
1798 int w = gd->xpos->len;
1799 int h = gd->ypos->len;
1800 int x, y;
1801
1802 for (y = 0; y < h; y++)
1803 {
1804 for (x = 0; x < w-1; x++)
1805 {
1806 cell_t *cell = get_cell(gd->cells, x, y);
1807 int line = cell->h_line;
1808 int erase = cell->h_crossed;
1809 printf("+");
1810 if (line && !erase)
1811 {
1812 printf("-");
1813 }
1814 else if (!line && erase)
1815 {
1816 printf("v");
1817 }
1818 else if (line && erase)
1819 {
1820 printf("*");
1821 }
1822 else
1823 {
1824 printf(" ");
1825 }
1826 }
1827 printf("+\n");
1828 if (y == h-1)
1829 break;
1830 for (x = 0; x < w; x++)
1831 {
1832 cell_t *cell = get_cell(gd->cells, x, y);
1833 int line = cell->v_line;
1834 int erase = cell->v_crossed;
1835 if (line && !erase)
1836 {
1837 printf("|");
1838 }
1839 else if (!line && erase)
1840 {
1841 printf(">");
1842 }
1843 else if (line && erase)
1844 {
1845 printf("*");
1846 }
1847 else
1848 {
1849 printf(" ");
1850 }
1851 if (x < w-1)
1852 {
1853 if (cell->full)
1854 printf("#");
1855 else
1856 printf(" ");
1857 }
1858 else
1859 printf("\n");
1860 }
1861 }
1862 }
1863 #endif
1864
1865 static void
1866 recalc_bbox(fz_stext_block *block)
1867 {
1868 fz_rect bbox = fz_empty_rect;
1869 fz_stext_line *line;
1870
1871 for (line = block->u.t.first_line; line != NULL; line = line->next)
1872 bbox = fz_union_rect(bbox, line->bbox);
1873
1874 block->bbox = bbox;
1875 }
1876
1877 static void
1878 unlink_line_from_block(fz_stext_line *line, fz_stext_block *block)
1879 {
1880 fz_stext_line *next_line = line->next;
1881
1882 if (line->prev)
1883 {
1884 assert(line->prev->next == line);
1885 line->prev->next = next_line;
1886 }
1887 else
1888 {
1889 assert(block->u.t.first_line == line);
1890 block->u.t.first_line = next_line;
1891 }
1892 if (next_line)
1893 {
1894 assert(next_line->prev == line);
1895 next_line->prev = line->prev;
1896 }
1897 else
1898 {
1899 assert(block->u.t.last_line == line);
1900 block->u.t.last_line = line->prev;
1901 }
1902 }
1903
1904 static void
1905 append_line_to_block(fz_stext_line *line, fz_stext_block *block)
1906 {
1907 if (block->u.t.last_line == NULL)
1908 {
1909 assert(block->u.t.first_line == NULL);
1910 block->u.t.first_line = block->u.t.last_line = line;
1911 line->prev = NULL;
1912 }
1913 else
1914 {
1915 assert(block->u.t.last_line->next == NULL);
1916 line->prev = block->u.t.last_line;
1917 block->u.t.last_line->next = line;
1918 block->u.t.last_line = line;
1919 }
1920 line->next = NULL;
1921 }
1922
1923 static void
1924 unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last)
1925 {
1926 if (block->prev)
1927 {
1928 assert(block->prev->next == block);
1929 block->prev->next = block->next;
1930 }
1931 else
1932 {
1933 assert(*first == block);
1934 *first = block->next;
1935 }
1936 if (block->next)
1937 {
1938 assert(block->next->prev == block);
1939 block->next->prev = block->prev;
1940 }
1941 else
1942 {
1943 assert(*last == block);
1944 *last = block->prev;
1945 }
1946 }
1947
1948 #ifndef NDEBUG
1949 static int
1950 verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src)
1951 {
1952 fz_stext_block *block;
1953 fz_stext_block **first = src ? &src->first_block : &page->first_block;
1954 fz_stext_block **last = src ? &src->last_block : &page->last_block;
1955 int max = 0;
1956
1957 assert((*first == NULL) == (*last == NULL));
1958
1959 for (block = *first; block != NULL; block = block->next)
1960 {
1961 fz_stext_line *line;
1962
1963 if (block->prev == NULL)
1964 assert(*first == block);
1965 else
1966 assert(block->prev->next == block);
1967 if (block->next == NULL)
1968 assert(*last == block);
1969 else
1970 assert(block->next->prev == block);
1971
1972 if (block->type == FZ_STEXT_BLOCK_STRUCT)
1973 {
1974 if (block->u.s.down)
1975 {
1976 int m = verify_stext(ctx, page, block->u.s.down);
1977 if (m > max)
1978 max = m;
1979 }
1980 continue;
1981 }
1982 if (block->type != FZ_STEXT_BLOCK_TEXT)
1983 continue;
1984 assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL));
1985 for (line = block->u.t.first_line; line != NULL; line = line->next)
1986 {
1987 fz_stext_char *ch;
1988
1989 if (line->next == NULL)
1990 assert(block->u.t.last_line == line);
1991 else
1992 assert(line->next->prev == line);
1993
1994 assert((line->first_char == NULL) == (line->last_char == NULL));
1995
1996 for (ch = line->first_char; ch != NULL; ch = ch->next)
1997 assert(ch->next != NULL || line->last_char == ch);
1998 }
1999 }
2000
2001 return max+1;
2002 }
2003 #endif
2004
2005 static fz_rect
2006 move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r)
2007 {
2008 fz_stext_block *before = dest ? dest->first_block : page->first_block;
2009 fz_stext_block **sfirst = src ? &src->first_block : &page->first_block;
2010 fz_stext_block **slast = src ? &src->last_block : &page->last_block;
2011 fz_stext_block *block, *next_block;
2012
2013 for (block = *sfirst; block != NULL; block = next_block)
2014 {
2015 fz_rect bbox = fz_intersect_rect(block->bbox, r);
2016 next_block = block->next;
2017 /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
2018 if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1)
2019 continue; /* Trivially excluded */
2020 if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1)
2021 {
2022 /* Trivially included */
2023 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
2024 {
2025 if (block->u.s.down->standard == FZ_STRUCTURE_TD)
2026 {
2027 /* Don't copy TDs! Just copy the contents */
2028 move_contained_content(ctx, page, dest, block->u.s.down, r);
2029 continue;
2030 }
2031 }
2032 unlink_block(block, sfirst, slast);
2033 insert_block_before(block, before, page, dest);
2034 assert(before == block->next);
2035 continue;
2036 }
2037 if (block->type == FZ_STEXT_BLOCK_STRUCT)
2038 {
2039 if (block->u.s.down)
2040 move_contained_content(ctx, page, dest, block->u.s.down, r);
2041 }
2042 if (block->type == FZ_STEXT_BLOCK_TEXT)
2043 {
2044 /* Partially included text block */
2045 fz_stext_line *line, *next_line;
2046 fz_stext_block *newblock = NULL;
2047
2048 for (line = block->u.t.first_line; line != NULL; line = next_line)
2049 {
2050 fz_rect lrect = fz_intersect_rect(line->bbox, r);
2051 next_line = line->next;
2052
2053 /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */
2054 if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1)
2055 continue; /* Trivial exclusion */
2056 if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1)
2057 {
2058 /* Trivial inclusion */
2059 if (newblock == NULL)
2060 {
2061 newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
2062 insert_block_before(newblock, before, page, dest);
2063 newblock->u.t.flags = block->u.t.flags;
2064 assert(before == newblock->next);
2065 }
2066
2067 unlink_line_from_block(line, block);
2068 append_line_to_block(line, newblock);
2069 }
2070 else
2071 {
2072 /* Need to walk the line and just take parts */
2073 fz_stext_line *newline = NULL;
2074 fz_stext_char *ch, *next_ch, *prev_ch = NULL;
2075
2076 for (ch = line->first_char; ch != NULL; ch = next_ch)
2077 {
2078 fz_rect crect = fz_rect_from_quad(ch->quad);
2079 float x = (crect.x0 + crect.x1)/2;
2080 float y = (crect.y0 + crect.y1)/2;
2081 next_ch = ch->next;
2082 if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y)
2083 {
2084 prev_ch = ch;
2085 continue;
2086 }
2087 /* Take this char */
2088 if (newline == NULL)
2089 {
2090 newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline));
2091 newline->dir = line->dir;
2092 newline->wmode = line->wmode;
2093 newline->bbox = fz_empty_rect;
2094 }
2095 /* Unlink char */
2096 if (prev_ch == NULL)
2097 line->first_char = next_ch;
2098 else
2099 prev_ch->next = next_ch;
2100 if (next_ch == NULL)
2101 line->last_char = prev_ch;
2102 /* Relink char */
2103 ch->next = NULL;
2104 if (newline->last_char == NULL)
2105 newline->first_char = ch;
2106 else
2107 newline->last_char->next = ch;
2108 newline->last_char = ch;
2109 newline->bbox = fz_union_rect(newline->bbox, crect);
2110 }
2111 if (line->first_char == NULL)
2112 {
2113 /* We've removed all the chars from this line.
2114 * Better remove the line too. */
2115 if (line->prev)
2116 line->prev->next = next_line;
2117 else
2118 block->u.t.first_line = next_line;
2119 if (next_line)
2120 next_line->prev = line->prev;
2121 else
2122 block->u.t.last_line = line->prev;
2123 }
2124 if (newline)
2125 {
2126 if (newblock == NULL)
2127 {
2128 newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
2129
2130 /* Add the block onto our target list */
2131 insert_block_before(newblock, before, page, dest);
2132 newblock->u.t.flags = block->u.t.flags;
2133 before = newblock->next;
2134 }
2135 append_line_to_block(newline, newblock);
2136 }
2137 }
2138 }
2139 if (newblock)
2140 {
2141 recalc_bbox(block);
2142 recalc_bbox(newblock);
2143 }
2144 if (block->u.t.first_line == NULL)
2145 {
2146 /* We've removed all the lines from the block. Should remove that too! */
2147 if (block->prev)
2148 {
2149 assert(block->prev->next == block);
2150 block->prev->next = next_block;
2151 }
2152 else
2153 {
2154 assert(*sfirst == block);
2155 *sfirst = block->next;
2156 }
2157 if (next_block)
2158 {
2159 assert(next_block->prev == block);
2160 next_block->prev = block->prev;
2161 }
2162 else
2163 {
2164 assert(*slast == block);
2165 *slast = block->prev;
2166 }
2167 }
2168 }
2169 }
2170
2171 return r;
2172 }
2173
2174 typedef struct
2175 {
2176 fz_stext_block *block;
2177 fz_stext_struct *parent;
2178 } tree_pos;
2179
2180 /* This is still not perfect, but it's better! */
2181 static tree_pos
2182 find_table_insertion_point(fz_context *ctx, fz_rect r, tree_pos current, tree_pos best)
2183 {
2184 fz_stext_block *block;
2185
2186 for (block = current.block; block != NULL; block = block->next)
2187 {
2188 if (block->type == FZ_STEXT_BLOCK_STRUCT)
2189 {
2190 if (block->u.s.down != NULL && fz_is_rect_inside_rect(r, block->bbox))
2191 {
2192 tree_pos down;
2193
2194 down.parent = block->u.s.down;
2195 down.block = block->u.s.down->first_block;
2196 best = find_table_insertion_point(ctx, r, down, best);
2197 }
2198 }
2199 else
2200 {
2201 /* Is block a better precursor than best? (Or a valid precursor, if best.block == NULL) */
2202 if (block->bbox.y1 < r.y0 && (best.block == NULL || best.block->bbox.y1 < block->bbox.y1))
2203 {
2204 best.block = block;
2205 best.parent = current.parent;
2206 }
2207 }
2208 }
2209
2210 return best;
2211 }
2212
2213 static int
2214 tr_is_empty(fz_context *ctx, fz_stext_block *block)
2215 {
2216 for (; block != NULL; block = block->next)
2217 {
2218 if (block->type != FZ_STEXT_BLOCK_STRUCT)
2219 return 0;
2220 if (!block->u.s.down)
2221 {
2222 }
2223 else if (block->u.s.down->standard != FZ_STRUCTURE_TD)
2224 return 0;
2225 else if (block->u.s.down->first_block != NULL)
2226 return 0;
2227 }
2228
2229 return 1;
2230 }
2231
2232 static int
2233 table_is_empty(fz_context *ctx, fz_stext_block *block)
2234 {
2235 for (; block != NULL; block = block->next)
2236 {
2237 if (block->type == FZ_STEXT_BLOCK_GRID)
2238 continue;
2239 if (block->type != FZ_STEXT_BLOCK_STRUCT)
2240 return 0;
2241 if (!block->u.s.down)
2242 {
2243 }
2244 else if (block->u.s.down->standard != FZ_STRUCTURE_TR)
2245 return 0;
2246 else if (!tr_is_empty(ctx, block->u.s.down->first_block))
2247 return 0;
2248 }
2249
2250 return 1;
2251 }
2252
2253 static void
2254 tidy_td_divs(fz_context *ctx, fz_stext_struct *parent)
2255 {
2256 while (1)
2257 {
2258 fz_stext_block *block = parent->first_block;
2259
2260 if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV)
2261 return;
2262
2263 parent->first_block = block->u.s.down->first_block;
2264 parent->last_block = block->u.s.down->last_block;
2265 }
2266 }
2267
2268 static void
2269 tidy_tr_divs(fz_context *ctx, fz_stext_block *block)
2270 {
2271 for (; block != NULL; block = block->next)
2272 {
2273 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD)
2274 tidy_td_divs(ctx, block->u.s.down);
2275 }
2276 }
2277
2278 static void
2279 tidy_table_divs(fz_context *ctx, fz_stext_block *block)
2280 {
2281 for (; block != NULL; block = block->next)
2282 {
2283 if (block->type == FZ_STEXT_BLOCK_GRID)
2284 continue;
2285 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR)
2286 tidy_tr_divs(ctx, block->u.s.down->first_block);
2287 }
2288 }
2289
2290 static int
2291 struct_is_empty(fz_context *ctx, fz_stext_block *block)
2292 {
2293 for (; block != NULL; block = block->next)
2294 {
2295 if (block->type != FZ_STEXT_BLOCK_STRUCT)
2296 return 0;
2297 if (!block->u.s.down)
2298 {
2299 }
2300 else if (!struct_is_empty(ctx, block->u.s.down->first_block))
2301 return 0;
2302 }
2303
2304 return 1;
2305 }
2306
2307 static int
2308 div_is_empty(fz_context *ctx, fz_stext_block *block)
2309 {
2310 for (; block != NULL; block = block->next)
2311 {
2312 if (block->type != FZ_STEXT_BLOCK_STRUCT)
2313 return 0;
2314 if (!block->u.s.down)
2315 {
2316 }
2317 else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
2318 {
2319 tidy_table_divs(ctx, block->u.s.down->first_block);
2320 return table_is_empty(ctx, block->u.s.down->first_block);
2321 }
2322 else if (block->u.s.down->standard != FZ_STRUCTURE_DIV)
2323 {
2324 if (!struct_is_empty(ctx, block->u.s.down->first_block))
2325 return 0;
2326 }
2327 else if (!div_is_empty(ctx, block->u.s.down->first_block))
2328 return 0;
2329 }
2330
2331 return 1;
2332 }
2333
2334 static void
2335 tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
2336 {
2337 fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block;
2338 fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block;
2339 fz_stext_block *block, *next_block;
2340
2341 for (block = *first_blockp; block != NULL; block = next_block)
2342 {
2343 next_block = block->next;
2344 if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down)
2345 {
2346 if (block->u.s.down->standard == FZ_STRUCTURE_TABLE)
2347 {
2348 tidy_table_divs(ctx, block->u.s.down->first_block);
2349 if (table_is_empty(ctx, block->u.s.down->first_block))
2350 {
2351 /* Remove block */
2352 if (block->prev)
2353 {
2354 assert(block->prev->next == block);
2355 block->prev->next = next_block;
2356 }
2357 else
2358 {
2359 assert(*first_blockp == block);
2360 *first_blockp = next_block;
2361 }
2362 if (next_block)
2363 {
2364 assert(next_block->prev == block);
2365 next_block->prev = block->prev;
2366 }
2367 else
2368 {
2369 assert(*last_blockp == block);
2370 *last_blockp = block->prev;
2371 }
2372 }
2373 else
2374 {
2375 tidy_orphaned_tables(ctx, page, block->u.s.down);
2376 }
2377 }
2378 if (block->u.s.down->standard == FZ_STRUCTURE_DIV)
2379 {
2380 tidy_orphaned_tables(ctx, page, block->u.s.down);
2381 if (div_is_empty(ctx, block->u.s.down->first_block))
2382 {
2383 /* Remove block */
2384 if (block->prev)
2385 {
2386 assert(block->prev->next == block);
2387 block->prev->next = next_block;
2388 }
2389 else
2390 {
2391 assert(*first_blockp == block);
2392 *first_blockp = next_block;
2393 }
2394 if (next_block)
2395 {
2396 assert(next_block->prev == block);
2397 next_block->prev = block->prev;
2398 }
2399 else
2400 {
2401 assert(*last_blockp == block);
2402 *last_blockp = block->prev;
2403 }
2404 }
2405 }
2406 }
2407 }
2408 }
2409
2410 static fz_stext_struct *
2411 transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent)
2412 {
2413 int w = gd->xpos->len;
2414 int h = gd->ypos->len;
2415 int x, y;
2416 char *sent_tab = fz_calloc(ctx, 1, w*(size_t)h);
2417 fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
2418 fz_stext_struct *table, *tr, *td;
2419 fz_rect r;
2420 tree_pos insert, top;
2421
2422 /* Where should we insert the table in the data? */
2423 r.x0 = gd->xpos->list[0].pos;
2424 r.x1 = gd->xpos->list[w-1].pos;
2425 r.y0 = gd->ypos->list[0].pos;
2426 r.y1 = gd->ypos->list[h-1].pos;
2427 #ifdef DEBUG_TABLE_STRUCTURE
2428 fz_print_stext_page_as_xml(ctx, fz_stddbg(ctx), page, 0);
2429 #endif
2430 top.block = *first_block;
2431 top.parent = parent;
2432 insert.block = NULL;
2433 insert.parent = NULL;
2434 insert = find_table_insertion_point(ctx, r, top, insert);
2435
2436 /* Convert to 'before' */
2437 if (insert.block)
2438 insert.block = insert.block->next;
2439
2440 /* Make table */
2441 table = add_struct_block_before(ctx, insert.block, page, insert.parent, FZ_STRUCTURE_TABLE, "Table");
2442
2443 /* Run through the cells, and guess at spanning. */
2444 for (y = 0; y < h-1; y++)
2445 {
2446 /* Have we sent this entire row before? */
2447 for (x = 0; x < w-1; x++)
2448 {
2449 if (!sent_tab[x+y*w])
2450 break;
2451 }
2452 if (x == w-1)
2453 continue; /* No point in sending a row with nothing in it! */
2454
2455 /* Make TR */
2456 tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR");
2457
2458 for (x = 0; x < w-1; x++)
2459 {
2460 int x2, y2;
2461 int cellw = 1;
2462 int cellh = 1;
2463
2464 /* Have we sent this cell already? */
2465 if (sent_tab[x+y*w])
2466 continue;
2467
2468 /* Find the width of the cell */
2469 for (x2 = x+1; x2 < w-1; x2++)
2470 {
2471 cell_t *cell = get_cell(gd->cells, x2, y);
2472 if (cell->v_line)
2473 break; /* Can't go past a line */
2474 if (gd->xpos->list[x2].uncertainty == 0)
2475 break; /* An uncertainty of 0 is as good as a line. */
2476 if (!cell->v_crossed)
2477 break;
2478 cellw++;
2479 }
2480 /* Find the height of the cell */
2481 for (y2 = y+1; y2 < h-1; y2++)
2482 {
2483 cell_t *cell;
2484 int h_crossed = 0;
2485 if (gd->ypos->list[y2].uncertainty == 0)
2486 break; /* An uncertainty of 0 is as good as a line. */
2487
2488 cell = get_cell(gd->cells, x, y2);
2489 if (cell->h_line)
2490 break; /* Can't extend down through a line. */
2491 if (cell->h_crossed)
2492 h_crossed = 1;
2493 for (x2 = x+1; x2 < x+cellw; x2++)
2494 {
2495 cell = get_cell(gd->cells, x2, y2);
2496 if (cell->h_line)
2497 break;
2498 if (cell->v_line)
2499 break; /* Can't go past a line */
2500 if (gd->xpos->list[x2].uncertainty == 0)
2501 break; /* An uncertainty of 0 is as good as a line. */
2502 if (!cell->v_crossed)
2503 break;
2504 if (cell->h_crossed)
2505 h_crossed = 1;
2506 }
2507 if (x2 == x+cellw && h_crossed)
2508 cellh++;
2509 else
2510 break;
2511 }
2512 /* Make TD */
2513 td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD");
2514 r.x0 = gd->xpos->list[x].pos;
2515 r.x1 = gd->xpos->list[x+cellw].pos;
2516 r.y0 = gd->ypos->list[y].pos;
2517 r.y1 = gd->ypos->list[y+cellh].pos;
2518 /* Use r, not REAL contents bbox, as otherwise spanned rows
2519 * can end up empty. */
2520 td->up->bbox = r;
2521 move_contained_content(ctx, page, td, parent, r);
2522 #ifdef DEBUG_TABLE_STRUCTURE
2523 printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh);
2524 #endif
2525 for (y2 = y; y2 < y+cellh; y2++)
2526 for (x2 = x; x2 < x+cellw; x2++)
2527 sent_tab[x2+y2*w] = 1;
2528 }
2529 r.x0 = gd->xpos->list[0].pos;
2530 r.x1 = gd->xpos->list[gd->xpos->len-1].pos;
2531 r.y0 = gd->ypos->list[y].pos;
2532 r.y1 = gd->ypos->list[y+1].pos;
2533 tr->up->bbox = r;
2534 table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox);
2535 }
2536 fz_free(ctx, sent_tab);
2537
2538 {
2539 fz_stext_block *block;
2540 fz_stext_grid_positions *xps2 = copy_grid_positions_to_pool(ctx, page, gd->xpos);
2541 fz_stext_grid_positions *yps2 = copy_grid_positions_to_pool(ctx, page, gd->ypos);
2542 block = add_grid_block(ctx, page, &table->first_block, &table->last_block);
2543 block->u.b.xs = xps2;
2544 block->u.b.ys = yps2;
2545 block->bbox.x0 = block->u.b.xs->list[0].pos;
2546 block->bbox.y0 = block->u.b.ys->list[0].pos;
2547 block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos;
2548 block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos;
2549 }
2550 tidy_orphaned_tables(ctx, page, parent);
2551
2552 return table;
2553 }
2554
2555 static void
2556 merge_column(grid_walker_data *gd, int x)
2557 {
2558 int y;
2559 for (y = 0; y < gd->cells->h; y++)
2560 {
2561 cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)];
2562 cell_t *s = &gd->cells->cell[x + y * gd->cells->w];
2563
2564 if (x > 0)
2565 memcpy(d-x, s-x, sizeof(*d) * x);
2566 d->full = s[0].full || s[1].full;
2567 d->h_crossed = s[0].h_crossed || s[1].h_crossed;
2568 d->h_line = s[0].h_line; /* == s[1].h_line */
2569 d->v_crossed = s[0].v_crossed;
2570 d->v_line = s[0].v_line;
2571 if (x < gd->cells->w - 2)
2572 memcpy(d+1, s+2, sizeof(*d) * (gd->cells->w - 2 - x));
2573 }
2574 gd->cells->w--;
2575
2576 if (x < gd->xpos->len - 2)
2577 memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], sizeof(gd->xpos->list[0]) * (gd->xpos->len - 2 - x));
2578 gd->xpos->len--;
2579 }
2580
2581 static void
2582 merge_columns(grid_walker_data *gd)
2583 {
2584 int x, y;
2585
2586 for (x = gd->cells->w-3; x >= 0; x--)
2587 {
2588 /* Can column x be merged with column x+1? */
2589 /* An empty column can certainly be merged if the h_lines are the same,
2590 * and there is no v_line. */
2591 for (y = 0; y < gd->cells->h-1; y++)
2592 {
2593 cell_t *a = get_cell(gd->cells, x, y);
2594 cell_t *b = get_cell(gd->cells, x+1, y);
2595 if (a->full || !!a->h_line != !!b->h_line || b->v_line)
2596 break;
2597 }
2598 if (y == gd->cells->h-1)
2599 goto merge_column;
2600 /* An empty column can certainly be merged if the h_lines are the same. */
2601 for (y = 0; y < gd->cells->h-1; y++)
2602 {
2603 cell_t *a = get_cell(gd->cells, x, y);
2604 cell_t *b = get_cell(gd->cells, x+1, y);
2605 if (b->full || !!a->h_line != !!b->h_line || b->v_line)
2606 break;
2607 }
2608 if (y == gd->cells->h-1)
2609 goto merge_column;
2610 /* We only ever want to merge columns if content crossed between them somewhere.
2611 * Don't use uncertainty for this, because uncertainty doesn't allow for
2612 * whitespace. */
2613 for (y = 0; y < gd->cells->h-1; y++)
2614 if (get_cell(gd->cells, x+1, y)->v_crossed == 1)
2615 break;
2616 if (y == gd->cells->h-1)
2617 continue;
2618 /* This requires all the pairs of cells in those 2 columns to be mergeable. */
2619 for (y = 0; y < gd->cells->h-1; y++)
2620 {
2621 cell_t *a = get_cell(gd->cells, x, y);
2622 cell_t *b = get_cell(gd->cells, x+1, y);
2623 /* If there is a divider, we can't merge. */
2624 if (b->v_line)
2625 break;
2626 /* If either is empty, we can merge. */
2627 if (!a->full || !b->full)
2628 continue;
2629 /* If we differ in h linedness, we can't merge */
2630 if (!!a->h_line != !!b->h_line)
2631 break;
2632 /* If both are full, we can only merge if we cross. */
2633 if (a->full && b->full && b->v_crossed)
2634 continue;
2635 /* Otherwise we can't merge */
2636 break;
2637 }
2638 if (y == gd->cells->h-1)
2639 {
2640 /* Merge the column! */
2641 merge_column:
2642 #ifdef DEBUG_TABLE_STRUCTURE
2643 printf("Merging column %d\n", x);
2644 #endif
2645 merge_column(gd, x);
2646 #ifdef DEBUG_TABLE_STRUCTURE
2647 asciiart_table(gd);
2648 #endif
2649 }
2650 }
2651 }
2652
2653 static void
2654 merge_row(grid_walker_data *gd, int y)
2655 {
2656 int x;
2657 int w = gd->cells->w;
2658 cell_t *d = &gd->cells->cell[y * w];
2659 for (x = 0; x < gd->cells->w-1; x++)
2660 {
2661 if (d->full == 0)
2662 d->full = d[w].full;
2663 if (d->v_crossed == 0)
2664 d->v_crossed = d[w].v_crossed;
2665 d++;
2666 }
2667 if (y < gd->cells->h - 2)
2668 memcpy(d, d+w, sizeof(*d) * (gd->cells->h - 2 - y) * w);
2669 gd->cells->h--;
2670
2671 if (y < gd->ypos->len - 2)
2672 memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], sizeof(gd->ypos->list[0]) * (gd->ypos->len - 2 - y));
2673 gd->ypos->len--;
2674 }
2675
2676 static void
2677 merge_rows(grid_walker_data *gd)
2678 {
2679 int x, y;
2680
2681 for (y = gd->cells->h-3; y >= 0; y--)
2682 {
2683 /* Can row y be merged with row y+1? */
2684 /* An empty row can certainly be merged if the v_lines are the same,
2685 * and there is no h_line. */
2686 for (x = 0; x < gd->cells->w-1; x++)
2687 {
2688 cell_t *a = get_cell(gd->cells, x, y);
2689 cell_t *b = get_cell(gd->cells, x, y+1);
2690 if (a->full || !!a->v_line != !!b->v_line || b->h_line)
2691 break;
2692 }
2693 if (x == gd->cells->w-1)
2694 goto merge_row;
2695 /* An empty row can certainly be merged if the v_lines are the same. */
2696 for (x = 0; x < gd->cells->w-1; x++)
2697 {
2698 cell_t *a = get_cell(gd->cells, x, y);
2699 cell_t *b = get_cell(gd->cells, x, y+1);
2700 if (b->full || !!a->v_line != !!b->v_line || b->h_line)
2701 break;
2702 }
2703 if (x == gd->cells->w-1)
2704 goto merge_row;
2705 /* We only ever want to merge rows if content crossed between them somewhere.
2706 * Don't use uncertainty for this, because uncertainty doesn't allow for
2707 * whitespace. */
2708 for (x = 0; x < gd->cells->w-1; x++)
2709 if (get_cell(gd->cells, x, y+1)->h_crossed == 1)
2710 break;
2711 if (x == gd->cells->w-1)
2712 continue;
2713 /* This requires all the pairs of cells in those 2 rows to be mergeable. */
2714 for (x = 0; x < gd->cells->w-1; x++)
2715 {
2716 cell_t *a = get_cell(gd->cells, x, y);
2717 cell_t *b = get_cell(gd->cells, x, y+1);
2718 /* If there is a divider, we can't merge. */
2719 if (b->h_line)
2720 goto cant_merge;
2721 /* If either is empty, we can merge. */
2722 if (!a->full || !b->full)
2723 continue;
2724 /* If we differ in v linedness, we can't merge */
2725 if (!!a->v_line != !!b->v_line)
2726 goto cant_merge;
2727 /* If both are full, we can only merge if we cross. */
2728 if (a->full && b->full && b->h_crossed)
2729 continue;
2730 /* Otherwise we can't merge */
2731 break;
2732 }
2733 if (x == gd->cells->w-1)
2734 {
2735 /* Merge the row! */
2736 merge_row:
2737 #ifdef DEBUG_TABLE_STRUCTURE
2738 printf("Merging row %d\n", y);
2739 #endif
2740 merge_row(gd, y);
2741 #ifdef DEBUG_TABLE_STRUCTURE
2742 asciiart_table(gd);
2743 #endif
2744 continue;
2745 }
2746
2747 /* OK, so we failed to be able to merge y and y+1. But if we get here, we
2748 * know this wasn't because of any lines being in the way. So we can try
2749 * a different set of criteria. If every non-empty cell in line y+1 is either
2750 * into from line y, or crosses into line y+2, then it's probably a 'straddling'
2751 * line. Just remove it. */
2752 if (y+2 >= gd->cells->h)
2753 continue;
2754 for (x = 0; x < gd->cells->w-1; x++)
2755 {
2756 cell_t *b = get_cell(gd->cells, x, y+1);
2757 cell_t *c = get_cell(gd->cells, x, y+2);
2758 if (!b->full)
2759 continue;
2760 if (!b->h_crossed && !c->h_crossed)
2761 break;
2762 }
2763 if (x == gd->cells->w-1)
2764 {
2765 /* Merge the row! */
2766 #ifdef DEBUG_TABLE_STRUCTURE
2767 printf("Merging row %d (case 2)\n", y);
2768 #endif
2769 merge_row(gd, y);
2770 #ifdef DEBUG_TABLE_STRUCTURE
2771 asciiart_table(gd);
2772 #endif
2773 }
2774
2775
2776 cant_merge:
2777 {}
2778 }
2779 }
2780
2781 static int
2782 remove_bordered_empty_cells(grid_walker_data *gd)
2783 {
2784 int x, y, x1, y1, x2, y2;
2785 int changed = 0;
2786
2787 /* We are looking for a region of cells that's bordered,
2788 * and empty, and where the borders don't extend...
2789 *
2790 * ' '
2791 * ' '
2792 * . . .'____'. . .
2793 * | |
2794 * | |
2795 * . . .|____|. . .
2796 * ' '
2797 * ' '
2798 * ' '
2799 */
2800
2801 for (y = 0; y < gd->cells->h-1; y++)
2802 {
2803 for (x = 0; x < gd->cells->w-1; x++)
2804 {
2805 cell_t *u = y > 0 ? get_cell(gd->cells, x, y-1) : NULL;
2806 cell_t *l = x > 0 ? get_cell(gd->cells, x-1, y) : NULL;
2807 cell_t *c = get_cell(gd->cells, x, y);
2808
2809 if (c->full)
2810 continue;
2811 if (!c->h_line || !c->v_line)
2812 continue;
2813 if (l != NULL && l->h_line)
2814 continue;
2815 if (u != NULL && u->v_line)
2816 continue;
2817 /* So c (x, y) is a reasonable top left of the bounded region. */
2818 for (x1 = x+1; x1 < gd->cells->w; x1++)
2819 {
2820 u = y > 0 ? get_cell(gd->cells, x1, y-1) : NULL;
2821 c = get_cell(gd->cells, x1, y);
2822
2823 if (u != NULL && u->v_line)
2824 goto bad_region;
2825 if (c->h_line && !c->v_line)
2826 continue;
2827 if (c->h_line || !c->v_line)
2828 goto bad_region;
2829 break;
2830 }
2831 if (x1 == gd->cells->w)
2832 goto bad_region;
2833 /* If we get here, then we have the top row of a bounded region
2834 * that runs from (x,y) to (x1-1, y). So, can we extend that region
2835 * downwards? */
2836 for (y1 = y+1; y1 < gd->cells->h; y1++)
2837 {
2838 c = get_cell(gd->cells, x, y1);
2839
2840 if (!c->h_line && c->v_line)
2841 {
2842 /* This could be the start of a line. */
2843 for (x2 = x+1; x2 < x1; x2++)
2844 {
2845 c = get_cell(gd->cells, x2, y1);
2846 if (c->full || c->h_line || c->v_line)
2847 goto bad_region;
2848 }
2849 c = get_cell(gd->cells, x1, y1);
2850 if (c->h_line || !c->v_line)
2851 goto bad_region;
2852 /* That line was fine! Region is extended. */
2853 }
2854 else if (c->h_line && !c->v_line)
2855 {
2856 /* This might be the bottom line of the bounded region. */
2857 for (x2 = x+1; x2 < x1; x2++)
2858 {
2859 c = get_cell(gd->cells, x2, y1);
2860 if (!c->h_line || c->v_line)
2861 goto bad_region;
2862 }
2863 c = get_cell(gd->cells, x1, y1);
2864 if (c->h_line || c->v_line)
2865 goto bad_region;
2866 break;
2867 }
2868 else
2869 goto bad_region;
2870 }
2871 if (y1 == gd->cells->h)
2872 goto bad_region;
2873 /* So we have a bounded region from x,y to x1-1,y1-1 */
2874 for (y2 = y; y2 < y1; y2++)
2875 {
2876 for (x2 = x; x2 < x1; x2++)
2877 {
2878 c = get_cell(gd->cells, x2, y2);
2879 c->h_line = 0;
2880 c->v_line = 0;
2881 c->full = 1;
2882 if (x2 > x)
2883 c->v_crossed = 1;
2884 if (y2 > y)
2885 c->h_crossed = 1;
2886 }
2887 c = get_cell(gd->cells, x2, y2);
2888 c->v_line = 0;
2889 }
2890 for (x2 = x; x2 < x1; x2++)
2891 get_cell(gd->cells, x2, y2)->h_line = 0;
2892 changed = 1;
2893 bad_region:
2894 {}
2895 }
2896 }
2897
2898 #ifdef DEBUG_TABLE_STRUCTURE
2899 if (changed)
2900 {
2901 printf("Simplified empty bounded cells to give:\n");
2902 asciiart_table(gd);
2903 }
2904 #endif
2905
2906 return changed;
2907 }
2908
2909 static int
2910 find_table_within_bounds(fz_context *ctx, grid_walker_data *gd, fz_stext_block *content, fz_rect bounds)
2911 {
2912 div_list xs = { 0 };
2913 div_list ys = { 0 };
2914 int failed = 1;
2915
2916 fz_try(ctx)
2917 {
2918 walk_to_find_content(ctx, &xs, &ys, content, bounds);
2919
2920 sanitize_positions(ctx, &xs);
2921 sanitize_positions(ctx, &ys);
2922
2923 /* Run across the line, counting 'winding' */
2924 /* If we don't have at least 2 rows and 2 columns, give up. */
2925 if (xs.len <= 2 || ys.len <= 2)
2926 break;
2927
2928 gd->xpos = make_table_positions(ctx, &xs, bounds.x0, bounds.x1);
2929 gd->ypos = make_table_positions(ctx, &ys, bounds.y0, bounds.y1);
2930 gd->cells = new_cells(ctx, gd->xpos->len, gd->ypos->len);
2931
2932 /* Walk the content looking for grid lines. These
2933 * lines refine our positions. */
2934 walk_grid_lines(ctx, gd, content);
2935 /* Now, we walk the content looking for content that crosses
2936 * these grid lines. This allows us to spot spanned cells. */
2937 if (calculate_spanned_content(ctx, gd, content))
2938 break; /* Unlikely to be a table. */
2939
2940 #ifdef DEBUG_TABLE_STRUCTURE
2941 asciiart_table(gd);
2942 #endif
2943 /* Now, can we remove some columns or rows? i.e. have we oversegmented? */
2944 do
2945 {
2946 merge_columns(gd);
2947 merge_rows(gd);
2948 }
2949 while (remove_bordered_empty_cells(gd));
2950
2951 /* Did we shrink the table so much it's not a table any more? */
2952 if (gd->xpos->len <= 2 || gd->ypos->len <= 2)
2953 break;
2954
2955 failed = 0;
2956 }
2957 fz_always(ctx)
2958 {
2959 fz_free(ctx, xs.list);
2960 fz_free(ctx, ys.list);
2961 }
2962 fz_catch(ctx)
2963 {
2964 fz_rethrow(ctx);
2965 }
2966
2967 return failed;
2968 }
2969
2970 static int
2971 do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent)
2972 {
2973 fz_stext_block *block;
2974 int count;
2975 fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block;
2976 int num_subtables = 0;
2977 grid_walker_data gd = { 0 };
2978
2979 /* No content? Just bale. */
2980 if (*first_block == NULL)
2981 return 0;
2982
2983 /* If all the content here looks like a column of text, don't
2984 * hunt for a table within it. */
2985 if (all_blocks_are_justified_or_headers(ctx, *first_block))
2986 return num_subtables;
2987
2988 gd.bounds = fz_infinite_rect;
2989
2990 /* First off, descend into any children to see if those look like tables. */
2991 count = 0;
2992 for (block = *first_block; block != NULL; block = block->next)
2993 {
2994 if (block->type == FZ_STEXT_BLOCK_STRUCT)
2995 {
2996 if (block->u.s.down)
2997 {
2998 num_subtables += do_table_hunt(ctx, page, block->u.s.down);
2999 count++;
3000 }
3001 }
3002 else if (block->type == FZ_STEXT_BLOCK_TEXT)
3003 count++;
3004 }
3005
3006 /* If we don't have at least a single child, no more to hunt. */
3007 /* We only need a single block, because a single text block can
3008 * contain an entire unbordered table. */
3009 if (count < 1)
3010 return num_subtables;
3011
3012 /* We only look for a table at this level if we either at the top
3013 * or on a div. This saves us looking for tables within an 'H'
3014 * for example. */
3015 if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV)
3016 return num_subtables;
3017
3018
3019 fz_var(gd);
3020
3021 fz_try(ctx)
3022 {
3023 /* Now see whether the content looks like tables. */
3024 fz_rect bounds = walk_to_find_bounds(ctx, *first_block);
3025
3026 if (find_table_within_bounds(ctx, &gd, *first_block, bounds))
3027 break;
3028
3029 if (num_subtables > 0)
3030 {
3031 /* We are risking throwing away a table we've already found for this
3032 * one. Only do it if this one is really convincing. */
3033 int x, y;
3034 for (x = 0; x < gd.xpos->len; x++)
3035 if (gd.xpos->list[x].uncertainty != 0)
3036 break;
3037 if (x != gd.xpos->len)
3038 break;
3039 for (y = 0; y < gd.ypos->len; y++)
3040 if (gd.ypos->list[y].uncertainty != 0)
3041 break;
3042 if (y != gd.ypos->len)
3043 break;
3044 }
3045
3046 #ifdef DEBUG_TABLE_STRUCTURE
3047 printf("Transcribing table: (%g,%g)->(%g,%g)\n",
3048 gd.xpos->list[0].pos,
3049 gd.ypos->list[0].pos,
3050 gd.xpos->list[gd.xpos->len-1].pos,
3051 gd.ypos->list[gd.ypos->len-1].pos);
3052 #endif
3053
3054 /* Now we should have the entire table calculated. */
3055 (void)transcribe_table(ctx, &gd, page, parent);
3056 num_subtables = 1;
3057 #ifdef DEBUG_WRITE_AS_PS
3058 {
3059 int i;
3060 printf("%% TABLE\n");
3061 for (i = 0; i < block->u.b.xs->len; i++)
3062 {
3063 if (block->u.b.xs->list[i].uncertainty)
3064 printf("0 1 0 setrgbcolor\n");
3065 else
3066 printf("0 0.5 0 setrgbcolor\n");
3067 printf("%g %g moveto %g %g lineto stroke\n",
3068 block->u.b.xs->list[i].pos, block->bbox.y0,
3069 block->u.b.xs->list[i].pos, block->bbox.y1);
3070 }
3071 for (i = 0; i < block->u.b.ys->len; i++)
3072 {
3073 if (block->u.b.ys->list[i].uncertainty)
3074 printf("0 1 0 setrgbcolor\n");
3075 else
3076 printf("0 0.5 0 setrgbcolor\n");
3077 printf("%g %g moveto %g %g lineto stroke\n",
3078 block->bbox.x0, block->u.b.ys->list[i].pos,
3079 block->bbox.x1, block->u.b.ys->list[i].pos);
3080 }
3081 }
3082 #endif
3083 }
3084 fz_always(ctx)
3085 {
3086 fz_free(ctx, gd.xpos);
3087 fz_free(ctx, gd.ypos);
3088 fz_free(ctx, gd.cells);
3089 }
3090 fz_catch(ctx)
3091 fz_rethrow(ctx);
3092
3093 return num_subtables;
3094 }
3095
3096 void
3097 fz_table_hunt(fz_context *ctx, fz_stext_page *page)
3098 {
3099 if (page == NULL)
3100 return;
3101
3102 assert(verify_stext(ctx, page, NULL));
3103
3104 do_table_hunt(ctx, page, NULL);
3105
3106 assert(verify_stext(ctx, page, NULL));
3107 }
3108
3109 fz_stext_block *
3110 fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds)
3111 {
3112 fz_stext_struct *table = NULL;
3113 grid_walker_data gd = { 0 };
3114
3115 /* No content? Just bale. */
3116 if (page == NULL || page->first_block == NULL)
3117 return NULL;
3118
3119 fz_var(gd);
3120
3121 fz_try(ctx)
3122 {
3123 gd.bounds = bounds;
3124 if (find_table_within_bounds(ctx, &gd, page->first_block, bounds))
3125 break;
3126
3127 #ifdef DEBUG_TABLE_STRUCTURE
3128 printf("Transcribing table: (%g,%g)->(%g,%g)\n",
3129 gd.xpos->list[0].pos,
3130 gd.ypos->list[0].pos,
3131 gd.xpos->list[gd.xpos->len-1].pos,
3132 gd.ypos->list[gd.ypos->len-1].pos);
3133 #endif
3134
3135 /* Now we should have the entire table calculated. */
3136 table = transcribe_table(ctx, &gd, page, NULL);
3137 #ifdef DEBUG_WRITE_AS_PS
3138 {
3139 int i;
3140 printf("%% TABLE\n");
3141 for (i = 0; i < block->u.b.xs->len; i++)
3142 {
3143 if (block->u.b.xs->list[i].uncertainty)
3144 printf("0 1 0 setrgbcolor\n");
3145 else
3146 printf("0 0.5 0 setrgbcolor\n");
3147 printf("%g %g moveto %g %g lineto stroke\n",
3148 block->u.b.xs->list[i].pos, block->bbox.y0,
3149 block->u.b.xs->list[i].pos, block->bbox.y1);
3150 }
3151 for (i = 0; i < block->u.b.ys->len; i++)
3152 {
3153 if (block->u.b.ys->list[i].uncertainty)
3154 printf("0 1 0 setrgbcolor\n");
3155 else
3156 printf("0 0.5 0 setrgbcolor\n");
3157 printf("%g %g moveto %g %g lineto stroke\n",
3158 block->bbox.x0, block->u.b.ys->list[i].pos,
3159 block->bbox.x1, block->u.b.ys->list[i].pos);
3160 }
3161 }
3162 #endif
3163 }
3164 fz_always(ctx)
3165 {
3166 fz_free(ctx, gd.xpos);
3167 fz_free(ctx, gd.ypos);
3168 fz_free(ctx, gd.cells);
3169 }
3170 fz_catch(ctx)
3171 fz_rethrow(ctx);
3172
3173 return table ? table->first_block : NULL;
3174 }