Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/fitz/stext-table.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/fitz/stext-table.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,3174 @@ +// Copyright (C) 2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" + +#include <assert.h> + +/* #define DEBUG_WRITE_AS_PS */ + +/* #define DEBUG_TABLE_STRUCTURE */ + +/* #define DEBUG_TABLE_HUNT */ + +/* + * The algorithm. + * + * The goal of the algorithm is to identify tables on a page. + * First we have to find the tables on a page, then we have to + * figure out where the columns/rows are, and then how the + * cells span them. + * + * We do this as a series of steps. + * + * To illustrate what's going on, let's use an example page + * that we can follow through all the steps. + * + * +---------------------------+ + * | | + * | #### ## ### ## | <- Title + * | | + * | ##### ##### #### ## | \ + * | ## ###### ###### ## | | + * | #### ####### ###### | |- Abstract + * | ####### #### ## ### | | + * | ### ##### ###### | / + * | | + * | ######### ######### | 2 Columns of text + * | ######### ######### | + * | ######## ######### | + * | ######### | + * | +-------+ ####### | <- With an image on the left + * | | | | + * | | | ## ## # # | <- And a table on the right + * | +-------+ ## ## # # | + * | ## ## # # | + * | ######### ## ## # # | + * | ######### ## ## # # | + * | ######### ## ## # # | + * | | + * +---------------------------+ + * + * + * Step 1: Segmentation. + * + * First, we segment the page, trying to break it down into a + * series of non-overlapping rectangles. We do this (in stext-boxer.c) + * by looking for where the content isn't. If we can identify breaks + * that run through the page (either from top to bottom or from left + * to right), then we can split the page there, and recursively consider + * the two halves of the page. + * + * It's not a perfect algorithm, but it manages to in many cases. + * + * After segmenting the above example, first we'll find the horizontal + * splits, giving: + * + * +---------------------------+ + * | | + * | #### ## ### ## | + * +---------------------------+ + * | ##### ##### #### ## | + * | ## ###### ###### ## | + * | #### ####### ###### | + * | ####### #### ## ### | + * | ### ##### ###### | + * +---------------------------+ + * | ######### ######### | + * | ######### ######### | + * | ######## ######### | + * | ######### | + * | +-------+ ####### | + * | | | | + * | | | ## ## # # | + * | +-------+ ## ## # # | + * | ## ## # # | + * | ######### ## ## # # | + * | ######### ## ## # # | + * | ######### ## ## # # | + * | | + * +---------------------------+ + * + * Then we'll recurse and find the vertical split between + * the columns: + * + * +---------------------------+ + * | | + * | #### ## ### ## | + * +---------------------------+ + * | ##### ##### #### ## | + * | ## ###### ###### ## | + * | #### ####### ###### | + * | ####### #### ## ### | + * | ### ##### ###### | + * +-------------+-------------+ + * | ######### | ######### | + * | ######### | ######### | + * | ######## | ######### | + * | | ######### | + * | +-------+ | ####### | + * | | | | | + * | | | | ## ## # # | + * | +-------+ | ## ## # # | + * | | ## ## # # | + * | ######### | ## ## # # | + * | ######### | ## ## # # | + * | ######### | ## ## # # | + * | | | + * +-------------+-------------+ + * + * Then we recurse again and find the horizontal splits + * within the columns: + * + * +---------------------------+ + * | | + * | #### ## ### ## | + * +---------------------------+ + * | ##### ##### #### ## | + * | ## ###### ###### ## | + * | #### ####### ###### | + * | ####### #### ## ### | + * | ### ##### ###### | + * +-------------+-------------+ + * | ######### | ######### | + * | ######### | ######### | + * | ######## | ######### | + * +-------------+ ######### | + * | +-------+ | ####### | + * | | | +-------------+ + * | | | | ## ## # # | + * | +-------+ | ## ## # # | + * +-------------+ ## ## # # | + * | ######### | ## ## # # | + * | ######### | ## ## # # | + * | ######### | ## ## # # | + * | | | + * +-------------+-------------+ + * + * We recurse a fixed maximum number of times (currently + * 6, IIRC) or until we fail to find any suitable splits. + * + * This completes the page segmentation step. + * + * Step 2: Grid finding + * + * Next, we look at each of those segments and try to identify + * where grids might be. + * + * Imagine the bottom right section of that page as + * a board with lego blocks on where there is text. + * Now imagine viewing that from the bottom of the page. + * The gaps between the columns of the table are where you + * can see through to the top between the blocks. + * + * Similarly, if you view it from the side, the gaps between the + * rows of the page are where you can see through to the other + * side. + * + * So, how do we code that? Well, we run through the page content + * (obviously, restricted to the content that falls into this + * segment of the page - that'll go without saying from here on + * in). For each bit of content, we look at the "x extent" of that + * content - for instance a given string might start at position + * 10 and continue to position 100. We build a list of all these + * start, and stop positions, and keep them in a sorted list. + * + * Then we walk this list from left to right, keeping a sum. I + * call this sum "wind", because it's very similar to the winding + * number that you get when doing scan conversion of bezier shapes. + * + * wind starts out as 0. We increment it whenever we pass a 'start' + * position, and decrement it whenever we pass a 'stop' position. + * So at any given x position along the line wind tells us the + * number of pieces of content that overlap that x position. + * So wind(left) = 0 = wind(right), and wind(x) >= x for all x. + * + * So, if we walk from left to right, the trace of wind might + * look something like: + * + * __ + * ___ / \ _ __ + * / \ / \/ \ _/ \_ + * / \___/ \___/ \ + * + * The left and right edges of the table are pretty clear. + * The regions where wind drops to 0 represent the column dividers. + * The left and right hand side of those regions gives us the min + * and max values for that divider. + * + * We can then repeat this process for Y ranges instead of X ranges + * to get the row dividers. + * + * BUT, this only works for pure grid tables. It falls down for + * cases where we have merged cells (which is very common due to + * titles etc). + * + * We can modify the algorithm slightly to allow for this. + * + * Consider the following table: + * + * +-----------------------------------+ + * | Long Table title across the top | + * +---------------+---------+---------+ + * | Name | Result1 | Result2 | + * +---------------+----+----+----+----+ + * | Homer Simpson | 1 | 23 | 4 | 56 | + * | Barney Gumble | 1 | 23 | 4 | 56 | + * | Moe | 1 | 23 | 4 | 56 | + * | Apu | 1 | 23 | 4 | 56 | + * | Ned Flanders | 1 | 23 | 4 | 56 | + * +---------------+----+----+----+----+ + * + * The wind trace for that looks something like + * (with a certain degree of artistic license for the + * limitations of ascii art): + * + * ________ + * / \ _ __ _ _ + * / \____/ \_/ \___/ \_/ \ + * / \ + * + * So, the trace never quite drops back to zero in the + * middle due to the spanning of the top title. + * + * So, instead of just looking for points where the trace + * drops to zero, we instead look for local minima. Each local + * minima represents a place where there might be a grid divider. + * The value of wind at such points can be considered the + * "uncertainty" with which there might be a divider there. + * Clear dividers (with a wind value of 0) have no uncertainty. + * Places where cells are spanned have a higher value of uncertainty. + * + * The output from this step is the list of possible grid positions + * (X and Y), with uncertainty values. + * + * We have skipped over the handling of spaces in the above + * description. On the one hand, we'd quite like to treat + * sentences as long unbroken regions of content. But sometimes + * we can get awkward content like: + * + * P subtelomeric 24.38 8.15 11.31 0.94 1.46 + * 11.08 6.93 15.34 0.00 0.73 + * Pericentromeric region; C band 20.42 9.26 13.64 0.81 0.81 + * 16.50 7.03 14.45 0.17 5.15 + + * where the content is frequently sent with spaces instead of + * small gaps (particular in tables of digits, because numerals + * are often the same width even in proportional fonts). + * + * To cope with this, as we send potential edges into the start/stop + * positions, we send 'weak' edges for the start and stops of runs + * of spaces. We then post process the edges to remove any local + * minima regions in the 'wind' values that are bounded purely by + * 'weak' edges. + * + * Step 3: Cell analysis + * + * So, armed with the output from step 2, we can examine each grid + * found. If we have W x-dividers and H y-dividers, we know we have + * a potential table with (W-1) x (H-1) cells in it. + * + * We represent this as a W x H grid of cells, each like: + * + * . . + * . . + * . . .+-------+. . . Each cell holds information about the + * | . edges above, and to the left of it. + * | . + * | . + * . . .+. . . .+. . . + * . . + * . . + * + * h_line: Is there a horizontal divider drawn on the page that + * corresponds to the top of this cell (i.e. is there a cell border + * here?) + * v_line: Is there a vertical divider drawn on the page that + * corresponds to the left of this cell (i.e. is there a cell border + * here?) + * h_crossed: Does content cross this line (i.e. are we merged + * with the cell above?) + * v_crossed: Does content cross this line (i.e. are we merged + * with the cell to the left?) + * full: Is there any content in this cell at all? + * + * We need a W x H grid of cells to represent the entire table due + * to the potential right and bottom edge lines. The right and + * bottom rows of cells should never be full, or be crossed, but + * it's easiest just to use a simple representation that copes with + * the h_line and v_line values naturally. + * + * So, we start with the cells structure empty, and we run through + * the page content, filling in the details as we go. + * + * At the end of the process, we have enough information to draw + * an asciiart representation of our table. It might look something + * like this (this comes from dotted-gridlines-tables.pdf): + * + * +-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | | | | | |#|#|#|#| | | | + * + + + + + + +v+ +v+v+ + + + + * | | | | | |#|#|#|#| | | | + * + + + + + + + +v+ + + + + + + * | |#| | |#|#|#|#|#|#|#|#| + * + +v+ + + +v+v+ +v+v+v+v+v+ + * | |#| |#|#|#|#|#|#|#|#|#| + * + + + + +v+ + +v+ + + + + + + * |#|#| #|#|#|#|#|#|#|#|#|#| + * +v+v+ +v+ +v+v+ +v+v+v+v+v+ + * |#|#| #|#|#|#|#|#|#|#|#|#| + * + + + + +v+ + +v+ + + + + + + * | |#| |#|#|#|#|#|#|#|#|#| + * + +v+ + + +v+v+ +v+v+v+v+v+ + * | |#| | |#|#|#|#|#|#|#|#| + * + + + + + + + +v+ + + + + + + * | | | | | |#|#|#|#| | | | + * + + + + + + +v+ +v+v+ + + + + * | | | | | |#|#|#|#| | | | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | |# |#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |# |#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |#>#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |#>#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |#>#|#| |#| | | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |# |#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |# |#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |# |#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |# |#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * |# |#|#|#|#|#|#|#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+-+ + * + * This shows where lines are detected ( - and | ), + * where they are crossed ( > and v) and where cells + * are full ( # ). + * + * Step 4: Row and column merging. + * + * Based on the information above, we then try to merge + * cells and columns to simplify the table. + * + * The best rules I've come up with this so far are: + * We can merge two adjacent columns if all the pairs of + * cells in the two columns are mergeable. + * + * Cells are held to be mergeable or not based upon the following + * rules: + * If there is a line between 2 cells - not mergeable. + * else if the uncertainty between 2 cells is 0 - not mergeable. + * else if the line between the 2 cells is crossed - mergeable. + * else if strictly one of the cells is full - mergeable. + * else not mergeable. + * + * So in the above example, column 2 (numbered from 0) can be merged + * with column 3. + * + * This gives: + * + * +-+-+-+-+-+-+-+-+-+-+-+-+ + * | | | | | |#|#|#|#| | | | + * + + + + + +v+ +v+v+ + + + + * | | | | | |#|#|#|#| | | | + * + + + + + + +v+ + + + + + + * | |#| | |#|#|#|#|#|#|#|#| + * + +v+ + +v+v+ +v+v+v+v+v+ + * | |#| |#|#|#|#|#|#|#|#|#| + * + + + +v+ + +v+ + + + + + + * |#|#|#|#|#|#|#|#|#|#|#|#| + * +v+v+v+ +v+v+ +v+v+v+v+v+ + * |#|#|#|#|#|#|#|#|#|#|#|#| + * + + + +v+ + +v+ + + + + + + * | |#| |#|#|#|#|#|#|#|#|#| + * + +v+ + +v+v+ +v+v+v+v+v+ + * | |#| | |#|#|#|#|#|#|#|#| + * + + + + + + +v+ + + + + + + * | | | | | |#|#|#|#| | | | + * + + + + + +v+ +v+v+ + + + + * | | | | | |#|#|#|#| | | | + * +-+-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| |#| | | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * | |#|#| | |#| | |#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * |# |#|#|#|#|#|#|#|#|#| + * + +-+-+-+-+-+-+-+-+-+-+-+ + * + * We then perform the same merging process for rows as for + * columns - though there are no rows in the above example + * that can be merged. + * + * You'll note that, for example, we don't merge row 0 and + * row 1 in the above, because we have a pair of cells that + * are both full without crossing. + * + * Step 5: Cell spanning + * + * Now we actually start to output the table. We keep a 'sent_table' + * (a grid of W x H bools) to keep track of whether we've output + * the content for a given cell or not yet. + * + * For each cell we reach, assuming sent_table[x,y] is false, + * we merge it with as many cells on the right as required, + * according to 'v_crossed' values (subject to not passing + * v_lines or uncertainty == 0's). + * + * We then try to merge cells below according to 'h_crossed' + * values (subject to not passing h_lines or uncertainty == 0's). + * + * In theory this can leave us with some cases where we'd like + * to merge some cells (because of crossed) and can't (because + * of lines or sent_table[]) values. In the absence of better + * cell spanning algorithms we have no choice here. + * + * Then we output the contents and set sent_table[] values as + * appropriate. + * + * If a row has no cells in it, we currently omit the TR. If/when + * we figure out how to indicate rowspan/colspan in stext, we can + * revisit that. + */ + + +static fz_stext_block * +add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block) +{ + fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block)); + memset(block, 0, sizeof(*block)); + block->type = FZ_STEXT_BLOCK_GRID; + block->bbox = fz_empty_rect; /* Fixes bug 703267. */ + block->next = *first_block; + if (*first_block) + { + (*first_block)->prev = block; + assert(*last_block); + } + else + { + assert(*last_block == NULL); + *last_block = block; + } + *first_block = block; + return block; +} + +static void +insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest) +{ + if (before) + { + /* We have a block to insert it before, so we know it's not the last. */ + assert(dest ? (dest->first_block != NULL && dest->last_block != NULL) : (page->first_block != NULL && page->last_block != NULL)); + block->next = before; + block->prev = before->prev; + if (before->prev) + { + assert(before->prev->next == before); + before->prev->next = block; + } + else if (dest) + { + assert(dest->first_block == before); + dest->first_block = block; + } + else + { + assert(page->first_block == before); + page->first_block = block; + } + before->prev = block; + } + else if (dest) + { + /* Will be the last block. */ + block->next = NULL; + block->prev = dest->last_block; + if (dest->last_block) + { + assert(dest->last_block->next == NULL); + dest->last_block->next = block; + } + if (dest->first_block == NULL) + dest->first_block = block; + dest->last_block = block; + } + else + { + /* Will be the last block. */ + block->next = NULL; + block->prev = page->last_block; + if (page->last_block) + { + assert(page->last_block->next == NULL); + page->last_block->next = block; + } + if (page->first_block == NULL) + page->first_block = block; + page->last_block = block; + } +} + +static fz_stext_struct * +add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw) +{ + fz_stext_block *block; + int idx = 0; + size_t z; + fz_stext_struct *newstruct; + + if (raw == NULL) + raw = ""; + z = strlen(raw); + + /* We're going to insert a struct block. We need an idx, so walk the list */ + for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + assert(block->u.s.index >= idx); + idx = block->u.s.index + 1; + } + } + /* So we'll add our block as idx. But all the other struct blocks that follow us need to have + * larger values. */ + + /* Update all the subsequent structs to have a higher idx */ + if (before) + { + int idx2 = idx+1; + for (block = before->next; block != NULL; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_STRUCT) + continue; + if (block->u.s.index > idx2) + break; + block->u.s.index = idx2++; + } + } + + /* Now make our new struct block and insert it. */ + block = fz_pool_alloc(ctx, page->pool, sizeof(*block)); + block->type = FZ_STEXT_BLOCK_STRUCT; + block->bbox = fz_empty_rect; /* Fixes bug 703267. */ + insert_block_before(block, before, page, parent); + + block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z); + block->u.s.index = idx; + newstruct->parent = parent; + newstruct->standard = std; + memcpy(newstruct->raw, raw, z); + newstruct->raw[z] = 0; + newstruct->up = block; + + return newstruct; +} + +typedef struct +{ + int len; + int max; + struct { + uint8_t left; + uint8_t weak; + float pos; + int freq; + } *list; +} div_list; + +static void +div_list_push(fz_context *ctx, div_list *div, int left, int weak, float pos) +{ + int i; + + /* FIXME: Could be bsearch. */ + for (i = 0; i < div->len; i++) + { + if (div->list[i].pos > pos) + break; + else if (div->list[i].pos == pos && div->list[i].left == left) + { + div->list[i].freq++; + div->list[i].weak &= weak; + return; + } + } + + if (div->len == div->max) + { + int newmax = div->max * 2; + if (newmax == 0) + newmax = 32; + div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax); + div->max = newmax; + } + + if (i < div->len) + memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i)); + div->len++; + div->list[i].left = left; + div->list[i].weak = weak; + div->list[i].pos = pos; + div->list[i].freq = 1; +} + +static fz_stext_grid_positions * +make_table_positions(fz_context *ctx, div_list *xs, float min, float max) +{ + int wind; + fz_stext_grid_positions *pos; + int len = xs->len; + int i; + int hi = 0; + + /* Count the number of edges */ + int local_min = 0; + int edges = 2; + + if (len == 0) + return NULL; + + assert(xs->list[0].left); + for (i = 0; i < len; i++) + { + if (xs->list[i].pos >= min) + break; + } + for (; i < len; i++) + { + if (xs->list[i].pos >= max) + break; + if (xs->list[i].left) + { + if (local_min) + edges++; + } + else + local_min = 1; + } + assert(!xs->list[len-1].left); + + pos = fz_malloc_flexible(ctx, fz_stext_grid_positions, list, edges); + pos->len = edges; + + /* Copy the edges in */ + wind = 0; + local_min = 0; + edges = 1; + pos->list[0].pos = min; + pos->list[0].min = min; + pos->list[0].max = fz_max(xs->list[0].pos, min); + pos->list[0].uncertainty = 0; + pos->list[0].reinforcement = 0; +#ifdef DEBUG_TABLE_HUNT + printf("|%g ", pos->list[0].pos); +#endif + /* Skip over entries to the left of min. */ + for (i = 0; i < len; i++) + { + if (xs->list[i].pos >= min) + break; + if (xs->list[i].left) + wind += xs->list[i].freq; + else + wind -= xs->list[i].freq; + } + for (; i < len; i++) + { + if (xs->list[i].pos >= max) + break; + if (xs->list[i].left) + { + if (local_min) + { + pos->list[edges].min = xs->list[i-1].pos; + pos->list[edges].max = xs->list[i].pos; + pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2; + pos->list[edges].uncertainty = wind; +#ifdef DEBUG_TABLE_HUNT + if (wind) + printf("?%g(%d) ", pos->list[edges].pos, wind); + else + printf("|%g ", pos->list[edges].pos); +#endif + edges++; + } + wind += xs->list[i].freq; + if (wind > hi) + hi = wind; + } + else + { + wind -= xs->list[i].freq; + local_min = 1; + } + } + assert(i < len || wind == 0); + pos->list[edges].pos = max; + pos->list[edges].min = fz_min(xs->list[i-1].pos, max); + pos->list[edges].max = max; + assert(max >= xs->list[i-1].pos); + pos->list[edges].uncertainty = 0; + pos->list[edges].reinforcement = 0; + pos->max_uncertainty = hi; +#ifdef DEBUG_TABLE_HUNT + printf("|%g\n", pos->list[edges].pos); +#endif + + return pos; +} + +static fz_stext_grid_positions * +copy_grid_positions_to_pool(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs) +{ + size_t z = offsetof(fz_stext_grid_positions, list) + sizeof(xs->list[0]) * (xs->len); + fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z); + memcpy(xs2, xs, z); + return xs2; +} + +static void +sanitize_positions(fz_context *ctx, div_list *xs) +{ + int i, j, wind, changed; + +#ifdef DEBUG_TABLE_HUNT + printf("OK:\n"); + for (i = 0; i < xs->len; i++) + { + if (xs->list[i].left) + printf("["); + printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); + if (!xs->list[i].left) + printf("]"); + printf(" "); + } + printf("\n"); +#endif + + if (xs->len == 0) + return; + + do + { + /* Now, combine runs of left and right */ + for (i = 0; i < xs->len; i++) + { + if (xs->list[i].left) + { + j = i; + while (i < xs->len-1 && xs->list[i+1].left) + { + i++; + xs->list[j].freq += xs->list[i].freq; + xs->list[j].weak &= xs->list[i].weak; + xs->list[i].freq = 0; + } + } + else + { + while (i < xs->len-1 && !xs->list[i+1].left) + { + i++; + xs->list[i].freq += xs->list[i-1].freq; + xs->list[i].weak &= xs->list[i-1].weak; + xs->list[i-1].freq = 0; + } + } + } + +#ifdef DEBUG_TABLE_HUNT + printf("Shrunk:\n"); + for (i = 0; i < xs->len; i++) + { + if (xs->list[i].left) + printf("["); + printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); + if (!xs->list[i].left) + printf("]"); + printf(" "); + } + printf("\n"); +#endif + + /* Now remove the 0 frequency ones. */ + j = 0; + for (i = 0; i < xs->len; i++) + { + if (xs->list[i].freq == 0) + continue; + if (i != j) + xs->list[j] = xs->list[i]; + j++; + } + xs->len = j; + + /* Now run across looking for local minima where at least one + * edge is 'weak'. If the wind at that point is non-zero, then + * remove the weak edges from consideration and retry. */ + wind = 0; + changed = 0; + i = 0; + while (1) + { + assert(xs->list[i].left); + for (; xs->list[i].left; i++) + { + wind += xs->list[i].freq; + } + assert(i < xs->len); + for (; xs->list[i].left == 0 && i < xs->len; i++) + { + wind -= xs->list[i].freq; + } + if (i == xs->len) + break; + if (wind != 0 && (xs->list[i-1].weak || xs->list[i].weak)) + { + int m = fz_mini(xs->list[i-1].freq, xs->list[i].freq); + assert(m > 0); + xs->list[i-1].freq -= m; + xs->list[i].freq -= m; + changed = 1; + } + } + } + while (changed); + +#ifdef DEBUG_TABLE_HUNT + printf("Compacted:\n"); + for (i = 0; i < xs->len; i++) + { + if (xs->list[i].left) + printf("["); + printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); + if (!xs->list[i].left) + printf("]"); + printf(" "); + } + printf("\n"); +#endif +} + +/* We want to check for whether a DIV that we are about to descend into + * contains a column of justified text. We will accept some headers in + * this text, but not JUST headers. */ +static int +all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block) +{ + int just_headers = 1; + + for (; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + if (block->u.s.down == NULL) + continue; + if (block->u.s.down->standard == FZ_STRUCTURE_H || + block->u.s.down->standard == FZ_STRUCTURE_H1 || + block->u.s.down->standard == FZ_STRUCTURE_H2 || + block->u.s.down->standard == FZ_STRUCTURE_H3 || + block->u.s.down->standard == FZ_STRUCTURE_H4 || + block->u.s.down->standard == FZ_STRUCTURE_H5 || + block->u.s.down->standard == FZ_STRUCTURE_H6) + continue; + if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block)) + return 0; + } + just_headers = 0; + if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL) + return 0; + } + + if (just_headers) + return 0; + + return 1; +} + +#define TWO_INCHES (72*2) + +/* Walk through the blocks, finding the bbox. */ +static fz_rect +walk_to_find_bounds(fz_context *ctx, fz_stext_block *first_block) +{ + fz_rect bounds = fz_empty_rect; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + + for (block = first_block; block != NULL; block = block->next) + { + switch (block->type) + { + case FZ_STEXT_BLOCK_STRUCT: + if (!block->u.s.down) + continue; + if (block->u.s.down->standard == FZ_STRUCTURE_H) + { + if (block->next != NULL && + block->next->type == FZ_STEXT_BLOCK_TEXT && + block->next->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) + continue; + } + bounds = fz_union_rect(bounds, walk_to_find_bounds(ctx, block->u.s.down->first_block)); + break; + case FZ_STEXT_BLOCK_VECTOR: + bounds = fz_union_rect(bounds, block->bbox); + break; + case FZ_STEXT_BLOCK_TEXT: + if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL && block->bbox.x1 - block->bbox.x0 >= TWO_INCHES) + continue; + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + for (ch = line->first_char; ch != NULL; ch = ch->next) + { + if (ch->c != ' ') + bounds = fz_union_rect(bounds, fz_rect_from_quad(ch->quad)); + } + } + break; + } + } + + return bounds; +} + +static void +walk_to_find_content(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, fz_rect bounds) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + + for (block = first_block; block != NULL; block = block->next) + { + switch (block->type) + { + case FZ_STEXT_BLOCK_STRUCT: + if (block->u.s.down && !fz_is_empty_rect(fz_intersect_rect(bounds, block->bbox))) + walk_to_find_content(ctx, xs, ys, block->u.s.down->first_block, bounds); + break; + case FZ_STEXT_BLOCK_VECTOR: + break; + case FZ_STEXT_BLOCK_TEXT: + { + fz_rect justified_region = fz_empty_rect; + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + fz_rect region = fz_empty_rect; + + region.y0 = line->bbox.y0; + region.y1 = line->bbox.y1; + + if (region.y0 < bounds.y0) + region.y0 = bounds.y0; + if (region.y1 > bounds.y1) + region.y1 = bounds.y1; + if (region.y0 >= region.y1) + break; + + /* Skip leading spaces. */ + for (ch = line->first_char; ch != NULL; ch = ch->next) + if (ch->c != ' ') + break; + + for (; ch != NULL; ch = ch->next) + { + if (ch->c == ' ') + { + /* Find the last space char in this run. */ + fz_stext_char *last_space; + + for (last_space = ch; last_space->next != NULL && last_space->next->c == ' '; last_space = last_space->next); + + /* If we're not the last char in the line (i.e. we're not a trailing space, + * then send a 'weak' gap for the spaces, assuming it's sane to do so). */ + if (last_space->next != NULL) + { + float rpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); + float lpos = fz_min(last_space->next->quad.ll.x, last_space->next->quad.ll.x); + + /* Clamp these to the bounds */ + rpos = fz_clamp(rpos, bounds.x0, bounds.x1); + lpos = fz_clamp(lpos, bounds.x0, bounds.x1); + + /* So we have a region (rpos...lpos) to add. */ + /* This can be positioned in various different ways relative to the + * current region: + * [region] + * (rpos..lpos) OK + * (rpos..lpos) OK, but adjust lpos + * (rpos..lpos) OK, but adjust rpos + * (rpos..lpos) OK + * (rpos .. lpos) OK + */ + if (lpos >= region.x1) + { + if (rpos >= region.x0 && rpos < region.x1) + rpos = region.x1; + } + else if (rpos <= region.x0) + { + if (lpos > region.x0) + lpos = region.x0; + } + else + rpos = lpos; /* Make it an invalid region */ + + if (rpos < lpos) + { + /* Send a weak right at the start of the spaces... */ + div_list_push(ctx, xs, 0, 1, rpos); + /* and a weak left at the end. */ + div_list_push(ctx, xs, 1, 1, lpos); + } + + /* Expand the region as required. */ + if (rpos < region.x0) + region.x0 = rpos; + if (lpos > region.x1) + region.x1 = lpos; + } + /* Jump over the spaces */ + ch = last_space; + } + else + { + float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); + float rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x); + if (lpos < region.x0) + region.x0 = lpos; + if (rpos > region.x1) + region.x1 = rpos; + } + } + if (!fz_is_empty_rect(region)) + { + div_list_push(ctx, xs, 1, 0, region.x0); + div_list_push(ctx, xs, 0, 0, region.x1); + /* For justified regions, we don't break after each line, but + * rather before/after the region as a whole. */ + if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL) + { + div_list_push(ctx, ys, 1, 0, region.y0); + div_list_push(ctx, ys, 0, 0, region.y1); + } + else + justified_region = fz_union_rect(justified_region, region); + } + } + if (!fz_is_empty_rect(justified_region) && block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) + { + div_list_push(ctx, ys, 1, 0, justified_region.y0); + div_list_push(ctx, ys, 0, 0, justified_region.y1); + } + break; + } + } + } +} + +/* One of our datastructures (cells_t) is an array of details about the + * cells that make up our table. It's a w * h array of cell_t's. Each + * cell contains data on one of the cells in the table, as you'd expect. + * + * . . + * . . + * - - +-------+ - - + * | . + * | . + * | . + * - - + - - - + - - + * . . + * . . + * + * For any given cell, we store details about the top (lowest y coord) + * and left (lowest x coord) edges. Specifically we store whether + * there is a line at this position (h_line and v_line) (i.e. a drawn + * border), and we also store whether content crosses this edge (h_crossed + * and y_crossed). Finally, we store whether the cell has any content + * in it at all (full). + * + * A table which has w positions across and h positions vertically, will + * only really have (w-1) * (h-1) cells. We store w*h though to allow for + * the right and bottom edges to have their lines represented. + */ + +typedef struct +{ + int h_line; + int v_line; + int h_crossed; + int v_crossed; + int full; +} cell_t; + +typedef struct +{ + int w; + int h; + cell_t cell[FZ_FLEXIBLE_ARRAY]; +} cells_t; + +typedef struct +{ + cells_t *cells; + fz_stext_grid_positions *xpos; + fz_stext_grid_positions *ypos; + fz_rect bounds; +} grid_walker_data; + +static cell_t * +get_cell(cells_t *cells, int x, int y) +{ + return &cells->cell[x + y * cells->w]; +} + +#ifdef DEBUG_TABLE_STRUCTURE +static void +asciiart_table(grid_walker_data *gd); +#endif + +static fz_stext_grid_positions * +split_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, int i, int early) +{ + fz_stext_grid_positions **posp = row ? &gd->ypos : &gd->xpos; + fz_stext_grid_positions *pos = *posp; + int n = pos->len; + int x, y, w, h; + cells_t *cells; + + /* Realloc the required structs */ + *posp = pos = fz_realloc_flexible(ctx, pos, fz_stext_grid_positions, list, n+1); + cells = gd->cells = fz_realloc_flexible(ctx, gd->cells, cells_t, cell, (gd->cells->w + (1-row)) * (gd->cells->h + row)); + /* If both pass, then we're safe to shuffle the data. */ + +#ifdef DEBUG_TABLE_STRUCTURE + printf("Before split %s %d\n", row ? "row" : "col", i); + asciiart_table(gd); +#endif + + assert(i >= 0 && i < n); + memmove(&pos->list[i+1], &pos->list[i], sizeof(pos->list[0]) * (n-i)); + pos->len++; + + /* Next expand the cells. Only h_line and v_line are filled in so far. */ + w = cells->w; + h = cells->h; + if (early && i > 0) + i--, early = 0; + if (row) + { + /* Add a row */ + cells->h = h+1; + /* Expand the table, duplicating row i */ + memmove(&cells->cell[(i+1)*w], &cells->cell[i*w], sizeof(cells->cell[0])*(h-i)*w); + + if (early) + { + /* We are splitting row 0 into 0 and 1, with 0 being the new one. */ + for (x = 0; x < w; x++) + { + cells->cell[x].h_line = 0; + cells->cell[x].v_line = 0; + } + } + else + { + /* We are splitting row i into i and i+1, with i+1 being the new one. */ + /* v_lines are carried over. h_lines need to be unset. */ + for (x = 0; x < w; x++) + cells->cell[x + (i+1)*w].h_line = 0; + } + } + else + { + /* Add a column */ + cells->w = w+1; + /* Expand the table, duplicating column i */ + for (y = h-1; y >= 0; y--) + { + for (x = w; x > i; x--) + cells->cell[x + y*(w+1)] = cells->cell[x-1 + y*w]; + for (; x >= 0; x--) + cells->cell[x + y*(w+1)] = cells->cell[x + y*w]; + } + if (early) + { + /* We are splitting col 0 into 0 and 1, with 0 being the new one. */ + for (y = 0; y < h; y++) + { + cells->cell[y*(w+1)].h_line = 0; + cells->cell[y*(w+1)].v_line = 0; + } + } + else + { + /* h_lines are carried over. v_lines need to be reset */ + for (y = 0; y < h; y++) + cells->cell[i+1 + y*(w+1)].v_line = 0; + } + } + +#ifdef DEBUG_TABLE_STRUCTURE + printf("After split\n"); + asciiart_table(gd); +#endif + return pos; +} + +/* This routine finds (and reinforces) grid positions for lines. + * + * If we have a thin line from (x0, y0) to (x1, y0), then we are + * pretty sure that y0 will be on the edge of a cell. We are less + * sure that x0 and x1 will match up to the edge of a cell. + * Stylistically some tables overrun or underrun such lines. + * + * Similarly from (x0, y0) to (x0, y1), we expect x0 to be accurate + * but y0 and y1 less so. + * + * If we have a wider rectangle, from (x0, y0) to (x1, y1) then + * we fully expect all sides to be accurate. + */ +static int +find_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, float x, int inaccurate) +{ + const int WIGGLE_ROOM = 2; + int i; + fz_stext_grid_positions *pos = row ? gd->ypos : gd->xpos; + + assert(x >= pos->list[0].min && x <= pos->list[pos->len-1].max); + +#ifdef DEBUG_TABLE_STRUCTURE + printf("Looking for %g in %s splits:\n", x, row ? "row" : "col"); + for (i = 0; i < pos->len; i++) + { + printf("%d\t%g\t%g\t%g\t%d\n", i, pos->list[i].min, pos->list[i].pos, pos->list[i].max, pos->list[i].reinforcement); + } +#endif + + while (inaccurate) /* So we can break out */ + { + float prev = 0; + + /* If we start/finish outside the range of the table, then we + * want to extend the table. So ignore 'inaccurate' in this + * case. Match the logic below. */ + if (x < pos->list[0].min) + break; + if (x < pos->list[0].pos - WIGGLE_ROOM && pos->list[0].reinforcement > 0) + break; + if (x > pos->list[pos->len-1].max) + break; + if (x > pos->list[pos->len-1].pos + WIGGLE_ROOM && pos->list[pos->len-1].reinforcement > 0) + break; + + /* Just find the closest one. No reinforcement. */ + for (i = 0; i < pos->len; i++) + { + if (x < pos->list[i].min) + { + float mid = (prev + pos->list[i].min)/2; + if (x < mid) + return i-1; + return i; + } + prev = pos->list[i].max; + if (x <= prev) + return i; + } + assert("Never happens" == NULL); + + return -1; + } + + for (i = 0; i < pos->len; i++) + { + if (x < pos->list[i].min) + { + /* Split i into i and i+1, and make i the new one. */ + assert(i > 0); +#ifdef DEBUG_TABLE_STRUCTURE + printf("Splitting before %d\n", i); +#endif + pos = split_grid_pos(ctx, gd, row, i, 1); + pos->list[i-1].max = pos->list[i].min = (pos->list[i-1].max + x)/2; + pos->list[i].pos = x; + pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2; + pos->list[i].reinforcement = 1; + return i; + } + else if (x <= pos->list[i].max) + { + /* We are in the range for the ith divider. */ + if (pos->list[i].reinforcement == 0) + { + /* If we've not been reinforced before, reinforce now. */ + pos->list[i].pos = x; + pos->list[i].reinforcement = 1; + return i; + } + /* We've been reinforced before. This ought to be a pretty good + * indication. */ + if (pos->list[i].pos - WIGGLE_ROOM < x && x < pos->list[i].pos + WIGGLE_ROOM) + { + /* We are a close match to the previously predicted pos + * value. */ + pos->list[i].pos = pos->list[i].pos * pos->list[i].reinforcement + x; + pos->list[i].pos /= ++pos->list[i].reinforcement; + return i; + } + /* We need to split i into i and i+1. */ + pos = split_grid_pos(ctx, gd, row, i, pos->list[i].pos > x); + if (pos->list[i].pos > x) + { + /* Make i the new one */ +#ifdef DEBUG_TABLE_STRUCTURE + printf("Splitting %d (early)\n", i); +#endif + pos->list[i].pos = x; + pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2; + pos->list[i].reinforcement = 1; + return i; + } + else + { + /* Make i+1 the new one */ +#ifdef DEBUG_TABLE_STRUCTURE + printf("Splitting %d (late)\n", i); +#endif + pos->list[i+1].pos = x; + pos->list[i].max = pos->list[i+1].min = (pos->list[i].pos + x)/2; + pos->list[i].reinforcement = 1; + return i+1; + } + } + } + assert("Never happens" == NULL); + + return -1; +} + +static int +find_cell_l(fz_stext_grid_positions *pos, float x) +{ + int i; + + for (i = 0; i < pos->len; i++) + if (x < pos->list[i].pos) + return i-1; + + return -1; +} + +static int +find_cell_r(fz_stext_grid_positions *pos, float x) +{ + int i; + + for (i = 0; i < pos->len; i++) + if (x <= pos->list[i].pos) + return i-1; + + return -1; +} + +/* Add a horizontal line. Return 1 if the line doesn't seem to be a border line. + * Record which cells that was a border for. */ +static void +add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1) +{ + int start = find_grid_pos(ctx, gd, 0, x0, 1); + int end = find_grid_pos(ctx, gd, 0, x1, 1); + float y = (y0 + y1) / 2; + int yidx = find_grid_pos(ctx, gd, 1, y, 0); + int i; + + assert(start > 0 && end > 0); + + for (i = start; i < end; i++) + get_cell(gd->cells, i, yidx)->h_line++; +} + +/* Add a vertical line. Return 1 if the line doesn't seem to be a border line. + * Record which cells that was a border for. */ +static void +add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1) +{ + int start = find_grid_pos(ctx, gd, 1, y0, 1); + int end = find_grid_pos(ctx, gd, 1, y1, 1); + float x = (x0 + x1) / 2; + int xidx = find_grid_pos(ctx, gd, 0, x, 0); + int i; + + for (i = start; i < end; i++) + get_cell(gd->cells, xidx, i)->v_line++; +} + +static void +add_hv_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1, int stroked) +{ + int ix0 = find_grid_pos(ctx, gd, 0, x0, 0); + int ix1 = find_grid_pos(ctx, gd, 0, x1, 0); + int iy0 = find_grid_pos(ctx, gd, 1, y0, 0); + int iy1 = find_grid_pos(ctx, gd, 1, y1, 0); + int i; + + if (stroked) + { + for (i = ix0; i < ix1; i++) + { + get_cell(gd->cells, i, iy0)->h_line++; + get_cell(gd->cells, i, iy1)->h_line++; + } + for (i = iy0; i < iy1; i++) + { + get_cell(gd->cells, ix0, i)->v_line++; + get_cell(gd->cells, ix1, i)->v_line++; + } + } +} + +/* Shared internal routine with stext-boxer.c */ +fz_rect fz_collate_small_vector_run(fz_stext_block **blockp); + +static void +walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) +{ + for (; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + if (block->u.s.down) + walk_grid_lines(ctx, gd, block->u.s.down->first_block); + continue; + } + else if (block->type == FZ_STEXT_BLOCK_VECTOR) + { + fz_rect r; + float w, h; + + /* Only process rectangle blocks. */ + if ((block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0) + continue; + + r = fz_collate_small_vector_run(&block); + r = fz_intersect_rect(r, gd->bounds); + if (!fz_is_valid_rect(r)) + continue; + + w = r.x1 - r.x0; + h = r.y1 - r.y0; + if (w > h && h < 2) + { + /* Thin, wide line */ + (void) add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1); + } + else if (w < h && w < 2) + { + /* Thin, wide line */ + (void) add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1); + } + else + { + /* Rectangle */ + (void) add_hv_line(ctx, gd, r.x0, r.x1, r.y0, r.y1, block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED); + } + } + } +} + +static int +is_numeric(int c) +{ + return (c >= '0' && c <= '9'); +} + +static int +mark_cells_for_content(fz_context *ctx, grid_walker_data *gd, fz_rect s) +{ + fz_rect r = fz_intersect_rect(gd->bounds, s); + int x0, x1, y0, y1, x, y; + + if (fz_is_empty_rect(r)) + return 0; + + x0 = find_cell_l(gd->xpos, r.x0); + x1 = find_cell_r(gd->xpos, r.x1); + y0 = find_cell_l(gd->ypos, r.y0); + y1 = find_cell_r(gd->ypos, r.y1); + + if (x0 < 0 || x1 < 0 || y0 < 0 || y1 < 0) + return 1; + if (x0 < x1) + { + for (y = y0; y <= y1; y++) + for (x = x0; x < x1; x++) + get_cell(gd->cells, x+1, y)->v_crossed++; + } + if (y0 < y1) + { + for (y = y0; y < y1; y++) + for (x = x0; x <= x1; x++) + get_cell(gd->cells, x, y+1)->h_crossed++; + } + for (y = y0; y <= y1; y++) + for (x = x0; x <= x1; x++) + get_cell(gd->cells, x, y)->full++; + + return 0; +} + +#define IN_CELL 0 +#define IN_BORDER 1 +#define IN_UNKNOWN 2 + +#define SPLIT_MARGIN 4 +static int +where_is(fz_stext_grid_positions *pos, float x, int *in) +{ + int i; + + *in = IN_UNKNOWN; + + /* If the table is empty, nothing to do. */ + if (pos->len == 0) + return -1; + + /* If we are completely outside the table, give up. */ + if (x <= pos->list[0].pos - SPLIT_MARGIN || x >= pos->list[pos->len-1].max + SPLIT_MARGIN) + return -1; + + for (i = 0; i < pos->len; i++) + { + /* Calculate the region (below..above) that counts as being + * on the border of position i. */ + float prev = i > 0 ? pos->list[i-1].max : pos->list[0].min; + float next = i < pos->len-1 ? pos->list[i+1].min : pos->list[i].max; + float below = pos->list[i].pos - SPLIT_MARGIN; + float above = pos->list[i].pos + SPLIT_MARGIN; + /* Find the distance half way back to the previous pos as + * a limit to our margin. */ + prev = (prev + pos->list[i].pos)/2; + next = (next + pos->list[i].pos)/2; + if (below < prev) + below = prev; + if (above > next) + above = next; + + if (x < below) + { + *in = IN_CELL; + return i-1; + } + else if (x <= above) + { + *in = IN_BORDER; + return i; + } + } + + *in = IN_BORDER; + return i-1; +} + +enum +{ + VECTOR_IS_CONTENT = 0, + VECTOR_IS_BORDER = 1, + VECTOR_IS_UNKNOWN = 2, + VECTOR_IS_IGNORABLE = 3 +}; + +/* So a vector can either be a border, or contained + * in some cells, or something completely else. */ +static int +classify_vector(fz_context *ctx, grid_walker_data *gd, fz_rect r, int is_rect) +{ + int at_x0, at_x1, at_y0, at_y1; + int ix0, ix1, iy0, iy1; + + r = fz_intersect_rect(r, gd->bounds); + if (fz_is_empty_rect(r)) + return VECTOR_IS_IGNORABLE; + ix0 = where_is(gd->xpos, r.x0, &at_x0); + ix1 = where_is(gd->xpos, r.x1, &at_x1); + iy0 = where_is(gd->ypos, r.y0, &at_y0); + iy1 = where_is(gd->ypos, r.y1, &at_y1); + + /* No idea, just treat it as a border. */ + if (at_x0 == IN_UNKNOWN || at_x1 == IN_UNKNOWN || at_y0 == IN_UNKNOWN || at_y1 == IN_UNKNOWN) + return VECTOR_IS_IGNORABLE; + + if (at_x0 == IN_BORDER && at_x1 == IN_BORDER) + { + /* Vector is aligned along sides of cells. */ + return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; + } + if (at_y0 == IN_BORDER && at_y1 == IN_BORDER) + { + /* Vector is aligned along sides of cells. */ + return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; + } + if (at_x0 == IN_CELL && at_x1 == IN_CELL) + { + /* Content within a cell (or 1d range of cells). */ + return VECTOR_IS_CONTENT; + } + if (at_y0 == IN_CELL && at_y1 == IN_CELL) + { + /* Content within a cell (or 1d range of cells). */ + return VECTOR_IS_CONTENT; + } + if (at_x0 == IN_BORDER && at_x1 == IN_CELL && ix0 == ix1) + { + return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; + } + if (at_x0 == IN_CELL && at_x1 == IN_BORDER && ix0+1 == ix1) + { + return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; + } + if (at_y0 == IN_BORDER && at_y1 == IN_CELL && iy0 == iy1) + { + return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; + } + if (at_y0 == IN_CELL && at_y1 == IN_BORDER && iy0+1 == iy1) + { + return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; + } + + if (is_rect) + { + return VECTOR_IS_IGNORABLE; + } + + /* unknown - take this as indication that this maybe isn't + * table. */ + return VECTOR_IS_UNKNOWN; +} + +#undef IN_CELL +#undef IN_BORDER +#undef IN_UNKNOWN + + +/* Walk through the content, looking at how it spans our grid. + * Record gridlines, which cells have content that cross into + * neighbours, and which cells have any content at all. + * Return a count of vector graphics that are found that don't + * look plausibly like cell contents. */ +static int +calculate_spanned_content(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) +{ + int duff = 0; + fz_rect bounds = { + gd->xpos->list[0].pos, + gd->ypos->list[0].pos, + gd->xpos->list[gd->xpos->len-1].pos, + gd->ypos->list[gd->ypos->len-1].pos }; + + for (; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + if (block->u.s.down) + duff += calculate_spanned_content(ctx, gd, block->u.s.down->first_block); + continue; + } + else if (block->type == FZ_STEXT_BLOCK_VECTOR) + { + switch (classify_vector(ctx, gd, block->bbox, !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE))) + { + case VECTOR_IS_CONTENT: + mark_cells_for_content(ctx, gd, block->bbox); + break; + case VECTOR_IS_BORDER: + case VECTOR_IS_IGNORABLE: + break; + default: + duff++; + break; + } + } + else if (block->type == FZ_STEXT_BLOCK_TEXT) + { + fz_stext_line *line; + + if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 || + block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0) + continue; + + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + fz_stext_char *ch = line->first_char; + int was_numeric = 0; + + /* Skip leading spaces */ + while (ch != NULL && ch->c == ' ') + ch = ch->next; + + for (; ch != NULL; ch = ch->next) + { + if (ch->c == 32) + { + /* Trailing space, skip it. */ + if (ch->next == NULL) + break; + if (ch->next->c == 32) + { + /* Run of spaces. Skip 'em. */ + while (ch->next && ch->next->c == 32) + ch = ch->next; + was_numeric = 0; + continue; + } + if (was_numeric || is_numeric(ch->next->c)) + { + /* Single spaces around numbers are ignored. */ + was_numeric = 0; + continue; + } + /* A single space. Accept it. */ + was_numeric = 0; + } + else + was_numeric = is_numeric(ch->c); + duff += mark_cells_for_content(ctx, gd, fz_rect_from_quad(ch->quad)); + } + } + } + } + + return duff; +} + +static cells_t *new_cells(fz_context *ctx, int w, int h) +{ + cells_t *cells = fz_malloc_flexible(ctx, cells_t, cell, w * h); + cells->w = w; + cells->h = h; + + return cells; +} + +#ifdef DEBUG_TABLE_STRUCTURE +static void +asciiart_table(grid_walker_data *gd) +{ + int w = gd->xpos->len; + int h = gd->ypos->len; + int x, y; + + for (y = 0; y < h; y++) + { + for (x = 0; x < w-1; x++) + { + cell_t *cell = get_cell(gd->cells, x, y); + int line = cell->h_line; + int erase = cell->h_crossed; + printf("+"); + if (line && !erase) + { + printf("-"); + } + else if (!line && erase) + { + printf("v"); + } + else if (line && erase) + { + printf("*"); + } + else + { + printf(" "); + } + } + printf("+\n"); + if (y == h-1) + break; + for (x = 0; x < w; x++) + { + cell_t *cell = get_cell(gd->cells, x, y); + int line = cell->v_line; + int erase = cell->v_crossed; + if (line && !erase) + { + printf("|"); + } + else if (!line && erase) + { + printf(">"); + } + else if (line && erase) + { + printf("*"); + } + else + { + printf(" "); + } + if (x < w-1) + { + if (cell->full) + printf("#"); + else + printf(" "); + } + else + printf("\n"); + } + } +} +#endif + +static void +recalc_bbox(fz_stext_block *block) +{ + fz_rect bbox = fz_empty_rect; + fz_stext_line *line; + + for (line = block->u.t.first_line; line != NULL; line = line->next) + bbox = fz_union_rect(bbox, line->bbox); + + block->bbox = bbox; +} + +static void +unlink_line_from_block(fz_stext_line *line, fz_stext_block *block) +{ + fz_stext_line *next_line = line->next; + + if (line->prev) + { + assert(line->prev->next == line); + line->prev->next = next_line; + } + else + { + assert(block->u.t.first_line == line); + block->u.t.first_line = next_line; + } + if (next_line) + { + assert(next_line->prev == line); + next_line->prev = line->prev; + } + else + { + assert(block->u.t.last_line == line); + block->u.t.last_line = line->prev; + } +} + +static void +append_line_to_block(fz_stext_line *line, fz_stext_block *block) +{ + if (block->u.t.last_line == NULL) + { + assert(block->u.t.first_line == NULL); + block->u.t.first_line = block->u.t.last_line = line; + line->prev = NULL; + } + else + { + assert(block->u.t.last_line->next == NULL); + line->prev = block->u.t.last_line; + block->u.t.last_line->next = line; + block->u.t.last_line = line; + } + line->next = NULL; +} + +static void +unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last) +{ + if (block->prev) + { + assert(block->prev->next == block); + block->prev->next = block->next; + } + else + { + assert(*first == block); + *first = block->next; + } + if (block->next) + { + assert(block->next->prev == block); + block->next->prev = block->prev; + } + else + { + assert(*last == block); + *last = block->prev; + } +} + +#ifndef NDEBUG +static int +verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src) +{ + fz_stext_block *block; + fz_stext_block **first = src ? &src->first_block : &page->first_block; + fz_stext_block **last = src ? &src->last_block : &page->last_block; + int max = 0; + + assert((*first == NULL) == (*last == NULL)); + + for (block = *first; block != NULL; block = block->next) + { + fz_stext_line *line; + + if (block->prev == NULL) + assert(*first == block); + else + assert(block->prev->next == block); + if (block->next == NULL) + assert(*last == block); + else + assert(block->next->prev == block); + + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + if (block->u.s.down) + { + int m = verify_stext(ctx, page, block->u.s.down); + if (m > max) + max = m; + } + continue; + } + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL)); + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + fz_stext_char *ch; + + if (line->next == NULL) + assert(block->u.t.last_line == line); + else + assert(line->next->prev == line); + + assert((line->first_char == NULL) == (line->last_char == NULL)); + + for (ch = line->first_char; ch != NULL; ch = ch->next) + assert(ch->next != NULL || line->last_char == ch); + } + } + + return max+1; +} +#endif + +static fz_rect +move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r) +{ + fz_stext_block *before = dest ? dest->first_block : page->first_block; + fz_stext_block **sfirst = src ? &src->first_block : &page->first_block; + fz_stext_block **slast = src ? &src->last_block : &page->last_block; + fz_stext_block *block, *next_block; + + for (block = *sfirst; block != NULL; block = next_block) + { + fz_rect bbox = fz_intersect_rect(block->bbox, r); + next_block = block->next; + /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ + if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1) + continue; /* Trivially excluded */ + if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1) + { + /* Trivially included */ + if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) + { + if (block->u.s.down->standard == FZ_STRUCTURE_TD) + { + /* Don't copy TDs! Just copy the contents */ + move_contained_content(ctx, page, dest, block->u.s.down, r); + continue; + } + } + unlink_block(block, sfirst, slast); + insert_block_before(block, before, page, dest); + assert(before == block->next); + continue; + } + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + if (block->u.s.down) + move_contained_content(ctx, page, dest, block->u.s.down, r); + } + if (block->type == FZ_STEXT_BLOCK_TEXT) + { + /* Partially included text block */ + fz_stext_line *line, *next_line; + fz_stext_block *newblock = NULL; + + for (line = block->u.t.first_line; line != NULL; line = next_line) + { + fz_rect lrect = fz_intersect_rect(line->bbox, r); + next_line = line->next; + + /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ + if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1) + continue; /* Trivial exclusion */ + if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1) + { + /* Trivial inclusion */ + if (newblock == NULL) + { + newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); + insert_block_before(newblock, before, page, dest); + newblock->u.t.flags = block->u.t.flags; + assert(before == newblock->next); + } + + unlink_line_from_block(line, block); + append_line_to_block(line, newblock); + } + else + { + /* Need to walk the line and just take parts */ + fz_stext_line *newline = NULL; + fz_stext_char *ch, *next_ch, *prev_ch = NULL; + + for (ch = line->first_char; ch != NULL; ch = next_ch) + { + fz_rect crect = fz_rect_from_quad(ch->quad); + float x = (crect.x0 + crect.x1)/2; + float y = (crect.y0 + crect.y1)/2; + next_ch = ch->next; + if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y) + { + prev_ch = ch; + continue; + } + /* Take this char */ + if (newline == NULL) + { + newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline)); + newline->dir = line->dir; + newline->wmode = line->wmode; + newline->bbox = fz_empty_rect; + } + /* Unlink char */ + if (prev_ch == NULL) + line->first_char = next_ch; + else + prev_ch->next = next_ch; + if (next_ch == NULL) + line->last_char = prev_ch; + /* Relink char */ + ch->next = NULL; + if (newline->last_char == NULL) + newline->first_char = ch; + else + newline->last_char->next = ch; + newline->last_char = ch; + newline->bbox = fz_union_rect(newline->bbox, crect); + } + if (line->first_char == NULL) + { + /* We've removed all the chars from this line. + * Better remove the line too. */ + if (line->prev) + line->prev->next = next_line; + else + block->u.t.first_line = next_line; + if (next_line) + next_line->prev = line->prev; + else + block->u.t.last_line = line->prev; + } + if (newline) + { + if (newblock == NULL) + { + newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); + + /* Add the block onto our target list */ + insert_block_before(newblock, before, page, dest); + newblock->u.t.flags = block->u.t.flags; + before = newblock->next; + } + append_line_to_block(newline, newblock); + } + } + } + if (newblock) + { + recalc_bbox(block); + recalc_bbox(newblock); + } + if (block->u.t.first_line == NULL) + { + /* We've removed all the lines from the block. Should remove that too! */ + if (block->prev) + { + assert(block->prev->next == block); + block->prev->next = next_block; + } + else + { + assert(*sfirst == block); + *sfirst = block->next; + } + if (next_block) + { + assert(next_block->prev == block); + next_block->prev = block->prev; + } + else + { + assert(*slast == block); + *slast = block->prev; + } + } + } + } + + return r; +} + +typedef struct +{ + fz_stext_block *block; + fz_stext_struct *parent; +} tree_pos; + +/* This is still not perfect, but it's better! */ +static tree_pos +find_table_insertion_point(fz_context *ctx, fz_rect r, tree_pos current, tree_pos best) +{ + fz_stext_block *block; + + for (block = current.block; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + if (block->u.s.down != NULL && fz_is_rect_inside_rect(r, block->bbox)) + { + tree_pos down; + + down.parent = block->u.s.down; + down.block = block->u.s.down->first_block; + best = find_table_insertion_point(ctx, r, down, best); + } + } + else + { + /* Is block a better precursor than best? (Or a valid precursor, if best.block == NULL) */ + if (block->bbox.y1 < r.y0 && (best.block == NULL || best.block->bbox.y1 < block->bbox.y1)) + { + best.block = block; + best.parent = current.parent; + } + } + } + + return best; +} + +static int +tr_is_empty(fz_context *ctx, fz_stext_block *block) +{ + for (; block != NULL; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_STRUCT) + return 0; + if (!block->u.s.down) + { + } + else if (block->u.s.down->standard != FZ_STRUCTURE_TD) + return 0; + else if (block->u.s.down->first_block != NULL) + return 0; + } + + return 1; +} + +static int +table_is_empty(fz_context *ctx, fz_stext_block *block) +{ + for (; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_GRID) + continue; + if (block->type != FZ_STEXT_BLOCK_STRUCT) + return 0; + if (!block->u.s.down) + { + } + else if (block->u.s.down->standard != FZ_STRUCTURE_TR) + return 0; + else if (!tr_is_empty(ctx, block->u.s.down->first_block)) + return 0; + } + + return 1; +} + +static void +tidy_td_divs(fz_context *ctx, fz_stext_struct *parent) +{ + while (1) + { + fz_stext_block *block = parent->first_block; + + if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV) + return; + + parent->first_block = block->u.s.down->first_block; + parent->last_block = block->u.s.down->last_block; + } +} + +static void +tidy_tr_divs(fz_context *ctx, fz_stext_block *block) +{ + for (; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD) + tidy_td_divs(ctx, block->u.s.down); + } +} + +static void +tidy_table_divs(fz_context *ctx, fz_stext_block *block) +{ + for (; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_GRID) + continue; + if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR) + tidy_tr_divs(ctx, block->u.s.down->first_block); + } +} + +static int +struct_is_empty(fz_context *ctx, fz_stext_block *block) +{ + for (; block != NULL; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_STRUCT) + return 0; + if (!block->u.s.down) + { + } + else if (!struct_is_empty(ctx, block->u.s.down->first_block)) + return 0; + } + + return 1; +} + +static int +div_is_empty(fz_context *ctx, fz_stext_block *block) +{ + for (; block != NULL; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_STRUCT) + return 0; + if (!block->u.s.down) + { + } + else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) + { + tidy_table_divs(ctx, block->u.s.down->first_block); + return table_is_empty(ctx, block->u.s.down->first_block); + } + else if (block->u.s.down->standard != FZ_STRUCTURE_DIV) + { + if (!struct_is_empty(ctx, block->u.s.down->first_block)) + return 0; + } + else if (!div_is_empty(ctx, block->u.s.down->first_block)) + return 0; + } + + return 1; +} + +static void +tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) +{ + fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block; + fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block; + fz_stext_block *block, *next_block; + + for (block = *first_blockp; block != NULL; block = next_block) + { + next_block = block->next; + if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) + { + if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) + { + tidy_table_divs(ctx, block->u.s.down->first_block); + if (table_is_empty(ctx, block->u.s.down->first_block)) + { + /* Remove block */ + if (block->prev) + { + assert(block->prev->next == block); + block->prev->next = next_block; + } + else + { + assert(*first_blockp == block); + *first_blockp = next_block; + } + if (next_block) + { + assert(next_block->prev == block); + next_block->prev = block->prev; + } + else + { + assert(*last_blockp == block); + *last_blockp = block->prev; + } + } + else + { + tidy_orphaned_tables(ctx, page, block->u.s.down); + } + } + if (block->u.s.down->standard == FZ_STRUCTURE_DIV) + { + tidy_orphaned_tables(ctx, page, block->u.s.down); + if (div_is_empty(ctx, block->u.s.down->first_block)) + { + /* Remove block */ + if (block->prev) + { + assert(block->prev->next == block); + block->prev->next = next_block; + } + else + { + assert(*first_blockp == block); + *first_blockp = next_block; + } + if (next_block) + { + assert(next_block->prev == block); + next_block->prev = block->prev; + } + else + { + assert(*last_blockp == block); + *last_blockp = block->prev; + } + } + } + } + } +} + +static fz_stext_struct * +transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent) +{ + int w = gd->xpos->len; + int h = gd->ypos->len; + int x, y; + char *sent_tab = fz_calloc(ctx, 1, w*(size_t)h); + fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; + fz_stext_struct *table, *tr, *td; + fz_rect r; + tree_pos insert, top; + + /* Where should we insert the table in the data? */ + r.x0 = gd->xpos->list[0].pos; + r.x1 = gd->xpos->list[w-1].pos; + r.y0 = gd->ypos->list[0].pos; + r.y1 = gd->ypos->list[h-1].pos; +#ifdef DEBUG_TABLE_STRUCTURE + fz_print_stext_page_as_xml(ctx, fz_stddbg(ctx), page, 0); +#endif + top.block = *first_block; + top.parent = parent; + insert.block = NULL; + insert.parent = NULL; + insert = find_table_insertion_point(ctx, r, top, insert); + + /* Convert to 'before' */ + if (insert.block) + insert.block = insert.block->next; + + /* Make table */ + table = add_struct_block_before(ctx, insert.block, page, insert.parent, FZ_STRUCTURE_TABLE, "Table"); + + /* Run through the cells, and guess at spanning. */ + for (y = 0; y < h-1; y++) + { + /* Have we sent this entire row before? */ + for (x = 0; x < w-1; x++) + { + if (!sent_tab[x+y*w]) + break; + } + if (x == w-1) + continue; /* No point in sending a row with nothing in it! */ + + /* Make TR */ + tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR"); + + for (x = 0; x < w-1; x++) + { + int x2, y2; + int cellw = 1; + int cellh = 1; + + /* Have we sent this cell already? */ + if (sent_tab[x+y*w]) + continue; + + /* Find the width of the cell */ + for (x2 = x+1; x2 < w-1; x2++) + { + cell_t *cell = get_cell(gd->cells, x2, y); + if (cell->v_line) + break; /* Can't go past a line */ + if (gd->xpos->list[x2].uncertainty == 0) + break; /* An uncertainty of 0 is as good as a line. */ + if (!cell->v_crossed) + break; + cellw++; + } + /* Find the height of the cell */ + for (y2 = y+1; y2 < h-1; y2++) + { + cell_t *cell; + int h_crossed = 0; + if (gd->ypos->list[y2].uncertainty == 0) + break; /* An uncertainty of 0 is as good as a line. */ + + cell = get_cell(gd->cells, x, y2); + if (cell->h_line) + break; /* Can't extend down through a line. */ + if (cell->h_crossed) + h_crossed = 1; + for (x2 = x+1; x2 < x+cellw; x2++) + { + cell = get_cell(gd->cells, x2, y2); + if (cell->h_line) + break; + if (cell->v_line) + break; /* Can't go past a line */ + if (gd->xpos->list[x2].uncertainty == 0) + break; /* An uncertainty of 0 is as good as a line. */ + if (!cell->v_crossed) + break; + if (cell->h_crossed) + h_crossed = 1; + } + if (x2 == x+cellw && h_crossed) + cellh++; + else + break; + } + /* Make TD */ + td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD"); + r.x0 = gd->xpos->list[x].pos; + r.x1 = gd->xpos->list[x+cellw].pos; + r.y0 = gd->ypos->list[y].pos; + r.y1 = gd->ypos->list[y+cellh].pos; + /* Use r, not REAL contents bbox, as otherwise spanned rows + * can end up empty. */ + td->up->bbox = r; + move_contained_content(ctx, page, td, parent, r); +#ifdef DEBUG_TABLE_STRUCTURE + printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh); +#endif + for (y2 = y; y2 < y+cellh; y2++) + for (x2 = x; x2 < x+cellw; x2++) + sent_tab[x2+y2*w] = 1; + } + r.x0 = gd->xpos->list[0].pos; + r.x1 = gd->xpos->list[gd->xpos->len-1].pos; + r.y0 = gd->ypos->list[y].pos; + r.y1 = gd->ypos->list[y+1].pos; + tr->up->bbox = r; + table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox); + } + fz_free(ctx, sent_tab); + + { + fz_stext_block *block; + fz_stext_grid_positions *xps2 = copy_grid_positions_to_pool(ctx, page, gd->xpos); + fz_stext_grid_positions *yps2 = copy_grid_positions_to_pool(ctx, page, gd->ypos); + block = add_grid_block(ctx, page, &table->first_block, &table->last_block); + block->u.b.xs = xps2; + block->u.b.ys = yps2; + block->bbox.x0 = block->u.b.xs->list[0].pos; + block->bbox.y0 = block->u.b.ys->list[0].pos; + block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos; + block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos; + } + tidy_orphaned_tables(ctx, page, parent); + + return table; +} + +static void +merge_column(grid_walker_data *gd, int x) +{ + int y; + for (y = 0; y < gd->cells->h; y++) + { + cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)]; + cell_t *s = &gd->cells->cell[x + y * gd->cells->w]; + + if (x > 0) + memcpy(d-x, s-x, sizeof(*d) * x); + d->full = s[0].full || s[1].full; + d->h_crossed = s[0].h_crossed || s[1].h_crossed; + d->h_line = s[0].h_line; /* == s[1].h_line */ + d->v_crossed = s[0].v_crossed; + d->v_line = s[0].v_line; + if (x < gd->cells->w - 2) + memcpy(d+1, s+2, sizeof(*d) * (gd->cells->w - 2 - x)); + } + gd->cells->w--; + + if (x < gd->xpos->len - 2) + memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], sizeof(gd->xpos->list[0]) * (gd->xpos->len - 2 - x)); + gd->xpos->len--; +} + +static void +merge_columns(grid_walker_data *gd) +{ + int x, y; + + for (x = gd->cells->w-3; x >= 0; x--) + { + /* Can column x be merged with column x+1? */ + /* An empty column can certainly be merged if the h_lines are the same, + * and there is no v_line. */ + for (y = 0; y < gd->cells->h-1; y++) + { + cell_t *a = get_cell(gd->cells, x, y); + cell_t *b = get_cell(gd->cells, x+1, y); + if (a->full || !!a->h_line != !!b->h_line || b->v_line) + break; + } + if (y == gd->cells->h-1) + goto merge_column; + /* An empty column can certainly be merged if the h_lines are the same. */ + for (y = 0; y < gd->cells->h-1; y++) + { + cell_t *a = get_cell(gd->cells, x, y); + cell_t *b = get_cell(gd->cells, x+1, y); + if (b->full || !!a->h_line != !!b->h_line || b->v_line) + break; + } + if (y == gd->cells->h-1) + goto merge_column; + /* We only ever want to merge columns if content crossed between them somewhere. + * Don't use uncertainty for this, because uncertainty doesn't allow for + * whitespace. */ + for (y = 0; y < gd->cells->h-1; y++) + if (get_cell(gd->cells, x+1, y)->v_crossed == 1) + break; + if (y == gd->cells->h-1) + continue; + /* This requires all the pairs of cells in those 2 columns to be mergeable. */ + for (y = 0; y < gd->cells->h-1; y++) + { + cell_t *a = get_cell(gd->cells, x, y); + cell_t *b = get_cell(gd->cells, x+1, y); + /* If there is a divider, we can't merge. */ + if (b->v_line) + break; + /* If either is empty, we can merge. */ + if (!a->full || !b->full) + continue; + /* If we differ in h linedness, we can't merge */ + if (!!a->h_line != !!b->h_line) + break; + /* If both are full, we can only merge if we cross. */ + if (a->full && b->full && b->v_crossed) + continue; + /* Otherwise we can't merge */ + break; + } + if (y == gd->cells->h-1) + { + /* Merge the column! */ +merge_column: +#ifdef DEBUG_TABLE_STRUCTURE + printf("Merging column %d\n", x); +#endif + merge_column(gd, x); +#ifdef DEBUG_TABLE_STRUCTURE + asciiart_table(gd); +#endif + } + } +} + +static void +merge_row(grid_walker_data *gd, int y) +{ + int x; + int w = gd->cells->w; + cell_t *d = &gd->cells->cell[y * w]; + for (x = 0; x < gd->cells->w-1; x++) + { + if (d->full == 0) + d->full = d[w].full; + if (d->v_crossed == 0) + d->v_crossed = d[w].v_crossed; + d++; + } + if (y < gd->cells->h - 2) + memcpy(d, d+w, sizeof(*d) * (gd->cells->h - 2 - y) * w); + gd->cells->h--; + + if (y < gd->ypos->len - 2) + memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], sizeof(gd->ypos->list[0]) * (gd->ypos->len - 2 - y)); + gd->ypos->len--; +} + +static void +merge_rows(grid_walker_data *gd) +{ + int x, y; + + for (y = gd->cells->h-3; y >= 0; y--) + { + /* Can row y be merged with row y+1? */ + /* An empty row can certainly be merged if the v_lines are the same, + * and there is no h_line. */ + for (x = 0; x < gd->cells->w-1; x++) + { + cell_t *a = get_cell(gd->cells, x, y); + cell_t *b = get_cell(gd->cells, x, y+1); + if (a->full || !!a->v_line != !!b->v_line || b->h_line) + break; + } + if (x == gd->cells->w-1) + goto merge_row; + /* An empty row can certainly be merged if the v_lines are the same. */ + for (x = 0; x < gd->cells->w-1; x++) + { + cell_t *a = get_cell(gd->cells, x, y); + cell_t *b = get_cell(gd->cells, x, y+1); + if (b->full || !!a->v_line != !!b->v_line || b->h_line) + break; + } + if (x == gd->cells->w-1) + goto merge_row; + /* We only ever want to merge rows if content crossed between them somewhere. + * Don't use uncertainty for this, because uncertainty doesn't allow for + * whitespace. */ + for (x = 0; x < gd->cells->w-1; x++) + if (get_cell(gd->cells, x, y+1)->h_crossed == 1) + break; + if (x == gd->cells->w-1) + continue; + /* This requires all the pairs of cells in those 2 rows to be mergeable. */ + for (x = 0; x < gd->cells->w-1; x++) + { + cell_t *a = get_cell(gd->cells, x, y); + cell_t *b = get_cell(gd->cells, x, y+1); + /* If there is a divider, we can't merge. */ + if (b->h_line) + goto cant_merge; + /* If either is empty, we can merge. */ + if (!a->full || !b->full) + continue; + /* If we differ in v linedness, we can't merge */ + if (!!a->v_line != !!b->v_line) + goto cant_merge; + /* If both are full, we can only merge if we cross. */ + if (a->full && b->full && b->h_crossed) + continue; + /* Otherwise we can't merge */ + break; + } + if (x == gd->cells->w-1) + { + /* Merge the row! */ +merge_row: +#ifdef DEBUG_TABLE_STRUCTURE + printf("Merging row %d\n", y); +#endif + merge_row(gd, y); +#ifdef DEBUG_TABLE_STRUCTURE + asciiart_table(gd); +#endif + continue; + } + + /* OK, so we failed to be able to merge y and y+1. But if we get here, we + * know this wasn't because of any lines being in the way. So we can try + * a different set of criteria. If every non-empty cell in line y+1 is either + * into from line y, or crosses into line y+2, then it's probably a 'straddling' + * line. Just remove it. */ + if (y+2 >= gd->cells->h) + continue; + for (x = 0; x < gd->cells->w-1; x++) + { + cell_t *b = get_cell(gd->cells, x, y+1); + cell_t *c = get_cell(gd->cells, x, y+2); + if (!b->full) + continue; + if (!b->h_crossed && !c->h_crossed) + break; + } + if (x == gd->cells->w-1) + { + /* Merge the row! */ +#ifdef DEBUG_TABLE_STRUCTURE + printf("Merging row %d (case 2)\n", y); +#endif + merge_row(gd, y); +#ifdef DEBUG_TABLE_STRUCTURE + asciiart_table(gd); +#endif + } + + +cant_merge: + {} + } +} + +static int +remove_bordered_empty_cells(grid_walker_data *gd) +{ + int x, y, x1, y1, x2, y2; + int changed = 0; + + /* We are looking for a region of cells that's bordered, + * and empty, and where the borders don't extend... + * + * ' ' + * ' ' + * . . .'____'. . . + * | | + * | | + * . . .|____|. . . + * ' ' + * ' ' + * ' ' + */ + + for (y = 0; y < gd->cells->h-1; y++) + { + for (x = 0; x < gd->cells->w-1; x++) + { + cell_t *u = y > 0 ? get_cell(gd->cells, x, y-1) : NULL; + cell_t *l = x > 0 ? get_cell(gd->cells, x-1, y) : NULL; + cell_t *c = get_cell(gd->cells, x, y); + + if (c->full) + continue; + if (!c->h_line || !c->v_line) + continue; + if (l != NULL && l->h_line) + continue; + if (u != NULL && u->v_line) + continue; + /* So c (x, y) is a reasonable top left of the bounded region. */ + for (x1 = x+1; x1 < gd->cells->w; x1++) + { + u = y > 0 ? get_cell(gd->cells, x1, y-1) : NULL; + c = get_cell(gd->cells, x1, y); + + if (u != NULL && u->v_line) + goto bad_region; + if (c->h_line && !c->v_line) + continue; + if (c->h_line || !c->v_line) + goto bad_region; + break; + } + if (x1 == gd->cells->w) + goto bad_region; + /* If we get here, then we have the top row of a bounded region + * that runs from (x,y) to (x1-1, y). So, can we extend that region + * downwards? */ + for (y1 = y+1; y1 < gd->cells->h; y1++) + { + c = get_cell(gd->cells, x, y1); + + if (!c->h_line && c->v_line) + { + /* This could be the start of a line. */ + for (x2 = x+1; x2 < x1; x2++) + { + c = get_cell(gd->cells, x2, y1); + if (c->full || c->h_line || c->v_line) + goto bad_region; + } + c = get_cell(gd->cells, x1, y1); + if (c->h_line || !c->v_line) + goto bad_region; + /* That line was fine! Region is extended. */ + } + else if (c->h_line && !c->v_line) + { + /* This might be the bottom line of the bounded region. */ + for (x2 = x+1; x2 < x1; x2++) + { + c = get_cell(gd->cells, x2, y1); + if (!c->h_line || c->v_line) + goto bad_region; + } + c = get_cell(gd->cells, x1, y1); + if (c->h_line || c->v_line) + goto bad_region; + break; + } + else + goto bad_region; + } + if (y1 == gd->cells->h) + goto bad_region; + /* So we have a bounded region from x,y to x1-1,y1-1 */ + for (y2 = y; y2 < y1; y2++) + { + for (x2 = x; x2 < x1; x2++) + { + c = get_cell(gd->cells, x2, y2); + c->h_line = 0; + c->v_line = 0; + c->full = 1; + if (x2 > x) + c->v_crossed = 1; + if (y2 > y) + c->h_crossed = 1; + } + c = get_cell(gd->cells, x2, y2); + c->v_line = 0; + } + for (x2 = x; x2 < x1; x2++) + get_cell(gd->cells, x2, y2)->h_line = 0; + changed = 1; +bad_region: + {} + } + } + +#ifdef DEBUG_TABLE_STRUCTURE + if (changed) + { + printf("Simplified empty bounded cells to give:\n"); + asciiart_table(gd); + } +#endif + + return changed; +} + +static int +find_table_within_bounds(fz_context *ctx, grid_walker_data *gd, fz_stext_block *content, fz_rect bounds) +{ + div_list xs = { 0 }; + div_list ys = { 0 }; + int failed = 1; + + fz_try(ctx) + { + walk_to_find_content(ctx, &xs, &ys, content, bounds); + + sanitize_positions(ctx, &xs); + sanitize_positions(ctx, &ys); + + /* Run across the line, counting 'winding' */ + /* If we don't have at least 2 rows and 2 columns, give up. */ + if (xs.len <= 2 || ys.len <= 2) + break; + + gd->xpos = make_table_positions(ctx, &xs, bounds.x0, bounds.x1); + gd->ypos = make_table_positions(ctx, &ys, bounds.y0, bounds.y1); + gd->cells = new_cells(ctx, gd->xpos->len, gd->ypos->len); + + /* Walk the content looking for grid lines. These + * lines refine our positions. */ + walk_grid_lines(ctx, gd, content); + /* Now, we walk the content looking for content that crosses + * these grid lines. This allows us to spot spanned cells. */ + if (calculate_spanned_content(ctx, gd, content)) + break; /* Unlikely to be a table. */ + +#ifdef DEBUG_TABLE_STRUCTURE + asciiart_table(gd); +#endif + /* Now, can we remove some columns or rows? i.e. have we oversegmented? */ + do + { + merge_columns(gd); + merge_rows(gd); + } + while (remove_bordered_empty_cells(gd)); + + /* Did we shrink the table so much it's not a table any more? */ + if (gd->xpos->len <= 2 || gd->ypos->len <= 2) + break; + + failed = 0; + } + fz_always(ctx) + { + fz_free(ctx, xs.list); + fz_free(ctx, ys.list); + } + fz_catch(ctx) + { + fz_rethrow(ctx); + } + + return failed; +} + +static int +do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) +{ + fz_stext_block *block; + int count; + fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; + int num_subtables = 0; + grid_walker_data gd = { 0 }; + + /* No content? Just bale. */ + if (*first_block == NULL) + return 0; + + /* If all the content here looks like a column of text, don't + * hunt for a table within it. */ + if (all_blocks_are_justified_or_headers(ctx, *first_block)) + return num_subtables; + + gd.bounds = fz_infinite_rect; + + /* First off, descend into any children to see if those look like tables. */ + count = 0; + for (block = *first_block; block != NULL; block = block->next) + { + if (block->type == FZ_STEXT_BLOCK_STRUCT) + { + if (block->u.s.down) + { + num_subtables += do_table_hunt(ctx, page, block->u.s.down); + count++; + } + } + else if (block->type == FZ_STEXT_BLOCK_TEXT) + count++; + } + + /* If we don't have at least a single child, no more to hunt. */ + /* We only need a single block, because a single text block can + * contain an entire unbordered table. */ + if (count < 1) + return num_subtables; + + /* We only look for a table at this level if we either at the top + * or on a div. This saves us looking for tables within an 'H' + * for example. */ + if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV) + return num_subtables; + + + fz_var(gd); + + fz_try(ctx) + { + /* Now see whether the content looks like tables. */ + fz_rect bounds = walk_to_find_bounds(ctx, *first_block); + + if (find_table_within_bounds(ctx, &gd, *first_block, bounds)) + break; + + if (num_subtables > 0) + { + /* We are risking throwing away a table we've already found for this + * one. Only do it if this one is really convincing. */ + int x, y; + for (x = 0; x < gd.xpos->len; x++) + if (gd.xpos->list[x].uncertainty != 0) + break; + if (x != gd.xpos->len) + break; + for (y = 0; y < gd.ypos->len; y++) + if (gd.ypos->list[y].uncertainty != 0) + break; + if (y != gd.ypos->len) + break; + } + +#ifdef DEBUG_TABLE_STRUCTURE + printf("Transcribing table: (%g,%g)->(%g,%g)\n", + gd.xpos->list[0].pos, + gd.ypos->list[0].pos, + gd.xpos->list[gd.xpos->len-1].pos, + gd.ypos->list[gd.ypos->len-1].pos); +#endif + + /* Now we should have the entire table calculated. */ + (void)transcribe_table(ctx, &gd, page, parent); + num_subtables = 1; +#ifdef DEBUG_WRITE_AS_PS + { + int i; + printf("%% TABLE\n"); + for (i = 0; i < block->u.b.xs->len; i++) + { + if (block->u.b.xs->list[i].uncertainty) + printf("0 1 0 setrgbcolor\n"); + else + printf("0 0.5 0 setrgbcolor\n"); + printf("%g %g moveto %g %g lineto stroke\n", + block->u.b.xs->list[i].pos, block->bbox.y0, + block->u.b.xs->list[i].pos, block->bbox.y1); + } + for (i = 0; i < block->u.b.ys->len; i++) + { + if (block->u.b.ys->list[i].uncertainty) + printf("0 1 0 setrgbcolor\n"); + else + printf("0 0.5 0 setrgbcolor\n"); + printf("%g %g moveto %g %g lineto stroke\n", + block->bbox.x0, block->u.b.ys->list[i].pos, + block->bbox.x1, block->u.b.ys->list[i].pos); + } + } +#endif + } + fz_always(ctx) + { + fz_free(ctx, gd.xpos); + fz_free(ctx, gd.ypos); + fz_free(ctx, gd.cells); + } + fz_catch(ctx) + fz_rethrow(ctx); + + return num_subtables; +} + +void +fz_table_hunt(fz_context *ctx, fz_stext_page *page) +{ + if (page == NULL) + return; + + assert(verify_stext(ctx, page, NULL)); + + do_table_hunt(ctx, page, NULL); + + assert(verify_stext(ctx, page, NULL)); +} + +fz_stext_block * +fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds) +{ + fz_stext_struct *table = NULL; + grid_walker_data gd = { 0 }; + + /* No content? Just bale. */ + if (page == NULL || page->first_block == NULL) + return NULL; + + fz_var(gd); + + fz_try(ctx) + { + gd.bounds = bounds; + if (find_table_within_bounds(ctx, &gd, page->first_block, bounds)) + break; + +#ifdef DEBUG_TABLE_STRUCTURE + printf("Transcribing table: (%g,%g)->(%g,%g)\n", + gd.xpos->list[0].pos, + gd.ypos->list[0].pos, + gd.xpos->list[gd.xpos->len-1].pos, + gd.ypos->list[gd.ypos->len-1].pos); +#endif + + /* Now we should have the entire table calculated. */ + table = transcribe_table(ctx, &gd, page, NULL); +#ifdef DEBUG_WRITE_AS_PS + { + int i; + printf("%% TABLE\n"); + for (i = 0; i < block->u.b.xs->len; i++) + { + if (block->u.b.xs->list[i].uncertainty) + printf("0 1 0 setrgbcolor\n"); + else + printf("0 0.5 0 setrgbcolor\n"); + printf("%g %g moveto %g %g lineto stroke\n", + block->u.b.xs->list[i].pos, block->bbox.y0, + block->u.b.xs->list[i].pos, block->bbox.y1); + } + for (i = 0; i < block->u.b.ys->len; i++) + { + if (block->u.b.ys->list[i].uncertainty) + printf("0 1 0 setrgbcolor\n"); + else + printf("0 0.5 0 setrgbcolor\n"); + printf("%g %g moveto %g %g lineto stroke\n", + block->bbox.x0, block->u.b.ys->list[i].pos, + block->bbox.x1, block->u.b.ys->list[i].pos); + } + } +#endif + } + fz_always(ctx) + { + fz_free(ctx, gd.xpos); + fz_free(ctx, gd.ypos); + fz_free(ctx, gd.cells); + } + fz_catch(ctx) + fz_rethrow(ctx); + + return table ? table->first_block : NULL; +}
