Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/source/fitz/stext-table.c @ 25:575f70dbc259
Do not expose local directory names in mupdf_location in _build.py.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Fri, 19 Sep 2025 18:20:23 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
// Copyright (C) 2025 Artifex Software, Inc. // // This file is part of MuPDF. // // MuPDF is free software: you can redistribute it and/or modify it under the // terms of the GNU Affero General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more // details. // // You should have received a copy of the GNU Affero General Public License // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> // // Alternative licensing terms are available from the licensor. // For commercial licensing, see <https://www.artifex.com/> or contact // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, // CA 94129, USA, for further information. #include "mupdf/fitz.h" #include <assert.h> /* #define DEBUG_WRITE_AS_PS */ /* #define DEBUG_TABLE_STRUCTURE */ /* #define DEBUG_TABLE_HUNT */ /* * The algorithm. * * The goal of the algorithm is to identify tables on a page. * First we have to find the tables on a page, then we have to * figure out where the columns/rows are, and then how the * cells span them. * * We do this as a series of steps. * * To illustrate what's going on, let's use an example page * that we can follow through all the steps. * * +---------------------------+ * | | * | #### ## ### ## | <- Title * | | * | ##### ##### #### ## | \ * | ## ###### ###### ## | | * | #### ####### ###### | |- Abstract * | ####### #### ## ### | | * | ### ##### ###### | / * | | * | ######### ######### | 2 Columns of text * | ######### ######### | * | ######## ######### | * | ######### | * | +-------+ ####### | <- With an image on the left * | | | | * | | | ## ## # # | <- And a table on the right * | +-------+ ## ## # # | * | ## ## # # | * | ######### ## ## # # | * | ######### ## ## # # | * | ######### ## ## # # | * | | * +---------------------------+ * * * Step 1: Segmentation. * * First, we segment the page, trying to break it down into a * series of non-overlapping rectangles. We do this (in stext-boxer.c) * by looking for where the content isn't. If we can identify breaks * that run through the page (either from top to bottom or from left * to right), then we can split the page there, and recursively consider * the two halves of the page. * * It's not a perfect algorithm, but it manages to in many cases. * * After segmenting the above example, first we'll find the horizontal * splits, giving: * * +---------------------------+ * | | * | #### ## ### ## | * +---------------------------+ * | ##### ##### #### ## | * | ## ###### ###### ## | * | #### ####### ###### | * | ####### #### ## ### | * | ### ##### ###### | * +---------------------------+ * | ######### ######### | * | ######### ######### | * | ######## ######### | * | ######### | * | +-------+ ####### | * | | | | * | | | ## ## # # | * | +-------+ ## ## # # | * | ## ## # # | * | ######### ## ## # # | * | ######### ## ## # # | * | ######### ## ## # # | * | | * +---------------------------+ * * Then we'll recurse and find the vertical split between * the columns: * * +---------------------------+ * | | * | #### ## ### ## | * +---------------------------+ * | ##### ##### #### ## | * | ## ###### ###### ## | * | #### ####### ###### | * | ####### #### ## ### | * | ### ##### ###### | * +-------------+-------------+ * | ######### | ######### | * | ######### | ######### | * | ######## | ######### | * | | ######### | * | +-------+ | ####### | * | | | | | * | | | | ## ## # # | * | +-------+ | ## ## # # | * | | ## ## # # | * | ######### | ## ## # # | * | ######### | ## ## # # | * | ######### | ## ## # # | * | | | * +-------------+-------------+ * * Then we recurse again and find the horizontal splits * within the columns: * * +---------------------------+ * | | * | #### ## ### ## | * +---------------------------+ * | ##### ##### #### ## | * | ## ###### ###### ## | * | #### ####### ###### | * | ####### #### ## ### | * | ### ##### ###### | * +-------------+-------------+ * | ######### | ######### | * | ######### | ######### | * | ######## | ######### | * +-------------+ ######### | * | +-------+ | ####### | * | | | +-------------+ * | | | | ## ## # # | * | +-------+ | ## ## # # | * +-------------+ ## ## # # | * | ######### | ## ## # # | * | ######### | ## ## # # | * | ######### | ## ## # # | * | | | * +-------------+-------------+ * * We recurse a fixed maximum number of times (currently * 6, IIRC) or until we fail to find any suitable splits. * * This completes the page segmentation step. * * Step 2: Grid finding * * Next, we look at each of those segments and try to identify * where grids might be. * * Imagine the bottom right section of that page as * a board with lego blocks on where there is text. * Now imagine viewing that from the bottom of the page. * The gaps between the columns of the table are where you * can see through to the top between the blocks. * * Similarly, if you view it from the side, the gaps between the * rows of the page are where you can see through to the other * side. * * So, how do we code that? Well, we run through the page content * (obviously, restricted to the content that falls into this * segment of the page - that'll go without saying from here on * in). For each bit of content, we look at the "x extent" of that * content - for instance a given string might start at position * 10 and continue to position 100. We build a list of all these * start, and stop positions, and keep them in a sorted list. * * Then we walk this list from left to right, keeping a sum. I * call this sum "wind", because it's very similar to the winding * number that you get when doing scan conversion of bezier shapes. * * wind starts out as 0. We increment it whenever we pass a 'start' * position, and decrement it whenever we pass a 'stop' position. * So at any given x position along the line wind tells us the * number of pieces of content that overlap that x position. * So wind(left) = 0 = wind(right), and wind(x) >= x for all x. * * So, if we walk from left to right, the trace of wind might * look something like: * * __ * ___ / \ _ __ * / \ / \/ \ _/ \_ * / \___/ \___/ \ * * The left and right edges of the table are pretty clear. * The regions where wind drops to 0 represent the column dividers. * The left and right hand side of those regions gives us the min * and max values for that divider. * * We can then repeat this process for Y ranges instead of X ranges * to get the row dividers. * * BUT, this only works for pure grid tables. It falls down for * cases where we have merged cells (which is very common due to * titles etc). * * We can modify the algorithm slightly to allow for this. * * Consider the following table: * * +-----------------------------------+ * | Long Table title across the top | * +---------------+---------+---------+ * | Name | Result1 | Result2 | * +---------------+----+----+----+----+ * | Homer Simpson | 1 | 23 | 4 | 56 | * | Barney Gumble | 1 | 23 | 4 | 56 | * | Moe | 1 | 23 | 4 | 56 | * | Apu | 1 | 23 | 4 | 56 | * | Ned Flanders | 1 | 23 | 4 | 56 | * +---------------+----+----+----+----+ * * The wind trace for that looks something like * (with a certain degree of artistic license for the * limitations of ascii art): * * ________ * / \ _ __ _ _ * / \____/ \_/ \___/ \_/ \ * / \ * * So, the trace never quite drops back to zero in the * middle due to the spanning of the top title. * * So, instead of just looking for points where the trace * drops to zero, we instead look for local minima. Each local * minima represents a place where there might be a grid divider. * The value of wind at such points can be considered the * "uncertainty" with which there might be a divider there. * Clear dividers (with a wind value of 0) have no uncertainty. * Places where cells are spanned have a higher value of uncertainty. * * The output from this step is the list of possible grid positions * (X and Y), with uncertainty values. * * We have skipped over the handling of spaces in the above * description. On the one hand, we'd quite like to treat * sentences as long unbroken regions of content. But sometimes * we can get awkward content like: * * P subtelomeric 24.38 8.15 11.31 0.94 1.46 * 11.08 6.93 15.34 0.00 0.73 * Pericentromeric region; C band 20.42 9.26 13.64 0.81 0.81 * 16.50 7.03 14.45 0.17 5.15 * where the content is frequently sent with spaces instead of * small gaps (particular in tables of digits, because numerals * are often the same width even in proportional fonts). * * To cope with this, as we send potential edges into the start/stop * positions, we send 'weak' edges for the start and stops of runs * of spaces. We then post process the edges to remove any local * minima regions in the 'wind' values that are bounded purely by * 'weak' edges. * * Step 3: Cell analysis * * So, armed with the output from step 2, we can examine each grid * found. If we have W x-dividers and H y-dividers, we know we have * a potential table with (W-1) x (H-1) cells in it. * * We represent this as a W x H grid of cells, each like: * * . . * . . * . . .+-------+. . . Each cell holds information about the * | . edges above, and to the left of it. * | . * | . * . . .+. . . .+. . . * . . * . . * * h_line: Is there a horizontal divider drawn on the page that * corresponds to the top of this cell (i.e. is there a cell border * here?) * v_line: Is there a vertical divider drawn on the page that * corresponds to the left of this cell (i.e. is there a cell border * here?) * h_crossed: Does content cross this line (i.e. are we merged * with the cell above?) * v_crossed: Does content cross this line (i.e. are we merged * with the cell to the left?) * full: Is there any content in this cell at all? * * We need a W x H grid of cells to represent the entire table due * to the potential right and bottom edge lines. The right and * bottom rows of cells should never be full, or be crossed, but * it's easiest just to use a simple representation that copes with * the h_line and v_line values naturally. * * So, we start with the cells structure empty, and we run through * the page content, filling in the details as we go. * * At the end of the process, we have enough information to draw * an asciiart representation of our table. It might look something * like this (this comes from dotted-gridlines-tables.pdf): * * +-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | | | | |#|#|#|#| | | | * + + + + + + +v+ +v+v+ + + + * | | | | | |#|#|#|#| | | | * + + + + + + + +v+ + + + + + * | |#| | |#|#|#|#|#|#|#|#| * + +v+ + + +v+v+ +v+v+v+v+v+ * | |#| |#|#|#|#|#|#|#|#|#| * + + + + +v+ + +v+ + + + + + * |#|#| #|#|#|#|#|#|#|#|#|#| * +v+v+ +v+ +v+v+ +v+v+v+v+v+ * |#|#| #|#|#|#|#|#|#|#|#|#| * + + + + +v+ + +v+ + + + + + * | |#| |#|#|#|#|#|#|#|#|#| * + +v+ + + +v+v+ +v+v+v+v+v+ * | |#| | |#|#|#|#|#|#|#|#| * + + + + + + + +v+ + + + + + * | | | | | |#|#|#|#| | | | * + + + + + + +v+ +v+v+ + + + * | | | | | |#|#|#|#| | | | * +-+-+-+-+-+-+-+-+-+-+-+-+-+ * | |# |#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |# |#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |#>#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |#>#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |#>#|#| |#| | | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |# |#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |# |#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |# |#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * | |# |#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * |# |#|#|#|#|#|#|#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+-+ * * This shows where lines are detected ( - and | ), * where they are crossed ( > and v) and where cells * are full ( # ). * * Step 4: Row and column merging. * * Based on the information above, we then try to merge * cells and columns to simplify the table. * * The best rules I've come up with this so far are: * We can merge two adjacent columns if all the pairs of * cells in the two columns are mergeable. * * Cells are held to be mergeable or not based upon the following * rules: * If there is a line between 2 cells - not mergeable. * else if the uncertainty between 2 cells is 0 - not mergeable. * else if the line between the 2 cells is crossed - mergeable. * else if strictly one of the cells is full - mergeable. * else not mergeable. * * So in the above example, column 2 (numbered from 0) can be merged * with column 3. * * This gives: * * +-+-+-+-+-+-+-+-+-+-+-+-+ * | | | | | |#|#|#|#| | | | * + + + + + +v+ +v+v+ + + + * | | | | | |#|#|#|#| | | | * + + + + + + +v+ + + + + + * | |#| | |#|#|#|#|#|#|#|#| * + +v+ + +v+v+ +v+v+v+v+v+ * | |#| |#|#|#|#|#|#|#|#|#| * + + + +v+ + +v+ + + + + + * |#|#|#|#|#|#|#|#|#|#|#|#| * +v+v+v+ +v+v+ +v+v+v+v+v+ * |#|#|#|#|#|#|#|#|#|#|#|#| * + + + +v+ + +v+ + + + + + * | |#| |#|#|#|#|#|#|#|#|#| * + +v+ + +v+v+ +v+v+v+v+v+ * | |#| | |#|#|#|#|#|#|#|#| * + + + + + + +v+ + + + + + * | | | | | |#|#|#|#| | | | * + + + + + +v+ +v+v+ + + + * | | | | | |#|#|#|#| | | | * +-+-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| |#| | | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * | |#|#| | |#| | |#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * |# |#|#|#|#|#|#|#|#|#| * + +-+-+-+-+-+-+-+-+-+-+-+ * * We then perform the same merging process for rows as for * columns - though there are no rows in the above example * that can be merged. * * You'll note that, for example, we don't merge row 0 and * row 1 in the above, because we have a pair of cells that * are both full without crossing. * * Step 5: Cell spanning * * Now we actually start to output the table. We keep a 'sent_table' * (a grid of W x H bools) to keep track of whether we've output * the content for a given cell or not yet. * * For each cell we reach, assuming sent_table[x,y] is false, * we merge it with as many cells on the right as required, * according to 'v_crossed' values (subject to not passing * v_lines or uncertainty == 0's). * * We then try to merge cells below according to 'h_crossed' * values (subject to not passing h_lines or uncertainty == 0's). * * In theory this can leave us with some cases where we'd like * to merge some cells (because of crossed) and can't (because * of lines or sent_table[]) values. In the absence of better * cell spanning algorithms we have no choice here. * * Then we output the contents and set sent_table[] values as * appropriate. * * If a row has no cells in it, we currently omit the TR. If/when * we figure out how to indicate rowspan/colspan in stext, we can * revisit that. */ static fz_stext_block * add_grid_block(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block) { fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof(**first_block)); memset(block, 0, sizeof(*block)); block->type = FZ_STEXT_BLOCK_GRID; block->bbox = fz_empty_rect; /* Fixes bug 703267. */ block->next = *first_block; if (*first_block) { (*first_block)->prev = block; assert(*last_block); } else { assert(*last_block == NULL); *last_block = block; } *first_block = block; return block; } static void insert_block_before(fz_stext_block *block, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *dest) { if (before) { /* We have a block to insert it before, so we know it's not the last. */ assert(dest ? (dest->first_block != NULL && dest->last_block != NULL) : (page->first_block != NULL && page->last_block != NULL)); block->next = before; block->prev = before->prev; if (before->prev) { assert(before->prev->next == before); before->prev->next = block; } else if (dest) { assert(dest->first_block == before); dest->first_block = block; } else { assert(page->first_block == before); page->first_block = block; } before->prev = block; } else if (dest) { /* Will be the last block. */ block->next = NULL; block->prev = dest->last_block; if (dest->last_block) { assert(dest->last_block->next == NULL); dest->last_block->next = block; } if (dest->first_block == NULL) dest->first_block = block; dest->last_block = block; } else { /* Will be the last block. */ block->next = NULL; block->prev = page->last_block; if (page->last_block) { assert(page->last_block->next == NULL); page->last_block->next = block; } if (page->first_block == NULL) page->first_block = block; page->last_block = block; } } static fz_stext_struct * add_struct_block_before(fz_context *ctx, fz_stext_block *before, fz_stext_page *page, fz_stext_struct *parent, fz_structure std, const char *raw) { fz_stext_block *block; int idx = 0; size_t z; fz_stext_struct *newstruct; if (raw == NULL) raw = ""; z = strlen(raw); /* We're going to insert a struct block. We need an idx, so walk the list */ for (block = parent ? parent->first_block : page->first_block; block != before; block = block->next) { if (block->type == FZ_STEXT_BLOCK_STRUCT) { assert(block->u.s.index >= idx); idx = block->u.s.index + 1; } } /* So we'll add our block as idx. But all the other struct blocks that follow us need to have * larger values. */ /* Update all the subsequent structs to have a higher idx */ if (before) { int idx2 = idx+1; for (block = before->next; block != NULL; block = block->next) { if (block->type != FZ_STEXT_BLOCK_STRUCT) continue; if (block->u.s.index > idx2) break; block->u.s.index = idx2++; } } /* Now make our new struct block and insert it. */ block = fz_pool_alloc(ctx, page->pool, sizeof(*block)); block->type = FZ_STEXT_BLOCK_STRUCT; block->bbox = fz_empty_rect; /* Fixes bug 703267. */ insert_block_before(block, before, page, parent); block->u.s.down = newstruct = fz_pool_alloc(ctx, page->pool, sizeof(*block->u.s.down) + z); block->u.s.index = idx; newstruct->parent = parent; newstruct->standard = std; memcpy(newstruct->raw, raw, z); newstruct->raw[z] = 0; newstruct->up = block; return newstruct; } typedef struct { int len; int max; struct { uint8_t left; uint8_t weak; float pos; int freq; } *list; } div_list; static void div_list_push(fz_context *ctx, div_list *div, int left, int weak, float pos) { int i; /* FIXME: Could be bsearch. */ for (i = 0; i < div->len; i++) { if (div->list[i].pos > pos) break; else if (div->list[i].pos == pos && div->list[i].left == left) { div->list[i].freq++; div->list[i].weak &= weak; return; } } if (div->len == div->max) { int newmax = div->max * 2; if (newmax == 0) newmax = 32; div->list = fz_realloc(ctx, div->list, sizeof(div->list[0]) * newmax); div->max = newmax; } if (i < div->len) memmove(&div->list[i+1], &div->list[i], sizeof(div->list[0]) * (div->len - i)); div->len++; div->list[i].left = left; div->list[i].weak = weak; div->list[i].pos = pos; div->list[i].freq = 1; } static fz_stext_grid_positions * make_table_positions(fz_context *ctx, div_list *xs, float min, float max) { int wind; fz_stext_grid_positions *pos; int len = xs->len; int i; int hi = 0; /* Count the number of edges */ int local_min = 0; int edges = 2; if (len == 0) return NULL; assert(xs->list[0].left); for (i = 0; i < len; i++) { if (xs->list[i].pos >= min) break; } for (; i < len; i++) { if (xs->list[i].pos >= max) break; if (xs->list[i].left) { if (local_min) edges++; } else local_min = 1; } assert(!xs->list[len-1].left); pos = fz_malloc_flexible(ctx, fz_stext_grid_positions, list, edges); pos->len = edges; /* Copy the edges in */ wind = 0; local_min = 0; edges = 1; pos->list[0].pos = min; pos->list[0].min = min; pos->list[0].max = fz_max(xs->list[0].pos, min); pos->list[0].uncertainty = 0; pos->list[0].reinforcement = 0; #ifdef DEBUG_TABLE_HUNT printf("|%g ", pos->list[0].pos); #endif /* Skip over entries to the left of min. */ for (i = 0; i < len; i++) { if (xs->list[i].pos >= min) break; if (xs->list[i].left) wind += xs->list[i].freq; else wind -= xs->list[i].freq; } for (; i < len; i++) { if (xs->list[i].pos >= max) break; if (xs->list[i].left) { if (local_min) { pos->list[edges].min = xs->list[i-1].pos; pos->list[edges].max = xs->list[i].pos; pos->list[edges].pos = (xs->list[i-1].pos + xs->list[i].pos)/2; pos->list[edges].uncertainty = wind; #ifdef DEBUG_TABLE_HUNT if (wind) printf("?%g(%d) ", pos->list[edges].pos, wind); else printf("|%g ", pos->list[edges].pos); #endif edges++; } wind += xs->list[i].freq; if (wind > hi) hi = wind; } else { wind -= xs->list[i].freq; local_min = 1; } } assert(i < len || wind == 0); pos->list[edges].pos = max; pos->list[edges].min = fz_min(xs->list[i-1].pos, max); pos->list[edges].max = max; assert(max >= xs->list[i-1].pos); pos->list[edges].uncertainty = 0; pos->list[edges].reinforcement = 0; pos->max_uncertainty = hi; #ifdef DEBUG_TABLE_HUNT printf("|%g\n", pos->list[edges].pos); #endif return pos; } static fz_stext_grid_positions * copy_grid_positions_to_pool(fz_context *ctx, fz_stext_page *page, fz_stext_grid_positions *xs) { size_t z = offsetof(fz_stext_grid_positions, list) + sizeof(xs->list[0]) * (xs->len); fz_stext_grid_positions *xs2 = fz_pool_alloc(ctx, page->pool, z); memcpy(xs2, xs, z); return xs2; } static void sanitize_positions(fz_context *ctx, div_list *xs) { int i, j, wind, changed; #ifdef DEBUG_TABLE_HUNT printf("OK:\n"); for (i = 0; i < xs->len; i++) { if (xs->list[i].left) printf("["); printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); if (!xs->list[i].left) printf("]"); printf(" "); } printf("\n"); #endif if (xs->len == 0) return; do { /* Now, combine runs of left and right */ for (i = 0; i < xs->len; i++) { if (xs->list[i].left) { j = i; while (i < xs->len-1 && xs->list[i+1].left) { i++; xs->list[j].freq += xs->list[i].freq; xs->list[j].weak &= xs->list[i].weak; xs->list[i].freq = 0; } } else { while (i < xs->len-1 && !xs->list[i+1].left) { i++; xs->list[i].freq += xs->list[i-1].freq; xs->list[i].weak &= xs->list[i-1].weak; xs->list[i-1].freq = 0; } } } #ifdef DEBUG_TABLE_HUNT printf("Shrunk:\n"); for (i = 0; i < xs->len; i++) { if (xs->list[i].left) printf("["); printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); if (!xs->list[i].left) printf("]"); printf(" "); } printf("\n"); #endif /* Now remove the 0 frequency ones. */ j = 0; for (i = 0; i < xs->len; i++) { if (xs->list[i].freq == 0) continue; if (i != j) xs->list[j] = xs->list[i]; j++; } xs->len = j; /* Now run across looking for local minima where at least one * edge is 'weak'. If the wind at that point is non-zero, then * remove the weak edges from consideration and retry. */ wind = 0; changed = 0; i = 0; while (1) { assert(xs->list[i].left); for (; xs->list[i].left; i++) { wind += xs->list[i].freq; } assert(i < xs->len); for (; xs->list[i].left == 0 && i < xs->len; i++) { wind -= xs->list[i].freq; } if (i == xs->len) break; if (wind != 0 && (xs->list[i-1].weak || xs->list[i].weak)) { int m = fz_mini(xs->list[i-1].freq, xs->list[i].freq); assert(m > 0); xs->list[i-1].freq -= m; xs->list[i].freq -= m; changed = 1; } } } while (changed); #ifdef DEBUG_TABLE_HUNT printf("Compacted:\n"); for (i = 0; i < xs->len; i++) { if (xs->list[i].left) printf("["); printf("%g(%d%s)", xs->list[i].pos, xs->list[i].freq, xs->list[i].weak ? "weak" : ""); if (!xs->list[i].left) printf("]"); printf(" "); } printf("\n"); #endif } /* We want to check for whether a DIV that we are about to descend into * contains a column of justified text. We will accept some headers in * this text, but not JUST headers. */ static int all_blocks_are_justified_or_headers(fz_context *ctx, fz_stext_block *block) { int just_headers = 1; for (; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_STRUCT) { if (block->u.s.down == NULL) continue; if (block->u.s.down->standard == FZ_STRUCTURE_H || block->u.s.down->standard == FZ_STRUCTURE_H1 || block->u.s.down->standard == FZ_STRUCTURE_H2 || block->u.s.down->standard == FZ_STRUCTURE_H3 || block->u.s.down->standard == FZ_STRUCTURE_H4 || block->u.s.down->standard == FZ_STRUCTURE_H5 || block->u.s.down->standard == FZ_STRUCTURE_H6) continue; if (!all_blocks_are_justified_or_headers(ctx, block->u.s.down->first_block)) return 0; } just_headers = 0; if (block->type == FZ_STEXT_BLOCK_TEXT && block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL) return 0; } if (just_headers) return 0; return 1; } #define TWO_INCHES (72*2) /* Walk through the blocks, finding the bbox. */ static fz_rect walk_to_find_bounds(fz_context *ctx, fz_stext_block *first_block) { fz_rect bounds = fz_empty_rect; fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; for (block = first_block; block != NULL; block = block->next) { switch (block->type) { case FZ_STEXT_BLOCK_STRUCT: if (!block->u.s.down) continue; if (block->u.s.down->standard == FZ_STRUCTURE_H) { if (block->next != NULL && block->next->type == FZ_STEXT_BLOCK_TEXT && block->next->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) continue; } bounds = fz_union_rect(bounds, walk_to_find_bounds(ctx, block->u.s.down->first_block)); break; case FZ_STEXT_BLOCK_VECTOR: bounds = fz_union_rect(bounds, block->bbox); break; case FZ_STEXT_BLOCK_TEXT: if (block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL && block->bbox.x1 - block->bbox.x0 >= TWO_INCHES) continue; for (line = block->u.t.first_line; line != NULL; line = line->next) { for (ch = line->first_char; ch != NULL; ch = ch->next) { if (ch->c != ' ') bounds = fz_union_rect(bounds, fz_rect_from_quad(ch->quad)); } } break; } } return bounds; } static void walk_to_find_content(fz_context *ctx, div_list *xs, div_list *ys, fz_stext_block *first_block, fz_rect bounds) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; for (block = first_block; block != NULL; block = block->next) { switch (block->type) { case FZ_STEXT_BLOCK_STRUCT: if (block->u.s.down && !fz_is_empty_rect(fz_intersect_rect(bounds, block->bbox))) walk_to_find_content(ctx, xs, ys, block->u.s.down->first_block, bounds); break; case FZ_STEXT_BLOCK_VECTOR: break; case FZ_STEXT_BLOCK_TEXT: { fz_rect justified_region = fz_empty_rect; for (line = block->u.t.first_line; line != NULL; line = line->next) { fz_rect region = fz_empty_rect; region.y0 = line->bbox.y0; region.y1 = line->bbox.y1; if (region.y0 < bounds.y0) region.y0 = bounds.y0; if (region.y1 > bounds.y1) region.y1 = bounds.y1; if (region.y0 >= region.y1) break; /* Skip leading spaces. */ for (ch = line->first_char; ch != NULL; ch = ch->next) if (ch->c != ' ') break; for (; ch != NULL; ch = ch->next) { if (ch->c == ' ') { /* Find the last space char in this run. */ fz_stext_char *last_space; for (last_space = ch; last_space->next != NULL && last_space->next->c == ' '; last_space = last_space->next); /* If we're not the last char in the line (i.e. we're not a trailing space, * then send a 'weak' gap for the spaces, assuming it's sane to do so). */ if (last_space->next != NULL) { float rpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); float lpos = fz_min(last_space->next->quad.ll.x, last_space->next->quad.ll.x); /* Clamp these to the bounds */ rpos = fz_clamp(rpos, bounds.x0, bounds.x1); lpos = fz_clamp(lpos, bounds.x0, bounds.x1); /* So we have a region (rpos...lpos) to add. */ /* This can be positioned in various different ways relative to the * current region: * [region] * (rpos..lpos) OK * (rpos..lpos) OK, but adjust lpos * (rpos..lpos) OK, but adjust rpos * (rpos..lpos) OK * (rpos .. lpos) OK */ if (lpos >= region.x1) { if (rpos >= region.x0 && rpos < region.x1) rpos = region.x1; } else if (rpos <= region.x0) { if (lpos > region.x0) lpos = region.x0; } else rpos = lpos; /* Make it an invalid region */ if (rpos < lpos) { /* Send a weak right at the start of the spaces... */ div_list_push(ctx, xs, 0, 1, rpos); /* and a weak left at the end. */ div_list_push(ctx, xs, 1, 1, lpos); } /* Expand the region as required. */ if (rpos < region.x0) region.x0 = rpos; if (lpos > region.x1) region.x1 = lpos; } /* Jump over the spaces */ ch = last_space; } else { float lpos = fz_min(ch->quad.ll.x, ch->quad.ul.x); float rpos = fz_max(ch->quad.lr.x, ch->quad.ur.x); if (lpos < region.x0) region.x0 = lpos; if (rpos > region.x1) region.x1 = rpos; } } if (!fz_is_empty_rect(region)) { div_list_push(ctx, xs, 1, 0, region.x0); div_list_push(ctx, xs, 0, 0, region.x1); /* For justified regions, we don't break after each line, but * rather before/after the region as a whole. */ if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_FULL) { div_list_push(ctx, ys, 1, 0, region.y0); div_list_push(ctx, ys, 0, 0, region.y1); } else justified_region = fz_union_rect(justified_region, region); } } if (!fz_is_empty_rect(justified_region) && block->u.t.flags == FZ_STEXT_TEXT_JUSTIFY_FULL) { div_list_push(ctx, ys, 1, 0, justified_region.y0); div_list_push(ctx, ys, 0, 0, justified_region.y1); } break; } } } } /* One of our datastructures (cells_t) is an array of details about the * cells that make up our table. It's a w * h array of cell_t's. Each * cell contains data on one of the cells in the table, as you'd expect. * * . . * . . * - - +-------+ - - * | . * | . * | . * - - + - - - + - - * . . * . . * * For any given cell, we store details about the top (lowest y coord) * and left (lowest x coord) edges. Specifically we store whether * there is a line at this position (h_line and v_line) (i.e. a drawn * border), and we also store whether content crosses this edge (h_crossed * and y_crossed). Finally, we store whether the cell has any content * in it at all (full). * * A table which has w positions across and h positions vertically, will * only really have (w-1) * (h-1) cells. We store w*h though to allow for * the right and bottom edges to have their lines represented. */ typedef struct { int h_line; int v_line; int h_crossed; int v_crossed; int full; } cell_t; typedef struct { int w; int h; cell_t cell[FZ_FLEXIBLE_ARRAY]; } cells_t; typedef struct { cells_t *cells; fz_stext_grid_positions *xpos; fz_stext_grid_positions *ypos; fz_rect bounds; } grid_walker_data; static cell_t * get_cell(cells_t *cells, int x, int y) { return &cells->cell[x + y * cells->w]; } #ifdef DEBUG_TABLE_STRUCTURE static void asciiart_table(grid_walker_data *gd); #endif static fz_stext_grid_positions * split_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, int i, int early) { fz_stext_grid_positions **posp = row ? &gd->ypos : &gd->xpos; fz_stext_grid_positions *pos = *posp; int n = pos->len; int x, y, w, h; cells_t *cells; /* Realloc the required structs */ *posp = pos = fz_realloc_flexible(ctx, pos, fz_stext_grid_positions, list, n+1); cells = gd->cells = fz_realloc_flexible(ctx, gd->cells, cells_t, cell, (gd->cells->w + (1-row)) * (gd->cells->h + row)); /* If both pass, then we're safe to shuffle the data. */ #ifdef DEBUG_TABLE_STRUCTURE printf("Before split %s %d\n", row ? "row" : "col", i); asciiart_table(gd); #endif assert(i >= 0 && i < n); memmove(&pos->list[i+1], &pos->list[i], sizeof(pos->list[0]) * (n-i)); pos->len++; /* Next expand the cells. Only h_line and v_line are filled in so far. */ w = cells->w; h = cells->h; if (early && i > 0) i--, early = 0; if (row) { /* Add a row */ cells->h = h+1; /* Expand the table, duplicating row i */ memmove(&cells->cell[(i+1)*w], &cells->cell[i*w], sizeof(cells->cell[0])*(h-i)*w); if (early) { /* We are splitting row 0 into 0 and 1, with 0 being the new one. */ for (x = 0; x < w; x++) { cells->cell[x].h_line = 0; cells->cell[x].v_line = 0; } } else { /* We are splitting row i into i and i+1, with i+1 being the new one. */ /* v_lines are carried over. h_lines need to be unset. */ for (x = 0; x < w; x++) cells->cell[x + (i+1)*w].h_line = 0; } } else { /* Add a column */ cells->w = w+1; /* Expand the table, duplicating column i */ for (y = h-1; y >= 0; y--) { for (x = w; x > i; x--) cells->cell[x + y*(w+1)] = cells->cell[x-1 + y*w]; for (; x >= 0; x--) cells->cell[x + y*(w+1)] = cells->cell[x + y*w]; } if (early) { /* We are splitting col 0 into 0 and 1, with 0 being the new one. */ for (y = 0; y < h; y++) { cells->cell[y*(w+1)].h_line = 0; cells->cell[y*(w+1)].v_line = 0; } } else { /* h_lines are carried over. v_lines need to be reset */ for (y = 0; y < h; y++) cells->cell[i+1 + y*(w+1)].v_line = 0; } } #ifdef DEBUG_TABLE_STRUCTURE printf("After split\n"); asciiart_table(gd); #endif return pos; } /* This routine finds (and reinforces) grid positions for lines. * * If we have a thin line from (x0, y0) to (x1, y0), then we are * pretty sure that y0 will be on the edge of a cell. We are less * sure that x0 and x1 will match up to the edge of a cell. * Stylistically some tables overrun or underrun such lines. * * Similarly from (x0, y0) to (x0, y1), we expect x0 to be accurate * but y0 and y1 less so. * * If we have a wider rectangle, from (x0, y0) to (x1, y1) then * we fully expect all sides to be accurate. */ static int find_grid_pos(fz_context *ctx, grid_walker_data *gd, int row, float x, int inaccurate) { const int WIGGLE_ROOM = 2; int i; fz_stext_grid_positions *pos = row ? gd->ypos : gd->xpos; assert(x >= pos->list[0].min && x <= pos->list[pos->len-1].max); #ifdef DEBUG_TABLE_STRUCTURE printf("Looking for %g in %s splits:\n", x, row ? "row" : "col"); for (i = 0; i < pos->len; i++) { printf("%d\t%g\t%g\t%g\t%d\n", i, pos->list[i].min, pos->list[i].pos, pos->list[i].max, pos->list[i].reinforcement); } #endif while (inaccurate) /* So we can break out */ { float prev = 0; /* If we start/finish outside the range of the table, then we * want to extend the table. So ignore 'inaccurate' in this * case. Match the logic below. */ if (x < pos->list[0].min) break; if (x < pos->list[0].pos - WIGGLE_ROOM && pos->list[0].reinforcement > 0) break; if (x > pos->list[pos->len-1].max) break; if (x > pos->list[pos->len-1].pos + WIGGLE_ROOM && pos->list[pos->len-1].reinforcement > 0) break; /* Just find the closest one. No reinforcement. */ for (i = 0; i < pos->len; i++) { if (x < pos->list[i].min) { float mid = (prev + pos->list[i].min)/2; if (x < mid) return i-1; return i; } prev = pos->list[i].max; if (x <= prev) return i; } assert("Never happens" == NULL); return -1; } for (i = 0; i < pos->len; i++) { if (x < pos->list[i].min) { /* Split i into i and i+1, and make i the new one. */ assert(i > 0); #ifdef DEBUG_TABLE_STRUCTURE printf("Splitting before %d\n", i); #endif pos = split_grid_pos(ctx, gd, row, i, 1); pos->list[i-1].max = pos->list[i].min = (pos->list[i-1].max + x)/2; pos->list[i].pos = x; pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2; pos->list[i].reinforcement = 1; return i; } else if (x <= pos->list[i].max) { /* We are in the range for the ith divider. */ if (pos->list[i].reinforcement == 0) { /* If we've not been reinforced before, reinforce now. */ pos->list[i].pos = x; pos->list[i].reinforcement = 1; return i; } /* We've been reinforced before. This ought to be a pretty good * indication. */ if (pos->list[i].pos - WIGGLE_ROOM < x && x < pos->list[i].pos + WIGGLE_ROOM) { /* We are a close match to the previously predicted pos * value. */ pos->list[i].pos = pos->list[i].pos * pos->list[i].reinforcement + x; pos->list[i].pos /= ++pos->list[i].reinforcement; return i; } /* We need to split i into i and i+1. */ pos = split_grid_pos(ctx, gd, row, i, pos->list[i].pos > x); if (pos->list[i].pos > x) { /* Make i the new one */ #ifdef DEBUG_TABLE_STRUCTURE printf("Splitting %d (early)\n", i); #endif pos->list[i].pos = x; pos->list[i].max = pos->list[i+1].min = (pos->list[i+1].pos + x)/2; pos->list[i].reinforcement = 1; return i; } else { /* Make i+1 the new one */ #ifdef DEBUG_TABLE_STRUCTURE printf("Splitting %d (late)\n", i); #endif pos->list[i+1].pos = x; pos->list[i].max = pos->list[i+1].min = (pos->list[i].pos + x)/2; pos->list[i].reinforcement = 1; return i+1; } } } assert("Never happens" == NULL); return -1; } static int find_cell_l(fz_stext_grid_positions *pos, float x) { int i; for (i = 0; i < pos->len; i++) if (x < pos->list[i].pos) return i-1; return -1; } static int find_cell_r(fz_stext_grid_positions *pos, float x) { int i; for (i = 0; i < pos->len; i++) if (x <= pos->list[i].pos) return i-1; return -1; } /* Add a horizontal line. Return 1 if the line doesn't seem to be a border line. * Record which cells that was a border for. */ static void add_h_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1) { int start = find_grid_pos(ctx, gd, 0, x0, 1); int end = find_grid_pos(ctx, gd, 0, x1, 1); float y = (y0 + y1) / 2; int yidx = find_grid_pos(ctx, gd, 1, y, 0); int i; assert(start > 0 && end > 0); for (i = start; i < end; i++) get_cell(gd->cells, i, yidx)->h_line++; } /* Add a vertical line. Return 1 if the line doesn't seem to be a border line. * Record which cells that was a border for. */ static void add_v_line(fz_context *ctx, grid_walker_data *gd, float y0, float y1, float x0, float x1) { int start = find_grid_pos(ctx, gd, 1, y0, 1); int end = find_grid_pos(ctx, gd, 1, y1, 1); float x = (x0 + x1) / 2; int xidx = find_grid_pos(ctx, gd, 0, x, 0); int i; for (i = start; i < end; i++) get_cell(gd->cells, xidx, i)->v_line++; } static void add_hv_line(fz_context *ctx, grid_walker_data *gd, float x0, float x1, float y0, float y1, int stroked) { int ix0 = find_grid_pos(ctx, gd, 0, x0, 0); int ix1 = find_grid_pos(ctx, gd, 0, x1, 0); int iy0 = find_grid_pos(ctx, gd, 1, y0, 0); int iy1 = find_grid_pos(ctx, gd, 1, y1, 0); int i; if (stroked) { for (i = ix0; i < ix1; i++) { get_cell(gd->cells, i, iy0)->h_line++; get_cell(gd->cells, i, iy1)->h_line++; } for (i = iy0; i < iy1; i++) { get_cell(gd->cells, ix0, i)->v_line++; get_cell(gd->cells, ix1, i)->v_line++; } } } /* Shared internal routine with stext-boxer.c */ fz_rect fz_collate_small_vector_run(fz_stext_block **blockp); static void walk_grid_lines(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) { for (; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_STRUCT) { if (block->u.s.down) walk_grid_lines(ctx, gd, block->u.s.down->first_block); continue; } else if (block->type == FZ_STEXT_BLOCK_VECTOR) { fz_rect r; float w, h; /* Only process rectangle blocks. */ if ((block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE) == 0) continue; r = fz_collate_small_vector_run(&block); r = fz_intersect_rect(r, gd->bounds); if (!fz_is_valid_rect(r)) continue; w = r.x1 - r.x0; h = r.y1 - r.y0; if (w > h && h < 2) { /* Thin, wide line */ (void) add_h_line(ctx, gd, r.x0, r.x1, r.y0, r.y1); } else if (w < h && w < 2) { /* Thin, wide line */ (void) add_v_line(ctx, gd, r.y0, r.y1, r.x0, r.x1); } else { /* Rectangle */ (void) add_hv_line(ctx, gd, r.x0, r.x1, r.y0, r.y1, block->u.v.flags & FZ_STEXT_VECTOR_IS_STROKED); } } } } static int is_numeric(int c) { return (c >= '0' && c <= '9'); } static int mark_cells_for_content(fz_context *ctx, grid_walker_data *gd, fz_rect s) { fz_rect r = fz_intersect_rect(gd->bounds, s); int x0, x1, y0, y1, x, y; if (fz_is_empty_rect(r)) return 0; x0 = find_cell_l(gd->xpos, r.x0); x1 = find_cell_r(gd->xpos, r.x1); y0 = find_cell_l(gd->ypos, r.y0); y1 = find_cell_r(gd->ypos, r.y1); if (x0 < 0 || x1 < 0 || y0 < 0 || y1 < 0) return 1; if (x0 < x1) { for (y = y0; y <= y1; y++) for (x = x0; x < x1; x++) get_cell(gd->cells, x+1, y)->v_crossed++; } if (y0 < y1) { for (y = y0; y < y1; y++) for (x = x0; x <= x1; x++) get_cell(gd->cells, x, y+1)->h_crossed++; } for (y = y0; y <= y1; y++) for (x = x0; x <= x1; x++) get_cell(gd->cells, x, y)->full++; return 0; } #define IN_CELL 0 #define IN_BORDER 1 #define IN_UNKNOWN 2 #define SPLIT_MARGIN 4 static int where_is(fz_stext_grid_positions *pos, float x, int *in) { int i; *in = IN_UNKNOWN; /* If the table is empty, nothing to do. */ if (pos->len == 0) return -1; /* If we are completely outside the table, give up. */ if (x <= pos->list[0].pos - SPLIT_MARGIN || x >= pos->list[pos->len-1].max + SPLIT_MARGIN) return -1; for (i = 0; i < pos->len; i++) { /* Calculate the region (below..above) that counts as being * on the border of position i. */ float prev = i > 0 ? pos->list[i-1].max : pos->list[0].min; float next = i < pos->len-1 ? pos->list[i+1].min : pos->list[i].max; float below = pos->list[i].pos - SPLIT_MARGIN; float above = pos->list[i].pos + SPLIT_MARGIN; /* Find the distance half way back to the previous pos as * a limit to our margin. */ prev = (prev + pos->list[i].pos)/2; next = (next + pos->list[i].pos)/2; if (below < prev) below = prev; if (above > next) above = next; if (x < below) { *in = IN_CELL; return i-1; } else if (x <= above) { *in = IN_BORDER; return i; } } *in = IN_BORDER; return i-1; } enum { VECTOR_IS_CONTENT = 0, VECTOR_IS_BORDER = 1, VECTOR_IS_UNKNOWN = 2, VECTOR_IS_IGNORABLE = 3 }; /* So a vector can either be a border, or contained * in some cells, or something completely else. */ static int classify_vector(fz_context *ctx, grid_walker_data *gd, fz_rect r, int is_rect) { int at_x0, at_x1, at_y0, at_y1; int ix0, ix1, iy0, iy1; r = fz_intersect_rect(r, gd->bounds); if (fz_is_empty_rect(r)) return VECTOR_IS_IGNORABLE; ix0 = where_is(gd->xpos, r.x0, &at_x0); ix1 = where_is(gd->xpos, r.x1, &at_x1); iy0 = where_is(gd->ypos, r.y0, &at_y0); iy1 = where_is(gd->ypos, r.y1, &at_y1); /* No idea, just treat it as a border. */ if (at_x0 == IN_UNKNOWN || at_x1 == IN_UNKNOWN || at_y0 == IN_UNKNOWN || at_y1 == IN_UNKNOWN) return VECTOR_IS_IGNORABLE; if (at_x0 == IN_BORDER && at_x1 == IN_BORDER) { /* Vector is aligned along sides of cells. */ return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; } if (at_y0 == IN_BORDER && at_y1 == IN_BORDER) { /* Vector is aligned along sides of cells. */ return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; } if (at_x0 == IN_CELL && at_x1 == IN_CELL) { /* Content within a cell (or 1d range of cells). */ return VECTOR_IS_CONTENT; } if (at_y0 == IN_CELL && at_y1 == IN_CELL) { /* Content within a cell (or 1d range of cells). */ return VECTOR_IS_CONTENT; } if (at_x0 == IN_BORDER && at_x1 == IN_CELL && ix0 == ix1) { return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; } if (at_x0 == IN_CELL && at_x1 == IN_BORDER && ix0+1 == ix1) { return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; } if (at_y0 == IN_BORDER && at_y1 == IN_CELL && iy0 == iy1) { return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; } if (at_y0 == IN_CELL && at_y1 == IN_BORDER && iy0+1 == iy1) { return is_rect ? VECTOR_IS_BORDER : VECTOR_IS_CONTENT; } if (is_rect) { return VECTOR_IS_IGNORABLE; } /* unknown - take this as indication that this maybe isn't * table. */ return VECTOR_IS_UNKNOWN; } #undef IN_CELL #undef IN_BORDER #undef IN_UNKNOWN /* Walk through the content, looking at how it spans our grid. * Record gridlines, which cells have content that cross into * neighbours, and which cells have any content at all. * Return a count of vector graphics that are found that don't * look plausibly like cell contents. */ static int calculate_spanned_content(fz_context *ctx, grid_walker_data *gd, fz_stext_block *block) { int duff = 0; fz_rect bounds = { gd->xpos->list[0].pos, gd->ypos->list[0].pos, gd->xpos->list[gd->xpos->len-1].pos, gd->ypos->list[gd->ypos->len-1].pos }; for (; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_STRUCT) { if (block->u.s.down) duff += calculate_spanned_content(ctx, gd, block->u.s.down->first_block); continue; } else if (block->type == FZ_STEXT_BLOCK_VECTOR) { switch (classify_vector(ctx, gd, block->bbox, !!(block->u.v.flags & FZ_STEXT_VECTOR_IS_RECTANGLE))) { case VECTOR_IS_CONTENT: mark_cells_for_content(ctx, gd, block->bbox); break; case VECTOR_IS_BORDER: case VECTOR_IS_IGNORABLE: break; default: duff++; break; } } else if (block->type == FZ_STEXT_BLOCK_TEXT) { fz_stext_line *line; if (block->bbox.x0 >= bounds.x1 || block->bbox.y0 >= bounds.y1 || block->bbox.x1 <= bounds.x0 || block->bbox.y1 <= bounds.y0) continue; for (line = block->u.t.first_line; line != NULL; line = line->next) { fz_stext_char *ch = line->first_char; int was_numeric = 0; /* Skip leading spaces */ while (ch != NULL && ch->c == ' ') ch = ch->next; for (; ch != NULL; ch = ch->next) { if (ch->c == 32) { /* Trailing space, skip it. */ if (ch->next == NULL) break; if (ch->next->c == 32) { /* Run of spaces. Skip 'em. */ while (ch->next && ch->next->c == 32) ch = ch->next; was_numeric = 0; continue; } if (was_numeric || is_numeric(ch->next->c)) { /* Single spaces around numbers are ignored. */ was_numeric = 0; continue; } /* A single space. Accept it. */ was_numeric = 0; } else was_numeric = is_numeric(ch->c); duff += mark_cells_for_content(ctx, gd, fz_rect_from_quad(ch->quad)); } } } } return duff; } static cells_t *new_cells(fz_context *ctx, int w, int h) { cells_t *cells = fz_malloc_flexible(ctx, cells_t, cell, w * h); cells->w = w; cells->h = h; return cells; } #ifdef DEBUG_TABLE_STRUCTURE static void asciiart_table(grid_walker_data *gd) { int w = gd->xpos->len; int h = gd->ypos->len; int x, y; for (y = 0; y < h; y++) { for (x = 0; x < w-1; x++) { cell_t *cell = get_cell(gd->cells, x, y); int line = cell->h_line; int erase = cell->h_crossed; printf("+"); if (line && !erase) { printf("-"); } else if (!line && erase) { printf("v"); } else if (line && erase) { printf("*"); } else { printf(" "); } } printf("+\n"); if (y == h-1) break; for (x = 0; x < w; x++) { cell_t *cell = get_cell(gd->cells, x, y); int line = cell->v_line; int erase = cell->v_crossed; if (line && !erase) { printf("|"); } else if (!line && erase) { printf(">"); } else if (line && erase) { printf("*"); } else { printf(" "); } if (x < w-1) { if (cell->full) printf("#"); else printf(" "); } else printf("\n"); } } } #endif static void recalc_bbox(fz_stext_block *block) { fz_rect bbox = fz_empty_rect; fz_stext_line *line; for (line = block->u.t.first_line; line != NULL; line = line->next) bbox = fz_union_rect(bbox, line->bbox); block->bbox = bbox; } static void unlink_line_from_block(fz_stext_line *line, fz_stext_block *block) { fz_stext_line *next_line = line->next; if (line->prev) { assert(line->prev->next == line); line->prev->next = next_line; } else { assert(block->u.t.first_line == line); block->u.t.first_line = next_line; } if (next_line) { assert(next_line->prev == line); next_line->prev = line->prev; } else { assert(block->u.t.last_line == line); block->u.t.last_line = line->prev; } } static void append_line_to_block(fz_stext_line *line, fz_stext_block *block) { if (block->u.t.last_line == NULL) { assert(block->u.t.first_line == NULL); block->u.t.first_line = block->u.t.last_line = line; line->prev = NULL; } else { assert(block->u.t.last_line->next == NULL); line->prev = block->u.t.last_line; block->u.t.last_line->next = line; block->u.t.last_line = line; } line->next = NULL; } static void unlink_block(fz_stext_block *block, fz_stext_block **first, fz_stext_block **last) { if (block->prev) { assert(block->prev->next == block); block->prev->next = block->next; } else { assert(*first == block); *first = block->next; } if (block->next) { assert(block->next->prev == block); block->next->prev = block->prev; } else { assert(*last == block); *last = block->prev; } } #ifndef NDEBUG static int verify_stext(fz_context *ctx, fz_stext_page *page, fz_stext_struct *src) { fz_stext_block *block; fz_stext_block **first = src ? &src->first_block : &page->first_block; fz_stext_block **last = src ? &src->last_block : &page->last_block; int max = 0; assert((*first == NULL) == (*last == NULL)); for (block = *first; block != NULL; block = block->next) { fz_stext_line *line; if (block->prev == NULL) assert(*first == block); else assert(block->prev->next == block); if (block->next == NULL) assert(*last == block); else assert(block->next->prev == block); if (block->type == FZ_STEXT_BLOCK_STRUCT) { if (block->u.s.down) { int m = verify_stext(ctx, page, block->u.s.down); if (m > max) max = m; } continue; } if (block->type != FZ_STEXT_BLOCK_TEXT) continue; assert((block->u.t.first_line == NULL) == (block->u.t.last_line == NULL)); for (line = block->u.t.first_line; line != NULL; line = line->next) { fz_stext_char *ch; if (line->next == NULL) assert(block->u.t.last_line == line); else assert(line->next->prev == line); assert((line->first_char == NULL) == (line->last_char == NULL)); for (ch = line->first_char; ch != NULL; ch = ch->next) assert(ch->next != NULL || line->last_char == ch); } } return max+1; } #endif static fz_rect move_contained_content(fz_context *ctx, fz_stext_page *page, fz_stext_struct *dest, fz_stext_struct *src, fz_rect r) { fz_stext_block *before = dest ? dest->first_block : page->first_block; fz_stext_block **sfirst = src ? &src->first_block : &page->first_block; fz_stext_block **slast = src ? &src->last_block : &page->last_block; fz_stext_block *block, *next_block; for (block = *sfirst; block != NULL; block = next_block) { fz_rect bbox = fz_intersect_rect(block->bbox, r); next_block = block->next; /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ if (bbox.x0 > bbox.x1 || bbox.y0 > bbox.y1) continue; /* Trivially excluded */ if (bbox.x0 == block->bbox.x0 && bbox.y0 == block->bbox.y0 && bbox.x1 == block->bbox.x1 && bbox.y1 == block->bbox.y1) { /* Trivially included */ if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) { if (block->u.s.down->standard == FZ_STRUCTURE_TD) { /* Don't copy TDs! Just copy the contents */ move_contained_content(ctx, page, dest, block->u.s.down, r); continue; } } unlink_block(block, sfirst, slast); insert_block_before(block, before, page, dest); assert(before == block->next); continue; } if (block->type == FZ_STEXT_BLOCK_STRUCT) { if (block->u.s.down) move_contained_content(ctx, page, dest, block->u.s.down, r); } if (block->type == FZ_STEXT_BLOCK_TEXT) { /* Partially included text block */ fz_stext_line *line, *next_line; fz_stext_block *newblock = NULL; for (line = block->u.t.first_line; line != NULL; line = next_line) { fz_rect lrect = fz_intersect_rect(line->bbox, r); next_line = line->next; /* Don't use fz_is_empty_rect here, as that will exclude zero height areas like spaces. */ if (lrect.x0 > lrect.x1 || lrect.y0 > lrect.y1) continue; /* Trivial exclusion */ if (line->bbox.x0 == lrect.x0 && line->bbox.y0 == lrect.y0 && line->bbox.x1 == lrect.x1 && line->bbox.y1 == lrect.y1) { /* Trivial inclusion */ if (newblock == NULL) { newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); insert_block_before(newblock, before, page, dest); newblock->u.t.flags = block->u.t.flags; assert(before == newblock->next); } unlink_line_from_block(line, block); append_line_to_block(line, newblock); } else { /* Need to walk the line and just take parts */ fz_stext_line *newline = NULL; fz_stext_char *ch, *next_ch, *prev_ch = NULL; for (ch = line->first_char; ch != NULL; ch = next_ch) { fz_rect crect = fz_rect_from_quad(ch->quad); float x = (crect.x0 + crect.x1)/2; float y = (crect.y0 + crect.y1)/2; next_ch = ch->next; if (r.x0 > x || r.x1 < x || r.y0 > y || r.y1 < y) { prev_ch = ch; continue; } /* Take this char */ if (newline == NULL) { newline = fz_pool_alloc(ctx, page->pool, sizeof(*newline)); newline->dir = line->dir; newline->wmode = line->wmode; newline->bbox = fz_empty_rect; } /* Unlink char */ if (prev_ch == NULL) line->first_char = next_ch; else prev_ch->next = next_ch; if (next_ch == NULL) line->last_char = prev_ch; /* Relink char */ ch->next = NULL; if (newline->last_char == NULL) newline->first_char = ch; else newline->last_char->next = ch; newline->last_char = ch; newline->bbox = fz_union_rect(newline->bbox, crect); } if (line->first_char == NULL) { /* We've removed all the chars from this line. * Better remove the line too. */ if (line->prev) line->prev->next = next_line; else block->u.t.first_line = next_line; if (next_line) next_line->prev = line->prev; else block->u.t.last_line = line->prev; } if (newline) { if (newblock == NULL) { newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); /* Add the block onto our target list */ insert_block_before(newblock, before, page, dest); newblock->u.t.flags = block->u.t.flags; before = newblock->next; } append_line_to_block(newline, newblock); } } } if (newblock) { recalc_bbox(block); recalc_bbox(newblock); } if (block->u.t.first_line == NULL) { /* We've removed all the lines from the block. Should remove that too! */ if (block->prev) { assert(block->prev->next == block); block->prev->next = next_block; } else { assert(*sfirst == block); *sfirst = block->next; } if (next_block) { assert(next_block->prev == block); next_block->prev = block->prev; } else { assert(*slast == block); *slast = block->prev; } } } } return r; } typedef struct { fz_stext_block *block; fz_stext_struct *parent; } tree_pos; /* This is still not perfect, but it's better! */ static tree_pos find_table_insertion_point(fz_context *ctx, fz_rect r, tree_pos current, tree_pos best) { fz_stext_block *block; for (block = current.block; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_STRUCT) { if (block->u.s.down != NULL && fz_is_rect_inside_rect(r, block->bbox)) { tree_pos down; down.parent = block->u.s.down; down.block = block->u.s.down->first_block; best = find_table_insertion_point(ctx, r, down, best); } } else { /* Is block a better precursor than best? (Or a valid precursor, if best.block == NULL) */ if (block->bbox.y1 < r.y0 && (best.block == NULL || best.block->bbox.y1 < block->bbox.y1)) { best.block = block; best.parent = current.parent; } } } return best; } static int tr_is_empty(fz_context *ctx, fz_stext_block *block) { for (; block != NULL; block = block->next) { if (block->type != FZ_STEXT_BLOCK_STRUCT) return 0; if (!block->u.s.down) { } else if (block->u.s.down->standard != FZ_STRUCTURE_TD) return 0; else if (block->u.s.down->first_block != NULL) return 0; } return 1; } static int table_is_empty(fz_context *ctx, fz_stext_block *block) { for (; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_GRID) continue; if (block->type != FZ_STEXT_BLOCK_STRUCT) return 0; if (!block->u.s.down) { } else if (block->u.s.down->standard != FZ_STRUCTURE_TR) return 0; else if (!tr_is_empty(ctx, block->u.s.down->first_block)) return 0; } return 1; } static void tidy_td_divs(fz_context *ctx, fz_stext_struct *parent) { while (1) { fz_stext_block *block = parent->first_block; if (block == NULL || block->next != NULL || block->type != FZ_STEXT_BLOCK_STRUCT || block->u.s.down == NULL || block->u.s.down->standard != FZ_STRUCTURE_DIV) return; parent->first_block = block->u.s.down->first_block; parent->last_block = block->u.s.down->last_block; } } static void tidy_tr_divs(fz_context *ctx, fz_stext_block *block) { for (; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TD) tidy_td_divs(ctx, block->u.s.down); } } static void tidy_table_divs(fz_context *ctx, fz_stext_block *block) { for (; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_GRID) continue; if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down && block->u.s.down->standard == FZ_STRUCTURE_TR) tidy_tr_divs(ctx, block->u.s.down->first_block); } } static int struct_is_empty(fz_context *ctx, fz_stext_block *block) { for (; block != NULL; block = block->next) { if (block->type != FZ_STEXT_BLOCK_STRUCT) return 0; if (!block->u.s.down) { } else if (!struct_is_empty(ctx, block->u.s.down->first_block)) return 0; } return 1; } static int div_is_empty(fz_context *ctx, fz_stext_block *block) { for (; block != NULL; block = block->next) { if (block->type != FZ_STEXT_BLOCK_STRUCT) return 0; if (!block->u.s.down) { } else if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) { tidy_table_divs(ctx, block->u.s.down->first_block); return table_is_empty(ctx, block->u.s.down->first_block); } else if (block->u.s.down->standard != FZ_STRUCTURE_DIV) { if (!struct_is_empty(ctx, block->u.s.down->first_block)) return 0; } else if (!div_is_empty(ctx, block->u.s.down->first_block)) return 0; } return 1; } static void tidy_orphaned_tables(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) { fz_stext_block **first_blockp = parent ? &parent->first_block : &page->first_block; fz_stext_block **last_blockp = parent ? &parent->last_block : &page->last_block; fz_stext_block *block, *next_block; for (block = *first_blockp; block != NULL; block = next_block) { next_block = block->next; if (block->type == FZ_STEXT_BLOCK_STRUCT && block->u.s.down) { if (block->u.s.down->standard == FZ_STRUCTURE_TABLE) { tidy_table_divs(ctx, block->u.s.down->first_block); if (table_is_empty(ctx, block->u.s.down->first_block)) { /* Remove block */ if (block->prev) { assert(block->prev->next == block); block->prev->next = next_block; } else { assert(*first_blockp == block); *first_blockp = next_block; } if (next_block) { assert(next_block->prev == block); next_block->prev = block->prev; } else { assert(*last_blockp == block); *last_blockp = block->prev; } } else { tidy_orphaned_tables(ctx, page, block->u.s.down); } } if (block->u.s.down->standard == FZ_STRUCTURE_DIV) { tidy_orphaned_tables(ctx, page, block->u.s.down); if (div_is_empty(ctx, block->u.s.down->first_block)) { /* Remove block */ if (block->prev) { assert(block->prev->next == block); block->prev->next = next_block; } else { assert(*first_blockp == block); *first_blockp = next_block; } if (next_block) { assert(next_block->prev == block); next_block->prev = block->prev; } else { assert(*last_blockp == block); *last_blockp = block->prev; } } } } } } static fz_stext_struct * transcribe_table(fz_context *ctx, grid_walker_data *gd, fz_stext_page *page, fz_stext_struct *parent) { int w = gd->xpos->len; int h = gd->ypos->len; int x, y; char *sent_tab = fz_calloc(ctx, 1, w*(size_t)h); fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; fz_stext_struct *table, *tr, *td; fz_rect r; tree_pos insert, top; /* Where should we insert the table in the data? */ r.x0 = gd->xpos->list[0].pos; r.x1 = gd->xpos->list[w-1].pos; r.y0 = gd->ypos->list[0].pos; r.y1 = gd->ypos->list[h-1].pos; #ifdef DEBUG_TABLE_STRUCTURE fz_print_stext_page_as_xml(ctx, fz_stddbg(ctx), page, 0); #endif top.block = *first_block; top.parent = parent; insert.block = NULL; insert.parent = NULL; insert = find_table_insertion_point(ctx, r, top, insert); /* Convert to 'before' */ if (insert.block) insert.block = insert.block->next; /* Make table */ table = add_struct_block_before(ctx, insert.block, page, insert.parent, FZ_STRUCTURE_TABLE, "Table"); /* Run through the cells, and guess at spanning. */ for (y = 0; y < h-1; y++) { /* Have we sent this entire row before? */ for (x = 0; x < w-1; x++) { if (!sent_tab[x+y*w]) break; } if (x == w-1) continue; /* No point in sending a row with nothing in it! */ /* Make TR */ tr = add_struct_block_before(ctx, NULL, page, table, FZ_STRUCTURE_TR, "TR"); for (x = 0; x < w-1; x++) { int x2, y2; int cellw = 1; int cellh = 1; /* Have we sent this cell already? */ if (sent_tab[x+y*w]) continue; /* Find the width of the cell */ for (x2 = x+1; x2 < w-1; x2++) { cell_t *cell = get_cell(gd->cells, x2, y); if (cell->v_line) break; /* Can't go past a line */ if (gd->xpos->list[x2].uncertainty == 0) break; /* An uncertainty of 0 is as good as a line. */ if (!cell->v_crossed) break; cellw++; } /* Find the height of the cell */ for (y2 = y+1; y2 < h-1; y2++) { cell_t *cell; int h_crossed = 0; if (gd->ypos->list[y2].uncertainty == 0) break; /* An uncertainty of 0 is as good as a line. */ cell = get_cell(gd->cells, x, y2); if (cell->h_line) break; /* Can't extend down through a line. */ if (cell->h_crossed) h_crossed = 1; for (x2 = x+1; x2 < x+cellw; x2++) { cell = get_cell(gd->cells, x2, y2); if (cell->h_line) break; if (cell->v_line) break; /* Can't go past a line */ if (gd->xpos->list[x2].uncertainty == 0) break; /* An uncertainty of 0 is as good as a line. */ if (!cell->v_crossed) break; if (cell->h_crossed) h_crossed = 1; } if (x2 == x+cellw && h_crossed) cellh++; else break; } /* Make TD */ td = add_struct_block_before(ctx, NULL, page, tr, FZ_STRUCTURE_TD, "TD"); r.x0 = gd->xpos->list[x].pos; r.x1 = gd->xpos->list[x+cellw].pos; r.y0 = gd->ypos->list[y].pos; r.y1 = gd->ypos->list[y+cellh].pos; /* Use r, not REAL contents bbox, as otherwise spanned rows * can end up empty. */ td->up->bbox = r; move_contained_content(ctx, page, td, parent, r); #ifdef DEBUG_TABLE_STRUCTURE printf("(%d,%d) + (%d,%d)\n", x, y, cellw, cellh); #endif for (y2 = y; y2 < y+cellh; y2++) for (x2 = x; x2 < x+cellw; x2++) sent_tab[x2+y2*w] = 1; } r.x0 = gd->xpos->list[0].pos; r.x1 = gd->xpos->list[gd->xpos->len-1].pos; r.y0 = gd->ypos->list[y].pos; r.y1 = gd->ypos->list[y+1].pos; tr->up->bbox = r; table->up->bbox = fz_union_rect(table->up->bbox, tr->up->bbox); } fz_free(ctx, sent_tab); { fz_stext_block *block; fz_stext_grid_positions *xps2 = copy_grid_positions_to_pool(ctx, page, gd->xpos); fz_stext_grid_positions *yps2 = copy_grid_positions_to_pool(ctx, page, gd->ypos); block = add_grid_block(ctx, page, &table->first_block, &table->last_block); block->u.b.xs = xps2; block->u.b.ys = yps2; block->bbox.x0 = block->u.b.xs->list[0].pos; block->bbox.y0 = block->u.b.ys->list[0].pos; block->bbox.x1 = block->u.b.xs->list[block->u.b.xs->len-1].pos; block->bbox.y1 = block->u.b.ys->list[block->u.b.ys->len-1].pos; } tidy_orphaned_tables(ctx, page, parent); return table; } static void merge_column(grid_walker_data *gd, int x) { int y; for (y = 0; y < gd->cells->h; y++) { cell_t *d = &gd->cells->cell[x + y * (gd->cells->w-1)]; cell_t *s = &gd->cells->cell[x + y * gd->cells->w]; if (x > 0) memcpy(d-x, s-x, sizeof(*d) * x); d->full = s[0].full || s[1].full; d->h_crossed = s[0].h_crossed || s[1].h_crossed; d->h_line = s[0].h_line; /* == s[1].h_line */ d->v_crossed = s[0].v_crossed; d->v_line = s[0].v_line; if (x < gd->cells->w - 2) memcpy(d+1, s+2, sizeof(*d) * (gd->cells->w - 2 - x)); } gd->cells->w--; if (x < gd->xpos->len - 2) memcpy(&gd->xpos->list[x+1], &gd->xpos->list[x+2], sizeof(gd->xpos->list[0]) * (gd->xpos->len - 2 - x)); gd->xpos->len--; } static void merge_columns(grid_walker_data *gd) { int x, y; for (x = gd->cells->w-3; x >= 0; x--) { /* Can column x be merged with column x+1? */ /* An empty column can certainly be merged if the h_lines are the same, * and there is no v_line. */ for (y = 0; y < gd->cells->h-1; y++) { cell_t *a = get_cell(gd->cells, x, y); cell_t *b = get_cell(gd->cells, x+1, y); if (a->full || !!a->h_line != !!b->h_line || b->v_line) break; } if (y == gd->cells->h-1) goto merge_column; /* An empty column can certainly be merged if the h_lines are the same. */ for (y = 0; y < gd->cells->h-1; y++) { cell_t *a = get_cell(gd->cells, x, y); cell_t *b = get_cell(gd->cells, x+1, y); if (b->full || !!a->h_line != !!b->h_line || b->v_line) break; } if (y == gd->cells->h-1) goto merge_column; /* We only ever want to merge columns if content crossed between them somewhere. * Don't use uncertainty for this, because uncertainty doesn't allow for * whitespace. */ for (y = 0; y < gd->cells->h-1; y++) if (get_cell(gd->cells, x+1, y)->v_crossed == 1) break; if (y == gd->cells->h-1) continue; /* This requires all the pairs of cells in those 2 columns to be mergeable. */ for (y = 0; y < gd->cells->h-1; y++) { cell_t *a = get_cell(gd->cells, x, y); cell_t *b = get_cell(gd->cells, x+1, y); /* If there is a divider, we can't merge. */ if (b->v_line) break; /* If either is empty, we can merge. */ if (!a->full || !b->full) continue; /* If we differ in h linedness, we can't merge */ if (!!a->h_line != !!b->h_line) break; /* If both are full, we can only merge if we cross. */ if (a->full && b->full && b->v_crossed) continue; /* Otherwise we can't merge */ break; } if (y == gd->cells->h-1) { /* Merge the column! */ merge_column: #ifdef DEBUG_TABLE_STRUCTURE printf("Merging column %d\n", x); #endif merge_column(gd, x); #ifdef DEBUG_TABLE_STRUCTURE asciiart_table(gd); #endif } } } static void merge_row(grid_walker_data *gd, int y) { int x; int w = gd->cells->w; cell_t *d = &gd->cells->cell[y * w]; for (x = 0; x < gd->cells->w-1; x++) { if (d->full == 0) d->full = d[w].full; if (d->v_crossed == 0) d->v_crossed = d[w].v_crossed; d++; } if (y < gd->cells->h - 2) memcpy(d, d+w, sizeof(*d) * (gd->cells->h - 2 - y) * w); gd->cells->h--; if (y < gd->ypos->len - 2) memcpy(&gd->ypos->list[y+1], &gd->ypos->list[y+2], sizeof(gd->ypos->list[0]) * (gd->ypos->len - 2 - y)); gd->ypos->len--; } static void merge_rows(grid_walker_data *gd) { int x, y; for (y = gd->cells->h-3; y >= 0; y--) { /* Can row y be merged with row y+1? */ /* An empty row can certainly be merged if the v_lines are the same, * and there is no h_line. */ for (x = 0; x < gd->cells->w-1; x++) { cell_t *a = get_cell(gd->cells, x, y); cell_t *b = get_cell(gd->cells, x, y+1); if (a->full || !!a->v_line != !!b->v_line || b->h_line) break; } if (x == gd->cells->w-1) goto merge_row; /* An empty row can certainly be merged if the v_lines are the same. */ for (x = 0; x < gd->cells->w-1; x++) { cell_t *a = get_cell(gd->cells, x, y); cell_t *b = get_cell(gd->cells, x, y+1); if (b->full || !!a->v_line != !!b->v_line || b->h_line) break; } if (x == gd->cells->w-1) goto merge_row; /* We only ever want to merge rows if content crossed between them somewhere. * Don't use uncertainty for this, because uncertainty doesn't allow for * whitespace. */ for (x = 0; x < gd->cells->w-1; x++) if (get_cell(gd->cells, x, y+1)->h_crossed == 1) break; if (x == gd->cells->w-1) continue; /* This requires all the pairs of cells in those 2 rows to be mergeable. */ for (x = 0; x < gd->cells->w-1; x++) { cell_t *a = get_cell(gd->cells, x, y); cell_t *b = get_cell(gd->cells, x, y+1); /* If there is a divider, we can't merge. */ if (b->h_line) goto cant_merge; /* If either is empty, we can merge. */ if (!a->full || !b->full) continue; /* If we differ in v linedness, we can't merge */ if (!!a->v_line != !!b->v_line) goto cant_merge; /* If both are full, we can only merge if we cross. */ if (a->full && b->full && b->h_crossed) continue; /* Otherwise we can't merge */ break; } if (x == gd->cells->w-1) { /* Merge the row! */ merge_row: #ifdef DEBUG_TABLE_STRUCTURE printf("Merging row %d\n", y); #endif merge_row(gd, y); #ifdef DEBUG_TABLE_STRUCTURE asciiart_table(gd); #endif continue; } /* OK, so we failed to be able to merge y and y+1. But if we get here, we * know this wasn't because of any lines being in the way. So we can try * a different set of criteria. If every non-empty cell in line y+1 is either * into from line y, or crosses into line y+2, then it's probably a 'straddling' * line. Just remove it. */ if (y+2 >= gd->cells->h) continue; for (x = 0; x < gd->cells->w-1; x++) { cell_t *b = get_cell(gd->cells, x, y+1); cell_t *c = get_cell(gd->cells, x, y+2); if (!b->full) continue; if (!b->h_crossed && !c->h_crossed) break; } if (x == gd->cells->w-1) { /* Merge the row! */ #ifdef DEBUG_TABLE_STRUCTURE printf("Merging row %d (case 2)\n", y); #endif merge_row(gd, y); #ifdef DEBUG_TABLE_STRUCTURE asciiart_table(gd); #endif } cant_merge: {} } } static int remove_bordered_empty_cells(grid_walker_data *gd) { int x, y, x1, y1, x2, y2; int changed = 0; /* We are looking for a region of cells that's bordered, * and empty, and where the borders don't extend... * * ' ' * ' ' * . . .'____'. . . * | | * | | * . . .|____|. . . * ' ' * ' ' * ' ' */ for (y = 0; y < gd->cells->h-1; y++) { for (x = 0; x < gd->cells->w-1; x++) { cell_t *u = y > 0 ? get_cell(gd->cells, x, y-1) : NULL; cell_t *l = x > 0 ? get_cell(gd->cells, x-1, y) : NULL; cell_t *c = get_cell(gd->cells, x, y); if (c->full) continue; if (!c->h_line || !c->v_line) continue; if (l != NULL && l->h_line) continue; if (u != NULL && u->v_line) continue; /* So c (x, y) is a reasonable top left of the bounded region. */ for (x1 = x+1; x1 < gd->cells->w; x1++) { u = y > 0 ? get_cell(gd->cells, x1, y-1) : NULL; c = get_cell(gd->cells, x1, y); if (u != NULL && u->v_line) goto bad_region; if (c->h_line && !c->v_line) continue; if (c->h_line || !c->v_line) goto bad_region; break; } if (x1 == gd->cells->w) goto bad_region; /* If we get here, then we have the top row of a bounded region * that runs from (x,y) to (x1-1, y). So, can we extend that region * downwards? */ for (y1 = y+1; y1 < gd->cells->h; y1++) { c = get_cell(gd->cells, x, y1); if (!c->h_line && c->v_line) { /* This could be the start of a line. */ for (x2 = x+1; x2 < x1; x2++) { c = get_cell(gd->cells, x2, y1); if (c->full || c->h_line || c->v_line) goto bad_region; } c = get_cell(gd->cells, x1, y1); if (c->h_line || !c->v_line) goto bad_region; /* That line was fine! Region is extended. */ } else if (c->h_line && !c->v_line) { /* This might be the bottom line of the bounded region. */ for (x2 = x+1; x2 < x1; x2++) { c = get_cell(gd->cells, x2, y1); if (!c->h_line || c->v_line) goto bad_region; } c = get_cell(gd->cells, x1, y1); if (c->h_line || c->v_line) goto bad_region; break; } else goto bad_region; } if (y1 == gd->cells->h) goto bad_region; /* So we have a bounded region from x,y to x1-1,y1-1 */ for (y2 = y; y2 < y1; y2++) { for (x2 = x; x2 < x1; x2++) { c = get_cell(gd->cells, x2, y2); c->h_line = 0; c->v_line = 0; c->full = 1; if (x2 > x) c->v_crossed = 1; if (y2 > y) c->h_crossed = 1; } c = get_cell(gd->cells, x2, y2); c->v_line = 0; } for (x2 = x; x2 < x1; x2++) get_cell(gd->cells, x2, y2)->h_line = 0; changed = 1; bad_region: {} } } #ifdef DEBUG_TABLE_STRUCTURE if (changed) { printf("Simplified empty bounded cells to give:\n"); asciiart_table(gd); } #endif return changed; } static int find_table_within_bounds(fz_context *ctx, grid_walker_data *gd, fz_stext_block *content, fz_rect bounds) { div_list xs = { 0 }; div_list ys = { 0 }; int failed = 1; fz_try(ctx) { walk_to_find_content(ctx, &xs, &ys, content, bounds); sanitize_positions(ctx, &xs); sanitize_positions(ctx, &ys); /* Run across the line, counting 'winding' */ /* If we don't have at least 2 rows and 2 columns, give up. */ if (xs.len <= 2 || ys.len <= 2) break; gd->xpos = make_table_positions(ctx, &xs, bounds.x0, bounds.x1); gd->ypos = make_table_positions(ctx, &ys, bounds.y0, bounds.y1); gd->cells = new_cells(ctx, gd->xpos->len, gd->ypos->len); /* Walk the content looking for grid lines. These * lines refine our positions. */ walk_grid_lines(ctx, gd, content); /* Now, we walk the content looking for content that crosses * these grid lines. This allows us to spot spanned cells. */ if (calculate_spanned_content(ctx, gd, content)) break; /* Unlikely to be a table. */ #ifdef DEBUG_TABLE_STRUCTURE asciiart_table(gd); #endif /* Now, can we remove some columns or rows? i.e. have we oversegmented? */ do { merge_columns(gd); merge_rows(gd); } while (remove_bordered_empty_cells(gd)); /* Did we shrink the table so much it's not a table any more? */ if (gd->xpos->len <= 2 || gd->ypos->len <= 2) break; failed = 0; } fz_always(ctx) { fz_free(ctx, xs.list); fz_free(ctx, ys.list); } fz_catch(ctx) { fz_rethrow(ctx); } return failed; } static int do_table_hunt(fz_context *ctx, fz_stext_page *page, fz_stext_struct *parent) { fz_stext_block *block; int count; fz_stext_block **first_block = parent ? &parent->first_block : &page->first_block; int num_subtables = 0; grid_walker_data gd = { 0 }; /* No content? Just bale. */ if (*first_block == NULL) return 0; /* If all the content here looks like a column of text, don't * hunt for a table within it. */ if (all_blocks_are_justified_or_headers(ctx, *first_block)) return num_subtables; gd.bounds = fz_infinite_rect; /* First off, descend into any children to see if those look like tables. */ count = 0; for (block = *first_block; block != NULL; block = block->next) { if (block->type == FZ_STEXT_BLOCK_STRUCT) { if (block->u.s.down) { num_subtables += do_table_hunt(ctx, page, block->u.s.down); count++; } } else if (block->type == FZ_STEXT_BLOCK_TEXT) count++; } /* If we don't have at least a single child, no more to hunt. */ /* We only need a single block, because a single text block can * contain an entire unbordered table. */ if (count < 1) return num_subtables; /* We only look for a table at this level if we either at the top * or on a div. This saves us looking for tables within an 'H' * for example. */ if (parent != NULL && parent->standard != FZ_STRUCTURE_DIV) return num_subtables; fz_var(gd); fz_try(ctx) { /* Now see whether the content looks like tables. */ fz_rect bounds = walk_to_find_bounds(ctx, *first_block); if (find_table_within_bounds(ctx, &gd, *first_block, bounds)) break; if (num_subtables > 0) { /* We are risking throwing away a table we've already found for this * one. Only do it if this one is really convincing. */ int x, y; for (x = 0; x < gd.xpos->len; x++) if (gd.xpos->list[x].uncertainty != 0) break; if (x != gd.xpos->len) break; for (y = 0; y < gd.ypos->len; y++) if (gd.ypos->list[y].uncertainty != 0) break; if (y != gd.ypos->len) break; } #ifdef DEBUG_TABLE_STRUCTURE printf("Transcribing table: (%g,%g)->(%g,%g)\n", gd.xpos->list[0].pos, gd.ypos->list[0].pos, gd.xpos->list[gd.xpos->len-1].pos, gd.ypos->list[gd.ypos->len-1].pos); #endif /* Now we should have the entire table calculated. */ (void)transcribe_table(ctx, &gd, page, parent); num_subtables = 1; #ifdef DEBUG_WRITE_AS_PS { int i; printf("%% TABLE\n"); for (i = 0; i < block->u.b.xs->len; i++) { if (block->u.b.xs->list[i].uncertainty) printf("0 1 0 setrgbcolor\n"); else printf("0 0.5 0 setrgbcolor\n"); printf("%g %g moveto %g %g lineto stroke\n", block->u.b.xs->list[i].pos, block->bbox.y0, block->u.b.xs->list[i].pos, block->bbox.y1); } for (i = 0; i < block->u.b.ys->len; i++) { if (block->u.b.ys->list[i].uncertainty) printf("0 1 0 setrgbcolor\n"); else printf("0 0.5 0 setrgbcolor\n"); printf("%g %g moveto %g %g lineto stroke\n", block->bbox.x0, block->u.b.ys->list[i].pos, block->bbox.x1, block->u.b.ys->list[i].pos); } } #endif } fz_always(ctx) { fz_free(ctx, gd.xpos); fz_free(ctx, gd.ypos); fz_free(ctx, gd.cells); } fz_catch(ctx) fz_rethrow(ctx); return num_subtables; } void fz_table_hunt(fz_context *ctx, fz_stext_page *page) { if (page == NULL) return; assert(verify_stext(ctx, page, NULL)); do_table_hunt(ctx, page, NULL); assert(verify_stext(ctx, page, NULL)); } fz_stext_block * fz_find_table_within_bounds(fz_context *ctx, fz_stext_page *page, fz_rect bounds) { fz_stext_struct *table = NULL; grid_walker_data gd = { 0 }; /* No content? Just bale. */ if (page == NULL || page->first_block == NULL) return NULL; fz_var(gd); fz_try(ctx) { gd.bounds = bounds; if (find_table_within_bounds(ctx, &gd, page->first_block, bounds)) break; #ifdef DEBUG_TABLE_STRUCTURE printf("Transcribing table: (%g,%g)->(%g,%g)\n", gd.xpos->list[0].pos, gd.ypos->list[0].pos, gd.xpos->list[gd.xpos->len-1].pos, gd.ypos->list[gd.ypos->len-1].pos); #endif /* Now we should have the entire table calculated. */ table = transcribe_table(ctx, &gd, page, NULL); #ifdef DEBUG_WRITE_AS_PS { int i; printf("%% TABLE\n"); for (i = 0; i < block->u.b.xs->len; i++) { if (block->u.b.xs->list[i].uncertainty) printf("0 1 0 setrgbcolor\n"); else printf("0 0.5 0 setrgbcolor\n"); printf("%g %g moveto %g %g lineto stroke\n", block->u.b.xs->list[i].pos, block->bbox.y0, block->u.b.xs->list[i].pos, block->bbox.y1); } for (i = 0; i < block->u.b.ys->len; i++) { if (block->u.b.ys->list[i].uncertainty) printf("0 1 0 setrgbcolor\n"); else printf("0 0.5 0 setrgbcolor\n"); printf("%g %g moveto %g %g lineto stroke\n", block->bbox.x0, block->u.b.ys->list[i].pos, block->bbox.x1, block->u.b.ys->list[i].pos); } } #endif } fz_always(ctx) { fz_free(ctx, gd.xpos); fz_free(ctx, gd.ypos); fz_free(ctx, gd.cells); } fz_catch(ctx) fz_rethrow(ctx); return table ? table->first_block : NULL; }
