Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/fitz/stext-para.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/fitz/stext-para.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1584 @@ +// Copyright (C) 2025 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" + +#include <assert.h> + +/* #define DEBUG_SPLITS */ + +/* #define DEBUG_PARA_SPLITS */ + +static void +recalc_bbox(fz_stext_block *block) +{ + fz_rect bbox = fz_empty_rect; + fz_stext_line *line; + + for (line = block->u.t.first_line; line != NULL; line = line->next) + bbox = fz_union_rect(bbox, line->bbox); + + block->bbox = bbox; +} + +typedef enum +{ + UNDERLINE_UNKNOWN, + UNDERLINE_YES, + UNDERLINE_NO, + UNDERLINE_MIXED +} underline_state; + +/* Some crap heuristics to spot a bold font. */ +static int +font_is_bold(fz_font *font) +{ + const char *c; + + if (font == NULL) + return 0; + if (font->flags.is_bold) + return 1; + + if (fz_strstrcase(font->name, "Bold") != NULL) + return 1; + if (fz_strstrcase(font->name, "Black") != NULL) + return 1; + if (fz_strstrcase(font->name, "Medium") != NULL) + return 0; + if (fz_strstrcase(font->name, "Light") != NULL) + return 0; + + c = fz_strstr(font->name, " B"); + if (c && (c[2] == ' ' || c[2] == 0)) + return 1; + + return 0; +} + +/* Check to see if lines move left to right and downwards. */ +/* FIXME: Maybe allow right to left? checking unicode values? */ +static int +lines_move_plausibly_like_paragraph(fz_stext_block *block) +{ + fz_stext_line *line; + int firstline = 1; + float line_height, line_x, line_y; + + /* Do the lines that make up this block move in an appropriate way? */ + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + float x = (line->bbox.x0 + line->bbox.x1)/2; + float y = (line->bbox.y0 + line->bbox.y1)/2; + float height = line->bbox.y1 - line->bbox.y0; + fz_stext_char *ch; + + /* Ignore any completely empty lines */ + for (ch = line->first_char; ch != NULL; ch = ch->next) + if (ch->c != ' ') + break; + if (ch == NULL) + continue; + + if (firstline) + { + line_height = height; + line_x = x; + line_y = y; + firstline = 0; + } + else if (line_y - line_height/2 < y && line_y + line_height/2 > y) + { + /* We are plausibly the same line. Only accept if we move right. */ + if (x < line_x) + return 0; + else + line_x = x; + } + else if (line_y < y) + { + /* Moving downwards. Plausible. */ + line_y = y; + line_height = height; + line_x = x; + } + else + { + /* Nothing else is plausible. */ + return 0; + } + } + return 1; +} + +#ifdef DEBUG_SPLITS +static void dump_line(fz_context *ctx, const char *str, fz_stext_line *line) +{ + fz_stext_char *ch; + + if (str) + fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", str); + + if (line == NULL) + return; + + for (ch = line->first_char; ch != NULL; ch = ch->next) + fz_write_printf(ctx, fz_stddbg(ctx), "%c", (char)ch->c); + fz_write_printf(ctx, fz_stddbg(ctx), "\n"); +} + +static void dump_block(fz_context *ctx, const char *fmt, fz_stext_block *block) +{ + fz_stext_line *line; + + fz_write_printf(ctx, fz_stddbg(ctx), "%s\n", fmt); + if (block == NULL || block->type != FZ_STEXT_BLOCK_TEXT) + return; + + for (line = block->u.t.first_line; line != NULL; line = line->next) + dump_line(ctx, NULL, line); +} +#endif + +typedef struct +{ + fz_pool *pool; + fz_stext_struct *parent; + int idx; + fz_stext_block **pfirst; + fz_stext_block **plast; +} stext_pos; + +static fz_stext_block *split_block_at_line(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_stext_line *line) +{ + fz_stext_block *newblock = fz_pool_alloc(ctx, pos->pool, sizeof *newblock); + +#ifdef DEBUG_SPLITS + dump_block(ctx, "Splitting:", block); + dump_line(ctx, "At line:", line); +#endif + + newblock->bbox = fz_empty_rect; + newblock->prev = block; + newblock->next = block->next; + if (block->next) + block->next->prev = newblock; + else + { + assert(*pos->plast == block); + *pos->plast = newblock; + } + block->next = newblock; + newblock->type = FZ_STEXT_BLOCK_TEXT; + newblock->u.t.flags = block->u.t.flags; + newblock->u.t.first_line = line; + newblock->u.t.last_line = block->u.t.last_line; + block->u.t.last_line = line->prev; + line->prev->next = NULL; + line->prev = NULL; + recalc_bbox(block); + recalc_bbox(newblock); + +#ifdef DEBUG_SPLITS + dump_block(ctx, "Giving:", block); + dump_block(ctx, "and:", newblock); +#endif + + return newblock; +} + +/* Convert a block to being a struct that contains just that block. */ +static void block_to_struct(fz_context *ctx, stext_pos *pos, fz_stext_block *block, int structtype) +{ + fz_stext_struct *str = fz_pool_alloc_flexible(ctx, pos->pool, fz_stext_struct, raw, 1); + fz_stext_block *new_block = fz_pool_alloc(ctx, pos->pool, sizeof(*new_block)); + + str->up = block; + str->parent = pos->parent; + str->first_block = new_block; + str->last_block = new_block; + str->standard = structtype; + str->raw[0] = 0; + + new_block->type = block->type; + new_block->bbox = block->bbox; + new_block->u = block->u; + + block->type = FZ_STEXT_BLOCK_STRUCT; + block->u.s.down = str; + block->u.s.index = pos->idx++; +} + +/* + We are going to repeatedly walk the lines that make up a block. + To reduce the boilerplate here, we'll use a line_walker function. + This will call a bunch of callbacks as it goes. + + newline_fn Called whenever we move to a new horizontal line (i.e. + as if we've got a newline). This is not the same as being + called every fz_stext_line, as we frequently get multiple + fz_stext_line's on a single horizontal line. If this returns + 0, execution continues. Return 1 to stop the walking. + line_fn Called for every fz_stext_line (typically used to process + characters). + end_fn Called at the end of the block (with line being the final + line of the block. + arg An opaque pointer passed to all the callbacks. +*/ +typedef int (line_walker_newline_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height); +typedef int (line_walker_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg); +typedef void (line_walker_end_fn)(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg); + +static void +line_walker(fz_context *ctx, fz_stext_block *block, line_walker_newline_fn *newline_fn, line_walker_fn *line_fn, line_walker_end_fn *end_fn, void *arg) +{ + int firstline = 1; + fz_stext_line *line; + float line_height, line_y; + + if (block->u.t.first_line == NULL) + return; + + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + float y = (line->bbox.y0 + line->bbox.y1)/2; + float height = line->bbox.y1 - line->bbox.y0; + + if (line->first_char == NULL) + continue; /* Should never happen, but makes life easier to assume this later. */ + + if (firstline) + { + line_height = height; + firstline = 0; + line_y = y; + } + else if (line_y - line_height/2 < y && line_y + line_height/2 > y) + { + /* We are plausibly the same horizontal line. */ + } + else if (line_y < y) + { + /* Moving downwards. */ + line_height = height; + line_y = y; + if (newline_fn && newline_fn(ctx, block, line, arg, line_height)) + return; + } + if (line_fn && line_fn(ctx, block, line, arg)) + return; + } + if (end_fn) + end_fn(ctx, block, block->u.t.last_line, arg); +} + +/* We scan through the block, collecting lines up that look + * "title-ish" (by which here, we mean "are completely + * underlined"). As soon as we finish such a region, we split + * the block (either before or after it as appropriate), and + * mark it as a title. + * + * e.g. + * + * _THIS_IS_LIKELY_A + * _TITLE_ ___ < BREAK HERE + * Lorem ipsum dolor sit + * amet, consectetur + * adipiscing elit. ___ < BREAK HERE + * _LIKELY_ANOTHER_TITLE_ ____< BREAK HERE + * Sed do eiusmod tempor + * incididunt ut labore + * et dolore magna aliqua. + */ +typedef struct +{ + stext_pos *pos; + fz_stext_line *title_start; + fz_stext_line *title_end; + underline_state underlined; + int changed; +} underlined_data; + +static int +underlined_break(fz_context *ctx, fz_stext_block *block, underlined_data *data) +{ + fz_stext_line *line; + + /* We have a block that looks like a title. */ + if (data->title_start != block->u.t.first_line) + { + /* We need to split the block before title_start */ + line = data->title_start; + } + else if (data->title_end != block->u.t.last_line) + { + /* We need to split the block after title_end */ + line = data->title_end->next; + } + else + { + /* This block is already entirely title. */ + line = NULL; + } + if (line) + { + (void)split_block_at_line(ctx, data->pos, block, line); + data->changed = 1; + if (line == data->title_start) + { + /* Don't label the latter part as a title yet, we'll do it when + * we step back in, but we don't know how much of the latter + * block is title yet. */ + } + else + { + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H); + } + } + else + { + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H); + } + return 1; +} + +static int +underlined_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height) +{ + underlined_data *data = (underlined_data *)arg; + + if (data->underlined == UNDERLINE_YES) + { + /* Add the line we've just finished to the start/stop region */ + if (data->title_start == NULL) + data->title_start = line->prev; + data->title_end = line->prev; + } + else if (data->title_start != NULL) + { + /* We've reached the end of a title region. */ + return underlined_break(ctx, block, data); + } + data->underlined = UNDERLINE_UNKNOWN; + + return 0; +} + +static int +underlined_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + underlined_data *data = (underlined_data *)arg; + fz_stext_char *ch; + + /* If we already know that this line is mixed underlined, then no point in + * wasting time. */ + if (data->underlined == UNDERLINE_MIXED) + return 0; + + /* If we haven't started looking yet, prime the value. */ + if (data->underlined == UNDERLINE_UNKNOWN) + data->underlined = (line->first_char->flags & FZ_STEXT_UNDERLINE) ? UNDERLINE_YES : UNDERLINE_NO; + + /* Check that all the rest of the the chars match our expected value. */ + for (ch = line->first_char; ch != NULL; ch = ch->next) + if ((!!(ch->flags & FZ_STEXT_UNDERLINE)) ^ (data->underlined == UNDERLINE_YES)) + { + /* Differs! So, Mixed. */ + data->underlined = UNDERLINE_MIXED; + break; + } + + return 0; +} + +static void +underlined_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + underlined_data *data = (underlined_data *)arg; + + if (data->underlined == UNDERLINE_YES) + { + /* Add the line we've just finished to the start/stop region */ + if (data->title_start == NULL) + data->title_start = block->u.t.last_line; + data->title_end = block->u.t.last_line; + } + + /* If we didn't find a region, bale. */ + if (data->title_start) + underlined_break(ctx, block, data); +} + +static int +detect_underlined_titles(fz_context *ctx, stext_pos *pos, fz_stext_block *block) +{ + /* Let's do the title scanning, where our criteria is + * "the entire line is underlined". */ + underlined_data data[1]; + + data->pos = pos; + data->title_start = NULL; + data->title_end = NULL; + data->underlined = UNDERLINE_UNKNOWN; + data->changed = 0; + + line_walker(ctx, block, underlined_newline, underlined_line, underlined_end, data); + + return data->changed; +} + + +/* Now we scan again, where the 'title' criteria is based upon + * the titles being entirely in a different font. */ +typedef struct +{ + stext_pos *pos; + fz_stext_line *title_start; + fz_stext_line *title_end; + fz_font *font; + int changed; +} font_data; + +#define MIXED_FONT ((fz_font *)1) + +static int +font_break(fz_context *ctx, fz_stext_block *block, font_data *data) +{ + fz_stext_line *line; + + /* We have a block that looks like a title. */ + if (data->title_start != block->u.t.first_line) + { + /* We need to split the block before title_start */ + line = data->title_start; + } + else if (data->title_end != block->u.t.last_line) + { + /* We need to split the block after title_end */ + line = data->title_end->next; + } + else + { + /* This block is already entirely title. */ + line = NULL; + } + if (line) + { + (void)split_block_at_line(ctx, data->pos, block, line); + data->changed = 1; + if (line == data->title_start) + { + /* Don't label the latter part as a title yet, we'll do it when + * we step back in, but we don't know how much of the latter + * block is title yet. */ + } + else + { + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H); + } + } + else + { + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_H); + } + + return 1; +} + +static int +font_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height) +{ + font_data *data = (font_data *)arg; + + if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font)) + { + /* Add the line we've just finished to the start/stop region */ + if (data->title_start == NULL) + data->title_start = line->prev; + data->title_end = line->prev; + } + else if (data->title_start != NULL) + { + /* We've reached the end of a title region. */ + return font_break(ctx, block, data); + } + data->font = NULL; + + return 0; +} + +static int +font_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + font_data *data = (font_data *)arg; + fz_stext_char *ch; + + /* If we already know that this line is mixed fonts, then no point in + * wasting time. */ + if (data->font == MIXED_FONT) + return 0; + + /* If we are just starting, prime it. */ + if (data->font == NULL) + data->font = line->first_char->font; + + for (ch = line->first_char; ch != NULL; ch = ch->next) + if (ch->font != data->font) + { + data->font = MIXED_FONT; + break; + } + + return 0; +} + +static void +font_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + font_data *data = (font_data *)arg; + + if (data->font != NULL && data->font != MIXED_FONT && font_is_bold(data->font)) + { + /* Add the line we've just finished to the start/stop region */ + if (data->title_start == NULL) + data->title_start = block->u.t.last_line; + data->title_end = block->u.t.last_line; + } + + if (data->title_start) + font_break(ctx, block, data); +} + +static int +detect_titles_by_font_usage(fz_context *ctx, stext_pos *pos, fz_stext_block *block) +{ + font_data data[1]; + + data->pos = pos; + data->title_start = NULL; + data->title_end = NULL; + data->font = NULL; + data->changed = 0; + + line_walker(ctx, block, font_newline, font_line, font_end, data); + + return data->changed; +} + +typedef struct +{ + fz_rect bbox; + stext_pos *pos; + int changed; +} indent_data; + +static int +indent_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height) +{ + indent_data *data = (indent_data *)arg; + float indent = line->bbox.x0 - data->bbox.x0; + + if (indent > line_height) + { + /* Break the block here! */ + (void)split_block_at_line(ctx, data->pos, block, line); + data->changed = 1; + return 1; + } + + return 0; +} + +static int +break_paragraphs_by_indent(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox) +{ + indent_data data[1]; + + data->pos = pos; + data->bbox = bbox; + data->changed = 0; + + line_walker(ctx, block, indent_newline, NULL, NULL, data); + + return data->changed; +} + +typedef struct +{ + fz_rect bbox; + stext_pos *pos; + float line_gap; + float prev_line_gap; + int looking_for_space; + float space_size; + int maybe_ends_paragraph; + int changed; +} trailing_data; + +static int +trailing_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height) +{ + trailing_data *data = (trailing_data *)arg; + + data->prev_line_gap = data->line_gap; + + if (data->looking_for_space) + { + /* We've moved downwards onto a line, and failed to find + * a space on that line. Presumably that means that whole + * line is a single word. */ + float line_len = line->bbox.x1 - line->bbox.x0; + + if (line_len + data->space_size < data->prev_line_gap) + { + /* We could have fitted this word into the previous line. */ + /* So presumably that was a paragraph break. Split here. */ + (void)split_block_at_line(ctx, data->pos, block, line); + data->changed = 1; + return 1; + } + data->looking_for_space = 0; + } + + /* If we the last line we looked at ended plausibly for a paragraph, + * then look for a space in this line... */ + data->looking_for_space = data->maybe_ends_paragraph; + + return 0; +} + +static int +trailing_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + trailing_data *data = (trailing_data *)arg; + fz_stext_char *ch; + + data->line_gap = data->bbox.x1 - line->bbox.x1; + if (line->last_char && ( + (line->last_char->c >= 'A' && line->last_char->c <= 'Z') || + (line->last_char->c >= 'a' && line->last_char->c <= 'z') || + (line->last_char->c >= '0' && line->last_char->c <= '9'))) + { + /* In Latin text, paragraphs should always end up some form + * of punctuation. I suspect that's less true of some other + * languages (particularly far-eastern ones). Let's just say + * that if we end in A-Za-z0-9 we can't possibly be the last + * line of a paragraph. */ + data->maybe_ends_paragraph = 0; + } + else + { + /* Plausibly the next line might be the first line of a new paragraph */ + data->maybe_ends_paragraph = 1; + } + for (ch = line->first_char; ch != NULL; ch = ch->next) + { + fz_rect r; + float w, line_len; + + if (ch->c != ' ') + continue; + + r = fz_rect_from_quad(ch->quad); + w = r.x1 - r.x0; + + if (w < data->space_size) + data->space_size = w; + + /* If we aren't looking_for_space, then no point in checking for + * whether the prefix will fit. But keep looping as we want to + * continue to refine our idea of how big a space is. */ + if (!data->looking_for_space) + continue; + + line_len = r.x0 - line->bbox.x0; + if (line_len + data->space_size < data->prev_line_gap) + { + /* We could have fitted this word into the previous line. */ + /* So presumably that was a paragraph break. Split here. */ + (void)split_block_at_line(ctx, data->pos, block, line); + data->changed = 1; + return 1; + } + data->looking_for_space = 0; + } + + return 0; +} + +static int +break_paragraphs_by_analysing_trailing_gaps(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox) +{ + trailing_data data[1]; + + data->bbox = bbox; + data->pos = pos; + data->line_gap = 0; + data->prev_line_gap = 0; + data->looking_for_space = 0; + data->space_size = 99999; + data->maybe_ends_paragraph = 0; + data->changed = 0; + + line_walker(ctx, block, trailing_newline, trailing_line, NULL, data); + + return data->changed; +} + +typedef struct +{ + fz_rect bbox; + stext_pos *pos; + int count_lines; + int count_justified; + int non_digits_exist_in_this_line; + fz_rect fragment_box; + fz_rect line_box; + int gap_count_this_line; + float gap_size_this_line; + int bad_gap; + float xmin, xmax; + float last_min_space; + int changed; +} justify_data; + +#define JUSTIFY_THRESHOLD 1 + +static int +justify_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height) +{ + justify_data *data = (justify_data *)arg; + + if (line->prev) + line = line->prev; + + data->line_box = fz_union_rect(data->line_box, data->fragment_box); + if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line) + data->count_justified++; + data->non_digits_exist_in_this_line = 0; + data->count_lines++; + data->gap_size_this_line = 0; + data->gap_count_this_line = 0; + data->fragment_box = fz_empty_rect; + data->line_box = fz_empty_rect; + + data->xmin = INFINITY; + data->xmax = -INFINITY; + + return 0; +} + +static void +fragment_end(justify_data *data) +{ + float gap; + + if (fz_is_empty_rect(data->fragment_box)) + { + /* No fragment. Nothing to do. */ + return; + } + if (fz_is_empty_rect(data->line_box)) + { + /* First fragment of the line; no gap yet. */ + gap = 0; + } + else if (data->fragment_box.x0 > data->line_box.x1) + { + /* This whole fragment is to the right of the line so far. */ + gap = data->fragment_box.x0 - data->line_box.x1; + } + else if (data->fragment_box.x1 < data->line_box.x0) + { + /* This whole fragment is the left of the line so far. */ + gap = data->line_box.x1 - data->fragment_box.x0; + } + else + { + /* Abutting or overlapping fragment. Ignore it. */ + gap = 0; + } + data->line_box = fz_union_rect(data->line_box, data->fragment_box); + data->fragment_box = fz_empty_rect; + if (gap < data->last_min_space) + return; + /* So we have a gap to consider */ + if (data->gap_count_this_line > 0) + { + /* Allow for double spaces, cos some layouts put + * double spaces before full stops. */ + if (fabs(gap - data->gap_size_this_line) > 1 && + fabs(gap/2.0 - data->gap_size_this_line) < 1) + gap /= 2; + if (fabs(gap - data->gap_size_this_line) > 1) + data->bad_gap = 1; + } + data->gap_size_this_line = (data->gap_size_this_line * data->gap_count_this_line + gap) / (data->gap_count_this_line + 1); + data->gap_count_this_line++; +} + +/* This is trickier than you'd imagine. We want to walk the line, looking + * for how large the spaces are. In a justified line, all the spaces should + * be pretty much the same size. (Except maybe before periods). But we want + * to cope with bidirectional text which can send glyphs in unexpected orders. + * e.g. abc fed ghi + * So we have to walk over "fragments" at a time. + */ +static int +justify_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + justify_data *data = (justify_data *)arg; + fz_stext_char *ch; + + for (ch = line->first_char; ch != NULL; ch = ch->next) + { + fz_rect r = fz_rect_from_quad(ch->quad); + float min_space = ch->size * 0.15f; /* Matches SPACE_DIST from stext-device. */ + + if (ch->c == ' ') + { + /* This ends a fragment, but we don't treat it as such. + * Just continue, because we'll end the fragment next time + * around the loop (this copes with trailing spaces, and + * multiple spaces, and gaps between 'lines' that are on + * the same line. */ + data->last_min_space = min_space; + continue; + } + if ((ch->c <= '0' || ch->c >= '9') && ch->c != '.') + data->non_digits_exist_in_this_line = 1; + if (!fz_is_empty_rect(data->fragment_box)) + { + if (r.x0 > data->fragment_box.x1 + data->last_min_space) + { + /* Fragment ends due to gap on right. */ + fragment_end(data); + } + else if (r.x1 < data->fragment_box.x0 - data->last_min_space) + { + /* Fragment ends due to gap on left. */ + fragment_end(data); + } + } + /* Extend the fragment */ + data->fragment_box = fz_union_rect(data->fragment_box, r); + data->last_min_space = min_space; + } + + return 0; +} + +static void +justify_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + justify_data *data = (justify_data *)arg; + + fragment_end(data); + data->line_box = fz_union_rect(data->line_box, data->fragment_box); + if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD && data->gap_count_this_line && data->non_digits_exist_in_this_line) + data->count_justified++; + data->count_lines++; +} + +static int +justify2_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height) +{ + justify_data *data = (justify_data *)arg; + + if (data->line_box.x0 < data->bbox.x0 + JUSTIFY_THRESHOLD && data->line_box.x1 > data->bbox.x1 - JUSTIFY_THRESHOLD) + { + /* Justified */ + } + else + { + /* Break after line */ + (void)split_block_at_line(ctx, data->pos, block, line); + data->changed = 1; + return 1; + } + + data->line_box = fz_empty_rect; + + return 0; +} + +static int +justify2_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + justify_data *data = (justify_data *)arg; + fz_stext_char *ch; + + for (ch = line->first_char; ch != NULL; ch = ch->next) + { + if (ch->c == ' ') + continue; + + data->line_box = fz_union_rect(data->line_box, fz_rect_from_quad(ch->quad)); + } + + return 0; +} + +static fz_rect +text_block_marked_bbox(fz_context *ctx, fz_stext_block *block) +{ + fz_stext_line *line; + fz_stext_char *ch; + fz_rect r = fz_empty_rect; + + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + for (ch = line->first_char; ch != NULL; ch = ch->next) + { + if (ch->c == ' ') + continue; + r = fz_union_rect(r, fz_rect_from_quad(ch->quad)); + } + } + + return r; +} + +static int +break_paragraphs_within_justified_text(fz_context *ctx, stext_pos *pos, fz_stext_block *block, fz_rect bbox) +{ + justify_data data[1]; + + if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN) + return 0; + + data->bbox = bbox; + + data->pos = pos; + data->count_lines = 0; + data->count_justified = 0; + data->non_digits_exist_in_this_line = 0; + data->bad_gap = 0; + data->gap_size_this_line = 0; + data->gap_count_this_line = 0; + data->fragment_box = fz_empty_rect; + data->line_box = fz_empty_rect; + data->xmin = INFINITY; + data->xmax = -INFINITY; + data->changed = 0; + + line_walker(ctx, block, justify_newline, justify_line, justify_end, data); + + /* We can't really derive anything about single lines! */ + if (data->count_lines < 2) + return 0; + /* If at least half of the lines don't appear to be justified, then + * don't trust 'em. */ + if (data->count_justified * 2 < data->count_lines) + return 0; + /* If the "badness" we've seen to do with big gaps (i.e. how much + * bigger the gaps are than we'd reasonably expect) is too large + * then we can't be a justified block. We are prepared to forgive + * larger sizes in larger paragraphs. */ + if (data->bad_gap) + return 0; + block->u.t.flags = FZ_STEXT_TEXT_JUSTIFY_FULL; + + line_walker(ctx, block, justify2_newline, justify2_line, NULL, data); + + return data->changed; +} + +typedef enum +{ + LOOKING_FOR_BULLET = 0, + LOOKING_FOR_POST_BULLET = 1, + LOOKING_FOR_POST_NUMERICAL_BULLET = 2, + FOUND_BULLET = 3, + CONTINUATION_LINE = 4, + NO_BULLET = 5 +} list_state; + +typedef struct +{ + stext_pos *pos; + list_state state; + int buffer[10]; + int buffer_fill; + float bullet_r; + float post_bullet_indent; + float l; + fz_stext_line *bullet_line_start; + fz_stext_line *this_line_start; + int changed; +} list_data; + +static int +list_newline(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg, float line_height) +{ + list_data *data = (list_data *)arg; + + if (data->state == FOUND_BULLET) + { + if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET) + { + /* We need to split the block before the bullet started. */ + (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start); + data->changed = 1; + return 1; + } + if (data->bullet_line_start != data->this_line_start) + { + /* We've found a second bullet. Break before the previous line. */ + (void)split_block_at_line(ctx, data->pos, block, data->this_line_start); + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM); + data->changed = 1; + return 1; + } + } + else if (data->state == NO_BULLET && data->bullet_line_start) + { + /* We've found a bullet before, and the line we've just completed + * is neither a new bullet line, or a continuation so, we need to + * break that into a new block. */ + (void)split_block_at_line(ctx, data->pos, block, data->this_line_start); + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM); + data->changed = 1; + return 1; + } + + data->this_line_start = line; + data->state = LOOKING_FOR_BULLET; + data->buffer_fill = 0; + data->l = block->bbox.x1; + data->bullet_r = block->bbox.x0; + + return 0; +} + +static int +approx_eq(float a, float b, float c) +{ + return fabs(a - b) <= c; +} + +static int +is_roman(int c) +{ + switch (c) + { + case 'm': case 'M': + case 'c': case 'C': + case 'l': case 'L': + case 'x': case 'X': + case 'v': case 'V': + case 'i': case 'I': + return 1; + } + return 0; +} + +typedef enum { + NOT_A_BULLET, + BULLET, + NUMERICAL_BULLET +} bullet_t; + +static bullet_t +is_bullet_aux(int *buffer, int len, int contained) +{ + int i, decimal_pos, decimals_found; + + if (len == 1 && ( + buffer[0] == '*' || + buffer[0] == 0x00B7 || /* Middle Dot */ + buffer[0] == 0x2022 || /* Bullet */ + buffer[0] == 0x2023 || /* Triangular Bullet */ + buffer[0] == 0x2043 || /* Hyphen Bullet */ + buffer[0] == 0x204C || /* Back leftwards bullet */ + buffer[0] == 0x204D || /* Back rightwards bullet */ + buffer[0] == 0x2219 || /* Bullet operator */ + buffer[0] == 0x25C9 || /* Fisheye */ + buffer[0] == 0x25CB || /* White circle */ + buffer[0] == 0x25CF || /* Black circle */ + buffer[0] == 0x25D8 || /* Inverse Bullet */ + buffer[0] == 0x25E6 || /* White Bullet */ + buffer[0] == 0x2619 || /* Reversed Rotated Floral Heart Bullet / Fleuron */ + buffer[0] == 0x261a || /* Black left pointing index */ + buffer[0] == 0x261b || /* Black right pointing index */ + buffer[0] == 0x261c || /* White left pointing index */ + buffer[0] == 0x261d || /* White up pointing index */ + buffer[0] == 0x261e || /* White right pointing index */ + buffer[0] == 0x261f || /* White down pointing index */ + buffer[0] == 0x2765 || /* Rotated Heavy Heart Black Heart Bullet */ + buffer[0] == 0x2767 || /* Rotated Floral Heart Bullet / Fleuron */ + buffer[0] == 0x29BE || /* Circled White Bullet */ + buffer[0] == 0x29BF || /* Circled Bullet */ + buffer[0] == 0x2660 || /* Black Spade suit */ + buffer[0] == 0x2661 || /* White Heart suit */ + buffer[0] == 0x2662 || /* White Diamond suit */ + buffer[0] == 0x2663 || /* Black Club suit */ + buffer[0] == 0x2664 || /* White Spade suit */ + buffer[0] == 0x2665 || /* Black Heart suit */ + buffer[0] == 0x2666 || /* Black Diamond suit */ + buffer[0] == 0x2667 || /* White Clud suit */ + buffer[0] == 0x1F446 || /* WHITE UP POINTING BACKHAND INDEX */ + buffer[0] == 0x1F447 || /* WHITE DOWN POINTING BACKHAND INDEX */ + buffer[0] == 0x1F448 || /* WHITE LEFT POINTING BACKHAND INDEX */ + buffer[0] == 0x1F449 || /* WHITE RIGHT POINTING BACKHAND INDEX */ + buffer[0] == 0x1f597 || /* White down pointing left hand index */ + buffer[0] == 0x1F598 || /* SIDEWAYS WHITE LEFT POINTING INDEX */ + buffer[0] == 0x1F599 || /* SIDEWAYS WHITE RIGHT POINTING INDEX */ + buffer[0] == 0x1F59A || /* SIDEWAYS BLACK LEFT POINTING INDEX */ + buffer[0] == 0x1F59B || /* SIDEWAYS BLACK RIGHT POINTING INDEX */ + buffer[0] == 0x1F59C || /* BLACK LEFT POINTING BACKHAND INDEX */ + buffer[0] == 0x1F59D || /* BLACK RIGHT POINTING BACKHAND INDEX */ + buffer[0] == 0x1F59E || /* SIDEWAYS WHITE UP POINTING INDEX */ + buffer[0] == 0x1F59F || /* SIDEWAYS WHITE DOWN POINTING INDEX */ + buffer[0] == 0x1F5A0 || /* SIDEWAYS BLACK UP POINTING INDEX */ + buffer[0] == 0x1F5A1 || /* SIDEWAYS BLACK DOWN POINTING INDEX */ + buffer[0] == 0x1F5A2 || /* BLACK UP POINTING BACKHAND INDEX */ + buffer[0] == 0x1F5A3 || /* BLACK DOWN POINTING BACKHAND INDEX */ + buffer[0] == 0x1FBC1 || /* LEFT THIRD WHITE RIGHT POINTING INDEX */ + buffer[0] == 0x1FBC2 || /* MIDDLE THIRD WHITE RIGHT POINTING INDEX */ + buffer[0] == 0x1FBC3 || /* RIGHT THIRD WHITE RIGHT POINTING INDEX */ + buffer[0] == 0xFFFD || /* UNICODE_REPLACEMENT_CHARACTER */ + 0)) + return BULLET; + + if (!contained) + { + if (len > 2 && buffer[0] == '(' && buffer[len-1] == ')') + return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET; + if (len > 2 && buffer[0] == '<' && buffer[len-1] == '>') + return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET; + if (len > 2 && buffer[0] == '[' && buffer[len-1] == ']') + return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET; + if (len > 2 && buffer[0] == '{' && buffer[len-1] == '}') + return is_bullet_aux(buffer+1, len-2, 1) ? BULLET : NOT_A_BULLET; + + if (len > 1 && buffer[len-1] == ':') + return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET; + if (len > 1 && buffer[len-1] == ')') + return is_bullet_aux(buffer, len-1, 1) ? BULLET : NOT_A_BULLET; + } + + /* Look for numbers */ + /* Be careful not to interpret rows of numbers, like: + * 10.02 12.03 + * as bullets. + */ + decimal_pos = 0; + decimals_found = 0; + for (i = 0; i < len; i++) + { + if (buffer[i] >= '0' && buffer[i] <= '9') + { + } + else if (buffer[i] == '.') + { + decimal_pos = i; + decimals_found++; + } + else + break; + } + if (i == len && decimals_found <= 1) + return NUMERICAL_BULLET; + /* or number.something */ + if (decimals_found && i == decimal_pos+1 && i < len) + return is_bullet_aux(buffer+i, len-i, 0) ? BULLET : NOT_A_BULLET;; + + /* Look for roman */ + for (i = 0; i < len; i++) + if (!is_roman(buffer[i])) + break; + if (i == len) + return 1; + /* or roman.something */ + if (buffer[i] == '.' && i < len-1) + return is_bullet_aux(buffer+i+1, len-i-1, 0) ? BULLET : NOT_A_BULLET; + + /* FIXME: Others. */ + return NOT_A_BULLET; +} + +static bullet_t +is_bullet(int *buffer, int len) +{ + return is_bullet_aux(buffer, len, 0); +} + +static int +eval_buffer_for_bullet(fz_context *ctx, list_data *data, float size) +{ + bullet_t bullet_type; + + bullet_type = is_bullet(data->buffer, data->buffer_fill); + if (bullet_type == NUMERICAL_BULLET) + data->state = LOOKING_FOR_POST_NUMERICAL_BULLET; + else if (bullet_type) + data->state = LOOKING_FOR_POST_BULLET; + else + { + if (approx_eq(data->l, data->post_bullet_indent, size/2)) + data->state = CONTINUATION_LINE; + else + data->state = NO_BULLET; + return 1; + } + return 0; +} + +static int +list_line(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + list_data *data = (list_data *)arg; + fz_stext_char *ch; + + for (ch = line->first_char; ch != NULL; ch = ch->next) + { + fz_rect r = fz_rect_from_quad(ch->quad); + + if (r.x0 < data->l) + data->l = line->bbox.x0; + + switch (data->state) + { + case LOOKING_FOR_BULLET: + if (ch->c == ' ') + { + /* We have a space */ + if (data->buffer_fill == 0) + continue; /* Just skip leading spaces */ + if (eval_buffer_for_bullet(ctx, data, ch->size)) + return 0; + } + else if (data->buffer_fill > 0 && r.x0 - data->bullet_r > ch->size/2) + { + /* We have a gap large enough to be a space while we've + * got something in the buffer. */ + if (eval_buffer_for_bullet(ctx, data, ch->size)) + return 0; + } + else if (data->buffer_fill < (int)nelem(data->buffer)) + { + /* Stick it in the buffer for evaluation later. */ + data->buffer[data->buffer_fill++] = ch->c; + } + else + { + /* Buffer overflowed. Can't be a bullet. */ + if (approx_eq(data->l, data->post_bullet_indent, ch->size)) + data->state = CONTINUATION_LINE; + else + data->state = NO_BULLET; + return 0; + } + data->bullet_r = r.x1; + break; + case LOOKING_FOR_POST_BULLET: + if (ch->c != ' ') + { + data->state = FOUND_BULLET; + if (data->bullet_line_start == NULL) + data->bullet_line_start = data->this_line_start; + data->post_bullet_indent = r.x0; + } + break; + case LOOKING_FOR_POST_NUMERICAL_BULLET: + if (ch->c >= '0' && ch->c <= '9') + { + /* Numerical bullets can't be followed by numbers. */ + if (approx_eq(data->l, data->post_bullet_indent, ch->size)) + data->state = CONTINUATION_LINE; + else + data->state = NO_BULLET; + return 0; + } + if (ch->c != ' ') + { + data->state = FOUND_BULLET; + if (data->bullet_line_start == NULL) + data->bullet_line_start = data->this_line_start; + data->post_bullet_indent = r.x0; + } + break; + default: + break; + } + } + + return 0; +} + +static void +list_end(fz_context *ctx, fz_stext_block *block, fz_stext_line *line, void *arg) +{ + list_data *data = (list_data *)arg; + + if (data->state == LOOKING_FOR_BULLET) + { + eval_buffer_for_bullet(ctx, data, 0); + /* If we ended up thinking we'd found a bullet, subject to + * what follows not being of a specific form, then we're + * fine, because nothing follows us! */ + if (data->state == LOOKING_FOR_POST_NUMERICAL_BULLET || + data->state == LOOKING_FOR_POST_BULLET) + { + data->state = FOUND_BULLET; + if (data->bullet_line_start == NULL) + data->bullet_line_start = data->this_line_start; + } + /* FIXME: This block contains just a bullet - not the content + * for the bullet. We see this with page-12.pdf. + * <> Rising commitment to battery... + * committed to in-house battery... + * developing and manufacturing... + * + * The <> is in a whole different DIV to the following text. + * Really we want to look for if the "next" content (for some + * definition of next) is on the same line as the bullet. If + * it is, we want to merge the 2 divs. + * + * But that's a really tricky thing to do given the recursive + * block walk we are current doing. Think about this. + * For now, we just mark the <> as being a list item. + */ + } + if (data->state == FOUND_BULLET) + { + if (block->u.t.first_line != data->bullet_line_start && data->state == FOUND_BULLET) + { + /* We need to split the block before the start of the bullet. */ + (void)split_block_at_line(ctx, data->pos, block, data->bullet_line_start); + data->changed = 1; + return; + } + if (data->bullet_line_start != data->this_line_start) + { + /* We've found a second bullet. Break before the line. */ + (void)split_block_at_line(ctx, data->pos, block, data->this_line_start); + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM); + data->changed = 1; + return; + } + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM); + } + else if (data->state == NO_BULLET && data->bullet_line_start) + { + /* We've found a bullet before, and the line we've just completed + * is neither a new bullet line, or a continuation so, we need to + * break that into a new block. */ + (void)split_block_at_line(ctx, data->pos, block, data->this_line_start); + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM); + data->changed = 1; + return; + } + else if (data->bullet_line_start) + { + /* We've come to the end of the block still in the list item. */ + block_to_struct(ctx, data->pos, block, FZ_STRUCTURE_LISTITEM); + } +} + +static int +break_list_items(fz_context *ctx, stext_pos *pos, fz_stext_block *block) +{ + list_data data[1]; + + if (block->u.t.flags != FZ_STEXT_TEXT_JUSTIFY_UNKNOWN) + return 0; + + data->pos = pos; + data->state = LOOKING_FOR_BULLET; + data->buffer_fill = 0; + data->l = block->bbox.x1; + data->bullet_line_start = NULL; + data->this_line_start = block->u.t.first_line; + data->bullet_r = block->bbox.x0; + data->changed = 0; + + line_walker(ctx, block, list_newline, list_line, list_end, data); + + return data->changed; +} + +static int +is_header(fz_structure s) +{ + return (s == FZ_STRUCTURE_H || + s == FZ_STRUCTURE_H1 || + s == FZ_STRUCTURE_H2 || + s == FZ_STRUCTURE_H3 || + s == FZ_STRUCTURE_H4 || + s == FZ_STRUCTURE_H5 || + s == FZ_STRUCTURE_H6); +} + +static void +do_para_break(fz_context *ctx, fz_stext_page *page, fz_stext_block **pfirst, fz_stext_block **plast, fz_stext_struct *parent, int in_header) +{ + fz_stext_block *block, *next_block; + stext_pos pos; + fz_rect bbox; + + pos.pool = page->pool; + pos.idx = 0; + pos.pfirst = pfirst; + pos.plast = plast; + pos.parent = parent; + + /* First off, in order for us to consider a block to be suitable for paragraph + * splitting, we want it to be a series of lines moving down the page, (or left + * to right within a line). */ + for (block = *pfirst; block != NULL; block = next_block) + { + next_block = block->next; + + switch (block->type) + { + case FZ_STEXT_BLOCK_STRUCT: + if (block->u.s.index < pos.idx) + block->u.s.index = pos.idx++; + else + pos.idx = block->u.s.index+1; + if (block->u.s.down) + { + int header = in_header | is_header(block->u.s.down->standard); + do_para_break(ctx, page, &block->u.s.down->first_block, &block->u.s.down->last_block, block->u.s.down, header); + } + break; + case FZ_STEXT_BLOCK_TEXT: + if (!lines_move_plausibly_like_paragraph(block)) + break; + +#ifdef DEBUG_SPLITS + dump_block(ctx, "Around the top level block loop:", block); +#endif + + /* Firstly, and somewhat annoyingly we need to find the bbox of the + * block that doesn't include for trailing spaces. If we just use + * the normal bbox, then lines that end in "foo " will end further + * to the right of lines that end in "ba-", and consequently we'll + * fail to detect blocks as being justified. + * See PMC2656817_00002.pdf as an example. */ + bbox = text_block_marked_bbox(ctx, block); + +#ifdef DEBUG_PARA_SPLITS + { + fz_stext_line *line; + + for (line = block->u.t.first_line; line != NULL; line = line->next) + { + fz_stext_char *ch; + + for (ch = line->first_char; ch != NULL; ch = ch->next) + { + fz_write_printf(ctx, fz_stddbg(ctx), "%C", ch->c); + } + } + } +#endif + + /* Think about breaking lines at Titles. */ + /* First, underlined ones. */ + if (detect_underlined_titles(ctx, &pos, block)) + next_block = block->next; /* We split the block! */ + if (block->type != FZ_STEXT_BLOCK_TEXT) + { + next_block = block; + break; + } + +#ifdef DEBUG_PARA_SPLITS + fz_write_printf(ctx, fz_stddbg(ctx), "A"); +#endif + + /* Next, ones that use bold fonts. */ + if (!in_header) + { + if (detect_titles_by_font_usage(ctx, &pos, block)) + next_block = block->next; /* We split the block! */ + if (block->type != FZ_STEXT_BLOCK_TEXT) + { + next_block = block; + break; + } + } + +#ifdef DEBUG_PARA_SPLITS + fz_write_printf(ctx, fz_stddbg(ctx), "B"); +#endif + + /* Now look at breaking based upon indents */ + if (break_paragraphs_by_indent(ctx, &pos, block, bbox)) + next_block = block->next; /* We split the block! */ + if (block->type != FZ_STEXT_BLOCK_TEXT) + { + next_block = block; + break; + } + +#ifdef DEBUG_PARA_SPLITS + fz_write_printf(ctx, fz_stddbg(ctx), "C"); +#endif + + /* Now we're going to look for unindented paragraphs. We do this by + * considering if the first word on the next line would have fitted + * into the space left at the end of the previous line. */ + if (break_paragraphs_by_analysing_trailing_gaps(ctx, &pos, block, bbox)) + next_block = block->next; /* We split the block! */ + if (block->type != FZ_STEXT_BLOCK_TEXT) + { + next_block = block; + break; + } + +#ifdef DEBUG_PARA_SPLITS + fz_write_printf(ctx, fz_stddbg(ctx), "D"); +#endif + + /* Now look to see if a block looks like fully justified text. If it + * does, then any line that doesn't reach the right hand side must be + * a paragraph break. */ + if (break_paragraphs_within_justified_text(ctx, &pos, block, bbox)) + next_block = block->next; /* We split the block! */ + if (block->type != FZ_STEXT_BLOCK_TEXT) + { + next_block = block; + break; + } + +#ifdef DEBUG_PARA_SPLITS + fz_write_printf(ctx, fz_stddbg(ctx), "E"); +#endif + + /* Look for bulleted list items. */ + if (break_list_items(ctx, &pos, block)) + next_block = block->next; /* We split the block! */ + + break; + } + } +} + +void +fz_paragraph_break(fz_context *ctx, fz_stext_page *page) +{ + do_para_break(ctx, page, &page->first_block, &page->last_block, NULL, 0); +}
