Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/fitz/stext-search.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/fitz/stext-search.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,782 @@ +// Copyright (C) 2004-2024 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" + +#include <string.h> +#include <limits.h> +#include <assert.h> + +/* Enumerate marked selection */ + +static float hdist(fz_point *dir, fz_point *a, fz_point *b) +{ + float dx = b->x - a->x; + float dy = b->y - a->y; + return fz_abs(dx * dir->x - dy * dir->y); +} + +static float vdist(fz_point *dir, fz_point *a, fz_point *b) +{ + float dx = b->x - a->x; + float dy = b->y - a->y; + return fz_abs(dx * dir->y - dy * dir->x); +} + +static float vecdot(fz_point a, fz_point b) +{ + return a.x * b.x + a.y * b.y; +} + +static float linedist(fz_point origin, fz_point dir, fz_point q) +{ + return vecdot(dir, fz_make_point(q.x - origin.x, q.y - origin.y)); +} + +static int line_length(fz_stext_line *line) +{ + fz_stext_char *ch; + int n = 0; + for (ch = line->first_char; ch; ch = ch->next) + ++n; + return n; +} + +static float largest_size_in_line(fz_stext_line *line) +{ + fz_stext_char *ch; + float size = 0; + for (ch = line->first_char; ch; ch = ch->next) + if (ch->size > size) + size = ch->size; + return size; +} + +static int find_closest_in_line(fz_stext_line *line, int idx, fz_point q) +{ + fz_stext_char *ch; + float closest_dist = 1e30f; + int closest_idx = idx; + float d1, d2; + + float hsize = largest_size_in_line(line) / 2; + fz_point vdir = fz_make_point(-line->dir.y, line->dir.x); + fz_point hdir = line->dir; + + // Compute mid-line from quads! + fz_point p1 = fz_make_point( + (line->first_char->quad.ll.x + line->first_char->quad.ul.x) / 2, + (line->first_char->quad.ll.y + line->first_char->quad.ul.y) / 2 + ); + + // Signed distance perpendicular mid-line (positive is below) + float vdist = linedist(p1, vdir, q); + if (vdist < -hsize) + return idx; + if (vdist > hsize) + return idx + line_length(line); + + for (ch = line->first_char; ch; ch = ch->next) + { + if (ch->bidi & 1) + { + d1 = fz_abs(linedist(ch->quad.lr, hdir, q)); + d2 = fz_abs(linedist(ch->quad.ll, hdir, q)); + } + else + { + d1 = fz_abs(linedist(ch->quad.ll, hdir, q)); + d2 = fz_abs(linedist(ch->quad.lr, hdir, q)); + } + + if (d1 < closest_dist) + { + closest_dist = d1; + closest_idx = idx; + } + + if (d2 < closest_dist) + { + closest_dist = d2; + closest_idx = idx + 1; + } + + ++idx; + } + + return closest_idx; +} + +static int find_closest_in_page(fz_stext_page *page, fz_point q) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_line *closest_line = NULL; + int closest_idx = 0; + float closest_vdist = 1e30f; + float closest_hdist = 1e30f; + int idx = 0; + + for (block = page->first_block; block; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) + { + float hsize = largest_size_in_line(line) / 2; + fz_point hdir = line->dir; + fz_point vdir = fz_make_point(-line->dir.y, line->dir.x); + + // Compute mid-line from quads! + fz_point p1 = fz_make_point( + (line->first_char->quad.ll.x + line->first_char->quad.ul.x) / 2, + (line->first_char->quad.ll.y + line->first_char->quad.ul.y) / 2 + ); + fz_point p2 = fz_make_point( + (line->last_char->quad.lr.x + line->last_char->quad.ur.x) / 2, + (line->last_char->quad.lr.y + line->last_char->quad.ur.y) / 2 + ); + + // Signed distance perpendicular mid-line (positive is below) + float vdist = linedist(p1, vdir, q); + + // Signed distance tangent to mid-line from end points (positive is to end) + float hdist1 = linedist(p1, hdir, q); + float hdist2 = linedist(p2, hdir, q); + + // Within the line itself! + if (vdist >= -hsize && vdist <= hsize && (hdist1 > 0) != (hdist2 > 0)) + { + closest_vdist = 0; + closest_hdist = 0; + closest_line = line; + closest_idx = idx; + } + else + { + // Vertical distance from mid-line. + float avdist = fz_abs(vdist); + + // Horizontal distance from closest end-point + float ahdist = fz_min(fz_abs(hdist1), fz_abs(hdist2)); + + if (avdist < hsize) + { + // Within extended line + if (ahdist <= closest_hdist) + { + closest_vdist = 0; + closest_hdist = ahdist; + closest_line = line; + closest_idx = idx; + } + } + else + { + // Outside line + // TODO: closest column? + if (avdist <= closest_vdist) + { + closest_vdist = avdist; + closest_line = line; + closest_idx = idx; + } + } + } + + idx += line_length(line); + } + } + + if (closest_line) + return find_closest_in_line(closest_line, closest_idx, q); + + return 0; +} + +struct callbacks +{ + void (*on_char)(fz_context *ctx, void *arg, fz_stext_line *ln, fz_stext_char *ch); + void (*on_line)(fz_context *ctx, void *arg, fz_stext_line *ln); + void *arg; +}; + +static void +fz_enumerate_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, struct callbacks *cb) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + int idx, start, end; + int inside; + + start = find_closest_in_page(page, a); + end = find_closest_in_page(page, b); + + if (start > end) + idx = start, start = end, end = idx; + + if (start == end) + return; + + inside = 0; + idx = 0; + for (block = page->first_block; block; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) + { + for (ch = line->first_char; ch; ch = ch->next) + { + if (!inside) + if (idx == start) + inside = 1; + if (inside) + cb->on_char(ctx, cb->arg, line, ch); + if (++idx == end) + return; + } + if (inside) + cb->on_line(ctx, cb->arg, line); + } + } +} + +fz_quad +fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *a, fz_point *b, int mode) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_quad handles; + int idx, start, end; + int pc; + + start = find_closest_in_page(page, *a); + end = find_closest_in_page(page, *b); + + if (start > end) + idx = start, start = end, end = idx; + + handles.ll = handles.ul = *a; + handles.lr = handles.ur = *b; + + idx = 0; + for (block = page->first_block; block; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) + { + pc = '\n'; + for (ch = line->first_char; ch; ch = ch->next) + { + if (idx <= start) + { + if (mode == FZ_SELECT_CHARS + || (mode == FZ_SELECT_WORDS && (pc == ' ' || pc == '\n')) + || (mode == FZ_SELECT_LINES && (pc == '\n'))) + { + handles.ll = ch->quad.ll; + handles.ul = ch->quad.ul; + *a = ch->origin; + } + } + if (idx >= end) + { + if (mode == FZ_SELECT_CHARS + || (mode == FZ_SELECT_WORDS && (ch->c == ' '))) + { + handles.lr = ch->quad.ll; + handles.ur = ch->quad.ul; + *b = ch->origin; + return handles; + } + if (!ch->next) + { + handles.lr = ch->quad.lr; + handles.ur = ch->quad.ur; + *b = ch->quad.lr; + return handles; + } + } + pc = ch->c; + ++idx; + } + } + } + + return handles; +} + +/* Highlight selection */ + +struct highlight +{ + int len, cap; + fz_quad *box; + float hfuzz, vfuzz; +}; + +int same_point(fz_point a, fz_point b) +{ + int dx = fz_abs(a.x - b.x); + int dy = fz_abs(a.y - b.y); + return (dx < 0.1 && dy < 0.1); +} + +int is_near(float hfuzz, float vfuzz, fz_point hdir, fz_point end, fz_point p1, fz_point p2) +{ + float v = fz_abs(linedist(end, fz_make_point(-hdir.y, hdir.x), p1)); + float d1 = fz_abs(linedist(end, hdir, p1)); + float d2 = fz_abs(linedist(end, hdir, p2)); + return (v < vfuzz && d1 < hfuzz && d1 < d2); +} + +static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) +{ + struct highlight *hits = arg; + float vfuzz = hits->vfuzz * ch->size; + float hfuzz = hits->hfuzz * ch->size; + fz_point dir = line->dir; + + // Skip zero-extent quads + if (same_point(ch->quad.ll, ch->quad.lr)) + return; + + if (hits->len > 0) + { + fz_quad *end = &hits->box[hits->len-1]; + + if (is_near(hfuzz, vfuzz, dir, end->lr, ch->quad.ll, ch->quad.lr) && + is_near(hfuzz, vfuzz, dir, end->ur, ch->quad.ul, ch->quad.ur)) + { + end->ur = ch->quad.ur; + end->lr = ch->quad.lr; + return; + } + + if (is_near(hfuzz, vfuzz, dir, end->ll, ch->quad.lr, ch->quad.ll) && + is_near(hfuzz, vfuzz, dir, end->ul, ch->quad.ur, ch->quad.ul)) + { + end->ul = ch->quad.ul; + end->ll = ch->quad.ll; + return; + } + } + + if (hits->len < hits->cap) + hits->box[hits->len++] = ch->quad; +} + +static void on_highlight_line(fz_context *ctx, void *arg, fz_stext_line *line) +{ +} + +int +fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads) +{ + struct callbacks cb; + struct highlight hits; + + hits.len = 0; + hits.cap = max_quads; + hits.box = quads; + hits.hfuzz = 0.5f; /* merge large gaps */ + hits.vfuzz = 0.1f; + + cb.on_char = on_highlight_char; + cb.on_line = on_highlight_line; + cb.arg = &hits; + + fz_enumerate_selection(ctx, page, a, b, &cb); + + return hits.len; +} + +/* Copy selection */ + +static void on_copy_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) +{ + fz_buffer *buffer = arg; + int c = ch->c; + if (c < 32) + c = FZ_REPLACEMENT_CHARACTER; + fz_append_rune(ctx, buffer, c); +} + +static void on_copy_line_crlf(fz_context *ctx, void *arg, fz_stext_line *line) +{ + fz_buffer *buffer = arg; + fz_append_byte(ctx, buffer, '\r'); + fz_append_byte(ctx, buffer, '\n'); +} + +static void on_copy_line_lf(fz_context *ctx, void *arg, fz_stext_line *line) +{ + fz_buffer *buffer = arg; + fz_append_byte(ctx, buffer, '\n'); +} + +char * +fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf) +{ + struct callbacks cb; + fz_buffer *buffer; + unsigned char *s; + + buffer = fz_new_buffer(ctx, 1024); + fz_try(ctx) + { + cb.on_char = on_copy_char; + cb.on_line = crlf ? on_copy_line_crlf : on_copy_line_lf; + cb.arg = buffer; + + fz_enumerate_selection(ctx, page, a, b, &cb); + fz_terminate_buffer(ctx, buffer); + } + fz_catch(ctx) + { + fz_drop_buffer(ctx, buffer); + fz_rethrow(ctx); + } + fz_buffer_extract(ctx, buffer, &s); /* take over the data */ + fz_drop_buffer(ctx, buffer); + return (char*)s; +} + +char * +fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_buffer *buffer; + unsigned char *s; + + int need_new_line = 0; + + buffer = fz_new_buffer(ctx, 1024); + fz_try(ctx) + { + for (block = page->first_block; block; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) + { + int line_had_text = 0; + for (ch = line->first_char; ch; ch = ch->next) + { + fz_rect r = fz_rect_from_quad(ch->quad); + if (!fz_is_empty_rect(fz_intersect_rect(r, area))) + { + line_had_text = 1; + if (need_new_line) + { + fz_append_string(ctx, buffer, crlf ? "\r\n" : "\n"); + need_new_line = 0; + } + fz_append_rune(ctx, buffer, ch->c < 32 ? FZ_REPLACEMENT_CHARACTER : ch->c); + } + } + if (line_had_text) + need_new_line = 1; + } + } + fz_terminate_buffer(ctx, buffer); + } + fz_catch(ctx) + { + fz_drop_buffer(ctx, buffer); + fz_rethrow(ctx); + } + + fz_buffer_extract(ctx, buffer, &s); /* take over the data */ + fz_drop_buffer(ctx, buffer); + return (char*)s; +} + +/* String search */ + +static inline int canon(int c) +{ + // Map full-width ASCII forms to ASCII: + // U+FF01 .. U+FF5E => U+0021 .. U+007E + if (c >= 0xFF01 && c <= 0xFF5E) + c = c - 0xFF01 + 0x21; + + if (c == 0xA0 || c == 0x2028 || c == 0x2029) + return ' '; + if (c == '\r' || c == '\n' || c == '\t') + return ' '; + + return fz_toupper(c); +} + +static inline int chartocanon(int *c, const char *s) +{ + int n = fz_chartorune(c, s); + *c = canon(*c); + return n; +} + +static const char *match_string(const char *h, const char *n) +{ + int hc, nc; + const char *e = h; + h += chartocanon(&hc, h); + n += chartocanon(&nc, n); + while (hc == nc) + { + e = h; + if (hc == ' ') + do + h += chartocanon(&hc, h); + while (hc == ' '); + else + h += chartocanon(&hc, h); + if (nc == ' ') + do + n += chartocanon(&nc, n); + while (nc == ' '); + else + n += chartocanon(&nc, n); + } + return nc == 0 ? e : NULL; +} + +static const char *find_string(const char *s, const char *needle, const char **endp) +{ + const char *end; + while (*s) + { + end = match_string(s, needle); + if (end) + return *endp = end, s; + ++s; + } + return *endp = NULL, NULL; +} + +struct search_data +{ + /* Number of hits so far.*/ + int count_quads; + int count_hits; + int max_quads; + int quad_fill; + fz_quad locals[32]; + fz_quad *quads; + float hfuzz, vfuzz; + fz_search_callback_fn *cb; + void *opaque; +}; + +static int hit_char(fz_context *ctx, struct search_data *hits, fz_stext_line *line, fz_stext_char *ch, int is_at_start) +{ + float vfuzz = ch->size * hits->vfuzz; + float hfuzz = ch->size * hits->hfuzz; + + /* Can we continue an existing quad? */ + if (hits->quad_fill > 0 && !is_at_start) + { + fz_quad *end = &hits->quads[hits->quad_fill-1]; + if (hdist(&line->dir, &end->lr, &ch->quad.ll) < hfuzz + && vdist(&line->dir, &end->lr, &ch->quad.ll) < vfuzz + && hdist(&line->dir, &end->ur, &ch->quad.ul) < hfuzz + && vdist(&line->dir, &end->ur, &ch->quad.ul) < vfuzz) + { + /* Yes */ + end->ur = ch->quad.ur; + end->lr = ch->quad.lr; + return 0; + } + } + + if (is_at_start && hits->quad_fill > 0) + { + /* Send the quads we have queued. */ + hits->count_hits++; + if (hits->cb && hits->cb(ctx, hits->opaque, hits->quad_fill, hits->quads)) + return 1; + hits->quad_fill = 0; + } + + if (hits->quad_fill == hits->max_quads) + { + int newmax = hits->max_quads * 2; + if (hits->quads == hits->locals) + { + hits->quads = fz_malloc(ctx, sizeof(hits->quads[0]) * newmax); + memcpy(hits->quads, hits->locals, sizeof(hits->quads[0]) * nelem(hits->locals)); + } + else + { + hits->quads = fz_realloc(ctx, hits->quads, sizeof(hits->quads[0]) * newmax); + } + hits->max_quads = newmax; + } + hits->quads[hits->quad_fill++] = ch->quad; + hits->count_quads++; + + return 0; +} + +int +fz_search_stext_page_cb(fz_context *ctx, fz_stext_page *page, const char *needle, fz_search_callback_fn *cb, void *opaque) +{ + struct search_data hits; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_buffer *buffer; + const char *haystack, *begin, *end; + int c, inside; + + if (strlen(needle) == 0) + return 0; + + hits.count_quads = 0; + hits.count_hits = 0; + hits.quad_fill = 0; + hits.max_quads = nelem(hits.locals); + hits.quads = hits.locals; + hits.hfuzz = 0.2f; /* merge kerns but not large gaps */ + hits.vfuzz = 0.1f; + hits.cb = cb; + hits.opaque = opaque; + + buffer = fz_new_buffer_from_stext_page(ctx, page); + fz_try(ctx) + { + haystack = fz_string_from_buffer(ctx, buffer); + begin = find_string(haystack, needle, &end); + if (!begin) + goto no_more_matches; + + inside = 0; + for (block = page->first_block; block; block = block->next) + { + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) + { + for (ch = line->first_char; ch; ch = ch->next) + { +try_new_match: + if (!inside) + { + if (haystack >= begin) + inside = 1; + } + if (inside) + { + if (haystack < end) + { + if (hit_char(ctx, &hits, line, ch, haystack == begin)) + goto no_more_matches; + } + else + { + inside = 0; + begin = find_string(haystack, needle, &end); + if (!begin) + goto no_more_matches; + else + goto try_new_match; + } + } + haystack += fz_chartorune(&c, haystack); + } + assert(*haystack == '\n'); + ++haystack; + } + assert(*haystack == '\n'); + ++haystack; + } +no_more_matches: + /* Send the quads we have queued. */ + if (hits.quad_fill) + { + hits.count_hits++; + if (hits.cb) + (void)hits.cb(ctx, hits.opaque, hits.quad_fill, hits.quads); + } + } + fz_always(ctx) + { + fz_drop_buffer(ctx, buffer); + if (hits.quads != hits.locals) + fz_free(ctx, hits.quads); + } + fz_catch(ctx) + fz_rethrow(ctx); + + return hits.count_hits; +} + +typedef struct +{ + int *hit_mark; + fz_quad *quads; + int max_quads; + int fill; + int hit; +} oldsearch_data; + +static int +oldsearch_cb(fz_context *ctx, void *opaque, int num_quads, fz_quad *quads) +{ + oldsearch_data *data = (oldsearch_data *)opaque; + int i; + int hit = data->hit++; + + for (i = 0; i < num_quads; i++) + { + if (data->fill == data->max_quads) + break; + if (data->hit_mark) + data->hit_mark[data->fill] = hit; + data->quads[data->fill] = quads[i]; + data->fill++; + } + + /* We never return 1 here, even if we fill up the buffer, as we + * want the old API to get the correct total number of quads. */ + return 0; +} + +int +fz_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle, int *hit_mark, fz_quad *quads, int max_quads) +{ + oldsearch_data data; + + data.hit_mark = hit_mark; + data.quads = quads; + data.max_quads = max_quads; + data.fill = 0; + data.hit = 0; + (void)fz_search_stext_page_cb(ctx, page, needle, oldsearch_cb, &data); + return data.fill; /* Return the number of quads we have read */ +}
