Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/source/fitz/stext-search.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line source
// Copyright (C) 2004-2024 Artifex Software, Inc. // // This file is part of MuPDF. // // MuPDF is free software: you can redistribute it and/or modify it under the // terms of the GNU Affero General Public License as published by the Free // Software Foundation, either version 3 of the License, or (at your option) // any later version. // // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more // details. // // You should have received a copy of the GNU Affero General Public License // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> // // Alternative licensing terms are available from the licensor. // For commercial licensing, see <https://www.artifex.com/> or contact // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, // CA 94129, USA, for further information. #include "mupdf/fitz.h" #include <string.h> #include <limits.h> #include <assert.h> /* Enumerate marked selection */ static float hdist(fz_point *dir, fz_point *a, fz_point *b) { float dx = b->x - a->x; float dy = b->y - a->y; return fz_abs(dx * dir->x - dy * dir->y); } static float vdist(fz_point *dir, fz_point *a, fz_point *b) { float dx = b->x - a->x; float dy = b->y - a->y; return fz_abs(dx * dir->y - dy * dir->x); } static float vecdot(fz_point a, fz_point b) { return a.x * b.x + a.y * b.y; } static float linedist(fz_point origin, fz_point dir, fz_point q) { return vecdot(dir, fz_make_point(q.x - origin.x, q.y - origin.y)); } static int line_length(fz_stext_line *line) { fz_stext_char *ch; int n = 0; for (ch = line->first_char; ch; ch = ch->next) ++n; return n; } static float largest_size_in_line(fz_stext_line *line) { fz_stext_char *ch; float size = 0; for (ch = line->first_char; ch; ch = ch->next) if (ch->size > size) size = ch->size; return size; } static int find_closest_in_line(fz_stext_line *line, int idx, fz_point q) { fz_stext_char *ch; float closest_dist = 1e30f; int closest_idx = idx; float d1, d2; float hsize = largest_size_in_line(line) / 2; fz_point vdir = fz_make_point(-line->dir.y, line->dir.x); fz_point hdir = line->dir; // Compute mid-line from quads! fz_point p1 = fz_make_point( (line->first_char->quad.ll.x + line->first_char->quad.ul.x) / 2, (line->first_char->quad.ll.y + line->first_char->quad.ul.y) / 2 ); // Signed distance perpendicular mid-line (positive is below) float vdist = linedist(p1, vdir, q); if (vdist < -hsize) return idx; if (vdist > hsize) return idx + line_length(line); for (ch = line->first_char; ch; ch = ch->next) { if (ch->bidi & 1) { d1 = fz_abs(linedist(ch->quad.lr, hdir, q)); d2 = fz_abs(linedist(ch->quad.ll, hdir, q)); } else { d1 = fz_abs(linedist(ch->quad.ll, hdir, q)); d2 = fz_abs(linedist(ch->quad.lr, hdir, q)); } if (d1 < closest_dist) { closest_dist = d1; closest_idx = idx; } if (d2 < closest_dist) { closest_dist = d2; closest_idx = idx + 1; } ++idx; } return closest_idx; } static int find_closest_in_page(fz_stext_page *page, fz_point q) { fz_stext_block *block; fz_stext_line *line; fz_stext_line *closest_line = NULL; int closest_idx = 0; float closest_vdist = 1e30f; float closest_hdist = 1e30f; int idx = 0; for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) continue; for (line = block->u.t.first_line; line; line = line->next) { float hsize = largest_size_in_line(line) / 2; fz_point hdir = line->dir; fz_point vdir = fz_make_point(-line->dir.y, line->dir.x); // Compute mid-line from quads! fz_point p1 = fz_make_point( (line->first_char->quad.ll.x + line->first_char->quad.ul.x) / 2, (line->first_char->quad.ll.y + line->first_char->quad.ul.y) / 2 ); fz_point p2 = fz_make_point( (line->last_char->quad.lr.x + line->last_char->quad.ur.x) / 2, (line->last_char->quad.lr.y + line->last_char->quad.ur.y) / 2 ); // Signed distance perpendicular mid-line (positive is below) float vdist = linedist(p1, vdir, q); // Signed distance tangent to mid-line from end points (positive is to end) float hdist1 = linedist(p1, hdir, q); float hdist2 = linedist(p2, hdir, q); // Within the line itself! if (vdist >= -hsize && vdist <= hsize && (hdist1 > 0) != (hdist2 > 0)) { closest_vdist = 0; closest_hdist = 0; closest_line = line; closest_idx = idx; } else { // Vertical distance from mid-line. float avdist = fz_abs(vdist); // Horizontal distance from closest end-point float ahdist = fz_min(fz_abs(hdist1), fz_abs(hdist2)); if (avdist < hsize) { // Within extended line if (ahdist <= closest_hdist) { closest_vdist = 0; closest_hdist = ahdist; closest_line = line; closest_idx = idx; } } else { // Outside line // TODO: closest column? if (avdist <= closest_vdist) { closest_vdist = avdist; closest_line = line; closest_idx = idx; } } } idx += line_length(line); } } if (closest_line) return find_closest_in_line(closest_line, closest_idx, q); return 0; } struct callbacks { void (*on_char)(fz_context *ctx, void *arg, fz_stext_line *ln, fz_stext_char *ch); void (*on_line)(fz_context *ctx, void *arg, fz_stext_line *ln); void *arg; }; static void fz_enumerate_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, struct callbacks *cb) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; int idx, start, end; int inside; start = find_closest_in_page(page, a); end = find_closest_in_page(page, b); if (start > end) idx = start, start = end, end = idx; if (start == end) return; inside = 0; idx = 0; for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) continue; for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { if (!inside) if (idx == start) inside = 1; if (inside) cb->on_char(ctx, cb->arg, line, ch); if (++idx == end) return; } if (inside) cb->on_line(ctx, cb->arg, line); } } } fz_quad fz_snap_selection(fz_context *ctx, fz_stext_page *page, fz_point *a, fz_point *b, int mode) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_quad handles; int idx, start, end; int pc; start = find_closest_in_page(page, *a); end = find_closest_in_page(page, *b); if (start > end) idx = start, start = end, end = idx; handles.ll = handles.ul = *a; handles.lr = handles.ur = *b; idx = 0; for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) continue; for (line = block->u.t.first_line; line; line = line->next) { pc = '\n'; for (ch = line->first_char; ch; ch = ch->next) { if (idx <= start) { if (mode == FZ_SELECT_CHARS || (mode == FZ_SELECT_WORDS && (pc == ' ' || pc == '\n')) || (mode == FZ_SELECT_LINES && (pc == '\n'))) { handles.ll = ch->quad.ll; handles.ul = ch->quad.ul; *a = ch->origin; } } if (idx >= end) { if (mode == FZ_SELECT_CHARS || (mode == FZ_SELECT_WORDS && (ch->c == ' '))) { handles.lr = ch->quad.ll; handles.ur = ch->quad.ul; *b = ch->origin; return handles; } if (!ch->next) { handles.lr = ch->quad.lr; handles.ur = ch->quad.ur; *b = ch->quad.lr; return handles; } } pc = ch->c; ++idx; } } } return handles; } /* Highlight selection */ struct highlight { int len, cap; fz_quad *box; float hfuzz, vfuzz; }; int same_point(fz_point a, fz_point b) { int dx = fz_abs(a.x - b.x); int dy = fz_abs(a.y - b.y); return (dx < 0.1 && dy < 0.1); } int is_near(float hfuzz, float vfuzz, fz_point hdir, fz_point end, fz_point p1, fz_point p2) { float v = fz_abs(linedist(end, fz_make_point(-hdir.y, hdir.x), p1)); float d1 = fz_abs(linedist(end, hdir, p1)); float d2 = fz_abs(linedist(end, hdir, p2)); return (v < vfuzz && d1 < hfuzz && d1 < d2); } static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) { struct highlight *hits = arg; float vfuzz = hits->vfuzz * ch->size; float hfuzz = hits->hfuzz * ch->size; fz_point dir = line->dir; // Skip zero-extent quads if (same_point(ch->quad.ll, ch->quad.lr)) return; if (hits->len > 0) { fz_quad *end = &hits->box[hits->len-1]; if (is_near(hfuzz, vfuzz, dir, end->lr, ch->quad.ll, ch->quad.lr) && is_near(hfuzz, vfuzz, dir, end->ur, ch->quad.ul, ch->quad.ur)) { end->ur = ch->quad.ur; end->lr = ch->quad.lr; return; } if (is_near(hfuzz, vfuzz, dir, end->ll, ch->quad.lr, ch->quad.ll) && is_near(hfuzz, vfuzz, dir, end->ul, ch->quad.ur, ch->quad.ul)) { end->ul = ch->quad.ul; end->ll = ch->quad.ll; return; } } if (hits->len < hits->cap) hits->box[hits->len++] = ch->quad; } static void on_highlight_line(fz_context *ctx, void *arg, fz_stext_line *line) { } int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, fz_quad *quads, int max_quads) { struct callbacks cb; struct highlight hits; hits.len = 0; hits.cap = max_quads; hits.box = quads; hits.hfuzz = 0.5f; /* merge large gaps */ hits.vfuzz = 0.1f; cb.on_char = on_highlight_char; cb.on_line = on_highlight_line; cb.arg = &hits; fz_enumerate_selection(ctx, page, a, b, &cb); return hits.len; } /* Copy selection */ static void on_copy_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) { fz_buffer *buffer = arg; int c = ch->c; if (c < 32) c = FZ_REPLACEMENT_CHARACTER; fz_append_rune(ctx, buffer, c); } static void on_copy_line_crlf(fz_context *ctx, void *arg, fz_stext_line *line) { fz_buffer *buffer = arg; fz_append_byte(ctx, buffer, '\r'); fz_append_byte(ctx, buffer, '\n'); } static void on_copy_line_lf(fz_context *ctx, void *arg, fz_stext_line *line) { fz_buffer *buffer = arg; fz_append_byte(ctx, buffer, '\n'); } char * fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_point a, fz_point b, int crlf) { struct callbacks cb; fz_buffer *buffer; unsigned char *s; buffer = fz_new_buffer(ctx, 1024); fz_try(ctx) { cb.on_char = on_copy_char; cb.on_line = crlf ? on_copy_line_crlf : on_copy_line_lf; cb.arg = buffer; fz_enumerate_selection(ctx, page, a, b, &cb); fz_terminate_buffer(ctx, buffer); } fz_catch(ctx) { fz_drop_buffer(ctx, buffer); fz_rethrow(ctx); } fz_buffer_extract(ctx, buffer, &s); /* take over the data */ fz_drop_buffer(ctx, buffer); return (char*)s; } char * fz_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area, int crlf) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_buffer *buffer; unsigned char *s; int need_new_line = 0; buffer = fz_new_buffer(ctx, 1024); fz_try(ctx) { for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) continue; for (line = block->u.t.first_line; line; line = line->next) { int line_had_text = 0; for (ch = line->first_char; ch; ch = ch->next) { fz_rect r = fz_rect_from_quad(ch->quad); if (!fz_is_empty_rect(fz_intersect_rect(r, area))) { line_had_text = 1; if (need_new_line) { fz_append_string(ctx, buffer, crlf ? "\r\n" : "\n"); need_new_line = 0; } fz_append_rune(ctx, buffer, ch->c < 32 ? FZ_REPLACEMENT_CHARACTER : ch->c); } } if (line_had_text) need_new_line = 1; } } fz_terminate_buffer(ctx, buffer); } fz_catch(ctx) { fz_drop_buffer(ctx, buffer); fz_rethrow(ctx); } fz_buffer_extract(ctx, buffer, &s); /* take over the data */ fz_drop_buffer(ctx, buffer); return (char*)s; } /* String search */ static inline int canon(int c) { // Map full-width ASCII forms to ASCII: // U+FF01 .. U+FF5E => U+0021 .. U+007E if (c >= 0xFF01 && c <= 0xFF5E) c = c - 0xFF01 + 0x21; if (c == 0xA0 || c == 0x2028 || c == 0x2029) return ' '; if (c == '\r' || c == '\n' || c == '\t') return ' '; return fz_toupper(c); } static inline int chartocanon(int *c, const char *s) { int n = fz_chartorune(c, s); *c = canon(*c); return n; } static const char *match_string(const char *h, const char *n) { int hc, nc; const char *e = h; h += chartocanon(&hc, h); n += chartocanon(&nc, n); while (hc == nc) { e = h; if (hc == ' ') do h += chartocanon(&hc, h); while (hc == ' '); else h += chartocanon(&hc, h); if (nc == ' ') do n += chartocanon(&nc, n); while (nc == ' '); else n += chartocanon(&nc, n); } return nc == 0 ? e : NULL; } static const char *find_string(const char *s, const char *needle, const char **endp) { const char *end; while (*s) { end = match_string(s, needle); if (end) return *endp = end, s; ++s; } return *endp = NULL, NULL; } struct search_data { /* Number of hits so far.*/ int count_quads; int count_hits; int max_quads; int quad_fill; fz_quad locals[32]; fz_quad *quads; float hfuzz, vfuzz; fz_search_callback_fn *cb; void *opaque; }; static int hit_char(fz_context *ctx, struct search_data *hits, fz_stext_line *line, fz_stext_char *ch, int is_at_start) { float vfuzz = ch->size * hits->vfuzz; float hfuzz = ch->size * hits->hfuzz; /* Can we continue an existing quad? */ if (hits->quad_fill > 0 && !is_at_start) { fz_quad *end = &hits->quads[hits->quad_fill-1]; if (hdist(&line->dir, &end->lr, &ch->quad.ll) < hfuzz && vdist(&line->dir, &end->lr, &ch->quad.ll) < vfuzz && hdist(&line->dir, &end->ur, &ch->quad.ul) < hfuzz && vdist(&line->dir, &end->ur, &ch->quad.ul) < vfuzz) { /* Yes */ end->ur = ch->quad.ur; end->lr = ch->quad.lr; return 0; } } if (is_at_start && hits->quad_fill > 0) { /* Send the quads we have queued. */ hits->count_hits++; if (hits->cb && hits->cb(ctx, hits->opaque, hits->quad_fill, hits->quads)) return 1; hits->quad_fill = 0; } if (hits->quad_fill == hits->max_quads) { int newmax = hits->max_quads * 2; if (hits->quads == hits->locals) { hits->quads = fz_malloc(ctx, sizeof(hits->quads[0]) * newmax); memcpy(hits->quads, hits->locals, sizeof(hits->quads[0]) * nelem(hits->locals)); } else { hits->quads = fz_realloc(ctx, hits->quads, sizeof(hits->quads[0]) * newmax); } hits->max_quads = newmax; } hits->quads[hits->quad_fill++] = ch->quad; hits->count_quads++; return 0; } int fz_search_stext_page_cb(fz_context *ctx, fz_stext_page *page, const char *needle, fz_search_callback_fn *cb, void *opaque) { struct search_data hits; fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_buffer *buffer; const char *haystack, *begin, *end; int c, inside; if (strlen(needle) == 0) return 0; hits.count_quads = 0; hits.count_hits = 0; hits.quad_fill = 0; hits.max_quads = nelem(hits.locals); hits.quads = hits.locals; hits.hfuzz = 0.2f; /* merge kerns but not large gaps */ hits.vfuzz = 0.1f; hits.cb = cb; hits.opaque = opaque; buffer = fz_new_buffer_from_stext_page(ctx, page); fz_try(ctx) { haystack = fz_string_from_buffer(ctx, buffer); begin = find_string(haystack, needle, &end); if (!begin) goto no_more_matches; inside = 0; for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) continue; for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { try_new_match: if (!inside) { if (haystack >= begin) inside = 1; } if (inside) { if (haystack < end) { if (hit_char(ctx, &hits, line, ch, haystack == begin)) goto no_more_matches; } else { inside = 0; begin = find_string(haystack, needle, &end); if (!begin) goto no_more_matches; else goto try_new_match; } } haystack += fz_chartorune(&c, haystack); } assert(*haystack == '\n'); ++haystack; } assert(*haystack == '\n'); ++haystack; } no_more_matches: /* Send the quads we have queued. */ if (hits.quad_fill) { hits.count_hits++; if (hits.cb) (void)hits.cb(ctx, hits.opaque, hits.quad_fill, hits.quads); } } fz_always(ctx) { fz_drop_buffer(ctx, buffer); if (hits.quads != hits.locals) fz_free(ctx, hits.quads); } fz_catch(ctx) fz_rethrow(ctx); return hits.count_hits; } typedef struct { int *hit_mark; fz_quad *quads; int max_quads; int fill; int hit; } oldsearch_data; static int oldsearch_cb(fz_context *ctx, void *opaque, int num_quads, fz_quad *quads) { oldsearch_data *data = (oldsearch_data *)opaque; int i; int hit = data->hit++; for (i = 0; i < num_quads; i++) { if (data->fill == data->max_quads) break; if (data->hit_mark) data->hit_mark[data->fill] = hit; data->quads[data->fill] = quads[i]; data->fill++; } /* We never return 1 here, even if we fill up the buffer, as we * want the old API to get the correct total number of quads. */ return 0; } int fz_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle, int *hit_mark, fz_quad *quads, int max_quads) { oldsearch_data data; data.hit_mark = hit_mark; data.quads = quads; data.max_quads = max_quads; data.fill = 0; data.hit = 0; (void)fz_search_stext_page_cb(ctx, page, needle, oldsearch_cb, &data); return data.fill; /* Return the number of quads we have read */ }
