comparison mupdf-source/source/fitz/stext-device.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children aa33339d6b8a
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 // You should have received a copy of the GNU Affero General Public License
15 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
16 //
17 // Alternative licensing terms are available from the licensor.
18 // For commercial licensing, see <https://www.artifex.com/> or contact
19 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
20 // CA 94129, USA, for further information.
21
22 #include "mupdf/fitz.h"
23
24 #include "glyphbox.h"
25
26 #include <float.h>
27 #include <string.h>
28
29 /* Simple layout structure */
30
31 fz_layout_block *fz_new_layout(fz_context *ctx)
32 {
33 fz_pool *pool = fz_new_pool(ctx);
34 fz_layout_block *block;
35 fz_try(ctx)
36 {
37 block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
38 block->pool = pool;
39 block->head = NULL;
40 block->tailp = &block->head;
41 }
42 fz_catch(ctx)
43 {
44 fz_drop_pool(ctx, pool);
45 fz_rethrow(ctx);
46 }
47 return block;
48 }
49
50 void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
51 {
52 if (block)
53 fz_drop_pool(ctx, block->pool);
54 }
55
56 void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p)
57 {
58 fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
59 line->x = x;
60 line->y = y;
61 line->font_size = font_size;
62 line->p = p;
63 line->text = NULL;
64 line->next = NULL;
65 *block->tailp = line;
66 block->tailp = &line->next;
67 block->text_tailp = &line->text;
68 }
69
70 void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p)
71 {
72 fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
73 ch->x = x;
74 ch->advance = advance;
75 ch->p = p;
76 ch->next = NULL;
77 *block->text_tailp = ch;
78 block->text_tailp = &ch->next;
79 }
80
81 /* Extract text into blocks and lines. */
82
83 #define PARAGRAPH_DIST 1.5f
84 #define SPACE_DIST 0.15f
85 #define SPACE_MAX_DIST 0.8f
86 #define BASE_MAX_DIST 0.8f
87 #define FAKE_BOLD_MAX_DIST 0.1f
88
89 /* We keep a stack of the different metatexts that apply at any
90 * given point (normally none!). Whenever we get some content
91 * with a metatext in force, we really want to update the bounds
92 * for that metatext. But running along the whole list each time
93 * would be painful. So we just update the bounds for dev->metatext
94 * and rely on metatext_bounds() propagating it upwards 'just in
95 * time' for us to use metatexts other than the latest one. This
96 * also means we need to propagate bounds upwards when we pop
97 * a metatext.
98 *
99 * Why do we need bounds at all? Well, suppose we get:
100 * /Span <</ActualText (c) >> BDC /Im0 Do EMC
101 * Then where on the page do we put 'c' ? By collecting the
102 * bounds, we can place 'c' wherever the image was.
103 */
104 typedef struct metatext_t
105 {
106 fz_metatext type;
107 char *text;
108 fz_rect bounds;
109 struct metatext_t *prev;
110 } metatext_t;
111
112 typedef struct
113 {
114 fz_point from;
115 fz_point to;
116 float thickness;
117 } rect_details;
118
119 typedef struct
120 {
121 fz_device super;
122 fz_stext_page *page;
123 int id;
124 fz_point pen, start;
125 fz_point lag_pen;
126 fz_matrix trm;
127 int new_obj;
128 int lastchar;
129 int lastbidi;
130 int flags;
131 int color;
132 int last_was_fake_bold;
133 const fz_text *lasttext;
134 fz_stext_options opts;
135
136 metatext_t *metatext;
137
138 /* Store the last values we saw. We need this for flushing the actualtext. */
139 struct
140 {
141 int valid;
142 int clipped;
143 fz_matrix trm;
144 int wmode;
145 int bidi_level;
146 fz_font *font;
147 int flags;
148 } last;
149
150 /* The list of 'rects' seen during processing (if we're collecting styles). */
151 int rect_max;
152 int rect_len;
153 rect_details *rects;
154 } fz_stext_device;
155
156 const char *fz_stext_options_usage =
157 "Text output options:\n"
158 "\tpreserve-images: keep images in output\n"
159 "\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
160 "\tpreserve-spans: do not merge spans on the same line\n"
161 "\tpreserve-whitespace: do not convert all whitespace into space characters\n"
162 "\tinhibit-spaces: don't add spaces between gaps in the text\n"
163 "\tparagraph-break: break blocks at paragraph boundaries\n"
164 "\tdehyphenate: attempt to join up hyphenated words\n"
165 "\tignore-actualtext: do not apply ActualText replacements\n"
166 "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n"
167 "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n"
168 "\taccurate-bboxes: calculate char bboxes from the outlines\n"
169 "\taccurate-ascenders: calculate ascender/descender from font glyphs\n"
170 "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n"
171 "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n"
172 "\tclip: do not include text that is completely clipped\n"
173 "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n"
174 "\tstructured: collect structure markup\n"
175 "\tvectors: include vector bboxes in output\n"
176 "\tsegment: attempt to segment the page\n"
177 "\ttable-hunt: hunt for tables within a (segmented) page\n"
178 "\n";
179
180 /* Find the current actualtext, if any. Will abort if dev == NULL. */
181 static metatext_t *
182 find_actualtext(fz_stext_device *dev)
183 {
184 metatext_t *mt = dev->metatext;
185
186 while (mt && mt->type != FZ_METATEXT_ACTUALTEXT)
187 mt = mt->prev;
188
189 return mt;
190 }
191
192 /* Find the bounds of the given metatext. Will abort if mt or
193 * dev are NULL. */
194 static fz_rect *
195 metatext_bounds(metatext_t *mt, fz_stext_device *dev)
196 {
197 metatext_t *mt2 = dev->metatext;
198
199 while (mt2 != mt)
200 {
201 mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds);
202 mt2 = mt2->prev;
203 }
204
205 return &mt->bounds;
206 }
207
208 /* Find the bounds of the current actualtext, or NULL if there
209 * isn't one. Will abort if dev is NULL. */
210 static fz_rect *
211 actualtext_bounds(fz_stext_device *dev)
212 {
213 metatext_t *mt = find_actualtext(dev);
214
215 if (mt == NULL)
216 return NULL;
217
218 return metatext_bounds(mt, dev);
219 }
220
221 fz_stext_page *
222 fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
223 {
224 fz_pool *pool = fz_new_pool(ctx);
225 fz_stext_page *page = NULL;
226 fz_try(ctx)
227 {
228 page = fz_pool_alloc(ctx, pool, sizeof(*page));
229 page->pool = pool;
230 page->mediabox = mediabox;
231 page->first_block = NULL;
232 page->last_block = NULL;
233 }
234 fz_catch(ctx)
235 {
236 fz_drop_pool(ctx, pool);
237 fz_rethrow(ctx);
238 }
239 return page;
240 }
241
242 static void
243 drop_run(fz_context *ctx, fz_stext_block *block)
244 {
245 fz_stext_line *line;
246 fz_stext_char *ch;
247 while (block)
248 {
249 switch (block->type)
250 {
251 case FZ_STEXT_BLOCK_IMAGE:
252 fz_drop_image(ctx, block->u.i.image);
253 break;
254 case FZ_STEXT_BLOCK_TEXT:
255 for (line = block->u.t.first_line; line; line = line->next)
256 for (ch = line->first_char; ch; ch = ch->next)
257 fz_drop_font(ctx, ch->font);
258 break;
259 case FZ_STEXT_BLOCK_STRUCT:
260 drop_run(ctx, block->u.s.down->first_block);
261 break;
262 default:
263 break;
264 }
265 block = block->next;
266 }
267 }
268
269 void
270 fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
271 {
272 if (page)
273 {
274 drop_run(ctx, page->first_block);
275 fz_drop_pool(ctx, page->pool);
276 }
277 }
278
279 /*
280 * This adds a new block at the end of the page. This should not be used
281 * to add 'struct' blocks to the page as those have to be added internally,
282 * with more complicated pointer setup.
283 */
284 static fz_stext_block *
285 add_block_to_page(fz_context *ctx, fz_stext_page *page)
286 {
287 fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
288 block->bbox = fz_empty_rect; /* Fixes bug 703267. */
289 block->prev = page->last_block;
290 if (page->last_struct)
291 {
292 if (page->last_struct->last_block)
293 {
294 block->prev = page->last_struct->last_block;
295 block->prev->next = block;
296 page->last_struct->last_block = block;
297 }
298 else
299 page->last_struct->last_block = page->last_struct->first_block = block;
300 }
301 else if (!page->last_block)
302 {
303 page->last_block = block;
304 if (!page->first_block)
305 page->first_block = block;
306 }
307 else
308 {
309 page->last_block->next = block;
310 page->last_block = block;
311 }
312 return block;
313 }
314
315 static fz_stext_block *
316 add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
317 {
318 fz_stext_block *block = add_block_to_page(ctx, page);
319 block->type = FZ_STEXT_BLOCK_TEXT;
320 return block;
321 }
322
323 static fz_stext_block *
324 add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image)
325 {
326 fz_stext_block *block = add_block_to_page(ctx, page);
327 block->type = FZ_STEXT_BLOCK_IMAGE;
328 block->u.i.transform = ctm;
329 block->u.i.image = fz_keep_image(ctx, image);
330 block->bbox = fz_transform_rect(fz_unit_rect, ctm);
331 return block;
332 }
333
334 static fz_stext_line *
335 add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi)
336 {
337 fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
338 line->prev = block->u.t.last_line;
339 if (!block->u.t.first_line)
340 block->u.t.first_line = block->u.t.last_line = line;
341 else
342 {
343 block->u.t.last_line->next = line;
344 block->u.t.last_line = line;
345 }
346
347 line->dir = *dir;
348 line->wmode = wmode;
349
350 return line;
351 }
352
353 #define NON_ACCURATE_GLYPH_ADDED_SPACE (-2)
354 #define NON_ACCURATE_GLYPH (-1)
355
356 static fz_stext_char *
357 add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags)
358 {
359 fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
360 fz_point a, d;
361
362 if (!line->first_char)
363 line->first_char = line->last_char = ch;
364 else
365 {
366 line->last_char->next = ch;
367 line->last_char = ch;
368 }
369
370 ch->c = c;
371 ch->argb = color;
372 ch->bidi = bidi;
373 ch->origin = *p;
374 ch->size = size;
375 ch->font = fz_keep_font(ctx, font);
376 ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0);
377 if (font->flags.is_bold)
378 ch->flags |= FZ_STEXT_BOLD;
379
380 if (line->wmode == 0)
381 {
382 fz_rect bounds;
383 int bounded = 0;
384 a.x = 0;
385 d.x = 0;
386 if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE)
387 {
388 /* Added space, in accurate mode. */
389 a.y = d.y = 0;
390 }
391 else if (glyph == NON_ACCURATE_GLYPH)
392 {
393 /* Non accurate mode. */
394 a.y = fz_font_ascender(ctx, font);
395 d.y = fz_font_descender(ctx, font);
396 }
397 else
398 {
399 /* Any glyph in accurate mode */
400 bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
401 bounded = 1;
402 a.y = bounds.y1;
403 d.y = bounds.y0;
404 }
405 if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS)
406 {
407 if (!bounded)
408 bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
409 if (a.x > bounds.x0)
410 a.x = bounds.x0;
411 if (d.y < bounds.x1)
412 d.y = bounds.x1;
413 }
414 }
415 else
416 {
417 a.x = 1;
418 d.x = 0;
419 a.y = 0;
420 d.y = 0;
421 }
422 a = fz_transform_vector(a, trm);
423 d = fz_transform_vector(d, trm);
424
425 ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
426 ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
427 ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
428 ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
429
430 return ch;
431 }
432
433 static void
434 remove_last_char(fz_context *ctx, fz_stext_line *line)
435 {
436 if (line && line->first_char)
437 {
438 fz_stext_char *prev = NULL;
439 fz_stext_char *ch = line->first_char;
440 while (ch->next)
441 {
442 prev = ch;
443 ch = ch->next;
444 }
445 if (prev)
446 {
447 /* The characters are pool allocated, so we don't actually leak the removed node. */
448 /* We do need to drop the char's font reference though. */
449 fz_drop_font(ctx, prev->next->font);
450 line->last_char = prev;
451 line->last_char->next = NULL;
452 }
453 }
454 }
455
456 static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail)
457 {
458 fz_stext_char *prev, *next;
459 prev = tail;
460 while (curr != tail)
461 {
462 next = curr->next;
463 curr->next = prev;
464 prev = curr;
465 curr = next;
466 }
467 return prev;
468 }
469
470 static void reverse_bidi_line(fz_stext_line *line)
471 {
472 fz_stext_char *a, *b, **prev;
473 prev = &line->first_char;
474 for (a = line->first_char; a; a = a->next)
475 {
476 if (a->bidi)
477 {
478 b = a;
479 while (b->next && b->next->bidi)
480 b = b->next;
481 if (a != b)
482 *prev = reverse_bidi_span(a, b->next);
483 }
484 prev = &a->next;
485 line->last_char = a;
486 }
487 }
488
489 static int is_hyphen(int c)
490 {
491 /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
492 return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
493 }
494
495 static float
496 vec_dot(const fz_point *a, const fz_point *b)
497 {
498 return a->x * b->x + a->y * b->y;
499 }
500
501 static int may_add_space(int lastchar)
502 {
503 /* Basic latin, greek, cyrillic, hebrew, arabic,
504 * general punctuation,
505 * superscripts and subscripts,
506 * and currency symbols.
507 */
508 return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF)));
509 }
510
511 #define FAKEBOLD_THRESHOLD_RECIP 10
512
513 static int
514 close(float a, float b, float size)
515 {
516 a -= b;
517 if (a < 0)
518 a = -a;
519
520 return FAKEBOLD_THRESHOLD_RECIP * a < size;
521 }
522
523 static int
524 font_equiv(fz_context *ctx, fz_font *f, fz_font *g)
525 {
526 unsigned char fdigest[16];
527 unsigned char gdigest[16];
528
529 if (f == g)
530 return 1;
531
532 if (strcmp(f->name, g->name) != 0)
533 return 0;
534
535 fz_font_digest(ctx, f, fdigest);
536 fz_font_digest(ctx, g, gdigest);
537
538 return (memcmp(fdigest, gdigest, 16) == 0);
539 }
540
541 static int
542 check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags)
543 {
544 fz_stext_line *line;
545 fz_stext_char *ch;
546
547 for (; block != NULL; block = block->next)
548 {
549 if (block->type == FZ_STEXT_BLOCK_STRUCT)
550 {
551 if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags))
552 return 1;
553 }
554 else if (block->type == FZ_STEXT_BLOCK_TEXT)
555 {
556 for (line = block->u.t.first_line; line != NULL; line = line->next)
557 {
558 fz_stext_char *pr = NULL;
559 for (ch = line->first_char; ch != NULL; ch = ch->next)
560 {
561 /* Not perfect, but it'll do! */
562 if (ch->c == c && close(ch->origin.x, p.x, size) && close(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font))
563 {
564 /* If we were filled before, and we are stroking now... */
565 if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED &&
566 (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED)
567 {
568 /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */
569 ch->flags |= flags;
570 return 1;
571 }
572 /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these
573 * as boldening if either the char before, or the char after were also boldened. */
574 ch->flags |= flags;
575
576 if (c == ' ')
577 {
578 if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) ||
579 (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0))
580 {
581 /* OK, we can be bold. */
582 ch->flags |= FZ_STEXT_BOLD;
583 return 1;
584 }
585 /* Ignore this and keep going */
586 }
587 else
588 {
589 ch->flags |= FZ_STEXT_BOLD;
590 return 1;
591 }
592 }
593 pr = ch;
594 }
595 }
596 }
597 }
598
599 return 0;
600 }
601
602 static void
603 fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags)
604 {
605 fz_stext_page *page = dev->page;
606 fz_stext_block *cur_block;
607 fz_stext_line *cur_line;
608
609 int new_para = 0;
610 int new_line = 1;
611 int add_space = 0;
612 fz_point dir, ndir, p, q;
613 float size;
614 fz_point delta;
615 float spacing = 0;
616 float base_offset = 0;
617 float dist;
618
619 /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */
620 bidi = bidi & 1;
621
622 /* dir = direction vector for motion. ndir = normalised(dir) */
623 if (wmode == 0)
624 {
625 dir.x = 1;
626 dir.y = 0;
627 }
628 else
629 {
630 dir.x = 0;
631 dir.y = -1;
632 }
633 dir = fz_transform_vector(dir, trm);
634 ndir = fz_normalize_vector(dir);
635
636 size = fz_matrix_expansion(trm);
637
638 /* We need to identify where glyphs 'start' (p) and 'stop' (q).
639 * Each glyph holds its 'start' position, and the next glyph in the
640 * span (or span->max if there is no next glyph) holds its 'end'
641 * position.
642 *
643 * For both horizontal and vertical motion, trm->{e,f} gives the
644 * origin (usually the bottom left) of the glyph.
645 *
646 * In horizontal mode:
647 * + p is bottom left.
648 * + q is the bottom right
649 * In vertical mode:
650 * + p is top left (where it advanced from)
651 * + q is bottom left
652 */
653 if (wmode == 0)
654 {
655 p.x = trm.e;
656 p.y = trm.f;
657 q.x = trm.e + adv * dir.x;
658 q.y = trm.f + adv * dir.y;
659 }
660 else
661 {
662 p.x = trm.e - adv * dir.x;
663 p.y = trm.f - adv * dir.y;
664 q.x = trm.e;
665 q.y = trm.f;
666 }
667
668 if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0)
669 {
670 if (glyph == -1)
671 {
672 if (dev->last_was_fake_bold)
673 goto move_pen_and_exit;
674 }
675 else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags))
676 {
677 dev->last_was_fake_bold = 1;
678 goto move_pen_and_exit;
679 }
680 dev->last_was_fake_bold = 0;
681 }
682
683 /* Find current position to enter new text. */
684 cur_block = page->last_struct ? page->last_struct->last_block : page->last_block;
685 if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
686 cur_block = NULL;
687 cur_line = cur_block ? cur_block->u.t.last_line : NULL;
688
689 if (cur_line && glyph < 0)
690 {
691 /* Don't advance pen or break lines for no-glyph characters in a cluster */
692 add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags);
693 dev->lastbidi = bidi;
694 dev->lastchar = c;
695 return;
696 }
697
698 if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
699 {
700 /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
701 * then we can't append to the current block/line. */
702 new_para = 1;
703 new_line = 1;
704 }
705 else
706 {
707 /* Detect fake bold where text is printed twice in the same place. */
708 /* Largely supplanted by the check_for_fake_bold mechanism above,
709 * but we leave this in for backward compatibility as it's cheap,
710 * and works even when FZ_STEXT_COLLECT_STYLES is not set. */
711 dist = hypotf(q.x - dev->pen.x, q.y - dev->pen.y) / size;
712 if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar)
713 return;
714
715 /* Calculate how far we've moved since the last character. */
716 delta.x = p.x - dev->pen.x;
717 delta.y = p.y - dev->pen.y;
718
719 /* The transform has not changed, so we know we're in the same
720 * direction. Calculate 2 distances; how far off the previous
721 * baseline we are, together with how far along the baseline
722 * we are from the expected position. */
723 spacing = (ndir.x * delta.x + ndir.y * delta.y) / size;
724 base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size;
725
726 /* Only a small amount off the baseline - we'll take this */
727 if (fabsf(base_offset) < BASE_MAX_DIST)
728 {
729 /* If mixed LTR and RTL content */
730 if ((bidi & 1) != (dev->lastbidi & 1))
731 {
732 /* Ignore jumps within line when switching between LTR and RTL text. */
733 new_line = 0;
734 }
735
736 /* RTL */
737 else if (bidi & 1)
738 {
739 fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y);
740 float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv;
741
742 /* If the pen is where we would have been if we
743 * had advanced backwards from the previous
744 * character by this character's advance, we
745 * are probably seeing characters emitted in
746 * logical order.
747 */
748 if (fabsf(logical_spacing) < SPACE_DIST)
749 {
750 new_line = 0;
751 }
752
753 /* However, if the pen has advanced to where we would expect it
754 * in an LTR context, we're seeing them emitted in visual order
755 * and should flag them for reordering!
756 */
757 else if (fabsf(spacing) < SPACE_DIST)
758 {
759 bidi = 3; /* mark line as visual */
760 new_line = 0;
761 }
762
763 /* And any other small jump could be a missing space. */
764 else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST)
765 {
766 if (wmode == 0 && may_add_space(dev->lastchar))
767 add_space = 1;
768 new_line = 0;
769 }
770 else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
771 {
772 /* Motion is in line, but negative. We've probably got overlapping
773 * chars here. Live with it. */
774 new_line = 0;
775 }
776 else if (spacing > 0 && spacing < SPACE_MAX_DIST)
777 {
778 bidi = 3; /* mark line as visual */
779 if (wmode == 0 && may_add_space(dev->lastchar))
780 add_space = 1;
781 new_line = 0;
782 }
783
784 else
785 {
786 /* Motion is large and unexpected (probably a new table column). */
787 new_line = 1;
788 }
789 }
790
791 /* LTR or neutral character */
792 else
793 {
794 if (fabsf(spacing) < SPACE_DIST)
795 {
796 /* Motion is in line and small enough to ignore. */
797 new_line = 0;
798 }
799 else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
800 {
801 /* Motion is in line, but negative. We've probably got overlapping
802 * chars here. Live with it. */
803 new_line = 0;
804 }
805 else if (spacing > 0 && spacing < SPACE_MAX_DIST)
806 {
807 /* Motion is forward in line and large enough to warrant us adding a space. */
808 if (wmode == 0 && may_add_space(dev->lastchar))
809 add_space = 1;
810 new_line = 0;
811 }
812 else
813 {
814 /* Motion is large and unexpected (probably a new table column). */
815 new_line = 1;
816 }
817 }
818 }
819
820 /* Enough for a new line, but not enough for a new paragraph */
821 else if (fabsf(base_offset) <= PARAGRAPH_DIST)
822 {
823 /* Check indent to spot text-indent style paragraphs */
824 if (wmode == 0 && cur_line && dev->new_obj)
825 if ((p.x - dev->start.x) > 0.5f)
826 new_para = 1;
827 new_line = 1;
828 }
829
830 /* Way off the baseline - open a new paragraph */
831 else
832 {
833 new_para = 1;
834 new_line = 1;
835 }
836 }
837
838 /* Start a new block (but only at the beginning of a text object) */
839 if (new_para || !cur_block)
840 {
841 cur_block = add_text_block_to_page(ctx, page);
842 cur_line = cur_block->u.t.last_line;
843 }
844
845 if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && is_hyphen(dev->lastchar))
846 {
847 remove_last_char(ctx, cur_line);
848 new_line = 0;
849 }
850
851 /* Start a new line */
852 if (new_line || !cur_line || force_new_line)
853 {
854 cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi);
855 dev->start = p;
856 }
857
858 /* Add synthetic space */
859 if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
860 add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, 1, flags, dev->flags);
861
862 add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags);
863
864 move_pen_and_exit:
865 dev->lastchar = c;
866 dev->lastbidi = bidi;
867 dev->lag_pen = p;
868 dev->pen = q;
869
870 dev->new_obj = 0;
871 dev->trm = trm;
872 }
873
874 static void
875 fz_add_stext_char(fz_context *ctx,
876 fz_stext_device *dev,
877 fz_font *font,
878 int c,
879 int glyph,
880 fz_matrix trm,
881 float adv,
882 int wmode,
883 int bidi,
884 int force_new_line,
885 int flags)
886 {
887 /* ignore when one unicode character maps to multiple glyphs */
888 if (c == -1)
889 return;
890
891 if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS)
892 fz_calculate_font_ascender_descender(ctx, font);
893
894 if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
895 {
896 switch (c)
897 {
898 case 0xFB00: /* ff */
899 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
900 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
901 return;
902 case 0xFB01: /* fi */
903 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
904 fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
905 return;
906 case 0xFB02: /* fl */
907 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
908 fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
909 return;
910 case 0xFB03: /* ffi */
911 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
912 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
913 fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
914 return;
915 case 0xFB04: /* ffl */
916 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
917 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
918 fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
919 return;
920 case 0xFB05: /* long st */
921 case 0xFB06: /* st */
922 fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags);
923 fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags);
924 return;
925 }
926 }
927
928 if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
929 {
930 switch (c)
931 {
932 case 0x0009: /* tab */
933 case 0x0020: /* space */
934 case 0x00A0: /* no-break space */
935 case 0x1680: /* ogham space mark */
936 case 0x180E: /* mongolian vowel separator */
937 case 0x2000: /* en quad */
938 case 0x2001: /* em quad */
939 case 0x2002: /* en space */
940 case 0x2003: /* em space */
941 case 0x2004: /* three-per-em space */
942 case 0x2005: /* four-per-em space */
943 case 0x2006: /* six-per-em space */
944 case 0x2007: /* figure space */
945 case 0x2008: /* punctuation space */
946 case 0x2009: /* thin space */
947 case 0x200A: /* hair space */
948 case 0x202F: /* narrow no-break space */
949 case 0x205F: /* medium mathematical space */
950 case 0x3000: /* ideographic space */
951 c = ' ';
952 }
953 }
954
955 fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags);
956 }
957
958 static fz_rect
959 current_clip(fz_context *ctx, fz_stext_device *dev)
960 {
961 fz_rect r = fz_infinite_rect;
962
963 if (dev->flags & FZ_STEXT_CLIP)
964 {
965 r = fz_device_current_scissor(ctx, &dev->super);
966 r = fz_intersect_rect(r, dev->page->mediabox);
967 }
968 if (dev->flags & FZ_STEXT_CLIP_RECT)
969 r = fz_intersect_rect(r, dev->opts.clip);
970
971 return r;
972 }
973
974 static void
975 do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags)
976 {
977 fz_font *font = span->font;
978 fz_matrix tm = span->trm;
979 float adv;
980 int unicode;
981 int i;
982
983 for (i = start; i < end; i++)
984 {
985 /* Calculate new pen location and delta */
986 tm.e = span->items[i].x;
987 tm.f = span->items[i].y;
988 dev->last.trm = fz_concat(tm, ctm);
989 dev->last.bidi_level = span->bidi_level;
990 dev->last.wmode = span->wmode;
991 if (font != dev->last.font)
992 {
993 fz_drop_font(ctx, dev->last.font);
994 dev->last.font = fz_keep_font(ctx, font);
995 }
996 dev->last.valid = 1;
997 dev->last.flags = flags;
998
999 if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1000 {
1001 fz_rect r = current_clip(ctx, dev);
1002 if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
1003 {
1004 dev->last.clipped = 1;
1005 continue;
1006 }
1007 }
1008 dev->last.clipped = 0;
1009
1010 /* Calculate bounding box and new pen position based on font metrics */
1011 if (span->items[i].gid >= 0)
1012 adv = span->items[i].adv;
1013 else
1014 adv = 0;
1015
1016 unicode = span->items[i].ucs;
1017 if (unicode == FZ_REPLACEMENT_CHARACTER)
1018 {
1019 if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE)
1020 {
1021 unicode = span->items[i].cid;
1022 flags |= FZ_STEXT_UNICODE_IS_CID;
1023 }
1024 else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE)
1025 {
1026 unicode = span->items[i].gid;
1027 flags |= FZ_STEXT_UNICODE_IS_GID;
1028 }
1029 }
1030
1031 /* Send the chars we have through. */
1032 fz_add_stext_char(ctx, dev, font,
1033 unicode,
1034 span->items[i].gid,
1035 dev->last.trm,
1036 adv,
1037 dev->last.wmode,
1038 dev->last.bidi_level,
1039 (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1040 flags);
1041 }
1042 }
1043
1044 static int
1045 rune_index(const char *utf8, size_t idx)
1046 {
1047 int rune;
1048
1049 do
1050 {
1051 int len = fz_chartorune(&rune, utf8);
1052 if (rune == 0)
1053 return -1;
1054 utf8 += len;
1055 }
1056 while (idx--);
1057
1058 return rune;
1059 }
1060
1061 static void
1062 flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i)
1063 {
1064 if (*actualtext == 0)
1065 return;
1066
1067 while (1)
1068 {
1069 int rune;
1070 actualtext += fz_chartorune(&rune, actualtext);
1071
1072 if (rune == 0)
1073 break;
1074
1075 if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1076 if (dev->last.clipped)
1077 continue;
1078
1079 fz_add_stext_char(ctx, dev, dev->last.font,
1080 rune,
1081 -1,
1082 dev->last.trm,
1083 0,
1084 dev->last.wmode,
1085 dev->last.bidi_level,
1086 (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1087 dev->last.flags);
1088 i++;
1089 }
1090 }
1091
1092 static void
1093 do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags)
1094 {
1095 /* We are within an actualtext block. This means we can't just add the chars
1096 * as they are. We need to add the chars as they are meant to be. Sadly the
1097 * actualtext mechanism doesn't help us at all with positioning. */
1098 fz_font *font = span->font;
1099 fz_matrix tm = span->trm;
1100 float adv;
1101 int start, i, end;
1102 char *actualtext = mt->text;
1103 size_t z = fz_utflen(actualtext);
1104
1105 /* If actualtext is empty, nothing to do! */
1106 if (z == 0)
1107 return;
1108
1109 /* Now, we HOPE that the creator of a PDF will minimise the actual text
1110 * differences, so that we'll get:
1111 * "Politicians <Actualtext="lie">fib</ActualText>, always."
1112 * rather than:
1113 * "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText>
1114 * but experience with PDF files tells us that this won't always be the case.
1115 *
1116 * We try to minimise the actualtext section here, just in case.
1117 */
1118
1119 /* Spot a matching prefix and send it. */
1120 for (start = 0; start < span->len; start++)
1121 {
1122 int rune;
1123 int len = fz_chartorune(&rune, actualtext);
1124 if (span->items[start].gid != rune || rune == 0)
1125 break;
1126 actualtext += len; z--;
1127 }
1128 if (start != 0)
1129 do_extract(ctx, dev, span, ctm, 0, start, flags);
1130
1131 if (start == span->len)
1132 {
1133 /* The prefix has consumed all this object. Just shorten the actualtext and we'll
1134 * catch the rest next time. */
1135 z = strlen(actualtext)+1;
1136 memmove(mt->text, actualtext, z);
1137 return;
1138 }
1139
1140 /* We haven't consumed the whole string, so there must be runes left.
1141 * Shut coverity up. */
1142 assert(z != 0);
1143
1144 /* Spot a matching postfix. Can't send it til the end. */
1145 for (end = span->len; end > start; end--)
1146 {
1147 /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */
1148 int rune = rune_index(actualtext, z-1);
1149 if (span->items[end-1].gid != rune)
1150 break;
1151 z--;
1152 }
1153 /* So we can send end -> span->len at the end. */
1154
1155 /* So we have at least SOME chars that don't match. */
1156 /* Now, do the difficult bit in the middle.*/
1157 /* items[start..end] have to be sent with actualtext[start..z] */
1158 for (i = start; i < end; i++)
1159 {
1160 fz_text_item *item = &span->items[i];
1161 int rune = -1;
1162
1163 if ((size_t)i < z)
1164 actualtext += fz_chartorune(&rune, actualtext);
1165
1166 /* Calculate new pen location and delta */
1167 tm.e = item->x;
1168 tm.f = item->y;
1169 dev->last.trm = fz_concat(tm, ctm);
1170 dev->last.bidi_level = span->bidi_level;
1171 dev->last.wmode = span->wmode;
1172 if (font != dev->last.font)
1173 {
1174 fz_drop_font(ctx, dev->last.font);
1175 dev->last.font = fz_keep_font(ctx, font);
1176 }
1177 dev->last.valid = 1;
1178
1179 if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
1180 {
1181 fz_rect r = current_clip(ctx, dev);
1182 if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
1183 {
1184 dev->last.clipped = 1;
1185 continue;
1186 }
1187 }
1188 dev->last.clipped = 0;
1189
1190 /* Calculate bounding box and new pen position based on font metrics */
1191 if (item->gid >= 0)
1192 adv = item->adv;
1193 else
1194 adv = 0;
1195
1196 fz_add_stext_char(ctx, dev, font,
1197 rune,
1198 span->items[i].gid,
1199 dev->last.trm,
1200 adv,
1201 dev->last.wmode,
1202 dev->last.bidi_level,
1203 (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
1204 flags);
1205 }
1206
1207 /* If we haven't spotted a postfix by this point, then don't force ourselves to output
1208 * any more of the actualtext at this point. We might get a new text object that matches
1209 * more of it. */
1210 if (end == span->len)
1211 {
1212 /* Shorten actualtext and exit. */
1213 z = strlen(actualtext)+1;
1214 memmove(mt->text, actualtext, z);
1215 return;
1216 }
1217
1218 /* We found a matching postfix. It seems likely that this is going to be the only
1219 * text object we get, so send any remaining actualtext now. */
1220 flush_actualtext(ctx, dev, actualtext, i);
1221
1222 /* Send the postfix */
1223 if (end != span->len)
1224 do_extract(ctx, dev, span, ctm, end, span->len, flags);
1225
1226 mt->text[0] = 0;
1227 }
1228
1229 static void
1230 fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags)
1231 {
1232 fz_stext_device *tdev = (fz_stext_device*)dev;
1233 metatext_t *mt = NULL;
1234
1235 if (span->len == 0)
1236 return;
1237
1238 /* Are we in an actualtext? */
1239 if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT))
1240 mt = find_actualtext(dev);
1241
1242 if (mt)
1243 do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags);
1244 else
1245 do_extract(ctx, dev, span, ctm, 0, span->len, flags);
1246 }
1247
1248 static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha)
1249 {
1250 float rgb[3];
1251 fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
1252 return
1253 ((uint32_t) (fz_clampi(alpha * 255 + 0.5f, 0, 255) << 24)) |
1254 ((uint32_t) (fz_clampi(rgb[0] * 255 + 0.5f, 0, 255) << 16)) |
1255 ((uint32_t) (fz_clampi(rgb[1] * 255 + 0.5f, 0, 255) << 8)) |
1256 ((uint32_t) (fz_clampi(rgb[2] * 255 + 0.5f, 0, 255)));
1257 }
1258
1259 static void
1260 fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
1261 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1262 {
1263 fz_stext_device *tdev = (fz_stext_device*)dev;
1264 fz_text_span *span;
1265 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1266 return;
1267 tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1268 tdev->new_obj = 1;
1269 for (span = text->head; span; span = span->next)
1270 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED);
1271 fz_drop_text(ctx, tdev->lasttext);
1272 tdev->lasttext = fz_keep_text(ctx, text);
1273 }
1274
1275 static void
1276 fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
1277 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
1278 {
1279 fz_stext_device *tdev = (fz_stext_device*)dev;
1280 fz_text_span *span;
1281 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1282 return;
1283 tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
1284 tdev->new_obj = 1;
1285 for (span = text->head; span; span = span->next)
1286 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED);
1287 fz_drop_text(ctx, tdev->lasttext);
1288 tdev->lasttext = fz_keep_text(ctx, text);
1289 }
1290
1291 static void
1292 fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
1293 {
1294 fz_stext_device *tdev = (fz_stext_device*)dev;
1295 fz_text_span *span;
1296 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1297 return;
1298 tdev->color = 0;
1299 tdev->new_obj = 1;
1300 for (span = text->head; span; span = span->next)
1301 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED);
1302 fz_drop_text(ctx, tdev->lasttext);
1303 tdev->lasttext = fz_keep_text(ctx, text);
1304 }
1305
1306 static void
1307 fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
1308 {
1309 fz_stext_device *tdev = (fz_stext_device*)dev;
1310 fz_text_span *span;
1311 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1312 return;
1313 tdev->color = 0;
1314 tdev->new_obj = 1;
1315 for (span = text->head; span; span = span->next)
1316 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED);
1317 fz_drop_text(ctx, tdev->lasttext);
1318 tdev->lasttext = fz_keep_text(ctx, text);
1319 }
1320
1321 static void
1322 fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
1323 {
1324 fz_stext_device *tdev = (fz_stext_device*)dev;
1325 fz_text_span *span;
1326 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
1327 return;
1328 tdev->color = 0;
1329 tdev->new_obj = 1;
1330 for (span = text->head; span; span = span->next)
1331 fz_stext_extract(ctx, tdev, span, ctm, 0);
1332 fz_drop_text(ctx, tdev->lasttext);
1333 tdev->lasttext = fz_keep_text(ctx, text);
1334 }
1335
1336 static void
1337 fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text)
1338 {
1339 fz_stext_device *tdev = (fz_stext_device*)dev;
1340 metatext_t *mt = fz_malloc_struct(ctx, metatext_t);
1341
1342 mt->prev = tdev->metatext;
1343 tdev->metatext = mt;
1344 mt->type = meta;
1345 mt->text = text ? fz_strdup(ctx, text) : NULL;
1346 mt->bounds = fz_empty_rect;
1347 }
1348
1349 static void
1350 pop_metatext(fz_context *ctx, fz_stext_device *dev)
1351 {
1352 metatext_t *prev;
1353 fz_rect bounds;
1354
1355 if (!dev->metatext)
1356 return;
1357
1358 prev = dev->metatext->prev;
1359 bounds = dev->metatext->bounds;
1360 fz_free(ctx, dev->metatext->text);
1361 fz_free(ctx, dev->metatext);
1362 dev->metatext = prev;
1363 if (prev)
1364 prev->bounds = fz_union_rect(prev->bounds, bounds);
1365 }
1366
1367 static void
1368 fz_stext_end_metatext(fz_context *ctx, fz_device *dev)
1369 {
1370 fz_stext_device *tdev = (fz_stext_device*)dev;
1371 fz_font *myfont = NULL;
1372
1373 if (!tdev->metatext)
1374 return; /* Mismatched pop. Live with it. */
1375
1376 if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT)
1377 {
1378 /* We only deal with ActualText here. Just pop anything else off,
1379 * and we're done. */
1380 pop_metatext(ctx, tdev);
1381 return;
1382 }
1383
1384 /* If we have a 'last' text position, send the content after that. */
1385 if (tdev->last.valid)
1386 {
1387 flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
1388 pop_metatext(ctx, tdev);
1389 return;
1390 }
1391
1392 /* If we have collected a rectangle for content that encloses the actual text,
1393 * send the content there. */
1394 if (!fz_is_empty_rect(tdev->metatext->bounds))
1395 {
1396 tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0;
1397 tdev->last.trm.b = 0;
1398 tdev->last.trm.c = 0;
1399 tdev->last.trm.d = tdev->metatext->bounds.y1 - tdev->metatext->bounds.y0;
1400 tdev->last.trm.e = tdev->metatext->bounds.x0;
1401 tdev->last.trm.f = tdev->metatext->bounds.y0;
1402 }
1403 else
1404 fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned.");
1405
1406 fz_var(myfont);
1407
1408 fz_try(ctx)
1409 {
1410 if (tdev->last.font == NULL)
1411 {
1412 myfont = fz_new_base14_font(ctx, "Helvetica");
1413 tdev->last.font = myfont;
1414 }
1415 flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
1416 pop_metatext(ctx, tdev);
1417 }
1418 fz_always(ctx)
1419 {
1420 if (myfont)
1421 {
1422 tdev->last.font = NULL;
1423 fz_drop_font(ctx, myfont);
1424 }
1425 }
1426 fz_catch(ctx)
1427 fz_rethrow(ctx);
1428 }
1429
1430
1431 /* Images and shadings */
1432
1433 static void
1434 fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
1435 {
1436 fz_stext_device *tdev = (fz_stext_device*)dev;
1437 fz_rect *bounds = actualtext_bounds(tdev);
1438
1439 /* If there is an actualtext in force, update its bounds. */
1440 if (bounds)
1441 {
1442 static const fz_rect unit = { 0, 0, 1, 1 };
1443 *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm));
1444 }
1445
1446 /* Unless we are being told to preserve images, nothing to do here. */
1447 if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1448 return;
1449
1450 /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
1451 if (alpha >= 0.5f)
1452 add_image_block_to_page(ctx, tdev->page, ctm, img);
1453
1454 }
1455
1456 static void
1457 fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
1458 fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
1459 {
1460 fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
1461 }
1462
1463 static fz_image *
1464 fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
1465 {
1466 fz_matrix ctm = *in_out_ctm;
1467 fz_pixmap *pix;
1468 fz_image *img = NULL;
1469 fz_rect bounds;
1470 fz_irect bbox;
1471
1472 bounds = fz_bound_shade(ctx, shade, ctm);
1473 bounds = fz_intersect_rect(bounds, scissor);
1474 bbox = fz_irect_from_rect(bounds);
1475
1476 pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
1477 fz_try(ctx)
1478 {
1479 if (shade->use_background)
1480 fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
1481 else
1482 fz_clear_pixmap(ctx, pix);
1483 fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL);
1484 img = fz_new_image_from_pixmap(ctx, pix, NULL);
1485 }
1486 fz_always(ctx)
1487 fz_drop_pixmap(ctx, pix);
1488 fz_catch(ctx)
1489 fz_rethrow(ctx);
1490
1491 in_out_ctm->a = pix->w;
1492 in_out_ctm->b = 0;
1493 in_out_ctm->c = 0;
1494 in_out_ctm->d = pix->h;
1495 in_out_ctm->e = pix->x;
1496 in_out_ctm->f = pix->y;
1497 return img;
1498 }
1499
1500 static void
1501 fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
1502 {
1503 fz_stext_device *tdev = (fz_stext_device*)dev;
1504 fz_rect *bounds = actualtext_bounds(tdev);
1505 fz_matrix local_ctm;
1506 fz_rect scissor;
1507 fz_image *image;
1508
1509 /* If we aren't keeping images, but we are in a bound, update the bounds
1510 * without generating the entire image. */
1511 if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && bounds)
1512 {
1513 *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm));
1514 return;
1515 }
1516
1517 /* Unless we are preserving image, nothing to do here. */
1518 if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
1519 return;
1520
1521 local_ctm = ctm;
1522 scissor = fz_device_current_scissor(ctx, dev);
1523 if (dev->flags & FZ_STEXT_CLIP_RECT)
1524 scissor = fz_intersect_rect(scissor, tdev->opts.clip);
1525 scissor = fz_intersect_rect(scissor, tdev->page->mediabox);
1526 image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
1527 fz_try(ctx)
1528 fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
1529 fz_always(ctx)
1530 fz_drop_image(ctx, image);
1531 fz_catch(ctx)
1532 fz_rethrow(ctx);
1533 }
1534
1535 static void
1536 fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block)
1537 {
1538 fz_stext_line *line;
1539 fz_stext_char *ch;
1540
1541 for ( ; block != NULL; block = block->next)
1542 {
1543 if (block->type == FZ_STEXT_BLOCK_STRUCT)
1544 if (block->u.s.down)
1545 fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block);
1546 if (block->type != FZ_STEXT_BLOCK_TEXT)
1547 continue;
1548 for (line = block->u.t.first_line; line; line = line->next)
1549 {
1550 int reorder = 0;
1551 for (ch = line->first_char; ch; ch = ch->next)
1552 {
1553 fz_rect ch_box = fz_rect_from_quad(ch->quad);
1554 if (ch == line->first_char)
1555 line->bbox = ch_box;
1556 else
1557 line->bbox = fz_union_rect(line->bbox, ch_box);
1558 if (ch->bidi == 3)
1559 reorder = 1;
1560 }
1561 block->bbox = fz_union_rect(block->bbox, line->bbox);
1562 if (reorder)
1563 reverse_bidi_line(line);
1564 }
1565 }
1566 }
1567
1568 static void
1569 advance_to_x(fz_point *a, fz_point b, float x)
1570 {
1571 a->y += (b.y - a->y) * (x - a->x) / (b.x - a->x);
1572 a->x = x;
1573 }
1574
1575 static void
1576 advance_to_y(fz_point *a, fz_point b, float y)
1577 {
1578 a->x += (b.x - a->x) * (y - a->y) / (b.y - a->y);
1579 a->y = y;
1580 }
1581
1582 static int
1583 line_crosses_rect(fz_point a, fz_point b, fz_rect r)
1584 {
1585 /* Cope with trivial exclusions */
1586 if (a.x < r.x0 && b.x < r.x0)
1587 return 0;
1588 if (a.x > r.x1 && b.x > r.x1)
1589 return 0;
1590 if (a.y < r.y0 && b.y < r.y0)
1591 return 0;
1592 if (a.y > r.y1 && b.y > r.y1)
1593 return 0;
1594
1595 if (a.x < r.x0)
1596 advance_to_x(&a, b, r.x0);
1597 if (a.x > r.x1)
1598 advance_to_x(&a, b, r.x1);
1599 if (a.y < r.y0)
1600 advance_to_y(&a, b, r.y0);
1601 if (a.y > r.y1)
1602 advance_to_y(&a, b, r.y1);
1603
1604 return fz_is_point_inside_rect(a, r);
1605 }
1606
1607 static float
1608 calculate_ascent(fz_point p, fz_point origin, fz_point dir)
1609 {
1610 return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x);
1611 }
1612
1613 /* Create us a rect from the given quad, but extend it downwards
1614 * to allow for underlines that pass under the glyphs. */
1615 static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size)
1616 {
1617 /* Consider the two rects from A and g respectively.
1618 *
1619 * ul +------+ ur or
1620 * | /\ | ul +------+ ur
1621 * | /__\ | | /''\ |
1622 * |/ \| |( ||
1623 * ll +------+ lr | ''''||
1624 * | ''' | <-expected underline level
1625 * ll +------+ lr
1626 *
1627 * So an underline won't cross A's rect, but will cross g's.
1628 * We want to make a rect that includes a suitable amount of
1629 * space underneath. The information we have available to us
1630 * is summed up here:
1631 *
1632 * ul +---------+ ur
1633 * | |
1634 * | origin |
1635 * |+----------> dir
1636 * | |
1637 * ll +---------+ lr
1638 *
1639 * Consider the distance from ul to the line that passes through
1640 * the origin with direction dir. Similarly, consider the distance
1641 * from ur to the same line. This can be thought of as the 'ascent'
1642 * of this character.
1643 *
1644 * We'd like the distance from ul to ll to be greater than this, so
1645 * as to ensure we cover the possible location where an underline
1646 * might reasonably go.
1647 *
1648 * If we have a line (l) through point A with direction vector u,
1649 * the distance between point P and line(l) is:
1650 *
1651 * d(P,l) = || AP x u || / || u ||
1652 *
1653 * where x is the cross product.
1654 *
1655 * For us, because || dir || = 1:
1656 *
1657 * d(ul, origin) = || (origin-ul) x dir ||
1658 *
1659 * The cross product is only defined in 3 (or 7!) dimensions, so
1660 * extend both vectors into 3d by defining a 0 z component.
1661 *
1662 * (origin-ul) x dir = [ (origin.y - ul.y) . 0 - 0 . dir.y ]
1663 * [ 0 . dir.x - (origin.x - ul.y) . 0 ]
1664 * [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ]
1665 *
1666 * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x
1667 */
1668 float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2;
1669 fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y };
1670 fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y };
1671 float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2;
1672 int neg = 0;
1673 float extra_rise = 0;
1674
1675 /* Spaces will have 0 ascent. underscores will have small ascent.
1676 * We want a sane ascent to be able to spot strikeouts, but not
1677 * so big that it incorporates lines above the text, like borders. */
1678 if (ascent < 0.75*size)
1679 extra_rise = 0.75*size - ascent;
1680
1681 /* We'd like height to be at least ascent + 1/4 size */
1682 if (height < 0)
1683 neg = 1, height = -height;
1684 if (height < ascent + size * 0.25f)
1685 height = ascent + size * 0.25f;
1686
1687 height -= ascent;
1688 if (neg)
1689 height = -height;
1690 quad.ll.x += - height * dir.y;
1691 quad.ll.y += height * dir.x;
1692 quad.lr.x += - height * dir.y;
1693 quad.lr.y += height * dir.x;
1694 quad.ul.x -= - extra_rise * dir.y;
1695 quad.ul.y -= extra_rise * dir.x;
1696 quad.ur.x -= - extra_rise * dir.y;
1697 quad.ur.y -= extra_rise * dir.x;
1698
1699 return fz_rect_from_quad(quad);
1700 }
1701
1702 static int feq(float a,float b)
1703 {
1704 #define EPSILON 0.00001
1705 a -= b;
1706 if (a < 0)
1707 a = -a;
1708 return a < EPSILON;
1709 }
1710
1711 static void
1712 check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir, float thickness)
1713 {
1714 for ( ; block; block = block->next)
1715 {
1716 fz_stext_line *line;
1717
1718 if (block->type != FZ_STEXT_BLOCK_TEXT)
1719 continue;
1720
1721 for (line = block->u.t.first_line; line != NULL; line = line->next)
1722 {
1723 fz_stext_char *ch;
1724
1725 if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) &&
1726 (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y)))
1727 continue;
1728
1729 /* Matching directions... */
1730
1731 /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
1732 * chars. - FIXME: Now we do! */
1733 for (ch = line->first_char; ch; ch = ch->next)
1734 {
1735 fz_point up;
1736 float dx, dy, dot;
1737 fz_rect ch_box;
1738
1739 /* If the thickness is more than a 1/4 of the size, it's a highlight, not a
1740 * line! */
1741 if (ch->size < thickness*4)
1742 continue;
1743
1744 ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size);
1745
1746 if (!line_crosses_rect(from, to, ch_box))
1747 continue;
1748
1749 /* Is this a strikeout or an underline? */
1750
1751 /* The baseline moves from ch->origin in the direction line->dir */
1752 up.x = line->dir.y;
1753 up.y = -line->dir.x;
1754
1755 /* How far is our line displaced from the line through the origin? */
1756 dx = from.x - ch->origin.x;
1757 dy = from.y - ch->origin.y;
1758 /* Dot product with up. up is normalised */
1759 dot = dx * up.x + dy * up.y;
1760
1761 if (dot > 0)
1762 ch->flags |= FZ_STEXT_STRIKEOUT;
1763 else
1764 ch->flags |= FZ_STEXT_UNDERLINE;
1765 }
1766 }
1767 }
1768 }
1769
1770 static void
1771 check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page)
1772 {
1773 int i, n = tdev->rect_len;
1774
1775 for (i = 0; i < n; i++)
1776 {
1777 fz_point from = tdev->rects[i].from;
1778 fz_point to = tdev->rects[i].to;
1779 float thickness = tdev->rects[i].thickness;
1780 fz_point dir;
1781 dir.x = to.x - from.x;
1782 dir.y = to.y - from.y;
1783 dir = fz_normalize_vector(dir);
1784
1785 check_strikeout(ctx, page->first_block, from, to, dir, thickness);
1786 }
1787 }
1788
1789 static void
1790 fz_stext_close_device(fz_context *ctx, fz_device *dev)
1791 {
1792 fz_stext_device *tdev = (fz_stext_device*)dev;
1793 fz_stext_page *page = tdev->page;
1794
1795 fixup_bboxes_and_bidi(ctx, page->first_block);
1796
1797 if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES)
1798 check_rects_for_strikeout(ctx, tdev, page);
1799
1800 /* TODO: smart sorting of blocks and lines in reading order */
1801 /* TODO: unicode NFC normalization */
1802
1803 if (tdev->opts.flags & FZ_STEXT_SEGMENT)
1804 fz_segment_stext_page(ctx, page);
1805
1806 if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK)
1807 fz_paragraph_break(ctx, page);
1808
1809 if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT)
1810 fz_table_hunt(ctx, page);
1811 }
1812
1813 static void
1814 fz_stext_drop_device(fz_context *ctx, fz_device *dev)
1815 {
1816 fz_stext_device *tdev = (fz_stext_device*)dev;
1817 fz_drop_text(ctx, tdev->lasttext);
1818 fz_drop_font(ctx, tdev->last.font);
1819 while (tdev->metatext)
1820 pop_metatext(ctx, tdev);
1821
1822 fz_free(ctx, tdev->rects);
1823 }
1824
1825 static int
1826 val_is_rect(const char *val, fz_rect *rp)
1827 {
1828 fz_rect r;
1829 const char *s;
1830
1831 s = strchr(val, ':');
1832 if (s == NULL || s == val)
1833 return 0;
1834 r.x0 = fz_atof(val);
1835 val = s+1;
1836 s = strchr(val, ':');
1837 if (s == NULL || s == val)
1838 return 0;
1839 r.y0 = fz_atof(val);
1840 val = s+1;
1841 s = strchr(val, ':');
1842 if (s == NULL || s == val)
1843 return 0;
1844 r.x1 = fz_atof(val);
1845 val = s+1;
1846 r.y1 = fz_atof(val);
1847
1848 *rp = r;
1849
1850 return 1;
1851 }
1852
1853 fz_stext_options *
1854 fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
1855 {
1856 const char *val;
1857
1858 memset(opts, 0, sizeof *opts);
1859
1860 if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
1861 opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
1862 if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
1863 opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
1864 if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes"))
1865 opts->flags |= FZ_STEXT_PRESERVE_IMAGES;
1866 if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes"))
1867 opts->flags |= FZ_STEXT_INHIBIT_SPACES;
1868 if (fz_has_option(ctx, string, "dehyphenate", &val) && fz_option_eq(val, "yes"))
1869 opts->flags |= FZ_STEXT_DEHYPHENATE;
1870 if (fz_has_option(ctx, string, "preserve-spans", &val) && fz_option_eq(val, "yes"))
1871 opts->flags |= FZ_STEXT_PRESERVE_SPANS;
1872 if (fz_has_option(ctx, string, "structured", &val) && fz_option_eq(val, "yes"))
1873 opts->flags |= FZ_STEXT_COLLECT_STRUCTURE;
1874 if (fz_has_option(ctx, string, "use-cid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
1875 opts->flags |= FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE;
1876 if (fz_has_option(ctx, string, "use-gid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
1877 opts->flags |= FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE;
1878 if (fz_has_option(ctx, string, "accurate-bboxes", &val) && fz_option_eq(val, "yes"))
1879 opts->flags |= FZ_STEXT_ACCURATE_BBOXES;
1880 if (fz_has_option(ctx, string, "vectors", &val) && fz_option_eq(val, "yes"))
1881 opts->flags |= FZ_STEXT_COLLECT_VECTORS;
1882 if (fz_has_option(ctx, string, "ignore-actualtext", & val) && fz_option_eq(val, "yes"))
1883 opts->flags |= FZ_STEXT_IGNORE_ACTUALTEXT;
1884 if (fz_has_option(ctx, string, "segment", &val) && fz_option_eq(val, "yes"))
1885 opts->flags |= FZ_STEXT_SEGMENT;
1886 if (fz_has_option(ctx, string, "paragraph-break", &val) && fz_option_eq(val, "yes"))
1887 opts->flags |= FZ_STEXT_PARAGRAPH_BREAK;
1888 if (fz_has_option(ctx, string, "table-hunt", &val) && fz_option_eq(val, "yes"))
1889 opts->flags |= FZ_STEXT_TABLE_HUNT;
1890 if (fz_has_option(ctx, string, "collect-styles", &val) && fz_option_eq(val, "yes"))
1891 opts->flags |= FZ_STEXT_COLLECT_STYLES;
1892 if (fz_has_option(ctx, string, "accurate-ascenders", &val) && fz_option_eq(val, "yes"))
1893 opts->flags |= FZ_STEXT_ACCURATE_ASCENDERS;
1894 if (fz_has_option(ctx, string, "accurate-side-bearings", &val) && fz_option_eq(val, "yes"))
1895 opts->flags |= FZ_STEXT_ACCURATE_SIDE_BEARINGS;
1896
1897 opts->flags |= FZ_STEXT_CLIP;
1898 if (fz_has_option(ctx, string, "mediabox-clip", &val))
1899 {
1900 fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead.");
1901 if (fz_option_eq(val, "no"))
1902 opts->flags ^= FZ_STEXT_CLIP;
1903 }
1904 if (fz_has_option(ctx, string, "clip", &val) && fz_option_eq(val, "no"))
1905 opts->flags ^= FZ_STEXT_CLIP;
1906 if (fz_has_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip))
1907 opts->flags |= FZ_STEXT_CLIP_RECT;
1908
1909 opts->scale = 1;
1910 if (fz_has_option(ctx, string, "resolution", &val))
1911 opts->scale = fz_atof(val) / 96.0f; /* HTML base resolution is 96ppi */
1912
1913 return opts;
1914 }
1915
1916 typedef struct
1917 {
1918 int fail;
1919 int count;
1920 fz_point corners[4];
1921 } is_rect_data;
1922
1923 static void
1924 stash_point(is_rect_data *rd, float x, float y)
1925 {
1926 if (rd->count > 3)
1927 {
1928 rd->fail = 1;
1929 return;
1930 }
1931
1932 rd->corners[rd->count].x = x;
1933 rd->corners[rd->count].y = y;
1934 rd->count++;
1935 }
1936
1937 static void
1938 is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
1939 {
1940 is_rect_data *rd = arg;
1941 if (rd->fail)
1942 return;
1943
1944 if (rd->count != 0)
1945 {
1946 rd->fail = 1;
1947 return;
1948 }
1949 stash_point(rd, x, y);
1950 }
1951
1952 static void
1953 is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
1954 {
1955 is_rect_data *rd = arg;
1956 if (rd->fail)
1957 return;
1958
1959 if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
1960 return;
1961
1962 stash_point(rd, x, y);
1963 }
1964
1965 static void
1966 is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
1967 {
1968 is_rect_data *rd = arg;
1969 rd->fail = 1;
1970 }
1971
1972 static void
1973 is_rect_closepath(fz_context *ctx, void *arg)
1974 {
1975 is_rect_data *rd = arg;
1976 if (rd->fail)
1977 return;
1978 if (rd->count == 3)
1979 stash_point(rd, rd->corners[0].x, rd->corners[0].y);
1980 if (rd->count != 4)
1981 rd->fail = 1;
1982 }
1983
1984 static int
1985 is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm)
1986 {
1987 float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
1988 is_rect_data rd = { 0 };
1989 static const fz_path_walker walker =
1990 {
1991 is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
1992 };
1993 int i;
1994
1995 fz_walk_path(ctx, path, &walker, &rd);
1996
1997 if (rd.fail)
1998 return 0;
1999
2000 if (rd.count == 2)
2001 {
2002 stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
2003 stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
2004 }
2005
2006 for (i = 0 ; i < 4; i++)
2007 {
2008 fz_point p = fz_transform_point(rd.corners[i], ctm);
2009
2010 rd.corners[i].x = p.x;
2011 rd.corners[i].y = p.y;
2012 }
2013
2014 /* So we have a 4 cornered path. Hopefully something like:
2015 * 0---------1
2016 * | |
2017 * 3---------2
2018 * but it might be:
2019 * 0---------3
2020 * | |
2021 * 1---------2
2022 */
2023 while (1)
2024 {
2025 d01x = rd.corners[1].x - rd.corners[0].x;
2026 d01y = rd.corners[1].y - rd.corners[0].y;
2027 d01 = d01x * d01x + d01y * d01y;
2028 d03x = rd.corners[3].x - rd.corners[0].x;
2029 d03y = rd.corners[3].y - rd.corners[0].y;
2030 d03 = d03x * d03x + d03y * d03y;
2031 if(d01 < d03)
2032 {
2033 /* We are the latter case. Transpose it. */
2034 fz_point p = rd.corners[1];
2035 rd.corners[1] = rd.corners[3];
2036 rd.corners[3] = p;
2037 }
2038 else
2039 break;
2040 }
2041 d32x = rd.corners[2].x - rd.corners[3].x;
2042 d32y = rd.corners[2].y - rd.corners[3].y;
2043
2044 /* So d32x and d01x need to be the same for this to be a strikeout. */
2045 if (!feq(d32x, d01x) || !feq(d32y, d01y))
2046 return 0;
2047
2048 /* We are plausibly a rectangle. */
2049 *thickness = sqrtf(d03x * d03x + d03y * d03y);
2050
2051 from->x = (rd.corners[0].x + rd.corners[3].x)/2;
2052 from->y = (rd.corners[0].y + rd.corners[3].y)/2;
2053 to->x = (rd.corners[1].x + rd.corners[2].x)/2;
2054 to->y = (rd.corners[1].y + rd.corners[2].y)/2;
2055
2056 return 1;
2057 }
2058
2059 static void
2060 check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm)
2061 {
2062 float thickness;
2063 fz_point from, to;
2064
2065 /* Is this path a thin rectangle (possibly rotated)? If so, then we need to
2066 * consider it as being a strikeout or underline. */
2067 if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm))
2068 return;
2069
2070 /* Add to the list of rects in the device. */
2071 if (tdev->rect_len == tdev->rect_max)
2072 {
2073 int newmax = tdev->rect_max * 2;
2074 if (newmax == 0)
2075 newmax = 32;
2076
2077 tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax);
2078 tdev->rect_max = newmax;
2079 }
2080 tdev->rects[tdev->rect_len].from = from;
2081 tdev->rects[tdev->rect_len].to = to;
2082 tdev->rects[tdev->rect_len].thickness = thickness;
2083 tdev->rect_len++;
2084 }
2085
2086 static void
2087 add_vector(fz_context *ctx, fz_stext_page *page, fz_rect bbox, uint32_t flags, uint32_t argb)
2088 {
2089 fz_stext_block *b = add_block_to_page(ctx, page);
2090
2091 b->type = FZ_STEXT_BLOCK_VECTOR;
2092 b->bbox = bbox;
2093 b->u.v.flags = flags;
2094 b->u.v.argb = argb;
2095 }
2096
2097 typedef struct
2098 {
2099 fz_matrix ctm;
2100 uint32_t argb;
2101 uint32_t flags;
2102 fz_stext_page *page;
2103 fz_rect leftovers;
2104 fz_rect pending;
2105 int count;
2106 fz_point p[5];
2107 } split_path_data;
2108
2109 static void
2110 maybe_rect(fz_context *ctx, split_path_data *sp)
2111 {
2112 int rect = 0;
2113 int i;
2114
2115 if (sp->count >= 0)
2116 {
2117 if (sp->count == 3)
2118 {
2119 /* Allow for "moveto A, lineto B, lineto A, close" */
2120 if (feq(sp->p[0].x, sp->p[2].x) || feq(sp->p[0].y, sp->p[2].y))
2121 sp->count = 2;
2122 }
2123 if (sp->count == 2)
2124 {
2125 if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y))
2126 rect = 1; /* Count that as a rect */
2127 }
2128 else if (sp->count == 4 || sp->count == 5)
2129 {
2130 if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y))
2131 rect = 1;
2132 else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y))
2133 rect = 1;
2134 }
2135 if (rect)
2136 {
2137 fz_rect bounds;
2138
2139 bounds.x0 = bounds.x1 = sp->p[0].x;
2140 bounds.y0 = bounds.y1 = sp->p[0].y;
2141 for (i = 1; i < sp->count; i++)
2142 bounds = fz_include_point_in_rect(bounds, sp->p[i]);
2143 if (fz_is_valid_rect(sp->pending))
2144 add_vector(ctx, sp->page, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb);
2145 sp->pending = bounds;
2146 return;
2147 }
2148
2149 for (i = 0; i < sp->count; i++)
2150 sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
2151 }
2152 }
2153
2154 static void
2155 split_move(fz_context *ctx, void *arg, float x, float y)
2156 {
2157 split_path_data *sp = (split_path_data *)arg;
2158 fz_point p = fz_transform_point_xy(x, y, sp->ctm);
2159
2160 maybe_rect(ctx, sp);
2161 sp->p[0] = p;
2162 sp->count = 1;
2163 }
2164
2165 static void
2166 split_line(fz_context *ctx, void *arg, float x, float y)
2167 {
2168 split_path_data *sp = (split_path_data *)arg;
2169 fz_point p = fz_transform_point_xy(x, y, sp->ctm);
2170 int i;
2171
2172 if (sp->count >= 0)
2173 {
2174 /* Check for lines to the same point. */
2175 if (feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y))
2176 return;
2177 /* If we're still maybe a rect, just record the point. */
2178 if (sp->count < 4)
2179 {
2180 sp->p[sp->count++] = p;
2181 return;
2182 }
2183 /* Check for close line? */
2184 if (sp->count == 4)
2185 {
2186 if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y))
2187 {
2188 /* We've just drawn a line back to the start point. */
2189 /* Needless saving of point, but it makes the logic
2190 * easier elsewhere. */
2191 sp->p[sp->count++] = p;
2192 return;
2193 }
2194 }
2195 /* We can no longer be a rect. Output the points we had saved. */
2196 for (i = 0; i < sp->count; i++)
2197 sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
2198 /* Remember we're not a rect. */
2199 sp->count = -1;
2200 }
2201 /* Roll this point into the non-rect bounds. */
2202 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p);
2203 }
2204
2205 static void
2206 split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
2207 {
2208 split_path_data *sp = (split_path_data *)arg;
2209 fz_point p1 = fz_transform_point_xy(x1, y1, sp->ctm);
2210 fz_point p2 = fz_transform_point_xy(x2, y2, sp->ctm);
2211 fz_point p3 = fz_transform_point_xy(x3, y3, sp->ctm);
2212 int i;
2213
2214 if (sp->count >= 0)
2215 {
2216 /* We can no longer be a rect. Output the points we had saved. */
2217 for (i = 0; i < sp->count; i++)
2218 sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
2219 /* Remember we're not a rect. */
2220 sp->count = -1;
2221 }
2222 /* Roll these points into the non-rect bounds. */
2223 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p1);
2224 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p2);
2225 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p3);
2226 }
2227
2228 static void
2229 split_close(fz_context *ctx, void *arg)
2230 {
2231 split_path_data *sp = (split_path_data *)arg;
2232
2233 maybe_rect(ctx, sp);
2234 sp->count = 0;
2235 }
2236
2237
2238 static const
2239 fz_path_walker split_path_rects =
2240 {
2241 split_move,
2242 split_line,
2243 split_curve,
2244 split_close
2245 };
2246
2247 static void
2248 add_vectors_from_path(fz_context *ctx, fz_stext_page *page, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, int stroke)
2249 {
2250 int have_leftovers;
2251 split_path_data sp;
2252
2253 sp.ctm = ctm;
2254 sp.argb = hexrgba_from_color(ctx, cs, color, alpha);
2255 sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0;
2256 sp.page = page;
2257 sp.count = 0;
2258 sp.leftovers = fz_empty_rect;
2259 sp.pending = fz_empty_rect;
2260 fz_walk_path(ctx, path, &split_path_rects, &sp);
2261
2262 have_leftovers = fz_is_valid_rect(sp.leftovers);
2263
2264 maybe_rect(ctx, &sp);
2265
2266 if (fz_is_valid_rect(sp.pending))
2267 add_vector(ctx, page, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb);
2268 if (have_leftovers)
2269 add_vector(ctx, page, sp.leftovers, sp.flags, sp.argb);
2270 }
2271
2272 static void
2273 fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2274 {
2275 fz_stext_device *tdev = (fz_stext_device*)dev;
2276 fz_stext_page *page = tdev->page;
2277 fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
2278 fz_rect *bounds = actualtext_bounds(tdev);
2279
2280 /* If we're in an actualtext, then update the bounds to include this content. */
2281 if (bounds != NULL)
2282 *bounds = fz_union_rect(*bounds, path_bounds);
2283
2284 if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2285 check_for_strikeout(ctx, tdev, page, path, ctm);
2286
2287 if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2288 add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 0);
2289 }
2290
2291 static void
2292 fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
2293 {
2294 fz_stext_device *tdev = (fz_stext_device*)dev;
2295 fz_stext_page *page = tdev->page;
2296 fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm);
2297 fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
2298
2299 /* If we're in an actualtext, then update the bounds to include this content. */
2300 if (bounds != NULL)
2301 *bounds = fz_union_rect(*bounds, path_bounds);
2302
2303 if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
2304 check_for_strikeout(ctx, tdev, page, path, ctm);
2305
2306 if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
2307 add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 1);
2308 }
2309
2310 static void
2311 new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
2312 {
2313 fz_stext_struct *str;
2314 size_t z;
2315
2316 if (raw == NULL)
2317 raw = "";
2318 z = strlen(raw);
2319
2320 str = fz_pool_alloc(ctx, page->pool, sizeof(*str) + z);
2321 str->first_block = NULL;
2322 str->last_block = NULL;
2323 str->standard = standard;
2324 str->parent = page->last_struct;
2325 str->up = block;
2326 memcpy(str->raw, raw, z+1);
2327
2328 block->u.s.down = str;
2329 }
2330
2331 static void
2332 fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx)
2333 {
2334 fz_stext_device *tdev = (fz_stext_device*)dev;
2335 fz_stext_page *page = tdev->page;
2336 fz_stext_block *block, *le, *gt, *newblock;
2337
2338 if (raw == NULL)
2339 raw = "";
2340
2341 /* Find a pointer to the last block. */
2342 if (page->last_block)
2343 {
2344 block = page->last_block;
2345 }
2346 else if (page->last_struct)
2347 {
2348 block = page->last_struct->last_block;
2349 }
2350 else
2351 {
2352 block = page->first_block;
2353 }
2354
2355 /* So block is somewhere in the content chain. Let's try and find:
2356 * le = the struct node <= idx before block in the content chain.
2357 * ge = the struct node >= idx after block in the content chain.
2358 * Search backwards to start with.
2359 */
2360 gt = NULL;
2361 le = block;
2362 while (le)
2363 {
2364 if (le->type == FZ_STEXT_BLOCK_STRUCT)
2365 {
2366 if (le->u.s.index > idx)
2367 gt = le;
2368 if (le->u.s.index <= idx)
2369 break;
2370 }
2371 le = le->prev;
2372 }
2373 /* The following loop copes with finding gt (the smallest block with an index higher
2374 * than we want) if we haven't found it already. The while loop in here was designed
2375 * to cope with 'block' being in the middle of a list. In fact, the way the code is
2376 * currently, block will always be at the end of a list, so the while won't do anything.
2377 * But I'm loathe to remove it in case we ever change this code to start from wherever
2378 * we did the last insertion. */
2379 if (gt == NULL)
2380 {
2381 gt = block;
2382 while (gt)
2383 {
2384 if (gt->type == FZ_STEXT_BLOCK_STRUCT)
2385 {
2386 if (gt->u.s.index <= idx)
2387 le = gt;
2388 if (gt->u.s.index >= idx)
2389 break;
2390 }
2391 block = gt;
2392 gt = gt->next;
2393 }
2394 }
2395
2396 if (le && le->u.s.index == idx)
2397 {
2398 /* We want to move down into the le block. Does it have a struct
2399 * attached yet? */
2400 if (le->u.s.down == NULL)
2401 {
2402 /* No. We need to create a new struct node. */
2403 new_stext_struct(ctx, page, le, standard, raw);
2404 }
2405 else if (le->u.s.down->standard != standard || strcmp(raw, le->u.s.down->raw) != 0)
2406 {
2407 /* Yes, but it doesn't match the one we expect! */
2408 fz_warn(ctx, "Mismatched structure type!");
2409 }
2410 page->last_struct = le->u.s.down;
2411 page->last_block = le->u.s.down->last_block;
2412
2413 return;
2414 }
2415
2416 /* We are going to need to create a new block. Create a complete unlinked one here. */
2417 newblock = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
2418 newblock->bbox = fz_empty_rect;
2419 newblock->prev = NULL;
2420 newblock->next = NULL;
2421 newblock->type = FZ_STEXT_BLOCK_STRUCT;
2422 newblock->u.s.index = idx;
2423 newblock->u.s.down = NULL;
2424 /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
2425 new_stext_struct(ctx, page, newblock, standard, raw);
2426
2427 /* So now we just need to link it in somewhere. */
2428 if (gt)
2429 {
2430 /* Link it in before gt. */
2431 newblock->prev = gt->prev;
2432 if (gt->prev)
2433 gt->prev->next = newblock;
2434 gt->prev = newblock;
2435 newblock->next = gt;
2436 }
2437 else if (block)
2438 {
2439 /* Link it in at the end of the list (i.e. after 'block') */
2440 newblock->prev = block;
2441 block->next = newblock;
2442 }
2443 else if (page->last_struct)
2444 {
2445 /* We have no blocks at all at this level. */
2446 page->last_struct->first_block = newblock;
2447 page->last_struct->last_block = newblock;
2448 }
2449 else
2450 {
2451 /* We have no blocks at ANY level. */
2452 page->first_block = newblock;
2453 }
2454 /* Wherever we linked it in, that's where we want to continue adding content. */
2455 page->last_struct = newblock->u.s.down;
2456 page->last_block = NULL;
2457 }
2458
2459 static void
2460 fz_stext_end_structure(fz_context *ctx, fz_device *dev)
2461 {
2462 fz_stext_device *tdev = (fz_stext_device*)dev;
2463 fz_stext_page *page = tdev->page;
2464 fz_stext_struct *str = page->last_struct;
2465
2466 if (str == NULL)
2467 {
2468 fz_warn(ctx, "Structure out of sync");
2469 return;
2470 }
2471
2472 page->last_struct = str->parent;
2473 if (page->last_struct == NULL)
2474 {
2475 page->last_block = page->first_block;
2476 /* Yuck */
2477 while (page->last_block->next)
2478 page->last_block = page->last_block->next;
2479 }
2480 else
2481 {
2482 page->last_block = page->last_struct->last_block;
2483 }
2484 }
2485
2486 fz_device *
2487 fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
2488 {
2489 fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
2490
2491 dev->super.close_device = fz_stext_close_device;
2492 dev->super.drop_device = fz_stext_drop_device;
2493
2494 dev->super.fill_text = fz_stext_fill_text;
2495 dev->super.stroke_text = fz_stext_stroke_text;
2496 dev->super.clip_text = fz_stext_clip_text;
2497 dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
2498 dev->super.ignore_text = fz_stext_ignore_text;
2499 dev->super.begin_metatext = fz_stext_begin_metatext;
2500 dev->super.end_metatext = fz_stext_end_metatext;
2501
2502 dev->super.fill_shade = fz_stext_fill_shade;
2503 dev->super.fill_image = fz_stext_fill_image;
2504 dev->super.fill_image_mask = fz_stext_fill_image_mask;
2505
2506 if (opts)
2507 {
2508 dev->flags = opts->flags;
2509 if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE)
2510 {
2511 dev->super.begin_structure = fz_stext_begin_structure;
2512 dev->super.end_structure = fz_stext_end_structure;
2513 }
2514 if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES))
2515 {
2516 dev->super.fill_path = fz_stext_fill_path;
2517 dev->super.stroke_path = fz_stext_stroke_path;
2518 }
2519 }
2520 dev->page = page;
2521 dev->pen.x = 0;
2522 dev->pen.y = 0;
2523 dev->trm = fz_identity;
2524 dev->lastchar = ' ';
2525 dev->lasttext = NULL;
2526 dev->lastbidi = 0;
2527 dev->last_was_fake_bold = 1;
2528 if (opts)
2529 dev->opts = *opts;
2530
2531 if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
2532 dev->super.hints |= FZ_DONT_DECODE_IMAGES;
2533
2534 dev->rect_max = 0;
2535 dev->rect_len = 0;
2536 dev->rects = NULL;
2537
2538 return (fz_device*)dev;
2539 }