Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/stext-device.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children | aa33339d6b8a |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // You should have received a copy of the GNU Affero General Public License | |
| 15 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 16 // | |
| 17 // Alternative licensing terms are available from the licensor. | |
| 18 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 19 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 20 // CA 94129, USA, for further information. | |
| 21 | |
| 22 #include "mupdf/fitz.h" | |
| 23 | |
| 24 #include "glyphbox.h" | |
| 25 | |
| 26 #include <float.h> | |
| 27 #include <string.h> | |
| 28 | |
| 29 /* Simple layout structure */ | |
| 30 | |
| 31 fz_layout_block *fz_new_layout(fz_context *ctx) | |
| 32 { | |
| 33 fz_pool *pool = fz_new_pool(ctx); | |
| 34 fz_layout_block *block; | |
| 35 fz_try(ctx) | |
| 36 { | |
| 37 block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block)); | |
| 38 block->pool = pool; | |
| 39 block->head = NULL; | |
| 40 block->tailp = &block->head; | |
| 41 } | |
| 42 fz_catch(ctx) | |
| 43 { | |
| 44 fz_drop_pool(ctx, pool); | |
| 45 fz_rethrow(ctx); | |
| 46 } | |
| 47 return block; | |
| 48 } | |
| 49 | |
| 50 void fz_drop_layout(fz_context *ctx, fz_layout_block *block) | |
| 51 { | |
| 52 if (block) | |
| 53 fz_drop_pool(ctx, block->pool); | |
| 54 } | |
| 55 | |
| 56 void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p) | |
| 57 { | |
| 58 fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line)); | |
| 59 line->x = x; | |
| 60 line->y = y; | |
| 61 line->font_size = font_size; | |
| 62 line->p = p; | |
| 63 line->text = NULL; | |
| 64 line->next = NULL; | |
| 65 *block->tailp = line; | |
| 66 block->tailp = &line->next; | |
| 67 block->text_tailp = &line->text; | |
| 68 } | |
| 69 | |
| 70 void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p) | |
| 71 { | |
| 72 fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char)); | |
| 73 ch->x = x; | |
| 74 ch->advance = advance; | |
| 75 ch->p = p; | |
| 76 ch->next = NULL; | |
| 77 *block->text_tailp = ch; | |
| 78 block->text_tailp = &ch->next; | |
| 79 } | |
| 80 | |
| 81 /* Extract text into blocks and lines. */ | |
| 82 | |
| 83 #define PARAGRAPH_DIST 1.5f | |
| 84 #define SPACE_DIST 0.15f | |
| 85 #define SPACE_MAX_DIST 0.8f | |
| 86 #define BASE_MAX_DIST 0.8f | |
| 87 #define FAKE_BOLD_MAX_DIST 0.1f | |
| 88 | |
| 89 /* We keep a stack of the different metatexts that apply at any | |
| 90 * given point (normally none!). Whenever we get some content | |
| 91 * with a metatext in force, we really want to update the bounds | |
| 92 * for that metatext. But running along the whole list each time | |
| 93 * would be painful. So we just update the bounds for dev->metatext | |
| 94 * and rely on metatext_bounds() propagating it upwards 'just in | |
| 95 * time' for us to use metatexts other than the latest one. This | |
| 96 * also means we need to propagate bounds upwards when we pop | |
| 97 * a metatext. | |
| 98 * | |
| 99 * Why do we need bounds at all? Well, suppose we get: | |
| 100 * /Span <</ActualText (c) >> BDC /Im0 Do EMC | |
| 101 * Then where on the page do we put 'c' ? By collecting the | |
| 102 * bounds, we can place 'c' wherever the image was. | |
| 103 */ | |
| 104 typedef struct metatext_t | |
| 105 { | |
| 106 fz_metatext type; | |
| 107 char *text; | |
| 108 fz_rect bounds; | |
| 109 struct metatext_t *prev; | |
| 110 } metatext_t; | |
| 111 | |
| 112 typedef struct | |
| 113 { | |
| 114 fz_point from; | |
| 115 fz_point to; | |
| 116 float thickness; | |
| 117 } rect_details; | |
| 118 | |
| 119 typedef struct | |
| 120 { | |
| 121 fz_device super; | |
| 122 fz_stext_page *page; | |
| 123 int id; | |
| 124 fz_point pen, start; | |
| 125 fz_point lag_pen; | |
| 126 fz_matrix trm; | |
| 127 int new_obj; | |
| 128 int lastchar; | |
| 129 int lastbidi; | |
| 130 int flags; | |
| 131 int color; | |
| 132 int last_was_fake_bold; | |
| 133 const fz_text *lasttext; | |
| 134 fz_stext_options opts; | |
| 135 | |
| 136 metatext_t *metatext; | |
| 137 | |
| 138 /* Store the last values we saw. We need this for flushing the actualtext. */ | |
| 139 struct | |
| 140 { | |
| 141 int valid; | |
| 142 int clipped; | |
| 143 fz_matrix trm; | |
| 144 int wmode; | |
| 145 int bidi_level; | |
| 146 fz_font *font; | |
| 147 int flags; | |
| 148 } last; | |
| 149 | |
| 150 /* The list of 'rects' seen during processing (if we're collecting styles). */ | |
| 151 int rect_max; | |
| 152 int rect_len; | |
| 153 rect_details *rects; | |
| 154 } fz_stext_device; | |
| 155 | |
| 156 const char *fz_stext_options_usage = | |
| 157 "Text output options:\n" | |
| 158 "\tpreserve-images: keep images in output\n" | |
| 159 "\tpreserve-ligatures: do not expand ligatures into constituent characters\n" | |
| 160 "\tpreserve-spans: do not merge spans on the same line\n" | |
| 161 "\tpreserve-whitespace: do not convert all whitespace into space characters\n" | |
| 162 "\tinhibit-spaces: don't add spaces between gaps in the text\n" | |
| 163 "\tparagraph-break: break blocks at paragraph boundaries\n" | |
| 164 "\tdehyphenate: attempt to join up hyphenated words\n" | |
| 165 "\tignore-actualtext: do not apply ActualText replacements\n" | |
| 166 "\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n" | |
| 167 "\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n" | |
| 168 "\taccurate-bboxes: calculate char bboxes from the outlines\n" | |
| 169 "\taccurate-ascenders: calculate ascender/descender from font glyphs\n" | |
| 170 "\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n" | |
| 171 "\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n" | |
| 172 "\tclip: do not include text that is completely clipped\n" | |
| 173 "\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n" | |
| 174 "\tstructured: collect structure markup\n" | |
| 175 "\tvectors: include vector bboxes in output\n" | |
| 176 "\tsegment: attempt to segment the page\n" | |
| 177 "\ttable-hunt: hunt for tables within a (segmented) page\n" | |
| 178 "\n"; | |
| 179 | |
| 180 /* Find the current actualtext, if any. Will abort if dev == NULL. */ | |
| 181 static metatext_t * | |
| 182 find_actualtext(fz_stext_device *dev) | |
| 183 { | |
| 184 metatext_t *mt = dev->metatext; | |
| 185 | |
| 186 while (mt && mt->type != FZ_METATEXT_ACTUALTEXT) | |
| 187 mt = mt->prev; | |
| 188 | |
| 189 return mt; | |
| 190 } | |
| 191 | |
| 192 /* Find the bounds of the given metatext. Will abort if mt or | |
| 193 * dev are NULL. */ | |
| 194 static fz_rect * | |
| 195 metatext_bounds(metatext_t *mt, fz_stext_device *dev) | |
| 196 { | |
| 197 metatext_t *mt2 = dev->metatext; | |
| 198 | |
| 199 while (mt2 != mt) | |
| 200 { | |
| 201 mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds); | |
| 202 mt2 = mt2->prev; | |
| 203 } | |
| 204 | |
| 205 return &mt->bounds; | |
| 206 } | |
| 207 | |
| 208 /* Find the bounds of the current actualtext, or NULL if there | |
| 209 * isn't one. Will abort if dev is NULL. */ | |
| 210 static fz_rect * | |
| 211 actualtext_bounds(fz_stext_device *dev) | |
| 212 { | |
| 213 metatext_t *mt = find_actualtext(dev); | |
| 214 | |
| 215 if (mt == NULL) | |
| 216 return NULL; | |
| 217 | |
| 218 return metatext_bounds(mt, dev); | |
| 219 } | |
| 220 | |
| 221 fz_stext_page * | |
| 222 fz_new_stext_page(fz_context *ctx, fz_rect mediabox) | |
| 223 { | |
| 224 fz_pool *pool = fz_new_pool(ctx); | |
| 225 fz_stext_page *page = NULL; | |
| 226 fz_try(ctx) | |
| 227 { | |
| 228 page = fz_pool_alloc(ctx, pool, sizeof(*page)); | |
| 229 page->pool = pool; | |
| 230 page->mediabox = mediabox; | |
| 231 page->first_block = NULL; | |
| 232 page->last_block = NULL; | |
| 233 } | |
| 234 fz_catch(ctx) | |
| 235 { | |
| 236 fz_drop_pool(ctx, pool); | |
| 237 fz_rethrow(ctx); | |
| 238 } | |
| 239 return page; | |
| 240 } | |
| 241 | |
| 242 static void | |
| 243 drop_run(fz_context *ctx, fz_stext_block *block) | |
| 244 { | |
| 245 fz_stext_line *line; | |
| 246 fz_stext_char *ch; | |
| 247 while (block) | |
| 248 { | |
| 249 switch (block->type) | |
| 250 { | |
| 251 case FZ_STEXT_BLOCK_IMAGE: | |
| 252 fz_drop_image(ctx, block->u.i.image); | |
| 253 break; | |
| 254 case FZ_STEXT_BLOCK_TEXT: | |
| 255 for (line = block->u.t.first_line; line; line = line->next) | |
| 256 for (ch = line->first_char; ch; ch = ch->next) | |
| 257 fz_drop_font(ctx, ch->font); | |
| 258 break; | |
| 259 case FZ_STEXT_BLOCK_STRUCT: | |
| 260 drop_run(ctx, block->u.s.down->first_block); | |
| 261 break; | |
| 262 default: | |
| 263 break; | |
| 264 } | |
| 265 block = block->next; | |
| 266 } | |
| 267 } | |
| 268 | |
| 269 void | |
| 270 fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) | |
| 271 { | |
| 272 if (page) | |
| 273 { | |
| 274 drop_run(ctx, page->first_block); | |
| 275 fz_drop_pool(ctx, page->pool); | |
| 276 } | |
| 277 } | |
| 278 | |
| 279 /* | |
| 280 * This adds a new block at the end of the page. This should not be used | |
| 281 * to add 'struct' blocks to the page as those have to be added internally, | |
| 282 * with more complicated pointer setup. | |
| 283 */ | |
| 284 static fz_stext_block * | |
| 285 add_block_to_page(fz_context *ctx, fz_stext_page *page) | |
| 286 { | |
| 287 fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); | |
| 288 block->bbox = fz_empty_rect; /* Fixes bug 703267. */ | |
| 289 block->prev = page->last_block; | |
| 290 if (page->last_struct) | |
| 291 { | |
| 292 if (page->last_struct->last_block) | |
| 293 { | |
| 294 block->prev = page->last_struct->last_block; | |
| 295 block->prev->next = block; | |
| 296 page->last_struct->last_block = block; | |
| 297 } | |
| 298 else | |
| 299 page->last_struct->last_block = page->last_struct->first_block = block; | |
| 300 } | |
| 301 else if (!page->last_block) | |
| 302 { | |
| 303 page->last_block = block; | |
| 304 if (!page->first_block) | |
| 305 page->first_block = block; | |
| 306 } | |
| 307 else | |
| 308 { | |
| 309 page->last_block->next = block; | |
| 310 page->last_block = block; | |
| 311 } | |
| 312 return block; | |
| 313 } | |
| 314 | |
| 315 static fz_stext_block * | |
| 316 add_text_block_to_page(fz_context *ctx, fz_stext_page *page) | |
| 317 { | |
| 318 fz_stext_block *block = add_block_to_page(ctx, page); | |
| 319 block->type = FZ_STEXT_BLOCK_TEXT; | |
| 320 return block; | |
| 321 } | |
| 322 | |
| 323 static fz_stext_block * | |
| 324 add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image) | |
| 325 { | |
| 326 fz_stext_block *block = add_block_to_page(ctx, page); | |
| 327 block->type = FZ_STEXT_BLOCK_IMAGE; | |
| 328 block->u.i.transform = ctm; | |
| 329 block->u.i.image = fz_keep_image(ctx, image); | |
| 330 block->bbox = fz_transform_rect(fz_unit_rect, ctm); | |
| 331 return block; | |
| 332 } | |
| 333 | |
| 334 static fz_stext_line * | |
| 335 add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi) | |
| 336 { | |
| 337 fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); | |
| 338 line->prev = block->u.t.last_line; | |
| 339 if (!block->u.t.first_line) | |
| 340 block->u.t.first_line = block->u.t.last_line = line; | |
| 341 else | |
| 342 { | |
| 343 block->u.t.last_line->next = line; | |
| 344 block->u.t.last_line = line; | |
| 345 } | |
| 346 | |
| 347 line->dir = *dir; | |
| 348 line->wmode = wmode; | |
| 349 | |
| 350 return line; | |
| 351 } | |
| 352 | |
| 353 #define NON_ACCURATE_GLYPH_ADDED_SPACE (-2) | |
| 354 #define NON_ACCURATE_GLYPH (-1) | |
| 355 | |
| 356 static fz_stext_char * | |
| 357 add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags) | |
| 358 { | |
| 359 fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); | |
| 360 fz_point a, d; | |
| 361 | |
| 362 if (!line->first_char) | |
| 363 line->first_char = line->last_char = ch; | |
| 364 else | |
| 365 { | |
| 366 line->last_char->next = ch; | |
| 367 line->last_char = ch; | |
| 368 } | |
| 369 | |
| 370 ch->c = c; | |
| 371 ch->argb = color; | |
| 372 ch->bidi = bidi; | |
| 373 ch->origin = *p; | |
| 374 ch->size = size; | |
| 375 ch->font = fz_keep_font(ctx, font); | |
| 376 ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0); | |
| 377 if (font->flags.is_bold) | |
| 378 ch->flags |= FZ_STEXT_BOLD; | |
| 379 | |
| 380 if (line->wmode == 0) | |
| 381 { | |
| 382 fz_rect bounds; | |
| 383 int bounded = 0; | |
| 384 a.x = 0; | |
| 385 d.x = 0; | |
| 386 if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE) | |
| 387 { | |
| 388 /* Added space, in accurate mode. */ | |
| 389 a.y = d.y = 0; | |
| 390 } | |
| 391 else if (glyph == NON_ACCURATE_GLYPH) | |
| 392 { | |
| 393 /* Non accurate mode. */ | |
| 394 a.y = fz_font_ascender(ctx, font); | |
| 395 d.y = fz_font_descender(ctx, font); | |
| 396 } | |
| 397 else | |
| 398 { | |
| 399 /* Any glyph in accurate mode */ | |
| 400 bounds = fz_bound_glyph(ctx, font, glyph, fz_identity); | |
| 401 bounded = 1; | |
| 402 a.y = bounds.y1; | |
| 403 d.y = bounds.y0; | |
| 404 } | |
| 405 if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS) | |
| 406 { | |
| 407 if (!bounded) | |
| 408 bounds = fz_bound_glyph(ctx, font, glyph, fz_identity); | |
| 409 if (a.x > bounds.x0) | |
| 410 a.x = bounds.x0; | |
| 411 if (d.y < bounds.x1) | |
| 412 d.y = bounds.x1; | |
| 413 } | |
| 414 } | |
| 415 else | |
| 416 { | |
| 417 a.x = 1; | |
| 418 d.x = 0; | |
| 419 a.y = 0; | |
| 420 d.y = 0; | |
| 421 } | |
| 422 a = fz_transform_vector(a, trm); | |
| 423 d = fz_transform_vector(d, trm); | |
| 424 | |
| 425 ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y); | |
| 426 ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y); | |
| 427 ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y); | |
| 428 ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y); | |
| 429 | |
| 430 return ch; | |
| 431 } | |
| 432 | |
| 433 static void | |
| 434 remove_last_char(fz_context *ctx, fz_stext_line *line) | |
| 435 { | |
| 436 if (line && line->first_char) | |
| 437 { | |
| 438 fz_stext_char *prev = NULL; | |
| 439 fz_stext_char *ch = line->first_char; | |
| 440 while (ch->next) | |
| 441 { | |
| 442 prev = ch; | |
| 443 ch = ch->next; | |
| 444 } | |
| 445 if (prev) | |
| 446 { | |
| 447 /* The characters are pool allocated, so we don't actually leak the removed node. */ | |
| 448 /* We do need to drop the char's font reference though. */ | |
| 449 fz_drop_font(ctx, prev->next->font); | |
| 450 line->last_char = prev; | |
| 451 line->last_char->next = NULL; | |
| 452 } | |
| 453 } | |
| 454 } | |
| 455 | |
| 456 static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail) | |
| 457 { | |
| 458 fz_stext_char *prev, *next; | |
| 459 prev = tail; | |
| 460 while (curr != tail) | |
| 461 { | |
| 462 next = curr->next; | |
| 463 curr->next = prev; | |
| 464 prev = curr; | |
| 465 curr = next; | |
| 466 } | |
| 467 return prev; | |
| 468 } | |
| 469 | |
| 470 static void reverse_bidi_line(fz_stext_line *line) | |
| 471 { | |
| 472 fz_stext_char *a, *b, **prev; | |
| 473 prev = &line->first_char; | |
| 474 for (a = line->first_char; a; a = a->next) | |
| 475 { | |
| 476 if (a->bidi) | |
| 477 { | |
| 478 b = a; | |
| 479 while (b->next && b->next->bidi) | |
| 480 b = b->next; | |
| 481 if (a != b) | |
| 482 *prev = reverse_bidi_span(a, b->next); | |
| 483 } | |
| 484 prev = &a->next; | |
| 485 line->last_char = a; | |
| 486 } | |
| 487 } | |
| 488 | |
| 489 static int is_hyphen(int c) | |
| 490 { | |
| 491 /* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */ | |
| 492 return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011); | |
| 493 } | |
| 494 | |
| 495 static float | |
| 496 vec_dot(const fz_point *a, const fz_point *b) | |
| 497 { | |
| 498 return a->x * b->x + a->y * b->y; | |
| 499 } | |
| 500 | |
| 501 static int may_add_space(int lastchar) | |
| 502 { | |
| 503 /* Basic latin, greek, cyrillic, hebrew, arabic, | |
| 504 * general punctuation, | |
| 505 * superscripts and subscripts, | |
| 506 * and currency symbols. | |
| 507 */ | |
| 508 return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF))); | |
| 509 } | |
| 510 | |
| 511 #define FAKEBOLD_THRESHOLD_RECIP 10 | |
| 512 | |
| 513 static int | |
| 514 close(float a, float b, float size) | |
| 515 { | |
| 516 a -= b; | |
| 517 if (a < 0) | |
| 518 a = -a; | |
| 519 | |
| 520 return FAKEBOLD_THRESHOLD_RECIP * a < size; | |
| 521 } | |
| 522 | |
| 523 static int | |
| 524 font_equiv(fz_context *ctx, fz_font *f, fz_font *g) | |
| 525 { | |
| 526 unsigned char fdigest[16]; | |
| 527 unsigned char gdigest[16]; | |
| 528 | |
| 529 if (f == g) | |
| 530 return 1; | |
| 531 | |
| 532 if (strcmp(f->name, g->name) != 0) | |
| 533 return 0; | |
| 534 | |
| 535 fz_font_digest(ctx, f, fdigest); | |
| 536 fz_font_digest(ctx, g, gdigest); | |
| 537 | |
| 538 return (memcmp(fdigest, gdigest, 16) == 0); | |
| 539 } | |
| 540 | |
| 541 static int | |
| 542 check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags) | |
| 543 { | |
| 544 fz_stext_line *line; | |
| 545 fz_stext_char *ch; | |
| 546 | |
| 547 for (; block != NULL; block = block->next) | |
| 548 { | |
| 549 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 550 { | |
| 551 if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags)) | |
| 552 return 1; | |
| 553 } | |
| 554 else if (block->type == FZ_STEXT_BLOCK_TEXT) | |
| 555 { | |
| 556 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 557 { | |
| 558 fz_stext_char *pr = NULL; | |
| 559 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 560 { | |
| 561 /* Not perfect, but it'll do! */ | |
| 562 if (ch->c == c && close(ch->origin.x, p.x, size) && close(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font)) | |
| 563 { | |
| 564 /* If we were filled before, and we are stroking now... */ | |
| 565 if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED && | |
| 566 (flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED) | |
| 567 { | |
| 568 /* Update this to be filled + stroked, but don't specifically mark it as fake bold. */ | |
| 569 ch->flags |= flags; | |
| 570 return 1; | |
| 571 } | |
| 572 /* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these | |
| 573 * as boldening if either the char before, or the char after were also boldened. */ | |
| 574 ch->flags |= flags; | |
| 575 | |
| 576 if (c == ' ') | |
| 577 { | |
| 578 if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) || | |
| 579 (ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0)) | |
| 580 { | |
| 581 /* OK, we can be bold. */ | |
| 582 ch->flags |= FZ_STEXT_BOLD; | |
| 583 return 1; | |
| 584 } | |
| 585 /* Ignore this and keep going */ | |
| 586 } | |
| 587 else | |
| 588 { | |
| 589 ch->flags |= FZ_STEXT_BOLD; | |
| 590 return 1; | |
| 591 } | |
| 592 } | |
| 593 pr = ch; | |
| 594 } | |
| 595 } | |
| 596 } | |
| 597 } | |
| 598 | |
| 599 return 0; | |
| 600 } | |
| 601 | |
| 602 static void | |
| 603 fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags) | |
| 604 { | |
| 605 fz_stext_page *page = dev->page; | |
| 606 fz_stext_block *cur_block; | |
| 607 fz_stext_line *cur_line; | |
| 608 | |
| 609 int new_para = 0; | |
| 610 int new_line = 1; | |
| 611 int add_space = 0; | |
| 612 fz_point dir, ndir, p, q; | |
| 613 float size; | |
| 614 fz_point delta; | |
| 615 float spacing = 0; | |
| 616 float base_offset = 0; | |
| 617 float dist; | |
| 618 | |
| 619 /* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */ | |
| 620 bidi = bidi & 1; | |
| 621 | |
| 622 /* dir = direction vector for motion. ndir = normalised(dir) */ | |
| 623 if (wmode == 0) | |
| 624 { | |
| 625 dir.x = 1; | |
| 626 dir.y = 0; | |
| 627 } | |
| 628 else | |
| 629 { | |
| 630 dir.x = 0; | |
| 631 dir.y = -1; | |
| 632 } | |
| 633 dir = fz_transform_vector(dir, trm); | |
| 634 ndir = fz_normalize_vector(dir); | |
| 635 | |
| 636 size = fz_matrix_expansion(trm); | |
| 637 | |
| 638 /* We need to identify where glyphs 'start' (p) and 'stop' (q). | |
| 639 * Each glyph holds its 'start' position, and the next glyph in the | |
| 640 * span (or span->max if there is no next glyph) holds its 'end' | |
| 641 * position. | |
| 642 * | |
| 643 * For both horizontal and vertical motion, trm->{e,f} gives the | |
| 644 * origin (usually the bottom left) of the glyph. | |
| 645 * | |
| 646 * In horizontal mode: | |
| 647 * + p is bottom left. | |
| 648 * + q is the bottom right | |
| 649 * In vertical mode: | |
| 650 * + p is top left (where it advanced from) | |
| 651 * + q is bottom left | |
| 652 */ | |
| 653 if (wmode == 0) | |
| 654 { | |
| 655 p.x = trm.e; | |
| 656 p.y = trm.f; | |
| 657 q.x = trm.e + adv * dir.x; | |
| 658 q.y = trm.f + adv * dir.y; | |
| 659 } | |
| 660 else | |
| 661 { | |
| 662 p.x = trm.e - adv * dir.x; | |
| 663 p.y = trm.f - adv * dir.y; | |
| 664 q.x = trm.e; | |
| 665 q.y = trm.f; | |
| 666 } | |
| 667 | |
| 668 if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0) | |
| 669 { | |
| 670 if (glyph == -1) | |
| 671 { | |
| 672 if (dev->last_was_fake_bold) | |
| 673 goto move_pen_and_exit; | |
| 674 } | |
| 675 else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags)) | |
| 676 { | |
| 677 dev->last_was_fake_bold = 1; | |
| 678 goto move_pen_and_exit; | |
| 679 } | |
| 680 dev->last_was_fake_bold = 0; | |
| 681 } | |
| 682 | |
| 683 /* Find current position to enter new text. */ | |
| 684 cur_block = page->last_struct ? page->last_struct->last_block : page->last_block; | |
| 685 if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) | |
| 686 cur_block = NULL; | |
| 687 cur_line = cur_block ? cur_block->u.t.last_line : NULL; | |
| 688 | |
| 689 if (cur_line && glyph < 0) | |
| 690 { | |
| 691 /* Don't advance pen or break lines for no-glyph characters in a cluster */ | |
| 692 add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags); | |
| 693 dev->lastbidi = bidi; | |
| 694 dev->lastchar = c; | |
| 695 return; | |
| 696 } | |
| 697 | |
| 698 if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f) | |
| 699 { | |
| 700 /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), | |
| 701 * then we can't append to the current block/line. */ | |
| 702 new_para = 1; | |
| 703 new_line = 1; | |
| 704 } | |
| 705 else | |
| 706 { | |
| 707 /* Detect fake bold where text is printed twice in the same place. */ | |
| 708 /* Largely supplanted by the check_for_fake_bold mechanism above, | |
| 709 * but we leave this in for backward compatibility as it's cheap, | |
| 710 * and works even when FZ_STEXT_COLLECT_STYLES is not set. */ | |
| 711 dist = hypotf(q.x - dev->pen.x, q.y - dev->pen.y) / size; | |
| 712 if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar) | |
| 713 return; | |
| 714 | |
| 715 /* Calculate how far we've moved since the last character. */ | |
| 716 delta.x = p.x - dev->pen.x; | |
| 717 delta.y = p.y - dev->pen.y; | |
| 718 | |
| 719 /* The transform has not changed, so we know we're in the same | |
| 720 * direction. Calculate 2 distances; how far off the previous | |
| 721 * baseline we are, together with how far along the baseline | |
| 722 * we are from the expected position. */ | |
| 723 spacing = (ndir.x * delta.x + ndir.y * delta.y) / size; | |
| 724 base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size; | |
| 725 | |
| 726 /* Only a small amount off the baseline - we'll take this */ | |
| 727 if (fabsf(base_offset) < BASE_MAX_DIST) | |
| 728 { | |
| 729 /* If mixed LTR and RTL content */ | |
| 730 if ((bidi & 1) != (dev->lastbidi & 1)) | |
| 731 { | |
| 732 /* Ignore jumps within line when switching between LTR and RTL text. */ | |
| 733 new_line = 0; | |
| 734 } | |
| 735 | |
| 736 /* RTL */ | |
| 737 else if (bidi & 1) | |
| 738 { | |
| 739 fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y); | |
| 740 float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv; | |
| 741 | |
| 742 /* If the pen is where we would have been if we | |
| 743 * had advanced backwards from the previous | |
| 744 * character by this character's advance, we | |
| 745 * are probably seeing characters emitted in | |
| 746 * logical order. | |
| 747 */ | |
| 748 if (fabsf(logical_spacing) < SPACE_DIST) | |
| 749 { | |
| 750 new_line = 0; | |
| 751 } | |
| 752 | |
| 753 /* However, if the pen has advanced to where we would expect it | |
| 754 * in an LTR context, we're seeing them emitted in visual order | |
| 755 * and should flag them for reordering! | |
| 756 */ | |
| 757 else if (fabsf(spacing) < SPACE_DIST) | |
| 758 { | |
| 759 bidi = 3; /* mark line as visual */ | |
| 760 new_line = 0; | |
| 761 } | |
| 762 | |
| 763 /* And any other small jump could be a missing space. */ | |
| 764 else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST) | |
| 765 { | |
| 766 if (wmode == 0 && may_add_space(dev->lastchar)) | |
| 767 add_space = 1; | |
| 768 new_line = 0; | |
| 769 } | |
| 770 else if (spacing < 0 && spacing > -SPACE_MAX_DIST) | |
| 771 { | |
| 772 /* Motion is in line, but negative. We've probably got overlapping | |
| 773 * chars here. Live with it. */ | |
| 774 new_line = 0; | |
| 775 } | |
| 776 else if (spacing > 0 && spacing < SPACE_MAX_DIST) | |
| 777 { | |
| 778 bidi = 3; /* mark line as visual */ | |
| 779 if (wmode == 0 && may_add_space(dev->lastchar)) | |
| 780 add_space = 1; | |
| 781 new_line = 0; | |
| 782 } | |
| 783 | |
| 784 else | |
| 785 { | |
| 786 /* Motion is large and unexpected (probably a new table column). */ | |
| 787 new_line = 1; | |
| 788 } | |
| 789 } | |
| 790 | |
| 791 /* LTR or neutral character */ | |
| 792 else | |
| 793 { | |
| 794 if (fabsf(spacing) < SPACE_DIST) | |
| 795 { | |
| 796 /* Motion is in line and small enough to ignore. */ | |
| 797 new_line = 0; | |
| 798 } | |
| 799 else if (spacing < 0 && spacing > -SPACE_MAX_DIST) | |
| 800 { | |
| 801 /* Motion is in line, but negative. We've probably got overlapping | |
| 802 * chars here. Live with it. */ | |
| 803 new_line = 0; | |
| 804 } | |
| 805 else if (spacing > 0 && spacing < SPACE_MAX_DIST) | |
| 806 { | |
| 807 /* Motion is forward in line and large enough to warrant us adding a space. */ | |
| 808 if (wmode == 0 && may_add_space(dev->lastchar)) | |
| 809 add_space = 1; | |
| 810 new_line = 0; | |
| 811 } | |
| 812 else | |
| 813 { | |
| 814 /* Motion is large and unexpected (probably a new table column). */ | |
| 815 new_line = 1; | |
| 816 } | |
| 817 } | |
| 818 } | |
| 819 | |
| 820 /* Enough for a new line, but not enough for a new paragraph */ | |
| 821 else if (fabsf(base_offset) <= PARAGRAPH_DIST) | |
| 822 { | |
| 823 /* Check indent to spot text-indent style paragraphs */ | |
| 824 if (wmode == 0 && cur_line && dev->new_obj) | |
| 825 if ((p.x - dev->start.x) > 0.5f) | |
| 826 new_para = 1; | |
| 827 new_line = 1; | |
| 828 } | |
| 829 | |
| 830 /* Way off the baseline - open a new paragraph */ | |
| 831 else | |
| 832 { | |
| 833 new_para = 1; | |
| 834 new_line = 1; | |
| 835 } | |
| 836 } | |
| 837 | |
| 838 /* Start a new block (but only at the beginning of a text object) */ | |
| 839 if (new_para || !cur_block) | |
| 840 { | |
| 841 cur_block = add_text_block_to_page(ctx, page); | |
| 842 cur_line = cur_block->u.t.last_line; | |
| 843 } | |
| 844 | |
| 845 if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && is_hyphen(dev->lastchar)) | |
| 846 { | |
| 847 remove_last_char(ctx, cur_line); | |
| 848 new_line = 0; | |
| 849 } | |
| 850 | |
| 851 /* Start a new line */ | |
| 852 if (new_line || !cur_line || force_new_line) | |
| 853 { | |
| 854 cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi); | |
| 855 dev->start = p; | |
| 856 } | |
| 857 | |
| 858 /* Add synthetic space */ | |
| 859 if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES)) | |
| 860 add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, 1, flags, dev->flags); | |
| 861 | |
| 862 add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags); | |
| 863 | |
| 864 move_pen_and_exit: | |
| 865 dev->lastchar = c; | |
| 866 dev->lastbidi = bidi; | |
| 867 dev->lag_pen = p; | |
| 868 dev->pen = q; | |
| 869 | |
| 870 dev->new_obj = 0; | |
| 871 dev->trm = trm; | |
| 872 } | |
| 873 | |
| 874 static void | |
| 875 fz_add_stext_char(fz_context *ctx, | |
| 876 fz_stext_device *dev, | |
| 877 fz_font *font, | |
| 878 int c, | |
| 879 int glyph, | |
| 880 fz_matrix trm, | |
| 881 float adv, | |
| 882 int wmode, | |
| 883 int bidi, | |
| 884 int force_new_line, | |
| 885 int flags) | |
| 886 { | |
| 887 /* ignore when one unicode character maps to multiple glyphs */ | |
| 888 if (c == -1) | |
| 889 return; | |
| 890 | |
| 891 if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS) | |
| 892 fz_calculate_font_ascender_descender(ctx, font); | |
| 893 | |
| 894 if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) | |
| 895 { | |
| 896 switch (c) | |
| 897 { | |
| 898 case 0xFB00: /* ff */ | |
| 899 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); | |
| 900 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); | |
| 901 return; | |
| 902 case 0xFB01: /* fi */ | |
| 903 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); | |
| 904 fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags); | |
| 905 return; | |
| 906 case 0xFB02: /* fl */ | |
| 907 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); | |
| 908 fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags); | |
| 909 return; | |
| 910 case 0xFB03: /* ffi */ | |
| 911 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); | |
| 912 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); | |
| 913 fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags); | |
| 914 return; | |
| 915 case 0xFB04: /* ffl */ | |
| 916 fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags); | |
| 917 fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags); | |
| 918 fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags); | |
| 919 return; | |
| 920 case 0xFB05: /* long st */ | |
| 921 case 0xFB06: /* st */ | |
| 922 fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags); | |
| 923 fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags); | |
| 924 return; | |
| 925 } | |
| 926 } | |
| 927 | |
| 928 if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) | |
| 929 { | |
| 930 switch (c) | |
| 931 { | |
| 932 case 0x0009: /* tab */ | |
| 933 case 0x0020: /* space */ | |
| 934 case 0x00A0: /* no-break space */ | |
| 935 case 0x1680: /* ogham space mark */ | |
| 936 case 0x180E: /* mongolian vowel separator */ | |
| 937 case 0x2000: /* en quad */ | |
| 938 case 0x2001: /* em quad */ | |
| 939 case 0x2002: /* en space */ | |
| 940 case 0x2003: /* em space */ | |
| 941 case 0x2004: /* three-per-em space */ | |
| 942 case 0x2005: /* four-per-em space */ | |
| 943 case 0x2006: /* six-per-em space */ | |
| 944 case 0x2007: /* figure space */ | |
| 945 case 0x2008: /* punctuation space */ | |
| 946 case 0x2009: /* thin space */ | |
| 947 case 0x200A: /* hair space */ | |
| 948 case 0x202F: /* narrow no-break space */ | |
| 949 case 0x205F: /* medium mathematical space */ | |
| 950 case 0x3000: /* ideographic space */ | |
| 951 c = ' '; | |
| 952 } | |
| 953 } | |
| 954 | |
| 955 fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags); | |
| 956 } | |
| 957 | |
| 958 static fz_rect | |
| 959 current_clip(fz_context *ctx, fz_stext_device *dev) | |
| 960 { | |
| 961 fz_rect r = fz_infinite_rect; | |
| 962 | |
| 963 if (dev->flags & FZ_STEXT_CLIP) | |
| 964 { | |
| 965 r = fz_device_current_scissor(ctx, &dev->super); | |
| 966 r = fz_intersect_rect(r, dev->page->mediabox); | |
| 967 } | |
| 968 if (dev->flags & FZ_STEXT_CLIP_RECT) | |
| 969 r = fz_intersect_rect(r, dev->opts.clip); | |
| 970 | |
| 971 return r; | |
| 972 } | |
| 973 | |
| 974 static void | |
| 975 do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags) | |
| 976 { | |
| 977 fz_font *font = span->font; | |
| 978 fz_matrix tm = span->trm; | |
| 979 float adv; | |
| 980 int unicode; | |
| 981 int i; | |
| 982 | |
| 983 for (i = start; i < end; i++) | |
| 984 { | |
| 985 /* Calculate new pen location and delta */ | |
| 986 tm.e = span->items[i].x; | |
| 987 tm.f = span->items[i].y; | |
| 988 dev->last.trm = fz_concat(tm, ctm); | |
| 989 dev->last.bidi_level = span->bidi_level; | |
| 990 dev->last.wmode = span->wmode; | |
| 991 if (font != dev->last.font) | |
| 992 { | |
| 993 fz_drop_font(ctx, dev->last.font); | |
| 994 dev->last.font = fz_keep_font(ctx, font); | |
| 995 } | |
| 996 dev->last.valid = 1; | |
| 997 dev->last.flags = flags; | |
| 998 | |
| 999 if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) | |
| 1000 { | |
| 1001 fz_rect r = current_clip(ctx, dev); | |
| 1002 if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r)) | |
| 1003 { | |
| 1004 dev->last.clipped = 1; | |
| 1005 continue; | |
| 1006 } | |
| 1007 } | |
| 1008 dev->last.clipped = 0; | |
| 1009 | |
| 1010 /* Calculate bounding box and new pen position based on font metrics */ | |
| 1011 if (span->items[i].gid >= 0) | |
| 1012 adv = span->items[i].adv; | |
| 1013 else | |
| 1014 adv = 0; | |
| 1015 | |
| 1016 unicode = span->items[i].ucs; | |
| 1017 if (unicode == FZ_REPLACEMENT_CHARACTER) | |
| 1018 { | |
| 1019 if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE) | |
| 1020 { | |
| 1021 unicode = span->items[i].cid; | |
| 1022 flags |= FZ_STEXT_UNICODE_IS_CID; | |
| 1023 } | |
| 1024 else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE) | |
| 1025 { | |
| 1026 unicode = span->items[i].gid; | |
| 1027 flags |= FZ_STEXT_UNICODE_IS_GID; | |
| 1028 } | |
| 1029 } | |
| 1030 | |
| 1031 /* Send the chars we have through. */ | |
| 1032 fz_add_stext_char(ctx, dev, font, | |
| 1033 unicode, | |
| 1034 span->items[i].gid, | |
| 1035 dev->last.trm, | |
| 1036 adv, | |
| 1037 dev->last.wmode, | |
| 1038 dev->last.bidi_level, | |
| 1039 (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), | |
| 1040 flags); | |
| 1041 } | |
| 1042 } | |
| 1043 | |
| 1044 static int | |
| 1045 rune_index(const char *utf8, size_t idx) | |
| 1046 { | |
| 1047 int rune; | |
| 1048 | |
| 1049 do | |
| 1050 { | |
| 1051 int len = fz_chartorune(&rune, utf8); | |
| 1052 if (rune == 0) | |
| 1053 return -1; | |
| 1054 utf8 += len; | |
| 1055 } | |
| 1056 while (idx--); | |
| 1057 | |
| 1058 return rune; | |
| 1059 } | |
| 1060 | |
| 1061 static void | |
| 1062 flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i) | |
| 1063 { | |
| 1064 if (*actualtext == 0) | |
| 1065 return; | |
| 1066 | |
| 1067 while (1) | |
| 1068 { | |
| 1069 int rune; | |
| 1070 actualtext += fz_chartorune(&rune, actualtext); | |
| 1071 | |
| 1072 if (rune == 0) | |
| 1073 break; | |
| 1074 | |
| 1075 if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) | |
| 1076 if (dev->last.clipped) | |
| 1077 continue; | |
| 1078 | |
| 1079 fz_add_stext_char(ctx, dev, dev->last.font, | |
| 1080 rune, | |
| 1081 -1, | |
| 1082 dev->last.trm, | |
| 1083 0, | |
| 1084 dev->last.wmode, | |
| 1085 dev->last.bidi_level, | |
| 1086 (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), | |
| 1087 dev->last.flags); | |
| 1088 i++; | |
| 1089 } | |
| 1090 } | |
| 1091 | |
| 1092 static void | |
| 1093 do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags) | |
| 1094 { | |
| 1095 /* We are within an actualtext block. This means we can't just add the chars | |
| 1096 * as they are. We need to add the chars as they are meant to be. Sadly the | |
| 1097 * actualtext mechanism doesn't help us at all with positioning. */ | |
| 1098 fz_font *font = span->font; | |
| 1099 fz_matrix tm = span->trm; | |
| 1100 float adv; | |
| 1101 int start, i, end; | |
| 1102 char *actualtext = mt->text; | |
| 1103 size_t z = fz_utflen(actualtext); | |
| 1104 | |
| 1105 /* If actualtext is empty, nothing to do! */ | |
| 1106 if (z == 0) | |
| 1107 return; | |
| 1108 | |
| 1109 /* Now, we HOPE that the creator of a PDF will minimise the actual text | |
| 1110 * differences, so that we'll get: | |
| 1111 * "Politicians <Actualtext="lie">fib</ActualText>, always." | |
| 1112 * rather than: | |
| 1113 * "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText> | |
| 1114 * but experience with PDF files tells us that this won't always be the case. | |
| 1115 * | |
| 1116 * We try to minimise the actualtext section here, just in case. | |
| 1117 */ | |
| 1118 | |
| 1119 /* Spot a matching prefix and send it. */ | |
| 1120 for (start = 0; start < span->len; start++) | |
| 1121 { | |
| 1122 int rune; | |
| 1123 int len = fz_chartorune(&rune, actualtext); | |
| 1124 if (span->items[start].gid != rune || rune == 0) | |
| 1125 break; | |
| 1126 actualtext += len; z--; | |
| 1127 } | |
| 1128 if (start != 0) | |
| 1129 do_extract(ctx, dev, span, ctm, 0, start, flags); | |
| 1130 | |
| 1131 if (start == span->len) | |
| 1132 { | |
| 1133 /* The prefix has consumed all this object. Just shorten the actualtext and we'll | |
| 1134 * catch the rest next time. */ | |
| 1135 z = strlen(actualtext)+1; | |
| 1136 memmove(mt->text, actualtext, z); | |
| 1137 return; | |
| 1138 } | |
| 1139 | |
| 1140 /* We haven't consumed the whole string, so there must be runes left. | |
| 1141 * Shut coverity up. */ | |
| 1142 assert(z != 0); | |
| 1143 | |
| 1144 /* Spot a matching postfix. Can't send it til the end. */ | |
| 1145 for (end = span->len; end > start; end--) | |
| 1146 { | |
| 1147 /* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */ | |
| 1148 int rune = rune_index(actualtext, z-1); | |
| 1149 if (span->items[end-1].gid != rune) | |
| 1150 break; | |
| 1151 z--; | |
| 1152 } | |
| 1153 /* So we can send end -> span->len at the end. */ | |
| 1154 | |
| 1155 /* So we have at least SOME chars that don't match. */ | |
| 1156 /* Now, do the difficult bit in the middle.*/ | |
| 1157 /* items[start..end] have to be sent with actualtext[start..z] */ | |
| 1158 for (i = start; i < end; i++) | |
| 1159 { | |
| 1160 fz_text_item *item = &span->items[i]; | |
| 1161 int rune = -1; | |
| 1162 | |
| 1163 if ((size_t)i < z) | |
| 1164 actualtext += fz_chartorune(&rune, actualtext); | |
| 1165 | |
| 1166 /* Calculate new pen location and delta */ | |
| 1167 tm.e = item->x; | |
| 1168 tm.f = item->y; | |
| 1169 dev->last.trm = fz_concat(tm, ctm); | |
| 1170 dev->last.bidi_level = span->bidi_level; | |
| 1171 dev->last.wmode = span->wmode; | |
| 1172 if (font != dev->last.font) | |
| 1173 { | |
| 1174 fz_drop_font(ctx, dev->last.font); | |
| 1175 dev->last.font = fz_keep_font(ctx, font); | |
| 1176 } | |
| 1177 dev->last.valid = 1; | |
| 1178 | |
| 1179 if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT)) | |
| 1180 { | |
| 1181 fz_rect r = current_clip(ctx, dev); | |
| 1182 if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r)) | |
| 1183 { | |
| 1184 dev->last.clipped = 1; | |
| 1185 continue; | |
| 1186 } | |
| 1187 } | |
| 1188 dev->last.clipped = 0; | |
| 1189 | |
| 1190 /* Calculate bounding box and new pen position based on font metrics */ | |
| 1191 if (item->gid >= 0) | |
| 1192 adv = item->adv; | |
| 1193 else | |
| 1194 adv = 0; | |
| 1195 | |
| 1196 fz_add_stext_char(ctx, dev, font, | |
| 1197 rune, | |
| 1198 span->items[i].gid, | |
| 1199 dev->last.trm, | |
| 1200 adv, | |
| 1201 dev->last.wmode, | |
| 1202 dev->last.bidi_level, | |
| 1203 (i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS), | |
| 1204 flags); | |
| 1205 } | |
| 1206 | |
| 1207 /* If we haven't spotted a postfix by this point, then don't force ourselves to output | |
| 1208 * any more of the actualtext at this point. We might get a new text object that matches | |
| 1209 * more of it. */ | |
| 1210 if (end == span->len) | |
| 1211 { | |
| 1212 /* Shorten actualtext and exit. */ | |
| 1213 z = strlen(actualtext)+1; | |
| 1214 memmove(mt->text, actualtext, z); | |
| 1215 return; | |
| 1216 } | |
| 1217 | |
| 1218 /* We found a matching postfix. It seems likely that this is going to be the only | |
| 1219 * text object we get, so send any remaining actualtext now. */ | |
| 1220 flush_actualtext(ctx, dev, actualtext, i); | |
| 1221 | |
| 1222 /* Send the postfix */ | |
| 1223 if (end != span->len) | |
| 1224 do_extract(ctx, dev, span, ctm, end, span->len, flags); | |
| 1225 | |
| 1226 mt->text[0] = 0; | |
| 1227 } | |
| 1228 | |
| 1229 static void | |
| 1230 fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags) | |
| 1231 { | |
| 1232 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1233 metatext_t *mt = NULL; | |
| 1234 | |
| 1235 if (span->len == 0) | |
| 1236 return; | |
| 1237 | |
| 1238 /* Are we in an actualtext? */ | |
| 1239 if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT)) | |
| 1240 mt = find_actualtext(dev); | |
| 1241 | |
| 1242 if (mt) | |
| 1243 do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags); | |
| 1244 else | |
| 1245 do_extract(ctx, dev, span, ctm, 0, span->len, flags); | |
| 1246 } | |
| 1247 | |
| 1248 static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha) | |
| 1249 { | |
| 1250 float rgb[3]; | |
| 1251 fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params); | |
| 1252 return | |
| 1253 ((uint32_t) (fz_clampi(alpha * 255 + 0.5f, 0, 255) << 24)) | | |
| 1254 ((uint32_t) (fz_clampi(rgb[0] * 255 + 0.5f, 0, 255) << 16)) | | |
| 1255 ((uint32_t) (fz_clampi(rgb[1] * 255 + 0.5f, 0, 255) << 8)) | | |
| 1256 ((uint32_t) (fz_clampi(rgb[2] * 255 + 0.5f, 0, 255))); | |
| 1257 } | |
| 1258 | |
| 1259 static void | |
| 1260 fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, | |
| 1261 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) | |
| 1262 { | |
| 1263 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1264 fz_text_span *span; | |
| 1265 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) | |
| 1266 return; | |
| 1267 tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha); | |
| 1268 tdev->new_obj = 1; | |
| 1269 for (span = text->head; span; span = span->next) | |
| 1270 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED); | |
| 1271 fz_drop_text(ctx, tdev->lasttext); | |
| 1272 tdev->lasttext = fz_keep_text(ctx, text); | |
| 1273 } | |
| 1274 | |
| 1275 static void | |
| 1276 fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, | |
| 1277 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) | |
| 1278 { | |
| 1279 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1280 fz_text_span *span; | |
| 1281 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) | |
| 1282 return; | |
| 1283 tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha); | |
| 1284 tdev->new_obj = 1; | |
| 1285 for (span = text->head; span; span = span->next) | |
| 1286 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED); | |
| 1287 fz_drop_text(ctx, tdev->lasttext); | |
| 1288 tdev->lasttext = fz_keep_text(ctx, text); | |
| 1289 } | |
| 1290 | |
| 1291 static void | |
| 1292 fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor) | |
| 1293 { | |
| 1294 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1295 fz_text_span *span; | |
| 1296 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) | |
| 1297 return; | |
| 1298 tdev->color = 0; | |
| 1299 tdev->new_obj = 1; | |
| 1300 for (span = text->head; span; span = span->next) | |
| 1301 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED); | |
| 1302 fz_drop_text(ctx, tdev->lasttext); | |
| 1303 tdev->lasttext = fz_keep_text(ctx, text); | |
| 1304 } | |
| 1305 | |
| 1306 static void | |
| 1307 fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) | |
| 1308 { | |
| 1309 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1310 fz_text_span *span; | |
| 1311 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) | |
| 1312 return; | |
| 1313 tdev->color = 0; | |
| 1314 tdev->new_obj = 1; | |
| 1315 for (span = text->head; span; span = span->next) | |
| 1316 fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED); | |
| 1317 fz_drop_text(ctx, tdev->lasttext); | |
| 1318 tdev->lasttext = fz_keep_text(ctx, text); | |
| 1319 } | |
| 1320 | |
| 1321 static void | |
| 1322 fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm) | |
| 1323 { | |
| 1324 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1325 fz_text_span *span; | |
| 1326 if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0) | |
| 1327 return; | |
| 1328 tdev->color = 0; | |
| 1329 tdev->new_obj = 1; | |
| 1330 for (span = text->head; span; span = span->next) | |
| 1331 fz_stext_extract(ctx, tdev, span, ctm, 0); | |
| 1332 fz_drop_text(ctx, tdev->lasttext); | |
| 1333 tdev->lasttext = fz_keep_text(ctx, text); | |
| 1334 } | |
| 1335 | |
| 1336 static void | |
| 1337 fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text) | |
| 1338 { | |
| 1339 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1340 metatext_t *mt = fz_malloc_struct(ctx, metatext_t); | |
| 1341 | |
| 1342 mt->prev = tdev->metatext; | |
| 1343 tdev->metatext = mt; | |
| 1344 mt->type = meta; | |
| 1345 mt->text = text ? fz_strdup(ctx, text) : NULL; | |
| 1346 mt->bounds = fz_empty_rect; | |
| 1347 } | |
| 1348 | |
| 1349 static void | |
| 1350 pop_metatext(fz_context *ctx, fz_stext_device *dev) | |
| 1351 { | |
| 1352 metatext_t *prev; | |
| 1353 fz_rect bounds; | |
| 1354 | |
| 1355 if (!dev->metatext) | |
| 1356 return; | |
| 1357 | |
| 1358 prev = dev->metatext->prev; | |
| 1359 bounds = dev->metatext->bounds; | |
| 1360 fz_free(ctx, dev->metatext->text); | |
| 1361 fz_free(ctx, dev->metatext); | |
| 1362 dev->metatext = prev; | |
| 1363 if (prev) | |
| 1364 prev->bounds = fz_union_rect(prev->bounds, bounds); | |
| 1365 } | |
| 1366 | |
| 1367 static void | |
| 1368 fz_stext_end_metatext(fz_context *ctx, fz_device *dev) | |
| 1369 { | |
| 1370 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1371 fz_font *myfont = NULL; | |
| 1372 | |
| 1373 if (!tdev->metatext) | |
| 1374 return; /* Mismatched pop. Live with it. */ | |
| 1375 | |
| 1376 if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT) | |
| 1377 { | |
| 1378 /* We only deal with ActualText here. Just pop anything else off, | |
| 1379 * and we're done. */ | |
| 1380 pop_metatext(ctx, tdev); | |
| 1381 return; | |
| 1382 } | |
| 1383 | |
| 1384 /* If we have a 'last' text position, send the content after that. */ | |
| 1385 if (tdev->last.valid) | |
| 1386 { | |
| 1387 flush_actualtext(ctx, tdev, tdev->metatext->text, 0); | |
| 1388 pop_metatext(ctx, tdev); | |
| 1389 return; | |
| 1390 } | |
| 1391 | |
| 1392 /* If we have collected a rectangle for content that encloses the actual text, | |
| 1393 * send the content there. */ | |
| 1394 if (!fz_is_empty_rect(tdev->metatext->bounds)) | |
| 1395 { | |
| 1396 tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0; | |
| 1397 tdev->last.trm.b = 0; | |
| 1398 tdev->last.trm.c = 0; | |
| 1399 tdev->last.trm.d = tdev->metatext->bounds.y1 - tdev->metatext->bounds.y0; | |
| 1400 tdev->last.trm.e = tdev->metatext->bounds.x0; | |
| 1401 tdev->last.trm.f = tdev->metatext->bounds.y0; | |
| 1402 } | |
| 1403 else | |
| 1404 fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned."); | |
| 1405 | |
| 1406 fz_var(myfont); | |
| 1407 | |
| 1408 fz_try(ctx) | |
| 1409 { | |
| 1410 if (tdev->last.font == NULL) | |
| 1411 { | |
| 1412 myfont = fz_new_base14_font(ctx, "Helvetica"); | |
| 1413 tdev->last.font = myfont; | |
| 1414 } | |
| 1415 flush_actualtext(ctx, tdev, tdev->metatext->text, 0); | |
| 1416 pop_metatext(ctx, tdev); | |
| 1417 } | |
| 1418 fz_always(ctx) | |
| 1419 { | |
| 1420 if (myfont) | |
| 1421 { | |
| 1422 tdev->last.font = NULL; | |
| 1423 fz_drop_font(ctx, myfont); | |
| 1424 } | |
| 1425 } | |
| 1426 fz_catch(ctx) | |
| 1427 fz_rethrow(ctx); | |
| 1428 } | |
| 1429 | |
| 1430 | |
| 1431 /* Images and shadings */ | |
| 1432 | |
| 1433 static void | |
| 1434 fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) | |
| 1435 { | |
| 1436 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1437 fz_rect *bounds = actualtext_bounds(tdev); | |
| 1438 | |
| 1439 /* If there is an actualtext in force, update its bounds. */ | |
| 1440 if (bounds) | |
| 1441 { | |
| 1442 static const fz_rect unit = { 0, 0, 1, 1 }; | |
| 1443 *bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm)); | |
| 1444 } | |
| 1445 | |
| 1446 /* Unless we are being told to preserve images, nothing to do here. */ | |
| 1447 if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0) | |
| 1448 return; | |
| 1449 | |
| 1450 /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ | |
| 1451 if (alpha >= 0.5f) | |
| 1452 add_image_block_to_page(ctx, tdev->page, ctm, img); | |
| 1453 | |
| 1454 } | |
| 1455 | |
| 1456 static void | |
| 1457 fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, | |
| 1458 fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params) | |
| 1459 { | |
| 1460 fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); | |
| 1461 } | |
| 1462 | |
| 1463 static fz_image * | |
| 1464 fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor) | |
| 1465 { | |
| 1466 fz_matrix ctm = *in_out_ctm; | |
| 1467 fz_pixmap *pix; | |
| 1468 fz_image *img = NULL; | |
| 1469 fz_rect bounds; | |
| 1470 fz_irect bbox; | |
| 1471 | |
| 1472 bounds = fz_bound_shade(ctx, shade, ctm); | |
| 1473 bounds = fz_intersect_rect(bounds, scissor); | |
| 1474 bbox = fz_irect_from_rect(bounds); | |
| 1475 | |
| 1476 pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background); | |
| 1477 fz_try(ctx) | |
| 1478 { | |
| 1479 if (shade->use_background) | |
| 1480 fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params); | |
| 1481 else | |
| 1482 fz_clear_pixmap(ctx, pix); | |
| 1483 fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL); | |
| 1484 img = fz_new_image_from_pixmap(ctx, pix, NULL); | |
| 1485 } | |
| 1486 fz_always(ctx) | |
| 1487 fz_drop_pixmap(ctx, pix); | |
| 1488 fz_catch(ctx) | |
| 1489 fz_rethrow(ctx); | |
| 1490 | |
| 1491 in_out_ctm->a = pix->w; | |
| 1492 in_out_ctm->b = 0; | |
| 1493 in_out_ctm->c = 0; | |
| 1494 in_out_ctm->d = pix->h; | |
| 1495 in_out_ctm->e = pix->x; | |
| 1496 in_out_ctm->f = pix->y; | |
| 1497 return img; | |
| 1498 } | |
| 1499 | |
| 1500 static void | |
| 1501 fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params) | |
| 1502 { | |
| 1503 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1504 fz_rect *bounds = actualtext_bounds(tdev); | |
| 1505 fz_matrix local_ctm; | |
| 1506 fz_rect scissor; | |
| 1507 fz_image *image; | |
| 1508 | |
| 1509 /* If we aren't keeping images, but we are in a bound, update the bounds | |
| 1510 * without generating the entire image. */ | |
| 1511 if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && bounds) | |
| 1512 { | |
| 1513 *bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm)); | |
| 1514 return; | |
| 1515 } | |
| 1516 | |
| 1517 /* Unless we are preserving image, nothing to do here. */ | |
| 1518 if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0) | |
| 1519 return; | |
| 1520 | |
| 1521 local_ctm = ctm; | |
| 1522 scissor = fz_device_current_scissor(ctx, dev); | |
| 1523 if (dev->flags & FZ_STEXT_CLIP_RECT) | |
| 1524 scissor = fz_intersect_rect(scissor, tdev->opts.clip); | |
| 1525 scissor = fz_intersect_rect(scissor, tdev->page->mediabox); | |
| 1526 image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor); | |
| 1527 fz_try(ctx) | |
| 1528 fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params); | |
| 1529 fz_always(ctx) | |
| 1530 fz_drop_image(ctx, image); | |
| 1531 fz_catch(ctx) | |
| 1532 fz_rethrow(ctx); | |
| 1533 } | |
| 1534 | |
| 1535 static void | |
| 1536 fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block) | |
| 1537 { | |
| 1538 fz_stext_line *line; | |
| 1539 fz_stext_char *ch; | |
| 1540 | |
| 1541 for ( ; block != NULL; block = block->next) | |
| 1542 { | |
| 1543 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 1544 if (block->u.s.down) | |
| 1545 fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block); | |
| 1546 if (block->type != FZ_STEXT_BLOCK_TEXT) | |
| 1547 continue; | |
| 1548 for (line = block->u.t.first_line; line; line = line->next) | |
| 1549 { | |
| 1550 int reorder = 0; | |
| 1551 for (ch = line->first_char; ch; ch = ch->next) | |
| 1552 { | |
| 1553 fz_rect ch_box = fz_rect_from_quad(ch->quad); | |
| 1554 if (ch == line->first_char) | |
| 1555 line->bbox = ch_box; | |
| 1556 else | |
| 1557 line->bbox = fz_union_rect(line->bbox, ch_box); | |
| 1558 if (ch->bidi == 3) | |
| 1559 reorder = 1; | |
| 1560 } | |
| 1561 block->bbox = fz_union_rect(block->bbox, line->bbox); | |
| 1562 if (reorder) | |
| 1563 reverse_bidi_line(line); | |
| 1564 } | |
| 1565 } | |
| 1566 } | |
| 1567 | |
| 1568 static void | |
| 1569 advance_to_x(fz_point *a, fz_point b, float x) | |
| 1570 { | |
| 1571 a->y += (b.y - a->y) * (x - a->x) / (b.x - a->x); | |
| 1572 a->x = x; | |
| 1573 } | |
| 1574 | |
| 1575 static void | |
| 1576 advance_to_y(fz_point *a, fz_point b, float y) | |
| 1577 { | |
| 1578 a->x += (b.x - a->x) * (y - a->y) / (b.y - a->y); | |
| 1579 a->y = y; | |
| 1580 } | |
| 1581 | |
| 1582 static int | |
| 1583 line_crosses_rect(fz_point a, fz_point b, fz_rect r) | |
| 1584 { | |
| 1585 /* Cope with trivial exclusions */ | |
| 1586 if (a.x < r.x0 && b.x < r.x0) | |
| 1587 return 0; | |
| 1588 if (a.x > r.x1 && b.x > r.x1) | |
| 1589 return 0; | |
| 1590 if (a.y < r.y0 && b.y < r.y0) | |
| 1591 return 0; | |
| 1592 if (a.y > r.y1 && b.y > r.y1) | |
| 1593 return 0; | |
| 1594 | |
| 1595 if (a.x < r.x0) | |
| 1596 advance_to_x(&a, b, r.x0); | |
| 1597 if (a.x > r.x1) | |
| 1598 advance_to_x(&a, b, r.x1); | |
| 1599 if (a.y < r.y0) | |
| 1600 advance_to_y(&a, b, r.y0); | |
| 1601 if (a.y > r.y1) | |
| 1602 advance_to_y(&a, b, r.y1); | |
| 1603 | |
| 1604 return fz_is_point_inside_rect(a, r); | |
| 1605 } | |
| 1606 | |
| 1607 static float | |
| 1608 calculate_ascent(fz_point p, fz_point origin, fz_point dir) | |
| 1609 { | |
| 1610 return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x); | |
| 1611 } | |
| 1612 | |
| 1613 /* Create us a rect from the given quad, but extend it downwards | |
| 1614 * to allow for underlines that pass under the glyphs. */ | |
| 1615 static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size) | |
| 1616 { | |
| 1617 /* Consider the two rects from A and g respectively. | |
| 1618 * | |
| 1619 * ul +------+ ur or | |
| 1620 * | /\ | ul +------+ ur | |
| 1621 * | /__\ | | /''\ | | |
| 1622 * |/ \| |( || | |
| 1623 * ll +------+ lr | ''''|| | |
| 1624 * | ''' | <-expected underline level | |
| 1625 * ll +------+ lr | |
| 1626 * | |
| 1627 * So an underline won't cross A's rect, but will cross g's. | |
| 1628 * We want to make a rect that includes a suitable amount of | |
| 1629 * space underneath. The information we have available to us | |
| 1630 * is summed up here: | |
| 1631 * | |
| 1632 * ul +---------+ ur | |
| 1633 * | | | |
| 1634 * | origin | | |
| 1635 * |+----------> dir | |
| 1636 * | | | |
| 1637 * ll +---------+ lr | |
| 1638 * | |
| 1639 * Consider the distance from ul to the line that passes through | |
| 1640 * the origin with direction dir. Similarly, consider the distance | |
| 1641 * from ur to the same line. This can be thought of as the 'ascent' | |
| 1642 * of this character. | |
| 1643 * | |
| 1644 * We'd like the distance from ul to ll to be greater than this, so | |
| 1645 * as to ensure we cover the possible location where an underline | |
| 1646 * might reasonably go. | |
| 1647 * | |
| 1648 * If we have a line (l) through point A with direction vector u, | |
| 1649 * the distance between point P and line(l) is: | |
| 1650 * | |
| 1651 * d(P,l) = || AP x u || / || u || | |
| 1652 * | |
| 1653 * where x is the cross product. | |
| 1654 * | |
| 1655 * For us, because || dir || = 1: | |
| 1656 * | |
| 1657 * d(ul, origin) = || (origin-ul) x dir || | |
| 1658 * | |
| 1659 * The cross product is only defined in 3 (or 7!) dimensions, so | |
| 1660 * extend both vectors into 3d by defining a 0 z component. | |
| 1661 * | |
| 1662 * (origin-ul) x dir = [ (origin.y - ul.y) . 0 - 0 . dir.y ] | |
| 1663 * [ 0 . dir.x - (origin.x - ul.y) . 0 ] | |
| 1664 * [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ] | |
| 1665 * | |
| 1666 * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x | |
| 1667 */ | |
| 1668 float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2; | |
| 1669 fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y }; | |
| 1670 fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y }; | |
| 1671 float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2; | |
| 1672 int neg = 0; | |
| 1673 float extra_rise = 0; | |
| 1674 | |
| 1675 /* Spaces will have 0 ascent. underscores will have small ascent. | |
| 1676 * We want a sane ascent to be able to spot strikeouts, but not | |
| 1677 * so big that it incorporates lines above the text, like borders. */ | |
| 1678 if (ascent < 0.75*size) | |
| 1679 extra_rise = 0.75*size - ascent; | |
| 1680 | |
| 1681 /* We'd like height to be at least ascent + 1/4 size */ | |
| 1682 if (height < 0) | |
| 1683 neg = 1, height = -height; | |
| 1684 if (height < ascent + size * 0.25f) | |
| 1685 height = ascent + size * 0.25f; | |
| 1686 | |
| 1687 height -= ascent; | |
| 1688 if (neg) | |
| 1689 height = -height; | |
| 1690 quad.ll.x += - height * dir.y; | |
| 1691 quad.ll.y += height * dir.x; | |
| 1692 quad.lr.x += - height * dir.y; | |
| 1693 quad.lr.y += height * dir.x; | |
| 1694 quad.ul.x -= - extra_rise * dir.y; | |
| 1695 quad.ul.y -= extra_rise * dir.x; | |
| 1696 quad.ur.x -= - extra_rise * dir.y; | |
| 1697 quad.ur.y -= extra_rise * dir.x; | |
| 1698 | |
| 1699 return fz_rect_from_quad(quad); | |
| 1700 } | |
| 1701 | |
| 1702 static int feq(float a,float b) | |
| 1703 { | |
| 1704 #define EPSILON 0.00001 | |
| 1705 a -= b; | |
| 1706 if (a < 0) | |
| 1707 a = -a; | |
| 1708 return a < EPSILON; | |
| 1709 } | |
| 1710 | |
| 1711 static void | |
| 1712 check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir, float thickness) | |
| 1713 { | |
| 1714 for ( ; block; block = block->next) | |
| 1715 { | |
| 1716 fz_stext_line *line; | |
| 1717 | |
| 1718 if (block->type != FZ_STEXT_BLOCK_TEXT) | |
| 1719 continue; | |
| 1720 | |
| 1721 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 1722 { | |
| 1723 fz_stext_char *ch; | |
| 1724 | |
| 1725 if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) && | |
| 1726 (!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y))) | |
| 1727 continue; | |
| 1728 | |
| 1729 /* Matching directions... */ | |
| 1730 | |
| 1731 /* Unfortunately, we don't have a valid line->bbox at this point, so we need to check | |
| 1732 * chars. - FIXME: Now we do! */ | |
| 1733 for (ch = line->first_char; ch; ch = ch->next) | |
| 1734 { | |
| 1735 fz_point up; | |
| 1736 float dx, dy, dot; | |
| 1737 fz_rect ch_box; | |
| 1738 | |
| 1739 /* If the thickness is more than a 1/4 of the size, it's a highlight, not a | |
| 1740 * line! */ | |
| 1741 if (ch->size < thickness*4) | |
| 1742 continue; | |
| 1743 | |
| 1744 ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size); | |
| 1745 | |
| 1746 if (!line_crosses_rect(from, to, ch_box)) | |
| 1747 continue; | |
| 1748 | |
| 1749 /* Is this a strikeout or an underline? */ | |
| 1750 | |
| 1751 /* The baseline moves from ch->origin in the direction line->dir */ | |
| 1752 up.x = line->dir.y; | |
| 1753 up.y = -line->dir.x; | |
| 1754 | |
| 1755 /* How far is our line displaced from the line through the origin? */ | |
| 1756 dx = from.x - ch->origin.x; | |
| 1757 dy = from.y - ch->origin.y; | |
| 1758 /* Dot product with up. up is normalised */ | |
| 1759 dot = dx * up.x + dy * up.y; | |
| 1760 | |
| 1761 if (dot > 0) | |
| 1762 ch->flags |= FZ_STEXT_STRIKEOUT; | |
| 1763 else | |
| 1764 ch->flags |= FZ_STEXT_UNDERLINE; | |
| 1765 } | |
| 1766 } | |
| 1767 } | |
| 1768 } | |
| 1769 | |
| 1770 static void | |
| 1771 check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page) | |
| 1772 { | |
| 1773 int i, n = tdev->rect_len; | |
| 1774 | |
| 1775 for (i = 0; i < n; i++) | |
| 1776 { | |
| 1777 fz_point from = tdev->rects[i].from; | |
| 1778 fz_point to = tdev->rects[i].to; | |
| 1779 float thickness = tdev->rects[i].thickness; | |
| 1780 fz_point dir; | |
| 1781 dir.x = to.x - from.x; | |
| 1782 dir.y = to.y - from.y; | |
| 1783 dir = fz_normalize_vector(dir); | |
| 1784 | |
| 1785 check_strikeout(ctx, page->first_block, from, to, dir, thickness); | |
| 1786 } | |
| 1787 } | |
| 1788 | |
| 1789 static void | |
| 1790 fz_stext_close_device(fz_context *ctx, fz_device *dev) | |
| 1791 { | |
| 1792 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1793 fz_stext_page *page = tdev->page; | |
| 1794 | |
| 1795 fixup_bboxes_and_bidi(ctx, page->first_block); | |
| 1796 | |
| 1797 if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) | |
| 1798 check_rects_for_strikeout(ctx, tdev, page); | |
| 1799 | |
| 1800 /* TODO: smart sorting of blocks and lines in reading order */ | |
| 1801 /* TODO: unicode NFC normalization */ | |
| 1802 | |
| 1803 if (tdev->opts.flags & FZ_STEXT_SEGMENT) | |
| 1804 fz_segment_stext_page(ctx, page); | |
| 1805 | |
| 1806 if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK) | |
| 1807 fz_paragraph_break(ctx, page); | |
| 1808 | |
| 1809 if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT) | |
| 1810 fz_table_hunt(ctx, page); | |
| 1811 } | |
| 1812 | |
| 1813 static void | |
| 1814 fz_stext_drop_device(fz_context *ctx, fz_device *dev) | |
| 1815 { | |
| 1816 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 1817 fz_drop_text(ctx, tdev->lasttext); | |
| 1818 fz_drop_font(ctx, tdev->last.font); | |
| 1819 while (tdev->metatext) | |
| 1820 pop_metatext(ctx, tdev); | |
| 1821 | |
| 1822 fz_free(ctx, tdev->rects); | |
| 1823 } | |
| 1824 | |
| 1825 static int | |
| 1826 val_is_rect(const char *val, fz_rect *rp) | |
| 1827 { | |
| 1828 fz_rect r; | |
| 1829 const char *s; | |
| 1830 | |
| 1831 s = strchr(val, ':'); | |
| 1832 if (s == NULL || s == val) | |
| 1833 return 0; | |
| 1834 r.x0 = fz_atof(val); | |
| 1835 val = s+1; | |
| 1836 s = strchr(val, ':'); | |
| 1837 if (s == NULL || s == val) | |
| 1838 return 0; | |
| 1839 r.y0 = fz_atof(val); | |
| 1840 val = s+1; | |
| 1841 s = strchr(val, ':'); | |
| 1842 if (s == NULL || s == val) | |
| 1843 return 0; | |
| 1844 r.x1 = fz_atof(val); | |
| 1845 val = s+1; | |
| 1846 r.y1 = fz_atof(val); | |
| 1847 | |
| 1848 *rp = r; | |
| 1849 | |
| 1850 return 1; | |
| 1851 } | |
| 1852 | |
| 1853 fz_stext_options * | |
| 1854 fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string) | |
| 1855 { | |
| 1856 const char *val; | |
| 1857 | |
| 1858 memset(opts, 0, sizeof *opts); | |
| 1859 | |
| 1860 if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes")) | |
| 1861 opts->flags |= FZ_STEXT_PRESERVE_LIGATURES; | |
| 1862 if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes")) | |
| 1863 opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE; | |
| 1864 if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes")) | |
| 1865 opts->flags |= FZ_STEXT_PRESERVE_IMAGES; | |
| 1866 if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes")) | |
| 1867 opts->flags |= FZ_STEXT_INHIBIT_SPACES; | |
| 1868 if (fz_has_option(ctx, string, "dehyphenate", &val) && fz_option_eq(val, "yes")) | |
| 1869 opts->flags |= FZ_STEXT_DEHYPHENATE; | |
| 1870 if (fz_has_option(ctx, string, "preserve-spans", &val) && fz_option_eq(val, "yes")) | |
| 1871 opts->flags |= FZ_STEXT_PRESERVE_SPANS; | |
| 1872 if (fz_has_option(ctx, string, "structured", &val) && fz_option_eq(val, "yes")) | |
| 1873 opts->flags |= FZ_STEXT_COLLECT_STRUCTURE; | |
| 1874 if (fz_has_option(ctx, string, "use-cid-for-unknown-unicode", &val) && fz_option_eq(val, "yes")) | |
| 1875 opts->flags |= FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE; | |
| 1876 if (fz_has_option(ctx, string, "use-gid-for-unknown-unicode", &val) && fz_option_eq(val, "yes")) | |
| 1877 opts->flags |= FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE; | |
| 1878 if (fz_has_option(ctx, string, "accurate-bboxes", &val) && fz_option_eq(val, "yes")) | |
| 1879 opts->flags |= FZ_STEXT_ACCURATE_BBOXES; | |
| 1880 if (fz_has_option(ctx, string, "vectors", &val) && fz_option_eq(val, "yes")) | |
| 1881 opts->flags |= FZ_STEXT_COLLECT_VECTORS; | |
| 1882 if (fz_has_option(ctx, string, "ignore-actualtext", & val) && fz_option_eq(val, "yes")) | |
| 1883 opts->flags |= FZ_STEXT_IGNORE_ACTUALTEXT; | |
| 1884 if (fz_has_option(ctx, string, "segment", &val) && fz_option_eq(val, "yes")) | |
| 1885 opts->flags |= FZ_STEXT_SEGMENT; | |
| 1886 if (fz_has_option(ctx, string, "paragraph-break", &val) && fz_option_eq(val, "yes")) | |
| 1887 opts->flags |= FZ_STEXT_PARAGRAPH_BREAK; | |
| 1888 if (fz_has_option(ctx, string, "table-hunt", &val) && fz_option_eq(val, "yes")) | |
| 1889 opts->flags |= FZ_STEXT_TABLE_HUNT; | |
| 1890 if (fz_has_option(ctx, string, "collect-styles", &val) && fz_option_eq(val, "yes")) | |
| 1891 opts->flags |= FZ_STEXT_COLLECT_STYLES; | |
| 1892 if (fz_has_option(ctx, string, "accurate-ascenders", &val) && fz_option_eq(val, "yes")) | |
| 1893 opts->flags |= FZ_STEXT_ACCURATE_ASCENDERS; | |
| 1894 if (fz_has_option(ctx, string, "accurate-side-bearings", &val) && fz_option_eq(val, "yes")) | |
| 1895 opts->flags |= FZ_STEXT_ACCURATE_SIDE_BEARINGS; | |
| 1896 | |
| 1897 opts->flags |= FZ_STEXT_CLIP; | |
| 1898 if (fz_has_option(ctx, string, "mediabox-clip", &val)) | |
| 1899 { | |
| 1900 fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead."); | |
| 1901 if (fz_option_eq(val, "no")) | |
| 1902 opts->flags ^= FZ_STEXT_CLIP; | |
| 1903 } | |
| 1904 if (fz_has_option(ctx, string, "clip", &val) && fz_option_eq(val, "no")) | |
| 1905 opts->flags ^= FZ_STEXT_CLIP; | |
| 1906 if (fz_has_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip)) | |
| 1907 opts->flags |= FZ_STEXT_CLIP_RECT; | |
| 1908 | |
| 1909 opts->scale = 1; | |
| 1910 if (fz_has_option(ctx, string, "resolution", &val)) | |
| 1911 opts->scale = fz_atof(val) / 96.0f; /* HTML base resolution is 96ppi */ | |
| 1912 | |
| 1913 return opts; | |
| 1914 } | |
| 1915 | |
| 1916 typedef struct | |
| 1917 { | |
| 1918 int fail; | |
| 1919 int count; | |
| 1920 fz_point corners[4]; | |
| 1921 } is_rect_data; | |
| 1922 | |
| 1923 static void | |
| 1924 stash_point(is_rect_data *rd, float x, float y) | |
| 1925 { | |
| 1926 if (rd->count > 3) | |
| 1927 { | |
| 1928 rd->fail = 1; | |
| 1929 return; | |
| 1930 } | |
| 1931 | |
| 1932 rd->corners[rd->count].x = x; | |
| 1933 rd->corners[rd->count].y = y; | |
| 1934 rd->count++; | |
| 1935 } | |
| 1936 | |
| 1937 static void | |
| 1938 is_rect_moveto(fz_context *ctx, void *arg, float x, float y) | |
| 1939 { | |
| 1940 is_rect_data *rd = arg; | |
| 1941 if (rd->fail) | |
| 1942 return; | |
| 1943 | |
| 1944 if (rd->count != 0) | |
| 1945 { | |
| 1946 rd->fail = 1; | |
| 1947 return; | |
| 1948 } | |
| 1949 stash_point(rd, x, y); | |
| 1950 } | |
| 1951 | |
| 1952 static void | |
| 1953 is_rect_lineto(fz_context *ctx, void *arg, float x, float y) | |
| 1954 { | |
| 1955 is_rect_data *rd = arg; | |
| 1956 if (rd->fail) | |
| 1957 return; | |
| 1958 | |
| 1959 if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y) | |
| 1960 return; | |
| 1961 | |
| 1962 stash_point(rd, x, y); | |
| 1963 } | |
| 1964 | |
| 1965 static void | |
| 1966 is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3) | |
| 1967 { | |
| 1968 is_rect_data *rd = arg; | |
| 1969 rd->fail = 1; | |
| 1970 } | |
| 1971 | |
| 1972 static void | |
| 1973 is_rect_closepath(fz_context *ctx, void *arg) | |
| 1974 { | |
| 1975 is_rect_data *rd = arg; | |
| 1976 if (rd->fail) | |
| 1977 return; | |
| 1978 if (rd->count == 3) | |
| 1979 stash_point(rd, rd->corners[0].x, rd->corners[0].y); | |
| 1980 if (rd->count != 4) | |
| 1981 rd->fail = 1; | |
| 1982 } | |
| 1983 | |
| 1984 static int | |
| 1985 is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm) | |
| 1986 { | |
| 1987 float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y; | |
| 1988 is_rect_data rd = { 0 }; | |
| 1989 static const fz_path_walker walker = | |
| 1990 { | |
| 1991 is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath | |
| 1992 }; | |
| 1993 int i; | |
| 1994 | |
| 1995 fz_walk_path(ctx, path, &walker, &rd); | |
| 1996 | |
| 1997 if (rd.fail) | |
| 1998 return 0; | |
| 1999 | |
| 2000 if (rd.count == 2) | |
| 2001 { | |
| 2002 stash_point(&rd, rd.corners[1].x, rd.corners[1].y); | |
| 2003 stash_point(&rd, rd.corners[0].x, rd.corners[0].y); | |
| 2004 } | |
| 2005 | |
| 2006 for (i = 0 ; i < 4; i++) | |
| 2007 { | |
| 2008 fz_point p = fz_transform_point(rd.corners[i], ctm); | |
| 2009 | |
| 2010 rd.corners[i].x = p.x; | |
| 2011 rd.corners[i].y = p.y; | |
| 2012 } | |
| 2013 | |
| 2014 /* So we have a 4 cornered path. Hopefully something like: | |
| 2015 * 0---------1 | |
| 2016 * | | | |
| 2017 * 3---------2 | |
| 2018 * but it might be: | |
| 2019 * 0---------3 | |
| 2020 * | | | |
| 2021 * 1---------2 | |
| 2022 */ | |
| 2023 while (1) | |
| 2024 { | |
| 2025 d01x = rd.corners[1].x - rd.corners[0].x; | |
| 2026 d01y = rd.corners[1].y - rd.corners[0].y; | |
| 2027 d01 = d01x * d01x + d01y * d01y; | |
| 2028 d03x = rd.corners[3].x - rd.corners[0].x; | |
| 2029 d03y = rd.corners[3].y - rd.corners[0].y; | |
| 2030 d03 = d03x * d03x + d03y * d03y; | |
| 2031 if(d01 < d03) | |
| 2032 { | |
| 2033 /* We are the latter case. Transpose it. */ | |
| 2034 fz_point p = rd.corners[1]; | |
| 2035 rd.corners[1] = rd.corners[3]; | |
| 2036 rd.corners[3] = p; | |
| 2037 } | |
| 2038 else | |
| 2039 break; | |
| 2040 } | |
| 2041 d32x = rd.corners[2].x - rd.corners[3].x; | |
| 2042 d32y = rd.corners[2].y - rd.corners[3].y; | |
| 2043 | |
| 2044 /* So d32x and d01x need to be the same for this to be a strikeout. */ | |
| 2045 if (!feq(d32x, d01x) || !feq(d32y, d01y)) | |
| 2046 return 0; | |
| 2047 | |
| 2048 /* We are plausibly a rectangle. */ | |
| 2049 *thickness = sqrtf(d03x * d03x + d03y * d03y); | |
| 2050 | |
| 2051 from->x = (rd.corners[0].x + rd.corners[3].x)/2; | |
| 2052 from->y = (rd.corners[0].y + rd.corners[3].y)/2; | |
| 2053 to->x = (rd.corners[1].x + rd.corners[2].x)/2; | |
| 2054 to->y = (rd.corners[1].y + rd.corners[2].y)/2; | |
| 2055 | |
| 2056 return 1; | |
| 2057 } | |
| 2058 | |
| 2059 static void | |
| 2060 check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm) | |
| 2061 { | |
| 2062 float thickness; | |
| 2063 fz_point from, to; | |
| 2064 | |
| 2065 /* Is this path a thin rectangle (possibly rotated)? If so, then we need to | |
| 2066 * consider it as being a strikeout or underline. */ | |
| 2067 if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm)) | |
| 2068 return; | |
| 2069 | |
| 2070 /* Add to the list of rects in the device. */ | |
| 2071 if (tdev->rect_len == tdev->rect_max) | |
| 2072 { | |
| 2073 int newmax = tdev->rect_max * 2; | |
| 2074 if (newmax == 0) | |
| 2075 newmax = 32; | |
| 2076 | |
| 2077 tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax); | |
| 2078 tdev->rect_max = newmax; | |
| 2079 } | |
| 2080 tdev->rects[tdev->rect_len].from = from; | |
| 2081 tdev->rects[tdev->rect_len].to = to; | |
| 2082 tdev->rects[tdev->rect_len].thickness = thickness; | |
| 2083 tdev->rect_len++; | |
| 2084 } | |
| 2085 | |
| 2086 static void | |
| 2087 add_vector(fz_context *ctx, fz_stext_page *page, fz_rect bbox, uint32_t flags, uint32_t argb) | |
| 2088 { | |
| 2089 fz_stext_block *b = add_block_to_page(ctx, page); | |
| 2090 | |
| 2091 b->type = FZ_STEXT_BLOCK_VECTOR; | |
| 2092 b->bbox = bbox; | |
| 2093 b->u.v.flags = flags; | |
| 2094 b->u.v.argb = argb; | |
| 2095 } | |
| 2096 | |
| 2097 typedef struct | |
| 2098 { | |
| 2099 fz_matrix ctm; | |
| 2100 uint32_t argb; | |
| 2101 uint32_t flags; | |
| 2102 fz_stext_page *page; | |
| 2103 fz_rect leftovers; | |
| 2104 fz_rect pending; | |
| 2105 int count; | |
| 2106 fz_point p[5]; | |
| 2107 } split_path_data; | |
| 2108 | |
| 2109 static void | |
| 2110 maybe_rect(fz_context *ctx, split_path_data *sp) | |
| 2111 { | |
| 2112 int rect = 0; | |
| 2113 int i; | |
| 2114 | |
| 2115 if (sp->count >= 0) | |
| 2116 { | |
| 2117 if (sp->count == 3) | |
| 2118 { | |
| 2119 /* Allow for "moveto A, lineto B, lineto A, close" */ | |
| 2120 if (feq(sp->p[0].x, sp->p[2].x) || feq(sp->p[0].y, sp->p[2].y)) | |
| 2121 sp->count = 2; | |
| 2122 } | |
| 2123 if (sp->count == 2) | |
| 2124 { | |
| 2125 if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y)) | |
| 2126 rect = 1; /* Count that as a rect */ | |
| 2127 } | |
| 2128 else if (sp->count == 4 || sp->count == 5) | |
| 2129 { | |
| 2130 if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y)) | |
| 2131 rect = 1; | |
| 2132 else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y)) | |
| 2133 rect = 1; | |
| 2134 } | |
| 2135 if (rect) | |
| 2136 { | |
| 2137 fz_rect bounds; | |
| 2138 | |
| 2139 bounds.x0 = bounds.x1 = sp->p[0].x; | |
| 2140 bounds.y0 = bounds.y1 = sp->p[0].y; | |
| 2141 for (i = 1; i < sp->count; i++) | |
| 2142 bounds = fz_include_point_in_rect(bounds, sp->p[i]); | |
| 2143 if (fz_is_valid_rect(sp->pending)) | |
| 2144 add_vector(ctx, sp->page, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb); | |
| 2145 sp->pending = bounds; | |
| 2146 return; | |
| 2147 } | |
| 2148 | |
| 2149 for (i = 0; i < sp->count; i++) | |
| 2150 sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]); | |
| 2151 } | |
| 2152 } | |
| 2153 | |
| 2154 static void | |
| 2155 split_move(fz_context *ctx, void *arg, float x, float y) | |
| 2156 { | |
| 2157 split_path_data *sp = (split_path_data *)arg; | |
| 2158 fz_point p = fz_transform_point_xy(x, y, sp->ctm); | |
| 2159 | |
| 2160 maybe_rect(ctx, sp); | |
| 2161 sp->p[0] = p; | |
| 2162 sp->count = 1; | |
| 2163 } | |
| 2164 | |
| 2165 static void | |
| 2166 split_line(fz_context *ctx, void *arg, float x, float y) | |
| 2167 { | |
| 2168 split_path_data *sp = (split_path_data *)arg; | |
| 2169 fz_point p = fz_transform_point_xy(x, y, sp->ctm); | |
| 2170 int i; | |
| 2171 | |
| 2172 if (sp->count >= 0) | |
| 2173 { | |
| 2174 /* Check for lines to the same point. */ | |
| 2175 if (feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y)) | |
| 2176 return; | |
| 2177 /* If we're still maybe a rect, just record the point. */ | |
| 2178 if (sp->count < 4) | |
| 2179 { | |
| 2180 sp->p[sp->count++] = p; | |
| 2181 return; | |
| 2182 } | |
| 2183 /* Check for close line? */ | |
| 2184 if (sp->count == 4) | |
| 2185 { | |
| 2186 if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y)) | |
| 2187 { | |
| 2188 /* We've just drawn a line back to the start point. */ | |
| 2189 /* Needless saving of point, but it makes the logic | |
| 2190 * easier elsewhere. */ | |
| 2191 sp->p[sp->count++] = p; | |
| 2192 return; | |
| 2193 } | |
| 2194 } | |
| 2195 /* We can no longer be a rect. Output the points we had saved. */ | |
| 2196 for (i = 0; i < sp->count; i++) | |
| 2197 sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]); | |
| 2198 /* Remember we're not a rect. */ | |
| 2199 sp->count = -1; | |
| 2200 } | |
| 2201 /* Roll this point into the non-rect bounds. */ | |
| 2202 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p); | |
| 2203 } | |
| 2204 | |
| 2205 static void | |
| 2206 split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3) | |
| 2207 { | |
| 2208 split_path_data *sp = (split_path_data *)arg; | |
| 2209 fz_point p1 = fz_transform_point_xy(x1, y1, sp->ctm); | |
| 2210 fz_point p2 = fz_transform_point_xy(x2, y2, sp->ctm); | |
| 2211 fz_point p3 = fz_transform_point_xy(x3, y3, sp->ctm); | |
| 2212 int i; | |
| 2213 | |
| 2214 if (sp->count >= 0) | |
| 2215 { | |
| 2216 /* We can no longer be a rect. Output the points we had saved. */ | |
| 2217 for (i = 0; i < sp->count; i++) | |
| 2218 sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]); | |
| 2219 /* Remember we're not a rect. */ | |
| 2220 sp->count = -1; | |
| 2221 } | |
| 2222 /* Roll these points into the non-rect bounds. */ | |
| 2223 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p1); | |
| 2224 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p2); | |
| 2225 sp->leftovers = fz_include_point_in_rect(sp->leftovers, p3); | |
| 2226 } | |
| 2227 | |
| 2228 static void | |
| 2229 split_close(fz_context *ctx, void *arg) | |
| 2230 { | |
| 2231 split_path_data *sp = (split_path_data *)arg; | |
| 2232 | |
| 2233 maybe_rect(ctx, sp); | |
| 2234 sp->count = 0; | |
| 2235 } | |
| 2236 | |
| 2237 | |
| 2238 static const | |
| 2239 fz_path_walker split_path_rects = | |
| 2240 { | |
| 2241 split_move, | |
| 2242 split_line, | |
| 2243 split_curve, | |
| 2244 split_close | |
| 2245 }; | |
| 2246 | |
| 2247 static void | |
| 2248 add_vectors_from_path(fz_context *ctx, fz_stext_page *page, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, int stroke) | |
| 2249 { | |
| 2250 int have_leftovers; | |
| 2251 split_path_data sp; | |
| 2252 | |
| 2253 sp.ctm = ctm; | |
| 2254 sp.argb = hexrgba_from_color(ctx, cs, color, alpha); | |
| 2255 sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0; | |
| 2256 sp.page = page; | |
| 2257 sp.count = 0; | |
| 2258 sp.leftovers = fz_empty_rect; | |
| 2259 sp.pending = fz_empty_rect; | |
| 2260 fz_walk_path(ctx, path, &split_path_rects, &sp); | |
| 2261 | |
| 2262 have_leftovers = fz_is_valid_rect(sp.leftovers); | |
| 2263 | |
| 2264 maybe_rect(ctx, &sp); | |
| 2265 | |
| 2266 if (fz_is_valid_rect(sp.pending)) | |
| 2267 add_vector(ctx, page, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb); | |
| 2268 if (have_leftovers) | |
| 2269 add_vector(ctx, page, sp.leftovers, sp.flags, sp.argb); | |
| 2270 } | |
| 2271 | |
| 2272 static void | |
| 2273 fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp) | |
| 2274 { | |
| 2275 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 2276 fz_stext_page *page = tdev->page; | |
| 2277 fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm); | |
| 2278 fz_rect *bounds = actualtext_bounds(tdev); | |
| 2279 | |
| 2280 /* If we're in an actualtext, then update the bounds to include this content. */ | |
| 2281 if (bounds != NULL) | |
| 2282 *bounds = fz_union_rect(*bounds, path_bounds); | |
| 2283 | |
| 2284 if (tdev->flags & FZ_STEXT_COLLECT_STYLES) | |
| 2285 check_for_strikeout(ctx, tdev, page, path, ctm); | |
| 2286 | |
| 2287 if (tdev->flags & FZ_STEXT_COLLECT_VECTORS) | |
| 2288 add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 0); | |
| 2289 } | |
| 2290 | |
| 2291 static void | |
| 2292 fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp) | |
| 2293 { | |
| 2294 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 2295 fz_stext_page *page = tdev->page; | |
| 2296 fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm); | |
| 2297 fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev); | |
| 2298 | |
| 2299 /* If we're in an actualtext, then update the bounds to include this content. */ | |
| 2300 if (bounds != NULL) | |
| 2301 *bounds = fz_union_rect(*bounds, path_bounds); | |
| 2302 | |
| 2303 if (tdev->flags & FZ_STEXT_COLLECT_STYLES) | |
| 2304 check_for_strikeout(ctx, tdev, page, path, ctm); | |
| 2305 | |
| 2306 if (tdev->flags & FZ_STEXT_COLLECT_VECTORS) | |
| 2307 add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 1); | |
| 2308 } | |
| 2309 | |
| 2310 static void | |
| 2311 new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw) | |
| 2312 { | |
| 2313 fz_stext_struct *str; | |
| 2314 size_t z; | |
| 2315 | |
| 2316 if (raw == NULL) | |
| 2317 raw = ""; | |
| 2318 z = strlen(raw); | |
| 2319 | |
| 2320 str = fz_pool_alloc(ctx, page->pool, sizeof(*str) + z); | |
| 2321 str->first_block = NULL; | |
| 2322 str->last_block = NULL; | |
| 2323 str->standard = standard; | |
| 2324 str->parent = page->last_struct; | |
| 2325 str->up = block; | |
| 2326 memcpy(str->raw, raw, z+1); | |
| 2327 | |
| 2328 block->u.s.down = str; | |
| 2329 } | |
| 2330 | |
| 2331 static void | |
| 2332 fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx) | |
| 2333 { | |
| 2334 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 2335 fz_stext_page *page = tdev->page; | |
| 2336 fz_stext_block *block, *le, *gt, *newblock; | |
| 2337 | |
| 2338 if (raw == NULL) | |
| 2339 raw = ""; | |
| 2340 | |
| 2341 /* Find a pointer to the last block. */ | |
| 2342 if (page->last_block) | |
| 2343 { | |
| 2344 block = page->last_block; | |
| 2345 } | |
| 2346 else if (page->last_struct) | |
| 2347 { | |
| 2348 block = page->last_struct->last_block; | |
| 2349 } | |
| 2350 else | |
| 2351 { | |
| 2352 block = page->first_block; | |
| 2353 } | |
| 2354 | |
| 2355 /* So block is somewhere in the content chain. Let's try and find: | |
| 2356 * le = the struct node <= idx before block in the content chain. | |
| 2357 * ge = the struct node >= idx after block in the content chain. | |
| 2358 * Search backwards to start with. | |
| 2359 */ | |
| 2360 gt = NULL; | |
| 2361 le = block; | |
| 2362 while (le) | |
| 2363 { | |
| 2364 if (le->type == FZ_STEXT_BLOCK_STRUCT) | |
| 2365 { | |
| 2366 if (le->u.s.index > idx) | |
| 2367 gt = le; | |
| 2368 if (le->u.s.index <= idx) | |
| 2369 break; | |
| 2370 } | |
| 2371 le = le->prev; | |
| 2372 } | |
| 2373 /* The following loop copes with finding gt (the smallest block with an index higher | |
| 2374 * than we want) if we haven't found it already. The while loop in here was designed | |
| 2375 * to cope with 'block' being in the middle of a list. In fact, the way the code is | |
| 2376 * currently, block will always be at the end of a list, so the while won't do anything. | |
| 2377 * But I'm loathe to remove it in case we ever change this code to start from wherever | |
| 2378 * we did the last insertion. */ | |
| 2379 if (gt == NULL) | |
| 2380 { | |
| 2381 gt = block; | |
| 2382 while (gt) | |
| 2383 { | |
| 2384 if (gt->type == FZ_STEXT_BLOCK_STRUCT) | |
| 2385 { | |
| 2386 if (gt->u.s.index <= idx) | |
| 2387 le = gt; | |
| 2388 if (gt->u.s.index >= idx) | |
| 2389 break; | |
| 2390 } | |
| 2391 block = gt; | |
| 2392 gt = gt->next; | |
| 2393 } | |
| 2394 } | |
| 2395 | |
| 2396 if (le && le->u.s.index == idx) | |
| 2397 { | |
| 2398 /* We want to move down into the le block. Does it have a struct | |
| 2399 * attached yet? */ | |
| 2400 if (le->u.s.down == NULL) | |
| 2401 { | |
| 2402 /* No. We need to create a new struct node. */ | |
| 2403 new_stext_struct(ctx, page, le, standard, raw); | |
| 2404 } | |
| 2405 else if (le->u.s.down->standard != standard || strcmp(raw, le->u.s.down->raw) != 0) | |
| 2406 { | |
| 2407 /* Yes, but it doesn't match the one we expect! */ | |
| 2408 fz_warn(ctx, "Mismatched structure type!"); | |
| 2409 } | |
| 2410 page->last_struct = le->u.s.down; | |
| 2411 page->last_block = le->u.s.down->last_block; | |
| 2412 | |
| 2413 return; | |
| 2414 } | |
| 2415 | |
| 2416 /* We are going to need to create a new block. Create a complete unlinked one here. */ | |
| 2417 newblock = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); | |
| 2418 newblock->bbox = fz_empty_rect; | |
| 2419 newblock->prev = NULL; | |
| 2420 newblock->next = NULL; | |
| 2421 newblock->type = FZ_STEXT_BLOCK_STRUCT; | |
| 2422 newblock->u.s.index = idx; | |
| 2423 newblock->u.s.down = NULL; | |
| 2424 /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */ | |
| 2425 new_stext_struct(ctx, page, newblock, standard, raw); | |
| 2426 | |
| 2427 /* So now we just need to link it in somewhere. */ | |
| 2428 if (gt) | |
| 2429 { | |
| 2430 /* Link it in before gt. */ | |
| 2431 newblock->prev = gt->prev; | |
| 2432 if (gt->prev) | |
| 2433 gt->prev->next = newblock; | |
| 2434 gt->prev = newblock; | |
| 2435 newblock->next = gt; | |
| 2436 } | |
| 2437 else if (block) | |
| 2438 { | |
| 2439 /* Link it in at the end of the list (i.e. after 'block') */ | |
| 2440 newblock->prev = block; | |
| 2441 block->next = newblock; | |
| 2442 } | |
| 2443 else if (page->last_struct) | |
| 2444 { | |
| 2445 /* We have no blocks at all at this level. */ | |
| 2446 page->last_struct->first_block = newblock; | |
| 2447 page->last_struct->last_block = newblock; | |
| 2448 } | |
| 2449 else | |
| 2450 { | |
| 2451 /* We have no blocks at ANY level. */ | |
| 2452 page->first_block = newblock; | |
| 2453 } | |
| 2454 /* Wherever we linked it in, that's where we want to continue adding content. */ | |
| 2455 page->last_struct = newblock->u.s.down; | |
| 2456 page->last_block = NULL; | |
| 2457 } | |
| 2458 | |
| 2459 static void | |
| 2460 fz_stext_end_structure(fz_context *ctx, fz_device *dev) | |
| 2461 { | |
| 2462 fz_stext_device *tdev = (fz_stext_device*)dev; | |
| 2463 fz_stext_page *page = tdev->page; | |
| 2464 fz_stext_struct *str = page->last_struct; | |
| 2465 | |
| 2466 if (str == NULL) | |
| 2467 { | |
| 2468 fz_warn(ctx, "Structure out of sync"); | |
| 2469 return; | |
| 2470 } | |
| 2471 | |
| 2472 page->last_struct = str->parent; | |
| 2473 if (page->last_struct == NULL) | |
| 2474 { | |
| 2475 page->last_block = page->first_block; | |
| 2476 /* Yuck */ | |
| 2477 while (page->last_block->next) | |
| 2478 page->last_block = page->last_block->next; | |
| 2479 } | |
| 2480 else | |
| 2481 { | |
| 2482 page->last_block = page->last_struct->last_block; | |
| 2483 } | |
| 2484 } | |
| 2485 | |
| 2486 fz_device * | |
| 2487 fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) | |
| 2488 { | |
| 2489 fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); | |
| 2490 | |
| 2491 dev->super.close_device = fz_stext_close_device; | |
| 2492 dev->super.drop_device = fz_stext_drop_device; | |
| 2493 | |
| 2494 dev->super.fill_text = fz_stext_fill_text; | |
| 2495 dev->super.stroke_text = fz_stext_stroke_text; | |
| 2496 dev->super.clip_text = fz_stext_clip_text; | |
| 2497 dev->super.clip_stroke_text = fz_stext_clip_stroke_text; | |
| 2498 dev->super.ignore_text = fz_stext_ignore_text; | |
| 2499 dev->super.begin_metatext = fz_stext_begin_metatext; | |
| 2500 dev->super.end_metatext = fz_stext_end_metatext; | |
| 2501 | |
| 2502 dev->super.fill_shade = fz_stext_fill_shade; | |
| 2503 dev->super.fill_image = fz_stext_fill_image; | |
| 2504 dev->super.fill_image_mask = fz_stext_fill_image_mask; | |
| 2505 | |
| 2506 if (opts) | |
| 2507 { | |
| 2508 dev->flags = opts->flags; | |
| 2509 if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE) | |
| 2510 { | |
| 2511 dev->super.begin_structure = fz_stext_begin_structure; | |
| 2512 dev->super.end_structure = fz_stext_end_structure; | |
| 2513 } | |
| 2514 if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES)) | |
| 2515 { | |
| 2516 dev->super.fill_path = fz_stext_fill_path; | |
| 2517 dev->super.stroke_path = fz_stext_stroke_path; | |
| 2518 } | |
| 2519 } | |
| 2520 dev->page = page; | |
| 2521 dev->pen.x = 0; | |
| 2522 dev->pen.y = 0; | |
| 2523 dev->trm = fz_identity; | |
| 2524 dev->lastchar = ' '; | |
| 2525 dev->lasttext = NULL; | |
| 2526 dev->lastbidi = 0; | |
| 2527 dev->last_was_fake_bold = 1; | |
| 2528 if (opts) | |
| 2529 dev->opts = *opts; | |
| 2530 | |
| 2531 if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0) | |
| 2532 dev->super.hints |= FZ_DONT_DECODE_IMAGES; | |
| 2533 | |
| 2534 dev->rect_max = 0; | |
| 2535 dev->rect_len = 0; | |
| 2536 dev->rects = NULL; | |
| 2537 | |
| 2538 return (fz_device*)dev; | |
| 2539 } |
