Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/stext-boxer.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 | |
| 25 #undef DEBUG_WRITE_AS_PS | |
| 26 #undef DEBUG_STRUCT | |
| 27 | |
| 28 typedef struct boxer_s boxer_t; | |
| 29 | |
| 30 typedef struct { | |
| 31 int len; | |
| 32 int max; | |
| 33 fz_rect list[FZ_FLEXIBLE_ARRAY]; | |
| 34 } rectlist_t; | |
| 35 | |
| 36 struct boxer_s { | |
| 37 fz_rect mediabox; | |
| 38 rectlist_t *list; | |
| 39 }; | |
| 40 | |
| 41 static int fz_rect_contains_rect(fz_rect a, fz_rect b) | |
| 42 { | |
| 43 if (a.x0 > b.x0) | |
| 44 return 0; | |
| 45 if (a.y0 > b.y0) | |
| 46 return 0; | |
| 47 if (a.x1 < b.x1) | |
| 48 return 0; | |
| 49 if (a.y1 < b.y1) | |
| 50 return 0; | |
| 51 | |
| 52 return 1; | |
| 53 } | |
| 54 | |
| 55 static rectlist_t * | |
| 56 rectlist_create(fz_context *ctx, int max) | |
| 57 { | |
| 58 rectlist_t *list = fz_malloc_flexible(ctx, rectlist_t, list, max); | |
| 59 | |
| 60 list->len = 0; | |
| 61 list->max = max; | |
| 62 | |
| 63 return list; | |
| 64 } | |
| 65 | |
| 66 /* Push box onto rectlist, unless it is completely enclosed by | |
| 67 * another box, or completely encloses others (in which case they | |
| 68 * are replaced by it). */ | |
| 69 static void | |
| 70 rectlist_append(rectlist_t *list, fz_rect *box) | |
| 71 { | |
| 72 int i; | |
| 73 | |
| 74 for (i = 0; i < list->len; i++) | |
| 75 { | |
| 76 fz_rect *r = &list->list[i]; | |
| 77 fz_rect smaller, larger; | |
| 78 /* We allow ourselves a fudge factor of 4 points when checking for inclusion. */ | |
| 79 double r_fudge = 4; | |
| 80 | |
| 81 smaller.x0 = r->x0 + r_fudge; | |
| 82 larger. x0 = r->x0 - r_fudge; | |
| 83 smaller.y0 = r->y0 + r_fudge; | |
| 84 larger. y0 = r->y0 - r_fudge; | |
| 85 smaller.x1 = r->x1 - r_fudge; | |
| 86 larger. x1 = r->x1 + r_fudge; | |
| 87 smaller.y1 = r->y1 - r_fudge; | |
| 88 larger. y1 = r->y1 + r_fudge; | |
| 89 | |
| 90 if (fz_rect_contains_rect(larger, *box)) | |
| 91 return; /* box is enclosed! Nothing to do. */ | |
| 92 if (fz_rect_contains_rect(*box, smaller)) | |
| 93 { | |
| 94 /* box encloses r. Ditch r. */ | |
| 95 /* Shorten the list */ | |
| 96 --list->len; | |
| 97 /* If the one that just got chopped off wasn't r, move it down. */ | |
| 98 if (i < list->len) | |
| 99 { | |
| 100 memcpy(r, &list->list[list->len], sizeof(*r)); | |
| 101 i--; /* Reconsider this entry next time. */ | |
| 102 } | |
| 103 } | |
| 104 } | |
| 105 | |
| 106 assert(list->len < list->max); | |
| 107 memcpy(&list->list[list->len], box, sizeof(*box)); | |
| 108 list->len++; | |
| 109 } | |
| 110 | |
| 111 static boxer_t * | |
| 112 boxer_create_length(fz_context *ctx, fz_rect *mediabox, int len) | |
| 113 { | |
| 114 boxer_t *boxer = fz_malloc_struct(ctx, boxer_t); | |
| 115 | |
| 116 if (boxer == NULL) | |
| 117 return NULL; | |
| 118 | |
| 119 memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox)); | |
| 120 boxer->list = rectlist_create(ctx, len); | |
| 121 | |
| 122 return boxer; | |
| 123 } | |
| 124 | |
| 125 /* Create a boxer structure for a page of size mediabox. */ | |
| 126 static boxer_t * | |
| 127 boxer_create(fz_context *ctx, fz_rect *mediabox) | |
| 128 { | |
| 129 boxer_t *boxer = boxer_create_length(ctx, mediabox, 1); | |
| 130 | |
| 131 if (boxer == NULL) | |
| 132 return NULL; | |
| 133 rectlist_append(boxer->list, mediabox); | |
| 134 | |
| 135 return boxer; | |
| 136 } | |
| 137 | |
| 138 static void | |
| 139 push_if_intersect_suitable(rectlist_t *dst, const fz_rect *a, const fz_rect *b) | |
| 140 { | |
| 141 fz_rect c; | |
| 142 | |
| 143 /* Intersect a and b. */ | |
| 144 c = fz_intersect_rect(*a, *b); | |
| 145 /* If no intersection, nothing to push. */ | |
| 146 if (!fz_is_valid_rect(c)) | |
| 147 return; | |
| 148 | |
| 149 rectlist_append(dst, &c); | |
| 150 } | |
| 151 | |
| 152 static void | |
| 153 boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const fz_rect *box) | |
| 154 { | |
| 155 int i; | |
| 156 | |
| 157 for (i = 0; i < src->len; i++) | |
| 158 push_if_intersect_suitable(dst, &src->list[i], box); | |
| 159 } | |
| 160 | |
| 161 /* Mark a given box as being occupied (typically by a glyph) */ | |
| 162 static void boxer_feed(fz_context *ctx, boxer_t *boxer, fz_rect *bbox) | |
| 163 { | |
| 164 fz_rect box; | |
| 165 /* When we feed a box into a the boxer, we can never make | |
| 166 * the list more than 4 times as long. */ | |
| 167 rectlist_t *newlist = rectlist_create(ctx, boxer->list->len * 4); | |
| 168 | |
| 169 #ifdef DEBUG_WRITE_AS_PS | |
| 170 printf("0 0 1 setrgbcolor\n"); | |
| 171 printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n", | |
| 172 bbox->x0, bbox->y0, | |
| 173 bbox->x0, bbox->y1, | |
| 174 bbox->x1, bbox->y1, | |
| 175 bbox->x1, bbox->y0 | |
| 176 ); | |
| 177 #endif | |
| 178 | |
| 179 /* Left (0,0) (x0,H) */ | |
| 180 box.x0 = boxer->mediabox.x0; | |
| 181 box.y0 = boxer->mediabox.y0; | |
| 182 box.x1 = bbox->x0; | |
| 183 box.y1 = boxer->mediabox.y1; | |
| 184 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 185 | |
| 186 /* Right (x1,0) (W,H) */ | |
| 187 box.x0 = bbox->x1; | |
| 188 box.y0 = boxer->mediabox.y0; | |
| 189 box.x1 = boxer->mediabox.x1; | |
| 190 box.y1 = boxer->mediabox.y1; | |
| 191 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 192 | |
| 193 /* Bottom (0,0) (W,y0) */ | |
| 194 box.x0 = boxer->mediabox.x0; | |
| 195 box.y0 = boxer->mediabox.y0; | |
| 196 box.x1 = boxer->mediabox.x1; | |
| 197 box.y1 = bbox->y0; | |
| 198 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 199 | |
| 200 /* Top (0,y1) (W,H) */ | |
| 201 box.x0 = boxer->mediabox.x0; | |
| 202 box.y0 = bbox->y1; | |
| 203 box.x1 = boxer->mediabox.x1; | |
| 204 box.y1 = boxer->mediabox.y1; | |
| 205 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 206 | |
| 207 fz_free(ctx, boxer->list); | |
| 208 boxer->list = newlist; | |
| 209 } | |
| 210 | |
| 211 #ifdef DEBUG_WRITE_AS_PS | |
| 212 static int | |
| 213 compare_areas(const void *a_, const void *b_) | |
| 214 { | |
| 215 const fz_rect *a = (const fz_rect *)a_; | |
| 216 const fz_rect *b = (const fz_rect *)b_; | |
| 217 double area_a = (a->x1-a->x0) * (a->y1-a->y0); | |
| 218 double area_b = (b->x1-b->x0) * (b->y1-b->y0); | |
| 219 | |
| 220 if (area_a < area_b) | |
| 221 return 1; | |
| 222 else if (area_a > area_b) | |
| 223 return -1; | |
| 224 else | |
| 225 return 0; | |
| 226 } | |
| 227 | |
| 228 /* Sort the rectangle list to be largest area first. For ease of humans | |
| 229 * reading debug output. */ | |
| 230 static void boxer_sort(boxer_t *boxer) | |
| 231 { | |
| 232 qsort(boxer->list->list, boxer->list->len, sizeof(fz_rect), compare_areas); | |
| 233 } | |
| 234 | |
| 235 /* Get the rectangle list for a given boxer. Return value is the length of | |
| 236 * the list. Lifespan is until the boxer is modified or freed. */ | |
| 237 static int boxer_results(boxer_t *boxer, fz_rect **list) | |
| 238 { | |
| 239 *list = boxer->list->list; | |
| 240 return boxer->list->len; | |
| 241 } | |
| 242 #endif | |
| 243 | |
| 244 /* Currently unused debugging routine */ | |
| 245 #if 0 | |
| 246 static void | |
| 247 boxer_dump(fz_context *ctx, boxer_t *boxer) | |
| 248 { | |
| 249 int i; | |
| 250 | |
| 251 printf("bbox = %g %g %g %g\n", boxer->mediabox.x0, boxer->mediabox.y0, boxer->mediabox.x1, boxer->mediabox.y1); | |
| 252 for (i = 0; i < boxer->list->len; i++) | |
| 253 { | |
| 254 printf("%d %g %g %g %g\n", i, boxer->list->list[i].x0, boxer->list->list[i].y0, boxer->list->list[i].x1, boxer->list->list[i].y1); | |
| 255 } | |
| 256 } | |
| 257 #endif | |
| 258 | |
| 259 /* Destroy a boxer. */ | |
| 260 static void boxer_destroy(fz_context *ctx, boxer_t *boxer) | |
| 261 { | |
| 262 if (!boxer) | |
| 263 return; | |
| 264 | |
| 265 fz_free(ctx, boxer->list); | |
| 266 fz_free(ctx, boxer); | |
| 267 } | |
| 268 | |
| 269 /* Find the margins for a given boxer. */ | |
| 270 static fz_rect boxer_margins(boxer_t *boxer) | |
| 271 { | |
| 272 rectlist_t *list = boxer->list; | |
| 273 int i; | |
| 274 fz_rect margins = boxer->mediabox; | |
| 275 | |
| 276 for (i = 0; i < list->len; i++) | |
| 277 { | |
| 278 fz_rect *r = &list->list[i]; | |
| 279 if (r->x0 <= margins.x0 && r->y0 <= margins.y0 && r->y1 >= margins.y1) | |
| 280 margins.x0 = r->x1; /* Left Margin */ | |
| 281 else if (r->x1 >= margins.x1 && r->y0 <= margins.y0 && r->y1 >= margins.y1) | |
| 282 margins.x1 = r->x0; /* Right Margin */ | |
| 283 else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y0 <= margins.y0) | |
| 284 margins.y0 = r->y1; /* Top Margin */ | |
| 285 else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y1 >= margins.y1) | |
| 286 margins.y1 = r->y0; /* Bottom Margin */ | |
| 287 } | |
| 288 | |
| 289 return margins; | |
| 290 } | |
| 291 | |
| 292 /* Create a new boxer from a subset of an old one. */ | |
| 293 static boxer_t *boxer_subset(fz_context *ctx, boxer_t *boxer, fz_rect rect) | |
| 294 { | |
| 295 boxer_t *new_boxer = boxer_create_length(ctx, &rect, boxer->list->len); | |
| 296 int i; | |
| 297 | |
| 298 if (new_boxer == NULL) | |
| 299 return NULL; | |
| 300 | |
| 301 for (i = 0; i < boxer->list->len; i++) | |
| 302 { | |
| 303 fz_rect r = fz_intersect_rect(boxer->list->list[i], rect); | |
| 304 | |
| 305 if (fz_is_empty_rect(r)) | |
| 306 continue; | |
| 307 rectlist_append(new_boxer->list, &r); | |
| 308 } | |
| 309 | |
| 310 return new_boxer; | |
| 311 } | |
| 312 | |
| 313 static int analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth); | |
| 314 | |
| 315 static void | |
| 316 analyse_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, fz_rect r, int depth) | |
| 317 { | |
| 318 boxer_t *sub_box = boxer_subset(ctx, boxer, r); | |
| 319 | |
| 320 fz_try(ctx) | |
| 321 (void)analyse_sub(ctx, page, first_block, last_block, sub_box, depth); | |
| 322 fz_always(ctx) | |
| 323 boxer_destroy(ctx, sub_box); | |
| 324 fz_catch(ctx) | |
| 325 fz_rethrow(ctx); | |
| 326 } | |
| 327 | |
| 328 /* Consider a boxer for subdivision. | |
| 329 * Returns 0 if no suitable subdivision point found. | |
| 330 * Returns 1 if a subdivision point is found.*/ | |
| 331 static int | |
| 332 boxer_subdivide(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, int depth) | |
| 333 { | |
| 334 rectlist_t *list = boxer->list; | |
| 335 double max_h = 0, max_v = 0; | |
| 336 int i; | |
| 337 | |
| 338 for (i = 0; i < list->len; i++) | |
| 339 { | |
| 340 fz_rect r = boxer->list->list[i]; | |
| 341 | |
| 342 if (r.x0 <= boxer->mediabox.x0 && r.x1 >= boxer->mediabox.x1) | |
| 343 { | |
| 344 /* Horizontal divider */ | |
| 345 double size = r.y1 - r.y0; | |
| 346 if (size > max_h) | |
| 347 { | |
| 348 max_h = size; | |
| 349 } | |
| 350 } | |
| 351 if (r.y0 <= boxer->mediabox.y0 && r.y1 >= boxer->mediabox.y1) | |
| 352 { | |
| 353 /* Vertical divider */ | |
| 354 double size = r.x1 - r.x0; | |
| 355 if (size > max_v) | |
| 356 { | |
| 357 max_v = size; | |
| 358 } | |
| 359 } | |
| 360 } | |
| 361 | |
| 362 if (max_h > max_v) | |
| 363 { | |
| 364 fz_rect r; | |
| 365 float min_gap; | |
| 366 float top; | |
| 367 | |
| 368 /* Divider runs horizontally. */ | |
| 369 | |
| 370 /* We want to list out all the horizontal subregions that are separated | |
| 371 * by an appropriate gap, from top to bottom. */ | |
| 372 | |
| 373 /* Any gap larger than the the gap we ignored will do. */ | |
| 374 min_gap = max_v; | |
| 375 | |
| 376 /* We're going to need to run through the data multiple times to find the | |
| 377 * topmost block each time. We'll use 'top' to cull against, and gradually | |
| 378 * lower that after each successful pass. */ | |
| 379 top = boxer->mediabox.y0; | |
| 380 while (1) | |
| 381 { | |
| 382 /* Find the topmost divider below 'top' of height at least min_gap */ | |
| 383 float found_top; | |
| 384 int found = -1; | |
| 385 for (i = 0; i < list->len; i++) | |
| 386 { | |
| 387 fz_rect *b = &list->list[i]; | |
| 388 if (b->x0 <= boxer->mediabox.x0 && boxer->mediabox.x1 <= b->x1 && b->y0 > top && b->y1 - b->y0 >= min_gap) | |
| 389 { | |
| 390 if (found == -1 || b->y0 < found_top) | |
| 391 { | |
| 392 found = i; | |
| 393 found_top = b->y0; | |
| 394 } | |
| 395 } | |
| 396 } | |
| 397 | |
| 398 /* If we failed to find one, we're done. */ | |
| 399 if (found == -1) | |
| 400 break; | |
| 401 | |
| 402 /* So we have a region from top to found_top */ | |
| 403 r = boxer->mediabox; | |
| 404 r.y0 = top; | |
| 405 r.y1 = found_top; | |
| 406 | |
| 407 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth); | |
| 408 | |
| 409 /* Now move top down for the next go. */ | |
| 410 top = list->list[found].y1; | |
| 411 } | |
| 412 | |
| 413 /* One final region, from top to bottom */ | |
| 414 r = boxer->mediabox; | |
| 415 r.y0 = top; | |
| 416 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth); | |
| 417 | |
| 418 return 1; | |
| 419 } | |
| 420 else if (max_v > 0) | |
| 421 { | |
| 422 fz_rect r; | |
| 423 float min_gap; | |
| 424 float left; | |
| 425 | |
| 426 /* Divider runs vertically. */ | |
| 427 | |
| 428 /* We want to list out all the vertical subregions that are separated | |
| 429 * by an appropriate gap, from left to right. */ | |
| 430 | |
| 431 /* Any gap larger than the the gap we ignored will do. */ | |
| 432 min_gap = max_h; | |
| 433 | |
| 434 /* We're going to need to run through the data multiple times to find the | |
| 435 * leftmost block each time. We'll use 'left' to cull against, and gradually | |
| 436 * lower that after each successful pass. */ | |
| 437 left = boxer->mediabox.x0; | |
| 438 while (1) | |
| 439 { | |
| 440 /* Find the leftmost divider to the right of 'left' of width at least min_gap */ | |
| 441 float found_left; | |
| 442 int found = -1; | |
| 443 for (i = 0; i < list->len; i++) | |
| 444 { | |
| 445 fz_rect *b = &list->list[i]; | |
| 446 if (b->y0 <= boxer->mediabox.y0 && boxer->mediabox.y1 <= b->y1 && b->x0 > left && b->x1 - b->x0 >= min_gap) | |
| 447 { | |
| 448 if (found == -1 || b->x0 < found_left) | |
| 449 { | |
| 450 found = i; | |
| 451 found_left = b->x0; | |
| 452 } | |
| 453 } | |
| 454 } | |
| 455 | |
| 456 /* If we failed to find one, we're done. */ | |
| 457 if (found == -1) | |
| 458 break; | |
| 459 | |
| 460 /* So we have a region from top to found_top */ | |
| 461 r = boxer->mediabox; | |
| 462 r.x0 = left; | |
| 463 r.x1 = found_left; | |
| 464 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth); | |
| 465 | |
| 466 /* Now move left right for the next go. */ | |
| 467 left = list->list[found].x1; | |
| 468 } | |
| 469 | |
| 470 /* One final region, from left to right */ | |
| 471 r = boxer->mediabox; | |
| 472 r.x0 = left; | |
| 473 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth); | |
| 474 | |
| 475 return 1; | |
| 476 } | |
| 477 | |
| 478 return 0; | |
| 479 } | |
| 480 | |
| 481 static void | |
| 482 new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw) | |
| 483 { | |
| 484 fz_stext_struct *str; | |
| 485 size_t z; | |
| 486 | |
| 487 if (raw == NULL) | |
| 488 raw = ""; | |
| 489 z = strlen(raw); | |
| 490 | |
| 491 str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1); | |
| 492 str->first_block = NULL; | |
| 493 str->last_block = NULL; | |
| 494 str->standard = standard; | |
| 495 str->parent = page->last_struct; | |
| 496 str->up = block; | |
| 497 memcpy(str->raw, raw, z+1); | |
| 498 | |
| 499 block->u.s.down = str; | |
| 500 } | |
| 501 | |
| 502 #ifdef DEBUG_STRUCT | |
| 503 static void | |
| 504 do_dump_stext(fz_stext_block *block, int depth) | |
| 505 { | |
| 506 int d; | |
| 507 | |
| 508 while (block) | |
| 509 { | |
| 510 for (d = 0; d < depth; d++) | |
| 511 printf(" "); | |
| 512 switch(block->type) | |
| 513 { | |
| 514 case FZ_STEXT_BLOCK_TEXT: | |
| 515 printf("TEXT %p\n", block); | |
| 516 break; | |
| 517 case FZ_STEXT_BLOCK_IMAGE: | |
| 518 printf("IMAGE %p\n", block); | |
| 519 break; | |
| 520 case FZ_STEXT_BLOCK_VECTOR: | |
| 521 printf("VECTOR %p\n", block); | |
| 522 break; | |
| 523 case FZ_STEXT_BLOCK_STRUCT: | |
| 524 printf("STRUCT %p\n", block); | |
| 525 do_dump_stext(block->u.s.down->first_block, depth+1); | |
| 526 break; | |
| 527 } | |
| 528 block = block->next; | |
| 529 } | |
| 530 } | |
| 531 | |
| 532 static void | |
| 533 dump_stext(char *str, fz_stext_block *block) | |
| 534 { | |
| 535 printf("%s\n", str); | |
| 536 | |
| 537 do_dump_stext(block, 0); | |
| 538 } | |
| 539 #endif | |
| 540 | |
| 541 static void | |
| 542 recalc_bbox(fz_stext_block *block) | |
| 543 { | |
| 544 fz_rect bbox = fz_empty_rect; | |
| 545 fz_stext_line *line; | |
| 546 | |
| 547 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 548 bbox = fz_union_rect(bbox, line->bbox); | |
| 549 | |
| 550 block->bbox = bbox; | |
| 551 } | |
| 552 | |
| 553 static fz_stext_struct * | |
| 554 page_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, fz_rect mediabox) | |
| 555 { | |
| 556 fz_stext_block *block, *next_block; | |
| 557 fz_stext_block *target = NULL; | |
| 558 fz_stext_block *last = NULL; | |
| 559 fz_stext_block *newblock; | |
| 560 int idx = 0; | |
| 561 | |
| 562 #ifdef DEBUG_STRUCT | |
| 563 dump_stext("BEFORE", *first_block); | |
| 564 #endif | |
| 565 | |
| 566 for (block = *first_block; block != NULL; block = next_block) | |
| 567 { | |
| 568 fz_rect bbox; | |
| 569 | |
| 570 next_block = block->next; | |
| 571 if (block->type != FZ_STEXT_BLOCK_TEXT && block->type != FZ_STEXT_BLOCK_VECTOR) | |
| 572 continue; | |
| 573 | |
| 574 bbox = block->bbox; | |
| 575 | |
| 576 /* Can we take the whole block? */ | |
| 577 if (bbox.x0 >= mediabox.x0 && bbox.y0 >= mediabox.y0 && bbox.x1 <= mediabox.x1 && bbox.y1 <= mediabox.y1) | |
| 578 { | |
| 579 /* Unlink block from the current list. */ | |
| 580 if (block->prev) | |
| 581 block->prev->next = next_block; | |
| 582 else | |
| 583 *first_block = next_block; | |
| 584 if (next_block) | |
| 585 next_block->prev = block->prev; | |
| 586 else | |
| 587 *last_block = block->prev; | |
| 588 | |
| 589 /* Add block onto our target list */ | |
| 590 if (target == NULL) | |
| 591 { | |
| 592 target = block; | |
| 593 block->prev = NULL; | |
| 594 } | |
| 595 else | |
| 596 { | |
| 597 last->next = block; | |
| 598 block->prev = last; | |
| 599 } | |
| 600 last = block; | |
| 601 block->next = NULL; | |
| 602 } | |
| 603 else if (block->type == FZ_STEXT_BLOCK_TEXT && !fz_is_empty_rect(fz_intersect_rect(bbox, mediabox))) | |
| 604 { | |
| 605 /* Need to look at the parts. */ | |
| 606 fz_stext_line *line, *next_line; | |
| 607 | |
| 608 newblock = NULL; | |
| 609 for (line = block->u.t.first_line; line != NULL; line = next_line) | |
| 610 { | |
| 611 next_line = line->next; | |
| 612 if (line->bbox.x0 >= mediabox.x0 && line->bbox.y0 >= mediabox.y0 && line->bbox.x1 <= mediabox.x1 && line->bbox.y1 <= mediabox.y1) | |
| 613 { | |
| 614 /* We need to take this line */ | |
| 615 if (newblock == NULL) | |
| 616 { | |
| 617 newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block)); | |
| 618 | |
| 619 /* Add the block onto our target list */ | |
| 620 if (target == NULL) | |
| 621 { | |
| 622 target = newblock; | |
| 623 } | |
| 624 else | |
| 625 { | |
| 626 last->next = newblock; | |
| 627 newblock->prev = last; | |
| 628 } | |
| 629 last = newblock; | |
| 630 } | |
| 631 | |
| 632 /* Unlink line from the current list. */ | |
| 633 if (line->prev) | |
| 634 line->prev->next = next_line; | |
| 635 else | |
| 636 block->u.t.first_line = next_line; | |
| 637 if (next_line) | |
| 638 next_line->prev = line->prev; | |
| 639 else | |
| 640 block->u.t.last_line = line->prev; | |
| 641 | |
| 642 /* Add line onto our new block */ | |
| 643 if (newblock->u.t.last_line == NULL) | |
| 644 { | |
| 645 newblock->u.t.first_line = newblock->u.t.last_line = line; | |
| 646 line->prev = NULL; | |
| 647 } | |
| 648 else | |
| 649 { | |
| 650 line->prev = newblock->u.t.last_line; | |
| 651 newblock->u.t.last_line->next = line; | |
| 652 newblock->u.t.last_line = line; | |
| 653 } | |
| 654 line->next = NULL; | |
| 655 } | |
| 656 } | |
| 657 if (newblock) | |
| 658 { | |
| 659 recalc_bbox(block); | |
| 660 recalc_bbox(newblock); | |
| 661 } | |
| 662 } | |
| 663 } | |
| 664 | |
| 665 /* If no content to add, bale! */ | |
| 666 if (target == NULL) | |
| 667 return NULL; | |
| 668 | |
| 669 /* We want to insert a structure node that contains target as the last structure | |
| 670 * node on this blocklist. Find the first block that's not a structure block. */ | |
| 671 for (block = *first_block; block != NULL; block = block->next) | |
| 672 { | |
| 673 if (block->type != FZ_STEXT_BLOCK_STRUCT) | |
| 674 break; | |
| 675 idx++; | |
| 676 } | |
| 677 | |
| 678 /* So we want to insert just before block. */ | |
| 679 | |
| 680 /* We are going to need to create a new block. Create a complete unlinked one here. */ | |
| 681 newblock = fz_pool_alloc(ctx, page->pool, sizeof *newblock); | |
| 682 newblock->bbox = fz_empty_rect; | |
| 683 newblock->prev = block ? block->prev : *last_block; | |
| 684 newblock->next = block; | |
| 685 newblock->type = FZ_STEXT_BLOCK_STRUCT; | |
| 686 newblock->u.s.index = idx; | |
| 687 newblock->u.s.down = NULL; | |
| 688 /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */ | |
| 689 /* And create a new struct and have newblock point to it. */ | |
| 690 new_stext_struct(ctx, page, newblock, FZ_STRUCTURE_DIV, "Split"); | |
| 691 | |
| 692 /* Now insert newblock just before block */ | |
| 693 /* If block was first, now we are. */ | |
| 694 if (*first_block == block) | |
| 695 *first_block = newblock; | |
| 696 if (block == NULL) | |
| 697 { | |
| 698 /* Inserting at the end! */ | |
| 699 if (*last_block) | |
| 700 (*last_block)->next = newblock; | |
| 701 *last_block = newblock; | |
| 702 } | |
| 703 else | |
| 704 { | |
| 705 if (block->prev) | |
| 706 block->prev->next = newblock; | |
| 707 block->prev = newblock; | |
| 708 } | |
| 709 | |
| 710 newblock->u.s.down->first_block = target; | |
| 711 target->prev = NULL; | |
| 712 | |
| 713 for (block = target; block->next != NULL; block = block->next) | |
| 714 newblock->bbox = fz_union_rect(newblock->bbox, block->bbox); | |
| 715 newblock->bbox = fz_union_rect(newblock->bbox, block->bbox); | |
| 716 newblock->u.s.down->last_block = block; | |
| 717 | |
| 718 #ifdef DEBUG_STRUCT | |
| 719 dump_stext("AFTER", *first_block); | |
| 720 #endif | |
| 721 | |
| 722 return newblock->u.s.down; | |
| 723 } | |
| 724 | |
| 725 enum { | |
| 726 MAX_ANALYSIS_DEPTH = 6 | |
| 727 }; | |
| 728 | |
| 729 static int | |
| 730 analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth) | |
| 731 { | |
| 732 fz_rect margins; | |
| 733 boxer_t *boxer; | |
| 734 boxer_t *boxer1 = NULL; | |
| 735 boxer_t *boxer2 = NULL; | |
| 736 fz_stext_struct *div; | |
| 737 int ret = 0; | |
| 738 | |
| 739 /* Find the margins in the enclosing boxer. This returns | |
| 740 * a subset of the bbox of the original. */ | |
| 741 margins = boxer_margins(big_boxer); | |
| 742 #ifdef DEBUG_WRITE_AS_PS | |
| 743 printf("\n\n%% MARGINS %g %g %g %g\n", margins.x0, margins.y0, margins.x1, margins.y1); | |
| 744 #endif | |
| 745 | |
| 746 /* Now subset the rectangles just to include those that are in our bbox. */ | |
| 747 boxer = boxer_subset(ctx, big_boxer, margins); | |
| 748 | |
| 749 fz_var(boxer1); | |
| 750 fz_var(boxer2); | |
| 751 | |
| 752 fz_try(ctx) | |
| 753 { | |
| 754 div = page_subset(ctx, page, first_block, last_block, boxer->mediabox); | |
| 755 /* If nothing subsetted (no textual content in that region), give up. */ | |
| 756 if (div == NULL) | |
| 757 break; | |
| 758 | |
| 759 ret = 1; | |
| 760 | |
| 761 if (depth < MAX_ANALYSIS_DEPTH) | |
| 762 { | |
| 763 /* Can we subdivide that region any more? */ | |
| 764 if (boxer_subdivide(ctx, page, &div->first_block, &div->last_block, boxer, depth+1)) | |
| 765 break; | |
| 766 } | |
| 767 | |
| 768 #ifdef DEBUG_WRITE_AS_PS | |
| 769 { | |
| 770 int i, n; | |
| 771 fz_rect *list; | |
| 772 boxer_sort(boxer); | |
| 773 n = boxer_results(boxer, &list); | |
| 774 | |
| 775 printf("%% SUBDIVISION\n"); | |
| 776 for (i = 0; i < n; i++) | |
| 777 { | |
| 778 printf("%% %g %g %g %g\n", | |
| 779 list[i].x0, list[i].y0, list[i].x1, list[i].y1); | |
| 780 } | |
| 781 | |
| 782 printf("0 0 0 setrgbcolor\n"); | |
| 783 for (i = 0; i < n; i++) { | |
| 784 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n", | |
| 785 list[i].x0, list[i].y0, | |
| 786 list[i].x0, list[i].y1, | |
| 787 list[i].x1, list[i].y1, | |
| 788 list[i].x1, list[i].y0); | |
| 789 } | |
| 790 | |
| 791 printf("1 0 0 setrgbcolor\n"); | |
| 792 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n", | |
| 793 margins.x0, margins.y0, | |
| 794 margins.x0, margins.y1, | |
| 795 margins.x1, margins.y1, | |
| 796 margins.x1, margins.y0); | |
| 797 } | |
| 798 #endif | |
| 799 } | |
| 800 fz_always(ctx) | |
| 801 { | |
| 802 boxer_destroy(ctx, boxer1); | |
| 803 boxer_destroy(ctx, boxer2); | |
| 804 boxer_destroy(ctx, boxer); | |
| 805 } | |
| 806 fz_catch(ctx) | |
| 807 fz_rethrow(ctx); | |
| 808 | |
| 809 return ret; | |
| 810 } | |
| 811 | |
| 812 static int | |
| 813 line_isnt_all_spaces(fz_context *ctx, fz_stext_line *line) | |
| 814 { | |
| 815 fz_stext_char *ch; | |
| 816 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 817 if (ch->c != 32 && ch->c != 160) | |
| 818 return 1; | |
| 819 return 0; | |
| 820 } | |
| 821 | |
| 822 static void | |
| 823 feed_line(fz_context *ctx, boxer_t *boxer, fz_stext_line *line) | |
| 824 { | |
| 825 fz_stext_char *ch; | |
| 826 | |
| 827 for (ch = line->first_char; ch != NULL; ch = ch->next) | |
| 828 { | |
| 829 fz_rect r = fz_empty_rect; | |
| 830 | |
| 831 if (ch->c == ' ') | |
| 832 continue; | |
| 833 | |
| 834 do | |
| 835 { | |
| 836 fz_rect bbox = fz_rect_from_quad(ch->quad); | |
| 837 float margin = ch->size/2; | |
| 838 bbox.x0 -= margin; | |
| 839 bbox.y0 -= margin; | |
| 840 bbox.x1 += margin; | |
| 841 bbox.y1 += margin; | |
| 842 r = fz_union_rect(r, bbox); | |
| 843 ch = ch->next; | |
| 844 } | |
| 845 while (ch != NULL && ch->c != ' '); | |
| 846 boxer_feed(ctx, boxer, &r); | |
| 847 if (ch == NULL) | |
| 848 break; | |
| 849 } | |
| 850 } | |
| 851 | |
| 852 /* Internal, non-API function, shared with stext-table. */ | |
| 853 fz_rect | |
| 854 fz_collate_small_vector_run(fz_stext_block **blockp) | |
| 855 { | |
| 856 fz_stext_block *block = *blockp; | |
| 857 fz_stext_block *next; | |
| 858 fz_rect r = block->bbox; | |
| 859 int MAX_SIZE = 2; | |
| 860 int MAX_GAP = 2; | |
| 861 | |
| 862 float w = r.x1 - r.x0; | |
| 863 float h = r.y1 - r.y0; | |
| 864 | |
| 865 if (w < MAX_SIZE) | |
| 866 { | |
| 867 while ((next = block->next) != NULL && | |
| 868 next->type == FZ_STEXT_BLOCK_VECTOR && | |
| 869 next->bbox.x0 == r.x0 && | |
| 870 next->bbox.x1 == r.x1 && | |
| 871 ((next->bbox.y1 > r.y1 && next->bbox.y0 <= r.y1 + MAX_GAP) || | |
| 872 (next->bbox.y0 < r.y0 && next->bbox.y1 >= r.y0 - MAX_GAP))) | |
| 873 { | |
| 874 r = fz_union_rect(r, next->bbox); | |
| 875 block = next; | |
| 876 } | |
| 877 } | |
| 878 if (h < MAX_SIZE) | |
| 879 { | |
| 880 while ((next = block->next) != NULL && | |
| 881 next->type == FZ_STEXT_BLOCK_VECTOR && | |
| 882 next->bbox.y0 == r.y0 && | |
| 883 next->bbox.y1 == r.y1 && | |
| 884 ((next->bbox.x1 > r.x1 && next->bbox.x0 <= r.x1 + MAX_GAP) || | |
| 885 (next->bbox.x0 < r.x0 && next->bbox.x1 >= r.x0 - MAX_GAP))) | |
| 886 { | |
| 887 r = fz_union_rect(r, next->bbox); | |
| 888 block = next; | |
| 889 } | |
| 890 } | |
| 891 | |
| 892 *blockp = block; | |
| 893 | |
| 894 return r; | |
| 895 } | |
| 896 | |
| 897 int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page) | |
| 898 { | |
| 899 boxer_t *boxer; | |
| 900 fz_stext_block *block; | |
| 901 int ret = 0; | |
| 902 | |
| 903 /* If we have structure already, give up. We can't hope to beat | |
| 904 * proper structure! */ | |
| 905 for (block = page->first_block; block != NULL; block = block->next) | |
| 906 if (block->type == FZ_STEXT_BLOCK_STRUCT) | |
| 907 return 0; | |
| 908 | |
| 909 #ifdef DEBUG_WRITE_AS_PS | |
| 910 printf("1 -1 scale 0 -%g translate\n", page->mediabox.y1-page->mediabox.y0); | |
| 911 #endif | |
| 912 | |
| 913 boxer = boxer_create(ctx, &page->mediabox); | |
| 914 | |
| 915 fz_try(ctx) | |
| 916 { | |
| 917 /* Just walking the blocks is safe as we're assuming no structure here. */ | |
| 918 for (block = page->first_block; block != NULL; block = block->next) | |
| 919 { | |
| 920 fz_stext_line *line; | |
| 921 switch (block->type) | |
| 922 { | |
| 923 case FZ_STEXT_BLOCK_TEXT: | |
| 924 for (line = block->u.t.first_line; line != NULL; line = line->next) | |
| 925 if (line_isnt_all_spaces(ctx, line)) | |
| 926 feed_line(ctx, boxer, line); | |
| 927 break; | |
| 928 case FZ_STEXT_BLOCK_VECTOR: | |
| 929 { | |
| 930 /* Allow a 1 point margin around vectors to avoid hairline | |
| 931 * cracks between supposedly abutting things. */ | |
| 932 int VECTOR_MARGIN = 1; | |
| 933 fz_rect r = fz_collate_small_vector_run(&block); | |
| 934 | |
| 935 r.x0 -= VECTOR_MARGIN; | |
| 936 r.y0 -= VECTOR_MARGIN; | |
| 937 r.x1 += VECTOR_MARGIN; | |
| 938 r.y1 += VECTOR_MARGIN; | |
| 939 boxer_feed(ctx, boxer, &r); | |
| 940 } | |
| 941 } | |
| 942 } | |
| 943 | |
| 944 ret = analyse_sub(ctx, page, &page->first_block, &page->last_block, boxer, 0); | |
| 945 } | |
| 946 fz_always(ctx) | |
| 947 boxer_destroy(ctx, boxer); | |
| 948 fz_catch(ctx) | |
| 949 fz_rethrow(ctx); | |
| 950 | |
| 951 #ifdef DEBUG_WRITE_AS_PS | |
| 952 printf("showpage\n"); | |
| 953 #endif | |
| 954 | |
| 955 return ret; | |
| 956 } |
