Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/boxer.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include <stdio.h> | |
| 2 #include <stdlib.h> | |
| 3 #include <memory.h> | |
| 4 #include <assert.h> | |
| 5 | |
| 6 #include "document.h" | |
| 7 #include "outf.h" | |
| 8 | |
| 9 #define DEBUG_WRITE_AS_PS | |
| 10 /* #define DEBUG_PRINT */ | |
| 11 | |
| 12 typedef struct boxer_s boxer_t; | |
| 13 | |
| 14 typedef struct { | |
| 15 int len; | |
| 16 int max; | |
| 17 rect_t list[1]; | |
| 18 } rectlist_t; | |
| 19 | |
| 20 struct boxer_s { | |
| 21 extract_alloc_t *alloc; | |
| 22 rect_t mediabox; | |
| 23 rectlist_t *list; | |
| 24 }; | |
| 25 | |
| 26 static rectlist_t * | |
| 27 rectlist_create(extract_alloc_t *alloc, int max) | |
| 28 { | |
| 29 rectlist_t *list; | |
| 30 | |
| 31 if (extract_malloc(alloc, &list, sizeof(rectlist_t) + sizeof(rect_t)*(max-1))) | |
| 32 return NULL; | |
| 33 | |
| 34 list->len = 0; | |
| 35 list->max = max; | |
| 36 | |
| 37 return list; | |
| 38 } | |
| 39 | |
| 40 /* Push box onto rectlist, unless it is completely enclosed by | |
| 41 * another box, or completely encloses others (in which case they | |
| 42 * are replaced by it). */ | |
| 43 static void | |
| 44 rectlist_append(rectlist_t *list, rect_t *box) | |
| 45 { | |
| 46 int i; | |
| 47 | |
| 48 for (i = 0; i < list->len; i++) | |
| 49 { | |
| 50 rect_t *r = &list->list[i]; | |
| 51 rect_t smaller, larger; | |
| 52 /* We allow ourselves a fudge factor of 4 points when checking for inclusion. */ | |
| 53 double r_fudge = 4; | |
| 54 | |
| 55 smaller.min.x = r->min.x + r_fudge; | |
| 56 larger. min.x = r->min.x - r_fudge; | |
| 57 smaller.min.y = r->min.y + r_fudge; | |
| 58 larger. min.y = r->min.y - r_fudge; | |
| 59 smaller.max.x = r->max.x - r_fudge; | |
| 60 larger. max.x = r->max.x + r_fudge; | |
| 61 smaller.max.y = r->max.y - r_fudge; | |
| 62 larger. max.y = r->max.y + r_fudge; | |
| 63 | |
| 64 if (extract_rect_contains_rect(larger, *box)) | |
| 65 return; /* box is enclosed! Nothing to do. */ | |
| 66 if (extract_rect_contains_rect(*box, smaller)) | |
| 67 { | |
| 68 /* box encloses r. Ditch r. */ | |
| 69 /* Shorten the list */ | |
| 70 --list->len; | |
| 71 /* If the one that just got chopped off wasn't r, move it down. */ | |
| 72 if (i < list->len) | |
| 73 { | |
| 74 memcpy(r, &list->list[list->len], sizeof(*r)); | |
| 75 i--; /* Reconsider this entry next time. */ | |
| 76 } | |
| 77 } | |
| 78 } | |
| 79 | |
| 80 assert(list->len < list->max); | |
| 81 memcpy(&list->list[list->len], box, sizeof(*box)); | |
| 82 list->len++; | |
| 83 } | |
| 84 | |
| 85 static boxer_t * | |
| 86 boxer_create_length(extract_alloc_t *alloc, rect_t *mediabox, int len) | |
| 87 { | |
| 88 boxer_t *boxer; | |
| 89 | |
| 90 if (extract_malloc(alloc, &boxer, sizeof(*boxer))) | |
| 91 return NULL; | |
| 92 | |
| 93 boxer->alloc = alloc; | |
| 94 memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox)); | |
| 95 boxer->list = rectlist_create(alloc, len); | |
| 96 | |
| 97 return boxer; | |
| 98 } | |
| 99 | |
| 100 /* Create a boxer structure for a page of size mediabox. */ | |
| 101 static boxer_t * | |
| 102 boxer_create(extract_alloc_t *alloc, rect_t *mediabox) | |
| 103 { | |
| 104 boxer_t *boxer = boxer_create_length(alloc, mediabox, 1); | |
| 105 | |
| 106 if (boxer == NULL) | |
| 107 return NULL; | |
| 108 rectlist_append(boxer->list, mediabox); | |
| 109 | |
| 110 return boxer; | |
| 111 } | |
| 112 | |
| 113 static void | |
| 114 push_if_intersect_suitable(rectlist_t *dst, const rect_t *a, const rect_t *b) | |
| 115 { | |
| 116 rect_t c; | |
| 117 | |
| 118 /* Intersect a and b. */ | |
| 119 c = extract_rect_intersect(*a, *b); | |
| 120 /* If no intersection, nothing to push. */ | |
| 121 if (!extract_rect_valid(c)) | |
| 122 return; | |
| 123 | |
| 124 /* If the intersect is too narrow or too tall, ignore it. | |
| 125 * We don't care about inter character spaces, for example. | |
| 126 * Arbitrary 4 point threshold. */ | |
| 127 #define THRESHOLD 4 | |
| 128 if (c.min.x + THRESHOLD >= c.max.x || c.min.y+THRESHOLD >= c.max.y) | |
| 129 return; | |
| 130 | |
| 131 rectlist_append(dst, &c); | |
| 132 } | |
| 133 | |
| 134 static void | |
| 135 boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const rect_t *box) | |
| 136 { | |
| 137 int i; | |
| 138 | |
| 139 for (i = 0; i < src->len; i++) | |
| 140 push_if_intersect_suitable(dst, &src->list[i], box); | |
| 141 } | |
| 142 | |
| 143 /* Mark a given box as being occupied (typically by a glyph) */ | |
| 144 static int boxer_feed(boxer_t *boxer, rect_t *bbox) | |
| 145 { | |
| 146 rect_t box; | |
| 147 /* When we feed a box into a the boxer, we can never make | |
| 148 * the list more than 4 times as long. */ | |
| 149 rectlist_t *newlist = rectlist_create(boxer->alloc, boxer->list->len * 4); | |
| 150 if (newlist == NULL) | |
| 151 return -1; | |
| 152 | |
| 153 #ifdef DEBUG_WRITE_AS_PS | |
| 154 printf("0 0 1 setrgbcolor\n"); | |
| 155 printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n", | |
| 156 bbox->min.x, bbox->min.y, | |
| 157 bbox->min.x, bbox->max.y, | |
| 158 bbox->max.x, bbox->max.y, | |
| 159 bbox->max.x, bbox->min.y | |
| 160 ); | |
| 161 #endif | |
| 162 | |
| 163 /* Left (0,0) (min.x,H) */ | |
| 164 box.min.x = boxer->mediabox.min.x; | |
| 165 box.min.y = boxer->mediabox.min.y; | |
| 166 box.max.x = bbox->min.x; | |
| 167 box.max.y = boxer->mediabox.max.y; | |
| 168 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 169 | |
| 170 /* Right (max.x,0) (W,H) */ | |
| 171 box.min.x = bbox->max.x; | |
| 172 box.min.y = boxer->mediabox.min.y; | |
| 173 box.max.x = boxer->mediabox.max.x; | |
| 174 box.max.y = boxer->mediabox.max.y; | |
| 175 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 176 | |
| 177 /* Bottom (0,0) (W,min.y) */ | |
| 178 box.min.x = boxer->mediabox.min.x; | |
| 179 box.min.y = boxer->mediabox.min.y; | |
| 180 box.max.x = boxer->mediabox.max.x; | |
| 181 box.max.y = bbox->min.y; | |
| 182 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 183 | |
| 184 /* Top (0,max.y) (W,H) */ | |
| 185 box.min.x = boxer->mediabox.min.x; | |
| 186 box.min.y = bbox->max.y; | |
| 187 box.max.x = boxer->mediabox.max.x; | |
| 188 box.max.y = boxer->mediabox.max.y; | |
| 189 boxlist_feed_intersect(newlist, boxer->list, &box); | |
| 190 | |
| 191 extract_free(boxer->alloc, &boxer->list); | |
| 192 boxer->list = newlist; | |
| 193 | |
| 194 return 0; | |
| 195 } | |
| 196 | |
| 197 static int | |
| 198 compare_areas(const void *a_, const void *b_) | |
| 199 { | |
| 200 const rect_t *a = (const rect_t *)a_; | |
| 201 const rect_t *b = (const rect_t *)b_; | |
| 202 double area_a = (a->max.x-a->min.x) * (a->max.y-a->min.y); | |
| 203 double area_b = (b->max.x-b->min.x) * (b->max.y-b->min.y); | |
| 204 | |
| 205 if (area_a < area_b) | |
| 206 return 1; | |
| 207 else if (area_a > area_b) | |
| 208 return -1; | |
| 209 else | |
| 210 return 0; | |
| 211 } | |
| 212 | |
| 213 /* Sort the rectangle list to be largest area first. For ease of humans | |
| 214 * reading debug output. */ | |
| 215 static void boxer_sort(boxer_t *boxer) | |
| 216 { | |
| 217 qsort(boxer->list->list, boxer->list->len, sizeof(rect_t), compare_areas); | |
| 218 } | |
| 219 | |
| 220 /* Get the rectangle list for a given boxer. Return value is the length of | |
| 221 * the list. Lifespan is until the boxer is modified or freed. */ | |
| 222 static int boxer_results(boxer_t *boxer, rect_t **list) | |
| 223 { | |
| 224 *list = boxer->list->list; | |
| 225 return boxer->list->len; | |
| 226 } | |
| 227 | |
| 228 /* Destroy a boxer. */ | |
| 229 static void boxer_destroy(boxer_t *boxer) | |
| 230 { | |
| 231 if (!boxer) | |
| 232 return; | |
| 233 | |
| 234 extract_free(boxer->alloc, &boxer->list); | |
| 235 extract_free(boxer->alloc, &boxer); | |
| 236 } | |
| 237 | |
| 238 /* Find the margins for a given boxer. */ | |
| 239 static rect_t boxer_margins(boxer_t *boxer) | |
| 240 { | |
| 241 rectlist_t *list = boxer->list; | |
| 242 int i; | |
| 243 rect_t margins = boxer->mediabox; | |
| 244 | |
| 245 for (i = 0; i < list->len; i++) | |
| 246 { | |
| 247 rect_t *r = &list->list[i]; | |
| 248 if (r->min.x <= margins.min.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y) | |
| 249 margins.min.x = r->max.x; /* Left Margin */ | |
| 250 else if (r->max.x >= margins.max.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y) | |
| 251 margins.max.x = r->min.x; /* Right Margin */ | |
| 252 else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->min.y <= margins.min.y) | |
| 253 margins.min.y = r->max.y; /* Top Margin */ | |
| 254 else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->max.y >= margins.max.y) | |
| 255 margins.max.y = r->min.y; /* Bottom Margin */ | |
| 256 } | |
| 257 | |
| 258 return margins; | |
| 259 } | |
| 260 | |
| 261 /* Create a new boxer from a subset of an old one. */ | |
| 262 static boxer_t *boxer_subset(boxer_t *boxer, rect_t rect) | |
| 263 { | |
| 264 boxer_t *new_boxer = boxer_create_length(boxer->alloc, &rect, boxer->list->len); | |
| 265 int i; | |
| 266 | |
| 267 if (new_boxer == NULL) | |
| 268 return NULL; | |
| 269 | |
| 270 for (i = 0; i < boxer->list->len; i++) | |
| 271 { | |
| 272 rect_t r = extract_rect_intersect(boxer->list->list[i], rect); | |
| 273 | |
| 274 if (!extract_rect_valid(r)) | |
| 275 continue; | |
| 276 rectlist_append(new_boxer->list, &r); | |
| 277 } | |
| 278 | |
| 279 return new_boxer; | |
| 280 } | |
| 281 | |
| 282 /* Consider a boxer for subdivision. | |
| 283 * Returns 0 if no suitable subdivision point found. | |
| 284 * Returns 1, and sets *boxer1 and *boxer2 to new boxer structures for the the subdivisions | |
| 285 * if a subdivision point is found.*/ | |
| 286 static split_type_t | |
| 287 boxer_subdivide(boxer_t *boxer, boxer_t **boxer1, boxer_t **boxer2) | |
| 288 { | |
| 289 rectlist_t *list = boxer->list; | |
| 290 int num_h = 0, num_v = 0; | |
| 291 double max_h = 0, max_v = 0; | |
| 292 rect_t best_h = {0}, best_v = {0}; | |
| 293 int i; | |
| 294 | |
| 295 *boxer1 = NULL; | |
| 296 *boxer2 = NULL; | |
| 297 | |
| 298 for (i = 0; i < list->len; i++) | |
| 299 { | |
| 300 rect_t r = boxer->list->list[i]; | |
| 301 | |
| 302 if (r.min.x <= boxer->mediabox.min.x && r.max.x >= boxer->mediabox.max.x) | |
| 303 { | |
| 304 /* Horizontal divider */ | |
| 305 double size = r.max.y - r.min.y; | |
| 306 if (size > max_h) | |
| 307 { | |
| 308 max_h = size; | |
| 309 best_h = r; | |
| 310 } | |
| 311 num_h++; | |
| 312 } | |
| 313 if (r.min.y <= boxer->mediabox.min.y && r.max.y >= boxer->mediabox.max.y) | |
| 314 { | |
| 315 /* Vertical divider */ | |
| 316 double size = r.max.x - r.min.x; | |
| 317 if (size > max_v) | |
| 318 { | |
| 319 max_v = size; | |
| 320 best_v = r; | |
| 321 } | |
| 322 num_v++; | |
| 323 } | |
| 324 } | |
| 325 | |
| 326 outf("num_h=%d num_v=%d\n", num_h, num_v); | |
| 327 outf("max_h=%g max_v=%g\n", max_h, max_v); | |
| 328 | |
| 329 if (max_h > max_v) | |
| 330 { | |
| 331 rect_t r; | |
| 332 /* Divider runs horizontally. */ | |
| 333 r = boxer->mediabox; | |
| 334 r.max.y = best_h.min.y; | |
| 335 *boxer1 = boxer_subset(boxer, r); | |
| 336 r = boxer->mediabox; | |
| 337 r.min.y = best_h.max.y; | |
| 338 *boxer2 = boxer_subset(boxer, r); | |
| 339 return SPLIT_VERTICAL; | |
| 340 } | |
| 341 else if (max_v > 0) | |
| 342 { | |
| 343 rect_t r; | |
| 344 /* Divider runs vertically. */ | |
| 345 r = boxer->mediabox; | |
| 346 r.max.x = best_v.min.x; | |
| 347 *boxer1 = boxer_subset(boxer, r); | |
| 348 r = boxer->mediabox; | |
| 349 r.min.x = best_v.max.x; | |
| 350 *boxer2 = boxer_subset(boxer, r); | |
| 351 return SPLIT_HORIZONTAL; | |
| 352 } | |
| 353 | |
| 354 return SPLIT_NONE; | |
| 355 } | |
| 356 | |
| 357 | |
| 358 /* Extract specifics */ | |
| 359 static rect_t | |
| 360 extract_span_bbox(span_t *span) | |
| 361 { | |
| 362 int j; | |
| 363 rect_t bbox = extract_rect_empty; | |
| 364 | |
| 365 for (j = 0; j < span->chars_num; j++) | |
| 366 { | |
| 367 char_t *char_ = &span->chars[j]; | |
| 368 bbox = extract_rect_union(bbox, char_->bbox); | |
| 369 } | |
| 370 return bbox; | |
| 371 } | |
| 372 | |
| 373 | |
| 374 static int | |
| 375 extract_subpage_subset(extract_alloc_t *alloc, extract_page_t *page, subpage_t *subpage, rect_t mediabox) | |
| 376 { | |
| 377 content_span_iterator sit; | |
| 378 span_t *span; | |
| 379 subpage_t *target; | |
| 380 | |
| 381 if (extract_subpage_alloc(alloc, mediabox, page, &target)) | |
| 382 return -1; | |
| 383 | |
| 384 for (span = content_span_iterator_init(&sit, &subpage->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 385 { | |
| 386 rect_t bbox = extract_span_bbox(span); | |
| 387 | |
| 388 if (bbox.min.x >= mediabox.min.x && bbox.min.y >= mediabox.min.y && bbox.max.x <= mediabox.max.x && bbox.max.y <= mediabox.max.y) | |
| 389 { | |
| 390 content_unlink(&span->base); | |
| 391 content_append_span(&target->content, span); | |
| 392 } | |
| 393 } | |
| 394 | |
| 395 return 0; | |
| 396 } | |
| 397 | |
| 398 enum { | |
| 399 MAX_ANALYSIS_DEPTH = 6 | |
| 400 }; | |
| 401 | |
| 402 static int | |
| 403 analyse_sub(extract_page_t *page, subpage_t *subpage, boxer_t *big_boxer, split_t **psplit, int depth) | |
| 404 { | |
| 405 rect_t margins; | |
| 406 boxer_t *boxer; | |
| 407 boxer_t *boxer1; | |
| 408 boxer_t *boxer2; | |
| 409 int ret; | |
| 410 split_type_t split_type; | |
| 411 split_t *split; | |
| 412 | |
| 413 margins = boxer_margins(big_boxer); | |
| 414 #ifdef DEBUG_WRITE_AS_PS | |
| 415 printf("\n\n%% MARGINS %g %g %g %g\n", margins.min.x, margins.min.y, margins.max.x, margins.max.y); | |
| 416 #endif | |
| 417 | |
| 418 boxer = boxer_subset(big_boxer, margins); | |
| 419 | |
| 420 if (depth < MAX_ANALYSIS_DEPTH && | |
| 421 (split_type = boxer_subdivide(boxer, &boxer1, &boxer2)) != SPLIT_NONE) | |
| 422 { | |
| 423 if (boxer1 == NULL || boxer2 == NULL || | |
| 424 extract_split_alloc(boxer->alloc, split_type, 2, psplit)) | |
| 425 { | |
| 426 ret = -1; | |
| 427 goto fail_mid_split; | |
| 428 } | |
| 429 split = *psplit; | |
| 430 outf("depth=%d %s\n", depth, split_type == SPLIT_HORIZONTAL ? "H" : "V"); | |
| 431 ret = analyse_sub(page, subpage, boxer1, &split->split[0], depth+1); | |
| 432 if (!ret) ret = analyse_sub(page, subpage, boxer2, &split->split[1], depth+1); | |
| 433 if (!ret) | |
| 434 { | |
| 435 if (split_type == SPLIT_HORIZONTAL) | |
| 436 { | |
| 437 split->split[0]->weight = boxer1->mediabox.max.x - boxer1->mediabox.min.x; | |
| 438 split->split[1]->weight = boxer2->mediabox.max.x - boxer2->mediabox.min.x; | |
| 439 } | |
| 440 else | |
| 441 { | |
| 442 split->split[0]->weight = boxer1->mediabox.max.y - boxer1->mediabox.min.y; | |
| 443 split->split[1]->weight = boxer2->mediabox.max.y - boxer2->mediabox.min.y; | |
| 444 } | |
| 445 } | |
| 446 fail_mid_split: | |
| 447 boxer_destroy(boxer1); | |
| 448 boxer_destroy(boxer2); | |
| 449 boxer_destroy(boxer); | |
| 450 return ret; | |
| 451 } | |
| 452 | |
| 453 outf("depth=%d LEAF\n", depth); | |
| 454 | |
| 455 if (extract_split_alloc(boxer->alloc, SPLIT_NONE, 0, psplit)) | |
| 456 { | |
| 457 boxer_destroy(boxer); | |
| 458 return -1; | |
| 459 } | |
| 460 split = *psplit; | |
| 461 | |
| 462 ret = extract_subpage_subset(boxer->alloc, page, subpage, boxer->mediabox); | |
| 463 | |
| 464 #ifdef DEBUG_WRITE_AS_PS | |
| 465 { | |
| 466 int i, n; | |
| 467 rect_t *list; | |
| 468 boxer_sort(boxer); | |
| 469 n = boxer_results(boxer, &list); | |
| 470 | |
| 471 printf("%% SUBDIVISION\n"); | |
| 472 for (i = 0; i < n; i++) | |
| 473 { | |
| 474 printf("%% %g %g %g %g\n", | |
| 475 list[i].min.x, list[i].min.y, list[i].max.x, list[i].max.y); | |
| 476 } | |
| 477 | |
| 478 printf("0 0 0 setrgbcolor\n"); | |
| 479 for (i = 0; i < n; i++) { | |
| 480 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n", | |
| 481 list[i].min.x, list[i].min.y, | |
| 482 list[i].min.x, list[i].max.y, | |
| 483 list[i].max.x, list[i].max.y, | |
| 484 list[i].max.x, list[i].min.y); | |
| 485 } | |
| 486 | |
| 487 printf("1 0 0 setrgbcolor\n"); | |
| 488 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n", | |
| 489 margins.min.x, margins.min.y, | |
| 490 margins.min.x, margins.max.y, | |
| 491 margins.max.x, margins.max.y, | |
| 492 margins.max.x, margins.min.y); | |
| 493 } | |
| 494 #endif | |
| 495 boxer_destroy(boxer); | |
| 496 | |
| 497 return ret; | |
| 498 } | |
| 499 | |
| 500 | |
| 501 static int | |
| 502 collate_splits(extract_alloc_t *alloc, split_t **psplit) | |
| 503 { | |
| 504 split_t *split = *psplit; | |
| 505 int s; | |
| 506 int n = 0; | |
| 507 int i; | |
| 508 int j; | |
| 509 split_t *newsplit; | |
| 510 | |
| 511 /* Recurse into all our children to ensure they are collated. | |
| 512 * Count how many children we'll have once we pull all the | |
| 513 * children of children that match our type up into us. */ | |
| 514 for (s = 0; s < split->count; s++) | |
| 515 { | |
| 516 if (collate_splits(alloc, &split->split[s])) | |
| 517 return -1; | |
| 518 if (split->split[s]->type == split->type) | |
| 519 n += split->split[s]->count; | |
| 520 else | |
| 521 n++; | |
| 522 } | |
| 523 | |
| 524 /* No change in the number of children? Just exit. */ | |
| 525 if (n == split->count) | |
| 526 return 0; | |
| 527 | |
| 528 if (extract_split_alloc(alloc, split->type, n, &newsplit)) | |
| 529 return -1; | |
| 530 | |
| 531 newsplit->weight = split->weight; | |
| 532 | |
| 533 /* Now, run across our children. */ | |
| 534 i = 0; | |
| 535 for (s = 0; s < split->count; s++) | |
| 536 { | |
| 537 split_t *sub = split->split[s]; | |
| 538 if (sub->type == split->type) | |
| 539 { | |
| 540 /* If the type matches, pull the grandchildren into newsplit. */ | |
| 541 for (j = 0; j < sub->count; j++) | |
| 542 { | |
| 543 newsplit->split[i++] = sub->split[j]; | |
| 544 sub->split[j] = NULL; | |
| 545 } | |
| 546 } | |
| 547 else | |
| 548 { | |
| 549 /* Otherwise just move the child into newsplit. */ | |
| 550 newsplit->split[i++] = sub; | |
| 551 split->split[s] = NULL; | |
| 552 } | |
| 553 } | |
| 554 | |
| 555 extract_split_free(alloc, psplit); | |
| 556 *psplit = newsplit; | |
| 557 | |
| 558 return 0; | |
| 559 } | |
| 560 | |
| 561 int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page) | |
| 562 { | |
| 563 boxer_t *boxer; | |
| 564 subpage_t *subpage = page->subpages[0]; | |
| 565 content_span_iterator sit; | |
| 566 span_t *span; | |
| 567 | |
| 568 /* This code will only work if the page contains a single subpage. | |
| 569 * This should always be the case if we're called from a page | |
| 570 * generated via extract_page_begin. */ | |
| 571 if (page->subpages_num != 1) return 0; | |
| 572 | |
| 573 /* Take the old subpages out from the page. */ | |
| 574 page->subpages_num = 0; | |
| 575 extract_free(alloc, &page->subpages); | |
| 576 | |
| 577 #ifdef DEBUG_WRITE_AS_PS | |
| 578 printf("1 -1 scale 0 -%g translate\n", page->mediabox.max.y-page->mediabox.min.y); | |
| 579 #endif | |
| 580 | |
| 581 boxer = boxer_create(alloc, (rect_t *)&subpage->mediabox); | |
| 582 | |
| 583 for (span = content_span_iterator_init(&sit, &subpage->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 584 { | |
| 585 rect_t bbox = extract_span_bbox(span); | |
| 586 if (boxer_feed(boxer, &bbox)) | |
| 587 goto fail; | |
| 588 } | |
| 589 | |
| 590 if (analyse_sub(page, subpage, boxer, &page->split, 0)) | |
| 591 goto fail; | |
| 592 | |
| 593 if (collate_splits(boxer->alloc, &page->split)) | |
| 594 goto fail; | |
| 595 | |
| 596 #ifdef DEBUG_WRITE_AS_PS | |
| 597 printf("showpage\n"); | |
| 598 #endif | |
| 599 | |
| 600 boxer_destroy(boxer); | |
| 601 extract_subpage_free(alloc, &subpage); | |
| 602 | |
| 603 return 0; | |
| 604 | |
| 605 fail: | |
| 606 outf("Analysis failed!\n"); | |
| 607 boxer_destroy(boxer); | |
| 608 extract_subpage_free(alloc, &subpage); | |
| 609 | |
| 610 return -1; | |
| 611 } |
