comparison mupdf-source/source/fitz/stext-boxer.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24
25 #undef DEBUG_WRITE_AS_PS
26 #undef DEBUG_STRUCT
27
28 typedef struct boxer_s boxer_t;
29
30 typedef struct {
31 int len;
32 int max;
33 fz_rect list[FZ_FLEXIBLE_ARRAY];
34 } rectlist_t;
35
36 struct boxer_s {
37 fz_rect mediabox;
38 rectlist_t *list;
39 };
40
41 static int fz_rect_contains_rect(fz_rect a, fz_rect b)
42 {
43 if (a.x0 > b.x0)
44 return 0;
45 if (a.y0 > b.y0)
46 return 0;
47 if (a.x1 < b.x1)
48 return 0;
49 if (a.y1 < b.y1)
50 return 0;
51
52 return 1;
53 }
54
55 static rectlist_t *
56 rectlist_create(fz_context *ctx, int max)
57 {
58 rectlist_t *list = fz_malloc_flexible(ctx, rectlist_t, list, max);
59
60 list->len = 0;
61 list->max = max;
62
63 return list;
64 }
65
66 /* Push box onto rectlist, unless it is completely enclosed by
67 * another box, or completely encloses others (in which case they
68 * are replaced by it). */
69 static void
70 rectlist_append(rectlist_t *list, fz_rect *box)
71 {
72 int i;
73
74 for (i = 0; i < list->len; i++)
75 {
76 fz_rect *r = &list->list[i];
77 fz_rect smaller, larger;
78 /* We allow ourselves a fudge factor of 4 points when checking for inclusion. */
79 double r_fudge = 4;
80
81 smaller.x0 = r->x0 + r_fudge;
82 larger. x0 = r->x0 - r_fudge;
83 smaller.y0 = r->y0 + r_fudge;
84 larger. y0 = r->y0 - r_fudge;
85 smaller.x1 = r->x1 - r_fudge;
86 larger. x1 = r->x1 + r_fudge;
87 smaller.y1 = r->y1 - r_fudge;
88 larger. y1 = r->y1 + r_fudge;
89
90 if (fz_rect_contains_rect(larger, *box))
91 return; /* box is enclosed! Nothing to do. */
92 if (fz_rect_contains_rect(*box, smaller))
93 {
94 /* box encloses r. Ditch r. */
95 /* Shorten the list */
96 --list->len;
97 /* If the one that just got chopped off wasn't r, move it down. */
98 if (i < list->len)
99 {
100 memcpy(r, &list->list[list->len], sizeof(*r));
101 i--; /* Reconsider this entry next time. */
102 }
103 }
104 }
105
106 assert(list->len < list->max);
107 memcpy(&list->list[list->len], box, sizeof(*box));
108 list->len++;
109 }
110
111 static boxer_t *
112 boxer_create_length(fz_context *ctx, fz_rect *mediabox, int len)
113 {
114 boxer_t *boxer = fz_malloc_struct(ctx, boxer_t);
115
116 if (boxer == NULL)
117 return NULL;
118
119 memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox));
120 boxer->list = rectlist_create(ctx, len);
121
122 return boxer;
123 }
124
125 /* Create a boxer structure for a page of size mediabox. */
126 static boxer_t *
127 boxer_create(fz_context *ctx, fz_rect *mediabox)
128 {
129 boxer_t *boxer = boxer_create_length(ctx, mediabox, 1);
130
131 if (boxer == NULL)
132 return NULL;
133 rectlist_append(boxer->list, mediabox);
134
135 return boxer;
136 }
137
138 static void
139 push_if_intersect_suitable(rectlist_t *dst, const fz_rect *a, const fz_rect *b)
140 {
141 fz_rect c;
142
143 /* Intersect a and b. */
144 c = fz_intersect_rect(*a, *b);
145 /* If no intersection, nothing to push. */
146 if (!fz_is_valid_rect(c))
147 return;
148
149 rectlist_append(dst, &c);
150 }
151
152 static void
153 boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const fz_rect *box)
154 {
155 int i;
156
157 for (i = 0; i < src->len; i++)
158 push_if_intersect_suitable(dst, &src->list[i], box);
159 }
160
161 /* Mark a given box as being occupied (typically by a glyph) */
162 static void boxer_feed(fz_context *ctx, boxer_t *boxer, fz_rect *bbox)
163 {
164 fz_rect box;
165 /* When we feed a box into a the boxer, we can never make
166 * the list more than 4 times as long. */
167 rectlist_t *newlist = rectlist_create(ctx, boxer->list->len * 4);
168
169 #ifdef DEBUG_WRITE_AS_PS
170 printf("0 0 1 setrgbcolor\n");
171 printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n",
172 bbox->x0, bbox->y0,
173 bbox->x0, bbox->y1,
174 bbox->x1, bbox->y1,
175 bbox->x1, bbox->y0
176 );
177 #endif
178
179 /* Left (0,0) (x0,H) */
180 box.x0 = boxer->mediabox.x0;
181 box.y0 = boxer->mediabox.y0;
182 box.x1 = bbox->x0;
183 box.y1 = boxer->mediabox.y1;
184 boxlist_feed_intersect(newlist, boxer->list, &box);
185
186 /* Right (x1,0) (W,H) */
187 box.x0 = bbox->x1;
188 box.y0 = boxer->mediabox.y0;
189 box.x1 = boxer->mediabox.x1;
190 box.y1 = boxer->mediabox.y1;
191 boxlist_feed_intersect(newlist, boxer->list, &box);
192
193 /* Bottom (0,0) (W,y0) */
194 box.x0 = boxer->mediabox.x0;
195 box.y0 = boxer->mediabox.y0;
196 box.x1 = boxer->mediabox.x1;
197 box.y1 = bbox->y0;
198 boxlist_feed_intersect(newlist, boxer->list, &box);
199
200 /* Top (0,y1) (W,H) */
201 box.x0 = boxer->mediabox.x0;
202 box.y0 = bbox->y1;
203 box.x1 = boxer->mediabox.x1;
204 box.y1 = boxer->mediabox.y1;
205 boxlist_feed_intersect(newlist, boxer->list, &box);
206
207 fz_free(ctx, boxer->list);
208 boxer->list = newlist;
209 }
210
211 #ifdef DEBUG_WRITE_AS_PS
212 static int
213 compare_areas(const void *a_, const void *b_)
214 {
215 const fz_rect *a = (const fz_rect *)a_;
216 const fz_rect *b = (const fz_rect *)b_;
217 double area_a = (a->x1-a->x0) * (a->y1-a->y0);
218 double area_b = (b->x1-b->x0) * (b->y1-b->y0);
219
220 if (area_a < area_b)
221 return 1;
222 else if (area_a > area_b)
223 return -1;
224 else
225 return 0;
226 }
227
228 /* Sort the rectangle list to be largest area first. For ease of humans
229 * reading debug output. */
230 static void boxer_sort(boxer_t *boxer)
231 {
232 qsort(boxer->list->list, boxer->list->len, sizeof(fz_rect), compare_areas);
233 }
234
235 /* Get the rectangle list for a given boxer. Return value is the length of
236 * the list. Lifespan is until the boxer is modified or freed. */
237 static int boxer_results(boxer_t *boxer, fz_rect **list)
238 {
239 *list = boxer->list->list;
240 return boxer->list->len;
241 }
242 #endif
243
244 /* Currently unused debugging routine */
245 #if 0
246 static void
247 boxer_dump(fz_context *ctx, boxer_t *boxer)
248 {
249 int i;
250
251 printf("bbox = %g %g %g %g\n", boxer->mediabox.x0, boxer->mediabox.y0, boxer->mediabox.x1, boxer->mediabox.y1);
252 for (i = 0; i < boxer->list->len; i++)
253 {
254 printf("%d %g %g %g %g\n", i, boxer->list->list[i].x0, boxer->list->list[i].y0, boxer->list->list[i].x1, boxer->list->list[i].y1);
255 }
256 }
257 #endif
258
259 /* Destroy a boxer. */
260 static void boxer_destroy(fz_context *ctx, boxer_t *boxer)
261 {
262 if (!boxer)
263 return;
264
265 fz_free(ctx, boxer->list);
266 fz_free(ctx, boxer);
267 }
268
269 /* Find the margins for a given boxer. */
270 static fz_rect boxer_margins(boxer_t *boxer)
271 {
272 rectlist_t *list = boxer->list;
273 int i;
274 fz_rect margins = boxer->mediabox;
275
276 for (i = 0; i < list->len; i++)
277 {
278 fz_rect *r = &list->list[i];
279 if (r->x0 <= margins.x0 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
280 margins.x0 = r->x1; /* Left Margin */
281 else if (r->x1 >= margins.x1 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
282 margins.x1 = r->x0; /* Right Margin */
283 else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y0 <= margins.y0)
284 margins.y0 = r->y1; /* Top Margin */
285 else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y1 >= margins.y1)
286 margins.y1 = r->y0; /* Bottom Margin */
287 }
288
289 return margins;
290 }
291
292 /* Create a new boxer from a subset of an old one. */
293 static boxer_t *boxer_subset(fz_context *ctx, boxer_t *boxer, fz_rect rect)
294 {
295 boxer_t *new_boxer = boxer_create_length(ctx, &rect, boxer->list->len);
296 int i;
297
298 if (new_boxer == NULL)
299 return NULL;
300
301 for (i = 0; i < boxer->list->len; i++)
302 {
303 fz_rect r = fz_intersect_rect(boxer->list->list[i], rect);
304
305 if (fz_is_empty_rect(r))
306 continue;
307 rectlist_append(new_boxer->list, &r);
308 }
309
310 return new_boxer;
311 }
312
313 static int analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth);
314
315 static void
316 analyse_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, fz_rect r, int depth)
317 {
318 boxer_t *sub_box = boxer_subset(ctx, boxer, r);
319
320 fz_try(ctx)
321 (void)analyse_sub(ctx, page, first_block, last_block, sub_box, depth);
322 fz_always(ctx)
323 boxer_destroy(ctx, sub_box);
324 fz_catch(ctx)
325 fz_rethrow(ctx);
326 }
327
328 /* Consider a boxer for subdivision.
329 * Returns 0 if no suitable subdivision point found.
330 * Returns 1 if a subdivision point is found.*/
331 static int
332 boxer_subdivide(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, int depth)
333 {
334 rectlist_t *list = boxer->list;
335 double max_h = 0, max_v = 0;
336 int i;
337
338 for (i = 0; i < list->len; i++)
339 {
340 fz_rect r = boxer->list->list[i];
341
342 if (r.x0 <= boxer->mediabox.x0 && r.x1 >= boxer->mediabox.x1)
343 {
344 /* Horizontal divider */
345 double size = r.y1 - r.y0;
346 if (size > max_h)
347 {
348 max_h = size;
349 }
350 }
351 if (r.y0 <= boxer->mediabox.y0 && r.y1 >= boxer->mediabox.y1)
352 {
353 /* Vertical divider */
354 double size = r.x1 - r.x0;
355 if (size > max_v)
356 {
357 max_v = size;
358 }
359 }
360 }
361
362 if (max_h > max_v)
363 {
364 fz_rect r;
365 float min_gap;
366 float top;
367
368 /* Divider runs horizontally. */
369
370 /* We want to list out all the horizontal subregions that are separated
371 * by an appropriate gap, from top to bottom. */
372
373 /* Any gap larger than the the gap we ignored will do. */
374 min_gap = max_v;
375
376 /* We're going to need to run through the data multiple times to find the
377 * topmost block each time. We'll use 'top' to cull against, and gradually
378 * lower that after each successful pass. */
379 top = boxer->mediabox.y0;
380 while (1)
381 {
382 /* Find the topmost divider below 'top' of height at least min_gap */
383 float found_top;
384 int found = -1;
385 for (i = 0; i < list->len; i++)
386 {
387 fz_rect *b = &list->list[i];
388 if (b->x0 <= boxer->mediabox.x0 && boxer->mediabox.x1 <= b->x1 && b->y0 > top && b->y1 - b->y0 >= min_gap)
389 {
390 if (found == -1 || b->y0 < found_top)
391 {
392 found = i;
393 found_top = b->y0;
394 }
395 }
396 }
397
398 /* If we failed to find one, we're done. */
399 if (found == -1)
400 break;
401
402 /* So we have a region from top to found_top */
403 r = boxer->mediabox;
404 r.y0 = top;
405 r.y1 = found_top;
406
407 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
408
409 /* Now move top down for the next go. */
410 top = list->list[found].y1;
411 }
412
413 /* One final region, from top to bottom */
414 r = boxer->mediabox;
415 r.y0 = top;
416 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
417
418 return 1;
419 }
420 else if (max_v > 0)
421 {
422 fz_rect r;
423 float min_gap;
424 float left;
425
426 /* Divider runs vertically. */
427
428 /* We want to list out all the vertical subregions that are separated
429 * by an appropriate gap, from left to right. */
430
431 /* Any gap larger than the the gap we ignored will do. */
432 min_gap = max_h;
433
434 /* We're going to need to run through the data multiple times to find the
435 * leftmost block each time. We'll use 'left' to cull against, and gradually
436 * lower that after each successful pass. */
437 left = boxer->mediabox.x0;
438 while (1)
439 {
440 /* Find the leftmost divider to the right of 'left' of width at least min_gap */
441 float found_left;
442 int found = -1;
443 for (i = 0; i < list->len; i++)
444 {
445 fz_rect *b = &list->list[i];
446 if (b->y0 <= boxer->mediabox.y0 && boxer->mediabox.y1 <= b->y1 && b->x0 > left && b->x1 - b->x0 >= min_gap)
447 {
448 if (found == -1 || b->x0 < found_left)
449 {
450 found = i;
451 found_left = b->x0;
452 }
453 }
454 }
455
456 /* If we failed to find one, we're done. */
457 if (found == -1)
458 break;
459
460 /* So we have a region from top to found_top */
461 r = boxer->mediabox;
462 r.x0 = left;
463 r.x1 = found_left;
464 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
465
466 /* Now move left right for the next go. */
467 left = list->list[found].x1;
468 }
469
470 /* One final region, from left to right */
471 r = boxer->mediabox;
472 r.x0 = left;
473 analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
474
475 return 1;
476 }
477
478 return 0;
479 }
480
481 static void
482 new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
483 {
484 fz_stext_struct *str;
485 size_t z;
486
487 if (raw == NULL)
488 raw = "";
489 z = strlen(raw);
490
491 str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1);
492 str->first_block = NULL;
493 str->last_block = NULL;
494 str->standard = standard;
495 str->parent = page->last_struct;
496 str->up = block;
497 memcpy(str->raw, raw, z+1);
498
499 block->u.s.down = str;
500 }
501
502 #ifdef DEBUG_STRUCT
503 static void
504 do_dump_stext(fz_stext_block *block, int depth)
505 {
506 int d;
507
508 while (block)
509 {
510 for (d = 0; d < depth; d++)
511 printf(" ");
512 switch(block->type)
513 {
514 case FZ_STEXT_BLOCK_TEXT:
515 printf("TEXT %p\n", block);
516 break;
517 case FZ_STEXT_BLOCK_IMAGE:
518 printf("IMAGE %p\n", block);
519 break;
520 case FZ_STEXT_BLOCK_VECTOR:
521 printf("VECTOR %p\n", block);
522 break;
523 case FZ_STEXT_BLOCK_STRUCT:
524 printf("STRUCT %p\n", block);
525 do_dump_stext(block->u.s.down->first_block, depth+1);
526 break;
527 }
528 block = block->next;
529 }
530 }
531
532 static void
533 dump_stext(char *str, fz_stext_block *block)
534 {
535 printf("%s\n", str);
536
537 do_dump_stext(block, 0);
538 }
539 #endif
540
541 static void
542 recalc_bbox(fz_stext_block *block)
543 {
544 fz_rect bbox = fz_empty_rect;
545 fz_stext_line *line;
546
547 for (line = block->u.t.first_line; line != NULL; line = line->next)
548 bbox = fz_union_rect(bbox, line->bbox);
549
550 block->bbox = bbox;
551 }
552
553 static fz_stext_struct *
554 page_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, fz_rect mediabox)
555 {
556 fz_stext_block *block, *next_block;
557 fz_stext_block *target = NULL;
558 fz_stext_block *last = NULL;
559 fz_stext_block *newblock;
560 int idx = 0;
561
562 #ifdef DEBUG_STRUCT
563 dump_stext("BEFORE", *first_block);
564 #endif
565
566 for (block = *first_block; block != NULL; block = next_block)
567 {
568 fz_rect bbox;
569
570 next_block = block->next;
571 if (block->type != FZ_STEXT_BLOCK_TEXT && block->type != FZ_STEXT_BLOCK_VECTOR)
572 continue;
573
574 bbox = block->bbox;
575
576 /* Can we take the whole block? */
577 if (bbox.x0 >= mediabox.x0 && bbox.y0 >= mediabox.y0 && bbox.x1 <= mediabox.x1 && bbox.y1 <= mediabox.y1)
578 {
579 /* Unlink block from the current list. */
580 if (block->prev)
581 block->prev->next = next_block;
582 else
583 *first_block = next_block;
584 if (next_block)
585 next_block->prev = block->prev;
586 else
587 *last_block = block->prev;
588
589 /* Add block onto our target list */
590 if (target == NULL)
591 {
592 target = block;
593 block->prev = NULL;
594 }
595 else
596 {
597 last->next = block;
598 block->prev = last;
599 }
600 last = block;
601 block->next = NULL;
602 }
603 else if (block->type == FZ_STEXT_BLOCK_TEXT && !fz_is_empty_rect(fz_intersect_rect(bbox, mediabox)))
604 {
605 /* Need to look at the parts. */
606 fz_stext_line *line, *next_line;
607
608 newblock = NULL;
609 for (line = block->u.t.first_line; line != NULL; line = next_line)
610 {
611 next_line = line->next;
612 if (line->bbox.x0 >= mediabox.x0 && line->bbox.y0 >= mediabox.y0 && line->bbox.x1 <= mediabox.x1 && line->bbox.y1 <= mediabox.y1)
613 {
614 /* We need to take this line */
615 if (newblock == NULL)
616 {
617 newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
618
619 /* Add the block onto our target list */
620 if (target == NULL)
621 {
622 target = newblock;
623 }
624 else
625 {
626 last->next = newblock;
627 newblock->prev = last;
628 }
629 last = newblock;
630 }
631
632 /* Unlink line from the current list. */
633 if (line->prev)
634 line->prev->next = next_line;
635 else
636 block->u.t.first_line = next_line;
637 if (next_line)
638 next_line->prev = line->prev;
639 else
640 block->u.t.last_line = line->prev;
641
642 /* Add line onto our new block */
643 if (newblock->u.t.last_line == NULL)
644 {
645 newblock->u.t.first_line = newblock->u.t.last_line = line;
646 line->prev = NULL;
647 }
648 else
649 {
650 line->prev = newblock->u.t.last_line;
651 newblock->u.t.last_line->next = line;
652 newblock->u.t.last_line = line;
653 }
654 line->next = NULL;
655 }
656 }
657 if (newblock)
658 {
659 recalc_bbox(block);
660 recalc_bbox(newblock);
661 }
662 }
663 }
664
665 /* If no content to add, bale! */
666 if (target == NULL)
667 return NULL;
668
669 /* We want to insert a structure node that contains target as the last structure
670 * node on this blocklist. Find the first block that's not a structure block. */
671 for (block = *first_block; block != NULL; block = block->next)
672 {
673 if (block->type != FZ_STEXT_BLOCK_STRUCT)
674 break;
675 idx++;
676 }
677
678 /* So we want to insert just before block. */
679
680 /* We are going to need to create a new block. Create a complete unlinked one here. */
681 newblock = fz_pool_alloc(ctx, page->pool, sizeof *newblock);
682 newblock->bbox = fz_empty_rect;
683 newblock->prev = block ? block->prev : *last_block;
684 newblock->next = block;
685 newblock->type = FZ_STEXT_BLOCK_STRUCT;
686 newblock->u.s.index = idx;
687 newblock->u.s.down = NULL;
688 /* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
689 /* And create a new struct and have newblock point to it. */
690 new_stext_struct(ctx, page, newblock, FZ_STRUCTURE_DIV, "Split");
691
692 /* Now insert newblock just before block */
693 /* If block was first, now we are. */
694 if (*first_block == block)
695 *first_block = newblock;
696 if (block == NULL)
697 {
698 /* Inserting at the end! */
699 if (*last_block)
700 (*last_block)->next = newblock;
701 *last_block = newblock;
702 }
703 else
704 {
705 if (block->prev)
706 block->prev->next = newblock;
707 block->prev = newblock;
708 }
709
710 newblock->u.s.down->first_block = target;
711 target->prev = NULL;
712
713 for (block = target; block->next != NULL; block = block->next)
714 newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
715 newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
716 newblock->u.s.down->last_block = block;
717
718 #ifdef DEBUG_STRUCT
719 dump_stext("AFTER", *first_block);
720 #endif
721
722 return newblock->u.s.down;
723 }
724
725 enum {
726 MAX_ANALYSIS_DEPTH = 6
727 };
728
729 static int
730 analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth)
731 {
732 fz_rect margins;
733 boxer_t *boxer;
734 boxer_t *boxer1 = NULL;
735 boxer_t *boxer2 = NULL;
736 fz_stext_struct *div;
737 int ret = 0;
738
739 /* Find the margins in the enclosing boxer. This returns
740 * a subset of the bbox of the original. */
741 margins = boxer_margins(big_boxer);
742 #ifdef DEBUG_WRITE_AS_PS
743 printf("\n\n%% MARGINS %g %g %g %g\n", margins.x0, margins.y0, margins.x1, margins.y1);
744 #endif
745
746 /* Now subset the rectangles just to include those that are in our bbox. */
747 boxer = boxer_subset(ctx, big_boxer, margins);
748
749 fz_var(boxer1);
750 fz_var(boxer2);
751
752 fz_try(ctx)
753 {
754 div = page_subset(ctx, page, first_block, last_block, boxer->mediabox);
755 /* If nothing subsetted (no textual content in that region), give up. */
756 if (div == NULL)
757 break;
758
759 ret = 1;
760
761 if (depth < MAX_ANALYSIS_DEPTH)
762 {
763 /* Can we subdivide that region any more? */
764 if (boxer_subdivide(ctx, page, &div->first_block, &div->last_block, boxer, depth+1))
765 break;
766 }
767
768 #ifdef DEBUG_WRITE_AS_PS
769 {
770 int i, n;
771 fz_rect *list;
772 boxer_sort(boxer);
773 n = boxer_results(boxer, &list);
774
775 printf("%% SUBDIVISION\n");
776 for (i = 0; i < n; i++)
777 {
778 printf("%% %g %g %g %g\n",
779 list[i].x0, list[i].y0, list[i].x1, list[i].y1);
780 }
781
782 printf("0 0 0 setrgbcolor\n");
783 for (i = 0; i < n; i++) {
784 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
785 list[i].x0, list[i].y0,
786 list[i].x0, list[i].y1,
787 list[i].x1, list[i].y1,
788 list[i].x1, list[i].y0);
789 }
790
791 printf("1 0 0 setrgbcolor\n");
792 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
793 margins.x0, margins.y0,
794 margins.x0, margins.y1,
795 margins.x1, margins.y1,
796 margins.x1, margins.y0);
797 }
798 #endif
799 }
800 fz_always(ctx)
801 {
802 boxer_destroy(ctx, boxer1);
803 boxer_destroy(ctx, boxer2);
804 boxer_destroy(ctx, boxer);
805 }
806 fz_catch(ctx)
807 fz_rethrow(ctx);
808
809 return ret;
810 }
811
812 static int
813 line_isnt_all_spaces(fz_context *ctx, fz_stext_line *line)
814 {
815 fz_stext_char *ch;
816 for (ch = line->first_char; ch != NULL; ch = ch->next)
817 if (ch->c != 32 && ch->c != 160)
818 return 1;
819 return 0;
820 }
821
822 static void
823 feed_line(fz_context *ctx, boxer_t *boxer, fz_stext_line *line)
824 {
825 fz_stext_char *ch;
826
827 for (ch = line->first_char; ch != NULL; ch = ch->next)
828 {
829 fz_rect r = fz_empty_rect;
830
831 if (ch->c == ' ')
832 continue;
833
834 do
835 {
836 fz_rect bbox = fz_rect_from_quad(ch->quad);
837 float margin = ch->size/2;
838 bbox.x0 -= margin;
839 bbox.y0 -= margin;
840 bbox.x1 += margin;
841 bbox.y1 += margin;
842 r = fz_union_rect(r, bbox);
843 ch = ch->next;
844 }
845 while (ch != NULL && ch->c != ' ');
846 boxer_feed(ctx, boxer, &r);
847 if (ch == NULL)
848 break;
849 }
850 }
851
852 /* Internal, non-API function, shared with stext-table. */
853 fz_rect
854 fz_collate_small_vector_run(fz_stext_block **blockp)
855 {
856 fz_stext_block *block = *blockp;
857 fz_stext_block *next;
858 fz_rect r = block->bbox;
859 int MAX_SIZE = 2;
860 int MAX_GAP = 2;
861
862 float w = r.x1 - r.x0;
863 float h = r.y1 - r.y0;
864
865 if (w < MAX_SIZE)
866 {
867 while ((next = block->next) != NULL &&
868 next->type == FZ_STEXT_BLOCK_VECTOR &&
869 next->bbox.x0 == r.x0 &&
870 next->bbox.x1 == r.x1 &&
871 ((next->bbox.y1 > r.y1 && next->bbox.y0 <= r.y1 + MAX_GAP) ||
872 (next->bbox.y0 < r.y0 && next->bbox.y1 >= r.y0 - MAX_GAP)))
873 {
874 r = fz_union_rect(r, next->bbox);
875 block = next;
876 }
877 }
878 if (h < MAX_SIZE)
879 {
880 while ((next = block->next) != NULL &&
881 next->type == FZ_STEXT_BLOCK_VECTOR &&
882 next->bbox.y0 == r.y0 &&
883 next->bbox.y1 == r.y1 &&
884 ((next->bbox.x1 > r.x1 && next->bbox.x0 <= r.x1 + MAX_GAP) ||
885 (next->bbox.x0 < r.x0 && next->bbox.x1 >= r.x0 - MAX_GAP)))
886 {
887 r = fz_union_rect(r, next->bbox);
888 block = next;
889 }
890 }
891
892 *blockp = block;
893
894 return r;
895 }
896
897 int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page)
898 {
899 boxer_t *boxer;
900 fz_stext_block *block;
901 int ret = 0;
902
903 /* If we have structure already, give up. We can't hope to beat
904 * proper structure! */
905 for (block = page->first_block; block != NULL; block = block->next)
906 if (block->type == FZ_STEXT_BLOCK_STRUCT)
907 return 0;
908
909 #ifdef DEBUG_WRITE_AS_PS
910 printf("1 -1 scale 0 -%g translate\n", page->mediabox.y1-page->mediabox.y0);
911 #endif
912
913 boxer = boxer_create(ctx, &page->mediabox);
914
915 fz_try(ctx)
916 {
917 /* Just walking the blocks is safe as we're assuming no structure here. */
918 for (block = page->first_block; block != NULL; block = block->next)
919 {
920 fz_stext_line *line;
921 switch (block->type)
922 {
923 case FZ_STEXT_BLOCK_TEXT:
924 for (line = block->u.t.first_line; line != NULL; line = line->next)
925 if (line_isnt_all_spaces(ctx, line))
926 feed_line(ctx, boxer, line);
927 break;
928 case FZ_STEXT_BLOCK_VECTOR:
929 {
930 /* Allow a 1 point margin around vectors to avoid hairline
931 * cracks between supposedly abutting things. */
932 int VECTOR_MARGIN = 1;
933 fz_rect r = fz_collate_small_vector_run(&block);
934
935 r.x0 -= VECTOR_MARGIN;
936 r.y0 -= VECTOR_MARGIN;
937 r.x1 += VECTOR_MARGIN;
938 r.y1 += VECTOR_MARGIN;
939 boxer_feed(ctx, boxer, &r);
940 }
941 }
942 }
943
944 ret = analyse_sub(ctx, page, &page->first_block, &page->last_block, boxer, 0);
945 }
946 fz_always(ctx)
947 boxer_destroy(ctx, boxer);
948 fz_catch(ctx)
949 fz_rethrow(ctx);
950
951 #ifdef DEBUG_WRITE_AS_PS
952 printf("showpage\n");
953 #endif
954
955 return ret;
956 }