comparison mupdf-source/thirdparty/extract/src/boxer.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <memory.h>
4 #include <assert.h>
5
6 #include "document.h"
7 #include "outf.h"
8
9 #define DEBUG_WRITE_AS_PS
10 /* #define DEBUG_PRINT */
11
12 typedef struct boxer_s boxer_t;
13
14 typedef struct {
15 int len;
16 int max;
17 rect_t list[1];
18 } rectlist_t;
19
20 struct boxer_s {
21 extract_alloc_t *alloc;
22 rect_t mediabox;
23 rectlist_t *list;
24 };
25
26 static rectlist_t *
27 rectlist_create(extract_alloc_t *alloc, int max)
28 {
29 rectlist_t *list;
30
31 if (extract_malloc(alloc, &list, sizeof(rectlist_t) + sizeof(rect_t)*(max-1)))
32 return NULL;
33
34 list->len = 0;
35 list->max = max;
36
37 return list;
38 }
39
40 /* Push box onto rectlist, unless it is completely enclosed by
41 * another box, or completely encloses others (in which case they
42 * are replaced by it). */
43 static void
44 rectlist_append(rectlist_t *list, rect_t *box)
45 {
46 int i;
47
48 for (i = 0; i < list->len; i++)
49 {
50 rect_t *r = &list->list[i];
51 rect_t smaller, larger;
52 /* We allow ourselves a fudge factor of 4 points when checking for inclusion. */
53 double r_fudge = 4;
54
55 smaller.min.x = r->min.x + r_fudge;
56 larger. min.x = r->min.x - r_fudge;
57 smaller.min.y = r->min.y + r_fudge;
58 larger. min.y = r->min.y - r_fudge;
59 smaller.max.x = r->max.x - r_fudge;
60 larger. max.x = r->max.x + r_fudge;
61 smaller.max.y = r->max.y - r_fudge;
62 larger. max.y = r->max.y + r_fudge;
63
64 if (extract_rect_contains_rect(larger, *box))
65 return; /* box is enclosed! Nothing to do. */
66 if (extract_rect_contains_rect(*box, smaller))
67 {
68 /* box encloses r. Ditch r. */
69 /* Shorten the list */
70 --list->len;
71 /* If the one that just got chopped off wasn't r, move it down. */
72 if (i < list->len)
73 {
74 memcpy(r, &list->list[list->len], sizeof(*r));
75 i--; /* Reconsider this entry next time. */
76 }
77 }
78 }
79
80 assert(list->len < list->max);
81 memcpy(&list->list[list->len], box, sizeof(*box));
82 list->len++;
83 }
84
85 static boxer_t *
86 boxer_create_length(extract_alloc_t *alloc, rect_t *mediabox, int len)
87 {
88 boxer_t *boxer;
89
90 if (extract_malloc(alloc, &boxer, sizeof(*boxer)))
91 return NULL;
92
93 boxer->alloc = alloc;
94 memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox));
95 boxer->list = rectlist_create(alloc, len);
96
97 return boxer;
98 }
99
100 /* Create a boxer structure for a page of size mediabox. */
101 static boxer_t *
102 boxer_create(extract_alloc_t *alloc, rect_t *mediabox)
103 {
104 boxer_t *boxer = boxer_create_length(alloc, mediabox, 1);
105
106 if (boxer == NULL)
107 return NULL;
108 rectlist_append(boxer->list, mediabox);
109
110 return boxer;
111 }
112
113 static void
114 push_if_intersect_suitable(rectlist_t *dst, const rect_t *a, const rect_t *b)
115 {
116 rect_t c;
117
118 /* Intersect a and b. */
119 c = extract_rect_intersect(*a, *b);
120 /* If no intersection, nothing to push. */
121 if (!extract_rect_valid(c))
122 return;
123
124 /* If the intersect is too narrow or too tall, ignore it.
125 * We don't care about inter character spaces, for example.
126 * Arbitrary 4 point threshold. */
127 #define THRESHOLD 4
128 if (c.min.x + THRESHOLD >= c.max.x || c.min.y+THRESHOLD >= c.max.y)
129 return;
130
131 rectlist_append(dst, &c);
132 }
133
134 static void
135 boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const rect_t *box)
136 {
137 int i;
138
139 for (i = 0; i < src->len; i++)
140 push_if_intersect_suitable(dst, &src->list[i], box);
141 }
142
143 /* Mark a given box as being occupied (typically by a glyph) */
144 static int boxer_feed(boxer_t *boxer, rect_t *bbox)
145 {
146 rect_t box;
147 /* When we feed a box into a the boxer, we can never make
148 * the list more than 4 times as long. */
149 rectlist_t *newlist = rectlist_create(boxer->alloc, boxer->list->len * 4);
150 if (newlist == NULL)
151 return -1;
152
153 #ifdef DEBUG_WRITE_AS_PS
154 printf("0 0 1 setrgbcolor\n");
155 printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n",
156 bbox->min.x, bbox->min.y,
157 bbox->min.x, bbox->max.y,
158 bbox->max.x, bbox->max.y,
159 bbox->max.x, bbox->min.y
160 );
161 #endif
162
163 /* Left (0,0) (min.x,H) */
164 box.min.x = boxer->mediabox.min.x;
165 box.min.y = boxer->mediabox.min.y;
166 box.max.x = bbox->min.x;
167 box.max.y = boxer->mediabox.max.y;
168 boxlist_feed_intersect(newlist, boxer->list, &box);
169
170 /* Right (max.x,0) (W,H) */
171 box.min.x = bbox->max.x;
172 box.min.y = boxer->mediabox.min.y;
173 box.max.x = boxer->mediabox.max.x;
174 box.max.y = boxer->mediabox.max.y;
175 boxlist_feed_intersect(newlist, boxer->list, &box);
176
177 /* Bottom (0,0) (W,min.y) */
178 box.min.x = boxer->mediabox.min.x;
179 box.min.y = boxer->mediabox.min.y;
180 box.max.x = boxer->mediabox.max.x;
181 box.max.y = bbox->min.y;
182 boxlist_feed_intersect(newlist, boxer->list, &box);
183
184 /* Top (0,max.y) (W,H) */
185 box.min.x = boxer->mediabox.min.x;
186 box.min.y = bbox->max.y;
187 box.max.x = boxer->mediabox.max.x;
188 box.max.y = boxer->mediabox.max.y;
189 boxlist_feed_intersect(newlist, boxer->list, &box);
190
191 extract_free(boxer->alloc, &boxer->list);
192 boxer->list = newlist;
193
194 return 0;
195 }
196
197 static int
198 compare_areas(const void *a_, const void *b_)
199 {
200 const rect_t *a = (const rect_t *)a_;
201 const rect_t *b = (const rect_t *)b_;
202 double area_a = (a->max.x-a->min.x) * (a->max.y-a->min.y);
203 double area_b = (b->max.x-b->min.x) * (b->max.y-b->min.y);
204
205 if (area_a < area_b)
206 return 1;
207 else if (area_a > area_b)
208 return -1;
209 else
210 return 0;
211 }
212
213 /* Sort the rectangle list to be largest area first. For ease of humans
214 * reading debug output. */
215 static void boxer_sort(boxer_t *boxer)
216 {
217 qsort(boxer->list->list, boxer->list->len, sizeof(rect_t), compare_areas);
218 }
219
220 /* Get the rectangle list for a given boxer. Return value is the length of
221 * the list. Lifespan is until the boxer is modified or freed. */
222 static int boxer_results(boxer_t *boxer, rect_t **list)
223 {
224 *list = boxer->list->list;
225 return boxer->list->len;
226 }
227
228 /* Destroy a boxer. */
229 static void boxer_destroy(boxer_t *boxer)
230 {
231 if (!boxer)
232 return;
233
234 extract_free(boxer->alloc, &boxer->list);
235 extract_free(boxer->alloc, &boxer);
236 }
237
238 /* Find the margins for a given boxer. */
239 static rect_t boxer_margins(boxer_t *boxer)
240 {
241 rectlist_t *list = boxer->list;
242 int i;
243 rect_t margins = boxer->mediabox;
244
245 for (i = 0; i < list->len; i++)
246 {
247 rect_t *r = &list->list[i];
248 if (r->min.x <= margins.min.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y)
249 margins.min.x = r->max.x; /* Left Margin */
250 else if (r->max.x >= margins.max.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y)
251 margins.max.x = r->min.x; /* Right Margin */
252 else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->min.y <= margins.min.y)
253 margins.min.y = r->max.y; /* Top Margin */
254 else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->max.y >= margins.max.y)
255 margins.max.y = r->min.y; /* Bottom Margin */
256 }
257
258 return margins;
259 }
260
261 /* Create a new boxer from a subset of an old one. */
262 static boxer_t *boxer_subset(boxer_t *boxer, rect_t rect)
263 {
264 boxer_t *new_boxer = boxer_create_length(boxer->alloc, &rect, boxer->list->len);
265 int i;
266
267 if (new_boxer == NULL)
268 return NULL;
269
270 for (i = 0; i < boxer->list->len; i++)
271 {
272 rect_t r = extract_rect_intersect(boxer->list->list[i], rect);
273
274 if (!extract_rect_valid(r))
275 continue;
276 rectlist_append(new_boxer->list, &r);
277 }
278
279 return new_boxer;
280 }
281
282 /* Consider a boxer for subdivision.
283 * Returns 0 if no suitable subdivision point found.
284 * Returns 1, and sets *boxer1 and *boxer2 to new boxer structures for the the subdivisions
285 * if a subdivision point is found.*/
286 static split_type_t
287 boxer_subdivide(boxer_t *boxer, boxer_t **boxer1, boxer_t **boxer2)
288 {
289 rectlist_t *list = boxer->list;
290 int num_h = 0, num_v = 0;
291 double max_h = 0, max_v = 0;
292 rect_t best_h = {0}, best_v = {0};
293 int i;
294
295 *boxer1 = NULL;
296 *boxer2 = NULL;
297
298 for (i = 0; i < list->len; i++)
299 {
300 rect_t r = boxer->list->list[i];
301
302 if (r.min.x <= boxer->mediabox.min.x && r.max.x >= boxer->mediabox.max.x)
303 {
304 /* Horizontal divider */
305 double size = r.max.y - r.min.y;
306 if (size > max_h)
307 {
308 max_h = size;
309 best_h = r;
310 }
311 num_h++;
312 }
313 if (r.min.y <= boxer->mediabox.min.y && r.max.y >= boxer->mediabox.max.y)
314 {
315 /* Vertical divider */
316 double size = r.max.x - r.min.x;
317 if (size > max_v)
318 {
319 max_v = size;
320 best_v = r;
321 }
322 num_v++;
323 }
324 }
325
326 outf("num_h=%d num_v=%d\n", num_h, num_v);
327 outf("max_h=%g max_v=%g\n", max_h, max_v);
328
329 if (max_h > max_v)
330 {
331 rect_t r;
332 /* Divider runs horizontally. */
333 r = boxer->mediabox;
334 r.max.y = best_h.min.y;
335 *boxer1 = boxer_subset(boxer, r);
336 r = boxer->mediabox;
337 r.min.y = best_h.max.y;
338 *boxer2 = boxer_subset(boxer, r);
339 return SPLIT_VERTICAL;
340 }
341 else if (max_v > 0)
342 {
343 rect_t r;
344 /* Divider runs vertically. */
345 r = boxer->mediabox;
346 r.max.x = best_v.min.x;
347 *boxer1 = boxer_subset(boxer, r);
348 r = boxer->mediabox;
349 r.min.x = best_v.max.x;
350 *boxer2 = boxer_subset(boxer, r);
351 return SPLIT_HORIZONTAL;
352 }
353
354 return SPLIT_NONE;
355 }
356
357
358 /* Extract specifics */
359 static rect_t
360 extract_span_bbox(span_t *span)
361 {
362 int j;
363 rect_t bbox = extract_rect_empty;
364
365 for (j = 0; j < span->chars_num; j++)
366 {
367 char_t *char_ = &span->chars[j];
368 bbox = extract_rect_union(bbox, char_->bbox);
369 }
370 return bbox;
371 }
372
373
374 static int
375 extract_subpage_subset(extract_alloc_t *alloc, extract_page_t *page, subpage_t *subpage, rect_t mediabox)
376 {
377 content_span_iterator sit;
378 span_t *span;
379 subpage_t *target;
380
381 if (extract_subpage_alloc(alloc, mediabox, page, &target))
382 return -1;
383
384 for (span = content_span_iterator_init(&sit, &subpage->content); span != NULL; span = content_span_iterator_next(&sit))
385 {
386 rect_t bbox = extract_span_bbox(span);
387
388 if (bbox.min.x >= mediabox.min.x && bbox.min.y >= mediabox.min.y && bbox.max.x <= mediabox.max.x && bbox.max.y <= mediabox.max.y)
389 {
390 content_unlink(&span->base);
391 content_append_span(&target->content, span);
392 }
393 }
394
395 return 0;
396 }
397
398 enum {
399 MAX_ANALYSIS_DEPTH = 6
400 };
401
402 static int
403 analyse_sub(extract_page_t *page, subpage_t *subpage, boxer_t *big_boxer, split_t **psplit, int depth)
404 {
405 rect_t margins;
406 boxer_t *boxer;
407 boxer_t *boxer1;
408 boxer_t *boxer2;
409 int ret;
410 split_type_t split_type;
411 split_t *split;
412
413 margins = boxer_margins(big_boxer);
414 #ifdef DEBUG_WRITE_AS_PS
415 printf("\n\n%% MARGINS %g %g %g %g\n", margins.min.x, margins.min.y, margins.max.x, margins.max.y);
416 #endif
417
418 boxer = boxer_subset(big_boxer, margins);
419
420 if (depth < MAX_ANALYSIS_DEPTH &&
421 (split_type = boxer_subdivide(boxer, &boxer1, &boxer2)) != SPLIT_NONE)
422 {
423 if (boxer1 == NULL || boxer2 == NULL ||
424 extract_split_alloc(boxer->alloc, split_type, 2, psplit))
425 {
426 ret = -1;
427 goto fail_mid_split;
428 }
429 split = *psplit;
430 outf("depth=%d %s\n", depth, split_type == SPLIT_HORIZONTAL ? "H" : "V");
431 ret = analyse_sub(page, subpage, boxer1, &split->split[0], depth+1);
432 if (!ret) ret = analyse_sub(page, subpage, boxer2, &split->split[1], depth+1);
433 if (!ret)
434 {
435 if (split_type == SPLIT_HORIZONTAL)
436 {
437 split->split[0]->weight = boxer1->mediabox.max.x - boxer1->mediabox.min.x;
438 split->split[1]->weight = boxer2->mediabox.max.x - boxer2->mediabox.min.x;
439 }
440 else
441 {
442 split->split[0]->weight = boxer1->mediabox.max.y - boxer1->mediabox.min.y;
443 split->split[1]->weight = boxer2->mediabox.max.y - boxer2->mediabox.min.y;
444 }
445 }
446 fail_mid_split:
447 boxer_destroy(boxer1);
448 boxer_destroy(boxer2);
449 boxer_destroy(boxer);
450 return ret;
451 }
452
453 outf("depth=%d LEAF\n", depth);
454
455 if (extract_split_alloc(boxer->alloc, SPLIT_NONE, 0, psplit))
456 {
457 boxer_destroy(boxer);
458 return -1;
459 }
460 split = *psplit;
461
462 ret = extract_subpage_subset(boxer->alloc, page, subpage, boxer->mediabox);
463
464 #ifdef DEBUG_WRITE_AS_PS
465 {
466 int i, n;
467 rect_t *list;
468 boxer_sort(boxer);
469 n = boxer_results(boxer, &list);
470
471 printf("%% SUBDIVISION\n");
472 for (i = 0; i < n; i++)
473 {
474 printf("%% %g %g %g %g\n",
475 list[i].min.x, list[i].min.y, list[i].max.x, list[i].max.y);
476 }
477
478 printf("0 0 0 setrgbcolor\n");
479 for (i = 0; i < n; i++) {
480 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
481 list[i].min.x, list[i].min.y,
482 list[i].min.x, list[i].max.y,
483 list[i].max.x, list[i].max.y,
484 list[i].max.x, list[i].min.y);
485 }
486
487 printf("1 0 0 setrgbcolor\n");
488 printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
489 margins.min.x, margins.min.y,
490 margins.min.x, margins.max.y,
491 margins.max.x, margins.max.y,
492 margins.max.x, margins.min.y);
493 }
494 #endif
495 boxer_destroy(boxer);
496
497 return ret;
498 }
499
500
501 static int
502 collate_splits(extract_alloc_t *alloc, split_t **psplit)
503 {
504 split_t *split = *psplit;
505 int s;
506 int n = 0;
507 int i;
508 int j;
509 split_t *newsplit;
510
511 /* Recurse into all our children to ensure they are collated.
512 * Count how many children we'll have once we pull all the
513 * children of children that match our type up into us. */
514 for (s = 0; s < split->count; s++)
515 {
516 if (collate_splits(alloc, &split->split[s]))
517 return -1;
518 if (split->split[s]->type == split->type)
519 n += split->split[s]->count;
520 else
521 n++;
522 }
523
524 /* No change in the number of children? Just exit. */
525 if (n == split->count)
526 return 0;
527
528 if (extract_split_alloc(alloc, split->type, n, &newsplit))
529 return -1;
530
531 newsplit->weight = split->weight;
532
533 /* Now, run across our children. */
534 i = 0;
535 for (s = 0; s < split->count; s++)
536 {
537 split_t *sub = split->split[s];
538 if (sub->type == split->type)
539 {
540 /* If the type matches, pull the grandchildren into newsplit. */
541 for (j = 0; j < sub->count; j++)
542 {
543 newsplit->split[i++] = sub->split[j];
544 sub->split[j] = NULL;
545 }
546 }
547 else
548 {
549 /* Otherwise just move the child into newsplit. */
550 newsplit->split[i++] = sub;
551 split->split[s] = NULL;
552 }
553 }
554
555 extract_split_free(alloc, psplit);
556 *psplit = newsplit;
557
558 return 0;
559 }
560
561 int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page)
562 {
563 boxer_t *boxer;
564 subpage_t *subpage = page->subpages[0];
565 content_span_iterator sit;
566 span_t *span;
567
568 /* This code will only work if the page contains a single subpage.
569 * This should always be the case if we're called from a page
570 * generated via extract_page_begin. */
571 if (page->subpages_num != 1) return 0;
572
573 /* Take the old subpages out from the page. */
574 page->subpages_num = 0;
575 extract_free(alloc, &page->subpages);
576
577 #ifdef DEBUG_WRITE_AS_PS
578 printf("1 -1 scale 0 -%g translate\n", page->mediabox.max.y-page->mediabox.min.y);
579 #endif
580
581 boxer = boxer_create(alloc, (rect_t *)&subpage->mediabox);
582
583 for (span = content_span_iterator_init(&sit, &subpage->content); span != NULL; span = content_span_iterator_next(&sit))
584 {
585 rect_t bbox = extract_span_bbox(span);
586 if (boxer_feed(boxer, &bbox))
587 goto fail;
588 }
589
590 if (analyse_sub(page, subpage, boxer, &page->split, 0))
591 goto fail;
592
593 if (collate_splits(boxer->alloc, &page->split))
594 goto fail;
595
596 #ifdef DEBUG_WRITE_AS_PS
597 printf("showpage\n");
598 #endif
599
600 boxer_destroy(boxer);
601 extract_subpage_free(alloc, &subpage);
602
603 return 0;
604
605 fail:
606 outf("Analysis failed!\n");
607 boxer_destroy(boxer);
608 extract_subpage_free(alloc, &subpage);
609
610 return -1;
611 }