comparison mupdf-source/thirdparty/extract/src/document.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include "document.h"
2 #include "outf.h"
3 #include <assert.h>
4 #include <stdio.h>
5 #include <string.h>
6
7 void
8 content_init(content_t *content, content_type_t type)
9 {
10 content->type = type;
11 content->next = content->prev = (type == content_root) ? content : NULL;
12 }
13
14 void
15 content_init_root(content_root_t *content, content_t *parent)
16 {
17 content->base.type = content_root;
18 content->base.next = content->base.prev = &content->base;
19 content->parent = parent;
20 }
21
22 void
23 content_unlink(content_t *content)
24 {
25 if (content == NULL)
26 return;
27
28 assert(content->type != content_root);
29
30 if (content->prev == NULL)
31 {
32 assert(content->next == NULL);
33 /* Already unlinked */
34 }
35 else
36 {
37 assert(content->next != content && content->prev != content);
38 content->prev->next = content->next;
39 content->next->prev = content->prev;
40 content->next = content->prev = NULL;
41 }
42 }
43
44 void content_unlink_span(span_t *span)
45 {
46 content_unlink(&span->base);
47 }
48
49 void extract_span_init(span_t *span, structure_t *structure)
50 {
51 static const span_t blank = { 0 };
52
53 *span = blank;
54 content_init(&span->base, content_span);
55 span->structure = structure;
56 }
57
58 void extract_span_free(extract_alloc_t *alloc, span_t **pspan)
59 {
60 if (*pspan == NULL)
61 return;
62
63 content_unlink(&(*pspan)->base);
64 extract_free(alloc, &(*pspan)->font_name);
65 extract_free(alloc, &(*pspan)->chars);
66 extract_free(alloc, pspan);
67 }
68
69 void extract_line_init(line_t *line)
70 {
71 static const line_t blank = { 0 };
72
73 *line = blank;
74 content_init(&line->base, content_line);
75 content_init_root(&line->content, &line->base);
76 }
77
78 void extract_paragraph_init(paragraph_t *paragraph)
79 {
80 static const paragraph_t blank = { 0 };
81
82 *paragraph = blank;
83 content_init(&paragraph->base, content_paragraph);
84 content_init_root(&paragraph->content, &paragraph->base);
85 }
86
87 void extract_block_init(block_t *block)
88 {
89 static const block_t blank = { 0 };
90
91 *block = blank;
92 content_init(&block->base, content_block);
93 content_init_root(&block->content, &block->base);
94 }
95
96 void extract_table_init(table_t *table)
97 {
98 static const table_t blank = { 0 };
99
100 *table = blank;
101 content_init(&table->base, content_table);
102 }
103
104 void extract_image_init(image_t *image)
105 {
106 static const image_t blank = { 0 };
107
108 *image = blank;
109 content_init(&image->base, content_image);
110 }
111
112 void
113 content_clear(extract_alloc_t *alloc, content_root_t *proot)
114 {
115 content_t *content, *next;
116
117 assert(proot->base.type == content_root && proot->base.next != NULL && proot->base.prev != NULL);
118 for (content = proot->base.next; content != &proot->base; content = next)
119 {
120 assert(content->type != content_root);
121 next = content->next;
122 switch (content->type)
123 {
124 default:
125 case content_root:
126 assert("This never happens" == NULL);
127 break;
128 case content_span:
129 extract_span_free(alloc, (span_t **)&content);
130 break;
131 case content_line:
132 extract_line_free(alloc, (line_t **)&content);
133 break;
134 case content_paragraph:
135 extract_paragraph_free(alloc, (paragraph_t **)&content);
136 break;
137 case content_block:
138 extract_block_free(alloc, (block_t **)&content);
139 break;
140 case content_table:
141 extract_table_free(alloc, (table_t **)&content);
142 break;
143 case content_image:
144 extract_image_free(alloc, (image_t **)&content);
145 break;
146 }
147 }
148 }
149
150 int
151 content_count(content_root_t *root)
152 {
153 int n = 0;
154 content_t *s;
155
156 for (s = root->base.next; s != &root->base; s = s->next)
157 n++;
158
159 return n;
160 }
161
162 static int
163 content_count_type(content_root_t *root, content_type_t type)
164 {
165 int n = 0;
166 content_t *s;
167
168 for (s = root->base.next; s != &root->base; s = s->next)
169 if (s->type == type) n++;
170
171 return n;
172 }
173
174 int content_count_spans(content_root_t *root)
175 {
176 return content_count_type(root, content_span);
177 }
178
179 int content_count_images(content_root_t *root)
180 {
181 return content_count_type(root, content_image);
182 }
183
184 int content_count_lines(content_root_t *root)
185 {
186 return content_count_type(root, content_line);
187 }
188
189 int content_count_paragraphs(content_root_t *root)
190 {
191 return content_count_type(root, content_paragraph);
192 }
193
194 int content_count_tables(content_root_t *root)
195 {
196 return content_count_type(root, content_table);
197 }
198
199 static content_t *
200 content_first_of_type(const content_root_t *root, content_type_t type)
201 {
202 content_t *content;
203 assert(root && root->base.type == content_root);
204
205 for (content = root->base.next; content != &root->base; content = content->next)
206 {
207 if (content->type == type)
208 return content;
209 }
210 return NULL;
211 }
212
213 static content_t *
214 content_last_of_type(const content_root_t *root, content_type_t type)
215 {
216 content_t *content;
217 assert(root && root->base.type == content_root);
218
219 for (content = root->base.prev; content != &root->base; content = content->prev)
220 {
221 if (content->type == type)
222 return content;
223 }
224 return NULL;
225 }
226
227 span_t *content_first_span(const content_root_t *root)
228 {
229 return (span_t *)content_first_of_type(root, content_span);
230 }
231
232 span_t *content_last_span(const content_root_t *root)
233 {
234 return (span_t *)content_last_of_type(root, content_span);
235 }
236
237 line_t *content_first_line(const content_root_t *root)
238 {
239 return (line_t *)content_first_of_type(root, content_line);
240 }
241
242 line_t *content_last_line(const content_root_t *root)
243 {
244 return (line_t *)content_last_of_type(root, content_line);
245 }
246
247 paragraph_t *content_first_paragraph(const content_root_t *root)
248 {
249 return (paragraph_t *)content_first_of_type(root, content_paragraph);
250 }
251
252 paragraph_t *content_last_paragraph(const content_root_t *root)
253 {
254 return (paragraph_t *)content_last_of_type(root, content_paragraph);
255 }
256
257 static content_t *
258 content_next_of_type(const content_t *node, content_type_t type)
259 {
260 content_t *content;
261 assert(node && node->type != content_root);
262
263 for (content = node->next; content->type != content_root; content = content->next)
264 {
265 if (content->type == type)
266 return content;
267 }
268 return NULL;
269 }
270
271 static content_t *
272 content_prev_of_type(const content_t *node, content_type_t type)
273 {
274 content_t *content;
275 assert(node && node->type != content_root);
276
277 for (content = node->prev; content->type != content_root; content = content->prev)
278 {
279 if (content->type == type)
280 return content;
281 }
282 return NULL;
283 }
284
285 span_t *content_next_span(const content_t *root)
286 {
287 return (span_t *)content_next_of_type(root, content_span);
288 }
289
290 span_t *content_prev_span(const content_t *root)
291 {
292 return (span_t *)content_prev_of_type(root, content_span);
293 }
294
295 line_t *content_next_line(const content_t *root)
296 {
297 return (line_t *)content_next_of_type(root, content_line);
298 }
299
300 line_t *content_prev_line(const content_t *root)
301 {
302 return (line_t *)content_prev_of_type(root, content_line);
303 }
304
305 paragraph_t *content_next_paragraph(const content_t *root)
306 {
307 return (paragraph_t *)content_next_of_type(root, content_paragraph);
308 }
309
310 paragraph_t *content_prev_paragraph(const content_t *root)
311 {
312 return (paragraph_t *)content_prev_of_type(root, content_paragraph);
313 }
314
315 void
316 content_concat(content_root_t *dst, content_root_t *src)
317 {
318 content_t *walk, *walk_next;
319
320 if (src == NULL)
321 return;
322
323 for (walk = src->base.next; walk != &src->base; walk = walk_next)
324 {
325 walk_next = walk->next;
326 content_append(dst, walk);
327 }
328 }
329
330 void extract_line_free(extract_alloc_t* alloc, line_t **pline)
331 {
332 line_t *line = *pline;
333
334 content_unlink(&line->base);
335 content_clear(alloc, &line->content);
336 extract_free(alloc, pline);
337 }
338
339 void extract_image_clear(extract_alloc_t *alloc, image_t *image)
340 {
341 extract_free(alloc, &image->type);
342 extract_free(alloc, &image->name);
343 extract_free(alloc, &image->id);
344 if (image->data_free)
345 {
346 image->data_free(image->data_free_handle, image->data);
347 image->data_free = NULL;
348 image->data_free_handle = NULL;
349 image->data = NULL;
350 }
351 }
352
353 void extract_image_free(extract_alloc_t *alloc, image_t **pimage)
354 {
355 if (*pimage == NULL)
356 return;
357 extract_image_clear(alloc, *pimage);
358 extract_free(alloc, pimage);
359 }
360
361 void extract_cell_free(extract_alloc_t *alloc, cell_t **pcell)
362 {
363 cell_t *cell = *pcell;
364
365 if (cell == NULL)
366 return;
367
368 content_clear(alloc, &cell->content);
369
370 extract_free(alloc, pcell);
371 }
372
373 int
374 extract_split_alloc(extract_alloc_t *alloc, split_type_t type, int count, split_t **psplit)
375 {
376 split_t *split;
377
378 if (extract_malloc(alloc, psplit, sizeof(*split) + (count-1) * sizeof(split_t *)))
379 return -1;
380
381 split = *psplit;
382 split->type = type;
383 split->weight = 0;
384 split->count = count;
385 memset(&split->split[0], 0, sizeof(split_t *) * count);
386
387 return 0;
388 }
389
390 void extract_split_free(extract_alloc_t *alloc, split_t **psplit)
391 {
392 int i;
393 split_t *split = *psplit;
394
395 if (split == NULL)
396 return;
397
398 for (i = 0; i < split->count; i++)
399 extract_split_free(alloc, &split->split[i]);
400 extract_free(alloc, psplit);
401 }
402
403 static void space_prefix(int depth)
404 {
405 while (depth-- > 0)
406 putc(' ', stdout);
407 }
408
409 static void
410 content_dump_aux(const content_root_t *content, int depth);
411
412 static void dump_span(const span_t *span, int depth)
413 {
414 int i;
415 for (i = 0; i < span->chars_num; i++)
416 {
417 char_t *c = &span->chars[i];
418 space_prefix(depth);
419 printf("<char ucs=\"");
420 if (c->ucs >= 32 && c->ucs <= 127)
421 putc((char)c->ucs, stdout);
422 else
423 printf("<%04x>", c->ucs);
424 printf("\" x=%f y=%f adv=%f />\n", c->x, c->y, c->adv);
425 }
426 }
427
428 static void
429 dump_structure_path(structure_t *structure)
430 {
431 if (structure->parent)
432 {
433 dump_structure_path(structure->parent);
434 printf("/");
435 }
436 printf("%s(%d)", extract_struct_string(structure->type), structure->uid);
437 }
438
439 static void
440 content_dump_span_aux(const span_t *span, int depth)
441 {
442 space_prefix(depth);
443 printf("<span ctm=[%f %f %f %f]\n",
444 span->ctm.a, span->ctm.b, span->ctm.c, span->ctm.d);
445 if (span->structure)
446 {
447 space_prefix(depth);
448 printf(" structure=\"");
449 dump_structure_path(span->structure);
450 printf("\"\n");
451 }
452 space_prefix(depth);
453 printf(" font-name=\"%s\" font_bbox=[%f %f %f %f]>\n",
454 span->font_name,
455 span->font_bbox.min.x, span->font_bbox.min.y,
456 span->font_bbox.max.x, span->font_bbox.max.y);
457 dump_span(span, depth+1);
458 space_prefix(depth);
459 printf("</span>\n");
460 }
461
462 void
463 content_dump_span(const span_t *span)
464 {
465 content_dump_span_aux(span, 0);
466 }
467
468 static void
469 content_dump_line_aux(const line_t *line, int depth)
470 {
471 span_t *span0 = content_first_span(&line->content);
472 span_t *span1 = content_last_span(&line->content);
473 char_t *char0 = (span0 && span0->chars_num > 0) ? &span0->chars[0] : NULL;
474 char_t *char1 = (span1 && span1->chars_num > 0) ? &span1->chars[span1->chars_num-1] : NULL;
475 space_prefix(depth);
476 printf("<line");
477 if (char0 && char1)
478 {
479 printf(" x0=%g y0=%g x1=%g y1=%g\n", char0->x, char0->y, char1->x, char1->y);
480 }
481 content_dump_aux(&line->content, depth+1);
482 space_prefix(depth);
483 printf("</line>\n");
484 }
485
486 void
487 content_dump_line(const line_t *line)
488 {
489 content_dump_line_aux(line, 0);
490 }
491
492 static void
493 content_dump_aux(const content_root_t *content, int depth)
494 {
495 const content_t *walk;
496
497 assert(content->base.type == content_root);
498 for (walk = content->base.next; walk != &content->base; walk = walk->next)
499 {
500 assert(walk->next->prev == walk && walk->prev->next == walk);
501 switch (walk->type)
502 {
503 case content_span:
504 content_dump_span_aux((const span_t *)walk, depth);
505 break;
506 case content_line:
507 content_dump_line_aux((const line_t *)walk, depth);
508 break;
509 case content_paragraph:
510 space_prefix(depth);
511 printf("<paragraph>\n");
512 content_dump_aux(&((const paragraph_t *)walk)->content, depth+1);
513 space_prefix(depth);
514 printf("</paragraph>\n");
515 break;
516 case content_block:
517 space_prefix(depth);
518 printf("<block>\n");
519 content_dump_aux(&((const block_t *)walk)->content, depth+1);
520 space_prefix(depth);
521 printf("</block>\n");
522 break;
523 case content_table:
524 {
525 const table_t *table = (const table_t *)walk;
526 int i, j, k;
527 space_prefix(depth);
528 printf("<table w=%d h=%d>\n", table->cells_num_x, table->cells_num_y);
529 k = 0;
530 for (j = 0; j < table->cells_num_y; j++)
531 {
532 for (i = 0; i < table->cells_num_x; i++)
533 {
534 space_prefix(depth+1);
535 printf("<cell>\n");
536 content_dump_aux(&table->cells[k]->content, depth+2);
537 space_prefix(depth+1);
538 printf("</cell>\n");
539 k++;
540 }
541 }
542 space_prefix(depth);
543 printf("</table>\n");
544 break;
545 }
546 case content_image:
547 space_prefix(depth);
548 printf("<image/>\n");
549 break;
550 default:
551 assert("Unexpected type found while dumping content list." == NULL);
552 break;
553 }
554 }
555 }
556
557 void content_dump(const content_root_t *content)
558 {
559 content_dump_aux(content, 0);
560 }
561
562 static void
563 content_dump_brief_aux(const content_root_t *content, int depth);
564
565 static void
566 content_dump_brief_span_aux(const span_t *span)
567 {
568 int i;
569
570 printf("\"");
571 for (i = 0; i < span->chars_num; i++)
572 {
573 char_t *c = &span->chars[i];
574 if (c->ucs >= 32 && c->ucs <= 127)
575 putc((char)c->ucs, stdout);
576 else
577 printf("<%04x>", c->ucs);
578 }
579 printf("\"");
580 }
581
582 static void
583 content_dump_brief_line_aux(const line_t *line, int depth)
584 {
585 printf("<line text=");
586 content_dump_brief_aux(&line->content, depth+1);
587 printf(">\n");
588 }
589
590 static void
591 content_dump_brief_aux(const content_root_t *content, int depth)
592 {
593 const content_t *walk;
594
595 assert(content->base.type == content_root);
596 for (walk = content->base.next; walk != &content->base; walk = walk->next)
597 {
598 assert(walk->next->prev == walk && walk->prev->next == walk);
599 switch (walk->type)
600 {
601 case content_span:
602 content_dump_brief_span_aux((const span_t *)walk);
603 break;
604 case content_line:
605 content_dump_brief_line_aux((const line_t *)walk, depth);
606 break;
607 case content_paragraph:
608 content_dump_brief_aux(&((const paragraph_t *)walk)->content, depth+1);
609 break;
610 case content_block:
611 content_dump_brief_aux(&((const block_t *)walk)->content, depth+1);
612 break;
613 case content_table:
614 {
615 const table_t *table = (const table_t *)walk;
616 int i, j, k;
617 k = 0;
618 for (j = 0; j < table->cells_num_y; j++)
619 {
620 for (i = 0; i < table->cells_num_x; i++)
621 {
622 content_dump_brief_aux(&table->cells[k]->content, depth+2);
623 k++;
624 }
625 }
626 break;
627 }
628 case content_image:
629 break;
630 default:
631 assert("Unexpected type found while dumping content list." == NULL);
632 break;
633 }
634 }
635 }
636
637 void content_dump_brief(const content_root_t *content)
638 {
639 content_dump_brief_aux(content, 0);
640 }
641
642 static content_t *
643 cmp_and_merge(content_t *q1, int q1pos, int len1, int n, content_cmp_fn *cmp)
644 {
645 int len2 = q1pos + len1*2; /* end of both lists assuming we don't overrun */
646 int p;
647 content_t *q2 = q1;
648
649 /* Don't overrun the end, and then convert to length from end. */
650 if (len2 > n)
651 len2 = n;
652 len2 -= q1pos + len1;
653
654 if (len2 <= 0)
655 len1 += len2;
656
657 /* Find the start of q2. We know this fits. */
658 for (p = 0; p < len1; p++)
659 q2 = q2->next;
660
661 if (len2 <= 0)
662 return q2;
663
664 /* So we have [q1..(q1+len1)) as the first list to merge, and
665 * [q2..q2+len2)) as the second list to merge. We know that
666 * q2 = q1+len1. So, if we can reduce len1 or len2 to 0, we have
667 * the lists sorted. */
668 while (1)
669 {
670 if (cmp(q1, q2) > 0)
671 {
672 /* q2 is smaller. q2 should be before q1. Move it. */
673 /* So:
674 * a<->q1<->c..d<->q2<->b => a<->q2<->q1<->c..d<->b
675 * (where c and d can either be the same, or can be q2 and q1!)
676 */
677 content_t *a = q1->prev;
678 content_t *b = q2->next;
679 content_t *d = q2->prev;
680 d->next = b;
681 b->prev = d;
682 a->next = q2;
683 q2->prev = a;
684 q2->next = q1;
685 q1->prev = q2;
686 /* Now advance q2 */
687 q2 = b;
688 len2--;
689 if (len2 == 0)
690 break;
691 } else {
692 /* Advance q1 */
693 q1 = q1->next;
694 len1--;
695 if (len1 == 0)
696 break;
697 }
698 }
699
700 while (len2)
701 {
702 q2 = q2->next;
703 len2--;
704 }
705
706 return q2;
707 }
708
709 /* Spiffy in-place merge sort. */
710 void content_sort(content_root_t *content, content_cmp_fn *cmp)
711 {
712 int n = content_count(content);
713 int size;
714
715 for (size = 1; size < n; size <<= 1)
716 {
717 int q1_idx = 0;
718 content_t *q1 = content->base.next;
719 for (q1_idx = 0; q1_idx < n; q1_idx += size*2)
720 q1 = cmp_and_merge(q1, q1_idx, size, n, cmp);
721 assert(q1->type == content_root);
722 }
723 assert(content_count(content) == n);
724 }