comparison mupdf-source/thirdparty/extract/src/document.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #ifndef ARTIFEX_EXTRACT_DOCUMENT_H
2 #define ARTIFEX_EXTRACT_DOCUMENT_H
3
4 #include "extract/extract.h"
5 #include "extract/alloc.h"
6
7 #include "compat_stdint.h"
8 #include <assert.h>
9
10 typedef struct span_t span_t;
11 typedef struct line_t line_t;
12 typedef struct paragraph_t paragraph_t;
13 typedef struct image_t image_t;
14 typedef struct table_t table_t;
15 typedef struct block_t block_t;
16 typedef struct structure_t structure_t;
17
18 static const double pi = 3.141592653589793;
19
20 /*
21 All content is stored as content_t nodes in a doubly linked-list.
22 The first node in the list is a 'content_root' node. The last
23 node in the list is the same node again.
24
25 Thus:
26 Every node in a list (including the root) has next and prev != NULL.
27 The root node in an empty list has next and prev pointing to itself.
28 Any non-root node with prev and next == NULL is not in a list.
29
30 Content nodes record a 'type' for the node. Each node is 'derived' in
31 an OO style from the basic content_t.
32
33 The different content types form a heirarchy:
34
35 A spans is an array of "char_t"s (note, an array, NOT a content list).
36
37 Lines contain a content list, which should mostly consist of spans.
38
39 Paragraphs contain a content list, which should mostly consist of lines.
40
41 Image nodes contains details of a bitmap image.
42
43 Table nodes contain an array of cells, each of which contains a content
44 list that can contain any other type.
45
46 Blocks contain a content list consisting of paragraphs, tables and images.
47 Conceptually these represent a block of content on a page.
48 */
49 typedef enum {
50 content_root,
51 content_span,
52 content_line,
53 content_paragraph,
54 content_image,
55 content_table,
56 content_block
57 } content_type_t;
58
59 typedef struct content_t {
60 /* The type field tells us what derived type we actually are. */
61 content_type_t type;
62
63 /* This holds us in the linked list of sibling content nodes. */
64 struct content_t *prev;
65 struct content_t *next;
66 } content_t;
67
68 /* Initialise a content_t (just the base struct). */
69 void content_init(content_t *content, content_type_t type);
70
71 /* Unlink a (non-root) content_t from any list. */
72 void content_unlink(content_t *content);
73
74 /* Unlink a span_t from any list. */
75 void content_unlink_span(span_t *span);
76
77 typedef struct {
78 content_t base;
79 content_t *parent;
80 } content_root_t;
81
82 void content_init_root(content_root_t *root, content_t *parent);
83
84 /* Free all the content, from a (root) content_t. */
85 void content_clear(extract_alloc_t* alloc, content_root_t *root);
86
87 span_t *content_first_span(const content_root_t *root);
88 span_t *content_last_span(const content_root_t *root);
89 line_t *content_first_line(const content_root_t *root);
90 line_t *content_last_line(const content_root_t *root);
91 paragraph_t *content_first_paragraph(const content_root_t *root);
92 paragraph_t *content_last_paragraph(const content_root_t *root);
93
94 span_t *content_next_span(const content_t *node);
95 span_t *content_prev_span(const content_t *node);
96 line_t *content_next_line(const content_t *node);
97 line_t *content_prev_line(const content_t *node);
98 paragraph_t *content_next_paragraph(const content_t *node);
99 paragraph_t *content_prev_paragraph(const content_t *node);
100
101 int content_count(content_root_t *root);
102 int content_count_images(content_root_t *root);
103 int content_count_spans(content_root_t *root);
104 int content_count_lines(content_root_t *root);
105 int content_count_paragraphs(content_root_t *root);
106 int content_count_tables(content_root_t *root);
107
108 int content_new_root(extract_alloc_t *alloc, content_root_t **proot);
109 int content_new_span(extract_alloc_t *alloc, span_t **pspan, structure_t *structure);
110 int content_new_line(extract_alloc_t *alloc, line_t **pline);
111 int content_new_paragraph(extract_alloc_t *alloc, paragraph_t **pparagraph);
112 int content_new_table(extract_alloc_t *alloc, table_t **ptable);
113 int content_new_block(extract_alloc_t *alloc, block_t **pblock);
114
115 int content_append_new_span(extract_alloc_t* alloc, content_root_t *root, span_t **pspan, structure_t *structure);
116 int content_append_new_line(extract_alloc_t* alloc, content_root_t *root, line_t **pline);
117 int content_append_new_paragraph(extract_alloc_t* alloc, content_root_t *root, paragraph_t **pparagraph);
118 int content_append_new_image(extract_alloc_t* alloc, content_root_t *root, image_t **pimage);
119 int content_append_new_table(extract_alloc_t* alloc, content_root_t *root, table_t **ptable);
120 int content_append_new_block(extract_alloc_t* alloc, content_root_t *root, block_t **pblock);
121
122 void content_replace(content_t *current, content_t *replacement);
123 int content_replace_new_line(extract_alloc_t* alloc, content_t *current, line_t **pline);
124 int content_replace_new_paragraph(extract_alloc_t* alloc, content_t *current, paragraph_t **pparagraph);
125 int content_replace_new_block(extract_alloc_t* alloc, content_t *current, block_t **pblock);
126
127
128 void content_append(content_root_t *root, content_t *content);
129 void content_append_span(content_root_t *root, span_t *span);
130 void content_append_line(content_root_t *root, line_t *line);
131 void content_append_paragraph(content_root_t *root, paragraph_t *paragraph);
132 void content_append_table(content_root_t *root, table_t *table);
133 void content_append_block(content_root_t *root, block_t *block);
134
135 void content_concat(content_root_t *dst, content_root_t *src);
136
137 void content_dump(const content_root_t *content);
138 void content_dump_line(const line_t *line);
139 void content_dump_span(const span_t *span);
140 void content_dump_brief(const content_root_t *content);
141
142
143 typedef int (content_cmp_fn)(const content_t *, const content_t *);
144
145 void content_sort(content_root_t *content, content_cmp_fn *cmp);
146
147 /* To iterate over the line elements of a content list:
148
149 content_line_iterator it;
150 line_t *line;
151
152 for(line = content_line_iterator_line_init(&it, content); line != NULL; line = content_line_iterator_next(&it))
153 {
154 }
155
156 */
157
158 typedef struct {
159 content_root_t *root;
160 content_t *next;
161 } content_paragraph_iterator;
162
163 static inline paragraph_t *content_paragraph_iterator_next(content_paragraph_iterator *it)
164 {
165 content_t *next;
166
167 do {
168 next = it->next;
169 if (next == &it->root->base)
170 return NULL;
171 assert(next->type != content_root);
172 it->next = next->next;
173 } while (next->type != content_paragraph);
174
175 return (paragraph_t *)next;
176 }
177
178 static inline paragraph_t *content_paragraph_iterator_init(content_paragraph_iterator *it, content_root_t *root)
179 {
180 it->root = root;
181 it->next = root->base.next;
182
183 return content_paragraph_iterator_next(it);
184 }
185
186 typedef struct {
187 content_root_t *root;
188 content_t *next;
189 } content_line_iterator;
190
191 static inline line_t *content_line_iterator_next(content_line_iterator *it)
192 {
193 content_t *next;
194
195 do {
196 next = it->next;
197 if (next == &it->root->base)
198 return NULL;
199 assert(next->type != content_root);
200 it->next = next->next;
201 } while (next->type != content_line);
202
203 return (line_t *)next;
204 }
205
206 static inline line_t *content_line_iterator_init(content_line_iterator *it, content_root_t *root)
207 {
208 it->root = root;
209 it->next = root->base.next;
210
211 return content_line_iterator_next(it);
212 }
213
214 typedef struct {
215 content_root_t *root;
216 content_t *next;
217 } content_span_iterator;
218
219 static inline span_t *content_span_iterator_next(content_span_iterator *it)
220 {
221 content_t *next;
222
223 do {
224 next = it->next;
225 if (next == &it->root->base)
226 return NULL;
227 assert(next->type != content_root);
228 it->next = next->next;
229 } while (next->type != content_span);
230
231 return (span_t *)next;
232 }
233
234 static inline span_t *content_span_iterator_init(content_span_iterator *it, content_root_t *root)
235 {
236 it->root = root;
237 it->next = root->base.next;
238
239 return content_span_iterator_next(it);
240 }
241
242 typedef struct {
243 content_root_t *root;
244 content_t *next;
245 } content_image_iterator;
246
247 static inline image_t *content_image_iterator_next(content_image_iterator *it)
248 {
249 content_t *next;
250
251 do {
252 next = it->next;
253 if (next == &it->root->base)
254 return NULL;
255 assert(next->type != content_root);
256 it->next = next->next;
257 } while (next->type != content_image);
258
259 return (image_t *)next;
260 }
261
262 static inline image_t *content_image_iterator_init(content_image_iterator *it, content_root_t *root)
263 {
264 it->root = root;
265 it->next = root->base.next;
266
267 return content_image_iterator_next(it);
268 }
269
270 typedef struct {
271 content_root_t *root;
272 content_t *next;
273 } content_table_iterator;
274
275 static inline table_t *content_table_iterator_next(content_table_iterator *it)
276 {
277 content_t *next;
278
279 do {
280 next = it->next;
281 if (next == &it->root->base)
282 return NULL;
283 assert(next->type != content_root);
284 it->next = next->next;
285 } while (next->type != content_table);
286
287 return (table_t *)next;
288 }
289
290 static inline table_t *content_table_iterator_init(content_table_iterator *it, content_root_t *root)
291 {
292 it->root = root;
293 it->next = root->base.next;
294
295 return content_table_iterator_next(it);
296 }
297
298 typedef struct {
299 content_root_t *root;
300 content_t *next;
301 } content_iterator;
302
303 static inline content_t *content_iterator_next(content_iterator *it)
304 {
305 content_t *next = it->next;
306
307 if (next == &it->root->base)
308 return NULL;
309 assert(next->type != content_root);
310 it->next = next->next;
311
312 return next;
313 }
314
315 static inline content_t *content_iterator_init(content_iterator *it, content_root_t *root)
316 {
317 it->root = root;
318 it->next = root->base.next;
319
320 return content_iterator_next(it);
321 }
322
323 typedef struct
324 {
325 double x;
326 double y;
327 } point_t;
328
329 const char *extract_point_string(const point_t *point);
330
331 typedef struct
332 {
333 point_t min;
334 point_t max;
335 } rect_t;
336
337 extern const rect_t extract_rect_infinite;
338 extern const rect_t extract_rect_empty;
339
340 rect_t extract_rect_intersect(rect_t a, rect_t b);
341
342 rect_t extract_rect_union(rect_t a, rect_t b);
343
344 rect_t extract_rect_union_point(rect_t a, point_t b);
345
346 int extract_rect_contains_rect(rect_t a, rect_t b);
347
348 int extract_rect_valid(rect_t a);
349
350 const char *extract_rect_string(const rect_t *rect);
351
352 typedef struct
353 {
354 double a;
355 double b;
356 double c;
357 double d;
358 double e;
359 double f;
360 } matrix_t;
361
362 typedef struct
363 {
364 double a;
365 double b;
366 double c;
367 double d;
368 } matrix4_t;
369
370 const char *extract_matrix_string(const matrix_t *matrix);
371 const char *extract_matrix4_string(const matrix4_t *matrix);
372
373 /* Returns a*d - b*c. */
374 double extract_matrix_expansion(matrix4_t m);
375
376 /* Returns the inverse of a matrix (or identity for degenerate). */
377 matrix4_t extract_matrix4_invert(const matrix4_t *ctm);
378
379 point_t extract_matrix4_transform_point(matrix4_t m, point_t p);
380 point_t extract_matrix4_transform_xy(matrix4_t m, double x, double y);
381 matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2);
382 matrix4_t extract_multiply_matrix4_matrix4(matrix4_t m1, matrix4_t m2);
383
384 /* Returns zero if first four members of *lhs and *rhs are equal, otherwise
385 +/-1. */
386 int extract_matrix4_cmp(const matrix4_t *lhs, const matrix4_t *rhs);
387
388 /* A single char in a span. */
389 typedef struct
390 {
391 /* (x,y) after transformation by ctm. */
392 double x;
393 double y;
394
395 unsigned ucs;
396 double adv; /* Advance, before transform by ctm */
397
398 rect_t bbox;
399 } char_t;
400
401 /* List of chars that have same font and are usually adjacent. */
402 struct span_t
403 {
404 content_t base;
405 matrix4_t ctm;
406 char *font_name;
407 rect_t font_bbox;
408 structure_t *structure;
409
410 struct {
411 unsigned font_bold : 1;
412 unsigned font_italic : 1;
413 unsigned wmode : 1;
414 } flags;
415
416 char_t *chars;
417 int chars_num;
418 };
419
420 void extract_span_init(span_t *span, structure_t *structure);
421
422 /* Frees a span_t, returning with *pspan set to NULL. */
423 void extract_span_free(extract_alloc_t *alloc, span_t **pspan);
424
425 /* Returns last character in span. */
426 char_t *extract_span_char_last(span_t *span);
427
428 /* Appends new char_t to an span_t with .ucs=c and all other
429 fields zeroed. Returns pointer to new char_t record, or NULL if allocation
430 failed. */
431 char_t *extract_span_append_c(extract_alloc_t *alloc, span_t *span, int c);
432
433 /* Returns static string containing info about span_t. */
434 const char *extract_span_string(extract_alloc_t *alloc, span_t *span);
435
436 /* List of spans that are aligned on same line. */
437 struct line_t
438 {
439 content_t base;
440 double ascender;
441 double descender;
442 content_root_t content;
443 };
444
445 void extract_line_init(line_t *line);
446
447 void extract_line_free(extract_alloc_t* alloc, line_t **pline);
448
449 /* Returns first span in a line. */
450 span_t *extract_line_span_first(line_t *line);
451
452 /* Returns last span in a line. */
453 span_t *extract_line_span_last(line_t *line);
454
455 /* List of lines that are aligned and adjacent to each other so as to form a
456 paragraph. */
457 struct paragraph_t
458 {
459 content_t base;
460 int line_flags;
461 content_root_t content;
462 };
463
464 typedef enum
465 {
466 /* If the paragraph is ever not aligned to the left hand edge, we set this flag. */
467 paragraph_not_aligned_left = 1,
468
469 /* If the paragraph is ever not aligned to the right hand edge, we set this flag. */
470 paragraph_not_aligned_right = 2,
471
472 /* If the paragraph ever has a line that doesn't look centred, we set this flag. */
473 paragraph_not_centred = 4,
474
475 /* If the paragraph ever has a line that doesn't look fully justified, we set this flag. */
476 paragraph_not_fully_justified = 8,
477
478 /* If the paragraph ever breaks at a place where it looks like first word from the
479 * next line could have fitted, then set this flag.*/
480 paragraph_breaks_strangely = 16
481 } paragraph_flags;
482
483 void extract_paragraph_init(paragraph_t *paragraph);
484
485 void extract_paragraph_free(extract_alloc_t *alloc, paragraph_t **pparagraph);
486
487 /* List of content that we believe should be treated as a whole. */
488 struct block_t
489 {
490 content_t base;
491 content_root_t content;
492 };
493
494 void extract_block_init(block_t *block);
495
496 void extract_block_free(extract_alloc_t *alloc, block_t **pblock);
497
498
499
500 /* Information about an image. <type> is as passed to extract_add_image();
501 <name> and <id> are created to be unique identifiers for use in generated docx
502 file. */
503 struct image_t
504 {
505 content_t base;
506 char *type; /* jpg, png etc. */
507 char *name; /* Name of image file within docx. */
508 char *id; /* ID of image within docx. */
509 double x;
510 double y;
511 double w;
512 double h;
513 void *data;
514 size_t data_size;
515
516 extract_image_data_free *data_free;
517 void *data_free_handle;
518 };
519
520 void extract_image_init(image_t *image);
521
522 void extract_image_clear(extract_alloc_t *alloc, image_t *image);
523
524 void extract_image_free(extract_alloc_t *alloc, image_t **pimage);
525
526 /* A line that is part of a table. */
527 typedef struct
528 {
529 float color;
530 rect_t rect;
531 } tableline_t;
532
533 typedef struct
534 {
535 tableline_t *tablelines;
536 int tablelines_num;
537 } tablelines_t;
538
539
540 /* A cell within a table. */
541 typedef struct
542 {
543 rect_t rect;
544
545 /* If left/above is true, this cell is not obscured by cell to its
546 * left/above. */
547 uint8_t left;
548 uint8_t above;
549
550 /* extend_right and extend_down are 1 for normal cells, 2 for cells which
551 * extend right/down to cover an additional column/row, 3 to cover two
552 * additional columns/rows etc. */
553 int extend_right;
554 int extend_down;
555
556 /* Contents of this cell. */
557 content_root_t content;
558 } cell_t;
559
560 void extract_cell_init(cell_t *cell);
561 void extract_cell_free(extract_alloc_t *alloc, cell_t **pcell);
562 void extract_table_init(table_t *table);
563
564 struct table_t
565 {
566 content_t base;
567 point_t pos; /* top-left. */
568
569 /* Array of cells_num_x*cells_num_y cells; cell (x, y) is:
570 * cells_num_x * y + x.
571 */
572 cell_t **cells;
573 int cells_num_x;
574 int cells_num_y;
575 };
576
577 void extract_table_free(extract_alloc_t *alloc, table_t **ptable);
578
579 typedef enum
580 {
581 SPLIT_NONE = 0,
582 SPLIT_HORIZONTAL,
583 SPLIT_VERTICAL
584 } split_type_t;
585
586
587 typedef struct split_t
588 {
589 split_type_t type;
590 double weight;
591 int count;
592 struct split_t *split[1];
593 } split_t;
594
595 struct structure_t
596 {
597 structure_t *parent;
598 structure_t *sibling_next;
599 structure_t *sibling_prev;
600 structure_t *kids_first;
601 structure_t **kids_tail;
602 int uid;
603 extract_struct_t type;
604 int score;
605 };
606
607 /* A subpage. Contains different representations of the list of spans. */
608 typedef struct
609 {
610 rect_t mediabox;
611
612 int images_num;
613
614 /* All the content on the page. */
615 content_root_t content;
616
617 tablelines_t tablelines_horizontal;
618 tablelines_t tablelines_vertical;
619
620 content_root_t tables;
621 } subpage_t;
622
623
624 /* A page. Contains a list of subpages. NB not
625 called page_t because this clashes with a system type on hpux. */
626 typedef struct
627 {
628 rect_t mediabox;
629
630 subpage_t **subpages;
631 int subpages_num;
632
633 split_t *split;
634 } extract_page_t;
635
636
637 /* A list of pages. */
638 typedef struct
639 {
640 extract_page_t **pages;
641 int pages_num;
642
643 /* All the structure for the document. */
644 structure_t *structure;
645
646 /* During construction, current points to the current point
647 * within the structure tree where things should be added. */
648 structure_t *current;
649 } document_t;
650
651
652 typedef struct
653 {
654 image_t **images;
655 int images_num;
656 char **imagetypes;
657 int imagetypes_num;
658 } images_t;
659
660
661 /* This does all the work of finding paragraphs and tables. */
662 int extract_document_join(extract_alloc_t *alloc, document_t *document, int layout_analysis, double master_space_guess);
663
664 double extract_font_size(matrix4_t *ctm);
665
666 /* Things below here are used when generating output. */
667
668 /* Basic information about current font. */
669 typedef struct
670 {
671 char *name;
672 double size;
673 int bold;
674 int italic;
675 } font_t;
676
677 /* Used to keep track of font information when writing paragraphs of odt
678 content, e.g. so we know whether a font has changed so need to start a new odt
679 span. */
680 typedef struct
681 {
682 font_t font;
683 matrix4_t *ctm_prev;
684 } content_state_t;
685
686 /* Analyse page content for layouts. */
687 int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page);
688
689 /* subpage_t constructor. */
690 int extract_subpage_alloc(extract_alloc_t *extract, rect_t mediabox, extract_page_t *page, subpage_t **psubpage);
691
692 /* subpage_t destructor. */
693 void extract_subpage_free(extract_alloc_t *alloc, subpage_t **psubpage);
694
695 /* Allocate a split_t. */
696 int extract_split_alloc(extract_alloc_t *alloc, split_type_t type, int count, split_t **psplit);
697
698 void extract_split_free(extract_alloc_t *alloc, split_t **psplit);
699
700 typedef struct {
701 content_root_t *root;
702 content_t *next;
703 } content_tree_iterator;
704
705 static inline content_t *content_tree_iterator_next(content_tree_iterator *it)
706 {
707 content_t *next = it->next;
708
709 while (next->type == content_root)
710 {
711 content_t *parent = ((content_root_t *)next)->parent;
712 if (parent == NULL)
713 return NULL;
714 next = parent->next;
715 }
716 assert(next->type != content_root);
717
718 switch (next->type)
719 {
720 default:
721 case content_root:
722 assert("Never happens!" == NULL);
723 break;
724 case content_span:
725 it->next = next->next;
726 break;
727 case content_line:
728 it->next = ((line_t *)next)->content.base.next;
729 break;
730 case content_paragraph:
731 it->next = ((paragraph_t *)next)->content.base.next;
732 break;
733 }
734
735 return next;
736 }
737
738 static inline content_t *content_tree_iterator_init(content_tree_iterator *it, content_root_t *root)
739 {
740 it->root = root;
741 it->next = root->base.next;
742
743 return content_tree_iterator_next(it);
744 }
745
746 /* Some helper functions */
747
748 /* Return a span_t * pointer to the first element in a content list. */
749 static inline span_t *content_head_as_span(content_root_t *root)
750 {
751 assert(root != NULL && root->base.type == content_root && (root->base.next == NULL || root->base.next->type == content_span));
752 return (span_t *)root->base.next;
753 }
754
755 /* Return a point for the post-advance position of a char in a given span. */
756 point_t extract_predicted_end_of_char(char_t *char_, const span_t *span);
757
758 /* Return a point for the post-advance position of the final char in a given span. */
759 point_t extract_end_of_span(const span_t *span);
760
761 /* Return the bounds for a block before it was rotated around its origin. */
762 rect_t extract_block_pre_rotation_bounds(block_t *block, double rotate);
763
764 double extract_baseline_angle(const matrix4_t *ctm);
765
766 #endif