Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/extract/src/document.h @ 42:4621bd954a09
FIX: Need packaging at runtime to because of the parsing of mupdf_version into a tuple.
In the merge it was removed erroneously.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:14:12 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
#ifndef ARTIFEX_EXTRACT_DOCUMENT_H #define ARTIFEX_EXTRACT_DOCUMENT_H #include "extract/extract.h" #include "extract/alloc.h" #include "compat_stdint.h" #include <assert.h> typedef struct span_t span_t; typedef struct line_t line_t; typedef struct paragraph_t paragraph_t; typedef struct image_t image_t; typedef struct table_t table_t; typedef struct block_t block_t; typedef struct structure_t structure_t; static const double pi = 3.141592653589793; /* All content is stored as content_t nodes in a doubly linked-list. The first node in the list is a 'content_root' node. The last node in the list is the same node again. Thus: Every node in a list (including the root) has next and prev != NULL. The root node in an empty list has next and prev pointing to itself. Any non-root node with prev and next == NULL is not in a list. Content nodes record a 'type' for the node. Each node is 'derived' in an OO style from the basic content_t. The different content types form a heirarchy: A spans is an array of "char_t"s (note, an array, NOT a content list). Lines contain a content list, which should mostly consist of spans. Paragraphs contain a content list, which should mostly consist of lines. Image nodes contains details of a bitmap image. Table nodes contain an array of cells, each of which contains a content list that can contain any other type. Blocks contain a content list consisting of paragraphs, tables and images. Conceptually these represent a block of content on a page. */ typedef enum { content_root, content_span, content_line, content_paragraph, content_image, content_table, content_block } content_type_t; typedef struct content_t { /* The type field tells us what derived type we actually are. */ content_type_t type; /* This holds us in the linked list of sibling content nodes. */ struct content_t *prev; struct content_t *next; } content_t; /* Initialise a content_t (just the base struct). */ void content_init(content_t *content, content_type_t type); /* Unlink a (non-root) content_t from any list. */ void content_unlink(content_t *content); /* Unlink a span_t from any list. */ void content_unlink_span(span_t *span); typedef struct { content_t base; content_t *parent; } content_root_t; void content_init_root(content_root_t *root, content_t *parent); /* Free all the content, from a (root) content_t. */ void content_clear(extract_alloc_t* alloc, content_root_t *root); span_t *content_first_span(const content_root_t *root); span_t *content_last_span(const content_root_t *root); line_t *content_first_line(const content_root_t *root); line_t *content_last_line(const content_root_t *root); paragraph_t *content_first_paragraph(const content_root_t *root); paragraph_t *content_last_paragraph(const content_root_t *root); span_t *content_next_span(const content_t *node); span_t *content_prev_span(const content_t *node); line_t *content_next_line(const content_t *node); line_t *content_prev_line(const content_t *node); paragraph_t *content_next_paragraph(const content_t *node); paragraph_t *content_prev_paragraph(const content_t *node); int content_count(content_root_t *root); int content_count_images(content_root_t *root); int content_count_spans(content_root_t *root); int content_count_lines(content_root_t *root); int content_count_paragraphs(content_root_t *root); int content_count_tables(content_root_t *root); int content_new_root(extract_alloc_t *alloc, content_root_t **proot); int content_new_span(extract_alloc_t *alloc, span_t **pspan, structure_t *structure); int content_new_line(extract_alloc_t *alloc, line_t **pline); int content_new_paragraph(extract_alloc_t *alloc, paragraph_t **pparagraph); int content_new_table(extract_alloc_t *alloc, table_t **ptable); int content_new_block(extract_alloc_t *alloc, block_t **pblock); int content_append_new_span(extract_alloc_t* alloc, content_root_t *root, span_t **pspan, structure_t *structure); int content_append_new_line(extract_alloc_t* alloc, content_root_t *root, line_t **pline); int content_append_new_paragraph(extract_alloc_t* alloc, content_root_t *root, paragraph_t **pparagraph); int content_append_new_image(extract_alloc_t* alloc, content_root_t *root, image_t **pimage); int content_append_new_table(extract_alloc_t* alloc, content_root_t *root, table_t **ptable); int content_append_new_block(extract_alloc_t* alloc, content_root_t *root, block_t **pblock); void content_replace(content_t *current, content_t *replacement); int content_replace_new_line(extract_alloc_t* alloc, content_t *current, line_t **pline); int content_replace_new_paragraph(extract_alloc_t* alloc, content_t *current, paragraph_t **pparagraph); int content_replace_new_block(extract_alloc_t* alloc, content_t *current, block_t **pblock); void content_append(content_root_t *root, content_t *content); void content_append_span(content_root_t *root, span_t *span); void content_append_line(content_root_t *root, line_t *line); void content_append_paragraph(content_root_t *root, paragraph_t *paragraph); void content_append_table(content_root_t *root, table_t *table); void content_append_block(content_root_t *root, block_t *block); void content_concat(content_root_t *dst, content_root_t *src); void content_dump(const content_root_t *content); void content_dump_line(const line_t *line); void content_dump_span(const span_t *span); void content_dump_brief(const content_root_t *content); typedef int (content_cmp_fn)(const content_t *, const content_t *); void content_sort(content_root_t *content, content_cmp_fn *cmp); /* To iterate over the line elements of a content list: content_line_iterator it; line_t *line; for(line = content_line_iterator_line_init(&it, content); line != NULL; line = content_line_iterator_next(&it)) { } */ typedef struct { content_root_t *root; content_t *next; } content_paragraph_iterator; static inline paragraph_t *content_paragraph_iterator_next(content_paragraph_iterator *it) { content_t *next; do { next = it->next; if (next == &it->root->base) return NULL; assert(next->type != content_root); it->next = next->next; } while (next->type != content_paragraph); return (paragraph_t *)next; } static inline paragraph_t *content_paragraph_iterator_init(content_paragraph_iterator *it, content_root_t *root) { it->root = root; it->next = root->base.next; return content_paragraph_iterator_next(it); } typedef struct { content_root_t *root; content_t *next; } content_line_iterator; static inline line_t *content_line_iterator_next(content_line_iterator *it) { content_t *next; do { next = it->next; if (next == &it->root->base) return NULL; assert(next->type != content_root); it->next = next->next; } while (next->type != content_line); return (line_t *)next; } static inline line_t *content_line_iterator_init(content_line_iterator *it, content_root_t *root) { it->root = root; it->next = root->base.next; return content_line_iterator_next(it); } typedef struct { content_root_t *root; content_t *next; } content_span_iterator; static inline span_t *content_span_iterator_next(content_span_iterator *it) { content_t *next; do { next = it->next; if (next == &it->root->base) return NULL; assert(next->type != content_root); it->next = next->next; } while (next->type != content_span); return (span_t *)next; } static inline span_t *content_span_iterator_init(content_span_iterator *it, content_root_t *root) { it->root = root; it->next = root->base.next; return content_span_iterator_next(it); } typedef struct { content_root_t *root; content_t *next; } content_image_iterator; static inline image_t *content_image_iterator_next(content_image_iterator *it) { content_t *next; do { next = it->next; if (next == &it->root->base) return NULL; assert(next->type != content_root); it->next = next->next; } while (next->type != content_image); return (image_t *)next; } static inline image_t *content_image_iterator_init(content_image_iterator *it, content_root_t *root) { it->root = root; it->next = root->base.next; return content_image_iterator_next(it); } typedef struct { content_root_t *root; content_t *next; } content_table_iterator; static inline table_t *content_table_iterator_next(content_table_iterator *it) { content_t *next; do { next = it->next; if (next == &it->root->base) return NULL; assert(next->type != content_root); it->next = next->next; } while (next->type != content_table); return (table_t *)next; } static inline table_t *content_table_iterator_init(content_table_iterator *it, content_root_t *root) { it->root = root; it->next = root->base.next; return content_table_iterator_next(it); } typedef struct { content_root_t *root; content_t *next; } content_iterator; static inline content_t *content_iterator_next(content_iterator *it) { content_t *next = it->next; if (next == &it->root->base) return NULL; assert(next->type != content_root); it->next = next->next; return next; } static inline content_t *content_iterator_init(content_iterator *it, content_root_t *root) { it->root = root; it->next = root->base.next; return content_iterator_next(it); } typedef struct { double x; double y; } point_t; const char *extract_point_string(const point_t *point); typedef struct { point_t min; point_t max; } rect_t; extern const rect_t extract_rect_infinite; extern const rect_t extract_rect_empty; rect_t extract_rect_intersect(rect_t a, rect_t b); rect_t extract_rect_union(rect_t a, rect_t b); rect_t extract_rect_union_point(rect_t a, point_t b); int extract_rect_contains_rect(rect_t a, rect_t b); int extract_rect_valid(rect_t a); const char *extract_rect_string(const rect_t *rect); typedef struct { double a; double b; double c; double d; double e; double f; } matrix_t; typedef struct { double a; double b; double c; double d; } matrix4_t; const char *extract_matrix_string(const matrix_t *matrix); const char *extract_matrix4_string(const matrix4_t *matrix); /* Returns a*d - b*c. */ double extract_matrix_expansion(matrix4_t m); /* Returns the inverse of a matrix (or identity for degenerate). */ matrix4_t extract_matrix4_invert(const matrix4_t *ctm); point_t extract_matrix4_transform_point(matrix4_t m, point_t p); point_t extract_matrix4_transform_xy(matrix4_t m, double x, double y); matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2); matrix4_t extract_multiply_matrix4_matrix4(matrix4_t m1, matrix4_t m2); /* Returns zero if first four members of *lhs and *rhs are equal, otherwise +/-1. */ int extract_matrix4_cmp(const matrix4_t *lhs, const matrix4_t *rhs); /* A single char in a span. */ typedef struct { /* (x,y) after transformation by ctm. */ double x; double y; unsigned ucs; double adv; /* Advance, before transform by ctm */ rect_t bbox; } char_t; /* List of chars that have same font and are usually adjacent. */ struct span_t { content_t base; matrix4_t ctm; char *font_name; rect_t font_bbox; structure_t *structure; struct { unsigned font_bold : 1; unsigned font_italic : 1; unsigned wmode : 1; } flags; char_t *chars; int chars_num; }; void extract_span_init(span_t *span, structure_t *structure); /* Frees a span_t, returning with *pspan set to NULL. */ void extract_span_free(extract_alloc_t *alloc, span_t **pspan); /* Returns last character in span. */ char_t *extract_span_char_last(span_t *span); /* Appends new char_t to an span_t with .ucs=c and all other fields zeroed. Returns pointer to new char_t record, or NULL if allocation failed. */ char_t *extract_span_append_c(extract_alloc_t *alloc, span_t *span, int c); /* Returns static string containing info about span_t. */ const char *extract_span_string(extract_alloc_t *alloc, span_t *span); /* List of spans that are aligned on same line. */ struct line_t { content_t base; double ascender; double descender; content_root_t content; }; void extract_line_init(line_t *line); void extract_line_free(extract_alloc_t* alloc, line_t **pline); /* Returns first span in a line. */ span_t *extract_line_span_first(line_t *line); /* Returns last span in a line. */ span_t *extract_line_span_last(line_t *line); /* List of lines that are aligned and adjacent to each other so as to form a paragraph. */ struct paragraph_t { content_t base; int line_flags; content_root_t content; }; typedef enum { /* If the paragraph is ever not aligned to the left hand edge, we set this flag. */ paragraph_not_aligned_left = 1, /* If the paragraph is ever not aligned to the right hand edge, we set this flag. */ paragraph_not_aligned_right = 2, /* If the paragraph ever has a line that doesn't look centred, we set this flag. */ paragraph_not_centred = 4, /* If the paragraph ever has a line that doesn't look fully justified, we set this flag. */ paragraph_not_fully_justified = 8, /* If the paragraph ever breaks at a place where it looks like first word from the * next line could have fitted, then set this flag.*/ paragraph_breaks_strangely = 16 } paragraph_flags; void extract_paragraph_init(paragraph_t *paragraph); void extract_paragraph_free(extract_alloc_t *alloc, paragraph_t **pparagraph); /* List of content that we believe should be treated as a whole. */ struct block_t { content_t base; content_root_t content; }; void extract_block_init(block_t *block); void extract_block_free(extract_alloc_t *alloc, block_t **pblock); /* Information about an image. <type> is as passed to extract_add_image(); <name> and <id> are created to be unique identifiers for use in generated docx file. */ struct image_t { content_t base; char *type; /* jpg, png etc. */ char *name; /* Name of image file within docx. */ char *id; /* ID of image within docx. */ double x; double y; double w; double h; void *data; size_t data_size; extract_image_data_free *data_free; void *data_free_handle; }; void extract_image_init(image_t *image); void extract_image_clear(extract_alloc_t *alloc, image_t *image); void extract_image_free(extract_alloc_t *alloc, image_t **pimage); /* A line that is part of a table. */ typedef struct { float color; rect_t rect; } tableline_t; typedef struct { tableline_t *tablelines; int tablelines_num; } tablelines_t; /* A cell within a table. */ typedef struct { rect_t rect; /* If left/above is true, this cell is not obscured by cell to its * left/above. */ uint8_t left; uint8_t above; /* extend_right and extend_down are 1 for normal cells, 2 for cells which * extend right/down to cover an additional column/row, 3 to cover two * additional columns/rows etc. */ int extend_right; int extend_down; /* Contents of this cell. */ content_root_t content; } cell_t; void extract_cell_init(cell_t *cell); void extract_cell_free(extract_alloc_t *alloc, cell_t **pcell); void extract_table_init(table_t *table); struct table_t { content_t base; point_t pos; /* top-left. */ /* Array of cells_num_x*cells_num_y cells; cell (x, y) is: * cells_num_x * y + x. */ cell_t **cells; int cells_num_x; int cells_num_y; }; void extract_table_free(extract_alloc_t *alloc, table_t **ptable); typedef enum { SPLIT_NONE = 0, SPLIT_HORIZONTAL, SPLIT_VERTICAL } split_type_t; typedef struct split_t { split_type_t type; double weight; int count; struct split_t *split[1]; } split_t; struct structure_t { structure_t *parent; structure_t *sibling_next; structure_t *sibling_prev; structure_t *kids_first; structure_t **kids_tail; int uid; extract_struct_t type; int score; }; /* A subpage. Contains different representations of the list of spans. */ typedef struct { rect_t mediabox; int images_num; /* All the content on the page. */ content_root_t content; tablelines_t tablelines_horizontal; tablelines_t tablelines_vertical; content_root_t tables; } subpage_t; /* A page. Contains a list of subpages. NB not called page_t because this clashes with a system type on hpux. */ typedef struct { rect_t mediabox; subpage_t **subpages; int subpages_num; split_t *split; } extract_page_t; /* A list of pages. */ typedef struct { extract_page_t **pages; int pages_num; /* All the structure for the document. */ structure_t *structure; /* During construction, current points to the current point * within the structure tree where things should be added. */ structure_t *current; } document_t; typedef struct { image_t **images; int images_num; char **imagetypes; int imagetypes_num; } images_t; /* This does all the work of finding paragraphs and tables. */ int extract_document_join(extract_alloc_t *alloc, document_t *document, int layout_analysis, double master_space_guess); double extract_font_size(matrix4_t *ctm); /* Things below here are used when generating output. */ /* Basic information about current font. */ typedef struct { char *name; double size; int bold; int italic; } font_t; /* Used to keep track of font information when writing paragraphs of odt content, e.g. so we know whether a font has changed so need to start a new odt span. */ typedef struct { font_t font; matrix4_t *ctm_prev; } content_state_t; /* Analyse page content for layouts. */ int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page); /* subpage_t constructor. */ int extract_subpage_alloc(extract_alloc_t *extract, rect_t mediabox, extract_page_t *page, subpage_t **psubpage); /* subpage_t destructor. */ void extract_subpage_free(extract_alloc_t *alloc, subpage_t **psubpage); /* Allocate a split_t. */ int extract_split_alloc(extract_alloc_t *alloc, split_type_t type, int count, split_t **psplit); void extract_split_free(extract_alloc_t *alloc, split_t **psplit); typedef struct { content_root_t *root; content_t *next; } content_tree_iterator; static inline content_t *content_tree_iterator_next(content_tree_iterator *it) { content_t *next = it->next; while (next->type == content_root) { content_t *parent = ((content_root_t *)next)->parent; if (parent == NULL) return NULL; next = parent->next; } assert(next->type != content_root); switch (next->type) { default: case content_root: assert("Never happens!" == NULL); break; case content_span: it->next = next->next; break; case content_line: it->next = ((line_t *)next)->content.base.next; break; case content_paragraph: it->next = ((paragraph_t *)next)->content.base.next; break; } return next; } static inline content_t *content_tree_iterator_init(content_tree_iterator *it, content_root_t *root) { it->root = root; it->next = root->base.next; return content_tree_iterator_next(it); } /* Some helper functions */ /* Return a span_t * pointer to the first element in a content list. */ static inline span_t *content_head_as_span(content_root_t *root) { assert(root != NULL && root->base.type == content_root && (root->base.next == NULL || root->base.next->type == content_span)); return (span_t *)root->base.next; } /* Return a point for the post-advance position of a char in a given span. */ point_t extract_predicted_end_of_char(char_t *char_, const span_t *span); /* Return a point for the post-advance position of the final char in a given span. */ point_t extract_end_of_span(const span_t *span); /* Return the bounds for a block before it was rotated around its origin. */ rect_t extract_block_pre_rotation_bounds(block_t *block, double rotate); double extract_baseline_angle(const matrix4_t *ctm); #endif
