diff mupdf-source/thirdparty/extract/src/document.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/extract/src/document.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,766 @@
+#ifndef ARTIFEX_EXTRACT_DOCUMENT_H
+#define ARTIFEX_EXTRACT_DOCUMENT_H
+
+#include "extract/extract.h"
+#include "extract/alloc.h"
+
+#include "compat_stdint.h"
+#include <assert.h>
+
+typedef struct span_t span_t;
+typedef struct line_t line_t;
+typedef struct paragraph_t paragraph_t;
+typedef struct image_t image_t;
+typedef struct table_t table_t;
+typedef struct block_t block_t;
+typedef struct structure_t structure_t;
+
+static const double pi = 3.141592653589793;
+
+/*
+All content is stored as content_t nodes in a doubly linked-list.
+The first node in the list is a 'content_root' node. The last
+node in the list is the same node again.
+
+Thus:
+  Every node in a list (including the root) has next and prev != NULL.
+  The root node in an empty list has next and prev pointing to itself.
+  Any non-root node with prev and next == NULL is not in a list.
+
+Content nodes record a 'type' for the node. Each node is 'derived' in
+an OO style from the basic content_t.
+
+The different content types form a heirarchy:
+
+A spans is an array of "char_t"s (note, an array, NOT a content list).
+
+Lines contain a content list, which should mostly consist of spans.
+
+Paragraphs contain a content list, which should mostly consist of lines.
+
+Image nodes contains details of a bitmap image.
+
+Table nodes contain an array of cells, each of which contains a content
+list that can contain any other type.
+
+Blocks contain a content list consisting of paragraphs, tables and images.
+Conceptually these represent a block of content on a page.
+*/
+typedef enum {
+	content_root,
+	content_span,
+	content_line,
+	content_paragraph,
+	content_image,
+	content_table,
+	content_block
+} content_type_t;
+
+typedef struct content_t {
+	/* The type field tells us what derived type we actually are. */
+	content_type_t type;
+
+	/* This holds us in the linked list of sibling content nodes. */
+	struct content_t *prev;
+	struct content_t *next;
+} content_t;
+
+/* Initialise a content_t (just the base struct). */
+void content_init(content_t *content, content_type_t type);
+
+/* Unlink a (non-root) content_t from any list. */
+void content_unlink(content_t *content);
+
+/* Unlink a span_t from any list. */
+void content_unlink_span(span_t *span);
+
+typedef struct {
+	content_t  base;
+	content_t *parent;
+} content_root_t;
+
+void content_init_root(content_root_t *root, content_t *parent);
+
+/* Free all the content, from a (root) content_t. */
+void content_clear(extract_alloc_t* alloc, content_root_t *root);
+
+span_t *content_first_span(const content_root_t *root);
+span_t *content_last_span(const content_root_t *root);
+line_t *content_first_line(const content_root_t *root);
+line_t *content_last_line(const content_root_t *root);
+paragraph_t *content_first_paragraph(const content_root_t *root);
+paragraph_t *content_last_paragraph(const content_root_t *root);
+
+span_t *content_next_span(const content_t *node);
+span_t *content_prev_span(const content_t *node);
+line_t *content_next_line(const content_t *node);
+line_t *content_prev_line(const content_t *node);
+paragraph_t *content_next_paragraph(const content_t *node);
+paragraph_t *content_prev_paragraph(const content_t *node);
+
+int content_count(content_root_t *root);
+int content_count_images(content_root_t *root);
+int content_count_spans(content_root_t *root);
+int content_count_lines(content_root_t *root);
+int content_count_paragraphs(content_root_t *root);
+int content_count_tables(content_root_t *root);
+
+int content_new_root(extract_alloc_t *alloc, content_root_t **proot);
+int content_new_span(extract_alloc_t *alloc, span_t **pspan, structure_t *structure);
+int content_new_line(extract_alloc_t *alloc, line_t **pline);
+int content_new_paragraph(extract_alloc_t *alloc, paragraph_t **pparagraph);
+int content_new_table(extract_alloc_t *alloc, table_t **ptable);
+int content_new_block(extract_alloc_t *alloc, block_t **pblock);
+
+int content_append_new_span(extract_alloc_t* alloc, content_root_t *root, span_t **pspan, structure_t *structure);
+int content_append_new_line(extract_alloc_t* alloc, content_root_t *root, line_t **pline);
+int content_append_new_paragraph(extract_alloc_t* alloc, content_root_t *root, paragraph_t **pparagraph);
+int content_append_new_image(extract_alloc_t* alloc, content_root_t *root, image_t **pimage);
+int content_append_new_table(extract_alloc_t* alloc, content_root_t *root, table_t **ptable);
+int content_append_new_block(extract_alloc_t* alloc, content_root_t *root, block_t **pblock);
+
+void content_replace(content_t *current, content_t *replacement);
+int content_replace_new_line(extract_alloc_t* alloc, content_t *current, line_t **pline);
+int content_replace_new_paragraph(extract_alloc_t* alloc, content_t *current, paragraph_t **pparagraph);
+int content_replace_new_block(extract_alloc_t* alloc, content_t *current, block_t **pblock);
+
+
+void content_append(content_root_t *root, content_t *content);
+void content_append_span(content_root_t *root, span_t *span);
+void content_append_line(content_root_t *root, line_t *line);
+void content_append_paragraph(content_root_t *root, paragraph_t *paragraph);
+void content_append_table(content_root_t *root, table_t *table);
+void content_append_block(content_root_t *root, block_t *block);
+
+void content_concat(content_root_t *dst, content_root_t *src);
+
+void content_dump(const content_root_t *content);
+void content_dump_line(const line_t *line);
+void content_dump_span(const span_t *span);
+void content_dump_brief(const content_root_t *content);
+
+
+typedef int (content_cmp_fn)(const content_t *, const content_t *);
+
+void content_sort(content_root_t *content, content_cmp_fn *cmp);
+
+/* To iterate over the line elements of a content list:
+
+content_line_iterator it;
+line_t *line;
+
+for(line = content_line_iterator_line_init(&it, content); line != NULL; line = content_line_iterator_next(&it))
+{
+}
+
+*/
+
+typedef struct {
+	content_root_t *root;
+	content_t      *next;
+} content_paragraph_iterator;
+
+static inline paragraph_t *content_paragraph_iterator_next(content_paragraph_iterator *it)
+{
+	content_t *next;
+
+	do {
+		next = it->next;
+		if (next == &it->root->base)
+			return NULL;
+		assert(next->type != content_root);
+		it->next = next->next;
+	} while (next->type != content_paragraph);
+
+	return (paragraph_t *)next;
+}
+
+static inline paragraph_t *content_paragraph_iterator_init(content_paragraph_iterator *it, content_root_t *root)
+{
+	it->root = root;
+	it->next = root->base.next;
+
+	return content_paragraph_iterator_next(it);
+}
+
+typedef struct {
+	content_root_t *root;
+	content_t      *next;
+} content_line_iterator;
+
+static inline line_t *content_line_iterator_next(content_line_iterator *it)
+{
+	content_t *next;
+
+	do {
+		next = it->next;
+		if (next == &it->root->base)
+			return NULL;
+		assert(next->type != content_root);
+			it->next = next->next;
+	} while (next->type != content_line);
+
+	return (line_t *)next;
+}
+
+static inline line_t *content_line_iterator_init(content_line_iterator *it, content_root_t *root)
+{
+	it->root = root;
+	it->next = root->base.next;
+
+	return content_line_iterator_next(it);
+}
+
+typedef struct {
+	content_root_t *root;
+	content_t      *next;
+} content_span_iterator;
+
+static inline span_t *content_span_iterator_next(content_span_iterator *it)
+{
+	content_t *next;
+
+	do {
+		next = it->next;
+		if (next == &it->root->base)
+			return NULL;
+		assert(next->type != content_root);
+		it->next = next->next;
+	} while (next->type != content_span);
+
+	return (span_t *)next;
+}
+
+static inline span_t *content_span_iterator_init(content_span_iterator *it, content_root_t *root)
+{
+	it->root = root;
+	it->next = root->base.next;
+
+	return content_span_iterator_next(it);
+}
+
+typedef struct {
+	content_root_t *root;
+	content_t      *next;
+} content_image_iterator;
+
+static inline image_t *content_image_iterator_next(content_image_iterator *it)
+{
+	content_t *next;
+
+	do {
+		next = it->next;
+		if (next == &it->root->base)
+			return NULL;
+		assert(next->type != content_root);
+		it->next = next->next;
+	} while (next->type != content_image);
+
+	return (image_t *)next;
+}
+
+static inline image_t *content_image_iterator_init(content_image_iterator *it, content_root_t *root)
+{
+	it->root = root;
+	it->next = root->base.next;
+
+	return content_image_iterator_next(it);
+}
+
+typedef struct {
+	content_root_t *root;
+	content_t      *next;
+} content_table_iterator;
+
+static inline table_t *content_table_iterator_next(content_table_iterator *it)
+{
+	content_t *next;
+
+	do {
+		next = it->next;
+		if (next == &it->root->base)
+			return NULL;
+		assert(next->type != content_root);
+		it->next = next->next;
+	} while (next->type != content_table);
+
+    return (table_t *)next;
+}
+
+static inline table_t *content_table_iterator_init(content_table_iterator *it, content_root_t *root)
+{
+	it->root = root;
+	it->next = root->base.next;
+
+	return content_table_iterator_next(it);
+}
+
+typedef struct {
+	content_root_t *root;
+	content_t      *next;
+} content_iterator;
+
+static inline content_t *content_iterator_next(content_iterator *it)
+{
+	content_t *next = it->next;
+
+	if (next == &it->root->base)
+		return NULL;
+	assert(next->type != content_root);
+	it->next = next->next;
+
+	return next;
+}
+
+static inline content_t *content_iterator_init(content_iterator *it, content_root_t *root)
+{
+	it->root = root;
+	it->next = root->base.next;
+
+	return content_iterator_next(it);
+}
+
+typedef struct
+{
+	double x;
+	double y;
+} point_t;
+
+const char *extract_point_string(const point_t *point);
+
+typedef struct
+{
+	point_t min;
+	point_t max;
+} rect_t;
+
+extern const rect_t extract_rect_infinite;
+extern const rect_t extract_rect_empty;
+
+rect_t extract_rect_intersect(rect_t a, rect_t b);
+
+rect_t extract_rect_union(rect_t a, rect_t b);
+
+rect_t extract_rect_union_point(rect_t a, point_t b);
+
+int extract_rect_contains_rect(rect_t a, rect_t b);
+
+int extract_rect_valid(rect_t a);
+
+const char *extract_rect_string(const rect_t *rect);
+
+typedef struct
+{
+	double  a;
+	double  b;
+	double  c;
+	double  d;
+	double  e;
+	double  f;
+} matrix_t;
+
+typedef struct
+{
+	double  a;
+	double  b;
+	double  c;
+	double  d;
+} matrix4_t;
+
+const char *extract_matrix_string(const matrix_t *matrix);
+const char *extract_matrix4_string(const matrix4_t *matrix);
+
+/* Returns a*d - b*c. */
+double      extract_matrix_expansion(matrix4_t m);
+
+/* Returns the inverse of a matrix (or identity for degenerate). */
+matrix4_t   extract_matrix4_invert(const matrix4_t *ctm);
+
+point_t     extract_matrix4_transform_point(matrix4_t m, point_t p);
+point_t     extract_matrix4_transform_xy(matrix4_t m, double x, double y);
+matrix_t    extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2);
+matrix4_t   extract_multiply_matrix4_matrix4(matrix4_t m1, matrix4_t m2);
+
+/* Returns zero if first four members of *lhs and *rhs are equal, otherwise
++/-1. */
+int extract_matrix4_cmp(const matrix4_t *lhs, const matrix4_t *rhs);
+
+/* A single char in a span. */
+typedef struct
+{
+	/* (x,y) after transformation by ctm. */
+	double      x;
+	double      y;
+
+	unsigned    ucs;
+	double      adv; /* Advance, before transform by ctm */
+
+	rect_t      bbox;
+} char_t;
+
+/* List of chars that have same font and are usually adjacent. */
+struct span_t
+{
+	content_t    base;
+	matrix4_t    ctm;
+	char        *font_name;
+	rect_t       font_bbox;
+	structure_t *structure;
+
+	struct {
+		unsigned font_bold      : 1;
+		unsigned font_italic    : 1;
+		unsigned wmode          : 1;
+	} flags;
+
+	char_t     *chars;
+	int         chars_num;
+};
+
+void extract_span_init(span_t *span, structure_t *structure);
+
+/* Frees a span_t, returning with *pspan set to NULL. */
+void extract_span_free(extract_alloc_t *alloc, span_t **pspan);
+
+/* Returns last character in span. */
+char_t *extract_span_char_last(span_t *span);
+
+/* Appends new char_t to an span_t with .ucs=c and all other
+fields zeroed. Returns pointer to new char_t record, or NULL if allocation
+failed. */
+char_t *extract_span_append_c(extract_alloc_t *alloc, span_t *span, int c);
+
+/* Returns static string containing info about span_t. */
+const char *extract_span_string(extract_alloc_t *alloc, span_t *span);
+
+/* List of spans that are aligned on same line. */
+struct line_t
+{
+	content_t base;
+	double ascender;
+	double descender;
+	content_root_t content;
+};
+
+void extract_line_init(line_t *line);
+
+void extract_line_free(extract_alloc_t* alloc, line_t **pline);
+
+/* Returns first span in a line. */
+span_t *extract_line_span_first(line_t *line);
+
+/* Returns last span in a line. */
+span_t *extract_line_span_last(line_t *line);
+
+/* List of lines that are aligned and adjacent to each other so as to form a
+paragraph. */
+struct paragraph_t
+{
+	content_t      base;
+	int            line_flags;
+	content_root_t content;
+};
+
+typedef enum
+{
+	/* If the paragraph is ever not aligned to the left hand edge, we set this flag. */
+	paragraph_not_aligned_left = 1,
+
+	/* If the paragraph is ever not aligned to the right hand edge, we set this flag. */
+	paragraph_not_aligned_right = 2,
+
+	/* If the paragraph ever has a line that doesn't look centred, we set this flag. */
+	paragraph_not_centred = 4,
+
+	/* If the paragraph ever has a line that doesn't look fully justified, we set this flag. */
+	paragraph_not_fully_justified = 8,
+
+	/* If the paragraph ever breaks at a place where it looks like first word from the
+	* next line could have fitted, then set this flag.*/
+	paragraph_breaks_strangely = 16
+} paragraph_flags;
+
+void extract_paragraph_init(paragraph_t *paragraph);
+
+void extract_paragraph_free(extract_alloc_t *alloc, paragraph_t **pparagraph);
+
+/* List of content that we believe should be treated as a whole. */
+struct block_t
+{
+	content_t      base;
+	content_root_t content;
+};
+
+void extract_block_init(block_t *block);
+
+void extract_block_free(extract_alloc_t *alloc, block_t **pblock);
+
+
+
+/* Information about an image. <type> is as passed to extract_add_image();
+<name> and <id> are created to be unique identifiers for use in generated docx
+file. */
+struct image_t
+{
+	content_t                base;
+	char                    *type;   /* jpg, png etc. */
+	char                    *name;   /* Name of image file within docx. */
+	char                    *id;     /* ID of image within docx. */
+	double                   x;
+	double                   y;
+	double                   w;
+	double                   h;
+	void                    *data;
+	size_t                   data_size;
+
+	extract_image_data_free *data_free;
+	void                    *data_free_handle;
+};
+
+void extract_image_init(image_t *image);
+
+void extract_image_clear(extract_alloc_t *alloc, image_t *image);
+
+void extract_image_free(extract_alloc_t *alloc, image_t **pimage);
+
+/* A line that is part of a table. */
+typedef struct
+{
+	float   color;
+	rect_t  rect;
+} tableline_t;
+
+typedef struct
+{
+	tableline_t *tablelines;
+	int          tablelines_num;
+} tablelines_t;
+
+
+/* A cell within a table. */
+typedef struct
+{
+	rect_t          rect;
+
+	/* If left/above is true, this cell is not obscured by cell to its
+	 * left/above. */
+	uint8_t         left;
+	uint8_t         above;
+
+	/* extend_right and extend_down are 1 for normal cells, 2 for cells which
+	 * extend right/down to cover an additional column/row, 3 to cover two
+	 * additional columns/rows etc. */
+	int             extend_right;
+	int             extend_down;
+
+	/* Contents of this cell. */
+	content_root_t  content;
+} cell_t;
+
+void extract_cell_init(cell_t *cell);
+void extract_cell_free(extract_alloc_t *alloc, cell_t **pcell);
+void extract_table_init(table_t *table);
+
+struct table_t
+{
+	content_t   base;
+	point_t     pos;    /* top-left. */
+
+	/* Array of cells_num_x*cells_num_y cells; cell (x, y) is:
+	 * cells_num_x * y + x.
+	 */
+	cell_t    **cells;
+	int         cells_num_x;
+	int         cells_num_y;
+};
+
+void extract_table_free(extract_alloc_t *alloc, table_t **ptable);
+
+typedef enum
+{
+	SPLIT_NONE = 0,
+	SPLIT_HORIZONTAL,
+	SPLIT_VERTICAL
+} split_type_t;
+
+
+typedef struct split_t
+{
+	split_type_t    type;
+	double          weight;
+	int             count;
+	struct split_t *split[1];
+} split_t;
+
+struct structure_t
+{
+	structure_t       *parent;
+	structure_t       *sibling_next;
+	structure_t       *sibling_prev;
+	structure_t       *kids_first;
+	structure_t      **kids_tail;
+	int                uid;
+	extract_struct_t   type;
+	int                score;
+};
+
+/* A subpage. Contains different representations of the list of spans. */
+typedef struct
+{
+	rect_t          mediabox;
+
+	int             images_num;
+
+	/* All the content on the page. */
+	content_root_t  content;
+
+	tablelines_t    tablelines_horizontal;
+	tablelines_t    tablelines_vertical;
+
+	content_root_t  tables;
+} subpage_t;
+
+
+/* A page. Contains a list of subpages. NB not
+called page_t because this clashes with a system type on hpux. */
+typedef struct
+{
+	rect_t      mediabox;
+
+	subpage_t **subpages;
+	int         subpages_num;
+
+	split_t    *split;
+} extract_page_t;
+
+
+/* A list of pages. */
+typedef struct
+{
+	extract_page_t **pages;
+	int              pages_num;
+
+	/* All the structure for the document. */
+	structure_t    *structure;
+
+	/* During construction, current points to the current point
+	* within the structure tree where things should be added. */
+	structure_t    *current;
+} document_t;
+
+
+typedef struct
+{
+	image_t **images;
+	int       images_num;
+	char    **imagetypes;
+	int       imagetypes_num;
+} images_t;
+
+
+/* This does all the work of finding paragraphs and tables. */
+int extract_document_join(extract_alloc_t *alloc, document_t *document, int layout_analysis, double master_space_guess);
+
+double extract_font_size(matrix4_t *ctm);
+
+/* Things below here are used when generating output. */
+
+/* Basic information about current font. */
+typedef struct
+{
+	char   *name;
+	double  size;
+	int     bold;
+	int     italic;
+} font_t;
+
+/* Used to keep track of font information when writing paragraphs of odt
+content, e.g. so we know whether a font has changed so need to start a new odt
+span. */
+typedef struct
+{
+	font_t     font;
+	matrix4_t *ctm_prev;
+} content_state_t;
+
+/* Analyse page content for layouts. */
+int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page);
+
+/* subpage_t constructor. */
+int extract_subpage_alloc(extract_alloc_t *extract, rect_t mediabox, extract_page_t *page, subpage_t **psubpage);
+
+/* subpage_t destructor. */
+void extract_subpage_free(extract_alloc_t *alloc, subpage_t **psubpage);
+
+/* Allocate a split_t. */
+int extract_split_alloc(extract_alloc_t *alloc, split_type_t type, int count, split_t **psplit);
+
+void extract_split_free(extract_alloc_t *alloc, split_t **psplit);
+
+typedef struct {
+	content_root_t *root;
+	content_t      *next;
+} content_tree_iterator;
+
+static inline content_t *content_tree_iterator_next(content_tree_iterator *it)
+{
+	content_t *next = it->next;
+
+	while (next->type == content_root)
+	{
+		content_t *parent = ((content_root_t *)next)->parent;
+		if (parent == NULL)
+			return NULL;
+		next = parent->next;
+	}
+	assert(next->type != content_root);
+
+	switch (next->type)
+	{
+	default:
+	case content_root:
+		assert("Never happens!" == NULL);
+		break;
+	case content_span:
+		it->next = next->next;
+		break;
+	case content_line:
+		it->next = ((line_t *)next)->content.base.next;
+		break;
+	case content_paragraph:
+		it->next = ((paragraph_t *)next)->content.base.next;
+		break;
+	}
+
+	return next;
+}
+
+static inline content_t *content_tree_iterator_init(content_tree_iterator *it, content_root_t *root)
+{
+	it->root = root;
+	it->next = root->base.next;
+
+	return content_tree_iterator_next(it);
+}
+
+/* Some helper functions */
+
+/* Return a span_t * pointer to the first element in a content list. */
+static inline span_t *content_head_as_span(content_root_t *root)
+{
+	assert(root != NULL && root->base.type == content_root && (root->base.next == NULL || root->base.next->type == content_span));
+	return (span_t *)root->base.next;
+}
+
+/* Return a point for the post-advance position of a char in a given span. */
+point_t extract_predicted_end_of_char(char_t *char_, const span_t *span);
+
+/* Return a point for the post-advance position of the final char in a given span. */
+point_t extract_end_of_span(const span_t *span);
+
+/* Return the bounds for a block before it was rotated around its origin. */
+rect_t extract_block_pre_rotation_bounds(block_t *block, double rotate);
+
+double extract_baseline_angle(const matrix4_t *ctm);
+
+#endif