diff mupdf-source/thirdparty/extract/src/document.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/extract/src/document.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,724 @@
+#include "document.h"
+#include "outf.h"
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+void
+content_init(content_t *content, content_type_t type)
+{
+	content->type = type;
+	content->next = content->prev = (type == content_root) ? content : NULL;
+}
+
+void
+content_init_root(content_root_t *content, content_t *parent)
+{
+	content->base.type = content_root;
+	content->base.next = content->base.prev = &content->base;
+	content->parent = parent;
+}
+
+void
+content_unlink(content_t *content)
+{
+	if (content == NULL)
+		return;
+
+	assert(content->type != content_root);
+
+	if (content->prev == NULL)
+	{
+		assert(content->next == NULL);
+		/* Already unlinked */
+	}
+	else
+	{
+		assert(content->next != content && content->prev != content);
+		content->prev->next = content->next;
+		content->next->prev = content->prev;
+		content->next = content->prev = NULL;
+	}
+}
+
+void content_unlink_span(span_t *span)
+{
+	content_unlink(&span->base);
+}
+
+void extract_span_init(span_t *span, structure_t *structure)
+{
+	static const span_t blank = { 0 };
+
+	*span = blank;
+	content_init(&span->base, content_span);
+	span->structure = structure;
+}
+
+void extract_span_free(extract_alloc_t *alloc, span_t **pspan)
+{
+	if (*pspan == NULL)
+		return;
+
+	content_unlink(&(*pspan)->base);
+	extract_free(alloc, &(*pspan)->font_name);
+	extract_free(alloc, &(*pspan)->chars);
+	extract_free(alloc, pspan);
+}
+
+void extract_line_init(line_t *line)
+{
+	static const line_t blank = { 0 };
+
+	*line = blank;
+	content_init(&line->base, content_line);
+	content_init_root(&line->content, &line->base);
+}
+
+void extract_paragraph_init(paragraph_t *paragraph)
+{
+	static const paragraph_t blank = { 0 };
+
+	*paragraph = blank;
+	content_init(&paragraph->base, content_paragraph);
+	content_init_root(&paragraph->content, &paragraph->base);
+}
+
+void extract_block_init(block_t *block)
+{
+	static const block_t blank = { 0 };
+
+	*block = blank;
+	content_init(&block->base, content_block);
+	content_init_root(&block->content, &block->base);
+}
+
+void extract_table_init(table_t *table)
+{
+	static const table_t blank = { 0 };
+
+	*table = blank;
+	content_init(&table->base, content_table);
+}
+
+void extract_image_init(image_t *image)
+{
+	static const image_t blank = { 0 };
+
+	*image = blank;
+	content_init(&image->base, content_image);
+}
+
+void
+content_clear(extract_alloc_t *alloc, content_root_t *proot)
+{
+	content_t *content, *next;
+
+	assert(proot->base.type == content_root && proot->base.next != NULL && proot->base.prev != NULL);
+	for (content = proot->base.next; content != &proot->base; content = next)
+	{
+		assert(content->type != content_root);
+		next = content->next;
+		switch (content->type)
+		{
+		default:
+		case content_root:
+			assert("This never happens" == NULL);
+			break;
+		case content_span:
+			extract_span_free(alloc, (span_t **)&content);
+			break;
+		case content_line:
+			extract_line_free(alloc, (line_t **)&content);
+			break;
+		case content_paragraph:
+			extract_paragraph_free(alloc, (paragraph_t **)&content);
+			break;
+		case content_block:
+			extract_block_free(alloc, (block_t **)&content);
+			break;
+		case content_table:
+			extract_table_free(alloc, (table_t **)&content);
+			break;
+		case content_image:
+			extract_image_free(alloc, (image_t **)&content);
+			break;
+		}
+	}
+}
+
+int
+content_count(content_root_t *root)
+{
+	int n = 0;
+	content_t *s;
+
+	for (s = root->base.next; s != &root->base; s = s->next)
+		n++;
+
+	return n;
+}
+
+static int
+content_count_type(content_root_t *root, content_type_t type)
+{
+	int n = 0;
+	content_t *s;
+
+	for (s = root->base.next; s != &root->base; s = s->next)
+		if (s->type == type) n++;
+
+	return n;
+}
+
+int content_count_spans(content_root_t *root)
+{
+	return content_count_type(root, content_span);
+}
+
+int content_count_images(content_root_t *root)
+{
+	return content_count_type(root, content_image);
+}
+
+int content_count_lines(content_root_t *root)
+{
+	return content_count_type(root, content_line);
+}
+
+int content_count_paragraphs(content_root_t *root)
+{
+	return content_count_type(root, content_paragraph);
+}
+
+int content_count_tables(content_root_t *root)
+{
+	return content_count_type(root, content_table);
+}
+
+static content_t *
+content_first_of_type(const content_root_t *root, content_type_t type)
+{
+	content_t *content;
+	assert(root && root->base.type == content_root);
+
+	for (content = root->base.next; content != &root->base; content = content->next)
+	{
+		if (content->type == type)
+			return content;
+	}
+	return NULL;
+}
+
+static content_t *
+content_last_of_type(const content_root_t *root, content_type_t type)
+{
+	content_t *content;
+	assert(root && root->base.type == content_root);
+
+	for (content = root->base.prev; content != &root->base; content = content->prev)
+	{
+		if (content->type == type)
+			return content;
+	}
+	return NULL;
+}
+
+span_t *content_first_span(const content_root_t *root)
+{
+	return (span_t *)content_first_of_type(root, content_span);
+}
+
+span_t *content_last_span(const content_root_t *root)
+{
+	return (span_t *)content_last_of_type(root, content_span);
+}
+
+line_t *content_first_line(const content_root_t *root)
+{
+	return (line_t *)content_first_of_type(root, content_line);
+}
+
+line_t *content_last_line(const content_root_t *root)
+{
+	return (line_t *)content_last_of_type(root, content_line);
+}
+
+paragraph_t *content_first_paragraph(const content_root_t *root)
+{
+	return (paragraph_t *)content_first_of_type(root, content_paragraph);
+}
+
+paragraph_t *content_last_paragraph(const content_root_t *root)
+{
+	return (paragraph_t *)content_last_of_type(root, content_paragraph);
+}
+
+static content_t *
+content_next_of_type(const content_t *node, content_type_t type)
+{
+	content_t *content;
+	assert(node && node->type != content_root);
+
+	for (content = node->next; content->type != content_root; content = content->next)
+	{
+		if (content->type == type)
+			return content;
+	}
+	return NULL;
+}
+
+static content_t *
+content_prev_of_type(const content_t *node, content_type_t type)
+{
+	content_t *content;
+	assert(node && node->type != content_root);
+
+	for (content = node->prev; content->type != content_root; content = content->prev)
+	{
+		if (content->type == type)
+			return content;
+	}
+	return NULL;
+}
+
+span_t *content_next_span(const content_t *root)
+{
+	return (span_t *)content_next_of_type(root, content_span);
+}
+
+span_t *content_prev_span(const content_t *root)
+{
+	return (span_t *)content_prev_of_type(root, content_span);
+}
+
+line_t *content_next_line(const content_t *root)
+{
+	return (line_t *)content_next_of_type(root, content_line);
+}
+
+line_t *content_prev_line(const content_t *root)
+{
+	return (line_t *)content_prev_of_type(root, content_line);
+}
+
+paragraph_t *content_next_paragraph(const content_t *root)
+{
+	return (paragraph_t *)content_next_of_type(root, content_paragraph);
+}
+
+paragraph_t *content_prev_paragraph(const content_t *root)
+{
+	return (paragraph_t *)content_prev_of_type(root, content_paragraph);
+}
+
+void
+content_concat(content_root_t *dst, content_root_t *src)
+{
+	content_t *walk, *walk_next;
+
+	if (src == NULL)
+		return;
+
+	for (walk = src->base.next; walk != &src->base; walk = walk_next)
+	{
+		walk_next = walk->next;
+		content_append(dst, walk);
+	}
+}
+
+void extract_line_free(extract_alloc_t* alloc, line_t **pline)
+{
+	line_t *line = *pline;
+
+	content_unlink(&line->base);
+	content_clear(alloc, &line->content);
+	extract_free(alloc, pline);
+}
+
+void extract_image_clear(extract_alloc_t *alloc, image_t *image)
+{
+	extract_free(alloc, &image->type);
+	extract_free(alloc, &image->name);
+	extract_free(alloc, &image->id);
+	if (image->data_free)
+	{
+		image->data_free(image->data_free_handle, image->data);
+		image->data_free = NULL;
+		image->data_free_handle = NULL;
+		image->data = NULL;
+	}
+}
+
+void extract_image_free(extract_alloc_t *alloc, image_t **pimage)
+{
+	if (*pimage == NULL)
+		return;
+	extract_image_clear(alloc, *pimage);
+	extract_free(alloc, pimage);
+}
+
+void extract_cell_free(extract_alloc_t *alloc, cell_t **pcell)
+{
+	cell_t *cell = *pcell;
+
+	if (cell == NULL)
+		return;
+
+	content_clear(alloc, &cell->content);
+
+	extract_free(alloc, pcell);
+}
+
+int
+extract_split_alloc(extract_alloc_t *alloc, split_type_t type, int count, split_t **psplit)
+{
+	split_t *split;
+
+	if (extract_malloc(alloc, psplit, sizeof(*split) + (count-1) * sizeof(split_t *)))
+		return -1;
+
+	split = *psplit;
+	split->type = type;
+	split->weight = 0;
+	split->count = count;
+	memset(&split->split[0], 0, sizeof(split_t *) * count);
+
+	return 0;
+}
+
+void extract_split_free(extract_alloc_t *alloc, split_t **psplit)
+{
+	int i;
+	split_t *split = *psplit;
+
+	if (split == NULL)
+		return;
+
+	for (i = 0; i < split->count; i++)
+		extract_split_free(alloc, &split->split[i]);
+	extract_free(alloc, psplit);
+}
+
+static void space_prefix(int depth)
+{
+	while (depth-- > 0)
+		putc(' ', stdout);
+}
+
+static void
+content_dump_aux(const content_root_t *content, int depth);
+
+static void dump_span(const span_t *span, int depth)
+{
+	int i;
+	for (i = 0; i < span->chars_num; i++)
+	{
+		char_t *c = &span->chars[i];
+		space_prefix(depth);
+		printf("<char ucs=\"");
+		if (c->ucs >= 32 && c->ucs <= 127)
+			putc((char)c->ucs, stdout);
+		else
+			printf("<%04x>", c->ucs);
+		printf("\" x=%f y=%f adv=%f />\n", c->x, c->y, c->adv);
+	}
+}
+
+static void
+dump_structure_path(structure_t *structure)
+{
+	if (structure->parent)
+	{
+		dump_structure_path(structure->parent);
+		printf("/");
+	}
+	printf("%s(%d)", extract_struct_string(structure->type), structure->uid);
+}
+
+static void
+content_dump_span_aux(const span_t *span, int depth)
+{
+	space_prefix(depth);
+	printf("<span ctm=[%f %f %f %f]\n",
+		span->ctm.a, span->ctm.b, span->ctm.c, span->ctm.d);
+	if (span->structure)
+	{
+		space_prefix(depth);
+		printf("      structure=\"");
+		dump_structure_path(span->structure);
+		printf("\"\n");
+	}
+	space_prefix(depth);
+	printf("      font-name=\"%s\" font_bbox=[%f %f %f %f]>\n",
+		span->font_name,
+		span->font_bbox.min.x, span->font_bbox.min.y,
+		span->font_bbox.max.x, span->font_bbox.max.y);
+	dump_span(span, depth+1);
+	space_prefix(depth);
+	printf("</span>\n");
+}
+
+void
+content_dump_span(const span_t *span)
+{
+	content_dump_span_aux(span, 0);
+}
+
+static void
+content_dump_line_aux(const line_t *line, int depth)
+{
+	span_t *span0 = content_first_span(&line->content);
+	span_t *span1 = content_last_span(&line->content);
+	char_t *char0 = (span0 && span0->chars_num > 0) ? &span0->chars[0] : NULL;
+	char_t *char1 = (span1 && span1->chars_num > 0) ? &span1->chars[span1->chars_num-1] : NULL;
+	space_prefix(depth);
+	printf("<line");
+	if (char0 && char1)
+	{
+		printf(" x0=%g y0=%g x1=%g y1=%g\n", char0->x, char0->y, char1->x, char1->y);
+	}
+	content_dump_aux(&line->content, depth+1);
+	space_prefix(depth);
+	printf("</line>\n");
+}
+
+void
+content_dump_line(const line_t *line)
+{
+	content_dump_line_aux(line, 0);
+}
+
+static void
+content_dump_aux(const content_root_t *content, int depth)
+{
+	const content_t *walk;
+
+	assert(content->base.type == content_root);
+	for (walk = content->base.next; walk != &content->base; walk = walk->next)
+	{
+		assert(walk->next->prev == walk && walk->prev->next == walk);
+		switch (walk->type)
+		{
+		case content_span:
+			content_dump_span_aux((const span_t *)walk, depth);
+			break;
+		case content_line:
+			content_dump_line_aux((const line_t *)walk, depth);
+			break;
+		case content_paragraph:
+			space_prefix(depth);
+			printf("<paragraph>\n");
+			content_dump_aux(&((const paragraph_t *)walk)->content, depth+1);
+			space_prefix(depth);
+			printf("</paragraph>\n");
+			break;
+		case content_block:
+			space_prefix(depth);
+			printf("<block>\n");
+			content_dump_aux(&((const block_t *)walk)->content, depth+1);
+			space_prefix(depth);
+			printf("</block>\n");
+			break;
+		case content_table:
+		{
+			const table_t *table = (const table_t *)walk;
+			int i, j, k;
+			space_prefix(depth);
+			printf("<table w=%d h=%d>\n", table->cells_num_x, table->cells_num_y);
+			k = 0;
+			for (j = 0; j < table->cells_num_y; j++)
+			{
+				for (i = 0; i < table->cells_num_x; i++)
+				{
+					space_prefix(depth+1);
+					printf("<cell>\n");
+					content_dump_aux(&table->cells[k]->content, depth+2);
+					space_prefix(depth+1);
+					printf("</cell>\n");
+					k++;
+				}
+			}
+			space_prefix(depth);
+			printf("</table>\n");
+			break;
+		}
+		case content_image:
+			space_prefix(depth);
+			printf("<image/>\n");
+			break;
+		default:
+			assert("Unexpected type found while dumping content list." == NULL);
+			break;
+		}
+	}
+}
+
+void content_dump(const content_root_t *content)
+{
+	content_dump_aux(content, 0);
+}
+
+static void
+content_dump_brief_aux(const content_root_t *content, int depth);
+
+static void
+content_dump_brief_span_aux(const span_t *span)
+{
+	int i;
+
+	printf("\"");
+	for (i = 0; i < span->chars_num; i++)
+	{
+		char_t *c = &span->chars[i];
+		if (c->ucs >= 32 && c->ucs <= 127)
+			putc((char)c->ucs, stdout);
+		else
+			printf("<%04x>", c->ucs);
+	}
+	printf("\"");
+}
+
+static void
+content_dump_brief_line_aux(const line_t *line, int depth)
+{
+	printf("<line text=");
+	content_dump_brief_aux(&line->content, depth+1);
+	printf(">\n");
+}
+
+static void
+content_dump_brief_aux(const content_root_t *content, int depth)
+{
+	const content_t *walk;
+
+	assert(content->base.type == content_root);
+	for (walk = content->base.next; walk != &content->base; walk = walk->next)
+	{
+		assert(walk->next->prev == walk && walk->prev->next == walk);
+		switch (walk->type)
+		{
+		case content_span:
+			content_dump_brief_span_aux((const span_t *)walk);
+			break;
+		case content_line:
+			content_dump_brief_line_aux((const line_t *)walk, depth);
+			break;
+		case content_paragraph:
+			content_dump_brief_aux(&((const paragraph_t *)walk)->content, depth+1);
+			break;
+		case content_block:
+			content_dump_brief_aux(&((const block_t *)walk)->content, depth+1);
+			break;
+		case content_table:
+		{
+			const table_t *table = (const table_t *)walk;
+			int i, j, k;
+			k = 0;
+			for (j = 0; j < table->cells_num_y; j++)
+			{
+				for (i = 0; i < table->cells_num_x; i++)
+				{
+					content_dump_brief_aux(&table->cells[k]->content, depth+2);
+					k++;
+				}
+			}
+			break;
+		}
+		case content_image:
+			break;
+		default:
+			assert("Unexpected type found while dumping content list." == NULL);
+			break;
+		}
+	}
+}
+
+void content_dump_brief(const content_root_t *content)
+{
+	content_dump_brief_aux(content, 0);
+}
+
+static content_t *
+cmp_and_merge(content_t *q1, int q1pos, int len1, int n, content_cmp_fn *cmp)
+{
+	int len2 = q1pos + len1*2; /* end of both lists assuming we don't overrun */
+	int p;
+	content_t *q2 = q1;
+
+	/* Don't overrun the end, and then convert to length from end. */
+	if (len2 > n)
+		len2 = n;
+	len2 -= q1pos + len1;
+
+	if (len2 <= 0)
+		len1 += len2;
+
+	/* Find the start of q2. We know this fits. */
+	for (p = 0; p < len1; p++)
+		q2 = q2->next;
+
+	if (len2 <= 0)
+		return q2;
+
+	/* So we have [q1..(q1+len1)) as the first list to merge, and
+	 * [q2..q2+len2)) as the second list to merge. We know that
+	 * q2 = q1+len1. So, if we can reduce len1 or len2 to 0, we have
+	 * the lists sorted. */
+	while (1)
+	{
+		if (cmp(q1, q2) > 0)
+		{
+			/* q2 is smaller. q2 should be before q1. Move it. */
+			/* So:
+			*    a<->q1<->c..d<->q2<->b  =>  a<->q2<->q1<->c..d<->b
+			* (where c and d can either be the same, or can be q2 and q1!)
+			*/
+			content_t *a = q1->prev;
+			content_t *b = q2->next;
+			content_t *d = q2->prev;
+			d->next = b;
+			b->prev = d;
+			a->next = q2;
+			q2->prev = a;
+			q2->next = q1;
+			q1->prev = q2;
+			/* Now advance q2 */
+			q2 = b;
+			len2--;
+			if (len2 == 0)
+				break;
+		} else {
+			/* Advance q1 */
+			q1 = q1->next;
+			len1--;
+			if (len1 == 0)
+				break;
+		}
+	}
+
+	while (len2)
+	{
+		q2 = q2->next;
+		len2--;
+	}
+
+	return q2;
+}
+
+/* Spiffy in-place merge sort. */
+void content_sort(content_root_t *content, content_cmp_fn *cmp)
+{
+	int n = content_count(content);
+	int size;
+
+	for (size = 1; size < n; size <<= 1)
+	{
+		int q1_idx = 0;
+		content_t *q1 = content->base.next;
+		for (q1_idx = 0; q1_idx < n; q1_idx += size*2)
+			q1 = cmp_and_merge(q1, q1_idx, size, n, cmp);
+		assert(q1->type == content_root);
+	}
+	assert(content_count(content) == n);
+}