diff mupdf-source/thirdparty/extract/src/boxer.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/extract/src/boxer.c	Mon Sep 15 11:44:09 2025 +0200
@@ -0,0 +1,611 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <assert.h>
+
+#include "document.h"
+#include "outf.h"
+
+#define DEBUG_WRITE_AS_PS
+/* #define DEBUG_PRINT */
+
+typedef struct boxer_s boxer_t;
+
+typedef struct {
+	int    len;
+	int    max;
+	rect_t list[1];
+} rectlist_t;
+
+struct boxer_s {
+	extract_alloc_t *alloc;
+	rect_t           mediabox;
+	rectlist_t      *list;
+};
+
+static rectlist_t *
+rectlist_create(extract_alloc_t *alloc, int max)
+{
+	rectlist_t *list;
+
+	if (extract_malloc(alloc, &list, sizeof(rectlist_t) + sizeof(rect_t)*(max-1)))
+		return NULL;
+
+	list->len = 0;
+	list->max = max;
+
+	return list;
+}
+
+/* Push box onto rectlist, unless it is completely enclosed by
+ * another box, or completely encloses others (in which case they
+ * are replaced by it). */
+static void
+rectlist_append(rectlist_t *list, rect_t *box)
+{
+	int i;
+
+	for (i = 0; i < list->len; i++)
+	{
+		rect_t *r = &list->list[i];
+		rect_t smaller, larger;
+		/* We allow ourselves a fudge factor of 4 points when checking for inclusion. */
+		double r_fudge = 4;
+
+		smaller.min.x = r->min.x + r_fudge;
+		larger. min.x = r->min.x - r_fudge;
+		smaller.min.y = r->min.y + r_fudge;
+		larger. min.y = r->min.y - r_fudge;
+		smaller.max.x = r->max.x - r_fudge;
+		larger. max.x = r->max.x + r_fudge;
+		smaller.max.y = r->max.y - r_fudge;
+		larger. max.y = r->max.y + r_fudge;
+
+		if (extract_rect_contains_rect(larger, *box))
+			return; /* box is enclosed! Nothing to do. */
+		if (extract_rect_contains_rect(*box, smaller))
+		{
+			/* box encloses r. Ditch r. */
+			/* Shorten the list */
+			--list->len;
+			/* If the one that just got chopped off wasn't r, move it down. */
+			if (i < list->len)
+			{
+				memcpy(r, &list->list[list->len], sizeof(*r));
+				i--; /* Reconsider this entry next time. */
+			}
+		}
+	}
+
+	assert(list->len < list->max);
+	memcpy(&list->list[list->len], box, sizeof(*box));
+	list->len++;
+}
+
+static boxer_t *
+boxer_create_length(extract_alloc_t *alloc, rect_t *mediabox, int len)
+{
+	boxer_t *boxer;
+
+	if (extract_malloc(alloc, &boxer, sizeof(*boxer)))
+		return NULL;
+
+	boxer->alloc = alloc;
+	memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox));
+	boxer->list = rectlist_create(alloc, len);
+
+	return boxer;
+}
+
+/* Create a boxer structure for a page of size mediabox. */
+static boxer_t *
+boxer_create(extract_alloc_t *alloc, rect_t *mediabox)
+{
+	boxer_t *boxer = boxer_create_length(alloc, mediabox, 1);
+
+	if (boxer == NULL)
+		return NULL;
+	rectlist_append(boxer->list, mediabox);
+
+	return boxer;
+}
+
+static void
+push_if_intersect_suitable(rectlist_t *dst, const rect_t *a, const rect_t *b)
+{
+	rect_t c;
+
+	/* Intersect a and b. */
+	c = extract_rect_intersect(*a, *b);
+	/* If no intersection, nothing to push. */
+	if (!extract_rect_valid(c))
+		return;
+
+	/* If the intersect is too narrow or too tall, ignore it.
+	* We don't care about inter character spaces, for example.
+	* Arbitrary 4 point threshold. */
+#define THRESHOLD 4
+	if (c.min.x + THRESHOLD >= c.max.x || c.min.y+THRESHOLD >= c.max.y)
+		return;
+
+	rectlist_append(dst, &c);
+}
+
+static void
+boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const rect_t *box)
+{
+	int i;
+
+	for (i = 0; i < src->len; i++)
+		push_if_intersect_suitable(dst, &src->list[i], box);
+}
+
+/* Mark a given box as being occupied (typically by a glyph) */
+static int boxer_feed(boxer_t *boxer, rect_t *bbox)
+{
+	rect_t box;
+	/* When we feed a box into a the boxer, we can never make
+	* the list more than 4 times as long. */
+	rectlist_t *newlist = rectlist_create(boxer->alloc, boxer->list->len * 4);
+	if (newlist == NULL)
+		return -1;
+
+#ifdef DEBUG_WRITE_AS_PS
+	printf("0 0 1 setrgbcolor\n");
+	printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n",
+		bbox->min.x, bbox->min.y,
+		bbox->min.x, bbox->max.y,
+		bbox->max.x, bbox->max.y,
+		bbox->max.x, bbox->min.y
+	);
+#endif
+
+	/* Left (0,0) (min.x,H) */
+	box.min.x = boxer->mediabox.min.x;
+	box.min.y = boxer->mediabox.min.y;
+	box.max.x = bbox->min.x;
+	box.max.y = boxer->mediabox.max.y;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	/* Right (max.x,0) (W,H) */
+	box.min.x = bbox->max.x;
+	box.min.y = boxer->mediabox.min.y;
+	box.max.x = boxer->mediabox.max.x;
+	box.max.y = boxer->mediabox.max.y;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	/* Bottom (0,0) (W,min.y) */
+	box.min.x = boxer->mediabox.min.x;
+	box.min.y = boxer->mediabox.min.y;
+	box.max.x = boxer->mediabox.max.x;
+	box.max.y = bbox->min.y;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	/* Top (0,max.y) (W,H) */
+	box.min.x = boxer->mediabox.min.x;
+	box.min.y = bbox->max.y;
+	box.max.x = boxer->mediabox.max.x;
+	box.max.y = boxer->mediabox.max.y;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	extract_free(boxer->alloc, &boxer->list);
+	boxer->list = newlist;
+
+	return 0;
+}
+
+static int
+compare_areas(const void *a_, const void *b_)
+{
+	const rect_t *a = (const rect_t *)a_;
+	const rect_t *b = (const rect_t *)b_;
+	double area_a = (a->max.x-a->min.x) * (a->max.y-a->min.y);
+	double area_b = (b->max.x-b->min.x) * (b->max.y-b->min.y);
+
+	if (area_a < area_b)
+		return 1;
+	else if (area_a > area_b)
+		return -1;
+	else
+		return 0;
+}
+
+/* Sort the rectangle list to be largest area first. For ease of humans
+ * reading debug output. */
+static void boxer_sort(boxer_t *boxer)
+{
+	qsort(boxer->list->list, boxer->list->len, sizeof(rect_t), compare_areas);
+}
+
+/* Get the rectangle list for a given boxer. Return value is the length of
+ * the list. Lifespan is until the boxer is modified or freed. */
+static int boxer_results(boxer_t *boxer, rect_t **list)
+{
+	*list = boxer->list->list;
+	return boxer->list->len;
+}
+
+/* Destroy a boxer. */
+static void boxer_destroy(boxer_t *boxer)
+{
+	if (!boxer)
+		return;
+
+	extract_free(boxer->alloc, &boxer->list);
+	extract_free(boxer->alloc, &boxer);
+}
+
+/* Find the margins for a given boxer. */
+static rect_t boxer_margins(boxer_t *boxer)
+{
+	rectlist_t *list = boxer->list;
+	int i;
+	rect_t margins = boxer->mediabox;
+
+	for (i = 0; i < list->len; i++)
+	{
+		rect_t *r = &list->list[i];
+		if (r->min.x <= margins.min.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y)
+			margins.min.x = r->max.x; /* Left Margin */
+		else if (r->max.x >= margins.max.x && r->min.y <= margins.min.y && r->max.y >= margins.max.y)
+			margins.max.x = r->min.x; /* Right Margin */
+		else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->min.y <= margins.min.y)
+			margins.min.y = r->max.y; /* Top Margin */
+		else if (r->min.x <= margins.min.x && r->max.x >= margins.max.x && r->max.y >= margins.max.y)
+			margins.max.y = r->min.y; /* Bottom Margin */
+	}
+
+	return margins;
+}
+
+/* Create a new boxer from a subset of an old one. */
+static boxer_t *boxer_subset(boxer_t *boxer, rect_t rect)
+{
+	boxer_t *new_boxer = boxer_create_length(boxer->alloc, &rect, boxer->list->len);
+	int i;
+
+	if (new_boxer == NULL)
+		return NULL;
+
+	for (i = 0; i < boxer->list->len; i++)
+	{
+		rect_t r = extract_rect_intersect(boxer->list->list[i], rect);
+
+		if (!extract_rect_valid(r))
+			continue;
+		rectlist_append(new_boxer->list, &r);
+	}
+
+	return new_boxer;
+}
+
+/* Consider a boxer for subdivision.
+ * Returns 0 if no suitable subdivision point found.
+ * Returns 1, and sets *boxer1 and *boxer2 to new boxer structures for the the subdivisions
+ * if a subdivision point is found.*/
+static split_type_t
+boxer_subdivide(boxer_t *boxer, boxer_t **boxer1, boxer_t **boxer2)
+{
+	rectlist_t *list = boxer->list;
+	int num_h = 0, num_v = 0;
+	double max_h = 0, max_v = 0;
+	rect_t best_h = {0}, best_v = {0};
+	int i;
+
+	*boxer1 = NULL;
+	*boxer2 = NULL;
+
+	for (i = 0; i < list->len; i++)
+	{
+		rect_t r = boxer->list->list[i];
+
+		if (r.min.x <= boxer->mediabox.min.x && r.max.x >= boxer->mediabox.max.x)
+		{
+			/* Horizontal divider */
+			double size = r.max.y - r.min.y;
+			if (size > max_h)
+			{
+				max_h = size;
+				best_h = r;
+			}
+			num_h++;
+		}
+		if (r.min.y <= boxer->mediabox.min.y && r.max.y >= boxer->mediabox.max.y)
+		{
+			/* Vertical divider */
+			double size = r.max.x - r.min.x;
+			if (size > max_v)
+			{
+				max_v = size;
+				best_v = r;
+			}
+			num_v++;
+		}
+	}
+
+	outf("num_h=%d num_v=%d\n", num_h, num_v);
+	outf("max_h=%g max_v=%g\n", max_h, max_v);
+
+	if (max_h > max_v)
+	{
+		rect_t r;
+		/* Divider runs horizontally. */
+		r = boxer->mediabox;
+		r.max.y = best_h.min.y;
+		*boxer1 = boxer_subset(boxer, r);
+		r = boxer->mediabox;
+		r.min.y = best_h.max.y;
+		*boxer2 = boxer_subset(boxer, r);
+		return SPLIT_VERTICAL;
+	}
+	else if (max_v > 0)
+	{
+		rect_t r;
+		/* Divider runs vertically. */
+		r = boxer->mediabox;
+		r.max.x = best_v.min.x;
+		*boxer1 = boxer_subset(boxer, r);
+		r = boxer->mediabox;
+		r.min.x = best_v.max.x;
+		*boxer2 = boxer_subset(boxer, r);
+		return SPLIT_HORIZONTAL;
+	}
+
+	return SPLIT_NONE;
+}
+
+
+/* Extract specifics */
+static rect_t
+extract_span_bbox(span_t *span)
+{
+	int j;
+	rect_t bbox = extract_rect_empty;
+
+	for (j = 0; j < span->chars_num; j++)
+	{
+		char_t *char_ = &span->chars[j];
+		bbox = extract_rect_union(bbox, char_->bbox);
+	}
+	return bbox;
+}
+
+
+static int
+extract_subpage_subset(extract_alloc_t *alloc, extract_page_t *page, subpage_t *subpage, rect_t mediabox)
+{
+	content_span_iterator  sit;
+	span_t                *span;
+	subpage_t             *target;
+
+	if (extract_subpage_alloc(alloc, mediabox, page, &target))
+	return -1;
+
+	for (span = content_span_iterator_init(&sit, &subpage->content); span != NULL; span = content_span_iterator_next(&sit))
+	{
+		rect_t bbox = extract_span_bbox(span);
+
+		if (bbox.min.x >= mediabox.min.x && bbox.min.y >= mediabox.min.y && bbox.max.x <= mediabox.max.x && bbox.max.y <= mediabox.max.y)
+		{
+			content_unlink(&span->base);
+			content_append_span(&target->content, span);
+		}
+	}
+
+	return 0;
+}
+
+enum {
+	MAX_ANALYSIS_DEPTH = 6
+};
+
+static int
+analyse_sub(extract_page_t *page, subpage_t *subpage, boxer_t *big_boxer, split_t **psplit, int depth)
+{
+	rect_t margins;
+	boxer_t *boxer;
+	boxer_t *boxer1;
+	boxer_t *boxer2;
+	int ret;
+	split_type_t split_type;
+	split_t *split;
+
+	margins = boxer_margins(big_boxer);
+#ifdef DEBUG_WRITE_AS_PS
+	printf("\n\n%% MARGINS %g %g %g %g\n", margins.min.x, margins.min.y, margins.max.x, margins.max.y);
+#endif
+
+	boxer = boxer_subset(big_boxer, margins);
+
+	if (depth < MAX_ANALYSIS_DEPTH &&
+		(split_type = boxer_subdivide(boxer, &boxer1, &boxer2)) != SPLIT_NONE)
+	{
+		if (boxer1 == NULL || boxer2 == NULL ||
+			extract_split_alloc(boxer->alloc, split_type, 2, psplit))
+		{
+			ret = -1;
+			goto fail_mid_split;
+		}
+		split = *psplit;
+		outf("depth=%d %s\n", depth, split_type == SPLIT_HORIZONTAL ? "H" : "V");
+		ret = analyse_sub(page, subpage, boxer1, &split->split[0], depth+1);
+		if (!ret) ret = analyse_sub(page, subpage, boxer2, &split->split[1], depth+1);
+		if (!ret)
+		{
+			if (split_type == SPLIT_HORIZONTAL)
+			{
+				split->split[0]->weight = boxer1->mediabox.max.x - boxer1->mediabox.min.x;
+				split->split[1]->weight = boxer2->mediabox.max.x - boxer2->mediabox.min.x;
+			}
+			else
+			{
+				split->split[0]->weight = boxer1->mediabox.max.y - boxer1->mediabox.min.y;
+				split->split[1]->weight = boxer2->mediabox.max.y - boxer2->mediabox.min.y;
+			}
+		}
+fail_mid_split:
+		boxer_destroy(boxer1);
+		boxer_destroy(boxer2);
+		boxer_destroy(boxer);
+		return ret;
+	}
+
+	outf("depth=%d LEAF\n", depth);
+
+	if (extract_split_alloc(boxer->alloc, SPLIT_NONE, 0, psplit))
+	{
+		boxer_destroy(boxer);
+		return -1;
+	}
+	split = *psplit;
+
+	ret = extract_subpage_subset(boxer->alloc, page, subpage, boxer->mediabox);
+
+#ifdef DEBUG_WRITE_AS_PS
+	{
+		int i, n;
+		rect_t *list;
+		boxer_sort(boxer);
+		n = boxer_results(boxer, &list);
+
+		printf("%% SUBDIVISION\n");
+		for (i = 0; i < n; i++)
+		{
+			printf("%% %g %g %g %g\n",
+				list[i].min.x, list[i].min.y, list[i].max.x, list[i].max.y);
+		}
+
+		printf("0 0 0 setrgbcolor\n");
+		for (i = 0; i < n; i++) {
+			printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
+				list[i].min.x, list[i].min.y,
+				list[i].min.x, list[i].max.y,
+				list[i].max.x, list[i].max.y,
+				list[i].max.x, list[i].min.y);
+		}
+
+		printf("1 0 0 setrgbcolor\n");
+		printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
+			margins.min.x, margins.min.y,
+			margins.min.x, margins.max.y,
+			margins.max.x, margins.max.y,
+			margins.max.x, margins.min.y);
+	}
+#endif
+	boxer_destroy(boxer);
+
+	return ret;
+}
+
+
+static int
+collate_splits(extract_alloc_t *alloc, split_t **psplit)
+{
+	split_t *split = *psplit;
+	int s;
+	int n = 0;
+	int i;
+	int j;
+	split_t *newsplit;
+
+	/* Recurse into all our children to ensure they are collated.
+	* Count how many children we'll have once we pull all the
+	* children of children that match our type up into us. */
+	for (s = 0; s < split->count; s++)
+	{
+		if (collate_splits(alloc, &split->split[s]))
+			return -1;
+		if (split->split[s]->type == split->type)
+			n += split->split[s]->count;
+		else
+			n++;
+	}
+
+	/* No change in the number of children? Just exit. */
+	if (n == split->count)
+		return 0;
+
+	if (extract_split_alloc(alloc, split->type, n, &newsplit))
+		return -1;
+
+	newsplit->weight = split->weight;
+
+	/* Now, run across our children. */
+	i = 0;
+	for (s = 0; s < split->count; s++)
+	{
+		split_t *sub = split->split[s];
+		if (sub->type == split->type)
+		{
+			/* If the type matches, pull the grandchildren into newsplit. */
+			for (j = 0; j < sub->count; j++)
+			{
+				newsplit->split[i++] = sub->split[j];
+				sub->split[j] = NULL;
+			}
+		}
+		else
+		{
+			/* Otherwise just move the child into newsplit. */
+			newsplit->split[i++] = sub;
+			split->split[s] = NULL;
+		}
+	}
+
+	extract_split_free(alloc, psplit);
+	*psplit = newsplit;
+
+	return 0;
+}
+
+int extract_page_analyse(extract_alloc_t *alloc, extract_page_t *page)
+{
+	boxer_t               *boxer;
+	subpage_t             *subpage = page->subpages[0];
+	content_span_iterator  sit;
+	span_t                *span;
+
+	/* This code will only work if the page contains a single subpage.
+	* This should always be the case if we're called from a page
+	* generated via extract_page_begin. */
+	if (page->subpages_num != 1) return 0;
+
+	/* Take the old subpages out from the page. */
+	page->subpages_num = 0;
+	extract_free(alloc, &page->subpages);
+
+#ifdef DEBUG_WRITE_AS_PS
+	printf("1 -1 scale 0 -%g translate\n", page->mediabox.max.y-page->mediabox.min.y);
+#endif
+
+	boxer = boxer_create(alloc, (rect_t *)&subpage->mediabox);
+
+	for (span = content_span_iterator_init(&sit, &subpage->content); span != NULL; span = content_span_iterator_next(&sit))
+	{
+		rect_t bbox = extract_span_bbox(span);
+		if (boxer_feed(boxer, &bbox))
+			goto fail;
+	}
+
+	if (analyse_sub(page, subpage, boxer, &page->split, 0))
+		goto fail;
+
+	if (collate_splits(boxer->alloc, &page->split))
+		goto fail;
+
+#ifdef DEBUG_WRITE_AS_PS
+	printf("showpage\n");
+#endif
+
+	boxer_destroy(boxer);
+	extract_subpage_free(alloc, &subpage);
+
+	return 0;
+
+fail:
+	outf("Analysis failed!\n");
+	boxer_destroy(boxer);
+	extract_subpage_free(alloc, &subpage);
+
+	return -1;
+}