diff mupdf-source/source/fitz/stext-boxer.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/fitz/stext-boxer.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,956 @@
+// Copyright (C) 2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+
+#undef DEBUG_WRITE_AS_PS
+#undef DEBUG_STRUCT
+
+typedef struct boxer_s boxer_t;
+
+typedef struct {
+	int len;
+	int max;
+	fz_rect list[FZ_FLEXIBLE_ARRAY];
+} rectlist_t;
+
+struct boxer_s {
+	fz_rect mediabox;
+	rectlist_t *list;
+};
+
+static int fz_rect_contains_rect(fz_rect a, fz_rect b)
+{
+	if (a.x0 > b.x0)
+		return 0;
+	if (a.y0 > b.y0)
+		return 0;
+	if (a.x1 < b.x1)
+		return 0;
+	if (a.y1 < b.y1)
+		return 0;
+
+	return 1;
+}
+
+static rectlist_t *
+rectlist_create(fz_context *ctx, int max)
+{
+	rectlist_t *list = fz_malloc_flexible(ctx, rectlist_t, list, max);
+
+	list->len = 0;
+	list->max = max;
+
+	return list;
+}
+
+/* Push box onto rectlist, unless it is completely enclosed by
+ * another box, or completely encloses others (in which case they
+ * are replaced by it). */
+static void
+rectlist_append(rectlist_t *list, fz_rect *box)
+{
+	int i;
+
+	for (i = 0; i < list->len; i++)
+	{
+		fz_rect *r = &list->list[i];
+		fz_rect smaller, larger;
+		/* We allow ourselves a fudge factor of 4 points when checking for inclusion. */
+		double r_fudge = 4;
+
+		smaller.x0 = r->x0 + r_fudge;
+		larger. x0 = r->x0 - r_fudge;
+		smaller.y0 = r->y0 + r_fudge;
+		larger. y0 = r->y0 - r_fudge;
+		smaller.x1 = r->x1 - r_fudge;
+		larger. x1 = r->x1 + r_fudge;
+		smaller.y1 = r->y1 - r_fudge;
+		larger. y1 = r->y1 + r_fudge;
+
+		if (fz_rect_contains_rect(larger, *box))
+			return; /* box is enclosed! Nothing to do. */
+		if (fz_rect_contains_rect(*box, smaller))
+		{
+			/* box encloses r. Ditch r. */
+			/* Shorten the list */
+			--list->len;
+			/* If the one that just got chopped off wasn't r, move it down. */
+			if (i < list->len)
+			{
+				memcpy(r, &list->list[list->len], sizeof(*r));
+				i--; /* Reconsider this entry next time. */
+			}
+		}
+	}
+
+	assert(list->len < list->max);
+	memcpy(&list->list[list->len], box, sizeof(*box));
+	list->len++;
+}
+
+static boxer_t *
+boxer_create_length(fz_context *ctx, fz_rect *mediabox, int len)
+{
+	boxer_t *boxer = fz_malloc_struct(ctx, boxer_t);
+
+	if (boxer == NULL)
+		return NULL;
+
+	memcpy(&boxer->mediabox, mediabox, sizeof(*mediabox));
+	boxer->list = rectlist_create(ctx, len);
+
+	return boxer;
+}
+
+/* Create a boxer structure for a page of size mediabox. */
+static boxer_t *
+boxer_create(fz_context *ctx, fz_rect *mediabox)
+{
+	boxer_t *boxer = boxer_create_length(ctx, mediabox, 1);
+
+	if (boxer == NULL)
+		return NULL;
+	rectlist_append(boxer->list, mediabox);
+
+	return boxer;
+}
+
+static void
+push_if_intersect_suitable(rectlist_t *dst, const fz_rect *a, const fz_rect *b)
+{
+	fz_rect c;
+
+	/* Intersect a and b. */
+	c = fz_intersect_rect(*a, *b);
+	/* If no intersection, nothing to push. */
+	if (!fz_is_valid_rect(c))
+		return;
+
+	rectlist_append(dst, &c);
+}
+
+static void
+boxlist_feed_intersect(rectlist_t *dst, const rectlist_t *src, const fz_rect *box)
+{
+	int i;
+
+	for (i = 0; i < src->len; i++)
+		push_if_intersect_suitable(dst, &src->list[i], box);
+}
+
+/* Mark a given box as being occupied (typically by a glyph) */
+static void boxer_feed(fz_context *ctx, boxer_t *boxer, fz_rect *bbox)
+{
+	fz_rect box;
+	/* When we feed a box into a the boxer, we can never make
+	* the list more than 4 times as long. */
+	rectlist_t *newlist = rectlist_create(ctx, boxer->list->len * 4);
+
+#ifdef DEBUG_WRITE_AS_PS
+	printf("0 0 1 setrgbcolor\n");
+	printf("%g %g moveto %g %g lineto %g %g lineto %g %g lineto closepath fill\n",
+		bbox->x0, bbox->y0,
+		bbox->x0, bbox->y1,
+		bbox->x1, bbox->y1,
+		bbox->x1, bbox->y0
+	);
+#endif
+
+	/* Left (0,0) (x0,H) */
+	box.x0 = boxer->mediabox.x0;
+	box.y0 = boxer->mediabox.y0;
+	box.x1 = bbox->x0;
+	box.y1 = boxer->mediabox.y1;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	/* Right (x1,0) (W,H) */
+	box.x0 = bbox->x1;
+	box.y0 = boxer->mediabox.y0;
+	box.x1 = boxer->mediabox.x1;
+	box.y1 = boxer->mediabox.y1;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	/* Bottom (0,0) (W,y0) */
+	box.x0 = boxer->mediabox.x0;
+	box.y0 = boxer->mediabox.y0;
+	box.x1 = boxer->mediabox.x1;
+	box.y1 = bbox->y0;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	/* Top (0,y1) (W,H) */
+	box.x0 = boxer->mediabox.x0;
+	box.y0 = bbox->y1;
+	box.x1 = boxer->mediabox.x1;
+	box.y1 = boxer->mediabox.y1;
+	boxlist_feed_intersect(newlist, boxer->list, &box);
+
+	fz_free(ctx, boxer->list);
+	boxer->list = newlist;
+}
+
+#ifdef DEBUG_WRITE_AS_PS
+static int
+compare_areas(const void *a_, const void *b_)
+{
+	const fz_rect *a = (const fz_rect *)a_;
+	const fz_rect *b = (const fz_rect *)b_;
+	double area_a = (a->x1-a->x0) * (a->y1-a->y0);
+	double area_b = (b->x1-b->x0) * (b->y1-b->y0);
+
+	if (area_a < area_b)
+		return 1;
+	else if (area_a > area_b)
+		return -1;
+	else
+		return 0;
+}
+
+/* Sort the rectangle list to be largest area first. For ease of humans
+ * reading debug output. */
+static void boxer_sort(boxer_t *boxer)
+{
+	qsort(boxer->list->list, boxer->list->len, sizeof(fz_rect), compare_areas);
+}
+
+/* Get the rectangle list for a given boxer. Return value is the length of
+ * the list. Lifespan is until the boxer is modified or freed. */
+static int boxer_results(boxer_t *boxer, fz_rect **list)
+{
+	*list = boxer->list->list;
+	return boxer->list->len;
+}
+#endif
+
+/* Currently unused debugging routine */
+#if 0
+static void
+boxer_dump(fz_context *ctx, boxer_t *boxer)
+{
+	int i;
+
+	printf("bbox = %g %g %g %g\n", boxer->mediabox.x0, boxer->mediabox.y0, boxer->mediabox.x1, boxer->mediabox.y1);
+	for (i = 0; i < boxer->list->len; i++)
+	{
+		printf("%d %g %g %g %g\n", i, boxer->list->list[i].x0, boxer->list->list[i].y0, boxer->list->list[i].x1, boxer->list->list[i].y1);
+	}
+}
+#endif
+
+/* Destroy a boxer. */
+static void boxer_destroy(fz_context *ctx, boxer_t *boxer)
+{
+	if (!boxer)
+		return;
+
+	fz_free(ctx, boxer->list);
+	fz_free(ctx, boxer);
+}
+
+/* Find the margins for a given boxer. */
+static fz_rect boxer_margins(boxer_t *boxer)
+{
+	rectlist_t *list = boxer->list;
+	int i;
+	fz_rect margins = boxer->mediabox;
+
+	for (i = 0; i < list->len; i++)
+	{
+		fz_rect *r = &list->list[i];
+		if (r->x0 <= margins.x0 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
+			margins.x0 = r->x1; /* Left Margin */
+		else if (r->x1 >= margins.x1 && r->y0 <= margins.y0 && r->y1 >= margins.y1)
+			margins.x1 = r->x0; /* Right Margin */
+		else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y0 <= margins.y0)
+			margins.y0 = r->y1; /* Top Margin */
+		else if (r->x0 <= margins.x0 && r->x1 >= margins.x1 && r->y1 >= margins.y1)
+			margins.y1 = r->y0; /* Bottom Margin */
+	}
+
+	return margins;
+}
+
+/* Create a new boxer from a subset of an old one. */
+static boxer_t *boxer_subset(fz_context *ctx, boxer_t *boxer, fz_rect rect)
+{
+	boxer_t *new_boxer = boxer_create_length(ctx, &rect, boxer->list->len);
+	int i;
+
+	if (new_boxer == NULL)
+		return NULL;
+
+	for (i = 0; i < boxer->list->len; i++)
+	{
+		fz_rect r = fz_intersect_rect(boxer->list->list[i], rect);
+
+		if (fz_is_empty_rect(r))
+			continue;
+		rectlist_append(new_boxer->list, &r);
+	}
+
+	return new_boxer;
+}
+
+static int analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth);
+
+static void
+analyse_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, fz_rect r, int depth)
+{
+	boxer_t *sub_box = boxer_subset(ctx, boxer, r);
+
+	fz_try(ctx)
+		(void)analyse_sub(ctx, page, first_block, last_block, sub_box, depth);
+	fz_always(ctx)
+		boxer_destroy(ctx, sub_box);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+/* Consider a boxer for subdivision.
+ * Returns 0 if no suitable subdivision point found.
+ * Returns 1 if a subdivision point is found.*/
+static int
+boxer_subdivide(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *boxer, int depth)
+{
+	rectlist_t *list = boxer->list;
+	double max_h = 0, max_v = 0;
+	int i;
+
+	for (i = 0; i < list->len; i++)
+	{
+		fz_rect r = boxer->list->list[i];
+
+		if (r.x0 <= boxer->mediabox.x0 && r.x1 >= boxer->mediabox.x1)
+		{
+			/* Horizontal divider */
+			double size = r.y1 - r.y0;
+			if (size > max_h)
+			{
+				max_h = size;
+			}
+		}
+		if (r.y0 <= boxer->mediabox.y0 && r.y1 >= boxer->mediabox.y1)
+		{
+			/* Vertical divider */
+			double size = r.x1 - r.x0;
+			if (size > max_v)
+			{
+				max_v = size;
+			}
+		}
+	}
+
+	if (max_h > max_v)
+	{
+		fz_rect r;
+		float min_gap;
+		float top;
+
+		/* Divider runs horizontally. */
+
+		/* We want to list out all the horizontal subregions that are separated
+		 * by an appropriate gap, from top to bottom. */
+
+		/* Any gap larger than the the gap we ignored will do. */
+		min_gap = max_v;
+
+		/* We're going to need to run through the data multiple times to find the
+		 * topmost block each time. We'll use 'top' to cull against, and gradually
+		 * lower that after each successful pass. */
+		top = boxer->mediabox.y0;
+		while (1)
+		{
+			/* Find the topmost divider below 'top' of height at least min_gap */
+			float found_top;
+			int found = -1;
+			for (i = 0; i < list->len; i++)
+			{
+				fz_rect *b = &list->list[i];
+				if (b->x0 <= boxer->mediabox.x0 && boxer->mediabox.x1 <= b->x1 && b->y0 > top && b->y1 - b->y0 >= min_gap)
+				{
+					if (found == -1 || b->y0 < found_top)
+					{
+						found = i;
+						found_top = b->y0;
+					}
+				}
+			}
+
+			/* If we failed to find one, we're done. */
+			if (found == -1)
+				break;
+
+			/* So we have a region from top to found_top */
+			r = boxer->mediabox;
+			r.y0 = top;
+			r.y1 = found_top;
+
+			analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
+
+			/* Now move top down for the next go. */
+			top = list->list[found].y1;
+		}
+
+		/* One final region, from top to bottom */
+		r = boxer->mediabox;
+		r.y0 = top;
+		analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
+
+		return 1;
+	}
+	else if (max_v > 0)
+	{
+		fz_rect r;
+		float min_gap;
+		float left;
+
+		/* Divider runs vertically. */
+
+		/* We want to list out all the vertical subregions that are separated
+		 * by an appropriate gap, from left to right. */
+
+		/* Any gap larger than the the gap we ignored will do. */
+		min_gap = max_h;
+
+		/* We're going to need to run through the data multiple times to find the
+		 * leftmost block each time. We'll use 'left' to cull against, and gradually
+		 * lower that after each successful pass. */
+		left = boxer->mediabox.x0;
+		while (1)
+		{
+			/* Find the leftmost divider to the right of 'left' of width at least min_gap */
+			float found_left;
+			int found = -1;
+			for (i = 0; i < list->len; i++)
+			{
+				fz_rect *b = &list->list[i];
+				if (b->y0 <= boxer->mediabox.y0 && boxer->mediabox.y1 <= b->y1 && b->x0 > left && b->x1 - b->x0 >= min_gap)
+				{
+					if (found == -1 || b->x0 < found_left)
+					{
+						found = i;
+						found_left = b->x0;
+					}
+				}
+			}
+
+			/* If we failed to find one, we're done. */
+			if (found == -1)
+				break;
+
+			/* So we have a region from top to found_top */
+			r = boxer->mediabox;
+			r.x0 = left;
+			r.x1 = found_left;
+			analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
+
+			/* Now move left right for the next go. */
+			left = list->list[found].x1;
+		}
+
+		/* One final region, from left to right */
+		r = boxer->mediabox;
+		r.x0 = left;
+		analyse_subset(ctx, page, first_block, last_block, boxer, r, depth);
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static void
+new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
+{
+	fz_stext_struct *str;
+	size_t z;
+
+	if (raw == NULL)
+		raw = "";
+	z = strlen(raw);
+
+	str = fz_pool_alloc(ctx, page->pool, offsetof(fz_stext_struct, raw) + z + 1);
+	str->first_block = NULL;
+	str->last_block = NULL;
+	str->standard = standard;
+	str->parent = page->last_struct;
+	str->up = block;
+	memcpy(str->raw, raw, z+1);
+
+	block->u.s.down = str;
+}
+
+#ifdef DEBUG_STRUCT
+static void
+do_dump_stext(fz_stext_block *block, int depth)
+{
+	int d;
+
+	while (block)
+	{
+		for (d = 0; d < depth; d++)
+			printf(" ");
+		switch(block->type)
+		{
+			case FZ_STEXT_BLOCK_TEXT:
+				printf("TEXT %p\n", block);
+				break;
+			case FZ_STEXT_BLOCK_IMAGE:
+				printf("IMAGE %p\n", block);
+				break;
+			case FZ_STEXT_BLOCK_VECTOR:
+				printf("VECTOR %p\n", block);
+				break;
+			case FZ_STEXT_BLOCK_STRUCT:
+				printf("STRUCT %p\n", block);
+				do_dump_stext(block->u.s.down->first_block, depth+1);
+				break;
+		}
+		block = block->next;
+	}
+}
+
+static void
+dump_stext(char *str, fz_stext_block *block)
+{
+	printf("%s\n", str);
+
+	do_dump_stext(block, 0);
+}
+#endif
+
+static void
+recalc_bbox(fz_stext_block *block)
+{
+	fz_rect bbox = fz_empty_rect;
+	fz_stext_line *line;
+
+	for (line = block->u.t.first_line; line != NULL; line = line->next)
+		bbox = fz_union_rect(bbox, line->bbox);
+
+	block->bbox = bbox;
+}
+
+static fz_stext_struct *
+page_subset(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, fz_rect mediabox)
+{
+	fz_stext_block *block, *next_block;
+	fz_stext_block *target = NULL;
+	fz_stext_block *last = NULL;
+	fz_stext_block *newblock;
+	int idx = 0;
+
+#ifdef DEBUG_STRUCT
+	dump_stext("BEFORE", *first_block);
+#endif
+
+	for (block = *first_block; block != NULL; block = next_block)
+	{
+		fz_rect bbox;
+
+		next_block = block->next;
+		if (block->type != FZ_STEXT_BLOCK_TEXT && block->type != FZ_STEXT_BLOCK_VECTOR)
+			continue;
+
+		bbox = block->bbox;
+
+		/* Can we take the whole block? */
+		if (bbox.x0 >= mediabox.x0 && bbox.y0 >= mediabox.y0 && bbox.x1 <= mediabox.x1 && bbox.y1 <= mediabox.y1)
+		{
+			/* Unlink block from the current list. */
+			if (block->prev)
+				block->prev->next = next_block;
+			else
+				*first_block = next_block;
+			if (next_block)
+				next_block->prev = block->prev;
+			else
+				*last_block = block->prev;
+
+			/* Add block onto our target list */
+			if (target == NULL)
+			{
+				target = block;
+				block->prev = NULL;
+			}
+			else
+			{
+				last->next = block;
+				block->prev = last;
+			}
+			last = block;
+			block->next = NULL;
+		}
+		else if (block->type == FZ_STEXT_BLOCK_TEXT && !fz_is_empty_rect(fz_intersect_rect(bbox, mediabox)))
+		{
+			/* Need to look at the parts. */
+			fz_stext_line *line, *next_line;
+
+			newblock = NULL;
+			for (line = block->u.t.first_line; line != NULL; line = next_line)
+			{
+				next_line = line->next;
+				if (line->bbox.x0 >= mediabox.x0 && line->bbox.y0 >= mediabox.y0 && line->bbox.x1 <= mediabox.x1 && line->bbox.y1 <= mediabox.y1)
+				{
+					/* We need to take this line */
+					if (newblock == NULL)
+					{
+						newblock = fz_pool_alloc(ctx, page->pool, sizeof(fz_stext_block));
+
+						/* Add the block onto our target list */
+						if (target == NULL)
+						{
+							target = newblock;
+						}
+						else
+						{
+							last->next = newblock;
+							newblock->prev = last;
+						}
+						last = newblock;
+					}
+
+					/* Unlink line from the current list. */
+					if (line->prev)
+						line->prev->next = next_line;
+					else
+						block->u.t.first_line = next_line;
+					if (next_line)
+						next_line->prev = line->prev;
+					else
+						block->u.t.last_line = line->prev;
+
+					/* Add line onto our new block */
+					if (newblock->u.t.last_line == NULL)
+					{
+						newblock->u.t.first_line = newblock->u.t.last_line = line;
+						line->prev = NULL;
+					}
+					else
+					{
+						line->prev = newblock->u.t.last_line;
+						newblock->u.t.last_line->next = line;
+						newblock->u.t.last_line = line;
+					}
+					line->next = NULL;
+				}
+			}
+			if (newblock)
+			{
+				recalc_bbox(block);
+				recalc_bbox(newblock);
+			}
+		}
+	}
+
+	/* If no content to add, bale! */
+	if (target == NULL)
+		return NULL;
+
+	/* We want to insert a structure node that contains target as the last structure
+	 * node on this blocklist. Find the first block that's not a structure block. */
+	for (block = *first_block; block != NULL; block = block->next)
+	{
+		if (block->type != FZ_STEXT_BLOCK_STRUCT)
+			break;
+		idx++;
+	}
+
+	/* So we want to insert just before block. */
+
+	/* We are going to need to create a new block. Create a complete unlinked one here. */
+	newblock = fz_pool_alloc(ctx, page->pool, sizeof *newblock);
+	newblock->bbox = fz_empty_rect;
+	newblock->prev = block ? block->prev : *last_block;
+	newblock->next = block;
+	newblock->type = FZ_STEXT_BLOCK_STRUCT;
+	newblock->u.s.index = idx;
+	newblock->u.s.down = NULL;
+	/* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
+	/* And create a new struct and have newblock point to it. */
+	new_stext_struct(ctx, page, newblock, FZ_STRUCTURE_DIV, "Split");
+
+	/* Now insert newblock just before block */
+	/* If block was first, now we are. */
+	if (*first_block == block)
+		*first_block = newblock;
+	if (block == NULL)
+	{
+		/* Inserting at the end! */
+		if (*last_block)
+			(*last_block)->next = newblock;
+		*last_block = newblock;
+	}
+	else
+	{
+		if (block->prev)
+			block->prev->next = newblock;
+		block->prev = newblock;
+	}
+
+	newblock->u.s.down->first_block = target;
+	target->prev = NULL;
+
+	for (block = target; block->next != NULL; block = block->next)
+		newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
+	newblock->bbox = fz_union_rect(newblock->bbox, block->bbox);
+	newblock->u.s.down->last_block = block;
+
+#ifdef DEBUG_STRUCT
+	dump_stext("AFTER", *first_block);
+#endif
+
+	return newblock->u.s.down;
+}
+
+enum {
+	MAX_ANALYSIS_DEPTH = 6
+};
+
+static int
+analyse_sub(fz_context *ctx, fz_stext_page *page, fz_stext_block **first_block, fz_stext_block **last_block, boxer_t *big_boxer, int depth)
+{
+	fz_rect margins;
+	boxer_t *boxer;
+	boxer_t *boxer1 = NULL;
+	boxer_t *boxer2 = NULL;
+	fz_stext_struct *div;
+	int ret = 0;
+
+	/* Find the margins in the enclosing boxer. This returns
+	 * a subset of the bbox of the original. */
+	margins = boxer_margins(big_boxer);
+#ifdef DEBUG_WRITE_AS_PS
+	printf("\n\n%% MARGINS %g %g %g %g\n", margins.x0, margins.y0, margins.x1, margins.y1);
+#endif
+
+	/* Now subset the rectangles just to include those that are in our bbox. */
+	boxer = boxer_subset(ctx, big_boxer, margins);
+
+	fz_var(boxer1);
+	fz_var(boxer2);
+
+	fz_try(ctx)
+	{
+		div = page_subset(ctx, page, first_block, last_block, boxer->mediabox);
+		/* If nothing subsetted (no textual content in that region), give up. */
+		if (div == NULL)
+			break;
+
+		ret = 1;
+
+		if (depth < MAX_ANALYSIS_DEPTH)
+		{
+			/* Can we subdivide that region any more? */
+			if (boxer_subdivide(ctx, page, &div->first_block, &div->last_block, boxer, depth+1))
+				break;
+		}
+
+#ifdef DEBUG_WRITE_AS_PS
+		{
+			int i, n;
+			fz_rect *list;
+			boxer_sort(boxer);
+			n = boxer_results(boxer, &list);
+
+			printf("%% SUBDIVISION\n");
+			for (i = 0; i < n; i++)
+			{
+				printf("%% %g %g %g %g\n",
+					list[i].x0, list[i].y0, list[i].x1, list[i].y1);
+			}
+
+			printf("0 0 0 setrgbcolor\n");
+			for (i = 0; i < n; i++) {
+				printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
+					list[i].x0, list[i].y0,
+					list[i].x0, list[i].y1,
+					list[i].x1, list[i].y1,
+					list[i].x1, list[i].y0);
+			}
+
+			printf("1 0 0 setrgbcolor\n");
+			printf("%g %g moveto\n%g %g lineto\n%g %g lineto\n%g %g lineto\nclosepath\nstroke\n\n",
+				margins.x0, margins.y0,
+				margins.x0, margins.y1,
+				margins.x1, margins.y1,
+				margins.x1, margins.y0);
+		}
+#endif
+	}
+	fz_always(ctx)
+	{
+		boxer_destroy(ctx, boxer1);
+		boxer_destroy(ctx, boxer2);
+		boxer_destroy(ctx, boxer);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+	return ret;
+}
+
+static int
+line_isnt_all_spaces(fz_context *ctx, fz_stext_line *line)
+{
+	fz_stext_char *ch;
+	for (ch = line->first_char; ch != NULL; ch = ch->next)
+		if (ch->c != 32 && ch->c != 160)
+			return 1;
+	return 0;
+}
+
+static void
+feed_line(fz_context *ctx, boxer_t *boxer, fz_stext_line *line)
+{
+	fz_stext_char *ch;
+
+	for (ch = line->first_char; ch != NULL; ch = ch->next)
+	{
+		fz_rect r = fz_empty_rect;
+
+		if (ch->c == ' ')
+			continue;
+
+		do
+		{
+			fz_rect bbox = fz_rect_from_quad(ch->quad);
+			float margin = ch->size/2;
+			bbox.x0 -= margin;
+			bbox.y0 -= margin;
+			bbox.x1 += margin;
+			bbox.y1 += margin;
+			r = fz_union_rect(r, bbox);
+			ch = ch->next;
+		}
+		while (ch != NULL && ch->c != ' ');
+		boxer_feed(ctx, boxer, &r);
+		if (ch == NULL)
+			break;
+	}
+}
+
+/* Internal, non-API function, shared with stext-table. */
+fz_rect
+fz_collate_small_vector_run(fz_stext_block **blockp)
+{
+	fz_stext_block *block = *blockp;
+	fz_stext_block *next;
+	fz_rect r = block->bbox;
+	int MAX_SIZE = 2;
+	int MAX_GAP = 2;
+
+	float w = r.x1 - r.x0;
+	float h = r.y1 - r.y0;
+
+	if (w < MAX_SIZE)
+	{
+		while ((next = block->next) != NULL &&
+			next->type == FZ_STEXT_BLOCK_VECTOR &&
+			next->bbox.x0 == r.x0 &&
+			next->bbox.x1 == r.x1 &&
+			((next->bbox.y1 > r.y1 && next->bbox.y0 <= r.y1 + MAX_GAP) ||
+			(next->bbox.y0 < r.y0 && next->bbox.y1 >= r.y0 - MAX_GAP)))
+		{
+			r = fz_union_rect(r, next->bbox);
+			block = next;
+		}
+	}
+	if (h < MAX_SIZE)
+	{
+		while ((next = block->next) != NULL &&
+			next->type == FZ_STEXT_BLOCK_VECTOR &&
+			next->bbox.y0 == r.y0 &&
+			next->bbox.y1 == r.y1 &&
+			((next->bbox.x1 > r.x1 && next->bbox.x0 <= r.x1 + MAX_GAP) ||
+			(next->bbox.x0 < r.x0 && next->bbox.x1 >= r.x0 - MAX_GAP)))
+		{
+			r = fz_union_rect(r, next->bbox);
+			block = next;
+		}
+	}
+
+	*blockp = block;
+
+	return r;
+}
+
+int fz_segment_stext_page(fz_context *ctx, fz_stext_page *page)
+{
+	boxer_t *boxer;
+	fz_stext_block *block;
+	int ret = 0;
+
+	/* If we have structure already, give up. We can't hope to beat
+	 * proper structure! */
+	for (block = page->first_block; block != NULL; block = block->next)
+		if (block->type == FZ_STEXT_BLOCK_STRUCT)
+			return 0;
+
+#ifdef DEBUG_WRITE_AS_PS
+	printf("1 -1 scale 0 -%g translate\n", page->mediabox.y1-page->mediabox.y0);
+#endif
+
+	boxer = boxer_create(ctx, &page->mediabox);
+
+	fz_try(ctx)
+	{
+		/* Just walking the blocks is safe as we're assuming no structure here. */
+		for (block = page->first_block; block != NULL; block = block->next)
+		{
+			fz_stext_line *line;
+			switch (block->type)
+			{
+			case FZ_STEXT_BLOCK_TEXT:
+				for (line = block->u.t.first_line; line != NULL; line = line->next)
+					if (line_isnt_all_spaces(ctx, line))
+						feed_line(ctx, boxer, line);
+				break;
+			case FZ_STEXT_BLOCK_VECTOR:
+			{
+				/* Allow a 1 point margin around vectors to avoid hairline
+				 * cracks between supposedly abutting things. */
+				int VECTOR_MARGIN = 1;
+				fz_rect r = fz_collate_small_vector_run(&block);
+
+				r.x0 -= VECTOR_MARGIN;
+				r.y0 -= VECTOR_MARGIN;
+				r.x1 += VECTOR_MARGIN;
+				r.y1 += VECTOR_MARGIN;
+				boxer_feed(ctx, boxer, &r);
+			}
+			}
+		}
+
+		ret = analyse_sub(ctx, page, &page->first_block, &page->last_block, boxer, 0);
+	}
+	fz_always(ctx)
+		boxer_destroy(ctx, boxer);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+#ifdef DEBUG_WRITE_AS_PS
+	printf("showpage\n");
+#endif
+
+	return ret;
+}