diff mupdf-source/source/fitz/stext-device.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children aa33339d6b8a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/fitz/stext-device.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,2539 @@
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+
+#include "glyphbox.h"
+
+#include <float.h>
+#include <string.h>
+
+/* Simple layout structure */
+
+fz_layout_block *fz_new_layout(fz_context *ctx)
+{
+	fz_pool *pool = fz_new_pool(ctx);
+	fz_layout_block *block;
+	fz_try(ctx)
+	{
+		block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block));
+		block->pool = pool;
+		block->head = NULL;
+		block->tailp = &block->head;
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_pool(ctx, pool);
+		fz_rethrow(ctx);
+	}
+	return block;
+}
+
+void fz_drop_layout(fz_context *ctx, fz_layout_block *block)
+{
+	if (block)
+		fz_drop_pool(ctx, block->pool);
+}
+
+void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float font_size, const char *p)
+{
+	fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line));
+	line->x = x;
+	line->y = y;
+	line->font_size = font_size;
+	line->p = p;
+	line->text = NULL;
+	line->next = NULL;
+	*block->tailp = line;
+	block->tailp = &line->next;
+	block->text_tailp = &line->text;
+}
+
+void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float advance, const char *p)
+{
+	fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char));
+	ch->x = x;
+	ch->advance = advance;
+	ch->p = p;
+	ch->next = NULL;
+	*block->text_tailp = ch;
+	block->text_tailp = &ch->next;
+}
+
+/* Extract text into blocks and lines. */
+
+#define PARAGRAPH_DIST 1.5f
+#define SPACE_DIST 0.15f
+#define SPACE_MAX_DIST 0.8f
+#define BASE_MAX_DIST 0.8f
+#define FAKE_BOLD_MAX_DIST 0.1f
+
+/* We keep a stack of the different metatexts that apply at any
+ * given point (normally none!). Whenever we get some content
+ * with a metatext in force, we really want to update the bounds
+ * for that metatext. But running along the whole list each time
+ * would be painful. So we just update the bounds for dev->metatext
+ * and rely on metatext_bounds() propagating it upwards 'just in
+ * time' for us to use metatexts other than the latest one. This
+ * also means we need to propagate bounds upwards when we pop
+ * a metatext.
+ *
+ * Why do we need bounds at all? Well, suppose we get:
+ *    /Span <</ActualText (c) >> BDC /Im0 Do EMC
+ * Then where on the page do we put 'c' ? By collecting the
+ * bounds, we can place 'c' wherever the image was.
+ */
+typedef struct metatext_t
+{
+	fz_metatext type;
+	char *text;
+	fz_rect bounds;
+	struct metatext_t *prev;
+} metatext_t;
+
+typedef struct
+{
+	fz_point from;
+	fz_point to;
+	float thickness;
+} rect_details;
+
+typedef struct
+{
+	fz_device super;
+	fz_stext_page *page;
+	int id;
+	fz_point pen, start;
+	fz_point lag_pen;
+	fz_matrix trm;
+	int new_obj;
+	int lastchar;
+	int lastbidi;
+	int flags;
+	int color;
+	int last_was_fake_bold;
+	const fz_text *lasttext;
+	fz_stext_options opts;
+
+	metatext_t *metatext;
+
+	/* Store the last values we saw. We need this for flushing the actualtext. */
+	struct
+	{
+		int valid;
+		int clipped;
+		fz_matrix trm;
+		int wmode;
+		int bidi_level;
+		fz_font *font;
+		int flags;
+	} last;
+
+	/* The list of 'rects' seen during processing (if we're collecting styles). */
+	int rect_max;
+	int rect_len;
+	rect_details *rects;
+} fz_stext_device;
+
+const char *fz_stext_options_usage =
+	"Text output options:\n"
+	"\tpreserve-images: keep images in output\n"
+	"\tpreserve-ligatures: do not expand ligatures into constituent characters\n"
+	"\tpreserve-spans: do not merge spans on the same line\n"
+	"\tpreserve-whitespace: do not convert all whitespace into space characters\n"
+	"\tinhibit-spaces: don't add spaces between gaps in the text\n"
+	"\tparagraph-break: break blocks at paragraph boundaries\n"
+	"\tdehyphenate: attempt to join up hyphenated words\n"
+	"\tignore-actualtext: do not apply ActualText replacements\n"
+	"\tuse-cid-for-unknown-unicode: use character code if unicode mapping fails\n"
+	"\tuse-gid-for-unknown-unicode: use glyph index if unicode mapping fails\n"
+	"\taccurate-bboxes: calculate char bboxes from the outlines\n"
+	"\taccurate-ascenders: calculate ascender/descender from font glyphs\n"
+	"\taccurate-side-bearings: expand char bboxes to completely include width of glyphs\n"
+	"\tcollect-styles: attempt to detect text features (fake bold, strikeout, underlined etc)\n"
+	"\tclip: do not include text that is completely clipped\n"
+	"\tclip-rect=x0:y0:x1:y1 specify clipping rectangle within which to collect content\n"
+	"\tstructured: collect structure markup\n"
+	"\tvectors: include vector bboxes in output\n"
+	"\tsegment: attempt to segment the page\n"
+	"\ttable-hunt: hunt for tables within a (segmented) page\n"
+	"\n";
+
+/* Find the current actualtext, if any. Will abort if dev == NULL. */
+static metatext_t *
+find_actualtext(fz_stext_device *dev)
+{
+	metatext_t *mt = dev->metatext;
+
+	while (mt && mt->type != FZ_METATEXT_ACTUALTEXT)
+		mt = mt->prev;
+
+	return mt;
+}
+
+/* Find the bounds of the given metatext. Will abort if mt or
+ * dev are NULL. */
+static fz_rect *
+metatext_bounds(metatext_t *mt, fz_stext_device *dev)
+{
+	metatext_t *mt2 = dev->metatext;
+
+	while (mt2 != mt)
+	{
+		mt2->prev->bounds = fz_union_rect(mt2->prev->bounds, mt2->bounds);
+		mt2 = mt2->prev;
+	}
+
+	return &mt->bounds;
+}
+
+/* Find the bounds of the current actualtext, or NULL if there
+ * isn't one. Will abort if dev is NULL. */
+static fz_rect *
+actualtext_bounds(fz_stext_device *dev)
+{
+	metatext_t *mt = find_actualtext(dev);
+
+	if (mt == NULL)
+		return NULL;
+
+	return metatext_bounds(mt, dev);
+}
+
+fz_stext_page *
+fz_new_stext_page(fz_context *ctx, fz_rect mediabox)
+{
+	fz_pool *pool = fz_new_pool(ctx);
+	fz_stext_page *page = NULL;
+	fz_try(ctx)
+	{
+		page = fz_pool_alloc(ctx, pool, sizeof(*page));
+		page->pool = pool;
+		page->mediabox = mediabox;
+		page->first_block = NULL;
+		page->last_block = NULL;
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_pool(ctx, pool);
+		fz_rethrow(ctx);
+	}
+	return page;
+}
+
+static void
+drop_run(fz_context *ctx, fz_stext_block *block)
+{
+	fz_stext_line *line;
+	fz_stext_char *ch;
+	while (block)
+	{
+		switch (block->type)
+		{
+		case FZ_STEXT_BLOCK_IMAGE:
+			fz_drop_image(ctx, block->u.i.image);
+			break;
+		case FZ_STEXT_BLOCK_TEXT:
+			for (line = block->u.t.first_line; line; line = line->next)
+				for (ch = line->first_char; ch; ch = ch->next)
+					fz_drop_font(ctx, ch->font);
+			break;
+		case FZ_STEXT_BLOCK_STRUCT:
+			drop_run(ctx, block->u.s.down->first_block);
+			break;
+		default:
+			break;
+		}
+		block = block->next;
+	}
+}
+
+void
+fz_drop_stext_page(fz_context *ctx, fz_stext_page *page)
+{
+	if (page)
+	{
+		drop_run(ctx, page->first_block);
+		fz_drop_pool(ctx, page->pool);
+	}
+}
+
+/*
+ * This adds a new block at the end of the page. This should not be used
+ * to add 'struct' blocks to the page as those have to be added internally,
+ * with more complicated pointer setup.
+ */
+static fz_stext_block *
+add_block_to_page(fz_context *ctx, fz_stext_page *page)
+{
+	fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
+	block->bbox = fz_empty_rect; /* Fixes bug 703267. */
+	block->prev = page->last_block;
+	if (page->last_struct)
+	{
+		if (page->last_struct->last_block)
+		{
+			block->prev = page->last_struct->last_block;
+			block->prev->next = block;
+			page->last_struct->last_block = block;
+		}
+		else
+			page->last_struct->last_block = page->last_struct->first_block = block;
+	}
+	else if (!page->last_block)
+	{
+		page->last_block = block;
+		if (!page->first_block)
+			page->first_block = block;
+	}
+	else
+	{
+		page->last_block->next = block;
+		page->last_block = block;
+	}
+	return block;
+}
+
+static fz_stext_block *
+add_text_block_to_page(fz_context *ctx, fz_stext_page *page)
+{
+	fz_stext_block *block = add_block_to_page(ctx, page);
+	block->type = FZ_STEXT_BLOCK_TEXT;
+	return block;
+}
+
+static fz_stext_block *
+add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image)
+{
+	fz_stext_block *block = add_block_to_page(ctx, page);
+	block->type = FZ_STEXT_BLOCK_IMAGE;
+	block->u.i.transform = ctm;
+	block->u.i.image = fz_keep_image(ctx, image);
+	block->bbox = fz_transform_rect(fz_unit_rect, ctm);
+	return block;
+}
+
+static fz_stext_line *
+add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode, int bidi)
+{
+	fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line);
+	line->prev = block->u.t.last_line;
+	if (!block->u.t.first_line)
+		block->u.t.first_line = block->u.t.last_line = line;
+	else
+	{
+		block->u.t.last_line->next = line;
+		block->u.t.last_line = line;
+	}
+
+	line->dir = *dir;
+	line->wmode = wmode;
+
+	return line;
+}
+
+#define NON_ACCURATE_GLYPH_ADDED_SPACE (-2)
+#define NON_ACCURATE_GLYPH (-1)
+
+static fz_stext_char *
+add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, int glyph, fz_point *p, fz_point *q, int bidi, int color, int synthetic, int flags, int dev_flags)
+{
+	fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char);
+	fz_point a, d;
+
+	if (!line->first_char)
+		line->first_char = line->last_char = ch;
+	else
+	{
+		line->last_char->next = ch;
+		line->last_char = ch;
+	}
+
+	ch->c = c;
+	ch->argb = color;
+	ch->bidi = bidi;
+	ch->origin = *p;
+	ch->size = size;
+	ch->font = fz_keep_font(ctx, font);
+	ch->flags = flags | (synthetic ? FZ_STEXT_SYNTHETIC : 0);
+	if (font->flags.is_bold)
+		ch->flags |= FZ_STEXT_BOLD;
+
+	if (line->wmode == 0)
+	{
+		fz_rect bounds;
+		int bounded = 0;
+		a.x = 0;
+		d.x = 0;
+		if (glyph == NON_ACCURATE_GLYPH_ADDED_SPACE)
+		{
+			/* Added space, in accurate mode. */
+			a.y = d.y = 0;
+		}
+		else if (glyph == NON_ACCURATE_GLYPH)
+		{
+			/* Non accurate mode. */
+			a.y = fz_font_ascender(ctx, font);
+			d.y = fz_font_descender(ctx, font);
+		}
+		else
+		{
+			/* Any glyph in accurate mode */
+			bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
+			bounded = 1;
+			a.y = bounds.y1;
+			d.y = bounds.y0;
+		}
+		if (dev_flags & FZ_STEXT_ACCURATE_SIDE_BEARINGS)
+		{
+			if (!bounded)
+				bounds = fz_bound_glyph(ctx, font, glyph, fz_identity);
+			if (a.x > bounds.x0)
+				a.x = bounds.x0;
+			if (d.y < bounds.x1)
+				d.y = bounds.x1;
+		}
+	}
+	else
+	{
+		a.x = 1;
+		d.x = 0;
+		a.y = 0;
+		d.y = 0;
+	}
+	a = fz_transform_vector(a, trm);
+	d = fz_transform_vector(d, trm);
+
+	ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y);
+	ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y);
+	ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y);
+	ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y);
+
+	return ch;
+}
+
+static void
+remove_last_char(fz_context *ctx, fz_stext_line *line)
+{
+	if (line && line->first_char)
+	{
+		fz_stext_char *prev = NULL;
+		fz_stext_char *ch = line->first_char;
+		while (ch->next)
+		{
+			prev = ch;
+			ch = ch->next;
+		}
+		if (prev)
+		{
+			/* The characters are pool allocated, so we don't actually leak the removed node. */
+			/* We do need to drop the char's font reference though. */
+			fz_drop_font(ctx, prev->next->font);
+			line->last_char = prev;
+			line->last_char->next = NULL;
+		}
+	}
+}
+
+static fz_stext_char *reverse_bidi_span(fz_stext_char *curr, fz_stext_char *tail)
+{
+	fz_stext_char *prev, *next;
+	prev = tail;
+	while (curr != tail)
+	{
+		next = curr->next;
+		curr->next = prev;
+		prev = curr;
+		curr = next;
+	}
+	return prev;
+}
+
+static void reverse_bidi_line(fz_stext_line *line)
+{
+	fz_stext_char *a, *b, **prev;
+	prev = &line->first_char;
+	for (a = line->first_char; a; a = a->next)
+	{
+		if (a->bidi)
+		{
+			b = a;
+			while (b->next && b->next->bidi)
+				b = b->next;
+			if (a != b)
+				*prev = reverse_bidi_span(a, b->next);
+		}
+		prev = &a->next;
+		line->last_char = a;
+	}
+}
+
+static int is_hyphen(int c)
+{
+	/* check for: hyphen-minus, soft hyphen, hyphen, and non-breaking hyphen */
+	return (c == '-' || c == 0xAD || c == 0x2010 || c == 0x2011);
+}
+
+static float
+vec_dot(const fz_point *a, const fz_point *b)
+{
+	return a->x * b->x + a->y * b->y;
+}
+
+static int may_add_space(int lastchar)
+{
+	/* Basic latin, greek, cyrillic, hebrew, arabic,
+	 * general punctuation,
+	 * superscripts and subscripts,
+	 * and currency symbols.
+	 */
+	return (lastchar != ' ' && (lastchar < 0x700 || (lastchar >= 0x2000 && lastchar <= 0x20CF)));
+}
+
+#define FAKEBOLD_THRESHOLD_RECIP 10
+
+static int
+close(float a, float b, float size)
+{
+	a -= b;
+	if (a < 0)
+		a = -a;
+
+	return FAKEBOLD_THRESHOLD_RECIP * a < size;
+}
+
+static int
+font_equiv(fz_context *ctx, fz_font *f, fz_font *g)
+{
+	unsigned char fdigest[16];
+	unsigned char gdigest[16];
+
+	if (f == g)
+		return 1;
+
+	if (strcmp(f->name, g->name) != 0)
+		return 0;
+
+	fz_font_digest(ctx, f, fdigest);
+	fz_font_digest(ctx, g, gdigest);
+
+	return (memcmp(fdigest, gdigest, 16) == 0);
+}
+
+static int
+check_for_fake_bold(fz_context *ctx, fz_stext_block *block, fz_font *font, int c, fz_point p, float size, int flags)
+{
+	fz_stext_line *line;
+	fz_stext_char *ch;
+
+	for (; block != NULL; block = block->next)
+	{
+		if (block->type == FZ_STEXT_BLOCK_STRUCT)
+		{
+			if (block->u.s.down != NULL && check_for_fake_bold(ctx, block->u.s.down->first_block, font, c, p, size, flags))
+				return 1;
+		}
+		else if (block->type == FZ_STEXT_BLOCK_TEXT)
+		{
+			for (line = block->u.t.first_line; line != NULL; line = line->next)
+			{
+				fz_stext_char *pr = NULL;
+				for (ch = line->first_char; ch != NULL; ch = ch->next)
+				{
+					/* Not perfect, but it'll do! */
+					if (ch->c == c && close(ch->origin.x, p.x, size) && close(ch->origin.y, p.y, size) && font_equiv(ctx, ch->font, font))
+					{
+						/* If we were filled before, and we are stroking now... */
+						if ((ch->flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_FILLED &&
+							(flags & (FZ_STEXT_FILLED | FZ_STEXT_STROKED)) == FZ_STEXT_STROKED)
+						{
+							/* Update this to be filled + stroked, but don't specifically mark it as fake bold. */
+							ch->flags |= flags;
+							return 1;
+						}
+						/* Overlaying spaces is tricksy. How can that count as boldening when it doesn't mark? We only accept these
+						 * as boldening if either the char before, or the char after were also boldened. */
+						ch->flags |= flags;
+
+						if (c == ' ')
+						{
+							if ((pr && (pr->flags & FZ_STEXT_BOLD) != 0) ||
+								(ch->next && (ch->next->flags & FZ_STEXT_BOLD) != 0))
+							{
+								/* OK, we can be bold. */
+								ch->flags |= FZ_STEXT_BOLD;
+								return 1;
+							}
+							/* Ignore this and keep going */
+						}
+						else
+						{
+							ch->flags |= FZ_STEXT_BOLD;
+							return 1;
+						}
+					}
+					pr = ch;
+				}
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void
+fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode, int bidi, int force_new_line, int flags)
+{
+	fz_stext_page *page = dev->page;
+	fz_stext_block *cur_block;
+	fz_stext_line *cur_line;
+
+	int new_para = 0;
+	int new_line = 1;
+	int add_space = 0;
+	fz_point dir, ndir, p, q;
+	float size;
+	fz_point delta;
+	float spacing = 0;
+	float base_offset = 0;
+	float dist;
+
+	/* Preserve RTL-ness only (and ignore level) so we can use bit 2 as "visual" tag for reordering pass. */
+	bidi = bidi & 1;
+
+	/* dir = direction vector for motion. ndir = normalised(dir) */
+	if (wmode == 0)
+	{
+		dir.x = 1;
+		dir.y = 0;
+	}
+	else
+	{
+		dir.x = 0;
+		dir.y = -1;
+	}
+	dir = fz_transform_vector(dir, trm);
+	ndir = fz_normalize_vector(dir);
+
+	size = fz_matrix_expansion(trm);
+
+	/* We need to identify where glyphs 'start' (p) and 'stop' (q).
+	 * Each glyph holds its 'start' position, and the next glyph in the
+	 * span (or span->max if there is no next glyph) holds its 'end'
+	 * position.
+	 *
+	 * For both horizontal and vertical motion, trm->{e,f} gives the
+	 * origin (usually the bottom left) of the glyph.
+	 *
+	 * In horizontal mode:
+	 *   + p is bottom left.
+	 *   + q is the bottom right
+	 * In vertical mode:
+	 *   + p is top left (where it advanced from)
+	 *   + q is bottom left
+	 */
+	if (wmode == 0)
+	{
+		p.x = trm.e;
+		p.y = trm.f;
+		q.x = trm.e + adv * dir.x;
+		q.y = trm.f + adv * dir.y;
+	}
+	else
+	{
+		p.x = trm.e - adv * dir.x;
+		p.y = trm.f - adv * dir.y;
+		q.x = trm.e;
+		q.y = trm.f;
+	}
+
+	if ((dev->opts.flags & FZ_STEXT_COLLECT_STYLES) != 0)
+	{
+		if (glyph == -1)
+		{
+			if (dev->last_was_fake_bold)
+				goto move_pen_and_exit;
+		}
+		else if (check_for_fake_bold(ctx, page->first_block, font, c, p, size, flags))
+		{
+			dev->last_was_fake_bold = 1;
+			goto move_pen_and_exit;
+		}
+		dev->last_was_fake_bold = 0;
+	}
+
+	/* Find current position to enter new text. */
+	cur_block = page->last_struct ? page->last_struct->last_block : page->last_block;
+	if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT)
+		cur_block = NULL;
+	cur_line = cur_block ? cur_block->u.t.last_line : NULL;
+
+	if (cur_line && glyph < 0)
+	{
+		/* Don't advance pen or break lines for no-glyph characters in a cluster */
+		add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &dev->pen, &dev->pen, bidi, dev->color, 0, flags, dev->flags);
+		dev->lastbidi = bidi;
+		dev->lastchar = c;
+		return;
+	}
+
+	if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f)
+	{
+		/* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all),
+		 * then we can't append to the current block/line. */
+		new_para = 1;
+		new_line = 1;
+	}
+	else
+	{
+		/* Detect fake bold where text is printed twice in the same place. */
+		/* Largely supplanted by the check_for_fake_bold mechanism above,
+		 * but we leave this in for backward compatibility as it's cheap,
+		 * and works even when FZ_STEXT_COLLECT_STYLES is not set. */
+		dist = hypotf(q.x - dev->pen.x, q.y - dev->pen.y) / size;
+		if (dist < FAKE_BOLD_MAX_DIST && c == dev->lastchar)
+			return;
+
+		/* Calculate how far we've moved since the last character. */
+		delta.x = p.x - dev->pen.x;
+		delta.y = p.y - dev->pen.y;
+
+		/* The transform has not changed, so we know we're in the same
+		 * direction. Calculate 2 distances; how far off the previous
+		 * baseline we are, together with how far along the baseline
+		 * we are from the expected position. */
+		spacing = (ndir.x * delta.x + ndir.y * delta.y) / size;
+		base_offset = (-ndir.y * delta.x + ndir.x * delta.y) / size;
+
+		/* Only a small amount off the baseline - we'll take this */
+		if (fabsf(base_offset) < BASE_MAX_DIST)
+		{
+			/* If mixed LTR and RTL content */
+			if ((bidi & 1) != (dev->lastbidi & 1))
+			{
+				/* Ignore jumps within line when switching between LTR and RTL text. */
+				new_line = 0;
+			}
+
+			/* RTL */
+			else if (bidi & 1)
+			{
+				fz_point logical_delta = fz_make_point(p.x - dev->lag_pen.x, p.y - dev->lag_pen.y);
+				float logical_spacing = (ndir.x * logical_delta.x + ndir.y * logical_delta.y) / size + adv;
+
+				/* If the pen is where we would have been if we
+				 * had advanced backwards from the previous
+				 * character by this character's advance, we
+				 * are probably seeing characters emitted in
+				 * logical order.
+				 */
+				if (fabsf(logical_spacing) < SPACE_DIST)
+				{
+					new_line = 0;
+				}
+
+				/* However, if the pen has advanced to where we would expect it
+				 * in an LTR context, we're seeing them emitted in visual order
+				 * and should flag them for reordering!
+				 */
+				else if (fabsf(spacing) < SPACE_DIST)
+				{
+					bidi = 3; /* mark line as visual */
+					new_line = 0;
+				}
+
+				/* And any other small jump could be a missing space. */
+				else if (logical_spacing < 0 && logical_spacing > -SPACE_MAX_DIST)
+				{
+					if (wmode == 0 && may_add_space(dev->lastchar))
+						add_space = 1;
+					new_line = 0;
+				}
+				else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
+				{
+					/* Motion is in line, but negative. We've probably got overlapping
+					 * chars here. Live with it. */
+					new_line = 0;
+				}
+				else if (spacing > 0 && spacing < SPACE_MAX_DIST)
+				{
+					bidi = 3; /* mark line as visual */
+					if (wmode == 0 && may_add_space(dev->lastchar))
+						add_space = 1;
+					new_line = 0;
+				}
+
+				else
+				{
+					/* Motion is large and unexpected (probably a new table column). */
+					new_line = 1;
+				}
+			}
+
+			/* LTR or neutral character */
+			else
+			{
+				if (fabsf(spacing) < SPACE_DIST)
+				{
+					/* Motion is in line and small enough to ignore. */
+					new_line = 0;
+				}
+				else if (spacing < 0 && spacing > -SPACE_MAX_DIST)
+				{
+					/* Motion is in line, but negative. We've probably got overlapping
+					 * chars here. Live with it. */
+					new_line = 0;
+				}
+				else if (spacing > 0 && spacing < SPACE_MAX_DIST)
+				{
+					/* Motion is forward in line and large enough to warrant us adding a space. */
+					if (wmode == 0 && may_add_space(dev->lastchar))
+						add_space = 1;
+					new_line = 0;
+				}
+				else
+				{
+					/* Motion is large and unexpected (probably a new table column). */
+					new_line = 1;
+				}
+			}
+		}
+
+		/* Enough for a new line, but not enough for a new paragraph */
+		else if (fabsf(base_offset) <= PARAGRAPH_DIST)
+		{
+			/* Check indent to spot text-indent style paragraphs */
+			if (wmode == 0 && cur_line && dev->new_obj)
+				if ((p.x - dev->start.x) > 0.5f)
+					new_para = 1;
+			new_line = 1;
+		}
+
+		/* Way off the baseline - open a new paragraph */
+		else
+		{
+			new_para = 1;
+			new_line = 1;
+		}
+	}
+
+	/* Start a new block (but only at the beginning of a text object) */
+	if (new_para || !cur_block)
+	{
+		cur_block = add_text_block_to_page(ctx, page);
+		cur_line = cur_block->u.t.last_line;
+	}
+
+	if (new_line && (dev->flags & FZ_STEXT_DEHYPHENATE) && is_hyphen(dev->lastchar))
+	{
+		remove_last_char(ctx, cur_line);
+		new_line = 0;
+	}
+
+	/* Start a new line */
+	if (new_line || !cur_line || force_new_line)
+	{
+		cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode, bidi);
+		dev->start = p;
+	}
+
+	/* Add synthetic space */
+	if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES))
+		add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? NON_ACCURATE_GLYPH_ADDED_SPACE : NON_ACCURATE_GLYPH, &dev->pen, &p, bidi, dev->color, 1, flags, dev->flags);
+
+	add_char_to_line(ctx, page, cur_line, trm, font, size, c, (dev->flags & FZ_STEXT_ACCURATE_BBOXES) ? glyph : NON_ACCURATE_GLYPH, &p, &q, bidi, dev->color, 0, flags, dev->flags);
+
+move_pen_and_exit:
+	dev->lastchar = c;
+	dev->lastbidi = bidi;
+	dev->lag_pen = p;
+	dev->pen = q;
+
+	dev->new_obj = 0;
+	dev->trm = trm;
+}
+
+static void
+fz_add_stext_char(fz_context *ctx,
+	fz_stext_device *dev,
+	fz_font *font,
+	int c,
+	int glyph,
+	fz_matrix trm,
+	float adv,
+	int wmode,
+	int bidi,
+	int force_new_line,
+	int flags)
+{
+	/* ignore when one unicode character maps to multiple glyphs */
+	if (c == -1)
+		return;
+
+	if (dev->flags & FZ_STEXT_ACCURATE_ASCENDERS)
+		fz_calculate_font_ascender_descender(ctx, font);
+
+	if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES))
+	{
+		switch (c)
+		{
+		case 0xFB00: /* ff */
+			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
+			return;
+		case 0xFB01: /* fi */
+			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
+			return;
+		case 0xFB02: /* fl */
+			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
+			return;
+		case 0xFB03: /* ffi */
+			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode, bidi, 0, flags);
+			return;
+		case 0xFB04: /* ffl */
+			fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode, bidi, force_new_line, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode, bidi, 0, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode, bidi, 0, flags);
+			return;
+		case 0xFB05: /* long st */
+		case 0xFB06: /* st */
+			fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode, bidi, force_new_line, flags);
+			fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode, bidi, 0, flags);
+			return;
+		}
+	}
+
+	if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE))
+	{
+		switch (c)
+		{
+		case 0x0009: /* tab */
+		case 0x0020: /* space */
+		case 0x00A0: /* no-break space */
+		case 0x1680: /* ogham space mark */
+		case 0x180E: /* mongolian vowel separator */
+		case 0x2000: /* en quad */
+		case 0x2001: /* em quad */
+		case 0x2002: /* en space */
+		case 0x2003: /* em space */
+		case 0x2004: /* three-per-em space */
+		case 0x2005: /* four-per-em space */
+		case 0x2006: /* six-per-em space */
+		case 0x2007: /* figure space */
+		case 0x2008: /* punctuation space */
+		case 0x2009: /* thin space */
+		case 0x200A: /* hair space */
+		case 0x202F: /* narrow no-break space */
+		case 0x205F: /* medium mathematical space */
+		case 0x3000: /* ideographic space */
+			c = ' ';
+		}
+	}
+
+	fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode, bidi, force_new_line, flags);
+}
+
+static fz_rect
+current_clip(fz_context *ctx, fz_stext_device *dev)
+{
+	fz_rect r = fz_infinite_rect;
+
+	if (dev->flags & FZ_STEXT_CLIP)
+	{
+		r = fz_device_current_scissor(ctx, &dev->super);
+		r = fz_intersect_rect(r, dev->page->mediabox);
+	}
+	if (dev->flags & FZ_STEXT_CLIP_RECT)
+		r = fz_intersect_rect(r, dev->opts.clip);
+
+	return r;
+}
+
+static void
+do_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int start, int end, int flags)
+{
+	fz_font *font = span->font;
+	fz_matrix tm = span->trm;
+	float adv;
+	int unicode;
+	int i;
+
+	for (i = start; i < end; i++)
+	{
+		/* Calculate new pen location and delta */
+		tm.e = span->items[i].x;
+		tm.f = span->items[i].y;
+		dev->last.trm = fz_concat(tm, ctm);
+		dev->last.bidi_level = span->bidi_level;
+		dev->last.wmode = span->wmode;
+		if (font != dev->last.font)
+		{
+			fz_drop_font(ctx, dev->last.font);
+			dev->last.font = fz_keep_font(ctx, font);
+		}
+		dev->last.valid = 1;
+		dev->last.flags = flags;
+
+		if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
+		{
+			fz_rect r = current_clip(ctx, dev);
+			if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
+			{
+				dev->last.clipped = 1;
+				continue;
+			}
+		}
+		dev->last.clipped = 0;
+
+		/* Calculate bounding box and new pen position based on font metrics */
+		if (span->items[i].gid >= 0)
+			adv = span->items[i].adv;
+		else
+			adv = 0;
+
+		unicode = span->items[i].ucs;
+		if (unicode == FZ_REPLACEMENT_CHARACTER)
+		{
+			if (dev->flags & FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE)
+			{
+				unicode = span->items[i].cid;
+				flags |= FZ_STEXT_UNICODE_IS_CID;
+			}
+			else if (dev->flags & FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE)
+			{
+				unicode = span->items[i].gid;
+				flags |= FZ_STEXT_UNICODE_IS_GID;
+			}
+		}
+
+		/* Send the chars we have through. */
+		fz_add_stext_char(ctx, dev, font,
+			unicode,
+			span->items[i].gid,
+			dev->last.trm,
+			adv,
+			dev->last.wmode,
+			dev->last.bidi_level,
+			(i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
+			flags);
+	}
+}
+
+static int
+rune_index(const char *utf8, size_t idx)
+{
+	int rune;
+
+	do
+	{
+		int len = fz_chartorune(&rune, utf8);
+		if (rune == 0)
+			return -1;
+		utf8 += len;
+	}
+	while (idx--);
+
+	return rune;
+}
+
+static void
+flush_actualtext(fz_context *ctx, fz_stext_device *dev, const char *actualtext, int i)
+{
+	if (*actualtext == 0)
+		return;
+
+	while (1)
+	{
+		int rune;
+		actualtext += fz_chartorune(&rune, actualtext);
+
+		if (rune == 0)
+			break;
+
+		if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
+			if (dev->last.clipped)
+				continue;
+
+		fz_add_stext_char(ctx, dev, dev->last.font,
+			rune,
+			-1,
+			dev->last.trm,
+			0,
+			dev->last.wmode,
+			dev->last.bidi_level,
+			(i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
+			dev->last.flags);
+		i++;
+	}
+}
+
+static void
+do_extract_within_actualtext(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, metatext_t *mt, int flags)
+{
+	/* We are within an actualtext block. This means we can't just add the chars
+	 * as they are. We need to add the chars as they are meant to be. Sadly the
+	 * actualtext mechanism doesn't help us at all with positioning. */
+	fz_font *font = span->font;
+	fz_matrix tm = span->trm;
+	float adv;
+	int start, i, end;
+	char *actualtext = mt->text;
+	size_t z = fz_utflen(actualtext);
+
+	/* If actualtext is empty, nothing to do! */
+	if (z == 0)
+		return;
+
+	/* Now, we HOPE that the creator of a PDF will minimise the actual text
+	 * differences, so that we'll get:
+	 *   "Politicians <Actualtext="lie">fib</ActualText>, always."
+	 * rather than:
+	 *   "<Actualtext="Politicians lie, always">Politicians fib, always.</ActualText>
+	 * but experience with PDF files tells us that this won't always be the case.
+	 *
+	 * We try to minimise the actualtext section here, just in case.
+	 */
+
+	/* Spot a matching prefix and send it. */
+	for (start = 0; start < span->len; start++)
+	{
+		int rune;
+		int len = fz_chartorune(&rune, actualtext);
+		if (span->items[start].gid != rune || rune == 0)
+			break;
+		actualtext += len; z--;
+	}
+	if (start != 0)
+		do_extract(ctx, dev, span, ctm, 0, start, flags);
+
+	if (start == span->len)
+	{
+		/* The prefix has consumed all this object. Just shorten the actualtext and we'll
+		 * catch the rest next time. */
+		z = strlen(actualtext)+1;
+		memmove(mt->text, actualtext, z);
+		return;
+	}
+
+	/* We haven't consumed the whole string, so there must be runes left.
+	 * Shut coverity up. */
+	assert(z != 0);
+
+	/* Spot a matching postfix. Can't send it til the end. */
+	for (end = span->len; end > start; end--)
+	{
+		/* Nasty n^2 algo here, cos backtracking through utf8 is not trivial. It'll do. */
+		int rune = rune_index(actualtext, z-1);
+		if (span->items[end-1].gid != rune)
+			break;
+		z--;
+	}
+	/* So we can send end -> span->len at the end. */
+
+	/* So we have at least SOME chars that don't match. */
+	/* Now, do the difficult bit in the middle.*/
+	/* items[start..end] have to be sent with actualtext[start..z] */
+	for (i = start; i < end; i++)
+	{
+		fz_text_item *item = &span->items[i];
+		int rune = -1;
+
+		if ((size_t)i < z)
+			actualtext += fz_chartorune(&rune, actualtext);
+
+		/* Calculate new pen location and delta */
+		tm.e = item->x;
+		tm.f = item->y;
+		dev->last.trm = fz_concat(tm, ctm);
+		dev->last.bidi_level = span->bidi_level;
+		dev->last.wmode = span->wmode;
+		if (font != dev->last.font)
+		{
+			fz_drop_font(ctx, dev->last.font);
+			dev->last.font = fz_keep_font(ctx, font);
+		}
+		dev->last.valid = 1;
+
+		if (dev->flags & (FZ_STEXT_CLIP | FZ_STEXT_CLIP_RECT))
+		{
+			fz_rect r = current_clip(ctx, dev);
+			if (fz_glyph_entirely_outside_box(ctx, &ctm, span, &span->items[i], &r))
+			{
+				dev->last.clipped = 1;
+				continue;
+			}
+		}
+		dev->last.clipped = 0;
+
+		/* Calculate bounding box and new pen position based on font metrics */
+		if (item->gid >= 0)
+			adv = item->adv;
+		else
+			adv = 0;
+
+		fz_add_stext_char(ctx, dev, font,
+			rune,
+			span->items[i].gid,
+			dev->last.trm,
+			adv,
+			dev->last.wmode,
+			dev->last.bidi_level,
+			(i == 0) && (dev->flags & FZ_STEXT_PRESERVE_SPANS),
+			flags);
+	}
+
+	/* If we haven't spotted a postfix by this point, then don't force ourselves to output
+	 * any more of the actualtext at this point. We might get a new text object that matches
+	 * more of it. */
+	if (end == span->len)
+	{
+		/* Shorten actualtext and exit. */
+		z = strlen(actualtext)+1;
+		memmove(mt->text, actualtext, z);
+		return;
+	}
+
+	/* We found a matching postfix. It seems likely that this is going to be the only
+	 * text object we get, so send any remaining actualtext now. */
+	flush_actualtext(ctx, dev, actualtext, i);
+
+	/* Send the postfix */
+	if (end != span->len)
+		do_extract(ctx, dev, span, ctm, end, span->len, flags);
+
+	mt->text[0] = 0;
+}
+
+static void
+fz_stext_extract(fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm, int flags)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	metatext_t *mt = NULL;
+
+	if (span->len == 0)
+		return;
+
+	/* Are we in an actualtext? */
+	if (!(tdev->opts.flags & FZ_STEXT_IGNORE_ACTUALTEXT))
+		mt = find_actualtext(dev);
+
+	if (mt)
+		do_extract_within_actualtext(ctx, dev, span, ctm, mt, flags);
+	else
+		do_extract(ctx, dev, span, ctm, 0, span->len, flags);
+}
+
+static uint32_t hexrgba_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color, float alpha)
+{
+	float rgb[3];
+	fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params);
+	return
+		((uint32_t) (fz_clampi(alpha * 255 + 0.5f, 0, 255) << 24)) |
+		((uint32_t) (fz_clampi(rgb[0] * 255 + 0.5f, 0, 255) << 16)) |
+		((uint32_t) (fz_clampi(rgb[1] * 255 + 0.5f, 0, 255) << 8)) |
+		((uint32_t) (fz_clampi(rgb[2] * 255 + 0.5f, 0, 255)));
+}
+
+static void
+fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
+	fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_text_span *span;
+	if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
+		return;
+	tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
+	tdev->new_obj = 1;
+	for (span = text->head; span; span = span->next)
+		fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED);
+	fz_drop_text(ctx, tdev->lasttext);
+	tdev->lasttext = fz_keep_text(ctx, text);
+}
+
+static void
+fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm,
+	fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_text_span *span;
+	if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
+		return;
+	tdev->color = hexrgba_from_color(ctx, colorspace, color, alpha);
+	tdev->new_obj = 1;
+	for (span = text->head; span; span = span->next)
+		fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED);
+	fz_drop_text(ctx, tdev->lasttext);
+	tdev->lasttext = fz_keep_text(ctx, text);
+}
+
+static void
+fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_text_span *span;
+	if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
+		return;
+	tdev->color = 0;
+	tdev->new_obj = 1;
+	for (span = text->head; span; span = span->next)
+		fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_FILLED | FZ_STEXT_CLIPPED);
+	fz_drop_text(ctx, tdev->lasttext);
+	tdev->lasttext = fz_keep_text(ctx, text);
+}
+
+static void
+fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_text_span *span;
+	if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
+		return;
+	tdev->color = 0;
+	tdev->new_obj = 1;
+	for (span = text->head; span; span = span->next)
+		fz_stext_extract(ctx, tdev, span, ctm, FZ_STEXT_STROKED | FZ_STEXT_CLIPPED);
+	fz_drop_text(ctx, tdev->lasttext);
+	tdev->lasttext = fz_keep_text(ctx, text);
+}
+
+static void
+fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_text_span *span;
+	if (text == tdev->lasttext && (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES) == 0)
+		return;
+	tdev->color = 0;
+	tdev->new_obj = 1;
+	for (span = text->head; span; span = span->next)
+		fz_stext_extract(ctx, tdev, span, ctm, 0);
+	fz_drop_text(ctx, tdev->lasttext);
+	tdev->lasttext = fz_keep_text(ctx, text);
+}
+
+static void
+fz_stext_begin_metatext(fz_context *ctx, fz_device *dev, fz_metatext meta, const char *text)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	metatext_t *mt = fz_malloc_struct(ctx, metatext_t);
+
+	mt->prev = tdev->metatext;
+	tdev->metatext = mt;
+	mt->type = meta;
+	mt->text = text ? fz_strdup(ctx, text) : NULL;
+	mt->bounds = fz_empty_rect;
+}
+
+static void
+pop_metatext(fz_context *ctx, fz_stext_device *dev)
+{
+	metatext_t *prev;
+	fz_rect bounds;
+
+	if (!dev->metatext)
+		return;
+
+	prev = dev->metatext->prev;
+	bounds = dev->metatext->bounds;
+	fz_free(ctx, dev->metatext->text);
+	fz_free(ctx, dev->metatext);
+	dev->metatext = prev;
+	if (prev)
+		prev->bounds = fz_union_rect(prev->bounds, bounds);
+}
+
+static void
+fz_stext_end_metatext(fz_context *ctx, fz_device *dev)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_font *myfont = NULL;
+
+	if (!tdev->metatext)
+		return; /* Mismatched pop. Live with it. */
+
+	if (tdev->metatext->type != FZ_METATEXT_ACTUALTEXT)
+	{
+		/* We only deal with ActualText here. Just pop anything else off,
+		 * and we're done. */
+		pop_metatext(ctx, tdev);
+		return;
+	}
+
+	/* If we have a 'last' text position, send the content after that. */
+	if (tdev->last.valid)
+	{
+		flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
+		pop_metatext(ctx, tdev);
+		return;
+	}
+
+	/* If we have collected a rectangle for content that encloses the actual text,
+	 * send the content there. */
+	if (!fz_is_empty_rect(tdev->metatext->bounds))
+	{
+		tdev->last.trm.a = tdev->metatext->bounds.x1 - tdev->metatext->bounds.x0;
+		tdev->last.trm.b = 0;
+		tdev->last.trm.c = 0;
+		tdev->last.trm.d = tdev->metatext->bounds.y1 - tdev->metatext->bounds.y0;
+		tdev->last.trm.e = tdev->metatext->bounds.x0;
+		tdev->last.trm.f = tdev->metatext->bounds.y0;
+	}
+	else
+		fz_warn(ctx, "Actualtext with no position. Text may be lost or mispositioned.");
+
+	fz_var(myfont);
+
+	fz_try(ctx)
+	{
+		if (tdev->last.font == NULL)
+		{
+			myfont = fz_new_base14_font(ctx, "Helvetica");
+			tdev->last.font = myfont;
+		}
+		flush_actualtext(ctx, tdev, tdev->metatext->text, 0);
+		pop_metatext(ctx, tdev);
+	}
+	fz_always(ctx)
+	{
+		if (myfont)
+		{
+			tdev->last.font = NULL;
+			fz_drop_font(ctx, myfont);
+		}
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+
+/* Images and shadings */
+
+static void
+fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_rect *bounds = actualtext_bounds(tdev);
+
+	/* If there is an actualtext in force, update its bounds. */
+	if (bounds)
+	{
+		static const fz_rect unit = { 0, 0, 1, 1 };
+		*bounds = fz_union_rect(*bounds, fz_transform_rect(unit, ctm));
+	}
+
+	/* Unless we are being told to preserve images, nothing to do here. */
+	if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
+		return;
+
+	/* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */
+	if (alpha >= 0.5f)
+		add_image_block_to_page(ctx, tdev->page, ctm, img);
+
+}
+
+static void
+fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm,
+		fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params)
+{
+	fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params);
+}
+
+static fz_image *
+fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor)
+{
+	fz_matrix ctm = *in_out_ctm;
+	fz_pixmap *pix;
+	fz_image *img = NULL;
+	fz_rect bounds;
+	fz_irect bbox;
+
+	bounds = fz_bound_shade(ctx, shade, ctm);
+	bounds = fz_intersect_rect(bounds, scissor);
+	bbox = fz_irect_from_rect(bounds);
+
+	pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background);
+	fz_try(ctx)
+	{
+		if (shade->use_background)
+			fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params);
+		else
+			fz_clear_pixmap(ctx, pix);
+		fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL, NULL);
+		img = fz_new_image_from_pixmap(ctx, pix, NULL);
+	}
+	fz_always(ctx)
+		fz_drop_pixmap(ctx, pix);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+	in_out_ctm->a = pix->w;
+	in_out_ctm->b = 0;
+	in_out_ctm->c = 0;
+	in_out_ctm->d = pix->h;
+	in_out_ctm->e = pix->x;
+	in_out_ctm->f = pix->y;
+	return img;
+}
+
+static void
+fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_rect *bounds = actualtext_bounds(tdev);
+	fz_matrix local_ctm;
+	fz_rect scissor;
+	fz_image *image;
+
+	/* If we aren't keeping images, but we are in a bound, update the bounds
+	 * without generating the entire image. */
+	if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0 && bounds)
+	{
+		*bounds = fz_union_rect(*bounds, fz_bound_shade(ctx, shade, ctm));
+		return;
+	}
+
+	/* Unless we are preserving image, nothing to do here. */
+	if ((tdev->opts.flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
+		return;
+
+	local_ctm = ctm;
+	scissor = fz_device_current_scissor(ctx, dev);
+	if (dev->flags & FZ_STEXT_CLIP_RECT)
+		scissor = fz_intersect_rect(scissor, tdev->opts.clip);
+	scissor = fz_intersect_rect(scissor, tdev->page->mediabox);
+	image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor);
+	fz_try(ctx)
+		fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params);
+	fz_always(ctx)
+		fz_drop_image(ctx, image);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+static void
+fixup_bboxes_and_bidi(fz_context *ctx, fz_stext_block *block)
+{
+	fz_stext_line *line;
+	fz_stext_char *ch;
+
+	for ( ; block != NULL; block = block->next)
+	{
+		if (block->type == FZ_STEXT_BLOCK_STRUCT)
+			if (block->u.s.down)
+				fixup_bboxes_and_bidi(ctx, block->u.s.down->first_block);
+		if (block->type != FZ_STEXT_BLOCK_TEXT)
+			continue;
+		for (line = block->u.t.first_line; line; line = line->next)
+		{
+			int reorder = 0;
+			for (ch = line->first_char; ch; ch = ch->next)
+			{
+				fz_rect ch_box = fz_rect_from_quad(ch->quad);
+				if (ch == line->first_char)
+					line->bbox = ch_box;
+				else
+					line->bbox = fz_union_rect(line->bbox, ch_box);
+				if (ch->bidi == 3)
+					reorder = 1;
+			}
+			block->bbox = fz_union_rect(block->bbox, line->bbox);
+			if (reorder)
+				reverse_bidi_line(line);
+		}
+	}
+}
+
+static void
+advance_to_x(fz_point *a, fz_point b, float x)
+{
+	a->y += (b.y - a->y) * (x - a->x) / (b.x - a->x);
+	a->x = x;
+}
+
+static void
+advance_to_y(fz_point *a, fz_point b, float y)
+{
+	a->x += (b.x - a->x) * (y - a->y) / (b.y - a->y);
+	a->y = y;
+}
+
+static int
+line_crosses_rect(fz_point a, fz_point b, fz_rect r)
+{
+	/* Cope with trivial exclusions */
+	if (a.x < r.x0 && b.x < r.x0)
+		return 0;
+	if (a.x > r.x1 && b.x > r.x1)
+		return 0;
+	if (a.y < r.y0 && b.y < r.y0)
+		return 0;
+	if (a.y > r.y1 && b.y > r.y1)
+		return 0;
+
+	if (a.x < r.x0)
+		advance_to_x(&a, b, r.x0);
+	if (a.x > r.x1)
+		advance_to_x(&a, b, r.x1);
+	if (a.y < r.y0)
+		advance_to_y(&a, b, r.y0);
+	if (a.y > r.y1)
+		advance_to_y(&a, b, r.y1);
+
+	return fz_is_point_inside_rect(a, r);
+}
+
+static float
+calculate_ascent(fz_point p, fz_point origin, fz_point dir)
+{
+	return fabsf((origin.x-p.x)*dir.y - (origin.y-p.y)*dir.x);
+}
+
+/* Create us a rect from the given quad, but extend it downwards
+ * to allow for underlines that pass under the glyphs. */
+static fz_rect expanded_rect_from_quad(fz_quad quad, fz_point dir, fz_point origin, float size)
+{
+	/* Consider the two rects from A and g respectively.
+	 *
+	 * ul +------+ ur   or
+	 *    |  /\  |         ul +------+ ur
+	 *    | /__\ |            | /''\ |
+	 *    |/    \|            |(    ||
+	 * ll +------+ lr         | ''''||
+	 *                        |  ''' | <-expected underline level
+	 *                     ll +------+ lr
+	 *
+	 * So an underline won't cross A's rect, but will cross g's.
+	 * We want to make a rect that includes a suitable amount of
+	 * space underneath. The information we have available to us
+	 * is summed up here:
+	 *
+	 *  ul +---------+ ur
+	 *     |         |
+	 *     | origin  |
+	 *     |+----------> dir
+	 *     |         |
+	 *  ll +---------+ lr
+	 *
+	 * Consider the distance from ul to the line that passes through
+	 * the origin with direction dir. Similarly, consider the distance
+	 * from ur to the same line. This can be thought of as the 'ascent'
+	 * of this character.
+	 *
+	 * We'd like the distance from ul to ll to be greater than this, so
+	 * as to ensure we cover the possible location where an underline
+	 * might reasonably go.
+	 *
+	 * If we have a line (l) through point A with direction vector u,
+	 * the distance between point P and line(l) is:
+	 *
+	 * d(P,l) = || AP x u || / || u ||
+	 *
+	 * where x is the cross product.
+	 *
+	 * For us, because || dir || = 1:
+	 *
+	 * d(ul, origin) = || (origin-ul) x dir ||
+	 *
+	 * The cross product is only defined in 3 (or 7!) dimensions, so
+	 * extend both vectors into 3d by defining a 0 z component.
+	 *
+	 * (origin-ul) x dir = [ (origin.y - ul.y) . 0     - 0                 . dir.y ]
+	 *                     [ 0                 . dir.x - (origin.x - ul.y) . 0     ]
+	 *                     [ (origin.x - ul.x) . dir.y - (origin.y - ul.y) . dir.x ]
+	 *
+	 * So d(ul, origin) = abs(D) where D = (origin.x-ul.x).dir.y - (origin.y-ul.y).dir.x
+	 */
+	float ascent = (calculate_ascent(quad.ul, origin, dir) + calculate_ascent(quad.ur, origin, dir)) / 2;
+	fz_point left = { quad.ll.x - quad.ul.x, quad.ll.y - quad.ul.y };
+	fz_point right = { quad.lr.x - quad.ur.x, quad.lr.y - quad.ur.y };
+	float height = (hypotf(left.x, left.y) + hypotf(right.x, right.y))/2;
+	int neg = 0;
+	float extra_rise = 0;
+
+	/* Spaces will have 0 ascent. underscores will have small ascent.
+	 * We want a sane ascent to be able to spot strikeouts, but not
+	 * so big that it incorporates lines above the text, like borders. */
+	if (ascent < 0.75*size)
+		extra_rise = 0.75*size - ascent;
+
+	/* We'd like height to be at least ascent + 1/4 size */
+	if (height < 0)
+		neg = 1, height = -height;
+	if (height < ascent + size * 0.25f)
+		height = ascent + size * 0.25f;
+
+	height -= ascent;
+	if (neg)
+		height = -height;
+	quad.ll.x += - height * dir.y;
+	quad.ll.y +=   height * dir.x;
+	quad.lr.x += - height * dir.y;
+	quad.lr.y +=   height * dir.x;
+	quad.ul.x -= - extra_rise * dir.y;
+	quad.ul.y -=   extra_rise * dir.x;
+	quad.ur.x -= - extra_rise * dir.y;
+	quad.ur.y -=   extra_rise * dir.x;
+
+	return fz_rect_from_quad(quad);
+}
+
+static int feq(float a,float b)
+{
+#define EPSILON 0.00001
+	a -= b;
+	if (a < 0)
+		a = -a;
+	return a < EPSILON;
+}
+
+static void
+check_strikeout(fz_context *ctx, fz_stext_block *block, fz_point from, fz_point to, fz_point dir, float thickness)
+{
+	for ( ; block; block = block->next)
+	{
+		fz_stext_line *line;
+
+		if (block->type != FZ_STEXT_BLOCK_TEXT)
+			continue;
+
+		for (line = block->u.t.first_line; line != NULL; line = line->next)
+		{
+			fz_stext_char *ch;
+
+			if ((!feq(line->dir.x, dir.x) || !feq(line->dir.y, dir.y)) &&
+				(!feq(line->dir.x, -dir.x) || !feq(line->dir.y, -dir.y)))
+				continue;
+
+			/* Matching directions... */
+
+			/* Unfortunately, we don't have a valid line->bbox at this point, so we need to check
+			 * chars. - FIXME: Now we do! */
+			for (ch = line->first_char; ch; ch = ch->next)
+			{
+				fz_point up;
+				float dx, dy, dot;
+				fz_rect ch_box;
+
+				/* If the thickness is more than a 1/4 of the size, it's a highlight, not a
+				 * line! */
+				if (ch->size < thickness*4)
+					continue;
+
+				ch_box = expanded_rect_from_quad(ch->quad, line->dir, ch->origin, ch->size);
+
+				if (!line_crosses_rect(from, to, ch_box))
+					continue;
+
+				/* Is this a strikeout or an underline? */
+
+				/* The baseline moves from ch->origin in the direction line->dir */
+				up.x = line->dir.y;
+				up.y = -line->dir.x;
+
+				/* How far is our line displaced from the line through the origin? */
+				dx = from.x - ch->origin.x;
+				dy = from.y - ch->origin.y;
+				/* Dot product with up. up is normalised */
+				dot = dx * up.x + dy * up.y;
+
+				if (dot > 0)
+					ch->flags |= FZ_STEXT_STRIKEOUT;
+				else
+					ch->flags |= FZ_STEXT_UNDERLINE;
+			}
+		}
+	}
+}
+
+static void
+check_rects_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page)
+{
+	int i, n = tdev->rect_len;
+
+	for (i = 0; i < n; i++)
+	{
+		fz_point from = tdev->rects[i].from;
+		fz_point to = tdev->rects[i].to;
+		float thickness = tdev->rects[i].thickness;
+		fz_point dir;
+		dir.x = to.x - from.x;
+		dir.y = to.y - from.y;
+		dir = fz_normalize_vector(dir);
+
+		check_strikeout(ctx, page->first_block, from, to, dir, thickness);
+	}
+}
+
+static void
+fz_stext_close_device(fz_context *ctx, fz_device *dev)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_stext_page *page = tdev->page;
+
+	fixup_bboxes_and_bidi(ctx, page->first_block);
+
+	if (tdev->opts.flags & FZ_STEXT_COLLECT_STYLES)
+		check_rects_for_strikeout(ctx, tdev, page);
+
+	/* TODO: smart sorting of blocks and lines in reading order */
+	/* TODO: unicode NFC normalization */
+
+	if (tdev->opts.flags & FZ_STEXT_SEGMENT)
+		fz_segment_stext_page(ctx, page);
+
+	if (tdev->opts.flags & FZ_STEXT_PARAGRAPH_BREAK)
+		fz_paragraph_break(ctx, page);
+
+	if (tdev->opts.flags & FZ_STEXT_TABLE_HUNT)
+		fz_table_hunt(ctx, page);
+}
+
+static void
+fz_stext_drop_device(fz_context *ctx, fz_device *dev)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_drop_text(ctx, tdev->lasttext);
+	fz_drop_font(ctx, tdev->last.font);
+	while (tdev->metatext)
+		pop_metatext(ctx, tdev);
+
+	fz_free(ctx, tdev->rects);
+}
+
+static int
+val_is_rect(const char *val, fz_rect *rp)
+{
+	fz_rect r;
+	const char *s;
+
+	s = strchr(val, ':');
+	if (s == NULL || s == val)
+		return 0;
+	r.x0 = fz_atof(val);
+	val = s+1;
+	s = strchr(val, ':');
+	if (s == NULL || s == val)
+		return 0;
+	r.y0 = fz_atof(val);
+	val = s+1;
+	s = strchr(val, ':');
+	if (s == NULL || s == val)
+		return 0;
+	r.x1 = fz_atof(val);
+	val = s+1;
+	r.y1 = fz_atof(val);
+
+	*rp = r;
+
+	return 1;
+}
+
+fz_stext_options *
+fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string)
+{
+	const char *val;
+
+	memset(opts, 0, sizeof *opts);
+
+	if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_PRESERVE_LIGATURES;
+	if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE;
+	if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_PRESERVE_IMAGES;
+	if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_INHIBIT_SPACES;
+	if (fz_has_option(ctx, string, "dehyphenate", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_DEHYPHENATE;
+	if (fz_has_option(ctx, string, "preserve-spans", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_PRESERVE_SPANS;
+	if (fz_has_option(ctx, string, "structured", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_COLLECT_STRUCTURE;
+	if (fz_has_option(ctx, string, "use-cid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE;
+	if (fz_has_option(ctx, string, "use-gid-for-unknown-unicode", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE;
+	if (fz_has_option(ctx, string, "accurate-bboxes", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_ACCURATE_BBOXES;
+	if (fz_has_option(ctx, string, "vectors", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_COLLECT_VECTORS;
+	if (fz_has_option(ctx, string, "ignore-actualtext", & val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_IGNORE_ACTUALTEXT;
+	if (fz_has_option(ctx, string, "segment", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_SEGMENT;
+	if (fz_has_option(ctx, string, "paragraph-break", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_PARAGRAPH_BREAK;
+	if (fz_has_option(ctx, string, "table-hunt", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_TABLE_HUNT;
+	if (fz_has_option(ctx, string, "collect-styles", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_COLLECT_STYLES;
+	if (fz_has_option(ctx, string, "accurate-ascenders", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_ACCURATE_ASCENDERS;
+	if (fz_has_option(ctx, string, "accurate-side-bearings", &val) && fz_option_eq(val, "yes"))
+		opts->flags |= FZ_STEXT_ACCURATE_SIDE_BEARINGS;
+
+	opts->flags |= FZ_STEXT_CLIP;
+	if (fz_has_option(ctx, string, "mediabox-clip", &val))
+	{
+		fz_warn(ctx, "The 'mediabox-clip' option has been deprecated. Use 'clip' instead.");
+		if (fz_option_eq(val, "no"))
+			opts->flags ^= FZ_STEXT_CLIP;
+	}
+	if (fz_has_option(ctx, string, "clip", &val) && fz_option_eq(val, "no"))
+		opts->flags ^= FZ_STEXT_CLIP;
+	if (fz_has_option(ctx, string, "clip-rect", &val) && val_is_rect(val, &opts->clip))
+		opts->flags |= FZ_STEXT_CLIP_RECT;
+
+	opts->scale = 1;
+	if (fz_has_option(ctx, string, "resolution", &val))
+		opts->scale = fz_atof(val) / 96.0f; /* HTML base resolution is 96ppi */
+
+	return opts;
+}
+
+typedef struct
+{
+	int fail;
+	int count;
+	fz_point corners[4];
+} is_rect_data;
+
+static void
+stash_point(is_rect_data *rd, float x, float y)
+{
+	if (rd->count > 3)
+	{
+		rd->fail = 1;
+		return;
+	}
+
+	rd->corners[rd->count].x = x;
+	rd->corners[rd->count].y = y;
+	rd->count++;
+}
+
+static void
+is_rect_moveto(fz_context *ctx, void *arg, float x, float y)
+{
+	is_rect_data *rd = arg;
+	if (rd->fail)
+		return;
+
+	if (rd->count != 0)
+	{
+		rd->fail = 1;
+		return;
+	}
+	stash_point(rd, x, y);
+}
+
+static void
+is_rect_lineto(fz_context *ctx, void *arg, float x, float y)
+{
+	is_rect_data *rd = arg;
+	if (rd->fail)
+		return;
+
+	if (rd->count == 4 && rd->corners[0].x == x && rd->corners[1].y == y)
+		return;
+
+	stash_point(rd, x, y);
+}
+
+static void
+is_rect_curveto(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
+{
+	is_rect_data *rd = arg;
+	rd->fail = 1;
+}
+
+static void
+is_rect_closepath(fz_context *ctx, void *arg)
+{
+	is_rect_data *rd = arg;
+	if (rd->fail)
+		return;
+	if (rd->count == 3)
+		stash_point(rd, rd->corners[0].x, rd->corners[0].y);
+	if (rd->count != 4)
+		rd->fail = 1;
+}
+
+static int
+is_path_rect(fz_context *ctx, const fz_path *path, fz_point *from, fz_point *to, float *thickness, fz_matrix ctm)
+{
+	float d01, d01x, d01y, d03, d03x, d03y, d32x, d32y;
+	is_rect_data rd = { 0 };
+	static const fz_path_walker walker =
+	{
+		is_rect_moveto, is_rect_lineto, is_rect_curveto, is_rect_closepath
+	};
+	int i;
+
+	fz_walk_path(ctx, path, &walker, &rd);
+
+	if (rd.fail)
+		return 0;
+
+	if (rd.count == 2)
+	{
+		stash_point(&rd, rd.corners[1].x, rd.corners[1].y);
+		stash_point(&rd, rd.corners[0].x, rd.corners[0].y);
+	}
+
+	for (i = 0 ; i < 4; i++)
+	{
+		fz_point p = fz_transform_point(rd.corners[i], ctm);
+
+		rd.corners[i].x = p.x;
+		rd.corners[i].y = p.y;
+	}
+
+	/* So we have a 4 cornered path. Hopefully something like:
+	 * 0---------1
+	 * |         |
+	 * 3---------2
+	 * but it might be:
+	 * 0---------3
+	 * |         |
+	 * 1---------2
+	*/
+	while (1)
+	{
+		d01x = rd.corners[1].x - rd.corners[0].x;
+		d01y = rd.corners[1].y - rd.corners[0].y;
+		d01 = d01x * d01x + d01y * d01y;
+		d03x = rd.corners[3].x - rd.corners[0].x;
+		d03y = rd.corners[3].y - rd.corners[0].y;
+		d03 = d03x * d03x + d03y * d03y;
+		if(d01 < d03)
+		{
+			/* We are the latter case. Transpose it. */
+			fz_point p = rd.corners[1];
+			rd.corners[1] = rd.corners[3];
+			rd.corners[3] = p;
+		}
+		else
+			break;
+	}
+	d32x = rd.corners[2].x - rd.corners[3].x;
+	d32y = rd.corners[2].y - rd.corners[3].y;
+
+	/* So d32x and d01x need to be the same for this to be a strikeout. */
+	if (!feq(d32x, d01x) || !feq(d32y, d01y))
+		return 0;
+
+	/* We are plausibly a rectangle. */
+	*thickness = sqrtf(d03x * d03x + d03y * d03y);
+
+	from->x = (rd.corners[0].x + rd.corners[3].x)/2;
+	from->y = (rd.corners[0].y + rd.corners[3].y)/2;
+	to->x = (rd.corners[1].x + rd.corners[2].x)/2;
+	to->y = (rd.corners[1].y + rd.corners[2].y)/2;
+
+	return 1;
+}
+
+static void
+check_for_strikeout(fz_context *ctx, fz_stext_device *tdev, fz_stext_page *page, const fz_path *path, fz_matrix ctm)
+{
+	float thickness;
+	fz_point from, to;
+
+	/* Is this path a thin rectangle (possibly rotated)? If so, then we need to
+	 * consider it as being a strikeout or underline. */
+	if (!is_path_rect(ctx, path, &from, &to, &thickness, ctm))
+		return;
+
+	/* Add to the list of rects in the device. */
+	if (tdev->rect_len == tdev->rect_max)
+	{
+		int newmax = tdev->rect_max * 2;
+		if (newmax == 0)
+			newmax = 32;
+
+		tdev->rects = fz_realloc(ctx, tdev->rects, sizeof(*tdev->rects) * newmax);
+		tdev->rect_max = newmax;
+	}
+	tdev->rects[tdev->rect_len].from = from;
+	tdev->rects[tdev->rect_len].to = to;
+	tdev->rects[tdev->rect_len].thickness = thickness;
+	tdev->rect_len++;
+}
+
+static void
+add_vector(fz_context *ctx, fz_stext_page *page, fz_rect bbox, uint32_t flags, uint32_t argb)
+{
+	fz_stext_block *b = add_block_to_page(ctx, page);
+
+	b->type = FZ_STEXT_BLOCK_VECTOR;
+	b->bbox = bbox;
+	b->u.v.flags = flags;
+	b->u.v.argb = argb;
+}
+
+typedef struct
+{
+	fz_matrix ctm;
+	uint32_t argb;
+	uint32_t flags;
+	fz_stext_page *page;
+	fz_rect leftovers;
+	fz_rect pending;
+	int count;
+	fz_point p[5];
+} split_path_data;
+
+static void
+maybe_rect(fz_context *ctx, split_path_data *sp)
+{
+	int rect = 0;
+	int i;
+
+	if (sp->count >= 0)
+	{
+		if (sp->count == 3)
+		{
+			/* Allow for "moveto A, lineto B, lineto A, close" */
+			if (feq(sp->p[0].x, sp->p[2].x) || feq(sp->p[0].y, sp->p[2].y))
+				sp->count = 2;
+		}
+		if (sp->count == 2)
+		{
+			if (feq(sp->p[0].x, sp->p[1].x) || feq(sp->p[0].y, sp->p[1].y))
+				rect = 1; /* Count that as a rect */
+		}
+		else if (sp->count == 4 || sp->count == 5)
+		{
+			if (feq(sp->p[0].x, sp->p[1].x) && feq(sp->p[2].x, sp->p[3].x) && feq(sp->p[0].y, sp->p[3].y) && feq(sp->p[1].y, sp->p[2].y))
+				rect = 1;
+			else if (feq(sp->p[0].x, sp->p[3].x) && feq(sp->p[1].x, sp->p[2].x) && feq(sp->p[0].y, sp->p[1].y) && feq(sp->p[2].y, sp->p[3].y))
+				rect = 1;
+		}
+		if (rect)
+		{
+			fz_rect bounds;
+
+			bounds.x0 = bounds.x1 = sp->p[0].x;
+			bounds.y0 = bounds.y1 = sp->p[0].y;
+			for (i = 1; i < sp->count; i++)
+				bounds = fz_include_point_in_rect(bounds, sp->p[i]);
+			if (fz_is_valid_rect(sp->pending))
+				add_vector(ctx, sp->page, sp->pending, sp->flags | FZ_STEXT_VECTOR_IS_RECTANGLE | FZ_STEXT_VECTOR_CONTINUES, sp->argb);
+			sp->pending = bounds;
+			return;
+		}
+
+		for (i = 0; i < sp->count; i++)
+			sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
+	}
+}
+
+static void
+split_move(fz_context *ctx, void *arg, float x, float y)
+{
+	split_path_data *sp = (split_path_data *)arg;
+	fz_point p = fz_transform_point_xy(x, y, sp->ctm);
+
+	maybe_rect(ctx, sp);
+	sp->p[0] = p;
+	sp->count = 1;
+}
+
+static void
+split_line(fz_context *ctx, void *arg, float x, float y)
+{
+	split_path_data *sp = (split_path_data *)arg;
+	fz_point p = fz_transform_point_xy(x, y, sp->ctm);
+	int i;
+
+	if (sp->count >= 0)
+	{
+		/* Check for lines to the same point. */
+		if (feq(sp->p[sp->count-1].x, p.x) && feq(sp->p[sp->count-1].y, p.y))
+			return;
+		/* If we're still maybe a rect, just record the point. */
+		if (sp->count < 4)
+		{
+			sp->p[sp->count++] = p;
+			return;
+		}
+		/* Check for close line? */
+		if (sp->count == 4)
+		{
+			if (feq(sp->p[0].x, p.x) && feq(sp->p[0].y, p.y))
+			{
+				/* We've just drawn a line back to the start point. */
+				/* Needless saving of point, but it makes the logic
+				 * easier elsewhere. */
+				sp->p[sp->count++] = p;
+				return;
+			}
+		}
+		/* We can no longer be a rect. Output the points we had saved. */
+		for (i = 0; i < sp->count; i++)
+			sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
+		/* Remember we're not a rect. */
+		sp->count = -1;
+	}
+	/* Roll this point into the non-rect bounds. */
+	sp->leftovers = fz_include_point_in_rect(sp->leftovers, p);
+}
+
+static void
+split_curve(fz_context *ctx, void *arg, float x1, float y1, float x2, float y2, float x3, float y3)
+{
+	split_path_data *sp = (split_path_data *)arg;
+	fz_point p1 = fz_transform_point_xy(x1, y1, sp->ctm);
+	fz_point p2 = fz_transform_point_xy(x2, y2, sp->ctm);
+	fz_point p3 = fz_transform_point_xy(x3, y3, sp->ctm);
+	int i;
+
+	if (sp->count >= 0)
+	{
+		/* We can no longer be a rect. Output the points we had saved. */
+		for (i = 0; i < sp->count; i++)
+			sp->leftovers = fz_include_point_in_rect(sp->leftovers, sp->p[i]);
+		/* Remember we're not a rect. */
+		sp->count = -1;
+	}
+	/* Roll these points into the non-rect bounds. */
+	sp->leftovers = fz_include_point_in_rect(sp->leftovers, p1);
+	sp->leftovers = fz_include_point_in_rect(sp->leftovers, p2);
+	sp->leftovers = fz_include_point_in_rect(sp->leftovers, p3);
+}
+
+static void
+split_close(fz_context *ctx, void *arg)
+{
+	split_path_data *sp = (split_path_data *)arg;
+
+	maybe_rect(ctx, sp);
+	sp->count = 0;
+}
+
+
+static const
+fz_path_walker split_path_rects =
+{
+	split_move,
+	split_line,
+	split_curve,
+	split_close
+};
+
+static void
+add_vectors_from_path(fz_context *ctx, fz_stext_page *page, const fz_path *path, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp, int stroke)
+{
+	int have_leftovers;
+	split_path_data sp;
+
+	sp.ctm = ctm;
+	sp.argb = hexrgba_from_color(ctx, cs, color, alpha);
+	sp.flags = stroke ? FZ_STEXT_VECTOR_IS_STROKED : 0;
+	sp.page = page;
+	sp.count = 0;
+	sp.leftovers = fz_empty_rect;
+	sp.pending = fz_empty_rect;
+	fz_walk_path(ctx, path, &split_path_rects, &sp);
+
+	have_leftovers = fz_is_valid_rect(sp.leftovers);
+
+	maybe_rect(ctx, &sp);
+
+	if (fz_is_valid_rect(sp.pending))
+		add_vector(ctx, page, sp.pending, sp.flags | FZ_STEXT_VECTOR_IS_RECTANGLE | (have_leftovers ? FZ_STEXT_VECTOR_CONTINUES : 0), sp.argb);
+	if (have_leftovers)
+		add_vector(ctx, page, sp.leftovers, sp.flags, sp.argb);
+}
+
+static void
+fz_stext_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_stext_page *page = tdev->page;
+	fz_rect path_bounds = fz_bound_path(ctx, path, NULL, ctm);
+	fz_rect *bounds = actualtext_bounds(tdev);
+
+	/* If we're in an actualtext, then update the bounds to include this content. */
+	if (bounds != NULL)
+		*bounds = fz_union_rect(*bounds, path_bounds);
+
+	if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
+		check_for_strikeout(ctx, tdev, page, path, ctm);
+
+	if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
+		add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 0);
+}
+
+static void
+fz_stext_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *ss, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params cp)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_stext_page *page = tdev->page;
+	fz_rect path_bounds = fz_bound_path(ctx, path, ss, ctm);
+	fz_rect *bounds = actualtext_bounds((fz_stext_device *)dev);
+
+	/* If we're in an actualtext, then update the bounds to include this content. */
+	if (bounds != NULL)
+		*bounds = fz_union_rect(*bounds, path_bounds);
+
+	if (tdev->flags & FZ_STEXT_COLLECT_STYLES)
+		check_for_strikeout(ctx, tdev, page, path, ctm);
+
+	if (tdev->flags & FZ_STEXT_COLLECT_VECTORS)
+		add_vectors_from_path(ctx, page, path, ctm, cs, color, alpha, cp, 1);
+}
+
+static void
+new_stext_struct(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, fz_structure standard, const char *raw)
+{
+	fz_stext_struct *str;
+	size_t z;
+
+	if (raw == NULL)
+		raw = "";
+	z = strlen(raw);
+
+	str = fz_pool_alloc(ctx, page->pool, sizeof(*str) + z);
+	str->first_block = NULL;
+	str->last_block = NULL;
+	str->standard = standard;
+	str->parent = page->last_struct;
+	str->up = block;
+	memcpy(str->raw, raw, z+1);
+
+	block->u.s.down = str;
+}
+
+static void
+fz_stext_begin_structure(fz_context *ctx, fz_device *dev, fz_structure standard, const char *raw, int idx)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_stext_page *page = tdev->page;
+	fz_stext_block *block, *le, *gt, *newblock;
+
+	if (raw == NULL)
+		raw = "";
+
+	/* Find a pointer to the last block. */
+	if (page->last_block)
+	{
+		block = page->last_block;
+	}
+	else if (page->last_struct)
+	{
+		block = page->last_struct->last_block;
+	}
+	else
+	{
+		block = page->first_block;
+	}
+
+	/* So block is somewhere in the content chain. Let's try and find:
+	 *   le = the struct node <= idx before block in the content chain.
+	 *   ge = the struct node >= idx after block in the content chain.
+	 * Search backwards to start with.
+	 */
+	gt = NULL;
+	le = block;
+	while (le)
+	{
+		if (le->type == FZ_STEXT_BLOCK_STRUCT)
+		{
+			if (le->u.s.index > idx)
+				gt = le;
+			if (le->u.s.index <= idx)
+				break;
+		}
+		le = le->prev;
+	}
+	/* The following loop copes with finding gt (the smallest block with an index higher
+	 * than we want) if we haven't found it already. The while loop in here was designed
+	 * to cope with 'block' being in the middle of a list. In fact, the way the code is
+	 * currently, block will always be at the end of a list, so the while won't do anything.
+	 * But I'm loathe to remove it in case we ever change this code to start from wherever
+	 * we did the last insertion. */
+	if (gt == NULL)
+	{
+		gt = block;
+		while (gt)
+		{
+			if (gt->type == FZ_STEXT_BLOCK_STRUCT)
+			{
+				if (gt->u.s.index <= idx)
+					le = gt;
+				if (gt->u.s.index >= idx)
+					break;
+			}
+			block = gt;
+			gt = gt->next;
+		}
+	}
+
+	if (le && le->u.s.index == idx)
+	{
+		/* We want to move down into the le block. Does it have a struct
+		 * attached yet? */
+		if (le->u.s.down == NULL)
+		{
+			/* No. We need to create a new struct node. */
+			new_stext_struct(ctx, page, le, standard, raw);
+		}
+		else if (le->u.s.down->standard != standard || strcmp(raw, le->u.s.down->raw) != 0)
+		{
+			/* Yes, but it doesn't match the one we expect! */
+			fz_warn(ctx, "Mismatched structure type!");
+		}
+		page->last_struct = le->u.s.down;
+		page->last_block = le->u.s.down->last_block;
+
+		return;
+	}
+
+	/* We are going to need to create a new block. Create a complete unlinked one here. */
+	newblock = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block);
+	newblock->bbox = fz_empty_rect;
+	newblock->prev = NULL;
+	newblock->next = NULL;
+	newblock->type = FZ_STEXT_BLOCK_STRUCT;
+	newblock->u.s.index = idx;
+	newblock->u.s.down = NULL;
+	/* If this throws, we leak newblock but it's within the pool, so it doesn't matter. */
+	new_stext_struct(ctx, page, newblock, standard, raw);
+
+	/* So now we just need to link it in somewhere. */
+	if (gt)
+	{
+		/* Link it in before gt. */
+		newblock->prev = gt->prev;
+		if (gt->prev)
+			gt->prev->next = newblock;
+		gt->prev = newblock;
+		newblock->next = gt;
+	}
+	else if (block)
+	{
+		/* Link it in at the end of the list (i.e. after 'block') */
+		newblock->prev = block;
+		block->next = newblock;
+	}
+	else if (page->last_struct)
+	{
+		/* We have no blocks at all at this level. */
+		page->last_struct->first_block = newblock;
+		page->last_struct->last_block = newblock;
+	}
+	else
+	{
+		/* We have no blocks at ANY level. */
+		page->first_block = newblock;
+	}
+	/* Wherever we linked it in, that's where we want to continue adding content. */
+	page->last_struct = newblock->u.s.down;
+	page->last_block = NULL;
+}
+
+static void
+fz_stext_end_structure(fz_context *ctx, fz_device *dev)
+{
+	fz_stext_device *tdev = (fz_stext_device*)dev;
+	fz_stext_page *page = tdev->page;
+	fz_stext_struct *str = page->last_struct;
+
+	if (str == NULL)
+	{
+		fz_warn(ctx, "Structure out of sync");
+		return;
+	}
+
+	page->last_struct = str->parent;
+	if (page->last_struct == NULL)
+	{
+		page->last_block = page->first_block;
+		/* Yuck */
+		while (page->last_block->next)
+			page->last_block = page->last_block->next;
+	}
+	else
+	{
+		page->last_block = page->last_struct->last_block;
+	}
+}
+
+fz_device *
+fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts)
+{
+	fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device);
+
+	dev->super.close_device = fz_stext_close_device;
+	dev->super.drop_device = fz_stext_drop_device;
+
+	dev->super.fill_text = fz_stext_fill_text;
+	dev->super.stroke_text = fz_stext_stroke_text;
+	dev->super.clip_text = fz_stext_clip_text;
+	dev->super.clip_stroke_text = fz_stext_clip_stroke_text;
+	dev->super.ignore_text = fz_stext_ignore_text;
+	dev->super.begin_metatext = fz_stext_begin_metatext;
+	dev->super.end_metatext = fz_stext_end_metatext;
+
+	dev->super.fill_shade = fz_stext_fill_shade;
+	dev->super.fill_image = fz_stext_fill_image;
+	dev->super.fill_image_mask = fz_stext_fill_image_mask;
+
+	if (opts)
+	{
+		dev->flags = opts->flags;
+		if (opts->flags & FZ_STEXT_COLLECT_STRUCTURE)
+		{
+			dev->super.begin_structure = fz_stext_begin_structure;
+			dev->super.end_structure = fz_stext_end_structure;
+		}
+		if (opts->flags & (FZ_STEXT_COLLECT_VECTORS | FZ_STEXT_COLLECT_STYLES))
+		{
+			dev->super.fill_path = fz_stext_fill_path;
+			dev->super.stroke_path = fz_stext_stroke_path;
+		}
+	}
+	dev->page = page;
+	dev->pen.x = 0;
+	dev->pen.y = 0;
+	dev->trm = fz_identity;
+	dev->lastchar = ' ';
+	dev->lasttext = NULL;
+	dev->lastbidi = 0;
+	dev->last_was_fake_bold = 1;
+	if (opts)
+		dev->opts = *opts;
+
+	if ((dev->flags & FZ_STEXT_PRESERVE_IMAGES) == 0)
+		dev->super.hints |= FZ_DONT_DECODE_IMAGES;
+
+	dev->rect_max = 0;
+	dev->rect_len = 0;
+	dev->rects = NULL;
+
+	return (fz_device*)dev;
+}