diff mupdf-source/source/fitz/deskew.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/fitz/deskew.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1194 @@
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+
+//#define DEBUG_DESKEWER
+
+//int errprintf_nomem(const char *string, ...);
+//#define printf errprintf_nomem
+
+/* We up the mallocs by a small amount to allow for SSE
+ * reading overruns. */
+#define SSE_SLOP 16
+
+/* Some notes on the theory.
+ *
+ * It is "well known" that a rotation (of between -90 and 90
+ * degrees at least) can be performed as a process of 3 shears,
+ * one on X, one on Y, one on X. For small angles (such as those
+ * found when deskewing scanned pages), we can do it in just 2;
+ * one on X, then one on Y.
+ *
+ * The standard rotation matrix is:
+ *
+ *  R = ( cos(t)   -sin(t) )
+ *       ( sin(t)    cos(t) )
+ *
+ * We can use the following decomposition (thanks to Michael
+ * Vrhel for pointing this out):
+ *
+ *     = ( 1        0        ) ( cos(t)  -sin(t) )
+ *       ( tan(t)   1/cos(t) ) ( 0       1       )
+ *
+ * (noting that tan(t) = sin(t)/cos(t), and sin(t).sin(t)+cos(t)cos(t) = 1)
+ *
+ *     = ( 1 0        ) ( 1       0 ) ( 1  -tan(t) ) ( cos(t) 0 )
+ *       ( 0 1/cos(t) ) ( sin(t)  1 ) ( 0  1       ) ( 0      1 )
+ *
+ *     = scale(Y)   .   sheer(Y)   .   sheer(X)   .   scale(X)
+ *
+ * The process we use to do sub-pixel capable shears allows us to
+ * incorporate 1 dimensional scales for free. If the expected user of
+ * this API is a scanner driver correcting for small errors in document
+ * alignment, let's consider that they may also want to expand/reduce
+ * at the same time.
+ *
+ * I'm not sure whether pre or post scales will be most useful, so
+ * let's do the maths allowing for both, as it'll end up as no
+ * more work.
+ *
+ * Our transformation pipeline can then be naively expressed as:
+ *
+ * (u') = (post_x 0     ) (1 0) (1 0) (1 c) (r 0) (pre_x 0    ) (u)
+ * (v')   (0      post_y) (0 R) (b 1) (0 1) (0 1) (0     pre_y) (v)
+ *
+ * where b = sin(t), c = -tan(t), R = 1/cos(t), r = cos(t).
+ *
+ * Rearranging gives us that:
+ *
+ * (u') = (post_x 0       ) (1 0) (1 c) (r.pre_x 0    ) (u)
+ * (v')   (0      post_y.R) (b 1) (0 1) (0       pre_y) (v)
+ *
+ * So, let X = post_x, Y = post_y.R, x = r.pre_x, y = pre_y and
+ * we have:
+ *
+ * (u') = (X 0) (1 0) (1 c) (x 0) (u)
+ * (v')   (0 Y) (b 1) (0 1) (0 y) (v)
+ *
+ * We need each step in the operation to be of the form:
+ *
+ *  (C D) for X and (1 0) for Y
+ *  (0 1)           (E F)
+ *
+ * for some constants C, D, E and F to work with our 1-d scale/shear
+ * mechanism. So, rearrange the pipeline:
+ *
+ *  P = (X 0) (1 0) (1 c) (x 0)
+ *      (0 Y) (b 1) (0 1) (0 y)
+ *
+ *    = (X  0) (x cy)
+ *      (bY Y) (0 y )
+ *
+ *    = (1    0) (X 0) (1 0) (x cy)
+ *      (bY/X Y) (0 1) (0 y) (0 1 )
+ *
+ *    = (1    0) (1 0) (X 0) (x cy)
+ *      (bY/X Y) (0 y) (0 1) (0 1 )
+ *
+ *    = (1    0 ) (xX cyX) = (xX   cyX      ) = (xX  cyX     )
+ *      (bY/X yY) (0  1  )   (bxY  bcyY + yY)   (bxY yY(1+bc))
+ *
+ * The first scale/shear involves us taking an input line of data,
+ * and scaling it into temporary storage. Every line of data supplied
+ * has the same scale done to it, but at a different X offset. We
+ * will therefore have to generate sets of weights for several different
+ * subpixel positions, and pick the appropriate one. 4 or 8 should
+ * be enough?
+ *
+ * For the second scale/shear, we don't run down individual scan-columns.
+ * Instead, we use a block of lines (the output from the first phase) and
+ * traverse through it diagonally, copying data out - effectively
+ * producing 1 pixel output for each of the 'scan columns'.
+ *
+ * This gives us a block of scanlines that we can apply the third
+ * shear to.
+ *
+ * Let us now consider the output position of each of the corners of
+ * our src_w * src_h input rectangle under this transformation. For
+ * simplicity of notation, let's call these just 'w' and 'h'.
+ *
+ * We know that by the properties of 2x2 matrices, (0, 0) maps to
+ * (0, 0) and (w, h) maps the the sum of where (w,0) (0,h) map to,
+ * so we only need calculate the image of 2 corners.
+ *
+ *    (1    0 ) (xX cyX) (w 0)
+ *    (bY/X Yy) (0  1  ) (0 h)
+ *
+ *  = (1    0 ) (wxX chyX)
+ *    (bY/X Yy) (0   h   )
+ *
+ *  = (wxX      chyX       )
+ *    (bwxY     cbhYy + hYy)
+ *
+ *  = (C E)
+ *    (D F)
+ *
+ * where C = wxX
+ *       D = bwxY
+ *       E = chyX
+ *       F = cbhYy + hYy = hYy (1 + cb)
+ *
+ * Some ASCII art to illustrate the images after the different
+ * stages:
+ *
+ * For angles 0..90:
+ *
+ *   *--x => *----x   =>    __.x
+ *   |  |     \    \      *'   \
+ *   o--+      o----+      \  __.+
+ *                          o'
+ *
+ * For angles 0 to -90:
+ *
+ *   *--x =>   *----x =>   *.__
+ *   |  |     /    /      /    'x
+ *   o--+    o----+      o.__  /
+ *                           '+
+ *
+ * How wide does temporary buffer 1 (where the results of the first
+ * scale/shear are stored) need to be?
+ *
+ * From the above diagrams we can see that for angles 0..90 it needs
+ * to be wide enough to stretch horizontally from * to +. i.e. wxX + chyX
+ *
+ * Similarly for angles 0..-90, it needs to be wide enough to stretch
+ * horizontally from o to x. i.e. wxX - chyX
+ *
+ * Given the properties of tan, we can see that it's wxX + |c|hyX in
+ * all cases.
+ *
+ *
+ * How tall is the image after the first scale/shear?
+ *
+ * Well, the height doesn't change, so src_h.
+ *
+ *
+ * How tall does temporary buffer 1 (where the results of the first
+ * scale/shear are stored) need to be?
+ *
+ * Consider the second stage. One of the scanlines produced will be
+ * made by reading in a line of pixels from (0, 0) to (wxX+|c|hyX, i)
+ * and outputting a horizontal line of pixels in the result. What is
+ * the value i?
+ *
+ * We know:
+ *
+ *  (1    0 ) (wxX+|c|hyX) = (wx+|c|hy)
+ *  (bY/X Yy) (i         )   (0       )
+ *
+ *  So: bY/X*(wxX+|c|hyX)) + iYy = 0
+ *      bY  *(wx +|c|hy )) + iYy = 0
+ *
+ *      i = -b*(wx+|c|hy)/y     (for non-zero Y and y)
+ *        = -b*(tmp1_width)/Xy  (for non-zero X, Y and y)
+ *
+ * (Sanity check: when t=0, b = 0, no lines needed. As the x
+ * scale increases, we need more lines. Seems sane.)
+ *
+ *
+ * How wide is the image after the second scale/shear?
+ *
+ * Well, the shear on Y doesn't change the width, so it's still
+ * wxX + |c|hyX.
+ *
+ *
+ * How tall is the entire image after second scale/shear?
+ *
+ * For angles 0 to 90, it's the difference in Y between o and x.
+ * i.e. cbhYy + hYy - bwxY
+ *
+ * For angles 0 to -90, it's the difference in Y between + and *.
+ * i.e. cbhYy + hYy + bwxY
+ *
+ * So cbhYy + hYy + |b|wxY in all cases.
+ *  = Y*((1+cb)*hy + |b|wx)
+ *
+ *
+ * What is the size of the final image?
+ *
+ *  W = wxX + |c|hyX
+ *  H = cbhYy + hYy + bwxY
+ *
+ */
+
+
+/*
+Border handling:
+
+We offer 3 styles of border handling, options 0,1,2.
+
+0: Increase the size of the output image from that of the input image. No pixels are dropped, and new border pixels are generated with the given backdrop color.
+This is the default, and the safest way to deskew as no pixels are lost.
+
+1: Keep the size of the page content constant (modulo pre and post scales). Some pixels are dropped, and new border pixels are generated with the given backdrop color.
+
+2: Decrease the size of the output image from that of the input image. Pixels are dropped. No new border pixels should be generated. If the centre of the input is
+correct, then this should exactly return the correct unskewed original page.
+
+*/
+
+#define WEIGHT_SHIFT 12
+#define WEIGHT_SCALE (1<<WEIGHT_SHIFT)
+#define WEIGHT_ROUND (1<<(WEIGHT_SHIFT-1))
+
+/* Auxiliary structures. */
+typedef struct
+{
+	uint32_t index;		/* index of first element in list of */
+				/* contributors */
+	uint16_t n;		/* number of contributors */
+				/* (not multiplied by stride) */
+	uint8_t  slow;		/* Flag */
+	int32_t  first_pixel;	/* offset of first value in source data */
+	int32_t  last_pixel;	/* last pixel number */
+} index_t;
+
+#if ARCH_HAS_NEON
+typedef int16_t weight_t;
+#else
+typedef int32_t weight_t;
+#endif
+
+typedef void (zoom_y_fn)(uint8_t        *            dst,
+			const uint8_t  * __restrict tmp,
+			const index_t  * __restrict index,
+			const weight_t * __restrict weights,
+			uint32_t                    width,
+			uint32_t                    channels,
+			uint32_t                    mod,
+			int32_t                     y);
+typedef void (zoom_x_fn)(uint8_t        * __restrict tmp,
+			const uint8_t  * __restrict src,
+			const index_t  * __restrict index,
+			const weight_t * __restrict weights,
+			uint32_t                    dst_w,
+			uint32_t                    src_w,
+			uint32_t                    channels,
+			const uint8_t  * __restrict bg);
+
+#define CLAMP(v, mn, mx)\
+	(v < mn ? mn : v > mx ? mx : v)
+
+/* Include the cores */
+#include "deskew_c.h"
+
+#if ARCH_HAS_SSE
+#include "deskew_sse.h"
+#endif
+#if ARCH_HAS_NEON
+#include "deskew_neon.h"
+#endif
+
+enum {
+	SUB_PIX_X = 4,
+	SUB_PIX_Y = 4,
+};
+
+typedef struct
+{
+	uint32_t      channels;
+	uint32_t      tmp_w;
+	uint32_t      tmp_h;
+	uint32_t      src_w;
+	uint32_t      src_h;
+	uint32_t      dst_w;
+	uint32_t      dst_h;
+	uint32_t      dst_w_borderless;
+	uint32_t      dst_h_borderless;
+	uint32_t      skip_l;
+	uint32_t      skip_t;
+	index_t      *index_x[SUB_PIX_X];
+	weight_t     *weights_x[SUB_PIX_X];
+	index_t      *index_y[SUB_PIX_Y];
+	weight_t     *weights_y[SUB_PIX_Y];
+	uint32_t      max_y_weights[SUB_PIX_Y];
+	uint8_t       bg[FZ_MAX_COLORS];
+
+	double        pre_x_scale;
+	double        pre_y_scale;
+	double        post_x_scale;
+	double        post_y_scale;
+	double        beta;
+	double        gamma;
+	double        chyX;
+	double        extent1;
+	double        w1;
+	double        h2;
+	double        diagonal_h;
+	double        t_degrees;
+
+	uint32_t      pre_fill_y;
+	double        start_y;
+
+	zoom_x_fn    *zoom_x;
+	zoom_y_fn    *zoom_y;
+} fz_deskewer;
+
+static void
+fill_bg(uint8_t *dst, const uint8_t *fill, uint32_t chan, uint32_t len)
+{
+	while (len--)
+	{
+		memcpy(dst, fill, chan);
+		dst += chan;
+	}
+}
+
+#define B (1.0f / 3.0f)
+#define C (1.0f / 3.0f)
+static double
+Mitchell_filter(double t)
+{
+	double t2 = t * t;
+
+	if (t < 0)
+		t = -t;
+
+	if (t < 1)
+		return ((12 - 9 * B - 6 * C) * (t * t2) +
+			(-18 + 12 * B + 6 * C) * t2 +
+			(6 - 2 * B)) / 6;
+	else if (t < 2)
+		return ((-1 * B - 6 * C) * (t * t2) +
+			(6 * B + 30 * C) * t2 +
+			(-12 * B - 48 * C) * t +
+			(8 * B + 24 * C)) / 6;
+	else
+		return 0;
+}
+
+#define FILTER_WIDTH 2
+
+static void
+make_x_weights(fz_context *ctx,
+		index_t **indexp,
+		weight_t **weightsp,
+		uint32_t src_w,
+		uint32_t dst_w,
+		double factor,
+		uint32_t offset_f,
+		uint32_t offset_n,
+		int sse_slow)
+{
+	double squeeze;
+	index_t  *index;
+	weight_t *weights;
+	uint32_t i;
+	double offset = ((double)offset_f)/offset_n;
+	uint32_t idx;
+	uint32_t max_weights;
+
+	if (factor <= 1)
+	{
+		squeeze = 1;
+		max_weights = 1 + FILTER_WIDTH * 2;
+	}
+	else
+	{
+		squeeze = factor;
+		max_weights = (uint32_t)ceil(1 + squeeze * FILTER_WIDTH * 2);
+		if (max_weights > 10)
+		{
+			max_weights = 10;
+			squeeze = ((double)max_weights) / (FILTER_WIDTH * 2);
+		}
+	}
+
+	max_weights = (max_weights + 3) & ~3;
+
+	weights = (weight_t *)fz_malloc_aligned(ctx, sizeof(*weights) * max_weights * dst_w + SSE_SLOP, sizeof(weight_t) * 4);
+	memset(weights, 0, sizeof(*weights) * max_weights * dst_w);
+	fz_try(ctx)
+		index = (index_t *)fz_malloc(ctx, sizeof(*index) * dst_w + SSE_SLOP);
+	fz_catch(ctx)
+	{
+		fz_free_aligned(ctx, weights);
+		fz_rethrow(ctx);
+	}
+	*indexp   = index;
+	*weightsp = weights;
+
+	idx = 0;
+	for (i = 0; i < dst_w; i++)
+	{
+		/* i is in 0..w (i.e. dst space).
+		 * centre, left, right are in 0..src_w (i.e. src_space)
+		 */
+		double centre = (i+0.5f)*factor - offset;
+		int32_t left = (int32_t)ceil(centre - squeeze*FILTER_WIDTH);
+		int32_t right = (int32_t)floor(centre + squeeze*FILTER_WIDTH);
+		int32_t j, k;
+
+		if ((centre - left) >= squeeze * FILTER_WIDTH)
+			left++;
+		if ((right - centre) >= squeeze * FILTER_WIDTH)
+			right--;
+
+		/* When we're calculating the second set of X weights, the subpixel adjustment can cause us to
+		 * read too far to the right. Adjust for this hackily here. */
+		if (left > (int32_t)src_w) {
+			right -= left - src_w;
+			centre -= left - src_w;
+			left = src_w;
+		}
+
+		assert(right-left+1 <= (int)max_weights && right >= 0 && left <= (int32_t)src_w);
+		index->index = idx;
+		j = left;
+		if (j < 0)
+		{
+			left = -1;
+			weights[idx] = 0;
+			for (; j < 0; j++)
+			{
+				double f = (centre - j) / squeeze;
+				weights[idx] += (int32_t)(Mitchell_filter(f) * WEIGHT_SCALE / squeeze);
+			}
+			idx++;
+		}
+		k = right;
+		if (k > (int32_t)src_w)
+			k = (int32_t)src_w;
+		for (; j <= k; j++)
+		{
+			double f = (centre - j) / squeeze;
+			weights[idx++] = (int32_t)(Mitchell_filter(f) * WEIGHT_SCALE / squeeze);
+		}
+		for (; j < right; j++)
+		{
+			double f = (centre - j) / squeeze;
+			weights[idx-1] += (int32_t)(Mitchell_filter(f) * WEIGHT_SCALE / squeeze);
+		}
+		index->first_pixel = left;
+		index->last_pixel  = k;
+		index->n           = k-left+1;
+		index->slow        = left < 0 || k >= (int32_t)src_w;
+		if (left + sse_slow > (int)src_w)
+			index->slow = 1;
+		idx = (idx + 3) & ~3;
+		index++;
+	}
+}
+
+/* The calculations here are different.
+ * We move from offset...offset+h1 in w steps.
+ * At each point, we calculate the weights vertically
+ * with a scale factor of dst_h/src_h.
+ */
+static void
+make_y_weights(fz_context *ctx,
+		index_t **indexp,
+		weight_t **weightsp,
+		uint32_t *max_weightsp,
+		uint32_t dst_w,
+		double factor,
+		double factor2,
+		uint32_t offset_f,
+		uint32_t offset_n,
+		uint32_t h)
+{
+	double squeeze;
+	index_t  *index;
+	weight_t *weights;
+	uint32_t i;
+	double   offset = ((double)offset_f)/offset_n;
+	uint32_t idx;
+	uint32_t max_weights;
+
+	if (factor >= 1)
+	{
+		squeeze = 1;
+		max_weights = 1 + FILTER_WIDTH * 2;
+	}
+	else
+	{
+		squeeze = 1/factor;
+		max_weights = (uint32_t)ceil(squeeze * FILTER_WIDTH * 2);
+		if (max_weights > 10)
+		{
+			max_weights = 10;
+			squeeze = ((double)max_weights) / (FILTER_WIDTH * 2);
+		}
+	}
+
+	max_weights = (max_weights + 3) & ~3;
+
+	/* Ensure that we never try to access before 0 */
+	offset += (double)FILTER_WIDTH/squeeze;
+
+	weights = (weight_t *)fz_malloc_aligned(ctx, sizeof(*weights) * max_weights * dst_w, sizeof(weight_t) * 4);
+	fz_try(ctx)
+		index = (index_t *)fz_malloc(ctx, sizeof(*index) * dst_w);
+	fz_catch(ctx)
+	{
+		fz_free_aligned(ctx, weights);
+		fz_rethrow(ctx);
+	}
+	*indexp       = index;
+	*weightsp     = weights;
+	*max_weightsp = max_weights;
+
+	if (factor2 < 0)
+		offset -= (dst_w-1) * factor2;
+
+	idx = 0;
+	for (i = 0; i < dst_w; i++)
+	{
+		/* i is in 0..dst_w (i.e. dst space).
+		 * centre, left, right are in 0..src_h (i.e. src_space)
+		 */
+		double centre = (i+0.5f)*factor2 + offset;
+		int32_t left = (int32_t)ceil(centre - squeeze*FILTER_WIDTH);
+		int32_t right = (int32_t)floor(centre + squeeze*FILTER_WIDTH);
+		int32_t j;
+
+		if ((centre - left) >= squeeze * FILTER_WIDTH)
+			left++;
+		if ((right - centre) >= squeeze * FILTER_WIDTH)
+			right--;
+
+		assert(right-left+1 <= (int)max_weights);
+		index->index       = idx;
+		for (j = left; j <= right; j++)
+		{
+			double f = (centre - j) / squeeze;
+			weights[idx++] = (int32_t)(Mitchell_filter(f) * WEIGHT_SCALE / squeeze);
+		}
+		index->last_pixel  = right;
+		index->n           = right-left+1;
+		if (left < 0)
+			left += h;
+		index->first_pixel = left;
+		index->slow        = 0;
+		index++;
+		idx = (idx + 3) & ~3;
+	}
+}
+
+#ifdef DEBUG_DESKEWER
+static void
+dump_weights(const index_t *index,
+		const weight_t *weights,
+		uint32_t w,
+		const char *str)
+{
+	uint32_t i;
+
+	printf("%s weights:\n", str);
+	for (i = 0; i < w; i++)
+	{
+		uint32_t j;
+		int32_t  sum = 0;
+		uint32_t n = index[i].n;
+		uint32_t idx = index[i].index;
+		printf(" %d: %d->%d:", i, index[i].first_pixel, index[i].last_pixel);
+		for (j = 0; j < n; j++)
+		{
+			sum += weights[idx];
+			printf(" %x", weights[idx++]);
+		}
+		printf(" (%x)\n", sum);
+	}
+}
+#endif
+
+static void
+rejig_for_zoom_y1(fz_context *ctx, fz_deskewer *deskewer)
+{
+	int i, k;
+	uint32_t j, z;
+
+	for (i = 0; i < SUB_PIX_Y; i++)
+	{
+		uint32_t  num_w = deskewer->dst_w;
+		weight_t *new_weights = fz_malloc_aligned(ctx, num_w *  deskewer->max_y_weights[i] *
+							sizeof(weight_t) * 4, sizeof(weight_t) * 4);
+		index_t  *index;
+		weight_t *weights;
+
+		index = deskewer->index_y[i];
+		weights = deskewer->weights_y[i];
+		for (j = 0; j < num_w; j++)
+		{
+			weight_t *neww = new_weights + (index->index * 4);
+			uint32_t n = index[0].n;
+			for (z = 0; z < n; z++)
+				neww[4 * z] = weights[index->index + z];
+			for (k = 1; k < 4 && j + k < num_w; k++)
+			{
+				if (index[0].n != index[k].n)
+					break;
+				if (index[0].first_pixel != index[k].first_pixel)
+					break;
+				for (z = 0; z < n; z++)
+					neww[k + 4 * z] = weights[index[k].index + z];
+			}
+			/* So, we can do j to j+k-1 (inclusive) all in one hit. */
+			index->slow = k;
+			index++;
+		}
+		fz_free_aligned(ctx, deskewer->weights_y[i]);
+		deskewer->weights_y[i] = new_weights;
+	}
+}
+
+static void
+fz_drop_deskewer(fz_context *ctx, fz_deskewer *deskewer)
+{
+	int i;
+
+	if (!deskewer)
+		return;
+
+	for (i = 0; i < SUB_PIX_X; i++)
+	{
+		fz_free(ctx, deskewer->index_x[i]);
+		fz_free_aligned(ctx, deskewer->weights_x[i]);
+	}
+	for (i = 0; i < SUB_PIX_Y; i++)
+	{
+		fz_free(ctx, deskewer->index_y[i]);
+		fz_free_aligned(ctx, deskewer->weights_y[i]);
+	}
+
+	fz_free(ctx, deskewer);
+}
+
+static fz_deskewer *
+fz_new_deskewer(fz_context *ctx,
+		unsigned int src_w, unsigned int src_h,
+		unsigned int *dst_w, unsigned int *dst_h,
+		double t_degrees, int border,
+		double pre_x_scale, double pre_y_scale,
+		double post_x_scale, double post_y_scale,
+		unsigned char *bg,
+		unsigned int channels)
+{
+	fz_deskewer *deskewer;
+	double w1, h2, extent1, beta, gamma, chyX, one_plus_cb;
+	double diagonal_h, h2_div_Y;
+	int i;
+
+#define SIMD_SWITCH_CONDITION 1
+
+#if ARCH_HAS_SSE
+#define SIMD_SWITCH(A,B,C) (SIMD_SWITCH_CONDITION ? (A) : (C))
+#elif ARCH_HAS_NEON
+#define SIMD_SWITCH(A,B,C) (SIMD_SWITCH_CONDITION ? (B) : (C))
+#else
+#define SIMD_SWITCH(A,B,C) (C)
+#endif
+
+	deskewer = fz_malloc_struct(ctx, fz_deskewer);
+
+	deskewer->channels    = channels;
+
+	/* Rotation coefficients */
+	gamma = -tan(t_degrees * M_PI / 180);
+	beta  = sin(t_degrees * M_PI / 180);
+
+	/* After the first shear/scale, the image will be w1 x src_h in size. */
+	chyX = gamma * src_h * pre_y_scale * post_x_scale;
+	extent1 = pre_x_scale * post_x_scale * src_w;
+	w1 = extent1 + fabs(chyX);
+	diagonal_h = fabs(beta) * w1 / pre_y_scale / post_x_scale;
+
+	/* After the second shear/scale, the image will be w1 x h2 in size. */
+
+	/* Calculate the size of the destination buffer */
+	one_plus_cb = 1 + gamma * beta;
+	h2_div_Y = src_h * pre_y_scale * one_plus_cb + fabs(src_w * pre_x_scale * beta);
+	h2 = post_y_scale * h2_div_Y;
+
+	deskewer->tmp_h      = (uint32_t)ceil(diagonal_h) + FILTER_WIDTH*2 + 1;
+	if (deskewer->tmp_h == 0) /* Allow for the zero degree case */
+		deskewer->tmp_h = 1;
+	deskewer->tmp_w      = (uint32_t)ceil(extent1)+1; /* +1 allows for subpixel positioning */
+	deskewer->src_w       = src_w;
+	deskewer->src_h       = src_h;
+	deskewer->dst_w       = (uint32_t)ceil(w1);
+	deskewer->dst_h       = (uint32_t)ceil(h2);
+	deskewer->pre_x_scale = pre_x_scale;
+	deskewer->pre_y_scale = pre_y_scale;
+	deskewer->post_x_scale= post_x_scale;
+	deskewer->post_y_scale= post_y_scale;
+	deskewer->beta        = beta;
+	deskewer->gamma       = gamma;
+	deskewer->chyX        = chyX;
+	deskewer->extent1     = extent1;
+	deskewer->w1          = w1;
+	deskewer->diagonal_h  = diagonal_h;
+	deskewer->t_degrees   = t_degrees;
+	deskewer->h2          = h2;
+	deskewer->skip_l      = 0;
+	deskewer->skip_t      = 0;
+	switch(border)
+	{
+	case 2:
+	{
+		int w = (int)(src_w * pre_x_scale * post_x_scale + 0.5);
+		int h = (int)(src_h * pre_y_scale * post_y_scale + 0.5);
+		deskewer->skip_l = (deskewer->dst_w - w);
+		deskewer->skip_t = (deskewer->dst_h - h);
+		*dst_w = w - deskewer->skip_l;
+		*dst_h = h - deskewer->skip_t;
+		break;
+	}
+	case 1:
+	{
+		int expected_w = (int)(src_w * pre_x_scale * post_x_scale + 0.5);
+		int expected_h = (int)(src_h * pre_y_scale * post_y_scale + 0.5);
+		deskewer->skip_l = (deskewer->dst_w - expected_w + 1) >> 1;
+		deskewer->skip_t = (deskewer->dst_h - expected_h + 1) >> 1;
+		*dst_w = expected_w;
+		*dst_h = expected_h;
+		break;
+	}
+	default:
+	case 0:
+		*dst_w = deskewer->dst_w;
+		*dst_h = deskewer->dst_h;
+		break;
+	}
+	deskewer->dst_w_borderless = *dst_w;
+	deskewer->dst_h_borderless = *dst_h;
+
+	fz_try(ctx)
+	{
+		memcpy(deskewer->bg, bg, channels);
+
+		/*
+		 * Once we have scaled/sheared lines into the tmp1, we have
+		 * data such as:
+		 *
+		 *    -ve angles     +ve angles
+		 *      +------+  or +-----+
+		 *     /      /       \     \
+		 *    /      /         \     \
+		 *   /      /           \     \
+		 *  /      /             \     \
+		 * +-----+                +-----+
+		 *
+		 * We then need to copy data out into the output. This is done by
+		 * reading a series of parallel diagonal lines out. The first
+		 * one is as shown here:
+		 *
+		 *            _.   or    ._            ----
+		 *        _.-'             '-._              } pre fill region
+		 *     _.+-----+         +-----+._     ----
+		 *  .-' /     /           \     \ '-.  ____  } lines of data required before we can start
+		 *     /     /             \     \
+		 *    /     /               \     \
+		 *   /     /                 \     \
+		 *  +-----+                   +-----+
+		 *
+		 * We can see that we need to fill the temporary buffer with some
+		 * empty lines to start with.
+		 *
+		 * The total Y extent of the diagonal line has been calculated as
+		 * diagonal_h already. The length of the horizontal edge of the data
+		 * region is extent1, and the remainder of the width is |c|hyX.
+		 *
+		 * Thus we need diagonal_h * extent1 / (extent1 + |c|hyX) in the pre
+		 * fill region. We can generate our first line out once we have
+		 * h1 lines in.
+		 */
+
+		deskewer->pre_fill_y = ((uint32_t)ceil(diagonal_h*extent1/w1) + FILTER_WIDTH*2);
+		deskewer->start_y = diagonal_h - deskewer->pre_fill_y;
+
+		/* Each line we scale into the destination starts at a different
+		 * X position. We hold that as x2.
+		 *
+		 * We calculated earlier where the corners of our source rectangle
+		 * were mapped to to give us the smallest dst_w x dst_h.
+		 *   dst_w = X * (wx + hy|c|)
+		 *   dst_h = Y * (hy(1+cb) + |b|wx)
+		 */
+
+		for (i = 0; i < SUB_PIX_X; i++)
+		{
+			make_x_weights(ctx,
+					&deskewer->index_x[i],
+					&deskewer->weights_x[i],
+					src_w,
+					deskewer->tmp_w,
+					(double)src_w / extent1,
+					i, SUB_PIX_X,
+					(16+channels-1)/channels);
+#ifdef DEBUG_DESKEWER
+			{
+				char text[16];
+				sprintf(text, "X[%d]", i);
+				dump_weights(deskewer->index_x[i],
+						deskewer->weights_x[i],
+						deskewer->tmp_w,
+						text);
+			}
+#endif
+		}
+
+		for (i = 0; i < SUB_PIX_Y; i++)
+		{
+			make_y_weights(ctx,
+				&deskewer->index_y[i],
+				&deskewer->weights_y[i],
+				&deskewer->max_y_weights[i],
+				deskewer->dst_w,
+				post_y_scale * pre_y_scale,
+				(t_degrees >= 0 ? diagonal_h : -diagonal_h) / w1,
+				i, SUB_PIX_Y,
+				deskewer->tmp_h);
+#ifdef DEBUG_DESKEWER
+			{
+				char text[16];
+				sprintf(text, "Y[%d]", i);
+				dump_weights(deskewer->index_y[i],
+						deskewer->weights_y[i],
+						deskewer->dst_w,
+						text);
+			}
+#endif
+		}
+
+		switch (channels)
+		{
+		case 1:
+			deskewer->zoom_x = SIMD_SWITCH(zoom_x1_sse, zoom_x1_neon, zoom_x1);
+			deskewer->zoom_y = SIMD_SWITCH(zoom_y1_sse, zoom_y1_neon, zoom_y1);
+			break;
+		case 3:
+			deskewer->zoom_x = SIMD_SWITCH(zoom_x3_sse, zoom_x3_neon, zoom_x3);
+			deskewer->zoom_y = SIMD_SWITCH(zoom_y3_sse, zoom_y3_neon, zoom_y3);
+			break;
+		case 4:
+			deskewer->zoom_x = SIMD_SWITCH(zoom_x4_sse, zoom_x4_neon, zoom_x4);
+			deskewer->zoom_y = SIMD_SWITCH(zoom_y4_sse, zoom_y4_neon, zoom_y4);
+			break;
+		default:
+			deskewer->zoom_x = zoom_x;
+			deskewer->zoom_y = zoom_y;
+			break;
+		}
+
+		if (channels == 1)
+			rejig_for_zoom_y1(ctx, deskewer);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_deskewer(ctx, deskewer);
+		fz_rethrow(ctx);
+	}
+
+	return deskewer;
+}
+
+/*
+  Our overall forward transform is:
+
+  (u') = (1    0 ) (xX cyX) (u)
+  (v')   (bY/X yY) (0  1  ) (v)
+
+  (right hand one is the X shear, left hand one is the Y shear).
+
+  (u') = (xX   cyX     ) (u)
+  (v')   (bxY  yY(1+bc)) (v)
+
+  So, inverse...
+
+  (u) = 1/det . (yY(1+bc)  -cyX) (u')
+  (v)           (-bxY       xX ) (v')
+
+  where det = (xyXY.(1+bc) - cyX.bxY) = xyXY(1+bc - bc) = xyXY
+
+  So inverse...
+
+   (u) = ((1+bc)/xX   -c/xY) (u')
+   (v)   (-b/yX        1/yY) (v')
+
+  Sanity check:
+
+   ((1+bc)/xX  -c/xY) (xX   cyX     ) = ((1+bc) - bc     cy(1+bc)/x - cy(1+bc)/x) = (1 0)
+   (-b/yX       1/yY) (bxY  yY(1+bc))   (-bx/y + bx/y    -bc + (1+bc)           )   (0 1)
+*/
+
+typedef struct fz_deskewer_bander_s {
+	fz_deskewer  *deskewer;
+	double        diag_y;
+	double        diag_dy;
+	uint32_t      in_y;
+	uint32_t      out_y;
+	uint32_t      tmp_width;
+	uint8_t      *tmp;
+	uint32_t      tmp_size;
+	double        tmp_x;
+	double        tmp_dx;
+	unsigned int  dst_x0;
+	unsigned int  dst_x1;
+	unsigned int  dst_y0;
+} fz_deskewer_bander;
+
+static fz_deskewer_bander *
+fz_new_deskewer_band(fz_context *ctx,
+			fz_deskewer *deskewer,
+			unsigned int src_y0,
+			unsigned int dst_y0)
+{
+	fz_deskewer_bander *bander = fz_malloc_struct(ctx, fz_deskewer_bander);
+	bander->deskewer = deskewer;
+
+	bander->tmp_width = deskewer->dst_w_borderless;
+	bander->tmp_size = bander->tmp_width * deskewer->channels * deskewer->tmp_h;
+	bander->dst_x0 = deskewer->skip_l;
+	bander->dst_x1 = deskewer->dst_w_borderless + deskewer->skip_l;
+	bander->dst_y0 = dst_y0;
+
+	fz_try(ctx)
+		bander->tmp = (uint8_t *)fz_malloc(ctx, bander->tmp_size + SSE_SLOP);
+	fz_catch(ctx)
+	{
+		fz_free(ctx, bander);
+		fz_rethrow(ctx);
+	}
+
+	bander->dst_y0 += deskewer->skip_t;
+
+	/* Each line we scale into tmp starts at a different x position.
+	 * We hold that as x. Over src_h lines, we want to move from
+	 * 0 to chy (or chy to 0). */
+	bander->tmp_x = (deskewer->chyX >= 0 ? deskewer->chyX : 0);
+	bander->tmp_dx = -deskewer->chyX/deskewer->src_h;
+	/* Do a half step */
+	bander->tmp_x += bander->tmp_dx/2;
+	bander->tmp_x += src_y0 * bander->tmp_dx;
+
+	bander->diag_y = FILTER_WIDTH-(double)deskewer->pre_fill_y;
+	bander->diag_dy = (deskewer->src_h + deskewer->diagonal_h * (2 * deskewer->extent1 / deskewer->w1 - 1)) / deskewer->h2;
+	/* Do a half step on y */
+	bander->diag_y += bander->diag_dy/2;
+	bander->diag_y += bander->dst_y0 * bander->diag_dy;
+
+	/* diag_y = Where we start to read the diagonal line from */
+	/* in_y = All lines < this have been written into tmp. */
+	/* out_y = All lines smaller than this have been written out */
+
+	/* The first source line we have is src_y0. After the X skew this will still be src_y0.
+	 * The Y skew will mean that this line extends across a range of output scanlines.
+	 * Find the range of scanlines that we touch. */
+	bander->in_y = src_y0;
+	/* We need to fill the lines up to and including tmp_y with the background color. */
+	{
+		int first_y = (bander->in_y - deskewer->pre_fill_y + deskewer->tmp_h) % deskewer->tmp_h;
+		int count = deskewer->pre_fill_y;
+		if (first_y + count > (int)deskewer->tmp_h)
+		{
+			int s = deskewer->tmp_h - first_y;
+			fill_bg(bander->tmp + first_y * bander->tmp_width * deskewer->channels, deskewer->bg, deskewer->channels, bander->tmp_width * s);
+			first_y = 0;
+			count -= s;
+		}
+		fill_bg(bander->tmp + first_y * bander->tmp_width * deskewer->channels, deskewer->bg, deskewer->channels, bander->tmp_width * count);
+	}
+
+	bander->out_y = dst_y0;
+
+	return bander;
+}
+
+static int
+fz_deskewer_band_pull(fz_deskewer_bander *bander,
+			unsigned char *dst)
+{
+	/* If we have enough data to make a new y2 line, do so. */
+	double y = bander->diag_y + 1.0/(2 * SUB_PIX_Y);
+	int iy = (int)floor(y);
+	int which_y = (int)floor((y - iy) * SUB_PIX_Y);
+	fz_deskewer *deskewer = bander->deskewer;
+
+	/* Ensure we have enough input lines to generate an output one. */
+	while (1)
+	{
+		/* Which source line do we need to have to generate the next destination line? */
+		int need_y = deskewer->index_y[which_y][deskewer->t_degrees >= 0 ? bander->dst_x1 - 1 : bander->dst_x0].last_pixel + iy;
+
+		/* Have we got enough lines? */
+		if ((int)bander->in_y >= need_y)
+			break; /* Yes, break out of the loop to process it. */
+
+		/* No. Can we ask the caller for one? */
+		if (bander->in_y < deskewer->src_h)
+			/* We are still in the range where we can ask for one. */
+			return 0;
+
+		/* Fill in a line with background pixels. */
+		fill_bg(&bander->tmp[bander->tmp_width * deskewer->channels * (bander->in_y % deskewer->tmp_h)],
+			deskewer->bg, deskewer->channels, bander->tmp_width);
+		bander->in_y++;
+	}
+
+	deskewer->zoom_y(dst,
+			bander->tmp,
+			&deskewer->index_y[which_y][bander->dst_x0],
+			deskewer->weights_y[which_y],
+			bander->tmp_width,
+			deskewer->channels,
+			bander->tmp_size,
+			(iy + deskewer->tmp_h) % deskewer->tmp_h);
+	bander->out_y++;
+	bander->diag_y += bander->diag_dy;
+
+	return 1;
+}
+
+static void
+fz_deskewer_band_push(fz_deskewer_bander *bander,
+			const unsigned char *src)
+{
+	fz_deskewer *deskewer = bander->deskewer;
+	uint8_t *line = &bander->tmp[bander->tmp_width * deskewer->channels * (bander->in_y % deskewer->tmp_h)];
+	double x = bander->tmp_x + 1.0 / (SUB_PIX_X * 2);
+	int xi = (int)floor(x);
+	int which = (int)floor((x - xi) * SUB_PIX_X);
+
+	{
+		int r = (int)bander->dst_x1;
+		if (r > xi)
+			r = xi;
+		if ((int)bander->dst_x0 < r)
+			fill_bg(line,
+				deskewer->bg,
+				deskewer->channels,
+				r - (int)bander->dst_x0);
+	}
+	{
+		int l = xi + deskewer->tmp_w - 1; /* Undo the +1 earlier */
+		if (l < (int)bander->dst_x0)
+			l = (int)bander->dst_x0;
+		if (l < (int)bander->dst_x1)
+			fill_bg(&line[(l - bander->dst_x0) * deskewer->channels],
+				deskewer->bg,
+				deskewer->channels,
+				(int)bander->dst_x1 - l);
+	}
+	{
+		int l = xi;
+		int r = xi + deskewer->tmp_w;
+		if (l < (int)bander->dst_x0)
+			l = (int)bander->dst_x0;
+		if (r > (int)bander->dst_x1)
+			r = (int)bander->dst_x1;
+		if (l < r)
+		{
+#if 0
+			memcpy(&line[(l - (int)bander->dst_x0) * deskewer->channels],
+				src,
+				(r-l) * deskewer->channels);
+#else
+			deskewer->zoom_x(&line[(l - (int)bander->dst_x0) * deskewer->channels],
+					src,
+					&deskewer->index_x[which][l - xi],
+					deskewer->weights_x[which],
+					r-l,
+					deskewer->src_w,
+					deskewer->channels,
+					deskewer->bg);
+#endif
+		}
+	}
+	bander->in_y++;
+	bander->tmp_x += bander->tmp_dx;
+}
+
+static void
+fz_drop_deskewer_band(fz_context *ctx, fz_deskewer_bander *bander)
+{
+	if (bander == NULL)
+		return;
+
+	fz_free(ctx, bander->tmp);
+	fz_free(ctx, bander);
+}
+
+static void
+fz_deskewer_band(fz_context *ctx,
+		fz_deskewer *deskewer,
+		const unsigned char *src,
+		int src_stride,
+		unsigned int src_y0,
+		unsigned int src_y1,
+		unsigned char *dst,
+		int dst_stride,
+		unsigned int dst_y0,
+		unsigned int dst_y1)
+{
+	fz_deskewer_bander *bander;
+	int row = 0;
+
+	bander = fz_new_deskewer_band(ctx, deskewer, src_y0, dst_y0);
+
+	while (dst_y0 < dst_y1)
+	{
+		if (fz_deskewer_band_pull(bander, dst + dst_stride * row) == 1)
+		{
+			row++;
+			dst_y0++;
+			continue;
+		}
+		fz_deskewer_band_push(bander, src + src_stride * row);
+	}
+
+	fz_drop_deskewer_band(ctx, bander);
+}
+
+fz_pixmap *fz_deskew_pixmap(fz_context *ctx,
+			fz_pixmap *src,
+			double degrees,
+			int border)
+{
+	uint8_t bg[FZ_MAX_COLORS] = { 0 };
+	uint8_t bg_rgb[FZ_MAX_COLORS] = { 255, 255, 255 };
+	unsigned int dst_w, dst_h;
+	fz_deskewer *deskewer = fz_new_deskewer(ctx, src->w, src->h, &dst_w, &dst_h, degrees, border, 1, 1, 1, 1, src->n - src->alpha > 3 ? bg : bg_rgb, src->n);
+	fz_pixmap *dst = NULL;
+
+	fz_var(dst);
+
+	fz_try(ctx)
+	{
+		dst = fz_new_pixmap(ctx, src->colorspace, dst_w, dst_h, NULL, src->alpha);
+
+		fz_deskewer_band(ctx, deskewer, src->samples, src->stride, 0, src->h, dst->samples, dst->stride, 0, dst->h);
+	}
+	fz_always(ctx)
+	{
+		fz_drop_deskewer(ctx, deskewer);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_pixmap(ctx, dst);
+		fz_rethrow(ctx);
+	}
+
+	return dst;
+
+}