comparison mupdf-source/source/fitz/draw-scale-simple.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 /*
24 This code does smooth scaling of a pixmap.
25
26 This function returns a new pixmap representing the area starting at (0,0)
27 given by taking the source pixmap src, scaling it to width w, and height h,
28 and then positioning it at (frac(x),frac(y)).
29
30 This is a cut-down version of draw_scale.c that only copes with filters
31 that return values strictly in the 0..1 range, and uses bytes for
32 intermediate results rather than ints.
33 */
34
35 #include "mupdf/fitz.h"
36
37 #include "draw-imp.h"
38 #include "pixmap-imp.h"
39
40 #include <math.h>
41 #include <string.h>
42 #include <assert.h>
43 #include <limits.h>
44
45 /* Do we special case handling of single pixel high/wide images? The
46 * 'purest' handling is given by not special casing them, but certain
47 * files that use such images 'stack' them to give full images. Not
48 * special casing them results in then being fainter and giving noticeable
49 * rounding errors.
50 */
51 #define SINGLE_PIXEL_SPECIALS
52
53 /*
54 Consider a row of source samples, src, of width src_w, positioned at x,
55 scaled to width dst_w.
56
57 src[i] is centred at: x + (i + 0.5)*dst_w/src_w
58
59 Therefore the distance between the centre of the jth output pixel and
60 the centre of the ith source sample is:
61
62 dist[j,i] = j + 0.5 - (x + (i + 0.5)*dst_w/src_w)
63
64 When scaling up, therefore:
65
66 dst[j] = SUM(filter(dist[j,i]) * src[i])
67 (for all ints i)
68
69 This can be simplified by noticing that filters are only non zero within
70 a given filter width (henceforth called W). So:
71
72 dst[j] = SUM(filter(dist[j,i]) * src[i])
73 (for ints i, s.t. (j*src_w/dst_w)-W < i < (j*src_w/dst_w)+W)
74
75 When scaling down, each filtered source sample is stretched to be wider
76 to avoid aliasing issues. This effectively reduces the distance between
77 centres.
78
79 dst[j] = SUM(filter(dist[j,i] * F) * F * src[i])
80 (where F = dst_w/src_w)
81 (for ints i, s.t. (j-W)/F < i < (j+W)/F)
82
83 */
84
85 typedef struct fz_scale_filter
86 {
87 int width;
88 float (*fn)(struct fz_scale_filter *, float);
89 } fz_scale_filter;
90
91 /* Image scale filters */
92
93 static float
94 triangle(fz_scale_filter *filter, float f)
95 {
96 if (f >= 1)
97 return 0;
98 return 1-f;
99 }
100
101 static float
102 box(fz_scale_filter *filter, float f)
103 {
104 if (f >= 0.5f)
105 return 0;
106 return 1;
107 }
108
109 static float
110 simple(fz_scale_filter *filter, float x)
111 {
112 if (x >= 1)
113 return 0;
114 return 1 + (2*x - 3)*x*x;
115 }
116
117 fz_scale_filter fz_scale_filter_box = { 1, box };
118 fz_scale_filter fz_scale_filter_triangle = { 1, triangle };
119 fz_scale_filter fz_scale_filter_simple = { 1, simple };
120
121 /*
122 We build ourselves a set of tables to contain the precalculated weights
123 for a given set of scale settings.
124
125 The first dst_w entries in index are the index into index of the
126 sets of weight for each destination pixel.
127
128 Each of the sets of weights is a set of values consisting of:
129 the minimum source pixel index used for this destination pixel
130 the number of weights used for this destination pixel
131 the weights themselves
132
133 So to calculate dst[i] we do the following:
134
135 weights = &index[index[i]];
136 min = *weights++;
137 len = *weights++;
138 dst[i] = 0;
139 while (--len > 0)
140 dst[i] += src[min++] * *weights++
141
142 in addition, we guarantee that at the end of this process weights will now
143 point to the weights value for dst pixel i+1.
144
145 In the simplest version of this algorithm, we would scale the whole image
146 horizontally first into a temporary buffer, then scale that temporary
147 buffer again vertically to give us our result. Using such a simple
148 algorithm would mean that could use the same style of weights for both
149 horizontal and vertical scaling.
150
151 Unfortunately, this would also require a large temporary buffer,
152 particularly in the case where we are scaling up.
153
154 We therefore modify the algorithm as follows; we scale scanlines from the
155 source image horizontally into a temporary buffer, until we have all the
156 contributors for a given output scanline. We then produce that output
157 scanline from the temporary buffer. In this way we restrict the height
158 of the temporary buffer to a small fraction of the final size.
159
160 Unfortunately, this means that the pseudo code for recombining a
161 scanline of fully scaled pixels is as follows:
162
163 weights = &index[index[y]];
164 min = *weights++;
165 len = *weights++;
166 for (x=0 to dst_w)
167 min2 = min
168 len2 = len
169 weights2 = weights
170 dst[x] = 0;
171 while (--len2 > 0)
172 dst[x] += temp[x][(min2++) % tmp_buf_height] * *weights2++
173
174 i.e. it requires a % operation for every source pixel - this is typically
175 expensive.
176
177 To avoid this, we alter the order in which vertical weights are stored,
178 so that they are ordered in the same order as the temporary buffer lines
179 would appear. This simplifies the algorithm to:
180
181 weights = &index[index[y]];
182 min = *weights++;
183 len = *weights++;
184 for (x=0 to dst_w)
185 min2 = 0
186 len2 = len
187 weights2 = weights
188 dst[x] = 0;
189 while (--len2 > 0)
190 dst[x] += temp[i][min2++] * *weights2++
191
192 This means that len may be larger than it needs to be (due to the
193 possible inclusion of a zero weight row or two), but in practise this
194 is only an increase of 1 or 2 at worst.
195
196 We implement this by generating the weights as normal (but ensuring we
197 leave enough space) and then reordering afterwards.
198
199 */
200
201 /* This structure is accessed from ARM code - bear this in mind before
202 * altering it! */
203 typedef struct
204 {
205 int flip; /* true if outputting reversed */
206 int count; /* number of output pixels we have records for in this table */
207 int max_len; /* Maximum number of weights for any one output pixel */
208 int n; /* number of components (src->n) */
209 int new_line; /* True if no weights for the current output pixel */
210 int patch_l; /* How many output pixels we skip over */
211 int index[FZ_FLEXIBLE_ARRAY];
212 } fz_weights;
213
214 struct fz_scale_cache
215 {
216 int src_w;
217 float x;
218 float dst_w;
219 fz_scale_filter *filter;
220 int vertical;
221 int dst_w_int;
222 int patch_l;
223 int patch_r;
224 int n;
225 int flip;
226 fz_weights *weights;
227 };
228
229 static fz_weights *
230 new_weights(fz_context *ctx, fz_scale_filter *filter, int src_w, float dst_w, int patch_w, int n, int flip, int patch_l)
231 {
232 int max_len;
233 fz_weights *weights;
234
235 if (src_w > dst_w)
236 {
237 /* Scaling down, so there will be a maximum of
238 * 2*filterwidth*src_w/dst_w src pixels
239 * contributing to each dst pixel. */
240 max_len = (int)ceilf((2 * filter->width * src_w)/dst_w);
241 if (max_len > src_w)
242 max_len = src_w;
243 }
244 else
245 {
246 /* Scaling up, so there will be a maximum of
247 * 2*filterwidth src pixels contributing to each dst pixel.
248 */
249 max_len = 2 * filter->width;
250 }
251 /* We need the size of the struct,
252 * plus patch_w*sizeof(int) for the index
253 * plus (2+max_len)*sizeof(int) for the weights
254 * plus room for an extra set of weights for reordering.
255 */
256 weights = fz_malloc_flexible(ctx, fz_weights, index, (max_len+3) * (patch_w+1));
257 if (!weights)
258 return NULL;
259 weights->count = -1;
260 weights->max_len = max_len;
261 weights->index[0] = patch_w;
262 weights->n = n;
263 weights->patch_l = patch_l;
264 weights->flip = flip;
265 return weights;
266 }
267
268 /* j is destination pixel in the patch_l..patch_l+patch_w range */
269 static void
270 init_weights(fz_weights *weights, int j)
271 {
272 int index;
273
274 j -= weights->patch_l;
275 assert(weights->count == j-1);
276 weights->count++;
277 weights->new_line = 1;
278 if (j == 0)
279 index = weights->index[0];
280 else
281 {
282 index = weights->index[j-1];
283 index += 2 + weights->index[index+1];
284 }
285 weights->index[j] = index; /* row pointer */
286 weights->index[index] = 0; /* min */
287 weights->index[index+1] = 0; /* len */
288 }
289
290 static void
291 insert_weight(fz_weights *weights, int j, int i, int weight)
292 {
293 int min, len, index;
294
295 /* Move j from patch_l...patch_l+patch_w range to 0..patch_w range */
296 j -= weights->patch_l;
297 if (weights->new_line)
298 {
299 /* New line */
300 weights->new_line = 0;
301 index = weights->index[j]; /* row pointer */
302 weights->index[index] = i; /* min */
303 weights->index[index+1] = 0; /* len */
304 }
305 index = weights->index[j];
306 min = weights->index[index++];
307 len = weights->index[index++];
308 while (i < min)
309 {
310 /* This only happens in rare cases, but we need to insert
311 * one earlier. In exceedingly rare cases we may need to
312 * insert more than one earlier. */
313 int k;
314
315 for (k = len; k > 0; k--)
316 {
317 weights->index[index+k] = weights->index[index+k-1];
318 }
319 weights->index[index] = 0;
320 min--;
321 len++;
322 weights->index[index-2] = min;
323 weights->index[index-1] = len;
324 }
325 if (i-min >= len)
326 {
327 /* The usual case */
328 while (i-min >= ++len)
329 {
330 weights->index[index+len-1] = 0;
331 }
332 assert(len-1 == i-min);
333 weights->index[index+i-min] = weight;
334 weights->index[index-1] = len;
335 assert(len <= weights->max_len);
336 }
337 else
338 {
339 /* Infrequent case */
340 weights->index[index+i-min] += weight;
341 }
342 }
343
344 static void
345 add_weight(fz_weights *weights, int j, int i, fz_scale_filter *filter,
346 float x, float F, float G, int src_w, float dst_w)
347 {
348 float dist = j - x + 0.5f - ((i + 0.5f)*dst_w/src_w);
349 float f;
350 int weight;
351
352 dist *= G;
353 if (dist < 0)
354 dist = -dist;
355 f = filter->fn(filter, dist)*F;
356 weight = (int)(256*f+0.5f);
357
358 /* Ensure i is in range */
359 if (i < 0 || i >= src_w)
360 return;
361 if (weight != 0)
362 insert_weight(weights, j, i, weight);
363 }
364
365 static void
366 reorder_weights(fz_weights *weights, int j, int src_w)
367 {
368 int idx = weights->index[j - weights->patch_l];
369 int min = weights->index[idx++];
370 int len = weights->index[idx++];
371 int max = weights->max_len;
372 int tmp = idx+max;
373 int i, off;
374
375 /* Copy into the temporary area */
376 memcpy(&weights->index[tmp], &weights->index[idx], sizeof(int)*len);
377
378 /* Pad out if required */
379 assert(len <= max);
380 assert(min+len <= src_w);
381 off = 0;
382 if (len < max)
383 {
384 memset(&weights->index[tmp+len], 0, sizeof(int)*(max-len));
385 len = max;
386 if (min + len > src_w)
387 {
388 off = min + len - src_w;
389 min = src_w - len;
390 weights->index[idx-2] = min;
391 }
392 weights->index[idx-1] = len;
393 }
394
395 /* Copy back into the proper places */
396 for (i = 0; i < len; i++)
397 {
398 weights->index[idx+((min+i+off) % max)] = weights->index[tmp+i];
399 }
400 }
401
402 /* Due to rounding and edge effects, the sums for the weights sometimes don't
403 * add up to 256. This causes visible rendering effects. Therefore, we take
404 * pains to ensure that they 1) never exceed 256, and 2) add up to exactly
405 * 256 for all pixels that are completely covered. See bug #691629. */
406 static void
407 check_weights(fz_weights *weights, int j, int w, float x, float wf)
408 {
409 int idx, len;
410 int sum = 0;
411 int max = -256;
412 int maxidx = 0;
413 int i;
414
415 idx = weights->index[j - weights->patch_l];
416 idx++; /* min */
417 len = weights->index[idx++];
418
419 for(i=0; i < len; i++)
420 {
421 int v = weights->index[idx++];
422 sum += v;
423 if (v > max)
424 {
425 max = v;
426 maxidx = idx;
427 }
428 }
429 /* If we aren't the first or last pixel, OR if the sum is too big
430 * then adjust it. */
431 if (((j != 0) && (j != w-1)) || (sum > 256))
432 weights->index[maxidx-1] += 256-sum;
433 /* Otherwise, if we are the first pixel, and it's fully covered, then
434 * adjust it. */
435 else if ((j == 0) && (x < 0.0001f) && (sum != 256))
436 weights->index[maxidx-1] += 256-sum;
437 /* Finally, if we are the last pixel, and it's fully covered, then
438 * adjust it. */
439 else if ((j == w-1) && (w - wf < 0.0001f) && (sum != 256))
440 weights->index[maxidx-1] += 256-sum;
441 }
442
443 static int
444 window_fix(int l, int *rp, float window, float centre)
445 {
446 int r = *rp;
447 while (centre - l > window)
448 l++;
449 while (r - centre > window)
450 r--;
451 *rp = r;
452 return l;
453 }
454
455 static fz_weights *
456 make_weights(fz_context *ctx, int src_w, float x, float dst_w, fz_scale_filter *filter, int vertical, int dst_w_int, int patch_l, int patch_r, int n, int flip, fz_scale_cache *cache)
457 {
458 fz_weights *weights;
459 float F, G;
460 float window;
461 int j;
462
463 if (cache)
464 {
465 if (cache->src_w == src_w && cache->x == x && cache->dst_w == dst_w &&
466 cache->filter == filter && cache->vertical == vertical &&
467 cache->dst_w_int == dst_w_int &&
468 cache->patch_l == patch_l && cache->patch_r == patch_r &&
469 cache->n == n && cache->flip == flip)
470 {
471 return cache->weights;
472 }
473 cache->src_w = src_w;
474 cache->x = x;
475 cache->dst_w = dst_w;
476 cache->filter = filter;
477 cache->vertical = vertical;
478 cache->dst_w_int = dst_w_int;
479 cache->patch_l = patch_l;
480 cache->patch_r = patch_r;
481 cache->n = n;
482 cache->flip = flip;
483 fz_free(ctx, cache->weights);
484 cache->weights = NULL;
485 }
486
487 if (dst_w < src_w)
488 {
489 /* Scaling down */
490 F = dst_w / src_w;
491 G = 1;
492 }
493 else
494 {
495 /* Scaling up */
496 F = 1;
497 G = src_w / dst_w;
498 }
499 window = filter->width / F;
500 weights = new_weights(ctx, filter, src_w, dst_w, patch_r-patch_l, n, flip, patch_l);
501 if (!weights)
502 return NULL;
503 for (j = patch_l; j < patch_r; j++)
504 {
505 /* find the position of the centre of dst[j] in src space */
506 float centre = (j - x + 0.5f)*src_w/dst_w - 0.5f;
507 int l, r;
508 l = ceilf(centre - window);
509 r = floorf(centre + window);
510
511 /* Now, due to the vagaries of floating point, if centre is large, l
512 * and r can actually end up further than 2*window apart. All we care
513 * about in this case is that we don't crash! We want a cheap correction
514 * that avoids the assert and doesn't cost too much in the normal case.
515 * This should do. */
516 if (r - l > 2 * window)
517 l = window_fix(l, &r, window, centre);
518
519 init_weights(weights, j);
520 for (; l <= r; l++)
521 {
522 add_weight(weights, j, l, filter, x, F, G, src_w, dst_w);
523 }
524 if (weights->new_line)
525 {
526 /* In very rare cases (bug 706764) we might not actually
527 * have generated any non-zero weights for this destination
528 * pixel. Just use the central pixel. */
529 int src_x = floorf(centre);
530 if (src_x >= src_w)
531 src_x = src_w-1;
532 if (src_x < 0)
533 src_x = 0;
534 insert_weight(weights, j, src_x, 1);
535 }
536 check_weights(weights, j, dst_w_int, x, dst_w);
537 if (vertical)
538 {
539 reorder_weights(weights, j, src_w);
540 }
541 }
542 weights->count++; /* weights->count = dst_w_int now */
543 if (cache)
544 {
545 cache->weights = weights;
546 }
547 return weights;
548 }
549
550 static void
551 scale_row_to_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
552 {
553 const int *contrib = &weights->index[weights->index[0]];
554 int len, i, j, n;
555 const unsigned char *min;
556 int tmp[FZ_MAX_COLORS];
557 int *t = tmp;
558
559 n = weights->n;
560 for (j = 0; j < n; j++)
561 tmp[j] = 128;
562 if (weights->flip)
563 {
564 dst += (weights->count-1)*n;
565 for (i=weights->count; i > 0; i--)
566 {
567 min = &src[n * *contrib++];
568 len = *contrib++;
569 while (len-- > 0)
570 {
571 for (j = n; j > 0; j--)
572 *t++ += *min++ * *contrib;
573 t -= n;
574 contrib++;
575 }
576 for (j = n; j > 0; j--)
577 {
578 *dst++ = (unsigned char)(*t>>8);
579 *t++ = 128;
580 }
581 t -= n;
582 dst -= n*2;
583 }
584 }
585 else
586 {
587 for (i=weights->count; i > 0; i--)
588 {
589 min = &src[n * *contrib++];
590 len = *contrib++;
591 while (len-- > 0)
592 {
593 for (j = n; j > 0; j--)
594 *t++ += *min++ * *contrib;
595 t -= n;
596 contrib++;
597 }
598 for (j = n; j > 0; j--)
599 {
600 *dst++ = (unsigned char)(*t>>8);
601 *t++ = 128;
602 }
603 t -= n;
604 }
605 }
606 }
607
608 #ifdef ARCH_ARM
609
610 static void
611 scale_row_to_temp1(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
612 __attribute__((naked));
613
614 static void
615 scale_row_to_temp2(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
616 __attribute__((naked));
617
618 static void
619 scale_row_to_temp3(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
620 __attribute__((naked));
621
622 static void
623 scale_row_to_temp4(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
624 __attribute__((naked));
625
626 static void
627 scale_row_from_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
628 __attribute__((naked));
629
630 static void
631 scale_row_from_temp_alpha(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
632 __attribute__((naked));
633
634 static void
635 scale_row_to_temp1(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
636 {
637 asm volatile(
638 ENTER_ARM
639 ".syntax unified\n"
640 "stmfd r13!,{r4-r7,r9,r14} \n"
641 "@ r0 = dst \n"
642 "@ r1 = src \n"
643 "@ r2 = weights \n"
644 "ldr r12,[r2],#4 @ r12= flip \n"
645 "ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
646 "ldr r4, [r2] @ r4 = index[0] \n"
647 "cmp r12,#0 @ if (flip) \n"
648 "beq 5f @ { \n"
649 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
650 "add r0, r0, r3 @ dst += count \n"
651 "1: \n"
652 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
653 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
654 "mov r5, #128 @ r5 = a = 128 \n"
655 "add r4, r1, r4 @ r4 = min = &src[r4] \n"
656 "subs r9, r9, #1 @ len-- \n"
657 "blt 3f @ while (len >= 0) \n"
658 "2: @ { \n"
659 "ldrgt r6, [r2], #4 @ r6 = *contrib++ \n"
660 "ldrbgt r7, [r4], #1 @ r7 = *min++ \n"
661 "ldr r12,[r2], #4 @ r12 = *contrib++ \n"
662 "ldrb r14,[r4], #1 @ r14 = *min++ \n"
663 "mlagt r5, r6, r7, r5 @ g += r6 * r7 \n"
664 "subs r9, r9, #2 @ r9 = len -= 2 \n"
665 "mla r5, r12,r14,r5 @ g += r14 * r12 \n"
666 "bge 2b @ } \n"
667 "3: \n"
668 "mov r5, r5, lsr #8 @ g >>= 8 \n"
669 "strb r5,[r0, #-1]! @ *--dst=a \n"
670 "subs r3, r3, #1 @ i-- \n"
671 "bgt 1b @ \n"
672 "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n"
673 "5:"
674 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
675 "6:"
676 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
677 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
678 "mov r5, #128 @ r5 = a = 128 \n"
679 "add r4, r1, r4 @ r4 = min = &src[r4] \n"
680 "subs r9, r9, #1 @ len-- \n"
681 "blt 9f @ while (len > 0) \n"
682 "7: @ { \n"
683 "ldrgt r6, [r2], #4 @ r6 = *contrib++ \n"
684 "ldrbgt r7, [r4], #1 @ r7 = *min++ \n"
685 "ldr r12,[r2], #4 @ r12 = *contrib++ \n"
686 "ldrb r14,[r4], #1 @ r14 = *min++ \n"
687 "mlagt r5, r6,r7,r5 @ a += r6 * r7 \n"
688 "subs r9, r9, #2 @ r9 = len -= 2 \n"
689 "mla r5, r12,r14,r5 @ a += r14 * r12 \n"
690 "bge 7b @ } \n"
691 "9: \n"
692 "mov r5, r5, LSR #8 @ a >>= 8 \n"
693 "strb r5, [r0], #1 @ *dst++=a \n"
694 "subs r3, r3, #1 @ i-- \n"
695 "bgt 6b @ \n"
696 "ldmfd r13!,{r4-r7,r9,PC} @ pop, return to thumb \n"
697 ENTER_THUMB
698 );
699 }
700
701 static void
702 scale_row_to_temp2(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
703 {
704 asm volatile(
705 ENTER_ARM
706 "stmfd r13!,{r4-r6,r9-r11,r14} \n"
707 "@ r0 = dst \n"
708 "@ r1 = src \n"
709 "@ r2 = weights \n"
710 "ldr r12,[r2],#4 @ r12= flip \n"
711 "ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
712 "ldr r4, [r2] @ r4 = index[0] \n"
713 "cmp r12,#0 @ if (flip) \n"
714 "beq 4f @ { \n"
715 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
716 "add r0, r0, r3, LSL #1 @ dst += 2*count \n"
717 "1: \n"
718 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
719 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
720 "mov r5, #128 @ r5 = g = 128 \n"
721 "mov r6, #128 @ r6 = a = 128 \n"
722 "add r4, r1, r4, LSL #1 @ r4 = min = &src[2*r4] \n"
723 "cmp r9, #0 @ while (len-- > 0) \n"
724 "beq 3f @ { \n"
725 "2: \n"
726 "ldr r14,[r2], #4 @ r14 = *contrib++ \n"
727 "ldrb r11,[r4], #1 @ r11 = *min++ \n"
728 "ldrb r12,[r4], #1 @ r12 = *min++ \n"
729 "subs r9, r9, #1 @ r9 = len-- \n"
730 "mla r5, r14,r11,r5 @ g += r11 * r14 \n"
731 "mla r6, r14,r12,r6 @ a += r12 * r14 \n"
732 "bgt 2b @ } \n"
733 "3: \n"
734 "mov r5, r5, lsr #8 @ g >>= 8 \n"
735 "mov r6, r6, lsr #8 @ a >>= 8 \n"
736 "strb r5, [r0, #-2]! @ *--dst=a \n"
737 "strb r6, [r0, #1] @ *--dst=g \n"
738 "subs r3, r3, #1 @ i-- \n"
739 "bgt 1b @ \n"
740 "ldmfd r13!,{r4-r6,r9-r11,PC} @ pop, return to thumb \n"
741 "4:"
742 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
743 "5:"
744 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
745 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
746 "mov r5, #128 @ r5 = g = 128 \n"
747 "mov r6, #128 @ r6 = a = 128 \n"
748 "add r4, r1, r4, LSL #1 @ r4 = min = &src[2*r4] \n"
749 "cmp r9, #0 @ while (len-- > 0) \n"
750 "beq 7f @ { \n"
751 "6: \n"
752 "ldr r14,[r2], #4 @ r10 = *contrib++ \n"
753 "ldrb r11,[r4], #1 @ r11 = *min++ \n"
754 "ldrb r12,[r4], #1 @ r12 = *min++ \n"
755 "subs r9, r9, #1 @ r9 = len-- \n"
756 "mla r5, r14,r11,r5 @ g += r11 * r14 \n"
757 "mla r6, r14,r12,r6 @ a += r12 * r14 \n"
758 "bgt 6b @ } \n"
759 "7: \n"
760 "mov r5, r5, lsr #8 @ g >>= 8 \n"
761 "mov r6, r6, lsr #8 @ a >>= 8 \n"
762 "strb r5, [r0], #1 @ *dst++=g \n"
763 "strb r6, [r0], #1 @ *dst++=a \n"
764 "subs r3, r3, #1 @ i-- \n"
765 "bgt 5b @ \n"
766 "ldmfd r13!,{r4-r6,r9-r11,PC} @ pop, return to thumb \n"
767 ENTER_THUMB
768 );
769 }
770
771 static void
772 scale_row_to_temp3(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
773 {
774 asm volatile(
775 ENTER_ARM
776 "stmfd r13!,{r4-r11,r14} \n"
777 "@ r0 = dst \n"
778 "@ r1 = src \n"
779 "@ r2 = weights \n"
780 "ldr r12,[r2],#4 @ r12= flip \n"
781 "ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
782 "ldr r4, [r2] @ r4 = index[0] \n"
783 "cmp r12,#0 @ if (flip) \n"
784 "beq 4f @ { \n"
785 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
786 "add r0, r0, r3, LSL #1 @ \n"
787 "add r0, r0, r3 @ dst += 3*count \n"
788 "1: \n"
789 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
790 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
791 "mov r5, #128 @ r5 = r = 128 \n"
792 "mov r6, #128 @ r6 = g = 128 \n"
793 "add r7, r1, r4, LSL #1 @ \n"
794 "add r4, r7, r4 @ r4 = min = &src[3*r4] \n"
795 "mov r7, #128 @ r7 = b = 128 \n"
796 "cmp r9, #0 @ while (len-- > 0) \n"
797 "beq 3f @ { \n"
798 "2: \n"
799 "ldr r14,[r2], #4 @ r14 = *contrib++ \n"
800 "ldrb r8, [r4], #1 @ r8 = *min++ \n"
801 "ldrb r11,[r4], #1 @ r11 = *min++ \n"
802 "ldrb r12,[r4], #1 @ r12 = *min++ \n"
803 "subs r9, r9, #1 @ r9 = len-- \n"
804 "mla r5, r14,r8, r5 @ r += r8 * r14 \n"
805 "mla r6, r14,r11,r6 @ g += r11 * r14 \n"
806 "mla r7, r14,r12,r7 @ b += r12 * r14 \n"
807 "bgt 2b @ } \n"
808 "3: \n"
809 "mov r5, r5, lsr #8 @ r >>= 8 \n"
810 "mov r6, r6, lsr #8 @ g >>= 8 \n"
811 "mov r7, r7, lsr #8 @ b >>= 8 \n"
812 "strb r5, [r0, #-3]! @ *--dst=r \n"
813 "strb r6, [r0, #1] @ *--dst=g \n"
814 "strb r7, [r0, #2] @ *--dst=b \n"
815 "subs r3, r3, #1 @ i-- \n"
816 "bgt 1b @ \n"
817 "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
818 "4:"
819 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
820 "5:"
821 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
822 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
823 "mov r5, #128 @ r5 = r = 128 \n"
824 "mov r6, #128 @ r6 = g = 128 \n"
825 "add r7, r1, r4, LSL #1 @ r7 = min = &src[2*r4] \n"
826 "add r4, r7, r4 @ r4 = min = &src[3*r4] \n"
827 "mov r7, #128 @ r7 = b = 128 \n"
828 "cmp r9, #0 @ while (len-- > 0) \n"
829 "beq 7f @ { \n"
830 "6: \n"
831 "ldr r14,[r2], #4 @ r10 = *contrib++ \n"
832 "ldrb r8, [r4], #1 @ r8 = *min++ \n"
833 "ldrb r11,[r4], #1 @ r11 = *min++ \n"
834 "ldrb r12,[r4], #1 @ r12 = *min++ \n"
835 "subs r9, r9, #1 @ r9 = len-- \n"
836 "mla r5, r14,r8, r5 @ r += r8 * r14 \n"
837 "mla r6, r14,r11,r6 @ g += r11 * r14 \n"
838 "mla r7, r14,r12,r7 @ b += r12 * r14 \n"
839 "bgt 6b @ } \n"
840 "7: \n"
841 "mov r5, r5, lsr #8 @ r >>= 8 \n"
842 "mov r6, r6, lsr #8 @ g >>= 8 \n"
843 "mov r7, r7, lsr #8 @ b >>= 8 \n"
844 "strb r5, [r0], #1 @ *dst++=r \n"
845 "strb r6, [r0], #1 @ *dst++=g \n"
846 "strb r7, [r0], #1 @ *dst++=b \n"
847 "subs r3, r3, #1 @ i-- \n"
848 "bgt 5b @ \n"
849 "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
850 ENTER_THUMB
851 );
852 }
853
854 static void
855 scale_row_to_temp4(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
856 {
857 asm volatile(
858 ENTER_ARM
859 "stmfd r13!,{r4-r11,r14} \n"
860 "@ r0 = dst \n"
861 "@ r1 = src \n"
862 "@ r2 = weights \n"
863 "ldr r12,[r2],#4 @ r12= flip \n"
864 "ldr r3, [r2],#20 @ r3 = count r2 = &index\n"
865 "ldr r4, [r2] @ r4 = index[0] \n"
866 "ldr r5,=0x00800080 @ r5 = rounding \n"
867 "ldr r6,=0x00FF00FF @ r7 = 0x00FF00FF \n"
868 "cmp r12,#0 @ if (flip) \n"
869 "beq 4f @ { \n"
870 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
871 "add r0, r0, r3, LSL #2 @ dst += 4*count \n"
872 "1: \n"
873 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
874 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
875 "mov r7, r5 @ r7 = b = rounding \n"
876 "mov r8, r5 @ r8 = a = rounding \n"
877 "add r4, r1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
878 "cmp r9, #0 @ while (len-- > 0) \n"
879 "beq 3f @ { \n"
880 "2: \n"
881 "ldr r11,[r4], #4 @ r11 = *min++ \n"
882 "ldr r10,[r2], #4 @ r10 = *contrib++ \n"
883 "subs r9, r9, #1 @ r9 = len-- \n"
884 "and r12,r6, r11 @ r12 = __22__00 \n"
885 "and r11,r6, r11,LSR #8 @ r11 = __33__11 \n"
886 "mla r7, r10,r12,r7 @ b += r14 * r10 \n"
887 "mla r8, r10,r11,r8 @ a += r11 * r10 \n"
888 "bgt 2b @ } \n"
889 "3: \n"
890 "and r7, r6, r7, lsr #8 @ r7 = __22__00 \n"
891 "bic r8, r8, r6 @ r8 = 33__11__ \n"
892 "orr r7, r7, r8 @ r7 = 33221100 \n"
893 "str r7, [r0, #-4]! @ *--dst=r \n"
894 "subs r3, r3, #1 @ i-- \n"
895 "bgt 1b @ \n"
896 "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
897 "4: \n"
898 "add r2, r2, r4, LSL #2 @ r2 = &index[index[0]] \n"
899 "5: \n"
900 "ldr r4, [r2], #4 @ r4 = *contrib++ \n"
901 "ldr r9, [r2], #4 @ r9 = len = *contrib++ \n"
902 "mov r7, r5 @ r7 = b = rounding \n"
903 "mov r8, r5 @ r8 = a = rounding \n"
904 "add r4, r1, r4, LSL #2 @ r4 = min = &src[4*r4] \n"
905 "cmp r9, #0 @ while (len-- > 0) \n"
906 "beq 7f @ { \n"
907 "6: \n"
908 "ldr r11,[r4], #4 @ r11 = *min++ \n"
909 "ldr r10,[r2], #4 @ r10 = *contrib++ \n"
910 "subs r9, r9, #1 @ r9 = len-- \n"
911 "and r12,r6, r11 @ r12 = __22__00 \n"
912 "and r11,r6, r11,LSR #8 @ r11 = __33__11 \n"
913 "mla r7, r10,r12,r7 @ b += r14 * r10 \n"
914 "mla r8, r10,r11,r8 @ a += r11 * r10 \n"
915 "bgt 6b @ } \n"
916 "7: \n"
917 "and r7, r6, r7, lsr #8 @ r7 = __22__00 \n"
918 "bic r8, r8, r6 @ r8 = 33__11__ \n"
919 "orr r7, r7, r8 @ r7 = 33221100 \n"
920 "str r7, [r0], #4 @ *dst++=r \n"
921 "subs r3, r3, #1 @ i-- \n"
922 "bgt 5b @ \n"
923 "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
924 ENTER_THUMB
925 );
926 }
927
928 static void
929 scale_row_from_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
930 {
931 asm volatile(
932 ENTER_ARM
933 "stmfd r13!,{r4-r11,r14} \n"
934 "@ r0 = dst \n"
935 "@ r1 = src \n"
936 "@ r2 = &weights->index[0] \n"
937 "@ r3 = width \n"
938 "@ r12= row \n"
939 "ldr r14,[r13,#4*9] @ r14= n \n"
940 "ldr r12,[r13,#4*10] @ r12= row \n"
941 "add r2, r2, #24 @ r2 = weights->index \n"
942 "mul r3, r14, r3 @ r3 = width *= n \n"
943 "ldr r4, [r2, r12, LSL #2] @ r4 = index[row] \n"
944 "add r2, r2, #4 @ r2 = &index[1] \n"
945 "subs r6, r3, #4 @ r6 = x = width-4 \n"
946 "ldr r14,[r2, r4, LSL #2]! @ r2 = contrib = index[index[row]+1]\n"
947 " @ r14= len = *contrib \n"
948 "blt 4f @ while (x >= 0) { \n"
949 #ifndef ARCH_UNALIGNED_OK
950 "tst r3, #3 @ if ((r3 & 3) \n"
951 "tsteq r1, #3 @ || (r1 & 3)) \n"
952 "bne 4f @ can't do fast code \n"
953 #endif
954 "ldr r9, =0x00FF00FF @ r9 = 0x00FF00FF \n"
955 "1: \n"
956 "ldr r7, =0x00800080 @ r5 = val0 = round \n"
957 "stmfd r13!,{r1,r2,r7} @ stash r1,r2,r5 \n"
958 " @ r1 = min = src \n"
959 " @ r2 = contrib2-4 \n"
960 "movs r8, r14 @ r8 = len2 = len \n"
961 "mov r5, r7 @ r7 = val1 = round \n"
962 "ble 3f @ while (len2-- > 0) { \n"
963 "2: \n"
964 "ldr r12,[r1], r3 @ r12 = *min r5 = min += width\n"
965 "ldr r10,[r2, #4]! @ r10 = *contrib2++ \n"
966 "subs r8, r8, #1 @ len2-- \n"
967 "and r11,r9, r12 @ r11= __22__00 \n"
968 "and r12,r9, r12,LSR #8 @ r12= __33__11 \n"
969 "mla r5, r10,r11,r5 @ r5 = val0 += r11 * r10\n"
970 "mla r7, r10,r12,r7 @ r7 = val1 += r12 * r10\n"
971 "bgt 2b @ } \n"
972 "and r5, r9, r5, LSR #8 @ r5 = __22__00 \n"
973 "and r7, r7, r9, LSL #8 @ r7 = 33__11__ \n"
974 "orr r5, r5, r7 @ r5 = 33221100 \n"
975 "3: \n"
976 "ldmfd r13!,{r1,r2,r7} @ restore r1,r2,r7 \n"
977 "subs r6, r6, #4 @ x-- \n"
978 "add r1, r1, #4 @ src++ \n"
979 "str r5, [r0], #4 @ *dst++ = val \n"
980 "bge 1b @ \n"
981 "4: @ } (Less than 4 to go) \n"
982 "adds r6, r6, #4 @ r6 = x += 4 \n"
983 "beq 8f @ if (x == 0) done \n"
984 "5: \n"
985 "mov r5, r1 @ r5 = min = src \n"
986 "mov r7, #128 @ r7 = val = 128 \n"
987 "movs r8, r14 @ r8 = len2 = len \n"
988 "add r9, r2, #4 @ r9 = contrib2 \n"
989 "ble 7f @ while (len2-- > 0) { \n"
990 "6: \n"
991 "ldr r10,[r9], #4 @ r10 = *contrib2++ \n"
992 "ldrb r12,[r5], r3 @ r12 = *min r5 = min += width\n"
993 "subs r8, r8, #1 @ len2-- \n"
994 "@ stall r12 \n"
995 "mla r7, r10,r12,r7 @ val += r12 * r10 \n"
996 "bgt 6b @ } \n"
997 "7: \n"
998 "mov r7, r7, asr #8 @ r7 = val >>= 8 \n"
999 "subs r6, r6, #1 @ x-- \n"
1000 "add r1, r1, #1 @ src++ \n"
1001 "strb r7, [r0], #1 @ *dst++ = val \n"
1002 "bgt 5b @ \n"
1003 "8: \n"
1004 "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
1005 ".ltorg \n"
1006 ENTER_THUMB
1007 );
1008 }
1009
1010 static void
1011 scale_row_from_temp_alpha(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int width, int n, int row)
1012 {
1013 asm volatile(
1014 ENTER_ARM
1015 "stmfd r13!,{r4-r11,r14} \n"
1016 "mov r11,#255 @ r11= 255 \n"
1017 "ldr r12,[r13,#4*10] @ r12= row \n"
1018 "@ r0 = dst \n"
1019 "@ r1 = src \n"
1020 "@ r2 = &weights->index[0] \n"
1021 "@ r3 = width \n"
1022 "@ r11= 255 \n"
1023 "@ r12= row \n"
1024 "add r2, r2, #24 @ r2 = weights->index \n"
1025 "ldr r4, [r2, r12, LSL #2] @ r4 = index[row] \n"
1026 "add r2, r2, #4 @ r2 = &index[1] \n"
1027 "mov r6, r3 @ r6 = x = width \n"
1028 "ldr r14,[r2, r4, LSL #2]! @ r2 = contrib = index[index[row]+1]\n"
1029 " @ r14= len = *contrib \n"
1030 "5: \n"
1031 "ldr r4,[r13,#4*9] @ r10= nn = n \n"
1032 "1: \n"
1033 "mov r5, r1 @ r5 = min = src \n"
1034 "mov r7, #128 @ r7 = val = 128 \n"
1035 "movs r8, r14 @ r8 = len2 = len \n"
1036 "add r9, r2, #4 @ r9 = contrib2 \n"
1037 "ble 7f @ while (len2-- > 0) { \n"
1038 "6: \n"
1039 "ldr r10,[r9], #4 @ r10 = *contrib2++ \n"
1040 "ldrb r12,[r5], r3 @ r12 = *min r5 = min += width\n"
1041 "subs r8, r8, #1 @ len2-- \n"
1042 "@ stall r12 \n"
1043 "mla r7, r10,r12,r7 @ val += r12 * r10 \n"
1044 "bgt 6b @ } \n"
1045 "7: \n"
1046 "mov r7, r7, asr #8 @ r7 = val >>= 8 \n"
1047 "subs r4, r4, #1 @ r4 = nn-- \n"
1048 "add r1, r1, #1 @ src++ \n"
1049 "strb r7, [r0], #1 @ *dst++ = val \n"
1050 "bgt 1b @ \n"
1051 "subs r6, r6, #1 @ x-- \n"
1052 "strb r11,[r0], #1 @ *dst++ = 255 \n"
1053 "bgt 5b @ \n"
1054 "ldmfd r13!,{r4-r11,PC} @ pop, return to thumb \n"
1055 ".ltorg \n"
1056 ENTER_THUMB
1057 );
1058 }
1059 #else
1060
1061 static void
1062 scale_row_to_temp1(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1063 {
1064 const int *contrib = &weights->index[weights->index[0]];
1065 int len, i;
1066 const unsigned char *min;
1067
1068 assert(weights->n == 1);
1069 if (weights->flip)
1070 {
1071 dst += weights->count;
1072 for (i=weights->count; i > 0; i--)
1073 {
1074 int val = 128;
1075 min = &src[*contrib++];
1076 len = *contrib++;
1077 while (len-- > 0)
1078 {
1079 val += *min++ * *contrib++;
1080 }
1081 *--dst = (unsigned char)(val>>8);
1082 }
1083 }
1084 else
1085 {
1086 for (i=weights->count; i > 0; i--)
1087 {
1088 int val = 128;
1089 min = &src[*contrib++];
1090 len = *contrib++;
1091 while (len-- > 0)
1092 {
1093 val += *min++ * *contrib++;
1094 }
1095 *dst++ = (unsigned char)(val>>8);
1096 }
1097 }
1098 }
1099
1100 static void
1101 scale_row_to_temp2(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1102 {
1103 const int *contrib = &weights->index[weights->index[0]];
1104 int len, i;
1105 const unsigned char *min;
1106
1107 assert(weights->n == 2);
1108 if (weights->flip)
1109 {
1110 dst += 2*weights->count;
1111 for (i=weights->count; i > 0; i--)
1112 {
1113 int c1 = 128;
1114 int c2 = 128;
1115 min = &src[2 * *contrib++];
1116 len = *contrib++;
1117 while (len-- > 0)
1118 {
1119 c1 += *min++ * *contrib;
1120 c2 += *min++ * *contrib++;
1121 }
1122 *--dst = (unsigned char)(c2>>8);
1123 *--dst = (unsigned char)(c1>>8);
1124 }
1125 }
1126 else
1127 {
1128 for (i=weights->count; i > 0; i--)
1129 {
1130 int c1 = 128;
1131 int c2 = 128;
1132 min = &src[2 * *contrib++];
1133 len = *contrib++;
1134 while (len-- > 0)
1135 {
1136 c1 += *min++ * *contrib;
1137 c2 += *min++ * *contrib++;
1138 }
1139 *dst++ = (unsigned char)(c1>>8);
1140 *dst++ = (unsigned char)(c2>>8);
1141 }
1142 }
1143 }
1144
1145 static void
1146 scale_row_to_temp3(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1147 {
1148 const int *contrib = &weights->index[weights->index[0]];
1149 int len, i;
1150 const unsigned char *min;
1151
1152 assert(weights->n == 3);
1153 if (weights->flip)
1154 {
1155 dst += 3*weights->count;
1156 for (i=weights->count; i > 0; i--)
1157 {
1158 int c1 = 128;
1159 int c2 = 128;
1160 int c3 = 128;
1161 min = &src[3 * *contrib++];
1162 len = *contrib++;
1163 while (len-- > 0)
1164 {
1165 int c = *contrib++;
1166 c1 += *min++ * c;
1167 c2 += *min++ * c;
1168 c3 += *min++ * c;
1169 }
1170 *--dst = (unsigned char)(c3>>8);
1171 *--dst = (unsigned char)(c2>>8);
1172 *--dst = (unsigned char)(c1>>8);
1173 }
1174 }
1175 else
1176 {
1177 for (i=weights->count; i > 0; i--)
1178 {
1179 int c1 = 128;
1180 int c2 = 128;
1181 int c3 = 128;
1182 min = &src[3 * *contrib++];
1183 len = *contrib++;
1184 while (len-- > 0)
1185 {
1186 int c = *contrib++;
1187 c1 += *min++ * c;
1188 c2 += *min++ * c;
1189 c3 += *min++ * c;
1190 }
1191 *dst++ = (unsigned char)(c1>>8);
1192 *dst++ = (unsigned char)(c2>>8);
1193 *dst++ = (unsigned char)(c3>>8);
1194 }
1195 }
1196 }
1197
1198 static void
1199 scale_row_to_temp4(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights)
1200 {
1201 const int *contrib = &weights->index[weights->index[0]];
1202 int len, i;
1203 const unsigned char *min;
1204
1205 assert(weights->n == 4);
1206 if (weights->flip)
1207 {
1208 dst += 4*weights->count;
1209 for (i=weights->count; i > 0; i--)
1210 {
1211 int r = 128;
1212 int g = 128;
1213 int b = 128;
1214 int a = 128;
1215 min = &src[4 * *contrib++];
1216 len = *contrib++;
1217 while (len-- > 0)
1218 {
1219 r += *min++ * *contrib;
1220 g += *min++ * *contrib;
1221 b += *min++ * *contrib;
1222 a += *min++ * *contrib++;
1223 }
1224 *--dst = (unsigned char)(a>>8);
1225 *--dst = (unsigned char)(b>>8);
1226 *--dst = (unsigned char)(g>>8);
1227 *--dst = (unsigned char)(r>>8);
1228 }
1229 }
1230 else
1231 {
1232 for (i=weights->count; i > 0; i--)
1233 {
1234 int r = 128;
1235 int g = 128;
1236 int b = 128;
1237 int a = 128;
1238 min = &src[4 * *contrib++];
1239 len = *contrib++;
1240 while (len-- > 0)
1241 {
1242 r += *min++ * *contrib;
1243 g += *min++ * *contrib;
1244 b += *min++ * *contrib;
1245 a += *min++ * *contrib++;
1246 }
1247 *dst++ = (unsigned char)(r>>8);
1248 *dst++ = (unsigned char)(g>>8);
1249 *dst++ = (unsigned char)(b>>8);
1250 *dst++ = (unsigned char)(a>>8);
1251 }
1252 }
1253 }
1254
1255 static void
1256 scale_row_from_temp(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int w, int n, int row)
1257 {
1258 const int *contrib = &weights->index[weights->index[row]];
1259 int len, x;
1260 int width = w * n;
1261
1262 contrib++; /* Skip min */
1263 len = *contrib++;
1264 for (x=width; x > 0; x--)
1265 {
1266 const unsigned char *min = src;
1267 int val = 128;
1268 int len2 = len;
1269 const int *contrib2 = contrib;
1270
1271 while (len2-- > 0)
1272 {
1273 val += *min * *contrib2++;
1274 min += width;
1275 }
1276 *dst++ = (unsigned char)(val>>8);
1277 src++;
1278 }
1279 }
1280
1281 static void
1282 scale_row_from_temp_alpha(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int w, int n, int row)
1283 {
1284 const int *contrib = &weights->index[weights->index[row]];
1285 int len, x;
1286 int width = w * n;
1287
1288 contrib++; /* Skip min */
1289 len = *contrib++;
1290 for (x=w; x > 0; x--)
1291 {
1292 int nn;
1293 for (nn = n; nn > 0; nn--)
1294 {
1295 const unsigned char *min = src;
1296 int val = 128;
1297 int len2 = len;
1298 const int *contrib2 = contrib;
1299
1300 while (len2-- > 0)
1301 {
1302 val += *min * *contrib2++;
1303 min += width;
1304 }
1305 *dst++ = (unsigned char)(val>>8);
1306 src++;
1307 }
1308 *dst++ = 255;
1309 }
1310 }
1311 #endif
1312
1313 #ifdef SINGLE_PIXEL_SPECIALS
1314 static void
1315 duplicate_single_pixel(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, int n, int forcealpha, int w, int h, int stride)
1316 {
1317 int i;
1318
1319 for (i = n; i > 0; i--)
1320 *dst++ = *src++;
1321 if (forcealpha)
1322 *dst++ = 255;
1323 n += forcealpha;
1324 for (i = w-1; i > 0; i--)
1325 {
1326 memcpy(dst, dst-n, n);
1327 dst += n;
1328 }
1329 w *= n;
1330 dst -= w;
1331 h--;
1332 while (h--)
1333 {
1334 memcpy(dst+stride, dst, w);
1335 dst += stride;
1336 }
1337 }
1338
1339 static void
1340 scale_single_row(unsigned char * FZ_RESTRICT dst, int dstride, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int src_w, int h, int forcealpha)
1341 {
1342 const int *contrib = &weights->index[weights->index[0]];
1343 int min, len, i, j, n, nf;
1344 int tmp[FZ_MAX_COLORS];
1345
1346 n = weights->n;
1347 nf = n + forcealpha;
1348 /* Scale a single row */
1349 for (j = 0; j < nf; j++)
1350 tmp[j] = 128;
1351 if (weights->flip)
1352 {
1353 dst += (weights->count-1)*nf;
1354 for (i=weights->count; i > 0; i--)
1355 {
1356 min = *contrib++;
1357 len = *contrib++;
1358 min *= n;
1359 while (len-- > 0)
1360 {
1361 int c = *contrib++;
1362 for (j = 0; j < n; j++)
1363 tmp[j] += src[min++] * c;
1364 if (forcealpha)
1365 tmp[j] += 255 * c;
1366 }
1367 for (j = 0; j < nf; j++)
1368 {
1369 *dst++ = (unsigned char)(tmp[j]>>8);
1370 tmp[j] = 128;
1371 }
1372 dst -= 2*nf;
1373 }
1374 dst += nf + dstride;
1375 }
1376 else
1377 {
1378 for (i=weights->count; i > 0; i--)
1379 {
1380 min = *contrib++;
1381 len = *contrib++;
1382 min *= n;
1383 while (len-- > 0)
1384 {
1385 int c = *contrib++;
1386 for (j = 0; j < n; j++)
1387 tmp[j] += src[min++] * c;
1388 if (forcealpha)
1389 tmp[j] += 255 * c;
1390 }
1391 for (j = 0; j < nf; j++)
1392 {
1393 *dst++ = (unsigned char)(tmp[j]>>8);
1394 tmp[j] = 128;
1395 }
1396 }
1397 dst += dstride - weights->count * nf;
1398 }
1399 /* And then duplicate it h times */
1400 nf *= weights->count;
1401 while (--h > 0)
1402 {
1403 memcpy(dst, dst-dstride, nf);
1404 dst += dstride;
1405 }
1406 }
1407
1408 static void
1409 scale_single_col(unsigned char * FZ_RESTRICT dst, int dstride, const unsigned char * FZ_RESTRICT src, int sstride, const fz_weights * FZ_RESTRICT weights, int src_w, int n, int w, int forcealpha)
1410 {
1411 const int *contrib = &weights->index[weights->index[0]];
1412 int min, len, i, j;
1413 int tmp[FZ_MAX_COLORS];
1414 int nf = n + forcealpha;
1415
1416 for (j = 0; j < nf; j++)
1417 tmp[j] = 128;
1418 if (weights->flip)
1419 {
1420 src_w = (src_w-1)*sstride;
1421 for (i=weights->count; i > 0; i--)
1422 {
1423 /* Scale the next pixel in the column */
1424 min = *contrib++;
1425 len = *contrib++;
1426 min = src_w-min*sstride;
1427 while (len-- > 0)
1428 {
1429 int c = *contrib++;
1430 for (j = 0; j < n; j++)
1431 tmp[j] += src[min+j] * c;
1432 if (forcealpha)
1433 tmp[j] += 255 * c;
1434 min -= sstride;
1435 }
1436 for (j = 0; j < nf; j++)
1437 {
1438 *dst++ = (unsigned char)(tmp[j]>>8);
1439 tmp[j] = 128;
1440 }
1441 /* And then duplicate it across the row */
1442 for (j = (w-1)*nf; j > 0; j--)
1443 {
1444 *dst = dst[-nf];
1445 dst++;
1446 }
1447 dst += dstride - w*nf;
1448 }
1449 }
1450 else
1451 {
1452 for (i=weights->count; i > 0; i--)
1453 {
1454 /* Scale the next pixel in the column */
1455 min = *contrib++;
1456 len = *contrib++;
1457 min *= sstride;
1458 while (len-- > 0)
1459 {
1460 int c = *contrib++;
1461 for (j = 0; j < n; j++)
1462 tmp[j] += src[min+j] * c;
1463 if (forcealpha)
1464 tmp[j] += 255 * c;
1465 min += sstride;
1466 }
1467 for (j = 0; j < nf; j++)
1468 {
1469 *dst++ = (unsigned char)(tmp[j]>>8);
1470 tmp[j] = 128;
1471 }
1472 /* And then duplicate it across the row */
1473 for (j = (w-1)*nf; j > 0; j--)
1474 {
1475 *dst = dst[-nf];
1476 dst++;
1477 }
1478 dst += dstride - w*nf;
1479 }
1480 }
1481 }
1482 #endif /* SINGLE_PIXEL_SPECIALS */
1483
1484 static void
1485 get_alpha_edge_values(const fz_weights * FZ_RESTRICT rows, int * FZ_RESTRICT tp, int * FZ_RESTRICT bp)
1486 {
1487 const int *contrib = &rows->index[rows->index[0]];
1488 int len, i, t, b;
1489
1490 /* Calculate the edge alpha values */
1491 contrib++; /* Skip min */
1492 len = *contrib++;
1493 t = 0;
1494 while (len--)
1495 t += *contrib++;
1496 for (i=rows->count-2; i > 0; i--)
1497 {
1498 contrib++; /* Skip min */
1499 len = *contrib++;
1500 contrib += len;
1501 }
1502 b = 0;
1503 if (i == 0)
1504 {
1505 contrib++;
1506 len = *contrib++;
1507 while (len--)
1508 b += *contrib++;
1509 }
1510 if (rows->flip && i == 0)
1511 {
1512 *tp = b;
1513 *bp = t;
1514 }
1515 else
1516 {
1517 *tp = t;
1518 *bp = b;
1519 }
1520 }
1521
1522 static void
1523 adjust_alpha_edges(fz_pixmap * FZ_RESTRICT pix, const fz_weights * FZ_RESTRICT rows, const fz_weights * FZ_RESTRICT cols)
1524 {
1525 int t, l, r, b, tl, tr, bl, br, x, y;
1526 unsigned char *dp = pix->samples;
1527 int w = pix->w;
1528 int n = pix->n;
1529 int span = w >= 2 ? (w-1)*n : 0;
1530 int stride = pix->stride;
1531
1532 get_alpha_edge_values(rows, &t, &b);
1533 get_alpha_edge_values(cols, &l, &r);
1534
1535 l = (255 * l + 128)>>8;
1536 r = (255 * r + 128)>>8;
1537 tl = (l * t + 128)>>8;
1538 tr = (r * t + 128)>>8;
1539 bl = (l * b + 128)>>8;
1540 br = (r * b + 128)>>8;
1541 t = (255 * t + 128)>>8;
1542 b = (255 * b + 128)>>8;
1543 dp += n-1;
1544 *dp = tl;
1545 dp += n;
1546 for (x = w-2; x > 0; x--)
1547 {
1548 *dp = t;
1549 dp += n;
1550 }
1551 if (x == 0)
1552 {
1553 *dp = tr;
1554 dp += n;
1555 }
1556 dp += stride - w*n;
1557 for (y = pix->h-2; y > 0; y--)
1558 {
1559 dp[span] = r;
1560 *dp = l;
1561 dp += stride;
1562 }
1563 if (y == 0)
1564 {
1565 *dp = bl;
1566 dp += n;
1567 for (x = w-2; x > 0; x--)
1568 {
1569 *dp = b;
1570 dp += n;
1571 }
1572 if (x == 0)
1573 {
1574 *dp = br;
1575 }
1576 }
1577 }
1578
1579 fz_pixmap *
1580 fz_scale_pixmap(fz_context *ctx, fz_pixmap *src, float x, float y, float w, float h, const fz_irect *clip)
1581 {
1582 return fz_scale_pixmap_cached(ctx, src, x, y, w, h, clip, NULL, NULL);
1583 }
1584
1585 fz_pixmap *
1586 fz_scale_pixmap_cached(fz_context *ctx, const fz_pixmap *src, float x, float y, float w, float h, const fz_irect *clip, fz_scale_cache *cache_x, fz_scale_cache *cache_y)
1587 {
1588 fz_scale_filter *filter = &fz_scale_filter_simple;
1589 fz_weights *contrib_rows = NULL;
1590 fz_weights *contrib_cols = NULL;
1591 fz_pixmap *output = NULL;
1592 unsigned char *temp = NULL;
1593 int max_row, temp_span, temp_rows, row;
1594 int dst_w_int, dst_h_int, dst_x_int, dst_y_int;
1595 int flip_x, flip_y, forcealpha;
1596 fz_rect patch;
1597
1598 fz_var(contrib_cols);
1599 fz_var(contrib_rows);
1600
1601 /* Avoid extreme scales where overflows become problematic. */
1602 if (w > (1<<24) || h > (1<<24) || w < -(1<<24) || h < -(1<<24))
1603 return NULL;
1604 if (x > (1<<24) || y > (1<<24) || x < -(1<<24) || y < -(1<<24))
1605 return NULL;
1606
1607 /* Clamp small ranges of w and h */
1608 if (w <= -1)
1609 {
1610 /* Large negative range. Don't clamp */
1611 }
1612 else if (w < 0)
1613 {
1614 w = -1;
1615 }
1616 else if (w < 1)
1617 {
1618 w = 1;
1619 }
1620 if (h <= -1)
1621 {
1622 /* Large negative range. Don't clamp */
1623 }
1624 else if (h < 0)
1625 {
1626 h = -1;
1627 }
1628 else if (h < 1)
1629 {
1630 h = 1;
1631 }
1632
1633 /* If the src has an alpha, we'll make the dst have an alpha automatically.
1634 * We also need to force the dst to have an alpha if x/y/w/h aren't ints. */
1635 forcealpha = !src->alpha && (x != (float)(int)x || y != (float)(int)y || w != (float)(int)w || h != (float)(int)h);
1636
1637 /* Find the destination bbox, width/height, and sub pixel offset,
1638 * allowing for whether we're flipping or not. */
1639 /* The (x,y) position given describes where the top left corner
1640 * of the source image should be mapped to (i.e. where (0,0) in image
1641 * space ends up). Also there are differences in the way we scale
1642 * horizontally and vertically. When scaling rows horizontally, we
1643 * always read forwards through the source, and store either forwards
1644 * or in reverse as required. When scaling vertically, we always store
1645 * out forwards, but may feed source rows in in a different order.
1646 *
1647 * Consider the image rectangle 'r' to which the image is mapped,
1648 * and the (possibly) larger rectangle 'R', given by expanding 'r' to
1649 * complete pixels.
1650 *
1651 * x can either be r.xmin-R.xmin or R.xmax-r.xmax depending on whether
1652 * the image is x flipped or not. Whatever happens 0 <= x < 1.
1653 * y is always R.ymax - r.ymax.
1654 */
1655 /* dst_x_int is calculated to be the left of the scaled image, and
1656 * x (the sub pixel offset) is the distance in from either the left
1657 * or right pixel expanded edge. */
1658 flip_x = (w < 0);
1659 if (flip_x)
1660 {
1661 float tmp;
1662 w = -w;
1663 dst_x_int = floorf(x-w);
1664 tmp = ceilf(x);
1665 dst_w_int = (int)tmp;
1666 x = tmp - x;
1667 dst_w_int -= dst_x_int;
1668 }
1669 else
1670 {
1671 dst_x_int = floorf(x);
1672 x -= dst_x_int;
1673 dst_w_int = (int)ceilf(x + w);
1674 }
1675 /* dst_y_int is calculated to be the top of the scaled image, and
1676 * y (the sub pixel offset) is the distance in from either the top
1677 * or bottom pixel expanded edge.
1678 */
1679 flip_y = (h < 0);
1680 if (flip_y)
1681 {
1682 float tmp;
1683 h = -h;
1684 dst_y_int = floorf(y-h);
1685 tmp = ceilf(y);
1686 dst_h_int = (int)tmp;
1687 y = tmp - y;
1688 dst_h_int -= dst_y_int;
1689 }
1690 else
1691 {
1692 dst_y_int = floorf(y);
1693 y -= dst_y_int;
1694 dst_h_int = (int)ceilf(y + h);
1695 }
1696
1697 fz_valgrind_pixmap(src);
1698
1699 /* Step 0: Calculate the patch */
1700 patch.x0 = 0;
1701 patch.y0 = 0;
1702 patch.x1 = dst_w_int;
1703 patch.y1 = dst_h_int;
1704 if (clip)
1705 {
1706 if (flip_x)
1707 {
1708 if (dst_x_int + dst_w_int > clip->x1)
1709 patch.x0 = dst_x_int + dst_w_int - clip->x1;
1710 if (clip->x0 > dst_x_int)
1711 {
1712 patch.x1 = dst_w_int - (clip->x0 - dst_x_int);
1713 dst_x_int = clip->x0;
1714 }
1715 }
1716 else
1717 {
1718 if (dst_x_int + dst_w_int > clip->x1)
1719 patch.x1 = clip->x1 - dst_x_int;
1720 if (clip->x0 > dst_x_int)
1721 {
1722 patch.x0 = clip->x0 - dst_x_int;
1723 dst_x_int += patch.x0;
1724 }
1725 }
1726
1727 if (flip_y)
1728 {
1729 if (dst_y_int + dst_h_int > clip->y1)
1730 patch.y1 = clip->y1 - dst_y_int;
1731 if (clip->y0 > dst_y_int)
1732 {
1733 patch.y0 = clip->y0 - dst_y_int;
1734 dst_y_int = clip->y0;
1735 }
1736 }
1737 else
1738 {
1739 if (dst_y_int + dst_h_int > clip->y1)
1740 patch.y1 = clip->y1 - dst_y_int;
1741 if (clip->y0 > dst_y_int)
1742 {
1743 patch.y0 = clip->y0 - dst_y_int;
1744 dst_y_int += patch.y0;
1745 }
1746 }
1747 }
1748 if (patch.x0 >= patch.x1 || patch.y0 >= patch.y1)
1749 return NULL;
1750
1751 fz_try(ctx)
1752 {
1753 /* Step 1: Calculate the weights for columns and rows */
1754 #ifdef SINGLE_PIXEL_SPECIALS
1755 if (src->w == 1)
1756 contrib_cols = NULL;
1757 else
1758 #endif /* SINGLE_PIXEL_SPECIALS */
1759 contrib_cols = Memento_label(make_weights(ctx, src->w, x, w, filter, 0, dst_w_int, patch.x0, patch.x1, src->n, flip_x, cache_x), "contrib_cols");
1760 #ifdef SINGLE_PIXEL_SPECIALS
1761 if (src->h == 1)
1762 contrib_rows = NULL;
1763 else
1764 #endif /* SINGLE_PIXEL_SPECIALS */
1765 contrib_rows = Memento_label(make_weights(ctx, src->h, y, h, filter, 1, dst_h_int, patch.y0, patch.y1, src->n, flip_y, cache_y), "contrib_rows");
1766
1767 output = fz_new_pixmap(ctx, src->colorspace, patch.x1 - patch.x0, patch.y1 - patch.y0, src->seps, src->alpha || forcealpha);
1768 }
1769 fz_catch(ctx)
1770 {
1771 if (!cache_x)
1772 fz_free(ctx, contrib_cols);
1773 if (!cache_y)
1774 fz_free(ctx, contrib_rows);
1775 fz_rethrow(ctx);
1776 }
1777 output->x = dst_x_int;
1778 output->y = dst_y_int;
1779
1780 /* Step 2: Apply the weights */
1781 #ifdef SINGLE_PIXEL_SPECIALS
1782 if (!contrib_rows)
1783 {
1784 /* Only 1 source pixel high. */
1785 if (!contrib_cols)
1786 {
1787 /* Only 1 pixel in the entire image! */
1788 duplicate_single_pixel(output->samples, src->samples, src->n, forcealpha, patch.x1-patch.x0, patch.y1-patch.y0, output->stride);
1789 fz_valgrind_pixmap(output);
1790 }
1791 else
1792 {
1793 /* Scale the row once, then copy it. */
1794 scale_single_row(output->samples, output->stride, src->samples, contrib_cols, src->w, patch.y1-patch.y0, forcealpha);
1795 fz_valgrind_pixmap(output);
1796 }
1797 }
1798 else if (!contrib_cols)
1799 {
1800 /* Only 1 source pixel wide. Scale the col and duplicate. */
1801 scale_single_col(output->samples, output->stride, src->samples, src->stride, contrib_rows, src->h, src->n, patch.x1-patch.x0, forcealpha);
1802 fz_valgrind_pixmap(output);
1803 }
1804 else
1805 #endif /* SINGLE_PIXEL_SPECIALS */
1806 {
1807 void (*row_scale_in)(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights);
1808 void (*row_scale_out)(unsigned char * FZ_RESTRICT dst, const unsigned char * FZ_RESTRICT src, const fz_weights * FZ_RESTRICT weights, int w, int n, int row);
1809
1810 temp_span = contrib_cols->count * src->n;
1811 temp_rows = contrib_rows->max_len;
1812 if (temp_span <= 0 || temp_rows > INT_MAX / temp_span)
1813 goto cleanup;
1814 fz_try(ctx)
1815 {
1816 temp = fz_calloc(ctx, (size_t)temp_span*temp_rows, sizeof(unsigned char));
1817 }
1818 fz_catch(ctx)
1819 {
1820 fz_drop_pixmap(ctx, output);
1821 if (!cache_x)
1822 fz_free(ctx, contrib_cols);
1823 if (!cache_y)
1824 fz_free(ctx, contrib_rows);
1825 fz_rethrow(ctx);
1826 }
1827 switch (src->n)
1828 {
1829 default:
1830 row_scale_in = scale_row_to_temp;
1831 break;
1832 case 1: /* Image mask case or Greyscale case */
1833 row_scale_in = scale_row_to_temp1;
1834 break;
1835 case 2: /* Greyscale with alpha case */
1836 row_scale_in = scale_row_to_temp2;
1837 break;
1838 case 3: /* RGB case */
1839 row_scale_in = scale_row_to_temp3;
1840 break;
1841 case 4: /* RGBA or CMYK case */
1842 row_scale_in = scale_row_to_temp4;
1843 break;
1844 }
1845 row_scale_out = forcealpha ? scale_row_from_temp_alpha : scale_row_from_temp;
1846 max_row = contrib_rows->index[contrib_rows->index[0]];
1847 for (row = 0; row < contrib_rows->count; row++)
1848 {
1849 /*
1850 Which source rows do we need to have scaled into the
1851 temporary buffer in order to be able to do the final
1852 scale?
1853 */
1854 int row_index = contrib_rows->index[row];
1855 int row_min = contrib_rows->index[row_index++];
1856 int row_len = contrib_rows->index[row_index];
1857 while (max_row < row_min+row_len)
1858 {
1859 /* Scale another row */
1860 assert(max_row < src->h);
1861 (*row_scale_in)(&temp[temp_span*(max_row % temp_rows)], &src->samples[(flip_y ? (src->h-1-max_row): max_row)*src->stride], contrib_cols);
1862 max_row++;
1863 }
1864
1865 (*row_scale_out)(&output->samples[row*output->stride], temp, contrib_rows, contrib_cols->count, src->n, row);
1866 }
1867 fz_free(ctx, temp);
1868
1869 if (forcealpha)
1870 adjust_alpha_edges(output, contrib_rows, contrib_cols);
1871
1872 fz_valgrind_pixmap(output);
1873 }
1874
1875 cleanup:
1876 if (!cache_y)
1877 fz_free(ctx, contrib_rows);
1878 if (!cache_x)
1879 fz_free(ctx, contrib_cols);
1880
1881 return output;
1882 }
1883
1884 void
1885 fz_drop_scale_cache(fz_context *ctx, fz_scale_cache *sc)
1886 {
1887 if (!sc)
1888 return;
1889 fz_free(ctx, sc->weights);
1890 fz_free(ctx, sc);
1891 }
1892
1893 fz_scale_cache *
1894 fz_new_scale_cache(fz_context *ctx)
1895 {
1896 return fz_malloc_struct(ctx, fz_scale_cache);
1897 }