comparison mupdf-source/source/fitz/deskew_neon.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 /* This file is included from deskew.c if NEON cores are allowed. */
24
25 #include "arm_neon.h"
26
27 static void
28 zoom_x1_neon(uint8_t * FZ_RESTRICT tmp,
29 const uint8_t * FZ_RESTRICT src,
30 const index_t * FZ_RESTRICT index,
31 const weight_t * FZ_RESTRICT weights,
32 uint32_t dst_w,
33 uint32_t src_w,
34 uint32_t channels,
35 const uint8_t * FZ_RESTRICT bg)
36 {
37 int32x4_t round = vdupq_n_s32(WEIGHT_ROUND);
38
39 if (0)
40 slow:
41 {
42 /* Do any where we might index off the edge of the source */
43 int pix_num = index->first_pixel;
44 const uint8_t *s = &src[pix_num];
45 const weight_t *w = &weights[index->index];
46 uint32_t j = index->n;
47 int32_t pixel0 = WEIGHT_ROUND;
48 if (pix_num < 0)
49 {
50 int32_t wt = *w++;
51 assert(pix_num == -1);
52 pixel0 += bg[0] * wt;
53 s++;
54 j--;
55 pix_num = 0;
56 }
57 pix_num = (int)src_w - pix_num;
58 if (pix_num > (int)j)
59 pix_num = j;
60 j -= pix_num;
61 while (pix_num > 0)
62 {
63 pixel0 += *s++ * *w++;
64 pix_num--;
65 }
66 if (j > 0)
67 {
68 assert(j == 1);
69 pixel0 += bg[0] * *w;
70 }
71 pixel0 >>= WEIGHT_SHIFT;
72 *tmp++ = CLAMP(pixel0, 0, 255);
73 index++;
74 dst_w--;
75 }
76
77 while (dst_w > 0)
78 {
79 const uint8_t *s;
80 uint32_t j;
81 const weight_t *w;
82
83 /* Jump out of band to do the (rare) slow (edge) pixels */
84 if (index->slow)
85 goto slow;
86
87 s = &src[index->first_pixel];
88 j = index->n;
89 w = &weights[index->index];
90 if (j <= 4)
91 {
92 int32x4_t q_pair_sum;
93 int16x4_t wts = vld1_s16(w);
94 uint8x8_t pix_bytes = vld1_u8(s);
95 int16x4_t pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix_bytes)));
96 int32x4_t sum = vmlal_s16(round, pix16, wts);
97 int32x2_t pair_sum = vpadd_s32(vget_high_s32(sum), vget_low_s32(sum));
98 pair_sum = vpadd_s32(pair_sum, pair_sum);
99 q_pair_sum = vcombine_s32(pair_sum, vget_high_s32(q_pair_sum));
100 *tmp++ = vget_lane_u8(vreinterpret_u8_u16(vqshrun_n_s32(q_pair_sum, WEIGHT_SHIFT-8)), 1);
101 }
102 else if (j <= 8)
103 {
104 int32x4_t q_pair_sum;
105 int16x8_t wts = vld1q_s16(w);
106 uint8x8_t pix_bytes = vld1_u8(s);
107 int16x8_t pix16 = vreinterpretq_s16_u16(vmovl_u8(pix_bytes));
108 int32x4_t sum = vmlal_s16(vmlal_s16(round, vget_low_s16(pix16), vget_low_s16(wts)),
109 vget_high_s16(pix16), vget_high_s16(wts));
110 int32x2_t pair_sum = vpadd_s32(vget_high_s32(sum), vget_low_s32(sum));
111 pair_sum = vpadd_s32(pair_sum, pair_sum);
112 q_pair_sum = vcombine_s32(pair_sum, vget_high_s32(q_pair_sum));
113 *tmp++ = vget_lane_u8(vreinterpret_u8_u16(vqshrun_n_s32(q_pair_sum, WEIGHT_SHIFT-8)), 1);
114 }
115 else
116 {
117 int32_t pixel0 = WEIGHT_ROUND;
118 for (j = index->n; j > 0; j--)
119 {
120 pixel0 += *s++ * *w++;
121 }
122 pixel0 >>= WEIGHT_SHIFT;
123 *tmp++ = CLAMP(pixel0, 0, 255);
124 }
125 index++;
126 dst_w--;
127 }
128 }
129
130 static void
131 zoom_x3_neon(uint8_t * FZ_RESTRICT tmp,
132 const uint8_t * FZ_RESTRICT src,
133 const index_t * FZ_RESTRICT index,
134 const weight_t * FZ_RESTRICT weights,
135 uint32_t dst_w,
136 uint32_t src_w,
137 uint32_t channels,
138 const uint8_t * FZ_RESTRICT bg)
139 {
140 int32x4_t round = vdupq_n_s32(WEIGHT_ROUND);
141
142 if (0)
143 slow:
144 {
145 /* Do any where we might index off the edge of the source */
146 int pix_num = index->first_pixel;
147 const uint8_t *s = &src[pix_num * 3];
148 const weight_t *w = &weights[index->index];
149 uint32_t j = index->n;
150 int32_t pixel0 = WEIGHT_ROUND;
151 int32_t pixel1 = WEIGHT_ROUND;
152 int32_t pixel2 = WEIGHT_ROUND;
153 if (pix_num < 0)
154 {
155 int32_t wt = *w++;
156 assert(pix_num == -1);
157 pixel0 += bg[0] * wt;
158 pixel1 += bg[1] * wt;
159 pixel2 += bg[2] * wt;
160 s += 3;
161 j--;
162 pix_num = 0;
163 }
164 pix_num = (int)src_w - pix_num;
165 if (pix_num > (int)j)
166 pix_num = j;
167 j -= pix_num;
168 while (pix_num > 0)
169 {
170 int32_t wt = *w++;
171 pixel0 += *s++ * wt;
172 pixel1 += *s++ * wt;
173 pixel2 += *s++ * wt;
174 pix_num--;
175 }
176 if (j > 0)
177 {
178 int32_t wt = *w++;
179 assert(j == 1);
180 pixel0 += bg[0] * wt;
181 pixel1 += bg[1] * wt;
182 pixel2 += bg[2] * wt;
183 }
184 pixel0 >>= WEIGHT_SHIFT;
185 pixel1 >>= WEIGHT_SHIFT;
186 pixel2 >>= WEIGHT_SHIFT;
187 *tmp++ = CLAMP(pixel0, 0, 255);
188 *tmp++ = CLAMP(pixel1, 0, 255);
189 *tmp++ = CLAMP(pixel2, 0, 255);
190 index++;
191 dst_w--;
192 }
193
194 while (dst_w > 0)
195 {
196 const uint8_t *s;
197 int j;
198 const weight_t *w;
199 uint8x16_t pix_bytes;
200 int32x4_t sum;
201 uint8x8_t out_pix;
202
203 /* Jump out of band to do the (rare) slow (edge) pixels */
204 if (index->slow)
205 goto slow;
206
207 s = &src[index->first_pixel * 3];
208 j = (int)index->n;
209 w = &weights[index->index];
210
211 pix_bytes = vld1q_u8(s); // pix_bytes = ppoonnmmllkkjjiihhggffeeddccbbaa
212 if (j == 4)
213 {
214 int16x4_t pix16;
215 int16x4_t vw;
216 vw = vdup_n_s16(w[0]);
217 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
218 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 3);
219 sum = vmlal_s16(round, pix16, vw);
220 vw = vdup_n_s16(w[1]);
221 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
222 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 3);
223 sum = vmlal_s16(sum, pix16, vw);
224 vw = vdup_n_s16(w[2]);
225 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
226 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 3);
227 sum = vmlal_s16(sum, pix16, vw);
228 vw = vdup_n_s16(w[3]);
229 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
230 sum = vmlal_s16(sum, pix16, vw);
231 }
232 else
233 {
234 int off = j & 3;
235 int16x4_t vw;
236 s += (off ? off : 4) * 3;
237 sum = round;
238 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
239 switch (off)
240 {
241 do
242 {
243 int16x4_t pix16;
244 pix_bytes = vld1q_u8(s); // pix_bytes = ppoonnmmllkkjjiihhggffeeddccbbaa
245 s += 4 * 3;
246 case 0:
247 vw = vdup_n_s16(*w++);
248 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
249 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 3);
250 sum = vmlal_s16(sum, pix16, vw);
251 case 3:
252 vw = vdup_n_s16(*w++);
253 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
254 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 3);
255 sum = vmlal_s16(sum, pix16, vw);
256 case 2:
257 vw = vdup_n_s16(*w++);
258 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
259 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 3);
260 sum = vmlal_s16(sum, pix16, vw);
261 case 1:
262 vw = vdup_n_s16(*w++);
263 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
264 sum = vmlal_s16(sum, pix16, vw);
265 j -= 4;
266 } while (j > 0);
267 }
268 }
269 out_pix = vreinterpret_u8_u16(vqshrun_n_s32(sum, WEIGHT_SHIFT-8));
270 *tmp++ = vget_lane_u8(out_pix, 1);
271 *tmp++ = vget_lane_u8(out_pix, 3);
272 *tmp++ = vget_lane_u8(out_pix, 5);
273 index++;
274 dst_w--;
275 }
276
277 while (dst_w > 0)
278 {
279 const uint8_t *s;
280
281 /* Jump out of band to do the (rare) slow (edge) pixels */
282 if (index->slow)
283 goto slow;
284
285 s = &src[index->first_pixel * 3];
286
287 {
288 const weight_t *w = &weights[index->index];
289 uint32_t j = index->n;
290 int32_t pixel0 = WEIGHT_ROUND;
291 int32_t pixel1 = WEIGHT_ROUND;
292 int32_t pixel2 = WEIGHT_ROUND;
293 for (j = index->n; j > 0; j--)
294 {
295 int32_t wt = *w++;
296 pixel0 += *s++ * wt;
297 pixel1 += *s++ * wt;
298 pixel2 += *s++ * wt;
299 }
300 pixel0 >>= WEIGHT_SHIFT;
301 pixel1 >>= WEIGHT_SHIFT;
302 pixel2 >>= WEIGHT_SHIFT;
303 *tmp++ = CLAMP(pixel0, 0, 255);
304 *tmp++ = CLAMP(pixel1, 0, 255);
305 *tmp++ = CLAMP(pixel2, 0, 255);
306 }
307 index++;
308 dst_w--;
309 }
310 }
311
312 static void
313 zoom_x4_neon(uint8_t * FZ_RESTRICT tmp,
314 const uint8_t * FZ_RESTRICT src,
315 const index_t * FZ_RESTRICT index,
316 const weight_t * FZ_RESTRICT weights,
317 uint32_t dst_w,
318 uint32_t src_w,
319 uint32_t channels,
320 const uint8_t * FZ_RESTRICT bg)
321 {
322 int32x4_t round = vdupq_n_s32(WEIGHT_ROUND);
323
324 if (0)
325 slow:
326 {
327 /* Do any where we might index off the edge of the source */
328 int pn = index->first_pixel;
329 const uint8_t *s = &src[pn * 4];
330 const weight_t *w = &weights[index->index];
331 uint32_t j = index->n;
332 int32_t pixel0 = WEIGHT_ROUND;
333 int32_t pixel1 = WEIGHT_ROUND;
334 int32_t pixel2 = WEIGHT_ROUND;
335 int32_t pixel3 = WEIGHT_ROUND;
336 int pix_num = pn;
337 if (pix_num < 0)
338 {
339 int32_t wt = *w++;
340 assert(pix_num == -1);
341 pixel0 += bg[0] * wt;
342 pixel1 += bg[1] * wt;
343 pixel2 += bg[2] * wt;
344 pixel3 += bg[3] * wt;
345 s += 4;
346 j--;
347 pix_num = 0;
348 }
349 pix_num = (int)src_w - pix_num;
350 if (pix_num > (int)j)
351 pix_num = j;
352 j -= pix_num;
353 while (pix_num > 0)
354 {
355 int32_t wt = *w++;
356 pixel0 += *s++ * wt;
357 pixel1 += *s++ * wt;
358 pixel2 += *s++ * wt;
359 pixel3 += *s++ * wt;
360 pix_num--;
361 }
362 if (j > 0)
363 {
364 int32_t wt = *w;
365 assert(j == 1);
366 pixel0 += bg[0] * wt;
367 pixel1 += bg[1] * wt;
368 pixel2 += bg[2] * wt;
369 pixel3 += bg[3] * wt;
370 }
371 pixel0 >>= WEIGHT_SHIFT;
372 pixel1 >>= WEIGHT_SHIFT;
373 pixel2 >>= WEIGHT_SHIFT;
374 pixel3 >>= WEIGHT_SHIFT;
375 *tmp++ = CLAMP(pixel0, 0, 255);
376 *tmp++ = CLAMP(pixel1, 0, 255);
377 *tmp++ = CLAMP(pixel2, 0, 255);
378 *tmp++ = CLAMP(pixel3, 0, 255);
379 index++;
380 dst_w--;
381 }
382
383 while (dst_w > 0)
384 {
385 const uint8_t *s;
386 int j;
387 const weight_t *w;
388 int32x4_t sum;
389 uint8x16_t pix_bytes;
390 uint8x8_t out_pix;
391 //__m128i mm0, mm1, mm4, mw0, mw1;
392
393 /* Jump out of band to do the (rare) slow (edge) pixels */
394 if (index->slow)
395 goto slow;
396
397 s = &src[index->first_pixel * 4];
398 j = (int)index->n;
399 w = &weights[index->index];
400
401 pix_bytes = vld1q_u8(s); // pix_bytes = ppoonnmmllkkjjiihhggffeeddccbbaa
402 if (j == 4)
403 {
404 int16x4_t pix16;
405 int16x4_t vw;
406 vw = vdup_n_s16(w[0]);
407 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
408 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 4);
409 sum = vmlal_s16(round, pix16, vw);
410 vw = vdup_n_s16(w[1]);
411 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
412 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 4);
413 sum = vmlal_s16(sum, pix16, vw);
414 vw = vdup_n_s16(w[2]);
415 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
416 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 4);
417 sum = vmlal_s16(sum, pix16, vw);
418 vw = vdup_n_s16(w[3]);
419 pix16 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
420 sum = vmlal_s16(sum, pix16, vw);
421 }
422 else
423 {
424 int off = j & 3;
425 int16x4_t vw;
426 s += (off ? off : 4) * 4;
427 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
428 sum = round;
429 switch (off)
430 {
431 do
432 {
433 int16x4_t pixels;
434 pix_bytes = vld1q_u8(s); // pix_bytes = ppoonnmmllkkjjiihhggffeeddccbbaa
435 s += 4 * 4;
436 case 0:
437 vw = vdup_n_s16(*w++);
438 pixels = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
439 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 4);
440 sum = vmlal_s16(sum, pixels, vw);
441 case 3:
442 vw = vdup_n_s16(*w++);
443 pixels = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
444 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 4);
445 sum = vmlal_s16(sum, pixels, vw);
446 case 2:
447 vw = vdup_n_s16(*w++);
448 pixels = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
449 pix_bytes = vextq_u8(pix_bytes, pix_bytes, 4);
450 sum = vmlal_s16(sum, pixels, vw);
451 case 1:
452 vw = vdup_n_s16(*w++);
453 pixels = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(pix_bytes))));
454 sum = vmlal_s16(sum, pixels, vw);
455 j -= 4;
456 } while (j > 0);
457 }
458 }
459 out_pix = vreinterpret_u8_u16(vqshrun_n_s32(sum, WEIGHT_SHIFT-8));
460 *tmp++ = vget_lane_u8(out_pix, 1);
461 *tmp++ = vget_lane_u8(out_pix, 3);
462 *tmp++ = vget_lane_u8(out_pix, 5);
463 *tmp++ = vget_lane_u8(out_pix, 7);
464 index++;
465 dst_w--;
466 }
467 }
468
469 static void
470 zoom_y1_neon(uint8_t * dst,
471 const uint8_t * FZ_RESTRICT tmp,
472 const index_t * FZ_RESTRICT index,
473 const weight_t * FZ_RESTRICT weights,
474 uint32_t width,
475 uint32_t channels,
476 uint32_t mod,
477 int32_t y)
478 {
479 uint32_t stride = width;
480 uint32_t offset = 0;
481 int32x4_t round = vdupq_n_s32(WEIGHT_ROUND);
482
483 if (0)
484 slow:
485 {
486 uint32_t off = (index->first_pixel + y) * stride + offset;
487
488 offset++;
489 if (off >= mod)
490 off -= mod;
491
492 {
493 const weight_t *w = (const weight_t *)&weights[index->index * 4];
494 uint32_t j;
495 int32_t pixel0 = WEIGHT_ROUND;
496
497 for (j = index->n; j > 0; j--)
498 {
499 pixel0 += tmp[off] * *w;
500 w += 4;
501 off += stride;
502 if (off >= mod)
503 off -= mod;
504 }
505 pixel0 >>= WEIGHT_SHIFT;
506 *dst++ = CLAMP(pixel0, 0, 255);
507 }
508 index++;
509 width--;
510 }
511
512 while (width > 0)
513 {
514 uint32_t off;
515 /* The slow flag stops us accessing off the end of the source row.
516 * It also tells us how many pixels we can do at once. This usage
517 * is different for zoom_y1 than for all other cores. */
518 int n = index->slow;
519 if (n <= 1)
520 goto slow;
521 off = (index->first_pixel + y) * stride + offset;
522 offset += n;
523 if (off >= mod)
524 off -= mod;
525
526 {
527 const weight_t *w = &weights[index->index * 4];
528 uint32_t j = index->n;
529 int32x4_t sum;
530 uint16x4_t out16;
531
532 if (j == 4)
533 {
534 uint8x8_t pix0, pix1, pix2, pix3;
535 int16x4_t vw0, vw1, vw2, vw3;
536 pix0 = vld1_u8(&tmp[off]);
537 off += stride;
538 if (off >= mod)
539 off -= mod;
540 vw0 = vld1_s16(w);
541 w += 4;
542 sum = vmlal_s16(round, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix0))), vw0);
543 pix1 = vld1_u8(&tmp[off]);
544 off += stride;
545 if (off >= mod)
546 off -= mod;
547 vw1 = vld1_s16(w);
548 w += 4;
549 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix1))), vw1);
550 pix2 = vld1_u8(&tmp[off]);
551 off += stride;
552 if (off >= mod)
553 off -= mod;
554 vw2 = vld1_s16(w);
555 w += 4;
556 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix2))), vw2);
557 pix3 = vld1_u8(&tmp[off]);
558 off += stride;
559 if (off >= mod)
560 off -= mod;
561 vw3 = vld1_s16(w);
562 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix3))), vw3);
563 }
564 else
565 {
566 sum = round;
567 for ( ; j > 0; j--)
568 {
569 uint8x8_t pix0;
570 int16x4_t vw0;
571 pix0 = vld1_u8(&tmp[off]);
572 off += stride;
573 if (off >= mod)
574 off -= mod;
575 vw0 = vld1_s16(w);
576 w += 4;
577 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix0))), vw0);
578 }
579 }
580 out16 = vqshrun_n_s32(sum, WEIGHT_SHIFT-8);
581 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 1);
582 if (n > 1)
583 {
584 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 3);
585 if (n > 2)
586 {
587 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 5);
588 if (n > 3)
589 {
590 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 7);
591 }
592 }
593 }
594 }
595 index += n;
596 width -= n;
597 }
598 }
599
600 static void
601 zoom_y3_neon(uint8_t * dst,
602 const uint8_t * FZ_RESTRICT tmp,
603 const index_t * FZ_RESTRICT index,
604 const weight_t * FZ_RESTRICT weights,
605 uint32_t width,
606 uint32_t channels,
607 uint32_t mod,
608 int32_t y)
609 {
610 uint32_t stride = width * 3;
611 uint32_t offset = 0;
612
613 while (width--)
614 {
615 const weight_t *w = &weights[index->index];
616 uint32_t j = index->n;
617 int32x4_t sum;
618 uint16x4_t out16;
619 uint32_t off = (index->first_pixel + y) * stride + offset;
620 offset += 3;
621 if (off >= mod)
622 off -= mod;
623
624 if (j == 4)
625 {
626 const weight_t *w = &weights[index->index];
627 uint8x8_t pix0, pix1, pix2, pix3;
628 int16x4_t vw0, vw1, vw2, vw3;
629 pix0 = vld1_u8(&tmp[off]);
630 off += stride;
631 if (off >= mod)
632 off -= mod;
633 vw0 = vdup_n_s16(*w++);
634 sum = vmlal_s16(vdupq_n_s32(WEIGHT_ROUND), vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix0))), vw0);
635 pix1 = vld1_u8(&tmp[off]);
636 off += stride;
637 if (off >= mod)
638 off -= mod;
639 vw1 = vdup_n_s16(*w++);
640 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix1))), vw1);
641 pix2 = vld1_u8(&tmp[off]);
642 off += stride;
643 if (off >= mod)
644 off -= mod;
645 vw2 = vdup_n_s16(*w++);
646 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix2))), vw2);
647 pix3 = vld1_u8(&tmp[off]);
648 off += stride;
649 if (off >= mod)
650 off -= mod;
651 vw3 = vdup_n_s16(*w++);
652 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix3))), vw3);
653 }
654 else
655 {
656 sum = vdupq_n_s32(WEIGHT_ROUND);
657 do
658 {
659 uint8x8_t pix0 = vld1_u8(&tmp[off]);
660 int16x4_t vw0;
661 off += stride;
662 if (off >= mod)
663 off -= mod;
664 vw0 = vdup_n_s16(*w++);
665 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix0))), vw0);
666 }
667 while (--j);
668 }
669 out16 = vqshrun_n_s32(sum, WEIGHT_SHIFT-8);
670 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 1);
671 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 3);
672 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 5);
673 index++;
674 }
675 }
676
677 static void
678 zoom_y4_neon(uint8_t * dst,
679 const uint8_t * FZ_RESTRICT tmp,
680 const index_t * FZ_RESTRICT index,
681 const weight_t * FZ_RESTRICT weights,
682 uint32_t width,
683 uint32_t channels,
684 uint32_t mod,
685 int32_t y)
686 {
687 uint32_t stride = width * 4;
688 uint32_t offset = 0;
689 int32x4_t round = vdupq_n_s32(WEIGHT_ROUND);
690
691 while (width--)
692 {
693 uint32_t off = (index->first_pixel + y) * stride + offset;
694
695 offset += 4;
696 if (off >= mod)
697 off -= mod;
698
699 {
700 const weight_t *w = &weights[index->index];
701 uint32_t j = index->n;
702 int32x4_t sum;
703 uint16x4_t out16;
704
705 if (j == 4)
706 {
707 uint8x8_t pix0, pix1, pix2, pix3;
708 int16x4_t vw0, vw1, vw2, vw3;
709 pix0 = vld1_u8(&tmp[off]);
710 off += stride;
711 if (off >= mod)
712 off -= mod;
713 vw0 = vdup_n_s16(*w++);
714 sum = vmlal_s16(round, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix0))), vw0);
715 pix1 = vld1_u8(&tmp[off]);
716 off += stride;
717 if (off >= mod)
718 off -= mod;
719 vw1 = vdup_n_s16(*w++);
720 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix1))), vw1);
721 pix2 = vld1_u8(&tmp[off]);
722 off += stride;
723 if (off >= mod)
724 off -= mod;
725 vw2 = vdup_n_s16(*w++);
726 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix2))), vw2);
727 pix3 = vld1_u8(&tmp[off]);
728 off += stride;
729 if (off >= mod)
730 off -= mod;
731 vw3 = vdup_n_s16(*w++);
732 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix3))), vw3);
733 }
734 else
735 {
736 sum = round;
737 for ( ; j > 0; j--)
738 {
739 uint8x8_t pix0;
740 int16x4_t vw0;
741 pix0 = vld1_u8(&tmp[off]);
742 off += stride;
743 if (off >= mod)
744 off -= mod;
745 vw0 = vdup_n_s16(*w++);
746 sum = vmlal_s16(sum, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(pix0))), vw0);
747 }
748 }
749 out16 = vqshrun_n_s32(sum, WEIGHT_SHIFT-8);
750 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 1);
751 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 3);
752 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 5);
753 *dst++ = vget_lane_u8(vreinterpret_u8_u16(out16), 7);
754 }
755 index++;
756 }
757 }