comparison mupdf-source/source/fitz/deskew_sse.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 /* This file is included from deskew.c if SSE cores are allowed. */
24
25 #include <emmintrin.h>
26 #include <smmintrin.h>
27
28 static void
29 zoom_x1_sse(uint8_t * FZ_RESTRICT tmp,
30 const uint8_t * FZ_RESTRICT src,
31 const index_t * FZ_RESTRICT index,
32 const int32_t * FZ_RESTRICT weights,
33 uint32_t dst_w,
34 uint32_t src_w,
35 uint32_t channels,
36 const uint8_t * FZ_RESTRICT bg)
37 {
38 __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
39
40 if (0)
41 slow:
42 {
43 /* Do any where we might index off the edge of the source */
44 int pix_num = index->first_pixel;
45 const uint8_t *s = &src[pix_num];
46 const int32_t *w = &weights[index->index];
47 uint32_t j = index->n;
48 int32_t pixel0 = WEIGHT_ROUND;
49 if (pix_num < 0)
50 {
51 int32_t wt = *w++;
52 assert(pix_num == -1);
53 pixel0 += bg[0] * wt;
54 s++;
55 j--;
56 pix_num = 0;
57 }
58 pix_num = (int)src_w - pix_num;
59 if (pix_num > (int)j)
60 pix_num = j;
61 j -= pix_num;
62 while (pix_num > 0)
63 {
64 pixel0 += *s++ * *w++;
65 pix_num--;
66 }
67 if (j > 0)
68 {
69 assert(j == 1);
70 pixel0 += bg[0] * *w;
71 }
72 pixel0 >>= WEIGHT_SHIFT;
73 *tmp++ = CLAMP(pixel0, 0, 255);
74 index++;
75 dst_w--;
76 }
77
78 while (dst_w > 0)
79 {
80 const uint8_t *s;
81 uint32_t j;
82 const int32_t *w;
83
84 /* Jump out of band to do the (rare) slow (edge) pixels */
85 if (index->slow)
86 goto slow;
87
88 s = &src[index->first_pixel];
89 j = index->n;
90 w = &weights[index->index];
91 if (j <= 4)
92 {
93 __m128i mw0, mm0;
94 mw0 = _mm_load_si128((const __m128i *)w);
95 mm0 = _mm_loadu_si128((const __m128i *)s);
96 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
97 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
98 mm0 = _mm_mullo_epi32(mm0,mw0); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
99 mm0 = _mm_hadd_epi32(mm0,mm0);
100 mm0 = _mm_hadd_epi32(mm0,mm0);
101 mm0 = _mm_add_epi32(mm0, round);// Add round
102 mm0 = _mm_srai_epi32(mm0, WEIGHT_SHIFT-8); // Shift down
103 mm0 = _mm_packus_epi32(mm0,mm0);// Clamp to 0 to 65535 range.
104 *tmp++ = _mm_extract_epi8(mm0,1);
105 }
106 else if (j <= 8)
107 {
108 __m128i mw0, mw1, mm0, mm1;
109 mw0 = _mm_load_si128((const __m128i *)w);
110 mm0 = _mm_loadu_si128((const __m128i *)s);
111 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
112 mw1 = _mm_load_si128((const __m128i *)(w+4));
113 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
114 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
115 mm1 = _mm_mullo_epi32(mm1,mw0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
116 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000hh000000gg000000ff000000ee SSE4.1
117 mm0 = _mm_mullo_epi32(mm0,mw1); // mm1 = 0000whwh0000wgwg0000wfwf0000wewe SSE4.1
118 mm1 = _mm_add_epi32(mm1, mm0);
119 mm1 = _mm_hadd_epi32(mm1,mm1);
120 mm1 = _mm_hadd_epi32(mm1,mm1);
121 mm1 = _mm_add_epi32(mm1, round); // Add round
122 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT-8); // Shift down
123 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
124 *tmp++ = _mm_extract_epi8(mm1,1);
125 }
126 else
127 {
128 int32_t pixel0 = WEIGHT_ROUND;
129 for (j = index->n; j > 0; j--)
130 {
131 pixel0 += *s++ * *w++;
132 }
133 pixel0 >>= WEIGHT_SHIFT;
134 *tmp++ = CLAMP(pixel0, 0, 255);
135 }
136 index++;
137 dst_w--;
138 }
139 }
140
141 static void
142 zoom_x3_sse(uint8_t * FZ_RESTRICT tmp,
143 const uint8_t * FZ_RESTRICT src,
144 const index_t * FZ_RESTRICT index,
145 const int32_t * FZ_RESTRICT weights,
146 uint32_t dst_w,
147 uint32_t src_w,
148 uint32_t channels,
149 const uint8_t * FZ_RESTRICT bg)
150 {
151 __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
152
153 if (0)
154 slow:
155 {
156 /* Do any where we might index off the edge of the source */
157 int pix_num = index->first_pixel;
158 const uint8_t *s = &src[pix_num * 3];
159 const int32_t *w = &weights[index->index];
160 uint32_t j = index->n;
161 int32_t pixel0 = WEIGHT_ROUND;
162 int32_t pixel1 = WEIGHT_ROUND;
163 int32_t pixel2 = WEIGHT_ROUND;
164 if (pix_num < 0)
165 {
166 int32_t wt = *w++;
167 assert(pix_num == -1);
168 pixel0 += bg[0] * wt;
169 pixel1 += bg[1] * wt;
170 pixel2 += bg[2] * wt;
171 s += 3;
172 j--;
173 pix_num = 0;
174 }
175 pix_num = (int)src_w - pix_num;
176 if (pix_num > (int)j)
177 pix_num = j;
178 j -= pix_num;
179 while (pix_num > 0)
180 {
181 int32_t wt = *w++;
182 pixel0 += *s++ * wt;
183 pixel1 += *s++ * wt;
184 pixel2 += *s++ * wt;
185 pix_num--;
186 }
187 if (j > 0)
188 {
189 int32_t wt = *w++;
190 assert(j == 1);
191 pixel0 += bg[0] * wt;
192 pixel1 += bg[1] * wt;
193 pixel2 += bg[2] * wt;
194 }
195 pixel0 >>= WEIGHT_SHIFT;
196 pixel1 >>= WEIGHT_SHIFT;
197 pixel2 >>= WEIGHT_SHIFT;
198 *tmp++ = CLAMP(pixel0, 0, 255);
199 *tmp++ = CLAMP(pixel1, 0, 255);
200 *tmp++ = CLAMP(pixel2, 0, 255);
201 index++;
202 dst_w--;
203 }
204
205 while (dst_w > 0)
206 {
207 const uint8_t *s;
208 int j;
209 const int32_t *w;
210 __m128i mm0, mm1, mm4, mw0, mw1;
211
212 /* Jump out of band to do the (rare) slow (edge) pixels */
213 if (index->slow)
214 goto slow;
215
216 s = &src[index->first_pixel * 3];
217 j = (int)index->n;
218 w = &weights[index->index];
219
220 mm4 = round;
221 mm0 = _mm_loadu_si128((const __m128i *)s); // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
222 if (j == 4)
223 {
224 mw0 = _mm_load_si128((const __m128i *)w);
225 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
226 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
227 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
228 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
229 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
230 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
231 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
232 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
233 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
234 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
235 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
236 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
237 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
238 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
239 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
240 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
241 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
242 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
243 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
244 }
245 else
246 {
247 int off = j & 3;
248 w -= (4 - j) & 3;
249 s += (off ? off : 4) * 3;
250 mw0 = _mm_loadu_si128((const __m128i *)w);
251 w += 4;
252 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
253 switch (off)
254 {
255 do
256 {
257 mm0 = _mm_loadu_si128((const __m128i *)s);
258 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
259 s += 4 * 3;
260 mw0 = _mm_load_si128((const __m128i *)w);
261 w += 4;
262 case 0:
263 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
264 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
265 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
266 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
267 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
268 case 3:
269 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
270 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
271 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
272 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
273 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
274 case 2:
275 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
276 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
277 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
278 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
279 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
280 case 1:
281 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
282 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
283 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
284 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
285 j -= 4;
286 } while (j > 0);
287 }
288 }
289 #if 0
290 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT - 8); // Shift down
291 mm4 = _mm_packus_epi32(mm4, mm4); // Clamp to 0 to 65535 range.
292 *tmp++ = _mm_extract_epi8(mm4, 1);
293 *tmp++ = _mm_extract_epi8(mm4, 3);
294 *tmp++ = _mm_extract_epi8(mm4, 5);
295 #else
296 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT); // Shift down
297 mm4 = _mm_packus_epi32(mm4, mm4); // Clamp to 0 to 65535 range.
298 mm4 = _mm_packus_epi16(mm4, mm4); // Clamp to 0 to 65535 range.
299 j = _mm_extract_epi32(mm4, 0);
300 *(int16_t *)tmp = j;
301 ((int8_t *)tmp)[2] = j>>16;
302 tmp += 3;
303 #endif
304 index++;
305 dst_w--;
306 }
307
308 while (dst_w > 0)
309 {
310 const uint8_t *s;
311
312 /* Jump out of band to do the (rare) slow (edge) pixels */
313 if (index->slow)
314 goto slow;
315
316 s = &src[index->first_pixel * 3];
317
318 {
319 const int32_t *w = &weights[index->index];
320 uint32_t j = index->n;
321 int32_t pixel0 = WEIGHT_ROUND;
322 int32_t pixel1 = WEIGHT_ROUND;
323 int32_t pixel2 = WEIGHT_ROUND;
324 for (j = index->n; j > 0; j--)
325 {
326 int32_t wt = *w++;
327 pixel0 += *s++ * wt;
328 pixel1 += *s++ * wt;
329 pixel2 += *s++ * wt;
330 }
331 pixel0 >>= WEIGHT_SHIFT;
332 pixel1 >>= WEIGHT_SHIFT;
333 pixel2 >>= WEIGHT_SHIFT;
334 *tmp++ = CLAMP(pixel0, 0, 255);
335 *tmp++ = CLAMP(pixel1, 0, 255);
336 *tmp++ = CLAMP(pixel2, 0, 255);
337 }
338 index++;
339 dst_w--;
340 }
341 }
342
343 static void
344 zoom_x4_sse(uint8_t * FZ_RESTRICT tmp,
345 const uint8_t * FZ_RESTRICT src,
346 const index_t * FZ_RESTRICT index,
347 const int32_t * FZ_RESTRICT weights,
348 uint32_t dst_w,
349 uint32_t src_w,
350 uint32_t channels,
351 const uint8_t * FZ_RESTRICT bg)
352 {
353 __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
354
355 if (0)
356 slow:
357 {
358 /* Do any where we might index off the edge of the source */
359 int pn = index->first_pixel;
360 const uint8_t *s = &src[pn * 4];
361 const int32_t *w = &weights[index->index];
362 uint32_t j = index->n;
363 int32_t pixel0 = WEIGHT_ROUND;
364 int32_t pixel1 = WEIGHT_ROUND;
365 int32_t pixel2 = WEIGHT_ROUND;
366 int32_t pixel3 = WEIGHT_ROUND;
367 int pix_num = pn;
368 if (pix_num < 0)
369 {
370 int32_t wt = *w++;
371 assert(pix_num == -1);
372 pixel0 += bg[0] * wt;
373 pixel1 += bg[1] * wt;
374 pixel2 += bg[2] * wt;
375 pixel3 += bg[3] * wt;
376 s += 4;
377 j--;
378 pix_num = 0;
379 }
380 pix_num = (int)src_w - pix_num;
381 if (pix_num > (int)j)
382 pix_num = j;
383 j -= pix_num;
384 while (pix_num > 0)
385 {
386 int32_t wt = *w++;
387 pixel0 += *s++ * wt;
388 pixel1 += *s++ * wt;
389 pixel2 += *s++ * wt;
390 pixel3 += *s++ * wt;
391 pix_num--;
392 }
393 if (j > 0)
394 {
395 int32_t wt = *w;
396 assert(j == 1);
397 pixel0 += bg[0] * wt;
398 pixel1 += bg[1] * wt;
399 pixel2 += bg[2] * wt;
400 pixel3 += bg[3] * wt;
401 }
402 pixel0 >>= WEIGHT_SHIFT;
403 pixel1 >>= WEIGHT_SHIFT;
404 pixel2 >>= WEIGHT_SHIFT;
405 pixel3 >>= WEIGHT_SHIFT;
406 *tmp++ = CLAMP(pixel0, 0, 255);
407 *tmp++ = CLAMP(pixel1, 0, 255);
408 *tmp++ = CLAMP(pixel2, 0, 255);
409 *tmp++ = CLAMP(pixel3, 0, 255);
410 index++;
411 dst_w--;
412 }
413
414 while (dst_w > 0)
415 {
416 const uint8_t *s;
417 int j;
418 const int32_t *w;
419 __m128i mm0, mm1, mm4, mw0, mw1;
420
421 /* Jump out of band to do the (rare) slow (edge) pixels */
422 if (index->slow)
423 goto slow;
424
425 s = &src[index->first_pixel * 4];
426 j = (int)index->n;
427 w = &weights[index->index];
428
429 mm4 = round;
430 mm0 = _mm_loadu_si128((const __m128i *)s); // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
431 if (j == 4)
432 {
433 mw0 = _mm_load_si128((const __m128i *)w);
434 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
435 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
436 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
437 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
438 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
439 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
440 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
441 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
442 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
443 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
444 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
445 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
446 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
447 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
448 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
449 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
450 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
451 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
452 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
453 }
454 else
455 {
456 int off = j & 3;
457 w -= (4 - j) & 3;
458 s += (off ? off : 4) * 4;
459 mw0 = _mm_loadu_si128((const __m128i *)w);
460 w += 4;
461 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
462 switch (off)
463 {
464 do
465 {
466 mm0 = _mm_loadu_si128((const __m128i *)s);
467 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
468 s += 4 * 4;
469 mw0 = _mm_load_si128((const __m128i *)w);
470 w += 4;
471 case 0:
472 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
473 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
474 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
475 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
476 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
477 case 3:
478 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
479 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
480 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
481 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
482 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
483 case 2:
484 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
485 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
486 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
487 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
488 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2
489 case 1:
490 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
491 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1
492 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
493 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
494 j -= 4;
495 } while (j > 0);
496 }
497 }
498 #if 0
499 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT - 8); // Shift down
500 mm4 = _mm_packus_epi32(mm4,mm4); // Clamp to 0 to 65535 range.
501 *tmp++ = _mm_extract_epi8(mm4,1);
502 *tmp++ = _mm_extract_epi8(mm4,3);
503 *tmp++ = _mm_extract_epi8(mm4,5);
504 *tmp++ = _mm_extract_epi8(mm4,7);
505 #else
506 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT);
507 mm4 = _mm_packus_epi32(mm4,mm4);
508 mm4 = _mm_packus_epi16(mm4,mm4);
509 *(int32_t *)tmp = _mm_extract_epi32(mm4,0);
510 tmp += 4;
511 #endif
512 index++;
513 dst_w--;
514 }
515 }
516
517 static void
518 zoom_y1_sse(uint8_t * dst,
519 const uint8_t * FZ_RESTRICT tmp,
520 const index_t * FZ_RESTRICT index,
521 const int32_t * FZ_RESTRICT weights,
522 uint32_t width,
523 uint32_t channels,
524 uint32_t mod,
525 int32_t y)
526 {
527 uint32_t stride = width;
528 uint32_t offset = 0;
529 const __m128i *mm_weights = (const __m128i *)weights;
530 const __m128i mm_weight_round = _mm_set1_epi32(WEIGHT_ROUND);
531
532 if (0)
533 slow:
534 {
535 uint32_t off = (index->first_pixel + y) * stride + offset;
536
537 offset++;
538 if (off >= mod)
539 off -= mod;
540
541 {
542 const int32_t *w = (const int32_t *)&mm_weights[index->index];
543 uint32_t j;
544 int32_t pixel0 = WEIGHT_ROUND;
545
546 for (j = index->n; j > 0; j--)
547 {
548 pixel0 += tmp[off] * *w;
549 w += 4;
550 off += stride;
551 if (off >= mod)
552 off -= mod;
553 }
554 pixel0 >>= WEIGHT_SHIFT;
555 *dst++ = CLAMP(pixel0, 0, 255);
556 }
557 index++;
558 width--;
559 }
560
561 while ((int)width > 0)
562 {
563 uint32_t off;
564 /* The slow flag stops us accessing off the end of the source row.
565 * It also tells us how many pixels we can do at once. This usage
566 * is different for zoom_y1 than for all other cores. */
567 uint8_t n = index->slow;
568 if (n <= 1 || n > width)
569 goto slow;
570 off = (index->first_pixel + y) * stride + offset;
571 offset += n;
572 if (off >= mod)
573 off -= mod;
574
575 {
576 const __m128i *w = &mm_weights[index->index];
577 uint32_t j = index->n;
578 __m128i mm_pixels = mm_weight_round;
579
580 if (j == 4)
581 {
582 __m128i pix0, pix1, pix2;
583 __m128i w0, w1, w2;
584 pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
585 off += stride;
586 if (off >= mod)
587 off -= mod;
588 w0 = _mm_load_si128(w++);
589 pix0 = _mm_cvtepu8_epi32(pix0);
590 pix1 = _mm_loadu_si128((const __m128i *)&tmp[off]);
591 off += stride;
592 if (off >= mod)
593 off -= mod;
594 pix0 = _mm_mullo_epi32(pix0, w0);
595 w1 = _mm_load_si128(w++);
596 pix1 = _mm_cvtepu8_epi32(pix1);
597 pix2 = _mm_loadu_si128((const __m128i *)&tmp[off]);
598 off += stride;
599 if (off >= mod)
600 off -= mod;
601 mm_pixels = _mm_add_epi32(mm_pixels, pix0);
602 pix1 = _mm_mullo_epi32(pix1, w1);
603 w2 = _mm_load_si128(w++);
604 pix2 = _mm_cvtepu8_epi32(pix2);
605 pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
606 off += stride;
607 if (off >= mod)
608 off -= mod;
609 mm_pixels = _mm_add_epi32(mm_pixels, pix1);
610 pix2 = _mm_mullo_epi32(pix2, w2);
611 w0 = _mm_load_si128(w++);
612 pix0 = _mm_cvtepu8_epi32(pix0);
613 pix0 = _mm_mullo_epi32(pix0, w0);
614 mm_pixels = _mm_add_epi32(mm_pixels, pix2);
615 mm_pixels = _mm_add_epi32(mm_pixels, pix0);
616 }
617 else
618 for ( ; j > 0; j--)
619 {
620 __m128i pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
621 __m128i w0 = _mm_load_si128(w++);
622 pix0 = _mm_cvtepu8_epi32(pix0);
623 off += stride;
624 pix0 = _mm_mullo_epi32(pix0, w0);
625 if (off >= mod)
626 off -= mod;
627 mm_pixels = _mm_add_epi32(mm_pixels, pix0);
628 }
629 mm_pixels = _mm_srli_epi32(mm_pixels, WEIGHT_SHIFT);
630 mm_pixels = _mm_packus_epi32(mm_pixels, mm_pixels); // Clamp to 0 to 65535 range.
631 mm_pixels = _mm_packus_epi16(mm_pixels, mm_pixels); // Clamp to 0 to 255 range.
632 j = _mm_extract_epi32(mm_pixels, 0);
633 switch (n)
634 {
635 default:
636 case 4:
637 *(int32_t *)dst = j;
638 dst += 4;
639 break;
640 case 3:
641 *(int16_t *)dst = j;
642 ((uint8_t *)dst)[2] = j >> 16;
643 dst += 3;
644 break;
645 case 2:
646 *(int16_t *)dst = j;
647 dst += 2;
648 break;
649 case 1:
650 *(int8_t *)dst = j;
651 dst += 1;
652 break;
653 }
654 }
655 index += n;
656 width -= n;
657 }
658 }
659
660 static void
661 zoom_y3_sse(uint8_t * dst,
662 const uint8_t * FZ_RESTRICT tmp,
663 const index_t * FZ_RESTRICT index,
664 const int32_t * FZ_RESTRICT weights,
665 uint32_t width,
666 uint32_t channels,
667 uint32_t mod,
668 int32_t y)
669 {
670 uint32_t stride = width * 3;
671 uint32_t offset = 0;
672 __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
673
674 while (width--)
675 {
676 uint32_t off = (index->first_pixel + y) * stride + offset;
677
678 offset += 3;
679 if (off >= mod)
680 off -= mod;
681
682 {
683 const int32_t *w = &weights[index->index];
684 int32_t j = (int32_t)index->n;
685 __m128i mm0, mm1, mm2, mw0, mw1;
686
687 if (j == 4)
688 {
689 mw0 = _mm_load_si128((const __m128i *)w);
690 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
691 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
692 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
693 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
694 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
695 mm1 = _mm_add_epi32(round, mm0);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
696 off += stride;
697 if (off >= mod)
698 off -= mod;
699 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
700 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
701 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
702 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
703 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
704 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
705 off += stride;
706 if (off >= mod)
707 off -= mod;
708 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
709 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
710 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
711 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
712 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
713 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
714 off += stride;
715 if (off >= mod)
716 off -= mod;
717 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
718 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
719 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
720 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
721 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
722 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
723 }
724 else
725 {
726 int duff = j & 3;
727 w -= (4 - j) & 3;
728 mw0 = _mm_loadu_si128((const __m128i *)w);
729 w += 4;
730 mm1 = round;
731 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
732 switch (duff)
733 {
734 do
735 {
736 off += stride;
737 if (off >= mod)
738 off -= mod;
739 mw0 = _mm_load_si128((const __m128i *)w);
740 w += 4;
741 case 0:
742 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
743 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
744 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
745 off += stride;
746 if (off >= mod)
747 off -= mod;
748 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
749 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
750 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
751 case 3:
752 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
753 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
754 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
755 off += stride;
756 if (off >= mod)
757 off -= mod;
758 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
759 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
760 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
761 case 2:
762 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
763 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
764 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
765 off += stride;
766 if (off >= mod)
767 off -= mod;
768 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
769 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
770 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
771 case 1:
772 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
773 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
774 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
775 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
776 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
777 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
778 j -= 4;
779 } while (j > 0);
780 }
781 }
782 #if 0
783 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT - 8); // Shift down
784 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
785 *dst++ = _mm_extract_epi8(mm1,1);
786 *dst++ = _mm_extract_epi8(mm1,3);
787 *dst++ = _mm_extract_epi8(mm1,5);
788 #else
789 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT); // Shift down
790 mm1 = _mm_packus_epi32(mm1, mm1); // Clamp to 0 to 65535 range.
791 mm1 = _mm_packus_epi16(mm1, mm1); // Clamp to 0 to 255 range.
792 j = _mm_extract_epi32(mm1, 0);
793 *(int16_t *)dst = j;
794 ((uint8_t *)dst)[2] = j >> 16;
795 dst += 3;
796 #endif
797 }
798 index++;
799 }
800 }
801
802 static void
803 zoom_y4_sse(uint8_t * dst,
804 const uint8_t * FZ_RESTRICT tmp,
805 const index_t * FZ_RESTRICT index,
806 const int32_t * FZ_RESTRICT weights,
807 uint32_t width,
808 uint32_t channels,
809 uint32_t mod,
810 int32_t y)
811 {
812 uint32_t stride = width * 4;
813 uint32_t offset = 0;
814 __m128i round = _mm_set1_epi32(WEIGHT_ROUND);
815
816 while (width--)
817 {
818 uint32_t off = (index->first_pixel + y) * stride + offset;
819
820 offset += 4;
821 if (off >= mod)
822 off -= mod;
823
824 {
825 const int32_t *w = &weights[index->index];
826 int32_t j = (int32_t)index->n;
827 __m128i mm0, mm1, mm2, mw0, mw1;
828
829 if (j == 4)
830 {
831 mw0 = _mm_load_si128((const __m128i *)w);
832 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
833 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
834 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
835 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
836 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
837 mm1 = _mm_add_epi32(round, mm0);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
838 off += stride;
839 if (off >= mod)
840 off -= mod;
841 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
842 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
843 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
844 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
845 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
846 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
847 off += stride;
848 if (off >= mod)
849 off -= mod;
850 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
851 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
852 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
853 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
854 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
855 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
856 off += stride;
857 if (off >= mod)
858 off -= mod;
859 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
860 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
861 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
862 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1
863 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
864 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
865 }
866 else
867 {
868 int duff = j & 3;
869 w -= (4 - j) & 3;
870 mw0 = _mm_loadu_si128((const __m128i *)w);
871 w += 4;
872 mm1 = round;
873 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */
874 switch (duff)
875 {
876 do
877 {
878 off += stride;
879 if (off >= mod)
880 off -= mod;
881 mw0 = _mm_load_si128((const __m128i *)w);
882 w += 4;
883 case 0:
884 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
885 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
886 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6));
887 off += stride;
888 if (off >= mod)
889 off -= mod;
890 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
891 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
892 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
893 case 3:
894 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
895 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
896 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6));
897 off += stride;
898 if (off >= mod)
899 off -= mod;
900 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
901 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
902 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
903 case 2:
904 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
905 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
906 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6));
907 off += stride;
908 if (off >= mod)
909 off -= mod;
910 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
911 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
912 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
913 case 1:
914 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]);
915 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2
916 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6));
917 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1
918 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1
919 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2
920 j -= 4;
921 } while (j > 0);
922 }
923 }
924 #if 0
925 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT - 8); // Shift down
926 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
927 *dst++ = _mm_extract_epi8(mm1,1);
928 *dst++ = _mm_extract_epi8(mm1,3);
929 *dst++ = _mm_extract_epi8(mm1,5);
930 *dst++ = _mm_extract_epi8(mm1,7);
931 #else
932 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT); // Shift down
933 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range.
934 mm1 = _mm_packus_epi16(mm1,mm1); // Clamp to 0 to 255 range.
935 *(int32_t *)dst = _mm_extract_epi32(mm1, 0);
936 dst += 4;
937 #endif
938 }
939 index++;
940 }
941 }