Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/deskew_sse.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2024 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 /* This file is included from deskew.c if SSE cores are allowed. */ | |
| 24 | |
| 25 #include <emmintrin.h> | |
| 26 #include <smmintrin.h> | |
| 27 | |
| 28 static void | |
| 29 zoom_x1_sse(uint8_t * FZ_RESTRICT tmp, | |
| 30 const uint8_t * FZ_RESTRICT src, | |
| 31 const index_t * FZ_RESTRICT index, | |
| 32 const int32_t * FZ_RESTRICT weights, | |
| 33 uint32_t dst_w, | |
| 34 uint32_t src_w, | |
| 35 uint32_t channels, | |
| 36 const uint8_t * FZ_RESTRICT bg) | |
| 37 { | |
| 38 __m128i round = _mm_set1_epi32(WEIGHT_ROUND); | |
| 39 | |
| 40 if (0) | |
| 41 slow: | |
| 42 { | |
| 43 /* Do any where we might index off the edge of the source */ | |
| 44 int pix_num = index->first_pixel; | |
| 45 const uint8_t *s = &src[pix_num]; | |
| 46 const int32_t *w = &weights[index->index]; | |
| 47 uint32_t j = index->n; | |
| 48 int32_t pixel0 = WEIGHT_ROUND; | |
| 49 if (pix_num < 0) | |
| 50 { | |
| 51 int32_t wt = *w++; | |
| 52 assert(pix_num == -1); | |
| 53 pixel0 += bg[0] * wt; | |
| 54 s++; | |
| 55 j--; | |
| 56 pix_num = 0; | |
| 57 } | |
| 58 pix_num = (int)src_w - pix_num; | |
| 59 if (pix_num > (int)j) | |
| 60 pix_num = j; | |
| 61 j -= pix_num; | |
| 62 while (pix_num > 0) | |
| 63 { | |
| 64 pixel0 += *s++ * *w++; | |
| 65 pix_num--; | |
| 66 } | |
| 67 if (j > 0) | |
| 68 { | |
| 69 assert(j == 1); | |
| 70 pixel0 += bg[0] * *w; | |
| 71 } | |
| 72 pixel0 >>= WEIGHT_SHIFT; | |
| 73 *tmp++ = CLAMP(pixel0, 0, 255); | |
| 74 index++; | |
| 75 dst_w--; | |
| 76 } | |
| 77 | |
| 78 while (dst_w > 0) | |
| 79 { | |
| 80 const uint8_t *s; | |
| 81 uint32_t j; | |
| 82 const int32_t *w; | |
| 83 | |
| 84 /* Jump out of band to do the (rare) slow (edge) pixels */ | |
| 85 if (index->slow) | |
| 86 goto slow; | |
| 87 | |
| 88 s = &src[index->first_pixel]; | |
| 89 j = index->n; | |
| 90 w = &weights[index->index]; | |
| 91 if (j <= 4) | |
| 92 { | |
| 93 __m128i mw0, mm0; | |
| 94 mw0 = _mm_load_si128((const __m128i *)w); | |
| 95 mm0 = _mm_loadu_si128((const __m128i *)s); | |
| 96 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 97 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 98 mm0 = _mm_mullo_epi32(mm0,mw0); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 99 mm0 = _mm_hadd_epi32(mm0,mm0); | |
| 100 mm0 = _mm_hadd_epi32(mm0,mm0); | |
| 101 mm0 = _mm_add_epi32(mm0, round);// Add round | |
| 102 mm0 = _mm_srai_epi32(mm0, WEIGHT_SHIFT-8); // Shift down | |
| 103 mm0 = _mm_packus_epi32(mm0,mm0);// Clamp to 0 to 65535 range. | |
| 104 *tmp++ = _mm_extract_epi8(mm0,1); | |
| 105 } | |
| 106 else if (j <= 8) | |
| 107 { | |
| 108 __m128i mw0, mw1, mm0, mm1; | |
| 109 mw0 = _mm_load_si128((const __m128i *)w); | |
| 110 mm0 = _mm_loadu_si128((const __m128i *)s); | |
| 111 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 112 mw1 = _mm_load_si128((const __m128i *)(w+4)); | |
| 113 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 114 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 115 mm1 = _mm_mullo_epi32(mm1,mw0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 116 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000hh000000gg000000ff000000ee SSE4.1 | |
| 117 mm0 = _mm_mullo_epi32(mm0,mw1); // mm1 = 0000whwh0000wgwg0000wfwf0000wewe SSE4.1 | |
| 118 mm1 = _mm_add_epi32(mm1, mm0); | |
| 119 mm1 = _mm_hadd_epi32(mm1,mm1); | |
| 120 mm1 = _mm_hadd_epi32(mm1,mm1); | |
| 121 mm1 = _mm_add_epi32(mm1, round); // Add round | |
| 122 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT-8); // Shift down | |
| 123 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range. | |
| 124 *tmp++ = _mm_extract_epi8(mm1,1); | |
| 125 } | |
| 126 else | |
| 127 { | |
| 128 int32_t pixel0 = WEIGHT_ROUND; | |
| 129 for (j = index->n; j > 0; j--) | |
| 130 { | |
| 131 pixel0 += *s++ * *w++; | |
| 132 } | |
| 133 pixel0 >>= WEIGHT_SHIFT; | |
| 134 *tmp++ = CLAMP(pixel0, 0, 255); | |
| 135 } | |
| 136 index++; | |
| 137 dst_w--; | |
| 138 } | |
| 139 } | |
| 140 | |
| 141 static void | |
| 142 zoom_x3_sse(uint8_t * FZ_RESTRICT tmp, | |
| 143 const uint8_t * FZ_RESTRICT src, | |
| 144 const index_t * FZ_RESTRICT index, | |
| 145 const int32_t * FZ_RESTRICT weights, | |
| 146 uint32_t dst_w, | |
| 147 uint32_t src_w, | |
| 148 uint32_t channels, | |
| 149 const uint8_t * FZ_RESTRICT bg) | |
| 150 { | |
| 151 __m128i round = _mm_set1_epi32(WEIGHT_ROUND); | |
| 152 | |
| 153 if (0) | |
| 154 slow: | |
| 155 { | |
| 156 /* Do any where we might index off the edge of the source */ | |
| 157 int pix_num = index->first_pixel; | |
| 158 const uint8_t *s = &src[pix_num * 3]; | |
| 159 const int32_t *w = &weights[index->index]; | |
| 160 uint32_t j = index->n; | |
| 161 int32_t pixel0 = WEIGHT_ROUND; | |
| 162 int32_t pixel1 = WEIGHT_ROUND; | |
| 163 int32_t pixel2 = WEIGHT_ROUND; | |
| 164 if (pix_num < 0) | |
| 165 { | |
| 166 int32_t wt = *w++; | |
| 167 assert(pix_num == -1); | |
| 168 pixel0 += bg[0] * wt; | |
| 169 pixel1 += bg[1] * wt; | |
| 170 pixel2 += bg[2] * wt; | |
| 171 s += 3; | |
| 172 j--; | |
| 173 pix_num = 0; | |
| 174 } | |
| 175 pix_num = (int)src_w - pix_num; | |
| 176 if (pix_num > (int)j) | |
| 177 pix_num = j; | |
| 178 j -= pix_num; | |
| 179 while (pix_num > 0) | |
| 180 { | |
| 181 int32_t wt = *w++; | |
| 182 pixel0 += *s++ * wt; | |
| 183 pixel1 += *s++ * wt; | |
| 184 pixel2 += *s++ * wt; | |
| 185 pix_num--; | |
| 186 } | |
| 187 if (j > 0) | |
| 188 { | |
| 189 int32_t wt = *w++; | |
| 190 assert(j == 1); | |
| 191 pixel0 += bg[0] * wt; | |
| 192 pixel1 += bg[1] * wt; | |
| 193 pixel2 += bg[2] * wt; | |
| 194 } | |
| 195 pixel0 >>= WEIGHT_SHIFT; | |
| 196 pixel1 >>= WEIGHT_SHIFT; | |
| 197 pixel2 >>= WEIGHT_SHIFT; | |
| 198 *tmp++ = CLAMP(pixel0, 0, 255); | |
| 199 *tmp++ = CLAMP(pixel1, 0, 255); | |
| 200 *tmp++ = CLAMP(pixel2, 0, 255); | |
| 201 index++; | |
| 202 dst_w--; | |
| 203 } | |
| 204 | |
| 205 while (dst_w > 0) | |
| 206 { | |
| 207 const uint8_t *s; | |
| 208 int j; | |
| 209 const int32_t *w; | |
| 210 __m128i mm0, mm1, mm4, mw0, mw1; | |
| 211 | |
| 212 /* Jump out of band to do the (rare) slow (edge) pixels */ | |
| 213 if (index->slow) | |
| 214 goto slow; | |
| 215 | |
| 216 s = &src[index->first_pixel * 3]; | |
| 217 j = (int)index->n; | |
| 218 w = &weights[index->index]; | |
| 219 | |
| 220 mm4 = round; | |
| 221 mm0 = _mm_loadu_si128((const __m128i *)s); // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 222 if (j == 4) | |
| 223 { | |
| 224 mw0 = _mm_load_si128((const __m128i *)w); | |
| 225 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 226 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 227 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 228 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 229 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 230 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 231 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 232 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 233 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 234 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 235 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 236 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 237 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 238 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 239 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 240 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 241 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 242 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 243 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 244 } | |
| 245 else | |
| 246 { | |
| 247 int off = j & 3; | |
| 248 w -= (4 - j) & 3; | |
| 249 s += (off ? off : 4) * 3; | |
| 250 mw0 = _mm_loadu_si128((const __m128i *)w); | |
| 251 w += 4; | |
| 252 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */ | |
| 253 switch (off) | |
| 254 { | |
| 255 do | |
| 256 { | |
| 257 mm0 = _mm_loadu_si128((const __m128i *)s); | |
| 258 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 259 s += 4 * 3; | |
| 260 mw0 = _mm_load_si128((const __m128i *)w); | |
| 261 w += 4; | |
| 262 case 0: | |
| 263 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 264 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 265 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 266 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 267 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 268 case 3: | |
| 269 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 270 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 271 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 272 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 273 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 274 case 2: | |
| 275 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 276 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 277 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 278 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 279 mm0 = _mm_srli_si128(mm0, 3); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 280 case 1: | |
| 281 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 282 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 283 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 284 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 285 j -= 4; | |
| 286 } while (j > 0); | |
| 287 } | |
| 288 } | |
| 289 #if 0 | |
| 290 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT - 8); // Shift down | |
| 291 mm4 = _mm_packus_epi32(mm4, mm4); // Clamp to 0 to 65535 range. | |
| 292 *tmp++ = _mm_extract_epi8(mm4, 1); | |
| 293 *tmp++ = _mm_extract_epi8(mm4, 3); | |
| 294 *tmp++ = _mm_extract_epi8(mm4, 5); | |
| 295 #else | |
| 296 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT); // Shift down | |
| 297 mm4 = _mm_packus_epi32(mm4, mm4); // Clamp to 0 to 65535 range. | |
| 298 mm4 = _mm_packus_epi16(mm4, mm4); // Clamp to 0 to 65535 range. | |
| 299 j = _mm_extract_epi32(mm4, 0); | |
| 300 *(int16_t *)tmp = j; | |
| 301 ((int8_t *)tmp)[2] = j>>16; | |
| 302 tmp += 3; | |
| 303 #endif | |
| 304 index++; | |
| 305 dst_w--; | |
| 306 } | |
| 307 | |
| 308 while (dst_w > 0) | |
| 309 { | |
| 310 const uint8_t *s; | |
| 311 | |
| 312 /* Jump out of band to do the (rare) slow (edge) pixels */ | |
| 313 if (index->slow) | |
| 314 goto slow; | |
| 315 | |
| 316 s = &src[index->first_pixel * 3]; | |
| 317 | |
| 318 { | |
| 319 const int32_t *w = &weights[index->index]; | |
| 320 uint32_t j = index->n; | |
| 321 int32_t pixel0 = WEIGHT_ROUND; | |
| 322 int32_t pixel1 = WEIGHT_ROUND; | |
| 323 int32_t pixel2 = WEIGHT_ROUND; | |
| 324 for (j = index->n; j > 0; j--) | |
| 325 { | |
| 326 int32_t wt = *w++; | |
| 327 pixel0 += *s++ * wt; | |
| 328 pixel1 += *s++ * wt; | |
| 329 pixel2 += *s++ * wt; | |
| 330 } | |
| 331 pixel0 >>= WEIGHT_SHIFT; | |
| 332 pixel1 >>= WEIGHT_SHIFT; | |
| 333 pixel2 >>= WEIGHT_SHIFT; | |
| 334 *tmp++ = CLAMP(pixel0, 0, 255); | |
| 335 *tmp++ = CLAMP(pixel1, 0, 255); | |
| 336 *tmp++ = CLAMP(pixel2, 0, 255); | |
| 337 } | |
| 338 index++; | |
| 339 dst_w--; | |
| 340 } | |
| 341 } | |
| 342 | |
| 343 static void | |
| 344 zoom_x4_sse(uint8_t * FZ_RESTRICT tmp, | |
| 345 const uint8_t * FZ_RESTRICT src, | |
| 346 const index_t * FZ_RESTRICT index, | |
| 347 const int32_t * FZ_RESTRICT weights, | |
| 348 uint32_t dst_w, | |
| 349 uint32_t src_w, | |
| 350 uint32_t channels, | |
| 351 const uint8_t * FZ_RESTRICT bg) | |
| 352 { | |
| 353 __m128i round = _mm_set1_epi32(WEIGHT_ROUND); | |
| 354 | |
| 355 if (0) | |
| 356 slow: | |
| 357 { | |
| 358 /* Do any where we might index off the edge of the source */ | |
| 359 int pn = index->first_pixel; | |
| 360 const uint8_t *s = &src[pn * 4]; | |
| 361 const int32_t *w = &weights[index->index]; | |
| 362 uint32_t j = index->n; | |
| 363 int32_t pixel0 = WEIGHT_ROUND; | |
| 364 int32_t pixel1 = WEIGHT_ROUND; | |
| 365 int32_t pixel2 = WEIGHT_ROUND; | |
| 366 int32_t pixel3 = WEIGHT_ROUND; | |
| 367 int pix_num = pn; | |
| 368 if (pix_num < 0) | |
| 369 { | |
| 370 int32_t wt = *w++; | |
| 371 assert(pix_num == -1); | |
| 372 pixel0 += bg[0] * wt; | |
| 373 pixel1 += bg[1] * wt; | |
| 374 pixel2 += bg[2] * wt; | |
| 375 pixel3 += bg[3] * wt; | |
| 376 s += 4; | |
| 377 j--; | |
| 378 pix_num = 0; | |
| 379 } | |
| 380 pix_num = (int)src_w - pix_num; | |
| 381 if (pix_num > (int)j) | |
| 382 pix_num = j; | |
| 383 j -= pix_num; | |
| 384 while (pix_num > 0) | |
| 385 { | |
| 386 int32_t wt = *w++; | |
| 387 pixel0 += *s++ * wt; | |
| 388 pixel1 += *s++ * wt; | |
| 389 pixel2 += *s++ * wt; | |
| 390 pixel3 += *s++ * wt; | |
| 391 pix_num--; | |
| 392 } | |
| 393 if (j > 0) | |
| 394 { | |
| 395 int32_t wt = *w; | |
| 396 assert(j == 1); | |
| 397 pixel0 += bg[0] * wt; | |
| 398 pixel1 += bg[1] * wt; | |
| 399 pixel2 += bg[2] * wt; | |
| 400 pixel3 += bg[3] * wt; | |
| 401 } | |
| 402 pixel0 >>= WEIGHT_SHIFT; | |
| 403 pixel1 >>= WEIGHT_SHIFT; | |
| 404 pixel2 >>= WEIGHT_SHIFT; | |
| 405 pixel3 >>= WEIGHT_SHIFT; | |
| 406 *tmp++ = CLAMP(pixel0, 0, 255); | |
| 407 *tmp++ = CLAMP(pixel1, 0, 255); | |
| 408 *tmp++ = CLAMP(pixel2, 0, 255); | |
| 409 *tmp++ = CLAMP(pixel3, 0, 255); | |
| 410 index++; | |
| 411 dst_w--; | |
| 412 } | |
| 413 | |
| 414 while (dst_w > 0) | |
| 415 { | |
| 416 const uint8_t *s; | |
| 417 int j; | |
| 418 const int32_t *w; | |
| 419 __m128i mm0, mm1, mm4, mw0, mw1; | |
| 420 | |
| 421 /* Jump out of band to do the (rare) slow (edge) pixels */ | |
| 422 if (index->slow) | |
| 423 goto slow; | |
| 424 | |
| 425 s = &src[index->first_pixel * 4]; | |
| 426 j = (int)index->n; | |
| 427 w = &weights[index->index]; | |
| 428 | |
| 429 mm4 = round; | |
| 430 mm0 = _mm_loadu_si128((const __m128i *)s); // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 431 if (j == 4) | |
| 432 { | |
| 433 mw0 = _mm_load_si128((const __m128i *)w); | |
| 434 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 435 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 436 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 437 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 438 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 439 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 440 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 441 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 442 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 443 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 444 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 445 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 446 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 447 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 448 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 449 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 450 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 451 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 452 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 453 } | |
| 454 else | |
| 455 { | |
| 456 int off = j & 3; | |
| 457 w -= (4 - j) & 3; | |
| 458 s += (off ? off : 4) * 4; | |
| 459 mw0 = _mm_loadu_si128((const __m128i *)w); | |
| 460 w += 4; | |
| 461 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */ | |
| 462 switch (off) | |
| 463 { | |
| 464 do | |
| 465 { | |
| 466 mm0 = _mm_loadu_si128((const __m128i *)s); | |
| 467 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 468 s += 4 * 4; | |
| 469 mw0 = _mm_load_si128((const __m128i *)w); | |
| 470 w += 4; | |
| 471 case 0: | |
| 472 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 473 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 474 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 475 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 476 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 477 case 3: | |
| 478 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 479 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 480 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 481 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 482 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 483 case 2: | |
| 484 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 485 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 486 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 487 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 488 mm0 = _mm_srli_si128(mm0, 4); // mm0 = 000000ppoonnmmllkkjjiihhggffeedd SSE2 | |
| 489 case 1: | |
| 490 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 491 mm1 = _mm_cvtepu8_epi32(mm0); // mm1 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 492 mm1 = _mm_mullo_epi32(mm1, mw1);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 493 mm4 = _mm_add_epi32(mm4, mm1); // mm4 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 494 j -= 4; | |
| 495 } while (j > 0); | |
| 496 } | |
| 497 } | |
| 498 #if 0 | |
| 499 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT - 8); // Shift down | |
| 500 mm4 = _mm_packus_epi32(mm4,mm4); // Clamp to 0 to 65535 range. | |
| 501 *tmp++ = _mm_extract_epi8(mm4,1); | |
| 502 *tmp++ = _mm_extract_epi8(mm4,3); | |
| 503 *tmp++ = _mm_extract_epi8(mm4,5); | |
| 504 *tmp++ = _mm_extract_epi8(mm4,7); | |
| 505 #else | |
| 506 mm4 = _mm_srai_epi32(mm4, WEIGHT_SHIFT); | |
| 507 mm4 = _mm_packus_epi32(mm4,mm4); | |
| 508 mm4 = _mm_packus_epi16(mm4,mm4); | |
| 509 *(int32_t *)tmp = _mm_extract_epi32(mm4,0); | |
| 510 tmp += 4; | |
| 511 #endif | |
| 512 index++; | |
| 513 dst_w--; | |
| 514 } | |
| 515 } | |
| 516 | |
| 517 static void | |
| 518 zoom_y1_sse(uint8_t * dst, | |
| 519 const uint8_t * FZ_RESTRICT tmp, | |
| 520 const index_t * FZ_RESTRICT index, | |
| 521 const int32_t * FZ_RESTRICT weights, | |
| 522 uint32_t width, | |
| 523 uint32_t channels, | |
| 524 uint32_t mod, | |
| 525 int32_t y) | |
| 526 { | |
| 527 uint32_t stride = width; | |
| 528 uint32_t offset = 0; | |
| 529 const __m128i *mm_weights = (const __m128i *)weights; | |
| 530 const __m128i mm_weight_round = _mm_set1_epi32(WEIGHT_ROUND); | |
| 531 | |
| 532 if (0) | |
| 533 slow: | |
| 534 { | |
| 535 uint32_t off = (index->first_pixel + y) * stride + offset; | |
| 536 | |
| 537 offset++; | |
| 538 if (off >= mod) | |
| 539 off -= mod; | |
| 540 | |
| 541 { | |
| 542 const int32_t *w = (const int32_t *)&mm_weights[index->index]; | |
| 543 uint32_t j; | |
| 544 int32_t pixel0 = WEIGHT_ROUND; | |
| 545 | |
| 546 for (j = index->n; j > 0; j--) | |
| 547 { | |
| 548 pixel0 += tmp[off] * *w; | |
| 549 w += 4; | |
| 550 off += stride; | |
| 551 if (off >= mod) | |
| 552 off -= mod; | |
| 553 } | |
| 554 pixel0 >>= WEIGHT_SHIFT; | |
| 555 *dst++ = CLAMP(pixel0, 0, 255); | |
| 556 } | |
| 557 index++; | |
| 558 width--; | |
| 559 } | |
| 560 | |
| 561 while ((int)width > 0) | |
| 562 { | |
| 563 uint32_t off; | |
| 564 /* The slow flag stops us accessing off the end of the source row. | |
| 565 * It also tells us how many pixels we can do at once. This usage | |
| 566 * is different for zoom_y1 than for all other cores. */ | |
| 567 uint8_t n = index->slow; | |
| 568 if (n <= 1 || n > width) | |
| 569 goto slow; | |
| 570 off = (index->first_pixel + y) * stride + offset; | |
| 571 offset += n; | |
| 572 if (off >= mod) | |
| 573 off -= mod; | |
| 574 | |
| 575 { | |
| 576 const __m128i *w = &mm_weights[index->index]; | |
| 577 uint32_t j = index->n; | |
| 578 __m128i mm_pixels = mm_weight_round; | |
| 579 | |
| 580 if (j == 4) | |
| 581 { | |
| 582 __m128i pix0, pix1, pix2; | |
| 583 __m128i w0, w1, w2; | |
| 584 pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 585 off += stride; | |
| 586 if (off >= mod) | |
| 587 off -= mod; | |
| 588 w0 = _mm_load_si128(w++); | |
| 589 pix0 = _mm_cvtepu8_epi32(pix0); | |
| 590 pix1 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 591 off += stride; | |
| 592 if (off >= mod) | |
| 593 off -= mod; | |
| 594 pix0 = _mm_mullo_epi32(pix0, w0); | |
| 595 w1 = _mm_load_si128(w++); | |
| 596 pix1 = _mm_cvtepu8_epi32(pix1); | |
| 597 pix2 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 598 off += stride; | |
| 599 if (off >= mod) | |
| 600 off -= mod; | |
| 601 mm_pixels = _mm_add_epi32(mm_pixels, pix0); | |
| 602 pix1 = _mm_mullo_epi32(pix1, w1); | |
| 603 w2 = _mm_load_si128(w++); | |
| 604 pix2 = _mm_cvtepu8_epi32(pix2); | |
| 605 pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 606 off += stride; | |
| 607 if (off >= mod) | |
| 608 off -= mod; | |
| 609 mm_pixels = _mm_add_epi32(mm_pixels, pix1); | |
| 610 pix2 = _mm_mullo_epi32(pix2, w2); | |
| 611 w0 = _mm_load_si128(w++); | |
| 612 pix0 = _mm_cvtepu8_epi32(pix0); | |
| 613 pix0 = _mm_mullo_epi32(pix0, w0); | |
| 614 mm_pixels = _mm_add_epi32(mm_pixels, pix2); | |
| 615 mm_pixels = _mm_add_epi32(mm_pixels, pix0); | |
| 616 } | |
| 617 else | |
| 618 for ( ; j > 0; j--) | |
| 619 { | |
| 620 __m128i pix0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 621 __m128i w0 = _mm_load_si128(w++); | |
| 622 pix0 = _mm_cvtepu8_epi32(pix0); | |
| 623 off += stride; | |
| 624 pix0 = _mm_mullo_epi32(pix0, w0); | |
| 625 if (off >= mod) | |
| 626 off -= mod; | |
| 627 mm_pixels = _mm_add_epi32(mm_pixels, pix0); | |
| 628 } | |
| 629 mm_pixels = _mm_srli_epi32(mm_pixels, WEIGHT_SHIFT); | |
| 630 mm_pixels = _mm_packus_epi32(mm_pixels, mm_pixels); // Clamp to 0 to 65535 range. | |
| 631 mm_pixels = _mm_packus_epi16(mm_pixels, mm_pixels); // Clamp to 0 to 255 range. | |
| 632 j = _mm_extract_epi32(mm_pixels, 0); | |
| 633 switch (n) | |
| 634 { | |
| 635 default: | |
| 636 case 4: | |
| 637 *(int32_t *)dst = j; | |
| 638 dst += 4; | |
| 639 break; | |
| 640 case 3: | |
| 641 *(int16_t *)dst = j; | |
| 642 ((uint8_t *)dst)[2] = j >> 16; | |
| 643 dst += 3; | |
| 644 break; | |
| 645 case 2: | |
| 646 *(int16_t *)dst = j; | |
| 647 dst += 2; | |
| 648 break; | |
| 649 case 1: | |
| 650 *(int8_t *)dst = j; | |
| 651 dst += 1; | |
| 652 break; | |
| 653 } | |
| 654 } | |
| 655 index += n; | |
| 656 width -= n; | |
| 657 } | |
| 658 } | |
| 659 | |
| 660 static void | |
| 661 zoom_y3_sse(uint8_t * dst, | |
| 662 const uint8_t * FZ_RESTRICT tmp, | |
| 663 const index_t * FZ_RESTRICT index, | |
| 664 const int32_t * FZ_RESTRICT weights, | |
| 665 uint32_t width, | |
| 666 uint32_t channels, | |
| 667 uint32_t mod, | |
| 668 int32_t y) | |
| 669 { | |
| 670 uint32_t stride = width * 3; | |
| 671 uint32_t offset = 0; | |
| 672 __m128i round = _mm_set1_epi32(WEIGHT_ROUND); | |
| 673 | |
| 674 while (width--) | |
| 675 { | |
| 676 uint32_t off = (index->first_pixel + y) * stride + offset; | |
| 677 | |
| 678 offset += 3; | |
| 679 if (off >= mod) | |
| 680 off -= mod; | |
| 681 | |
| 682 { | |
| 683 const int32_t *w = &weights[index->index]; | |
| 684 int32_t j = (int32_t)index->n; | |
| 685 __m128i mm0, mm1, mm2, mw0, mw1; | |
| 686 | |
| 687 if (j == 4) | |
| 688 { | |
| 689 mw0 = _mm_load_si128((const __m128i *)w); | |
| 690 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 691 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 692 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 693 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 694 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 695 mm1 = _mm_add_epi32(round, mm0);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 696 off += stride; | |
| 697 if (off >= mod) | |
| 698 off -= mod; | |
| 699 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 700 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 701 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 702 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 703 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 704 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 705 off += stride; | |
| 706 if (off >= mod) | |
| 707 off -= mod; | |
| 708 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 709 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 710 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 711 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 712 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 713 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 714 off += stride; | |
| 715 if (off >= mod) | |
| 716 off -= mod; | |
| 717 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 718 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 719 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 720 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 721 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 722 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 723 } | |
| 724 else | |
| 725 { | |
| 726 int duff = j & 3; | |
| 727 w -= (4 - j) & 3; | |
| 728 mw0 = _mm_loadu_si128((const __m128i *)w); | |
| 729 w += 4; | |
| 730 mm1 = round; | |
| 731 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */ | |
| 732 switch (duff) | |
| 733 { | |
| 734 do | |
| 735 { | |
| 736 off += stride; | |
| 737 if (off >= mod) | |
| 738 off -= mod; | |
| 739 mw0 = _mm_load_si128((const __m128i *)w); | |
| 740 w += 4; | |
| 741 case 0: | |
| 742 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 743 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 744 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 745 off += stride; | |
| 746 if (off >= mod) | |
| 747 off -= mod; | |
| 748 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 749 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 750 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 751 case 3: | |
| 752 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 753 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 754 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 755 off += stride; | |
| 756 if (off >= mod) | |
| 757 off -= mod; | |
| 758 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 759 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 760 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 761 case 2: | |
| 762 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 763 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 764 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 765 off += stride; | |
| 766 if (off >= mod) | |
| 767 off -= mod; | |
| 768 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 769 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 770 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 771 case 1: | |
| 772 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 773 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 774 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 775 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 776 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 777 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 778 j -= 4; | |
| 779 } while (j > 0); | |
| 780 } | |
| 781 } | |
| 782 #if 0 | |
| 783 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT - 8); // Shift down | |
| 784 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range. | |
| 785 *dst++ = _mm_extract_epi8(mm1,1); | |
| 786 *dst++ = _mm_extract_epi8(mm1,3); | |
| 787 *dst++ = _mm_extract_epi8(mm1,5); | |
| 788 #else | |
| 789 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT); // Shift down | |
| 790 mm1 = _mm_packus_epi32(mm1, mm1); // Clamp to 0 to 65535 range. | |
| 791 mm1 = _mm_packus_epi16(mm1, mm1); // Clamp to 0 to 255 range. | |
| 792 j = _mm_extract_epi32(mm1, 0); | |
| 793 *(int16_t *)dst = j; | |
| 794 ((uint8_t *)dst)[2] = j >> 16; | |
| 795 dst += 3; | |
| 796 #endif | |
| 797 } | |
| 798 index++; | |
| 799 } | |
| 800 } | |
| 801 | |
| 802 static void | |
| 803 zoom_y4_sse(uint8_t * dst, | |
| 804 const uint8_t * FZ_RESTRICT tmp, | |
| 805 const index_t * FZ_RESTRICT index, | |
| 806 const int32_t * FZ_RESTRICT weights, | |
| 807 uint32_t width, | |
| 808 uint32_t channels, | |
| 809 uint32_t mod, | |
| 810 int32_t y) | |
| 811 { | |
| 812 uint32_t stride = width * 4; | |
| 813 uint32_t offset = 0; | |
| 814 __m128i round = _mm_set1_epi32(WEIGHT_ROUND); | |
| 815 | |
| 816 while (width--) | |
| 817 { | |
| 818 uint32_t off = (index->first_pixel + y) * stride + offset; | |
| 819 | |
| 820 offset += 4; | |
| 821 if (off >= mod) | |
| 822 off -= mod; | |
| 823 | |
| 824 { | |
| 825 const int32_t *w = &weights[index->index]; | |
| 826 int32_t j = (int32_t)index->n; | |
| 827 __m128i mm0, mm1, mm2, mw0, mw1; | |
| 828 | |
| 829 if (j == 4) | |
| 830 { | |
| 831 mw0 = _mm_load_si128((const __m128i *)w); | |
| 832 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 833 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 834 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 835 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 836 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 837 mm1 = _mm_add_epi32(round, mm0);// mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 838 off += stride; | |
| 839 if (off >= mod) | |
| 840 off -= mod; | |
| 841 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 842 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 843 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 844 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 845 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 846 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 847 off += stride; | |
| 848 if (off >= mod) | |
| 849 off -= mod; | |
| 850 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 851 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 852 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 853 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 854 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 855 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 856 off += stride; | |
| 857 if (off >= mod) | |
| 858 off -= mod; | |
| 859 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 860 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 861 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 862 mm0 = _mm_cvtepu8_epi32(mm0); // mm0 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 863 mm0 = _mm_mullo_epi32(mm0,mw1); // mm0 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 864 mm1 = _mm_add_epi32(mm1, mm0); // mm1 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 865 } | |
| 866 else | |
| 867 { | |
| 868 int duff = j & 3; | |
| 869 w -= (4 - j) & 3; | |
| 870 mw0 = _mm_loadu_si128((const __m128i *)w); | |
| 871 w += 4; | |
| 872 mm1 = round; | |
| 873 /* This is a use of Duff's Device. I'm very sorry, but on the other hand, Yay! */ | |
| 874 switch (duff) | |
| 875 { | |
| 876 do | |
| 877 { | |
| 878 off += stride; | |
| 879 if (off >= mod) | |
| 880 off -= mod; | |
| 881 mw0 = _mm_load_si128((const __m128i *)w); | |
| 882 w += 4; | |
| 883 case 0: | |
| 884 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 885 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 886 mw1 = _mm_shuffle_epi32(mw0, 0 + (0 << 2) + (0 << 4) + (0 << 6)); | |
| 887 off += stride; | |
| 888 if (off >= mod) | |
| 889 off -= mod; | |
| 890 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 891 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 892 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 893 case 3: | |
| 894 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 895 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 896 mw1 = _mm_shuffle_epi32(mw0, 1 + (1 << 2) + (1 << 4) + (1 << 6)); | |
| 897 off += stride; | |
| 898 if (off >= mod) | |
| 899 off -= mod; | |
| 900 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 901 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 902 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 903 case 2: | |
| 904 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 905 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 906 mw1 = _mm_shuffle_epi32(mw0, 2 + (2 << 2) + (2 << 4) + (2 << 6)); | |
| 907 off += stride; | |
| 908 if (off >= mod) | |
| 909 off -= mod; | |
| 910 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 911 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 912 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 913 case 1: | |
| 914 mm0 = _mm_loadu_si128((const __m128i *)&tmp[off]); | |
| 915 // mm0 = ppoonnmmllkkjjiihhggffeeddccbbaa SSE2 | |
| 916 mw1 = _mm_shuffle_epi32(mw0, 3 + (3 << 2) + (3 << 4) + (3 << 6)); | |
| 917 mm2 = _mm_cvtepu8_epi32(mm0); // mm2 = 000000dd000000cc000000bb000000aa SSE4.1 | |
| 918 mm2 = _mm_mullo_epi32(mm2, mw1);// mm2 = 0000wdwd0000wcwc0000wbwb0000wawa SSE4.1 | |
| 919 mm1 = _mm_add_epi32(mm1, mm2); // mm1 = 0000xxxx0000xxxx0000xxxx0000xxxx SSE2 | |
| 920 j -= 4; | |
| 921 } while (j > 0); | |
| 922 } | |
| 923 } | |
| 924 #if 0 | |
| 925 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT - 8); // Shift down | |
| 926 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range. | |
| 927 *dst++ = _mm_extract_epi8(mm1,1); | |
| 928 *dst++ = _mm_extract_epi8(mm1,3); | |
| 929 *dst++ = _mm_extract_epi8(mm1,5); | |
| 930 *dst++ = _mm_extract_epi8(mm1,7); | |
| 931 #else | |
| 932 mm1 = _mm_srai_epi32(mm1, WEIGHT_SHIFT); // Shift down | |
| 933 mm1 = _mm_packus_epi32(mm1,mm1); // Clamp to 0 to 65535 range. | |
| 934 mm1 = _mm_packus_epi16(mm1,mm1); // Clamp to 0 to 255 range. | |
| 935 *(int32_t *)dst = _mm_extract_epi32(mm1, 0); | |
| 936 dst += 4; | |
| 937 #endif | |
| 938 } | |
| 939 index++; | |
| 940 } | |
| 941 } |
