Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/openjpeg/src/lib/openjp2/dwt.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /* | |
| 2 * The copyright in this software is being made available under the 2-clauses | |
| 3 * BSD License, included below. This software may be subject to other third | |
| 4 * party and contributor rights, including patent rights, and no such rights | |
| 5 * are granted under this license. | |
| 6 * | |
| 7 * Copyright (c) 2002-2014, Universite catholique de Louvain (UCL), Belgium | |
| 8 * Copyright (c) 2002-2014, Professor Benoit Macq | |
| 9 * Copyright (c) 2001-2003, David Janssens | |
| 10 * Copyright (c) 2002-2003, Yannick Verschueren | |
| 11 * Copyright (c) 2003-2007, Francois-Olivier Devaux | |
| 12 * Copyright (c) 2003-2014, Antonin Descampe | |
| 13 * Copyright (c) 2005, Herve Drolon, FreeImage Team | |
| 14 * Copyright (c) 2007, Jonathan Ballard <dzonatas@dzonux.net> | |
| 15 * Copyright (c) 2007, Callum Lerwick <seg@haxxed.com> | |
| 16 * Copyright (c) 2017, IntoPIX SA <support@intopix.com> | |
| 17 * All rights reserved. | |
| 18 * | |
| 19 * Redistribution and use in source and binary forms, with or without | |
| 20 * modification, are permitted provided that the following conditions | |
| 21 * are met: | |
| 22 * 1. Redistributions of source code must retain the above copyright | |
| 23 * notice, this list of conditions and the following disclaimer. | |
| 24 * 2. Redistributions in binary form must reproduce the above copyright | |
| 25 * notice, this list of conditions and the following disclaimer in the | |
| 26 * documentation and/or other materials provided with the distribution. | |
| 27 * | |
| 28 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS `AS IS' | |
| 29 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 31 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |
| 32 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |
| 33 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |
| 34 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |
| 35 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |
| 36 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |
| 37 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
| 38 * POSSIBILITY OF SUCH DAMAGE. | |
| 39 */ | |
| 40 | |
| 41 #include <assert.h> | |
| 42 | |
| 43 #define OPJ_SKIP_POISON | |
| 44 #include "opj_includes.h" | |
| 45 | |
| 46 #ifdef __SSE__ | |
| 47 #include <xmmintrin.h> | |
| 48 #endif | |
| 49 #ifdef __SSE2__ | |
| 50 #include <emmintrin.h> | |
| 51 #endif | |
| 52 #ifdef __SSSE3__ | |
| 53 #include <tmmintrin.h> | |
| 54 #endif | |
| 55 #if (defined(__AVX2__) || defined(__AVX512F__)) | |
| 56 #include <immintrin.h> | |
| 57 #endif | |
| 58 | |
| 59 #if defined(__GNUC__) | |
| 60 #pragma GCC poison malloc calloc realloc free | |
| 61 #endif | |
| 62 | |
| 63 /** @defgroup DWT DWT - Implementation of a discrete wavelet transform */ | |
| 64 /*@{*/ | |
| 65 | |
| 66 #define OPJ_WS(i) v->mem[(i)*2] | |
| 67 #define OPJ_WD(i) v->mem[(1+(i)*2)] | |
| 68 | |
| 69 #if defined(__AVX512F__) | |
| 70 /** Number of int32 values in a AVX512 register */ | |
| 71 #define VREG_INT_COUNT 16 | |
| 72 #elif defined(__AVX2__) | |
| 73 /** Number of int32 values in a AVX2 register */ | |
| 74 #define VREG_INT_COUNT 8 | |
| 75 #else | |
| 76 /** Number of int32 values in a SSE2 register */ | |
| 77 #define VREG_INT_COUNT 4 | |
| 78 #endif | |
| 79 | |
| 80 /** Number of columns that we can process in parallel in the vertical pass */ | |
| 81 #define PARALLEL_COLS_53 (2*VREG_INT_COUNT) | |
| 82 | |
| 83 /** @name Local data structures */ | |
| 84 /*@{*/ | |
| 85 | |
| 86 typedef struct dwt_local { | |
| 87 OPJ_INT32* mem; | |
| 88 OPJ_INT32 dn; /* number of elements in high pass band */ | |
| 89 OPJ_INT32 sn; /* number of elements in low pass band */ | |
| 90 OPJ_INT32 cas; /* 0 = start on even coord, 1 = start on odd coord */ | |
| 91 } opj_dwt_t; | |
| 92 | |
| 93 #define NB_ELTS_V8 8 | |
| 94 | |
| 95 typedef union { | |
| 96 OPJ_FLOAT32 f[NB_ELTS_V8]; | |
| 97 } opj_v8_t; | |
| 98 | |
| 99 typedef struct v8dwt_local { | |
| 100 opj_v8_t* wavelet ; | |
| 101 OPJ_INT32 dn ; /* number of elements in high pass band */ | |
| 102 OPJ_INT32 sn ; /* number of elements in low pass band */ | |
| 103 OPJ_INT32 cas ; /* 0 = start on even coord, 1 = start on odd coord */ | |
| 104 OPJ_UINT32 win_l_x0; /* start coord in low pass band */ | |
| 105 OPJ_UINT32 win_l_x1; /* end coord in low pass band */ | |
| 106 OPJ_UINT32 win_h_x0; /* start coord in high pass band */ | |
| 107 OPJ_UINT32 win_h_x1; /* end coord in high pass band */ | |
| 108 } opj_v8dwt_t ; | |
| 109 | |
| 110 /* From table F.4 from the standard */ | |
| 111 static const OPJ_FLOAT32 opj_dwt_alpha = -1.586134342f; | |
| 112 static const OPJ_FLOAT32 opj_dwt_beta = -0.052980118f; | |
| 113 static const OPJ_FLOAT32 opj_dwt_gamma = 0.882911075f; | |
| 114 static const OPJ_FLOAT32 opj_dwt_delta = 0.443506852f; | |
| 115 | |
| 116 static const OPJ_FLOAT32 opj_K = 1.230174105f; | |
| 117 static const OPJ_FLOAT32 opj_invK = (OPJ_FLOAT32)(1.0 / 1.230174105); | |
| 118 | |
| 119 /*@}*/ | |
| 120 | |
| 121 /** @name Local static functions */ | |
| 122 /*@{*/ | |
| 123 | |
| 124 /** | |
| 125 Forward lazy transform (horizontal) | |
| 126 */ | |
| 127 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a, | |
| 128 OPJ_INT32 * OPJ_RESTRICT b, | |
| 129 OPJ_INT32 dn, | |
| 130 OPJ_INT32 sn, OPJ_INT32 cas); | |
| 131 | |
| 132 /** | |
| 133 Forward 9-7 wavelet transform in 1-D | |
| 134 */ | |
| 135 static void opj_dwt_encode_1_real(void *a, OPJ_INT32 dn, OPJ_INT32 sn, | |
| 136 OPJ_INT32 cas); | |
| 137 /** | |
| 138 Explicit calculation of the Quantization Stepsizes | |
| 139 */ | |
| 140 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, | |
| 141 opj_stepsize_t *bandno_stepsize); | |
| 142 /** | |
| 143 Inverse wavelet transform in 2-D. | |
| 144 */ | |
| 145 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, | |
| 146 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 i); | |
| 147 | |
| 148 static OPJ_BOOL opj_dwt_decode_partial_tile( | |
| 149 opj_tcd_tilecomp_t* tilec, | |
| 150 OPJ_UINT32 numres); | |
| 151 | |
| 152 /* Forward transform, for the vertical pass, processing cols columns */ | |
| 153 /* where cols <= NB_ELTS_V8 */ | |
| 154 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */ | |
| 155 typedef void (*opj_encode_and_deinterleave_v_fnptr_type)( | |
| 156 void *array, | |
| 157 void *tmp, | |
| 158 OPJ_UINT32 height, | |
| 159 OPJ_BOOL even, | |
| 160 OPJ_UINT32 stride_width, | |
| 161 OPJ_UINT32 cols); | |
| 162 | |
| 163 /* Where void* is a OPJ_INT32* for 5x3 and OPJ_FLOAT32* for 9x7 */ | |
| 164 typedef void (*opj_encode_and_deinterleave_h_one_row_fnptr_type)( | |
| 165 void *row, | |
| 166 void *tmp, | |
| 167 OPJ_UINT32 width, | |
| 168 OPJ_BOOL even); | |
| 169 | |
| 170 static OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, | |
| 171 opj_tcd_tilecomp_t * tilec, | |
| 172 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v, | |
| 173 opj_encode_and_deinterleave_h_one_row_fnptr_type | |
| 174 p_encode_and_deinterleave_h_one_row); | |
| 175 | |
| 176 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, | |
| 177 OPJ_UINT32 i); | |
| 178 | |
| 179 /* <summary> */ | |
| 180 /* Inverse 9-7 wavelet transform in 1-D. */ | |
| 181 /* </summary> */ | |
| 182 | |
| 183 /*@}*/ | |
| 184 | |
| 185 /*@}*/ | |
| 186 | |
| 187 #define OPJ_S(i) a[(i)*2] | |
| 188 #define OPJ_D(i) a[(1+(i)*2)] | |
| 189 #define OPJ_S_(i) ((i)<0?OPJ_S(0):((i)>=sn?OPJ_S(sn-1):OPJ_S(i))) | |
| 190 #define OPJ_D_(i) ((i)<0?OPJ_D(0):((i)>=dn?OPJ_D(dn-1):OPJ_D(i))) | |
| 191 /* new */ | |
| 192 #define OPJ_SS_(i) ((i)<0?OPJ_S(0):((i)>=dn?OPJ_S(dn-1):OPJ_S(i))) | |
| 193 #define OPJ_DD_(i) ((i)<0?OPJ_D(0):((i)>=sn?OPJ_D(sn-1):OPJ_D(i))) | |
| 194 | |
| 195 /* <summary> */ | |
| 196 /* This table contains the norms of the 5-3 wavelets for different bands. */ | |
| 197 /* </summary> */ | |
| 198 /* FIXME! the array should really be extended up to 33 resolution levels */ | |
| 199 /* See https://github.com/uclouvain/openjpeg/issues/493 */ | |
| 200 static const OPJ_FLOAT64 opj_dwt_norms[4][10] = { | |
| 201 {1.000, 1.500, 2.750, 5.375, 10.68, 21.34, 42.67, 85.33, 170.7, 341.3}, | |
| 202 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, | |
| 203 {1.038, 1.592, 2.919, 5.703, 11.33, 22.64, 45.25, 90.48, 180.9}, | |
| 204 {.7186, .9218, 1.586, 3.043, 6.019, 12.01, 24.00, 47.97, 95.93} | |
| 205 }; | |
| 206 | |
| 207 /* <summary> */ | |
| 208 /* This table contains the norms of the 9-7 wavelets for different bands. */ | |
| 209 /* </summary> */ | |
| 210 /* FIXME! the array should really be extended up to 33 resolution levels */ | |
| 211 /* See https://github.com/uclouvain/openjpeg/issues/493 */ | |
| 212 static const OPJ_FLOAT64 opj_dwt_norms_real[4][10] = { | |
| 213 {1.000, 1.965, 4.177, 8.403, 16.90, 33.84, 67.69, 135.3, 270.6, 540.9}, | |
| 214 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, | |
| 215 {2.022, 3.989, 8.355, 17.04, 34.27, 68.63, 137.3, 274.6, 549.0}, | |
| 216 {2.080, 3.865, 8.307, 17.18, 34.71, 69.59, 139.3, 278.6, 557.2} | |
| 217 }; | |
| 218 | |
| 219 /* | |
| 220 ========================================================== | |
| 221 local functions | |
| 222 ========================================================== | |
| 223 */ | |
| 224 | |
| 225 /* <summary> */ | |
| 226 /* Forward lazy transform (horizontal). */ | |
| 227 /* </summary> */ | |
| 228 static void opj_dwt_deinterleave_h(const OPJ_INT32 * OPJ_RESTRICT a, | |
| 229 OPJ_INT32 * OPJ_RESTRICT b, | |
| 230 OPJ_INT32 dn, | |
| 231 OPJ_INT32 sn, OPJ_INT32 cas) | |
| 232 { | |
| 233 OPJ_INT32 i; | |
| 234 OPJ_INT32 * OPJ_RESTRICT l_dest = b; | |
| 235 const OPJ_INT32 * OPJ_RESTRICT l_src = a + cas; | |
| 236 | |
| 237 for (i = 0; i < sn; ++i) { | |
| 238 *l_dest++ = *l_src; | |
| 239 l_src += 2; | |
| 240 } | |
| 241 | |
| 242 l_dest = b + sn; | |
| 243 l_src = a + 1 - cas; | |
| 244 | |
| 245 for (i = 0; i < dn; ++i) { | |
| 246 *l_dest++ = *l_src; | |
| 247 l_src += 2; | |
| 248 } | |
| 249 } | |
| 250 | |
| 251 #ifdef STANDARD_SLOW_VERSION | |
| 252 /* <summary> */ | |
| 253 /* Inverse lazy transform (horizontal). */ | |
| 254 /* </summary> */ | |
| 255 static void opj_dwt_interleave_h(const opj_dwt_t* h, OPJ_INT32 *a) | |
| 256 { | |
| 257 const OPJ_INT32 *ai = a; | |
| 258 OPJ_INT32 *bi = h->mem + h->cas; | |
| 259 OPJ_INT32 i = h->sn; | |
| 260 while (i--) { | |
| 261 *bi = *(ai++); | |
| 262 bi += 2; | |
| 263 } | |
| 264 ai = a + h->sn; | |
| 265 bi = h->mem + 1 - h->cas; | |
| 266 i = h->dn ; | |
| 267 while (i--) { | |
| 268 *bi = *(ai++); | |
| 269 bi += 2; | |
| 270 } | |
| 271 } | |
| 272 | |
| 273 /* <summary> */ | |
| 274 /* Inverse lazy transform (vertical). */ | |
| 275 /* </summary> */ | |
| 276 static void opj_dwt_interleave_v(const opj_dwt_t* v, OPJ_INT32 *a, OPJ_INT32 x) | |
| 277 { | |
| 278 const OPJ_INT32 *ai = a; | |
| 279 OPJ_INT32 *bi = v->mem + v->cas; | |
| 280 OPJ_INT32 i = v->sn; | |
| 281 while (i--) { | |
| 282 *bi = *ai; | |
| 283 bi += 2; | |
| 284 ai += x; | |
| 285 } | |
| 286 ai = a + (v->sn * (OPJ_SIZE_T)x); | |
| 287 bi = v->mem + 1 - v->cas; | |
| 288 i = v->dn ; | |
| 289 while (i--) { | |
| 290 *bi = *ai; | |
| 291 bi += 2; | |
| 292 ai += x; | |
| 293 } | |
| 294 } | |
| 295 | |
| 296 #endif /* STANDARD_SLOW_VERSION */ | |
| 297 | |
| 298 #ifdef STANDARD_SLOW_VERSION | |
| 299 /* <summary> */ | |
| 300 /* Inverse 5-3 wavelet transform in 1-D. */ | |
| 301 /* </summary> */ | |
| 302 static void opj_dwt_decode_1_(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, | |
| 303 OPJ_INT32 cas) | |
| 304 { | |
| 305 OPJ_INT32 i; | |
| 306 | |
| 307 if (!cas) { | |
| 308 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ | |
| 309 for (i = 0; i < sn; i++) { | |
| 310 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; | |
| 311 } | |
| 312 for (i = 0; i < dn; i++) { | |
| 313 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; | |
| 314 } | |
| 315 } | |
| 316 } else { | |
| 317 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ | |
| 318 OPJ_S(0) /= 2; | |
| 319 } else { | |
| 320 for (i = 0; i < sn; i++) { | |
| 321 OPJ_D(i) -= (OPJ_SS_(i) + OPJ_SS_(i + 1) + 2) >> 2; | |
| 322 } | |
| 323 for (i = 0; i < dn; i++) { | |
| 324 OPJ_S(i) += (OPJ_DD_(i) + OPJ_DD_(i - 1)) >> 1; | |
| 325 } | |
| 326 } | |
| 327 } | |
| 328 } | |
| 329 | |
| 330 static void opj_dwt_decode_1(const opj_dwt_t *v) | |
| 331 { | |
| 332 opj_dwt_decode_1_(v->mem, v->dn, v->sn, v->cas); | |
| 333 } | |
| 334 | |
| 335 #endif /* STANDARD_SLOW_VERSION */ | |
| 336 | |
| 337 #if defined(__AVX512F__) | |
| 338 static int32_t loop_short_sse(int32_t len, const int32_t** lf_ptr, | |
| 339 const int32_t** hf_ptr, int32_t** out_ptr, | |
| 340 int32_t* prev_even) | |
| 341 { | |
| 342 int32_t next_even; | |
| 343 __m128i odd, even_m1, unpack1, unpack2; | |
| 344 const int32_t batch = (len - 2) / 8; | |
| 345 const __m128i two = _mm_set1_epi32(2); | |
| 346 | |
| 347 for (int32_t i = 0; i < batch; i++) { | |
| 348 const __m128i lf_ = _mm_loadu_si128((__m128i*)(*lf_ptr + 1)); | |
| 349 const __m128i hf1_ = _mm_loadu_si128((__m128i*)(*hf_ptr)); | |
| 350 const __m128i hf2_ = _mm_loadu_si128((__m128i*)(*hf_ptr + 1)); | |
| 351 | |
| 352 __m128i even = _mm_add_epi32(hf1_, hf2_); | |
| 353 even = _mm_add_epi32(even, two); | |
| 354 even = _mm_srai_epi32(even, 2); | |
| 355 even = _mm_sub_epi32(lf_, even); | |
| 356 | |
| 357 next_even = _mm_extract_epi32(even, 3); | |
| 358 even_m1 = _mm_bslli_si128(even, 4); | |
| 359 even_m1 = _mm_insert_epi32(even_m1, *prev_even, 0); | |
| 360 | |
| 361 //out[0] + out[2] | |
| 362 odd = _mm_add_epi32(even_m1, even); | |
| 363 odd = _mm_srai_epi32(odd, 1); | |
| 364 odd = _mm_add_epi32(odd, hf1_); | |
| 365 | |
| 366 unpack1 = _mm_unpacklo_epi32(even_m1, odd); | |
| 367 unpack2 = _mm_unpackhi_epi32(even_m1, odd); | |
| 368 | |
| 369 _mm_storeu_si128((__m128i*)(*out_ptr + 0), unpack1); | |
| 370 _mm_storeu_si128((__m128i*)(*out_ptr + 4), unpack2); | |
| 371 | |
| 372 *prev_even = next_even; | |
| 373 | |
| 374 *out_ptr += 8; | |
| 375 *lf_ptr += 4; | |
| 376 *hf_ptr += 4; | |
| 377 } | |
| 378 return batch; | |
| 379 } | |
| 380 #endif | |
| 381 | |
| 382 #if !defined(STANDARD_SLOW_VERSION) | |
| 383 static void opj_idwt53_h_cas0(OPJ_INT32* tmp, | |
| 384 const OPJ_INT32 sn, | |
| 385 const OPJ_INT32 len, | |
| 386 OPJ_INT32* tiledp) | |
| 387 { | |
| 388 OPJ_INT32 i, j; | |
| 389 const OPJ_INT32* in_even = &tiledp[0]; | |
| 390 const OPJ_INT32* in_odd = &tiledp[sn]; | |
| 391 | |
| 392 #ifdef TWO_PASS_VERSION | |
| 393 /* For documentation purpose: performs lifting in two iterations, */ | |
| 394 /* but without explicit interleaving */ | |
| 395 | |
| 396 assert(len > 1); | |
| 397 | |
| 398 /* Even */ | |
| 399 tmp[0] = in_even[0] - ((in_odd[0] + 1) >> 1); | |
| 400 for (i = 2, j = 0; i <= len - 2; i += 2, j++) { | |
| 401 tmp[i] = in_even[j + 1] - ((in_odd[j] + in_odd[j + 1] + 2) >> 2); | |
| 402 } | |
| 403 if (len & 1) { /* if len is odd */ | |
| 404 tmp[len - 1] = in_even[(len - 1) / 2] - ((in_odd[(len - 2) / 2] + 1) >> 1); | |
| 405 } | |
| 406 | |
| 407 /* Odd */ | |
| 408 for (i = 1, j = 0; i < len - 1; i += 2, j++) { | |
| 409 tmp[i] = in_odd[j] + ((tmp[i - 1] + tmp[i + 1]) >> 1); | |
| 410 } | |
| 411 if (!(len & 1)) { /* if len is even */ | |
| 412 tmp[len - 1] = in_odd[(len - 1) / 2] + tmp[len - 2]; | |
| 413 } | |
| 414 #else | |
| 415 #if defined(__AVX512F__) | |
| 416 OPJ_INT32* out_ptr = tmp; | |
| 417 int32_t prev_even = in_even[0] - ((in_odd[0] + 1) >> 1); | |
| 418 | |
| 419 const __m512i permutevar_mask = _mm512_setr_epi32( | |
| 420 0x10, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, | |
| 421 0x0c, 0x0d, 0x0e); | |
| 422 const __m512i store1_perm = _mm512_setr_epi64(0x00, 0x01, 0x08, 0x09, 0x02, | |
| 423 0x03, 0x0a, 0x0b); | |
| 424 const __m512i store2_perm = _mm512_setr_epi64(0x04, 0x05, 0x0c, 0x0d, 0x06, | |
| 425 0x07, 0x0e, 0x0f); | |
| 426 | |
| 427 const __m512i two = _mm512_set1_epi32(2); | |
| 428 | |
| 429 int32_t simd_batch_512 = (len - 2) / 32; | |
| 430 int32_t leftover; | |
| 431 | |
| 432 for (i = 0; i < simd_batch_512; i++) { | |
| 433 const __m512i lf_avx2 = _mm512_loadu_si512((__m512i*)(in_even + 1)); | |
| 434 const __m512i hf1_avx2 = _mm512_loadu_si512((__m512i*)(in_odd)); | |
| 435 const __m512i hf2_avx2 = _mm512_loadu_si512((__m512i*)(in_odd + 1)); | |
| 436 int32_t next_even; | |
| 437 __m512i duplicate, even_m1, odd, unpack1, unpack2, store1, store2; | |
| 438 | |
| 439 __m512i even = _mm512_add_epi32(hf1_avx2, hf2_avx2); | |
| 440 even = _mm512_add_epi32(even, two); | |
| 441 even = _mm512_srai_epi32(even, 2); | |
| 442 even = _mm512_sub_epi32(lf_avx2, even); | |
| 443 | |
| 444 next_even = _mm_extract_epi32(_mm512_extracti32x4_epi32(even, 3), 3); | |
| 445 | |
| 446 duplicate = _mm512_set1_epi32(prev_even); | |
| 447 even_m1 = _mm512_permutex2var_epi32(even, permutevar_mask, duplicate); | |
| 448 | |
| 449 //out[0] + out[2] | |
| 450 odd = _mm512_add_epi32(even_m1, even); | |
| 451 odd = _mm512_srai_epi32(odd, 1); | |
| 452 odd = _mm512_add_epi32(odd, hf1_avx2); | |
| 453 | |
| 454 unpack1 = _mm512_unpacklo_epi32(even_m1, odd); | |
| 455 unpack2 = _mm512_unpackhi_epi32(even_m1, odd); | |
| 456 | |
| 457 store1 = _mm512_permutex2var_epi64(unpack1, store1_perm, unpack2); | |
| 458 store2 = _mm512_permutex2var_epi64(unpack1, store2_perm, unpack2); | |
| 459 | |
| 460 _mm512_storeu_si512(out_ptr, store1); | |
| 461 _mm512_storeu_si512(out_ptr + 16, store2); | |
| 462 | |
| 463 prev_even = next_even; | |
| 464 | |
| 465 out_ptr += 32; | |
| 466 in_even += 16; | |
| 467 in_odd += 16; | |
| 468 } | |
| 469 | |
| 470 leftover = len - simd_batch_512 * 32; | |
| 471 if (leftover > 8) { | |
| 472 leftover -= 8 * loop_short_sse(leftover, &in_even, &in_odd, &out_ptr, | |
| 473 &prev_even); | |
| 474 } | |
| 475 out_ptr[0] = prev_even; | |
| 476 | |
| 477 for (j = 1; j < (leftover - 2); j += 2) { | |
| 478 out_ptr[2] = in_even[1] - ((in_odd[0] + (in_odd[1]) + 2) >> 2); | |
| 479 out_ptr[1] = in_odd[0] + ((out_ptr[0] + out_ptr[2]) >> 1); | |
| 480 in_even++; | |
| 481 in_odd++; | |
| 482 out_ptr += 2; | |
| 483 } | |
| 484 | |
| 485 if (len & 1) { | |
| 486 out_ptr[2] = in_even[1] - ((in_odd[0] + 1) >> 1); | |
| 487 out_ptr[1] = in_odd[0] + ((out_ptr[0] + out_ptr[2]) >> 1); | |
| 488 } else { //!(len & 1) | |
| 489 out_ptr[1] = in_odd[0] + out_ptr[0]; | |
| 490 } | |
| 491 #elif defined(__AVX2__) | |
| 492 OPJ_INT32* out_ptr = tmp; | |
| 493 int32_t prev_even = in_even[0] - ((in_odd[0] + 1) >> 1); | |
| 494 | |
| 495 const __m256i reg_permutevar_mask_move_right = _mm256_setr_epi32(0x00, 0x00, | |
| 496 0x01, 0x02, 0x03, 0x04, 0x05, 0x06); | |
| 497 const __m256i two = _mm256_set1_epi32(2); | |
| 498 | |
| 499 int32_t simd_batch = (len - 2) / 16; | |
| 500 int32_t next_even; | |
| 501 __m256i even_m1, odd, unpack1_avx2, unpack2_avx2; | |
| 502 | |
| 503 for (i = 0; i < simd_batch; i++) { | |
| 504 const __m256i lf_avx2 = _mm256_loadu_si256((__m256i*)(in_even + 1)); | |
| 505 const __m256i hf1_avx2 = _mm256_loadu_si256((__m256i*)(in_odd)); | |
| 506 const __m256i hf2_avx2 = _mm256_loadu_si256((__m256i*)(in_odd + 1)); | |
| 507 | |
| 508 __m256i even = _mm256_add_epi32(hf1_avx2, hf2_avx2); | |
| 509 even = _mm256_add_epi32(even, two); | |
| 510 even = _mm256_srai_epi32(even, 2); | |
| 511 even = _mm256_sub_epi32(lf_avx2, even); | |
| 512 | |
| 513 next_even = _mm_extract_epi32(_mm256_extracti128_si256(even, 1), 3); | |
| 514 even_m1 = _mm256_permutevar8x32_epi32(even, reg_permutevar_mask_move_right); | |
| 515 even_m1 = _mm256_blend_epi32(even_m1, _mm256_set1_epi32(prev_even), (1 << 0)); | |
| 516 | |
| 517 //out[0] + out[2] | |
| 518 odd = _mm256_add_epi32(even_m1, even); | |
| 519 odd = _mm256_srai_epi32(odd, 1); | |
| 520 odd = _mm256_add_epi32(odd, hf1_avx2); | |
| 521 | |
| 522 unpack1_avx2 = _mm256_unpacklo_epi32(even_m1, odd); | |
| 523 unpack2_avx2 = _mm256_unpackhi_epi32(even_m1, odd); | |
| 524 | |
| 525 _mm_storeu_si128((__m128i*)(out_ptr + 0), _mm256_castsi256_si128(unpack1_avx2)); | |
| 526 _mm_storeu_si128((__m128i*)(out_ptr + 4), _mm256_castsi256_si128(unpack2_avx2)); | |
| 527 _mm_storeu_si128((__m128i*)(out_ptr + 8), _mm256_extracti128_si256(unpack1_avx2, | |
| 528 0x1)); | |
| 529 _mm_storeu_si128((__m128i*)(out_ptr + 12), | |
| 530 _mm256_extracti128_si256(unpack2_avx2, 0x1)); | |
| 531 | |
| 532 prev_even = next_even; | |
| 533 | |
| 534 out_ptr += 16; | |
| 535 in_even += 8; | |
| 536 in_odd += 8; | |
| 537 } | |
| 538 out_ptr[0] = prev_even; | |
| 539 for (j = simd_batch * 16 + 1; j < (len - 2); j += 2) { | |
| 540 out_ptr[2] = in_even[1] - ((in_odd[0] + in_odd[1] + 2) >> 2); | |
| 541 out_ptr[1] = in_odd[0] + ((out_ptr[0] + out_ptr[2]) >> 1); | |
| 542 in_even++; | |
| 543 in_odd++; | |
| 544 out_ptr += 2; | |
| 545 } | |
| 546 | |
| 547 if (len & 1) { | |
| 548 out_ptr[2] = in_even[1] - ((in_odd[0] + 1) >> 1); | |
| 549 out_ptr[1] = in_odd[0] + ((out_ptr[0] + out_ptr[2]) >> 1); | |
| 550 } else { //!(len & 1) | |
| 551 out_ptr[1] = in_odd[0] + out_ptr[0]; | |
| 552 } | |
| 553 #else | |
| 554 OPJ_INT32 d1c, d1n, s1n, s0c, s0n; | |
| 555 | |
| 556 assert(len > 1); | |
| 557 | |
| 558 /* Improved version of the TWO_PASS_VERSION: */ | |
| 559 /* Performs lifting in one single iteration. Saves memory */ | |
| 560 /* accesses and explicit interleaving. */ | |
| 561 s1n = in_even[0]; | |
| 562 d1n = in_odd[0]; | |
| 563 s0n = s1n - ((d1n + 1) >> 1); | |
| 564 | |
| 565 for (i = 0, j = 1; i < (len - 3); i += 2, j++) { | |
| 566 d1c = d1n; | |
| 567 s0c = s0n; | |
| 568 | |
| 569 s1n = in_even[j]; | |
| 570 d1n = in_odd[j]; | |
| 571 | |
| 572 s0n = s1n - ((d1c + d1n + 2) >> 2); | |
| 573 | |
| 574 tmp[i ] = s0c; | |
| 575 tmp[i + 1] = opj_int_add_no_overflow(d1c, opj_int_add_no_overflow(s0c, | |
| 576 s0n) >> 1); | |
| 577 } | |
| 578 | |
| 579 tmp[i] = s0n; | |
| 580 | |
| 581 if (len & 1) { | |
| 582 tmp[len - 1] = in_even[(len - 1) / 2] - ((d1n + 1) >> 1); | |
| 583 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); | |
| 584 } else { | |
| 585 tmp[len - 1] = d1n + s0n; | |
| 586 } | |
| 587 #endif /*(__AVX512F__ || __AVX2__)*/ | |
| 588 #endif /*TWO_PASS_VERSION*/ | |
| 589 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); | |
| 590 } | |
| 591 | |
| 592 static void opj_idwt53_h_cas1(OPJ_INT32* tmp, | |
| 593 const OPJ_INT32 sn, | |
| 594 const OPJ_INT32 len, | |
| 595 OPJ_INT32* tiledp) | |
| 596 { | |
| 597 OPJ_INT32 i, j; | |
| 598 const OPJ_INT32* in_even = &tiledp[sn]; | |
| 599 const OPJ_INT32* in_odd = &tiledp[0]; | |
| 600 | |
| 601 #ifdef TWO_PASS_VERSION | |
| 602 /* For documentation purpose: performs lifting in two iterations, */ | |
| 603 /* but without explicit interleaving */ | |
| 604 | |
| 605 assert(len > 2); | |
| 606 | |
| 607 /* Odd */ | |
| 608 for (i = 1, j = 0; i < len - 1; i += 2, j++) { | |
| 609 tmp[i] = in_odd[j] - ((in_even[j] + in_even[j + 1] + 2) >> 2); | |
| 610 } | |
| 611 if (!(len & 1)) { | |
| 612 tmp[len - 1] = in_odd[len / 2 - 1] - ((in_even[len / 2 - 1] + 1) >> 1); | |
| 613 } | |
| 614 | |
| 615 /* Even */ | |
| 616 tmp[0] = in_even[0] + tmp[1]; | |
| 617 for (i = 2, j = 1; i < len - 1; i += 2, j++) { | |
| 618 tmp[i] = in_even[j] + ((tmp[i + 1] + tmp[i - 1]) >> 1); | |
| 619 } | |
| 620 if (len & 1) { | |
| 621 tmp[len - 1] = in_even[len / 2] + tmp[len - 2]; | |
| 622 } | |
| 623 #else | |
| 624 OPJ_INT32 s1, s2, dc, dn; | |
| 625 | |
| 626 assert(len > 2); | |
| 627 | |
| 628 /* Improved version of the TWO_PASS_VERSION: */ | |
| 629 /* Performs lifting in one single iteration. Saves memory */ | |
| 630 /* accesses and explicit interleaving. */ | |
| 631 | |
| 632 s1 = in_even[1]; | |
| 633 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); | |
| 634 tmp[0] = in_even[0] + dc; | |
| 635 | |
| 636 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { | |
| 637 | |
| 638 s2 = in_even[j + 1]; | |
| 639 | |
| 640 dn = in_odd[j] - ((s1 + s2 + 2) >> 2); | |
| 641 tmp[i ] = dc; | |
| 642 tmp[i + 1] = opj_int_add_no_overflow(s1, opj_int_add_no_overflow(dn, dc) >> 1); | |
| 643 | |
| 644 dc = dn; | |
| 645 s1 = s2; | |
| 646 } | |
| 647 | |
| 648 tmp[i] = dc; | |
| 649 | |
| 650 if (!(len & 1)) { | |
| 651 dn = in_odd[len / 2 - 1] - ((s1 + 1) >> 1); | |
| 652 tmp[len - 2] = s1 + ((dn + dc) >> 1); | |
| 653 tmp[len - 1] = dn; | |
| 654 } else { | |
| 655 tmp[len - 1] = s1 + dc; | |
| 656 } | |
| 657 #endif | |
| 658 memcpy(tiledp, tmp, (OPJ_UINT32)len * sizeof(OPJ_INT32)); | |
| 659 } | |
| 660 | |
| 661 | |
| 662 #endif /* !defined(STANDARD_SLOW_VERSION) */ | |
| 663 | |
| 664 /* <summary> */ | |
| 665 /* Inverse 5-3 wavelet transform in 1-D for one row. */ | |
| 666 /* </summary> */ | |
| 667 /* Performs interleave, inverse wavelet transform and copy back to buffer */ | |
| 668 static void opj_idwt53_h(const opj_dwt_t *dwt, | |
| 669 OPJ_INT32* tiledp) | |
| 670 { | |
| 671 #ifdef STANDARD_SLOW_VERSION | |
| 672 /* For documentation purpose */ | |
| 673 opj_dwt_interleave_h(dwt, tiledp); | |
| 674 opj_dwt_decode_1(dwt); | |
| 675 memcpy(tiledp, dwt->mem, (OPJ_UINT32)(dwt->sn + dwt->dn) * sizeof(OPJ_INT32)); | |
| 676 #else | |
| 677 const OPJ_INT32 sn = dwt->sn; | |
| 678 const OPJ_INT32 len = sn + dwt->dn; | |
| 679 if (dwt->cas == 0) { /* Left-most sample is on even coordinate */ | |
| 680 if (len > 1) { | |
| 681 opj_idwt53_h_cas0(dwt->mem, sn, len, tiledp); | |
| 682 } else { | |
| 683 /* Unmodified value */ | |
| 684 } | |
| 685 } else { /* Left-most sample is on odd coordinate */ | |
| 686 if (len == 1) { | |
| 687 tiledp[0] /= 2; | |
| 688 } else if (len == 2) { | |
| 689 OPJ_INT32* out = dwt->mem; | |
| 690 const OPJ_INT32* in_even = &tiledp[sn]; | |
| 691 const OPJ_INT32* in_odd = &tiledp[0]; | |
| 692 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); | |
| 693 out[0] = in_even[0] + out[1]; | |
| 694 memcpy(tiledp, dwt->mem, (OPJ_UINT32)len * sizeof(OPJ_INT32)); | |
| 695 } else if (len > 2) { | |
| 696 opj_idwt53_h_cas1(dwt->mem, sn, len, tiledp); | |
| 697 } | |
| 698 } | |
| 699 #endif | |
| 700 } | |
| 701 | |
| 702 #if (defined(__SSE2__) || defined(__AVX2__) || defined(__AVX512F__)) && !defined(STANDARD_SLOW_VERSION) | |
| 703 | |
| 704 /* Conveniency macros to improve the readability of the formulas */ | |
| 705 #if defined(__AVX512F__) | |
| 706 #define VREG __m512i | |
| 707 #define LOAD_CST(x) _mm512_set1_epi32(x) | |
| 708 #define LOAD(x) _mm512_loadu_si512((const VREG*)(x)) | |
| 709 #define LOADU(x) _mm512_loadu_si512((const VREG*)(x)) | |
| 710 #define STORE(x,y) _mm512_storeu_si512((VREG*)(x),(y)) | |
| 711 #define STOREU(x,y) _mm512_storeu_si512((VREG*)(x),(y)) | |
| 712 #define ADD(x,y) _mm512_add_epi32((x),(y)) | |
| 713 #define SUB(x,y) _mm512_sub_epi32((x),(y)) | |
| 714 #define SAR(x,y) _mm512_srai_epi32((x),(y)) | |
| 715 #elif defined(__AVX2__) | |
| 716 #define VREG __m256i | |
| 717 #define LOAD_CST(x) _mm256_set1_epi32(x) | |
| 718 #define LOAD(x) _mm256_load_si256((const VREG*)(x)) | |
| 719 #define LOADU(x) _mm256_loadu_si256((const VREG*)(x)) | |
| 720 #define STORE(x,y) _mm256_store_si256((VREG*)(x),(y)) | |
| 721 #define STOREU(x,y) _mm256_storeu_si256((VREG*)(x),(y)) | |
| 722 #define ADD(x,y) _mm256_add_epi32((x),(y)) | |
| 723 #define SUB(x,y) _mm256_sub_epi32((x),(y)) | |
| 724 #define SAR(x,y) _mm256_srai_epi32((x),(y)) | |
| 725 #else | |
| 726 #define VREG __m128i | |
| 727 #define LOAD_CST(x) _mm_set1_epi32(x) | |
| 728 #define LOAD(x) _mm_load_si128((const VREG*)(x)) | |
| 729 #define LOADU(x) _mm_loadu_si128((const VREG*)(x)) | |
| 730 #define STORE(x,y) _mm_store_si128((VREG*)(x),(y)) | |
| 731 #define STOREU(x,y) _mm_storeu_si128((VREG*)(x),(y)) | |
| 732 #define ADD(x,y) _mm_add_epi32((x),(y)) | |
| 733 #define SUB(x,y) _mm_sub_epi32((x),(y)) | |
| 734 #define SAR(x,y) _mm_srai_epi32((x),(y)) | |
| 735 #endif | |
| 736 #define ADD3(x,y,z) ADD(ADD(x,y),z) | |
| 737 | |
| 738 static | |
| 739 void opj_idwt53_v_final_memcpy(OPJ_INT32* tiledp_col, | |
| 740 const OPJ_INT32* tmp, | |
| 741 OPJ_INT32 len, | |
| 742 OPJ_SIZE_T stride) | |
| 743 { | |
| 744 OPJ_INT32 i; | |
| 745 for (i = 0; i < len; ++i) { | |
| 746 /* A memcpy(&tiledp_col[i * stride + 0], | |
| 747 &tmp[PARALLEL_COLS_53 * i + 0], | |
| 748 PARALLEL_COLS_53 * sizeof(OPJ_INT32)) | |
| 749 would do but would be a tiny bit slower. | |
| 750 We can take here advantage of our knowledge of alignment */ | |
| 751 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + 0], | |
| 752 LOAD(&tmp[PARALLEL_COLS_53 * i + 0])); | |
| 753 STOREU(&tiledp_col[(OPJ_SIZE_T)i * stride + VREG_INT_COUNT], | |
| 754 LOAD(&tmp[PARALLEL_COLS_53 * i + VREG_INT_COUNT])); | |
| 755 } | |
| 756 } | |
| 757 | |
| 758 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or | |
| 759 * 16 in AVX2, when top-most pixel is on even coordinate */ | |
| 760 static void opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2( | |
| 761 OPJ_INT32* tmp, | |
| 762 const OPJ_INT32 sn, | |
| 763 const OPJ_INT32 len, | |
| 764 OPJ_INT32* tiledp_col, | |
| 765 const OPJ_SIZE_T stride) | |
| 766 { | |
| 767 const OPJ_INT32* in_even = &tiledp_col[0]; | |
| 768 const OPJ_INT32* in_odd = &tiledp_col[(OPJ_SIZE_T)sn * stride]; | |
| 769 | |
| 770 OPJ_INT32 i; | |
| 771 OPJ_SIZE_T j; | |
| 772 VREG d1c_0, d1n_0, s1n_0, s0c_0, s0n_0; | |
| 773 VREG d1c_1, d1n_1, s1n_1, s0c_1, s0n_1; | |
| 774 const VREG two = LOAD_CST(2); | |
| 775 | |
| 776 assert(len > 1); | |
| 777 #if defined(__AVX512F__) | |
| 778 assert(PARALLEL_COLS_53 == 32); | |
| 779 assert(VREG_INT_COUNT == 16); | |
| 780 #elif defined(__AVX2__) | |
| 781 assert(PARALLEL_COLS_53 == 16); | |
| 782 assert(VREG_INT_COUNT == 8); | |
| 783 #else | |
| 784 assert(PARALLEL_COLS_53 == 8); | |
| 785 assert(VREG_INT_COUNT == 4); | |
| 786 #endif | |
| 787 | |
| 788 //For AVX512 code aligned load/store is set to it's unaligned equivalents | |
| 789 #if !defined(__AVX512F__) | |
| 790 /* Note: loads of input even/odd values must be done in a unaligned */ | |
| 791 /* fashion. But stores in tmp can be done with aligned store, since */ | |
| 792 /* the temporary buffer is properly aligned */ | |
| 793 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); | |
| 794 #endif | |
| 795 | |
| 796 s1n_0 = LOADU(in_even + 0); | |
| 797 s1n_1 = LOADU(in_even + VREG_INT_COUNT); | |
| 798 d1n_0 = LOADU(in_odd); | |
| 799 d1n_1 = LOADU(in_odd + VREG_INT_COUNT); | |
| 800 | |
| 801 /* s0n = s1n - ((d1n + 1) >> 1); <==> */ | |
| 802 /* s0n = s1n - ((d1n + d1n + 2) >> 2); */ | |
| 803 s0n_0 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); | |
| 804 s0n_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); | |
| 805 | |
| 806 for (i = 0, j = 1; i < (len - 3); i += 2, j++) { | |
| 807 d1c_0 = d1n_0; | |
| 808 s0c_0 = s0n_0; | |
| 809 d1c_1 = d1n_1; | |
| 810 s0c_1 = s0n_1; | |
| 811 | |
| 812 s1n_0 = LOADU(in_even + j * stride); | |
| 813 s1n_1 = LOADU(in_even + j * stride + VREG_INT_COUNT); | |
| 814 d1n_0 = LOADU(in_odd + j * stride); | |
| 815 d1n_1 = LOADU(in_odd + j * stride + VREG_INT_COUNT); | |
| 816 | |
| 817 /*s0n = s1n - ((d1c + d1n + 2) >> 2);*/ | |
| 818 s0n_0 = SUB(s1n_0, SAR(ADD3(d1c_0, d1n_0, two), 2)); | |
| 819 s0n_1 = SUB(s1n_1, SAR(ADD3(d1c_1, d1n_1, two), 2)); | |
| 820 | |
| 821 STORE(tmp + PARALLEL_COLS_53 * (i + 0), s0c_0); | |
| 822 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0c_1); | |
| 823 | |
| 824 /* d1c + ((s0c + s0n) >> 1) */ | |
| 825 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, | |
| 826 ADD(d1c_0, SAR(ADD(s0c_0, s0n_0), 1))); | |
| 827 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, | |
| 828 ADD(d1c_1, SAR(ADD(s0c_1, s0n_1), 1))); | |
| 829 } | |
| 830 | |
| 831 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + 0, s0n_0); | |
| 832 STORE(tmp + PARALLEL_COLS_53 * (i + 0) + VREG_INT_COUNT, s0n_1); | |
| 833 | |
| 834 if (len & 1) { | |
| 835 VREG tmp_len_minus_1; | |
| 836 s1n_0 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride); | |
| 837 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ | |
| 838 tmp_len_minus_1 = SUB(s1n_0, SAR(ADD3(d1n_0, d1n_0, two), 2)); | |
| 839 STORE(tmp + PARALLEL_COLS_53 * (len - 1), tmp_len_minus_1); | |
| 840 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ | |
| 841 STORE(tmp + PARALLEL_COLS_53 * (len - 2), | |
| 842 ADD(d1n_0, SAR(ADD(s0n_0, tmp_len_minus_1), 1))); | |
| 843 | |
| 844 s1n_1 = LOADU(in_even + (OPJ_SIZE_T)((len - 1) / 2) * stride + VREG_INT_COUNT); | |
| 845 /* tmp_len_minus_1 = s1n - ((d1n + 1) >> 1); */ | |
| 846 tmp_len_minus_1 = SUB(s1n_1, SAR(ADD3(d1n_1, d1n_1, two), 2)); | |
| 847 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, | |
| 848 tmp_len_minus_1); | |
| 849 /* d1n + ((s0n + tmp_len_minus_1) >> 1) */ | |
| 850 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, | |
| 851 ADD(d1n_1, SAR(ADD(s0n_1, tmp_len_minus_1), 1))); | |
| 852 | |
| 853 | |
| 854 } else { | |
| 855 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, | |
| 856 ADD(d1n_0, s0n_0)); | |
| 857 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, | |
| 858 ADD(d1n_1, s0n_1)); | |
| 859 } | |
| 860 | |
| 861 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); | |
| 862 } | |
| 863 | |
| 864 | |
| 865 /** Vertical inverse 5x3 wavelet transform for 8 columns in SSE2, or | |
| 866 * 16 in AVX2, when top-most pixel is on odd coordinate */ | |
| 867 static void opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2( | |
| 868 OPJ_INT32* tmp, | |
| 869 const OPJ_INT32 sn, | |
| 870 const OPJ_INT32 len, | |
| 871 OPJ_INT32* tiledp_col, | |
| 872 const OPJ_SIZE_T stride) | |
| 873 { | |
| 874 OPJ_INT32 i; | |
| 875 OPJ_SIZE_T j; | |
| 876 | |
| 877 VREG s1_0, s2_0, dc_0, dn_0; | |
| 878 VREG s1_1, s2_1, dc_1, dn_1; | |
| 879 const VREG two = LOAD_CST(2); | |
| 880 | |
| 881 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; | |
| 882 const OPJ_INT32* in_odd = &tiledp_col[0]; | |
| 883 | |
| 884 assert(len > 2); | |
| 885 #if defined(__AVX512F__) | |
| 886 assert(PARALLEL_COLS_53 == 32); | |
| 887 assert(VREG_INT_COUNT == 16); | |
| 888 #elif defined(__AVX2__) | |
| 889 assert(PARALLEL_COLS_53 == 16); | |
| 890 assert(VREG_INT_COUNT == 8); | |
| 891 #else | |
| 892 assert(PARALLEL_COLS_53 == 8); | |
| 893 assert(VREG_INT_COUNT == 4); | |
| 894 #endif | |
| 895 | |
| 896 //For AVX512 code aligned load/store is set to it's unaligned equivalents | |
| 897 #if !defined(__AVX512F__) | |
| 898 /* Note: loads of input even/odd values must be done in a unaligned */ | |
| 899 /* fashion. But stores in tmp can be done with aligned store, since */ | |
| 900 /* the temporary buffer is properly aligned */ | |
| 901 assert((OPJ_SIZE_T)tmp % (sizeof(OPJ_INT32) * VREG_INT_COUNT) == 0); | |
| 902 #endif | |
| 903 | |
| 904 s1_0 = LOADU(in_even + stride); | |
| 905 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ | |
| 906 dc_0 = SUB(LOADU(in_odd + 0), | |
| 907 SAR(ADD3(LOADU(in_even + 0), s1_0, two), 2)); | |
| 908 STORE(tmp + PARALLEL_COLS_53 * 0, ADD(LOADU(in_even + 0), dc_0)); | |
| 909 | |
| 910 s1_1 = LOADU(in_even + stride + VREG_INT_COUNT); | |
| 911 /* in_odd[0] - ((in_even[0] + s1 + 2) >> 2); */ | |
| 912 dc_1 = SUB(LOADU(in_odd + VREG_INT_COUNT), | |
| 913 SAR(ADD3(LOADU(in_even + VREG_INT_COUNT), s1_1, two), 2)); | |
| 914 STORE(tmp + PARALLEL_COLS_53 * 0 + VREG_INT_COUNT, | |
| 915 ADD(LOADU(in_even + VREG_INT_COUNT), dc_1)); | |
| 916 | |
| 917 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { | |
| 918 | |
| 919 s2_0 = LOADU(in_even + (j + 1) * stride); | |
| 920 s2_1 = LOADU(in_even + (j + 1) * stride + VREG_INT_COUNT); | |
| 921 | |
| 922 /* dn = in_odd[j * stride] - ((s1 + s2 + 2) >> 2); */ | |
| 923 dn_0 = SUB(LOADU(in_odd + j * stride), | |
| 924 SAR(ADD3(s1_0, s2_0, two), 2)); | |
| 925 dn_1 = SUB(LOADU(in_odd + j * stride + VREG_INT_COUNT), | |
| 926 SAR(ADD3(s1_1, s2_1, two), 2)); | |
| 927 | |
| 928 STORE(tmp + PARALLEL_COLS_53 * i, dc_0); | |
| 929 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); | |
| 930 | |
| 931 /* tmp[i + 1] = s1 + ((dn + dc) >> 1); */ | |
| 932 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + 0, | |
| 933 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); | |
| 934 STORE(tmp + PARALLEL_COLS_53 * (i + 1) + VREG_INT_COUNT, | |
| 935 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); | |
| 936 | |
| 937 dc_0 = dn_0; | |
| 938 s1_0 = s2_0; | |
| 939 dc_1 = dn_1; | |
| 940 s1_1 = s2_1; | |
| 941 } | |
| 942 STORE(tmp + PARALLEL_COLS_53 * i, dc_0); | |
| 943 STORE(tmp + PARALLEL_COLS_53 * i + VREG_INT_COUNT, dc_1); | |
| 944 | |
| 945 if (!(len & 1)) { | |
| 946 /*dn = in_odd[(len / 2 - 1) * stride] - ((s1 + 1) >> 1); */ | |
| 947 dn_0 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride), | |
| 948 SAR(ADD3(s1_0, s1_0, two), 2)); | |
| 949 dn_1 = SUB(LOADU(in_odd + (OPJ_SIZE_T)(len / 2 - 1) * stride + VREG_INT_COUNT), | |
| 950 SAR(ADD3(s1_1, s1_1, two), 2)); | |
| 951 | |
| 952 /* tmp[len - 2] = s1 + ((dn + dc) >> 1); */ | |
| 953 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + 0, | |
| 954 ADD(s1_0, SAR(ADD(dn_0, dc_0), 1))); | |
| 955 STORE(tmp + PARALLEL_COLS_53 * (len - 2) + VREG_INT_COUNT, | |
| 956 ADD(s1_1, SAR(ADD(dn_1, dc_1), 1))); | |
| 957 | |
| 958 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, dn_0); | |
| 959 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, dn_1); | |
| 960 } else { | |
| 961 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + 0, ADD(s1_0, dc_0)); | |
| 962 STORE(tmp + PARALLEL_COLS_53 * (len - 1) + VREG_INT_COUNT, | |
| 963 ADD(s1_1, dc_1)); | |
| 964 } | |
| 965 | |
| 966 opj_idwt53_v_final_memcpy(tiledp_col, tmp, len, stride); | |
| 967 } | |
| 968 | |
| 969 #undef VREG | |
| 970 #undef LOAD_CST | |
| 971 #undef LOADU | |
| 972 #undef LOAD | |
| 973 #undef STORE | |
| 974 #undef STOREU | |
| 975 #undef ADD | |
| 976 #undef ADD3 | |
| 977 #undef SUB | |
| 978 #undef SAR | |
| 979 | |
| 980 #endif /* (defined(__SSE2__) || defined(__AVX2__)) && !defined(STANDARD_SLOW_VERSION) */ | |
| 981 | |
| 982 #if !defined(STANDARD_SLOW_VERSION) | |
| 983 /** Vertical inverse 5x3 wavelet transform for one column, when top-most | |
| 984 * pixel is on even coordinate */ | |
| 985 static void opj_idwt3_v_cas0(OPJ_INT32* tmp, | |
| 986 const OPJ_INT32 sn, | |
| 987 const OPJ_INT32 len, | |
| 988 OPJ_INT32* tiledp_col, | |
| 989 const OPJ_SIZE_T stride) | |
| 990 { | |
| 991 OPJ_INT32 i, j; | |
| 992 OPJ_INT32 d1c, d1n, s1n, s0c, s0n; | |
| 993 | |
| 994 assert(len > 1); | |
| 995 | |
| 996 /* Performs lifting in one single iteration. Saves memory */ | |
| 997 /* accesses and explicit interleaving. */ | |
| 998 | |
| 999 s1n = tiledp_col[0]; | |
| 1000 d1n = tiledp_col[(OPJ_SIZE_T)sn * stride]; | |
| 1001 s0n = s1n - ((d1n + 1) >> 1); | |
| 1002 | |
| 1003 for (i = 0, j = 0; i < (len - 3); i += 2, j++) { | |
| 1004 d1c = d1n; | |
| 1005 s0c = s0n; | |
| 1006 | |
| 1007 s1n = tiledp_col[(OPJ_SIZE_T)(j + 1) * stride]; | |
| 1008 d1n = tiledp_col[(OPJ_SIZE_T)(sn + j + 1) * stride]; | |
| 1009 | |
| 1010 s0n = opj_int_sub_no_overflow(s1n, | |
| 1011 opj_int_add_no_overflow(opj_int_add_no_overflow(d1c, d1n), 2) >> 2); | |
| 1012 | |
| 1013 tmp[i ] = s0c; | |
| 1014 tmp[i + 1] = opj_int_add_no_overflow(d1c, opj_int_add_no_overflow(s0c, | |
| 1015 s0n) >> 1); | |
| 1016 } | |
| 1017 | |
| 1018 tmp[i] = s0n; | |
| 1019 | |
| 1020 if (len & 1) { | |
| 1021 tmp[len - 1] = | |
| 1022 tiledp_col[(OPJ_SIZE_T)((len - 1) / 2) * stride] - | |
| 1023 ((d1n + 1) >> 1); | |
| 1024 tmp[len - 2] = d1n + ((s0n + tmp[len - 1]) >> 1); | |
| 1025 } else { | |
| 1026 tmp[len - 1] = d1n + s0n; | |
| 1027 } | |
| 1028 | |
| 1029 for (i = 0; i < len; ++i) { | |
| 1030 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; | |
| 1031 } | |
| 1032 } | |
| 1033 | |
| 1034 | |
| 1035 /** Vertical inverse 5x3 wavelet transform for one column, when top-most | |
| 1036 * pixel is on odd coordinate */ | |
| 1037 static void opj_idwt3_v_cas1(OPJ_INT32* tmp, | |
| 1038 const OPJ_INT32 sn, | |
| 1039 const OPJ_INT32 len, | |
| 1040 OPJ_INT32* tiledp_col, | |
| 1041 const OPJ_SIZE_T stride) | |
| 1042 { | |
| 1043 OPJ_INT32 i, j; | |
| 1044 OPJ_INT32 s1, s2, dc, dn; | |
| 1045 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; | |
| 1046 const OPJ_INT32* in_odd = &tiledp_col[0]; | |
| 1047 | |
| 1048 assert(len > 2); | |
| 1049 | |
| 1050 /* Performs lifting in one single iteration. Saves memory */ | |
| 1051 /* accesses and explicit interleaving. */ | |
| 1052 | |
| 1053 s1 = in_even[stride]; | |
| 1054 dc = in_odd[0] - ((in_even[0] + s1 + 2) >> 2); | |
| 1055 tmp[0] = in_even[0] + dc; | |
| 1056 for (i = 1, j = 1; i < (len - 2 - !(len & 1)); i += 2, j++) { | |
| 1057 | |
| 1058 s2 = in_even[(OPJ_SIZE_T)(j + 1) * stride]; | |
| 1059 | |
| 1060 dn = in_odd[(OPJ_SIZE_T)j * stride] - ((s1 + s2 + 2) >> 2); | |
| 1061 tmp[i ] = dc; | |
| 1062 tmp[i + 1] = s1 + ((dn + dc) >> 1); | |
| 1063 | |
| 1064 dc = dn; | |
| 1065 s1 = s2; | |
| 1066 } | |
| 1067 tmp[i] = dc; | |
| 1068 if (!(len & 1)) { | |
| 1069 dn = in_odd[(OPJ_SIZE_T)(len / 2 - 1) * stride] - ((s1 + 1) >> 1); | |
| 1070 tmp[len - 2] = s1 + ((dn + dc) >> 1); | |
| 1071 tmp[len - 1] = dn; | |
| 1072 } else { | |
| 1073 tmp[len - 1] = s1 + dc; | |
| 1074 } | |
| 1075 | |
| 1076 for (i = 0; i < len; ++i) { | |
| 1077 tiledp_col[(OPJ_SIZE_T)i * stride] = tmp[i]; | |
| 1078 } | |
| 1079 } | |
| 1080 #endif /* !defined(STANDARD_SLOW_VERSION) */ | |
| 1081 | |
| 1082 /* <summary> */ | |
| 1083 /* Inverse vertical 5-3 wavelet transform in 1-D for several columns. */ | |
| 1084 /* </summary> */ | |
| 1085 /* Performs interleave, inverse wavelet transform and copy back to buffer */ | |
| 1086 static void opj_idwt53_v(const opj_dwt_t *dwt, | |
| 1087 OPJ_INT32* tiledp_col, | |
| 1088 OPJ_SIZE_T stride, | |
| 1089 OPJ_INT32 nb_cols) | |
| 1090 { | |
| 1091 #ifdef STANDARD_SLOW_VERSION | |
| 1092 /* For documentation purpose */ | |
| 1093 OPJ_INT32 k, c; | |
| 1094 for (c = 0; c < nb_cols; c ++) { | |
| 1095 opj_dwt_interleave_v(dwt, tiledp_col + c, stride); | |
| 1096 opj_dwt_decode_1(dwt); | |
| 1097 for (k = 0; k < dwt->sn + dwt->dn; ++k) { | |
| 1098 tiledp_col[c + k * stride] = dwt->mem[k]; | |
| 1099 } | |
| 1100 } | |
| 1101 #else | |
| 1102 const OPJ_INT32 sn = dwt->sn; | |
| 1103 const OPJ_INT32 len = sn + dwt->dn; | |
| 1104 if (dwt->cas == 0) { | |
| 1105 /* If len == 1, unmodified value */ | |
| 1106 | |
| 1107 #if (defined(__SSE2__) || defined(__AVX2__)) | |
| 1108 if (len > 1 && nb_cols == PARALLEL_COLS_53) { | |
| 1109 /* Same as below general case, except that thanks to SSE2/AVX2 */ | |
| 1110 /* we can efficiently process 8/16 columns in parallel */ | |
| 1111 opj_idwt53_v_cas0_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); | |
| 1112 return; | |
| 1113 } | |
| 1114 #endif | |
| 1115 if (len > 1) { | |
| 1116 OPJ_INT32 c; | |
| 1117 for (c = 0; c < nb_cols; c++, tiledp_col++) { | |
| 1118 opj_idwt3_v_cas0(dwt->mem, sn, len, tiledp_col, stride); | |
| 1119 } | |
| 1120 return; | |
| 1121 } | |
| 1122 } else { | |
| 1123 if (len == 1) { | |
| 1124 OPJ_INT32 c; | |
| 1125 for (c = 0; c < nb_cols; c++, tiledp_col++) { | |
| 1126 tiledp_col[0] /= 2; | |
| 1127 } | |
| 1128 return; | |
| 1129 } | |
| 1130 | |
| 1131 if (len == 2) { | |
| 1132 OPJ_INT32 c; | |
| 1133 OPJ_INT32* out = dwt->mem; | |
| 1134 for (c = 0; c < nb_cols; c++, tiledp_col++) { | |
| 1135 OPJ_INT32 i; | |
| 1136 const OPJ_INT32* in_even = &tiledp_col[(OPJ_SIZE_T)sn * stride]; | |
| 1137 const OPJ_INT32* in_odd = &tiledp_col[0]; | |
| 1138 | |
| 1139 out[1] = in_odd[0] - ((in_even[0] + 1) >> 1); | |
| 1140 out[0] = in_even[0] + out[1]; | |
| 1141 | |
| 1142 for (i = 0; i < len; ++i) { | |
| 1143 tiledp_col[(OPJ_SIZE_T)i * stride] = out[i]; | |
| 1144 } | |
| 1145 } | |
| 1146 | |
| 1147 return; | |
| 1148 } | |
| 1149 | |
| 1150 #if (defined(__SSE2__) || defined(__AVX2__)) | |
| 1151 if (len > 2 && nb_cols == PARALLEL_COLS_53) { | |
| 1152 /* Same as below general case, except that thanks to SSE2/AVX2 */ | |
| 1153 /* we can efficiently process 8/16 columns in parallel */ | |
| 1154 opj_idwt53_v_cas1_mcols_SSE2_OR_AVX2(dwt->mem, sn, len, tiledp_col, stride); | |
| 1155 return; | |
| 1156 } | |
| 1157 #endif | |
| 1158 if (len > 2) { | |
| 1159 OPJ_INT32 c; | |
| 1160 for (c = 0; c < nb_cols; c++, tiledp_col++) { | |
| 1161 opj_idwt3_v_cas1(dwt->mem, sn, len, tiledp_col, stride); | |
| 1162 } | |
| 1163 return; | |
| 1164 } | |
| 1165 } | |
| 1166 #endif | |
| 1167 } | |
| 1168 | |
| 1169 #if 0 | |
| 1170 static void opj_dwt_encode_step1(OPJ_FLOAT32* fw, | |
| 1171 OPJ_UINT32 end, | |
| 1172 const OPJ_FLOAT32 c) | |
| 1173 { | |
| 1174 OPJ_UINT32 i = 0; | |
| 1175 for (; i < end; ++i) { | |
| 1176 fw[0] *= c; | |
| 1177 fw += 2; | |
| 1178 } | |
| 1179 } | |
| 1180 #else | |
| 1181 static void opj_dwt_encode_step1_combined(OPJ_FLOAT32* fw, | |
| 1182 OPJ_UINT32 iters_c1, | |
| 1183 OPJ_UINT32 iters_c2, | |
| 1184 const OPJ_FLOAT32 c1, | |
| 1185 const OPJ_FLOAT32 c2) | |
| 1186 { | |
| 1187 OPJ_UINT32 i = 0; | |
| 1188 const OPJ_UINT32 iters_common = opj_uint_min(iters_c1, iters_c2); | |
| 1189 assert((((OPJ_SIZE_T)fw) & 0xf) == 0); | |
| 1190 assert(opj_int_abs((OPJ_INT32)iters_c1 - (OPJ_INT32)iters_c2) <= 1); | |
| 1191 for (; i + 3 < iters_common; i += 4) { | |
| 1192 #ifdef __SSE__ | |
| 1193 const __m128 vcst = _mm_set_ps(c2, c1, c2, c1); | |
| 1194 *(__m128*)fw = _mm_mul_ps(*(__m128*)fw, vcst); | |
| 1195 *(__m128*)(fw + 4) = _mm_mul_ps(*(__m128*)(fw + 4), vcst); | |
| 1196 #else | |
| 1197 fw[0] *= c1; | |
| 1198 fw[1] *= c2; | |
| 1199 fw[2] *= c1; | |
| 1200 fw[3] *= c2; | |
| 1201 fw[4] *= c1; | |
| 1202 fw[5] *= c2; | |
| 1203 fw[6] *= c1; | |
| 1204 fw[7] *= c2; | |
| 1205 #endif | |
| 1206 fw += 8; | |
| 1207 } | |
| 1208 for (; i < iters_common; i++) { | |
| 1209 fw[0] *= c1; | |
| 1210 fw[1] *= c2; | |
| 1211 fw += 2; | |
| 1212 } | |
| 1213 if (i < iters_c1) { | |
| 1214 fw[0] *= c1; | |
| 1215 } else if (i < iters_c2) { | |
| 1216 fw[1] *= c2; | |
| 1217 } | |
| 1218 } | |
| 1219 | |
| 1220 #endif | |
| 1221 | |
| 1222 static void opj_dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw, | |
| 1223 OPJ_UINT32 end, | |
| 1224 OPJ_UINT32 m, | |
| 1225 OPJ_FLOAT32 c) | |
| 1226 { | |
| 1227 OPJ_UINT32 i; | |
| 1228 OPJ_UINT32 imax = opj_uint_min(end, m); | |
| 1229 if (imax > 0) { | |
| 1230 fw[-1] += (fl[0] + fw[0]) * c; | |
| 1231 fw += 2; | |
| 1232 i = 1; | |
| 1233 for (; i + 3 < imax; i += 4) { | |
| 1234 fw[-1] += (fw[-2] + fw[0]) * c; | |
| 1235 fw[1] += (fw[0] + fw[2]) * c; | |
| 1236 fw[3] += (fw[2] + fw[4]) * c; | |
| 1237 fw[5] += (fw[4] + fw[6]) * c; | |
| 1238 fw += 8; | |
| 1239 } | |
| 1240 for (; i < imax; ++i) { | |
| 1241 fw[-1] += (fw[-2] + fw[0]) * c; | |
| 1242 fw += 2; | |
| 1243 } | |
| 1244 } | |
| 1245 if (m < end) { | |
| 1246 assert(m + 1 == end); | |
| 1247 fw[-1] += (2 * fw[-2]) * c; | |
| 1248 } | |
| 1249 } | |
| 1250 | |
| 1251 static void opj_dwt_encode_1_real(void *aIn, OPJ_INT32 dn, OPJ_INT32 sn, | |
| 1252 OPJ_INT32 cas) | |
| 1253 { | |
| 1254 OPJ_FLOAT32* w = (OPJ_FLOAT32*)aIn; | |
| 1255 OPJ_INT32 a, b; | |
| 1256 assert(dn + sn > 1); | |
| 1257 if (cas == 0) { | |
| 1258 a = 0; | |
| 1259 b = 1; | |
| 1260 } else { | |
| 1261 a = 1; | |
| 1262 b = 0; | |
| 1263 } | |
| 1264 opj_dwt_encode_step2(w + a, w + b + 1, | |
| 1265 (OPJ_UINT32)dn, | |
| 1266 (OPJ_UINT32)opj_int_min(dn, sn - b), | |
| 1267 opj_dwt_alpha); | |
| 1268 opj_dwt_encode_step2(w + b, w + a + 1, | |
| 1269 (OPJ_UINT32)sn, | |
| 1270 (OPJ_UINT32)opj_int_min(sn, dn - a), | |
| 1271 opj_dwt_beta); | |
| 1272 opj_dwt_encode_step2(w + a, w + b + 1, | |
| 1273 (OPJ_UINT32)dn, | |
| 1274 (OPJ_UINT32)opj_int_min(dn, sn - b), | |
| 1275 opj_dwt_gamma); | |
| 1276 opj_dwt_encode_step2(w + b, w + a + 1, | |
| 1277 (OPJ_UINT32)sn, | |
| 1278 (OPJ_UINT32)opj_int_min(sn, dn - a), | |
| 1279 opj_dwt_delta); | |
| 1280 #if 0 | |
| 1281 opj_dwt_encode_step1(w + b, (OPJ_UINT32)dn, | |
| 1282 opj_K); | |
| 1283 opj_dwt_encode_step1(w + a, (OPJ_UINT32)sn, | |
| 1284 opj_invK); | |
| 1285 #else | |
| 1286 if (a == 0) { | |
| 1287 opj_dwt_encode_step1_combined(w, | |
| 1288 (OPJ_UINT32)sn, | |
| 1289 (OPJ_UINT32)dn, | |
| 1290 opj_invK, | |
| 1291 opj_K); | |
| 1292 } else { | |
| 1293 opj_dwt_encode_step1_combined(w, | |
| 1294 (OPJ_UINT32)dn, | |
| 1295 (OPJ_UINT32)sn, | |
| 1296 opj_K, | |
| 1297 opj_invK); | |
| 1298 } | |
| 1299 #endif | |
| 1300 } | |
| 1301 | |
| 1302 static void opj_dwt_encode_stepsize(OPJ_INT32 stepsize, OPJ_INT32 numbps, | |
| 1303 opj_stepsize_t *bandno_stepsize) | |
| 1304 { | |
| 1305 OPJ_INT32 p, n; | |
| 1306 p = opj_int_floorlog2(stepsize) - 13; | |
| 1307 n = 11 - opj_int_floorlog2(stepsize); | |
| 1308 bandno_stepsize->mant = (n < 0 ? stepsize >> -n : stepsize << n) & 0x7ff; | |
| 1309 bandno_stepsize->expn = numbps - p; | |
| 1310 } | |
| 1311 | |
| 1312 /* | |
| 1313 ========================================================== | |
| 1314 DWT interface | |
| 1315 ========================================================== | |
| 1316 */ | |
| 1317 | |
| 1318 /** Process one line for the horizontal pass of the 5x3 forward transform */ | |
| 1319 static | |
| 1320 void opj_dwt_encode_and_deinterleave_h_one_row(void* rowIn, | |
| 1321 void* tmpIn, | |
| 1322 OPJ_UINT32 width, | |
| 1323 OPJ_BOOL even) | |
| 1324 { | |
| 1325 OPJ_INT32* OPJ_RESTRICT row = (OPJ_INT32*)rowIn; | |
| 1326 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32*)tmpIn; | |
| 1327 const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1); | |
| 1328 const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn); | |
| 1329 | |
| 1330 if (even) { | |
| 1331 if (width > 1) { | |
| 1332 OPJ_INT32 i; | |
| 1333 for (i = 0; i < sn - 1; i++) { | |
| 1334 tmp[sn + i] = row[2 * i + 1] - ((row[(i) * 2] + row[(i + 1) * 2]) >> 1); | |
| 1335 } | |
| 1336 if ((width % 2) == 0) { | |
| 1337 tmp[sn + i] = row[2 * i + 1] - row[(i) * 2]; | |
| 1338 } | |
| 1339 row[0] += (tmp[sn] + tmp[sn] + 2) >> 2; | |
| 1340 for (i = 1; i < dn; i++) { | |
| 1341 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + i] + 2) >> 2); | |
| 1342 } | |
| 1343 if ((width % 2) == 1) { | |
| 1344 row[i] = row[2 * i] + ((tmp[sn + (i - 1)] + tmp[sn + (i - 1)] + 2) >> 2); | |
| 1345 } | |
| 1346 memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32)); | |
| 1347 } | |
| 1348 } else { | |
| 1349 if (width == 1) { | |
| 1350 row[0] *= 2; | |
| 1351 } else { | |
| 1352 OPJ_INT32 i; | |
| 1353 tmp[sn + 0] = row[0] - row[1]; | |
| 1354 for (i = 1; i < sn; i++) { | |
| 1355 tmp[sn + i] = row[2 * i] - ((row[2 * i + 1] + row[2 * (i - 1) + 1]) >> 1); | |
| 1356 } | |
| 1357 if ((width % 2) == 1) { | |
| 1358 tmp[sn + i] = row[2 * i] - row[2 * (i - 1) + 1]; | |
| 1359 } | |
| 1360 | |
| 1361 for (i = 0; i < dn - 1; i++) { | |
| 1362 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i + 1] + 2) >> 2); | |
| 1363 } | |
| 1364 if ((width % 2) == 0) { | |
| 1365 row[i] = row[2 * i + 1] + ((tmp[sn + i] + tmp[sn + i] + 2) >> 2); | |
| 1366 } | |
| 1367 memcpy(row + sn, tmp + sn, (OPJ_SIZE_T)dn * sizeof(OPJ_INT32)); | |
| 1368 } | |
| 1369 } | |
| 1370 } | |
| 1371 | |
| 1372 /** Process one line for the horizontal pass of the 9x7 forward transform */ | |
| 1373 static | |
| 1374 void opj_dwt_encode_and_deinterleave_h_one_row_real(void* rowIn, | |
| 1375 void* tmpIn, | |
| 1376 OPJ_UINT32 width, | |
| 1377 OPJ_BOOL even) | |
| 1378 { | |
| 1379 OPJ_FLOAT32* OPJ_RESTRICT row = (OPJ_FLOAT32*)rowIn; | |
| 1380 OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32*)tmpIn; | |
| 1381 const OPJ_INT32 sn = (OPJ_INT32)((width + (even ? 1 : 0)) >> 1); | |
| 1382 const OPJ_INT32 dn = (OPJ_INT32)(width - (OPJ_UINT32)sn); | |
| 1383 if (width == 1) { | |
| 1384 return; | |
| 1385 } | |
| 1386 memcpy(tmp, row, width * sizeof(OPJ_FLOAT32)); | |
| 1387 opj_dwt_encode_1_real(tmp, dn, sn, even ? 0 : 1); | |
| 1388 opj_dwt_deinterleave_h((OPJ_INT32 * OPJ_RESTRICT)tmp, | |
| 1389 (OPJ_INT32 * OPJ_RESTRICT)row, | |
| 1390 dn, sn, even ? 0 : 1); | |
| 1391 } | |
| 1392 | |
| 1393 typedef struct { | |
| 1394 opj_dwt_t h; | |
| 1395 OPJ_UINT32 rw; /* Width of the resolution to process */ | |
| 1396 OPJ_UINT32 w; /* Width of tiledp */ | |
| 1397 OPJ_INT32 * OPJ_RESTRICT tiledp; | |
| 1398 OPJ_UINT32 min_j; | |
| 1399 OPJ_UINT32 max_j; | |
| 1400 opj_encode_and_deinterleave_h_one_row_fnptr_type p_function; | |
| 1401 } opj_dwt_encode_h_job_t; | |
| 1402 | |
| 1403 static void opj_dwt_encode_h_func(void* user_data, opj_tls_t* tls) | |
| 1404 { | |
| 1405 OPJ_UINT32 j; | |
| 1406 opj_dwt_encode_h_job_t* job; | |
| 1407 (void)tls; | |
| 1408 | |
| 1409 job = (opj_dwt_encode_h_job_t*)user_data; | |
| 1410 for (j = job->min_j; j < job->max_j; j++) { | |
| 1411 OPJ_INT32* OPJ_RESTRICT aj = job->tiledp + j * job->w; | |
| 1412 (*job->p_function)(aj, job->h.mem, job->rw, | |
| 1413 job->h.cas == 0 ? OPJ_TRUE : OPJ_FALSE); | |
| 1414 } | |
| 1415 | |
| 1416 opj_aligned_free(job->h.mem); | |
| 1417 opj_free(job); | |
| 1418 } | |
| 1419 | |
| 1420 typedef struct { | |
| 1421 opj_dwt_t v; | |
| 1422 OPJ_UINT32 rh; | |
| 1423 OPJ_UINT32 w; | |
| 1424 OPJ_INT32 * OPJ_RESTRICT tiledp; | |
| 1425 OPJ_UINT32 min_j; | |
| 1426 OPJ_UINT32 max_j; | |
| 1427 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v; | |
| 1428 } opj_dwt_encode_v_job_t; | |
| 1429 | |
| 1430 static void opj_dwt_encode_v_func(void* user_data, opj_tls_t* tls) | |
| 1431 { | |
| 1432 OPJ_UINT32 j; | |
| 1433 opj_dwt_encode_v_job_t* job; | |
| 1434 (void)tls; | |
| 1435 | |
| 1436 job = (opj_dwt_encode_v_job_t*)user_data; | |
| 1437 for (j = job->min_j; j + NB_ELTS_V8 - 1 < job->max_j; j += NB_ELTS_V8) { | |
| 1438 (*job->p_encode_and_deinterleave_v)(job->tiledp + j, | |
| 1439 job->v.mem, | |
| 1440 job->rh, | |
| 1441 job->v.cas == 0, | |
| 1442 job->w, | |
| 1443 NB_ELTS_V8); | |
| 1444 } | |
| 1445 if (j < job->max_j) { | |
| 1446 (*job->p_encode_and_deinterleave_v)(job->tiledp + j, | |
| 1447 job->v.mem, | |
| 1448 job->rh, | |
| 1449 job->v.cas == 0, | |
| 1450 job->w, | |
| 1451 job->max_j - j); | |
| 1452 } | |
| 1453 | |
| 1454 opj_aligned_free(job->v.mem); | |
| 1455 opj_free(job); | |
| 1456 } | |
| 1457 | |
| 1458 /** Fetch up to cols <= NB_ELTS_V8 for each line, and put them in tmpOut */ | |
| 1459 /* that has a NB_ELTS_V8 interleave factor. */ | |
| 1460 static void opj_dwt_fetch_cols_vertical_pass(const void *arrayIn, | |
| 1461 void *tmpOut, | |
| 1462 OPJ_UINT32 height, | |
| 1463 OPJ_UINT32 stride_width, | |
| 1464 OPJ_UINT32 cols) | |
| 1465 { | |
| 1466 const OPJ_INT32* OPJ_RESTRICT array = (const OPJ_INT32 * OPJ_RESTRICT)arrayIn; | |
| 1467 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpOut; | |
| 1468 if (cols == NB_ELTS_V8) { | |
| 1469 OPJ_UINT32 k; | |
| 1470 for (k = 0; k < height; ++k) { | |
| 1471 memcpy(tmp + NB_ELTS_V8 * k, | |
| 1472 array + k * stride_width, | |
| 1473 NB_ELTS_V8 * sizeof(OPJ_INT32)); | |
| 1474 } | |
| 1475 } else { | |
| 1476 OPJ_UINT32 k; | |
| 1477 for (k = 0; k < height; ++k) { | |
| 1478 OPJ_UINT32 c; | |
| 1479 for (c = 0; c < cols; c++) { | |
| 1480 tmp[NB_ELTS_V8 * k + c] = array[c + k * stride_width]; | |
| 1481 } | |
| 1482 for (; c < NB_ELTS_V8; c++) { | |
| 1483 tmp[NB_ELTS_V8 * k + c] = 0; | |
| 1484 } | |
| 1485 } | |
| 1486 } | |
| 1487 } | |
| 1488 | |
| 1489 /* Deinterleave result of forward transform, where cols <= NB_ELTS_V8 */ | |
| 1490 /* and src contains NB_ELTS_V8 consecutive values for up to NB_ELTS_V8 */ | |
| 1491 /* columns. */ | |
| 1492 static INLINE void opj_dwt_deinterleave_v_cols( | |
| 1493 const OPJ_INT32 * OPJ_RESTRICT src, | |
| 1494 OPJ_INT32 * OPJ_RESTRICT dst, | |
| 1495 OPJ_INT32 dn, | |
| 1496 OPJ_INT32 sn, | |
| 1497 OPJ_UINT32 stride_width, | |
| 1498 OPJ_INT32 cas, | |
| 1499 OPJ_UINT32 cols) | |
| 1500 { | |
| 1501 OPJ_INT32 k; | |
| 1502 OPJ_INT32 i = sn; | |
| 1503 OPJ_INT32 * OPJ_RESTRICT l_dest = dst; | |
| 1504 const OPJ_INT32 * OPJ_RESTRICT l_src = src + cas * NB_ELTS_V8; | |
| 1505 OPJ_UINT32 c; | |
| 1506 | |
| 1507 for (k = 0; k < 2; k++) { | |
| 1508 while (i--) { | |
| 1509 if (cols == NB_ELTS_V8) { | |
| 1510 memcpy(l_dest, l_src, NB_ELTS_V8 * sizeof(OPJ_INT32)); | |
| 1511 } else { | |
| 1512 c = 0; | |
| 1513 switch (cols) { | |
| 1514 case 7: | |
| 1515 l_dest[c] = l_src[c]; | |
| 1516 c++; /* fallthru */ | |
| 1517 case 6: | |
| 1518 l_dest[c] = l_src[c]; | |
| 1519 c++; /* fallthru */ | |
| 1520 case 5: | |
| 1521 l_dest[c] = l_src[c]; | |
| 1522 c++; /* fallthru */ | |
| 1523 case 4: | |
| 1524 l_dest[c] = l_src[c]; | |
| 1525 c++; /* fallthru */ | |
| 1526 case 3: | |
| 1527 l_dest[c] = l_src[c]; | |
| 1528 c++; /* fallthru */ | |
| 1529 case 2: | |
| 1530 l_dest[c] = l_src[c]; | |
| 1531 c++; /* fallthru */ | |
| 1532 default: | |
| 1533 l_dest[c] = l_src[c]; | |
| 1534 break; | |
| 1535 } | |
| 1536 } | |
| 1537 l_dest += stride_width; | |
| 1538 l_src += 2 * NB_ELTS_V8; | |
| 1539 } | |
| 1540 | |
| 1541 l_dest = dst + (OPJ_SIZE_T)sn * (OPJ_SIZE_T)stride_width; | |
| 1542 l_src = src + (1 - cas) * NB_ELTS_V8; | |
| 1543 i = dn; | |
| 1544 } | |
| 1545 } | |
| 1546 | |
| 1547 | |
| 1548 /* Forward 5-3 transform, for the vertical pass, processing cols columns */ | |
| 1549 /* where cols <= NB_ELTS_V8 */ | |
| 1550 static void opj_dwt_encode_and_deinterleave_v( | |
| 1551 void *arrayIn, | |
| 1552 void *tmpIn, | |
| 1553 OPJ_UINT32 height, | |
| 1554 OPJ_BOOL even, | |
| 1555 OPJ_UINT32 stride_width, | |
| 1556 OPJ_UINT32 cols) | |
| 1557 { | |
| 1558 OPJ_INT32* OPJ_RESTRICT array = (OPJ_INT32 * OPJ_RESTRICT)arrayIn; | |
| 1559 OPJ_INT32* OPJ_RESTRICT tmp = (OPJ_INT32 * OPJ_RESTRICT)tmpIn; | |
| 1560 const OPJ_UINT32 sn = (height + (even ? 1 : 0)) >> 1; | |
| 1561 const OPJ_UINT32 dn = height - sn; | |
| 1562 | |
| 1563 opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols); | |
| 1564 | |
| 1565 #define OPJ_Sc(i) tmp[(i)*2* NB_ELTS_V8 + c] | |
| 1566 #define OPJ_Dc(i) tmp[((1+(i)*2))* NB_ELTS_V8 + c] | |
| 1567 | |
| 1568 #ifdef __SSE2__ | |
| 1569 if (height == 1) { | |
| 1570 if (!even) { | |
| 1571 OPJ_UINT32 c; | |
| 1572 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1573 tmp[c] *= 2; | |
| 1574 } | |
| 1575 } | |
| 1576 } else if (even) { | |
| 1577 OPJ_UINT32 c; | |
| 1578 OPJ_UINT32 i; | |
| 1579 i = 0; | |
| 1580 if (i + 1 < sn) { | |
| 1581 __m128i xmm_Si_0 = *(const __m128i*)(tmp + 4 * 0); | |
| 1582 __m128i xmm_Si_1 = *(const __m128i*)(tmp + 4 * 1); | |
| 1583 for (; i + 1 < sn; i++) { | |
| 1584 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp + | |
| 1585 (i + 1) * 2 * NB_ELTS_V8 + 4 * 0); | |
| 1586 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp + | |
| 1587 (i + 1) * 2 * NB_ELTS_V8 + 4 * 1); | |
| 1588 __m128i xmm_Di_0 = *(const __m128i*)(tmp + | |
| 1589 (1 + i * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1590 __m128i xmm_Di_1 = *(const __m128i*)(tmp + | |
| 1591 (1 + i * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1592 xmm_Di_0 = _mm_sub_epi32(xmm_Di_0, | |
| 1593 _mm_srai_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), 1)); | |
| 1594 xmm_Di_1 = _mm_sub_epi32(xmm_Di_1, | |
| 1595 _mm_srai_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), 1)); | |
| 1596 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0; | |
| 1597 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1; | |
| 1598 xmm_Si_0 = xmm_Sip1_0; | |
| 1599 xmm_Si_1 = xmm_Sip1_1; | |
| 1600 } | |
| 1601 } | |
| 1602 if (((height) % 2) == 0) { | |
| 1603 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1604 OPJ_Dc(i) -= OPJ_Sc(i); | |
| 1605 } | |
| 1606 } | |
| 1607 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1608 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2; | |
| 1609 } | |
| 1610 i = 1; | |
| 1611 if (i < dn) { | |
| 1612 __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 + | |
| 1613 (i - 1) * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1614 __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 + | |
| 1615 (i - 1) * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1616 const __m128i xmm_two = _mm_set1_epi32(2); | |
| 1617 for (; i < dn; i++) { | |
| 1618 __m128i xmm_Di_0 = *(const __m128i*)(tmp + | |
| 1619 (1 + i * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1620 __m128i xmm_Di_1 = *(const __m128i*)(tmp + | |
| 1621 (1 + i * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1622 __m128i xmm_Si_0 = *(const __m128i*)(tmp + | |
| 1623 (i * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1624 __m128i xmm_Si_1 = *(const __m128i*)(tmp + | |
| 1625 (i * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1626 xmm_Si_0 = _mm_add_epi32(xmm_Si_0, | |
| 1627 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_0, xmm_Di_0), xmm_two), 2)); | |
| 1628 xmm_Si_1 = _mm_add_epi32(xmm_Si_1, | |
| 1629 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Dim1_1, xmm_Di_1), xmm_two), 2)); | |
| 1630 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0; | |
| 1631 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1; | |
| 1632 xmm_Dim1_0 = xmm_Di_0; | |
| 1633 xmm_Dim1_1 = xmm_Di_1; | |
| 1634 } | |
| 1635 } | |
| 1636 if (((height) % 2) == 1) { | |
| 1637 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1638 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2; | |
| 1639 } | |
| 1640 } | |
| 1641 } else { | |
| 1642 OPJ_UINT32 c; | |
| 1643 OPJ_UINT32 i; | |
| 1644 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1645 OPJ_Sc(0) -= OPJ_Dc(0); | |
| 1646 } | |
| 1647 i = 1; | |
| 1648 if (i < sn) { | |
| 1649 __m128i xmm_Dim1_0 = *(const __m128i*)(tmp + (1 + | |
| 1650 (i - 1) * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1651 __m128i xmm_Dim1_1 = *(const __m128i*)(tmp + (1 + | |
| 1652 (i - 1) * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1653 for (; i < sn; i++) { | |
| 1654 __m128i xmm_Di_0 = *(const __m128i*)(tmp + | |
| 1655 (1 + i * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1656 __m128i xmm_Di_1 = *(const __m128i*)(tmp + | |
| 1657 (1 + i * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1658 __m128i xmm_Si_0 = *(const __m128i*)(tmp + | |
| 1659 (i * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1660 __m128i xmm_Si_1 = *(const __m128i*)(tmp + | |
| 1661 (i * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1662 xmm_Si_0 = _mm_sub_epi32(xmm_Si_0, | |
| 1663 _mm_srai_epi32(_mm_add_epi32(xmm_Di_0, xmm_Dim1_0), 1)); | |
| 1664 xmm_Si_1 = _mm_sub_epi32(xmm_Si_1, | |
| 1665 _mm_srai_epi32(_mm_add_epi32(xmm_Di_1, xmm_Dim1_1), 1)); | |
| 1666 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Si_0; | |
| 1667 *(__m128i*)(tmp + (i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Si_1; | |
| 1668 xmm_Dim1_0 = xmm_Di_0; | |
| 1669 xmm_Dim1_1 = xmm_Di_1; | |
| 1670 } | |
| 1671 } | |
| 1672 if (((height) % 2) == 1) { | |
| 1673 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1674 OPJ_Sc(i) -= OPJ_Dc(i - 1); | |
| 1675 } | |
| 1676 } | |
| 1677 i = 0; | |
| 1678 if (i + 1 < dn) { | |
| 1679 __m128i xmm_Si_0 = *((const __m128i*)(tmp + 4 * 0)); | |
| 1680 __m128i xmm_Si_1 = *((const __m128i*)(tmp + 4 * 1)); | |
| 1681 const __m128i xmm_two = _mm_set1_epi32(2); | |
| 1682 for (; i + 1 < dn; i++) { | |
| 1683 __m128i xmm_Sip1_0 = *(const __m128i*)(tmp + | |
| 1684 (i + 1) * 2 * NB_ELTS_V8 + 4 * 0); | |
| 1685 __m128i xmm_Sip1_1 = *(const __m128i*)(tmp + | |
| 1686 (i + 1) * 2 * NB_ELTS_V8 + 4 * 1); | |
| 1687 __m128i xmm_Di_0 = *(const __m128i*)(tmp + | |
| 1688 (1 + i * 2) * NB_ELTS_V8 + 4 * 0); | |
| 1689 __m128i xmm_Di_1 = *(const __m128i*)(tmp + | |
| 1690 (1 + i * 2) * NB_ELTS_V8 + 4 * 1); | |
| 1691 xmm_Di_0 = _mm_add_epi32(xmm_Di_0, | |
| 1692 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_0, xmm_Sip1_0), xmm_two), 2)); | |
| 1693 xmm_Di_1 = _mm_add_epi32(xmm_Di_1, | |
| 1694 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(xmm_Si_1, xmm_Sip1_1), xmm_two), 2)); | |
| 1695 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 0) = xmm_Di_0; | |
| 1696 *(__m128i*)(tmp + (1 + i * 2) * NB_ELTS_V8 + 4 * 1) = xmm_Di_1; | |
| 1697 xmm_Si_0 = xmm_Sip1_0; | |
| 1698 xmm_Si_1 = xmm_Sip1_1; | |
| 1699 } | |
| 1700 } | |
| 1701 if (((height) % 2) == 0) { | |
| 1702 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1703 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2; | |
| 1704 } | |
| 1705 } | |
| 1706 } | |
| 1707 #else | |
| 1708 if (even) { | |
| 1709 OPJ_UINT32 c; | |
| 1710 if (height > 1) { | |
| 1711 OPJ_UINT32 i; | |
| 1712 for (i = 0; i + 1 < sn; i++) { | |
| 1713 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1714 OPJ_Dc(i) -= (OPJ_Sc(i) + OPJ_Sc(i + 1)) >> 1; | |
| 1715 } | |
| 1716 } | |
| 1717 if (((height) % 2) == 0) { | |
| 1718 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1719 OPJ_Dc(i) -= OPJ_Sc(i); | |
| 1720 } | |
| 1721 } | |
| 1722 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1723 OPJ_Sc(0) += (OPJ_Dc(0) + OPJ_Dc(0) + 2) >> 2; | |
| 1724 } | |
| 1725 for (i = 1; i < dn; i++) { | |
| 1726 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1727 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i) + 2) >> 2; | |
| 1728 } | |
| 1729 } | |
| 1730 if (((height) % 2) == 1) { | |
| 1731 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1732 OPJ_Sc(i) += (OPJ_Dc(i - 1) + OPJ_Dc(i - 1) + 2) >> 2; | |
| 1733 } | |
| 1734 } | |
| 1735 } | |
| 1736 } else { | |
| 1737 OPJ_UINT32 c; | |
| 1738 if (height == 1) { | |
| 1739 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1740 OPJ_Sc(0) *= 2; | |
| 1741 } | |
| 1742 } else { | |
| 1743 OPJ_UINT32 i; | |
| 1744 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1745 OPJ_Sc(0) -= OPJ_Dc(0); | |
| 1746 } | |
| 1747 for (i = 1; i < sn; i++) { | |
| 1748 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1749 OPJ_Sc(i) -= (OPJ_Dc(i) + OPJ_Dc(i - 1)) >> 1; | |
| 1750 } | |
| 1751 } | |
| 1752 if (((height) % 2) == 1) { | |
| 1753 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1754 OPJ_Sc(i) -= OPJ_Dc(i - 1); | |
| 1755 } | |
| 1756 } | |
| 1757 for (i = 0; i + 1 < dn; i++) { | |
| 1758 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1759 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i + 1) + 2) >> 2; | |
| 1760 } | |
| 1761 } | |
| 1762 if (((height) % 2) == 0) { | |
| 1763 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1764 OPJ_Dc(i) += (OPJ_Sc(i) + OPJ_Sc(i) + 2) >> 2; | |
| 1765 } | |
| 1766 } | |
| 1767 } | |
| 1768 } | |
| 1769 #endif | |
| 1770 | |
| 1771 if (cols == NB_ELTS_V8) { | |
| 1772 opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn, | |
| 1773 stride_width, even ? 0 : 1, NB_ELTS_V8); | |
| 1774 } else { | |
| 1775 opj_dwt_deinterleave_v_cols(tmp, array, (OPJ_INT32)dn, (OPJ_INT32)sn, | |
| 1776 stride_width, even ? 0 : 1, cols); | |
| 1777 } | |
| 1778 } | |
| 1779 | |
| 1780 static void opj_v8dwt_encode_step1(OPJ_FLOAT32* fw, | |
| 1781 OPJ_UINT32 end, | |
| 1782 const OPJ_FLOAT32 cst) | |
| 1783 { | |
| 1784 OPJ_UINT32 i; | |
| 1785 #ifdef __SSE__ | |
| 1786 __m128* vw = (__m128*) fw; | |
| 1787 const __m128 vcst = _mm_set1_ps(cst); | |
| 1788 for (i = 0; i < end; ++i) { | |
| 1789 vw[0] = _mm_mul_ps(vw[0], vcst); | |
| 1790 vw[1] = _mm_mul_ps(vw[1], vcst); | |
| 1791 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128)); | |
| 1792 } | |
| 1793 #else | |
| 1794 OPJ_UINT32 c; | |
| 1795 for (i = 0; i < end; ++i) { | |
| 1796 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1797 fw[i * 2 * NB_ELTS_V8 + c] *= cst; | |
| 1798 } | |
| 1799 } | |
| 1800 #endif | |
| 1801 } | |
| 1802 | |
| 1803 static void opj_v8dwt_encode_step2(OPJ_FLOAT32* fl, OPJ_FLOAT32* fw, | |
| 1804 OPJ_UINT32 end, | |
| 1805 OPJ_UINT32 m, | |
| 1806 OPJ_FLOAT32 cst) | |
| 1807 { | |
| 1808 OPJ_UINT32 i; | |
| 1809 OPJ_UINT32 imax = opj_uint_min(end, m); | |
| 1810 #ifdef __SSE__ | |
| 1811 __m128* vw = (__m128*) fw; | |
| 1812 __m128 vcst = _mm_set1_ps(cst); | |
| 1813 if (imax > 0) { | |
| 1814 __m128* vl = (__m128*) fl; | |
| 1815 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), vcst)); | |
| 1816 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), vcst)); | |
| 1817 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128)); | |
| 1818 i = 1; | |
| 1819 | |
| 1820 for (; i < imax; ++i) { | |
| 1821 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), vcst)); | |
| 1822 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), vcst)); | |
| 1823 vw += 2 * (NB_ELTS_V8 * sizeof(OPJ_FLOAT32) / sizeof(__m128)); | |
| 1824 } | |
| 1825 } | |
| 1826 if (m < end) { | |
| 1827 assert(m + 1 == end); | |
| 1828 vcst = _mm_add_ps(vcst, vcst); | |
| 1829 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(vw[-4], vcst)); | |
| 1830 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(vw[-3], vcst)); | |
| 1831 } | |
| 1832 #else | |
| 1833 OPJ_INT32 c; | |
| 1834 if (imax > 0) { | |
| 1835 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1836 fw[-1 * NB_ELTS_V8 + c] += (fl[0 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) * | |
| 1837 cst; | |
| 1838 } | |
| 1839 fw += 2 * NB_ELTS_V8; | |
| 1840 i = 1; | |
| 1841 for (; i < imax; ++i) { | |
| 1842 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1843 fw[-1 * NB_ELTS_V8 + c] += (fw[-2 * NB_ELTS_V8 + c] + fw[0 * NB_ELTS_V8 + c]) * | |
| 1844 cst; | |
| 1845 } | |
| 1846 fw += 2 * NB_ELTS_V8; | |
| 1847 } | |
| 1848 } | |
| 1849 if (m < end) { | |
| 1850 assert(m + 1 == end); | |
| 1851 for (c = 0; c < NB_ELTS_V8; c++) { | |
| 1852 fw[-1 * NB_ELTS_V8 + c] += (2 * fw[-2 * NB_ELTS_V8 + c]) * cst; | |
| 1853 } | |
| 1854 } | |
| 1855 #endif | |
| 1856 } | |
| 1857 | |
| 1858 /* Forward 9-7 transform, for the vertical pass, processing cols columns */ | |
| 1859 /* where cols <= NB_ELTS_V8 */ | |
| 1860 static void opj_dwt_encode_and_deinterleave_v_real( | |
| 1861 void *arrayIn, | |
| 1862 void *tmpIn, | |
| 1863 OPJ_UINT32 height, | |
| 1864 OPJ_BOOL even, | |
| 1865 OPJ_UINT32 stride_width, | |
| 1866 OPJ_UINT32 cols) | |
| 1867 { | |
| 1868 OPJ_FLOAT32* OPJ_RESTRICT array = (OPJ_FLOAT32 * OPJ_RESTRICT)arrayIn; | |
| 1869 OPJ_FLOAT32* OPJ_RESTRICT tmp = (OPJ_FLOAT32 * OPJ_RESTRICT)tmpIn; | |
| 1870 const OPJ_INT32 sn = (OPJ_INT32)((height + (even ? 1 : 0)) >> 1); | |
| 1871 const OPJ_INT32 dn = (OPJ_INT32)(height - (OPJ_UINT32)sn); | |
| 1872 OPJ_INT32 a, b; | |
| 1873 | |
| 1874 if (height == 1) { | |
| 1875 return; | |
| 1876 } | |
| 1877 | |
| 1878 opj_dwt_fetch_cols_vertical_pass(arrayIn, tmpIn, height, stride_width, cols); | |
| 1879 | |
| 1880 if (even) { | |
| 1881 a = 0; | |
| 1882 b = 1; | |
| 1883 } else { | |
| 1884 a = 1; | |
| 1885 b = 0; | |
| 1886 } | |
| 1887 opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8, | |
| 1888 tmp + (b + 1) * NB_ELTS_V8, | |
| 1889 (OPJ_UINT32)dn, | |
| 1890 (OPJ_UINT32)opj_int_min(dn, sn - b), | |
| 1891 opj_dwt_alpha); | |
| 1892 opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8, | |
| 1893 tmp + (a + 1) * NB_ELTS_V8, | |
| 1894 (OPJ_UINT32)sn, | |
| 1895 (OPJ_UINT32)opj_int_min(sn, dn - a), | |
| 1896 opj_dwt_beta); | |
| 1897 opj_v8dwt_encode_step2(tmp + a * NB_ELTS_V8, | |
| 1898 tmp + (b + 1) * NB_ELTS_V8, | |
| 1899 (OPJ_UINT32)dn, | |
| 1900 (OPJ_UINT32)opj_int_min(dn, sn - b), | |
| 1901 opj_dwt_gamma); | |
| 1902 opj_v8dwt_encode_step2(tmp + b * NB_ELTS_V8, | |
| 1903 tmp + (a + 1) * NB_ELTS_V8, | |
| 1904 (OPJ_UINT32)sn, | |
| 1905 (OPJ_UINT32)opj_int_min(sn, dn - a), | |
| 1906 opj_dwt_delta); | |
| 1907 opj_v8dwt_encode_step1(tmp + b * NB_ELTS_V8, (OPJ_UINT32)dn, | |
| 1908 opj_K); | |
| 1909 opj_v8dwt_encode_step1(tmp + a * NB_ELTS_V8, (OPJ_UINT32)sn, | |
| 1910 opj_invK); | |
| 1911 | |
| 1912 | |
| 1913 if (cols == NB_ELTS_V8) { | |
| 1914 opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp, | |
| 1915 (OPJ_INT32*)array, | |
| 1916 (OPJ_INT32)dn, (OPJ_INT32)sn, | |
| 1917 stride_width, even ? 0 : 1, NB_ELTS_V8); | |
| 1918 } else { | |
| 1919 opj_dwt_deinterleave_v_cols((OPJ_INT32*)tmp, | |
| 1920 (OPJ_INT32*)array, | |
| 1921 (OPJ_INT32)dn, (OPJ_INT32)sn, | |
| 1922 stride_width, even ? 0 : 1, cols); | |
| 1923 } | |
| 1924 } | |
| 1925 | |
| 1926 | |
| 1927 /* <summary> */ | |
| 1928 /* Forward 5-3 wavelet transform in 2-D. */ | |
| 1929 /* </summary> */ | |
| 1930 static INLINE OPJ_BOOL opj_dwt_encode_procedure(opj_thread_pool_t* tp, | |
| 1931 opj_tcd_tilecomp_t * tilec, | |
| 1932 opj_encode_and_deinterleave_v_fnptr_type p_encode_and_deinterleave_v, | |
| 1933 opj_encode_and_deinterleave_h_one_row_fnptr_type | |
| 1934 p_encode_and_deinterleave_h_one_row) | |
| 1935 { | |
| 1936 OPJ_INT32 i; | |
| 1937 OPJ_INT32 *bj = 00; | |
| 1938 OPJ_UINT32 w; | |
| 1939 OPJ_INT32 l; | |
| 1940 | |
| 1941 OPJ_SIZE_T l_data_size; | |
| 1942 | |
| 1943 opj_tcd_resolution_t * l_cur_res = 0; | |
| 1944 opj_tcd_resolution_t * l_last_res = 0; | |
| 1945 const int num_threads = opj_thread_pool_get_thread_count(tp); | |
| 1946 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data; | |
| 1947 | |
| 1948 w = (OPJ_UINT32)(tilec->x1 - tilec->x0); | |
| 1949 l = (OPJ_INT32)tilec->numresolutions - 1; | |
| 1950 | |
| 1951 l_cur_res = tilec->resolutions + l; | |
| 1952 l_last_res = l_cur_res - 1; | |
| 1953 | |
| 1954 l_data_size = opj_dwt_max_resolution(tilec->resolutions, tilec->numresolutions); | |
| 1955 /* overflow check */ | |
| 1956 if (l_data_size > (SIZE_MAX / (NB_ELTS_V8 * sizeof(OPJ_INT32)))) { | |
| 1957 /* FIXME event manager error callback */ | |
| 1958 return OPJ_FALSE; | |
| 1959 } | |
| 1960 l_data_size *= NB_ELTS_V8 * sizeof(OPJ_INT32); | |
| 1961 bj = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size); | |
| 1962 /* l_data_size is equal to 0 when numresolutions == 1 but bj is not used */ | |
| 1963 /* in that case, so do not error out */ | |
| 1964 if (l_data_size != 0 && ! bj) { | |
| 1965 return OPJ_FALSE; | |
| 1966 } | |
| 1967 i = l; | |
| 1968 | |
| 1969 while (i--) { | |
| 1970 OPJ_UINT32 j; | |
| 1971 OPJ_UINT32 rw; /* width of the resolution level computed */ | |
| 1972 OPJ_UINT32 rh; /* height of the resolution level computed */ | |
| 1973 OPJ_UINT32 | |
| 1974 rw1; /* width of the resolution level once lower than computed one */ | |
| 1975 OPJ_UINT32 | |
| 1976 rh1; /* height of the resolution level once lower than computed one */ | |
| 1977 OPJ_INT32 cas_col; /* 0 = non inversion on horizontal filtering 1 = inversion between low-pass and high-pass filtering */ | |
| 1978 OPJ_INT32 cas_row; /* 0 = non inversion on vertical filtering 1 = inversion between low-pass and high-pass filtering */ | |
| 1979 OPJ_INT32 dn, sn; | |
| 1980 | |
| 1981 rw = (OPJ_UINT32)(l_cur_res->x1 - l_cur_res->x0); | |
| 1982 rh = (OPJ_UINT32)(l_cur_res->y1 - l_cur_res->y0); | |
| 1983 rw1 = (OPJ_UINT32)(l_last_res->x1 - l_last_res->x0); | |
| 1984 rh1 = (OPJ_UINT32)(l_last_res->y1 - l_last_res->y0); | |
| 1985 | |
| 1986 cas_row = l_cur_res->x0 & 1; | |
| 1987 cas_col = l_cur_res->y0 & 1; | |
| 1988 | |
| 1989 sn = (OPJ_INT32)rh1; | |
| 1990 dn = (OPJ_INT32)(rh - rh1); | |
| 1991 | |
| 1992 /* Perform vertical pass */ | |
| 1993 if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) { | |
| 1994 for (j = 0; j + NB_ELTS_V8 - 1 < rw; j += NB_ELTS_V8) { | |
| 1995 p_encode_and_deinterleave_v(tiledp + j, | |
| 1996 bj, | |
| 1997 rh, | |
| 1998 cas_col == 0, | |
| 1999 w, | |
| 2000 NB_ELTS_V8); | |
| 2001 } | |
| 2002 if (j < rw) { | |
| 2003 p_encode_and_deinterleave_v(tiledp + j, | |
| 2004 bj, | |
| 2005 rh, | |
| 2006 cas_col == 0, | |
| 2007 w, | |
| 2008 rw - j); | |
| 2009 } | |
| 2010 } else { | |
| 2011 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; | |
| 2012 OPJ_UINT32 step_j; | |
| 2013 | |
| 2014 if (rw < num_jobs) { | |
| 2015 num_jobs = rw; | |
| 2016 } | |
| 2017 step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8; | |
| 2018 | |
| 2019 for (j = 0; j < num_jobs; j++) { | |
| 2020 opj_dwt_encode_v_job_t* job; | |
| 2021 | |
| 2022 job = (opj_dwt_encode_v_job_t*) opj_malloc(sizeof(opj_dwt_encode_v_job_t)); | |
| 2023 if (!job) { | |
| 2024 opj_thread_pool_wait_completion(tp, 0); | |
| 2025 opj_aligned_free(bj); | |
| 2026 return OPJ_FALSE; | |
| 2027 } | |
| 2028 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size); | |
| 2029 if (!job->v.mem) { | |
| 2030 opj_thread_pool_wait_completion(tp, 0); | |
| 2031 opj_free(job); | |
| 2032 opj_aligned_free(bj); | |
| 2033 return OPJ_FALSE; | |
| 2034 } | |
| 2035 job->v.dn = dn; | |
| 2036 job->v.sn = sn; | |
| 2037 job->v.cas = cas_col; | |
| 2038 job->rh = rh; | |
| 2039 job->w = w; | |
| 2040 job->tiledp = tiledp; | |
| 2041 job->min_j = j * step_j; | |
| 2042 job->max_j = (j + 1 == num_jobs) ? rw : (j + 1) * step_j; | |
| 2043 job->p_encode_and_deinterleave_v = p_encode_and_deinterleave_v; | |
| 2044 opj_thread_pool_submit_job(tp, opj_dwt_encode_v_func, job); | |
| 2045 } | |
| 2046 opj_thread_pool_wait_completion(tp, 0); | |
| 2047 } | |
| 2048 | |
| 2049 sn = (OPJ_INT32)rw1; | |
| 2050 dn = (OPJ_INT32)(rw - rw1); | |
| 2051 | |
| 2052 /* Perform horizontal pass */ | |
| 2053 if (num_threads <= 1 || rh <= 1) { | |
| 2054 for (j = 0; j < rh; j++) { | |
| 2055 OPJ_INT32* OPJ_RESTRICT aj = tiledp + j * w; | |
| 2056 (*p_encode_and_deinterleave_h_one_row)(aj, bj, rw, | |
| 2057 cas_row == 0 ? OPJ_TRUE : OPJ_FALSE); | |
| 2058 } | |
| 2059 } else { | |
| 2060 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; | |
| 2061 OPJ_UINT32 step_j; | |
| 2062 | |
| 2063 if (rh < num_jobs) { | |
| 2064 num_jobs = rh; | |
| 2065 } | |
| 2066 step_j = (rh / num_jobs); | |
| 2067 | |
| 2068 for (j = 0; j < num_jobs; j++) { | |
| 2069 opj_dwt_encode_h_job_t* job; | |
| 2070 | |
| 2071 job = (opj_dwt_encode_h_job_t*) opj_malloc(sizeof(opj_dwt_encode_h_job_t)); | |
| 2072 if (!job) { | |
| 2073 opj_thread_pool_wait_completion(tp, 0); | |
| 2074 opj_aligned_free(bj); | |
| 2075 return OPJ_FALSE; | |
| 2076 } | |
| 2077 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(l_data_size); | |
| 2078 if (!job->h.mem) { | |
| 2079 opj_thread_pool_wait_completion(tp, 0); | |
| 2080 opj_free(job); | |
| 2081 opj_aligned_free(bj); | |
| 2082 return OPJ_FALSE; | |
| 2083 } | |
| 2084 job->h.dn = dn; | |
| 2085 job->h.sn = sn; | |
| 2086 job->h.cas = cas_row; | |
| 2087 job->rw = rw; | |
| 2088 job->w = w; | |
| 2089 job->tiledp = tiledp; | |
| 2090 job->min_j = j * step_j; | |
| 2091 job->max_j = (j + 1U) * step_j; /* this can overflow */ | |
| 2092 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ | |
| 2093 job->max_j = rh; | |
| 2094 } | |
| 2095 job->p_function = p_encode_and_deinterleave_h_one_row; | |
| 2096 opj_thread_pool_submit_job(tp, opj_dwt_encode_h_func, job); | |
| 2097 } | |
| 2098 opj_thread_pool_wait_completion(tp, 0); | |
| 2099 } | |
| 2100 | |
| 2101 l_cur_res = l_last_res; | |
| 2102 | |
| 2103 --l_last_res; | |
| 2104 } | |
| 2105 | |
| 2106 opj_aligned_free(bj); | |
| 2107 return OPJ_TRUE; | |
| 2108 } | |
| 2109 | |
| 2110 /* Forward 5-3 wavelet transform in 2-D. */ | |
| 2111 /* </summary> */ | |
| 2112 OPJ_BOOL opj_dwt_encode(opj_tcd_t *p_tcd, | |
| 2113 opj_tcd_tilecomp_t * tilec) | |
| 2114 { | |
| 2115 return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, | |
| 2116 opj_dwt_encode_and_deinterleave_v, | |
| 2117 opj_dwt_encode_and_deinterleave_h_one_row); | |
| 2118 } | |
| 2119 | |
| 2120 /* <summary> */ | |
| 2121 /* Inverse 5-3 wavelet transform in 2-D. */ | |
| 2122 /* </summary> */ | |
| 2123 OPJ_BOOL opj_dwt_decode(opj_tcd_t *p_tcd, opj_tcd_tilecomp_t* tilec, | |
| 2124 OPJ_UINT32 numres) | |
| 2125 { | |
| 2126 if (p_tcd->whole_tile_decoding) { | |
| 2127 return opj_dwt_decode_tile(p_tcd->thread_pool, tilec, numres); | |
| 2128 } else { | |
| 2129 return opj_dwt_decode_partial_tile(tilec, numres); | |
| 2130 } | |
| 2131 } | |
| 2132 | |
| 2133 /* <summary> */ | |
| 2134 /* Get norm of 5-3 wavelet. */ | |
| 2135 /* </summary> */ | |
| 2136 OPJ_FLOAT64 opj_dwt_getnorm(OPJ_UINT32 level, OPJ_UINT32 orient) | |
| 2137 { | |
| 2138 /* FIXME ! This is just a band-aid to avoid a buffer overflow */ | |
| 2139 /* but the array should really be extended up to 33 resolution levels */ | |
| 2140 /* See https://github.com/uclouvain/openjpeg/issues/493 */ | |
| 2141 if (orient == 0 && level >= 10) { | |
| 2142 level = 9; | |
| 2143 } else if (orient > 0 && level >= 9) { | |
| 2144 level = 8; | |
| 2145 } | |
| 2146 return opj_dwt_norms[orient][level]; | |
| 2147 } | |
| 2148 | |
| 2149 /* <summary> */ | |
| 2150 /* Forward 9-7 wavelet transform in 2-D. */ | |
| 2151 /* </summary> */ | |
| 2152 OPJ_BOOL opj_dwt_encode_real(opj_tcd_t *p_tcd, | |
| 2153 opj_tcd_tilecomp_t * tilec) | |
| 2154 { | |
| 2155 return opj_dwt_encode_procedure(p_tcd->thread_pool, tilec, | |
| 2156 opj_dwt_encode_and_deinterleave_v_real, | |
| 2157 opj_dwt_encode_and_deinterleave_h_one_row_real); | |
| 2158 } | |
| 2159 | |
| 2160 /* <summary> */ | |
| 2161 /* Get norm of 9-7 wavelet. */ | |
| 2162 /* </summary> */ | |
| 2163 OPJ_FLOAT64 opj_dwt_getnorm_real(OPJ_UINT32 level, OPJ_UINT32 orient) | |
| 2164 { | |
| 2165 /* FIXME ! This is just a band-aid to avoid a buffer overflow */ | |
| 2166 /* but the array should really be extended up to 33 resolution levels */ | |
| 2167 /* See https://github.com/uclouvain/openjpeg/issues/493 */ | |
| 2168 if (orient == 0 && level >= 10) { | |
| 2169 level = 9; | |
| 2170 } else if (orient > 0 && level >= 9) { | |
| 2171 level = 8; | |
| 2172 } | |
| 2173 return opj_dwt_norms_real[orient][level]; | |
| 2174 } | |
| 2175 | |
| 2176 void opj_dwt_calc_explicit_stepsizes(opj_tccp_t * tccp, OPJ_UINT32 prec) | |
| 2177 { | |
| 2178 OPJ_UINT32 numbands, bandno; | |
| 2179 numbands = 3 * tccp->numresolutions - 2; | |
| 2180 for (bandno = 0; bandno < numbands; bandno++) { | |
| 2181 OPJ_FLOAT64 stepsize; | |
| 2182 OPJ_UINT32 resno, level, orient, gain; | |
| 2183 | |
| 2184 resno = (bandno == 0) ? 0 : ((bandno - 1) / 3 + 1); | |
| 2185 orient = (bandno == 0) ? 0 : ((bandno - 1) % 3 + 1); | |
| 2186 level = tccp->numresolutions - 1 - resno; | |
| 2187 gain = (tccp->qmfbid == 0) ? 0 : ((orient == 0) ? 0 : (((orient == 1) || | |
| 2188 (orient == 2)) ? 1 : 2)); | |
| 2189 if (tccp->qntsty == J2K_CCP_QNTSTY_NOQNT) { | |
| 2190 stepsize = 1.0; | |
| 2191 } else { | |
| 2192 OPJ_FLOAT64 norm = opj_dwt_getnorm_real(level, orient); | |
| 2193 stepsize = (1 << (gain)) / norm; | |
| 2194 } | |
| 2195 opj_dwt_encode_stepsize((OPJ_INT32) floor(stepsize * 8192.0), | |
| 2196 (OPJ_INT32)(prec + gain), &tccp->stepsizes[bandno]); | |
| 2197 } | |
| 2198 } | |
| 2199 | |
| 2200 /* <summary> */ | |
| 2201 /* Determine maximum computed resolution level for inverse wavelet transform */ | |
| 2202 /* </summary> */ | |
| 2203 static OPJ_UINT32 opj_dwt_max_resolution(opj_tcd_resolution_t* OPJ_RESTRICT r, | |
| 2204 OPJ_UINT32 i) | |
| 2205 { | |
| 2206 OPJ_UINT32 mr = 0; | |
| 2207 OPJ_UINT32 w; | |
| 2208 while (--i) { | |
| 2209 ++r; | |
| 2210 if (mr < (w = (OPJ_UINT32)(r->x1 - r->x0))) { | |
| 2211 mr = w ; | |
| 2212 } | |
| 2213 if (mr < (w = (OPJ_UINT32)(r->y1 - r->y0))) { | |
| 2214 mr = w ; | |
| 2215 } | |
| 2216 } | |
| 2217 return mr ; | |
| 2218 } | |
| 2219 | |
| 2220 typedef struct { | |
| 2221 opj_dwt_t h; | |
| 2222 OPJ_UINT32 rw; | |
| 2223 OPJ_UINT32 w; | |
| 2224 OPJ_INT32 * OPJ_RESTRICT tiledp; | |
| 2225 OPJ_UINT32 min_j; | |
| 2226 OPJ_UINT32 max_j; | |
| 2227 } opj_dwt_decode_h_job_t; | |
| 2228 | |
| 2229 static void opj_dwt_decode_h_func(void* user_data, opj_tls_t* tls) | |
| 2230 { | |
| 2231 OPJ_UINT32 j; | |
| 2232 opj_dwt_decode_h_job_t* job; | |
| 2233 (void)tls; | |
| 2234 | |
| 2235 job = (opj_dwt_decode_h_job_t*)user_data; | |
| 2236 for (j = job->min_j; j < job->max_j; j++) { | |
| 2237 opj_idwt53_h(&job->h, &job->tiledp[j * job->w]); | |
| 2238 } | |
| 2239 | |
| 2240 opj_aligned_free(job->h.mem); | |
| 2241 opj_free(job); | |
| 2242 } | |
| 2243 | |
| 2244 typedef struct { | |
| 2245 opj_dwt_t v; | |
| 2246 OPJ_UINT32 rh; | |
| 2247 OPJ_UINT32 w; | |
| 2248 OPJ_INT32 * OPJ_RESTRICT tiledp; | |
| 2249 OPJ_UINT32 min_j; | |
| 2250 OPJ_UINT32 max_j; | |
| 2251 } opj_dwt_decode_v_job_t; | |
| 2252 | |
| 2253 static void opj_dwt_decode_v_func(void* user_data, opj_tls_t* tls) | |
| 2254 { | |
| 2255 OPJ_UINT32 j; | |
| 2256 opj_dwt_decode_v_job_t* job; | |
| 2257 (void)tls; | |
| 2258 | |
| 2259 job = (opj_dwt_decode_v_job_t*)user_data; | |
| 2260 for (j = job->min_j; j + PARALLEL_COLS_53 <= job->max_j; | |
| 2261 j += PARALLEL_COLS_53) { | |
| 2262 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, | |
| 2263 PARALLEL_COLS_53); | |
| 2264 } | |
| 2265 if (j < job->max_j) | |
| 2266 opj_idwt53_v(&job->v, &job->tiledp[j], (OPJ_SIZE_T)job->w, | |
| 2267 (OPJ_INT32)(job->max_j - j)); | |
| 2268 | |
| 2269 opj_aligned_free(job->v.mem); | |
| 2270 opj_free(job); | |
| 2271 } | |
| 2272 | |
| 2273 | |
| 2274 /* <summary> */ | |
| 2275 /* Inverse wavelet transform in 2-D. */ | |
| 2276 /* </summary> */ | |
| 2277 static OPJ_BOOL opj_dwt_decode_tile(opj_thread_pool_t* tp, | |
| 2278 opj_tcd_tilecomp_t* tilec, OPJ_UINT32 numres) | |
| 2279 { | |
| 2280 opj_dwt_t h; | |
| 2281 opj_dwt_t v; | |
| 2282 | |
| 2283 opj_tcd_resolution_t* tr = tilec->resolutions; | |
| 2284 | |
| 2285 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - | |
| 2286 tr->x0); /* width of the resolution level computed */ | |
| 2287 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - | |
| 2288 tr->y0); /* height of the resolution level computed */ | |
| 2289 | |
| 2290 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - | |
| 2291 1].x1 - | |
| 2292 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); | |
| 2293 OPJ_SIZE_T h_mem_size; | |
| 2294 int num_threads; | |
| 2295 | |
| 2296 /* Not entirely sure for the return code of w == 0 which is triggered per */ | |
| 2297 /* https://github.com/uclouvain/openjpeg/issues/1505 */ | |
| 2298 if (numres == 1U || w == 0) { | |
| 2299 return OPJ_TRUE; | |
| 2300 } | |
| 2301 num_threads = opj_thread_pool_get_thread_count(tp); | |
| 2302 h_mem_size = opj_dwt_max_resolution(tr, numres); | |
| 2303 /* overflow check */ | |
| 2304 if (h_mem_size > (SIZE_MAX / PARALLEL_COLS_53 / sizeof(OPJ_INT32))) { | |
| 2305 /* FIXME event manager error callback */ | |
| 2306 return OPJ_FALSE; | |
| 2307 } | |
| 2308 /* We need PARALLEL_COLS_53 times the height of the array, */ | |
| 2309 /* since for the vertical pass */ | |
| 2310 /* we process PARALLEL_COLS_53 columns at a time */ | |
| 2311 h_mem_size *= PARALLEL_COLS_53 * sizeof(OPJ_INT32); | |
| 2312 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); | |
| 2313 if (! h.mem) { | |
| 2314 /* FIXME event manager error callback */ | |
| 2315 return OPJ_FALSE; | |
| 2316 } | |
| 2317 | |
| 2318 v.mem = h.mem; | |
| 2319 | |
| 2320 while (--numres) { | |
| 2321 OPJ_INT32 * OPJ_RESTRICT tiledp = tilec->data; | |
| 2322 OPJ_UINT32 j; | |
| 2323 | |
| 2324 ++tr; | |
| 2325 h.sn = (OPJ_INT32)rw; | |
| 2326 v.sn = (OPJ_INT32)rh; | |
| 2327 | |
| 2328 rw = (OPJ_UINT32)(tr->x1 - tr->x0); | |
| 2329 rh = (OPJ_UINT32)(tr->y1 - tr->y0); | |
| 2330 | |
| 2331 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); | |
| 2332 h.cas = tr->x0 % 2; | |
| 2333 | |
| 2334 if (num_threads <= 1 || rh <= 1) { | |
| 2335 for (j = 0; j < rh; ++j) { | |
| 2336 opj_idwt53_h(&h, &tiledp[(OPJ_SIZE_T)j * w]); | |
| 2337 } | |
| 2338 } else { | |
| 2339 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; | |
| 2340 OPJ_UINT32 step_j; | |
| 2341 | |
| 2342 if (rh < num_jobs) { | |
| 2343 num_jobs = rh; | |
| 2344 } | |
| 2345 step_j = (rh / num_jobs); | |
| 2346 | |
| 2347 for (j = 0; j < num_jobs; j++) { | |
| 2348 opj_dwt_decode_h_job_t* job; | |
| 2349 | |
| 2350 job = (opj_dwt_decode_h_job_t*) opj_malloc(sizeof(opj_dwt_decode_h_job_t)); | |
| 2351 if (!job) { | |
| 2352 /* It would be nice to fallback to single thread case, but */ | |
| 2353 /* unfortunately some jobs may be launched and have modified */ | |
| 2354 /* tiledp, so it is not practical to recover from that error */ | |
| 2355 /* FIXME event manager error callback */ | |
| 2356 opj_thread_pool_wait_completion(tp, 0); | |
| 2357 opj_aligned_free(h.mem); | |
| 2358 return OPJ_FALSE; | |
| 2359 } | |
| 2360 job->h = h; | |
| 2361 job->rw = rw; | |
| 2362 job->w = w; | |
| 2363 job->tiledp = tiledp; | |
| 2364 job->min_j = j * step_j; | |
| 2365 job->max_j = (j + 1U) * step_j; /* this can overflow */ | |
| 2366 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ | |
| 2367 job->max_j = rh; | |
| 2368 } | |
| 2369 job->h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); | |
| 2370 if (!job->h.mem) { | |
| 2371 /* FIXME event manager error callback */ | |
| 2372 opj_thread_pool_wait_completion(tp, 0); | |
| 2373 opj_free(job); | |
| 2374 opj_aligned_free(h.mem); | |
| 2375 return OPJ_FALSE; | |
| 2376 } | |
| 2377 opj_thread_pool_submit_job(tp, opj_dwt_decode_h_func, job); | |
| 2378 } | |
| 2379 opj_thread_pool_wait_completion(tp, 0); | |
| 2380 } | |
| 2381 | |
| 2382 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); | |
| 2383 v.cas = tr->y0 % 2; | |
| 2384 | |
| 2385 if (num_threads <= 1 || rw <= 1) { | |
| 2386 for (j = 0; j + PARALLEL_COLS_53 <= rw; | |
| 2387 j += PARALLEL_COLS_53) { | |
| 2388 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, PARALLEL_COLS_53); | |
| 2389 } | |
| 2390 if (j < rw) { | |
| 2391 opj_idwt53_v(&v, &tiledp[j], (OPJ_SIZE_T)w, (OPJ_INT32)(rw - j)); | |
| 2392 } | |
| 2393 } else { | |
| 2394 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; | |
| 2395 OPJ_UINT32 step_j; | |
| 2396 | |
| 2397 if (rw < num_jobs) { | |
| 2398 num_jobs = rw; | |
| 2399 } | |
| 2400 step_j = (rw / num_jobs); | |
| 2401 | |
| 2402 for (j = 0; j < num_jobs; j++) { | |
| 2403 opj_dwt_decode_v_job_t* job; | |
| 2404 | |
| 2405 job = (opj_dwt_decode_v_job_t*) opj_malloc(sizeof(opj_dwt_decode_v_job_t)); | |
| 2406 if (!job) { | |
| 2407 /* It would be nice to fallback to single thread case, but */ | |
| 2408 /* unfortunately some jobs may be launched and have modified */ | |
| 2409 /* tiledp, so it is not practical to recover from that error */ | |
| 2410 /* FIXME event manager error callback */ | |
| 2411 opj_thread_pool_wait_completion(tp, 0); | |
| 2412 opj_aligned_free(v.mem); | |
| 2413 return OPJ_FALSE; | |
| 2414 } | |
| 2415 job->v = v; | |
| 2416 job->rh = rh; | |
| 2417 job->w = w; | |
| 2418 job->tiledp = tiledp; | |
| 2419 job->min_j = j * step_j; | |
| 2420 job->max_j = (j + 1U) * step_j; /* this can overflow */ | |
| 2421 if (j == (num_jobs - 1U)) { /* this will take care of the overflow */ | |
| 2422 job->max_j = rw; | |
| 2423 } | |
| 2424 job->v.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); | |
| 2425 if (!job->v.mem) { | |
| 2426 /* FIXME event manager error callback */ | |
| 2427 opj_thread_pool_wait_completion(tp, 0); | |
| 2428 opj_free(job); | |
| 2429 opj_aligned_free(v.mem); | |
| 2430 return OPJ_FALSE; | |
| 2431 } | |
| 2432 opj_thread_pool_submit_job(tp, opj_dwt_decode_v_func, job); | |
| 2433 } | |
| 2434 opj_thread_pool_wait_completion(tp, 0); | |
| 2435 } | |
| 2436 } | |
| 2437 opj_aligned_free(h.mem); | |
| 2438 return OPJ_TRUE; | |
| 2439 } | |
| 2440 | |
| 2441 static void opj_dwt_interleave_partial_h(OPJ_INT32 *dest, | |
| 2442 OPJ_INT32 cas, | |
| 2443 opj_sparse_array_int32_t* sa, | |
| 2444 OPJ_UINT32 sa_line, | |
| 2445 OPJ_UINT32 sn, | |
| 2446 OPJ_UINT32 win_l_x0, | |
| 2447 OPJ_UINT32 win_l_x1, | |
| 2448 OPJ_UINT32 win_h_x0, | |
| 2449 OPJ_UINT32 win_h_x1) | |
| 2450 { | |
| 2451 OPJ_BOOL ret; | |
| 2452 ret = opj_sparse_array_int32_read(sa, | |
| 2453 win_l_x0, sa_line, | |
| 2454 win_l_x1, sa_line + 1, | |
| 2455 dest + cas + 2 * win_l_x0, | |
| 2456 2, 0, OPJ_TRUE); | |
| 2457 assert(ret); | |
| 2458 ret = opj_sparse_array_int32_read(sa, | |
| 2459 sn + win_h_x0, sa_line, | |
| 2460 sn + win_h_x1, sa_line + 1, | |
| 2461 dest + 1 - cas + 2 * win_h_x0, | |
| 2462 2, 0, OPJ_TRUE); | |
| 2463 assert(ret); | |
| 2464 OPJ_UNUSED(ret); | |
| 2465 } | |
| 2466 | |
| 2467 | |
| 2468 static void opj_dwt_interleave_partial_v(OPJ_INT32 *dest, | |
| 2469 OPJ_INT32 cas, | |
| 2470 opj_sparse_array_int32_t* sa, | |
| 2471 OPJ_UINT32 sa_col, | |
| 2472 OPJ_UINT32 nb_cols, | |
| 2473 OPJ_UINT32 sn, | |
| 2474 OPJ_UINT32 win_l_y0, | |
| 2475 OPJ_UINT32 win_l_y1, | |
| 2476 OPJ_UINT32 win_h_y0, | |
| 2477 OPJ_UINT32 win_h_y1) | |
| 2478 { | |
| 2479 OPJ_BOOL ret; | |
| 2480 ret = opj_sparse_array_int32_read(sa, | |
| 2481 sa_col, win_l_y0, | |
| 2482 sa_col + nb_cols, win_l_y1, | |
| 2483 dest + cas * 4 + 2 * 4 * win_l_y0, | |
| 2484 1, 2 * 4, OPJ_TRUE); | |
| 2485 assert(ret); | |
| 2486 ret = opj_sparse_array_int32_read(sa, | |
| 2487 sa_col, sn + win_h_y0, | |
| 2488 sa_col + nb_cols, sn + win_h_y1, | |
| 2489 dest + (1 - cas) * 4 + 2 * 4 * win_h_y0, | |
| 2490 1, 2 * 4, OPJ_TRUE); | |
| 2491 assert(ret); | |
| 2492 OPJ_UNUSED(ret); | |
| 2493 } | |
| 2494 | |
| 2495 static void opj_dwt_decode_partial_1(OPJ_INT32 *a, OPJ_INT32 dn, OPJ_INT32 sn, | |
| 2496 OPJ_INT32 cas, | |
| 2497 OPJ_INT32 win_l_x0, | |
| 2498 OPJ_INT32 win_l_x1, | |
| 2499 OPJ_INT32 win_h_x0, | |
| 2500 OPJ_INT32 win_h_x1) | |
| 2501 { | |
| 2502 OPJ_INT32 i; | |
| 2503 | |
| 2504 if (!cas) { | |
| 2505 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ | |
| 2506 | |
| 2507 /* Naive version is : | |
| 2508 for (i = win_l_x0; i < i_max; i++) { | |
| 2509 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; | |
| 2510 } | |
| 2511 for (i = win_h_x0; i < win_h_x1; i++) { | |
| 2512 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; | |
| 2513 } | |
| 2514 but the compiler doesn't manage to unroll it to avoid bound | |
| 2515 checking in OPJ_S_ and OPJ_D_ macros | |
| 2516 */ | |
| 2517 | |
| 2518 i = win_l_x0; | |
| 2519 if (i < win_l_x1) { | |
| 2520 OPJ_INT32 i_max; | |
| 2521 | |
| 2522 /* Left-most case */ | |
| 2523 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; | |
| 2524 i ++; | |
| 2525 | |
| 2526 i_max = win_l_x1; | |
| 2527 if (i_max > dn) { | |
| 2528 i_max = dn; | |
| 2529 } | |
| 2530 for (; i < i_max; i++) { | |
| 2531 /* No bound checking */ | |
| 2532 OPJ_S(i) -= (OPJ_D(i - 1) + OPJ_D(i) + 2) >> 2; | |
| 2533 } | |
| 2534 for (; i < win_l_x1; i++) { | |
| 2535 /* Right-most case */ | |
| 2536 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; | |
| 2537 } | |
| 2538 } | |
| 2539 | |
| 2540 i = win_h_x0; | |
| 2541 if (i < win_h_x1) { | |
| 2542 OPJ_INT32 i_max = win_h_x1; | |
| 2543 if (i_max >= sn) { | |
| 2544 i_max = sn - 1; | |
| 2545 } | |
| 2546 for (; i < i_max; i++) { | |
| 2547 /* No bound checking */ | |
| 2548 OPJ_D(i) += (OPJ_S(i) + OPJ_S(i + 1)) >> 1; | |
| 2549 } | |
| 2550 for (; i < win_h_x1; i++) { | |
| 2551 /* Right-most case */ | |
| 2552 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; | |
| 2553 } | |
| 2554 } | |
| 2555 } | |
| 2556 } else { | |
| 2557 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ | |
| 2558 OPJ_S(0) /= 2; | |
| 2559 } else { | |
| 2560 for (i = win_l_x0; i < win_l_x1; i++) { | |
| 2561 OPJ_D(i) = opj_int_sub_no_overflow(OPJ_D(i), | |
| 2562 opj_int_add_no_overflow(opj_int_add_no_overflow(OPJ_SS_(i), OPJ_SS_(i + 1)), | |
| 2563 2) >> 2); | |
| 2564 } | |
| 2565 for (i = win_h_x0; i < win_h_x1; i++) { | |
| 2566 OPJ_S(i) = opj_int_add_no_overflow(OPJ_S(i), | |
| 2567 opj_int_add_no_overflow(OPJ_DD_(i), OPJ_DD_(i - 1)) >> 1); | |
| 2568 } | |
| 2569 } | |
| 2570 } | |
| 2571 } | |
| 2572 | |
| 2573 #define OPJ_S_off(i,off) a[(OPJ_UINT32)(i)*2*4+off] | |
| 2574 #define OPJ_D_off(i,off) a[(1+(OPJ_UINT32)(i)*2)*4+off] | |
| 2575 #define OPJ_S__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=sn?OPJ_S_off(sn-1,off):OPJ_S_off(i,off))) | |
| 2576 #define OPJ_D__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=dn?OPJ_D_off(dn-1,off):OPJ_D_off(i,off))) | |
| 2577 #define OPJ_SS__off(i,off) ((i)<0?OPJ_S_off(0,off):((i)>=dn?OPJ_S_off(dn-1,off):OPJ_S_off(i,off))) | |
| 2578 #define OPJ_DD__off(i,off) ((i)<0?OPJ_D_off(0,off):((i)>=sn?OPJ_D_off(sn-1,off):OPJ_D_off(i,off))) | |
| 2579 | |
| 2580 static void opj_dwt_decode_partial_1_parallel(OPJ_INT32 *a, | |
| 2581 OPJ_UINT32 nb_cols, | |
| 2582 OPJ_INT32 dn, OPJ_INT32 sn, | |
| 2583 OPJ_INT32 cas, | |
| 2584 OPJ_INT32 win_l_x0, | |
| 2585 OPJ_INT32 win_l_x1, | |
| 2586 OPJ_INT32 win_h_x0, | |
| 2587 OPJ_INT32 win_h_x1) | |
| 2588 { | |
| 2589 OPJ_INT32 i; | |
| 2590 OPJ_UINT32 off; | |
| 2591 | |
| 2592 (void)nb_cols; | |
| 2593 | |
| 2594 if (!cas) { | |
| 2595 if ((dn > 0) || (sn > 1)) { /* NEW : CASE ONE ELEMENT */ | |
| 2596 | |
| 2597 /* Naive version is : | |
| 2598 for (i = win_l_x0; i < i_max; i++) { | |
| 2599 OPJ_S(i) -= (OPJ_D_(i - 1) + OPJ_D_(i) + 2) >> 2; | |
| 2600 } | |
| 2601 for (i = win_h_x0; i < win_h_x1; i++) { | |
| 2602 OPJ_D(i) += (OPJ_S_(i) + OPJ_S_(i + 1)) >> 1; | |
| 2603 } | |
| 2604 but the compiler doesn't manage to unroll it to avoid bound | |
| 2605 checking in OPJ_S_ and OPJ_D_ macros | |
| 2606 */ | |
| 2607 | |
| 2608 i = win_l_x0; | |
| 2609 if (i < win_l_x1) { | |
| 2610 OPJ_INT32 i_max; | |
| 2611 | |
| 2612 /* Left-most case */ | |
| 2613 for (off = 0; off < 4; off++) { | |
| 2614 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; | |
| 2615 } | |
| 2616 i ++; | |
| 2617 | |
| 2618 i_max = win_l_x1; | |
| 2619 if (i_max > dn) { | |
| 2620 i_max = dn; | |
| 2621 } | |
| 2622 | |
| 2623 #ifdef __SSE2__ | |
| 2624 if (i + 1 < i_max) { | |
| 2625 const __m128i two = _mm_set1_epi32(2); | |
| 2626 __m128i Dm1 = _mm_load_si128((__m128i * const)(a + 4 + (i - 1) * 8)); | |
| 2627 for (; i + 1 < i_max; i += 2) { | |
| 2628 /* No bound checking */ | |
| 2629 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); | |
| 2630 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); | |
| 2631 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); | |
| 2632 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); | |
| 2633 S = _mm_sub_epi32(S, | |
| 2634 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(Dm1, D), two), 2)); | |
| 2635 S1 = _mm_sub_epi32(S1, | |
| 2636 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(D, D1), two), 2)); | |
| 2637 _mm_store_si128((__m128i*)(a + i * 8), S); | |
| 2638 _mm_store_si128((__m128i*)(a + (i + 1) * 8), S1); | |
| 2639 Dm1 = D1; | |
| 2640 } | |
| 2641 } | |
| 2642 #endif | |
| 2643 | |
| 2644 for (; i < i_max; i++) { | |
| 2645 /* No bound checking */ | |
| 2646 for (off = 0; off < 4; off++) { | |
| 2647 OPJ_S_off(i, off) -= (OPJ_D_off(i - 1, off) + OPJ_D_off(i, off) + 2) >> 2; | |
| 2648 } | |
| 2649 } | |
| 2650 for (; i < win_l_x1; i++) { | |
| 2651 /* Right-most case */ | |
| 2652 for (off = 0; off < 4; off++) { | |
| 2653 OPJ_S_off(i, off) -= (OPJ_D__off(i - 1, off) + OPJ_D__off(i, off) + 2) >> 2; | |
| 2654 } | |
| 2655 } | |
| 2656 } | |
| 2657 | |
| 2658 i = win_h_x0; | |
| 2659 if (i < win_h_x1) { | |
| 2660 OPJ_INT32 i_max = win_h_x1; | |
| 2661 if (i_max >= sn) { | |
| 2662 i_max = sn - 1; | |
| 2663 } | |
| 2664 | |
| 2665 #ifdef __SSE2__ | |
| 2666 if (i + 1 < i_max) { | |
| 2667 __m128i S = _mm_load_si128((__m128i * const)(a + i * 8)); | |
| 2668 for (; i + 1 < i_max; i += 2) { | |
| 2669 /* No bound checking */ | |
| 2670 __m128i D = _mm_load_si128((__m128i * const)(a + 4 + i * 8)); | |
| 2671 __m128i S1 = _mm_load_si128((__m128i * const)(a + (i + 1) * 8)); | |
| 2672 __m128i D1 = _mm_load_si128((__m128i * const)(a + 4 + (i + 1) * 8)); | |
| 2673 __m128i S2 = _mm_load_si128((__m128i * const)(a + (i + 2) * 8)); | |
| 2674 D = _mm_add_epi32(D, _mm_srai_epi32(_mm_add_epi32(S, S1), 1)); | |
| 2675 D1 = _mm_add_epi32(D1, _mm_srai_epi32(_mm_add_epi32(S1, S2), 1)); | |
| 2676 _mm_store_si128((__m128i*)(a + 4 + i * 8), D); | |
| 2677 _mm_store_si128((__m128i*)(a + 4 + (i + 1) * 8), D1); | |
| 2678 S = S2; | |
| 2679 } | |
| 2680 } | |
| 2681 #endif | |
| 2682 | |
| 2683 for (; i < i_max; i++) { | |
| 2684 /* No bound checking */ | |
| 2685 for (off = 0; off < 4; off++) { | |
| 2686 OPJ_D_off(i, off) += (OPJ_S_off(i, off) + OPJ_S_off(i + 1, off)) >> 1; | |
| 2687 } | |
| 2688 } | |
| 2689 for (; i < win_h_x1; i++) { | |
| 2690 /* Right-most case */ | |
| 2691 for (off = 0; off < 4; off++) { | |
| 2692 OPJ_D_off(i, off) += (OPJ_S__off(i, off) + OPJ_S__off(i + 1, off)) >> 1; | |
| 2693 } | |
| 2694 } | |
| 2695 } | |
| 2696 } | |
| 2697 } else { | |
| 2698 if (!sn && dn == 1) { /* NEW : CASE ONE ELEMENT */ | |
| 2699 for (off = 0; off < 4; off++) { | |
| 2700 OPJ_S_off(0, off) /= 2; | |
| 2701 } | |
| 2702 } else { | |
| 2703 for (i = win_l_x0; i < win_l_x1; i++) { | |
| 2704 for (off = 0; off < 4; off++) { | |
| 2705 OPJ_D_off(i, off) = opj_int_sub_no_overflow( | |
| 2706 OPJ_D_off(i, off), | |
| 2707 opj_int_add_no_overflow( | |
| 2708 opj_int_add_no_overflow(OPJ_SS__off(i, off), OPJ_SS__off(i + 1, off)), 2) >> 2); | |
| 2709 } | |
| 2710 } | |
| 2711 for (i = win_h_x0; i < win_h_x1; i++) { | |
| 2712 for (off = 0; off < 4; off++) { | |
| 2713 OPJ_S_off(i, off) = opj_int_add_no_overflow( | |
| 2714 OPJ_S_off(i, off), | |
| 2715 opj_int_add_no_overflow(OPJ_DD__off(i, off), OPJ_DD__off(i - 1, off)) >> 1); | |
| 2716 } | |
| 2717 } | |
| 2718 } | |
| 2719 } | |
| 2720 } | |
| 2721 | |
| 2722 static void opj_dwt_get_band_coordinates(opj_tcd_tilecomp_t* tilec, | |
| 2723 OPJ_UINT32 resno, | |
| 2724 OPJ_UINT32 bandno, | |
| 2725 OPJ_UINT32 tcx0, | |
| 2726 OPJ_UINT32 tcy0, | |
| 2727 OPJ_UINT32 tcx1, | |
| 2728 OPJ_UINT32 tcy1, | |
| 2729 OPJ_UINT32* tbx0, | |
| 2730 OPJ_UINT32* tby0, | |
| 2731 OPJ_UINT32* tbx1, | |
| 2732 OPJ_UINT32* tby1) | |
| 2733 { | |
| 2734 /* Compute number of decomposition for this band. See table F-1 */ | |
| 2735 OPJ_UINT32 nb = (resno == 0) ? | |
| 2736 tilec->numresolutions - 1 : | |
| 2737 tilec->numresolutions - resno; | |
| 2738 /* Map above tile-based coordinates to sub-band-based coordinates per */ | |
| 2739 /* equation B-15 of the standard */ | |
| 2740 OPJ_UINT32 x0b = bandno & 1; | |
| 2741 OPJ_UINT32 y0b = bandno >> 1; | |
| 2742 if (tbx0) { | |
| 2743 *tbx0 = (nb == 0) ? tcx0 : | |
| 2744 (tcx0 <= (1U << (nb - 1)) * x0b) ? 0 : | |
| 2745 opj_uint_ceildivpow2(tcx0 - (1U << (nb - 1)) * x0b, nb); | |
| 2746 } | |
| 2747 if (tby0) { | |
| 2748 *tby0 = (nb == 0) ? tcy0 : | |
| 2749 (tcy0 <= (1U << (nb - 1)) * y0b) ? 0 : | |
| 2750 opj_uint_ceildivpow2(tcy0 - (1U << (nb - 1)) * y0b, nb); | |
| 2751 } | |
| 2752 if (tbx1) { | |
| 2753 *tbx1 = (nb == 0) ? tcx1 : | |
| 2754 (tcx1 <= (1U << (nb - 1)) * x0b) ? 0 : | |
| 2755 opj_uint_ceildivpow2(tcx1 - (1U << (nb - 1)) * x0b, nb); | |
| 2756 } | |
| 2757 if (tby1) { | |
| 2758 *tby1 = (nb == 0) ? tcy1 : | |
| 2759 (tcy1 <= (1U << (nb - 1)) * y0b) ? 0 : | |
| 2760 opj_uint_ceildivpow2(tcy1 - (1U << (nb - 1)) * y0b, nb); | |
| 2761 } | |
| 2762 } | |
| 2763 | |
| 2764 static void opj_dwt_segment_grow(OPJ_UINT32 filter_width, | |
| 2765 OPJ_UINT32 max_size, | |
| 2766 OPJ_UINT32* start, | |
| 2767 OPJ_UINT32* end) | |
| 2768 { | |
| 2769 *start = opj_uint_subs(*start, filter_width); | |
| 2770 *end = opj_uint_adds(*end, filter_width); | |
| 2771 *end = opj_uint_min(*end, max_size); | |
| 2772 } | |
| 2773 | |
| 2774 | |
| 2775 static opj_sparse_array_int32_t* opj_dwt_init_sparse_array( | |
| 2776 opj_tcd_tilecomp_t* tilec, | |
| 2777 OPJ_UINT32 numres) | |
| 2778 { | |
| 2779 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); | |
| 2780 OPJ_UINT32 w = (OPJ_UINT32)(tr_max->x1 - tr_max->x0); | |
| 2781 OPJ_UINT32 h = (OPJ_UINT32)(tr_max->y1 - tr_max->y0); | |
| 2782 OPJ_UINT32 resno, bandno, precno, cblkno; | |
| 2783 opj_sparse_array_int32_t* sa = opj_sparse_array_int32_create( | |
| 2784 w, h, opj_uint_min(w, 64), opj_uint_min(h, 64)); | |
| 2785 if (sa == NULL) { | |
| 2786 return NULL; | |
| 2787 } | |
| 2788 | |
| 2789 for (resno = 0; resno < numres; ++resno) { | |
| 2790 opj_tcd_resolution_t* res = &tilec->resolutions[resno]; | |
| 2791 | |
| 2792 for (bandno = 0; bandno < res->numbands; ++bandno) { | |
| 2793 opj_tcd_band_t* band = &res->bands[bandno]; | |
| 2794 | |
| 2795 for (precno = 0; precno < res->pw * res->ph; ++precno) { | |
| 2796 opj_tcd_precinct_t* precinct = &band->precincts[precno]; | |
| 2797 for (cblkno = 0; cblkno < precinct->cw * precinct->ch; ++cblkno) { | |
| 2798 opj_tcd_cblk_dec_t* cblk = &precinct->cblks.dec[cblkno]; | |
| 2799 if (cblk->decoded_data != NULL) { | |
| 2800 OPJ_UINT32 x = (OPJ_UINT32)(cblk->x0 - band->x0); | |
| 2801 OPJ_UINT32 y = (OPJ_UINT32)(cblk->y0 - band->y0); | |
| 2802 OPJ_UINT32 cblk_w = (OPJ_UINT32)(cblk->x1 - cblk->x0); | |
| 2803 OPJ_UINT32 cblk_h = (OPJ_UINT32)(cblk->y1 - cblk->y0); | |
| 2804 | |
| 2805 if (band->bandno & 1) { | |
| 2806 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; | |
| 2807 x += (OPJ_UINT32)(pres->x1 - pres->x0); | |
| 2808 } | |
| 2809 if (band->bandno & 2) { | |
| 2810 opj_tcd_resolution_t* pres = &tilec->resolutions[resno - 1]; | |
| 2811 y += (OPJ_UINT32)(pres->y1 - pres->y0); | |
| 2812 } | |
| 2813 | |
| 2814 if (!opj_sparse_array_int32_write(sa, x, y, | |
| 2815 x + cblk_w, y + cblk_h, | |
| 2816 cblk->decoded_data, | |
| 2817 1, cblk_w, OPJ_TRUE)) { | |
| 2818 opj_sparse_array_int32_free(sa); | |
| 2819 return NULL; | |
| 2820 } | |
| 2821 } | |
| 2822 } | |
| 2823 } | |
| 2824 } | |
| 2825 } | |
| 2826 | |
| 2827 return sa; | |
| 2828 } | |
| 2829 | |
| 2830 | |
| 2831 static OPJ_BOOL opj_dwt_decode_partial_tile( | |
| 2832 opj_tcd_tilecomp_t* tilec, | |
| 2833 OPJ_UINT32 numres) | |
| 2834 { | |
| 2835 opj_sparse_array_int32_t* sa; | |
| 2836 opj_dwt_t h; | |
| 2837 opj_dwt_t v; | |
| 2838 OPJ_UINT32 resno; | |
| 2839 /* This value matches the maximum left/right extension given in tables */ | |
| 2840 /* F.2 and F.3 of the standard. */ | |
| 2841 const OPJ_UINT32 filter_width = 2U; | |
| 2842 | |
| 2843 opj_tcd_resolution_t* tr = tilec->resolutions; | |
| 2844 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); | |
| 2845 | |
| 2846 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - | |
| 2847 tr->x0); /* width of the resolution level computed */ | |
| 2848 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - | |
| 2849 tr->y0); /* height of the resolution level computed */ | |
| 2850 | |
| 2851 OPJ_SIZE_T h_mem_size; | |
| 2852 | |
| 2853 /* Compute the intersection of the area of interest, expressed in tile coordinates */ | |
| 2854 /* with the tile coordinates */ | |
| 2855 OPJ_UINT32 win_tcx0 = tilec->win_x0; | |
| 2856 OPJ_UINT32 win_tcy0 = tilec->win_y0; | |
| 2857 OPJ_UINT32 win_tcx1 = tilec->win_x1; | |
| 2858 OPJ_UINT32 win_tcy1 = tilec->win_y1; | |
| 2859 | |
| 2860 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { | |
| 2861 return OPJ_TRUE; | |
| 2862 } | |
| 2863 | |
| 2864 sa = opj_dwt_init_sparse_array(tilec, numres); | |
| 2865 if (sa == NULL) { | |
| 2866 return OPJ_FALSE; | |
| 2867 } | |
| 2868 | |
| 2869 if (numres == 1U) { | |
| 2870 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, | |
| 2871 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, | |
| 2872 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, | |
| 2873 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, | |
| 2874 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, | |
| 2875 tilec->data_win, | |
| 2876 1, tr_max->win_x1 - tr_max->win_x0, | |
| 2877 OPJ_TRUE); | |
| 2878 assert(ret); | |
| 2879 OPJ_UNUSED(ret); | |
| 2880 opj_sparse_array_int32_free(sa); | |
| 2881 return OPJ_TRUE; | |
| 2882 } | |
| 2883 h_mem_size = opj_dwt_max_resolution(tr, numres); | |
| 2884 /* overflow check */ | |
| 2885 /* in vertical pass, we process 4 columns at a time */ | |
| 2886 if (h_mem_size > (SIZE_MAX / (4 * sizeof(OPJ_INT32)))) { | |
| 2887 /* FIXME event manager error callback */ | |
| 2888 opj_sparse_array_int32_free(sa); | |
| 2889 return OPJ_FALSE; | |
| 2890 } | |
| 2891 | |
| 2892 h_mem_size *= 4 * sizeof(OPJ_INT32); | |
| 2893 h.mem = (OPJ_INT32*)opj_aligned_32_malloc(h_mem_size); | |
| 2894 if (! h.mem) { | |
| 2895 /* FIXME event manager error callback */ | |
| 2896 opj_sparse_array_int32_free(sa); | |
| 2897 return OPJ_FALSE; | |
| 2898 } | |
| 2899 | |
| 2900 v.mem = h.mem; | |
| 2901 | |
| 2902 for (resno = 1; resno < numres; resno ++) { | |
| 2903 OPJ_UINT32 i, j; | |
| 2904 /* Window of interest subband-based coordinates */ | |
| 2905 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; | |
| 2906 OPJ_UINT32 win_hl_x0, win_hl_x1; | |
| 2907 OPJ_UINT32 win_lh_y0, win_lh_y1; | |
| 2908 /* Window of interest tile-resolution-based coordinates */ | |
| 2909 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; | |
| 2910 /* Tile-resolution subband-based coordinates */ | |
| 2911 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; | |
| 2912 | |
| 2913 ++tr; | |
| 2914 | |
| 2915 h.sn = (OPJ_INT32)rw; | |
| 2916 v.sn = (OPJ_INT32)rh; | |
| 2917 | |
| 2918 rw = (OPJ_UINT32)(tr->x1 - tr->x0); | |
| 2919 rh = (OPJ_UINT32)(tr->y1 - tr->y0); | |
| 2920 | |
| 2921 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); | |
| 2922 h.cas = tr->x0 % 2; | |
| 2923 | |
| 2924 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); | |
| 2925 v.cas = tr->y0 % 2; | |
| 2926 | |
| 2927 /* Get the subband coordinates for the window of interest */ | |
| 2928 /* LL band */ | |
| 2929 opj_dwt_get_band_coordinates(tilec, resno, 0, | |
| 2930 win_tcx0, win_tcy0, win_tcx1, win_tcy1, | |
| 2931 &win_ll_x0, &win_ll_y0, | |
| 2932 &win_ll_x1, &win_ll_y1); | |
| 2933 | |
| 2934 /* HL band */ | |
| 2935 opj_dwt_get_band_coordinates(tilec, resno, 1, | |
| 2936 win_tcx0, win_tcy0, win_tcx1, win_tcy1, | |
| 2937 &win_hl_x0, NULL, &win_hl_x1, NULL); | |
| 2938 | |
| 2939 /* LH band */ | |
| 2940 opj_dwt_get_band_coordinates(tilec, resno, 2, | |
| 2941 win_tcx0, win_tcy0, win_tcx1, win_tcy1, | |
| 2942 NULL, &win_lh_y0, NULL, &win_lh_y1); | |
| 2943 | |
| 2944 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ | |
| 2945 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; | |
| 2946 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; | |
| 2947 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; | |
| 2948 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; | |
| 2949 | |
| 2950 /* Subtract the origin of the bands for this tile, to the subwindow */ | |
| 2951 /* of interest band coordinates, so as to get them relative to the */ | |
| 2952 /* tile */ | |
| 2953 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); | |
| 2954 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); | |
| 2955 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); | |
| 2956 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); | |
| 2957 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); | |
| 2958 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); | |
| 2959 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); | |
| 2960 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); | |
| 2961 | |
| 2962 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); | |
| 2963 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); | |
| 2964 | |
| 2965 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); | |
| 2966 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); | |
| 2967 | |
| 2968 /* Compute the tile-resolution-based coordinates for the window of interest */ | |
| 2969 if (h.cas == 0) { | |
| 2970 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); | |
| 2971 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); | |
| 2972 } else { | |
| 2973 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); | |
| 2974 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); | |
| 2975 } | |
| 2976 | |
| 2977 if (v.cas == 0) { | |
| 2978 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); | |
| 2979 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); | |
| 2980 } else { | |
| 2981 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); | |
| 2982 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); | |
| 2983 } | |
| 2984 | |
| 2985 for (j = 0; j < rh; ++j) { | |
| 2986 if ((j >= win_ll_y0 && j < win_ll_y1) || | |
| 2987 (j >= win_lh_y0 + (OPJ_UINT32)v.sn && j < win_lh_y1 + (OPJ_UINT32)v.sn)) { | |
| 2988 | |
| 2989 /* Avoids dwt.c:1584:44 (in opj_dwt_decode_partial_1): runtime error: */ | |
| 2990 /* signed integer overflow: -1094795586 + -1094795586 cannot be represented in type 'int' */ | |
| 2991 /* on opj_decompress -i ../../openjpeg/MAPA.jp2 -o out.tif -d 0,0,256,256 */ | |
| 2992 /* This is less extreme than memsetting the whole buffer to 0 */ | |
| 2993 /* although we could potentially do better with better handling of edge conditions */ | |
| 2994 if (win_tr_x1 >= 1 && win_tr_x1 < rw) { | |
| 2995 h.mem[win_tr_x1 - 1] = 0; | |
| 2996 } | |
| 2997 if (win_tr_x1 < rw) { | |
| 2998 h.mem[win_tr_x1] = 0; | |
| 2999 } | |
| 3000 | |
| 3001 opj_dwt_interleave_partial_h(h.mem, | |
| 3002 h.cas, | |
| 3003 sa, | |
| 3004 j, | |
| 3005 (OPJ_UINT32)h.sn, | |
| 3006 win_ll_x0, | |
| 3007 win_ll_x1, | |
| 3008 win_hl_x0, | |
| 3009 win_hl_x1); | |
| 3010 opj_dwt_decode_partial_1(h.mem, h.dn, h.sn, h.cas, | |
| 3011 (OPJ_INT32)win_ll_x0, | |
| 3012 (OPJ_INT32)win_ll_x1, | |
| 3013 (OPJ_INT32)win_hl_x0, | |
| 3014 (OPJ_INT32)win_hl_x1); | |
| 3015 if (!opj_sparse_array_int32_write(sa, | |
| 3016 win_tr_x0, j, | |
| 3017 win_tr_x1, j + 1, | |
| 3018 h.mem + win_tr_x0, | |
| 3019 1, 0, OPJ_TRUE)) { | |
| 3020 /* FIXME event manager error callback */ | |
| 3021 opj_sparse_array_int32_free(sa); | |
| 3022 opj_aligned_free(h.mem); | |
| 3023 return OPJ_FALSE; | |
| 3024 } | |
| 3025 } | |
| 3026 } | |
| 3027 | |
| 3028 for (i = win_tr_x0; i < win_tr_x1;) { | |
| 3029 OPJ_UINT32 nb_cols = opj_uint_min(4U, win_tr_x1 - i); | |
| 3030 opj_dwt_interleave_partial_v(v.mem, | |
| 3031 v.cas, | |
| 3032 sa, | |
| 3033 i, | |
| 3034 nb_cols, | |
| 3035 (OPJ_UINT32)v.sn, | |
| 3036 win_ll_y0, | |
| 3037 win_ll_y1, | |
| 3038 win_lh_y0, | |
| 3039 win_lh_y1); | |
| 3040 opj_dwt_decode_partial_1_parallel(v.mem, nb_cols, v.dn, v.sn, v.cas, | |
| 3041 (OPJ_INT32)win_ll_y0, | |
| 3042 (OPJ_INT32)win_ll_y1, | |
| 3043 (OPJ_INT32)win_lh_y0, | |
| 3044 (OPJ_INT32)win_lh_y1); | |
| 3045 if (!opj_sparse_array_int32_write(sa, | |
| 3046 i, win_tr_y0, | |
| 3047 i + nb_cols, win_tr_y1, | |
| 3048 v.mem + 4 * win_tr_y0, | |
| 3049 1, 4, OPJ_TRUE)) { | |
| 3050 /* FIXME event manager error callback */ | |
| 3051 opj_sparse_array_int32_free(sa); | |
| 3052 opj_aligned_free(h.mem); | |
| 3053 return OPJ_FALSE; | |
| 3054 } | |
| 3055 | |
| 3056 i += nb_cols; | |
| 3057 } | |
| 3058 } | |
| 3059 opj_aligned_free(h.mem); | |
| 3060 | |
| 3061 { | |
| 3062 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, | |
| 3063 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, | |
| 3064 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, | |
| 3065 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, | |
| 3066 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, | |
| 3067 tilec->data_win, | |
| 3068 1, tr_max->win_x1 - tr_max->win_x0, | |
| 3069 OPJ_TRUE); | |
| 3070 assert(ret); | |
| 3071 OPJ_UNUSED(ret); | |
| 3072 } | |
| 3073 opj_sparse_array_int32_free(sa); | |
| 3074 return OPJ_TRUE; | |
| 3075 } | |
| 3076 | |
| 3077 static void opj_v8dwt_interleave_h(opj_v8dwt_t* OPJ_RESTRICT dwt, | |
| 3078 OPJ_FLOAT32* OPJ_RESTRICT a, | |
| 3079 OPJ_UINT32 width, | |
| 3080 OPJ_UINT32 remaining_height) | |
| 3081 { | |
| 3082 OPJ_FLOAT32* OPJ_RESTRICT bi = (OPJ_FLOAT32*)(dwt->wavelet + dwt->cas); | |
| 3083 OPJ_UINT32 i, k; | |
| 3084 OPJ_UINT32 x0 = dwt->win_l_x0; | |
| 3085 OPJ_UINT32 x1 = dwt->win_l_x1; | |
| 3086 | |
| 3087 for (k = 0; k < 2; ++k) { | |
| 3088 if (remaining_height >= NB_ELTS_V8 && ((OPJ_SIZE_T) a & 0x0f) == 0 && | |
| 3089 ((OPJ_SIZE_T) bi & 0x0f) == 0) { | |
| 3090 /* Fast code path */ | |
| 3091 for (i = x0; i < x1; ++i) { | |
| 3092 OPJ_UINT32 j = i; | |
| 3093 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8; | |
| 3094 dst[0] = a[j]; | |
| 3095 j += width; | |
| 3096 dst[1] = a[j]; | |
| 3097 j += width; | |
| 3098 dst[2] = a[j]; | |
| 3099 j += width; | |
| 3100 dst[3] = a[j]; | |
| 3101 j += width; | |
| 3102 dst[4] = a[j]; | |
| 3103 j += width; | |
| 3104 dst[5] = a[j]; | |
| 3105 j += width; | |
| 3106 dst[6] = a[j]; | |
| 3107 j += width; | |
| 3108 dst[7] = a[j]; | |
| 3109 } | |
| 3110 } else { | |
| 3111 /* Slow code path */ | |
| 3112 for (i = x0; i < x1; ++i) { | |
| 3113 OPJ_UINT32 j = i; | |
| 3114 OPJ_FLOAT32* OPJ_RESTRICT dst = bi + i * 2 * NB_ELTS_V8; | |
| 3115 dst[0] = a[j]; | |
| 3116 j += width; | |
| 3117 if (remaining_height == 1) { | |
| 3118 continue; | |
| 3119 } | |
| 3120 dst[1] = a[j]; | |
| 3121 j += width; | |
| 3122 if (remaining_height == 2) { | |
| 3123 continue; | |
| 3124 } | |
| 3125 dst[2] = a[j]; | |
| 3126 j += width; | |
| 3127 if (remaining_height == 3) { | |
| 3128 continue; | |
| 3129 } | |
| 3130 dst[3] = a[j]; | |
| 3131 j += width; | |
| 3132 if (remaining_height == 4) { | |
| 3133 continue; | |
| 3134 } | |
| 3135 dst[4] = a[j]; | |
| 3136 j += width; | |
| 3137 if (remaining_height == 5) { | |
| 3138 continue; | |
| 3139 } | |
| 3140 dst[5] = a[j]; | |
| 3141 j += width; | |
| 3142 if (remaining_height == 6) { | |
| 3143 continue; | |
| 3144 } | |
| 3145 dst[6] = a[j]; | |
| 3146 j += width; | |
| 3147 if (remaining_height == 7) { | |
| 3148 continue; | |
| 3149 } | |
| 3150 dst[7] = a[j]; | |
| 3151 } | |
| 3152 } | |
| 3153 | |
| 3154 bi = (OPJ_FLOAT32*)(dwt->wavelet + 1 - dwt->cas); | |
| 3155 a += dwt->sn; | |
| 3156 x0 = dwt->win_h_x0; | |
| 3157 x1 = dwt->win_h_x1; | |
| 3158 } | |
| 3159 } | |
| 3160 | |
| 3161 static void opj_v8dwt_interleave_partial_h(opj_v8dwt_t* dwt, | |
| 3162 opj_sparse_array_int32_t* sa, | |
| 3163 OPJ_UINT32 sa_line, | |
| 3164 OPJ_UINT32 remaining_height) | |
| 3165 { | |
| 3166 OPJ_UINT32 i; | |
| 3167 for (i = 0; i < remaining_height; i++) { | |
| 3168 OPJ_BOOL ret; | |
| 3169 ret = opj_sparse_array_int32_read(sa, | |
| 3170 dwt->win_l_x0, sa_line + i, | |
| 3171 dwt->win_l_x1, sa_line + i + 1, | |
| 3172 /* Nasty cast from float* to int32* */ | |
| 3173 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0) + i, | |
| 3174 2 * NB_ELTS_V8, 0, OPJ_TRUE); | |
| 3175 assert(ret); | |
| 3176 ret = opj_sparse_array_int32_read(sa, | |
| 3177 (OPJ_UINT32)dwt->sn + dwt->win_h_x0, sa_line + i, | |
| 3178 (OPJ_UINT32)dwt->sn + dwt->win_h_x1, sa_line + i + 1, | |
| 3179 /* Nasty cast from float* to int32* */ | |
| 3180 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0) + i, | |
| 3181 2 * NB_ELTS_V8, 0, OPJ_TRUE); | |
| 3182 assert(ret); | |
| 3183 OPJ_UNUSED(ret); | |
| 3184 } | |
| 3185 } | |
| 3186 | |
| 3187 static INLINE void opj_v8dwt_interleave_v(opj_v8dwt_t* OPJ_RESTRICT dwt, | |
| 3188 OPJ_FLOAT32* OPJ_RESTRICT a, | |
| 3189 OPJ_UINT32 width, | |
| 3190 OPJ_UINT32 nb_elts_read) | |
| 3191 { | |
| 3192 opj_v8_t* OPJ_RESTRICT bi = dwt->wavelet + dwt->cas; | |
| 3193 OPJ_UINT32 i; | |
| 3194 | |
| 3195 for (i = dwt->win_l_x0; i < dwt->win_l_x1; ++i) { | |
| 3196 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], | |
| 3197 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); | |
| 3198 } | |
| 3199 | |
| 3200 a += (OPJ_UINT32)dwt->sn * (OPJ_SIZE_T)width; | |
| 3201 bi = dwt->wavelet + 1 - dwt->cas; | |
| 3202 | |
| 3203 for (i = dwt->win_h_x0; i < dwt->win_h_x1; ++i) { | |
| 3204 memcpy(&bi[i * 2], &a[i * (OPJ_SIZE_T)width], | |
| 3205 (OPJ_SIZE_T)nb_elts_read * sizeof(OPJ_FLOAT32)); | |
| 3206 } | |
| 3207 } | |
| 3208 | |
| 3209 static void opj_v8dwt_interleave_partial_v(opj_v8dwt_t* OPJ_RESTRICT dwt, | |
| 3210 opj_sparse_array_int32_t* sa, | |
| 3211 OPJ_UINT32 sa_col, | |
| 3212 OPJ_UINT32 nb_elts_read) | |
| 3213 { | |
| 3214 OPJ_BOOL ret; | |
| 3215 ret = opj_sparse_array_int32_read(sa, | |
| 3216 sa_col, dwt->win_l_x0, | |
| 3217 sa_col + nb_elts_read, dwt->win_l_x1, | |
| 3218 (OPJ_INT32*)(dwt->wavelet + dwt->cas + 2 * dwt->win_l_x0), | |
| 3219 1, 2 * NB_ELTS_V8, OPJ_TRUE); | |
| 3220 assert(ret); | |
| 3221 ret = opj_sparse_array_int32_read(sa, | |
| 3222 sa_col, (OPJ_UINT32)dwt->sn + dwt->win_h_x0, | |
| 3223 sa_col + nb_elts_read, (OPJ_UINT32)dwt->sn + dwt->win_h_x1, | |
| 3224 (OPJ_INT32*)(dwt->wavelet + 1 - dwt->cas + 2 * dwt->win_h_x0), | |
| 3225 1, 2 * NB_ELTS_V8, OPJ_TRUE); | |
| 3226 assert(ret); | |
| 3227 OPJ_UNUSED(ret); | |
| 3228 } | |
| 3229 | |
| 3230 #ifdef __SSE__ | |
| 3231 | |
| 3232 static void opj_v8dwt_decode_step1_sse(opj_v8_t* w, | |
| 3233 OPJ_UINT32 start, | |
| 3234 OPJ_UINT32 end, | |
| 3235 const __m128 c) | |
| 3236 { | |
| 3237 __m128* OPJ_RESTRICT vw = (__m128*) w; | |
| 3238 OPJ_UINT32 i = start; | |
| 3239 /* To be adapted if NB_ELTS_V8 changes */ | |
| 3240 vw += 4 * start; | |
| 3241 /* Note: attempt at loop unrolling x2 doesn't help */ | |
| 3242 for (; i < end; ++i, vw += 4) { | |
| 3243 vw[0] = _mm_mul_ps(vw[0], c); | |
| 3244 vw[1] = _mm_mul_ps(vw[1], c); | |
| 3245 } | |
| 3246 } | |
| 3247 | |
| 3248 static void opj_v8dwt_decode_step2_sse(opj_v8_t* l, opj_v8_t* w, | |
| 3249 OPJ_UINT32 start, | |
| 3250 OPJ_UINT32 end, | |
| 3251 OPJ_UINT32 m, | |
| 3252 __m128 c) | |
| 3253 { | |
| 3254 __m128* OPJ_RESTRICT vl = (__m128*) l; | |
| 3255 __m128* OPJ_RESTRICT vw = (__m128*) w; | |
| 3256 /* To be adapted if NB_ELTS_V8 changes */ | |
| 3257 OPJ_UINT32 i; | |
| 3258 OPJ_UINT32 imax = opj_uint_min(end, m); | |
| 3259 if (start == 0) { | |
| 3260 if (imax >= 1) { | |
| 3261 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vl[0], vw[0]), c)); | |
| 3262 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vl[1], vw[1]), c)); | |
| 3263 vw += 4; | |
| 3264 start = 1; | |
| 3265 } | |
| 3266 } else { | |
| 3267 vw += start * 4; | |
| 3268 } | |
| 3269 | |
| 3270 i = start; | |
| 3271 /* Note: attempt at loop unrolling x2 doesn't help */ | |
| 3272 for (; i < imax; ++i) { | |
| 3273 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(_mm_add_ps(vw[-4], vw[0]), c)); | |
| 3274 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(_mm_add_ps(vw[-3], vw[1]), c)); | |
| 3275 vw += 4; | |
| 3276 } | |
| 3277 if (m < end) { | |
| 3278 assert(m + 1 == end); | |
| 3279 c = _mm_add_ps(c, c); | |
| 3280 vw[-2] = _mm_add_ps(vw[-2], _mm_mul_ps(c, vw[-4])); | |
| 3281 vw[-1] = _mm_add_ps(vw[-1], _mm_mul_ps(c, vw[-3])); | |
| 3282 } | |
| 3283 } | |
| 3284 | |
| 3285 #else | |
| 3286 | |
| 3287 static void opj_v8dwt_decode_step1(opj_v8_t* w, | |
| 3288 OPJ_UINT32 start, | |
| 3289 OPJ_UINT32 end, | |
| 3290 const OPJ_FLOAT32 c) | |
| 3291 { | |
| 3292 OPJ_FLOAT32* OPJ_RESTRICT fw = (OPJ_FLOAT32*) w; | |
| 3293 OPJ_UINT32 i; | |
| 3294 /* To be adapted if NB_ELTS_V8 changes */ | |
| 3295 for (i = start; i < end; ++i) { | |
| 3296 fw[i * 2 * 8 ] = fw[i * 2 * 8 ] * c; | |
| 3297 fw[i * 2 * 8 + 1] = fw[i * 2 * 8 + 1] * c; | |
| 3298 fw[i * 2 * 8 + 2] = fw[i * 2 * 8 + 2] * c; | |
| 3299 fw[i * 2 * 8 + 3] = fw[i * 2 * 8 + 3] * c; | |
| 3300 fw[i * 2 * 8 + 4] = fw[i * 2 * 8 + 4] * c; | |
| 3301 fw[i * 2 * 8 + 5] = fw[i * 2 * 8 + 5] * c; | |
| 3302 fw[i * 2 * 8 + 6] = fw[i * 2 * 8 + 6] * c; | |
| 3303 fw[i * 2 * 8 + 7] = fw[i * 2 * 8 + 7] * c; | |
| 3304 } | |
| 3305 } | |
| 3306 | |
| 3307 static void opj_v8dwt_decode_step2(opj_v8_t* l, opj_v8_t* w, | |
| 3308 OPJ_UINT32 start, | |
| 3309 OPJ_UINT32 end, | |
| 3310 OPJ_UINT32 m, | |
| 3311 OPJ_FLOAT32 c) | |
| 3312 { | |
| 3313 OPJ_FLOAT32* fl = (OPJ_FLOAT32*) l; | |
| 3314 OPJ_FLOAT32* fw = (OPJ_FLOAT32*) w; | |
| 3315 OPJ_UINT32 i; | |
| 3316 OPJ_UINT32 imax = opj_uint_min(end, m); | |
| 3317 if (start > 0) { | |
| 3318 fw += 2 * NB_ELTS_V8 * start; | |
| 3319 fl = fw - 2 * NB_ELTS_V8; | |
| 3320 } | |
| 3321 /* To be adapted if NB_ELTS_V8 changes */ | |
| 3322 for (i = start; i < imax; ++i) { | |
| 3323 fw[-8] = fw[-8] + ((fl[0] + fw[0]) * c); | |
| 3324 fw[-7] = fw[-7] + ((fl[1] + fw[1]) * c); | |
| 3325 fw[-6] = fw[-6] + ((fl[2] + fw[2]) * c); | |
| 3326 fw[-5] = fw[-5] + ((fl[3] + fw[3]) * c); | |
| 3327 fw[-4] = fw[-4] + ((fl[4] + fw[4]) * c); | |
| 3328 fw[-3] = fw[-3] + ((fl[5] + fw[5]) * c); | |
| 3329 fw[-2] = fw[-2] + ((fl[6] + fw[6]) * c); | |
| 3330 fw[-1] = fw[-1] + ((fl[7] + fw[7]) * c); | |
| 3331 fl = fw; | |
| 3332 fw += 2 * NB_ELTS_V8; | |
| 3333 } | |
| 3334 if (m < end) { | |
| 3335 assert(m + 1 == end); | |
| 3336 c += c; | |
| 3337 fw[-8] = fw[-8] + fl[0] * c; | |
| 3338 fw[-7] = fw[-7] + fl[1] * c; | |
| 3339 fw[-6] = fw[-6] + fl[2] * c; | |
| 3340 fw[-5] = fw[-5] + fl[3] * c; | |
| 3341 fw[-4] = fw[-4] + fl[4] * c; | |
| 3342 fw[-3] = fw[-3] + fl[5] * c; | |
| 3343 fw[-2] = fw[-2] + fl[6] * c; | |
| 3344 fw[-1] = fw[-1] + fl[7] * c; | |
| 3345 } | |
| 3346 } | |
| 3347 | |
| 3348 #endif | |
| 3349 | |
| 3350 /* <summary> */ | |
| 3351 /* Inverse 9-7 wavelet transform in 1-D. */ | |
| 3352 /* </summary> */ | |
| 3353 static void opj_v8dwt_decode(opj_v8dwt_t* OPJ_RESTRICT dwt) | |
| 3354 { | |
| 3355 OPJ_INT32 a, b; | |
| 3356 /* BUG_WEIRD_TWO_INVK (look for this identifier in tcd.c) */ | |
| 3357 /* Historic value for 2 / opj_invK */ | |
| 3358 /* Normally, we should use invK, but if we do so, we have failures in the */ | |
| 3359 /* conformance test, due to MSE and peak errors significantly higher than */ | |
| 3360 /* accepted value */ | |
| 3361 /* Due to using two_invK instead of invK, we have to compensate in tcd.c */ | |
| 3362 /* the computation of the stepsize for the non LL subbands */ | |
| 3363 const float two_invK = 1.625732422f; | |
| 3364 if (dwt->cas == 0) { | |
| 3365 if (!((dwt->dn > 0) || (dwt->sn > 1))) { | |
| 3366 return; | |
| 3367 } | |
| 3368 a = 0; | |
| 3369 b = 1; | |
| 3370 } else { | |
| 3371 if (!((dwt->sn > 0) || (dwt->dn > 1))) { | |
| 3372 return; | |
| 3373 } | |
| 3374 a = 1; | |
| 3375 b = 0; | |
| 3376 } | |
| 3377 #ifdef __SSE__ | |
| 3378 opj_v8dwt_decode_step1_sse(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, | |
| 3379 _mm_set1_ps(opj_K)); | |
| 3380 opj_v8dwt_decode_step1_sse(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, | |
| 3381 _mm_set1_ps(two_invK)); | |
| 3382 opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, | |
| 3383 dwt->win_l_x0, dwt->win_l_x1, | |
| 3384 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), | |
| 3385 _mm_set1_ps(-opj_dwt_delta)); | |
| 3386 opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, | |
| 3387 dwt->win_h_x0, dwt->win_h_x1, | |
| 3388 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), | |
| 3389 _mm_set1_ps(-opj_dwt_gamma)); | |
| 3390 opj_v8dwt_decode_step2_sse(dwt->wavelet + b, dwt->wavelet + a + 1, | |
| 3391 dwt->win_l_x0, dwt->win_l_x1, | |
| 3392 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), | |
| 3393 _mm_set1_ps(-opj_dwt_beta)); | |
| 3394 opj_v8dwt_decode_step2_sse(dwt->wavelet + a, dwt->wavelet + b + 1, | |
| 3395 dwt->win_h_x0, dwt->win_h_x1, | |
| 3396 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), | |
| 3397 _mm_set1_ps(-opj_dwt_alpha)); | |
| 3398 #else | |
| 3399 opj_v8dwt_decode_step1(dwt->wavelet + a, dwt->win_l_x0, dwt->win_l_x1, | |
| 3400 opj_K); | |
| 3401 opj_v8dwt_decode_step1(dwt->wavelet + b, dwt->win_h_x0, dwt->win_h_x1, | |
| 3402 two_invK); | |
| 3403 opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, | |
| 3404 dwt->win_l_x0, dwt->win_l_x1, | |
| 3405 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), | |
| 3406 -opj_dwt_delta); | |
| 3407 opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, | |
| 3408 dwt->win_h_x0, dwt->win_h_x1, | |
| 3409 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), | |
| 3410 -opj_dwt_gamma); | |
| 3411 opj_v8dwt_decode_step2(dwt->wavelet + b, dwt->wavelet + a + 1, | |
| 3412 dwt->win_l_x0, dwt->win_l_x1, | |
| 3413 (OPJ_UINT32)opj_int_min(dwt->sn, dwt->dn - a), | |
| 3414 -opj_dwt_beta); | |
| 3415 opj_v8dwt_decode_step2(dwt->wavelet + a, dwt->wavelet + b + 1, | |
| 3416 dwt->win_h_x0, dwt->win_h_x1, | |
| 3417 (OPJ_UINT32)opj_int_min(dwt->dn, dwt->sn - b), | |
| 3418 -opj_dwt_alpha); | |
| 3419 #endif | |
| 3420 } | |
| 3421 | |
| 3422 typedef struct { | |
| 3423 opj_v8dwt_t h; | |
| 3424 OPJ_UINT32 rw; | |
| 3425 OPJ_UINT32 w; | |
| 3426 OPJ_FLOAT32 * OPJ_RESTRICT aj; | |
| 3427 OPJ_UINT32 nb_rows; | |
| 3428 } opj_dwt97_decode_h_job_t; | |
| 3429 | |
| 3430 static void opj_dwt97_decode_h_func(void* user_data, opj_tls_t* tls) | |
| 3431 { | |
| 3432 OPJ_UINT32 j; | |
| 3433 opj_dwt97_decode_h_job_t* job; | |
| 3434 OPJ_FLOAT32 * OPJ_RESTRICT aj; | |
| 3435 OPJ_UINT32 w; | |
| 3436 (void)tls; | |
| 3437 | |
| 3438 job = (opj_dwt97_decode_h_job_t*)user_data; | |
| 3439 w = job->w; | |
| 3440 | |
| 3441 assert((job->nb_rows % NB_ELTS_V8) == 0); | |
| 3442 | |
| 3443 aj = job->aj; | |
| 3444 for (j = 0; j + NB_ELTS_V8 <= job->nb_rows; j += NB_ELTS_V8) { | |
| 3445 OPJ_UINT32 k; | |
| 3446 opj_v8dwt_interleave_h(&job->h, aj, job->w, NB_ELTS_V8); | |
| 3447 opj_v8dwt_decode(&job->h); | |
| 3448 | |
| 3449 /* To be adapted if NB_ELTS_V8 changes */ | |
| 3450 for (k = 0; k < job->rw; k++) { | |
| 3451 aj[k ] = job->h.wavelet[k].f[0]; | |
| 3452 aj[k + (OPJ_SIZE_T)w ] = job->h.wavelet[k].f[1]; | |
| 3453 aj[k + (OPJ_SIZE_T)w * 2] = job->h.wavelet[k].f[2]; | |
| 3454 aj[k + (OPJ_SIZE_T)w * 3] = job->h.wavelet[k].f[3]; | |
| 3455 } | |
| 3456 for (k = 0; k < job->rw; k++) { | |
| 3457 aj[k + (OPJ_SIZE_T)w * 4] = job->h.wavelet[k].f[4]; | |
| 3458 aj[k + (OPJ_SIZE_T)w * 5] = job->h.wavelet[k].f[5]; | |
| 3459 aj[k + (OPJ_SIZE_T)w * 6] = job->h.wavelet[k].f[6]; | |
| 3460 aj[k + (OPJ_SIZE_T)w * 7] = job->h.wavelet[k].f[7]; | |
| 3461 } | |
| 3462 | |
| 3463 aj += w * NB_ELTS_V8; | |
| 3464 } | |
| 3465 | |
| 3466 opj_aligned_free(job->h.wavelet); | |
| 3467 opj_free(job); | |
| 3468 } | |
| 3469 | |
| 3470 | |
| 3471 typedef struct { | |
| 3472 opj_v8dwt_t v; | |
| 3473 OPJ_UINT32 rh; | |
| 3474 OPJ_UINT32 w; | |
| 3475 OPJ_FLOAT32 * OPJ_RESTRICT aj; | |
| 3476 OPJ_UINT32 nb_columns; | |
| 3477 } opj_dwt97_decode_v_job_t; | |
| 3478 | |
| 3479 static void opj_dwt97_decode_v_func(void* user_data, opj_tls_t* tls) | |
| 3480 { | |
| 3481 OPJ_UINT32 j; | |
| 3482 opj_dwt97_decode_v_job_t* job; | |
| 3483 OPJ_FLOAT32 * OPJ_RESTRICT aj; | |
| 3484 (void)tls; | |
| 3485 | |
| 3486 job = (opj_dwt97_decode_v_job_t*)user_data; | |
| 3487 | |
| 3488 assert((job->nb_columns % NB_ELTS_V8) == 0); | |
| 3489 | |
| 3490 aj = job->aj; | |
| 3491 for (j = 0; j + NB_ELTS_V8 <= job->nb_columns; j += NB_ELTS_V8) { | |
| 3492 OPJ_UINT32 k; | |
| 3493 | |
| 3494 opj_v8dwt_interleave_v(&job->v, aj, job->w, NB_ELTS_V8); | |
| 3495 opj_v8dwt_decode(&job->v); | |
| 3496 | |
| 3497 for (k = 0; k < job->rh; ++k) { | |
| 3498 memcpy(&aj[k * (OPJ_SIZE_T)job->w], &job->v.wavelet[k], | |
| 3499 NB_ELTS_V8 * sizeof(OPJ_FLOAT32)); | |
| 3500 } | |
| 3501 aj += NB_ELTS_V8; | |
| 3502 } | |
| 3503 | |
| 3504 opj_aligned_free(job->v.wavelet); | |
| 3505 opj_free(job); | |
| 3506 } | |
| 3507 | |
| 3508 | |
| 3509 /* <summary> */ | |
| 3510 /* Inverse 9-7 wavelet transform in 2-D. */ | |
| 3511 /* </summary> */ | |
| 3512 static | |
| 3513 OPJ_BOOL opj_dwt_decode_tile_97(opj_thread_pool_t* tp, | |
| 3514 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, | |
| 3515 OPJ_UINT32 numres) | |
| 3516 { | |
| 3517 opj_v8dwt_t h; | |
| 3518 opj_v8dwt_t v; | |
| 3519 | |
| 3520 opj_tcd_resolution_t* res = tilec->resolutions; | |
| 3521 | |
| 3522 OPJ_UINT32 rw = (OPJ_UINT32)(res->x1 - | |
| 3523 res->x0); /* width of the resolution level computed */ | |
| 3524 OPJ_UINT32 rh = (OPJ_UINT32)(res->y1 - | |
| 3525 res->y0); /* height of the resolution level computed */ | |
| 3526 | |
| 3527 OPJ_UINT32 w = (OPJ_UINT32)(tilec->resolutions[tilec->minimum_num_resolutions - | |
| 3528 1].x1 - | |
| 3529 tilec->resolutions[tilec->minimum_num_resolutions - 1].x0); | |
| 3530 | |
| 3531 OPJ_SIZE_T l_data_size; | |
| 3532 const int num_threads = opj_thread_pool_get_thread_count(tp); | |
| 3533 | |
| 3534 if (numres == 1) { | |
| 3535 return OPJ_TRUE; | |
| 3536 } | |
| 3537 | |
| 3538 l_data_size = opj_dwt_max_resolution(res, numres); | |
| 3539 /* overflow check */ | |
| 3540 if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { | |
| 3541 /* FIXME event manager error callback */ | |
| 3542 return OPJ_FALSE; | |
| 3543 } | |
| 3544 h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); | |
| 3545 if (!h.wavelet) { | |
| 3546 /* FIXME event manager error callback */ | |
| 3547 return OPJ_FALSE; | |
| 3548 } | |
| 3549 v.wavelet = h.wavelet; | |
| 3550 | |
| 3551 while (--numres) { | |
| 3552 OPJ_FLOAT32 * OPJ_RESTRICT aj = (OPJ_FLOAT32*) tilec->data; | |
| 3553 OPJ_UINT32 j; | |
| 3554 | |
| 3555 h.sn = (OPJ_INT32)rw; | |
| 3556 v.sn = (OPJ_INT32)rh; | |
| 3557 | |
| 3558 ++res; | |
| 3559 | |
| 3560 rw = (OPJ_UINT32)(res->x1 - | |
| 3561 res->x0); /* width of the resolution level computed */ | |
| 3562 rh = (OPJ_UINT32)(res->y1 - | |
| 3563 res->y0); /* height of the resolution level computed */ | |
| 3564 | |
| 3565 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); | |
| 3566 h.cas = res->x0 % 2; | |
| 3567 | |
| 3568 h.win_l_x0 = 0; | |
| 3569 h.win_l_x1 = (OPJ_UINT32)h.sn; | |
| 3570 h.win_h_x0 = 0; | |
| 3571 h.win_h_x1 = (OPJ_UINT32)h.dn; | |
| 3572 | |
| 3573 if (num_threads <= 1 || rh < 2 * NB_ELTS_V8) { | |
| 3574 for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { | |
| 3575 OPJ_UINT32 k; | |
| 3576 opj_v8dwt_interleave_h(&h, aj, w, NB_ELTS_V8); | |
| 3577 opj_v8dwt_decode(&h); | |
| 3578 | |
| 3579 /* To be adapted if NB_ELTS_V8 changes */ | |
| 3580 for (k = 0; k < rw; k++) { | |
| 3581 aj[k ] = h.wavelet[k].f[0]; | |
| 3582 aj[k + (OPJ_SIZE_T)w ] = h.wavelet[k].f[1]; | |
| 3583 aj[k + (OPJ_SIZE_T)w * 2] = h.wavelet[k].f[2]; | |
| 3584 aj[k + (OPJ_SIZE_T)w * 3] = h.wavelet[k].f[3]; | |
| 3585 } | |
| 3586 for (k = 0; k < rw; k++) { | |
| 3587 aj[k + (OPJ_SIZE_T)w * 4] = h.wavelet[k].f[4]; | |
| 3588 aj[k + (OPJ_SIZE_T)w * 5] = h.wavelet[k].f[5]; | |
| 3589 aj[k + (OPJ_SIZE_T)w * 6] = h.wavelet[k].f[6]; | |
| 3590 aj[k + (OPJ_SIZE_T)w * 7] = h.wavelet[k].f[7]; | |
| 3591 } | |
| 3592 | |
| 3593 aj += w * NB_ELTS_V8; | |
| 3594 } | |
| 3595 } else { | |
| 3596 OPJ_UINT32 num_jobs = (OPJ_UINT32)num_threads; | |
| 3597 OPJ_UINT32 step_j; | |
| 3598 | |
| 3599 if ((rh / NB_ELTS_V8) < num_jobs) { | |
| 3600 num_jobs = rh / NB_ELTS_V8; | |
| 3601 } | |
| 3602 step_j = ((rh / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8; | |
| 3603 for (j = 0; j < num_jobs; j++) { | |
| 3604 opj_dwt97_decode_h_job_t* job; | |
| 3605 | |
| 3606 job = (opj_dwt97_decode_h_job_t*) opj_malloc(sizeof(opj_dwt97_decode_h_job_t)); | |
| 3607 if (!job) { | |
| 3608 opj_thread_pool_wait_completion(tp, 0); | |
| 3609 opj_aligned_free(h.wavelet); | |
| 3610 return OPJ_FALSE; | |
| 3611 } | |
| 3612 job->h.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); | |
| 3613 if (!job->h.wavelet) { | |
| 3614 opj_thread_pool_wait_completion(tp, 0); | |
| 3615 opj_free(job); | |
| 3616 opj_aligned_free(h.wavelet); | |
| 3617 return OPJ_FALSE; | |
| 3618 } | |
| 3619 job->h.dn = h.dn; | |
| 3620 job->h.sn = h.sn; | |
| 3621 job->h.cas = h.cas; | |
| 3622 job->h.win_l_x0 = h.win_l_x0; | |
| 3623 job->h.win_l_x1 = h.win_l_x1; | |
| 3624 job->h.win_h_x0 = h.win_h_x0; | |
| 3625 job->h.win_h_x1 = h.win_h_x1; | |
| 3626 job->rw = rw; | |
| 3627 job->w = w; | |
| 3628 job->aj = aj; | |
| 3629 job->nb_rows = (j + 1 == num_jobs) ? (rh & (OPJ_UINT32)~ | |
| 3630 (NB_ELTS_V8 - 1)) - j * step_j : step_j; | |
| 3631 aj += w * job->nb_rows; | |
| 3632 opj_thread_pool_submit_job(tp, opj_dwt97_decode_h_func, job); | |
| 3633 } | |
| 3634 opj_thread_pool_wait_completion(tp, 0); | |
| 3635 j = rh & (OPJ_UINT32)~(NB_ELTS_V8 - 1); | |
| 3636 } | |
| 3637 | |
| 3638 if (j < rh) { | |
| 3639 OPJ_UINT32 k; | |
| 3640 opj_v8dwt_interleave_h(&h, aj, w, rh - j); | |
| 3641 opj_v8dwt_decode(&h); | |
| 3642 for (k = 0; k < rw; k++) { | |
| 3643 OPJ_UINT32 l; | |
| 3644 for (l = 0; l < rh - j; l++) { | |
| 3645 aj[k + (OPJ_SIZE_T)w * l ] = h.wavelet[k].f[l]; | |
| 3646 } | |
| 3647 } | |
| 3648 } | |
| 3649 | |
| 3650 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); | |
| 3651 v.cas = res->y0 % 2; | |
| 3652 v.win_l_x0 = 0; | |
| 3653 v.win_l_x1 = (OPJ_UINT32)v.sn; | |
| 3654 v.win_h_x0 = 0; | |
| 3655 v.win_h_x1 = (OPJ_UINT32)v.dn; | |
| 3656 | |
| 3657 aj = (OPJ_FLOAT32*) tilec->data; | |
| 3658 if (num_threads <= 1 || rw < 2 * NB_ELTS_V8) { | |
| 3659 for (j = rw; j > (NB_ELTS_V8 - 1); j -= NB_ELTS_V8) { | |
| 3660 OPJ_UINT32 k; | |
| 3661 | |
| 3662 opj_v8dwt_interleave_v(&v, aj, w, NB_ELTS_V8); | |
| 3663 opj_v8dwt_decode(&v); | |
| 3664 | |
| 3665 for (k = 0; k < rh; ++k) { | |
| 3666 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], NB_ELTS_V8 * sizeof(OPJ_FLOAT32)); | |
| 3667 } | |
| 3668 aj += NB_ELTS_V8; | |
| 3669 } | |
| 3670 } else { | |
| 3671 /* "bench_dwt -I" shows that scaling is poor, likely due to RAM | |
| 3672 transfer being the limiting factor. So limit the number of | |
| 3673 threads. | |
| 3674 */ | |
| 3675 OPJ_UINT32 num_jobs = opj_uint_max((OPJ_UINT32)num_threads / 2, 2U); | |
| 3676 OPJ_UINT32 step_j; | |
| 3677 | |
| 3678 if ((rw / NB_ELTS_V8) < num_jobs) { | |
| 3679 num_jobs = rw / NB_ELTS_V8; | |
| 3680 } | |
| 3681 step_j = ((rw / num_jobs) / NB_ELTS_V8) * NB_ELTS_V8; | |
| 3682 for (j = 0; j < num_jobs; j++) { | |
| 3683 opj_dwt97_decode_v_job_t* job; | |
| 3684 | |
| 3685 job = (opj_dwt97_decode_v_job_t*) opj_malloc(sizeof(opj_dwt97_decode_v_job_t)); | |
| 3686 if (!job) { | |
| 3687 opj_thread_pool_wait_completion(tp, 0); | |
| 3688 opj_aligned_free(h.wavelet); | |
| 3689 return OPJ_FALSE; | |
| 3690 } | |
| 3691 job->v.wavelet = (opj_v8_t*)opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); | |
| 3692 if (!job->v.wavelet) { | |
| 3693 opj_thread_pool_wait_completion(tp, 0); | |
| 3694 opj_free(job); | |
| 3695 opj_aligned_free(h.wavelet); | |
| 3696 return OPJ_FALSE; | |
| 3697 } | |
| 3698 job->v.dn = v.dn; | |
| 3699 job->v.sn = v.sn; | |
| 3700 job->v.cas = v.cas; | |
| 3701 job->v.win_l_x0 = v.win_l_x0; | |
| 3702 job->v.win_l_x1 = v.win_l_x1; | |
| 3703 job->v.win_h_x0 = v.win_h_x0; | |
| 3704 job->v.win_h_x1 = v.win_h_x1; | |
| 3705 job->rh = rh; | |
| 3706 job->w = w; | |
| 3707 job->aj = aj; | |
| 3708 job->nb_columns = (j + 1 == num_jobs) ? (rw & (OPJ_UINT32)~ | |
| 3709 (NB_ELTS_V8 - 1)) - j * step_j : step_j; | |
| 3710 aj += job->nb_columns; | |
| 3711 opj_thread_pool_submit_job(tp, opj_dwt97_decode_v_func, job); | |
| 3712 } | |
| 3713 opj_thread_pool_wait_completion(tp, 0); | |
| 3714 } | |
| 3715 | |
| 3716 if (rw & (NB_ELTS_V8 - 1)) { | |
| 3717 OPJ_UINT32 k; | |
| 3718 | |
| 3719 j = rw & (NB_ELTS_V8 - 1); | |
| 3720 | |
| 3721 opj_v8dwt_interleave_v(&v, aj, w, j); | |
| 3722 opj_v8dwt_decode(&v); | |
| 3723 | |
| 3724 for (k = 0; k < rh; ++k) { | |
| 3725 memcpy(&aj[k * (OPJ_SIZE_T)w], &v.wavelet[k], | |
| 3726 (OPJ_SIZE_T)j * sizeof(OPJ_FLOAT32)); | |
| 3727 } | |
| 3728 } | |
| 3729 } | |
| 3730 | |
| 3731 opj_aligned_free(h.wavelet); | |
| 3732 return OPJ_TRUE; | |
| 3733 } | |
| 3734 | |
| 3735 static | |
| 3736 OPJ_BOOL opj_dwt_decode_partial_97(opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, | |
| 3737 OPJ_UINT32 numres) | |
| 3738 { | |
| 3739 opj_sparse_array_int32_t* sa; | |
| 3740 opj_v8dwt_t h; | |
| 3741 opj_v8dwt_t v; | |
| 3742 OPJ_UINT32 resno; | |
| 3743 /* This value matches the maximum left/right extension given in tables */ | |
| 3744 /* F.2 and F.3 of the standard. Note: in opj_tcd_is_subband_area_of_interest() */ | |
| 3745 /* we currently use 3. */ | |
| 3746 const OPJ_UINT32 filter_width = 4U; | |
| 3747 | |
| 3748 opj_tcd_resolution_t* tr = tilec->resolutions; | |
| 3749 opj_tcd_resolution_t* tr_max = &(tilec->resolutions[numres - 1]); | |
| 3750 | |
| 3751 OPJ_UINT32 rw = (OPJ_UINT32)(tr->x1 - | |
| 3752 tr->x0); /* width of the resolution level computed */ | |
| 3753 OPJ_UINT32 rh = (OPJ_UINT32)(tr->y1 - | |
| 3754 tr->y0); /* height of the resolution level computed */ | |
| 3755 | |
| 3756 OPJ_SIZE_T l_data_size; | |
| 3757 | |
| 3758 /* Compute the intersection of the area of interest, expressed in tile coordinates */ | |
| 3759 /* with the tile coordinates */ | |
| 3760 OPJ_UINT32 win_tcx0 = tilec->win_x0; | |
| 3761 OPJ_UINT32 win_tcy0 = tilec->win_y0; | |
| 3762 OPJ_UINT32 win_tcx1 = tilec->win_x1; | |
| 3763 OPJ_UINT32 win_tcy1 = tilec->win_y1; | |
| 3764 | |
| 3765 if (tr_max->x0 == tr_max->x1 || tr_max->y0 == tr_max->y1) { | |
| 3766 return OPJ_TRUE; | |
| 3767 } | |
| 3768 | |
| 3769 sa = opj_dwt_init_sparse_array(tilec, numres); | |
| 3770 if (sa == NULL) { | |
| 3771 return OPJ_FALSE; | |
| 3772 } | |
| 3773 | |
| 3774 if (numres == 1U) { | |
| 3775 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, | |
| 3776 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, | |
| 3777 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, | |
| 3778 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, | |
| 3779 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, | |
| 3780 tilec->data_win, | |
| 3781 1, tr_max->win_x1 - tr_max->win_x0, | |
| 3782 OPJ_TRUE); | |
| 3783 assert(ret); | |
| 3784 OPJ_UNUSED(ret); | |
| 3785 opj_sparse_array_int32_free(sa); | |
| 3786 return OPJ_TRUE; | |
| 3787 } | |
| 3788 | |
| 3789 l_data_size = opj_dwt_max_resolution(tr, numres); | |
| 3790 /* overflow check */ | |
| 3791 if (l_data_size > (SIZE_MAX / sizeof(opj_v8_t))) { | |
| 3792 /* FIXME event manager error callback */ | |
| 3793 opj_sparse_array_int32_free(sa); | |
| 3794 return OPJ_FALSE; | |
| 3795 } | |
| 3796 h.wavelet = (opj_v8_t*) opj_aligned_malloc(l_data_size * sizeof(opj_v8_t)); | |
| 3797 if (!h.wavelet) { | |
| 3798 /* FIXME event manager error callback */ | |
| 3799 opj_sparse_array_int32_free(sa); | |
| 3800 return OPJ_FALSE; | |
| 3801 } | |
| 3802 v.wavelet = h.wavelet; | |
| 3803 | |
| 3804 for (resno = 1; resno < numres; resno ++) { | |
| 3805 OPJ_UINT32 j; | |
| 3806 /* Window of interest subband-based coordinates */ | |
| 3807 OPJ_UINT32 win_ll_x0, win_ll_y0, win_ll_x1, win_ll_y1; | |
| 3808 OPJ_UINT32 win_hl_x0, win_hl_x1; | |
| 3809 OPJ_UINT32 win_lh_y0, win_lh_y1; | |
| 3810 /* Window of interest tile-resolution-based coordinates */ | |
| 3811 OPJ_UINT32 win_tr_x0, win_tr_x1, win_tr_y0, win_tr_y1; | |
| 3812 /* Tile-resolution subband-based coordinates */ | |
| 3813 OPJ_UINT32 tr_ll_x0, tr_ll_y0, tr_hl_x0, tr_lh_y0; | |
| 3814 | |
| 3815 ++tr; | |
| 3816 | |
| 3817 h.sn = (OPJ_INT32)rw; | |
| 3818 v.sn = (OPJ_INT32)rh; | |
| 3819 | |
| 3820 rw = (OPJ_UINT32)(tr->x1 - tr->x0); | |
| 3821 rh = (OPJ_UINT32)(tr->y1 - tr->y0); | |
| 3822 | |
| 3823 h.dn = (OPJ_INT32)(rw - (OPJ_UINT32)h.sn); | |
| 3824 h.cas = tr->x0 % 2; | |
| 3825 | |
| 3826 v.dn = (OPJ_INT32)(rh - (OPJ_UINT32)v.sn); | |
| 3827 v.cas = tr->y0 % 2; | |
| 3828 | |
| 3829 /* Get the subband coordinates for the window of interest */ | |
| 3830 /* LL band */ | |
| 3831 opj_dwt_get_band_coordinates(tilec, resno, 0, | |
| 3832 win_tcx0, win_tcy0, win_tcx1, win_tcy1, | |
| 3833 &win_ll_x0, &win_ll_y0, | |
| 3834 &win_ll_x1, &win_ll_y1); | |
| 3835 | |
| 3836 /* HL band */ | |
| 3837 opj_dwt_get_band_coordinates(tilec, resno, 1, | |
| 3838 win_tcx0, win_tcy0, win_tcx1, win_tcy1, | |
| 3839 &win_hl_x0, NULL, &win_hl_x1, NULL); | |
| 3840 | |
| 3841 /* LH band */ | |
| 3842 opj_dwt_get_band_coordinates(tilec, resno, 2, | |
| 3843 win_tcx0, win_tcy0, win_tcx1, win_tcy1, | |
| 3844 NULL, &win_lh_y0, NULL, &win_lh_y1); | |
| 3845 | |
| 3846 /* Beware: band index for non-LL0 resolution are 0=HL, 1=LH and 2=HH */ | |
| 3847 tr_ll_x0 = (OPJ_UINT32)tr->bands[1].x0; | |
| 3848 tr_ll_y0 = (OPJ_UINT32)tr->bands[0].y0; | |
| 3849 tr_hl_x0 = (OPJ_UINT32)tr->bands[0].x0; | |
| 3850 tr_lh_y0 = (OPJ_UINT32)tr->bands[1].y0; | |
| 3851 | |
| 3852 /* Subtract the origin of the bands for this tile, to the subwindow */ | |
| 3853 /* of interest band coordinates, so as to get them relative to the */ | |
| 3854 /* tile */ | |
| 3855 win_ll_x0 = opj_uint_subs(win_ll_x0, tr_ll_x0); | |
| 3856 win_ll_y0 = opj_uint_subs(win_ll_y0, tr_ll_y0); | |
| 3857 win_ll_x1 = opj_uint_subs(win_ll_x1, tr_ll_x0); | |
| 3858 win_ll_y1 = opj_uint_subs(win_ll_y1, tr_ll_y0); | |
| 3859 win_hl_x0 = opj_uint_subs(win_hl_x0, tr_hl_x0); | |
| 3860 win_hl_x1 = opj_uint_subs(win_hl_x1, tr_hl_x0); | |
| 3861 win_lh_y0 = opj_uint_subs(win_lh_y0, tr_lh_y0); | |
| 3862 win_lh_y1 = opj_uint_subs(win_lh_y1, tr_lh_y0); | |
| 3863 | |
| 3864 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.sn, &win_ll_x0, &win_ll_x1); | |
| 3865 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)h.dn, &win_hl_x0, &win_hl_x1); | |
| 3866 | |
| 3867 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.sn, &win_ll_y0, &win_ll_y1); | |
| 3868 opj_dwt_segment_grow(filter_width, (OPJ_UINT32)v.dn, &win_lh_y0, &win_lh_y1); | |
| 3869 | |
| 3870 /* Compute the tile-resolution-based coordinates for the window of interest */ | |
| 3871 if (h.cas == 0) { | |
| 3872 win_tr_x0 = opj_uint_min(2 * win_ll_x0, 2 * win_hl_x0 + 1); | |
| 3873 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_ll_x1, 2 * win_hl_x1 + 1), rw); | |
| 3874 } else { | |
| 3875 win_tr_x0 = opj_uint_min(2 * win_hl_x0, 2 * win_ll_x0 + 1); | |
| 3876 win_tr_x1 = opj_uint_min(opj_uint_max(2 * win_hl_x1, 2 * win_ll_x1 + 1), rw); | |
| 3877 } | |
| 3878 | |
| 3879 if (v.cas == 0) { | |
| 3880 win_tr_y0 = opj_uint_min(2 * win_ll_y0, 2 * win_lh_y0 + 1); | |
| 3881 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_ll_y1, 2 * win_lh_y1 + 1), rh); | |
| 3882 } else { | |
| 3883 win_tr_y0 = opj_uint_min(2 * win_lh_y0, 2 * win_ll_y0 + 1); | |
| 3884 win_tr_y1 = opj_uint_min(opj_uint_max(2 * win_lh_y1, 2 * win_ll_y1 + 1), rh); | |
| 3885 } | |
| 3886 | |
| 3887 h.win_l_x0 = win_ll_x0; | |
| 3888 h.win_l_x1 = win_ll_x1; | |
| 3889 h.win_h_x0 = win_hl_x0; | |
| 3890 h.win_h_x1 = win_hl_x1; | |
| 3891 for (j = 0; j + (NB_ELTS_V8 - 1) < rh; j += NB_ELTS_V8) { | |
| 3892 if ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) || | |
| 3893 (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn && | |
| 3894 j < win_lh_y1 + (OPJ_UINT32)v.sn)) { | |
| 3895 opj_v8dwt_interleave_partial_h(&h, sa, j, opj_uint_min(NB_ELTS_V8, rh - j)); | |
| 3896 opj_v8dwt_decode(&h); | |
| 3897 if (!opj_sparse_array_int32_write(sa, | |
| 3898 win_tr_x0, j, | |
| 3899 win_tr_x1, j + NB_ELTS_V8, | |
| 3900 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], | |
| 3901 NB_ELTS_V8, 1, OPJ_TRUE)) { | |
| 3902 /* FIXME event manager error callback */ | |
| 3903 opj_sparse_array_int32_free(sa); | |
| 3904 opj_aligned_free(h.wavelet); | |
| 3905 return OPJ_FALSE; | |
| 3906 } | |
| 3907 } | |
| 3908 } | |
| 3909 | |
| 3910 if (j < rh && | |
| 3911 ((j + (NB_ELTS_V8 - 1) >= win_ll_y0 && j < win_ll_y1) || | |
| 3912 (j + (NB_ELTS_V8 - 1) >= win_lh_y0 + (OPJ_UINT32)v.sn && | |
| 3913 j < win_lh_y1 + (OPJ_UINT32)v.sn))) { | |
| 3914 opj_v8dwt_interleave_partial_h(&h, sa, j, rh - j); | |
| 3915 opj_v8dwt_decode(&h); | |
| 3916 if (!opj_sparse_array_int32_write(sa, | |
| 3917 win_tr_x0, j, | |
| 3918 win_tr_x1, rh, | |
| 3919 (OPJ_INT32*)&h.wavelet[win_tr_x0].f[0], | |
| 3920 NB_ELTS_V8, 1, OPJ_TRUE)) { | |
| 3921 /* FIXME event manager error callback */ | |
| 3922 opj_sparse_array_int32_free(sa); | |
| 3923 opj_aligned_free(h.wavelet); | |
| 3924 return OPJ_FALSE; | |
| 3925 } | |
| 3926 } | |
| 3927 | |
| 3928 v.win_l_x0 = win_ll_y0; | |
| 3929 v.win_l_x1 = win_ll_y1; | |
| 3930 v.win_h_x0 = win_lh_y0; | |
| 3931 v.win_h_x1 = win_lh_y1; | |
| 3932 for (j = win_tr_x0; j < win_tr_x1; j += NB_ELTS_V8) { | |
| 3933 OPJ_UINT32 nb_elts = opj_uint_min(NB_ELTS_V8, win_tr_x1 - j); | |
| 3934 | |
| 3935 opj_v8dwt_interleave_partial_v(&v, sa, j, nb_elts); | |
| 3936 opj_v8dwt_decode(&v); | |
| 3937 | |
| 3938 if (!opj_sparse_array_int32_write(sa, | |
| 3939 j, win_tr_y0, | |
| 3940 j + nb_elts, win_tr_y1, | |
| 3941 (OPJ_INT32*)&h.wavelet[win_tr_y0].f[0], | |
| 3942 1, NB_ELTS_V8, OPJ_TRUE)) { | |
| 3943 /* FIXME event manager error callback */ | |
| 3944 opj_sparse_array_int32_free(sa); | |
| 3945 opj_aligned_free(h.wavelet); | |
| 3946 return OPJ_FALSE; | |
| 3947 } | |
| 3948 } | |
| 3949 } | |
| 3950 | |
| 3951 { | |
| 3952 OPJ_BOOL ret = opj_sparse_array_int32_read(sa, | |
| 3953 tr_max->win_x0 - (OPJ_UINT32)tr_max->x0, | |
| 3954 tr_max->win_y0 - (OPJ_UINT32)tr_max->y0, | |
| 3955 tr_max->win_x1 - (OPJ_UINT32)tr_max->x0, | |
| 3956 tr_max->win_y1 - (OPJ_UINT32)tr_max->y0, | |
| 3957 tilec->data_win, | |
| 3958 1, tr_max->win_x1 - tr_max->win_x0, | |
| 3959 OPJ_TRUE); | |
| 3960 assert(ret); | |
| 3961 OPJ_UNUSED(ret); | |
| 3962 } | |
| 3963 opj_sparse_array_int32_free(sa); | |
| 3964 | |
| 3965 opj_aligned_free(h.wavelet); | |
| 3966 return OPJ_TRUE; | |
| 3967 } | |
| 3968 | |
| 3969 | |
| 3970 OPJ_BOOL opj_dwt_decode_real(opj_tcd_t *p_tcd, | |
| 3971 opj_tcd_tilecomp_t* OPJ_RESTRICT tilec, | |
| 3972 OPJ_UINT32 numres) | |
| 3973 { | |
| 3974 if (p_tcd->whole_tile_decoding) { | |
| 3975 return opj_dwt_decode_tile_97(p_tcd->thread_pool, tilec, numres); | |
| 3976 } else { | |
| 3977 return opj_dwt_decode_partial_97(tilec, numres); | |
| 3978 } | |
| 3979 } |
