Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/scanutils.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2006 Google Inc. | |
| 2 // All Rights Reserved. | |
| 3 // Author: renn | |
| 4 // | |
| 5 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 6 // you may not use this file except in compliance with the License. | |
| 7 // You may obtain a copy of the License at | |
| 8 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 | |
| 15 #ifdef HAVE_CONFIG_H | |
| 16 # include "config_auto.h" | |
| 17 #endif | |
| 18 | |
| 19 #include <cctype> | |
| 20 #include <climits> // for CHAR_BIT | |
| 21 #include <cmath> | |
| 22 #include <cstdarg> | |
| 23 #include <cstddef> | |
| 24 #include <cstdint> | |
| 25 #include <cstdio> | |
| 26 #include <cstring> | |
| 27 #include <limits> // for std::numeric_limits | |
| 28 | |
| 29 #include "scanutils.h" | |
| 30 | |
| 31 enum Flags { | |
| 32 FL_SPLAT = 0x01, // Drop the value, do not assign | |
| 33 FL_INV = 0x02, // Character-set with inverse | |
| 34 FL_WIDTH = 0x04, // Field width specified | |
| 35 FL_MINUS = 0x08, // Negative number | |
| 36 }; | |
| 37 | |
| 38 enum Ranks { | |
| 39 RANK_CHAR = -2, | |
| 40 RANK_SHORT = -1, | |
| 41 RANK_INT = 0, | |
| 42 RANK_LONG = 1, | |
| 43 RANK_LONGLONG = 2, | |
| 44 RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers | |
| 45 }; | |
| 46 | |
| 47 const enum Ranks kMinRank = RANK_CHAR; | |
| 48 const enum Ranks kMaxRank = RANK_LONGLONG; | |
| 49 | |
| 50 const enum Ranks kIntMaxRank = RANK_LONGLONG; | |
| 51 const enum Ranks kSizeTRank = RANK_LONG; | |
| 52 const enum Ranks kPtrDiffRank = RANK_LONG; | |
| 53 | |
| 54 enum Bail { | |
| 55 BAIL_NONE = 0, // No error condition | |
| 56 BAIL_EOF, // Hit EOF | |
| 57 BAIL_ERR // Conversion mismatch | |
| 58 }; | |
| 59 | |
| 60 // Helper functions ------------------------------------------------------------ | |
| 61 inline size_t LongBit() { | |
| 62 return CHAR_BIT * sizeof(long); | |
| 63 } | |
| 64 | |
| 65 static inline int SkipSpace(FILE *s) { | |
| 66 int p; | |
| 67 while (isascii(p = fgetc(s)) && isspace(p)) { | |
| 68 ; | |
| 69 } | |
| 70 ungetc(p, s); // Make sure next char is available for reading | |
| 71 return p; | |
| 72 } | |
| 73 | |
| 74 static inline void SetBit(unsigned long *bitmap, unsigned int bit) { | |
| 75 bitmap[bit / LongBit()] |= 1UL << (bit % LongBit()); | |
| 76 } | |
| 77 | |
| 78 static inline int TestBit(unsigned long *bitmap, unsigned int bit) { | |
| 79 return static_cast<int>(bitmap[bit / LongBit()] >> (bit % LongBit())) & 1; | |
| 80 } | |
| 81 | |
| 82 static inline int DigitValue(int ch, int base) { | |
| 83 if (ch >= '0' && ch <= '9') { | |
| 84 if (base >= 10 || ch <= '7') { | |
| 85 return ch - '0'; | |
| 86 } | |
| 87 } else if (ch >= 'A' && ch <= 'Z' && base == 16) { | |
| 88 return ch - 'A' + 10; | |
| 89 } else if (ch >= 'a' && ch <= 'z' && base == 16) { | |
| 90 return ch - 'a' + 10; | |
| 91 } | |
| 92 return -1; | |
| 93 } | |
| 94 | |
| 95 // IO (re-)implementations ----------------------------------------------------- | |
| 96 static uintmax_t streamtoumax(FILE *s, int base) { | |
| 97 int minus = 0; | |
| 98 uintmax_t v = 0; | |
| 99 int d, c = 0; | |
| 100 | |
| 101 for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) { | |
| 102 ; | |
| 103 } | |
| 104 | |
| 105 // Single optional + or - | |
| 106 if (c == '-' || c == '+') { | |
| 107 minus = (c == '-'); | |
| 108 c = fgetc(s); | |
| 109 } | |
| 110 | |
| 111 // Assign correct base | |
| 112 if (base == 0) { | |
| 113 if (c == '0') { | |
| 114 c = fgetc(s); | |
| 115 if (c == 'x' || c == 'X') { | |
| 116 base = 16; | |
| 117 c = fgetc(s); | |
| 118 } else { | |
| 119 base = 8; | |
| 120 } | |
| 121 } | |
| 122 } else if (base == 16) { | |
| 123 if (c == '0') { | |
| 124 c = fgetc(s); | |
| 125 if (c == 'x' || c == 'X') { | |
| 126 c = fgetc(s); | |
| 127 } | |
| 128 } | |
| 129 } | |
| 130 | |
| 131 // Actual number parsing | |
| 132 for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s)) { | |
| 133 v = v * base + d; | |
| 134 } | |
| 135 | |
| 136 ungetc(c, s); | |
| 137 return minus ? -v : v; | |
| 138 } | |
| 139 | |
| 140 static double streamtofloat(FILE *s) { | |
| 141 bool minus = false; | |
| 142 uint64_t v = 0; | |
| 143 int d, c; | |
| 144 uint64_t k = 1; | |
| 145 uint64_t w = 0; | |
| 146 | |
| 147 for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) { | |
| 148 ; | |
| 149 } | |
| 150 | |
| 151 // Single optional + or - | |
| 152 if (c == '-' || c == '+') { | |
| 153 minus = (c == '-'); | |
| 154 c = fgetc(s); | |
| 155 } | |
| 156 | |
| 157 // Actual number parsing | |
| 158 for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) { | |
| 159 v = v * 10 + d; | |
| 160 } | |
| 161 if (c == '.') { | |
| 162 for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) { | |
| 163 w = w * 10 + d; | |
| 164 k *= 10; | |
| 165 } | |
| 166 } | |
| 167 double f = v + static_cast<double>(w) / k; | |
| 168 if (c == 'e' || c == 'E') { | |
| 169 c = fgetc(s); | |
| 170 int expsign = 1; | |
| 171 if (c == '-' || c == '+') { | |
| 172 expsign = (c == '-') ? -1 : 1; | |
| 173 c = fgetc(s); | |
| 174 } | |
| 175 int exponent = 0; | |
| 176 for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) { | |
| 177 exponent = exponent * 10 + d; | |
| 178 } | |
| 179 exponent *= expsign; | |
| 180 f *= pow(10.0, static_cast<double>(exponent)); | |
| 181 } | |
| 182 ungetc(c, s); | |
| 183 | |
| 184 return minus ? -f : f; | |
| 185 } | |
| 186 | |
| 187 static int tvfscanf(FILE *stream, const char *format, va_list ap); | |
| 188 | |
| 189 int tfscanf(FILE *stream, const char *format, ...) { | |
| 190 va_list ap; | |
| 191 int rv; | |
| 192 | |
| 193 va_start(ap, format); | |
| 194 rv = tvfscanf(stream, format, ap); | |
| 195 va_end(ap); | |
| 196 | |
| 197 return rv; | |
| 198 } | |
| 199 | |
| 200 static int tvfscanf(FILE *stream, const char *format, va_list ap) { | |
| 201 const char *p = format; | |
| 202 char ch; | |
| 203 int q = 0; | |
| 204 uintmax_t val = 0; | |
| 205 int rank = RANK_INT; // Default rank | |
| 206 unsigned int width = UINT_MAX; | |
| 207 int base; | |
| 208 int flags = 0; | |
| 209 enum { | |
| 210 ST_NORMAL, // Ground state | |
| 211 ST_FLAGS, // Special flags | |
| 212 ST_WIDTH, // Field width | |
| 213 ST_MODIFIERS, // Length or conversion modifiers | |
| 214 ST_MATCH_INIT, // Initial state of %[ sequence | |
| 215 ST_MATCH, // Main state of %[ sequence | |
| 216 ST_MATCH_RANGE, // After - in a %[ sequence | |
| 217 } state = ST_NORMAL; | |
| 218 char *sarg = nullptr; // %s %c or %[ string argument | |
| 219 enum Bail bail = BAIL_NONE; | |
| 220 int converted = 0; // Successful conversions | |
| 221 unsigned long | |
| 222 matchmap[((1 << CHAR_BIT) + (CHAR_BIT * sizeof(long) - 1)) / (CHAR_BIT * sizeof(long))]; | |
| 223 int matchinv = 0; // Is match map inverted? | |
| 224 unsigned char range_start = 0; | |
| 225 auto start_off = std::ftell(stream); | |
| 226 | |
| 227 // Skip leading spaces | |
| 228 SkipSpace(stream); | |
| 229 | |
| 230 while ((ch = *p++) && !bail) { | |
| 231 switch (state) { | |
| 232 case ST_NORMAL: | |
| 233 if (ch == '%') { | |
| 234 state = ST_FLAGS; | |
| 235 flags = 0; | |
| 236 rank = RANK_INT; | |
| 237 width = UINT_MAX; | |
| 238 } else if (isascii(ch) && isspace(ch)) { | |
| 239 SkipSpace(stream); | |
| 240 } else { | |
| 241 if (fgetc(stream) != ch) { | |
| 242 bail = BAIL_ERR; // Match failure | |
| 243 } | |
| 244 } | |
| 245 break; | |
| 246 | |
| 247 case ST_FLAGS: | |
| 248 if (ch == '*') { | |
| 249 flags |= FL_SPLAT; | |
| 250 } else if ('0' <= ch && ch <= '9') { | |
| 251 width = (ch - '0'); | |
| 252 state = ST_WIDTH; | |
| 253 flags |= FL_WIDTH; | |
| 254 } else { | |
| 255 state = ST_MODIFIERS; | |
| 256 p--; // Process this character again | |
| 257 } | |
| 258 break; | |
| 259 | |
| 260 case ST_WIDTH: | |
| 261 if (ch >= '0' && ch <= '9') { | |
| 262 width = width * 10 + (ch - '0'); | |
| 263 } else { | |
| 264 state = ST_MODIFIERS; | |
| 265 p--; // Process this character again | |
| 266 } | |
| 267 break; | |
| 268 | |
| 269 case ST_MODIFIERS: | |
| 270 switch (ch) { | |
| 271 // Length modifiers - nonterminal sequences | |
| 272 case 'h': | |
| 273 rank--; // Shorter rank | |
| 274 break; | |
| 275 case 'l': | |
| 276 rank++; // Longer rank | |
| 277 break; | |
| 278 case 'j': | |
| 279 rank = kIntMaxRank; | |
| 280 break; | |
| 281 case 'z': | |
| 282 rank = kSizeTRank; | |
| 283 break; | |
| 284 case 't': | |
| 285 rank = kPtrDiffRank; | |
| 286 break; | |
| 287 case 'L': | |
| 288 case 'q': | |
| 289 rank = RANK_LONGLONG; // long double/long long | |
| 290 break; | |
| 291 | |
| 292 default: | |
| 293 // Output modifiers - terminal sequences | |
| 294 state = ST_NORMAL; // Next state will be normal | |
| 295 if (rank < kMinRank) { // Canonicalize rank | |
| 296 rank = kMinRank; | |
| 297 } else if (rank > kMaxRank) { | |
| 298 rank = kMaxRank; | |
| 299 } | |
| 300 | |
| 301 switch (ch) { | |
| 302 case 'P': // Upper case pointer | |
| 303 case 'p': // Pointer | |
| 304 rank = RANK_PTR; | |
| 305 base = 0; | |
| 306 goto scan_int; | |
| 307 | |
| 308 case 'i': // Base-independent integer | |
| 309 base = 0; | |
| 310 goto scan_int; | |
| 311 | |
| 312 case 'd': // Decimal integer | |
| 313 base = 10; | |
| 314 goto scan_int; | |
| 315 | |
| 316 case 'o': // Octal integer | |
| 317 base = 8; | |
| 318 goto scan_int; | |
| 319 | |
| 320 case 'u': // Unsigned decimal integer | |
| 321 base = 10; | |
| 322 goto scan_int; | |
| 323 | |
| 324 case 'x': // Hexadecimal integer | |
| 325 case 'X': | |
| 326 base = 16; | |
| 327 goto scan_int; | |
| 328 | |
| 329 case 'n': // Number of characters consumed | |
| 330 val = std::ftell(stream) - start_off; | |
| 331 goto set_integer; | |
| 332 | |
| 333 scan_int: | |
| 334 q = SkipSpace(stream); | |
| 335 if (q <= 0) { | |
| 336 bail = BAIL_EOF; | |
| 337 break; | |
| 338 } | |
| 339 val = streamtoumax(stream, base); | |
| 340 // fall through | |
| 341 | |
| 342 set_integer: | |
| 343 if (!(flags & FL_SPLAT)) { | |
| 344 converted++; | |
| 345 switch (rank) { | |
| 346 case RANK_CHAR: | |
| 347 *va_arg(ap, unsigned char *) = static_cast<unsigned char>(val); | |
| 348 break; | |
| 349 case RANK_SHORT: | |
| 350 *va_arg(ap, unsigned short *) = static_cast<unsigned short>(val); | |
| 351 break; | |
| 352 case RANK_INT: | |
| 353 *va_arg(ap, unsigned int *) = static_cast<unsigned int>(val); | |
| 354 break; | |
| 355 case RANK_LONG: | |
| 356 *va_arg(ap, unsigned long *) = static_cast<unsigned long>(val); | |
| 357 break; | |
| 358 case RANK_LONGLONG: | |
| 359 *va_arg(ap, unsigned long long *) = static_cast<unsigned long long>(val); | |
| 360 break; | |
| 361 case RANK_PTR: | |
| 362 *va_arg(ap, void **) = reinterpret_cast<void *>(static_cast<uintptr_t>(val)); | |
| 363 break; | |
| 364 } | |
| 365 } | |
| 366 break; | |
| 367 | |
| 368 case 'f': // Preliminary float value parsing | |
| 369 case 'g': | |
| 370 case 'G': | |
| 371 case 'e': | |
| 372 case 'E': | |
| 373 q = SkipSpace(stream); | |
| 374 if (q <= 0) { | |
| 375 bail = BAIL_EOF; | |
| 376 break; | |
| 377 } | |
| 378 | |
| 379 { | |
| 380 double fval = streamtofloat(stream); | |
| 381 if (!(flags & FL_SPLAT)) { | |
| 382 if (rank == RANK_INT) { | |
| 383 *va_arg(ap, float *) = static_cast<float>(fval); | |
| 384 } else if (rank == RANK_LONG) { | |
| 385 *va_arg(ap, double *) = static_cast<double>(fval); | |
| 386 } | |
| 387 converted++; | |
| 388 } | |
| 389 } | |
| 390 break; | |
| 391 | |
| 392 case 'c': // Character | |
| 393 width = (flags & FL_WIDTH) ? width : 1; // Default width == 1 | |
| 394 sarg = va_arg(ap, char *); | |
| 395 while (width--) { | |
| 396 if ((q = fgetc(stream)) <= 0) { | |
| 397 bail = BAIL_EOF; | |
| 398 break; | |
| 399 } | |
| 400 if (!(flags & FL_SPLAT)) { | |
| 401 *sarg++ = q; | |
| 402 converted++; | |
| 403 } | |
| 404 } | |
| 405 break; | |
| 406 | |
| 407 case 's': // String | |
| 408 { | |
| 409 if (!(flags & FL_SPLAT)) { | |
| 410 sarg = va_arg(ap, char *); | |
| 411 } | |
| 412 unsigned length = 0; | |
| 413 while (width--) { | |
| 414 q = fgetc(stream); | |
| 415 if ((isascii(q) && isspace(q)) || (q <= 0)) { | |
| 416 ungetc(q, stream); | |
| 417 break; | |
| 418 } | |
| 419 if (!(flags & FL_SPLAT)) { | |
| 420 sarg[length] = q; | |
| 421 } | |
| 422 length++; | |
| 423 } | |
| 424 if (length == 0) { | |
| 425 bail = BAIL_EOF; | |
| 426 } else if (!(flags & FL_SPLAT)) { | |
| 427 sarg[length] = '\0'; // Terminate output | |
| 428 converted++; | |
| 429 } | |
| 430 } break; | |
| 431 | |
| 432 case '[': // Character range | |
| 433 sarg = va_arg(ap, char *); | |
| 434 state = ST_MATCH_INIT; | |
| 435 matchinv = 0; | |
| 436 memset(matchmap, 0, sizeof matchmap); | |
| 437 break; | |
| 438 | |
| 439 case '%': // %% sequence | |
| 440 if (fgetc(stream) != '%') { | |
| 441 bail = BAIL_ERR; | |
| 442 } | |
| 443 break; | |
| 444 | |
| 445 default: // Anything else | |
| 446 bail = BAIL_ERR; // Unknown sequence | |
| 447 break; | |
| 448 } | |
| 449 } | |
| 450 break; | |
| 451 | |
| 452 case ST_MATCH_INIT: // Initial state for %[ match | |
| 453 if (ch == '^' && !(flags & FL_INV)) { | |
| 454 matchinv = 1; | |
| 455 } else { | |
| 456 SetBit(matchmap, static_cast<unsigned char>(ch)); | |
| 457 state = ST_MATCH; | |
| 458 } | |
| 459 break; | |
| 460 | |
| 461 case ST_MATCH: // Main state for %[ match | |
| 462 if (ch == ']') { | |
| 463 goto match_run; | |
| 464 } else if (ch == '-') { | |
| 465 range_start = static_cast<unsigned char>(ch); | |
| 466 state = ST_MATCH_RANGE; | |
| 467 } else { | |
| 468 SetBit(matchmap, static_cast<unsigned char>(ch)); | |
| 469 } | |
| 470 break; | |
| 471 | |
| 472 case ST_MATCH_RANGE: // %[ match after - | |
| 473 if (ch == ']') { | |
| 474 SetBit(matchmap, static_cast<unsigned char>('-')); | |
| 475 goto match_run; | |
| 476 } else { | |
| 477 int i; | |
| 478 for (i = range_start; i < (static_cast<unsigned char>(ch)); i++) { | |
| 479 SetBit(matchmap, i); | |
| 480 } | |
| 481 state = ST_MATCH; | |
| 482 } | |
| 483 break; | |
| 484 | |
| 485 match_run: // Match expression finished | |
| 486 char *oarg = sarg; | |
| 487 while (width) { | |
| 488 q = fgetc(stream); | |
| 489 auto qc = static_cast<unsigned char>(q); | |
| 490 if (q <= 0 || !(TestBit(matchmap, qc) ^ matchinv)) { | |
| 491 ungetc(q, stream); | |
| 492 break; | |
| 493 } | |
| 494 if (!(flags & FL_SPLAT)) { | |
| 495 *sarg = q; | |
| 496 } | |
| 497 sarg++; | |
| 498 } | |
| 499 if (oarg == sarg) { | |
| 500 bail = (q <= 0) ? BAIL_EOF : BAIL_ERR; | |
| 501 } else if (!(flags & FL_SPLAT)) { | |
| 502 *sarg = '\0'; | |
| 503 converted++; | |
| 504 } | |
| 505 break; | |
| 506 } | |
| 507 } | |
| 508 | |
| 509 if (bail == BAIL_EOF && !converted) { | |
| 510 converted = -1; // Return EOF (-1) | |
| 511 } | |
| 512 | |
| 513 return converted; | |
| 514 } |
