view mupdf-source/thirdparty/tesseract/src/ccutil/scanutils.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line source

// Copyright 2006 Google Inc.
// All Rights Reserved.
// Author: renn
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifdef HAVE_CONFIG_H
#  include "config_auto.h"
#endif

#include <cctype>
#include <climits> // for CHAR_BIT
#include <cmath>
#include <cstdarg>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <limits> // for std::numeric_limits

#include "scanutils.h"

enum Flags {
  FL_SPLAT = 0x01, // Drop the value, do not assign
  FL_INV = 0x02,   // Character-set with inverse
  FL_WIDTH = 0x04, // Field width specified
  FL_MINUS = 0x08, // Negative number
};

enum Ranks {
  RANK_CHAR = -2,
  RANK_SHORT = -1,
  RANK_INT = 0,
  RANK_LONG = 1,
  RANK_LONGLONG = 2,
  RANK_PTR = std::numeric_limits<int>::max() // Special value used for pointers
};

const enum Ranks kMinRank = RANK_CHAR;
const enum Ranks kMaxRank = RANK_LONGLONG;

const enum Ranks kIntMaxRank = RANK_LONGLONG;
const enum Ranks kSizeTRank = RANK_LONG;
const enum Ranks kPtrDiffRank = RANK_LONG;

enum Bail {
  BAIL_NONE = 0, // No error condition
  BAIL_EOF,      // Hit EOF
  BAIL_ERR       // Conversion mismatch
};

// Helper functions ------------------------------------------------------------
inline size_t LongBit() {
  return CHAR_BIT * sizeof(long);
}

static inline int SkipSpace(FILE *s) {
  int p;
  while (isascii(p = fgetc(s)) && isspace(p)) {
    ;
  }
  ungetc(p, s); // Make sure next char is available for reading
  return p;
}

static inline void SetBit(unsigned long *bitmap, unsigned int bit) {
  bitmap[bit / LongBit()] |= 1UL << (bit % LongBit());
}

static inline int TestBit(unsigned long *bitmap, unsigned int bit) {
  return static_cast<int>(bitmap[bit / LongBit()] >> (bit % LongBit())) & 1;
}

static inline int DigitValue(int ch, int base) {
  if (ch >= '0' && ch <= '9') {
    if (base >= 10 || ch <= '7') {
      return ch - '0';
    }
  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
    return ch - 'A' + 10;
  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
    return ch - 'a' + 10;
  }
  return -1;
}

// IO (re-)implementations -----------------------------------------------------
static uintmax_t streamtoumax(FILE *s, int base) {
  int minus = 0;
  uintmax_t v = 0;
  int d, c = 0;

  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {
    ;
  }

  // Single optional + or -
  if (c == '-' || c == '+') {
    minus = (c == '-');
    c = fgetc(s);
  }

  // Assign correct base
  if (base == 0) {
    if (c == '0') {
      c = fgetc(s);
      if (c == 'x' || c == 'X') {
        base = 16;
        c = fgetc(s);
      } else {
        base = 8;
      }
    }
  } else if (base == 16) {
    if (c == '0') {
      c = fgetc(s);
      if (c == 'x' || c == 'X') {
        c = fgetc(s);
      }
    }
  }

  // Actual number parsing
  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s)) {
    v = v * base + d;
  }

  ungetc(c, s);
  return minus ? -v : v;
}

static double streamtofloat(FILE *s) {
  bool minus = false;
  uint64_t v = 0;
  int d, c;
  uint64_t k = 1;
  uint64_t w = 0;

  for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s)) {
    ;
  }

  // Single optional + or -
  if (c == '-' || c == '+') {
    minus = (c == '-');
    c = fgetc(s);
  }

  // Actual number parsing
  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
    v = v * 10 + d;
  }
  if (c == '.') {
    for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
      w = w * 10 + d;
      k *= 10;
    }
  }
  double f = v + static_cast<double>(w) / k;
  if (c == 'e' || c == 'E') {
    c = fgetc(s);
    int expsign = 1;
    if (c == '-' || c == '+') {
      expsign = (c == '-') ? -1 : 1;
      c = fgetc(s);
    }
    int exponent = 0;
    for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
      exponent = exponent * 10 + d;
    }
    exponent *= expsign;
    f *= pow(10.0, static_cast<double>(exponent));
  }
  ungetc(c, s);

  return minus ? -f : f;
}

static int tvfscanf(FILE *stream, const char *format, va_list ap);

int tfscanf(FILE *stream, const char *format, ...) {
  va_list ap;
  int rv;

  va_start(ap, format);
  rv = tvfscanf(stream, format, ap);
  va_end(ap);

  return rv;
}

static int tvfscanf(FILE *stream, const char *format, va_list ap) {
  const char *p = format;
  char ch;
  int q = 0;
  uintmax_t val = 0;
  int rank = RANK_INT; // Default rank
  unsigned int width = UINT_MAX;
  int base;
  int flags = 0;
  enum {
    ST_NORMAL,      // Ground state
    ST_FLAGS,       // Special flags
    ST_WIDTH,       // Field width
    ST_MODIFIERS,   // Length or conversion modifiers
    ST_MATCH_INIT,  // Initial state of %[ sequence
    ST_MATCH,       // Main state of %[ sequence
    ST_MATCH_RANGE, // After - in a %[ sequence
  } state = ST_NORMAL;
  char *sarg = nullptr; // %s %c or %[ string argument
  enum Bail bail = BAIL_NONE;
  int converted = 0; // Successful conversions
  unsigned long
      matchmap[((1 << CHAR_BIT) + (CHAR_BIT * sizeof(long) - 1)) / (CHAR_BIT * sizeof(long))];
  int matchinv = 0; // Is match map inverted?
  unsigned char range_start = 0;
  auto start_off = std::ftell(stream);

  // Skip leading spaces
  SkipSpace(stream);

  while ((ch = *p++) && !bail) {
    switch (state) {
      case ST_NORMAL:
        if (ch == '%') {
          state = ST_FLAGS;
          flags = 0;
          rank = RANK_INT;
          width = UINT_MAX;
        } else if (isascii(ch) && isspace(ch)) {
          SkipSpace(stream);
        } else {
          if (fgetc(stream) != ch) {
            bail = BAIL_ERR; // Match failure
          }
        }
        break;

      case ST_FLAGS:
        if (ch == '*') {
          flags |= FL_SPLAT;
        } else if ('0' <= ch && ch <= '9') {
          width = (ch - '0');
          state = ST_WIDTH;
          flags |= FL_WIDTH;
        } else {
          state = ST_MODIFIERS;
          p--; // Process this character again
        }
        break;

      case ST_WIDTH:
        if (ch >= '0' && ch <= '9') {
          width = width * 10 + (ch - '0');
        } else {
          state = ST_MODIFIERS;
          p--; // Process this character again
        }
        break;

      case ST_MODIFIERS:
        switch (ch) {
          // Length modifiers - nonterminal sequences
          case 'h':
            rank--; // Shorter rank
            break;
          case 'l':
            rank++; // Longer rank
            break;
          case 'j':
            rank = kIntMaxRank;
            break;
          case 'z':
            rank = kSizeTRank;
            break;
          case 't':
            rank = kPtrDiffRank;
            break;
          case 'L':
          case 'q':
            rank = RANK_LONGLONG; // long double/long long
            break;

          default:
            // Output modifiers - terminal sequences
            state = ST_NORMAL;   // Next state will be normal
            if (rank < kMinRank) { // Canonicalize rank
              rank = kMinRank;
            } else if (rank > kMaxRank) {
              rank = kMaxRank;
            }

            switch (ch) {
              case 'P': // Upper case pointer
              case 'p': // Pointer
                rank = RANK_PTR;
                base = 0;
                goto scan_int;

              case 'i': // Base-independent integer
                base = 0;
                goto scan_int;

              case 'd': // Decimal integer
                base = 10;
                goto scan_int;

              case 'o': // Octal integer
                base = 8;
                goto scan_int;

              case 'u': // Unsigned decimal integer
                base = 10;
                goto scan_int;

              case 'x': // Hexadecimal integer
              case 'X':
                base = 16;
                goto scan_int;

              case 'n': // Number of characters consumed
                val = std::ftell(stream) - start_off;
                goto set_integer;

              scan_int:
                q = SkipSpace(stream);
                if (q <= 0) {
                  bail = BAIL_EOF;
                  break;
                }
                val = streamtoumax(stream, base);
                // fall through

              set_integer:
                if (!(flags & FL_SPLAT)) {
                  converted++;
                  switch (rank) {
                    case RANK_CHAR:
                      *va_arg(ap, unsigned char *) = static_cast<unsigned char>(val);
                      break;
                    case RANK_SHORT:
                      *va_arg(ap, unsigned short *) = static_cast<unsigned short>(val);
                      break;
                    case RANK_INT:
                      *va_arg(ap, unsigned int *) = static_cast<unsigned int>(val);
                      break;
                    case RANK_LONG:
                      *va_arg(ap, unsigned long *) = static_cast<unsigned long>(val);
                      break;
                    case RANK_LONGLONG:
                      *va_arg(ap, unsigned long long *) = static_cast<unsigned long long>(val);
                      break;
                    case RANK_PTR:
                      *va_arg(ap, void **) = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
                      break;
                  }
                }
                break;

              case 'f': // Preliminary float value parsing
              case 'g':
              case 'G':
              case 'e':
              case 'E':
                q = SkipSpace(stream);
                if (q <= 0) {
                  bail = BAIL_EOF;
                  break;
                }

                {
                  double fval = streamtofloat(stream);
                  if (!(flags & FL_SPLAT)) {
                    if (rank == RANK_INT) {
                      *va_arg(ap, float *) = static_cast<float>(fval);
                    } else if (rank == RANK_LONG) {
                      *va_arg(ap, double *) = static_cast<double>(fval);
                    }
                    converted++;
                  }
                }
                break;

              case 'c':                                 // Character
                width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
                sarg = va_arg(ap, char *);
                while (width--) {
                  if ((q = fgetc(stream)) <= 0) {
                    bail = BAIL_EOF;
                    break;
                  }
                  if (!(flags & FL_SPLAT)) {
                    *sarg++ = q;
                    converted++;
                  }
                }
                break;

              case 's': // String
              {
                if (!(flags & FL_SPLAT)) {
                  sarg = va_arg(ap, char *);
                }
                unsigned length = 0;
                while (width--) {
                  q = fgetc(stream);
                  if ((isascii(q) && isspace(q)) || (q <= 0)) {
                    ungetc(q, stream);
                    break;
                  }
                  if (!(flags & FL_SPLAT)) {
                    sarg[length] = q;
                  }
                  length++;
                }
                if (length == 0) {
                  bail = BAIL_EOF;
                } else if (!(flags & FL_SPLAT)) {
                  sarg[length] = '\0'; // Terminate output
                  converted++;
                }
              } break;

              case '[': // Character range
                sarg = va_arg(ap, char *);
                state = ST_MATCH_INIT;
                matchinv = 0;
                memset(matchmap, 0, sizeof matchmap);
                break;

              case '%': // %% sequence
                if (fgetc(stream) != '%') {
                  bail = BAIL_ERR;
                }
                break;

              default:           // Anything else
                bail = BAIL_ERR; // Unknown sequence
                break;
            }
        }
        break;

      case ST_MATCH_INIT: // Initial state for %[ match
        if (ch == '^' && !(flags & FL_INV)) {
          matchinv = 1;
        } else {
          SetBit(matchmap, static_cast<unsigned char>(ch));
          state = ST_MATCH;
        }
        break;

      case ST_MATCH: // Main state for %[ match
        if (ch == ']') {
          goto match_run;
        } else if (ch == '-') {
          range_start = static_cast<unsigned char>(ch);
          state = ST_MATCH_RANGE;
        } else {
          SetBit(matchmap, static_cast<unsigned char>(ch));
        }
        break;

      case ST_MATCH_RANGE: // %[ match after -
        if (ch == ']') {
          SetBit(matchmap, static_cast<unsigned char>('-'));
          goto match_run;
        } else {
          int i;
          for (i = range_start; i < (static_cast<unsigned char>(ch)); i++) {
            SetBit(matchmap, i);
          }
          state = ST_MATCH;
        }
        break;

      match_run: // Match expression finished
        char *oarg = sarg;
        while (width) {
          q = fgetc(stream);
          auto qc = static_cast<unsigned char>(q);
          if (q <= 0 || !(TestBit(matchmap, qc) ^ matchinv)) {
            ungetc(q, stream);
            break;
          }
          if (!(flags & FL_SPLAT)) {
            *sarg = q;
          }
          sarg++;
        }
        if (oarg == sarg) {
          bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
        } else if (!(flags & FL_SPLAT)) {
          *sarg = '\0';
          converted++;
        }
        break;
    }
  }

  if (bail == BAIL_EOF && !converted) {
    converted = -1; // Return EOF (-1)
  }

  return converted;
}