Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccutil/bitvector.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccutil/bitvector.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,230 @@ +// Copyright 2011 Google Inc. All Rights Reserved. +// Author: rays@google.com (Ray Smith) +/////////////////////////////////////////////////////////////////////// +// File: bitvector.cpp +// Description: Class replacement for BITVECTOR. +// Author: Ray Smith +// +// (C) Copyright 2011, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "bitvector.h" +#include <algorithm> +#include <cstring> +#include "helpers.h" +#include "serialis.h" // for tesseract::Serialize + +namespace tesseract { + +// Fast lookup table to get the first least significant set bit in a byte. +// For zero, the table has 255, but since it is a special case, most code +// that uses this table will check for zero before looking up lsb_index_. +const uint8_t BitVector::lsb_index_[256] = { + 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, + 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, + 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, + 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, + 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, + 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, + 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; + +// Fast lookup table to get the residual bits after zeroing the first (lowest) +// set bit in a byte. +const uint8_t BitVector::lsb_eroded_[256] = { + 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e, + 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e, + 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e, + 0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e, + 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e, + 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e, + 0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e, + 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e, + 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e, + 0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e, + 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae, + 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe, + 0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce, + 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde, + 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee, + 0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe}; + +// Fast lookup table to give the number of set bits in a byte. +const int BitVector::hamming_table_[256] = { + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; + +BitVector &BitVector::operator=(const BitVector &src) { + array_ = src.array_; + bit_size_ = src.bit_size_; + return *this; +} + +// Initializes the array to length * false. +void BitVector::Init(int length) { + Alloc(length); + SetAllFalse(); +} + +// Writes to the given file. Returns false in case of error. +bool BitVector::Serialize(FILE *fp) const { + if (!tesseract::Serialize(fp, &bit_size_)) { + return false; + } + int wordlen = WordLength(); + return tesseract::Serialize(fp, &array_[0], wordlen); +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool BitVector::DeSerialize(bool swap, FILE *fp) { + uint32_t new_bit_size; + if (!tesseract::DeSerialize(fp, &new_bit_size)) { + return false; + } + if (swap) { + ReverseN(&new_bit_size, sizeof(new_bit_size)); + } + Alloc(new_bit_size); + int wordlen = WordLength(); + if (!tesseract::DeSerialize(fp, &array_[0], wordlen)) { + return false; + } + if (swap) { + for (int i = 0; i < wordlen; ++i) { + ReverseN(&array_[i], sizeof(array_[i])); + } + } + return true; +} + +void BitVector::SetAllFalse() { + memset(&array_[0], 0, ByteLength()); +} +void BitVector::SetAllTrue() { + memset(&array_[0], ~0, ByteLength()); +} + +// Returns the index of the next set bit after the given index. +// Useful for quickly iterating through the set bits in a sparse vector. +int BitVector::NextSetBit(int prev_bit) const { + // Move on to the next bit. + int next_bit = prev_bit + 1; + if (next_bit >= bit_size_) { + return -1; + } + // Check the remains of the word containing the next_bit first. + int next_word = WordIndex(next_bit); + int bit_index = next_word * kBitFactor; + int word_end = bit_index + kBitFactor; + uint32_t word = array_[next_word]; + uint8_t byte = word & 0xff; + while (bit_index < word_end) { + if (bit_index + 8 > next_bit && byte != 0) { + while (bit_index + lsb_index_[byte] < next_bit && byte != 0) { + byte = lsb_eroded_[byte]; + } + if (byte != 0) { + return bit_index + lsb_index_[byte]; + } + } + word >>= 8; + bit_index += 8; + byte = word & 0xff; + } + // next_word didn't contain a 1, so find the next word with set bit. + ++next_word; + int wordlen = WordLength(); + while (next_word < wordlen && (word = array_[next_word]) == 0) { + ++next_word; + bit_index += kBitFactor; + } + if (bit_index >= bit_size_) { + return -1; + } + // Find the first non-zero byte within the word. + while ((word & 0xff) == 0) { + word >>= 8; + bit_index += 8; + } + return bit_index + lsb_index_[word & 0xff]; +} + +// Returns the number of set bits in the vector. +int BitVector::NumSetBits() const { + int wordlen = WordLength(); + int total_bits = 0; + for (int w = 0; w < wordlen; ++w) { + uint32_t word = array_[w]; + for (int i = 0; i < 4; ++i) { + total_bits += hamming_table_[word & 0xff]; + word >>= 8; + } + } + return total_bits; +} + +// Logical in-place operations on whole bit vectors. Tries to do something +// sensible if they aren't the same size, but they should be really. +void BitVector::operator|=(const BitVector &other) { + int length = std::min(WordLength(), other.WordLength()); + for (int w = 0; w < length; ++w) { + array_[w] |= other.array_[w]; + } +} +void BitVector::operator&=(const BitVector &other) { + int length = std::min(WordLength(), other.WordLength()); + for (int w = 0; w < length; ++w) { + array_[w] &= other.array_[w]; + } + for (int w = WordLength() - 1; w >= length; --w) { + array_[w] = 0; + } +} +void BitVector::operator^=(const BitVector &other) { + int length = std::min(WordLength(), other.WordLength()); + for (int w = 0; w < length; ++w) { + array_[w] ^= other.array_[w]; + } +} +// Set subtraction *this = v1 - v2. +void BitVector::SetSubtract(const BitVector &v1, const BitVector &v2) { + Alloc(v1.size()); + int length = std::min(v1.WordLength(), v2.WordLength()); + for (int w = 0; w < length; ++w) { + array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]); + } + for (int w = WordLength() - 1; w >= length; --w) { + array_[w] = v1.array_[w]; + } +} + +// Allocates memory for a vector of the given length. +// Reallocates if the array is a different size, larger or smaller. +void BitVector::Alloc(int length) { + int initial_wordlength = WordLength(); + bit_size_ = length; + int new_wordlength = WordLength(); + if (new_wordlength != initial_wordlength) { + array_.resize(new_wordlength); + } +} + +} // namespace tesseract.
