Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccutil/bitvector.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 /////////////////////////////////////////////////////////////////////// | |
| 4 // File: bitvector.cpp | |
| 5 // Description: Class replacement for BITVECTOR. | |
| 6 // Author: Ray Smith | |
| 7 // | |
| 8 // (C) Copyright 2011, Google Inc. | |
| 9 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 // you may not use this file except in compliance with the License. | |
| 11 // You may obtain a copy of the License at | |
| 12 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 // Unless required by applicable law or agreed to in writing, software | |
| 14 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 // See the License for the specific language governing permissions and | |
| 17 // limitations under the License. | |
| 18 // | |
| 19 /////////////////////////////////////////////////////////////////////// | |
| 20 | |
| 21 #include "bitvector.h" | |
| 22 #include <algorithm> | |
| 23 #include <cstring> | |
| 24 #include "helpers.h" | |
| 25 #include "serialis.h" // for tesseract::Serialize | |
| 26 | |
| 27 namespace tesseract { | |
| 28 | |
| 29 // Fast lookup table to get the first least significant set bit in a byte. | |
| 30 // For zero, the table has 255, but since it is a special case, most code | |
| 31 // that uses this table will check for zero before looking up lsb_index_. | |
| 32 const uint8_t BitVector::lsb_index_[256] = { | |
| 33 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, | |
| 34 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, | |
| 35 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, | |
| 36 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, | |
| 37 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, | |
| 38 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, | |
| 39 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, | |
| 40 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, | |
| 41 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; | |
| 42 | |
| 43 // Fast lookup table to get the residual bits after zeroing the first (lowest) | |
| 44 // set bit in a byte. | |
| 45 const uint8_t BitVector::lsb_eroded_[256] = { | |
| 46 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e, | |
| 47 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e, | |
| 48 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e, | |
| 49 0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e, | |
| 50 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e, | |
| 51 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e, | |
| 52 0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e, | |
| 53 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e, | |
| 54 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e, | |
| 55 0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e, | |
| 56 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae, | |
| 57 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe, | |
| 58 0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce, | |
| 59 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde, | |
| 60 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee, | |
| 61 0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe}; | |
| 62 | |
| 63 // Fast lookup table to give the number of set bits in a byte. | |
| 64 const int BitVector::hamming_table_[256] = { | |
| 65 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, | |
| 66 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
| 67 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
| 68 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
| 69 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, | |
| 70 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
| 71 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, | |
| 72 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; | |
| 73 | |
| 74 BitVector &BitVector::operator=(const BitVector &src) { | |
| 75 array_ = src.array_; | |
| 76 bit_size_ = src.bit_size_; | |
| 77 return *this; | |
| 78 } | |
| 79 | |
| 80 // Initializes the array to length * false. | |
| 81 void BitVector::Init(int length) { | |
| 82 Alloc(length); | |
| 83 SetAllFalse(); | |
| 84 } | |
| 85 | |
| 86 // Writes to the given file. Returns false in case of error. | |
| 87 bool BitVector::Serialize(FILE *fp) const { | |
| 88 if (!tesseract::Serialize(fp, &bit_size_)) { | |
| 89 return false; | |
| 90 } | |
| 91 int wordlen = WordLength(); | |
| 92 return tesseract::Serialize(fp, &array_[0], wordlen); | |
| 93 } | |
| 94 | |
| 95 // Reads from the given file. Returns false in case of error. | |
| 96 // If swap is true, assumes a big/little-endian swap is needed. | |
| 97 bool BitVector::DeSerialize(bool swap, FILE *fp) { | |
| 98 uint32_t new_bit_size; | |
| 99 if (!tesseract::DeSerialize(fp, &new_bit_size)) { | |
| 100 return false; | |
| 101 } | |
| 102 if (swap) { | |
| 103 ReverseN(&new_bit_size, sizeof(new_bit_size)); | |
| 104 } | |
| 105 Alloc(new_bit_size); | |
| 106 int wordlen = WordLength(); | |
| 107 if (!tesseract::DeSerialize(fp, &array_[0], wordlen)) { | |
| 108 return false; | |
| 109 } | |
| 110 if (swap) { | |
| 111 for (int i = 0; i < wordlen; ++i) { | |
| 112 ReverseN(&array_[i], sizeof(array_[i])); | |
| 113 } | |
| 114 } | |
| 115 return true; | |
| 116 } | |
| 117 | |
| 118 void BitVector::SetAllFalse() { | |
| 119 memset(&array_[0], 0, ByteLength()); | |
| 120 } | |
| 121 void BitVector::SetAllTrue() { | |
| 122 memset(&array_[0], ~0, ByteLength()); | |
| 123 } | |
| 124 | |
| 125 // Returns the index of the next set bit after the given index. | |
| 126 // Useful for quickly iterating through the set bits in a sparse vector. | |
| 127 int BitVector::NextSetBit(int prev_bit) const { | |
| 128 // Move on to the next bit. | |
| 129 int next_bit = prev_bit + 1; | |
| 130 if (next_bit >= bit_size_) { | |
| 131 return -1; | |
| 132 } | |
| 133 // Check the remains of the word containing the next_bit first. | |
| 134 int next_word = WordIndex(next_bit); | |
| 135 int bit_index = next_word * kBitFactor; | |
| 136 int word_end = bit_index + kBitFactor; | |
| 137 uint32_t word = array_[next_word]; | |
| 138 uint8_t byte = word & 0xff; | |
| 139 while (bit_index < word_end) { | |
| 140 if (bit_index + 8 > next_bit && byte != 0) { | |
| 141 while (bit_index + lsb_index_[byte] < next_bit && byte != 0) { | |
| 142 byte = lsb_eroded_[byte]; | |
| 143 } | |
| 144 if (byte != 0) { | |
| 145 return bit_index + lsb_index_[byte]; | |
| 146 } | |
| 147 } | |
| 148 word >>= 8; | |
| 149 bit_index += 8; | |
| 150 byte = word & 0xff; | |
| 151 } | |
| 152 // next_word didn't contain a 1, so find the next word with set bit. | |
| 153 ++next_word; | |
| 154 int wordlen = WordLength(); | |
| 155 while (next_word < wordlen && (word = array_[next_word]) == 0) { | |
| 156 ++next_word; | |
| 157 bit_index += kBitFactor; | |
| 158 } | |
| 159 if (bit_index >= bit_size_) { | |
| 160 return -1; | |
| 161 } | |
| 162 // Find the first non-zero byte within the word. | |
| 163 while ((word & 0xff) == 0) { | |
| 164 word >>= 8; | |
| 165 bit_index += 8; | |
| 166 } | |
| 167 return bit_index + lsb_index_[word & 0xff]; | |
| 168 } | |
| 169 | |
| 170 // Returns the number of set bits in the vector. | |
| 171 int BitVector::NumSetBits() const { | |
| 172 int wordlen = WordLength(); | |
| 173 int total_bits = 0; | |
| 174 for (int w = 0; w < wordlen; ++w) { | |
| 175 uint32_t word = array_[w]; | |
| 176 for (int i = 0; i < 4; ++i) { | |
| 177 total_bits += hamming_table_[word & 0xff]; | |
| 178 word >>= 8; | |
| 179 } | |
| 180 } | |
| 181 return total_bits; | |
| 182 } | |
| 183 | |
| 184 // Logical in-place operations on whole bit vectors. Tries to do something | |
| 185 // sensible if they aren't the same size, but they should be really. | |
| 186 void BitVector::operator|=(const BitVector &other) { | |
| 187 int length = std::min(WordLength(), other.WordLength()); | |
| 188 for (int w = 0; w < length; ++w) { | |
| 189 array_[w] |= other.array_[w]; | |
| 190 } | |
| 191 } | |
| 192 void BitVector::operator&=(const BitVector &other) { | |
| 193 int length = std::min(WordLength(), other.WordLength()); | |
| 194 for (int w = 0; w < length; ++w) { | |
| 195 array_[w] &= other.array_[w]; | |
| 196 } | |
| 197 for (int w = WordLength() - 1; w >= length; --w) { | |
| 198 array_[w] = 0; | |
| 199 } | |
| 200 } | |
| 201 void BitVector::operator^=(const BitVector &other) { | |
| 202 int length = std::min(WordLength(), other.WordLength()); | |
| 203 for (int w = 0; w < length; ++w) { | |
| 204 array_[w] ^= other.array_[w]; | |
| 205 } | |
| 206 } | |
| 207 // Set subtraction *this = v1 - v2. | |
| 208 void BitVector::SetSubtract(const BitVector &v1, const BitVector &v2) { | |
| 209 Alloc(v1.size()); | |
| 210 int length = std::min(v1.WordLength(), v2.WordLength()); | |
| 211 for (int w = 0; w < length; ++w) { | |
| 212 array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]); | |
| 213 } | |
| 214 for (int w = WordLength() - 1; w >= length; --w) { | |
| 215 array_[w] = v1.array_[w]; | |
| 216 } | |
| 217 } | |
| 218 | |
| 219 // Allocates memory for a vector of the given length. | |
| 220 // Reallocates if the array is a different size, larger or smaller. | |
| 221 void BitVector::Alloc(int length) { | |
| 222 int initial_wordlength = WordLength(); | |
| 223 bit_size_ = length; | |
| 224 int new_wordlength = WordLength(); | |
| 225 if (new_wordlength != initial_wordlength) { | |
| 226 array_.resize(new_wordlength); | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 } // namespace tesseract. |
