Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/brotli/c/enc/block_splitter_inc.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/brotli/c/enc/block_splitter_inc.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,487 @@ +/* NOLINT(build/header_guard) */ +/* Copyright 2013 Google Inc. All Rights Reserved. + + Distributed under MIT license. + See file LICENSE for detail or copy at https://opensource.org/licenses/MIT +*/ + +/* template parameters: FN, DataType */ + +#define HistogramType FN(Histogram) + +static void FN(InitialEntropyCodes)(const DataType* data, size_t length, + size_t stride, + size_t num_histograms, + HistogramType* histograms) { + uint32_t seed = 7; + size_t block_length = length / num_histograms; + size_t i; + FN(ClearHistograms)(histograms, num_histograms); + for (i = 0; i < num_histograms; ++i) { + size_t pos = length * i / num_histograms; + if (i != 0) { + pos += MyRand(&seed) % block_length; + } + if (pos + stride >= length) { + pos = length - stride - 1; + } + FN(HistogramAddVector)(&histograms[i], data + pos, stride); + } +} + +static void FN(RandomSample)(uint32_t* seed, + const DataType* data, + size_t length, + size_t stride, + HistogramType* sample) { + size_t pos = 0; + if (stride >= length) { + stride = length; + } else { + pos = MyRand(seed) % (length - stride + 1); + } + FN(HistogramAddVector)(sample, data + pos, stride); +} + +static void FN(RefineEntropyCodes)(const DataType* data, size_t length, + size_t stride, + size_t num_histograms, + HistogramType* histograms, + HistogramType* tmp) { + size_t iters = + kIterMulForRefining * length / stride + kMinItersForRefining; + uint32_t seed = 7; + size_t iter; + iters = ((iters + num_histograms - 1) / num_histograms) * num_histograms; + for (iter = 0; iter < iters; ++iter) { + FN(HistogramClear)(tmp); + FN(RandomSample)(&seed, data, length, stride, tmp); + FN(HistogramAddHistogram)(&histograms[iter % num_histograms], tmp); + } +} + +/* Assigns a block id from the range [0, num_histograms) to each data element + in data[0..length) and fills in block_id[0..length) with the assigned values. + Returns the number of blocks, i.e. one plus the number of block switches. */ +static size_t FN(FindBlocks)(const DataType* data, const size_t length, + const double block_switch_bitcost, + const size_t num_histograms, + const HistogramType* histograms, + double* insert_cost, + double* cost, + uint8_t* switch_signal, + uint8_t* block_id) { + const size_t alphabet_size = FN(HistogramDataSize)(); + const size_t bitmap_len = (num_histograms + 7) >> 3; + size_t num_blocks = 1; + size_t byte_ix; + size_t i; + size_t j; + BROTLI_DCHECK(num_histograms <= 256); + + /* Trivial case: single historgram -> single block type. */ + if (num_histograms <= 1) { + for (i = 0; i < length; ++i) { + block_id[i] = 0; + } + return 1; + } + + /* Fill bitcost for each symbol of all histograms. + * Non-existing symbol cost: 2 + log2(total_count). + * Regular symbol cost: -log2(symbol_count / total_count). */ + memset(insert_cost, 0, + sizeof(insert_cost[0]) * alphabet_size * num_histograms); + for (i = 0; i < num_histograms; ++i) { + insert_cost[i] = FastLog2((uint32_t)histograms[i].total_count_); + } + for (i = alphabet_size; i != 0;) { + /* Reverse order to use the 0-th row as a temporary storage. */ + --i; + for (j = 0; j < num_histograms; ++j) { + insert_cost[i * num_histograms + j] = + insert_cost[j] - BitCost(histograms[j].data_[i]); + } + } + + /* After each iteration of this loop, cost[k] will contain the difference + between the minimum cost of arriving at the current byte position using + entropy code k, and the minimum cost of arriving at the current byte + position. This difference is capped at the block switch cost, and if it + reaches block switch cost, it means that when we trace back from the last + position, we need to switch here. */ + memset(cost, 0, sizeof(cost[0]) * num_histograms); + memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmap_len); + for (byte_ix = 0; byte_ix < length; ++byte_ix) { + size_t ix = byte_ix * bitmap_len; + size_t symbol = data[byte_ix]; + size_t insert_cost_ix = symbol * num_histograms; + double min_cost = 1e99; + double block_switch_cost = block_switch_bitcost; + static const size_t prologue_length = 2000; + static const double multiplier = 0.07 / 2000; + size_t k; + for (k = 0; k < num_histograms; ++k) { + /* We are coding the symbol with entropy code k. */ + cost[k] += insert_cost[insert_cost_ix + k]; + if (cost[k] < min_cost) { + min_cost = cost[k]; + block_id[byte_ix] = (uint8_t)k; + } + } + /* More blocks for the beginning. */ + if (byte_ix < prologue_length) { + block_switch_cost *= 0.77 + multiplier * (double)byte_ix; + } + for (k = 0; k < num_histograms; ++k) { + cost[k] -= min_cost; + if (cost[k] >= block_switch_cost) { + const uint8_t mask = (uint8_t)(1u << (k & 7)); + cost[k] = block_switch_cost; + BROTLI_DCHECK((k >> 3) < bitmap_len); + switch_signal[ix + (k >> 3)] |= mask; + } + } + } + + byte_ix = length - 1; + { /* Trace back from the last position and switch at the marked places. */ + size_t ix = byte_ix * bitmap_len; + uint8_t cur_id = block_id[byte_ix]; + while (byte_ix > 0) { + const uint8_t mask = (uint8_t)(1u << (cur_id & 7)); + BROTLI_DCHECK(((size_t)cur_id >> 3) < bitmap_len); + --byte_ix; + ix -= bitmap_len; + if (switch_signal[ix + (cur_id >> 3)] & mask) { + if (cur_id != block_id[byte_ix]) { + cur_id = block_id[byte_ix]; + ++num_blocks; + } + } + block_id[byte_ix] = cur_id; + } + } + return num_blocks; +} + +static size_t FN(RemapBlockIds)(uint8_t* block_ids, const size_t length, + uint16_t* new_id, const size_t num_histograms) { + static const uint16_t kInvalidId = 256; + uint16_t next_id = 0; + size_t i; + for (i = 0; i < num_histograms; ++i) { + new_id[i] = kInvalidId; + } + for (i = 0; i < length; ++i) { + BROTLI_DCHECK(block_ids[i] < num_histograms); + if (new_id[block_ids[i]] == kInvalidId) { + new_id[block_ids[i]] = next_id++; + } + } + for (i = 0; i < length; ++i) { + block_ids[i] = (uint8_t)new_id[block_ids[i]]; + BROTLI_DCHECK(block_ids[i] < num_histograms); + } + BROTLI_DCHECK(next_id <= num_histograms); + return next_id; +} + +static void FN(BuildBlockHistograms)(const DataType* data, const size_t length, + const uint8_t* block_ids, + const size_t num_histograms, + HistogramType* histograms) { + size_t i; + FN(ClearHistograms)(histograms, num_histograms); + for (i = 0; i < length; ++i) { + FN(HistogramAdd)(&histograms[block_ids[i]], data[i]); + } +} + +/* Given the initial partitioning build partitioning with limited number + * of histograms (and block types). */ +static void FN(ClusterBlocks)(MemoryManager* m, + const DataType* data, const size_t length, + const size_t num_blocks, + uint8_t* block_ids, + BlockSplit* split) { + uint32_t* histogram_symbols = BROTLI_ALLOC(m, uint32_t, num_blocks); + uint32_t* u32 = + BROTLI_ALLOC(m, uint32_t, num_blocks + 4 * HISTOGRAMS_PER_BATCH); + const size_t expected_num_clusters = CLUSTERS_PER_BATCH * + (num_blocks + HISTOGRAMS_PER_BATCH - 1) / HISTOGRAMS_PER_BATCH; + size_t all_histograms_size = 0; + size_t all_histograms_capacity = expected_num_clusters; + HistogramType* all_histograms = + BROTLI_ALLOC(m, HistogramType, all_histograms_capacity); + size_t cluster_size_size = 0; + size_t cluster_size_capacity = expected_num_clusters; + uint32_t* cluster_size = BROTLI_ALLOC(m, uint32_t, cluster_size_capacity); + size_t num_clusters = 0; + HistogramType* histograms = BROTLI_ALLOC(m, HistogramType, + BROTLI_MIN(size_t, num_blocks, HISTOGRAMS_PER_BATCH)); + size_t max_num_pairs = + HISTOGRAMS_PER_BATCH * HISTOGRAMS_PER_BATCH / 2; + size_t pairs_capacity = max_num_pairs + 1; + HistogramPair* pairs = BROTLI_ALLOC(m, HistogramPair, pairs_capacity); + size_t pos = 0; + uint32_t* clusters; + size_t num_final_clusters; + static const uint32_t kInvalidIndex = BROTLI_UINT32_MAX; + uint32_t* new_index; + size_t i; + uint32_t* BROTLI_RESTRICT const sizes = + u32 ? (u32 + 0 * HISTOGRAMS_PER_BATCH) : NULL; + uint32_t* BROTLI_RESTRICT const new_clusters = + u32 ? (u32 + 1 * HISTOGRAMS_PER_BATCH) : NULL; + uint32_t* BROTLI_RESTRICT const symbols = + u32 ? (u32 + 2 * HISTOGRAMS_PER_BATCH) : NULL; + uint32_t* BROTLI_RESTRICT const remap = + u32 ? (u32 + 3 * HISTOGRAMS_PER_BATCH) : NULL; + uint32_t* BROTLI_RESTRICT const block_lengths = + u32 ? (u32 + 4 * HISTOGRAMS_PER_BATCH) : NULL; + /* TODO(eustas): move to arena? */ + HistogramType* tmp = BROTLI_ALLOC(m, HistogramType, 2); + + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histogram_symbols) || + BROTLI_IS_NULL(u32) || BROTLI_IS_NULL(all_histograms) || + BROTLI_IS_NULL(cluster_size) || BROTLI_IS_NULL(histograms) || + BROTLI_IS_NULL(pairs) || BROTLI_IS_NULL(tmp)) { + return; + } + + memset(u32, 0, (num_blocks + 4 * HISTOGRAMS_PER_BATCH) * sizeof(uint32_t)); + + /* Calculate block lengths (convert repeating values -> series length). */ + { + size_t block_idx = 0; + for (i = 0; i < length; ++i) { + BROTLI_DCHECK(block_idx < num_blocks); + ++block_lengths[block_idx]; + if (i + 1 == length || block_ids[i] != block_ids[i + 1]) { + ++block_idx; + } + } + BROTLI_DCHECK(block_idx == num_blocks); + } + + /* Pre-cluster blocks (cluster batches). */ + for (i = 0; i < num_blocks; i += HISTOGRAMS_PER_BATCH) { + const size_t num_to_combine = + BROTLI_MIN(size_t, num_blocks - i, HISTOGRAMS_PER_BATCH); + size_t num_new_clusters; + size_t j; + for (j = 0; j < num_to_combine; ++j) { + size_t k; + size_t block_length = block_lengths[i + j]; + FN(HistogramClear)(&histograms[j]); + for (k = 0; k < block_length; ++k) { + FN(HistogramAdd)(&histograms[j], data[pos++]); + } + histograms[j].bit_cost_ = FN(BrotliPopulationCost)(&histograms[j]); + new_clusters[j] = (uint32_t)j; + symbols[j] = (uint32_t)j; + sizes[j] = 1; + } + num_new_clusters = FN(BrotliHistogramCombine)( + histograms, tmp, sizes, symbols, new_clusters, pairs, num_to_combine, + num_to_combine, HISTOGRAMS_PER_BATCH, max_num_pairs); + BROTLI_ENSURE_CAPACITY(m, HistogramType, all_histograms, + all_histograms_capacity, all_histograms_size + num_new_clusters); + BROTLI_ENSURE_CAPACITY(m, uint32_t, cluster_size, + cluster_size_capacity, cluster_size_size + num_new_clusters); + if (BROTLI_IS_OOM(m)) return; + for (j = 0; j < num_new_clusters; ++j) { + all_histograms[all_histograms_size++] = histograms[new_clusters[j]]; + cluster_size[cluster_size_size++] = sizes[new_clusters[j]]; + remap[new_clusters[j]] = (uint32_t)j; + } + for (j = 0; j < num_to_combine; ++j) { + histogram_symbols[i + j] = (uint32_t)num_clusters + remap[symbols[j]]; + } + num_clusters += num_new_clusters; + BROTLI_DCHECK(num_clusters == cluster_size_size); + BROTLI_DCHECK(num_clusters == all_histograms_size); + } + BROTLI_FREE(m, histograms); + + /* Final clustering. */ + max_num_pairs = + BROTLI_MIN(size_t, 64 * num_clusters, (num_clusters / 2) * num_clusters); + if (pairs_capacity < max_num_pairs + 1) { + BROTLI_FREE(m, pairs); + pairs = BROTLI_ALLOC(m, HistogramPair, max_num_pairs + 1); + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(pairs)) return; + } + clusters = BROTLI_ALLOC(m, uint32_t, num_clusters); + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(clusters)) return; + for (i = 0; i < num_clusters; ++i) { + clusters[i] = (uint32_t)i; + } + num_final_clusters = FN(BrotliHistogramCombine)( + all_histograms, tmp, cluster_size, histogram_symbols, clusters, pairs, + num_clusters, num_blocks, BROTLI_MAX_NUMBER_OF_BLOCK_TYPES, + max_num_pairs); + BROTLI_FREE(m, pairs); + BROTLI_FREE(m, cluster_size); + + /* Assign blocks to final histograms. */ + new_index = BROTLI_ALLOC(m, uint32_t, num_clusters); + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return; + for (i = 0; i < num_clusters; ++i) new_index[i] = kInvalidIndex; + pos = 0; + { + uint32_t next_index = 0; + for (i = 0; i < num_blocks; ++i) { + size_t j; + uint32_t best_out; + double best_bits; + FN(HistogramClear)(tmp); + for (j = 0; j < block_lengths[i]; ++j) { + FN(HistogramAdd)(tmp, data[pos++]); + } + /* Among equally good histograms prefer last used. */ + /* TODO(eustas): should we give a block-switch discount here? */ + best_out = (i == 0) ? histogram_symbols[0] : histogram_symbols[i - 1]; + best_bits = FN(BrotliHistogramBitCostDistance)( + tmp, &all_histograms[best_out], tmp + 1); + for (j = 0; j < num_final_clusters; ++j) { + const double cur_bits = FN(BrotliHistogramBitCostDistance)( + tmp, &all_histograms[clusters[j]], tmp + 1); + if (cur_bits < best_bits) { + best_bits = cur_bits; + best_out = clusters[j]; + } + } + histogram_symbols[i] = best_out; + if (new_index[best_out] == kInvalidIndex) { + new_index[best_out] = next_index++; + } + } + } + BROTLI_FREE(m, tmp); + BROTLI_FREE(m, clusters); + BROTLI_FREE(m, all_histograms); + BROTLI_ENSURE_CAPACITY( + m, uint8_t, split->types, split->types_alloc_size, num_blocks); + BROTLI_ENSURE_CAPACITY( + m, uint32_t, split->lengths, split->lengths_alloc_size, num_blocks); + if (BROTLI_IS_OOM(m)) return; + + /* Rewrite final assignment to block-split. There might be less blocks + * than |num_blocks| due to clustering. */ + { + uint32_t cur_length = 0; + size_t block_idx = 0; + uint8_t max_type = 0; + for (i = 0; i < num_blocks; ++i) { + cur_length += block_lengths[i]; + if (i + 1 == num_blocks || + histogram_symbols[i] != histogram_symbols[i + 1]) { + const uint8_t id = (uint8_t)new_index[histogram_symbols[i]]; + split->types[block_idx] = id; + split->lengths[block_idx] = cur_length; + max_type = BROTLI_MAX(uint8_t, max_type, id); + cur_length = 0; + ++block_idx; + } + } + split->num_blocks = block_idx; + split->num_types = (size_t)max_type + 1; + } + BROTLI_FREE(m, new_index); + BROTLI_FREE(m, u32); + BROTLI_FREE(m, histogram_symbols); +} + +/* Create BlockSplit (partitioning) given the limits, estimates and "effort" + * parameters. + * + * NB: max_histograms is often less than number of histograms allowed by format; + * this is done intentionally, to save some "space" for context-aware + * clustering (here entropy is estimated for context-free symbols). */ +static void FN(SplitByteVector)(MemoryManager* m, + const DataType* data, const size_t length, + const size_t symbols_per_histogram, + const size_t max_histograms, + const size_t sampling_stride_length, + const double block_switch_cost, + const BrotliEncoderParams* params, + BlockSplit* split) { + const size_t data_size = FN(HistogramDataSize)(); + HistogramType* histograms; + HistogramType* tmp; + /* Calculate number of histograms; initial estimate is one histogram per + * specified amount of symbols; however, this value is capped. */ + size_t num_histograms = length / symbols_per_histogram + 1; + if (num_histograms > max_histograms) { + num_histograms = max_histograms; + } + + /* Corner case: no input. */ + if (length == 0) { + split->num_types = 1; + return; + } + + if (length < kMinLengthForBlockSplitting) { + BROTLI_ENSURE_CAPACITY(m, uint8_t, + split->types, split->types_alloc_size, split->num_blocks + 1); + BROTLI_ENSURE_CAPACITY(m, uint32_t, + split->lengths, split->lengths_alloc_size, split->num_blocks + 1); + if (BROTLI_IS_OOM(m)) return; + split->num_types = 1; + split->types[split->num_blocks] = 0; + split->lengths[split->num_blocks] = (uint32_t)length; + split->num_blocks++; + return; + } + histograms = BROTLI_ALLOC(m, HistogramType, num_histograms + 1); + tmp = histograms + num_histograms; + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(histograms)) return; + /* Find good entropy codes. */ + FN(InitialEntropyCodes)(data, length, + sampling_stride_length, + num_histograms, histograms); + FN(RefineEntropyCodes)(data, length, + sampling_stride_length, + num_histograms, histograms, tmp); + { + /* Find a good path through literals with the good entropy codes. */ + uint8_t* block_ids = BROTLI_ALLOC(m, uint8_t, length); + size_t num_blocks = 0; + const size_t bitmaplen = (num_histograms + 7) >> 3; + double* insert_cost = BROTLI_ALLOC(m, double, data_size * num_histograms); + double* cost = BROTLI_ALLOC(m, double, num_histograms); + uint8_t* switch_signal = BROTLI_ALLOC(m, uint8_t, length * bitmaplen); + uint16_t* new_id = BROTLI_ALLOC(m, uint16_t, num_histograms); + const size_t iters = params->quality < HQ_ZOPFLIFICATION_QUALITY ? 3 : 10; + size_t i; + if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(block_ids) || + BROTLI_IS_NULL(insert_cost) || BROTLI_IS_NULL(cost) || + BROTLI_IS_NULL(switch_signal) || BROTLI_IS_NULL(new_id)) { + return; + } + for (i = 0; i < iters; ++i) { + num_blocks = FN(FindBlocks)(data, length, + block_switch_cost, + num_histograms, histograms, + insert_cost, cost, switch_signal, + block_ids); + num_histograms = FN(RemapBlockIds)(block_ids, length, + new_id, num_histograms); + FN(BuildBlockHistograms)(data, length, block_ids, + num_histograms, histograms); + } + BROTLI_FREE(m, insert_cost); + BROTLI_FREE(m, cost); + BROTLI_FREE(m, switch_signal); + BROTLI_FREE(m, new_id); + BROTLI_FREE(m, histograms); + FN(ClusterBlocks)(m, data, length, num_blocks, block_ids, split); + if (BROTLI_IS_OOM(m)) return; + BROTLI_FREE(m, block_ids); + } +} + +#undef HistogramType
