Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/lstm/networkio.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/lstm/networkio.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,343 @@ +/////////////////////////////////////////////////////////////////////// +// File: networkio.h +// Description: Network input/output data, allowing float/int implementations. +// Author: Ray Smith +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_NETWORKIO_H_ +#define TESSERACT_LSTM_NETWORKIO_H_ + +#include "helpers.h" +#include "image.h" +#include "static_shape.h" +#include "stridemap.h" +#include "weightmatrix.h" + +#include <cmath> +#include <cstdio> +#include <vector> + +struct Pix; + +namespace tesseract { + +// Class to contain all the input/output of a network, allowing for fixed or +// variable-strided 2d to 1d mapping, and float or int8_t values. Provides +// enough calculating functions to hide the detail of the implementation. +class TESS_API NetworkIO { +public: + NetworkIO() : int_mode_(false) {} + // Resizes the array (and stride), avoiding realloc if possible, to the given + // size from various size specs: + // Same stride size, but given number of features. + void Resize(const NetworkIO &src, int num_features) { + ResizeToMap(src.int_mode(), src.stride_map(), num_features); + } + // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim. + void Resize2d(bool int_mode, int width, int num_features); + // Resizes forcing a float representation with the stridemap of src and the + // given number of features. + void ResizeFloat(const NetworkIO &src, int num_features) { + ResizeToMap(false, src.stride_map(), num_features); + } + // Resizes to a specific stride_map. + void ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features); + // Shrinks image size by x_scale,y_scale, and use given number of features. + void ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features); + // Resizes to just 1 x-coord, whatever the input. + void ResizeXTo1(const NetworkIO &src, int num_features); + // Initialize all the array to zero. + void Zero(); + // Initializes to zero all elements of the array that do not correspond to + // valid image positions. (If a batch of different-sized images are packed + // together, then there will be padding pixels.) + void ZeroInvalidElements(); + // Sets up the array from the given image, using the currently set int_mode_. + // If the image width doesn't match the shape, the image is truncated or + // padded with noise to match. + void FromPix(const StaticShape &shape, const Image pix, TRand *randomizer); + // Sets up the array from the given set of images, using the currently set + // int_mode_. If the image width doesn't match the shape, the images are + // truncated or padded with noise to match. + void FromPixes(const StaticShape &shape, const std::vector<Image> &pixes, + TRand *randomizer); + // Copies the given pix to *this at the given batch index, stretching and + // clipping the pixel values so that [black, black + 2*contrast] maps to the + // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int. + // This is a 2-d operation in the sense that the output depth is the number + // of input channels, the height is the height of the image, and the width + // is the width of the image, or truncated/padded with noise if the width + // is a fixed size. + void Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer); + // Copies the given pix to *this at the given batch index, as Copy2DImage + // above, except that the output depth is the height of the input image, the + // output height is 1, and the output width as for Copy2DImage. + // The image is thus treated as a 1-d set of vertical pixel strips. + void Copy1DGreyImage(int batch, Image pix, float black, float contrast, TRand *randomizer); + // Helper stores the pixel value in i_ or f_ according to int_mode_. + // t: is the index from the StrideMap corresponding to the current + // [batch,y,x] position + // f: is the index into the depth/channel + // pixel: the value of the pixel from the image (in one channel) + // black: the pixel value to map to the lowest of the range of *this + // contrast: the range of pixel values to stretch to half the range of *this. + void SetPixel(int t, int f, int pixel, float black, float contrast); + // Converts the array to a Pix. Must be pixDestroyed after use. + Image ToPix() const; + // Prints the first and last num timesteps of the array for each feature. + void Print(int num) const; + + // Returns the timestep width. + int Width() const { + return int_mode_ ? i_.dim1() : f_.dim1(); + } + // Returns the number of features. + int NumFeatures() const { + return int_mode_ ? i_.dim2() : f_.dim2(); + } + // Accessor to a timestep of the float matrix. + float *f(int t) { + ASSERT_HOST(!int_mode_); + return f_[t]; + } + const float *f(int t) const { + ASSERT_HOST(!int_mode_); + return f_[t]; + } + const int8_t *i(int t) const { + ASSERT_HOST(int_mode_); + return i_[t]; + } + bool int_mode() const { + return int_mode_; + } + void set_int_mode(bool is_quantized) { + int_mode_ = is_quantized; + } + const StrideMap &stride_map() const { + return stride_map_; + } + void set_stride_map(const StrideMap &map) { + stride_map_ = map; + } + const GENERIC_2D_ARRAY<float> &float_array() const { + return f_; + } + GENERIC_2D_ARRAY<float> *mutable_float_array() { + return &f_; + } + + // Copies a single time step from src. + void CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t); + // Copies a part of single time step from src. + void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO &src, + int src_t, int src_offset); + // Zeroes a single time step. + void ZeroTimeStep(int t) { + if (int_mode_) { + memset(i_[t], 0, sizeof(*i_[t]) * NumFeatures()); + } else { + memset(f_[t], 0, sizeof(*f_[t]) * NumFeatures()); + } + } + // Sets the given range to random values. + void Randomize(int t, int offset, int num_features, TRand *randomizer); + + // Helper returns the label and score of the best choice over a range. + int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating, + float *certainty) const; + // Helper returns the rating and certainty of the choice over a range in t. + void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating, + float *certainty) const; + // Returns the index (label) of the best value at the given timestep, + // and if not null, sets the score to the log of the corresponding value. + int BestLabel(int t, float *score) const { + return BestLabel(t, -1, -1, score); + } + // Returns the index (label) of the best value at the given timestep, + // excluding not_this and not_that, and if not null, sets the score to the + // log of the corresponding value. + int BestLabel(int t, int not_this, int not_that, float *score) const; + // Returns the best start position out of range (into which both start and end + // must fit) to obtain the highest cumulative score for the given labels. + int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const; + // Returns the cumulative score of the given labels starting at start, and + // using one label per time-step. + TFloat ScoreOfLabels(const std::vector<int> &labels, int start) const; + // Helper function sets all the outputs for a single timestep, such that + // label has value ok_score, and the other labels share 1 - ok_score. + // Assumes float mode. + void SetActivations(int t, int label, float ok_score); + // Modifies the values, only if needed, so that the given label is + // the winner at the given time step t. + // Assumes float mode. + void EnsureBestLabel(int t, int label); + // Helper function converts prob to certainty taking the minimum into account. + static float ProbToCertainty(float prob); + // Returns true if there is any bad value that is suspiciously like a GT + // error. Assuming that *this is the difference(gradient) between target + // and forward output, returns true if there is a large negative value + // (correcting a very confident output) for which there is no corresponding + // positive value in an adjacent timestep for the same feature index. This + // allows the box-truthed samples to make fine adjustments to position while + // stopping other disagreements of confident output with ground truth. + bool AnySuspiciousTruth(float confidence_thr) const; + + // Reads a single timestep to floats in the range [-1, 1]. + void ReadTimeStep(int t, TFloat *output) const; + // Adds a single timestep to floats. + void AddTimeStep(int t, TFloat *inout) const; + // Adds part of a single timestep to floats. + void AddTimeStepPart(int t, int offset, int num_features, float *inout) const; + // Writes a single timestep from floats in the range [-1, 1]. + void WriteTimeStep(int t, const TFloat *input); + // Writes a single timestep from floats in the range [-1, 1] writing only + // num_features elements of input to (*this)[t], starting at offset. + void WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input); + // Maxpools a single time step from src. + void MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line); + // Runs maxpool backward, using maxes to index timesteps in *this. + void MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes); + // Returns the min over time of the maxes over features of the outputs. + float MinOfMaxes() const; + // Returns the min over time. + float Max() const { + return int_mode_ ? i_.Max() : f_.Max(); + } + // Computes combined results for a combiner that chooses between an existing + // input and itself, with an additional output to indicate the choice. + void CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output); + // Computes deltas for a combiner that chooses between 2 sets of inputs. + void ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output); + + // Copies the array checking that the types match. + void CopyAll(const NetworkIO &src); + // Adds the array to a float array, with scaling to [-1, 1] if the src is int. + void AddAllToFloat(const NetworkIO &src); + // Subtracts the array from a float array. src must also be float. + void SubtractAllFromFloat(const NetworkIO &src); + + // Copies src to *this, with maxabs normalization to match scale. + void CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale); + // Multiplies the float data by the given factor. + void ScaleFloatBy(float factor) { + f_ *= factor; + } + // Copies src to *this with independent reversal of the y dimension. + void CopyWithYReversal(const NetworkIO &src); + // Copies src to *this with independent reversal of the x dimension. + void CopyWithXReversal(const NetworkIO &src); + // Copies src to *this with independent transpose of the x and y dimensions. + void CopyWithXYTranspose(const NetworkIO &src); + // Copies src to *this, at the given feature_offset, returning the total + // feature offset after the copy. Multiple calls will stack outputs from + // multiple sources in feature space. + int CopyPacking(const NetworkIO &src, int feature_offset); + // Opposite of CopyPacking, fills *this with a part of src, starting at + // feature_offset, and picking num_features. Resizes *this to match. + void CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features); + // Transposes the float part of *this into dest. + void Transpose(TransposedArray *dest) const; + + // Clips the content of a single time-step to +/-range. + void ClipVector(int t, float range); + + // Applies Func to timestep t of *this (u) and multiplies the result by v + // component-wise, putting the product in *product. + // *this and v may be int or float, but must match. The outputs are TFloat. + template <class Func> + void FuncMultiply(const NetworkIO &v_io, int t, TFloat *product) { + Func f; + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + int dim = f_.dim2(); + if (int_mode_) { + const int8_t *u = i_[t]; + const int8_t *v = v_io.i_[t]; + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i] / static_cast<TFloat>(INT8_MAX)) * v[i] / INT8_MAX; + } + } else { + const float *u = f_[t]; + const float *v = v_io.f_[t]; + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i]) * v[i]; + } + } + } + // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, + // component-wise, putting the product in *product. + // All NetworkIOs are assumed to be float. + template <class Func> + void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const TFloat *w, + TFloat *product) const { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + Func f; + const float *u = f_[u_t]; + const float *v = v_io.f_[v_t]; + int dim = f_.dim2(); + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i]) * v[i] * w[i]; + } + } + // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, + // component-wise, adding the product to *product. + // All NetworkIOs are assumed to be float. + template <class Func> + void FuncMultiply3Add(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + Func f; + const float *u = f_[t]; + const float *v = v_io.f_[t]; + int dim = f_.dim2(); + for (int i = 0; i < dim; ++i) { + product[i] += f(u[i]) * v[i] * w[i]; + } + } + // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w, + // component-wise, putting the product in product, all at timestep t, except + // w, which is a simple array. All NetworkIOs are assumed to be float. + template <class Func1, class Func2> + void Func2Multiply3(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + Func1 f; + Func2 g; + const float *u = f_[t]; + const float *v = v_io.f_[t]; + int dim = f_.dim2(); + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i]) * g(v[i]) * w[i]; + } + } + +private: + // Returns the padding required for the given number of features in order + // for the SIMD operations to be safe. + static int GetPadding(int num_features); + + // Choice of float vs 8 bit int for data. + GENERIC_2D_ARRAY<float> f_; + GENERIC_2D_ARRAY<int8_t> i_; + // Which of f_ and i_ are we actually using. + bool int_mode_; + // Stride for 2d input data. + StrideMap stride_map_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_NETWORKIO_H_
