Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/lstm/networkio.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: networkio.h | |
| 3 // Description: Network input/output data, allowing float/int implementations. | |
| 4 // Author: Ray Smith | |
| 5 // | |
| 6 // (C) Copyright 2014, Google Inc. | |
| 7 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 8 // you may not use this file except in compliance with the License. | |
| 9 // You may obtain a copy of the License at | |
| 10 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 11 // Unless required by applicable law or agreed to in writing, software | |
| 12 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 // See the License for the specific language governing permissions and | |
| 15 // limitations under the License. | |
| 16 /////////////////////////////////////////////////////////////////////// | |
| 17 | |
| 18 #ifndef TESSERACT_LSTM_NETWORKIO_H_ | |
| 19 #define TESSERACT_LSTM_NETWORKIO_H_ | |
| 20 | |
| 21 #include "helpers.h" | |
| 22 #include "image.h" | |
| 23 #include "static_shape.h" | |
| 24 #include "stridemap.h" | |
| 25 #include "weightmatrix.h" | |
| 26 | |
| 27 #include <cmath> | |
| 28 #include <cstdio> | |
| 29 #include <vector> | |
| 30 | |
| 31 struct Pix; | |
| 32 | |
| 33 namespace tesseract { | |
| 34 | |
| 35 // Class to contain all the input/output of a network, allowing for fixed or | |
| 36 // variable-strided 2d to 1d mapping, and float or int8_t values. Provides | |
| 37 // enough calculating functions to hide the detail of the implementation. | |
| 38 class TESS_API NetworkIO { | |
| 39 public: | |
| 40 NetworkIO() : int_mode_(false) {} | |
| 41 // Resizes the array (and stride), avoiding realloc if possible, to the given | |
| 42 // size from various size specs: | |
| 43 // Same stride size, but given number of features. | |
| 44 void Resize(const NetworkIO &src, int num_features) { | |
| 45 ResizeToMap(src.int_mode(), src.stride_map(), num_features); | |
| 46 } | |
| 47 // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim. | |
| 48 void Resize2d(bool int_mode, int width, int num_features); | |
| 49 // Resizes forcing a float representation with the stridemap of src and the | |
| 50 // given number of features. | |
| 51 void ResizeFloat(const NetworkIO &src, int num_features) { | |
| 52 ResizeToMap(false, src.stride_map(), num_features); | |
| 53 } | |
| 54 // Resizes to a specific stride_map. | |
| 55 void ResizeToMap(bool int_mode, const StrideMap &stride_map, int num_features); | |
| 56 // Shrinks image size by x_scale,y_scale, and use given number of features. | |
| 57 void ResizeScaled(const NetworkIO &src, int x_scale, int y_scale, int num_features); | |
| 58 // Resizes to just 1 x-coord, whatever the input. | |
| 59 void ResizeXTo1(const NetworkIO &src, int num_features); | |
| 60 // Initialize all the array to zero. | |
| 61 void Zero(); | |
| 62 // Initializes to zero all elements of the array that do not correspond to | |
| 63 // valid image positions. (If a batch of different-sized images are packed | |
| 64 // together, then there will be padding pixels.) | |
| 65 void ZeroInvalidElements(); | |
| 66 // Sets up the array from the given image, using the currently set int_mode_. | |
| 67 // If the image width doesn't match the shape, the image is truncated or | |
| 68 // padded with noise to match. | |
| 69 void FromPix(const StaticShape &shape, const Image pix, TRand *randomizer); | |
| 70 // Sets up the array from the given set of images, using the currently set | |
| 71 // int_mode_. If the image width doesn't match the shape, the images are | |
| 72 // truncated or padded with noise to match. | |
| 73 void FromPixes(const StaticShape &shape, const std::vector<Image> &pixes, | |
| 74 TRand *randomizer); | |
| 75 // Copies the given pix to *this at the given batch index, stretching and | |
| 76 // clipping the pixel values so that [black, black + 2*contrast] maps to the | |
| 77 // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int. | |
| 78 // This is a 2-d operation in the sense that the output depth is the number | |
| 79 // of input channels, the height is the height of the image, and the width | |
| 80 // is the width of the image, or truncated/padded with noise if the width | |
| 81 // is a fixed size. | |
| 82 void Copy2DImage(int batch, Image pix, float black, float contrast, TRand *randomizer); | |
| 83 // Copies the given pix to *this at the given batch index, as Copy2DImage | |
| 84 // above, except that the output depth is the height of the input image, the | |
| 85 // output height is 1, and the output width as for Copy2DImage. | |
| 86 // The image is thus treated as a 1-d set of vertical pixel strips. | |
| 87 void Copy1DGreyImage(int batch, Image pix, float black, float contrast, TRand *randomizer); | |
| 88 // Helper stores the pixel value in i_ or f_ according to int_mode_. | |
| 89 // t: is the index from the StrideMap corresponding to the current | |
| 90 // [batch,y,x] position | |
| 91 // f: is the index into the depth/channel | |
| 92 // pixel: the value of the pixel from the image (in one channel) | |
| 93 // black: the pixel value to map to the lowest of the range of *this | |
| 94 // contrast: the range of pixel values to stretch to half the range of *this. | |
| 95 void SetPixel(int t, int f, int pixel, float black, float contrast); | |
| 96 // Converts the array to a Pix. Must be pixDestroyed after use. | |
| 97 Image ToPix() const; | |
| 98 // Prints the first and last num timesteps of the array for each feature. | |
| 99 void Print(int num) const; | |
| 100 | |
| 101 // Returns the timestep width. | |
| 102 int Width() const { | |
| 103 return int_mode_ ? i_.dim1() : f_.dim1(); | |
| 104 } | |
| 105 // Returns the number of features. | |
| 106 int NumFeatures() const { | |
| 107 return int_mode_ ? i_.dim2() : f_.dim2(); | |
| 108 } | |
| 109 // Accessor to a timestep of the float matrix. | |
| 110 float *f(int t) { | |
| 111 ASSERT_HOST(!int_mode_); | |
| 112 return f_[t]; | |
| 113 } | |
| 114 const float *f(int t) const { | |
| 115 ASSERT_HOST(!int_mode_); | |
| 116 return f_[t]; | |
| 117 } | |
| 118 const int8_t *i(int t) const { | |
| 119 ASSERT_HOST(int_mode_); | |
| 120 return i_[t]; | |
| 121 } | |
| 122 bool int_mode() const { | |
| 123 return int_mode_; | |
| 124 } | |
| 125 void set_int_mode(bool is_quantized) { | |
| 126 int_mode_ = is_quantized; | |
| 127 } | |
| 128 const StrideMap &stride_map() const { | |
| 129 return stride_map_; | |
| 130 } | |
| 131 void set_stride_map(const StrideMap &map) { | |
| 132 stride_map_ = map; | |
| 133 } | |
| 134 const GENERIC_2D_ARRAY<float> &float_array() const { | |
| 135 return f_; | |
| 136 } | |
| 137 GENERIC_2D_ARRAY<float> *mutable_float_array() { | |
| 138 return &f_; | |
| 139 } | |
| 140 | |
| 141 // Copies a single time step from src. | |
| 142 void CopyTimeStepFrom(int dest_t, const NetworkIO &src, int src_t); | |
| 143 // Copies a part of single time step from src. | |
| 144 void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, const NetworkIO &src, | |
| 145 int src_t, int src_offset); | |
| 146 // Zeroes a single time step. | |
| 147 void ZeroTimeStep(int t) { | |
| 148 if (int_mode_) { | |
| 149 memset(i_[t], 0, sizeof(*i_[t]) * NumFeatures()); | |
| 150 } else { | |
| 151 memset(f_[t], 0, sizeof(*f_[t]) * NumFeatures()); | |
| 152 } | |
| 153 } | |
| 154 // Sets the given range to random values. | |
| 155 void Randomize(int t, int offset, int num_features, TRand *randomizer); | |
| 156 | |
| 157 // Helper returns the label and score of the best choice over a range. | |
| 158 int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, float *rating, | |
| 159 float *certainty) const; | |
| 160 // Helper returns the rating and certainty of the choice over a range in t. | |
| 161 void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, float *rating, | |
| 162 float *certainty) const; | |
| 163 // Returns the index (label) of the best value at the given timestep, | |
| 164 // and if not null, sets the score to the log of the corresponding value. | |
| 165 int BestLabel(int t, float *score) const { | |
| 166 return BestLabel(t, -1, -1, score); | |
| 167 } | |
| 168 // Returns the index (label) of the best value at the given timestep, | |
| 169 // excluding not_this and not_that, and if not null, sets the score to the | |
| 170 // log of the corresponding value. | |
| 171 int BestLabel(int t, int not_this, int not_that, float *score) const; | |
| 172 // Returns the best start position out of range (into which both start and end | |
| 173 // must fit) to obtain the highest cumulative score for the given labels. | |
| 174 int PositionOfBestMatch(const std::vector<int> &labels, int start, int end) const; | |
| 175 // Returns the cumulative score of the given labels starting at start, and | |
| 176 // using one label per time-step. | |
| 177 TFloat ScoreOfLabels(const std::vector<int> &labels, int start) const; | |
| 178 // Helper function sets all the outputs for a single timestep, such that | |
| 179 // label has value ok_score, and the other labels share 1 - ok_score. | |
| 180 // Assumes float mode. | |
| 181 void SetActivations(int t, int label, float ok_score); | |
| 182 // Modifies the values, only if needed, so that the given label is | |
| 183 // the winner at the given time step t. | |
| 184 // Assumes float mode. | |
| 185 void EnsureBestLabel(int t, int label); | |
| 186 // Helper function converts prob to certainty taking the minimum into account. | |
| 187 static float ProbToCertainty(float prob); | |
| 188 // Returns true if there is any bad value that is suspiciously like a GT | |
| 189 // error. Assuming that *this is the difference(gradient) between target | |
| 190 // and forward output, returns true if there is a large negative value | |
| 191 // (correcting a very confident output) for which there is no corresponding | |
| 192 // positive value in an adjacent timestep for the same feature index. This | |
| 193 // allows the box-truthed samples to make fine adjustments to position while | |
| 194 // stopping other disagreements of confident output with ground truth. | |
| 195 bool AnySuspiciousTruth(float confidence_thr) const; | |
| 196 | |
| 197 // Reads a single timestep to floats in the range [-1, 1]. | |
| 198 void ReadTimeStep(int t, TFloat *output) const; | |
| 199 // Adds a single timestep to floats. | |
| 200 void AddTimeStep(int t, TFloat *inout) const; | |
| 201 // Adds part of a single timestep to floats. | |
| 202 void AddTimeStepPart(int t, int offset, int num_features, float *inout) const; | |
| 203 // Writes a single timestep from floats in the range [-1, 1]. | |
| 204 void WriteTimeStep(int t, const TFloat *input); | |
| 205 // Writes a single timestep from floats in the range [-1, 1] writing only | |
| 206 // num_features elements of input to (*this)[t], starting at offset. | |
| 207 void WriteTimeStepPart(int t, int offset, int num_features, const TFloat *input); | |
| 208 // Maxpools a single time step from src. | |
| 209 void MaxpoolTimeStep(int dest_t, const NetworkIO &src, int src_t, int *max_line); | |
| 210 // Runs maxpool backward, using maxes to index timesteps in *this. | |
| 211 void MaxpoolBackward(const NetworkIO &fwd, const GENERIC_2D_ARRAY<int> &maxes); | |
| 212 // Returns the min over time of the maxes over features of the outputs. | |
| 213 float MinOfMaxes() const; | |
| 214 // Returns the min over time. | |
| 215 float Max() const { | |
| 216 return int_mode_ ? i_.Max() : f_.Max(); | |
| 217 } | |
| 218 // Computes combined results for a combiner that chooses between an existing | |
| 219 // input and itself, with an additional output to indicate the choice. | |
| 220 void CombineOutputs(const NetworkIO &base_output, const NetworkIO &combiner_output); | |
| 221 // Computes deltas for a combiner that chooses between 2 sets of inputs. | |
| 222 void ComputeCombinerDeltas(const NetworkIO &fwd_deltas, const NetworkIO &base_output); | |
| 223 | |
| 224 // Copies the array checking that the types match. | |
| 225 void CopyAll(const NetworkIO &src); | |
| 226 // Adds the array to a float array, with scaling to [-1, 1] if the src is int. | |
| 227 void AddAllToFloat(const NetworkIO &src); | |
| 228 // Subtracts the array from a float array. src must also be float. | |
| 229 void SubtractAllFromFloat(const NetworkIO &src); | |
| 230 | |
| 231 // Copies src to *this, with maxabs normalization to match scale. | |
| 232 void CopyWithNormalization(const NetworkIO &src, const NetworkIO &scale); | |
| 233 // Multiplies the float data by the given factor. | |
| 234 void ScaleFloatBy(float factor) { | |
| 235 f_ *= factor; | |
| 236 } | |
| 237 // Copies src to *this with independent reversal of the y dimension. | |
| 238 void CopyWithYReversal(const NetworkIO &src); | |
| 239 // Copies src to *this with independent reversal of the x dimension. | |
| 240 void CopyWithXReversal(const NetworkIO &src); | |
| 241 // Copies src to *this with independent transpose of the x and y dimensions. | |
| 242 void CopyWithXYTranspose(const NetworkIO &src); | |
| 243 // Copies src to *this, at the given feature_offset, returning the total | |
| 244 // feature offset after the copy. Multiple calls will stack outputs from | |
| 245 // multiple sources in feature space. | |
| 246 int CopyPacking(const NetworkIO &src, int feature_offset); | |
| 247 // Opposite of CopyPacking, fills *this with a part of src, starting at | |
| 248 // feature_offset, and picking num_features. Resizes *this to match. | |
| 249 void CopyUnpacking(const NetworkIO &src, int feature_offset, int num_features); | |
| 250 // Transposes the float part of *this into dest. | |
| 251 void Transpose(TransposedArray *dest) const; | |
| 252 | |
| 253 // Clips the content of a single time-step to +/-range. | |
| 254 void ClipVector(int t, float range); | |
| 255 | |
| 256 // Applies Func to timestep t of *this (u) and multiplies the result by v | |
| 257 // component-wise, putting the product in *product. | |
| 258 // *this and v may be int or float, but must match. The outputs are TFloat. | |
| 259 template <class Func> | |
| 260 void FuncMultiply(const NetworkIO &v_io, int t, TFloat *product) { | |
| 261 Func f; | |
| 262 ASSERT_HOST(!int_mode_); | |
| 263 ASSERT_HOST(!v_io.int_mode_); | |
| 264 int dim = f_.dim2(); | |
| 265 if (int_mode_) { | |
| 266 const int8_t *u = i_[t]; | |
| 267 const int8_t *v = v_io.i_[t]; | |
| 268 for (int i = 0; i < dim; ++i) { | |
| 269 product[i] = f(u[i] / static_cast<TFloat>(INT8_MAX)) * v[i] / INT8_MAX; | |
| 270 } | |
| 271 } else { | |
| 272 const float *u = f_[t]; | |
| 273 const float *v = v_io.f_[t]; | |
| 274 for (int i = 0; i < dim; ++i) { | |
| 275 product[i] = f(u[i]) * v[i]; | |
| 276 } | |
| 277 } | |
| 278 } | |
| 279 // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, | |
| 280 // component-wise, putting the product in *product. | |
| 281 // All NetworkIOs are assumed to be float. | |
| 282 template <class Func> | |
| 283 void FuncMultiply3(int u_t, const NetworkIO &v_io, int v_t, const TFloat *w, | |
| 284 TFloat *product) const { | |
| 285 ASSERT_HOST(!int_mode_); | |
| 286 ASSERT_HOST(!v_io.int_mode_); | |
| 287 Func f; | |
| 288 const float *u = f_[u_t]; | |
| 289 const float *v = v_io.f_[v_t]; | |
| 290 int dim = f_.dim2(); | |
| 291 for (int i = 0; i < dim; ++i) { | |
| 292 product[i] = f(u[i]) * v[i] * w[i]; | |
| 293 } | |
| 294 } | |
| 295 // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, | |
| 296 // component-wise, adding the product to *product. | |
| 297 // All NetworkIOs are assumed to be float. | |
| 298 template <class Func> | |
| 299 void FuncMultiply3Add(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const { | |
| 300 ASSERT_HOST(!int_mode_); | |
| 301 ASSERT_HOST(!v_io.int_mode_); | |
| 302 Func f; | |
| 303 const float *u = f_[t]; | |
| 304 const float *v = v_io.f_[t]; | |
| 305 int dim = f_.dim2(); | |
| 306 for (int i = 0; i < dim; ++i) { | |
| 307 product[i] += f(u[i]) * v[i] * w[i]; | |
| 308 } | |
| 309 } | |
| 310 // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w, | |
| 311 // component-wise, putting the product in product, all at timestep t, except | |
| 312 // w, which is a simple array. All NetworkIOs are assumed to be float. | |
| 313 template <class Func1, class Func2> | |
| 314 void Func2Multiply3(const NetworkIO &v_io, int t, const TFloat *w, TFloat *product) const { | |
| 315 ASSERT_HOST(!int_mode_); | |
| 316 ASSERT_HOST(!v_io.int_mode_); | |
| 317 Func1 f; | |
| 318 Func2 g; | |
| 319 const float *u = f_[t]; | |
| 320 const float *v = v_io.f_[t]; | |
| 321 int dim = f_.dim2(); | |
| 322 for (int i = 0; i < dim; ++i) { | |
| 323 product[i] = f(u[i]) * g(v[i]) * w[i]; | |
| 324 } | |
| 325 } | |
| 326 | |
| 327 private: | |
| 328 // Returns the padding required for the given number of features in order | |
| 329 // for the SIMD operations to be safe. | |
| 330 static int GetPadding(int num_features); | |
| 331 | |
| 332 // Choice of float vs 8 bit int for data. | |
| 333 GENERIC_2D_ARRAY<float> f_; | |
| 334 GENERIC_2D_ARRAY<int8_t> i_; | |
| 335 // Which of f_ and i_ are we actually using. | |
| 336 bool int_mode_; | |
| 337 // Stride for 2d input data. | |
| 338 StrideMap stride_map_; | |
| 339 }; | |
| 340 | |
| 341 } // namespace tesseract. | |
| 342 | |
| 343 #endif // TESSERACT_LSTM_NETWORKIO_H_ |
