Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/common/networkbuilder.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: networkbuilder.h | |
| 3 // Description: Class to parse the network description language and | |
| 4 // build a corresponding network. | |
| 5 // Author: Ray Smith | |
| 6 // | |
| 7 // (C) Copyright 2014, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_LSTM_NETWORKBUILDER_H_ | |
| 20 #define TESSERACT_LSTM_NETWORKBUILDER_H_ | |
| 21 | |
| 22 #include "export.h" | |
| 23 #include "static_shape.h" | |
| 24 #include "stridemap.h" | |
| 25 | |
| 26 class UNICHARSET; | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 class Input; | |
| 31 class Network; | |
| 32 class Parallel; | |
| 33 class TRand; | |
| 34 | |
| 35 class TESS_COMMON_TRAINING_API NetworkBuilder { | |
| 36 public: | |
| 37 explicit NetworkBuilder(int num_softmax_outputs) : num_softmax_outputs_(num_softmax_outputs) {} | |
| 38 | |
| 39 // Builds a network with a network_spec in the network description | |
| 40 // language, to recognize a character set of num_outputs size. | |
| 41 // If append_index is non-negative, then *network must be non-null and the | |
| 42 // given network_spec will be appended to *network AFTER append_index, with | |
| 43 // the top of the input *network discarded. | |
| 44 // Note that network_spec is call by value to allow a non-const char* pointer | |
| 45 // into the string for BuildFromString. | |
| 46 // net_flags control network behavior according to the NetworkFlags enum. | |
| 47 // The resulting network is returned via **network. | |
| 48 // Returns false if something failed. | |
| 49 static bool InitNetwork(int num_outputs, const char *network_spec, int append_index, | |
| 50 int net_flags, float weight_range, TRand *randomizer, Network **network); | |
| 51 | |
| 52 // Parses the given string and returns a network according to the following | |
| 53 // language: | |
| 54 // ============ Syntax of description below: ============ | |
| 55 // <d> represents a number. | |
| 56 // <net> represents any single network element, including (recursively) a | |
| 57 // [...] series or (...) parallel construct. | |
| 58 // (s|t|r|l|m) (regex notation) represents a single required letter. | |
| 59 // NOTE THAT THROUGHOUT, x and y are REVERSED from conventional mathematics, | |
| 60 // to use the same convention as Tensor Flow. The reason TF adopts this | |
| 61 // convention is to eliminate the need to transpose images on input, since | |
| 62 // adjacent memory locations in images increase x and then y, while adjacent | |
| 63 // memory locations in tensors in TF, and NetworkIO in tesseract increase the | |
| 64 // rightmost index first, then the next-left and so-on, like C arrays. | |
| 65 // ============ INPUTS ============ | |
| 66 // <b>,<h>,<w>,<d> A batch of b images with height h, width w, and depth d. | |
| 67 // b, h and/or w may be zero, to indicate variable size. Some network layer | |
| 68 // (summarizing LSTM) must be used to make a variable h known. | |
| 69 // d may be 1 for greyscale, 3 for color. | |
| 70 // NOTE that throughout the constructed network, the inputs/outputs are all of | |
| 71 // the same [batch,height,width,depth] dimensions, even if a different size. | |
| 72 // ============ PLUMBING ============ | |
| 73 // [...] Execute ... networks in series (layers). | |
| 74 // (...) Execute ... networks in parallel, with their output depths added. | |
| 75 // R<d><net> Execute d replicas of net in parallel, with their output depths | |
| 76 // added. | |
| 77 // Rx<net> Execute <net> with x-dimension reversal. | |
| 78 // Ry<net> Execute <net> with y-dimension reversal. | |
| 79 // S<y>,<x> Rescale 2-D input by shrink factor x,y, rearranging the data by | |
| 80 // increasing the depth of the input by factor xy. | |
| 81 // Mp<y>,<x> Maxpool the input, reducing the size by an (x,y) rectangle. | |
| 82 // ============ FUNCTIONAL UNITS ============ | |
| 83 // C(s|t|r|l|m)<y>,<x>,<d> Convolves using a (x,y) window, with no shrinkage, | |
| 84 // random infill, producing d outputs, then applies a non-linearity: | |
| 85 // s: Sigmoid, t: Tanh, r: Relu, l: Linear, m: Softmax. | |
| 86 // F(s|t|r|l|m)<d> Truly fully-connected with s|t|r|l|m non-linearity and d | |
| 87 // outputs. Connects to every x,y,depth position of the input, reducing | |
| 88 // height, width to 1, producing a single <d> vector as the output. | |
| 89 // Input height and width must be constant. | |
| 90 // For a sliding-window linear or non-linear map that connects just to the | |
| 91 // input depth, and leaves the input image size as-is, use a 1x1 convolution | |
| 92 // eg. Cr1,1,64 instead of Fr64. | |
| 93 // L(f|r|b)(x|y)[s]<n> LSTM cell with n states/outputs. | |
| 94 // The LSTM must have one of: | |
| 95 // f runs the LSTM forward only. | |
| 96 // r runs the LSTM reversed only. | |
| 97 // b runs the LSTM bidirectionally. | |
| 98 // It will operate on either the x- or y-dimension, treating the other | |
| 99 // dimension independently (as if part of the batch). | |
| 100 // s (optional) summarizes the output in the requested dimension, | |
| 101 // outputting only the final step, collapsing the dimension to a | |
| 102 // single element. | |
| 103 // LS<n> Forward-only LSTM cell in the x-direction, with built-in Softmax. | |
| 104 // LE<n> Forward-only LSTM cell in the x-direction, with built-in softmax, | |
| 105 // with binary Encoding. | |
| 106 // L2xy<n> Full 2-d LSTM operating in quad-directions (bidi in x and y) and | |
| 107 // all the output depths added. | |
| 108 // ============ OUTPUTS ============ | |
| 109 // The network description must finish with an output specification: | |
| 110 // O(2|1|0)(l|s|c)<n> output layer with n classes | |
| 111 // 2 (heatmap) Output is a 2-d vector map of the input (possibly at | |
| 112 // different scale). | |
| 113 // 1 (sequence) Output is a 1-d sequence of vector values. | |
| 114 // 0 (category) Output is a 0-d single vector value. | |
| 115 // l uses a logistic non-linearity on the output, allowing multiple | |
| 116 // hot elements in any output vector value. | |
| 117 // s uses a softmax non-linearity, with one-hot output in each value. | |
| 118 // c uses a softmax with CTC. Can only be used with s (sequence). | |
| 119 // NOTE1: Only O1s and O1c are currently supported. | |
| 120 // NOTE2: n is totally ignored, and for compatibility purposes only. The | |
| 121 // output number of classes is obtained automatically from the | |
| 122 // unicharset. | |
| 123 Network *BuildFromString(const StaticShape &input_shape, const char **str); | |
| 124 | |
| 125 private: | |
| 126 // Parses an input specification and returns the result, which may include a | |
| 127 // series. | |
| 128 Network *ParseInput(const char **str); | |
| 129 // Parses a sequential series of networks, defined by [<net><net>...]. | |
| 130 Network *ParseSeries(const StaticShape &input_shape, Input *input_layer, const char **str); | |
| 131 // Parses a parallel set of networks, defined by (<net><net>...). | |
| 132 Network *ParseParallel(const StaticShape &input_shape, const char **str); | |
| 133 // Parses a network that begins with 'R'. | |
| 134 Network *ParseR(const StaticShape &input_shape, const char **str); | |
| 135 // Parses a network that begins with 'S'. | |
| 136 Network *ParseS(const StaticShape &input_shape, const char **str); | |
| 137 // Parses a network that begins with 'C'. | |
| 138 Network *ParseC(const StaticShape &input_shape, const char **str); | |
| 139 // Parses a network that begins with 'M'. | |
| 140 Network *ParseM(const StaticShape &input_shape, const char **str); | |
| 141 // Parses an LSTM network, either individual, bi- or quad-directional. | |
| 142 Network *ParseLSTM(const StaticShape &input_shape, const char **str); | |
| 143 // Builds a set of 4 lstms with t and y reversal, running in true parallel. | |
| 144 static Network *BuildLSTMXYQuad(int num_inputs, int num_states); | |
| 145 // Parses a Fully connected network. | |
| 146 Network *ParseFullyConnected(const StaticShape &input_shape, const char **str); | |
| 147 // Parses an Output spec. | |
| 148 Network *ParseOutput(const StaticShape &input_shape, const char **str); | |
| 149 | |
| 150 private: | |
| 151 int num_softmax_outputs_; | |
| 152 }; | |
| 153 | |
| 154 } // namespace tesseract. | |
| 155 | |
| 156 #endif // TESSERACT_LSTM_NETWORKBUILDER_H_ |
