Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/common/sampleiterator.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2011 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // Unless required by applicable law or agreed to in writing, software | |
| 9 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 11 // See the License for the specific language governing permissions and | |
| 12 // limitations under the License. | |
| 13 // | |
| 14 /////////////////////////////////////////////////////////////////////// | |
| 15 | |
| 16 #ifndef TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ | |
| 17 #define TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ | |
| 18 | |
| 19 namespace tesseract { | |
| 20 | |
| 21 class IndexMapBiDi; | |
| 22 class IntFeatureMap; | |
| 23 class ShapeTable; | |
| 24 class TrainingSample; | |
| 25 class TrainingSampleSet; | |
| 26 struct UnicharAndFonts; | |
| 27 | |
| 28 // Iterator class to encapsulate the complex iteration involved in getting | |
| 29 // all samples of all shapes needed for a classification problem. | |
| 30 // | |
| 31 // =====INPUTS TO Init FUNCTION===== | |
| 32 // The charset_map defines a subset of the sample_set classes (with a nullptr | |
| 33 // shape_table, or the shape_table classes if not nullptr.) | |
| 34 // | |
| 35 // The shape_table (if not nullptr) defines the mapping from shapes to | |
| 36 // font_id/class_id pairs. Each shape is a list of unichar_id and font lists. | |
| 37 // | |
| 38 // The sample_set holds the samples and provides indexed access to samples | |
| 39 // of font_id/class_id pairs. | |
| 40 // | |
| 41 // If randomize is true, the samples are perturbed slightly, but the | |
| 42 // perturbation is guaranteed to be the same for multiple identical | |
| 43 // iterations. | |
| 44 // | |
| 45 // =====DIFFERENT COMBINATIONS OF INPUTS===== | |
| 46 // nullptr shape_table: | |
| 47 // Without a shape_table, everything works in UNICHAR_IDs. | |
| 48 // | |
| 49 // nullptr shape_table, nullptr charset_map: | |
| 50 // Iterations simply run over the samples in the order the samples occur in the | |
| 51 // input files. | |
| 52 // GetCompactClassID and GetSparseClassID both return the sample UNICHAR_ID. | |
| 53 // | |
| 54 // nullptr shape_table, non-nullptr charset_map: | |
| 55 // When shape_table is nullptr, the charset_map indexes unichar_ids directly, | |
| 56 // and an iteration returns all samples of all chars in the charset_map, which | |
| 57 // is a subset of the full unicharset. | |
| 58 // The iteration will be in groups of the same unichar_id, in the order | |
| 59 // defined by the charset_map. | |
| 60 // GetCompactClassID returns the charset_map index of a sample, and | |
| 61 // GetSparseClassID returns the sample UNICHAR_ID. | |
| 62 // | |
| 63 // Non-nullptr shape_table: | |
| 64 // With a shape_table, samples are grouped according to the shape_table, so | |
| 65 // multiple UNICHAR_IDs and fonts may be grouped together, and everything | |
| 66 // works in shape_ids. | |
| 67 // | |
| 68 // Non-nullptr shape_table, nullptr charset_map. | |
| 69 // Iterations simply run over the samples in the order of shape_id. | |
| 70 // GetCompactClassID and GetSparseClassID both return the shape_id. | |
| 71 // (If you want the unichar_id or font_id, the sample still has them.) | |
| 72 // | |
| 73 // Non-nullptr shape_table, non-nullptr charset_map. | |
| 74 // When shape_table is not nullptr, the charset_map indexes and subsets shapes | |
| 75 // in the shape_table, and iterations will be in shape_table order, not | |
| 76 // charset_map order. | |
| 77 // GetCompactClassID returns the charset_map index of a shape, and | |
| 78 // GetSparseClassID returns the shape_id. | |
| 79 // | |
| 80 // =====What is SampleIterator good for?===== | |
| 81 // Inside a classifier training module, the SampleIterator has abstracted away | |
| 82 // all the different modes above. | |
| 83 // Use the following iteration to train your classifier: | |
| 84 // for (it.Begin(); !it.AtEnd(); it.Next()) { | |
| 85 // const TrainingSample& sample = it.GetSample(); | |
| 86 // int class_id = it.GetCompactClassID(); | |
| 87 // Your classifier may or may not be dealing with a shape_table, and may be | |
| 88 // dealing with some subset of the character/shape set. It doesn't need to | |
| 89 // know and shouldn't care. It is just learning shapes with compact class ids | |
| 90 // in the range [0, it.CompactCharsetSize()). | |
| 91 class SampleIterator { | |
| 92 public: | |
| 93 SampleIterator(); | |
| 94 ~SampleIterator(); | |
| 95 | |
| 96 void Clear(); | |
| 97 | |
| 98 // See class comment for arguments. | |
| 99 void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, | |
| 100 TrainingSampleSet *sample_set); | |
| 101 | |
| 102 // Iterator functions designed for use with a simple for loop: | |
| 103 // for (it.Begin(); !it.AtEnd(); it.Next()) { | |
| 104 // const TrainingSample& sample = it.GetSample(); | |
| 105 // int class_id = it.GetCompactClassID(); | |
| 106 // ... | |
| 107 // } | |
| 108 void Begin(); | |
| 109 bool AtEnd() const; | |
| 110 const TrainingSample &GetSample() const; | |
| 111 TrainingSample *MutableSample() const; | |
| 112 // Returns the total index (from the original set of samples) of the current | |
| 113 // sample. | |
| 114 int GlobalSampleIndex() const; | |
| 115 // Returns the index of the current sample in compact charset space, so | |
| 116 // in a 2-class problem between x and y, the returned indices will all be | |
| 117 // 0 or 1, and have nothing to do with the unichar_ids. | |
| 118 // If the charset_map_ is nullptr, then this is equal to GetSparseClassID(). | |
| 119 int GetCompactClassID() const; | |
| 120 // Returns the index of the current sample in sparse charset space, so | |
| 121 // in a 2-class problem between x and y, the returned indices will all be | |
| 122 // x or y, where x and y may be unichar_ids (no shape_table_) or shape_ids | |
| 123 // with a shape_table_. | |
| 124 int GetSparseClassID() const; | |
| 125 // Moves on to the next indexable sample. If the end is reached, leaves | |
| 126 // the state such that AtEnd() is true. | |
| 127 void Next(); | |
| 128 | |
| 129 // Returns the size of the compact charset space. | |
| 130 int CompactCharsetSize() const; | |
| 131 // Returns the size of the sparse charset space. | |
| 132 int SparseCharsetSize() const; | |
| 133 | |
| 134 const IndexMapBiDi &charset_map() const { | |
| 135 return *charset_map_; | |
| 136 } | |
| 137 const ShapeTable *shape_table() const { | |
| 138 return shape_table_; | |
| 139 } | |
| 140 // Sample set operations. | |
| 141 const TrainingSampleSet *sample_set() const { | |
| 142 return sample_set_; | |
| 143 } | |
| 144 | |
| 145 // A set of functions that do something to all the samples accessed by the | |
| 146 // iterator, as it is currently setup. | |
| 147 | |
| 148 // Apply the supplied feature_space/feature_map transform to all samples | |
| 149 // accessed by this iterator. | |
| 150 void MapSampleFeatures(const IntFeatureMap &feature_map); | |
| 151 | |
| 152 // Adjust the weights of all the samples to be uniform in the given charset. | |
| 153 // Returns the number of samples in the iterator. | |
| 154 int UniformSamples(); | |
| 155 | |
| 156 // Normalize the weights of all the samples defined by the iterator so they | |
| 157 // sum to 1. Returns the minimum assigned sample weight. | |
| 158 double NormalizeSamples(); | |
| 159 | |
| 160 private: | |
| 161 // Helper returns the current UnicharAndFont shape_entry. | |
| 162 const UnicharAndFonts *GetShapeEntry() const; | |
| 163 | |
| 164 // Map to subset the actual charset space. | |
| 165 const IndexMapBiDi *charset_map_; | |
| 166 // Shape table to recombine character classes into shapes | |
| 167 const ShapeTable *shape_table_; | |
| 168 // The samples to iterate over. | |
| 169 TrainingSampleSet *sample_set_; | |
| 170 // Flag to control randomizing the sample features. | |
| 171 bool randomize_; | |
| 172 // Shape table owned by this used to iterate character classes. | |
| 173 ShapeTable *owned_shape_table_; | |
| 174 | |
| 175 // Top-level iteration. Shape index in sparse charset_map space. | |
| 176 int shape_index_; | |
| 177 int num_shapes_; | |
| 178 // Index to the character class within a shape. | |
| 179 int shape_char_index_; | |
| 180 int num_shape_chars_; | |
| 181 // Index to the font within a shape/class pair. | |
| 182 int shape_font_index_; | |
| 183 int num_shape_fonts_; | |
| 184 // The lowest level iteration. sample_index_/num_samples_ counts samples | |
| 185 // in the current shape/class/font combination. | |
| 186 int sample_index_; | |
| 187 int num_samples_; | |
| 188 }; | |
| 189 | |
| 190 } // namespace tesseract. | |
| 191 | |
| 192 #endif // TESSERACT_CLASSIFY_SAMPLEITERATOR_H_ |
