comparison mupdf-source/thirdparty/leptonica/src/recog.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 #ifndef LEPTONICA_RECOG_H
28 #define LEPTONICA_RECOG_H
29
30 /*!
31 * \file recog.h
32 *
33 * <pre>
34 * This is a simple utility for training and recognizing individual
35 * machine-printed text characters. It is designed to be adapted
36 * to a particular set of character images; e.g., from a book.
37 *
38 * There are two methods of training the recognizer. In the most
39 * simple, a set of bitmaps has been labeled by some means, such
40 * a generic OCR program. This is input either one template at a time
41 * or as a pixa of templates, to a function that creates a recog.
42 * If in a pixa, the text string label must be embedded in the
43 * text field of each pix.
44 *
45 * If labeled data is not available, we start with a bootstrap
46 * recognizer (BSR) that has labeled data from a variety of sources.
47 * These images are scaled, typically to a fixed height, and then
48 * fed similarly scaled unlabeled images from the source (e.g., book),
49 * and the BSR attempts to identify them. All images that have
50 * a high enough correlation score with one of the templates in the
51 * BSR are emitted in a pixa, which now holds unscaled and labeled
52 * templates from the source. This is the generator for a book adapted
53 * recognizer (BAR).
54 *
55 * The pixa should always be thought of as the primary structure.
56 * It is the generator for the recog, because a recog is built
57 * from a pixa of unscaled images.
58 *
59 * New image templates can be added to a recog as long as it is
60 * in training mode. Once training is finished, to add templates
61 * it is necessary to extract the generating pixa, add templates
62 * to that pixa, and make a new recog. Similarly, we do not
63 * join two recog; instead, we simply join their generating pixa,
64 * and make a recog from that.
65 *
66 * To remove outliers from a pixa of labeled pix, make a recog,
67 * determine the outliers, and generate a new pixa with the
68 * outliers removed. The outliers are determined by building
69 * special templates for each character set that are scaled averages
70 * of the individual templates. Then a correlation score is found
71 * between each template and the averaged templates. There are
72 * two implementations; outliers are determined as either:
73 * (1) a template having a correlation score with its class average
74 * that is below a threshold, or
75 * (2) a template having a correlation score with its class average
76 * that is smaller than the correlation score with the average
77 * of another class.
78 * Outliers are removed from the generating pixa. Scaled averaging
79 * is only performed for determining outliers and for splitting
80 * characters; it is never used in a trained recognizer for identifying
81 * unlabeled samples.
82 *
83 * Two methods using averaged templates are provided for splitting
84 * touching characters:
85 * (1) greedy matching
86 * (2) document image decoding (DID)
87 * The DID method is the default. It is about 5x faster and
88 * possibly more accurate.
89 *
90 * Once a BAR has been made, unlabeled sample images are identified
91 * by finding the individual template in the BAR with highest
92 * correlation. The input images and images in the BAR can be
93 * represented in two ways:
94 * (1) as scanned, binarized to 1 bpp
95 * (2) as a width-normalized outline formed by thinning to a
96 * skeleton and then dilating by a fixed amount.
97 *
98 * The recog can be serialized to file and read back. The serialized
99 * version holds the templates used for correlation (which may have
100 * been modified by scaling and turning into lines from the unscaled
101 * templates), plus, for arbitrary character sets, the UTF8
102 * representation and the lookup table mapping from the character
103 * representation to index.
104 *
105 * Why do we not use averaged templates for recognition?
106 * Letterforms can take on significantly different shapes (eg.,
107 * the letters 'a' and 'g'), and it makes no sense to average these.
108 * The previous version of this utility allowed multiple recognizers
109 * to exist, but this is an unnecessary complication if recognition
110 * is done on all samples instead of on averages.
111 * </pre>
112 */
113
114 #define RECOG_VERSION_NUMBER 2
115
116 struct L_Recog {
117 l_int32 scalew; /*!< scale all examples to this width; */
118 /*!< use 0 prevent horizontal scaling */
119 l_int32 scaleh; /*!< scale all examples to this height; */
120 /*!< use 0 prevent vertical scaling */
121 l_int32 linew; /*!< use a value > 0 to convert the bitmap */
122 /*!< to lines of fixed width; 0 to skip */
123 l_int32 templ_use; /*!< template use: use either the average */
124 /*!< or all temmplates (L_USE_AVERAGE or */
125 /*!< L_USE_ALL) */
126 l_int32 maxarraysize; /*!< initialize container arrays to this */
127 l_int32 setsize; /*!< size of character set */
128 l_int32 threshold; /*!< for binarizing if depth > 1 */
129 l_int32 maxyshift; /*!< vertical jiggle on nominal centroid */
130 /*!< alignment; typically 0 or 1 */
131 l_int32 charset_type; /*!< one of L_ARABIC_NUMERALS, etc. */
132 l_int32 charset_size; /*!< expected number of classes in charset */
133 l_int32 min_nopad; /*!< min number of samples without padding */
134 l_int32 num_samples; /*!< number of training samples */
135 l_int32 minwidth_u; /*!< min width averaged unscaled templates */
136 l_int32 maxwidth_u; /*!< max width averaged unscaled templates */
137 l_int32 minheight_u; /*!< min height averaged unscaled templates */
138 l_int32 maxheight_u; /*!< max height averaged unscaled templates */
139 l_int32 minwidth; /*!< min width averaged scaled templates */
140 l_int32 maxwidth; /*!< max width averaged scaled templates */
141 l_int32 ave_done; /*!< set to 1 when averaged bitmaps are made */
142 l_int32 train_done; /*!< set to 1 when training is complete or */
143 /*!< identification has started */
144 l_float32 max_wh_ratio; /*!< max width/height ratio to split */
145 l_float32 max_ht_ratio; /*!< max of max/min template height ratio */
146 l_int32 min_splitw; /*!< min component width kept in splitting */
147 l_int32 max_splith; /*!< max component height kept in splitting */
148 struct Sarray *sa_text; /*!< text array for arbitrary char set */
149 struct L_Dna *dna_tochar; /*!< index-to-char lut for arbitrary charset */
150 l_int32 *centtab; /*!< table for finding centroids */
151 l_int32 *sumtab; /*!< table for finding pixel sums */
152 struct Pixaa *pixaa_u; /*!< all unscaled templates for each class */
153 struct Ptaa *ptaa_u; /*!< centroids of all unscaled templates */
154 struct Numaa *naasum_u; /*!< area of all unscaled templates */
155 struct Pixaa *pixaa; /*!< all (scaled) templates for each class */
156 struct Ptaa *ptaa; /*!< centroids of all (scaledl) templates */
157 struct Numaa *naasum; /*!< area of all (scaled) templates */
158 struct Pixa *pixa_u; /*!< averaged unscaled templates per class */
159 struct Pta *pta_u; /*!< centroids of unscaled ave. templates */
160 struct Numa *nasum_u; /*!< area of unscaled averaged templates */
161 struct Pixa *pixa; /*!< averaged (scaled) templates per class */
162 struct Pta *pta; /*!< centroids of (scaled) ave. templates */
163 struct Numa *nasum; /*!< area of (scaled) averaged templates */
164 struct Pixa *pixa_tr; /*!< all input training images */
165 struct Pixa *pixadb_ave; /*!< unscaled and scaled averaged bitmaps */
166 struct Pixa *pixa_id; /*!< input images for identifying */
167 struct Pix *pixdb_ave; /*!< debug: best match of input against ave. */
168 struct Pix *pixdb_range; /*!< debug: best matches within range */
169 struct Pixa *pixadb_boot; /*!< debug: bootstrap training results */
170 struct Pixa *pixadb_split; /*!< debug: splitting results */
171 struct L_Bmf *bmf; /*!< bmf fonts */
172 l_int32 bmf_size; /*!< font size of bmf; default is 6 pt */
173 struct L_Rdid *did; /*!< temp data used for image decoding */
174 struct L_Rch *rch; /*!< temp data used for holding best char */
175 struct L_Rcha *rcha; /*!< temp data used for array of best chars */
176 };
177 typedef struct L_Recog L_RECOG;
178
179 /*!
180 * Data returned from correlation matching on a single character
181 */
182 struct L_Rch {
183 l_int32 index; /*!< index of best template */
184 l_float32 score; /*!< correlation score of best template */
185 char *text; /*!< character string of best template */
186 l_int32 sample; /*!< index of best sample (within the best */
187 /*!< template class, if all samples are used) */
188 l_int32 xloc; /*!< x-location of template (delx + shiftx) */
189 l_int32 yloc; /*!< y-location of template (dely + shifty) */
190 l_int32 width; /*!< width of best template */
191 };
192 typedef struct L_Rch L_RCH;
193
194 /*!
195 * Data returned from correlation matching on an array of characters
196 */
197 struct L_Rcha {
198 struct Numa *naindex; /*!< indices of best templates */
199 struct Numa *nascore; /*!< correlation scores of best templates */
200 struct Sarray *satext; /*!< character strings of best templates */
201 struct Numa *nasample; /*!< indices of best samples */
202 struct Numa *naxloc; /*!< x-locations of templates (delx + shiftx) */
203 struct Numa *nayloc; /*!< y-locations of templates (dely + shifty) */
204 struct Numa *nawidth; /*!< widths of best templates */
205 };
206 typedef struct L_Rcha L_RCHA;
207
208 /*!
209 * Data used for decoding a line of characters.
210 */
211 struct L_Rdid {
212 struct Pix *pixs; /*!< clone of pix to be decoded */
213 l_int32 **counta; /*!< count array for each averaged template */
214 l_int32 **delya; /*!< best y-shift array per average template */
215 l_int32 narray; /*!< number of averaged templates */
216 l_int32 size; /*!< size of count array (width of pixs) */
217 l_int32 *setwidth; /*!< setwidths for each template */
218 struct Numa *nasum; /*!< pixel count in pixs by column */
219 struct Numa *namoment; /*!< first moment of pixels in pixs by cols */
220 l_int32 fullarrays; /*!< 1 if full arrays are made; 0 otherwise */
221 l_float32 *beta; /*!< channel coeffs for template fg term */
222 l_float32 *gamma; /*!< channel coeffs for bit-and term */
223 l_float32 *trellisscore; /*!< score on trellis */
224 l_int32 *trellistempl; /*!< template on trellis (for backtrack) */
225 struct Numa *natempl; /*!< indices of best path templates */
226 struct Numa *naxloc; /*!< x locations of best path templates */
227 struct Numa *nadely; /*!< y locations of best path templates */
228 struct Numa *nawidth; /*!< widths of best path templates */
229 struct Boxa *boxa; /*!< Viterbi result for splitting input pixs */
230 struct Numa *nascore; /*!< correlation scores: best path templates */
231 struct Numa *natempl_r; /*!< indices of best rescored templates */
232 struct Numa *nasample_r; /*!< samples of best scored templates */
233 struct Numa *naxloc_r; /*!< x locations of best rescoredtemplates */
234 struct Numa *nadely_r; /*!< y locations of best rescoredtemplates */
235 struct Numa *nawidth_r; /*!< widths of best rescoredtemplates */
236 struct Numa *nascore_r; /*!< correlation scores: rescored templates */
237 };
238 typedef struct L_Rdid L_RDID;
239
240
241 /*-------------------------------------------------------------------------*
242 * Flags for describing limited character sets *
243 *-------------------------------------------------------------------------*/
244 /*! Character Set */
245 enum {
246 L_UNKNOWN = 0, /*!< character set type is not specified */
247 L_ARABIC_NUMERALS = 1, /*!< 10 digits */
248 L_LC_ROMAN_NUMERALS = 2, /*!< 7 lower-case letters (i,v,x,l,c,d,m) */
249 L_UC_ROMAN_NUMERALS = 3, /*!< 7 upper-case letters (I,V,X,L,C,D,M) */
250 L_LC_ALPHA = 4, /*!< 26 lower-case letters */
251 L_UC_ALPHA = 5 /*!< 26 upper-case letters */
252 };
253
254 /*-------------------------------------------------------------------------*
255 * Flags for selecting between using average and all templates: *
256 * recog->templ_use *
257 *-------------------------------------------------------------------------*/
258 /*! Template Select */
259 enum {
260 L_USE_ALL_TEMPLATES = 0, /*!< use all templates; default */
261 L_USE_AVERAGE_TEMPLATES = 1 /*!< use average templates; special cases */
262 };
263
264 #endif /* LEPTONICA_RECOG_H */