comparison mupdf-source/thirdparty/leptonica/src/recogbasic.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file recogbasic.c
29 * <pre>
30 *
31 * Recog creation, destruction and access
32 * L_RECOG *recogCreateFromRecog()
33 * L_RECOG *recogCreateFromPixa()
34 * L_RECOG *recogCreateFromPixaNoFinish()
35 * L_RECOG *recogCreate()
36 * void recogDestroy()
37 *
38 * Recog accessors
39 * l_int32 recogGetCount()
40 * l_int32 recogSetParams()
41 * static l_int32 recogGetCharsetSize()
42 *
43 * Character/index lookup
44 * l_int32 recogGetClassIndex()
45 * l_int32 recogStringToIndex()
46 * l_int32 recogGetClassString()
47 * l_int32 l_convertCharstrToInt()
48 *
49 * Serialization
50 * L_RECOG *recogRead()
51 * L_RECOG *recogReadStream()
52 * L_RECOG *recogReadMem()
53 * l_int32 recogWrite()
54 * l_int32 recogWriteStream()
55 * l_int32 recogWriteMem()
56 * PIXA *recogExtractPixa()
57 * static l_int32 recogAddCharstrLabels()
58 * static l_int32 recogAddAllSamples()
59 *
60 * The recognizer functionality is split into four files:
61 * recogbasic.c: create, destroy, access, serialize
62 * recogtrain.c: training on labeled and unlabeled data
63 * recogident.c: running the recognizer(s) on input
64 * recogdid.c: running the recognizer(s) on input using a
65 * document image decoding (DID) hidden markov model
66 *
67 * This is a content-adapted (or book-adapted) recognizer (BAR) application.
68 * The recognizers here are typically assembled from data that has
69 * been labeled by a generic recognition system, such as Tesseract.
70 * The general procedure to create a recognizer (recog) from labeled data is
71 * to add the labeled character bitmaps, either one at a time or
72 * all together from a pixa with labeled pix.
73 *
74 * The suggested use for a BAR that consists of labeled templates drawn
75 * from a single source (e.g., a book) is to identify unlabeled samples
76 * by using unscaled character templates in the BAR, picking the
77 * template closest to the unlabeled sample.
78 *
79 * Outliers can be removed from a pixa of labeled pix. This is one of
80 * two methods that use averaged templates (the other is greedy splitting
81 * of characters). See recogtrain.c for a discussion and the implementation.
82 *
83 * A special bootstrap recognizer (BSR) can be used to make a BAR from
84 * unlabeled book data. This is done by comparing character images
85 * from the book with labeled templates in the BSR, where all images
86 * are scaled to h = 40. The templates can be either the scanned images
87 * or images consisting of width-normalized strokes derived from
88 * the skeleton of the character bitmaps.
89 *
90 * Two BARs of labeled character data, that have been made by
91 * different recognizers, can be joined by extracting a pixa of the
92 * labeled templates from each, joining the two pixa, and then
93 * and regenerating a BAR from the joined set of templates.
94 * If all the labeled character data is from a single source (e.g, a book),
95 * identification can proceed using unscaled templates (either the input
96 * image or width-normalized lines). But if the labeled data comes from
97 * more than one source, (a "hybrid" recognizer), the templates should
98 * be scaled, and we recommend scaling to a fixed height.
99 *
100 * Suppose it is not possible to generate a BAR with a sufficient number
101 * of templates of each class taken from a single source. In that case,
102 * templates from the BSR itself can be added. This is the condition
103 * described above, where the labeled templates come from multiple
104 * sources, and it is necessary to do all character matches using
105 * templates that have been scaled to a fixed height (e.g., 40).
106 * Likewise, the samples to be identified using this hybrid recognizer
107 * must be modified in the same way. See prog/recogtest3.c for an
108 * example of the steps that can be taken in the construction of a BAR
109 * using a BSR.
110 *
111 * For training numeric input, an example set of calls that scales
112 * each training input to fixed h and will use the line templates of
113 * width linew for identifying unknown characters is:
114 * L_Recog *rec = recogCreate(0, h, linew, 128, 1);
115 * for (i = 0; i < n; i++) { // read in n training digits
116 * Pix *pix = ...
117 * recogTrainLabeled(rec, pix, NULL, text[i], 0);
118 * }
119 * recogTrainingFinished(&rec, 1, -1, -1.0); // required
120 *
121 * It is an error if any function that computes averages, removes
122 * outliers or requests identification of an unlabeled character,
123 * such as:
124 * (1) computing the sample averages: recogAverageSamples()
125 * (2) removing outliers: recogRemoveOutliers1() or recogRemoveOutliers2()
126 * (3) requesting identification of an unlabeled character:
127 * recogIdentifyPix()
128 * is called before an explicit call to finish training. Note that
129 * to do further training on a "finished" recognizer, you can set
130 * recog->train_done = FALSE;
131 * add the new training samples, and again call
132 * recogTrainingFinished(&rec, 1, -1, -1.0); // required
133 *
134 * If not scaling, using the images directly for identification, and
135 * removing outliers, do something like this:
136 * L_Recog *rec = recogCreate(0, 0, 0, 128, 1);
137 * for (i = 0; i < n; i++) { // read in n training characters
138 * Pix *pix = ...
139 * recogTrainLabeled(rec, pix, NULL, text[i], 0);
140 * }
141 * recogTrainingFinished(&rec, 1, -1, -1.0);
142 * if (!rec) ... [return]
143 * // remove outliers
144 * recogRemoveOutliers1(&rec, 0.7, 2, NULL, NULL);
145 *
146 * You can generate a recognizer from a pixa where the text field in
147 * each pix is the character string label for the pix. For example,
148 * the following recognizer will store unscaled line images:
149 * L_Recog *rec = recogCreateFromPixa(pixa, 0, 0, linew, 128, 1);
150 * and in use, it is fed unscaled line images to identify.
151 *
152 * For the following, assume that you have a pixa of labeled templates.
153 * If it is likely that some of the input templates are mislabeled,
154 * there are several things that can be done to remove them.
155 * The first is to put a size and quantity filter on them; e.g.
156 * Pixa *pixa2 = recogFilterPixaBySize(pixa1, 10, 15, 2.6);
157 * Then you can remove outliers; e.g.,
158 * Pixa *pixa3 = pixaRemoveOutliers2(pixa2, -1.0, -1, NULL, NULL);
159 *
160 * To this point, all templates are from a single source, so you
161 * can make a recognizer that uses the unscaled templates and optionally
162 * attempts to split touching characters:
163 * L_Recog *recog1 = recogCreateFromPixa(pixa3, ...);
164 * Alternatively, if you need more templates for some of the classes,
165 * you can pad with templates from a "bootstrap" recognizer (BSR).
166 * If you pad, it is necessary to scale the templates and input
167 * samples to a fixed height, and no attempt will be made to split
168 * the input sample connected components:
169 * L_Recog *recog1 = recogCreateFromPixa(pixa3, 0, 40, 0, 128, 0);
170 * recogPadDigitTrainingSet(&recog1, 40, 0);
171 *
172 * A special case is a pure BSR, that contains images scaled to a fixed
173 * height (we use 40 in these examples).
174 * For this,use either the scanned bitmap:
175 * L_Recog *recboot = recogCreateFromPixa(pixa, 0, 40, 0, 128, 1);
176 * or width-normalized lines (use width of 5 here):
177 * L_Recog *recboot = recogCreateFromPixa(pixa, 0, 40, 5, 128, 1);
178 *
179 * This can be used to train a new book adapted recognizer (BAC), on
180 * unlabeled data from, e.g., a book. To do this, the following is required:
181 * (1) the input images from the book must be scaled in the same
182 * way as those in the BSR, and
183 * (2) both the BSR and the input images must be set up to be either
184 * input scanned images or width-normalized lines.
185 *
186 * </pre>
187 */
188
189 #ifdef HAVE_CONFIG_H
190 #include <config_auto.h>
191 #endif /* HAVE_CONFIG_H */
192
193 #include <string.h>
194 #include "allheaders.h"
195
196 static const l_int32 MaxExamplesInClass = 256;
197
198 /* Default recog parameters that can be changed */
199 static const l_int32 DefaultCharsetType = L_ARABIC_NUMERALS;
200 static const l_int32 DefaultMinNopad = 1;
201 static const l_float32 DefaultMaxWHRatio = 3.0f; /* max allowed w/h
202 ratio for a component to be split */
203 static const l_float32 DefaultMaxHTRatio = 2.6f; /* max allowed ratio of
204 max/min unscaled averaged template heights */
205 static const l_int32 DefaultThreshold = 150; /* for binarization */
206 static const l_int32 DefaultMaxYShift = 1; /* for identification */
207
208 /* Static functions */
209 static l_int32 recogGetCharsetSize(l_int32 type);
210 static l_int32 recogAddCharstrLabels(L_RECOG *recog);
211 static l_int32 recogAddAllSamples(L_RECOG **precog, PIXAA *paa, l_int32 debug);
212
213
214 /*------------------------------------------------------------------------*
215 * Recog: initialization and destruction *
216 *------------------------------------------------------------------------*/
217 /*!
218 * \brief recogCreateFromRecog()
219 *
220 * \param[in] recs source recog with arbitrary input parameters
221 * \param[in] scalew scale all widths to this; use 0 otherwise
222 * \param[in] scaleh scale all heights to this; use 0 otherwise
223 * \param[in] linew width of normalized strokes; use 0 to skip
224 * \param[in] threshold for binarization; typically ~128
225 * \param[in] maxyshift from nominal centroid alignment; default is 1
226 * \return recd, or NULL on error
227 *
228 * <pre>
229 * Notes:
230 * (1) This is a convenience function that generates a recog using
231 * the unscaled training data in an existing recog.
232 * (2) It is recommended to use %maxyshift = 1 (the default value)
233 * (3) See recogCreate() for use of %scalew, %scaleh and %linew.
234 * </pre>
235 */
236 L_RECOG *
237 recogCreateFromRecog(L_RECOG *recs,
238 l_int32 scalew,
239 l_int32 scaleh,
240 l_int32 linew,
241 l_int32 threshold,
242 l_int32 maxyshift)
243 {
244 L_RECOG *recd;
245 PIXA *pixa;
246
247 if (!recs)
248 return (L_RECOG *)ERROR_PTR("recs not defined", __func__, NULL);
249
250 pixa = recogExtractPixa(recs);
251 recd = recogCreateFromPixa(pixa, scalew, scaleh, linew, threshold,
252 maxyshift);
253 pixaDestroy(&pixa);
254 return recd;
255 }
256
257
258 /*!
259 * \brief recogCreateFromPixa()
260 *
261 * \param[in] pixa of labeled, 1 bpp images
262 * \param[in] scalew scale all widths to this; use 0 otherwise
263 * \param[in] scaleh scale all heights to this; use 0 otherwise
264 * \param[in] linew width of normalized strokes; use 0 to skip
265 * \param[in] threshold for binarization; typically ~150
266 * \param[in] maxyshift from nominal centroid alignment; default is 1
267 * \return recog, or NULL on error
268 *
269 * <pre>
270 * Notes:
271 * (1) This is a convenience function for training from labeled data.
272 * The pixa can be read from file.
273 * (2) The pixa should contain the unscaled bitmaps used for training.
274 * (3) See recogCreate() for use of %scalew, %scaleh and %linew.
275 * (4) It is recommended to use %maxyshift = 1 (the default value)
276 * (5) All examples in the same class (i.e., with the same character
277 * label) should be similar. They can be made similar by invoking
278 * recogRemoveOutliers[1,2]() on %pixa before calling this function.
279 * </pre>
280 */
281 L_RECOG *
282 recogCreateFromPixa(PIXA *pixa,
283 l_int32 scalew,
284 l_int32 scaleh,
285 l_int32 linew,
286 l_int32 threshold,
287 l_int32 maxyshift)
288 {
289 L_RECOG *recog;
290
291 if (!pixa)
292 return (L_RECOG *)ERROR_PTR("pixa not defined", __func__, NULL);
293
294 recog = recogCreateFromPixaNoFinish(pixa, scalew, scaleh, linew,
295 threshold, maxyshift);
296 if (!recog)
297 return (L_RECOG *)ERROR_PTR("recog not made", __func__, NULL);
298
299 recogTrainingFinished(&recog, 1, -1, -1.0);
300 if (!recog)
301 return (L_RECOG *)ERROR_PTR("bad templates", __func__, NULL);
302 return recog;
303 }
304
305
306 /*!
307 * \brief recogCreateFromPixaNoFinish()
308 *
309 * \param[in] pixa of labeled, 1 bpp images
310 * \param[in] scalew scale all widths to this; use 0 otherwise
311 * \param[in] scaleh scale all heights to this; use 0 otherwise
312 * \param[in] linew width of normalized strokes; use 0 to skip
313 * \param[in] threshold for binarization; typically ~150
314 * \param[in] maxyshift from nominal centroid alignment; default is 1
315 * \return recog, or NULL on error
316 *
317 * <pre>
318 * Notes:
319 * (1) See recogCreateFromPixa() for details.
320 * (2) This is also used to generate a pixaa with templates
321 * in each class within a pixa. For that, all args except for
322 * %pixa are ignored.
323 * </pre>
324 */
325 L_RECOG *
326 recogCreateFromPixaNoFinish(PIXA *pixa,
327 l_int32 scalew,
328 l_int32 scaleh,
329 l_int32 linew,
330 l_int32 threshold,
331 l_int32 maxyshift)
332 {
333 char *text;
334 l_int32 full, n, i, ntext, same, maxd;
335 PIX *pix;
336 L_RECOG *recog;
337
338 if (!pixa)
339 return (L_RECOG *)ERROR_PTR("pixa not defined", __func__, NULL);
340 pixaVerifyDepth(pixa, &same, &maxd);
341 if (maxd > 1)
342 return (L_RECOG *)ERROR_PTR("not all pix are 1 bpp", __func__, NULL);
343
344 pixaIsFull(pixa, &full, NULL);
345 if (!full)
346 return (L_RECOG *)ERROR_PTR("not all pix are present", __func__, NULL);
347
348 n = pixaGetCount(pixa);
349 pixaCountText(pixa, &ntext);
350 if (ntext == 0)
351 return (L_RECOG *)ERROR_PTR("no pix have text strings", __func__, NULL);
352 if (ntext < n)
353 L_ERROR("%d text strings < %d pix\n", __func__, ntext, n);
354
355 recog = recogCreate(scalew, scaleh, linew, threshold, maxyshift);
356 if (!recog)
357 return (L_RECOG *)ERROR_PTR("recog not made", __func__, NULL);
358 for (i = 0; i < n; i++) {
359 pix = pixaGetPix(pixa, i, L_CLONE);
360 text = pixGetText(pix);
361 if (!text || strlen(text) == 0) {
362 L_ERROR("pix[%d] has no text\n", __func__, i);
363 pixDestroy(&pix);
364 continue;
365 }
366 recogTrainLabeled(recog, pix, NULL, text, 0);
367 pixDestroy(&pix);
368 }
369
370 return recog;
371 }
372
373
374 /*!
375 * \brief recogCreate()
376 *
377 * \param[in] scalew scale all widths to this; use 0 otherwise
378 * \param[in] scaleh scale all heights to this; use 0 otherwise
379 * \param[in] linew width of normalized strokes; use 0 to skip
380 * \param[in] threshold for binarization; typically ~128; 0 for default
381 * \param[in] maxyshift from nominal centroid alignment; default is 1
382 * \return recog, or NULL on error
383 *
384 * <pre>
385 * Notes:
386 * (1) If %scalew == 0 and %scaleh == 0, no scaling is done.
387 * If one of these is 0 and the other is > 0, scaling is isotropic
388 * to the requested size. We typically do not set both > 0.
389 * (2) Use linew > 0 to convert the templates to images with fixed
390 * width strokes. linew == 0 skips the conversion.
391 * (3) The only valid values for %maxyshift are 0, 1 and 2.
392 * It is recommended to use %maxyshift == 1 (default value).
393 * Using %maxyshift == 0 is much faster than %maxyshift == 1, but
394 * it is much less likely to find the template with the best
395 * correlation. Use of anything but 1 results in a warning.
396 * (4) Scaling is used for finding outliers and for training a
397 * book-adapted recognizer (BAR) from a bootstrap recognizer (BSR).
398 * Scaling the height to a fixed value and scaling the width
399 * accordingly (e.g., %scaleh = 40, %scalew = 0) is recommended.
400 * (5) The storage for most of the arrays is allocated when training
401 * is finished.
402 * </pre>
403 */
404 L_RECOG *
405 recogCreate(l_int32 scalew,
406 l_int32 scaleh,
407 l_int32 linew,
408 l_int32 threshold,
409 l_int32 maxyshift)
410 {
411 L_RECOG *recog;
412
413 if (scalew < 0 || scaleh < 0)
414 return (L_RECOG *)ERROR_PTR("invalid scalew or scaleh", __func__, NULL);
415 if (linew > 10)
416 return (L_RECOG *)ERROR_PTR("invalid linew > 10", __func__, NULL);
417 if (threshold == 0) threshold = DefaultThreshold;
418 if (threshold < 0 || threshold > 255) {
419 L_WARNING("invalid threshold; using default\n", __func__);
420 threshold = DefaultThreshold;
421 }
422 if (maxyshift < 0 || maxyshift > 2) {
423 L_WARNING("invalid maxyshift; using default value\n", __func__);
424 maxyshift = DefaultMaxYShift;
425 } else if (maxyshift == 0) {
426 L_WARNING("Using maxyshift = 0; faster, worse correlation results\n",
427 __func__);
428 } else if (maxyshift == 2) {
429 L_WARNING("Using maxyshift = 2; slower\n", __func__);
430 }
431
432 recog = (L_RECOG *)LEPT_CALLOC(1, sizeof(L_RECOG));
433 recog->templ_use = L_USE_ALL_TEMPLATES; /* default */
434 recog->threshold = threshold;
435 recog->scalew = scalew;
436 recog->scaleh = scaleh;
437 recog->linew = linew;
438 recog->maxyshift = maxyshift;
439 recogSetParams(recog, 1, -1, -1.0, -1.0);
440 recog->bmf = bmfCreate(NULL, 6);
441 recog->bmf_size = 6;
442 recog->maxarraysize = MaxExamplesInClass;
443
444 /* Generate the LUTs */
445 recog->centtab = makePixelCentroidTab8();
446 recog->sumtab = makePixelSumTab8();
447 recog->sa_text = sarrayCreate(0);
448 recog->dna_tochar = l_dnaCreate(0);
449
450 /* Input default values for min component size for splitting.
451 * These are overwritten when pixTrainingFinished() is called. */
452 recog->min_splitw = 6;
453 recog->max_splith = 60;
454
455 /* Allocate the paa for the unscaled training bitmaps */
456 recog->pixaa_u = pixaaCreate(recog->maxarraysize);
457
458 /* Generate the storage for debugging */
459 recog->pixadb_boot = pixaCreate(2);
460 recog->pixadb_split = pixaCreate(2);
461 return recog;
462 }
463
464
465 /*!
466 * \brief recogDestroy()
467 *
468 * \param[in,out] precog will be set to null before returning
469 * \return void
470 */
471 void
472 recogDestroy(L_RECOG **precog)
473 {
474 L_RECOG *recog;
475
476 if (!precog) {
477 L_WARNING("ptr address is null\n", __func__);
478 return;
479 }
480
481 if ((recog = *precog) == NULL) return;
482
483 LEPT_FREE(recog->centtab);
484 LEPT_FREE(recog->sumtab);
485 sarrayDestroy(&recog->sa_text);
486 l_dnaDestroy(&recog->dna_tochar);
487 pixaaDestroy(&recog->pixaa_u);
488 pixaDestroy(&recog->pixa_u);
489 ptaaDestroy(&recog->ptaa_u);
490 ptaDestroy(&recog->pta_u);
491 numaDestroy(&recog->nasum_u);
492 numaaDestroy(&recog->naasum_u);
493 pixaaDestroy(&recog->pixaa);
494 pixaDestroy(&recog->pixa);
495 ptaaDestroy(&recog->ptaa);
496 ptaDestroy(&recog->pta);
497 numaDestroy(&recog->nasum);
498 numaaDestroy(&recog->naasum);
499 pixaDestroy(&recog->pixa_tr);
500 pixaDestroy(&recog->pixadb_ave);
501 pixaDestroy(&recog->pixa_id);
502 pixDestroy(&recog->pixdb_ave);
503 pixDestroy(&recog->pixdb_range);
504 pixaDestroy(&recog->pixadb_boot);
505 pixaDestroy(&recog->pixadb_split);
506 bmfDestroy(&recog->bmf);
507 rchDestroy(&recog->rch);
508 rchaDestroy(&recog->rcha);
509 recogDestroyDid(recog);
510 LEPT_FREE(recog);
511 *precog = NULL;
512 }
513
514
515 /*------------------------------------------------------------------------*
516 * Recog accessors *
517 *------------------------------------------------------------------------*/
518 /*!
519 * \brief recogGetCount()
520 *
521 * \param[in] recog
522 * \return count of classes in recog; 0 if no recog or on error
523 */
524 l_int32
525 recogGetCount(L_RECOG *recog)
526 {
527 if (!recog)
528 return ERROR_INT("recog not defined", __func__, 0);
529 return recog->setsize;
530 }
531
532
533 /*!
534 * \brief recogSetParams()
535 *
536 * \param[in] recog to be padded, if necessary
537 * \param[in] type type of char set; -1 for default;
538 * see enum in recog.h
539 * \param[in] min_nopad min number in a class without padding;
540 * use -1 for default
541 * \param[in] max_wh_ratio max width/height ratio allowed for splitting;
542 * use -1.0 for default
543 * \param[in] max_ht_ratio max of max/min averaged template height ratio;
544 * use -1.0 for default
545 * \return 0 if OK, 1 on error
546 *
547 * <pre>
548 * Notes:
549 * (1) This is called when a recog is created.
550 * (2) Default %min_nopad value allows for some padding.
551 * To disable padding, set %min_nopad = 0. To pad only when
552 * no samples are available for the class, set %min_nopad = 1.
553 * (3) The %max_wh_ratio limits the width/height ratio for components
554 * that we attempt to split. Splitting long components is expensive.
555 * (4) The %max_ht_ratio is a quality requirement on the training data.
556 * The recognizer will not run if the averages are computed and
557 * the templates do not satisfy it.
558 * </pre>
559 */
560 l_ok
561 recogSetParams(L_RECOG *recog,
562 l_int32 type,
563 l_int32 min_nopad,
564 l_float32 max_wh_ratio,
565 l_float32 max_ht_ratio)
566 {
567 if (!recog)
568 return ERROR_INT("recog not defined", __func__, 1);
569
570 recog->charset_type = (type >= 0) ? type : DefaultCharsetType;
571 recog->charset_size = recogGetCharsetSize(recog->charset_type);
572 recog->min_nopad = (min_nopad >= 0) ? min_nopad : DefaultMinNopad;
573 recog->max_wh_ratio = (max_wh_ratio > 0.0) ? max_wh_ratio :
574 DefaultMaxWHRatio;
575 recog->max_ht_ratio = (max_ht_ratio > 1.0) ? max_ht_ratio :
576 DefaultMaxHTRatio;
577 return 0;
578 }
579
580
581 /*!
582 * \brief recogGetCharsetSize()
583 *
584 * \param[in] type of charset
585 * \return size of charset, or 0 if unknown or on error
586 */
587 static l_int32
588 recogGetCharsetSize(l_int32 type)
589 {
590 switch (type) {
591 case L_UNKNOWN:
592 return 0;
593 case L_ARABIC_NUMERALS:
594 return 10;
595 case L_LC_ROMAN_NUMERALS:
596 return 7;
597 case L_UC_ROMAN_NUMERALS:
598 return 7;
599 case L_LC_ALPHA:
600 return 26;
601 case L_UC_ALPHA:
602 return 26;
603 default:
604 L_ERROR("invalid charset_type %d\n", __func__, type);
605 return 0;
606 }
607 return 0; /* shouldn't happen */
608 }
609
610
611 /*------------------------------------------------------------------------*
612 * Character/index lookup *
613 *------------------------------------------------------------------------*/
614 /*!
615 * \brief recogGetClassIndex()
616 *
617 * \param[in] recog with LUT's pre-computed
618 * \param[in] val integer value; can be up to 4 bytes for UTF-8
619 * \param[in] text text from which %val was derived; used if not found
620 * \param[out] pindex index into dna_tochar
621 * \return 0 if found; 1 if not found and added; 2 on error.
622 *
623 * <pre>
624 * Notes:
625 * (1) This is used during training. There is one entry in
626 * recog->dna_tochar (integer value, e.g., ascii) and
627 * one in recog->sa_text (e.g, ascii letter in a string)
628 * for each character class.
629 * (2) This searches the dna character array for %val. If it is
630 * not found, the template represents a character class not
631 * already seen: it increments setsize (the number of character
632 * classes) by 1, and augments both the index (dna_tochar)
633 * and text (sa_text) arrays.
634 * (3) Returns the index in &index, except on error.
635 * (4) Caller must check the function return value.
636 * </pre>
637 */
638 l_int32
639 recogGetClassIndex(L_RECOG *recog,
640 l_int32 val,
641 char *text,
642 l_int32 *pindex)
643 {
644 l_int32 i, n, ival;
645
646 if (!pindex)
647 return ERROR_INT("&index not defined", __func__, 2);
648 *pindex = -1;
649 if (!recog)
650 return ERROR_INT("recog not defined", __func__, 2);
651 if (!text)
652 return ERROR_INT("text not defined", __func__, 2);
653
654 /* Search existing characters */
655 n = l_dnaGetCount(recog->dna_tochar);
656 for (i = 0; i < n; i++) {
657 l_dnaGetIValue(recog->dna_tochar, i, &ival);
658 if (val == ival) { /* found */
659 *pindex = i;
660 return 0;
661 }
662 }
663
664 /* If not found... */
665 l_dnaAddNumber(recog->dna_tochar, val);
666 sarrayAddString(recog->sa_text, text, L_COPY);
667 recog->setsize++;
668 *pindex = n;
669 return 1;
670 }
671
672
673 /*!
674 * \brief recogStringToIndex()
675 *
676 * \param[in] recog
677 * \param[in] text text string for some class
678 * \param[out] pindex index for that class; -1 if not found
679 * \return 0 if OK, 1 on error not finding the string is an error
680 */
681 l_ok
682 recogStringToIndex(L_RECOG *recog,
683 char *text,
684 l_int32 *pindex)
685 {
686 char *charstr;
687 l_int32 i, n, diff;
688
689 if (!pindex)
690 return ERROR_INT("&index not defined", __func__, 1);
691 *pindex = -1;
692 if (!recog)
693 return ERROR_INT("recog not defined", __func__, 1);
694 if (!text)
695 return ERROR_INT("text not defined", __func__, 1);
696
697 /* Search existing characters */
698 n = recog->setsize;
699 for (i = 0; i < n; i++) {
700 recogGetClassString(recog, i, &charstr);
701 if (!charstr) {
702 L_ERROR("string not found for index %d\n", __func__, i);
703 continue;
704 }
705 diff = strcmp(text, charstr);
706 LEPT_FREE(charstr);
707 if (diff) continue;
708 *pindex = i;
709 return 0;
710 }
711
712 return 1; /* not found */
713 }
714
715
716 /*!
717 * \brief recogGetClassString()
718 *
719 * \param[in] recog
720 * \param[in] index into array of char types
721 * \param[out] pcharstr string representation;
722 * returns an empty string on error
723 * \return 0 if found, 1 on error
724 *
725 * <pre>
726 * Notes:
727 * (1) Extracts a copy of the string from sa_text, which
728 * the caller must free.
729 * (2) Caller must check the function return value.
730 * </pre>
731 */
732 l_int32
733 recogGetClassString(L_RECOG *recog,
734 l_int32 index,
735 char **pcharstr)
736 {
737 if (!pcharstr)
738 return ERROR_INT("&charstr not defined", __func__, 1);
739 *pcharstr = stringNew("");
740 if (!recog)
741 return ERROR_INT("recog not defined", __func__, 2);
742
743 if (index < 0 || index >= recog->setsize)
744 return ERROR_INT("invalid index", __func__, 1);
745 LEPT_FREE(*pcharstr);
746 *pcharstr = sarrayGetString(recog->sa_text, index, L_COPY);
747 return 0;
748 }
749
750
751 /*!
752 * \brief l_convertCharstrToInt()
753 *
754 * \param[in] str input string representing one UTF-8 character;
755 * not more than 4 bytes
756 * \param[out] pval integer value for the input. Think of it
757 * as a 1-to-1 hash code.
758 * \return 0 if OK, 1 on error
759 */
760 l_ok
761 l_convertCharstrToInt(const char *str,
762 l_int32 *pval)
763 {
764 l_int32 size;
765 l_uint32 val;
766
767 if (!pval)
768 return ERROR_INT("&val not defined", __func__, 1);
769 *pval = 0;
770 if (!str)
771 return ERROR_INT("str not defined", __func__, 1);
772 size = strlen(str);
773 if (size == 0)
774 return ERROR_INT("empty string", __func__, 1);
775 if (size > 4)
776 return ERROR_INT("invalid string: > 4 bytes", __func__, 1);
777
778 val = (l_uint8)str[0];
779 if (size > 1)
780 val = (val << 8) + (l_uint8)str[1];
781 if (size > 2)
782 val = (val << 8) + (l_uint8)str[2];
783 if (size > 3)
784 val = (val << 8) + (l_uint8)str[3];
785 *pval = (l_int32)(val & 0x7fffffff);
786 return 0;
787 }
788
789
790 /*------------------------------------------------------------------------*
791 * Serialization *
792 *------------------------------------------------------------------------*/
793 /*!
794 * \brief recogRead()
795 *
796 * \param[in] filename
797 * \return recog, or NULL on error
798 *
799 * <pre>
800 * Notes:
801 * (1) When a recog is serialized, a pixaa of the templates that are
802 * actually used for correlation is saved in the pixaa_u array
803 * of the recog. These can be different from the templates that
804 * were used to generate the recog, because those original templates
805 * can be scaled and turned into normalized lines. When recog1
806 * is deserialized to recog2, these templates are put in both the
807 * unscaled array (pixaa_u) and the modified array (pixaa) in recog2.
808 * Why not put it in only the unscaled array and let
809 * recogTrainingFinalized() regenerate the modified templates?
810 * The reason is that with normalized lines, the operation of
811 * thinning to a skeleton and dilating back to a fixed width
812 * is not idempotent. Thinning to a skeleton saves pixels at
813 * the end of a line segment, and thickening the skeleton puts
814 * additional pixels at the end of the lines. This tends to
815 * close gaps.
816 * </pre>
817 */
818 L_RECOG *
819 recogRead(const char *filename)
820 {
821 FILE *fp;
822 L_RECOG *recog;
823
824 if (!filename)
825 return (L_RECOG *)ERROR_PTR("filename not defined", __func__, NULL);
826 if ((fp = fopenReadStream(filename)) == NULL)
827 return (L_RECOG *)ERROR_PTR_1("stream not opened",
828 filename, __func__, NULL);
829
830 if ((recog = recogReadStream(fp)) == NULL) {
831 fclose(fp);
832 return (L_RECOG *)ERROR_PTR_1("recog not read",
833 filename, __func__, NULL);
834 }
835
836 fclose(fp);
837 return recog;
838 }
839
840
841 /*!
842 * \brief recogReadStream()
843 *
844 * \param[in] fp file stream
845 * \return recog, or NULL on error
846 */
847 L_RECOG *
848 recogReadStream(FILE *fp)
849 {
850 l_int32 version, setsize, threshold, scalew, scaleh, linew;
851 l_int32 maxyshift, nc;
852 L_DNA *dna_tochar;
853 PIXAA *paa;
854 L_RECOG *recog;
855 SARRAY *sa_text;
856
857 if (!fp)
858 return (L_RECOG *)ERROR_PTR("stream not defined", __func__, NULL);
859
860 if (fscanf(fp, "\nRecog Version %d\n", &version) != 1)
861 return (L_RECOG *)ERROR_PTR("not a recog file", __func__, NULL);
862 if (version != RECOG_VERSION_NUMBER)
863 return (L_RECOG *)ERROR_PTR("invalid recog version", __func__, NULL);
864 if (fscanf(fp, "Size of character set = %d\n", &setsize) != 1)
865 return (L_RECOG *)ERROR_PTR("setsize not read", __func__, NULL);
866 if (fscanf(fp, "Binarization threshold = %d\n", &threshold) != 1)
867 return (L_RECOG *)ERROR_PTR("binary thresh not read", __func__, NULL);
868 if (fscanf(fp, "Maxyshift = %d\n", &maxyshift) != 1)
869 return (L_RECOG *)ERROR_PTR("maxyshift not read", __func__, NULL);
870 if (fscanf(fp, "Scale to width = %d\n", &scalew) != 1)
871 return (L_RECOG *)ERROR_PTR("width not read", __func__, NULL);
872 if (fscanf(fp, "Scale to height = %d\n", &scaleh) != 1)
873 return (L_RECOG *)ERROR_PTR("height not read", __func__, NULL);
874 if (fscanf(fp, "Normalized line width = %d\n", &linew) != 1)
875 return (L_RECOG *)ERROR_PTR("line width not read", __func__, NULL);
876 if ((recog = recogCreate(scalew, scaleh, linew, threshold,
877 maxyshift)) == NULL)
878 return (L_RECOG *)ERROR_PTR("recog not made", __func__, NULL);
879
880 if (fscanf(fp, "\nLabels for character set:\n") == -1) {
881 recogDestroy(&recog);
882 return (L_RECOG *)ERROR_PTR("label intro not read", __func__, NULL);
883 }
884 l_dnaDestroy(&recog->dna_tochar);
885 if ((dna_tochar = l_dnaReadStream(fp)) == NULL) {
886 recogDestroy(&recog);
887 return (L_RECOG *)ERROR_PTR("dna_tochar not read", __func__, NULL);
888 }
889 recog->dna_tochar = dna_tochar;
890 sarrayDestroy(&recog->sa_text);
891 if ((sa_text = sarrayReadStream(fp)) == NULL) {
892 recogDestroy(&recog);
893 return (L_RECOG *)ERROR_PTR("sa_text not read", __func__, NULL);
894 }
895 recog->sa_text = sa_text;
896
897 if (fscanf(fp, "\nPixaa of all samples in the training set:\n") == -1) {
898 recogDestroy(&recog);
899 return (L_RECOG *)ERROR_PTR("pixaa intro not read", __func__, NULL);
900 }
901 if ((paa = pixaaReadStream(fp)) == NULL) {
902 recogDestroy(&recog);
903 return (L_RECOG *)ERROR_PTR("pixaa not read", __func__, NULL);
904 }
905 recog->setsize = setsize;
906 nc = pixaaGetCount(paa, NULL);
907 if (nc != setsize) {
908 recogDestroy(&recog);
909 pixaaDestroy(&paa);
910 L_ERROR("(setsize = %d) != (paa count = %d)\n", __func__,
911 setsize, nc);
912 return NULL;
913 }
914
915 recogAddAllSamples(&recog, paa, 0); /* this finishes */
916 pixaaDestroy(&paa);
917 if (!recog)
918 return (L_RECOG *)ERROR_PTR("bad templates", __func__, NULL);
919 return recog;
920 }
921
922
923 /*!
924 * \brief recogReadMem()
925 *
926 * \param[in] data serialization of recog (not ascii)
927 * \param[in] size of data in bytes
928 * \return recog, or NULL on error
929 */
930 L_RECOG *
931 recogReadMem(const l_uint8 *data,
932 size_t size)
933 {
934 FILE *fp;
935 L_RECOG *recog;
936
937 if (!data)
938 return (L_RECOG *)ERROR_PTR("data not defined", __func__, NULL);
939 if ((fp = fopenReadFromMemory(data, size)) == NULL)
940 return (L_RECOG *)ERROR_PTR("stream not opened", __func__, NULL);
941
942 recog = recogReadStream(fp);
943 fclose(fp);
944 if (!recog) L_ERROR("recog not read\n", __func__);
945 return recog;
946 }
947
948
949 /*!
950 * \brief recogWrite()
951 *
952 * \param[in] filename
953 * \param[in] recog
954 * \return 0 if OK, 1 on error
955 *
956 * <pre>
957 * Notes:
958 * (1) The pixaa of templates that is written is the modified one
959 * in the pixaa field. It is the pixaa that is actually used
960 * for correlation. This is not the unscaled array of labeled
961 * bitmaps, in pixaa_u, that was used to generate the recog in the
962 * first place. See the notes in recogRead() for the rationale.
963 * </pre>
964 */
965 l_ok
966 recogWrite(const char *filename,
967 L_RECOG *recog)
968 {
969 l_int32 ret;
970 FILE *fp;
971
972 if (!filename)
973 return ERROR_INT("filename not defined", __func__, 1);
974 if (!recog)
975 return ERROR_INT("recog not defined", __func__, 1);
976
977 if ((fp = fopenWriteStream(filename, "wb")) == NULL)
978 return ERROR_INT_1("stream not opened", filename, __func__, 1);
979 ret = recogWriteStream(fp, recog);
980 fclose(fp);
981 if (ret)
982 return ERROR_INT_1("recog not written to stream",
983 filename, __func__, 1);
984 return 0;
985 }
986
987
988 /*!
989 * \brief recogWriteStream()
990 *
991 * \param[in] fp file stream opened for "wb"
992 * \param[in] recog
993 * \return 0 if OK, 1 on error
994 */
995 l_ok
996 recogWriteStream(FILE *fp,
997 L_RECOG *recog)
998 {
999 if (!fp)
1000 return ERROR_INT("stream not defined", __func__, 1);
1001 if (!recog)
1002 return ERROR_INT("recog not defined", __func__, 1);
1003
1004 fprintf(fp, "\nRecog Version %d\n", RECOG_VERSION_NUMBER);
1005 fprintf(fp, "Size of character set = %d\n", recog->setsize);
1006 fprintf(fp, "Binarization threshold = %d\n", recog->threshold);
1007 fprintf(fp, "Maxyshift = %d\n", recog->maxyshift);
1008 fprintf(fp, "Scale to width = %d\n", recog->scalew);
1009 fprintf(fp, "Scale to height = %d\n", recog->scaleh);
1010 fprintf(fp, "Normalized line width = %d\n", recog->linew);
1011 fprintf(fp, "\nLabels for character set:\n");
1012 l_dnaWriteStream(fp, recog->dna_tochar);
1013 sarrayWriteStream(fp, recog->sa_text);
1014 fprintf(fp, "\nPixaa of all samples in the training set:\n");
1015 pixaaWriteStream(fp, recog->pixaa);
1016
1017 return 0;
1018 }
1019
1020
1021 /*!
1022 * \brief recogWriteMem()
1023 *
1024 * \param[out] pdata data of serialized recog (not ascii)
1025 * \param[out] psize size of returned data
1026 * \param[in] recog
1027 * \return 0 if OK, 1 on error
1028 *
1029 * <pre>
1030 * Notes:
1031 * (1) Serializes a recog in memory and puts the result in a buffer.
1032 * </pre>
1033 */
1034 l_ok
1035 recogWriteMem(l_uint8 **pdata,
1036 size_t *psize,
1037 L_RECOG *recog)
1038 {
1039 l_int32 ret;
1040 FILE *fp;
1041
1042 if (pdata) *pdata = NULL;
1043 if (psize) *psize = 0;
1044 if (!pdata)
1045 return ERROR_INT("&data not defined", __func__, 1);
1046 if (!psize)
1047 return ERROR_INT("&size not defined", __func__, 1);
1048 if (!recog)
1049 return ERROR_INT("recog not defined", __func__, 1);
1050
1051 #if HAVE_FMEMOPEN
1052 if ((fp = open_memstream((char **)pdata, psize)) == NULL)
1053 return ERROR_INT("stream not opened", __func__, 1);
1054 ret = recogWriteStream(fp, recog);
1055 fputc('\0', fp);
1056 fclose(fp);
1057 if (*psize > 0) *psize = *psize - 1;
1058 #else
1059 L_INFO("no fmemopen API --> work-around: write to temp file\n", __func__);
1060 #ifdef _WIN32
1061 if ((fp = fopenWriteWinTempfile()) == NULL)
1062 return ERROR_INT("tmpfile stream not opened", __func__, 1);
1063 #else
1064 if ((fp = tmpfile()) == NULL)
1065 return ERROR_INT("tmpfile stream not opened", __func__, 1);
1066 #endif /* _WIN32 */
1067 ret = recogWriteStream(fp, recog);
1068 rewind(fp);
1069 *pdata = l_binaryReadStream(fp, psize);
1070 fclose(fp);
1071 #endif /* HAVE_FMEMOPEN */
1072 return ret;
1073 }
1074
1075
1076 /*!
1077 * \brief recogExtractPixa()
1078 *
1079 * \param[in] recog
1080 * \return pixa if OK, NULL on error
1081 *
1082 * <pre>
1083 * Notes:
1084 * (1) This generates a pixa of all the unscaled images in the
1085 * recognizer, where each one has its character class label in
1086 * the pix text field, by flattening pixaa_u to a pixa.
1087 * </pre>
1088 */
1089 PIXA *
1090 recogExtractPixa(L_RECOG *recog)
1091 {
1092 if (!recog)
1093 return (PIXA *)ERROR_PTR("recog not defined", __func__, NULL);
1094
1095 recogAddCharstrLabels(recog);
1096 return pixaaFlattenToPixa(recog->pixaa_u, NULL, L_CLONE);
1097 }
1098
1099
1100 /*!
1101 * \brief recogAddCharstrLabels()
1102 *
1103 * \param[in] recog
1104 * \return 0 if OK, 1 on error
1105 */
1106 static l_int32
1107 recogAddCharstrLabels(L_RECOG *recog)
1108 {
1109 char *text;
1110 l_int32 i, j, n1, n2;
1111 PIX *pix;
1112 PIXA *pixa;
1113 PIXAA *paa;
1114
1115 if (!recog)
1116 return ERROR_INT("recog not defined", __func__, 1);
1117
1118 /* Add the labels to each unscaled pix */
1119 paa = recog->pixaa_u;
1120 n1 = pixaaGetCount(paa, NULL);
1121 for (i = 0; i < n1; i++) {
1122 pixa = pixaaGetPixa(paa, i, L_CLONE);
1123 text = sarrayGetString(recog->sa_text, i, L_NOCOPY);
1124 n2 = pixaGetCount(pixa);
1125 for (j = 0; j < n2; j++) {
1126 pix = pixaGetPix(pixa, j, L_CLONE);
1127 pixSetText(pix, text);
1128 pixDestroy(&pix);
1129 }
1130 pixaDestroy(&pixa);
1131 }
1132
1133 return 0;
1134 }
1135
1136
1137 /*!
1138 * \brief recogAddAllSamples()
1139 *
1140 * \param[in] precog addr of recog
1141 * \param[in] paa pixaa from previously trained recog
1142 * \param[in] debug
1143 * \return 0 if OK, 1 on error
1144 *
1145 * <pre>
1146 * Notes:
1147 * (1) On error, the input recog is destroyed.
1148 * (2) This is used with the serialization routine recogRead(),
1149 * where each pixa in the pixaa represents a set of characters
1150 * in a different class. Before calling this function, we have
1151 * verified that the number of character classes, given by the
1152 * setsize field in %recog, equals the number of pixa in the paa.
1153 * The character labels for each set are in the sa_text field.
1154 * </pre>
1155 */
1156 static l_int32
1157 recogAddAllSamples(L_RECOG **precog,
1158 PIXAA *paa,
1159 l_int32 debug)
1160 {
1161 char *text;
1162 l_int32 i, j, nc, ns;
1163 PIX *pix;
1164 PIXA *pixa, *pixa1;
1165 L_RECOG *recog;
1166
1167 if (!precog)
1168 return ERROR_INT("&recog not defined", __func__, 1);
1169 if ((recog = *precog) == NULL)
1170 return ERROR_INT("recog not defined", __func__, 1);
1171 if (!paa) {
1172 recogDestroy(&recog);
1173 *precog = NULL;
1174 return ERROR_INT("paa not defined", __func__, 1);
1175 }
1176
1177 nc = pixaaGetCount(paa, NULL);
1178 for (i = 0; i < nc; i++) {
1179 pixa = pixaaGetPixa(paa, i, L_CLONE);
1180 ns = pixaGetCount(pixa);
1181 text = sarrayGetString(recog->sa_text, i, L_NOCOPY);
1182 pixa1 = pixaCreate(ns);
1183 pixaaAddPixa(recog->pixaa_u, pixa1, L_INSERT);
1184 for (j = 0; j < ns; j++) {
1185 pix = pixaGetPix(pixa, j, L_CLONE);
1186 if (debug) lept_stderr("pix[%d,%d]: text = %s\n", i, j, text);
1187 pixaaAddPix(recog->pixaa_u, i, pix, NULL, L_INSERT);
1188 }
1189 pixaDestroy(&pixa);
1190 }
1191
1192 recogTrainingFinished(&recog, 0, -1, -1.0); /* For second parameter,
1193 see comment in recogRead() */
1194 if (!recog)
1195 return ERROR_INT("bad templates; recog destroyed", __func__, 1);
1196 return 0;
1197 }