comparison mupdf-source/thirdparty/leptonica/src/classapp.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file classapp.c
29 * <pre>
30 *
31 * Top-level jb2 correlation and rank-hausdorff
32 * l_int32 jbCorrelation()
33 * l_int32 jbRankHaus()
34 *
35 * Extract and classify words in textline order
36 * JBCLASSER *jbWordsInTextlines()
37 * l_int32 pixGetWordsInTextlines()
38 * l_int32 pixGetWordBoxesInTextlines()
39 *
40 * Extract word and character bounding boxes
41 * l_int32 pixFindWordAndCharacterBoxes()
42 *
43 * Use word bounding boxes to compare page images
44 * NUMAA *boxaExtractSortedPattern()
45 * l_int32 numaaCompareImagesByBoxes()
46 * static l_int32 testLineAlignmentX()
47 * static l_int32 countAlignedMatches()
48 * static void printRowIndices()
49 * </pre>
50 */
51
52 #ifdef HAVE_CONFIG_H
53 #include <config_auto.h>
54 #endif /* HAVE_CONFIG_H */
55
56 #include <string.h>
57 #include "allheaders.h"
58
59 #define L_BUF_SIZE 512 /*!< size of filename buffer */
60 static const l_int32 JB_WORDS_MIN_WIDTH = 5; /*!< min. word width in pixels */
61 static const l_int32 JB_WORDS_MIN_HEIGHT = 3; /*!< min. word height in pixels */
62
63 /* Static comparison functions */
64 static l_int32 testLineAlignmentX(NUMA *na1, NUMA *na2, l_int32 shiftx,
65 l_int32 delx, l_int32 nperline);
66 static l_int32 countAlignedMatches(NUMA *nai1, NUMA *nai2, NUMA *nasx,
67 NUMA *nasy, l_int32 n1, l_int32 n2,
68 l_int32 delx, l_int32 dely,
69 l_int32 nreq, l_int32 *psame,
70 l_int32 debugflag);
71 static void printRowIndices(l_int32 *index1, l_int32 n1,
72 l_int32 *index2, l_int32 n2);
73
74 /*------------------------------------------------------------------*
75 * Top-level jb2 correlation and rank-hausdorff *
76 *------------------------------------------------------------------*/
77 /*!
78 * \brief jbCorrelation()
79 *
80 * \param[in] dirin directory of input images
81 * \param[in] thresh typically ~0.8
82 * \param[in] weight typically ~0.6
83 * \param[in] components JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS
84 * \param[in] rootname for output files
85 * \param[in] firstpage 0-based
86 * \param[in] npages use 0 for all pages in dirin
87 * \param[in] renderflag 1 to render from templates; 0 to skip
88 * \return 0 if OK, 1 on error
89 *
90 * <pre>
91 * Notes:
92 * (1) The images must be 1 bpp. If they are not, you can convert
93 * them using convertFilesTo1bpp().
94 * (2) See prog/jbcorrelation for generating more output (e.g.,
95 * for debugging)
96 * </pre>
97 */
98 l_ok
99 jbCorrelation(const char *dirin,
100 l_float32 thresh,
101 l_float32 weight,
102 l_int32 components,
103 const char *rootname,
104 l_int32 firstpage,
105 l_int32 npages,
106 l_int32 renderflag)
107 {
108 char filename[L_BUF_SIZE];
109 l_int32 nfiles, i, numpages;
110 JBDATA *data;
111 JBCLASSER *classer;
112 PIX *pix;
113 PIXA *pixa;
114 SARRAY *safiles;
115
116 if (!dirin)
117 return ERROR_INT("dirin not defined", __func__, 1);
118 if (!rootname)
119 return ERROR_INT("rootname not defined", __func__, 1);
120 if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
121 components != JB_WORDS)
122 return ERROR_INT("components invalid", __func__, 1);
123
124 safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages);
125 nfiles = sarrayGetCount(safiles);
126
127 /* Classify components */
128 classer = jbCorrelationInit(components, 0, 0, thresh, weight);
129 jbAddPages(classer, safiles);
130
131 /* Save data */
132 data = jbDataSave(classer);
133 jbDataWrite(rootname, data);
134
135 /* Optionally, render pages using class templates */
136 if (renderflag) {
137 pixa = jbDataRender(data, FALSE);
138 numpages = pixaGetCount(pixa);
139 if (numpages != nfiles)
140 lept_stderr("numpages = %d, nfiles = %d, not equal!\n",
141 numpages, nfiles);
142 for (i = 0; i < numpages; i++) {
143 pix = pixaGetPix(pixa, i, L_CLONE);
144 snprintf(filename, L_BUF_SIZE, "%s.%04d", rootname, i);
145 lept_stderr("filename: %s\n", filename);
146 pixWrite(filename, pix, IFF_PNG);
147 pixDestroy(&pix);
148 }
149 pixaDestroy(&pixa);
150 }
151
152 sarrayDestroy(&safiles);
153 jbClasserDestroy(&classer);
154 jbDataDestroy(&data);
155 return 0;
156 }
157
158
159 /*!
160 * \brief jbRankHaus()
161 *
162 * \param[in] dirin directory of input images
163 * \param[in] size of Sel used for dilation; typ. 2
164 * \param[in] rank rank value of match; typ. 0.97
165 * \param[in] components JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS
166 * \param[in] rootname for output files
167 * \param[in] firstpage 0-based
168 * \param[in] npages use 0 for all pages in dirin
169 * \param[in] renderflag 1 to render from templates; 0 to skip
170 * \return 0 if OK, 1 on error
171 *
172 * <pre>
173 * Notes:
174 * (1) See prog/jbrankhaus for generating more output (e.g.,
175 * for debugging)
176 * </pre>
177 */
178 l_ok
179 jbRankHaus(const char *dirin,
180 l_int32 size,
181 l_float32 rank,
182 l_int32 components,
183 const char *rootname,
184 l_int32 firstpage,
185 l_int32 npages,
186 l_int32 renderflag)
187 {
188 char filename[L_BUF_SIZE];
189 l_int32 nfiles, i, numpages;
190 JBDATA *data;
191 JBCLASSER *classer;
192 PIX *pix;
193 PIXA *pixa;
194 SARRAY *safiles;
195
196 if (!dirin)
197 return ERROR_INT("dirin not defined", __func__, 1);
198 if (!rootname)
199 return ERROR_INT("rootname not defined", __func__, 1);
200 if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
201 components != JB_WORDS)
202 return ERROR_INT("components invalid", __func__, 1);
203
204 safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages);
205 nfiles = sarrayGetCount(safiles);
206
207 /* Classify components */
208 classer = jbRankHausInit(components, 0, 0, size, rank);
209 jbAddPages(classer, safiles);
210
211 /* Save data */
212 data = jbDataSave(classer);
213 jbDataWrite(rootname, data);
214
215 /* Optionally, render pages using class templates */
216 if (renderflag) {
217 pixa = jbDataRender(data, FALSE);
218 numpages = pixaGetCount(pixa);
219 if (numpages != nfiles)
220 lept_stderr("numpages = %d, nfiles = %d, not equal!\n",
221 numpages, nfiles);
222 for (i = 0; i < numpages; i++) {
223 pix = pixaGetPix(pixa, i, L_CLONE);
224 snprintf(filename, L_BUF_SIZE, "%s.%04d", rootname, i);
225 lept_stderr("filename: %s\n", filename);
226 pixWrite(filename, pix, IFF_PNG);
227 pixDestroy(&pix);
228 }
229 pixaDestroy(&pixa);
230 }
231
232 sarrayDestroy(&safiles);
233 jbClasserDestroy(&classer);
234 jbDataDestroy(&data);
235 return 0;
236 }
237
238
239
240 /*------------------------------------------------------------------*
241 * Extract and classify words in textline order *
242 *------------------------------------------------------------------*/
243 /*!
244 * \brief jbWordsInTextlines()
245 *
246 * \param[in] dirin directory of input pages
247 * \param[in] reduction 1 for full res; 2 for half-res
248 * \param[in] maxwidth of word mask components, to be kept
249 * \param[in] maxheight of word mask components, to be kept
250 * \param[in] thresh on correlation; 0.80 is reasonable
251 * \param[in] weight for handling thick text; 0.6 is reasonable
252 * \param[out] pnatl numa with textline index for each component
253 * \param[in] firstpage 0-based
254 * \param[in] npages use 0 for all pages in dirin
255 * \return classer for the set of pages
256 *
257 * <pre>
258 * Notes:
259 * (1) This is a high-level function. See prog/jbwords for example
260 * of usage.
261 * (2) Typically, use input of 75 - 150 ppi for finding words.
262 * </pre>
263 */
264 JBCLASSER *
265 jbWordsInTextlines(const char *dirin,
266 l_int32 reduction,
267 l_int32 maxwidth,
268 l_int32 maxheight,
269 l_float32 thresh,
270 l_float32 weight,
271 NUMA **pnatl,
272 l_int32 firstpage,
273 l_int32 npages)
274 {
275 char *fname;
276 l_int32 nfiles, i, w, h;
277 BOXA *boxa;
278 JBCLASSER *classer;
279 NUMA *nai, *natl;
280 PIX *pix1, *pix2;
281 PIXA *pixa;
282 SARRAY *safiles;
283
284 if (!pnatl)
285 return (JBCLASSER *)ERROR_PTR("&natl not defined", __func__, NULL);
286 *pnatl = NULL;
287 if (!dirin)
288 return (JBCLASSER *)ERROR_PTR("dirin not defined", __func__, NULL);
289 if (reduction != 1 && reduction != 2)
290 return (JBCLASSER *)ERROR_PTR("reduction not in {1,2}", __func__, NULL);
291
292 safiles = getSortedPathnamesInDirectory(dirin, NULL, firstpage, npages);
293 nfiles = sarrayGetCount(safiles);
294
295 /* Classify components */
296 classer = jbCorrelationInit(JB_WORDS, maxwidth, maxheight, thresh, weight);
297 classer->safiles = sarrayCopy(safiles);
298 natl = numaCreate(0);
299 *pnatl = natl;
300 for (i = 0; i < nfiles; i++) {
301 fname = sarrayGetString(safiles, i, L_NOCOPY);
302 if ((pix1 = pixRead(fname)) == NULL) {
303 L_WARNING("image file %d not read\n", __func__, i);
304 continue;
305 }
306 if (reduction == 1)
307 pix2 = pixClone(pix1);
308 else /* reduction == 2 */
309 pix2 = pixReduceRankBinaryCascade(pix1, 1, 0, 0, 0);
310 pixGetWordsInTextlines(pix2, JB_WORDS_MIN_WIDTH,
311 JB_WORDS_MIN_HEIGHT, maxwidth, maxheight,
312 &boxa, &pixa, &nai);
313 pixGetDimensions(pix2, &w, &h, NULL);
314 classer->w = w;
315 classer->h = h;
316 jbAddPageComponents(classer, pix2, boxa, pixa);
317 numaJoin(natl, nai, 0, -1);
318 pixDestroy(&pix1);
319 pixDestroy(&pix2);
320 numaDestroy(&nai);
321 boxaDestroy(&boxa);
322 pixaDestroy(&pixa);
323 }
324
325 sarrayDestroy(&safiles);
326 return classer;
327 }
328
329
330 /*!
331 * \brief pixGetWordsInTextlines()
332 *
333 * \param[in] pixs 1 bpp, typ. 75 - 150 ppi
334 * \param[in] minwidth of saved components; smaller are discarded
335 * \param[in] minheight of saved components; smaller are discarded
336 * \param[in] maxwidth of saved components; larger are discarded
337 * \param[in] maxheight of saved components; larger are discarded
338 * \param[out] pboxad word boxes sorted in textline line order
339 * \param[out] ppixad word images sorted in textline line order
340 * \param[out] pnai index of textline for each word
341 * \return 0 if OK, 1 on error
342 *
343 * <pre>
344 * Notes:
345 * (1) The input should be at a resolution of between 75 and 150 ppi.
346 * (2) The four size constraints on saved components are all
347 * scaled by %reduction.
348 * (3) The result are word images (and their b.b.), extracted in
349 * textline order, at either full res or 2x reduction,
350 * and with a numa giving the textline index for each word.
351 * (4) The pixa and boxa interfaces should make this type of
352 * application simple to put together. The steps are:
353 * ~ generate first estimate of word masks
354 * ~ get b.b. of these, and remove the small and big ones
355 * ~ extract pixa of the word images, using the b.b.
356 * ~ sort actual word images in textline order (2d)
357 * ~ flatten them to a pixa (1d), saving the textline index
358 * for each pix
359 * (5) In an actual application, it may be desirable to pre-filter
360 * the input image to remove large components, to extract
361 * single columns of text, and to deskew them. For example,
362 * to remove both large components and small noisy components
363 * that can interfere with the statistics used to estimate
364 * parameters for segmenting by words, but still retain text lines,
365 * the following image preprocessing can be done:
366 * Pix *pixt = pixMorphSequence(pixs, "c40.1", 0);
367 * Pix *pixf = pixSelectBySize(pixt, 0, 60, 8,
368 * L_SELECT_HEIGHT, L_SELECT_IF_LT, NULL);
369 * pixAnd(pixf, pixf, pixs); // the filtered image
370 * The closing turns text lines into long blobs, but does not
371 * significantly increase their height. But if there are many
372 * small connected components in a dense texture, this is likely
373 * to generate tall components that will be eliminated in pixf.
374 * </pre>
375 */
376 l_ok
377 pixGetWordsInTextlines(PIX *pixs,
378 l_int32 minwidth,
379 l_int32 minheight,
380 l_int32 maxwidth,
381 l_int32 maxheight,
382 BOXA **pboxad,
383 PIXA **ppixad,
384 NUMA **pnai)
385 {
386 BOXA *boxa1, *boxad;
387 BOXAA *baa;
388 NUMA *nai;
389 NUMAA *naa;
390 PIXA *pixa1, *pixad;
391 PIXAA *paa;
392
393 if (!pboxad || !ppixad || !pnai)
394 return ERROR_INT("&boxad, &pixad, &nai not all defined", __func__, 1);
395 *pboxad = NULL;
396 *ppixad = NULL;
397 *pnai = NULL;
398 if (!pixs)
399 return ERROR_INT("pixs not defined", __func__, 1);
400
401 /* Get the bounding boxes of the words from the word mask. */
402 pixWordBoxesByDilation(pixs, minwidth, minheight, maxwidth, maxheight,
403 &boxa1, NULL, NULL);
404
405 /* Generate a pixa of the word images */
406 pixa1 = pixaCreateFromBoxa(pixs, boxa1, 0, 0, NULL);
407
408 /* Sort the bounding boxes of these words by line. We use the
409 * index mapping to allow identical sorting of the pixa. */
410 baa = boxaSort2d(boxa1, &naa, -1, -1, 4);
411 paa = pixaSort2dByIndex(pixa1, naa, L_CLONE);
412
413 /* Flatten the word paa */
414 pixad = pixaaFlattenToPixa(paa, &nai, L_CLONE);
415 boxad = pixaGetBoxa(pixad, L_COPY);
416
417 *pnai = nai;
418 *pboxad = boxad;
419 *ppixad = pixad;
420
421 pixaDestroy(&pixa1);
422 boxaDestroy(&boxa1);
423 boxaaDestroy(&baa);
424 pixaaDestroy(&paa);
425 numaaDestroy(&naa);
426 return 0;
427 }
428
429
430 /*!
431 * \brief pixGetWordBoxesInTextlines()
432 *
433 * \param[in] pixs 1 bpp, typ. 75 - 150 ppi
434 * \param[in] minwidth of saved components; smaller are discarded
435 * \param[in] minheight of saved components; smaller are discarded
436 * \param[in] maxwidth of saved components; larger are discarded
437 * \param[in] maxheight of saved components; larger are discarded
438 * \param[out] pboxad word boxes sorted in textline line order
439 * \param[out] pnai [optional] index of textline for each word
440 * \return 0 if OK, 1 on error
441 *
442 * <pre>
443 * Notes:
444 * (1) The input should be at a resolution of between 75 and 150 ppi.
445 * (2) This is a special version of pixGetWordsInTextlines(), that
446 * just finds the word boxes in line order, with a numa
447 * giving the textline index for each word.
448 * See pixGetWordsInTextlines() for more details.
449 * </pre>
450 */
451 l_ok
452 pixGetWordBoxesInTextlines(PIX *pixs,
453 l_int32 minwidth,
454 l_int32 minheight,
455 l_int32 maxwidth,
456 l_int32 maxheight,
457 BOXA **pboxad,
458 NUMA **pnai)
459 {
460 BOXA *boxa1;
461 BOXAA *baa;
462 NUMA *nai;
463
464 if (pnai) *pnai = NULL;
465 if (!pboxad)
466 return ERROR_INT("&boxad and &nai not both defined", __func__, 1);
467 *pboxad = NULL;
468 if (!pixs)
469 return ERROR_INT("pixs not defined", __func__, 1);
470
471 /* Get the bounding boxes of the words from the word mask. */
472 pixWordBoxesByDilation(pixs, minwidth, minheight, maxwidth, maxheight,
473 &boxa1, NULL, NULL);
474
475 /* 2D sort the bounding boxes of these words. */
476 baa = boxaSort2d(boxa1, NULL, 3, -5, 5);
477
478 /* Flatten the boxaa, saving the boxa index for each box */
479 *pboxad = boxaaFlattenToBoxa(baa, &nai, L_CLONE);
480
481 if (pnai)
482 *pnai = nai;
483 else
484 numaDestroy(&nai);
485 boxaDestroy(&boxa1);
486 boxaaDestroy(&baa);
487 return 0;
488 }
489
490
491 /*------------------------------------------------------------------*
492 * Extract word and character bounding boxes *
493 *------------------------------------------------------------------*/
494 /*!
495 * \brief pixFindWordAndCharacterBoxes()
496 *
497 * \param[in] pixs 2, 4, 8 or 32 bpp; colormap OK; typ. 300 ppi
498 * \param[in] boxs [optional] region to select in pixs
499 * \param[in] thresh binarization threshold (typ. 100 - 150)
500 * \param[out] pboxaw return the word boxes
501 * \param[out] pboxaac return the character boxes
502 * \param[in] debugdir [optional] for debug images; use NULL to skip
503 * \return 0 if OK, 1 on error
504 *
505 * <pre>
506 * Notes:
507 * (1) If %boxs == NULL, the entire input image is used.
508 * (2) Having an input pix that is not 1bpp is necessary to reduce
509 * touching characters by using a low binarization threshold.
510 * Suggested thresholds are between 100 and 150.
511 * (3) The coordinates in the output boxes are global, with respect
512 * to the input image.
513 * </pre>
514 */
515 l_ok
516 pixFindWordAndCharacterBoxes(PIX *pixs,
517 BOX *boxs,
518 l_int32 thresh,
519 BOXA **pboxaw,
520 BOXAA **pboxaac,
521 const char *debugdir)
522 {
523 char *debugfile, *subdir;
524 l_int32 i, xs, ys, xb, yb, nb, loc;
525 l_float32 scalefact;
526 BOX *box1, *box2;
527 BOXA *boxa1, *boxa1a, *boxa2, *boxa3, *boxa4, *boxa5, *boxaw;
528 BOXAA *boxaac;
529 PIX *pix1, *pix2, *pix3, *pix3a, *pix4, *pix5;
530
531 if (pboxaw) *pboxaw = NULL;
532 if (pboxaac) *pboxaac = NULL;
533 if (!pboxaw || !pboxaac)
534 return ERROR_INT("&boxaw and &boxaac not defined", __func__, 1);
535 if (!pixs || pixGetDepth(pixs) == 1)
536 return ERROR_INT("pixs not defined or 1 bpp", __func__, 1);
537 if (thresh > 150)
538 L_WARNING("threshold is %d; may be too high\n", __func__, thresh);
539
540 if (boxs) {
541 if ((pix1 = pixClipRectangle(pixs, boxs, NULL)) == NULL)
542 return ERROR_INT("pix1 not made", __func__, 1);
543 boxGetGeometry(boxs, &xs, &ys, NULL, NULL);
544 } else {
545 pix1 = pixClone(pixs);
546 xs = ys = 0;
547 }
548
549 /* Convert pix1 to 8 bpp gray if necessary */
550 pix2 = pixConvertTo8(pix1, FALSE);
551
552 /* To find the words and letters, work with 1 bpp images and use
553 * a low threshold to reduce the number of touching characters. */
554 pix3 = pixConvertTo1(pix2, thresh);
555
556 /* Work at about 120 ppi to find the word bounding boxes. */
557 pix3a = pixScaleToResolution(pix3, 120.0, 300.0, &scalefact);
558
559 /* First find the words, removing the very small things like
560 * dots over the 'i' that weren't included in word boxes. */
561 pixGetWordBoxesInTextlines(pix3a, 1, 4, 150, 40, &boxa1a, NULL);
562 boxa1 = boxaTransform(boxa1a, 0, 0, 1.0 / scalefact, 1.0 / scalefact);
563 if (debugdir) {
564 loc = 0;
565 subdir = stringReplaceSubstr(debugdir, "/tmp/", "", &loc, NULL);
566 lept_mkdir(subdir);
567 LEPT_FREE(subdir);
568 pix4 = pixConvertTo32(pix2);
569 pixRenderBoxaArb(pix4, boxa1, 2, 255, 0, 0);
570 debugfile = stringJoin(debugdir, "/words.png");
571 pixWrite(debugfile, pix4, IFF_PNG);
572 pixDestroy(&pix4);
573 LEPT_FREE(debugfile);
574 }
575
576 /* Now find the letters at 300 ppi */
577 nb = boxaGetCount(boxa1);
578 boxaw = boxaCreate(nb);
579 boxaac = boxaaCreate(nb);
580 *pboxaw = boxaw;
581 *pboxaac = boxaac;
582 for (i = 0; i < nb; i++) {
583 box1 = boxaGetBox(boxa1, i, L_COPY);
584 boxGetGeometry(box1, &xb, &yb, NULL, NULL);
585 pix4 = pixClipRectangle(pix3, box1, NULL);
586 /* Join detached parts of characters vertically */
587 pix5 = pixMorphSequence(pix4, "c1.10", 0);
588 /* The connected components should mostly be characters */
589 boxa2 = pixConnCompBB(pix5, 4);
590 /* Remove very small pieces */
591 boxa3 = boxaSelectBySize(boxa2, 2, 5, L_SELECT_IF_BOTH,
592 L_SELECT_IF_GTE, NULL);
593 /* Order left to right */
594 boxa4 = boxaSort(boxa3, L_SORT_BY_X, L_SORT_INCREASING, NULL);
595 /* Express locations with reference to the full input image */
596 boxa5 = boxaTransform(boxa4, xs + xb, ys + yb, 1.0, 1.0);
597 box2 = boxTransform(box1, xs, ys, 1.0, 1.0);
598
599 /* Ignore any boxa with no boxes after size filtering */
600 if (boxaGetCount(boxa5) > 0) {
601 boxaAddBox(boxaw, box2, L_INSERT);
602 boxaaAddBoxa(boxaac, boxa5, L_INSERT);
603 } else {
604 boxDestroy(&box2);
605 boxaDestroy(&boxa5);
606 }
607 boxDestroy(&box1);
608 pixDestroy(&pix4);
609 pixDestroy(&pix5);
610 boxaDestroy(&boxa2);
611 boxaDestroy(&boxa3);
612 boxaDestroy(&boxa4);
613 }
614 pixDestroy(&pix1);
615 pixDestroy(&pix2);
616 pixDestroy(&pix3);
617 pixDestroy(&pix3a);
618 boxaDestroy(&boxa1);
619 boxaDestroy(&boxa1a);
620 if (debugdir) {
621 pix4 = pixConvertTo32(pixs);
622 boxa2 = boxaaFlattenToBoxa(boxaac, NULL, L_COPY);
623 pixRenderBoxaArb(pix4, boxa2, 2, 255, 0, 0);
624 boxa3 = boxaAdjustSides(boxaw, -2, 2, -2, 2);
625 pixRenderBoxaArb(pix4, boxa3, 2, 0, 255, 0);
626 debugfile = stringJoin(debugdir, "/chars.png");
627 pixWrite(debugfile, pix4, IFF_PNG);
628 pixDestroy(&pix4);
629 boxaDestroy(&boxa2);
630 boxaDestroy(&boxa3);
631 LEPT_FREE(debugfile);
632 }
633 return 0;
634 }
635
636
637 /*------------------------------------------------------------------*
638 * Use word bounding boxes to compare page images *
639 *------------------------------------------------------------------*/
640 /*!
641 * \brief boxaExtractSortedPattern()
642 *
643 * \param[in] boxa typ. of word bounding boxes, in textline order
644 * \param[in] na index of textline for each box in boxa
645 * \return naa NUMAA, where each numa represents one textline,
646 * or NULL on error
647 *
648 * <pre>
649 * Notes:
650 * (1) The input is expected to come from pixGetWordBoxesInTextlines().
651 * (2) Each numa in the output consists of an average y coordinate
652 * of the first box in the textline, followed by pairs of
653 * x coordinates representing the left and right edges of each
654 * of the boxes in the textline.
655 * </pre>
656 */
657 NUMAA *
658 boxaExtractSortedPattern(BOXA *boxa,
659 NUMA *na)
660 {
661 l_int32 index, nbox, row, prevrow, x, y, w, h;
662 BOX *box;
663 NUMA *nad = NULL;
664 NUMAA *naa;
665
666 if (!boxa)
667 return (NUMAA *)ERROR_PTR("boxa not defined", __func__, NULL);
668 if (!na)
669 return (NUMAA *)ERROR_PTR("na not defined", __func__, NULL);
670
671 naa = numaaCreate(0);
672 nbox = boxaGetCount(boxa);
673 if (nbox == 0)
674 return naa;
675
676 prevrow = -1;
677 for (index = 0; index < nbox; index++) {
678 box = boxaGetBox(boxa, index, L_CLONE);
679 numaGetIValue(na, index, &row);
680 if (row > prevrow) {
681 if (index > 0)
682 numaaAddNuma(naa, nad, L_INSERT);
683 nad = numaCreate(0);
684 prevrow = row;
685 boxGetGeometry(box, NULL, &y, NULL, &h);
686 numaAddNumber(nad, y + h / 2);
687 }
688 boxGetGeometry(box, &x, NULL, &w, NULL);
689 numaAddNumber(nad, x);
690 numaAddNumber(nad, x + w - 1);
691 boxDestroy(&box);
692 }
693 numaaAddNuma(naa, nad, L_INSERT);
694
695 return naa;
696 }
697
698
699 /*!
700 * \brief numaaCompareImagesByBoxes()
701 *
702 * \param[in] naa1 for image 1, formatted by boxaExtractSortedPattern()
703 * \param[in] naa2 for image 2, formatted by boxaExtractSortedPattern()
704 * \param[in] nperline number of box regions to be used in each textline
705 * \param[in] nreq number of complete row matches required
706 * \param[in] maxshiftx max allowed x shift between two patterns, in pixels
707 * \param[in] maxshifty max allowed y shift between two patterns, in pixels
708 * \param[in] delx max allowed difference in x data, after alignment
709 * \param[in] dely max allowed difference in y data, after alignment
710 * \param[out] psame 1 if %nreq row matches are found; 0 otherwise
711 * \param[in] debugflag 1 for debug output
712 * \return 0 if OK, 1 on error
713 *
714 * <pre>
715 * Notes:
716 * (1) Each input numaa describes a set of sorted bounding boxes
717 * (sorted by textline and, within each textline, from
718 * left to right) in the images from which they are derived.
719 * See boxaExtractSortedPattern() for a description of the data
720 * format in each of the input numaa.
721 * (2) This function does an alignment between the input
722 * descriptions of bounding boxes for two images. The
723 * input parameter %nperline specifies the number of boxes
724 * to consider in each line when testing for a match, and
725 * %nreq is the required number of lines that must be well-aligned
726 * to get a match.
727 * (3) Testing by alignment has 3 steps:
728 * (a) Generating the location of word bounding boxes from the
729 * images (prior to calling this function).
730 * (b) Listing all possible pairs of aligned rows, based on
731 * tolerances in horizontal and vertical positions of
732 * the boxes. Specifically, all pairs of rows are enumerated
733 * whose first %nperline boxes can be brought into close
734 * alignment, based on the delx parameter for boxes in the
735 * line and within the overall the %maxshiftx and %maxshifty
736 * constraints.
737 * (c) Each pair, starting with the first, is used to search
738 * for a set of %nreq - 1 other pairs that can all be aligned
739 * with a difference in global translation of not more
740 * than (%delx, %dely).
741 * </pre>
742 */
743 l_ok
744 numaaCompareImagesByBoxes(NUMAA *naa1,
745 NUMAA *naa2,
746 l_int32 nperline,
747 l_int32 nreq,
748 l_int32 maxshiftx,
749 l_int32 maxshifty,
750 l_int32 delx,
751 l_int32 dely,
752 l_int32 *psame,
753 l_int32 debugflag)
754 {
755 l_int32 n1, n2, i, j, nbox, y1, y2, xl1, xl2;
756 l_int32 shiftx, shifty, match;
757 l_int32 *line1, *line2; /* indicator for sufficient boxes in a line */
758 l_int32 *yloc1, *yloc2; /* arrays of y value for first box in a line */
759 l_int32 *xleft1, *xleft2; /* arrays of x value for left side of first box */
760 NUMA *na1, *na2, *nai1, *nai2, *nasx, *nasy;
761
762 if (!psame)
763 return ERROR_INT("&same not defined", __func__, 1);
764 *psame = 0;
765 if (!naa1)
766 return ERROR_INT("naa1 not defined", __func__, 1);
767 if (!naa2)
768 return ERROR_INT("naa2 not defined", __func__, 1);
769 if (nperline < 1)
770 return ERROR_INT("nperline < 1", __func__, 1);
771 if (nreq < 1)
772 return ERROR_INT("nreq < 1", __func__, 1);
773
774 n1 = numaaGetCount(naa1);
775 n2 = numaaGetCount(naa2);
776 if (n1 < nreq || n2 < nreq)
777 return 0;
778
779 /* Find the lines in naa1 and naa2 with sufficient boxes.
780 * Also, find the y-values for each of the lines, and the
781 * LH x-values of the first box in each line. */
782 line1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32));
783 line2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
784 yloc1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32));
785 yloc2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
786 xleft1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32));
787 xleft2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
788 if (!line1 || !line2 || !yloc1 || !yloc2 || !xleft1 || !xleft2) {
789 LEPT_FREE(line1);
790 LEPT_FREE(line2);
791 LEPT_FREE(yloc1);
792 LEPT_FREE(yloc2);
793 LEPT_FREE(xleft1);
794 LEPT_FREE(xleft2);
795 return ERROR_INT("calloc failure for an array", __func__, 1);
796 }
797 for (i = 0; i < n1; i++) {
798 na1 = numaaGetNuma(naa1, i, L_CLONE);
799 numaGetIValue(na1, 0, yloc1 + i);
800 numaGetIValue(na1, 1, xleft1 + i);
801 nbox = (numaGetCount(na1) - 1) / 2;
802 if (nbox >= nperline)
803 line1[i] = 1;
804 numaDestroy(&na1);
805 }
806 for (i = 0; i < n2; i++) {
807 na2 = numaaGetNuma(naa2, i, L_CLONE);
808 numaGetIValue(na2, 0, yloc2 + i);
809 numaGetIValue(na2, 1, xleft2 + i);
810 nbox = (numaGetCount(na2) - 1) / 2;
811 if (nbox >= nperline)
812 line2[i] = 1;
813 numaDestroy(&na2);
814 }
815
816 /* Enumerate all possible line matches. A 'possible' line
817 * match is one where the x and y shifts for the first box
818 * in each line are within the maxshiftx and maxshifty
819 * constraints, and the left and right sides of the remaining
820 * (nperline - 1) successive boxes are within delx of each other.
821 * The result is a set of four numas giving parameters of
822 * each set of matching lines. */
823 nai1 = numaCreate(0); /* line index 1 of match */
824 nai2 = numaCreate(0); /* line index 2 of match */
825 nasx = numaCreate(0); /* shiftx for match */
826 nasy = numaCreate(0); /* shifty for match */
827 for (i = 0; i < n1; i++) {
828 if (line1[i] == 0) continue;
829 y1 = yloc1[i];
830 xl1 = xleft1[i];
831 na1 = numaaGetNuma(naa1, i, L_CLONE);
832 for (j = 0; j < n2; j++) {
833 if (line2[j] == 0) continue;
834 y2 = yloc2[j];
835 if (L_ABS(y1 - y2) > maxshifty) continue;
836 xl2 = xleft2[j];
837 if (L_ABS(xl1 - xl2) > maxshiftx) continue;
838 shiftx = xl1 - xl2; /* shift to add to x2 values */
839 shifty = y1 - y2; /* shift to add to y2 values */
840 na2 = numaaGetNuma(naa2, j, L_CLONE);
841
842 /* Now check if 'nperline' boxes in the two lines match */
843 match = testLineAlignmentX(na1, na2, shiftx, delx, nperline);
844 if (match) {
845 numaAddNumber(nai1, i);
846 numaAddNumber(nai2, j);
847 numaAddNumber(nasx, shiftx);
848 numaAddNumber(nasy, shifty);
849 }
850 numaDestroy(&na2);
851 }
852 numaDestroy(&na1);
853 }
854
855 /* Determine if there are a sufficient number of mutually
856 * aligned matches. Mutually aligned matches place an additional
857 * constraint on the 'possible' matches, where the relative
858 * shifts must not exceed the (delx, dely) distances. */
859 countAlignedMatches(nai1, nai2, nasx, nasy, n1, n2, delx, dely,
860 nreq, psame, debugflag);
861
862 LEPT_FREE(line1);
863 LEPT_FREE(line2);
864 LEPT_FREE(yloc1);
865 LEPT_FREE(yloc2);
866 LEPT_FREE(xleft1);
867 LEPT_FREE(xleft2);
868 numaDestroy(&nai1);
869 numaDestroy(&nai2);
870 numaDestroy(&nasx);
871 numaDestroy(&nasy);
872 return 0;
873 }
874
875
876 static l_int32
877 testLineAlignmentX(NUMA *na1,
878 NUMA *na2,
879 l_int32 shiftx,
880 l_int32 delx,
881 l_int32 nperline)
882 {
883 l_int32 i, xl1, xr1, xl2, xr2, diffl, diffr;
884
885 if (!na1)
886 return ERROR_INT("na1 not defined", __func__, 1);
887 if (!na2)
888 return ERROR_INT("na2 not defined", __func__, 1);
889
890 for (i = 0; i < nperline; i++) {
891 numaGetIValue(na1, i + 1, &xl1);
892 numaGetIValue(na1, i + 2, &xr1);
893 numaGetIValue(na2, i + 1, &xl2);
894 numaGetIValue(na2, i + 2, &xr2);
895 diffl = L_ABS(xl1 - xl2 - shiftx);
896 diffr = L_ABS(xr1 - xr2 - shiftx);
897 if (diffl > delx || diffr > delx)
898 return 0;
899 }
900
901 return 1;
902 }
903
904
905 /*
906 * \brief countAlignedMatches()
907 *
908 * \param[in] nai1, nai2 numas of row pairs for matches
909 * \param[in] nasx, nasy numas of x and y shifts for the matches
910 * \param[in] n1, n2 number of rows in images 1 and 2
911 * \param[in] delx, dely allowed difference in shifts of the match,
912 * compared to the reference match
913 * \param[in] nre1 number of required aligned matches
914 * \param[out] psame return 1 if %nreq row matches are found;
915 * 0 otherwise
916 * \return 0 if OK, 1 on error
917 *
918 * <pre>
919 * Notes:
920 * (1) This takes 4 input arrays giving parameters of all the
921 * line matches. It looks for the maximum set of aligned
922 * matches (matches with approximately the same overall shifts)
923 * that do not use rows from either image more than once.
924 * </pre>
925 */
926 static l_ok
927 countAlignedMatches(NUMA *nai1,
928 NUMA *nai2,
929 NUMA *nasx,
930 NUMA *nasy,
931 l_int32 n1,
932 l_int32 n2,
933 l_int32 delx,
934 l_int32 dely,
935 l_int32 nreq,
936 l_int32 *psame,
937 l_int32 debugflag)
938 {
939 l_int32 i, j, nm, shiftx, shifty, nmatch, diffx, diffy;
940 l_int32 *ia1, *ia2, *iasx, *iasy, *index1, *index2;
941
942 if (!nai1 || !nai2 || !nasx || !nasy)
943 return ERROR_INT("4 input numas not defined", __func__, 1);
944 if (!psame)
945 return ERROR_INT("&same not defined", __func__, 1);
946 *psame = 0;
947
948 /* Check for sufficient aligned matches, doing a double iteration
949 * over the set of raw matches. The row index arrays
950 * are used to verify that the same rows in either image
951 * are not used in more than one match. Whenever there
952 * is a match that is properly aligned, those rows are
953 * marked in the index arrays. */
954 nm = numaGetCount(nai1); /* number of matches */
955 if (nm < nreq)
956 return 0;
957
958 ia1 = numaGetIArray(nai1);
959 ia2 = numaGetIArray(nai2);
960 iasx = numaGetIArray(nasx);
961 iasy = numaGetIArray(nasy);
962 index1 = (l_int32 *)LEPT_CALLOC(n1, sizeof(l_int32)); /* watch rows */
963 index2 = (l_int32 *)LEPT_CALLOC(n2, sizeof(l_int32));
964 if (!index1 || !index2)
965 return ERROR_INT("calloc fail for array", __func__, 1);
966 for (i = 0; i < nm; i++) {
967 if (*psame == 1)
968 break;
969
970 /* Reset row index arrays */
971 memset(index1, 0, 4 * n1);
972 memset(index2, 0, 4 * n2);
973 nmatch = 1;
974 index1[ia1[i]] = nmatch; /* mark these rows as taken */
975 index2[ia2[i]] = nmatch;
976 shiftx = iasx[i]; /* reference shift between two rows */
977 shifty = iasy[i]; /* ditto */
978 if (nreq == 1) {
979 *psame = 1;
980 break;
981 }
982 for (j = 0; j < nm; j++) {
983 if (j == i) continue;
984 /* Rows must both be different from any previously seen */
985 if (index1[ia1[j]] > 0 || index2[ia2[j]] > 0) continue;
986 /* Check the shift for this match */
987 diffx = L_ABS(shiftx - iasx[j]);
988 diffy = L_ABS(shifty - iasy[j]);
989 if (diffx > delx || diffy > dely) continue;
990 /* We have a match */
991 nmatch++;
992 index1[ia1[j]] = nmatch; /* mark the rows */
993 index2[ia2[j]] = nmatch;
994 if (nmatch >= nreq) {
995 *psame = 1;
996 if (debugflag)
997 printRowIndices(index1, n1, index2, n2);
998 break;
999 }
1000 }
1001 }
1002
1003 LEPT_FREE(ia1);
1004 LEPT_FREE(ia2);
1005 LEPT_FREE(iasx);
1006 LEPT_FREE(iasy);
1007 LEPT_FREE(index1);
1008 LEPT_FREE(index2);
1009 return 0;
1010 }
1011
1012
1013 static void
1014 printRowIndices(l_int32 *index1,
1015 l_int32 n1,
1016 l_int32 *index2,
1017 l_int32 n2)
1018 {
1019 l_int32 i;
1020
1021 lept_stderr("Index1: ");
1022 for (i = 0; i < n1; i++) {
1023 if (i && (i % 20 == 0))
1024 lept_stderr("\n ");
1025 lept_stderr("%3d", index1[i]);
1026 }
1027 lept_stderr("\n");
1028
1029 lept_stderr("Index2: ");
1030 for (i = 0; i < n2; i++) {
1031 if (i && (i % 20 == 0))
1032 lept_stderr("\n ");
1033 lept_stderr("%3d", index2[i]);
1034 }
1035 lept_stderr("\n");
1036 return;
1037 }