comparison mupdf-source/thirdparty/leptonica/src/dnahash_remnant.c.notused @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file dnahash_remnant.c.notused
29 * <pre>
30 *
31 * NOTE
32 * ==================================================================
33 * This code has been retired from the library. It is just
34 * documentation. It contains dnahash functionality that is no
35 * longer in use. The only current use of dnahash (in dnahash.c)
36 * is for fast template lookup in the jbig2 classifier (jbclass.c).
37 *
38 * The functions in this file hash strings, points and doubles.
39 * Most of them have been replaced by analogous ones using the more
40 * general hashmap. They are saved here for pedagogical purposes;
41 * mostly, to show how misguided one can be trying to implement a
42 * general hashing function using only an array of doubles. (Yes, it can
43 * be done, but it's a lot of work for relatively little functionality.)
44 *
45 * The new hashmap (in hashmap.c) has a much simpler lookup and
46 * add mechanism. It also has a rehashing function to allow the hash
47 * array to grow as items are added. Unlike the simple dnahash,
48 * the hashmap stores the key in the hashitem, so it doesn't require
49 * an auxiliary array to do lookup with checking and creating.
50 * (It does however require the auxiliary array to retrieve the
51 * original data, which is not stored in the hashitem. This may
52 * be changed in the future.)
53 * ==================================================================
54 *
55 * DnaHash: Accessors
56 * l_int32 l_dnaHashGetCount()
57 * l_int32 l_dnaHashGetTotalCount()
58 *
59 * Set operations on dna
60 * L_DNAHASH *l_dnaHashCreateFromDna()
61 * l_int32 l_dnaRemoveDupsByHash()
62 * l_int32 l_dnaMakeHistoByHash()
63 * L_DNA *l_dnaIntersectionByHash()
64 * l_int32 l_dnaFindValByHash()
65 *
66 * Set operations on pta
67 * PTA *ptaUnionByHash()
68 * l_int32 ptaRemoveDupsByHash()
69 * PTA *ptaIntersectionByHash();
70 * l_int32 ptaFindPtByHash()
71 * L_DNAHASH *l_dnaHashCreateFromPta()
72 *
73 * Set operations on sarray
74 * l_int32 sarrayRemoveDupsByHash()
75 * SARRAY *sarrayIntersectionByHash()
76 * l_int32 sarrayFindStringByHash()
77 * L_DNAHASH *l_dnaHashCreateFromSarray()
78 *
79 * (1) The DnaHash is an array of Dna. It can be used for fast
80 * storage and lookup for sets and maps. If the set or map
81 * is on a Dna itself, the hash can be a simple casting from
82 * a double to a l_uint64. For a string or a (x,y) point, we
83 * use a hash to a l_uint64. The result of the hash is
84 * a "key", which is then used with the mod function to select
85 * which Dna array is to be used.
86 * (2) The number of arrays in a DnaHash is a prime number.
87 * If there are N items, we set up the DnaHash array to have
88 * approximately N/20 Dna, so the average size of these arrays
89 * will be about 20 when fully populated. The number 20 was
90 * found empirically to be in a broad maximum of efficiency.
91 * (3) Note that the word "hash" is overloaded. There are actually
92 * two hashing steps: the first hashes the object to a l_uint64,
93 * called the "key", and the second uses the mod function to
94 * "hash" the "key" to the index of a particular Dna in the
95 * DnaHash array.
96 * (4) Insertion and lookup time for DnaHash is O(1). Hash collisions
97 * into the dna array are expected (we might choose to have an average
98 * of 20 for each key). This can be contrasted with using rbtree
99 * for sets and maps, where insertion and lookup are O(logN).
100 * (5) Hash functions that map points and strings to l_uint64 are
101 * given in utils1.c.
102 * (6) This is a very simple implementation, that expects that you
103 * know approximately (i.e., within a factor of 2 or 3) how many
104 * items are to be stored when you initialize the DnaHash.
105 * It cannot modify the size of the Dna array as the occupation grows.
106 * </pre>
107 */
108
109 #ifdef HAVE_CONFIG_H
110 #include <config_auto.h>
111 #endif /* HAVE_CONFIG_H */
112
113 #include "allheaders.h"
114
115 /*--------------------------------------------------------------------------*
116 * Dna hash: Accessors and modifiers *
117 *--------------------------------------------------------------------------*/
118 /*!
119 * \brief l_dnaHashGetCount()
120 *
121 * \param[in] dahash
122 * \return nbuckets allocated, or 0 on error
123 */
124 l_int32
125 l_dnaHashGetCount(L_DNAHASH *dahash)
126 {
127
128 if (!dahash)
129 return ERROR_INT("dahash not defined", __func__, 0);
130 return dahash->nbuckets;
131 }
132
133
134 /*!
135 * \brief l_dnaHashGetTotalCount()
136 *
137 * \param[in] dahash
138 * \return n number of numbers in all dna, or 0 on error
139 */
140 l_int32
141 l_dnaHashGetTotalCount(L_DNAHASH *dahash)
142 {
143 l_int32 i, n;
144 L_DNA *da;
145
146 if (!dahash)
147 return ERROR_INT("dahash not defined", __func__, 0);
148
149 for (i = 0, n = 0; i < dahash->nbuckets; i++) {
150 da = l_dnaHashGetDna(dahash, i, L_NOCOPY);
151 if (da)
152 n += l_dnaGetCount(da);
153 }
154
155 return n;
156 }
157
158
159 /*--------------------------------------------------------------------------*
160 * Set operations on dna using hashing *
161 *--------------------------------------------------------------------------*/
162 /*!
163 * \brief l_dnaHashCreateFromDna()
164 *
165 * \param[in] da
166 * \return dahash if OK; 1 on error
167 *
168 * <pre>
169 * Notes:
170 * (1) The values stored in the %dahash are indices into %da;
171 * %dahash has no use without %da.
172 * </pre>
173 */
174 L_DNAHASH *
175 l_dnaHashCreateFromDna(L_DNA *da)
176 {
177 l_int32 i, n;
178 l_uint32 nsize;
179 l_uint64 key;
180 l_float64 val;
181 L_DNAHASH *dahash;
182
183 if (!da)
184 return (L_DNAHASH *)ERROR_PTR("da not defined", __func__, NULL);
185
186 n = l_dnaGetCount(da);
187 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
188
189 dahash = l_dnaHashCreate(nsize, 8);
190 for (i = 0; i < n; i++) {
191 l_dnaGetDValue(da, i, &val);
192 l_dnaHashAdd(dahash, (l_uint64)val, (l_float64)i);
193 }
194
195 return dahash;
196 }
197
198
199 /*!
200 * \brief l_dnaRemoveDupsByHash()
201 *
202 * \param[in] das
203 * \param[out] pdad hash set
204 * \param[out] pdahash [optional] dnahash used for lookup
205 * \return 0 if OK; 1 on error
206 *
207 * <pre>
208 * Notes:
209 * (1) Generates a dna with unique values.
210 * (2) The dnahash is built up with dad to assure uniqueness.
211 * It can be used to find if an element is in the set:
212 * l_dnaFindValByHash(dad, dahash, val, &index)
213 * </pre>
214 */
215 l_ok
216 l_dnaRemoveDupsByHash(L_DNA *das,
217 L_DNA **pdad,
218 L_DNAHASH **pdahash)
219 {
220 l_int32 i, n, index, items;
221 l_uint32 nsize;
222 l_uint64 key;
223 l_float64 val;
224 L_DNA *dad;
225 L_DNAHASH *dahash;
226
227 if (pdahash) *pdahash = NULL;
228 if (!pdad)
229 return ERROR_INT("&dad not defined", __func__, 1);
230 *pdad = NULL;
231 if (!das)
232 return ERROR_INT("das not defined", __func__, 1);
233
234 n = l_dnaGetCount(das);
235 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
236 dahash = l_dnaHashCreate(nsize, 8);
237 dad = l_dnaCreate(n);
238 *pdad = dad;
239 for (i = 0, items = 0; i < n; i++) {
240 l_dnaGetDValue(das, i, &val);
241 l_dnaFindValByHash(dad, dahash, val, &index);
242 if (index < 0) { /* not found */
243 l_dnaHashAdd(dahash, (l_uint64)val, (l_float64)items);
244 l_dnaAddNumber(dad, val);
245 items++;
246 }
247 }
248
249 if (pdahash)
250 *pdahash = dahash;
251 else
252 l_dnaHashDestroy(&dahash);
253 return 0;
254 }
255
256
257 /*!
258 * \brief l_dnaMakeHistoByHash()
259 *
260 * \param[in] das
261 * \param[out] pdahash hash map: val --> index
262 * \param[out] pdav [optional] array of values: index --> val
263 * \param[out] pdac [optional] histo array of counts: index --> count
264 * \return 0 if OK; 1 on error
265 *
266 * <pre>
267 * Notes:
268 * (1) Generates and returns a dna of occurrences (histogram),
269 * an aligned dna of values, and an associated hashmap.
270 * The hashmap takes %dav and a value, and points into the
271 * histogram in %dac.
272 * (2) The dna of values, %dav, is aligned with the histogram %dac,
273 * and is needed for fast lookup. It is a hash set, because
274 * the values are unique.
275 * (3) If you only need to make a histogram and get the number of
276 * non-zero entries, here are two methods:
277 * (a) l_dnaMakeHistoByHash(da, &dahash, NULL, NULL);
278 * count = l_dnaHashGetTotalCount(dahash);
279 * (b) l_dnaRemoveDupsByHash(da, &da_nodups, NULL);
280 count = l_dnaGetCount(da_nodups);
281 * (4) Lookup is simple:
282 * l_dnaFindValByHash(dav, dahash, val, &index);
283 * if (index >= 0)
284 * l_dnaGetIValue(dac, index, &icount);
285 * else
286 * icount = 0;
287 * </pre>
288 */
289 l_ok
290 l_dnaMakeHistoByHash(L_DNA *das,
291 L_DNAHASH **pdahash,
292 L_DNA **pdav,
293 L_DNA **pdac)
294 {
295 l_int32 i, n, nitems, index, count;
296 l_uint32 nsize;
297 l_uint64 key;
298 l_float64 val;
299 L_DNA *dac, *dav;
300 L_DNAHASH *dahash;
301
302 if (pdahash) *pdahash = NULL;
303 if (pdac) *pdac = NULL;
304 if (pdav) *pdav = NULL;
305 if (!pdahash)
306 return ERROR_INT("&dahash not defined", __func__, 1);
307 if (!das)
308 return ERROR_INT("das not defined", __func__, 1);
309 if ((n = l_dnaGetCount(das)) == 0)
310 return ERROR_INT("no data in das", __func__, 1);
311
312 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
313 dahash = l_dnaHashCreate(nsize, 8);
314 dac = l_dnaCreate(n); /* histogram */
315 dav = l_dnaCreate(n); /* the values */
316 for (i = 0, nitems = 0; i < n; i++) {
317 l_dnaGetDValue(das, i, &val);
318 /* Is this value already stored in dav? */
319 l_dnaFindValByHash(dav, dahash, val, &index);
320 if (index >= 0) { /* found */
321 l_dnaGetIValue(dac, (l_float64)index, &count);
322 l_dnaSetValue(dac, (l_float64)index, count + 1);
323 } else { /* not found */
324 l_dnaHashAdd(dahash, (l_uint64)val, (l_float64)nitems);
325 l_dnaAddNumber(dav, val);
326 l_dnaAddNumber(dac, 1);
327 nitems++;
328 }
329 }
330
331 *pdahash = dahash;
332 if (pdac)
333 *pdac = dac;
334 else
335 l_dnaDestroy(&dac);
336 if (pdav)
337 *pdav = dav;
338 else
339 l_dnaDestroy(&dav);
340 return 0;
341 }
342
343
344 /*!
345 * \brief l_dnaIntersectionByHash()
346 *
347 * \param[in] da1, da2
348 * \return dad intersection of the number arrays, or NULL on error
349 *
350 * <pre>
351 * Notes:
352 * (1) This uses the same method for building the intersection set
353 * as ptaIntersectionByHash() and sarrayIntersectionByHash().
354 * </pre>
355 */
356 L_DNA *
357 l_dnaIntersectionByHash(L_DNA *da1,
358 L_DNA *da2)
359 {
360 l_int32 n1, n2, nsmall, nbuckets, i, index1, index2;
361 l_uint32 nsize2;
362 l_uint64 key;
363 l_float64 val;
364 L_DNAHASH *dahash1, *dahash2;
365 L_DNA *da_small, *da_big, *dad;
366
367 if (!da1)
368 return (L_DNA *)ERROR_PTR("da1 not defined", __func__, NULL);
369 if (!da2)
370 return (L_DNA *)ERROR_PTR("da2 not defined", __func__, NULL);
371
372 /* Put the elements of the biggest array into a dnahash */
373 n1 = l_dnaGetCount(da1);
374 n2 = l_dnaGetCount(da2);
375 da_small = (n1 < n2) ? da1 : da2; /* do not destroy da_small */
376 da_big = (n1 < n2) ? da2 : da1; /* do not destroy da_big */
377 dahash1 = l_dnaHashCreateFromDna(da_big);
378
379 /* Build up the intersection of numbers. Add to %dad
380 * if the number is in da_big (using dahash1) but hasn't
381 * yet been seen in the traversal of da_small (using dahash2). */
382 dad = l_dnaCreate(0);
383 nsmall = l_dnaGetCount(da_small);
384 findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */
385 dahash2 = l_dnaHashCreate(nsize2, 0);
386 nbuckets = l_dnaHashGetCount(dahash2);
387 for (i = 0; i < nsmall; i++) {
388 l_dnaGetDValue(da_small, i, &val);
389 l_dnaFindValByHash(da_big, dahash1, val, &index1);
390 if (index1 >= 0) { /* found */
391 l_dnaFindValByHash(da_small, dahash2, val, &index2);
392 if (index2 == -1) { /* not found */
393 l_dnaAddNumber(dad, val);
394 l_dnaHashAdd(dahash2, (l_uint64)val, (l_float64)i);
395 }
396 }
397 }
398
399 l_dnaHashDestroy(&dahash1);
400 l_dnaHashDestroy(&dahash2);
401 return dad;
402 }
403
404
405 /*!
406 * \brief l_dnaFindValByHash()
407 *
408 * \param[in] da
409 * \param[in] dahash containing indices into %da
410 * \param[in] val searching for this number in %da
411 * \param[out] pindex index into da if found; -1 otherwise
412 * \return 0 if OK; 1 on error
413 *
414 * <pre>
415 * Notes:
416 * (1) Algo: hash %val into a key; hash the key to get the dna
417 * in %dahash (that holds indices into %da); traverse
418 * the dna of indices looking for %val in %da.
419 * </pre>
420 */
421 l_ok
422 l_dnaFindValByHash(L_DNA *da,
423 L_DNAHASH *dahash,
424 l_float64 val,
425 l_int32 *pindex)
426 {
427 l_int32 i, nbuckets, nvals, indexval;
428 l_float64 vali;
429 l_uint64 key;
430 L_DNA *da1;
431
432 if (!pindex)
433 return ERROR_INT("&index not defined", __func__, 1);
434 *pindex = -1;
435 if (!da)
436 return ERROR_INT("da not defined", __func__, 1);
437 if (!dahash)
438 return ERROR_INT("dahash not defined", __func__, 1);
439
440 nbuckets = l_dnaHashGetCount(dahash);
441 da1 = l_dnaHashGetDna(dahash, (l_uint64)val, L_NOCOPY);
442 if (!da1) return 0;
443
444 /* Run through da1, looking for this %val */
445 nvals = l_dnaGetCount(da1);
446 for (i = 0; i < nvals; i++) {
447 l_dnaGetIValue(da1, i, &indexval);
448 l_dnaGetDValue(da, indexval, &vali);
449 if (val == vali) {
450 *pindex = indexval;
451 return 0;
452 }
453 }
454
455 return 0;
456 }
457
458
459 /*---------------------------------------------------------------------*
460 * Set operations on points using hashing *
461 *---------------------------------------------------------------------*/
462 /*!
463 * \brief ptaUnionByHash()
464 *
465 * \param[in] pta1, pta2
466 * \return ptad with the union of the set of points, or NULL on error
467 *
468 * <pre>
469 * Notes:
470 * (1) This is faster than ptaUnionByAset(), because the
471 * bucket lookup is O(n). It should be used if the pts are
472 * integers (e.g., representing pixel positions).
473 * </pre>
474 */
475 PTA *
476 ptaUnionByHash(PTA *pta1,
477 PTA *pta2)
478 {
479 PTA *pta3, *ptad;
480
481 if (!pta1)
482 return (PTA *)ERROR_PTR("pta1 not defined", __func__, NULL);
483 if (!pta2)
484 return (PTA *)ERROR_PTR("pta2 not defined", __func__, NULL);
485
486 /* Join */
487 pta3 = ptaCopy(pta1);
488 ptaJoin(pta3, pta2, 0, -1);
489
490 /* Eliminate duplicates */
491 ptaRemoveDupsByHash(pta3, &ptad, NULL);
492 ptaDestroy(&pta3);
493 return ptad;
494 }
495
496
497 /*!
498 * \brief ptaRemoveDupsByHash()
499 *
500 * \param[in] ptas assumed to be integer values
501 * \param[out] pptad unique set of pts; duplicates removed
502 * \param[out] pdahash [optional] dnahash used for lookup
503 * \return 0 if OK, 1 on error
504 *
505 * <pre>
506 * Notes:
507 * (1) Generates a pta with unique values.
508 * (2) The dnahash is built up with ptad to assure uniqueness.
509 * It can be used to find if a point is in the set:
510 * ptaFindPtByHash(ptad, dahash, x, y, &index)
511 * (3) The hash of the (x,y) location is simple and fast. It scales
512 * up with the number of buckets to insure a fairly random
513 * bucket selection for adjacent points.
514 * (4) A Dna is used rather than a Numa because we need accurate
515 * representation of 32-bit integers that are indices into ptas.
516 * Integer --> float --> integer conversion makes errors for
517 * integers larger than 10M.
518 * (5) This is faster than ptaRemoveDupsByAset(), because the
519 * bucket lookup is O(n), although there is a double-loop
520 * lookup within the dna in each bucket.
521 * </pre>
522 */
523 l_ok
524 ptaRemoveDupsByHash(PTA *ptas,
525 PTA **pptad,
526 L_DNAHASH **pdahash)
527 {
528 l_int32 i, n, index, items, x, y;
529 l_uint32 nsize;
530 l_uint64 key;
531 PTA *ptad;
532 L_DNAHASH *dahash;
533
534 if (pdahash) *pdahash = NULL;
535 if (!pptad)
536 return ERROR_INT("&ptad not defined", __func__, 1);
537 *pptad = NULL;
538 if (!ptas)
539 return ERROR_INT("ptas not defined", __func__, 1);
540
541 n = ptaGetCount(ptas);
542 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
543 dahash = l_dnaHashCreate(nsize, 8);
544 ptad = ptaCreate(n);
545 *pptad = ptad;
546 for (i = 0, items = 0; i < n; i++) {
547 ptaGetIPt(ptas, i, &x, &y);
548 ptaFindPtByHash(ptad, dahash, x, y, &index);
549 if (index < 0) { /* not found */
550 l_hashPtToUint64(x, y, &key);
551 l_dnaHashAdd(dahash, key, (l_float64)items);
552 ptaAddPt(ptad, x, y);
553 items++;
554 }
555 }
556
557 if (pdahash)
558 *pdahash = dahash;
559 else
560 l_dnaHashDestroy(&dahash);
561 return 0;
562 }
563
564
565 /*!
566 * \brief ptaIntersectionByHash()
567 *
568 * \param[in] pta1, pta2
569 * \return ptad intersection of the point sets, or NULL on error
570 *
571 * <pre>
572 * Notes:
573 * (1) This is faster than ptaIntersectionByAset(), because the
574 * bucket lookup is O(n). It should be used if the pts are
575 * integers (e.g., representing pixel positions).
576 * </pre>
577 */
578 PTA *
579 ptaIntersectionByHash(PTA *pta1,
580 PTA *pta2)
581 {
582 l_int32 n1, n2, nsmall, i, x, y, index1, index2;
583 l_uint32 nsize2;
584 l_uint64 key;
585 L_DNAHASH *dahash1, *dahash2;
586 PTA *pta_small, *pta_big, *ptad;
587
588 if (!pta1)
589 return (PTA *)ERROR_PTR("pta1 not defined", __func__, NULL);
590 if (!pta2)
591 return (PTA *)ERROR_PTR("pta2 not defined", __func__, NULL);
592
593 /* Put the elements of the biggest pta into a dnahash */
594 n1 = ptaGetCount(pta1);
595 n2 = ptaGetCount(pta2);
596 pta_small = (n1 < n2) ? pta1 : pta2; /* do not destroy pta_small */
597 pta_big = (n1 < n2) ? pta2 : pta1; /* do not destroy pta_big */
598 dahash1 = l_dnaHashCreateFromPta(pta_big);
599
600 /* Build up the intersection of points. Add to ptad
601 * if the point is in pta_big (using dahash1) but hasn't
602 * yet been seen in the traversal of pta_small (using dahash2). */
603 ptad = ptaCreate(0);
604 nsmall = ptaGetCount(pta_small);
605 findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */
606 dahash2 = l_dnaHashCreate(nsize2, 0);
607 for (i = 0; i < nsmall; i++) {
608 ptaGetIPt(pta_small, i, &x, &y);
609 ptaFindPtByHash(pta_big, dahash1, x, y, &index1);
610 if (index1 >= 0) { /* found */
611 ptaFindPtByHash(pta_small, dahash2, x, y, &index2);
612 if (index2 == -1) { /* not found */
613 ptaAddPt(ptad, x, y);
614 l_hashPtToUint64(x, y, &key);
615 l_dnaHashAdd(dahash2, key, (l_float64)i);
616 }
617 }
618 }
619
620 l_dnaHashDestroy(&dahash1);
621 l_dnaHashDestroy(&dahash2);
622 return ptad;
623 }
624
625
626 /*!
627 * \brief ptaFindPtByHash()
628 *
629 * \param[in] pta
630 * \param[in] dahash built from pta
631 * \param[in] x, y arbitrary points
632 * \param[out] pindex index into pta if (x,y) is in pta; -1 otherwise
633 * \return 0 if OK, 1 on error
634 *
635 * <pre>
636 * Notes:
637 * (1) Fast lookup in dnaHash associated with a pta, to see if a
638 * random point (x,y) is already stored in the hash table.
639 * (2) We use a strong hash function to minimize the chance that
640 * two different points hash to the same key value.
641 * (3) We select the number of buckets to be about 5% of the size
642 * of the input %pta, so that when fully populated, each
643 * bucket (dna) will have about 20 entries, each being an index
644 * into %pta. In lookup, after hashing to the key, and then
645 * again to the bucket, we traverse the bucket (dna), using the
646 * index into %pta to check if the point (x,y) has been found before.
647 * </pre>
648 */
649 l_ok
650 ptaFindPtByHash(PTA *pta,
651 L_DNAHASH *dahash,
652 l_int32 x,
653 l_int32 y,
654 l_int32 *pindex)
655 {
656 l_int32 i, nvals, index, xi, yi;
657 l_uint64 key;
658 L_DNA *da;
659
660 if (!pindex)
661 return ERROR_INT("&index not defined", __func__, 1);
662 *pindex = -1;
663 if (!pta)
664 return ERROR_INT("pta not defined", __func__, 1);
665 if (!dahash)
666 return ERROR_INT("dahash not defined", __func__, 1);
667
668 l_hashPtToUint64(x, y, &key);
669 da = l_dnaHashGetDna(dahash, key, L_NOCOPY);
670 if (!da) return 0;
671
672 /* Run through the da, looking for this point */
673 nvals = l_dnaGetCount(da);
674 for (i = 0; i < nvals; i++) {
675 l_dnaGetIValue(da, i, &index);
676 ptaGetIPt(pta, index, &xi, &yi);
677 if (x == xi && y == yi) {
678 *pindex = index;
679 return 0;
680 }
681 }
682
683 return 0;
684 }
685
686
687 /*!
688 * \brief l_dnaHashCreateFromPta()
689 *
690 * \param[in] pta
691 * \return dahash, or NULL on error
692 */
693 L_DNAHASH *
694 l_dnaHashCreateFromPta(PTA *pta)
695 {
696 l_int32 i, n, x, y;
697 l_uint32 nsize;
698 l_uint64 key;
699 L_DNAHASH *dahash;
700
701 if (!pta)
702 return (L_DNAHASH *)ERROR_PTR("pta not defined", __func__, NULL);
703
704 /* Build up dnaHash of indices, hashed by a key that is
705 * a large linear combination of x and y values designed to
706 * randomize the key. Having about 20 pts in each bucket is
707 * roughly optimal for speed for large sets. */
708 n = ptaGetCount(pta);
709 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
710
711 /* Add each point, using the hash as key and the index into
712 * %ptas as the value. Storing the index enables operations
713 * that check for duplicates. */
714 dahash = l_dnaHashCreate(nsize, 8);
715 for (i = 0; i < n; i++) {
716 ptaGetIPt(pta, i, &x, &y);
717 l_hashPtToUint64(x, y, &key);
718 l_dnaHashAdd(dahash, key, (l_float64)i);
719 }
720
721 return dahash;
722 }
723
724
725 /*----------------------------------------------------------------------*
726 * Set operations on sarray using hashing *
727 *----------------------------------------------------------------------*/
728 /*!
729 * \brief sarrayRemoveDupsByHash()
730 *
731 * \param[in] sas
732 * \param[out] psad unique set of strings; duplicates removed
733 * \param[out] pdahash [optional] dnahash used for lookup
734 * \return 0 if OK, 1 on error
735 *
736 * <pre>
737 * Notes:
738 * (1) Generates a sarray with unique values.
739 * (2) The dnahash is built up with sad to assure uniqueness.
740 * It can be used to find if a string is in the set:
741 * sarrayFindValByHash(sad, dahash, str, &index)
742 * (3) The hash of the string location is simple and fast. It scales
743 * up with the number of buckets to insure a fairly random
744 * bucket selection input strings.
745 * (4) This is faster than sarrayRemoveDupsByAset(), because the
746 * bucket lookup is O(n), although there is a double-loop
747 * lookup within the dna in each bucket.
748 * </pre>
749 */
750 l_ok
751 sarrayRemoveDupsByHash(SARRAY *sas,
752 SARRAY **psad,
753 L_DNAHASH **pdahash)
754 {
755 char *str;
756 l_int32 i, n, index, items;
757 l_uint32 nsize;
758 l_uint64 key;
759 SARRAY *sad;
760 L_DNAHASH *dahash;
761
762 if (pdahash) *pdahash = NULL;
763 if (!psad)
764 return ERROR_INT("&sad not defined", __func__, 1);
765 *psad = NULL;
766 if (!sas)
767 return ERROR_INT("sas not defined", __func__, 1);
768
769 n = sarrayGetCount(sas);
770 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
771 dahash = l_dnaHashCreate(nsize, 8);
772 sad = sarrayCreate(n);
773 *psad = sad;
774 for (i = 0, items = 0; i < n; i++) {
775 str = sarrayGetString(sas, i, L_NOCOPY);
776 sarrayFindStringByHash(sad, dahash, str, &index);
777 if (index < 0) { /* not found */
778 l_hashStringToUint64(str, &key);
779 l_dnaHashAdd(dahash, key, (l_float64)items);
780 sarrayAddString(sad, str, L_COPY);
781 items++;
782 }
783 }
784
785 if (pdahash)
786 *pdahash = dahash;
787 else
788 l_dnaHashDestroy(&dahash);
789 return 0;
790 }
791
792
793 /*!
794 * \brief sarrayIntersectionByHash()
795 *
796 * \param[in] sa1, sa2
797 * \return sad intersection of the strings, or NULL on error
798 *
799 * <pre>
800 * Notes:
801 * (1) This is faster than sarrayIntersectionByAset(), because the
802 * bucket lookup is O(n).
803 * </pre>
804 */
805 SARRAY *
806 sarrayIntersectionByHash(SARRAY *sa1,
807 SARRAY *sa2)
808 {
809 char *str;
810 l_int32 n1, n2, nsmall, i, index1, index2;
811 l_uint32 nsize2;
812 l_uint64 key;
813 L_DNAHASH *dahash1, *dahash2;
814 SARRAY *sa_small, *sa_big, *sad;
815
816 if (!sa1)
817 return (SARRAY *)ERROR_PTR("sa1 not defined", __func__, NULL);
818 if (!sa2)
819 return (SARRAY *)ERROR_PTR("sa2 not defined", __func__, NULL);
820
821 /* Put the elements of the biggest sarray into a dnahash */
822 n1 = sarrayGetCount(sa1);
823 n2 = sarrayGetCount(sa2);
824 sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */
825 sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */
826 dahash1 = l_dnaHashCreateFromSarray(sa_big);
827
828 /* Build up the intersection of strings. Add to %sad
829 * if the string is in sa_big (using dahash1) but hasn't
830 * yet been seen in the traversal of sa_small (using dahash2). */
831 sad = sarrayCreate(0);
832 nsmall = sarrayGetCount(sa_small);
833 findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */
834 dahash2 = l_dnaHashCreate(nsize2, 0);
835 for (i = 0; i < nsmall; i++) {
836 str = sarrayGetString(sa_small, i, L_NOCOPY);
837 sarrayFindStringByHash(sa_big, dahash1, str, &index1);
838 if (index1 >= 0) {
839 sarrayFindStringByHash(sa_small, dahash2, str, &index2);
840 if (index2 == -1) {
841 sarrayAddString(sad, str, L_COPY);
842 l_hashStringToUint64(str, &key);
843 l_dnaHashAdd(dahash2, key, (l_float64)i);
844 }
845 }
846 }
847
848 l_dnaHashDestroy(&dahash1);
849 l_dnaHashDestroy(&dahash2);
850 return sad;
851 }
852
853
854 /*!
855 * \brief sarrayFindStringByHash()
856 *
857 * \param[in] sa
858 * \param[in] dahash built from sa
859 * \param[in] str arbitrary string
860 * \param[out] pindex index into %sa if %str is in %sa; -1 otherwise
861 * \return 0 if OK, 1 on error
862 *
863 * <pre>
864 * Notes:
865 * (1) Fast lookup in dnaHash associated with a sarray, to see if a
866 * random string %str is already stored in the hash table.
867 * (2) We use a strong hash function to minimize the chance that
868 * two different strings hash to the same key value.
869 * (3) We select the number of buckets to be about 5% of the size
870 * of the input sarray, so that when fully populated, each
871 * bucket (dna) will have about 20 entries, each being an index
872 * into sa. In lookup, after hashing to the key, and then
873 * again to the bucket, we traverse the bucket (dna), using the
874 * index into sa to check if %str has been found before.
875 * </pre>
876 */
877 l_ok
878 sarrayFindStringByHash(SARRAY *sa,
879 L_DNAHASH *dahash,
880 const char *str,
881 l_int32 *pindex)
882 {
883 char *stri;
884 l_int32 i, nvals, index;
885 l_uint64 key;
886 L_DNA *da;
887
888 if (!pindex)
889 return ERROR_INT("&index not defined", __func__, 1);
890 *pindex = -1;
891 if (!sa)
892 return ERROR_INT("sa not defined", __func__, 1);
893 if (!dahash)
894 return ERROR_INT("dahash not defined", __func__, 1);
895
896 l_hashStringToUint64(str, &key);
897 da = l_dnaHashGetDna(dahash, key, L_NOCOPY);
898 if (!da) return 0;
899
900 /* Run through the da, looking for this string */
901 nvals = l_dnaGetCount(da);
902 for (i = 0; i < nvals; i++) {
903 l_dnaGetIValue(da, i, &index);
904 stri = sarrayGetString(sa, index, L_NOCOPY);
905 if (!strcmp(str, stri)) { /* duplicate */
906 *pindex = index;
907 return 0;
908 }
909 }
910
911 return 0;
912 }
913
914
915 /*!
916 * \brief l_dnaHashCreateFromSarray()
917 *
918 * \param[in] sa
919 * \return dahash, or NULL on error
920 */
921 L_DNAHASH *
922 l_dnaHashCreateFromSarray(SARRAY *sa)
923 {
924 char *str;
925 l_int32 i, n;
926 l_uint32 nsize;
927 l_uint64 key;
928 L_DNAHASH *dahash;
929
930 /* Build up dnaHash of indices, hashed by a 64-bit key that
931 * should randomize the lower bits used in bucket selection.
932 * Having about 20 pts in each bucket is roughly optimal. */
933 n = sarrayGetCount(sa);
934 findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */
935 /* lept_stderr("Prime used: %d\n", nsize); */
936
937 /* Add each string, using the hash as key and the index into %sa
938 * as the value. Storing the index enables operations that check
939 * for duplicates. */
940 dahash = l_dnaHashCreate(nsize, 8);
941 for (i = 0; i < n; i++) {
942 str = sarrayGetString(sa, i, L_NOCOPY);
943 l_hashStringToUint64(str, &key);
944 l_dnaHashAdd(dahash, key, (l_float64)i);
945 }
946
947 return dahash;
948 }
949