comparison mupdf-source/thirdparty/leptonica/src/sarray2.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file sarray2.c
29 * <pre>
30 *
31 * Sort
32 * SARRAY *sarraySort()
33 * SARRAY *sarraySortByIndex()
34 * l_int32 stringCompareLexical()
35 *
36 * Set operations using aset (rbtree)
37 * L_ASET *l_asetCreateFromSarray()
38 * l_int32 sarrayRemoveDupsByAset()
39 * l_int32 sarrayUnionByAset()
40 * l_int32 sarrayIntersectionByAset()
41 *
42 * Hashmap operations
43 * L_HASHMAP *l_hmapCreateFromSarray()
44 * l_int32 sarrayRemoveDupsByHmap()
45 * l_int32 sarrayUnionByHmap()
46 * l_int32 sarrayIntersectionByHmap()
47 *
48 * Miscellaneous operations
49 * SARRAY *sarrayGenerateIntegers()
50 * l_int32 sarrayLookupCSKV()
51 *
52 *
53 * We have two implementations of set operations on an array of strings:
54 *
55 * (1) Using an underlying tree (rbtree).
56 * This uses a good 64 bit hashing function for the key,
57 * that is not expected to have hash collisions (and we do
58 * not test for them). The tree is built up of the hash
59 * values, and if the hash is found in the tree, it is
60 * assumed that the string has already been found.
61 *
62 * (2) Building a hashmap from the keys (hashmap).
63 * This uses a fast 64 bit hashing function for the key, which
64 * is then hashed into a hashtable. Collisions of hashkeys are
65 * very rare, but the hashtable is designed to allow more than one
66 * hashitem in a table entry. The hashitems are put in a list at
67 * each hashtable entry, which is traversed looking for the key.
68 * </pre>
69 */
70
71 #ifdef HAVE_CONFIG_H
72 #include <config_auto.h>
73 #endif /* HAVE_CONFIG_H */
74
75 #include <string.h>
76 #include "allheaders.h"
77 #include "array_internal.h"
78
79 /*----------------------------------------------------------------------*
80 * Sort *
81 *----------------------------------------------------------------------*/
82 /*!
83 * \brief sarraySort()
84 *
85 * \param[in] saout output sarray; can be NULL or equal to sain
86 * \param[in] sain input sarray
87 * \param[in] sortorder L_SORT_INCREASING or L_SORT_DECREASING
88 * \return saout output sarray, sorted by ascii value, or NULL on error
89 *
90 * <pre>
91 * Notes:
92 * (1) Set saout = sain for in-place; otherwise, set naout = NULL.
93 * (2) Shell sort, modified from K&R, 2nd edition, p.62.
94 * Slow but simple O(n logn) sort.
95 * </pre>
96 */
97 SARRAY *
98 sarraySort(SARRAY *saout,
99 SARRAY *sain,
100 l_int32 sortorder)
101 {
102 char **array;
103 char *tmp;
104 l_int32 n, i, j, gap;
105
106 if (!sain)
107 return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
108
109 /* Make saout if necessary; otherwise do in-place */
110 if (!saout)
111 saout = sarrayCopy(sain);
112 else if (sain != saout)
113 return (SARRAY *)ERROR_PTR("invalid: not in-place", __func__, NULL);
114 array = saout->array; /* operate directly on the array */
115 n = sarrayGetCount(saout);
116
117 /* Shell sort */
118 for (gap = n/2; gap > 0; gap = gap / 2) {
119 for (i = gap; i < n; i++) {
120 for (j = i - gap; j >= 0; j -= gap) {
121 if ((sortorder == L_SORT_INCREASING &&
122 stringCompareLexical(array[j], array[j + gap])) ||
123 (sortorder == L_SORT_DECREASING &&
124 stringCompareLexical(array[j + gap], array[j])))
125 {
126 tmp = array[j];
127 array[j] = array[j + gap];
128 array[j + gap] = tmp;
129 }
130 }
131 }
132 }
133
134 return saout;
135 }
136
137
138 /*!
139 * \brief sarraySortByIndex()
140 *
141 * \param[in] sain
142 * \param[in] naindex na that maps from the new sarray to the input sarray
143 * \return saout sorted, or NULL on error
144 */
145 SARRAY *
146 sarraySortByIndex(SARRAY *sain,
147 NUMA *naindex)
148 {
149 char *str;
150 l_int32 i, n, index;
151 SARRAY *saout;
152
153 if (!sain)
154 return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
155 if (!naindex)
156 return (SARRAY *)ERROR_PTR("naindex not defined", __func__, NULL);
157
158 n = sarrayGetCount(sain);
159 saout = sarrayCreate(n);
160 for (i = 0; i < n; i++) {
161 numaGetIValue(naindex, i, &index);
162 str = sarrayGetString(sain, index, L_COPY);
163 sarrayAddString(saout, str, L_INSERT);
164 }
165
166 return saout;
167 }
168
169
170 /*!
171 * \brief stringCompareLexical()
172 *
173 * \param[in] str1
174 * \param[in] str2
175 * \return 1 if str1 > str2 lexically; 0 otherwise
176 *
177 * <pre>
178 * Notes:
179 * (1) If the lexical values are identical, return a 0, to
180 * indicate that no swapping is required to sort the strings.
181 * </pre>
182 */
183 l_int32
184 stringCompareLexical(const char *str1,
185 const char *str2)
186 {
187 l_int32 i, len1, len2, len;
188
189 if (!str1)
190 return ERROR_INT("str1 not defined", __func__, 1);
191 if (!str2)
192 return ERROR_INT("str2 not defined", __func__, 1);
193
194 len1 = strlen(str1);
195 len2 = strlen(str2);
196 len = L_MIN(len1, len2);
197
198 for (i = 0; i < len; i++) {
199 if (str1[i] == str2[i])
200 continue;
201 if (str1[i] > str2[i])
202 return 1;
203 else
204 return 0;
205 }
206
207 if (len1 > len2)
208 return 1;
209 else
210 return 0;
211 }
212
213
214 /*----------------------------------------------------------------------*
215 * Set operations using aset (rbtree) *
216 *----------------------------------------------------------------------*/
217 /*!
218 * \brief l_asetCreateFromSarray()
219 *
220 * \param[in] sa
221 * \return set using a string hash into a uint64 as the key
222 */
223 L_ASET *
224 l_asetCreateFromSarray(SARRAY *sa)
225 {
226 char *str;
227 l_int32 i, n;
228 l_uint64 hash;
229 L_ASET *set;
230 RB_TYPE key;
231
232 if (!sa)
233 return (L_ASET *)ERROR_PTR("sa not defined", __func__, NULL);
234
235 set = l_asetCreate(L_UINT_TYPE);
236 n = sarrayGetCount(sa);
237 for (i = 0; i < n; i++) {
238 str = sarrayGetString(sa, i, L_NOCOPY);
239 l_hashStringToUint64Fast(str, &hash);
240 key.utype = hash;
241 l_asetInsert(set, key);
242 }
243
244 return set;
245 }
246
247
248 /*!
249 * \brief sarrayRemoveDupsByAset()
250 *
251 * \param[in] sas
252 * \param[out] psad with duplicates removed
253 * \return 0 if OK; 1 on error
254 *
255 * <pre>
256 * Notes:
257 * (1) This is O(nlogn), considerably slower than
258 * sarrayRemoveDupsByHmap() for large string arrays.
259 * (2) The key for each string is a 64-bit hash.
260 * (3) Build a set, using hashed strings as keys. As the set is
261 * built, first do a find; if not found, add the key to the
262 * set and add the string to the output sarray.
263 * </pre>
264 */
265 l_ok
266 sarrayRemoveDupsByAset(SARRAY *sas,
267 SARRAY **psad)
268 {
269 char *str;
270 l_int32 i, n;
271 l_uint64 hash;
272 L_ASET *set;
273 RB_TYPE key;
274 SARRAY *sad;
275
276 if (!psad)
277 return ERROR_INT("&sad not defined", __func__, 1);
278 *psad = NULL;
279 if (!sas)
280 return ERROR_INT("sas not defined", __func__, 1);
281
282 set = l_asetCreate(L_UINT_TYPE);
283 sad = sarrayCreate(0);
284 *psad = sad;
285 n = sarrayGetCount(sas);
286 for (i = 0; i < n; i++) {
287 str = sarrayGetString(sas, i, L_NOCOPY);
288 l_hashStringToUint64Fast(str, &hash);
289 key.utype = hash;
290 if (!l_asetFind(set, key)) {
291 sarrayAddString(sad, str, L_COPY);
292 l_asetInsert(set, key);
293 }
294 }
295
296 l_asetDestroy(&set);
297 return 0;
298 }
299
300
301 /*!
302 * \brief sarrayUnionByAset()
303 *
304 * \param[in] sa1
305 * \param[in] sa2
306 * \param[out] psad union of the two string arrays
307 * \return 0 if OK; 1 on error
308 *
309 * <pre>
310 * Notes:
311 * (1) Duplicates are removed from the concatenation of the two arrays.
312 * (2) The key for each string is a 64-bit hash.
313 * (2) Algorithm: Concatenate the two sarrays. Then build a set,
314 * using hashed strings as keys. As the set is built, first do
315 * a find; if not found, add the key to the set and add the string
316 * to the output sarray. This is O(nlogn).
317 * </pre>
318 */
319 l_ok
320 sarrayUnionByAset(SARRAY *sa1,
321 SARRAY *sa2,
322 SARRAY **psad)
323 {
324 SARRAY *sa3;
325
326 if (!psad)
327 return ERROR_INT("&sad not defined", __func__, 1);
328 *psad = NULL;
329 if (!sa1)
330 return ERROR_INT("sa1 not defined", __func__, 1);
331 if (!sa2)
332 return ERROR_INT("sa2 not defined", __func__, 1);
333
334 /* Join */
335 sa3 = sarrayCopy(sa1);
336 if (sarrayJoin(sa3, sa2) == 1) {
337 sarrayDestroy(&sa3);
338 return ERROR_INT("join failed for sa3", __func__, 1);
339 }
340
341 /* Eliminate duplicates */
342 sarrayRemoveDupsByAset(sa3, psad);
343 sarrayDestroy(&sa3);
344 return 0;
345 }
346
347
348 /*!
349 * \brief sarrayIntersectionByAset()
350 *
351 * \param[in] sa1
352 * \param[in] sa2
353 * \param[out] psad intersection of the two string arrays
354 * \return 0 if OK; 1 on error
355 *
356 * <pre>
357 * Notes:
358 * (1) Algorithm: put the larger sarray into a set, using the string
359 * hashes as the key values. Then run through the smaller sarray,
360 * building an output sarray and a second set from the strings
361 * in the larger array: if a string is in the first set but
362 * not in the second, add the string to the output sarray and hash
363 * it into the second set. The second set is required to make
364 * sure only one instance of each string is put into the output sarray.
365 * This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays.
366 * </pre>
367 */
368 l_ok
369 sarrayIntersectionByAset(SARRAY *sa1,
370 SARRAY *sa2,
371 SARRAY **psad)
372 {
373 char *str;
374 l_int32 n1, n2, i, n;
375 l_uint64 hash;
376 L_ASET *set1, *set2;
377 RB_TYPE key;
378 SARRAY *sa_small, *sa_big, *sad;
379
380 if (!psad)
381 return ERROR_INT("&sad not defined", __func__, 1);
382 *psad = NULL;
383 if (!sa1)
384 return ERROR_INT("sa1 not defined", __func__, 1);
385 if (!sa2)
386 return ERROR_INT("sa2 not defined", __func__, 1);
387
388 /* Put the elements of the biggest array into a set */
389 n1 = sarrayGetCount(sa1);
390 n2 = sarrayGetCount(sa2);
391 sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */
392 sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */
393 set1 = l_asetCreateFromSarray(sa_big);
394
395 /* Build up the intersection of strings */
396 sad = sarrayCreate(0);
397 *psad = sad;
398 n = sarrayGetCount(sa_small);
399 set2 = l_asetCreate(L_UINT_TYPE);
400 for (i = 0; i < n; i++) {
401 str = sarrayGetString(sa_small, i, L_NOCOPY);
402 l_hashStringToUint64Fast(str, &hash);
403 key.utype = hash;
404 if (l_asetFind(set1, key) && !l_asetFind(set2, key)) {
405 sarrayAddString(sad, str, L_COPY);
406 l_asetInsert(set2, key);
407 }
408 }
409
410 l_asetDestroy(&set1);
411 l_asetDestroy(&set2);
412 return 0;
413 }
414
415
416 /*----------------------------------------------------------------------*
417 * Hashmap operations *
418 *----------------------------------------------------------------------*/
419 /*!
420 * \brief l_hmapCreateFromSarray()
421 *
422 * \param[in] sa input sarray
423 * \return hmap hashmap, or NULL on error
424 */
425 L_HASHMAP *
426 l_hmapCreateFromSarray(SARRAY *sa)
427 {
428 l_int32 i, n;
429 l_uint64 key;
430 char *str;
431 L_HASHMAP *hmap;
432
433 if (!sa)
434 return (L_HASHMAP *)ERROR_PTR("sa not defined", __func__, NULL);
435
436 n = sarrayGetCount(sa);
437 if ((hmap = l_hmapCreate(0.51 * n, 2)) == NULL)
438 return (L_HASHMAP *)ERROR_PTR("hmap not made", __func__, NULL);
439 for (i = 0; i < n; i++) {
440 str = sarrayGetString(sa, i, L_NOCOPY);
441 l_hashStringToUint64Fast(str, &key);
442 l_hmapLookup(hmap, key, i, L_HMAP_CREATE);
443 }
444 return hmap;
445 }
446
447
448 /*!
449 * \brief sarrayRemoveDupsByHmap()
450 *
451 * \param[in] sas
452 * \param[out] psad hash set of unique values
453 * \param[out] phmap [optional] hashmap used for lookup
454 * \return 0 if OK; 1 on error
455 */
456 l_ok
457 sarrayRemoveDupsByHmap(SARRAY *sas,
458 SARRAY **psad,
459 L_HASHMAP **phmap)
460 {
461 l_int32 i, tabsize;
462 char *str;
463 SARRAY *sad;
464 L_HASHITEM *hitem;
465 L_HASHMAP *hmap;
466
467 if (phmap) *phmap = NULL;
468 if (!psad)
469 return ERROR_INT("&sad not defined", __func__, 1);
470 *psad = NULL;
471 if (!sas)
472 return ERROR_INT("sas not defined", __func__, 1);
473
474 /* Traverse the hashtable lists */
475 if ((hmap = l_hmapCreateFromSarray(sas)) == NULL)
476 return ERROR_INT("hmap not made", __func__, 1);
477 sad = sarrayCreate(0);
478 *psad = sad;
479 tabsize = hmap->tabsize;
480 for (i = 0; i < tabsize; i++) {
481 hitem = hmap->hashtab[i];
482 while (hitem) {
483 str = sarrayGetString(sas, hitem->val, L_COPY);
484 sarrayAddString(sad, str, L_INSERT);
485 hitem = hitem->next;
486 }
487 }
488
489 if (phmap)
490 *phmap = hmap;
491 else
492 l_hmapDestroy(&hmap);
493 return 0;
494 }
495
496
497 /*!
498 * \brief sarrayUnionByHmap()
499 *
500 * \param[in] sa1
501 * \param[in] sa2
502 * \param[out] *psad union of the array values
503 * \return 0 if OK; 1 on error
504 */
505 l_ok
506 sarrayUnionByHmap(SARRAY *sa1,
507 SARRAY *sa2,
508 SARRAY **psad)
509 {
510 SARRAY *sa3;
511
512 if (!psad)
513 return ERROR_INT("&sad not defined", __func__, 1);
514 *psad = NULL;
515 if (!sa1)
516 return ERROR_INT("sa1 not defined", __func__, 1);
517 if (!sa2)
518 return ERROR_INT("sa2 not defined", __func__, 1);
519
520 sa3 = sarrayCopy(sa1);
521 if (sarrayJoin(sa3, sa2) == 1) {
522 sarrayDestroy(&sa3);
523 return ERROR_INT("sa3 join failed", __func__, 1);
524 }
525 sarrayRemoveDupsByHmap(sa3, psad, NULL);
526 sarrayDestroy(&sa3);
527 return 0;
528 }
529
530
531 /*!
532 * \brief sarrayIntersectionByHmap()
533 *
534 * \param[in] sa1
535 * \param[in] sa2
536 * \param[out] *psad intersection of the array values
537 * \return 0 if OK; 1 on error
538 */
539 l_ok
540 sarrayIntersectionByHmap(SARRAY *sa1,
541 SARRAY *sa2,
542 SARRAY **psad)
543 {
544 l_int32 i, n1, n2, n;
545 l_uint64 key;
546 char *str;
547 SARRAY *sa_small, *sa_big, *sa3, *sad;
548 L_HASHITEM *hitem;
549 L_HASHMAP *hmap;
550
551 if (!psad)
552 return ERROR_INT("&sad not defined", __func__, 1);
553 *psad = NULL;
554 if (!sa1)
555 return ERROR_INT("sa1 not defined", __func__, 1);
556 if (!sa2)
557 return ERROR_INT("sa2 not defined", __func__, 1);
558
559 /* Make a hashmap for the elements of the biggest array */
560 n1 = sarrayGetCount(sa1);
561 n2 = sarrayGetCount(sa2);
562 sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */
563 sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */
564 if ((hmap = l_hmapCreateFromSarray(sa_big)) == NULL)
565 return ERROR_INT("hmap not made", __func__, 1);
566
567 /* Remove duplicates from the smallest array. Alternatively,
568 * we can skip this step and avoid counting duplicates in
569 * sa_small by modifying the count fields in the sa_big hashitems;
570 * e.g., see l_hmapIntersectionDna(). */
571 sarrayRemoveDupsByHmap(sa_small, &sa3, NULL);
572
573 /* Go through sa3, the set of strings derived from the smallest array,
574 * hashing into the big array table. Any string found belongs to both,
575 * so add it to the output array. */
576 sad = sarrayCreate(0);
577 *psad = sad;
578 n = sarrayGetCount(sa3);
579 for (i = 0; i < n; i++) {
580 str = sarrayGetString(sa3, i, L_NOCOPY);
581 l_hashStringToUint64Fast(str, &key);
582 hitem = l_hmapLookup(hmap, key, i, L_HMAP_CHECK);
583 if (hitem)
584 sarrayAddString(sad, str, L_COPY);
585 }
586 l_hmapDestroy(&hmap);
587 sarrayDestroy(&sa3);
588 return 0;
589 }
590
591
592 /*----------------------------------------------------------------------*
593 * Miscellaneous operations *
594 *----------------------------------------------------------------------*/
595 /*!
596 * \brief sarrayGenerateIntegers()
597 *
598 * \param[in] n
599 * \return sa of printed numbers, 1 - n, or NULL on error
600 */
601 SARRAY *
602 sarrayGenerateIntegers(l_int32 n)
603 {
604 char buf[32];
605 l_int32 i;
606 SARRAY *sa;
607
608 if ((sa = sarrayCreate(n)) == NULL)
609 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
610 for (i = 0; i < n; i++) {
611 snprintf(buf, sizeof(buf), "%d", i);
612 sarrayAddString(sa, buf, L_COPY);
613 }
614 return sa;
615 }
616
617
618 /*!
619 * \brief sarrayLookupCSKV()
620 *
621 * \param[in] sa of strings, each being a comma-separated pair
622 * of strings, the first being a key and the
623 * second a value
624 * \param[in] keystring an input string to match with each key in %sa
625 * \param[out] pvalstring the returned value string corresponding to the
626 * input key string, if found; otherwise NULL
627 * \return 0 if OK, 1 on error
628 *
629 * <pre>
630 * Notes:
631 * (1) The input %sa can have other strings that are not in
632 * comma-separated key-value format. These will be ignored.
633 * (2) This returns a copy of the first value string in %sa whose
634 * key string matches the input %keystring.
635 * (3) White space is not ignored; all white space before the ','
636 * is used for the keystring in matching. This allows the
637 * key and val strings to have white space (e.g., multiple words).
638 * </pre>
639 */
640 l_ok
641 sarrayLookupCSKV(SARRAY *sa,
642 const char *keystring,
643 char **pvalstring)
644 {
645 char *key, *val, *str;
646 l_int32 i, n;
647 SARRAY *sa1;
648
649 if (!pvalstring)
650 return ERROR_INT("&valstring not defined", __func__, 1);
651 *pvalstring = NULL;
652 if (!sa)
653 return ERROR_INT("sa not defined", __func__, 1);
654 if (!keystring)
655 return ERROR_INT("keystring not defined", __func__, 1);
656
657 n = sarrayGetCount(sa);
658 for (i = 0; i < n; i++) {
659 str = sarrayGetString(sa, i, L_NOCOPY);
660 sa1 = sarrayCreate(2);
661 sarraySplitString(sa1, str, ",");
662 if (sarrayGetCount(sa1) != 2) {
663 sarrayDestroy(&sa1);
664 continue;
665 }
666 key = sarrayGetString(sa1, 0, L_NOCOPY);
667 val = sarrayGetString(sa1, 1, L_NOCOPY);
668 if (!strcmp(key, keystring)) {
669 *pvalstring = stringNew(val);
670 sarrayDestroy(&sa1);
671 return 0;
672 }
673 sarrayDestroy(&sa1);
674 }
675
676 return 0;
677 }