comparison mupdf-source/thirdparty/leptonica/src/sarray1.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file sarray1.c
29 * <pre>
30 *
31 * Create/Destroy/Copy
32 * SARRAY *sarrayCreate()
33 * SARRAY *sarrayCreateInitialized()
34 * SARRAY *sarrayCreateWordsFromString()
35 * SARRAY *sarrayCreateLinesFromString()
36 * void *sarrayDestroy()
37 * SARRAY *sarrayCopy()
38 * SARRAY *sarrayClone()
39 *
40 * Add/Remove string
41 * l_int32 sarrayAddString()
42 * static l_int32 sarrayExtendArray()
43 * char *sarrayRemoveString()
44 * l_int32 sarrayReplaceString()
45 * l_int32 sarrayClear()
46 *
47 * Accessors
48 * l_int32 sarrayGetCount()
49 * char **sarrayGetArray()
50 * char *sarrayGetString()
51 *
52 * Conversion back to string
53 * char *sarrayToString()
54 * char *sarrayToStringRange()
55 *
56 * Concatenate strings uniformly within the sarray
57 * SARRAY *sarrayConcatUniformly()
58 *
59 * Join 2 sarrays
60 * l_int32 sarrayJoin()
61 * l_int32 sarrayAppendRange()
62 *
63 * Pad an sarray to be the same size as another sarray
64 * l_int32 sarrayPadToSameSize()
65 *
66 * Convert word sarray to (formatted) line sarray
67 * SARRAY *sarrayConvertWordsToLines()
68 *
69 * Split string on separator list
70 * SARRAY *sarraySplitString()
71 *
72 * Filter sarray
73 * SARRAY *sarraySelectBySubstring()
74 * SARRAY *sarraySelectRange()
75 * l_int32 sarrayParseRange()
76 *
77 * Serialize for I/O
78 * SARRAY *sarrayRead()
79 * SARRAY *sarrayReadStream()
80 * SARRAY *sarrayReadMem()
81 * l_int32 sarrayWrite()
82 * l_int32 sarrayWriteStream()
83 * l_int32 sarrayWriteStderr()
84 * l_int32 sarrayWriteMem()
85 * l_int32 sarrayAppend()
86 *
87 * Directory filenames
88 * SARRAY *getNumberedPathnamesInDirectory()
89 * SARRAY *getSortedPathnamesInDirectory()
90 * SARRAY *convertSortedToNumberedPathnames()
91 * SARRAY *getFilenamesInDirectory()
92 *
93 * These functions are important for efficient manipulation
94 * of string data, and they have found widespread use in
95 * leptonica. For example:
96 * (1) to generate text files: e.g., PostScript and PDF
97 * wrappers around sets of images
98 * (2) to parse text files: e.g., extracting prototypes
99 * from the source to generate allheaders.h
100 * (3) to generate code for compilation: e.g., the fast
101 * dwa code for arbitrary structuring elements.
102 *
103 * Comments on usage:
104 *
105 * The user is responsible for correctly disposing of strings
106 * that have been extracted from sarrays. In the following,
107 * "str_not_owned" means the returned handle does not own the string,
108 * and "str_owned" means the returned handle owns the string.
109 * - To extract a string from an Sarray in order to inspect it
110 * or to make a copy of it later, get a handle to it:
111 * copyflag = L_NOCOPY.
112 * In this case, you must neither free the string nor put it
113 * directly in another array:
114 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
115 * - To extract a copy of a string from an Sarray, use:
116 * str-owned = sarrayGetString(sa, index, L_COPY);
117 * ~ To insert a string that is in one array into another
118 * array (always leaving the first array intact), there are
119 * two options:
120 * (1) use copyflag = L_COPY to make an immediate copy,
121 * which you then add to the second array by insertion:
122 * str-owned = sarrayGetString(sa, index, L_COPY);
123 * sarrayAddString(sa, str-owned, L_INSERT);
124 * (2) use copyflag = L_NOCOPY to get another handle to
125 * the string; you then add a copy of it to the
126 * second string array:
127 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
128 * sarrayAddString(sa, str-not-owned, L_COPY).
129 * sarrayAddString() transfers ownership to the Sarray, so never
130 * use L_INSERT if the string is owned by another array.
131 *
132 * In all cases, when you use copyflag = L_COPY to extract
133 * a string from an array, you must either free it
134 * or insert it in an array that will be freed later.
135 * </pre>
136 */
137
138 #ifdef HAVE_CONFIG_H
139 #include <config_auto.h>
140 #endif /* HAVE_CONFIG_H */
141
142 #include <string.h>
143 #ifndef _WIN32
144 #include <dirent.h> /* unix only */
145 #include <sys/stat.h>
146 #include <limits.h> /* needed for realpath() */
147 #include <stdlib.h> /* needed for realpath() */
148 #endif /* ! _WIN32 */
149 #include "allheaders.h"
150 #include "array_internal.h"
151
152 static const l_uint32 MaxPtrArraySize = 50000000; /* 50 million */
153 static const l_int32 InitialPtrArraySize = 50; /*!< n'importe quoi */
154
155 /* Static functions */
156 static l_int32 sarrayExtendArray(SARRAY *sa);
157
158
159 /*--------------------------------------------------------------------------*
160 * String array create/destroy/copy/extend *
161 *--------------------------------------------------------------------------*/
162 /*!
163 * \brief sarrayCreate()
164 *
165 * \param[in] n size of string ptr array to be alloc'd; use 0 for default
166 * \return sarray, or NULL on error
167 */
168 SARRAY *
169 sarrayCreate(l_int32 n)
170 {
171 SARRAY *sa;
172
173 if (n <= 0 || n > (l_int32)MaxPtrArraySize)
174 n = InitialPtrArraySize;
175
176 sa = (SARRAY *)LEPT_CALLOC(1, sizeof(SARRAY));
177 if ((sa->array = (char **)LEPT_CALLOC(n, sizeof(char *))) == NULL) {
178 sarrayDestroy(&sa);
179 return (SARRAY *)ERROR_PTR("ptr array not made", __func__, NULL);
180 }
181
182 sa->nalloc = n;
183 sa->n = 0;
184 sa->refcount = 1;
185 return sa;
186 }
187
188
189 /*!
190 * \brief sarrayCreateInitialized()
191 *
192 * \param[in] n size of string ptr array to be alloc'd
193 * \param[in] initstr string to be initialized on the full array
194 * \return sarray, or NULL on error
195 */
196 SARRAY *
197 sarrayCreateInitialized(l_int32 n,
198 const char *initstr)
199 {
200 l_int32 i;
201 SARRAY *sa;
202
203 if (n <= 0)
204 return (SARRAY *)ERROR_PTR("n must be > 0", __func__, NULL);
205 if (!initstr)
206 return (SARRAY *)ERROR_PTR("initstr not defined", __func__, NULL);
207
208 sa = sarrayCreate(n);
209 for (i = 0; i < n; i++)
210 sarrayAddString(sa, initstr, L_COPY);
211 return sa;
212 }
213
214
215 /*!
216 * \brief sarrayCreateWordsFromString()
217 *
218 * \param[in] string
219 * \return sarray, or NULL on error
220 *
221 * <pre>
222 * Notes:
223 * (1) This finds the number of word substrings, creates an sarray
224 * of this size, and puts copies of each substring into the sarray.
225 * </pre>
226 */
227 SARRAY *
228 sarrayCreateWordsFromString(const char *string)
229 {
230 char separators[] = " \n\t";
231 l_int32 i, nsub, size, inword;
232 SARRAY *sa;
233
234 if (!string)
235 return (SARRAY *)ERROR_PTR("textstr not defined", __func__, NULL);
236
237 /* Find the number of words */
238 size = strlen(string);
239 nsub = 0;
240 inword = FALSE;
241 for (i = 0; i < size; i++) {
242 if (inword == FALSE &&
243 (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
244 inword = TRUE;
245 nsub++;
246 } else if (inword == TRUE &&
247 (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
248 inword = FALSE;
249 }
250 }
251
252 if ((sa = sarrayCreate(nsub)) == NULL)
253 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
254 sarraySplitString(sa, string, separators);
255
256 return sa;
257 }
258
259
260 /*!
261 * \brief sarrayCreateLinesFromString()
262 *
263 * \param[in] string
264 * \param[in] blankflag 0 to exclude blank lines; 1 to include
265 * \return sarray, or NULL on error
266 *
267 * <pre>
268 * Notes:
269 * (1) This finds the number of line substrings, each of which
270 * ends with a newline, and puts a copy of each substring
271 * in a new sarray.
272 * (2) The newline characters are removed from each substring.
273 * </pre>
274 */
275 SARRAY *
276 sarrayCreateLinesFromString(const char *string,
277 l_int32 blankflag)
278 {
279 l_int32 i, nsub, size, startptr;
280 char *cstring, *substring;
281 SARRAY *sa;
282
283 if (!string)
284 return (SARRAY *)ERROR_PTR("textstr not defined", __func__, NULL);
285
286 /* Find the number of lines */
287 size = strlen(string);
288 nsub = 0;
289 for (i = 0; i < size; i++) {
290 if (string[i] == '\n')
291 nsub++;
292 }
293
294 if ((sa = sarrayCreate(nsub)) == NULL)
295 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
296
297 if (blankflag) { /* keep blank lines as null strings */
298 /* Make a copy for munging */
299 if ((cstring = stringNew(string)) == NULL) {
300 sarrayDestroy(&sa);
301 return (SARRAY *)ERROR_PTR("cstring not made", __func__, NULL);
302 }
303 /* We'll insert nulls like strtok */
304 startptr = 0;
305 for (i = 0; i < size; i++) {
306 if (cstring[i] == '\n') {
307 cstring[i] = '\0';
308 if (i > 0 && cstring[i - 1] == '\r')
309 cstring[i - 1] = '\0'; /* also remove Windows CR */
310 if ((substring = stringNew(cstring + startptr)) == NULL) {
311 sarrayDestroy(&sa);
312 LEPT_FREE(cstring);
313 return (SARRAY *)ERROR_PTR("substring not made",
314 __func__, NULL);
315 }
316 sarrayAddString(sa, substring, L_INSERT);
317 /* lept_stderr("substring = %s\n", substring); */
318 startptr = i + 1;
319 }
320 }
321 if (startptr < size) { /* no newline at end of last line */
322 if ((substring = stringNew(cstring + startptr)) == NULL) {
323 sarrayDestroy(&sa);
324 LEPT_FREE(cstring);
325 return (SARRAY *)ERROR_PTR("substring not made",
326 __func__, NULL);
327 }
328 sarrayAddString(sa, substring, L_INSERT);
329 /* lept_stderr("substring = %s\n", substring); */
330 }
331 LEPT_FREE(cstring);
332 } else { /* remove blank lines; use strtok */
333 sarraySplitString(sa, string, "\r\n");
334 }
335
336 return sa;
337 }
338
339
340 /*!
341 * \brief sarrayDestroy()
342 *
343 * \param[in,out] psa will be set to null before returning
344 * \return void
345 *
346 * <pre>
347 * Notes:
348 * (1) Decrements the ref count and, if 0, destroys the sarray.
349 * (2) Always nulls the input ptr.
350 * </pre>
351 */
352 void
353 sarrayDestroy(SARRAY **psa)
354 {
355 l_int32 i;
356 SARRAY *sa;
357
358 if (psa == NULL) {
359 L_WARNING("ptr address is NULL!\n", __func__);
360 return;
361 }
362 if ((sa = *psa) == NULL)
363 return;
364
365 if (--sa->refcount == 0) {
366 if (sa->array) {
367 for (i = 0; i < sa->n; i++) {
368 if (sa->array[i])
369 LEPT_FREE(sa->array[i]);
370 }
371 LEPT_FREE(sa->array);
372 }
373 LEPT_FREE(sa);
374 }
375 *psa = NULL;
376 }
377
378
379 /*!
380 * \brief sarrayCopy()
381 *
382 * \param[in] sa string array
383 * \return copy of sarray, or NULL on error
384 */
385 SARRAY *
386 sarrayCopy(SARRAY *sa)
387 {
388 l_int32 i;
389 SARRAY *csa;
390
391 if (!sa)
392 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
393
394 if ((csa = sarrayCreate(sa->nalloc)) == NULL)
395 return (SARRAY *)ERROR_PTR("csa not made", __func__, NULL);
396
397 for (i = 0; i < sa->n; i++)
398 sarrayAddString(csa, sa->array[i], L_COPY);
399
400 return csa;
401 }
402
403
404 /*!
405 * \brief sarrayClone()
406 *
407 * \param[in] sa string array
408 * \return ptr to same sarray, or NULL on error
409 */
410 SARRAY *
411 sarrayClone(SARRAY *sa)
412 {
413 if (!sa)
414 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
415 ++sa->refcount;
416 return sa;
417 }
418
419
420 /*!
421 * \brief sarrayAddString()
422 *
423 * \param[in] sa string array
424 * \param[in] string string to be added
425 * \param[in] copyflag L_INSERT, L_NOCOPY or L_COPY
426 * \return 0 if OK, 1 on error
427 *
428 * <pre>
429 * Notes:
430 * (1) See usage comments at the top of this file. L_INSERT is
431 * equivalent to L_NOCOPY.
432 * </pre>
433 */
434 l_ok
435 sarrayAddString(SARRAY *sa,
436 const char *string,
437 l_int32 copyflag)
438 {
439 l_int32 n;
440
441 if (!sa)
442 return ERROR_INT("sa not defined", __func__, 1);
443 if (!string)
444 return ERROR_INT("string not defined", __func__, 1);
445 if (copyflag != L_INSERT && copyflag != L_NOCOPY && copyflag != L_COPY)
446 return ERROR_INT("invalid copyflag", __func__, 1);
447
448 n = sarrayGetCount(sa);
449 if (n >= sa->nalloc) {
450 if (sarrayExtendArray(sa))
451 return ERROR_INT("extension failed", __func__, 1);
452 }
453
454 if (copyflag == L_COPY)
455 sa->array[n] = stringNew(string);
456 else /* L_INSERT or L_NOCOPY */
457 sa->array[n] = (char *)string;
458 sa->n++;
459 return 0;
460 }
461
462
463 /*!
464 * \brief sarrayExtendArray()
465 *
466 * \param[in] sa string array
467 * \return 0 if OK, 1 on error
468 *
469 * <pre>
470 * Notes:
471 * (1) Doubles the size of the string ptr array.
472 * (2) The max number of strings is 50M.
473 * </pre>
474 */
475 static l_int32
476 sarrayExtendArray(SARRAY *sa)
477 {
478 size_t oldsize, newsize;
479
480 if (!sa)
481 return ERROR_INT("sa not defined", __func__, 1);
482 if (sa->nalloc >= (l_int32)MaxPtrArraySize) /* belt & suspenders */
483 return ERROR_INT("sa at maximum ptr size; can't extend", __func__, 1);
484 oldsize = sa->nalloc * sizeof(char *);
485 if (sa->nalloc > (l_int32)(MaxPtrArraySize / 2)) {
486 newsize = MaxPtrArraySize * sizeof(char *);
487 sa->nalloc = (l_int32)MaxPtrArraySize;
488 } else {
489 newsize = 2 * oldsize;
490 sa->nalloc *= 2;
491 }
492 if ((sa->array = (char **)reallocNew((void **)&sa->array,
493 oldsize, newsize)) == NULL)
494 return ERROR_INT("new ptr array not returned", __func__, 1);
495
496 return 0;
497 }
498
499
500 /*!
501 * \brief sarrayRemoveString()
502 *
503 * \param[in] sa string array
504 * \param[in] index of string within sarray
505 * \return removed string, or NULL on error
506 */
507 char *
508 sarrayRemoveString(SARRAY *sa,
509 l_int32 index)
510 {
511 char *string;
512 char **array;
513 l_int32 i, n, nalloc;
514
515 if (!sa)
516 return (char *)ERROR_PTR("sa not defined", __func__, NULL);
517
518 if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
519 return (char *)ERROR_PTR("array not returned", __func__, NULL);
520
521 if (index < 0 || index >= n)
522 return (char *)ERROR_PTR("array index out of bounds", __func__, NULL);
523
524 string = array[index];
525
526 /* If removed string is not at end of array, shift
527 * to fill in, maintaining original ordering.
528 * Note: if we didn't care about the order, we could
529 * put the last string array[n - 1] directly into the hole. */
530 for (i = index; i < n - 1; i++)
531 array[i] = array[i + 1];
532
533 sa->n--;
534 return string;
535 }
536
537
538 /*!
539 * \brief sarrayReplaceString()
540 *
541 * \param[in] sa string array
542 * \param[in] index of string within sarray to be replaced
543 * \param[in] newstr string to replace existing one
544 * \param[in] copyflag L_INSERT, L_COPY
545 * \return 0 if OK, 1 on error
546 *
547 * <pre>
548 * Notes:
549 * (1) This destroys an existing string and replaces it with
550 * the new string or a copy of it.
551 * (2) By design, an sarray is always compacted, so there are
552 * never any holes (null ptrs) in the ptr array up to the
553 * current count.
554 * </pre>
555 */
556 l_ok
557 sarrayReplaceString(SARRAY *sa,
558 l_int32 index,
559 char *newstr,
560 l_int32 copyflag)
561 {
562 char *str;
563 l_int32 n;
564
565 if (!sa)
566 return ERROR_INT("sa not defined", __func__, 1);
567 n = sarrayGetCount(sa);
568 if (index < 0 || index >= n)
569 return ERROR_INT("array index out of bounds", __func__, 1);
570 if (!newstr)
571 return ERROR_INT("newstr not defined", __func__, 1);
572 if (copyflag != L_INSERT && copyflag != L_COPY)
573 return ERROR_INT("invalid copyflag", __func__, 1);
574
575 LEPT_FREE(sa->array[index]);
576 if (copyflag == L_INSERT)
577 str = newstr;
578 else /* L_COPY */
579 str = stringNew(newstr);
580 sa->array[index] = str;
581 return 0;
582 }
583
584
585 /*!
586 * \brief sarrayClear()
587 *
588 * \param[in] sa string array
589 * \return 0 if OK; 1 on error
590 */
591 l_ok
592 sarrayClear(SARRAY *sa)
593 {
594 l_int32 i;
595
596 if (!sa)
597 return ERROR_INT("sa not defined", __func__, 1);
598 for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */
599 LEPT_FREE(sa->array[i]);
600 sa->array[i] = NULL;
601 }
602 sa->n = 0;
603 return 0;
604 }
605
606
607 /*----------------------------------------------------------------------*
608 * Accessors *
609 *----------------------------------------------------------------------*/
610 /*!
611 * \brief sarrayGetCount()
612 *
613 * \param[in] sa string array
614 * \return count, or 0 if no strings or on error
615 */
616 l_int32
617 sarrayGetCount(SARRAY *sa)
618 {
619 if (!sa)
620 return ERROR_INT("sa not defined", __func__, 0);
621 return sa->n;
622 }
623
624
625 /*!
626 * \brief sarrayGetArray()
627 *
628 * \param[in] sa string array
629 * \param[out] pnalloc [optional] number allocated string ptrs
630 * \param[out] pn [optional] number allocated strings
631 * \return ptr to string array, or NULL on error
632 *
633 * <pre>
634 * Notes:
635 * (1) Caution: the returned array is not a copy, so caller
636 * must not destroy it!
637 * </pre>
638 */
639 char **
640 sarrayGetArray(SARRAY *sa,
641 l_int32 *pnalloc,
642 l_int32 *pn)
643 {
644 char **array;
645
646 if (!sa)
647 return (char **)ERROR_PTR("sa not defined", __func__, NULL);
648
649 array = sa->array;
650 if (pnalloc) *pnalloc = sa->nalloc;
651 if (pn) *pn = sa->n;
652
653 return array;
654 }
655
656
657 /*!
658 * \brief sarrayGetString()
659 *
660 * \param[in] sa string array
661 * \param[in] index to the index-th string
662 * \param[in] copyflag L_NOCOPY or L_COPY
663 * \return string, or NULL on error
664 *
665 * <pre>
666 * Notes:
667 * (1) See usage comments at the top of this file.
668 * (2) To get a pointer to the string itself, use L_NOCOPY.
669 * To get a copy of the string, use L_COPY.
670 * </pre>
671 */
672 char *
673 sarrayGetString(SARRAY *sa,
674 l_int32 index,
675 l_int32 copyflag)
676 {
677 if (!sa)
678 return (char *)ERROR_PTR("sa not defined", __func__, NULL);
679 if (index < 0 || index >= sa->n)
680 return (char *)ERROR_PTR("index not valid", __func__, NULL);
681 if (copyflag != L_NOCOPY && copyflag != L_COPY)
682 return (char *)ERROR_PTR("invalid copyflag", __func__, NULL);
683
684 if (copyflag == L_NOCOPY)
685 return sa->array[index];
686 else /* L_COPY */
687 return stringNew(sa->array[index]);
688 }
689
690
691 /*----------------------------------------------------------------------*
692 * Conversion to string *
693 *----------------------------------------------------------------------*/
694 /*!
695 * \brief sarrayToString()
696 *
697 * \param[in] sa string array
698 * \param[in] addnlflag flag: 0 adds nothing to each substring
699 * 1 adds '\n' to each substring
700 * 2 adds ' ' to each substring
701 * 3 adds ',' to each substring
702 * \return dest string, or NULL on error
703 *
704 * <pre>
705 * Notes:
706 * (1) Concatenates all the strings in the sarray, preserving
707 * all white space.
708 * (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
709 * (3) This function was NOT implemented as:
710 * for (i = 0; i < n; i++)
711 * strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
712 * Do you see why?
713 * </pre>
714 */
715 char *
716 sarrayToString(SARRAY *sa,
717 l_int32 addnlflag)
718 {
719 if (!sa)
720 return (char *)ERROR_PTR("sa not defined", __func__, NULL);
721
722 return sarrayToStringRange(sa, 0, 0, addnlflag);
723 }
724
725
726 /*!
727 * \brief sarrayToStringRange()
728 *
729 * \param[in] sa string array
730 * \param[in] first index of first string to use; starts with 0
731 * \param[in] nstrings number of strings to append into the result; use
732 * 0 to append to the end of the sarray
733 * \param[in] addnlflag flag: 0 adds nothing to each substring
734 * 1 adds '\n' to each substring
735 * 2 adds ' ' to each substring
736 * 3 adds ',' to each substring
737 * \return dest string, or NULL on error
738 *
739 * <pre>
740 * Notes:
741 * (1) Concatenates the specified strings in the sarray, preserving
742 * all white space.
743 * (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
744 * (3) If the sarray is empty, this returns a string with just
745 * the character corresponding to %addnlflag.
746 * </pre>
747 */
748 char *
749 sarrayToStringRange(SARRAY *sa,
750 l_int32 first,
751 l_int32 nstrings,
752 l_int32 addnlflag)
753 {
754 char *dest, *src, *str;
755 l_int32 n, i, last, size, index, len;
756
757 if (!sa)
758 return (char *)ERROR_PTR("sa not defined", __func__, NULL);
759 if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3)
760 return (char *)ERROR_PTR("invalid addnlflag", __func__, NULL);
761
762 n = sarrayGetCount(sa);
763
764 /* Empty sa; return char corresponding to addnlflag only */
765 if (n == 0) {
766 if (first == 0) {
767 if (addnlflag == 0)
768 return stringNew("");
769 if (addnlflag == 1)
770 return stringNew("\n");
771 if (addnlflag == 2)
772 return stringNew(" ");
773 else /* addnlflag == 3) */
774 return stringNew(",");
775 } else {
776 return (char *)ERROR_PTR("first not valid", __func__, NULL);
777 }
778 }
779
780 /* Determine the range of string indices to be used */
781 if (first < 0 || first >= n)
782 return (char *)ERROR_PTR("first not valid", __func__, NULL);
783 if (nstrings == 0 || (nstrings > n - first))
784 nstrings = n - first; /* no overflow */
785 last = first + nstrings - 1;
786
787 /* Determine the size of the output string */
788 size = 0;
789 for (i = first; i <= last; i++) {
790 if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
791 return (char *)ERROR_PTR("str not found", __func__, NULL);
792 size += strlen(str) + 2;
793 }
794 if ((dest = (char *)LEPT_CALLOC(size + 1, sizeof(char))) == NULL)
795 return (char *)ERROR_PTR("dest not made", __func__, NULL);
796
797 /* Construct the output */
798 index = 0;
799 for (i = first; i <= last; i++) {
800 src = sarrayGetString(sa, i, L_NOCOPY);
801 len = strlen(src);
802 memcpy(dest + index, src, len);
803 index += len;
804 if (addnlflag == 1) {
805 dest[index] = '\n';
806 index++;
807 } else if (addnlflag == 2) {
808 dest[index] = ' ';
809 index++;
810 } else if (addnlflag == 3) {
811 dest[index] = ',';
812 index++;
813 }
814 }
815
816 return dest;
817 }
818
819
820 /*----------------------------------------------------------------------*
821 * Concatenate strings uniformly within the sarray *
822 *----------------------------------------------------------------------*/
823 /*!
824 * \brief sarrayConcatUniformly()
825 *
826 * \param[in] sa string array
827 * \param[in] n number of strings in output sarray
828 * \param[in] addnlflag flag: 0 adds nothing to each substring
829 * 1 adds '\n' to each substring
830 * 2 adds ' ' to each substring
831 * 3 adds ',' to each substring
832 * \return dest sarray, or NULL on error
833 *
834 * <pre>
835 * Notes:
836 * (1) Divides %sa into %n essentially equal sets of strings,
837 * concatenates each set individually, and makes an output
838 * sarray with the %n concatenations. %n must not exceed the
839 * number of strings in %sa.
840 * (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
841 * </pre>
842 */
843 SARRAY *
844 sarrayConcatUniformly(SARRAY *sa,
845 l_int32 n,
846 l_int32 addnlflag)
847 {
848 l_int32 i, first, ntot, nstr;
849 char *str;
850 NUMA *na;
851 SARRAY *saout;
852
853 if (!sa)
854 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
855 ntot = sarrayGetCount(sa);
856 if (n < 1)
857 return (SARRAY *)ERROR_PTR("n must be >= 1", __func__, NULL);
858 if (n > ntot) {
859 L_ERROR("n = %d > ntot = %d\n", __func__, n, ntot);
860 return NULL;
861 }
862 if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3)
863 return (SARRAY *)ERROR_PTR("invalid addnlflag", __func__, NULL);
864
865 saout = sarrayCreate(0);
866 na = numaGetUniformBinSizes(ntot, n);
867 for (i = 0, first = 0; i < n; i++) {
868 numaGetIValue(na, i, &nstr);
869 str = sarrayToStringRange(sa, first, nstr, addnlflag);
870 sarrayAddString(saout, str, L_INSERT);
871 first += nstr;
872 }
873 numaDestroy(&na);
874 return saout;
875 }
876
877
878 /*----------------------------------------------------------------------*
879 * Join 2 sarrays *
880 *----------------------------------------------------------------------*/
881 /*!
882 * \brief sarrayJoin()
883 *
884 * \param[in] sa1 to be added to
885 * \param[in] sa2 append to sa1
886 * \return 0 if OK, 1 on error
887 *
888 * <pre>
889 * Notes:
890 * (1) Copies of the strings in sarray2 are added to sarray1.
891 * </pre>
892 */
893 l_ok
894 sarrayJoin(SARRAY *sa1,
895 SARRAY *sa2)
896 {
897 char *str;
898 l_int32 n, i;
899
900 if (!sa1)
901 return ERROR_INT("sa1 not defined", __func__, 1);
902 if (!sa2)
903 return ERROR_INT("sa2 not defined", __func__, 1);
904
905 n = sarrayGetCount(sa2);
906 for (i = 0; i < n; i++) {
907 str = sarrayGetString(sa2, i, L_NOCOPY);
908 if (sarrayAddString(sa1, str, L_COPY) == 1) {
909 L_ERROR("failed to add string at i = %d\n", __func__, i);
910 return 1;
911 }
912 }
913 return 0;
914 }
915
916
917 /*!
918 * \brief sarrayAppendRange()
919 *
920 * \param[in] sa1 to be added to
921 * \param[in] sa2 append specified range of strings in sa2 to sa1
922 * \param[in] start index of first string of sa2 to append
923 * \param[in] end index of last string of sa2 to append;
924 * -1 to append to end of array
925 * \return 0 if OK, 1 on error
926 *
927 * <pre>
928 * Notes:
929 * (1) Copies of the strings in sarray2 are added to sarray1.
930 * (2) The [start ... end] range is truncated if necessary.
931 * (3) Use end == -1 to append to the end of sa2.
932 * </pre>
933 */
934 l_ok
935 sarrayAppendRange(SARRAY *sa1,
936 SARRAY *sa2,
937 l_int32 start,
938 l_int32 end)
939 {
940 char *str;
941 l_int32 n, i;
942
943 if (!sa1)
944 return ERROR_INT("sa1 not defined", __func__, 1);
945 if (!sa2)
946 return ERROR_INT("sa2 not defined", __func__, 1);
947
948 if (start < 0)
949 start = 0;
950 n = sarrayGetCount(sa2);
951 if (end < 0 || end >= n)
952 end = n - 1;
953 if (start > end)
954 return ERROR_INT("start > end", __func__, 1);
955
956 for (i = start; i <= end; i++) {
957 str = sarrayGetString(sa2, i, L_NOCOPY);
958 sarrayAddString(sa1, str, L_COPY);
959 }
960
961 return 0;
962 }
963
964
965 /*----------------------------------------------------------------------*
966 * Pad an sarray to be the same size as another sarray *
967 *----------------------------------------------------------------------*/
968 /*!
969 * \brief sarrayPadToSameSize()
970 *
971 * \param[in] sa1, sa2
972 * \param[in] padstring
973 * \return 0 if OK, 1 on error
974 *
975 * <pre>
976 * Notes:
977 * (1) If two sarrays have different size, this adds enough
978 * instances of %padstring to the smaller so that they are
979 * the same size. It is useful when two or more sarrays
980 * are being sequenced in parallel, and it is necessary to
981 * find a valid string at each index.
982 * </pre>
983 */
984 l_ok
985 sarrayPadToSameSize(SARRAY *sa1,
986 SARRAY *sa2,
987 const char *padstring)
988 {
989 l_int32 i, n1, n2;
990
991 if (!sa1 || !sa2)
992 return ERROR_INT("both sa1 and sa2 not defined", __func__, 1);
993
994 n1 = sarrayGetCount(sa1);
995 n2 = sarrayGetCount(sa2);
996 if (n1 < n2) {
997 for (i = n1; i < n2; i++)
998 sarrayAddString(sa1, padstring, L_COPY);
999 } else if (n1 > n2) {
1000 for (i = n2; i < n1; i++)
1001 sarrayAddString(sa2, padstring, L_COPY);
1002 }
1003
1004 return 0;
1005 }
1006
1007
1008 /*----------------------------------------------------------------------*
1009 * Convert word sarray to line sarray *
1010 *----------------------------------------------------------------------*/
1011 /*!
1012 * \brief sarrayConvertWordsToLines()
1013 *
1014 * \param[in] sa sa of individual words
1015 * \param[in] linesize max num of chars in each line
1016 * \return saout sa of formatted lines, or NULL on error
1017 *
1018 * <pre>
1019 * Notes:
1020 * (1) This is useful for re-typesetting text to a specific maximum
1021 * line length. The individual words in the input sarray
1022 * are concatenated into textlines. An input word string of zero
1023 * length is taken to be a paragraph separator. Each time
1024 * such a string is found, the current line is ended and
1025 * a new line is also produced that contains just the
1026 * string of zero length "". When the output sarray
1027 * of lines is eventually converted to a string with newlines
1028 * typically appended to each line string, the empty
1029 * strings are just converted to newlines, producing the visible
1030 * paragraph separation.
1031 * (2) What happens when a word is larger than linesize?
1032 * We write it out as a single line anyway! Words preceding
1033 * or following this long word are placed on lines preceding
1034 * or following the line with the long word. Why this choice?
1035 * Long "words" found in text documents are typically URLs, and
1036 * it's often desirable not to put newlines in the middle of a URL.
1037 * The text display program e.g., text editor will typically
1038 * wrap the long "word" to fit in the window.
1039 * </pre>
1040 */
1041 SARRAY *
1042 sarrayConvertWordsToLines(SARRAY *sa,
1043 l_int32 linesize)
1044 {
1045 char *wd, *strl;
1046 char emptystring[] = "";
1047 l_int32 n, i, len, totlen;
1048 SARRAY *sal, *saout;
1049
1050 if (!sa)
1051 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
1052
1053 saout = sarrayCreate(0);
1054 n = sarrayGetCount(sa);
1055 totlen = 0;
1056 sal = NULL;
1057 for (i = 0; i < n; i++) {
1058 if (!sal)
1059 sal = sarrayCreate(0);
1060 wd = sarrayGetString(sa, i, L_NOCOPY);
1061 len = strlen(wd);
1062 if (len == 0) { /* end of paragraph: end line & insert blank line */
1063 if (totlen > 0) {
1064 strl = sarrayToString(sal, 2);
1065 sarrayAddString(saout, strl, L_INSERT);
1066 }
1067 sarrayAddString(saout, emptystring, L_COPY);
1068 sarrayDestroy(&sal);
1069 totlen = 0;
1070 } else if (totlen == 0 && len + 1 > linesize) { /* long word! */
1071 sarrayAddString(saout, wd, L_COPY); /* copy to one line */
1072 } else if (totlen + len + 1 > linesize) { /* end line & start new */
1073 strl = sarrayToString(sal, 2);
1074 sarrayAddString(saout, strl, L_INSERT);
1075 sarrayDestroy(&sal);
1076 sal = sarrayCreate(0);
1077 sarrayAddString(sal, wd, L_COPY);
1078 totlen = len + 1;
1079 } else { /* add to current line */
1080 sarrayAddString(sal, wd, L_COPY);
1081 totlen += len + 1;
1082 }
1083 }
1084 if (totlen > 0) { /* didn't end with blank line; output last line */
1085 strl = sarrayToString(sal, 2);
1086 sarrayAddString(saout, strl, L_INSERT);
1087 sarrayDestroy(&sal);
1088 }
1089
1090 return saout;
1091 }
1092
1093
1094 /*----------------------------------------------------------------------*
1095 * Split string on separator list *
1096 *----------------------------------------------------------------------*/
1097 /*
1098 * \brief sarraySplitString()
1099 *
1100 * \param[in] sa to append to; typically empty initially
1101 * \param[in] str string to split; not changed
1102 * \param[in] separators characters that split input string
1103 * \return 0 if OK, 1 on error.
1104 *
1105 * <pre>
1106 * Notes:
1107 * (1) This uses strtokSafe(). See the notes there in utils.c.
1108 * </pre>
1109 */
1110 l_int32
1111 sarraySplitString(SARRAY *sa,
1112 const char *str,
1113 const char *separators)
1114 {
1115 char *cstr, *substr, *saveptr;
1116
1117 if (!sa)
1118 return ERROR_INT("sa not defined", __func__, 1);
1119 if (!str)
1120 return ERROR_INT("str not defined", __func__, 1);
1121 if (!separators)
1122 return ERROR_INT("separators not defined", __func__, 1);
1123
1124 cstr = stringNew(str); /* preserves const-ness of input str */
1125 saveptr = NULL;
1126 substr = strtokSafe(cstr, separators, &saveptr);
1127 if (substr)
1128 sarrayAddString(sa, substr, L_INSERT);
1129 while ((substr = strtokSafe(NULL, separators, &saveptr)))
1130 sarrayAddString(sa, substr, L_INSERT);
1131 LEPT_FREE(cstr);
1132
1133 return 0;
1134 }
1135
1136
1137 /*----------------------------------------------------------------------*
1138 * Filter sarray *
1139 *----------------------------------------------------------------------*/
1140 /*!
1141 * \brief sarraySelectBySubstring()
1142 *
1143 * \param[in] sain input sarray
1144 * \param[in] substr [optional] substring for matching; can be NULL
1145 * \return saout output sarray, filtered with substring or NULL on error
1146 *
1147 * <pre>
1148 * Notes:
1149 * (1) This selects all strings in sain that have substr as a substring.
1150 * Note that we can't use strncmp() because we're looking for
1151 * a match to the substring anywhere within each filename.
1152 * (2) If substr == NULL, returns a copy of the sarray.
1153 * </pre>
1154 */
1155 SARRAY *
1156 sarraySelectBySubstring(SARRAY *sain,
1157 const char *substr)
1158 {
1159 char *str;
1160 l_int32 n, i, offset, found;
1161 SARRAY *saout;
1162
1163 if (!sain)
1164 return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
1165
1166 n = sarrayGetCount(sain);
1167 if (!substr || n == 0)
1168 return sarrayCopy(sain);
1169
1170 saout = sarrayCreate(n);
1171 for (i = 0; i < n; i++) {
1172 str = sarrayGetString(sain, i, L_NOCOPY);
1173 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1174 strlen(substr), &offset, &found);
1175 if (found)
1176 sarrayAddString(saout, str, L_COPY);
1177 }
1178
1179 return saout;
1180 }
1181
1182
1183 /*!
1184 * \brief sarraySelectRange()
1185 *
1186 * \param[in] sain input sarray
1187 * \param[in] first index of first string to be selected
1188 * \param[in] last index of last string to be selected;
1189 * use 0 to go to the end of the sarray
1190 * \return saout output sarray, or NULL on error
1191 *
1192 * <pre>
1193 * Notes:
1194 * (1) This makes %saout consisting of copies of all strings in %sain
1195 * in the index set [first ... last]. Use %last == 0 to get all
1196 * strings from %first to the last string in the sarray.
1197 * </pre>
1198 */
1199 SARRAY *
1200 sarraySelectRange(SARRAY *sain,
1201 l_int32 first,
1202 l_int32 last)
1203 {
1204 char *str;
1205 l_int32 n, i;
1206 SARRAY *saout;
1207
1208 if (!sain)
1209 return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
1210 if (first < 0) first = 0;
1211 n = sarrayGetCount(sain);
1212 if (last <= 0) last = n - 1;
1213 if (last >= n) {
1214 L_WARNING("last > n - 1; setting to n - 1\n", __func__);
1215 last = n - 1;
1216 }
1217 if (first > last)
1218 return (SARRAY *)ERROR_PTR("first must be >= last", __func__, NULL);
1219
1220 saout = sarrayCreate(0);
1221 for (i = first; i <= last; i++) {
1222 str = sarrayGetString(sain, i, L_COPY);
1223 sarrayAddString(saout, str, L_INSERT);
1224 }
1225
1226 return saout;
1227 }
1228
1229
1230 /*!
1231 * \brief sarrayParseRange()
1232 *
1233 * \param[in] sa input sarray
1234 * \param[in] start index to start range search
1235 * \param[out] pactualstart index of actual start; may be > 'start'
1236 * \param[out] pend index of end
1237 * \param[out] pnewstart index of start of next range
1238 * \param[in] substr substring for matching at beginning of string
1239 * \param[in] loc byte offset within the string for the pattern;
1240 * use -1 if the location does not matter.
1241 * \return 0 if valid range found; 1 otherwise
1242 *
1243 * <pre>
1244 * Notes:
1245 * (1) This finds the range of the next set of strings in SA,
1246 * beginning the search at 'start', that does NOT have
1247 * the substring 'substr' either at the indicated location
1248 * in the string or anywhere in the string. The input
1249 * variable 'loc' is the specified offset within the string;
1250 * use -1 to indicate 'anywhere in the string'.
1251 * (2) Always check the return value to verify that a valid range
1252 * was found.
1253 * (3) If a valid range is not found, the values of actstart,
1254 * end and newstart are all set to the size of sa.
1255 * (4) If this is the last valid range, newstart returns the value n.
1256 * In use, this should be tested before calling the function.
1257 * (5) Usage example. To find all the valid ranges in a file
1258 * where the invalid lines begin with two dashes, copy each
1259 * line in the file to a string in an sarray, and do:
1260 * start = 0;
1261 * while (!sarrayParseRange(sa, start, &actstart, &end, &start,
1262 * "--", 0))
1263 * lept_stderr("start = %d, end = %d\n", actstart, end);
1264 * </pre>
1265 */
1266 l_int32
1267 sarrayParseRange(SARRAY *sa,
1268 l_int32 start,
1269 l_int32 *pactualstart,
1270 l_int32 *pend,
1271 l_int32 *pnewstart,
1272 const char *substr,
1273 l_int32 loc)
1274 {
1275 char *str;
1276 l_int32 n, i, offset, found;
1277
1278 if (!sa)
1279 return ERROR_INT("sa not defined", __func__, 1);
1280 if (!pactualstart || !pend || !pnewstart)
1281 return ERROR_INT("not all range addresses defined", __func__, 1);
1282 n = sarrayGetCount(sa);
1283 *pactualstart = *pend = *pnewstart = n;
1284 if (!substr)
1285 return ERROR_INT("substr not defined", __func__, 1);
1286
1287 /* Look for the first string without the marker */
1288 if (start < 0 || start >= n)
1289 return 1;
1290 for (i = start; i < n; i++) {
1291 str = sarrayGetString(sa, i, L_NOCOPY);
1292 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1293 strlen(substr), &offset, &found);
1294 if (loc < 0) {
1295 if (!found) break;
1296 } else {
1297 if (!found || offset != loc) break;
1298 }
1299 }
1300 start = i;
1301 if (i == n) /* couldn't get started */
1302 return 1;
1303
1304 /* Look for the last string without the marker */
1305 *pactualstart = start;
1306 for (i = start + 1; i < n; i++) {
1307 str = sarrayGetString(sa, i, L_NOCOPY);
1308 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1309 strlen(substr), &offset, &found);
1310 if (loc < 0) {
1311 if (found) break;
1312 } else {
1313 if (found && offset == loc) break;
1314 }
1315 }
1316 *pend = i - 1;
1317 start = i;
1318 if (i == n) /* no further range */
1319 return 0;
1320
1321 /* Look for the first string after *pend without the marker.
1322 * This will start the next run of strings, if it exists. */
1323 for (i = start; i < n; i++) {
1324 str = sarrayGetString(sa, i, L_NOCOPY);
1325 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
1326 strlen(substr), &offset, &found);
1327 if (loc < 0) {
1328 if (!found) break;
1329 } else {
1330 if (!found || offset != loc) break;
1331 }
1332 }
1333 if (i < n)
1334 *pnewstart = i;
1335
1336 return 0;
1337 }
1338
1339
1340 /*----------------------------------------------------------------------*
1341 * Serialize for I/O *
1342 *----------------------------------------------------------------------*/
1343 /*!
1344 * \brief sarrayRead()
1345 *
1346 * \param[in] filename
1347 * \return sarray, or NULL on error
1348 */
1349 SARRAY *
1350 sarrayRead(const char *filename)
1351 {
1352 FILE *fp;
1353 SARRAY *sa;
1354
1355 if (!filename)
1356 return (SARRAY *)ERROR_PTR("filename not defined", __func__, NULL);
1357
1358 if ((fp = fopenReadStream(filename)) == NULL)
1359 return (SARRAY *)ERROR_PTR_1("stream not opened",
1360 filename, __func__, NULL);
1361 sa = sarrayReadStream(fp);
1362 fclose(fp);
1363 if (!sa)
1364 return (SARRAY *)ERROR_PTR_1("sa not read", filename, __func__, NULL);
1365 return sa;
1366 }
1367
1368
1369 /*!
1370 * \brief sarrayReadStream()
1371 *
1372 * \param[in] fp file stream
1373 * \return sarray, or NULL on error
1374 *
1375 * <pre>
1376 * Notes:
1377 * (1) We store the size of each string along with the string.
1378 * The limit on the number of strings is 50M.
1379 * The limit on the size of any string is 2^30 bytes.
1380 * (2) This allows a string to have embedded newlines. By reading
1381 * the entire string, as determined by its size, we are
1382 * not affected by any number of embedded newlines.
1383 * (3) It is OK for the sarray to be empty.
1384 * </pre>
1385 */
1386 SARRAY *
1387 sarrayReadStream(FILE *fp)
1388 {
1389 char *stringbuf;
1390 l_int32 i, n, size, index, bufsize, version, ignore, success;
1391 SARRAY *sa;
1392
1393 if (!fp)
1394 return (SARRAY *)ERROR_PTR("stream not defined", __func__, NULL);
1395
1396 if (fscanf(fp, "\nSarray Version %d\n", &version) != 1)
1397 return (SARRAY *)ERROR_PTR("not an sarray file", __func__, NULL);
1398 if (version != SARRAY_VERSION_NUMBER)
1399 return (SARRAY *)ERROR_PTR("invalid sarray version", __func__, NULL);
1400 if (fscanf(fp, "Number of strings = %d\n", &n) != 1)
1401 return (SARRAY *)ERROR_PTR("error on # strings", __func__, NULL);
1402 if (n < 0)
1403 return (SARRAY *)ERROR_PTR("num string ptrs <= 0", __func__, NULL);
1404 if (n > (l_int32)MaxPtrArraySize)
1405 return (SARRAY *)ERROR_PTR("too many string ptrs", __func__, NULL);
1406 if (n == 0) L_INFO("the sarray is empty\n", __func__);
1407
1408 success = TRUE;
1409 if ((sa = sarrayCreate(n)) == NULL)
1410 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
1411 bufsize = 512 + 1;
1412 stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
1413
1414 for (i = 0; i < n; i++) {
1415 /* Get the size of the stored string */
1416 if ((fscanf(fp, "%d[%d]:", &index, &size) != 2) || (size > (1 << 30))) {
1417 success = FALSE;
1418 L_ERROR("error on string size\n", __func__);
1419 goto cleanup;
1420 }
1421 /* Expand the string buffer if necessary */
1422 if (size > bufsize - 5) {
1423 LEPT_FREE(stringbuf);
1424 bufsize = (l_int32)(1.5 * size);
1425 stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
1426 }
1427 /* Read the stored string, plus leading spaces and trailing \n */
1428 if (fread(stringbuf, 1, size + 3, fp) != size + 3) {
1429 success = FALSE;
1430 L_ERROR("error reading string\n", __func__);
1431 goto cleanup;
1432 }
1433 /* Remove the \n that was added by sarrayWriteStream() */
1434 stringbuf[size + 2] = '\0';
1435 /* Copy it in, skipping the 2 leading spaces */
1436 sarrayAddString(sa, stringbuf + 2, L_COPY);
1437 }
1438 ignore = fscanf(fp, "\n");
1439
1440 cleanup:
1441 LEPT_FREE(stringbuf);
1442 if (!success) sarrayDestroy(&sa);
1443 return sa;
1444 }
1445
1446
1447 /*!
1448 * \brief sarrayReadMem()
1449 *
1450 * \param[in] data serialization in ascii
1451 * \param[in] size of data; can use strlen to get it
1452 * \return sarray, or NULL on error
1453 */
1454 SARRAY *
1455 sarrayReadMem(const l_uint8 *data,
1456 size_t size)
1457 {
1458 FILE *fp;
1459 SARRAY *sa;
1460
1461 if (!data)
1462 return (SARRAY *)ERROR_PTR("data not defined", __func__, NULL);
1463 if ((fp = fopenReadFromMemory(data, size)) == NULL)
1464 return (SARRAY *)ERROR_PTR("stream not opened", __func__, NULL);
1465
1466 sa = sarrayReadStream(fp);
1467 fclose(fp);
1468 if (!sa) L_ERROR("sarray not read\n", __func__);
1469 return sa;
1470 }
1471
1472
1473 /*!
1474 * \brief sarrayWrite()
1475 *
1476 * \param[in] filename
1477 * \param[in] sa string array
1478 * \return 0 if OK; 1 on error
1479 */
1480 l_ok
1481 sarrayWrite(const char *filename,
1482 SARRAY *sa)
1483 {
1484 l_int32 ret;
1485 FILE *fp;
1486
1487 if (!filename)
1488 return ERROR_INT("filename not defined", __func__, 1);
1489 if (!sa)
1490 return ERROR_INT("sa not defined", __func__, 1);
1491
1492 if ((fp = fopenWriteStream(filename, "w")) == NULL)
1493 return ERROR_INT_1("stream not opened", filename, __func__, 1);
1494 ret = sarrayWriteStream(fp, sa);
1495 fclose(fp);
1496 if (ret)
1497 return ERROR_INT_1("sa not written to stream", filename, __func__, 1);
1498 return 0;
1499 }
1500
1501
1502 /*!
1503 * \brief sarrayWriteStream()
1504 *
1505 * \param[in] fp file stream; use NULL to write to stderr
1506 * \param[in] sa string array
1507 * \return 0 if OK; 1 on error
1508 *
1509 * <pre>
1510 * Notes:
1511 * (1) This appends a '\n' to each string, which is stripped
1512 * off by sarrayReadStream().
1513 * </pre>
1514 */
1515 l_ok
1516 sarrayWriteStream(FILE *fp,
1517 SARRAY *sa)
1518 {
1519 l_int32 i, n, len;
1520
1521 if (!fp)
1522 return ERROR_INT("stream not defined", __func__, 1);
1523 if (!sa)
1524 return sarrayWriteStderr(sa);
1525
1526 n = sarrayGetCount(sa);
1527 fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
1528 fprintf(fp, "Number of strings = %d\n", n);
1529 for (i = 0; i < n; i++) {
1530 len = strlen(sa->array[i]);
1531 fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]);
1532 }
1533 fprintf(fp, "\n");
1534
1535 return 0;
1536 }
1537
1538
1539 /*!
1540 * \brief sarrayWriteStderr()
1541 *
1542 * \param[in] sa string array
1543 * \return 0 if OK; 1 on error
1544 */
1545 l_ok
1546 sarrayWriteStderr(SARRAY *sa)
1547 {
1548 l_int32 i, n, len;
1549
1550 if (!sa)
1551 return ERROR_INT("sa not defined", __func__, 1);
1552
1553 n = sarrayGetCount(sa);
1554 lept_stderr("\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
1555 lept_stderr("Number of strings = %d\n", n);
1556 for (i = 0; i < n; i++) {
1557 len = strlen(sa->array[i]);
1558 lept_stderr(" %d[%d]: %s\n", i, len, sa->array[i]);
1559 }
1560 lept_stderr("\n");
1561 return 0;
1562 }
1563
1564
1565 /*!
1566 * \brief sarrayWriteMem()
1567 *
1568 * \param[out] pdata data of serialized sarray; ascii
1569 * \param[out] psize size of returned data
1570 * \param[in] sa
1571 * \return 0 if OK, 1 on error
1572 *
1573 * <pre>
1574 * Notes:
1575 * (1) Serializes a sarray in memory and puts the result in a buffer.
1576 * </pre>
1577 */
1578 l_ok
1579 sarrayWriteMem(l_uint8 **pdata,
1580 size_t *psize,
1581 SARRAY *sa)
1582 {
1583 l_int32 ret;
1584 FILE *fp;
1585
1586 if (pdata) *pdata = NULL;
1587 if (psize) *psize = 0;
1588 if (!pdata)
1589 return ERROR_INT("&data not defined", __func__, 1);
1590 if (!psize)
1591 return ERROR_INT("&size not defined", __func__, 1);
1592 if (!sa)
1593 return ERROR_INT("sa not defined", __func__, 1);
1594
1595 #if HAVE_FMEMOPEN
1596 if ((fp = open_memstream((char **)pdata, psize)) == NULL)
1597 return ERROR_INT("stream not opened", __func__, 1);
1598 ret = sarrayWriteStream(fp, sa);
1599 fputc('\0', fp);
1600 fclose(fp);
1601 if (*psize > 0) *psize = *psize - 1;
1602 #else
1603 L_INFO("no fmemopen API --> work-around: write to temp file\n", __func__);
1604 #ifdef _WIN32
1605 if ((fp = fopenWriteWinTempfile()) == NULL)
1606 return ERROR_INT("tmpfile stream not opened", __func__, 1);
1607 #else
1608 if ((fp = tmpfile()) == NULL)
1609 return ERROR_INT("tmpfile stream not opened", __func__, 1);
1610 #endif /* _WIN32 */
1611 ret = sarrayWriteStream(fp, sa);
1612 rewind(fp);
1613 *pdata = l_binaryReadStream(fp, psize);
1614 fclose(fp);
1615 #endif /* HAVE_FMEMOPEN */
1616 return ret;
1617 }
1618
1619
1620 /*!
1621 * \brief sarrayAppend()
1622 *
1623 * \param[in] filename
1624 * \param[in] sa
1625 * \return 0 if OK; 1 on error
1626 */
1627 l_ok
1628 sarrayAppend(const char *filename,
1629 SARRAY *sa)
1630 {
1631 FILE *fp;
1632
1633 if (!filename)
1634 return ERROR_INT("filename not defined", __func__, 1);
1635 if (!sa)
1636 return ERROR_INT("sa not defined", __func__, 1);
1637
1638 if ((fp = fopenWriteStream(filename, "a")) == NULL)
1639 return ERROR_INT_1("stream not opened", filename, __func__, 1);
1640 if (sarrayWriteStream(fp, sa)) {
1641 fclose(fp);
1642 return ERROR_INT_1("sa not appended to stream", filename, __func__, 1);
1643 }
1644
1645 fclose(fp);
1646 return 0;
1647 }
1648
1649
1650 /*---------------------------------------------------------------------*
1651 * Directory filenames *
1652 *---------------------------------------------------------------------*/
1653 /*!
1654 * \brief getNumberedPathnamesInDirectory()
1655 *
1656 * \param[in] dirname directory name
1657 * \param[in] substr [optional] substring filter on filenames; can be NULL
1658 * \param[in] numpre number of characters in name before number
1659 * \param[in] numpost number of characters in name after the number,
1660 * up to a dot before an extension
1661 * \param[in] maxnum only consider page numbers up to this value
1662 * \return sarray of numbered pathnames, or NULL on error
1663 *
1664 * <pre>
1665 * Notes:
1666 * (1) Returns the full pathnames of the numbered filenames in
1667 * the directory. The number in the filename is the index
1668 * into the sarray. For indices for which there are no filenames,
1669 * an empty string ("") is placed into the sarray.
1670 * This makes reading numbered files very simple. For example,
1671 * the image whose filename includes number N can be retrieved using
1672 * pixReadIndexed(sa, N);
1673 * (2) If %substr is not NULL, only filenames that contain
1674 * the substring can be included. If %substr is NULL,
1675 * all matching filenames are used.
1676 * (3) If no numbered files are found, it returns an empty sarray,
1677 * with no initialized strings.
1678 * (4) It is assumed that the page number is contained within
1679 * the basename (the filename without directory or extension).
1680 * %numpre is the number of characters in the basename
1681 * preceding the actual page number; %numpost is the number
1682 * following the page number, up to either the end of the
1683 * basename or a ".", whichever comes first.
1684 * (5) This is useful when all filenames contain numbers that are
1685 * not necessarily consecutive. 0-padding is not required.
1686 * (6) To use a O(n) matching algorithm, the largest page number
1687 * is found and two internal arrays of this size are created.
1688 * This maximum is constrained not to exceed %maxsum,
1689 * to make sure that an unrealistically large number is not
1690 * accidentally used to determine the array sizes.
1691 * </pre>
1692 */
1693 SARRAY *
1694 getNumberedPathnamesInDirectory(const char *dirname,
1695 const char *substr,
1696 l_int32 numpre,
1697 l_int32 numpost,
1698 l_int32 maxnum)
1699 {
1700 l_int32 nfiles;
1701 SARRAY *sa, *saout;
1702
1703 if (!dirname)
1704 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
1705
1706 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
1707 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
1708 if ((nfiles = sarrayGetCount(sa)) == 0) {
1709 sarrayDestroy(&sa);
1710 return sarrayCreate(1);
1711 }
1712
1713 saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum);
1714 sarrayDestroy(&sa);
1715 return saout;
1716 }
1717
1718
1719 /*!
1720 * \brief getSortedPathnamesInDirectory()
1721 *
1722 * \param[in] dirname directory name
1723 * \param[in] substr [optional] substring filter on filenames; can be NULL
1724 * \param[in] first 0-based
1725 * \param[in] nfiles use 0 for all to the end
1726 * \return sarray of sorted pathnames, or NULL on error
1727 *
1728 * <pre>
1729 * Notes:
1730 * (1) Use %substr to filter filenames in the directory. If
1731 * %substr == NULL, this takes all files.
1732 * (2) The files in the directory, after optional filtering by
1733 * the substring, are lexically sorted in increasing order.
1734 * Use %first and %nfiles to select a contiguous set of files.
1735 * (3) The full pathnames are returned for the requested sequence.
1736 * If no files are found after filtering, returns an empty sarray.
1737 * </pre>
1738 */
1739 SARRAY *
1740 getSortedPathnamesInDirectory(const char *dirname,
1741 const char *substr,
1742 l_int32 first,
1743 l_int32 nfiles)
1744 {
1745 char *fname, *fullname;
1746 l_int32 i, n, last;
1747 SARRAY *sa, *safiles, *saout;
1748
1749 if (!dirname)
1750 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
1751
1752 if ((sa = getFilenamesInDirectory(dirname)) == NULL)
1753 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
1754 safiles = sarraySelectBySubstring(sa, substr);
1755 sarrayDestroy(&sa);
1756 n = sarrayGetCount(safiles);
1757 if (n == 0) {
1758 L_WARNING("no files found\n", __func__);
1759 return safiles;
1760 }
1761
1762 sarraySort(safiles, safiles, L_SORT_INCREASING);
1763
1764 first = L_MIN(L_MAX(first, 0), n - 1);
1765 if (nfiles == 0)
1766 nfiles = n - first;
1767 last = L_MIN(first + nfiles - 1, n - 1);
1768
1769 saout = sarrayCreate(last - first + 1);
1770 for (i = first; i <= last; i++) {
1771 fname = sarrayGetString(safiles, i, L_NOCOPY);
1772 fullname = pathJoin(dirname, fname);
1773 sarrayAddString(saout, fullname, L_INSERT);
1774 }
1775
1776 sarrayDestroy(&safiles);
1777 return saout;
1778 }
1779
1780
1781 /*!
1782 * \brief convertSortedToNumberedPathnames()
1783 *
1784 * \param[in] sa sorted pathnames including zero-padded integers
1785 * \param[in] numpre number of characters in name before number
1786 * \param[in] numpost number of characters in name after the number,
1787 * up to a dot before an extension
1788 * \param[in] maxnum only consider page numbers up to this value
1789 * \return sarray of numbered pathnames, or NULL on error
1790 *
1791 * <pre>
1792 * Notes:
1793 * (1) Typically, numpre = numpost = 0; e.g., when the filename
1794 * just has a number followed by an optional extension.
1795 * </pre>
1796 */
1797 SARRAY *
1798 convertSortedToNumberedPathnames(SARRAY *sa,
1799 l_int32 numpre,
1800 l_int32 numpost,
1801 l_int32 maxnum)
1802 {
1803 char *fname, *str;
1804 l_int32 i, nfiles, num, index;
1805 SARRAY *saout;
1806
1807 if (!sa)
1808 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
1809 if ((nfiles = sarrayGetCount(sa)) == 0)
1810 return sarrayCreate(1);
1811
1812 /* Find the last file in the sorted array that has a number
1813 * that (a) matches the count pattern and (b) does not
1814 * exceed %maxnum. %maxnum sets an upper limit on the size
1815 * of the sarray. */
1816 num = 0;
1817 for (i = nfiles - 1; i >= 0; i--) {
1818 fname = sarrayGetString(sa, i, L_NOCOPY);
1819 num = extractNumberFromFilename(fname, numpre, numpost);
1820 if (num < 0) continue;
1821 num = L_MIN(num + 1, maxnum);
1822 break;
1823 }
1824
1825 if (num <= 0) /* none found */
1826 return sarrayCreate(1);
1827
1828 /* Insert pathnames into the output sarray.
1829 * Ignore numbers that are out of the range of sarray. */
1830 saout = sarrayCreateInitialized(num, "");
1831 for (i = 0; i < nfiles; i++) {
1832 fname = sarrayGetString(sa, i, L_NOCOPY);
1833 index = extractNumberFromFilename(fname, numpre, numpost);
1834 if (index < 0 || index >= num) continue;
1835 str = sarrayGetString(saout, index, L_NOCOPY);
1836 if (str[0] != '\0') {
1837 L_WARNING("\n Multiple files with same number: %d\n",
1838 __func__, index);
1839 }
1840 sarrayReplaceString(saout, index, fname, L_COPY);
1841 }
1842
1843 return saout;
1844 }
1845
1846
1847 /*!
1848 * \brief getFilenamesInDirectory()
1849 *
1850 * \param[in] dirname directory name
1851 * \return sarray of file names, or NULL on error
1852 *
1853 * <pre>
1854 * Notes:
1855 * (1) The versions compiled under unix and cygwin use the POSIX C
1856 * library commands for handling directories. For Windows,
1857 * there is a separate implementation.
1858 * (2) It returns an array of filename tails; i.e., only the part of
1859 * the path after the last slash.
1860 * (3) Use of the d_type field of dirent is not portable:
1861 * "According to POSIX, the dirent structure contains a field
1862 * char d_name[] of unspecified size, with at most NAME_MAX
1863 * characters preceding the terminating null character. Use
1864 * of other fields will harm the portability of your programs."
1865 * (4) As a consequence of (3), we note several things:
1866 * ~ MINGW doesn't have a d_type member.
1867 * ~ Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
1868 * for d_type from all files.
1869 * On these systems, this function will return directories
1870 * (except for '.' and '..', which are eliminated using
1871 * the d_name field).
1872 * (5) For unix, we avoid the bug in earlier versions of realpath()
1873 * by requiring either POSIX 2008 or use of glibc.
1874 *
1875 * </pre>
1876 */
1877
1878 #ifndef _WIN32
1879
1880 SARRAY *
1881 getFilenamesInDirectory(const char *dirname)
1882 {
1883 char *gendir, *realdir, *stat_path;
1884 size_t size;
1885 SARRAY *safiles;
1886 DIR *pdir;
1887 struct dirent *pdirentry;
1888 int dfd, stat_ret;
1889 struct stat st;
1890
1891 if (!dirname)
1892 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
1893 if (dirname[0] == '\0')
1894 return (SARRAY *)ERROR_PTR("dirname is empty", __func__, NULL);
1895
1896 /* Who would have thought it was this fiddly to open a directory
1897 and get the files inside? fstatat() works with relative
1898 directory paths, and stat() requires using the absolute path.
1899 realpath() works as follows for files and directories:
1900 * If the file or directory exists, realpath returns its path;
1901 else it returns NULL.
1902 * For realpath() we use the POSIX 2008 implementation, where
1903 the second arg is NULL and the path is malloc'd and returned
1904 if the file or directory exists. All versions of glibc
1905 support this. */
1906 gendir = genPathname(dirname, NULL);
1907 realdir = realpath(gendir, NULL);
1908 LEPT_FREE(gendir);
1909 if (realdir == NULL)
1910 return (SARRAY *)ERROR_PTR("realdir not made", __func__, NULL);
1911 if ((pdir = opendir(realdir)) == NULL) {
1912 L_ERROR("directory %s not opened\n", __func__, realdir);
1913 LEPT_FREE(realdir);
1914 return NULL;
1915 }
1916 safiles = sarrayCreate(0);
1917 while ((pdirentry = readdir(pdir))) {
1918 #if HAVE_DIRFD && HAVE_FSTATAT
1919 /* Platform issues: although Linux has these POSIX functions,
1920 * AIX doesn't have fstatat() and Solaris doesn't have dirfd(). */
1921 dfd = dirfd(pdir);
1922 stat_ret = fstatat(dfd, pdirentry->d_name, &st, 0);
1923 #else
1924 size = strlen(realdir) + strlen(pdirentry->d_name) + 2;
1925 stat_path = (char *)LEPT_CALLOC(size, 1);
1926 snprintf(stat_path, size, "%s/%s", realdir, pdirentry->d_name);
1927 stat_ret = stat(stat_path, &st);
1928 LEPT_FREE(stat_path);
1929 #endif
1930 if (stat_ret == 0 && S_ISDIR(st.st_mode))
1931 continue;
1932 sarrayAddString(safiles, pdirentry->d_name, L_COPY);
1933 }
1934 closedir(pdir);
1935 LEPT_FREE(realdir);
1936 return safiles;
1937 }
1938
1939 #else /* _WIN32 */
1940
1941 /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
1942 #include <windows.h>
1943
1944 SARRAY *
1945 getFilenamesInDirectory(const char *dirname)
1946 {
1947 char *pszDir;
1948 char *realdir;
1949 HANDLE hFind = INVALID_HANDLE_VALUE;
1950 SARRAY *safiles;
1951 WIN32_FIND_DATAA ffd;
1952
1953 if (!dirname)
1954 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
1955
1956 realdir = genPathname(dirname, NULL);
1957 pszDir = stringJoin(realdir, "\\*");
1958 LEPT_FREE(realdir);
1959
1960 if (strlen(pszDir) + 1 > MAX_PATH) {
1961 LEPT_FREE(pszDir);
1962 return (SARRAY *)ERROR_PTR("dirname is too long", __func__, NULL);
1963 }
1964
1965 if ((safiles = sarrayCreate(0)) == NULL) {
1966 LEPT_FREE(pszDir);
1967 return (SARRAY *)ERROR_PTR("safiles not made", __func__, NULL);
1968 }
1969
1970 hFind = FindFirstFileA(pszDir, &ffd);
1971 if (INVALID_HANDLE_VALUE == hFind) {
1972 sarrayDestroy(&safiles);
1973 LEPT_FREE(pszDir);
1974 return (SARRAY *)ERROR_PTR("hFind not opened", __func__, NULL);
1975 }
1976
1977 while (FindNextFileA(hFind, &ffd) != 0) {
1978 if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */
1979 continue;
1980 convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR);
1981 sarrayAddString(safiles, ffd.cFileName, L_COPY);
1982 }
1983
1984 FindClose(hFind);
1985 LEPT_FREE(pszDir);
1986 return safiles;
1987 }
1988 #endif /* _WIN32 */