comparison mupdf-source/thirdparty/leptonica/src/utils2.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /*====================================================================*
2 - Copyright (C) 2001 Leptonica. All rights reserved.
3 -
4 - Redistribution and use in source and binary forms, with or without
5 - modification, are permitted provided that the following conditions
6 - are met:
7 - 1. Redistributions of source code must retain the above copyright
8 - notice, this list of conditions and the following disclaimer.
9 - 2. Redistributions in binary form must reproduce the above
10 - copyright notice, this list of conditions and the following
11 - disclaimer in the documentation and/or other materials
12 - provided with the distribution.
13 -
14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY
18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 *====================================================================*/
26
27 /*!
28 * \file utils2.c
29 * <pre>
30 *
31 * ------------------------------------------
32 * This file has these utilities:
33 * - safe string operations
34 * - find/replace operations on strings
35 * - read/write between file and memory
36 * - multi-platform file and directory operations
37 * - file name operations
38 * ------------------------------------------
39 *
40 * Safe string procs
41 * char *stringNew()
42 * l_int32 stringCopy()
43 * l_int32 stringCopySegment()
44 * l_int32 stringReplace()
45 * l_int32 stringLength()
46 * l_int32 stringCat()
47 * char *stringConcatNew()
48 * char *stringJoin()
49 * l_int32 stringJoinIP()
50 * char *stringReverse()
51 * char *strtokSafe()
52 * l_int32 stringSplitOnToken()
53 *
54 * Find and replace string and array procs
55 * l_int32 stringCheckForChars()
56 * char *stringRemoveChars()
57 * char *stringReplaceEachSubstr()
58 * char *stringReplaceSubstr()
59 * L_DNA *stringFindEachSubstr()
60 * l_int32 stringFindSubstr()
61 * l_uint8 *arrayReplaceEachSequence()
62 * L_DNA *arrayFindEachSequence()
63 * l_int32 arrayFindSequence()
64 *
65 * Safe realloc
66 * void *reallocNew()
67 *
68 * Read and write between file and memory
69 * l_uint8 *l_binaryRead()
70 * l_uint8 *l_binaryReadStream()
71 * l_uint8 *l_binaryReadSelect()
72 * l_uint8 *l_binaryReadSelectStream()
73 * l_int32 l_binaryWrite()
74 * l_int32 nbytesInFile()
75 * l_int32 fnbytesInFile()
76 *
77 * Copy and compare in memory
78 * l_uint8 *l_binaryCopy()
79 * l_uint8 *l_binaryCompare()
80 *
81 * File copy operations
82 * l_int32 fileCopy()
83 * l_int32 fileConcatenate()
84 * l_int32 fileAppendString()
85 *
86 * File split operations
87 * l_int32 fileSplitLinesUniform()
88 *
89 * Multi-platform functions for opening file streams
90 * FILE *fopenReadStream()
91 * FILE *fopenWriteStream()
92 * FILE *fopenReadFromMemory()
93 *
94 * Opening a Windows tmpfile for writing
95 * FILE *fopenWriteWinTempfile()
96 *
97 * Multi-platform functions that avoid C-runtime boundary crossing
98 * with Windows DLLs (use in programs only)
99 * FILE *lept_fopen()
100 * l_int32 lept_fclose()
101 * void *lept_calloc()
102 * void lept_free()
103 *
104 * Multi-platform file system operations in temp directories
105 * l_int32 lept_mkdir()
106 * l_int32 lept_rmdir()
107 * l_int32 lept_direxists()
108 * l_int32 lept_mv()
109 * l_int32 lept_rm_match()
110 * l_int32 lept_rm()
111 * l_int32 lept_rmfile()
112 * l_int32 lept_cp()
113 *
114 * Special debug/test function for calling 'system'
115 * l_int32 callSystemDebug()
116 *
117 * General file name operations
118 * l_int32 splitPathAtDirectory()
119 * l_int32 splitPathAtExtension()
120 * char *pathJoin()
121 * char *appendSubdirs()
122 *
123 * Special file name operations
124 * l_int32 convertSepCharsInPath()
125 * char *genPathname()
126 * l_int32 makeTempDirname()
127 * l_int32 modifyTrailingSlash()
128 * char *l_makeTempFilename()
129 * l_int32 extractNumberFromFilename()
130 *
131 *
132 * Notes on multi-platform development
133 * -----------------------------------
134 * This is important:
135 * (1) With the exception of splitPathAtDirectory(), splitPathAtExtension()
136 * and genPathname(), all input pathnames must have unix separators.
137 * (2) On macOS, iOS and Windows, for read or write to "/tmp/..."
138 * the filename is rewritten to use the OS specific temp directory:
139 * /tmp ==> [Temp]/...
140 * (3) This filename rewrite, along with the conversion from unix
141 * to OS specific pathnames, happens in genPathname().
142 * (4) Use fopenReadStream() and fopenWriteStream() to open files,
143 * because these use genPathname() to find the platform-dependent
144 * filenames. Likewise for l_binaryRead() and l_binaryWrite().
145 * (5) For moving, copying and removing files and directories that are in
146 * subdirectories of /tmp, use the lept_*() file system shell wrappers:
147 * lept_mkdir(), lept_rmdir(), lept_mv(), lept_rm() and lept_cp().
148 * (6) For programs use the lept_fopen(), lept_fclose(), lept_calloc()
149 * and lept_free() C library wrappers. These work properly on Windows,
150 * where the same DLL must perform complementary operations on
151 * file streams (open/close) and heap memory (malloc/free).
152 * (7) Why read and write files to temp directories?
153 * The library needs the ability to read and write ephemeral
154 * files to default places, both for generating debugging output
155 * and for supporting regression tests. Applications also need
156 * this ability for debugging.
157 * (8) Why do the pathname rewrite on macOS, iOS and Windows?
158 * The goal is to have the library, and programs using the library,
159 * run on multiple platforms without changes. The location of
160 * temporary files depends on the platform as well as the user's
161 * configuration. Temp files on some operating systems are in some
162 * directory not known a priori. To make everything work seamlessly on
163 * any OS, every time you open a file for reading or writing,
164 * use a special function such as fopenReadStream() or
165 * fopenWriteStream(); these call genPathname() to ensure that
166 * if it is a temp file, the correct path is used. To indicate
167 * that this is a temp file, the application is written with the
168 * root directory of the path in a canonical form: "/tmp".
169 * (9) Why is it that multi-platform directory functions like lept_mkdir()
170 * and lept_rmdir(), as well as associated file functions like
171 * lept_rm(), lept_mv() and lept_cp(), only work in the temp dir?
172 * These functions were designed to provide easy manipulation of
173 * temp files. The restriction to temp files is for safety -- to
174 * prevent an accidental deletion of important files. For example,
175 * lept_rmdir() first deletes all files in a specified subdirectory
176 * of temp, and then removes the directory.
177 *
178 * </pre>
179 */
180
181 #ifdef HAVE_CONFIG_H
182 #include <config_auto.h>
183 #endif /* HAVE_CONFIG_H */
184
185 #ifdef _MSC_VER
186 #include <process.h>
187 #include <direct.h>
188 #define getcwd _getcwd /* fix MSVC warning */
189 #else
190 #include <unistd.h>
191 #endif /* _MSC_VER */
192
193 #ifdef _WIN32
194 #include <windows.h>
195 #include <fcntl.h> /* _O_CREAT, ... */
196 #include <io.h> /* _open */
197 #include <sys/stat.h> /* _S_IREAD, _S_IWRITE */
198 #else
199 #include <sys/stat.h> /* for stat, mkdir(2) */
200 #include <sys/types.h>
201 #endif
202
203 #ifdef __APPLE__
204 #include <unistd.h>
205 #include <errno.h>
206 #endif
207
208 #include <string.h>
209 #include <stddef.h>
210 #include "allheaders.h"
211
212 #if defined(__APPLE__) || defined(_WIN32)
213 /* Rewrite paths starting with /tmp for macOS, iOS and Windows. */
214 #define REWRITE_TMP
215 #endif
216
217 /*--------------------------------------------------------------------*
218 * Safe string operations *
219 *--------------------------------------------------------------------*/
220 /*!
221 * \brief stringNew()
222 *
223 * \param[in] src
224 * \return dest copy of %src string, or NULL on error
225 */
226 char *
227 stringNew(const char *src)
228 {
229 l_int32 len;
230 char *dest;
231
232 if (!src) {
233 L_WARNING("src not defined\n", __func__);
234 return NULL;
235 }
236
237 len = strlen(src);
238 if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
239 return (char *)ERROR_PTR("dest not made", __func__, NULL);
240
241 stringCopy(dest, src, len);
242 return dest;
243 }
244
245
246 /*!
247 * \brief stringCopy()
248 *
249 * \param[in] dest existing byte buffer
250 * \param[in] src string [optional] can be null
251 * \param[in] n max number of characters to copy
252 * \return 0 if OK, 1 on error
253 *
254 * <pre>
255 * Notes:
256 * (1) Relatively safe wrapper for strncpy, that checks the input,
257 * and does not complain if %src is null or %n < 1.
258 * If %n < 1, this is a no-op.
259 * (2) %dest needs to be at least %n bytes in size.
260 * (3) We don't call strncpy() because valgrind complains about
261 * use of uninitialized values.
262 * </pre>
263 */
264 l_ok
265 stringCopy(char *dest,
266 const char *src,
267 l_int32 n)
268 {
269 l_int32 i;
270
271 if (!dest)
272 return ERROR_INT("dest not defined", __func__, 1);
273 if (!src || n < 1)
274 return 0;
275
276 /* Implementation of strncpy that valgrind doesn't complain about */
277 for (i = 0; i < n && src[i] != '\0'; i++)
278 dest[i] = src[i];
279 for (; i < n; i++)
280 dest[i] = '\0';
281 return 0;
282 }
283
284
285 /*!
286 * \brief stringCopySegment()
287 *
288 *
289 * \param[in] src string
290 * \param[in] start byte position at start of segment
291 * \param[in] nbytes number of bytes in the segment; use 0 to go to end
292 * \return copy of segment, or NULL on error
293 *
294 * <pre>
295 * Notes:
296 * (1) This is a variant of stringNew() that makes a new string
297 * from a segment of the input string. The segment is specified
298 * by the starting position and the number of bytes.
299 * (2) The start location %start must be within the string %src.
300 * (3) The copy is truncated to the end of the source string.
301 * Use %nbytes = 0 to copy to the end of %src.
302 * </pre>
303 */
304 char *
305 stringCopySegment(const char *src,
306 l_int32 start,
307 l_int32 nbytes)
308 {
309 char *dest;
310 l_int32 len;
311
312 if (!src)
313 return (char *)ERROR_PTR("src not defined", __func__, NULL);
314 len = strlen(src);
315 if (start < 0 || start > len - 1)
316 return (char *)ERROR_PTR("invalid start", __func__, NULL);
317 if (nbytes <= 0) /* copy to the end */
318 nbytes = len - start;
319 if (start + nbytes > len) /* truncate to the end */
320 nbytes = len - start;
321 if ((dest = (char *)LEPT_CALLOC(nbytes + 1, sizeof(char))) == NULL)
322 return (char *)ERROR_PTR("dest not made", __func__, NULL);
323 stringCopy(dest, src + start, nbytes);
324 return dest;
325 }
326
327
328 /*!
329 * \brief stringReplace()
330 *
331 * \param[out] pdest string copy
332 * \param[in] src [optional] string; can be null
333 * \return 0 if OK; 1 on error
334 *
335 * <pre>
336 * Notes:
337 * (1) Frees any existing dest string
338 * (2) Puts a copy of src string in the dest
339 * (3) If either or both strings are null, does something reasonable.
340 * </pre>
341 */
342 l_ok
343 stringReplace(char **pdest,
344 const char *src)
345 {
346 if (!pdest)
347 return ERROR_INT("pdest not defined", __func__, 1);
348
349 if (*pdest)
350 LEPT_FREE(*pdest);
351
352 if (src)
353 *pdest = stringNew(src);
354 else
355 *pdest = NULL;
356 return 0;
357 }
358
359
360 /*!
361 * \brief stringLength()
362 *
363 * \param[in] src string can be null or NULL-terminated string
364 * \param[in] size number of bytes to check; e.g., size of src buffer
365 * \return length of src in bytes; 0 if no bytes are found;
366 * %size on error when NUL byte is not found.
367 *
368 * <pre>
369 * Notes:
370 * (1) Safe implementation of strlen that only checks %size bytes
371 * for trailing NUL.
372 * (2) Valid returned string lengths are between 0 and size - 1.
373 * If %size bytes are checked without finding a NUL byte, then
374 * an error is indicated by returning %size.
375 * </pre>
376 */
377 l_int32
378 stringLength(const char *src,
379 size_t size)
380 {
381 l_int32 i;
382
383 if (!src)
384 return 0;
385 if (size < 1)
386 return ERROR_INT("size < 1; too small", __func__, 0);
387
388 for (i = 0; i < size; i++) {
389 if (src[i] == '\0')
390 return i;
391 }
392
393 /* Didn't find a NUL byte */
394 L_ERROR("NUL byte not found in %zu bytes\n", __func__, size);
395 return size;
396 }
397
398
399 /*!
400 * \brief stringCat()
401 *
402 * \param[in] dest null-terminated byte buffer
403 * \param[in] size size of dest buffer
404 * \param[in] src string can be null or NULL-terminated string
405 * \return number of bytes added to dest; -1 on error
406 *
407 * <pre>
408 * Notes:
409 * (1) Alternative implementation of strncat, that checks the input,
410 * is easier to use (since the size of the dest buffer is specified
411 * rather than the number of bytes to copy), and does not complain
412 * if %src is null.
413 * (2) Never writes past end of dest.
414 * (3) If there is not enough room to append the src, which is an error,
415 * it does nothing.
416 * (4) N.B. The order of 2nd and 3rd args is reversed from that in
417 * strncat, as in the Windows function strcat_s().
418 * </pre>
419 */
420 l_int32
421 stringCat(char *dest,
422 size_t size,
423 const char *src)
424 {
425 l_int32 i, n;
426 l_int32 lendest, lensrc;
427
428 if (!dest)
429 return ERROR_INT("dest not defined", __func__, -1);
430 if (size < 1)
431 return ERROR_INT("size < 1; too small", __func__, -1);
432 if (!src)
433 return 0;
434
435 lendest = stringLength(dest, size);
436 if (lendest == size)
437 return ERROR_INT("no terminating nul byte", __func__, -1);
438 lensrc = stringLength(src, size);
439 if (lensrc == 0)
440 return 0; /* nothing added to dest */
441 n = (lendest + lensrc > size - 1) ? 0 : lensrc;
442 if (n == 0)
443 return ERROR_INT("dest too small for append", __func__, -1);
444
445 for (i = 0; i < n; i++)
446 dest[lendest + i] = src[i];
447 dest[lendest + n] = '\0';
448 return n;
449 }
450
451
452 /*!
453 * \brief stringConcatNew()
454 *
455 * \param[in] first first string in list
456 * \param[in] ... NULL-terminated list of strings
457 * \return result new string concatenating the input strings, or
458 * NULL if first == NULL
459 *
460 * <pre>
461 * Notes:
462 * (1) The last arg in the list of strings must be NULL.
463 * (2) Caller must free the returned string.
464 * </pre>
465 */
466 char *
467 stringConcatNew(const char *first, ...)
468 {
469 size_t len;
470 char *result, *ptr;
471 const char *arg;
472 va_list args;
473
474 if (!first) return NULL;
475
476 /* Find the length of the output string */
477 va_start(args, first);
478 len = strlen(first);
479 while ((arg = va_arg(args, const char *)) != NULL)
480 len += strlen(arg);
481 va_end(args);
482 result = (char *)LEPT_CALLOC(len + 1, sizeof(char));
483
484 /* Concatenate the args */
485 va_start(args, first);
486 ptr = result;
487 arg = first;
488 while (*arg)
489 *ptr++ = *arg++;
490 while ((arg = va_arg(args, const char *)) != NULL) {
491 while (*arg)
492 *ptr++ = *arg++;
493 }
494 va_end(args);
495 return result;
496 }
497
498
499 /*!
500 * \brief stringJoin()
501 *
502 * \param[in] src1 [optional] string; can be null
503 * \param[in] src2 [optional] string; can be null
504 * \return concatenated string, or NULL on error
505 *
506 * <pre>
507 * Notes:
508 * (1) This is a safe version of strcat; it makes a new string.
509 * (2) It is not an error if either or both of the strings
510 * are empty, or if either or both of the pointers are null.
511 * </pre>
512 */
513 char *
514 stringJoin(const char *src1,
515 const char *src2)
516 {
517 char *dest;
518 l_int32 srclen1, srclen2, destlen;
519
520 srclen1 = (src1) ? strlen(src1) : 0;
521 srclen2 = (src2) ? strlen(src2) : 0;
522 destlen = srclen1 + srclen2 + 3;
523
524 if ((dest = (char *)LEPT_CALLOC(destlen, sizeof(char))) == NULL)
525 return (char *)ERROR_PTR("calloc fail for dest", __func__, NULL);
526
527 if (src1)
528 stringCat(dest, destlen, src1);
529 if (src2)
530 stringCat(dest, destlen, src2);
531 return dest;
532 }
533
534
535 /*!
536 * \brief stringJoinIP()
537 *
538 * \param[in,out] psrc1 address of string src1; cannot be on the stack
539 * \param[in] src2 [optional] string; can be null
540 * \return 0 if OK, 1 on error
541 *
542 * <pre>
543 * Notes:
544 * (1) This is a safe in-place version of strcat. The contents of
545 * src1 is replaced by the concatenation of src1 and src2.
546 * (2) It is not an error if either or both of the strings
547 * are empty (""), or if the pointers to the strings (*psrc1, src2)
548 * are null.
549 * (3) src1 should be initialized to null or an empty string
550 * before the first call. Use one of these:
551 * char *src1 = NULL;
552 * char *src1 = stringNew("");
553 * Then call with:
554 * stringJoinIP(&src1, src2);
555 * (4) This can also be implemented as a macro:
556 * \code
557 * #define stringJoinIP(src1, src2) \
558 * {tmpstr = stringJoin((src1),(src2)); \
559 * LEPT_FREE(src1); \
560 * (src1) = tmpstr;}
561 * \endcode
562 * (5) Another function to consider for joining many strings is
563 * stringConcatNew().
564 * </pre>
565 */
566 l_ok
567 stringJoinIP(char **psrc1,
568 const char *src2)
569 {
570 char *tmpstr;
571
572 if (!psrc1)
573 return ERROR_INT("&src1 not defined", __func__, 1);
574
575 tmpstr = stringJoin(*psrc1, src2);
576 LEPT_FREE(*psrc1);
577 *psrc1 = tmpstr;
578 return 0;
579 }
580
581
582 /*!
583 * \brief stringReverse()
584 *
585 * \param[in] src string
586 * \return dest newly-allocated reversed string
587 */
588 char *
589 stringReverse(const char *src)
590 {
591 char *dest;
592 l_int32 i, len;
593
594 if (!src)
595 return (char *)ERROR_PTR("src not defined", __func__, NULL);
596 len = strlen(src);
597 if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
598 return (char *)ERROR_PTR("calloc fail for dest", __func__, NULL);
599 for (i = 0; i < len; i++)
600 dest[i] = src[len - 1 - i];
601
602 return dest;
603 }
604
605
606 /*!
607 * \brief strtokSafe()
608 *
609 * \param[in] cstr input string to be sequentially parsed;
610 * use NULL after the first call
611 * \param[in] seps a string of character separators
612 * \param[out] psaveptr ptr to the next char after
613 * the last encountered separator
614 * \return substr a new string that is copied from the previous
615 * saveptr up to but not including the next
616 * separator character, or NULL if end of cstr.
617 *
618 * <pre>
619 * Notes:
620 * (1) This is a thread-safe implementation of strtok.
621 * (2) It has the same interface as strtok_r.
622 * (3) It differs from strtok_r in usage in two respects:
623 * (a) the input string is not altered
624 * (b) each returned substring is newly allocated and must
625 * be freed after use.
626 * (4) Let me repeat that. This is "safe" because the input
627 * string is not altered and because each returned string
628 * is newly allocated on the heap.
629 * (5) It is here because, surprisingly, some C libraries don't
630 * include strtok_r.
631 * (6) Important usage points:
632 * ~ Input the string to be parsed on the first invocation.
633 * ~ Then input NULL after that; the value returned in saveptr
634 * is used in all subsequent calls.
635 * (7) This is only slightly slower than strtok_r.
636 * </pre>
637 */
638 char *
639 strtokSafe(char *cstr,
640 const char *seps,
641 char **psaveptr)
642 {
643 char nextc;
644 char *start, *substr;
645 l_int32 istart, i, j, nchars;
646
647 if (!seps)
648 return (char *)ERROR_PTR("seps not defined", __func__, NULL);
649 if (!psaveptr)
650 return (char *)ERROR_PTR("&saveptr not defined", __func__, NULL);
651
652 if (!cstr) {
653 start = *psaveptr;
654 } else {
655 start = cstr;
656 *psaveptr = NULL;
657 }
658 if (!start) /* nothing to do */
659 return NULL;
660
661 /* First time, scan for the first non-sep character */
662 istart = 0;
663 if (cstr) {
664 for (istart = 0;; istart++) {
665 if ((nextc = start[istart]) == '\0') {
666 *psaveptr = NULL; /* in case caller doesn't check ret value */
667 return NULL;
668 }
669 if (!strchr(seps, nextc))
670 break;
671 }
672 }
673
674 /* Scan through, looking for a sep character; if none is
675 * found, 'i' will be at the end of the string. */
676 for (i = istart;; i++) {
677 if ((nextc = start[i]) == '\0')
678 break;
679 if (strchr(seps, nextc))
680 break;
681 }
682
683 /* Save the substring */
684 nchars = i - istart;
685 substr = (char *)LEPT_CALLOC(nchars + 1, sizeof(char));
686 stringCopy(substr, start + istart, nchars);
687
688 /* Look for the next non-sep character.
689 * If this is the last substring, return a null saveptr. */
690 for (j = i;; j++) {
691 if ((nextc = start[j]) == '\0') {
692 *psaveptr = NULL; /* no more non-sep characters */
693 break;
694 }
695 if (!strchr(seps, nextc)) {
696 *psaveptr = start + j; /* start here on next call */
697 break;
698 }
699 }
700
701 return substr;
702 }
703
704
705 /*!
706 * \brief stringSplitOnToken()
707 *
708 * \param[in] cstr input string to be split; not altered
709 * \param[in] seps a string of character separators
710 * \param[out] phead ptr to copy of the input string, up to
711 * the first separator token encountered
712 * \param[out] ptail ptr to copy of the part of the input string
713 * starting with the first non-separator character
714 * that occurs after the first separator is found
715 * \return 0 if OK, 1 on error
716 *
717 * <pre>
718 * Notes:
719 * (1) The input string is not altered; all split parts are new strings.
720 * (2) The split occurs around the first consecutive sequence of
721 * tokens encountered.
722 * (3) The head goes from the beginning of the string up to
723 * but not including the first token found.
724 * (4) The tail contains the second part of the string, starting
725 * with the first char in that part that is NOT a token.
726 * (5) If no separator token is found, 'head' contains a copy
727 * of the input string and 'tail' is null.
728 * </pre>
729 */
730 l_ok
731 stringSplitOnToken(char *cstr,
732 const char *seps,
733 char **phead,
734 char **ptail)
735 {
736 char *saveptr;
737
738 if (!phead)
739 return ERROR_INT("&head not defined", __func__, 1);
740 if (!ptail)
741 return ERROR_INT("&tail not defined", __func__, 1);
742 *phead = *ptail = NULL;
743 if (!cstr)
744 return ERROR_INT("cstr not defined", __func__, 1);
745 if (!seps)
746 return ERROR_INT("seps not defined", __func__, 1);
747
748 *phead = strtokSafe(cstr, seps, &saveptr);
749 if (saveptr)
750 *ptail = stringNew(saveptr);
751 return 0;
752 }
753
754
755 /*--------------------------------------------------------------------*
756 * Find and replace procs *
757 *--------------------------------------------------------------------*/
758 /*!
759 * \brief stringCheckForChars()
760 *
761 * \param[in] src input string; can be of zero length
762 * \param[in] chars string of chars to be searched for in %src
763 * \param[out] pfound 1 if any characters are found; 0 otherwise
764 * \return 0 if OK, 1 on error
765 *
766 * <pre>
767 * Notes:
768 * (1) This can be used to sanitize an operation by checking for
769 * special characters that don't belong in a string.
770 * </pre>
771 */
772 l_ok
773 stringCheckForChars(const char *src,
774 const char *chars,
775 l_int32 *pfound)
776 {
777 char ch;
778 l_int32 i, n;
779
780 if (!pfound)
781 return ERROR_INT("&found not defined", __func__, 1);
782 *pfound = FALSE;
783 if (!src || !chars)
784 return ERROR_INT("src and chars not both defined", __func__, 1);
785
786 n = strlen(src);
787 for (i = 0; i < n; i++) {
788 ch = src[i];
789 if (strchr(chars, ch)) {
790 *pfound = TRUE;
791 break;
792 }
793 }
794 return 0;
795 }
796
797
798 /*!
799 * \brief stringRemoveChars()
800 *
801 * \param[in] src input string; can be of zero length
802 * \param[in] remchars string of chars to be removed from src
803 * \return dest string with specified chars removed, or NULL on error
804 */
805 char *
806 stringRemoveChars(const char *src,
807 const char *remchars)
808 {
809 char ch;
810 char *dest;
811 l_int32 nsrc, i, k;
812
813 if (!src)
814 return (char *)ERROR_PTR("src not defined", __func__, NULL);
815 if (!remchars)
816 return stringNew(src);
817
818 if ((dest = (char *)LEPT_CALLOC(strlen(src) + 1, sizeof(char))) == NULL)
819 return (char *)ERROR_PTR("dest not made", __func__, NULL);
820 nsrc = strlen(src);
821 for (i = 0, k = 0; i < nsrc; i++) {
822 ch = src[i];
823 if (!strchr(remchars, ch))
824 dest[k++] = ch;
825 }
826
827 return dest;
828 }
829
830
831 /*!
832 * \brief stringReplaceEachSubstr()
833 *
834 * \param[in] src input string; can be of zero length
835 * \param[in] sub1 substring to be replaced
836 * \param[in] sub2 substring to put in; can be ""
837 * \param[out] pcount [optional] the number of times that sub1
838 * is found in src; 0 if not found
839 * \return dest string with substring replaced, or NULL if the
840 * substring not found or on error.
841 *
842 * <pre>
843 * Notes:
844 * (1) This is a wrapper for simple string substitution that uses
845 * the more general function arrayReplaceEachSequence().
846 * (2) This finds every non-overlapping occurrence of %sub1 in
847 * %src, and replaces it with %sub2. By "non-overlapping"
848 * we mean that after it finds each match, it removes the
849 * matching characters, replaces with the substitution string
850 * (if not empty), and continues. For example, if you replace
851 * 'aa' by 'X' in 'baaabbb', you find one match at position 1
852 * and return 'bXabbb'.
853 * (3) To only remove each instance of sub1, use "" for sub2
854 * (4) Returns a copy of %src if sub1 and sub2 are the same.
855 * (5) If the input %src is binary data that can have null characters,
856 * use arrayReplaceEachSequence() directly.
857 * </pre>
858 */
859 char *
860 stringReplaceEachSubstr(const char *src,
861 const char *sub1,
862 const char *sub2,
863 l_int32 *pcount)
864 {
865 size_t datalen;
866
867 if (pcount) *pcount = 0;
868 if (!src || !sub1 || !sub2)
869 return (char *)ERROR_PTR("src, sub1, sub2 not all defined",
870 __func__, NULL);
871
872 if (strlen(sub2) > 0) {
873 return (char *)arrayReplaceEachSequence(
874 (const l_uint8 *)src, strlen(src),
875 (const l_uint8 *)sub1, strlen(sub1),
876 (const l_uint8 *)sub2, strlen(sub2),
877 &datalen, pcount);
878 } else { /* empty replacement string; removal only */
879 return (char *)arrayReplaceEachSequence(
880 (const l_uint8 *)src, strlen(src),
881 (const l_uint8 *)sub1, strlen(sub1),
882 NULL, 0, &datalen, pcount);
883 }
884 }
885
886
887 /*!
888 * \brief stringReplaceSubstr()
889 *
890 * \param[in] src input string; can be of zero length
891 * \param[in] sub1 substring to be replaced
892 * \param[in] sub2 substring to put in; can be ""
893 * \param[in,out] ploc [optional] input start location for search;
894 * returns the loc after replacement
895 * \param[out] pfound [optional] 1 if sub1 is found; 0 otherwise
896 * \return dest string with substring replaced, or NULL on error.
897 *
898 * <pre>
899 * Notes:
900 * (1) Replaces the first instance.
901 * (2) To remove sub1 without replacement, use "" for sub2.
902 * (3) Returns a copy of %src if either no instance of %sub1 is found,
903 * or if %sub1 and %sub2 are the same.
904 * (4) If %ploc == NULL, the search will start at the beginning of %src.
905 * If %ploc != NULL, *ploc must be initialized to the byte offset
906 * within %src from which the search starts. To search the
907 * string from the beginning, set %loc = 0 and input &loc.
908 * After finding %sub1 and replacing it with %sub2, %loc will be
909 * returned as the next position after %sub2 in the output string.
910 * (5) Note that the output string also includes all the characters
911 * from the input string that occur after the single substitution.
912 * </pre>
913 */
914 char *
915 stringReplaceSubstr(const char *src,
916 const char *sub1,
917 const char *sub2,
918 l_int32 *ploc,
919 l_int32 *pfound)
920 {
921 const char *ptr;
922 char *dest;
923 l_int32 nsrc, nsub1, nsub2, len, npre, loc;
924
925 if (pfound) *pfound = 0;
926 if (!src || !sub1 || !sub2)
927 return (char *)ERROR_PTR("src, sub1, sub2 not all defined",
928 __func__, NULL);
929
930 if (ploc)
931 loc = *ploc;
932 else
933 loc = 0;
934 if (!strcmp(sub1, sub2))
935 return stringNew(src);
936 if ((ptr = strstr(src + loc, sub1)) == NULL)
937 return stringNew(src);
938 if (pfound) *pfound = 1;
939
940 nsrc = strlen(src);
941 nsub1 = strlen(sub1);
942 nsub2 = strlen(sub2);
943 len = nsrc + nsub2 - nsub1;
944 if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL)
945 return (char *)ERROR_PTR("dest not made", __func__, NULL);
946 npre = ptr - src;
947 memcpy(dest, src, npre);
948 strcpy(dest + npre, sub2);
949 strcpy(dest + npre + nsub2, ptr + nsub1);
950 if (ploc) *ploc = npre + nsub2;
951 return dest;
952 }
953
954
955 /*!
956 * \brief stringFindEachSubstr()
957 *
958 * \param[in] src input string; can be of zero length
959 * \param[in] sub substring to be searched for
960 * \return dna of offsets where the sequence is found, or NULL if
961 * none are found or on error
962 *
963 * <pre>
964 * Notes:
965 * (1) This finds every non-overlapping occurrence in %src of %sub.
966 * After it finds each match, it moves forward in %src by the length
967 * of %sub before continuing the search. So for example,
968 * if you search for the sequence 'aa' in the data 'baaabbb',
969 * you find one match at position 1.
970
971 * </pre>
972 */
973 L_DNA *
974 stringFindEachSubstr(const char *src,
975 const char *sub)
976 {
977 if (!src || !sub)
978 return (L_DNA *)ERROR_PTR("src, sub not both defined", __func__, NULL);
979
980 return arrayFindEachSequence((const l_uint8 *)src, strlen(src),
981 (const l_uint8 *)sub, strlen(sub));
982 }
983
984
985 /*!
986 * \brief stringFindSubstr()
987 *
988 * \param[in] src input string; can be of zero length
989 * \param[in] sub substring to be searched for; must not be empty
990 * \param[out] ploc [optional] location of substring in src
991 * \return 1 if found; 0 if not found or on error
992 *
993 * <pre>
994 * Notes:
995 * (1) This is a wrapper around strstr(). It finds the first
996 * instance of %sub in %src. If the substring is not found
997 * and the location is returned, it has the value -1.
998 * (2) Both %src and %sub must be defined, and %sub must have
999 * length of at least 1.
1000 * </pre>
1001 */
1002 l_int32
1003 stringFindSubstr(const char *src,
1004 const char *sub,
1005 l_int32 *ploc)
1006 {
1007 const char *ptr;
1008
1009 if (ploc) *ploc = -1;
1010 if (!src || !sub)
1011 return ERROR_INT("src and sub not both defined", __func__, 0);
1012 if (strlen(sub) == 0)
1013 return ERROR_INT("substring length 0", __func__, 0);
1014 if (strlen(src) == 0)
1015 return 0;
1016
1017 if ((ptr = strstr(src, sub)) == NULL) /* not found */
1018 return 0;
1019
1020 if (ploc)
1021 *ploc = ptr - src;
1022 return 1;
1023 }
1024
1025
1026 /*!
1027 * \brief arrayReplaceEachSequence()
1028 *
1029 * \param[in] datas source byte array
1030 * \param[in] dataslen length of source data, in bytes
1031 * \param[in] seq subarray of bytes to find in source data
1032 * \param[in] seqlen length of subarray, in bytes
1033 * \param[in] newseq replacement subarray; can be null
1034 * \param[in] newseqlen length of replacement subarray, in bytes
1035 * \param[out] pdatadlen length of dest byte array, in bytes
1036 * \param[out] pcount [optional] the number of times that sub1
1037 * is found in src; 0 if not found
1038 * \return datad with all all subarrays replaced (or removed)
1039 *
1040 * <pre>
1041 * Notes:
1042 * (1) The byte arrays %datas, %seq and %newseq are not C strings,
1043 * because they can contain null bytes. Therefore, for each
1044 * we must give the length of the array.
1045 * (2) If %newseq == NULL, this just removes all instances of %seq.
1046 * Otherwise, it replaces every non-overlapping occurrence of
1047 * %seq in %datas with %newseq. A new array %datad and its
1048 * size are returned. See arrayFindEachSequence() for more
1049 * details on finding non-overlapping occurrences.
1050 * (3) If no instances of %seq are found, this returns a copy of %datas.
1051 * (4) The returned %datad is null terminated.
1052 * (5) Can use stringReplaceEachSubstr() if using C strings.
1053 * </pre>
1054 */
1055 l_uint8 *
1056 arrayReplaceEachSequence(const l_uint8 *datas,
1057 size_t dataslen,
1058 const l_uint8 *seq,
1059 size_t seqlen,
1060 const l_uint8 *newseq,
1061 size_t newseqlen,
1062 size_t *pdatadlen,
1063 l_int32 *pcount)
1064 {
1065 l_uint8 *datad;
1066 size_t newsize;
1067 l_int32 n, i, j, di, si, index, incr;
1068 L_DNA *da;
1069
1070 if (pcount) *pcount = 0;
1071 if (!datas || !seq)
1072 return (l_uint8 *)ERROR_PTR("datas & seq not both defined",
1073 __func__, NULL);
1074 if (!pdatadlen)
1075 return (l_uint8 *)ERROR_PTR("&datadlen not defined", __func__, NULL);
1076 *pdatadlen = 0;
1077
1078 /* Identify the locations of the sequence. If there are none,
1079 * return a copy of %datas. */
1080 if ((da = arrayFindEachSequence(datas, dataslen, seq, seqlen)) == NULL) {
1081 *pdatadlen = dataslen;
1082 return l_binaryCopy(datas, dataslen);
1083 }
1084
1085 /* Allocate the output data; insure null termination */
1086 n = l_dnaGetCount(da);
1087 if (pcount) *pcount = n;
1088 if (!newseq) newseqlen = 0;
1089 newsize = dataslen + n * (newseqlen - seqlen) + 4;
1090 if ((datad = (l_uint8 *)LEPT_CALLOC(newsize, sizeof(l_uint8))) == NULL) {
1091 l_dnaDestroy(&da);
1092 return (l_uint8 *)ERROR_PTR("datad not made", __func__, NULL);
1093 }
1094
1095 /* Replace each sequence instance with a new sequence */
1096 l_dnaGetIValue(da, 0, &si);
1097 for (i = 0, di = 0, index = 0; i < dataslen; i++) {
1098 if (i == si) {
1099 index++;
1100 if (index < n) {
1101 l_dnaGetIValue(da, index, &si);
1102 incr = L_MIN(seqlen, si - i); /* amount to remove from datas */
1103 } else {
1104 incr = seqlen;
1105 }
1106 i += incr - 1; /* jump over the matched sequence in datas */
1107 if (newseq) { /* add new sequence to datad */
1108 for (j = 0; j < newseqlen; j++)
1109 datad[di++] = newseq[j];
1110 }
1111 } else {
1112 datad[di++] = datas[i];
1113 }
1114 }
1115
1116 *pdatadlen = di;
1117 l_dnaDestroy(&da);
1118 return datad;
1119 }
1120
1121
1122 /*!
1123 * \brief arrayFindEachSequence()
1124 *
1125 * \param[in] data byte array
1126 * \param[in] datalen length of data, in bytes
1127 * \param[in] sequence subarray of bytes to find in data
1128 * \param[in] seqlen length of sequence, in bytes
1129 * \return dna of offsets where the sequence is found, or NULL if
1130 * none are found or on error
1131 *
1132 * <pre>
1133 * Notes:
1134 * (1) The byte arrays %data and %sequence are not C strings,
1135 * because they can contain null bytes. Therefore, for each
1136 * we must give the length of the array.
1137 * (2) This finds every non-overlapping occurrence in %data of %sequence.
1138 * After it finds each match, it moves forward by the length
1139 * of the sequence before continuing the search. So for example,
1140 * if you search for the sequence 'aa' in the data 'baaabbb',
1141 * you find one match at position 1.
1142 * </pre>
1143 */
1144 L_DNA *
1145 arrayFindEachSequence(const l_uint8 *data,
1146 size_t datalen,
1147 const l_uint8 *sequence,
1148 size_t seqlen)
1149 {
1150 l_int32 start, offset, realoffset, found;
1151 L_DNA *da;
1152
1153 if (!data || !sequence)
1154 return (L_DNA *)ERROR_PTR("data & sequence not both defined",
1155 __func__, NULL);
1156
1157 da = l_dnaCreate(0);
1158 start = 0;
1159 while (1) {
1160 arrayFindSequence(data + start, datalen - start, sequence, seqlen,
1161 &offset, &found);
1162 if (found == FALSE)
1163 break;
1164
1165 realoffset = start + offset;
1166 l_dnaAddNumber(da, realoffset);
1167 start = realoffset + seqlen;
1168 if (start >= datalen)
1169 break;
1170 }
1171
1172 if (l_dnaGetCount(da) == 0)
1173 l_dnaDestroy(&da);
1174 return da;
1175 }
1176
1177
1178 /*!
1179 * \brief arrayFindSequence()
1180 *
1181 * \param[in] data byte array
1182 * \param[in] datalen length of data, in bytes
1183 * \param[in] sequence subarray of bytes to find in data
1184 * \param[in] seqlen length of sequence, in bytes
1185 * \param[out] poffset offset from beginning of
1186 * data where the sequence begins
1187 * \param[out] pfound 1 if sequence is found; 0 otherwise
1188 * \return 0 if OK, 1 on error
1189 *
1190 * <pre>
1191 * Notes:
1192 * (1) The byte arrays 'data' and 'sequence' are in general not C strings,
1193 * because they can contain null bytes. Therefore, for each
1194 * we must give the length of the array.
1195 * (2) This searches for the first occurrence in %data of %sequence,
1196 * which consists of %seqlen bytes. The parameter %seqlen
1197 * must not exceed the actual length of the %sequence byte array.
1198 * (3) If either byte array is a C string, cast the array to
1199 * (const l_uint8 *) and use strlen() on the string for its length.
1200 * (4) If the sequence is not found, the offset will be 0, so you
1201 * must check %found.
1202 * </pre>
1203 */
1204 l_ok
1205 arrayFindSequence(const l_uint8 *data,
1206 size_t datalen,
1207 const l_uint8 *sequence,
1208 size_t seqlen,
1209 l_int32 *poffset,
1210 l_int32 *pfound)
1211 {
1212 l_int32 i, j, found, lastpos;
1213
1214 if (poffset) *poffset = 0;
1215 if (pfound) *pfound = FALSE;
1216 if (!data || !sequence)
1217 return ERROR_INT("data & sequence not both defined", __func__, 1);
1218 if (!poffset || !pfound)
1219 return ERROR_INT("&offset and &found not defined", __func__, 1);
1220
1221 lastpos = datalen - seqlen + 1;
1222 found = FALSE;
1223 for (i = 0; i < lastpos; i++) {
1224 for (j = 0; j < seqlen; j++) {
1225 if (data[i + j] != sequence[j])
1226 break;
1227 if (j == seqlen - 1)
1228 found = TRUE;
1229 }
1230 if (found == TRUE)
1231 break;
1232 }
1233
1234 if (found == TRUE) {
1235 *poffset = i;
1236 *pfound = TRUE;
1237 }
1238 return 0;
1239 }
1240
1241
1242 /*--------------------------------------------------------------------*
1243 * Safe realloc *
1244 *--------------------------------------------------------------------*/
1245 /*!
1246 * \brief reallocNew()
1247 *
1248 * \param[in,out] pindata nulls indata before reallocing
1249 * \param[in] oldsize size of input data to be copied, in bytes
1250 * \param[in] newsize size of buffer to be reallocated in bytes
1251 * \return ptr to new data, or NULL on error
1252 *
1253 * Action: !N.B. 3) and (4!
1254 * 1 Allocates memory, initialized to 0
1255 * 2 Copies as much of the input data as possible
1256 * to the new block, truncating the copy if necessary
1257 * 3 Frees the input data
1258 * 4 Zeroes the input data ptr
1259 *
1260 * <pre>
1261 * Notes:
1262 * (1) If newsize == 0, frees input data and nulls ptr
1263 * (2) If input data is null, only callocs new memory
1264 * (3) This differs from realloc in that it always allocates
1265 * new memory (if newsize > 0) and initializes it to 0,
1266 * it requires the amount of old data to be copied,
1267 * and it takes the address of the input ptr and
1268 * nulls the handle.
1269 * </pre>
1270 */
1271 void *
1272 reallocNew(void **pindata,
1273 size_t oldsize,
1274 size_t newsize)
1275 {
1276 size_t minsize;
1277 void *indata;
1278 void *newdata;
1279
1280 if (!pindata)
1281 return ERROR_PTR("input data not defined", __func__, NULL);
1282 indata = *pindata;
1283
1284 if (newsize == 0) { /* nonstandard usage */
1285 if (indata) {
1286 LEPT_FREE(indata);
1287 *pindata = NULL;
1288 }
1289 return NULL;
1290 }
1291
1292 if (!indata) { /* nonstandard usage */
1293 if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1294 return ERROR_PTR("newdata not made", __func__, NULL);
1295 return newdata;
1296 }
1297
1298 /* Standard usage */
1299 if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL)
1300 return ERROR_PTR("newdata not made", __func__, NULL);
1301 minsize = L_MIN(oldsize, newsize);
1302 memcpy(newdata, indata, minsize);
1303 LEPT_FREE(indata);
1304 *pindata = NULL;
1305 return newdata;
1306 }
1307
1308
1309 /*--------------------------------------------------------------------*
1310 * Read and write between file and memory *
1311 *--------------------------------------------------------------------*/
1312 /*!
1313 * \brief l_binaryRead()
1314 *
1315 * \param[in] filename
1316 * \param[out] pnbytes number of bytes read
1317 * \return data, or NULL on error
1318 */
1319 l_uint8 *
1320 l_binaryRead(const char *filename,
1321 size_t *pnbytes)
1322 {
1323 l_uint8 *data;
1324 FILE *fp;
1325
1326 if (!pnbytes)
1327 return (l_uint8 *)ERROR_PTR("pnbytes not defined", __func__, NULL);
1328 *pnbytes = 0;
1329 if (!filename)
1330 return (l_uint8 *)ERROR_PTR("filename not defined", __func__, NULL);
1331
1332 if ((fp = fopenReadStream(filename)) == NULL)
1333 return (l_uint8 *)ERROR_PTR_1("file stream not opened",
1334 filename, __func__, NULL);
1335 data = l_binaryReadStream(fp, pnbytes);
1336 fclose(fp);
1337 return data;
1338 }
1339
1340
1341 /*!
1342 * \brief l_binaryReadStream()
1343 *
1344 * \param[in] fp file stream opened to read; can be stdin
1345 * \param[out] pnbytes number of bytes read
1346 * \return null-terminated array, or NULL on error; reading 0 bytes
1347 * is not an error
1348 *
1349 * <pre>
1350 * Notes:
1351 * (1) The returned array is terminated with a null byte so that it can
1352 * be used to read ascii data from a file into a proper C string.
1353 * (2) This can be used to capture data that is piped in via stdin,
1354 * because it does not require seeking within the file.
1355 * (3) For example, you can read an image from stdin into memory
1356 * using shell redirection, with one of these shell commands:
1357 * \code
1358 * cat <imagefile> | readprog
1359 * readprog < <imagefile>
1360 * \endcode
1361 * where readprog is:
1362 * \code
1363 * l_uint8 *data = l_binaryReadStream(stdin, &nbytes);
1364 * Pix *pix = pixReadMem(data, nbytes);
1365 * \endcode
1366 * </pre>
1367 */
1368 l_uint8 *
1369 l_binaryReadStream(FILE *fp,
1370 size_t *pnbytes)
1371 {
1372 l_uint8 *data;
1373 l_int32 seekable, navail, nadd, nread;
1374 L_BBUFFER *bb;
1375
1376 if (!pnbytes)
1377 return (l_uint8 *)ERROR_PTR("&nbytes not defined", __func__, NULL);
1378 *pnbytes = 0;
1379 if (!fp)
1380 return (l_uint8 *)ERROR_PTR("fp not defined", __func__, NULL);
1381
1382 /* Test if the stream is seekable, by attempting to seek to
1383 * the start of data. This is a no-op. If it is seekable, use
1384 * l_binaryReadSelectStream() to determine the size of the
1385 * data to be read in advance. */
1386 seekable = (ftell(fp) == 0) ? 1 : 0;
1387 if (seekable)
1388 return l_binaryReadSelectStream(fp, 0, 0, pnbytes);
1389
1390 /* If it is not seekable, use the bbuffer to realloc memory
1391 * as needed during reading. */
1392 bb = bbufferCreate(NULL, 4096);
1393 while (1) {
1394 navail = bb->nalloc - bb->n;
1395 if (navail < 4096) {
1396 nadd = L_MAX(bb->nalloc, 4096);
1397 bbufferExtendArray(bb, nadd);
1398 }
1399 nread = fread((void *)(bb->array + bb->n), 1, 4096, fp);
1400 bb->n += nread;
1401 if (nread != 4096) break;
1402 }
1403
1404 /* Copy the data to a new array sized for the data, because
1405 * the bbuffer array can be nearly twice the size we need. */
1406 if ((data = (l_uint8 *)LEPT_CALLOC(bb->n + 1, sizeof(l_uint8))) != NULL) {
1407 memcpy(data, bb->array, bb->n);
1408 *pnbytes = bb->n;
1409 } else {
1410 L_ERROR("calloc fail for data\n", __func__);
1411 }
1412
1413 bbufferDestroy(&bb);
1414 return data;
1415 }
1416
1417
1418 /*!
1419 * \brief l_binaryReadSelect()
1420 *
1421 * \param[in] filename
1422 * \param[in] start first byte to read
1423 * \param[in] nbytes number of bytes to read; use 0 to read to end of file
1424 * \param[out] pnread number of bytes actually read
1425 * \return data, or NULL on error
1426 *
1427 * <pre>
1428 * Notes:
1429 * (1) The returned array is terminated with a null byte so that it can
1430 * be used to read ascii data from a file into a proper C string.
1431 * </pre>
1432 */
1433 l_uint8 *
1434 l_binaryReadSelect(const char *filename,
1435 size_t start,
1436 size_t nbytes,
1437 size_t *pnread)
1438 {
1439 l_uint8 *data;
1440 FILE *fp;
1441
1442 if (!pnread)
1443 return (l_uint8 *)ERROR_PTR("pnread not defined", __func__, NULL);
1444 *pnread = 0;
1445 if (!filename)
1446 return (l_uint8 *)ERROR_PTR("filename not defined", __func__, NULL);
1447
1448 if ((fp = fopenReadStream(filename)) == NULL)
1449 return (l_uint8 *)ERROR_PTR_1("file stream not opened",
1450 filename, __func__, NULL);
1451 data = l_binaryReadSelectStream(fp, start, nbytes, pnread);
1452 fclose(fp);
1453 return data;
1454 }
1455
1456
1457 /*!
1458 * \brief l_binaryReadSelectStream()
1459 *
1460 * \param[in] fp file stream
1461 * \param[in] start first byte to read
1462 * \param[in] nbytes number of bytes to read; use 0 to read to end of file
1463 * \param[out] pnread number of bytes actually read
1464 * \return null-terminated array, or NULL on error; reading 0 bytes
1465 * is not an error
1466 *
1467 * <pre>
1468 * Notes:
1469 * (1) The returned array is terminated with a null byte so that it can
1470 * be used to read ascii data from a file into a proper C string.
1471 * If the file to be read is empty and %start == 0, an array
1472 * with a single null byte is returned.
1473 * (2) Side effect: the stream pointer is re-positioned to the
1474 * beginning of the file.
1475 * </pre>
1476 */
1477 l_uint8 *
1478 l_binaryReadSelectStream(FILE *fp,
1479 size_t start,
1480 size_t nbytes,
1481 size_t *pnread)
1482 {
1483 l_uint8 *data;
1484 size_t bytesleft, bytestoread, nread, filebytes;
1485
1486 if (!pnread)
1487 return (l_uint8 *)ERROR_PTR("&nread not defined", __func__, NULL);
1488 *pnread = 0;
1489 if (!fp)
1490 return (l_uint8 *)ERROR_PTR("stream not defined", __func__, NULL);
1491
1492 /* Verify and adjust the parameters if necessary */
1493 fseek(fp, 0, SEEK_END); /* EOF */
1494 filebytes = ftell(fp);
1495 fseek(fp, 0, SEEK_SET);
1496 if (start > filebytes) {
1497 L_ERROR("start = %zu but filebytes = %zu\n", __func__,
1498 start, filebytes);
1499 return NULL;
1500 }
1501 if (filebytes == 0) /* start == 0; nothing to read; return null byte */
1502 return (l_uint8 *)LEPT_CALLOC(1, 1);
1503 bytesleft = filebytes - start; /* greater than 0 */
1504 if (nbytes == 0) nbytes = bytesleft;
1505 bytestoread = (bytesleft >= nbytes) ? nbytes : bytesleft;
1506
1507 /* Read the data */
1508 if ((data = (l_uint8 *)LEPT_CALLOC(1, bytestoread + 1)) == NULL)
1509 return (l_uint8 *)ERROR_PTR("calloc fail for data", __func__, NULL);
1510 fseek(fp, start, SEEK_SET);
1511 nread = fread(data, 1, bytestoread, fp);
1512 if (nbytes != nread)
1513 L_INFO("%zu bytes requested; %zu bytes read\n", __func__,
1514 nbytes, nread);
1515 *pnread = nread;
1516 fseek(fp, 0, SEEK_SET);
1517 return data;
1518 }
1519
1520
1521 /*!
1522 * \brief l_binaryWrite()
1523 *
1524 * \param[in] filename output file
1525 * \param[in] operation "w" for write; "a" for append
1526 * \param[in] data binary data to be written
1527 * \param[in] nbytes size of data array
1528 * \return 0 if OK; 1 on error
1529 */
1530 l_ok
1531 l_binaryWrite(const char *filename,
1532 const char *operation,
1533 const void *data,
1534 size_t nbytes)
1535 {
1536 char actualOperation[20];
1537 FILE *fp;
1538
1539 if (!filename)
1540 return ERROR_INT("filename not defined", __func__, 1);
1541 if (!operation)
1542 return ERROR_INT("operation not defined", __func__, 1);
1543 if (!data)
1544 return ERROR_INT("data not defined", __func__, 1);
1545 if (nbytes <= 0)
1546 return ERROR_INT("nbytes must be > 0", __func__, 1);
1547
1548 if (strcmp(operation, "w") && strcmp(operation, "a"))
1549 return ERROR_INT("operation not one of {'w','a'}", __func__, 1);
1550
1551 /* The 'b' flag to fopen() is ignored for all POSIX
1552 * conforming systems. However, Windows needs the 'b' flag. */
1553 stringCopy(actualOperation, operation, 2);
1554 stringCat(actualOperation, 20, "b");
1555
1556 if ((fp = fopenWriteStream(filename, actualOperation)) == NULL)
1557 return ERROR_INT_1("stream not opened", filename, __func__, 1);
1558 fwrite(data, 1, nbytes, fp);
1559 fclose(fp);
1560 return 0;
1561 }
1562
1563
1564 /*!
1565 * \brief nbytesInFile()
1566 *
1567 * \param[in] filename
1568 * \return nbytes in file; 0 on error
1569 */
1570 size_t
1571 nbytesInFile(const char *filename)
1572 {
1573 size_t nbytes;
1574 FILE *fp;
1575
1576 if (!filename)
1577 return ERROR_INT("filename not defined", __func__, 0);
1578 if ((fp = fopenReadStream(filename)) == NULL)
1579 return ERROR_INT_1("stream not opened", filename, __func__, 0);
1580 nbytes = fnbytesInFile(fp);
1581 fclose(fp);
1582 return nbytes;
1583 }
1584
1585
1586 /*!
1587 * \brief fnbytesInFile()
1588 *
1589 * \param[in] fp file stream
1590 * \return nbytes in file; 0 on error
1591 */
1592 size_t
1593 fnbytesInFile(FILE *fp)
1594 {
1595 l_int64 pos, nbytes;
1596
1597 if (!fp)
1598 return ERROR_INT("stream not open", __func__, 0);
1599
1600 pos = ftell(fp); /* initial position */
1601 if (pos < 0)
1602 return ERROR_INT("seek position must be > 0", __func__, 0);
1603 fseek(fp, 0, SEEK_END); /* EOF */
1604 nbytes = ftell(fp);
1605 if (nbytes < 0)
1606 return ERROR_INT("nbytes is < 0", __func__, 0);
1607 fseek(fp, pos, SEEK_SET); /* back to initial position */
1608 return nbytes;
1609 }
1610
1611
1612 /*--------------------------------------------------------------------*
1613 * Copy and compare in memory *
1614 *--------------------------------------------------------------------*/
1615 /*!
1616 * \brief l_binaryCopy()
1617 *
1618 * \param[in] datas
1619 * \param[in] size of data array
1620 * \return datad on heap, or NULL on error
1621 *
1622 * <pre>
1623 * Notes:
1624 * (1) We add 4 bytes to the zeroed output because in some cases
1625 * (e.g., string handling) it is important to have the data
1626 * be null terminated. This guarantees that after the memcpy,
1627 * the result is automatically null terminated.
1628 * </pre>
1629 */
1630 l_uint8 *
1631 l_binaryCopy(const l_uint8 *datas,
1632 size_t size)
1633 {
1634 l_uint8 *datad;
1635
1636 if (!datas)
1637 return (l_uint8 *)ERROR_PTR("datas not defined", __func__, NULL);
1638
1639 if ((datad = (l_uint8 *)LEPT_CALLOC(size + 4, sizeof(l_uint8))) == NULL)
1640 return (l_uint8 *)ERROR_PTR("datad not made", __func__, NULL);
1641 memcpy(datad, datas, size);
1642 return datad;
1643 }
1644
1645
1646 /*!
1647 * \brief l_binaryCompare()
1648 *
1649 * \param[in] data1
1650 * \param[in] size1 of data1
1651 * \param[in] data2
1652 * \param[in] size2 of data1
1653 * \param[out] psame (1 if the same, 0 if different)
1654 * \return 0 if OK, 1 on error
1655 *
1656 * <pre>
1657 * Notes:
1658 * (1) This can also be used to compare C strings str1 and str2.
1659 * If the string lengths are not known, use strlen():
1660 * l_binaryCompare((l_uint8 *)str1, strlen(str1),
1661 (l_uint8 *)str2, strlen(str2));
1662 * </pre>
1663 */
1664 l_ok
1665 l_binaryCompare(const l_uint8 *data1,
1666 size_t size1,
1667 const l_uint8 *data2,
1668 size_t size2,
1669 l_int32 *psame)
1670 {
1671 l_int32 i;
1672
1673 if (!psame)
1674 return ERROR_INT("&same not defined", __func__, 1);
1675 *psame = FALSE;
1676 if (!data1 || !data2)
1677 return ERROR_INT("data1 and data2 not both defined", __func__, 1);
1678 if (size1 != size2) return 0;
1679 for (i = 0; i < size1; i++) {
1680 if (data1[i] != data2[i])
1681 return 0;
1682 }
1683 *psame = TRUE;
1684 return 0;
1685 }
1686
1687
1688 /*--------------------------------------------------------------------*
1689 * File copy operations *
1690 *--------------------------------------------------------------------*/
1691 /*!
1692 * \brief fileCopy()
1693 *
1694 * \param[in] srcfile copy from this file
1695 * \param[in] newfile copy to this file
1696 * \return 0 if OK, 1 on error
1697 */
1698 l_ok
1699 fileCopy(const char *srcfile,
1700 const char *newfile)
1701 {
1702 l_int32 ret;
1703 size_t nbytes;
1704 l_uint8 *data;
1705
1706 if (!srcfile)
1707 return ERROR_INT("srcfile not defined", __func__, 1);
1708 if (!newfile)
1709 return ERROR_INT("newfile not defined", __func__, 1);
1710
1711 if ((data = l_binaryRead(srcfile, &nbytes)) == NULL)
1712 return ERROR_INT("data not returned", __func__, 1);
1713 ret = l_binaryWrite(newfile, "w", data, nbytes);
1714 LEPT_FREE(data);
1715 return ret;
1716 }
1717
1718
1719 /*!
1720 * \brief fileConcatenate()
1721 *
1722 * \param[in] srcfile append data from this file
1723 * \param[in] destfile add data to this file
1724 * \return 0 if OK, 1 on error
1725 */
1726 l_ok
1727 fileConcatenate(const char *srcfile,
1728 const char *destfile)
1729 {
1730 size_t nbytes;
1731 l_uint8 *data;
1732
1733 if (!srcfile)
1734 return ERROR_INT("srcfile not defined", __func__, 1);
1735 if (!destfile)
1736 return ERROR_INT("destfile not defined", __func__, 1);
1737
1738 data = l_binaryRead(srcfile, &nbytes);
1739 l_binaryWrite(destfile, "a", data, nbytes);
1740 LEPT_FREE(data);
1741 return 0;
1742 }
1743
1744
1745 /*!
1746 * \brief fileAppendString()
1747 *
1748 * \param[in] filename
1749 * \param[in] str string to append to file
1750 * \return 0 if OK, 1 on error
1751 */
1752 l_ok
1753 fileAppendString(const char *filename,
1754 const char *str)
1755 {
1756 FILE *fp;
1757
1758 if (!filename)
1759 return ERROR_INT("filename not defined", __func__, 1);
1760 if (!str)
1761 return ERROR_INT("str not defined", __func__, 1);
1762
1763 if ((fp = fopenWriteStream(filename, "a")) == NULL)
1764 return ERROR_INT_1("stream not opened", filename, __func__, 1);
1765 fprintf(fp, "%s", str);
1766 fclose(fp);
1767 return 0;
1768 }
1769
1770
1771 /*--------------------------------------------------------------------*
1772 * File split operations *
1773 *--------------------------------------------------------------------*/
1774 /*!
1775 * \brief fileSplitLinesUniform()
1776 *
1777 * \param[in] filename input file
1778 * \param[in] n number of output files (>= 1)
1779 * \param[in] save_empty 1 to save empty lines; 0 to remove them
1780 * \param[in] rootpath root pathname of output files
1781 * \param[in] ext output extension, including the '.'; can be NULL
1782 * \return 0 if OK, 1 on error
1783 *
1784 * <pre>
1785 * Notes:
1786 * (1) This splits an input text file into %n files with roughly
1787 * equal numbers of text lines in each file.
1788 * (2) if %save_empty == 1, empty lines are included, and concatention
1789 * of the text in the split files will be identical to the original.
1790 * (3) The output filenames are in the form:
1791 * <rootpath>_N.<ext>, N = 1, ... n
1792 * (4) This handles the temp directory pathname conversion where needed:
1793 * /tmp ==> [OS specific temp directory]
1794 * (5) Files can also be sharded into sets of lines by the program 'split':
1795 * split -n l/<n> <filename>
1796 * Using 'split', the resulting files have approximately equal
1797 * numbers of bytes, rather than equal numbers of lines.
1798 * </pre>
1799 */
1800 l_ok
1801 fileSplitLinesUniform(const char *filename,
1802 l_int32 n,
1803 l_int32 save_empty,
1804 const char *rootpath,
1805 const char *ext)
1806 {
1807 l_int32 i, totlines, nlines, index;
1808 size_t nbytes;
1809 l_uint8 *data;
1810 char *str;
1811 char outname[512];
1812 NUMA *na;
1813 SARRAY *sa;
1814
1815 if (!filename)
1816 return ERROR_INT("filename not defined", __func__, 1);
1817 if (!rootpath)
1818 return ERROR_INT("rootpath not defined", __func__, 1);
1819 if (n <= 0)
1820 return ERROR_INT("n must be > 0", __func__, 1);
1821 if (save_empty != 0 && save_empty != 1)
1822 return ERROR_INT("save_empty not 0 or 1", __func__, 1);
1823
1824 /* Make sarray of lines; the newlines are stripped off */
1825 if ((data = l_binaryRead(filename, &nbytes)) == NULL)
1826 return ERROR_INT("data not read", __func__, 1);
1827 sa = sarrayCreateLinesFromString((const char *)data, save_empty);
1828 LEPT_FREE(data);
1829 if (!sa)
1830 return ERROR_INT("sa not made", __func__, 1);
1831 totlines = sarrayGetCount(sa);
1832 if (n > totlines) {
1833 sarrayDestroy(&sa);
1834 L_ERROR("num files = %d > num lines = %d\n", __func__, n, totlines);
1835 return 1;
1836 }
1837
1838 /* Write n sets of lines to n files, adding the newlines back */
1839 na = numaGetUniformBinSizes(totlines, n);
1840 index = 0;
1841 for (i = 0; i < n; i++) {
1842 if (ext == NULL)
1843 snprintf(outname, sizeof(outname), "%s_%d", rootpath, i);
1844 else
1845 snprintf(outname, sizeof(outname), "%s_%d%s", rootpath, i, ext);
1846 numaGetIValue(na, i, &nlines);
1847 str = sarrayToStringRange(sa, index, nlines, 1); /* add newlines */
1848 l_binaryWrite(outname, "w", str, strlen(str));
1849 LEPT_FREE(str);
1850 index += nlines;
1851 }
1852 numaDestroy(&na);
1853 sarrayDestroy(&sa);
1854 return 0;
1855 }
1856
1857
1858 /*--------------------------------------------------------------------*
1859 * Multi-platform functions for opening file streams *
1860 *--------------------------------------------------------------------*/
1861 /*!
1862 * \brief fopenReadStream()
1863 *
1864 * \param[in] filename
1865 * \return stream, or NULL on error
1866 *
1867 * <pre>
1868 * Notes:
1869 * (1) This should be used whenever you want to run fopen() to
1870 * read from a stream. Never call fopen() directory.
1871 * (2) This handles the temp directory pathname conversion where needed:
1872 * /tmp ==> [OS specific temp directory]
1873 * </pre>
1874 */
1875 FILE *
1876 fopenReadStream(const char *filename)
1877 {
1878 char *fname, *tail;
1879 FILE *fp;
1880
1881 if (!filename)
1882 return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
1883
1884 /* Try input filename */
1885 fname = genPathname(filename, NULL);
1886 fp = fopen(fname, "rb");
1887 LEPT_FREE(fname);
1888 if (fp) return fp;
1889
1890 /* Else, strip directory and try locally */
1891 splitPathAtDirectory(filename, NULL, &tail);
1892 if (!tail)
1893 return (FILE*)ERROR_PTR_1("tail not found", filename, __func__, NULL);
1894 fp = fopen(tail, "rb");
1895 if (!fp)
1896 L_ERROR("failed to open locally with tail %s for filename %s\n",
1897 __func__, tail, filename);
1898 LEPT_FREE(tail);
1899 return fp;
1900 }
1901
1902
1903 /*!
1904 * \brief fopenWriteStream()
1905 *
1906 * \param[in] filename
1907 * \param[in] modestring
1908 * \return stream, or NULL on error
1909 *
1910 * <pre>
1911 * Notes:
1912 * (1) This should be used whenever you want to run fopen() to
1913 * write or append to a stream. Never call fopen() directory.
1914 * (2) This handles the temp directory pathname conversion where needed:
1915 * /tmp ==> [OS specific temp directory]
1916 * </pre>
1917 */
1918 FILE *
1919 fopenWriteStream(const char *filename,
1920 const char *modestring)
1921 {
1922 char *fname;
1923 FILE *fp;
1924
1925 if (!filename)
1926 return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
1927
1928 fname = genPathname(filename, NULL);
1929 fp = fopen(fname, modestring);
1930 if (!fp)
1931 fp = (FILE *)ERROR_PTR_1("stream not opened", fname, __func__, NULL);
1932 LEPT_FREE(fname);
1933 return fp;
1934 }
1935
1936
1937 /*!
1938 * \brief fopenReadFromMemory()
1939 *
1940 * \param[in] data, size
1941 * \return file stream, or NULL on error
1942 *
1943 * <pre>
1944 * Notes:
1945 * (1) Work-around if fmemopen() not available.
1946 * (2) Windows tmpfile() writes into the root C:\ directory, which
1947 * requires admin privileges. This also works around that.
1948 * </pre>
1949 */
1950 FILE *
1951 fopenReadFromMemory(const l_uint8 *data,
1952 size_t size)
1953 {
1954 FILE *fp;
1955
1956 if (!data)
1957 return (FILE *)ERROR_PTR("data not defined", __func__, NULL);
1958
1959 #if HAVE_FMEMOPEN
1960 if ((fp = fmemopen((void *)data, size, "rb")) == NULL)
1961 return (FILE *)ERROR_PTR("stream not opened", __func__, NULL);
1962 #else /* write to tmp file */
1963 L_INFO("no fmemopen API --> work-around: write to temp file\n", __func__);
1964 #ifdef _WIN32
1965 if ((fp = fopenWriteWinTempfile()) == NULL)
1966 return (FILE *)ERROR_PTR("tmpfile stream not opened", __func__, NULL);
1967 #else
1968 if ((fp = tmpfile()) == NULL)
1969 return (FILE *)ERROR_PTR("tmpfile stream not opened", __func__, NULL);
1970 #endif /* _WIN32 */
1971 fwrite(data, 1, size, fp);
1972 rewind(fp);
1973 #endif /* HAVE_FMEMOPEN */
1974
1975 return fp;
1976 }
1977
1978
1979 /*--------------------------------------------------------------------*
1980 * Opening a Windows tmpfile for writing *
1981 *--------------------------------------------------------------------*/
1982 /*!
1983 * \brief fopenWriteWinTempfile()
1984 *
1985 * \return file stream, or NULL on error
1986 *
1987 * <pre>
1988 * Notes:
1989 * (1) The Windows version of tmpfile() writes into the root
1990 * C:\ directory, which requires admin privileges. This
1991 * function provides an alternative implementation.
1992 * </pre>
1993 */
1994 FILE *
1995 fopenWriteWinTempfile(void)
1996 {
1997 #ifdef _WIN32
1998 l_int32 handle;
1999 FILE *fp;
2000 char *filename;
2001
2002 if ((filename = l_makeTempFilename()) == NULL) {
2003 L_ERROR("l_makeTempFilename failed, %s\n", __func__, strerror(errno));
2004 return NULL;
2005 }
2006
2007 handle = _open(filename, _O_CREAT | _O_RDWR | _O_SHORT_LIVED |
2008 _O_TEMPORARY | _O_BINARY, _S_IREAD | _S_IWRITE);
2009 lept_free(filename);
2010 if (handle == -1) {
2011 L_ERROR("_open failed, %s\n", __func__, strerror(errno));
2012 return NULL;
2013 }
2014
2015 if ((fp = _fdopen(handle, "r+b")) == NULL) {
2016 L_ERROR("_fdopen failed, %s\n", __func__, strerror(errno));
2017 return NULL;
2018 }
2019
2020 return fp;
2021 #else
2022 return NULL;
2023 #endif /* _WIN32 */
2024 }
2025
2026
2027 /*--------------------------------------------------------------------*
2028 * Multi-platform functions that avoid C-runtime boundary *
2029 * crossing for applications with Windows DLLs *
2030 *--------------------------------------------------------------------*/
2031 /*
2032 * Problems arise when pointers to streams and data are passed
2033 * between two Windows DLLs that have been generated with different
2034 * C runtimes. To avoid this, leptonica provides wrappers for
2035 * several C library calls.
2036 */
2037 /*!
2038 * \brief lept_fopen()
2039 *
2040 * \param[in] filename
2041 * \param[in] mode same as for fopen(); e.g., "rb"
2042 * \return stream or NULL on error
2043 *
2044 * <pre>
2045 * Notes:
2046 * (1) This must be used by any application that passes
2047 * a file handle to a leptonica Windows DLL.
2048 * </pre>
2049 */
2050 FILE *
2051 lept_fopen(const char *filename,
2052 const char *mode)
2053 {
2054 if (!filename)
2055 return (FILE *)ERROR_PTR("filename not defined", __func__, NULL);
2056 if (!mode)
2057 return (FILE *)ERROR_PTR("mode not defined", __func__, NULL);
2058
2059 if (stringFindSubstr(mode, "r", NULL))
2060 return fopenReadStream(filename);
2061 else
2062 return fopenWriteStream(filename, mode);
2063 }
2064
2065
2066 /*!
2067 * \brief lept_fclose()
2068 *
2069 * \param[in] fp file stream
2070 * \return 0 if OK, 1 on error
2071 *
2072 * <pre>
2073 * Notes:
2074 * (1) This should be used by any application that accepts
2075 * a file handle generated by a leptonica Windows DLL.
2076 * </pre>
2077 */
2078 l_ok
2079 lept_fclose(FILE *fp)
2080 {
2081 if (!fp)
2082 return ERROR_INT("stream not defined", __func__, 1);
2083
2084 return fclose(fp);
2085 }
2086
2087
2088 /*!
2089 * \brief lept_calloc()
2090 *
2091 * \param[in] nmemb number of members
2092 * \param[in] size of each member
2093 * \return void ptr, or NULL on error
2094 *
2095 * <pre>
2096 * Notes:
2097 * (1) For safety with Windows DLLs, this can be used in conjunction
2098 * with lept_free() to avoid C-runtime boundary problems.
2099 * Just use these two functions throughout your application.
2100 * </pre>
2101 */
2102 void *
2103 lept_calloc(size_t nmemb,
2104 size_t size)
2105 {
2106 if (nmemb <= 0 || size <= 0)
2107 return NULL;
2108 return LEPT_CALLOC(nmemb, size);
2109 }
2110
2111
2112 /*!
2113 * \brief lept_free()
2114 *
2115 * \param[in] ptr
2116 *
2117 * <pre>
2118 * Notes:
2119 * (1) This should be used by any application that accepts
2120 * heap data allocated by a leptonica Windows DLL.
2121 * </pre>
2122 */
2123 void
2124 lept_free(void *ptr)
2125 {
2126 if (!ptr) return;
2127 LEPT_FREE(ptr);
2128 }
2129
2130
2131 /*--------------------------------------------------------------------*
2132 * Multi-platform file system operations *
2133 * [ These only write to /tmp or its subdirectories ] *
2134 *--------------------------------------------------------------------*/
2135 /*!
2136 * \brief lept_mkdir()
2137 *
2138 * \param[in] subdir of /tmp or its OS specific equivalent
2139 * \return 0 on success, non-zero on failure
2140 *
2141 * <pre>
2142 * Notes:
2143 * (1) %subdir is a partial path that can consist of one or more
2144 * directories.
2145 * (2) This makes any subdirectories of /tmp that are required.
2146 * (3) The root temp directory is:
2147 * /tmp (unix) [default]
2148 * [Temp] (Windows)
2149 * </pre>
2150 */
2151 l_int32
2152 lept_mkdir(const char *subdir)
2153 {
2154 char *dir, *tmpdir;
2155 l_int32 i, n;
2156 l_int32 ret = 0;
2157 SARRAY *sa;
2158 #ifdef _WIN32
2159 l_uint32 attributes;
2160 #endif /* _WIN32 */
2161
2162 if (!LeptDebugOK) {
2163 L_INFO("making named temp subdirectory %s is disabled\n",
2164 __func__, subdir);
2165 return 0;
2166 }
2167
2168 if (!subdir)
2169 return ERROR_INT("subdir not defined", __func__, 1);
2170 if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
2171 return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
2172
2173 sa = sarrayCreate(0);
2174 sarraySplitString(sa, subdir, "/");
2175 n = sarrayGetCount(sa);
2176 dir = genPathname("/tmp", NULL);
2177 /* Make sure the tmp directory exists */
2178 #ifndef _WIN32
2179 ret = mkdir(dir, 0777);
2180 #else
2181 attributes = GetFileAttributesA(dir);
2182 if (attributes == INVALID_FILE_ATTRIBUTES)
2183 ret = (CreateDirectoryA(dir, NULL) ? 0 : 1);
2184 #endif
2185 /* Make all the subdirectories */
2186 for (i = 0; i < n; i++) {
2187 tmpdir = pathJoin(dir, sarrayGetString(sa, i, L_NOCOPY));
2188 #ifndef _WIN32
2189 ret += mkdir(tmpdir, 0777);
2190 #else
2191 if (CreateDirectoryA(tmpdir, NULL) == 0)
2192 ret += (GetLastError() != ERROR_ALREADY_EXISTS);
2193 #endif
2194 LEPT_FREE(dir);
2195 dir = tmpdir;
2196 }
2197 LEPT_FREE(dir);
2198 sarrayDestroy(&sa);
2199 if (ret > 0)
2200 L_ERROR("failure to create %d directories\n", __func__, ret);
2201 return ret;
2202 }
2203
2204
2205 /*!
2206 * \brief lept_rmdir()
2207 *
2208 * \param[in] subdir of /tmp or its OS specific equivalent
2209 * \return 0 on success, non-zero on failure
2210 *
2211 * <pre>
2212 * Notes:
2213 * (1) %subdir is a partial path that can consist of one or more
2214 * directories.
2215 * (2) This removes all files from the specified subdirectory of
2216 * the root temp directory:
2217 * /tmp (unix)
2218 * [Temp] (Windows)
2219 * and then removes the subdirectory.
2220 * (3) The combination
2221 * lept_rmdir(subdir);
2222 * lept_mkdir(subdir);
2223 * is guaranteed to give you an empty subdirectory.
2224 * </pre>
2225 */
2226 l_int32
2227 lept_rmdir(const char *subdir)
2228 {
2229 char *dir, *fname, *fullname;
2230 l_int32 exists, ret, i, nfiles;
2231 SARRAY *sa;
2232 #ifdef _WIN32
2233 char *newpath;
2234 #else
2235 char *realdir;
2236 #endif /* _WIN32 */
2237
2238 if (!subdir)
2239 return ERROR_INT("subdir not defined", __func__, 1);
2240 if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/'))
2241 return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
2242
2243 /* Find the temp subdirectory */
2244 dir = pathJoin("/tmp", subdir);
2245 if (!dir)
2246 return ERROR_INT("directory name not made", __func__, 1);
2247 lept_direxists(dir, &exists);
2248 if (!exists) { /* fail silently */
2249 LEPT_FREE(dir);
2250 return 0;
2251 }
2252
2253 /* List all the files in that directory */
2254 if ((sa = getFilenamesInDirectory(dir)) == NULL) {
2255 L_ERROR("directory %s does not exist!\n", __func__, dir);
2256 LEPT_FREE(dir);
2257 return 1;
2258 }
2259 nfiles = sarrayGetCount(sa);
2260
2261 for (i = 0; i < nfiles; i++) {
2262 fname = sarrayGetString(sa, i, L_NOCOPY);
2263 fullname = genPathname(dir, fname);
2264 remove(fullname);
2265 LEPT_FREE(fullname);
2266 }
2267
2268 #ifndef _WIN32
2269 realdir = genPathname("/tmp", subdir);
2270 ret = rmdir(realdir);
2271 LEPT_FREE(realdir);
2272 #else
2273 newpath = genPathname(dir, NULL);
2274 ret = (RemoveDirectoryA(newpath) ? 0 : 1);
2275 LEPT_FREE(newpath);
2276 #endif /* !_WIN32 */
2277
2278 sarrayDestroy(&sa);
2279 LEPT_FREE(dir);
2280 return ret;
2281 }
2282
2283
2284 /*!
2285 * \brief lept_direxists()
2286 *
2287 * \param[in] dir
2288 * \param[out] pexists 1 if it exists; 0 otherwise
2289 * \return void
2290 *
2291 * <pre>
2292 * Notes:
2293 * (1) Always use unix pathname separators.
2294 * (2) By calling genPathname(), if the pathname begins with "/tmp"
2295 * this does an automatic directory translation for operating
2296 * systems that use a different path for /tmp.
2297 * </pre>
2298 */
2299 void
2300 lept_direxists(const char *dir,
2301 l_int32 *pexists)
2302 {
2303 char *realdir;
2304
2305 if (!pexists) return;
2306 *pexists = 0;
2307 if (!dir) return;
2308 if ((realdir = genPathname(dir, NULL)) == NULL)
2309 return;
2310
2311 #ifndef _WIN32
2312 {
2313 struct stat s;
2314 l_int32 err = stat(realdir, &s);
2315 if (err != -1 && S_ISDIR(s.st_mode))
2316 *pexists = 1;
2317 }
2318 #else /* _WIN32 */
2319 {
2320 l_uint32 attributes;
2321 attributes = GetFileAttributesA(realdir);
2322 if (attributes != INVALID_FILE_ATTRIBUTES &&
2323 (attributes & FILE_ATTRIBUTE_DIRECTORY))
2324 *pexists = 1;
2325 }
2326 #endif /* _WIN32 */
2327
2328 LEPT_FREE(realdir);
2329 }
2330
2331
2332 /*!
2333 * \brief lept_rm_match()
2334 *
2335 * \param[in] subdir [optional] if NULL, the removed files are in /tmp
2336 * \param[in] substr [optional] pattern to match in filename
2337 * \return 0 on success, non-zero on failure
2338 *
2339 * <pre>
2340 * Notes:
2341 * (1) This removes the matched files in /tmp or a subdirectory of /tmp.
2342 * Use NULL for %subdir if the files are in /tmp.
2343 * (2) If %substr == NULL, this removes all files in the directory.
2344 * If %substr == "" (empty), this removes no files.
2345 * If both %subdir == NULL and %substr == NULL, this removes
2346 * all files in /tmp.
2347 * (3) Use unix pathname separators.
2348 * (4) By calling genPathname(), if the pathname begins with "/tmp"
2349 * this does an automatic directory translation for operating
2350 * systems that use a different path for /tmp.
2351 * (5) Error conditions:
2352 * * returns -1 if the directory is not found
2353 * * returns the number of files (> 0) that it was unable to remove.
2354 * </pre>
2355 */
2356 l_int32
2357 lept_rm_match(const char *subdir,
2358 const char *substr)
2359 {
2360 char *path, *fname;
2361 char tempdir[256];
2362 l_int32 i, n, ret;
2363 SARRAY *sa;
2364
2365 makeTempDirname(tempdir, sizeof(tempdir), subdir);
2366 if ((sa = getSortedPathnamesInDirectory(tempdir, substr, 0, 0)) == NULL)
2367 return ERROR_INT("sa not made", __func__, -1);
2368 n = sarrayGetCount(sa);
2369 if (n == 0) {
2370 L_WARNING("no matching files found\n", __func__);
2371 sarrayDestroy(&sa);
2372 return 0;
2373 }
2374
2375 ret = 0;
2376 for (i = 0; i < n; i++) {
2377 fname = sarrayGetString(sa, i, L_NOCOPY);
2378 path = genPathname(fname, NULL);
2379 if (lept_rmfile(path) != 0) {
2380 L_ERROR("failed to remove %s\n", __func__, path);
2381 ret++;
2382 }
2383 LEPT_FREE(path);
2384 }
2385 sarrayDestroy(&sa);
2386 return ret;
2387 }
2388
2389
2390 /*!
2391 * \brief lept_rm()
2392 *
2393 * \param[in] subdir [optional] subdir of '/tmp'; can be NULL
2394 * \param[in] tail filename without the directory
2395 * \return 0 on success, non-zero on failure
2396 *
2397 * <pre>
2398 * Notes:
2399 * (1) By calling genPathname(), this does an automatic directory
2400 * translation on operating systems which use a different path.
2401 * </pre>
2402 */
2403 l_int32
2404 lept_rm(const char *subdir,
2405 const char *tail)
2406 {
2407 char *path;
2408 char newtemp[256];
2409 l_int32 ret;
2410
2411 if (!tail || strlen(tail) == 0)
2412 return ERROR_INT("tail undefined or empty", __func__, 1);
2413
2414 if (makeTempDirname(newtemp, sizeof(newtemp), subdir))
2415 return ERROR_INT("temp dirname not made", __func__, 1);
2416 path = genPathname(newtemp, tail);
2417 ret = lept_rmfile(path);
2418 LEPT_FREE(path);
2419 return ret;
2420 }
2421
2422
2423 /*!
2424 * \brief
2425 *
2426 * lept_rmfile()
2427 *
2428 * \param[in] filepath full path to file including the directory
2429 * \return 0 on success, non-zero on failure
2430 *
2431 * <pre>
2432 * Notes:
2433 * (1) This removes the named file.
2434 * (2) Use unix pathname separators.
2435 * (3) There is no name translation.
2436 * (4) Unlike the other lept_* functions in this section, this can remove
2437 * any file -- it is not restricted to files that are in /tmp or a
2438 * subdirectory of it.
2439 * (5) For files in /tmp or a subdirectory of it, this does an automatic
2440 * directory translation for operating systems that use a different
2441 * path for /tmp.
2442 * </pre>
2443 */
2444 l_int32
2445 lept_rmfile(const char *filepath)
2446 {
2447 l_int32 ret;
2448
2449 if (!filepath || strlen(filepath) == 0)
2450 return ERROR_INT("filepath undefined or empty", __func__, 1);
2451
2452 #ifndef _WIN32
2453 ret = remove(filepath);
2454 #else
2455 /* Set attributes to allow deletion of read-only files */
2456 SetFileAttributesA(filepath, FILE_ATTRIBUTE_NORMAL);
2457 ret = DeleteFileA(filepath) ? 0 : 1;
2458 #endif /* !_WIN32 */
2459
2460 return ret;
2461 }
2462
2463
2464 /*!
2465 * \brief lept_mv()
2466 *
2467 * \param[in] srcfile
2468 * \param[in] newdir [optional]; can be NULL
2469 * \param[in] newtail [optional]; can be NULL
2470 * \param[out] pnewpath [optional] of actual path; can be NULL
2471 * \return 0 on success, non-zero on failure
2472 *
2473 * <pre>
2474 * Notes:
2475 * (1) This moves %srcfile to /tmp or to a subdirectory of /tmp.
2476 * (2) %srcfile can either be a full path or relative to the
2477 * current directory.
2478 * (3) %newdir can either specify an existing subdirectory of /tmp
2479 * or can be NULL. In the latter case, the file will be written
2480 * into /tmp.
2481 * (4) %newtail can either specify a filename tail or, if NULL,
2482 * the filename is taken from src-tail, the tail of %srcfile.
2483 * (5) For debugging, the computed newpath can be returned. It must
2484 * be freed by the caller.
2485 * (6) Reminders:
2486 * (a) specify files using unix pathnames
2487 * (b) this does an automatic directory translation on operating
2488 * systems that use a different path for /tmp.
2489 * (7) Examples:
2490 * * newdir = NULL, newtail = NULL ==> /tmp/src-tail
2491 * * newdir = NULL, newtail = abc ==> /tmp/abc
2492 * * newdir = def/ghi, newtail = NULL ==> /tmp/def/ghi/src-tail
2493 * * newdir = def/ghi, newtail = abc ==> /tmp/def/ghi/abc
2494 * </pre>
2495 */
2496 l_int32
2497 lept_mv(const char *srcfile,
2498 const char *newdir,
2499 const char *newtail,
2500 char **pnewpath)
2501 {
2502 char *srcpath, *newpath, *dir, *srctail;
2503 char newtemp[256];
2504 l_int32 ret;
2505
2506 if (!srcfile)
2507 return ERROR_INT("srcfile not defined", __func__, 1);
2508
2509 /* Require output pathname to be in /tmp/ or a subdirectory */
2510 if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1)
2511 return ERROR_INT("newdir not NULL or a subdir of /tmp", __func__, 1);
2512
2513 /* Get canonical src pathname */
2514 splitPathAtDirectory(srcfile, &dir, &srctail);
2515
2516 #ifndef _WIN32
2517 srcpath = pathJoin(dir, srctail);
2518 LEPT_FREE(dir);
2519
2520 /* Generate output pathname */
2521 if (!newtail || newtail[0] == '\0')
2522 newpath = pathJoin(newtemp, srctail);
2523 else
2524 newpath = pathJoin(newtemp, newtail);
2525 LEPT_FREE(srctail);
2526
2527 /* Overwrite any existing file at 'newpath' */
2528 ret = fileCopy(srcpath, newpath);
2529 if (!ret) { /* and remove srcfile */
2530 char *realpath = genPathname(srcpath, NULL);
2531 remove(realpath);
2532 LEPT_FREE(realpath);
2533 }
2534 #else
2535 srcpath = genPathname(dir, srctail);
2536 LEPT_FREE(dir);
2537
2538 /* Generate output pathname */
2539 if (!newtail || newtail[0] == '\0')
2540 newpath = genPathname(newtemp, srctail);
2541 else
2542 newpath = genPathname(newtemp, newtail);
2543 LEPT_FREE(srctail);
2544
2545 /* Overwrite any existing file at 'newpath' */
2546 ret = MoveFileExA(srcpath, newpath,
2547 MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING) ? 0 : 1;
2548 #endif /* ! _WIN32 */
2549
2550 LEPT_FREE(srcpath);
2551 if (pnewpath)
2552 *pnewpath = newpath;
2553 else
2554 LEPT_FREE(newpath);
2555 return ret;
2556 }
2557
2558
2559 /*!
2560 * \brief lept_cp()
2561 *
2562 * \param[in] srcfile
2563 * \param[in] newdir [optional]; can be NULL
2564 * \param[in] newtail [optional]; can be NULL
2565 * \param[out] pnewpath [optional] of actual path; can be NULL
2566 * \return 0 on success, non-zero on failure
2567 *
2568 * <pre>
2569 * Notes:
2570 * (1) This copies %srcfile to /tmp or to a subdirectory of /tmp.
2571 * (2) %srcfile can either be a full path or relative to the
2572 * current directory.
2573 * (3) %newdir can either specify an existing subdirectory of /tmp,
2574 * or can be NULL. In the latter case, the file will be written
2575 * into /tmp.
2576 * (4) %newtail can either specify a filename tail or, if NULL,
2577 * the filename is taken from src-tail, the tail of %srcfile.
2578 * (5) For debugging, the computed newpath can be returned. It must
2579 * be freed by the caller.
2580 * (6) Reminders:
2581 * (a) specify files using unix pathnames
2582 * (b) this does an automatic directory translation for operating
2583 * systems that use a different path for /tmp
2584 * (7) Examples:
2585 * * newdir = NULL, newtail = NULL ==> /tmp/src-tail
2586 * * newdir = NULL, newtail = abc ==> /tmp/abc
2587 * * newdir = def/ghi, newtail = NULL ==> /tmp/def/ghi/src-tail
2588 * * newdir = def/ghi, newtail = abc ==> /tmp/def/ghi/abc
2589 *
2590 * </pre>
2591 */
2592 l_int32
2593 lept_cp(const char *srcfile,
2594 const char *newdir,
2595 const char *newtail,
2596 char **pnewpath)
2597 {
2598 char *srcpath, *newpath, *dir, *srctail;
2599 char newtemp[256];
2600 l_int32 ret;
2601
2602 if (!srcfile)
2603 return ERROR_INT("srcfile not defined", __func__, 1);
2604
2605 /* Require output pathname to be in /tmp or a subdirectory */
2606 if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1)
2607 return ERROR_INT("newdir not NULL or a subdir of /tmp", __func__, 1);
2608
2609 /* Get canonical src pathname */
2610 splitPathAtDirectory(srcfile, &dir, &srctail);
2611
2612 #ifndef _WIN32
2613 srcpath = pathJoin(dir, srctail);
2614 LEPT_FREE(dir);
2615
2616 /* Generate output pathname */
2617 if (!newtail || newtail[0] == '\0')
2618 newpath = pathJoin(newtemp, srctail);
2619 else
2620 newpath = pathJoin(newtemp, newtail);
2621 LEPT_FREE(srctail);
2622
2623 /* Overwrite any existing file at 'newpath' */
2624 ret = fileCopy(srcpath, newpath);
2625 #else
2626 srcpath = genPathname(dir, srctail);
2627 LEPT_FREE(dir);
2628
2629 /* Generate output pathname */
2630 if (!newtail || newtail[0] == '\0')
2631 newpath = genPathname(newtemp, srctail);
2632 else
2633 newpath = genPathname(newtemp, newtail);
2634 LEPT_FREE(srctail);
2635
2636 /* Overwrite any existing file at 'newpath' */
2637 ret = CopyFileA(srcpath, newpath, FALSE) ? 0 : 1;
2638 #endif /* !_WIN32 */
2639
2640 LEPT_FREE(srcpath);
2641 if (pnewpath)
2642 *pnewpath = newpath;
2643 else
2644 LEPT_FREE(newpath);
2645 return ret;
2646 }
2647
2648
2649 /*--------------------------------------------------------------------*
2650 * Special debug/test function for calling 'system' *
2651 *--------------------------------------------------------------------*/
2652 #if defined(__APPLE__)
2653 #include "TargetConditionals.h"
2654 #endif /* __APPLE__ */
2655
2656 /*!
2657 * \brief callSystemDebug()
2658 *
2659 * \param[in] cmd command to be exec'd
2660 * \return 0 on success
2661 *
2662 * <pre>
2663 * Notes:
2664 * (1) The C library 'system' call is only made through this function.
2665 * It only works in debug/test mode, where the global variable
2666 * LeptDebugOK == TRUE. This variable is set to FALSE in the
2667 * library as distributed, and calling this function will
2668 * generate an error message.
2669 * </pre>
2670 */
2671 l_int32
2672 callSystemDebug(const char *cmd)
2673 {
2674 l_int32 ret;
2675
2676 if (!cmd) {
2677 L_ERROR("cmd not defined\n", __func__);
2678 return 1;
2679 }
2680 if (LeptDebugOK == FALSE) {
2681 L_INFO("'system' calls are disabled\n", __func__);
2682 return 1;
2683 }
2684
2685 #if defined(__APPLE__) /* iOS 11 does not support system() */
2686
2687 #if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) /* Mac OS X */
2688 ret = system(cmd);
2689 #elif TARGET_OS_IPHONE || defined(OS_IOS) /* iOS */
2690 L_ERROR("iOS 11 does not support system()\n", __func__);
2691 #endif /* TARGET_OS_OSX */
2692
2693 #else /* ! __APPLE__ */
2694
2695 ret = system(cmd);
2696
2697 #endif /* __APPLE__ */
2698
2699 return ret;
2700 }
2701
2702
2703 /*--------------------------------------------------------------------*
2704 * General file name operations *
2705 *--------------------------------------------------------------------*/
2706 /*!
2707 * \brief splitPathAtDirectory()
2708 *
2709 * \param[in] pathname full path; can be a directory
2710 * \param[out] pdir [optional] root directory name of
2711 * input path, including trailing '/'
2712 * \param[out] ptail [optional] path tail, which is either
2713 * the file name within the root directory or
2714 * the last sub-directory in the path
2715 * \return 0 if OK, 1 on error
2716 *
2717 * <pre>
2718 * Notes:
2719 * (1) If you only want the tail, input null for the root directory ptr.
2720 * (2) If you only want the root directory name, input null for the
2721 * tail ptr.
2722 * (3) This function makes decisions based only on the lexical
2723 * structure of the input. Examples:
2724 * /usr/tmp/abc.d --> dir: /usr/tmp/ tail: abc.d
2725 * /usr/tmp/ --> dir: /usr/tmp/ tail: [empty string]
2726 * /usr/tmp --> dir: /usr/ tail: tmp
2727 * abc.d --> dir: [empty string] tail: abc.d
2728 * (4 Consider the first example above: /usr/tmp/abc.d.
2729 * Suppose you want the stem of the file, abc, without either
2730 * the directory or the extension. This can be extracted in two steps:
2731 * splitPathAtDirectory("usr/tmp/abc.d", NULL, &tail);
2732 * [sets tail: "abc.d"]
2733 * splitPathAtExtension(tail, &basename, NULL);
2734 * [sets basename: "abc"]
2735 * (5) The input can have either forward (unix) or backward (win)
2736 * slash separators. The output has unix separators.
2737 * Note that Win32 pathname functions generally accept both
2738 * slash forms, but the Windows command line interpreter
2739 * only accepts backward slashes, because forward slashes are
2740 * used to demarcate switches (vs. dashes in unix).
2741 * </pre>
2742 */
2743 l_ok
2744 splitPathAtDirectory(const char *pathname,
2745 char **pdir,
2746 char **ptail)
2747 {
2748 char *cpathname, *lastslash;
2749
2750 if (!pdir && !ptail)
2751 return ERROR_INT("null input for both strings", __func__, 1);
2752 if (pdir) *pdir = NULL;
2753 if (ptail) *ptail = NULL;
2754 if (!pathname)
2755 return ERROR_INT("pathname not defined", __func__, 1);
2756
2757 cpathname = stringNew(pathname);
2758 convertSepCharsInPath(cpathname, UNIX_PATH_SEPCHAR);
2759 lastslash = strrchr(cpathname, '/');
2760 if (lastslash) {
2761 if (ptail)
2762 *ptail = stringNew(lastslash + 1);
2763 if (pdir) {
2764 *(lastslash + 1) = '\0';
2765 *pdir = cpathname;
2766 } else {
2767 LEPT_FREE(cpathname);
2768 }
2769 } else { /* no directory */
2770 if (pdir)
2771 *pdir = stringNew("");
2772 if (ptail)
2773 *ptail = cpathname;
2774 else
2775 LEPT_FREE(cpathname);
2776 }
2777
2778 return 0;
2779 }
2780
2781
2782 /*!
2783 * \brief splitPathAtExtension()
2784 *
2785 * \param[in] pathname full path; can be a directory
2786 * \param[out] pbasename [optional] pathname not including the
2787 * last dot and characters after that
2788 * \param[out] pextension [optional] path extension, which is
2789 * the last dot and the characters after it. If
2790 * there is no extension, it returns the empty string
2791 * \return 0 if OK, 1 on error
2792 *
2793 * <pre>
2794 * Notes:
2795 * (1) If you only want the extension, input null for the basename ptr.
2796 * (2) If you only want the basename without extension, input null
2797 * for the extension ptr.
2798 * (3) This function makes decisions based only on the lexical
2799 * structure of the input. Examples:
2800 * /usr/tmp/abc.jpg --> basename: /usr/tmp/abc ext: .jpg
2801 * /usr/tmp/.jpg --> basename: /usr/tmp/ ext: .jpg
2802 * /usr/tmp.jpg/ --> basename: /usr/tmp.jpg/ ext: [empty str]
2803 * ./.jpg --> basename: ./ ext: .jpg
2804 * (4) The input can have either forward (unix) or backward (win)
2805 * slash separators. The output has unix separators.
2806 * (5) Note that basename, as used here, is different from the result
2807 * of the unix program 'basename'. Here, basename is the entire
2808 * pathname up to a final extension and its preceding dot.
2809 * </pre>
2810 */
2811 l_ok
2812 splitPathAtExtension(const char *pathname,
2813 char **pbasename,
2814 char **pextension)
2815 {
2816 char *tail, *dir, *lastdot;
2817 char empty[4] = "";
2818
2819 if (!pbasename && !pextension)
2820 return ERROR_INT("null input for both strings", __func__, 1);
2821 if (pbasename) *pbasename = NULL;
2822 if (pextension) *pextension = NULL;
2823 if (!pathname)
2824 return ERROR_INT("pathname not defined", __func__, 1);
2825
2826 /* Split out the directory first */
2827 splitPathAtDirectory(pathname, &dir, &tail);
2828
2829 /* Then look for a "." in the tail part.
2830 * This way we ignore all "." in the directory. */
2831 if ((lastdot = strrchr(tail, '.'))) {
2832 if (pextension)
2833 *pextension = stringNew(lastdot);
2834 if (pbasename) {
2835 *lastdot = '\0';
2836 *pbasename = stringJoin(dir, tail);
2837 }
2838 } else {
2839 if (pextension)
2840 *pextension = stringNew(empty);
2841 if (pbasename)
2842 *pbasename = stringNew(pathname);
2843 }
2844 LEPT_FREE(dir);
2845 LEPT_FREE(tail);
2846 return 0;
2847 }
2848
2849
2850 /*!
2851 * \brief pathJoin()
2852 *
2853 * \param[in] dir [optional] can be null
2854 * \param[in] fname [optional] can be null
2855 * \return specially concatenated path, or NULL on error
2856 *
2857 * <pre>
2858 * Notes:
2859 * (1) Use unix-style pathname separators ('/').
2860 * (2) %fname can be the entire path, or part of the path containing
2861 * at least one directory, or a tail without a directory, or NULL.
2862 * (3) It produces a path that strips multiple slashes to a single
2863 * slash, joins %dir and %fname by a slash, and has no trailing
2864 * slashes (except in the cases where %dir == "/" and
2865 * %fname == NULL, or v.v.).
2866 * (4) If both %dir and %fname are null, produces an empty string.
2867 * (5) Neither %dir nor %fname can begin with '..'.
2868 * (6) The result is not canonicalized or tested for correctness:
2869 * garbage in (e.g., /&%), garbage out.
2870 * (7) Examples:
2871 * //tmp// + //abc/ --> /tmp/abc
2872 * tmp/ + /abc/ --> tmp/abc
2873 * tmp/ + abc/ --> tmp/abc
2874 * /tmp/ + /// --> /tmp
2875 * /tmp/ + NULL --> /tmp
2876 * // + /abc// --> /abc
2877 * // + NULL --> /
2878 * NULL + /abc/def/ --> /abc/def
2879 * NULL + abc// --> abc
2880 * NULL + // --> /
2881 * NULL + NULL --> (empty string)
2882 * "" + "" --> (empty string)
2883 * "" + / --> /
2884 * ".." + /etc/foo --> NULL
2885 * /tmp + ".." --> NULL
2886 * </pre>
2887 */
2888 char *
2889 pathJoin(const char *dir,
2890 const char *fname)
2891 {
2892 const char *slash = "/";
2893 char *str, *dest;
2894 l_int32 i, n1, n2, emptydir;
2895 size_t size;
2896 SARRAY *sa1, *sa2;
2897 L_BYTEA *ba;
2898
2899 if (!dir && !fname)
2900 return stringNew("");
2901 if (dir && strlen(dir) >= 2 && dir[0] == '.' && dir[1] == '.')
2902 return (char *)ERROR_PTR("dir starts with '..'", __func__, NULL);
2903 if (fname && strlen(fname) >= 2 && fname[0] == '.' && fname[1] == '.')
2904 return (char *)ERROR_PTR("fname starts with '..'", __func__, NULL);
2905
2906 sa1 = sarrayCreate(0);
2907 sa2 = sarrayCreate(0);
2908 ba = l_byteaCreate(4);
2909
2910 /* Process %dir */
2911 if (dir && strlen(dir) > 0) {
2912 if (dir[0] == '/')
2913 l_byteaAppendString(ba, slash);
2914 sarraySplitString(sa1, dir, "/"); /* removes all slashes */
2915 n1 = sarrayGetCount(sa1);
2916 for (i = 0; i < n1; i++) {
2917 str = sarrayGetString(sa1, i, L_NOCOPY);
2918 l_byteaAppendString(ba, str);
2919 l_byteaAppendString(ba, slash);
2920 }
2921 }
2922
2923 /* Special case to add leading slash: dir NULL or empty string */
2924 emptydir = dir && strlen(dir) == 0;
2925 if ((!dir || emptydir) && fname && strlen(fname) > 0 && fname[0] == '/')
2926 l_byteaAppendString(ba, slash);
2927
2928 /* Process %fname */
2929 if (fname && strlen(fname) > 0) {
2930 sarraySplitString(sa2, fname, "/");
2931 n2 = sarrayGetCount(sa2);
2932 for (i = 0; i < n2; i++) {
2933 str = sarrayGetString(sa2, i, L_NOCOPY);
2934 l_byteaAppendString(ba, str);
2935 l_byteaAppendString(ba, slash);
2936 }
2937 }
2938
2939 /* Remove trailing slash */
2940 dest = (char *)l_byteaCopyData(ba, &size);
2941 if (size > 1 && dest[size - 1] == '/')
2942 dest[size - 1] = '\0';
2943
2944 sarrayDestroy(&sa1);
2945 sarrayDestroy(&sa2);
2946 l_byteaDestroy(&ba);
2947 return dest;
2948 }
2949
2950
2951 /*!
2952 * \brief appendSubdirs()
2953 *
2954 * \param[in] basedir
2955 * \param[in] subdirs
2956 * \return concatenated full directory path without trailing slash,
2957 * or NULL on error
2958 *
2959 * <pre>
2960 * Notes:
2961 * (1) Use unix pathname separators
2962 * (2) Allocates a new string: [basedir]/[subdirs]
2963 * </pre>
2964 */
2965 char *
2966 appendSubdirs(const char *basedir,
2967 const char *subdirs)
2968 {
2969 char *newdir;
2970 size_t len1, len2, len3, len4;
2971
2972 if (!basedir || !subdirs)
2973 return (char *)ERROR_PTR("basedir and subdirs not both defined",
2974 __func__, NULL);
2975
2976 len1 = strlen(basedir);
2977 len2 = strlen(subdirs);
2978 len3 = len1 + len2 + 8;
2979 if ((newdir = (char *)LEPT_CALLOC(len3, 1)) == NULL)
2980 return (char *)ERROR_PTR("newdir not made", __func__, NULL);
2981 stringCat(newdir, len3, basedir);
2982 if (newdir[len1 - 1] != '/') /* add '/' if necessary */
2983 newdir[len1] = '/';
2984 if (subdirs[0] == '/') /* add subdirs, stripping leading '/' */
2985 stringCat(newdir, len3, subdirs + 1);
2986 else
2987 stringCat(newdir, len3, subdirs);
2988 len4 = strlen(newdir);
2989 if (newdir[len4 - 1] == '/') /* strip trailing '/' */
2990 newdir[len4 - 1] = '\0';
2991
2992 return newdir;
2993 }
2994
2995
2996 /*--------------------------------------------------------------------*
2997 * Special file name operations *
2998 *--------------------------------------------------------------------*/
2999 /*!
3000 * \brief convertSepCharsInPath()
3001 *
3002 * \param[in] path
3003 * \param[in] type UNIX_PATH_SEPCHAR, WIN_PATH_SEPCHAR
3004 * \return 0 if OK, 1 on error
3005 *
3006 * <pre>
3007 * Notes:
3008 * (1) In-place conversion.
3009 * (2) Type is the resulting type:
3010 * * UNIX_PATH_SEPCHAR: '\\' ==> '/'
3011 * * WIN_PATH_SEPCHAR: '/' ==> '\\'
3012 * (3) Virtually all path operations in leptonica use unix separators.
3013 * (4) The backslash is a valid character in unix pathnames and should
3014 * not be converted. Each backslash needs to be escaped with a
3015 * preceding backslash for the shell, but the actual filename
3016 * does not include these escape characters.
3017 * </pre>
3018 */
3019 l_ok
3020 convertSepCharsInPath(char *path,
3021 l_int32 type)
3022 {
3023 l_int32 i;
3024 size_t len;
3025
3026 if (!path)
3027 return ERROR_INT("path not defined", __func__, 1);
3028 if (type != UNIX_PATH_SEPCHAR && type != WIN_PATH_SEPCHAR)
3029 return ERROR_INT("invalid type", __func__, 1);
3030
3031 len = strlen(path);
3032 if (type == UNIX_PATH_SEPCHAR) {
3033 #ifdef _WIN32 /* only convert on Windows */
3034 for (i = 0; i < len; i++) {
3035 if (path[i] == '\\')
3036 path[i] = '/';
3037 }
3038 #endif /* _WIN32 */
3039 } else { /* WIN_PATH_SEPCHAR */
3040 for (i = 0; i < len; i++) {
3041 if (path[i] == '/')
3042 path[i] = '\\';
3043 }
3044 }
3045 return 0;
3046 }
3047
3048
3049 /*!
3050 * \brief genPathname()
3051 *
3052 * \param[in] dir [optional] directory or full path name,
3053 * with or without the trailing '/'
3054 * \param[in] fname [optional] file name within a directory
3055 * \return pathname either a directory or full path, or NULL on error
3056 *
3057 * <pre>
3058 * Notes:
3059 * (1) This function generates actual paths in the following ways:
3060 * * from two sub-parts (e.g., a directory and a file name).
3061 * * from a single path full path, placed in %dir, with
3062 * %fname == NULL.
3063 * * from the name of a file in the local directory placed in
3064 * %fname, with %dir == NULL.
3065 * * if in a "/tmp" directory and on iOS, macOS or Windows,
3066 * the OS specific temp directory is used.
3067 * (2) This does an automatic directory translation for operating
3068 * systems that use a different path for /tmp.
3069 * That path is determined
3070 * * on Windows: by GetTempPath()
3071 * * on macOS, iOS: by confstr() (see man page)
3072 * (3) On unix, the TMPDIR variable is ignored. No rewriting
3073 * of temp directories is permitted.
3074 * (4) There are four cases for the input:
3075 * (a) %dir is a directory and %fname is defined: result is a
3076 * full path
3077 * (b) %dir is a directory and %fname is null: result is a directory
3078 * (c) %dir is a full path and %fname is null: result is a full path
3079 * (d) %dir is null or an empty string: start in the current dir;
3080 * result is a full path
3081 * (5) In all cases, the resulting pathname is not terminated with a slash
3082 * (6) The caller is responsible for freeing the returned pathname.
3083 * </pre>
3084 */
3085 char *
3086 genPathname(const char *dir,
3087 const char *fname)
3088 {
3089 #if defined(REWRITE_TMP)
3090 l_int32 rewrite_tmp = TRUE;
3091 #else
3092 l_int32 rewrite_tmp = FALSE;
3093 #endif /* REWRITE_TMP */
3094 char *cdir, *pathout;
3095 l_int32 dirlen, namelen;
3096 size_t size;
3097
3098 if (!dir && !fname)
3099 return (char *)ERROR_PTR("no input", __func__, NULL);
3100
3101 /* Handle the case where we start from the current directory */
3102 if (!dir || dir[0] == '\0') {
3103 if ((cdir = getcwd(NULL, 0)) == NULL)
3104 return (char *)ERROR_PTR("no current dir found", __func__, NULL);
3105 } else {
3106 if ((cdir = stringNew(dir)) == NULL)
3107 return (char *)ERROR_PTR("stringNew failed", __func__, NULL);
3108 }
3109
3110 /* Convert to unix path separators, and remove the trailing
3111 * slash in the directory, except when dir == "/" */
3112 convertSepCharsInPath(cdir, UNIX_PATH_SEPCHAR);
3113 dirlen = strlen(cdir);
3114 if (cdir[dirlen - 1] == '/' && dirlen != 1) {
3115 cdir[dirlen - 1] = '\0';
3116 dirlen--;
3117 }
3118
3119 namelen = (fname) ? strlen(fname) : 0;
3120 size = dirlen + namelen + 256;
3121 if ((pathout = (char *)LEPT_CALLOC(size, sizeof(char))) == NULL) {
3122 LEPT_FREE(cdir);
3123 return (char *)ERROR_PTR("pathout not made", __func__, NULL);
3124 }
3125
3126 /* First handle %dir (which may be a full pathname).
3127 * There is no path rewriting on unix, and on win32, we do not
3128 * rewrite unless the specified directory is /tmp or
3129 * a subdirectory of /tmp */
3130 if (!rewrite_tmp || dirlen < 4 ||
3131 (dirlen == 4 && strncmp(cdir, "/tmp", 4) != 0) || /* not in "/tmp" */
3132 (dirlen > 4 && strncmp(cdir, "/tmp/", 5) != 0)) { /* not in "/tmp/" */
3133 stringCopy(pathout, cdir, dirlen);
3134 } else { /* Rewrite with "/tmp" specified for the directory. */
3135 #if defined(__APPLE__)
3136 size_t n = confstr(_CS_DARWIN_USER_TEMP_DIR, pathout, size);
3137 if (n == 0 || n > size) {
3138 /* Fall back to using /tmp */
3139 stringCopy(pathout, cdir, dirlen);
3140 } else {
3141 /* Add the rest of cdir */
3142 if (dirlen > 4)
3143 stringCat(pathout, size, cdir + 4);
3144 }
3145 #elif defined(_WIN32)
3146 l_int32 tmpdirlen;
3147 char tmpdir[MAX_PATH];
3148 GetTempPathA(sizeof(tmpdir), tmpdir); /* get the Windows temp dir */
3149 tmpdirlen = strlen(tmpdir);
3150 if (tmpdirlen > 0 && tmpdir[tmpdirlen - 1] == '\\') {
3151 tmpdir[tmpdirlen - 1] = '\0'; /* trim the trailing '\' */
3152 }
3153 tmpdirlen = strlen(tmpdir);
3154 stringCopy(pathout, tmpdir, tmpdirlen);
3155
3156 /* Add the rest of cdir */
3157 if (dirlen > 4)
3158 stringCat(pathout, size, cdir + 4);
3159 #endif /* _WIN32 */
3160 }
3161
3162 /* Now handle %fname */
3163 if (fname && strlen(fname) > 0) {
3164 dirlen = strlen(pathout);
3165 pathout[dirlen] = '/';
3166 stringCat(pathout, size, fname);
3167 }
3168
3169 LEPT_FREE(cdir);
3170 return pathout;
3171 }
3172
3173
3174 /*!
3175 * \brief makeTempDirname()
3176 *
3177 * \param[in] result preallocated on stack or heap and passed in
3178 * \param[in] nbytes size of %result array, in bytes
3179 * \param[in] subdir [optional]; can be NULL or an empty string
3180 * \return 0 if OK, 1 on error
3181 *
3182 * <pre>
3183 * Notes:
3184 * (1) This generates the directory path for output temp files,
3185 * written into %result with unix separators.
3186 * (2) Caller allocates %result, large enough to hold the path,
3187 * which is:
3188 * /tmp/%subdir (unix)
3189 * [Temp]/%subdir (Windows, macOS, iOS)
3190 * where [Temp] is the OS path
3191 * and %subdir is in general a set of nested subdirectories:
3192 * dir1/dir2/.../dirN
3193 * which in use would not typically exceed 2 levels.
3194 * (3) Usage example:
3195 * \code
3196 * char result[256];
3197 * makeTempDirname(result, sizeof(result), "lept/golden");
3198 * \endcode
3199 * </pre>
3200 */
3201 l_ok
3202 makeTempDirname(char *result,
3203 size_t nbytes,
3204 const char *subdir)
3205 {
3206 char *dir, *path;
3207 l_int32 ret = 0;
3208 size_t pathlen;
3209
3210 if (!result)
3211 return ERROR_INT("result not defined", __func__, 1);
3212 if (subdir && ((subdir[0] == '.') || (subdir[0] == '/')))
3213 return ERROR_INT("subdir not an actual subdirectory", __func__, 1);
3214
3215 memset(result, 0, nbytes);
3216
3217 dir = pathJoin("/tmp", subdir);
3218
3219 #if defined(REWRITE_TMP)
3220 path = genPathname(dir, NULL);
3221 #else
3222 path = stringNew(dir);
3223 #endif /* ~ _WIN32 */
3224 pathlen = strlen(path);
3225 if (pathlen < nbytes - 1) {
3226 stringCopy(result, path, nbytes);
3227 } else {
3228 L_ERROR("result array too small for path\n", __func__);
3229 ret = 1;
3230 }
3231
3232 LEPT_FREE(dir);
3233 LEPT_FREE(path);
3234 return ret;
3235 }
3236
3237
3238 /*!
3239 * \brief modifyTrailingSlash()
3240 *
3241 * \param[in] path preallocated on stack or heap and passed in
3242 * \param[in] nbytes size of %path array, in bytes
3243 * \param[in] flag L_ADD_TRAIL_SLASH or L_REMOVE_TRAIL_SLASH
3244 * \return 0 if OK, 1 on error
3245 *
3246 * <pre>
3247 * Notes:
3248 * (1) This carries out the requested action if necessary.
3249 * </pre>
3250 */
3251 l_ok
3252 modifyTrailingSlash(char *path,
3253 size_t nbytes,
3254 l_int32 flag)
3255 {
3256 char lastchar;
3257 size_t len;
3258
3259 if (!path)
3260 return ERROR_INT("path not defined", __func__, 1);
3261 if (flag != L_ADD_TRAIL_SLASH && flag != L_REMOVE_TRAIL_SLASH)
3262 return ERROR_INT("invalid flag", __func__, 1);
3263
3264 len = strlen(path);
3265 lastchar = path[len - 1];
3266 if (flag == L_ADD_TRAIL_SLASH && lastchar != '/' && len < nbytes - 2) {
3267 path[len] = '/';
3268 path[len + 1] = '\0';
3269 } else if (flag == L_REMOVE_TRAIL_SLASH && lastchar == '/') {
3270 path[len - 1] = '\0';
3271 }
3272 return 0;
3273 }
3274
3275
3276 /*!
3277 * \brief l_makeTempFilename()
3278 *
3279 * \return fname : heap allocated filename; returns NULL on failure.
3280 *
3281 * <pre>
3282 * Notes:
3283 * (1) On unix, this makes a filename of the form
3284 * "/tmp/lept.XXXXXX",
3285 * where each X is a random character.
3286 * (2) On Windows, this makes a filename of the form
3287 * "/[Temp]/lp.XXXXXX".
3288 * (3) On all systems, this fails if the file is not writable.
3289 * (4) Safest usage is to write to a subdirectory in debug code.
3290 * (5) The returned filename must be freed by the caller, using lept_free.
3291 * (6) The tail of the filename has a '.', so that cygwin interprets
3292 * the file as having an extension. Otherwise, cygwin assumes it
3293 * is an executable and appends ".exe" to the filename.
3294 * (7) On unix, whenever possible use tmpfile() instead. tmpfile()
3295 * hides the file name, returns a stream opened for write,
3296 * and deletes the temp file when the stream is closed.
3297 * </pre>
3298 */
3299 char *
3300 l_makeTempFilename(void)
3301 {
3302 char dirname[240];
3303
3304 if (makeTempDirname(dirname, sizeof(dirname), NULL) == 1)
3305 return (char *)ERROR_PTR("failed to make dirname", __func__, NULL);
3306
3307 #ifndef _WIN32
3308 {
3309 char *pattern;
3310 l_int32 fd;
3311 pattern = stringConcatNew(dirname, "/lept.XXXXXX", NULL);
3312 fd = mkstemp(pattern);
3313 if (fd == -1) {
3314 LEPT_FREE(pattern);
3315 return (char *)ERROR_PTR("mkstemp failed", __func__, NULL);
3316 }
3317 close(fd);
3318 return pattern;
3319 }
3320 #else
3321 {
3322 char fname[MAX_PATH];
3323 FILE *fp;
3324 if (GetTempFileNameA(dirname, "lp.", 0, fname) == 0)
3325 return (char *)ERROR_PTR("GetTempFileName failed", __func__, NULL);
3326 if ((fp = fopen(fname, "wb")) == NULL)
3327 return (char *)ERROR_PTR("file cannot be written to", __func__, NULL);
3328 fclose(fp);
3329 return stringNew(fname);
3330 }
3331 #endif /* ~ _WIN32 */
3332 }
3333
3334
3335 /*!
3336 * \brief extractNumberFromFilename()
3337 *
3338 * \param[in] fname
3339 * \param[in] numpre number of characters before the digits to be found
3340 * \param[in] numpost number of characters after the digits to be found
3341 * \return num number embedded in the filename; -1 on error or if
3342 * not found
3343 *
3344 * <pre>
3345 * Notes:
3346 * (1) The number is to be found in the basename, which is the
3347 * filename without either the directory or the last extension.
3348 * (2) When a number is found, it is non-negative. If no number
3349 * is found, this returns -1, without an error message. The
3350 * caller needs to check.
3351 * </pre>
3352 */
3353 l_int32
3354 extractNumberFromFilename(const char *fname,
3355 l_int32 numpre,
3356 l_int32 numpost)
3357 {
3358 char *tail, *basename;
3359 l_int32 len, nret, num;
3360
3361 if (!fname)
3362 return ERROR_INT("fname not defined", __func__, -1);
3363
3364 splitPathAtDirectory(fname, NULL, &tail);
3365 splitPathAtExtension(tail, &basename, NULL);
3366 LEPT_FREE(tail);
3367
3368 len = strlen(basename);
3369 if (numpre + numpost > len - 1) {
3370 LEPT_FREE(basename);
3371 return ERROR_INT("numpre + numpost too big", __func__, -1);
3372 }
3373
3374 basename[len - numpost] = '\0';
3375 nret = sscanf(basename + numpre, "%d", &num);
3376 LEPT_FREE(basename);
3377
3378 if (nret == 1)
3379 return num;
3380 else
3381 return -1; /* not found */
3382 }