diff mupdf-source/thirdparty/leptonica/src/sarray1.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/leptonica/src/sarray1.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,1988 @@
+/*====================================================================*
+ -  Copyright (C) 2001 Leptonica.  All rights reserved.
+ -
+ -  Redistribution and use in source and binary forms, with or without
+ -  modification, are permitted provided that the following conditions
+ -  are met:
+ -  1. Redistributions of source code must retain the above copyright
+ -     notice, this list of conditions and the following disclaimer.
+ -  2. Redistributions in binary form must reproduce the above
+ -     copyright notice, this list of conditions and the following
+ -     disclaimer in the documentation and/or other materials
+ -     provided with the distribution.
+ -
+ -  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ -  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ -  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ -  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL ANY
+ -  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ -  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ -  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ -  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ -  OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ -  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ -  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *====================================================================*/
+
+/*!
+ * \file  sarray1.c
+ * <pre>
+ *
+ *      Create/Destroy/Copy
+ *          SARRAY    *sarrayCreate()
+ *          SARRAY    *sarrayCreateInitialized()
+ *          SARRAY    *sarrayCreateWordsFromString()
+ *          SARRAY    *sarrayCreateLinesFromString()
+ *          void      *sarrayDestroy()
+ *          SARRAY    *sarrayCopy()
+ *          SARRAY    *sarrayClone()
+ *
+ *      Add/Remove string
+ *          l_int32    sarrayAddString()
+ *          static l_int32  sarrayExtendArray()
+ *          char      *sarrayRemoveString()
+ *          l_int32    sarrayReplaceString()
+ *          l_int32    sarrayClear()
+ *
+ *      Accessors
+ *          l_int32    sarrayGetCount()
+ *          char     **sarrayGetArray()
+ *          char      *sarrayGetString()
+ *
+ *      Conversion back to string
+ *          char      *sarrayToString()
+ *          char      *sarrayToStringRange()
+ *
+ *      Concatenate strings uniformly within the sarray
+ *          SARRAY    *sarrayConcatUniformly()
+ *
+ *      Join 2 sarrays
+ *          l_int32    sarrayJoin()
+ *          l_int32    sarrayAppendRange()
+ *
+ *      Pad an sarray to be the same size as another sarray
+ *          l_int32    sarrayPadToSameSize()
+ *
+ *      Convert word sarray to (formatted) line sarray
+ *          SARRAY    *sarrayConvertWordsToLines()
+ *
+ *      Split string on separator list
+ *          SARRAY    *sarraySplitString()
+ *
+ *      Filter sarray
+ *          SARRAY    *sarraySelectBySubstring()
+ *          SARRAY    *sarraySelectRange()
+ *          l_int32    sarrayParseRange()
+ *
+ *      Serialize for I/O
+ *          SARRAY    *sarrayRead()
+ *          SARRAY    *sarrayReadStream()
+ *          SARRAY    *sarrayReadMem()
+ *          l_int32    sarrayWrite()
+ *          l_int32    sarrayWriteStream()
+ *          l_int32    sarrayWriteStderr()
+ *          l_int32    sarrayWriteMem()
+ *          l_int32    sarrayAppend()
+ *
+ *      Directory filenames
+ *          SARRAY    *getNumberedPathnamesInDirectory()
+ *          SARRAY    *getSortedPathnamesInDirectory()
+ *          SARRAY    *convertSortedToNumberedPathnames()
+ *          SARRAY    *getFilenamesInDirectory()
+ *
+ *      These functions are important for efficient manipulation
+ *      of string data, and they have found widespread use in
+ *      leptonica.  For example:
+ *         (1) to generate text files: e.g., PostScript and PDF
+ *             wrappers around sets of images
+ *         (2) to parse text files: e.g., extracting prototypes
+ *             from the source to generate allheaders.h
+ *         (3) to generate code for compilation: e.g., the fast
+ *             dwa code for arbitrary structuring elements.
+ *
+ *      Comments on usage:
+ *
+ *          The user is responsible for correctly disposing of strings
+ *          that have been extracted from sarrays.  In the following,
+ *          "str_not_owned" means the returned handle does not own the string,
+ *          and "str_owned" means the returned handle owns the string.
+ *            - To extract a string from an Sarray in order to inspect it
+ *              or to make a copy of it later, get a handle to it:
+ *                  copyflag = L_NOCOPY.
+ *              In this case, you must neither free the string nor put it
+ *              directly in another array:
+ *                 str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
+ *            - To extract a copy of a string from an Sarray, use:
+ *                 str-owned = sarrayGetString(sa, index, L_COPY);
+ *            ~ To insert a string that is in one array into another
+ *              array (always leaving the first array intact), there are
+ *              two options:
+ *                 (1) use copyflag = L_COPY to make an immediate copy,
+ *                     which you then add to the second array by insertion:
+ *                       str-owned = sarrayGetString(sa, index, L_COPY);
+ *                       sarrayAddString(sa, str-owned, L_INSERT);
+ *                 (2) use copyflag = L_NOCOPY to get another handle to
+ *                     the string; you then add a copy of it to the
+ *                     second string array:
+ *                       str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
+ *                       sarrayAddString(sa, str-not-owned, L_COPY).
+ *              sarrayAddString() transfers ownership to the Sarray, so never
+ *              use L_INSERT if the string is owned by another array.
+ *
+ *              In all cases, when you use copyflag = L_COPY to extract
+ *              a string from an array, you must either free it
+ *              or insert it in an array that will be freed later.
+ * </pre>
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config_auto.h>
+#endif  /* HAVE_CONFIG_H */
+
+#include <string.h>
+#ifndef _WIN32
+#include <dirent.h>     /* unix only */
+#include <sys/stat.h>
+#include <limits.h>  /* needed for realpath() */
+#include <stdlib.h>  /* needed for realpath() */
+#endif  /* ! _WIN32 */
+#include "allheaders.h"
+#include "array_internal.h"
+
+static const l_uint32  MaxPtrArraySize = 50000000;    /* 50 million */
+static const l_int32   InitialPtrArraySize = 50;      /*!< n'importe quoi */
+
+    /* Static functions */
+static l_int32 sarrayExtendArray(SARRAY *sa);
+
+
+/*--------------------------------------------------------------------------*
+ *                   String array create/destroy/copy/extend                *
+ *--------------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayCreate()
+ *
+ * \param[in]    n    size of string ptr array to be alloc'd; use 0 for default
+ * \return  sarray, or NULL on error
+ */
+SARRAY *
+sarrayCreate(l_int32  n)
+{
+SARRAY  *sa;
+
+    if (n <= 0 || n > (l_int32)MaxPtrArraySize)
+        n = InitialPtrArraySize;
+
+    sa = (SARRAY *)LEPT_CALLOC(1, sizeof(SARRAY));
+    if ((sa->array = (char **)LEPT_CALLOC(n, sizeof(char *))) == NULL) {
+        sarrayDestroy(&sa);
+        return (SARRAY *)ERROR_PTR("ptr array not made", __func__, NULL);
+    }
+
+    sa->nalloc = n;
+    sa->n = 0;
+    sa->refcount = 1;
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayCreateInitialized()
+ *
+ * \param[in]    n         size of string ptr array to be alloc'd
+ * \param[in]    initstr   string to be initialized on the full array
+ * \return  sarray, or NULL on error
+ */
+SARRAY *
+sarrayCreateInitialized(l_int32      n,
+                        const char  *initstr)
+{
+l_int32  i;
+SARRAY  *sa;
+
+    if (n <= 0)
+        return (SARRAY *)ERROR_PTR("n must be > 0", __func__, NULL);
+    if (!initstr)
+        return (SARRAY *)ERROR_PTR("initstr not defined", __func__, NULL);
+
+    sa = sarrayCreate(n);
+    for (i = 0; i < n; i++)
+        sarrayAddString(sa, initstr, L_COPY);
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayCreateWordsFromString()
+ *
+ * \param[in]    string
+ * \return  sarray, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This finds the number of word substrings, creates an sarray
+ *          of this size, and puts copies of each substring into the sarray.
+ * </pre>
+ */
+SARRAY *
+sarrayCreateWordsFromString(const char  *string)
+{
+char     separators[] = " \n\t";
+l_int32  i, nsub, size, inword;
+SARRAY  *sa;
+
+    if (!string)
+        return (SARRAY *)ERROR_PTR("textstr not defined", __func__, NULL);
+
+        /* Find the number of words */
+    size = strlen(string);
+    nsub = 0;
+    inword = FALSE;
+    for (i = 0; i < size; i++) {
+        if (inword == FALSE &&
+           (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) {
+           inword = TRUE;
+           nsub++;
+        } else if (inword == TRUE &&
+           (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) {
+           inword = FALSE;
+        }
+    }
+
+    if ((sa = sarrayCreate(nsub)) == NULL)
+        return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
+    sarraySplitString(sa, string, separators);
+
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayCreateLinesFromString()
+ *
+ * \param[in]    string
+ * \param[in]    blankflag    0 to exclude blank lines; 1 to include
+ * \return  sarray, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This finds the number of line substrings, each of which
+ *          ends with a newline, and puts a copy of each substring
+ *          in a new sarray.
+ *      (2) The newline characters are removed from each substring.
+ * </pre>
+ */
+SARRAY *
+sarrayCreateLinesFromString(const char  *string,
+                            l_int32      blankflag)
+{
+l_int32  i, nsub, size, startptr;
+char    *cstring, *substring;
+SARRAY  *sa;
+
+    if (!string)
+        return (SARRAY *)ERROR_PTR("textstr not defined", __func__, NULL);
+
+        /* Find the number of lines */
+    size = strlen(string);
+    nsub = 0;
+    for (i = 0; i < size; i++) {
+        if (string[i] == '\n')
+            nsub++;
+    }
+
+    if ((sa = sarrayCreate(nsub)) == NULL)
+        return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
+
+    if (blankflag) {  /* keep blank lines as null strings */
+            /* Make a copy for munging */
+        if ((cstring = stringNew(string)) == NULL) {
+            sarrayDestroy(&sa);
+            return (SARRAY *)ERROR_PTR("cstring not made", __func__, NULL);
+        }
+            /* We'll insert nulls like strtok */
+        startptr = 0;
+        for (i = 0; i < size; i++) {
+            if (cstring[i] == '\n') {
+                cstring[i] = '\0';
+                if (i > 0 && cstring[i - 1] == '\r')
+                    cstring[i - 1] = '\0';  /* also remove Windows CR */
+                if ((substring = stringNew(cstring + startptr)) == NULL) {
+                    sarrayDestroy(&sa);
+                    LEPT_FREE(cstring);
+                    return (SARRAY *)ERROR_PTR("substring not made",
+                                                __func__, NULL);
+                }
+                sarrayAddString(sa, substring, L_INSERT);
+/*                lept_stderr("substring = %s\n", substring); */
+                startptr = i + 1;
+            }
+        }
+        if (startptr < size) {  /* no newline at end of last line */
+            if ((substring = stringNew(cstring + startptr)) == NULL) {
+                sarrayDestroy(&sa);
+                LEPT_FREE(cstring);
+                return (SARRAY *)ERROR_PTR("substring not made",
+                                           __func__, NULL);
+            }
+            sarrayAddString(sa, substring, L_INSERT);
+/*            lept_stderr("substring = %s\n", substring); */
+        }
+        LEPT_FREE(cstring);
+    } else {  /* remove blank lines; use strtok */
+        sarraySplitString(sa, string, "\r\n");
+    }
+
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayDestroy()
+ *
+ * \param[in,out]   psa    will be set to null before returning
+ * \return  void
+ *
+ * <pre>
+ * Notes:
+ *      (1) Decrements the ref count and, if 0, destroys the sarray.
+ *      (2) Always nulls the input ptr.
+ * </pre>
+ */
+void
+sarrayDestroy(SARRAY  **psa)
+{
+l_int32  i;
+SARRAY  *sa;
+
+    if (psa == NULL) {
+        L_WARNING("ptr address is NULL!\n", __func__);
+        return;
+    }
+    if ((sa = *psa) == NULL)
+        return;
+
+    if (--sa->refcount == 0) {
+        if (sa->array) {
+            for (i = 0; i < sa->n; i++) {
+                if (sa->array[i])
+                    LEPT_FREE(sa->array[i]);
+            }
+            LEPT_FREE(sa->array);
+        }
+        LEPT_FREE(sa);
+    }
+    *psa = NULL;
+}
+
+
+/*!
+ * \brief   sarrayCopy()
+ *
+ * \param[in]    sa    string array
+ * \return  copy of sarray, or NULL on error
+ */
+SARRAY *
+sarrayCopy(SARRAY  *sa)
+{
+l_int32  i;
+SARRAY  *csa;
+
+    if (!sa)
+        return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
+
+    if ((csa = sarrayCreate(sa->nalloc)) == NULL)
+        return (SARRAY *)ERROR_PTR("csa not made", __func__, NULL);
+
+    for (i = 0; i < sa->n; i++)
+        sarrayAddString(csa, sa->array[i], L_COPY);
+
+    return csa;
+}
+
+
+/*!
+ * \brief   sarrayClone()
+ *
+ * \param[in]    sa    string array
+ * \return  ptr to same sarray, or NULL on error
+ */
+SARRAY *
+sarrayClone(SARRAY  *sa)
+{
+    if (!sa)
+        return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
+    ++sa->refcount;
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayAddString()
+ *
+ * \param[in]    sa         string array
+ * \param[in]    string     string to be added
+ * \param[in]    copyflag   L_INSERT, L_NOCOPY or L_COPY
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See usage comments at the top of this file.  L_INSERT is
+ *          equivalent to L_NOCOPY.
+ * </pre>
+ */
+l_ok
+sarrayAddString(SARRAY      *sa,
+                const char  *string,
+                l_int32      copyflag)
+{
+l_int32  n;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!string)
+        return ERROR_INT("string not defined", __func__, 1);
+    if (copyflag != L_INSERT && copyflag != L_NOCOPY && copyflag != L_COPY)
+        return ERROR_INT("invalid copyflag", __func__, 1);
+
+    n = sarrayGetCount(sa);
+    if (n >= sa->nalloc) {
+        if (sarrayExtendArray(sa))
+            return ERROR_INT("extension failed", __func__, 1);
+    }
+
+    if (copyflag == L_COPY)
+        sa->array[n] = stringNew(string);
+    else  /* L_INSERT or L_NOCOPY */
+        sa->array[n] = (char *)string;
+    sa->n++;
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayExtendArray()
+ *
+ * \param[in]    sa    string array
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Doubles the size of the string ptr array.
+ *      (2) The max number of strings is 50M.
+ * </pre>
+ */
+static l_int32
+sarrayExtendArray(SARRAY  *sa)
+{
+size_t  oldsize, newsize;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (sa->nalloc >= (l_int32)MaxPtrArraySize)  /* belt & suspenders */
+        return ERROR_INT("sa at maximum ptr size; can't extend", __func__, 1);
+    oldsize = sa->nalloc * sizeof(char *);
+    if (sa->nalloc > (l_int32)(MaxPtrArraySize / 2)) {
+        newsize = MaxPtrArraySize * sizeof(char *);
+        sa->nalloc = (l_int32)MaxPtrArraySize;
+    } else {
+        newsize = 2 * oldsize;
+        sa->nalloc *= 2;
+    }
+    if ((sa->array = (char **)reallocNew((void **)&sa->array,
+                                         oldsize, newsize)) == NULL)
+        return ERROR_INT("new ptr array not returned", __func__, 1);
+
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayRemoveString()
+ *
+ * \param[in]    sa       string array
+ * \param[in]    index    of string within sarray
+ * \return  removed string, or NULL on error
+ */
+char *
+sarrayRemoveString(SARRAY  *sa,
+                   l_int32  index)
+{
+char    *string;
+char   **array;
+l_int32  i, n, nalloc;
+
+    if (!sa)
+        return (char *)ERROR_PTR("sa not defined", __func__, NULL);
+
+    if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL)
+        return (char *)ERROR_PTR("array not returned", __func__, NULL);
+
+    if (index < 0 || index >= n)
+        return (char *)ERROR_PTR("array index out of bounds", __func__, NULL);
+
+    string = array[index];
+
+        /* If removed string is not at end of array, shift
+         * to fill in, maintaining original ordering.
+         * Note: if we didn't care about the order, we could
+         * put the last string array[n - 1] directly into the hole.  */
+    for (i = index; i < n - 1; i++)
+        array[i] = array[i + 1];
+
+    sa->n--;
+    return string;
+}
+
+
+/*!
+ * \brief   sarrayReplaceString()
+ *
+ * \param[in]    sa         string array
+ * \param[in]    index      of string within sarray to be replaced
+ * \param[in]    newstr     string to replace existing one
+ * \param[in]    copyflag   L_INSERT, L_COPY
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This destroys an existing string and replaces it with
+ *          the new string or a copy of it.
+ *      (2) By design, an sarray is always compacted, so there are
+ *          never any holes (null ptrs) in the ptr array up to the
+ *          current count.
+ * </pre>
+ */
+l_ok
+sarrayReplaceString(SARRAY  *sa,
+                    l_int32  index,
+                    char    *newstr,
+                    l_int32  copyflag)
+{
+char    *str;
+l_int32  n;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    n = sarrayGetCount(sa);
+    if (index < 0 || index >= n)
+        return ERROR_INT("array index out of bounds", __func__, 1);
+    if (!newstr)
+        return ERROR_INT("newstr not defined", __func__, 1);
+    if (copyflag != L_INSERT && copyflag != L_COPY)
+        return ERROR_INT("invalid copyflag", __func__, 1);
+
+    LEPT_FREE(sa->array[index]);
+    if (copyflag == L_INSERT)
+        str = newstr;
+    else  /* L_COPY */
+        str = stringNew(newstr);
+    sa->array[index] = str;
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayClear()
+ *
+ * \param[in]    sa    string array
+ * \return  0 if OK; 1 on error
+ */
+l_ok
+sarrayClear(SARRAY  *sa)
+{
+l_int32  i;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    for (i = 0; i < sa->n; i++) {  /* free strings and null ptrs */
+        LEPT_FREE(sa->array[i]);
+        sa->array[i] = NULL;
+    }
+    sa->n = 0;
+    return 0;
+}
+
+
+/*----------------------------------------------------------------------*
+ *                               Accessors                              *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayGetCount()
+ *
+ * \param[in]    sa    string array
+ * \return  count, or 0 if no strings or on error
+ */
+l_int32
+sarrayGetCount(SARRAY  *sa)
+{
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 0);
+    return sa->n;
+}
+
+
+/*!
+ * \brief   sarrayGetArray()
+ *
+ * \param[in]    sa        string array
+ * \param[out]   pnalloc   [optional] number allocated string ptrs
+ * \param[out]   pn        [optional] number allocated strings
+ * \return  ptr to string array, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Caution: the returned array is not a copy, so caller
+ *          must not destroy it!
+ * </pre>
+ */
+char **
+sarrayGetArray(SARRAY   *sa,
+               l_int32  *pnalloc,
+               l_int32  *pn)
+{
+char  **array;
+
+    if (!sa)
+        return (char **)ERROR_PTR("sa not defined", __func__, NULL);
+
+    array = sa->array;
+    if (pnalloc) *pnalloc = sa->nalloc;
+    if (pn) *pn = sa->n;
+
+    return array;
+}
+
+
+/*!
+ * \brief   sarrayGetString()
+ *
+ * \param[in]    sa         string array
+ * \param[in]    index      to the index-th string
+ * \param[in]    copyflag   L_NOCOPY or L_COPY
+ * \return  string, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) See usage comments at the top of this file.
+ *      (2) To get a pointer to the string itself, use L_NOCOPY.
+ *          To get a copy of the string, use L_COPY.
+ * </pre>
+ */
+char *
+sarrayGetString(SARRAY  *sa,
+                l_int32  index,
+                l_int32  copyflag)
+{
+    if (!sa)
+        return (char *)ERROR_PTR("sa not defined", __func__, NULL);
+    if (index < 0 || index >= sa->n)
+        return (char *)ERROR_PTR("index not valid", __func__, NULL);
+    if (copyflag != L_NOCOPY && copyflag != L_COPY)
+        return (char *)ERROR_PTR("invalid copyflag", __func__, NULL);
+
+    if (copyflag == L_NOCOPY)
+        return sa->array[index];
+    else  /* L_COPY */
+        return stringNew(sa->array[index]);
+}
+
+
+/*----------------------------------------------------------------------*
+ *                      Conversion to string                           *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayToString()
+ *
+ * \param[in]    sa          string array
+ * \param[in]    addnlflag   flag: 0 adds nothing to each substring
+ *                                 1 adds '\n' to each substring
+ *                                 2 adds ' ' to each substring
+ *                                 3 adds ',' to each substring
+ * \return  dest string, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Concatenates all the strings in the sarray, preserving
+ *          all white space.
+ *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
+ *      (3) This function was NOT implemented as:
+ *            for (i = 0; i < n; i++)
+ *                strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
+ *          Do you see why?
+ * </pre>
+ */
+char *
+sarrayToString(SARRAY  *sa,
+               l_int32  addnlflag)
+{
+    if (!sa)
+        return (char *)ERROR_PTR("sa not defined", __func__, NULL);
+
+    return sarrayToStringRange(sa, 0, 0, addnlflag);
+}
+
+
+/*!
+ * \brief   sarrayToStringRange()
+ *
+ * \param[in]   sa          string array
+ * \param[in]   first       index of first string to use; starts with 0
+ * \param[in]   nstrings    number of strings to append into the result; use
+ *                          0 to append to the end of the sarray
+ * \param[in]   addnlflag   flag: 0 adds nothing to each substring
+ *                                1 adds '\n' to each substring
+ *                                2 adds ' ' to each substring
+ *                                3 adds ',' to each substring
+ * \return  dest string, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Concatenates the specified strings in the sarray, preserving
+ *          all white space.
+ *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
+ *      (3) If the sarray is empty, this returns a string with just
+ *          the character corresponding to %addnlflag.
+ * </pre>
+ */
+char *
+sarrayToStringRange(SARRAY  *sa,
+                    l_int32  first,
+                    l_int32  nstrings,
+                    l_int32  addnlflag)
+{
+char    *dest, *src, *str;
+l_int32  n, i, last, size, index, len;
+
+    if (!sa)
+        return (char *)ERROR_PTR("sa not defined", __func__, NULL);
+    if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3)
+        return (char *)ERROR_PTR("invalid addnlflag", __func__, NULL);
+
+    n = sarrayGetCount(sa);
+
+        /* Empty sa; return char corresponding to addnlflag only */
+    if (n == 0) {
+        if (first == 0) {
+            if (addnlflag == 0)
+                return stringNew("");
+            if (addnlflag == 1)
+                return stringNew("\n");
+            if (addnlflag == 2)
+                return stringNew(" ");
+            else  /* addnlflag == 3) */
+                return stringNew(",");
+        } else {
+            return (char *)ERROR_PTR("first not valid", __func__, NULL);
+        }
+    }
+
+        /* Determine the range of string indices to be used */
+    if (first < 0 || first >= n)
+        return (char *)ERROR_PTR("first not valid", __func__, NULL);
+    if (nstrings == 0 || (nstrings > n - first))
+        nstrings = n - first;  /* no overflow */
+    last = first + nstrings - 1;
+
+        /* Determine the size of the output string */
+    size = 0;
+    for (i = first; i <= last; i++) {
+        if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL)
+            return (char *)ERROR_PTR("str not found", __func__, NULL);
+        size += strlen(str) + 2;
+    }
+    if ((dest = (char *)LEPT_CALLOC(size + 1, sizeof(char))) == NULL)
+        return (char *)ERROR_PTR("dest not made", __func__, NULL);
+
+        /* Construct the output */
+    index = 0;
+    for (i = first; i <= last; i++) {
+        src = sarrayGetString(sa, i, L_NOCOPY);
+        len = strlen(src);
+        memcpy(dest + index, src, len);
+        index += len;
+        if (addnlflag == 1) {
+            dest[index] = '\n';
+            index++;
+        } else if (addnlflag == 2) {
+            dest[index] = ' ';
+            index++;
+        } else if (addnlflag == 3) {
+            dest[index] = ',';
+            index++;
+        }
+    }
+
+    return dest;
+}
+
+
+/*----------------------------------------------------------------------*
+ *           Concatenate strings uniformly within the sarray            *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayConcatUniformly()
+ *
+ * \param[in]    sa          string array
+ * \param[in]    n           number of strings in output sarray
+ * \param[in]    addnlflag   flag: 0 adds nothing to each substring
+ *                                 1 adds '\n' to each substring
+ *                                 2 adds ' ' to each substring
+ *                                 3 adds ',' to each substring
+ * \return  dest sarray, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Divides %sa into %n essentially equal sets of strings,
+ *          concatenates each set individually, and makes an output
+ *          sarray with the %n concatenations.  %n must not exceed the
+ *          number of strings in %sa.
+ *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
+ * </pre>
+ */
+SARRAY *
+sarrayConcatUniformly(SARRAY  *sa,
+                      l_int32  n,
+                      l_int32  addnlflag)
+{
+l_int32  i, first, ntot, nstr;
+char    *str;
+NUMA    *na;
+SARRAY  *saout;
+
+    if (!sa)
+        return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
+    ntot = sarrayGetCount(sa);
+    if (n < 1)
+        return (SARRAY *)ERROR_PTR("n must be >= 1", __func__, NULL);
+    if (n > ntot) {
+        L_ERROR("n = %d > ntot = %d\n", __func__, n, ntot);
+        return NULL;
+    }
+    if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3)
+        return (SARRAY *)ERROR_PTR("invalid addnlflag", __func__, NULL);
+
+    saout = sarrayCreate(0);
+    na = numaGetUniformBinSizes(ntot, n);
+    for (i = 0, first = 0; i < n; i++) {
+        numaGetIValue(na, i, &nstr);
+        str = sarrayToStringRange(sa, first, nstr, addnlflag);
+        sarrayAddString(saout, str, L_INSERT);
+        first += nstr;
+    }
+    numaDestroy(&na);
+    return saout;
+}
+
+
+/*----------------------------------------------------------------------*
+ *                           Join 2 sarrays                             *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayJoin()
+ *
+ * \param[in]    sa1   to be added to
+ * \param[in]    sa2   append to sa1
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Copies of the strings in sarray2 are added to sarray1.
+ * </pre>
+ */
+l_ok
+sarrayJoin(SARRAY  *sa1,
+           SARRAY  *sa2)
+{
+char    *str;
+l_int32  n, i;
+
+    if (!sa1)
+        return ERROR_INT("sa1 not defined", __func__, 1);
+    if (!sa2)
+        return ERROR_INT("sa2 not defined", __func__, 1);
+
+    n = sarrayGetCount(sa2);
+    for (i = 0; i < n; i++) {
+        str = sarrayGetString(sa2, i, L_NOCOPY);
+        if (sarrayAddString(sa1, str, L_COPY) == 1) {
+            L_ERROR("failed to add string at i = %d\n", __func__, i);
+            return 1;
+        }
+    }
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayAppendRange()
+ *
+ * \param[in]    sa1     to be added to
+ * \param[in]    sa2     append specified range of strings in sa2 to sa1
+ * \param[in]    start   index of first string of sa2 to append
+ * \param[in]    end     index of last string of sa2 to append;
+ *                       -1 to append to end of array
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Copies of the strings in sarray2 are added to sarray1.
+ *      (2) The [start ... end] range is truncated if necessary.
+ *      (3) Use end == -1 to append to the end of sa2.
+ * </pre>
+ */
+l_ok
+sarrayAppendRange(SARRAY  *sa1,
+                  SARRAY  *sa2,
+                  l_int32  start,
+                  l_int32  end)
+{
+char    *str;
+l_int32  n, i;
+
+    if (!sa1)
+        return ERROR_INT("sa1 not defined", __func__, 1);
+    if (!sa2)
+        return ERROR_INT("sa2 not defined", __func__, 1);
+
+    if (start < 0)
+        start = 0;
+    n = sarrayGetCount(sa2);
+    if (end < 0 || end >= n)
+        end = n - 1;
+    if (start > end)
+        return ERROR_INT("start > end", __func__, 1);
+
+    for (i = start; i <= end; i++) {
+        str = sarrayGetString(sa2, i, L_NOCOPY);
+        sarrayAddString(sa1, str, L_COPY);
+    }
+
+    return 0;
+}
+
+
+/*----------------------------------------------------------------------*
+ *          Pad an sarray to be the same size as another sarray         *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayPadToSameSize()
+ *
+ * \param[in]    sa1, sa2
+ * \param[in]    padstring
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) If two sarrays have different size, this adds enough
+ *          instances of %padstring to the smaller so that they are
+ *          the same size.  It is useful when two or more sarrays
+ *          are being sequenced in parallel, and it is necessary to
+ *          find a valid string at each index.
+ * </pre>
+ */
+l_ok
+sarrayPadToSameSize(SARRAY      *sa1,
+                    SARRAY      *sa2,
+                    const char  *padstring)
+{
+l_int32  i, n1, n2;
+
+    if (!sa1 || !sa2)
+        return ERROR_INT("both sa1 and sa2 not defined", __func__, 1);
+
+    n1 = sarrayGetCount(sa1);
+    n2 = sarrayGetCount(sa2);
+    if (n1 < n2) {
+        for (i = n1; i < n2; i++)
+            sarrayAddString(sa1, padstring, L_COPY);
+    } else if (n1 > n2) {
+        for (i = n2; i < n1; i++)
+            sarrayAddString(sa2, padstring, L_COPY);
+    }
+
+    return 0;
+}
+
+
+/*----------------------------------------------------------------------*
+ *                   Convert word sarray to line sarray                 *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayConvertWordsToLines()
+ *
+ * \param[in]    sa  sa      of individual words
+ * \param[in]    linesize    max num of chars in each line
+ * \return  saout sa of formatted lines, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This is useful for re-typesetting text to a specific maximum
+ *          line length.  The individual words in the input sarray
+ *          are concatenated into textlines.  An input word string of zero
+ *          length is taken to be a paragraph separator.  Each time
+ *          such a string is found, the current line is ended and
+ *          a new line is also produced that contains just the
+ *          string of zero length "".  When the output sarray
+ *          of lines is eventually converted to a string with newlines
+ *          typically appended to each line string, the empty
+ *          strings are just converted to newlines, producing the visible
+ *          paragraph separation.
+ *      (2) What happens when a word is larger than linesize?
+ *          We write it out as a single line anyway!  Words preceding
+ *          or following this long word are placed on lines preceding
+ *          or following the line with the long word.  Why this choice?
+ *          Long "words" found in text documents are typically URLs, and
+ *          it's often desirable not to put newlines in the middle of a URL.
+ *          The text display program e.g., text editor will typically
+ *          wrap the long "word" to fit in the window.
+ * </pre>
+ */
+SARRAY *
+sarrayConvertWordsToLines(SARRAY  *sa,
+                          l_int32  linesize)
+{
+char    *wd, *strl;
+char     emptystring[] = "";
+l_int32  n, i, len, totlen;
+SARRAY  *sal, *saout;
+
+    if (!sa)
+        return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
+
+    saout = sarrayCreate(0);
+    n = sarrayGetCount(sa);
+    totlen = 0;
+    sal = NULL;
+    for (i = 0; i < n; i++) {
+        if (!sal)
+            sal = sarrayCreate(0);
+        wd = sarrayGetString(sa, i, L_NOCOPY);
+        len = strlen(wd);
+        if (len == 0) {  /* end of paragraph: end line & insert blank line */
+            if (totlen > 0) {
+                strl = sarrayToString(sal, 2);
+                sarrayAddString(saout, strl, L_INSERT);
+            }
+            sarrayAddString(saout, emptystring, L_COPY);
+            sarrayDestroy(&sal);
+            totlen = 0;
+        } else if (totlen == 0 && len + 1 > linesize) {  /* long word! */
+            sarrayAddString(saout, wd, L_COPY);  /* copy to one line */
+        } else if (totlen + len + 1 > linesize) {  /* end line & start new */
+            strl = sarrayToString(sal, 2);
+            sarrayAddString(saout, strl, L_INSERT);
+            sarrayDestroy(&sal);
+            sal = sarrayCreate(0);
+            sarrayAddString(sal, wd, L_COPY);
+            totlen = len + 1;
+        } else {  /* add to current line */
+            sarrayAddString(sal, wd, L_COPY);
+            totlen += len + 1;
+        }
+    }
+    if (totlen > 0) {   /* didn't end with blank line; output last line */
+        strl = sarrayToString(sal, 2);
+        sarrayAddString(saout, strl, L_INSERT);
+        sarrayDestroy(&sal);
+    }
+
+    return saout;
+}
+
+
+/*----------------------------------------------------------------------*
+ *                    Split string on separator list                    *
+ *----------------------------------------------------------------------*/
+/*
+ * \brief   sarraySplitString()
+ *
+ * \param[in]   sa            to append to; typically empty initially
+ * \param[in]   str           string to split; not changed
+ * \param[in]   separators    characters that split input string
+ * \return   0 if OK, 1 on error.
+ *
+ * <pre>
+ * Notes:
+ *      (1) This uses strtokSafe().  See the notes there in utils.c.
+ * </pre>
+ */
+l_int32
+sarraySplitString(SARRAY      *sa,
+                  const char  *str,
+                  const char  *separators)
+{
+char  *cstr, *substr, *saveptr;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!str)
+        return ERROR_INT("str not defined", __func__, 1);
+    if (!separators)
+        return ERROR_INT("separators not defined", __func__, 1);
+
+    cstr = stringNew(str);  /* preserves const-ness of input str */
+    saveptr = NULL;
+    substr = strtokSafe(cstr, separators, &saveptr);
+    if (substr)
+        sarrayAddString(sa, substr, L_INSERT);
+    while ((substr = strtokSafe(NULL, separators, &saveptr)))
+        sarrayAddString(sa, substr, L_INSERT);
+    LEPT_FREE(cstr);
+
+    return 0;
+}
+
+
+/*----------------------------------------------------------------------*
+ *                              Filter sarray                           *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarraySelectBySubstring()
+ *
+ * \param[in]    sain     input sarray
+ * \param[in]    substr   [optional] substring for matching; can be NULL
+ * \return  saout output sarray, filtered with substring or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This selects all strings in sain that have substr as a substring.
+ *          Note that we can't use strncmp() because we're looking for
+ *          a match to the substring anywhere within each filename.
+ *      (2) If substr == NULL, returns a copy of the sarray.
+ * </pre>
+ */
+SARRAY *
+sarraySelectBySubstring(SARRAY      *sain,
+                        const char  *substr)
+{
+char    *str;
+l_int32  n, i, offset, found;
+SARRAY  *saout;
+
+    if (!sain)
+        return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
+
+    n = sarrayGetCount(sain);
+    if (!substr || n == 0)
+        return sarrayCopy(sain);
+
+    saout = sarrayCreate(n);
+    for (i = 0; i < n; i++) {
+        str = sarrayGetString(sain, i, L_NOCOPY);
+        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
+                          strlen(substr), &offset, &found);
+        if (found)
+            sarrayAddString(saout, str, L_COPY);
+    }
+
+    return saout;
+}
+
+
+/*!
+ * \brief   sarraySelectRange()
+ *
+ * \param[in]    sain    input sarray
+ * \param[in]    first   index of first string to be selected
+ * \param[in]    last    index of last string to be selected;
+ *                       use 0 to go to the end of the sarray
+ * \return  saout   output sarray, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This makes %saout consisting of copies of all strings in %sain
+ *          in the index set [first ... last].  Use %last == 0 to get all
+ *          strings from %first to the last string in the sarray.
+ * </pre>
+ */
+SARRAY *
+sarraySelectRange(SARRAY  *sain,
+                  l_int32  first,
+                  l_int32  last)
+{
+char    *str;
+l_int32  n, i;
+SARRAY  *saout;
+
+    if (!sain)
+        return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL);
+    if (first < 0) first = 0;
+    n = sarrayGetCount(sain);
+    if (last <= 0) last = n - 1;
+    if (last >= n) {
+        L_WARNING("last > n - 1; setting to n - 1\n", __func__);
+        last = n - 1;
+    }
+    if (first > last)
+        return (SARRAY *)ERROR_PTR("first must be >= last", __func__, NULL);
+
+    saout = sarrayCreate(0);
+    for (i = first; i <= last; i++) {
+        str = sarrayGetString(sain, i, L_COPY);
+        sarrayAddString(saout, str, L_INSERT);
+    }
+
+    return saout;
+}
+
+
+/*!
+ * \brief   sarrayParseRange()
+ *
+ * \param[in]    sa             input sarray
+ * \param[in]    start          index to start range search
+ * \param[out]   pactualstart   index of actual start; may be > 'start'
+ * \param[out]   pend           index of end
+ * \param[out]   pnewstart      index of start of next range
+ * \param[in]    substr         substring for matching at beginning of string
+ * \param[in]    loc            byte offset within the string for the pattern;
+ *                              use -1 if the location does not matter.
+ * \return  0 if valid range found; 1 otherwise
+ *
+ * <pre>
+ * Notes:
+ *      (1) This finds the range of the next set of strings in SA,
+ *          beginning the search at 'start', that does NOT have
+ *          the substring 'substr' either at the indicated location
+ *          in the string or anywhere in the string.  The input
+ *          variable 'loc' is the specified offset within the string;
+ *          use -1 to indicate 'anywhere in the string'.
+ *      (2) Always check the return value to verify that a valid range
+ *          was found.
+ *      (3) If a valid range is not found, the values of actstart,
+ *          end and newstart are all set to the size of sa.
+ *      (4) If this is the last valid range, newstart returns the value n.
+ *          In use, this should be tested before calling the function.
+ *      (5) Usage example.  To find all the valid ranges in a file
+ *          where the invalid lines begin with two dashes, copy each
+ *          line in the file to a string in an sarray, and do:
+ *             start = 0;
+ *             while (!sarrayParseRange(sa, start, &actstart, &end, &start,
+ *                    "--", 0))
+ *                 lept_stderr("start = %d, end = %d\n", actstart, end);
+ * </pre>
+ */
+l_int32
+sarrayParseRange(SARRAY      *sa,
+                 l_int32      start,
+                 l_int32     *pactualstart,
+                 l_int32     *pend,
+                 l_int32     *pnewstart,
+                 const char  *substr,
+                 l_int32      loc)
+{
+char    *str;
+l_int32  n, i, offset, found;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+    if (!pactualstart || !pend || !pnewstart)
+        return ERROR_INT("not all range addresses defined", __func__, 1);
+    n = sarrayGetCount(sa);
+    *pactualstart = *pend = *pnewstart = n;
+    if (!substr)
+        return ERROR_INT("substr not defined", __func__, 1);
+
+        /* Look for the first string without the marker */
+    if (start < 0 || start >= n)
+        return 1;
+    for (i = start; i < n; i++) {
+        str = sarrayGetString(sa, i, L_NOCOPY);
+        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
+                          strlen(substr), &offset, &found);
+        if (loc < 0) {
+            if (!found) break;
+        } else {
+            if (!found || offset != loc) break;
+        }
+    }
+    start = i;
+    if (i == n)  /* couldn't get started */
+        return 1;
+
+        /* Look for the last string without the marker */
+    *pactualstart = start;
+    for (i = start + 1; i < n; i++) {
+        str = sarrayGetString(sa, i, L_NOCOPY);
+        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
+                          strlen(substr), &offset, &found);
+        if (loc < 0) {
+            if (found) break;
+        } else {
+            if (found && offset == loc) break;
+        }
+    }
+    *pend = i - 1;
+    start = i;
+    if (i == n)  /* no further range */
+        return 0;
+
+        /* Look for the first string after *pend without the marker.
+         * This will start the next run of strings, if it exists. */
+    for (i = start; i < n; i++) {
+        str = sarrayGetString(sa, i, L_NOCOPY);
+        arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr,
+                          strlen(substr), &offset, &found);
+        if (loc < 0) {
+            if (!found) break;
+        } else {
+            if (!found || offset != loc) break;
+        }
+    }
+    if (i < n)
+        *pnewstart = i;
+
+    return 0;
+}
+
+
+/*----------------------------------------------------------------------*
+ *                           Serialize for I/O                          *
+ *----------------------------------------------------------------------*/
+/*!
+ * \brief   sarrayRead()
+ *
+ * \param[in]    filename
+ * \return  sarray, or NULL on error
+ */
+SARRAY *
+sarrayRead(const char  *filename)
+{
+FILE    *fp;
+SARRAY  *sa;
+
+    if (!filename)
+        return (SARRAY *)ERROR_PTR("filename not defined", __func__, NULL);
+
+    if ((fp = fopenReadStream(filename)) == NULL)
+        return (SARRAY *)ERROR_PTR_1("stream not opened",
+                                     filename, __func__, NULL);
+    sa = sarrayReadStream(fp);
+    fclose(fp);
+    if (!sa)
+        return (SARRAY *)ERROR_PTR_1("sa not read", filename, __func__, NULL);
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayReadStream()
+ *
+ * \param[in]    fp    file stream
+ * \return  sarray, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) We store the size of each string along with the string.
+ *          The limit on the number of strings is 50M.
+ *          The limit on the size of any string is 2^30 bytes.
+ *      (2) This allows a string to have embedded newlines.  By reading
+ *          the entire string, as determined by its size, we are
+ *          not affected by any number of embedded newlines.
+ *      (3) It is OK for the sarray to be empty.
+ * </pre>
+ */
+SARRAY *
+sarrayReadStream(FILE  *fp)
+{
+char    *stringbuf;
+l_int32  i, n, size, index, bufsize, version, ignore, success;
+SARRAY  *sa;
+
+    if (!fp)
+        return (SARRAY *)ERROR_PTR("stream not defined", __func__, NULL);
+
+    if (fscanf(fp, "\nSarray Version %d\n", &version) != 1)
+        return (SARRAY *)ERROR_PTR("not an sarray file", __func__, NULL);
+    if (version != SARRAY_VERSION_NUMBER)
+        return (SARRAY *)ERROR_PTR("invalid sarray version", __func__, NULL);
+    if (fscanf(fp, "Number of strings = %d\n", &n) != 1)
+        return (SARRAY *)ERROR_PTR("error on # strings", __func__, NULL);
+    if (n < 0)
+        return (SARRAY *)ERROR_PTR("num string ptrs <= 0", __func__, NULL);
+    if (n > (l_int32)MaxPtrArraySize)
+        return (SARRAY *)ERROR_PTR("too many string ptrs", __func__, NULL);
+    if (n == 0) L_INFO("the sarray is empty\n", __func__);
+
+    success = TRUE;
+    if ((sa = sarrayCreate(n)) == NULL)
+        return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
+    bufsize = 512 + 1;
+    stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
+
+    for (i = 0; i < n; i++) {
+            /* Get the size of the stored string */
+        if ((fscanf(fp, "%d[%d]:", &index, &size) != 2) || (size > (1 << 30))) {
+            success = FALSE;
+            L_ERROR("error on string size\n", __func__);
+            goto cleanup;
+        }
+            /* Expand the string buffer if necessary */
+        if (size > bufsize - 5) {
+            LEPT_FREE(stringbuf);
+            bufsize = (l_int32)(1.5 * size);
+            stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char));
+        }
+            /* Read the stored string, plus leading spaces and trailing \n */
+        if (fread(stringbuf, 1, size + 3, fp) != size + 3) {
+            success = FALSE;
+            L_ERROR("error reading string\n", __func__);
+            goto cleanup;
+        }
+            /* Remove the \n that was added by sarrayWriteStream() */
+        stringbuf[size + 2] = '\0';
+            /* Copy it in, skipping the 2 leading spaces */
+        sarrayAddString(sa, stringbuf + 2, L_COPY);
+    }
+    ignore = fscanf(fp, "\n");
+
+cleanup:
+    LEPT_FREE(stringbuf);
+    if (!success) sarrayDestroy(&sa);
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayReadMem()
+ *
+ * \param[in]    data    serialization in ascii
+ * \param[in]    size    of data; can use strlen to get it
+ * \return  sarray, or NULL on error
+ */
+SARRAY *
+sarrayReadMem(const l_uint8  *data,
+              size_t          size)
+{
+FILE    *fp;
+SARRAY  *sa;
+
+    if (!data)
+        return (SARRAY *)ERROR_PTR("data not defined", __func__, NULL);
+    if ((fp = fopenReadFromMemory(data, size)) == NULL)
+        return (SARRAY *)ERROR_PTR("stream not opened", __func__, NULL);
+
+    sa = sarrayReadStream(fp);
+    fclose(fp);
+    if (!sa) L_ERROR("sarray not read\n", __func__);
+    return sa;
+}
+
+
+/*!
+ * \brief   sarrayWrite()
+ *
+ * \param[in]    filename
+ * \param[in]    sa          string array
+ * \return  0 if OK; 1 on error
+ */
+l_ok
+sarrayWrite(const char  *filename,
+            SARRAY      *sa)
+{
+l_int32  ret;
+FILE    *fp;
+
+    if (!filename)
+        return ERROR_INT("filename not defined", __func__, 1);
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+    if ((fp = fopenWriteStream(filename, "w")) == NULL)
+        return ERROR_INT_1("stream not opened", filename, __func__, 1);
+    ret = sarrayWriteStream(fp, sa);
+    fclose(fp);
+    if (ret)
+        return ERROR_INT_1("sa not written to stream", filename, __func__, 1);
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayWriteStream()
+ *
+ * \param[in]    fp    file stream; use NULL to write to stderr
+ * \param[in]    sa    string array
+ * \return  0 if OK; 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) This appends a '\n' to each string, which is stripped
+ *          off by sarrayReadStream().
+ * </pre>
+ */
+l_ok
+sarrayWriteStream(FILE    *fp,
+                  SARRAY  *sa)
+{
+l_int32  i, n, len;
+
+    if (!fp)
+        return ERROR_INT("stream not defined", __func__, 1);
+    if (!sa)
+        return sarrayWriteStderr(sa);
+
+    n = sarrayGetCount(sa);
+    fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
+    fprintf(fp, "Number of strings = %d\n", n);
+    for (i = 0; i < n; i++) {
+        len = strlen(sa->array[i]);
+        fprintf(fp, "  %d[%d]:  %s\n", i, len, sa->array[i]);
+    }
+    fprintf(fp, "\n");
+
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayWriteStderr()
+ *
+ * \param[in]    sa    string array
+ * \return  0 if OK; 1 on error
+ */
+l_ok
+sarrayWriteStderr(SARRAY  *sa)
+{
+l_int32  i, n, len;
+
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+    n = sarrayGetCount(sa);
+    lept_stderr("\nSarray Version %d\n", SARRAY_VERSION_NUMBER);
+    lept_stderr("Number of strings = %d\n", n);
+    for (i = 0; i < n; i++) {
+        len = strlen(sa->array[i]);
+        lept_stderr("  %d[%d]:  %s\n", i, len, sa->array[i]);
+    }
+    lept_stderr("\n");
+    return 0;
+}
+
+
+/*!
+ * \brief   sarrayWriteMem()
+ *
+ * \param[out]   pdata    data of serialized sarray; ascii
+ * \param[out]   psize    size of returned data
+ * \param[in]    sa
+ * \return  0 if OK, 1 on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Serializes a sarray in memory and puts the result in a buffer.
+ * </pre>
+ */
+l_ok
+sarrayWriteMem(l_uint8  **pdata,
+               size_t    *psize,
+               SARRAY    *sa)
+{
+l_int32  ret;
+FILE    *fp;
+
+    if (pdata) *pdata = NULL;
+    if (psize) *psize = 0;
+    if (!pdata)
+        return ERROR_INT("&data not defined", __func__, 1);
+    if (!psize)
+        return ERROR_INT("&size not defined", __func__, 1);
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+#if HAVE_FMEMOPEN
+    if ((fp = open_memstream((char **)pdata, psize)) == NULL)
+        return ERROR_INT("stream not opened", __func__, 1);
+    ret = sarrayWriteStream(fp, sa);
+    fputc('\0', fp);
+    fclose(fp);
+    if (*psize > 0) *psize = *psize - 1;
+#else
+    L_INFO("no fmemopen API --> work-around: write to temp file\n", __func__);
+  #ifdef _WIN32
+    if ((fp = fopenWriteWinTempfile()) == NULL)
+        return ERROR_INT("tmpfile stream not opened", __func__, 1);
+  #else
+    if ((fp = tmpfile()) == NULL)
+        return ERROR_INT("tmpfile stream not opened", __func__, 1);
+  #endif  /* _WIN32 */
+    ret = sarrayWriteStream(fp, sa);
+    rewind(fp);
+    *pdata = l_binaryReadStream(fp, psize);
+    fclose(fp);
+#endif  /* HAVE_FMEMOPEN */
+    return ret;
+}
+
+
+/*!
+ * \brief   sarrayAppend()
+ *
+ * \param[in]    filename
+ * \param[in]    sa
+ * \return  0 if OK; 1 on error
+ */
+l_ok
+sarrayAppend(const char  *filename,
+             SARRAY      *sa)
+{
+FILE  *fp;
+
+    if (!filename)
+        return ERROR_INT("filename not defined", __func__, 1);
+    if (!sa)
+        return ERROR_INT("sa not defined", __func__, 1);
+
+    if ((fp = fopenWriteStream(filename, "a")) == NULL)
+        return ERROR_INT_1("stream not opened", filename, __func__, 1);
+    if (sarrayWriteStream(fp, sa)) {
+        fclose(fp);
+        return ERROR_INT_1("sa not appended to stream", filename, __func__, 1);
+    }
+
+    fclose(fp);
+    return 0;
+}
+
+
+/*---------------------------------------------------------------------*
+ *                           Directory filenames                       *
+ *---------------------------------------------------------------------*/
+/*!
+ * \brief   getNumberedPathnamesInDirectory()
+ *
+ * \param[in]    dirname   directory name
+ * \param[in]    substr    [optional] substring filter on filenames; can be NULL
+ * \param[in]    numpre    number of characters in name before number
+ * \param[in]    numpost   number of characters in name after the number,
+ *                         up to a dot before an extension
+ * \param[in]    maxnum    only consider page numbers up to this value
+ * \return  sarray of numbered pathnames, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Returns the full pathnames of the numbered filenames in
+ *          the directory.  The number in the filename is the index
+ *          into the sarray.  For indices for which there are no filenames,
+ *          an empty string ("") is placed into the sarray.
+ *          This makes reading numbered files very simple.  For example,
+ *          the image whose filename includes number N can be retrieved using
+ *               pixReadIndexed(sa, N);
+ *      (2) If %substr is not NULL, only filenames that contain
+ *          the substring can be included.  If %substr is NULL,
+ *          all matching filenames are used.
+ *      (3) If no numbered files are found, it returns an empty sarray,
+ *          with no initialized strings.
+ *      (4) It is assumed that the page number is contained within
+ *          the basename (the filename without directory or extension).
+ *          %numpre is the number of characters in the basename
+ *          preceding the actual page number; %numpost is the number
+ *          following the page number, up to either the end of the
+ *          basename or a ".", whichever comes first.
+ *      (5) This is useful when all filenames contain numbers that are
+ *          not necessarily consecutive.  0-padding is not required.
+ *      (6) To use a O(n) matching algorithm, the largest page number
+ *          is found and two internal arrays of this size are created.
+ *          This maximum is constrained not to exceed %maxsum,
+ *          to make sure that an unrealistically large number is not
+ *          accidentally used to determine the array sizes.
+ * </pre>
+ */
+SARRAY *
+getNumberedPathnamesInDirectory(const char  *dirname,
+                                const char  *substr,
+                                l_int32      numpre,
+                                l_int32      numpost,
+                                l_int32      maxnum)
+{
+l_int32  nfiles;
+SARRAY  *sa, *saout;
+
+    if (!dirname)
+        return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
+
+    if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL)
+        return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
+    if ((nfiles = sarrayGetCount(sa)) == 0) {
+        sarrayDestroy(&sa);
+        return sarrayCreate(1);
+    }
+
+    saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum);
+    sarrayDestroy(&sa);
+    return saout;
+}
+
+
+/*!
+ * \brief   getSortedPathnamesInDirectory()
+ *
+ * \param[in]    dirname   directory name
+ * \param[in]    substr    [optional] substring filter on filenames; can be NULL
+ * \param[in]    first     0-based
+ * \param[in]    nfiles    use 0 for all to the end
+ * \return  sarray of sorted pathnames, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Use %substr to filter filenames in the directory.  If
+ *          %substr == NULL, this takes all files.
+ *      (2) The files in the directory, after optional filtering by
+ *          the substring, are lexically sorted in increasing order.
+ *          Use %first and %nfiles to select a contiguous set of files.
+ *      (3) The full pathnames are returned for the requested sequence.
+ *          If no files are found after filtering, returns an empty sarray.
+ * </pre>
+ */
+SARRAY *
+getSortedPathnamesInDirectory(const char  *dirname,
+                              const char  *substr,
+                              l_int32      first,
+                              l_int32      nfiles)
+{
+char    *fname, *fullname;
+l_int32  i, n, last;
+SARRAY  *sa, *safiles, *saout;
+
+    if (!dirname)
+        return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
+
+    if ((sa = getFilenamesInDirectory(dirname)) == NULL)
+        return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL);
+    safiles = sarraySelectBySubstring(sa, substr);
+    sarrayDestroy(&sa);
+    n = sarrayGetCount(safiles);
+    if (n == 0) {
+        L_WARNING("no files found\n", __func__);
+        return safiles;
+    }
+
+    sarraySort(safiles, safiles, L_SORT_INCREASING);
+
+    first = L_MIN(L_MAX(first, 0), n - 1);
+    if (nfiles == 0)
+        nfiles = n - first;
+    last = L_MIN(first + nfiles - 1, n - 1);
+
+    saout = sarrayCreate(last - first + 1);
+    for (i = first; i <= last; i++) {
+        fname = sarrayGetString(safiles, i, L_NOCOPY);
+        fullname = pathJoin(dirname, fname);
+        sarrayAddString(saout, fullname, L_INSERT);
+    }
+
+    sarrayDestroy(&safiles);
+    return saout;
+}
+
+
+/*!
+ * \brief   convertSortedToNumberedPathnames()
+ *
+ * \param[in]    sa        sorted pathnames including zero-padded integers
+ * \param[in]    numpre    number of characters in name before number
+ * \param[in]    numpost   number of characters in name after the number,
+ *                         up to a dot before an extension
+ * \param[in]    maxnum    only consider page numbers up to this value
+ * \return  sarray of numbered pathnames, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) Typically, numpre = numpost = 0; e.g., when the filename
+ *          just has a number followed by an optional extension.
+ * </pre>
+ */
+SARRAY *
+convertSortedToNumberedPathnames(SARRAY   *sa,
+                                 l_int32   numpre,
+                                 l_int32   numpost,
+                                 l_int32   maxnum)
+{
+char    *fname, *str;
+l_int32  i, nfiles, num, index;
+SARRAY  *saout;
+
+    if (!sa)
+        return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL);
+    if ((nfiles = sarrayGetCount(sa)) == 0)
+        return sarrayCreate(1);
+
+        /* Find the last file in the sorted array that has a number
+         * that (a) matches the count pattern and (b) does not
+         * exceed %maxnum.  %maxnum sets an upper limit on the size
+         * of the sarray.  */
+    num = 0;
+    for (i = nfiles - 1; i >= 0; i--) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        num = extractNumberFromFilename(fname, numpre, numpost);
+        if (num < 0) continue;
+        num = L_MIN(num + 1, maxnum);
+        break;
+    }
+
+    if (num <= 0)  /* none found */
+        return sarrayCreate(1);
+
+        /* Insert pathnames into the output sarray.
+         * Ignore numbers that are out of the range of sarray. */
+    saout = sarrayCreateInitialized(num, "");
+    for (i = 0; i < nfiles; i++) {
+        fname = sarrayGetString(sa, i, L_NOCOPY);
+        index = extractNumberFromFilename(fname, numpre, numpost);
+        if (index < 0 || index >= num) continue;
+        str = sarrayGetString(saout, index, L_NOCOPY);
+        if (str[0] != '\0') {
+            L_WARNING("\n  Multiple files with same number: %d\n",
+                      __func__, index);
+        }
+        sarrayReplaceString(saout, index, fname, L_COPY);
+    }
+
+    return saout;
+}
+
+
+/*!
+ * \brief   getFilenamesInDirectory()
+ *
+ * \param[in]    dirname     directory name
+ * \return  sarray of file names, or NULL on error
+ *
+ * <pre>
+ * Notes:
+ *      (1) The versions compiled under unix and cygwin use the POSIX C
+ *          library commands for handling directories.  For Windows,
+ *          there is a separate implementation.
+ *      (2) It returns an array of filename tails; i.e., only the part of
+ *          the path after the last slash.
+ *      (3) Use of the d_type field of dirent is not portable:
+ *          "According to POSIX, the dirent structure contains a field
+ *          char d_name[] of unspecified size, with at most NAME_MAX
+ *          characters preceding the terminating null character.  Use
+ *          of other fields will harm the portability of your programs."
+ *      (4) As a consequence of (3), we note several things:
+ *           ~ MINGW doesn't have a d_type member.
+ *           ~ Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
+ *             for d_type from all files.
+ *          On these systems, this function will return directories
+ *          (except for '.' and '..', which are eliminated using
+ *          the d_name field).
+ *      (5) For unix, we avoid the bug in earlier versions of realpath()
+ *          by requiring either POSIX 2008 or use of glibc.
+ *          
+ * </pre>
+ */
+
+#ifndef _WIN32
+
+SARRAY *
+getFilenamesInDirectory(const char  *dirname)
+{
+char           *gendir, *realdir, *stat_path;
+size_t          size;
+SARRAY         *safiles;
+DIR            *pdir;
+struct dirent  *pdirentry;
+int             dfd, stat_ret;
+struct stat     st;
+
+    if (!dirname)
+        return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
+    if (dirname[0] == '\0')
+        return (SARRAY *)ERROR_PTR("dirname is empty", __func__, NULL);
+
+        /* Who would have thought it was this fiddly to open a directory
+           and get the files inside?  fstatat() works with relative
+           directory paths, and stat() requires using the absolute path.
+           realpath() works as follows for files and directories:
+            * If the file or directory exists, realpath returns its path;
+              else it returns NULL.
+            * For realpath() we use the POSIX 2008 implementation, where
+              the second arg is NULL and the path is malloc'd and returned
+              if the file or directory exists.  All versions of glibc
+              support this.  */
+    gendir = genPathname(dirname, NULL);
+    realdir = realpath(gendir, NULL);
+    LEPT_FREE(gendir);
+    if (realdir == NULL)
+        return (SARRAY *)ERROR_PTR("realdir not made", __func__, NULL);
+    if ((pdir = opendir(realdir)) == NULL) {
+        L_ERROR("directory %s not opened\n", __func__, realdir);
+        LEPT_FREE(realdir);
+        return NULL;
+    }
+    safiles = sarrayCreate(0);
+    while ((pdirentry = readdir(pdir))) {
+#if HAVE_DIRFD && HAVE_FSTATAT
+            /* Platform issues: although Linux has these POSIX functions,
+             * AIX doesn't have fstatat() and Solaris doesn't have dirfd(). */
+        dfd = dirfd(pdir);
+        stat_ret = fstatat(dfd, pdirentry->d_name, &st, 0);
+#else
+        size = strlen(realdir) + strlen(pdirentry->d_name) + 2;
+        stat_path = (char *)LEPT_CALLOC(size, 1);
+        snprintf(stat_path, size, "%s/%s", realdir, pdirentry->d_name);
+        stat_ret = stat(stat_path, &st);
+        LEPT_FREE(stat_path);
+#endif
+        if (stat_ret == 0 && S_ISDIR(st.st_mode))
+            continue;
+        sarrayAddString(safiles, pdirentry->d_name, L_COPY);
+    }
+    closedir(pdir);
+    LEPT_FREE(realdir);
+    return safiles;
+}
+
+#else  /* _WIN32 */
+
+    /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */
+#include <windows.h>
+
+SARRAY *
+getFilenamesInDirectory(const char  *dirname)
+{
+char             *pszDir;
+char             *realdir;
+HANDLE            hFind = INVALID_HANDLE_VALUE;
+SARRAY           *safiles;
+WIN32_FIND_DATAA  ffd;
+
+    if (!dirname)
+        return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL);
+
+    realdir = genPathname(dirname, NULL);
+    pszDir = stringJoin(realdir, "\\*");
+    LEPT_FREE(realdir);
+
+    if (strlen(pszDir) + 1 > MAX_PATH) {
+        LEPT_FREE(pszDir);
+        return (SARRAY *)ERROR_PTR("dirname is too long", __func__, NULL);
+    }
+
+    if ((safiles = sarrayCreate(0)) == NULL) {
+        LEPT_FREE(pszDir);
+        return (SARRAY *)ERROR_PTR("safiles not made", __func__, NULL);
+    }
+
+    hFind = FindFirstFileA(pszDir, &ffd);
+    if (INVALID_HANDLE_VALUE == hFind) {
+        sarrayDestroy(&safiles);
+        LEPT_FREE(pszDir);
+        return (SARRAY *)ERROR_PTR("hFind not opened", __func__, NULL);
+    }
+
+    while (FindNextFileA(hFind, &ffd) != 0) {
+        if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)  /* skip dirs */
+            continue;
+        convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR);
+        sarrayAddString(safiles, ffd.cFileName, L_COPY);
+    }
+
+    FindClose(hFind);
+    LEPT_FREE(pszDir);
+    return safiles;
+}
+#endif  /* _WIN32 */