Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/leptonica/src/sarray1.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /*====================================================================* | |
| 2 - Copyright (C) 2001 Leptonica. All rights reserved. | |
| 3 - | |
| 4 - Redistribution and use in source and binary forms, with or without | |
| 5 - modification, are permitted provided that the following conditions | |
| 6 - are met: | |
| 7 - 1. Redistributions of source code must retain the above copyright | |
| 8 - notice, this list of conditions and the following disclaimer. | |
| 9 - 2. Redistributions in binary form must reproduce the above | |
| 10 - copyright notice, this list of conditions and the following | |
| 11 - disclaimer in the documentation and/or other materials | |
| 12 - provided with the distribution. | |
| 13 - | |
| 14 - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 15 - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 16 - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 17 - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY | |
| 18 - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 19 - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 20 - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 21 - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
| 22 - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| 23 - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| 24 - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 25 *====================================================================*/ | |
| 26 | |
| 27 /*! | |
| 28 * \file sarray1.c | |
| 29 * <pre> | |
| 30 * | |
| 31 * Create/Destroy/Copy | |
| 32 * SARRAY *sarrayCreate() | |
| 33 * SARRAY *sarrayCreateInitialized() | |
| 34 * SARRAY *sarrayCreateWordsFromString() | |
| 35 * SARRAY *sarrayCreateLinesFromString() | |
| 36 * void *sarrayDestroy() | |
| 37 * SARRAY *sarrayCopy() | |
| 38 * SARRAY *sarrayClone() | |
| 39 * | |
| 40 * Add/Remove string | |
| 41 * l_int32 sarrayAddString() | |
| 42 * static l_int32 sarrayExtendArray() | |
| 43 * char *sarrayRemoveString() | |
| 44 * l_int32 sarrayReplaceString() | |
| 45 * l_int32 sarrayClear() | |
| 46 * | |
| 47 * Accessors | |
| 48 * l_int32 sarrayGetCount() | |
| 49 * char **sarrayGetArray() | |
| 50 * char *sarrayGetString() | |
| 51 * | |
| 52 * Conversion back to string | |
| 53 * char *sarrayToString() | |
| 54 * char *sarrayToStringRange() | |
| 55 * | |
| 56 * Concatenate strings uniformly within the sarray | |
| 57 * SARRAY *sarrayConcatUniformly() | |
| 58 * | |
| 59 * Join 2 sarrays | |
| 60 * l_int32 sarrayJoin() | |
| 61 * l_int32 sarrayAppendRange() | |
| 62 * | |
| 63 * Pad an sarray to be the same size as another sarray | |
| 64 * l_int32 sarrayPadToSameSize() | |
| 65 * | |
| 66 * Convert word sarray to (formatted) line sarray | |
| 67 * SARRAY *sarrayConvertWordsToLines() | |
| 68 * | |
| 69 * Split string on separator list | |
| 70 * SARRAY *sarraySplitString() | |
| 71 * | |
| 72 * Filter sarray | |
| 73 * SARRAY *sarraySelectBySubstring() | |
| 74 * SARRAY *sarraySelectRange() | |
| 75 * l_int32 sarrayParseRange() | |
| 76 * | |
| 77 * Serialize for I/O | |
| 78 * SARRAY *sarrayRead() | |
| 79 * SARRAY *sarrayReadStream() | |
| 80 * SARRAY *sarrayReadMem() | |
| 81 * l_int32 sarrayWrite() | |
| 82 * l_int32 sarrayWriteStream() | |
| 83 * l_int32 sarrayWriteStderr() | |
| 84 * l_int32 sarrayWriteMem() | |
| 85 * l_int32 sarrayAppend() | |
| 86 * | |
| 87 * Directory filenames | |
| 88 * SARRAY *getNumberedPathnamesInDirectory() | |
| 89 * SARRAY *getSortedPathnamesInDirectory() | |
| 90 * SARRAY *convertSortedToNumberedPathnames() | |
| 91 * SARRAY *getFilenamesInDirectory() | |
| 92 * | |
| 93 * These functions are important for efficient manipulation | |
| 94 * of string data, and they have found widespread use in | |
| 95 * leptonica. For example: | |
| 96 * (1) to generate text files: e.g., PostScript and PDF | |
| 97 * wrappers around sets of images | |
| 98 * (2) to parse text files: e.g., extracting prototypes | |
| 99 * from the source to generate allheaders.h | |
| 100 * (3) to generate code for compilation: e.g., the fast | |
| 101 * dwa code for arbitrary structuring elements. | |
| 102 * | |
| 103 * Comments on usage: | |
| 104 * | |
| 105 * The user is responsible for correctly disposing of strings | |
| 106 * that have been extracted from sarrays. In the following, | |
| 107 * "str_not_owned" means the returned handle does not own the string, | |
| 108 * and "str_owned" means the returned handle owns the string. | |
| 109 * - To extract a string from an Sarray in order to inspect it | |
| 110 * or to make a copy of it later, get a handle to it: | |
| 111 * copyflag = L_NOCOPY. | |
| 112 * In this case, you must neither free the string nor put it | |
| 113 * directly in another array: | |
| 114 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY); | |
| 115 * - To extract a copy of a string from an Sarray, use: | |
| 116 * str-owned = sarrayGetString(sa, index, L_COPY); | |
| 117 * ~ To insert a string that is in one array into another | |
| 118 * array (always leaving the first array intact), there are | |
| 119 * two options: | |
| 120 * (1) use copyflag = L_COPY to make an immediate copy, | |
| 121 * which you then add to the second array by insertion: | |
| 122 * str-owned = sarrayGetString(sa, index, L_COPY); | |
| 123 * sarrayAddString(sa, str-owned, L_INSERT); | |
| 124 * (2) use copyflag = L_NOCOPY to get another handle to | |
| 125 * the string; you then add a copy of it to the | |
| 126 * second string array: | |
| 127 * str-not-owned = sarrayGetString(sa, index, L_NOCOPY); | |
| 128 * sarrayAddString(sa, str-not-owned, L_COPY). | |
| 129 * sarrayAddString() transfers ownership to the Sarray, so never | |
| 130 * use L_INSERT if the string is owned by another array. | |
| 131 * | |
| 132 * In all cases, when you use copyflag = L_COPY to extract | |
| 133 * a string from an array, you must either free it | |
| 134 * or insert it in an array that will be freed later. | |
| 135 * </pre> | |
| 136 */ | |
| 137 | |
| 138 #ifdef HAVE_CONFIG_H | |
| 139 #include <config_auto.h> | |
| 140 #endif /* HAVE_CONFIG_H */ | |
| 141 | |
| 142 #include <string.h> | |
| 143 #ifndef _WIN32 | |
| 144 #include <dirent.h> /* unix only */ | |
| 145 #include <sys/stat.h> | |
| 146 #include <limits.h> /* needed for realpath() */ | |
| 147 #include <stdlib.h> /* needed for realpath() */ | |
| 148 #endif /* ! _WIN32 */ | |
| 149 #include "allheaders.h" | |
| 150 #include "array_internal.h" | |
| 151 | |
| 152 static const l_uint32 MaxPtrArraySize = 50000000; /* 50 million */ | |
| 153 static const l_int32 InitialPtrArraySize = 50; /*!< n'importe quoi */ | |
| 154 | |
| 155 /* Static functions */ | |
| 156 static l_int32 sarrayExtendArray(SARRAY *sa); | |
| 157 | |
| 158 | |
| 159 /*--------------------------------------------------------------------------* | |
| 160 * String array create/destroy/copy/extend * | |
| 161 *--------------------------------------------------------------------------*/ | |
| 162 /*! | |
| 163 * \brief sarrayCreate() | |
| 164 * | |
| 165 * \param[in] n size of string ptr array to be alloc'd; use 0 for default | |
| 166 * \return sarray, or NULL on error | |
| 167 */ | |
| 168 SARRAY * | |
| 169 sarrayCreate(l_int32 n) | |
| 170 { | |
| 171 SARRAY *sa; | |
| 172 | |
| 173 if (n <= 0 || n > (l_int32)MaxPtrArraySize) | |
| 174 n = InitialPtrArraySize; | |
| 175 | |
| 176 sa = (SARRAY *)LEPT_CALLOC(1, sizeof(SARRAY)); | |
| 177 if ((sa->array = (char **)LEPT_CALLOC(n, sizeof(char *))) == NULL) { | |
| 178 sarrayDestroy(&sa); | |
| 179 return (SARRAY *)ERROR_PTR("ptr array not made", __func__, NULL); | |
| 180 } | |
| 181 | |
| 182 sa->nalloc = n; | |
| 183 sa->n = 0; | |
| 184 sa->refcount = 1; | |
| 185 return sa; | |
| 186 } | |
| 187 | |
| 188 | |
| 189 /*! | |
| 190 * \brief sarrayCreateInitialized() | |
| 191 * | |
| 192 * \param[in] n size of string ptr array to be alloc'd | |
| 193 * \param[in] initstr string to be initialized on the full array | |
| 194 * \return sarray, or NULL on error | |
| 195 */ | |
| 196 SARRAY * | |
| 197 sarrayCreateInitialized(l_int32 n, | |
| 198 const char *initstr) | |
| 199 { | |
| 200 l_int32 i; | |
| 201 SARRAY *sa; | |
| 202 | |
| 203 if (n <= 0) | |
| 204 return (SARRAY *)ERROR_PTR("n must be > 0", __func__, NULL); | |
| 205 if (!initstr) | |
| 206 return (SARRAY *)ERROR_PTR("initstr not defined", __func__, NULL); | |
| 207 | |
| 208 sa = sarrayCreate(n); | |
| 209 for (i = 0; i < n; i++) | |
| 210 sarrayAddString(sa, initstr, L_COPY); | |
| 211 return sa; | |
| 212 } | |
| 213 | |
| 214 | |
| 215 /*! | |
| 216 * \brief sarrayCreateWordsFromString() | |
| 217 * | |
| 218 * \param[in] string | |
| 219 * \return sarray, or NULL on error | |
| 220 * | |
| 221 * <pre> | |
| 222 * Notes: | |
| 223 * (1) This finds the number of word substrings, creates an sarray | |
| 224 * of this size, and puts copies of each substring into the sarray. | |
| 225 * </pre> | |
| 226 */ | |
| 227 SARRAY * | |
| 228 sarrayCreateWordsFromString(const char *string) | |
| 229 { | |
| 230 char separators[] = " \n\t"; | |
| 231 l_int32 i, nsub, size, inword; | |
| 232 SARRAY *sa; | |
| 233 | |
| 234 if (!string) | |
| 235 return (SARRAY *)ERROR_PTR("textstr not defined", __func__, NULL); | |
| 236 | |
| 237 /* Find the number of words */ | |
| 238 size = strlen(string); | |
| 239 nsub = 0; | |
| 240 inword = FALSE; | |
| 241 for (i = 0; i < size; i++) { | |
| 242 if (inword == FALSE && | |
| 243 (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) { | |
| 244 inword = TRUE; | |
| 245 nsub++; | |
| 246 } else if (inword == TRUE && | |
| 247 (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) { | |
| 248 inword = FALSE; | |
| 249 } | |
| 250 } | |
| 251 | |
| 252 if ((sa = sarrayCreate(nsub)) == NULL) | |
| 253 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL); | |
| 254 sarraySplitString(sa, string, separators); | |
| 255 | |
| 256 return sa; | |
| 257 } | |
| 258 | |
| 259 | |
| 260 /*! | |
| 261 * \brief sarrayCreateLinesFromString() | |
| 262 * | |
| 263 * \param[in] string | |
| 264 * \param[in] blankflag 0 to exclude blank lines; 1 to include | |
| 265 * \return sarray, or NULL on error | |
| 266 * | |
| 267 * <pre> | |
| 268 * Notes: | |
| 269 * (1) This finds the number of line substrings, each of which | |
| 270 * ends with a newline, and puts a copy of each substring | |
| 271 * in a new sarray. | |
| 272 * (2) The newline characters are removed from each substring. | |
| 273 * </pre> | |
| 274 */ | |
| 275 SARRAY * | |
| 276 sarrayCreateLinesFromString(const char *string, | |
| 277 l_int32 blankflag) | |
| 278 { | |
| 279 l_int32 i, nsub, size, startptr; | |
| 280 char *cstring, *substring; | |
| 281 SARRAY *sa; | |
| 282 | |
| 283 if (!string) | |
| 284 return (SARRAY *)ERROR_PTR("textstr not defined", __func__, NULL); | |
| 285 | |
| 286 /* Find the number of lines */ | |
| 287 size = strlen(string); | |
| 288 nsub = 0; | |
| 289 for (i = 0; i < size; i++) { | |
| 290 if (string[i] == '\n') | |
| 291 nsub++; | |
| 292 } | |
| 293 | |
| 294 if ((sa = sarrayCreate(nsub)) == NULL) | |
| 295 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL); | |
| 296 | |
| 297 if (blankflag) { /* keep blank lines as null strings */ | |
| 298 /* Make a copy for munging */ | |
| 299 if ((cstring = stringNew(string)) == NULL) { | |
| 300 sarrayDestroy(&sa); | |
| 301 return (SARRAY *)ERROR_PTR("cstring not made", __func__, NULL); | |
| 302 } | |
| 303 /* We'll insert nulls like strtok */ | |
| 304 startptr = 0; | |
| 305 for (i = 0; i < size; i++) { | |
| 306 if (cstring[i] == '\n') { | |
| 307 cstring[i] = '\0'; | |
| 308 if (i > 0 && cstring[i - 1] == '\r') | |
| 309 cstring[i - 1] = '\0'; /* also remove Windows CR */ | |
| 310 if ((substring = stringNew(cstring + startptr)) == NULL) { | |
| 311 sarrayDestroy(&sa); | |
| 312 LEPT_FREE(cstring); | |
| 313 return (SARRAY *)ERROR_PTR("substring not made", | |
| 314 __func__, NULL); | |
| 315 } | |
| 316 sarrayAddString(sa, substring, L_INSERT); | |
| 317 /* lept_stderr("substring = %s\n", substring); */ | |
| 318 startptr = i + 1; | |
| 319 } | |
| 320 } | |
| 321 if (startptr < size) { /* no newline at end of last line */ | |
| 322 if ((substring = stringNew(cstring + startptr)) == NULL) { | |
| 323 sarrayDestroy(&sa); | |
| 324 LEPT_FREE(cstring); | |
| 325 return (SARRAY *)ERROR_PTR("substring not made", | |
| 326 __func__, NULL); | |
| 327 } | |
| 328 sarrayAddString(sa, substring, L_INSERT); | |
| 329 /* lept_stderr("substring = %s\n", substring); */ | |
| 330 } | |
| 331 LEPT_FREE(cstring); | |
| 332 } else { /* remove blank lines; use strtok */ | |
| 333 sarraySplitString(sa, string, "\r\n"); | |
| 334 } | |
| 335 | |
| 336 return sa; | |
| 337 } | |
| 338 | |
| 339 | |
| 340 /*! | |
| 341 * \brief sarrayDestroy() | |
| 342 * | |
| 343 * \param[in,out] psa will be set to null before returning | |
| 344 * \return void | |
| 345 * | |
| 346 * <pre> | |
| 347 * Notes: | |
| 348 * (1) Decrements the ref count and, if 0, destroys the sarray. | |
| 349 * (2) Always nulls the input ptr. | |
| 350 * </pre> | |
| 351 */ | |
| 352 void | |
| 353 sarrayDestroy(SARRAY **psa) | |
| 354 { | |
| 355 l_int32 i; | |
| 356 SARRAY *sa; | |
| 357 | |
| 358 if (psa == NULL) { | |
| 359 L_WARNING("ptr address is NULL!\n", __func__); | |
| 360 return; | |
| 361 } | |
| 362 if ((sa = *psa) == NULL) | |
| 363 return; | |
| 364 | |
| 365 if (--sa->refcount == 0) { | |
| 366 if (sa->array) { | |
| 367 for (i = 0; i < sa->n; i++) { | |
| 368 if (sa->array[i]) | |
| 369 LEPT_FREE(sa->array[i]); | |
| 370 } | |
| 371 LEPT_FREE(sa->array); | |
| 372 } | |
| 373 LEPT_FREE(sa); | |
| 374 } | |
| 375 *psa = NULL; | |
| 376 } | |
| 377 | |
| 378 | |
| 379 /*! | |
| 380 * \brief sarrayCopy() | |
| 381 * | |
| 382 * \param[in] sa string array | |
| 383 * \return copy of sarray, or NULL on error | |
| 384 */ | |
| 385 SARRAY * | |
| 386 sarrayCopy(SARRAY *sa) | |
| 387 { | |
| 388 l_int32 i; | |
| 389 SARRAY *csa; | |
| 390 | |
| 391 if (!sa) | |
| 392 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 393 | |
| 394 if ((csa = sarrayCreate(sa->nalloc)) == NULL) | |
| 395 return (SARRAY *)ERROR_PTR("csa not made", __func__, NULL); | |
| 396 | |
| 397 for (i = 0; i < sa->n; i++) | |
| 398 sarrayAddString(csa, sa->array[i], L_COPY); | |
| 399 | |
| 400 return csa; | |
| 401 } | |
| 402 | |
| 403 | |
| 404 /*! | |
| 405 * \brief sarrayClone() | |
| 406 * | |
| 407 * \param[in] sa string array | |
| 408 * \return ptr to same sarray, or NULL on error | |
| 409 */ | |
| 410 SARRAY * | |
| 411 sarrayClone(SARRAY *sa) | |
| 412 { | |
| 413 if (!sa) | |
| 414 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 415 ++sa->refcount; | |
| 416 return sa; | |
| 417 } | |
| 418 | |
| 419 | |
| 420 /*! | |
| 421 * \brief sarrayAddString() | |
| 422 * | |
| 423 * \param[in] sa string array | |
| 424 * \param[in] string string to be added | |
| 425 * \param[in] copyflag L_INSERT, L_NOCOPY or L_COPY | |
| 426 * \return 0 if OK, 1 on error | |
| 427 * | |
| 428 * <pre> | |
| 429 * Notes: | |
| 430 * (1) See usage comments at the top of this file. L_INSERT is | |
| 431 * equivalent to L_NOCOPY. | |
| 432 * </pre> | |
| 433 */ | |
| 434 l_ok | |
| 435 sarrayAddString(SARRAY *sa, | |
| 436 const char *string, | |
| 437 l_int32 copyflag) | |
| 438 { | |
| 439 l_int32 n; | |
| 440 | |
| 441 if (!sa) | |
| 442 return ERROR_INT("sa not defined", __func__, 1); | |
| 443 if (!string) | |
| 444 return ERROR_INT("string not defined", __func__, 1); | |
| 445 if (copyflag != L_INSERT && copyflag != L_NOCOPY && copyflag != L_COPY) | |
| 446 return ERROR_INT("invalid copyflag", __func__, 1); | |
| 447 | |
| 448 n = sarrayGetCount(sa); | |
| 449 if (n >= sa->nalloc) { | |
| 450 if (sarrayExtendArray(sa)) | |
| 451 return ERROR_INT("extension failed", __func__, 1); | |
| 452 } | |
| 453 | |
| 454 if (copyflag == L_COPY) | |
| 455 sa->array[n] = stringNew(string); | |
| 456 else /* L_INSERT or L_NOCOPY */ | |
| 457 sa->array[n] = (char *)string; | |
| 458 sa->n++; | |
| 459 return 0; | |
| 460 } | |
| 461 | |
| 462 | |
| 463 /*! | |
| 464 * \brief sarrayExtendArray() | |
| 465 * | |
| 466 * \param[in] sa string array | |
| 467 * \return 0 if OK, 1 on error | |
| 468 * | |
| 469 * <pre> | |
| 470 * Notes: | |
| 471 * (1) Doubles the size of the string ptr array. | |
| 472 * (2) The max number of strings is 50M. | |
| 473 * </pre> | |
| 474 */ | |
| 475 static l_int32 | |
| 476 sarrayExtendArray(SARRAY *sa) | |
| 477 { | |
| 478 size_t oldsize, newsize; | |
| 479 | |
| 480 if (!sa) | |
| 481 return ERROR_INT("sa not defined", __func__, 1); | |
| 482 if (sa->nalloc >= (l_int32)MaxPtrArraySize) /* belt & suspenders */ | |
| 483 return ERROR_INT("sa at maximum ptr size; can't extend", __func__, 1); | |
| 484 oldsize = sa->nalloc * sizeof(char *); | |
| 485 if (sa->nalloc > (l_int32)(MaxPtrArraySize / 2)) { | |
| 486 newsize = MaxPtrArraySize * sizeof(char *); | |
| 487 sa->nalloc = (l_int32)MaxPtrArraySize; | |
| 488 } else { | |
| 489 newsize = 2 * oldsize; | |
| 490 sa->nalloc *= 2; | |
| 491 } | |
| 492 if ((sa->array = (char **)reallocNew((void **)&sa->array, | |
| 493 oldsize, newsize)) == NULL) | |
| 494 return ERROR_INT("new ptr array not returned", __func__, 1); | |
| 495 | |
| 496 return 0; | |
| 497 } | |
| 498 | |
| 499 | |
| 500 /*! | |
| 501 * \brief sarrayRemoveString() | |
| 502 * | |
| 503 * \param[in] sa string array | |
| 504 * \param[in] index of string within sarray | |
| 505 * \return removed string, or NULL on error | |
| 506 */ | |
| 507 char * | |
| 508 sarrayRemoveString(SARRAY *sa, | |
| 509 l_int32 index) | |
| 510 { | |
| 511 char *string; | |
| 512 char **array; | |
| 513 l_int32 i, n, nalloc; | |
| 514 | |
| 515 if (!sa) | |
| 516 return (char *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 517 | |
| 518 if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL) | |
| 519 return (char *)ERROR_PTR("array not returned", __func__, NULL); | |
| 520 | |
| 521 if (index < 0 || index >= n) | |
| 522 return (char *)ERROR_PTR("array index out of bounds", __func__, NULL); | |
| 523 | |
| 524 string = array[index]; | |
| 525 | |
| 526 /* If removed string is not at end of array, shift | |
| 527 * to fill in, maintaining original ordering. | |
| 528 * Note: if we didn't care about the order, we could | |
| 529 * put the last string array[n - 1] directly into the hole. */ | |
| 530 for (i = index; i < n - 1; i++) | |
| 531 array[i] = array[i + 1]; | |
| 532 | |
| 533 sa->n--; | |
| 534 return string; | |
| 535 } | |
| 536 | |
| 537 | |
| 538 /*! | |
| 539 * \brief sarrayReplaceString() | |
| 540 * | |
| 541 * \param[in] sa string array | |
| 542 * \param[in] index of string within sarray to be replaced | |
| 543 * \param[in] newstr string to replace existing one | |
| 544 * \param[in] copyflag L_INSERT, L_COPY | |
| 545 * \return 0 if OK, 1 on error | |
| 546 * | |
| 547 * <pre> | |
| 548 * Notes: | |
| 549 * (1) This destroys an existing string and replaces it with | |
| 550 * the new string or a copy of it. | |
| 551 * (2) By design, an sarray is always compacted, so there are | |
| 552 * never any holes (null ptrs) in the ptr array up to the | |
| 553 * current count. | |
| 554 * </pre> | |
| 555 */ | |
| 556 l_ok | |
| 557 sarrayReplaceString(SARRAY *sa, | |
| 558 l_int32 index, | |
| 559 char *newstr, | |
| 560 l_int32 copyflag) | |
| 561 { | |
| 562 char *str; | |
| 563 l_int32 n; | |
| 564 | |
| 565 if (!sa) | |
| 566 return ERROR_INT("sa not defined", __func__, 1); | |
| 567 n = sarrayGetCount(sa); | |
| 568 if (index < 0 || index >= n) | |
| 569 return ERROR_INT("array index out of bounds", __func__, 1); | |
| 570 if (!newstr) | |
| 571 return ERROR_INT("newstr not defined", __func__, 1); | |
| 572 if (copyflag != L_INSERT && copyflag != L_COPY) | |
| 573 return ERROR_INT("invalid copyflag", __func__, 1); | |
| 574 | |
| 575 LEPT_FREE(sa->array[index]); | |
| 576 if (copyflag == L_INSERT) | |
| 577 str = newstr; | |
| 578 else /* L_COPY */ | |
| 579 str = stringNew(newstr); | |
| 580 sa->array[index] = str; | |
| 581 return 0; | |
| 582 } | |
| 583 | |
| 584 | |
| 585 /*! | |
| 586 * \brief sarrayClear() | |
| 587 * | |
| 588 * \param[in] sa string array | |
| 589 * \return 0 if OK; 1 on error | |
| 590 */ | |
| 591 l_ok | |
| 592 sarrayClear(SARRAY *sa) | |
| 593 { | |
| 594 l_int32 i; | |
| 595 | |
| 596 if (!sa) | |
| 597 return ERROR_INT("sa not defined", __func__, 1); | |
| 598 for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */ | |
| 599 LEPT_FREE(sa->array[i]); | |
| 600 sa->array[i] = NULL; | |
| 601 } | |
| 602 sa->n = 0; | |
| 603 return 0; | |
| 604 } | |
| 605 | |
| 606 | |
| 607 /*----------------------------------------------------------------------* | |
| 608 * Accessors * | |
| 609 *----------------------------------------------------------------------*/ | |
| 610 /*! | |
| 611 * \brief sarrayGetCount() | |
| 612 * | |
| 613 * \param[in] sa string array | |
| 614 * \return count, or 0 if no strings or on error | |
| 615 */ | |
| 616 l_int32 | |
| 617 sarrayGetCount(SARRAY *sa) | |
| 618 { | |
| 619 if (!sa) | |
| 620 return ERROR_INT("sa not defined", __func__, 0); | |
| 621 return sa->n; | |
| 622 } | |
| 623 | |
| 624 | |
| 625 /*! | |
| 626 * \brief sarrayGetArray() | |
| 627 * | |
| 628 * \param[in] sa string array | |
| 629 * \param[out] pnalloc [optional] number allocated string ptrs | |
| 630 * \param[out] pn [optional] number allocated strings | |
| 631 * \return ptr to string array, or NULL on error | |
| 632 * | |
| 633 * <pre> | |
| 634 * Notes: | |
| 635 * (1) Caution: the returned array is not a copy, so caller | |
| 636 * must not destroy it! | |
| 637 * </pre> | |
| 638 */ | |
| 639 char ** | |
| 640 sarrayGetArray(SARRAY *sa, | |
| 641 l_int32 *pnalloc, | |
| 642 l_int32 *pn) | |
| 643 { | |
| 644 char **array; | |
| 645 | |
| 646 if (!sa) | |
| 647 return (char **)ERROR_PTR("sa not defined", __func__, NULL); | |
| 648 | |
| 649 array = sa->array; | |
| 650 if (pnalloc) *pnalloc = sa->nalloc; | |
| 651 if (pn) *pn = sa->n; | |
| 652 | |
| 653 return array; | |
| 654 } | |
| 655 | |
| 656 | |
| 657 /*! | |
| 658 * \brief sarrayGetString() | |
| 659 * | |
| 660 * \param[in] sa string array | |
| 661 * \param[in] index to the index-th string | |
| 662 * \param[in] copyflag L_NOCOPY or L_COPY | |
| 663 * \return string, or NULL on error | |
| 664 * | |
| 665 * <pre> | |
| 666 * Notes: | |
| 667 * (1) See usage comments at the top of this file. | |
| 668 * (2) To get a pointer to the string itself, use L_NOCOPY. | |
| 669 * To get a copy of the string, use L_COPY. | |
| 670 * </pre> | |
| 671 */ | |
| 672 char * | |
| 673 sarrayGetString(SARRAY *sa, | |
| 674 l_int32 index, | |
| 675 l_int32 copyflag) | |
| 676 { | |
| 677 if (!sa) | |
| 678 return (char *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 679 if (index < 0 || index >= sa->n) | |
| 680 return (char *)ERROR_PTR("index not valid", __func__, NULL); | |
| 681 if (copyflag != L_NOCOPY && copyflag != L_COPY) | |
| 682 return (char *)ERROR_PTR("invalid copyflag", __func__, NULL); | |
| 683 | |
| 684 if (copyflag == L_NOCOPY) | |
| 685 return sa->array[index]; | |
| 686 else /* L_COPY */ | |
| 687 return stringNew(sa->array[index]); | |
| 688 } | |
| 689 | |
| 690 | |
| 691 /*----------------------------------------------------------------------* | |
| 692 * Conversion to string * | |
| 693 *----------------------------------------------------------------------*/ | |
| 694 /*! | |
| 695 * \brief sarrayToString() | |
| 696 * | |
| 697 * \param[in] sa string array | |
| 698 * \param[in] addnlflag flag: 0 adds nothing to each substring | |
| 699 * 1 adds '\n' to each substring | |
| 700 * 2 adds ' ' to each substring | |
| 701 * 3 adds ',' to each substring | |
| 702 * \return dest string, or NULL on error | |
| 703 * | |
| 704 * <pre> | |
| 705 * Notes: | |
| 706 * (1) Concatenates all the strings in the sarray, preserving | |
| 707 * all white space. | |
| 708 * (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring. | |
| 709 * (3) This function was NOT implemented as: | |
| 710 * for (i = 0; i < n; i++) | |
| 711 * strcat(dest, sarrayGetString(sa, i, L_NOCOPY)); | |
| 712 * Do you see why? | |
| 713 * </pre> | |
| 714 */ | |
| 715 char * | |
| 716 sarrayToString(SARRAY *sa, | |
| 717 l_int32 addnlflag) | |
| 718 { | |
| 719 if (!sa) | |
| 720 return (char *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 721 | |
| 722 return sarrayToStringRange(sa, 0, 0, addnlflag); | |
| 723 } | |
| 724 | |
| 725 | |
| 726 /*! | |
| 727 * \brief sarrayToStringRange() | |
| 728 * | |
| 729 * \param[in] sa string array | |
| 730 * \param[in] first index of first string to use; starts with 0 | |
| 731 * \param[in] nstrings number of strings to append into the result; use | |
| 732 * 0 to append to the end of the sarray | |
| 733 * \param[in] addnlflag flag: 0 adds nothing to each substring | |
| 734 * 1 adds '\n' to each substring | |
| 735 * 2 adds ' ' to each substring | |
| 736 * 3 adds ',' to each substring | |
| 737 * \return dest string, or NULL on error | |
| 738 * | |
| 739 * <pre> | |
| 740 * Notes: | |
| 741 * (1) Concatenates the specified strings in the sarray, preserving | |
| 742 * all white space. | |
| 743 * (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring. | |
| 744 * (3) If the sarray is empty, this returns a string with just | |
| 745 * the character corresponding to %addnlflag. | |
| 746 * </pre> | |
| 747 */ | |
| 748 char * | |
| 749 sarrayToStringRange(SARRAY *sa, | |
| 750 l_int32 first, | |
| 751 l_int32 nstrings, | |
| 752 l_int32 addnlflag) | |
| 753 { | |
| 754 char *dest, *src, *str; | |
| 755 l_int32 n, i, last, size, index, len; | |
| 756 | |
| 757 if (!sa) | |
| 758 return (char *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 759 if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3) | |
| 760 return (char *)ERROR_PTR("invalid addnlflag", __func__, NULL); | |
| 761 | |
| 762 n = sarrayGetCount(sa); | |
| 763 | |
| 764 /* Empty sa; return char corresponding to addnlflag only */ | |
| 765 if (n == 0) { | |
| 766 if (first == 0) { | |
| 767 if (addnlflag == 0) | |
| 768 return stringNew(""); | |
| 769 if (addnlflag == 1) | |
| 770 return stringNew("\n"); | |
| 771 if (addnlflag == 2) | |
| 772 return stringNew(" "); | |
| 773 else /* addnlflag == 3) */ | |
| 774 return stringNew(","); | |
| 775 } else { | |
| 776 return (char *)ERROR_PTR("first not valid", __func__, NULL); | |
| 777 } | |
| 778 } | |
| 779 | |
| 780 /* Determine the range of string indices to be used */ | |
| 781 if (first < 0 || first >= n) | |
| 782 return (char *)ERROR_PTR("first not valid", __func__, NULL); | |
| 783 if (nstrings == 0 || (nstrings > n - first)) | |
| 784 nstrings = n - first; /* no overflow */ | |
| 785 last = first + nstrings - 1; | |
| 786 | |
| 787 /* Determine the size of the output string */ | |
| 788 size = 0; | |
| 789 for (i = first; i <= last; i++) { | |
| 790 if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) | |
| 791 return (char *)ERROR_PTR("str not found", __func__, NULL); | |
| 792 size += strlen(str) + 2; | |
| 793 } | |
| 794 if ((dest = (char *)LEPT_CALLOC(size + 1, sizeof(char))) == NULL) | |
| 795 return (char *)ERROR_PTR("dest not made", __func__, NULL); | |
| 796 | |
| 797 /* Construct the output */ | |
| 798 index = 0; | |
| 799 for (i = first; i <= last; i++) { | |
| 800 src = sarrayGetString(sa, i, L_NOCOPY); | |
| 801 len = strlen(src); | |
| 802 memcpy(dest + index, src, len); | |
| 803 index += len; | |
| 804 if (addnlflag == 1) { | |
| 805 dest[index] = '\n'; | |
| 806 index++; | |
| 807 } else if (addnlflag == 2) { | |
| 808 dest[index] = ' '; | |
| 809 index++; | |
| 810 } else if (addnlflag == 3) { | |
| 811 dest[index] = ','; | |
| 812 index++; | |
| 813 } | |
| 814 } | |
| 815 | |
| 816 return dest; | |
| 817 } | |
| 818 | |
| 819 | |
| 820 /*----------------------------------------------------------------------* | |
| 821 * Concatenate strings uniformly within the sarray * | |
| 822 *----------------------------------------------------------------------*/ | |
| 823 /*! | |
| 824 * \brief sarrayConcatUniformly() | |
| 825 * | |
| 826 * \param[in] sa string array | |
| 827 * \param[in] n number of strings in output sarray | |
| 828 * \param[in] addnlflag flag: 0 adds nothing to each substring | |
| 829 * 1 adds '\n' to each substring | |
| 830 * 2 adds ' ' to each substring | |
| 831 * 3 adds ',' to each substring | |
| 832 * \return dest sarray, or NULL on error | |
| 833 * | |
| 834 * <pre> | |
| 835 * Notes: | |
| 836 * (1) Divides %sa into %n essentially equal sets of strings, | |
| 837 * concatenates each set individually, and makes an output | |
| 838 * sarray with the %n concatenations. %n must not exceed the | |
| 839 * number of strings in %sa. | |
| 840 * (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring. | |
| 841 * </pre> | |
| 842 */ | |
| 843 SARRAY * | |
| 844 sarrayConcatUniformly(SARRAY *sa, | |
| 845 l_int32 n, | |
| 846 l_int32 addnlflag) | |
| 847 { | |
| 848 l_int32 i, first, ntot, nstr; | |
| 849 char *str; | |
| 850 NUMA *na; | |
| 851 SARRAY *saout; | |
| 852 | |
| 853 if (!sa) | |
| 854 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 855 ntot = sarrayGetCount(sa); | |
| 856 if (n < 1) | |
| 857 return (SARRAY *)ERROR_PTR("n must be >= 1", __func__, NULL); | |
| 858 if (n > ntot) { | |
| 859 L_ERROR("n = %d > ntot = %d\n", __func__, n, ntot); | |
| 860 return NULL; | |
| 861 } | |
| 862 if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3) | |
| 863 return (SARRAY *)ERROR_PTR("invalid addnlflag", __func__, NULL); | |
| 864 | |
| 865 saout = sarrayCreate(0); | |
| 866 na = numaGetUniformBinSizes(ntot, n); | |
| 867 for (i = 0, first = 0; i < n; i++) { | |
| 868 numaGetIValue(na, i, &nstr); | |
| 869 str = sarrayToStringRange(sa, first, nstr, addnlflag); | |
| 870 sarrayAddString(saout, str, L_INSERT); | |
| 871 first += nstr; | |
| 872 } | |
| 873 numaDestroy(&na); | |
| 874 return saout; | |
| 875 } | |
| 876 | |
| 877 | |
| 878 /*----------------------------------------------------------------------* | |
| 879 * Join 2 sarrays * | |
| 880 *----------------------------------------------------------------------*/ | |
| 881 /*! | |
| 882 * \brief sarrayJoin() | |
| 883 * | |
| 884 * \param[in] sa1 to be added to | |
| 885 * \param[in] sa2 append to sa1 | |
| 886 * \return 0 if OK, 1 on error | |
| 887 * | |
| 888 * <pre> | |
| 889 * Notes: | |
| 890 * (1) Copies of the strings in sarray2 are added to sarray1. | |
| 891 * </pre> | |
| 892 */ | |
| 893 l_ok | |
| 894 sarrayJoin(SARRAY *sa1, | |
| 895 SARRAY *sa2) | |
| 896 { | |
| 897 char *str; | |
| 898 l_int32 n, i; | |
| 899 | |
| 900 if (!sa1) | |
| 901 return ERROR_INT("sa1 not defined", __func__, 1); | |
| 902 if (!sa2) | |
| 903 return ERROR_INT("sa2 not defined", __func__, 1); | |
| 904 | |
| 905 n = sarrayGetCount(sa2); | |
| 906 for (i = 0; i < n; i++) { | |
| 907 str = sarrayGetString(sa2, i, L_NOCOPY); | |
| 908 if (sarrayAddString(sa1, str, L_COPY) == 1) { | |
| 909 L_ERROR("failed to add string at i = %d\n", __func__, i); | |
| 910 return 1; | |
| 911 } | |
| 912 } | |
| 913 return 0; | |
| 914 } | |
| 915 | |
| 916 | |
| 917 /*! | |
| 918 * \brief sarrayAppendRange() | |
| 919 * | |
| 920 * \param[in] sa1 to be added to | |
| 921 * \param[in] sa2 append specified range of strings in sa2 to sa1 | |
| 922 * \param[in] start index of first string of sa2 to append | |
| 923 * \param[in] end index of last string of sa2 to append; | |
| 924 * -1 to append to end of array | |
| 925 * \return 0 if OK, 1 on error | |
| 926 * | |
| 927 * <pre> | |
| 928 * Notes: | |
| 929 * (1) Copies of the strings in sarray2 are added to sarray1. | |
| 930 * (2) The [start ... end] range is truncated if necessary. | |
| 931 * (3) Use end == -1 to append to the end of sa2. | |
| 932 * </pre> | |
| 933 */ | |
| 934 l_ok | |
| 935 sarrayAppendRange(SARRAY *sa1, | |
| 936 SARRAY *sa2, | |
| 937 l_int32 start, | |
| 938 l_int32 end) | |
| 939 { | |
| 940 char *str; | |
| 941 l_int32 n, i; | |
| 942 | |
| 943 if (!sa1) | |
| 944 return ERROR_INT("sa1 not defined", __func__, 1); | |
| 945 if (!sa2) | |
| 946 return ERROR_INT("sa2 not defined", __func__, 1); | |
| 947 | |
| 948 if (start < 0) | |
| 949 start = 0; | |
| 950 n = sarrayGetCount(sa2); | |
| 951 if (end < 0 || end >= n) | |
| 952 end = n - 1; | |
| 953 if (start > end) | |
| 954 return ERROR_INT("start > end", __func__, 1); | |
| 955 | |
| 956 for (i = start; i <= end; i++) { | |
| 957 str = sarrayGetString(sa2, i, L_NOCOPY); | |
| 958 sarrayAddString(sa1, str, L_COPY); | |
| 959 } | |
| 960 | |
| 961 return 0; | |
| 962 } | |
| 963 | |
| 964 | |
| 965 /*----------------------------------------------------------------------* | |
| 966 * Pad an sarray to be the same size as another sarray * | |
| 967 *----------------------------------------------------------------------*/ | |
| 968 /*! | |
| 969 * \brief sarrayPadToSameSize() | |
| 970 * | |
| 971 * \param[in] sa1, sa2 | |
| 972 * \param[in] padstring | |
| 973 * \return 0 if OK, 1 on error | |
| 974 * | |
| 975 * <pre> | |
| 976 * Notes: | |
| 977 * (1) If two sarrays have different size, this adds enough | |
| 978 * instances of %padstring to the smaller so that they are | |
| 979 * the same size. It is useful when two or more sarrays | |
| 980 * are being sequenced in parallel, and it is necessary to | |
| 981 * find a valid string at each index. | |
| 982 * </pre> | |
| 983 */ | |
| 984 l_ok | |
| 985 sarrayPadToSameSize(SARRAY *sa1, | |
| 986 SARRAY *sa2, | |
| 987 const char *padstring) | |
| 988 { | |
| 989 l_int32 i, n1, n2; | |
| 990 | |
| 991 if (!sa1 || !sa2) | |
| 992 return ERROR_INT("both sa1 and sa2 not defined", __func__, 1); | |
| 993 | |
| 994 n1 = sarrayGetCount(sa1); | |
| 995 n2 = sarrayGetCount(sa2); | |
| 996 if (n1 < n2) { | |
| 997 for (i = n1; i < n2; i++) | |
| 998 sarrayAddString(sa1, padstring, L_COPY); | |
| 999 } else if (n1 > n2) { | |
| 1000 for (i = n2; i < n1; i++) | |
| 1001 sarrayAddString(sa2, padstring, L_COPY); | |
| 1002 } | |
| 1003 | |
| 1004 return 0; | |
| 1005 } | |
| 1006 | |
| 1007 | |
| 1008 /*----------------------------------------------------------------------* | |
| 1009 * Convert word sarray to line sarray * | |
| 1010 *----------------------------------------------------------------------*/ | |
| 1011 /*! | |
| 1012 * \brief sarrayConvertWordsToLines() | |
| 1013 * | |
| 1014 * \param[in] sa sa of individual words | |
| 1015 * \param[in] linesize max num of chars in each line | |
| 1016 * \return saout sa of formatted lines, or NULL on error | |
| 1017 * | |
| 1018 * <pre> | |
| 1019 * Notes: | |
| 1020 * (1) This is useful for re-typesetting text to a specific maximum | |
| 1021 * line length. The individual words in the input sarray | |
| 1022 * are concatenated into textlines. An input word string of zero | |
| 1023 * length is taken to be a paragraph separator. Each time | |
| 1024 * such a string is found, the current line is ended and | |
| 1025 * a new line is also produced that contains just the | |
| 1026 * string of zero length "". When the output sarray | |
| 1027 * of lines is eventually converted to a string with newlines | |
| 1028 * typically appended to each line string, the empty | |
| 1029 * strings are just converted to newlines, producing the visible | |
| 1030 * paragraph separation. | |
| 1031 * (2) What happens when a word is larger than linesize? | |
| 1032 * We write it out as a single line anyway! Words preceding | |
| 1033 * or following this long word are placed on lines preceding | |
| 1034 * or following the line with the long word. Why this choice? | |
| 1035 * Long "words" found in text documents are typically URLs, and | |
| 1036 * it's often desirable not to put newlines in the middle of a URL. | |
| 1037 * The text display program e.g., text editor will typically | |
| 1038 * wrap the long "word" to fit in the window. | |
| 1039 * </pre> | |
| 1040 */ | |
| 1041 SARRAY * | |
| 1042 sarrayConvertWordsToLines(SARRAY *sa, | |
| 1043 l_int32 linesize) | |
| 1044 { | |
| 1045 char *wd, *strl; | |
| 1046 char emptystring[] = ""; | |
| 1047 l_int32 n, i, len, totlen; | |
| 1048 SARRAY *sal, *saout; | |
| 1049 | |
| 1050 if (!sa) | |
| 1051 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 1052 | |
| 1053 saout = sarrayCreate(0); | |
| 1054 n = sarrayGetCount(sa); | |
| 1055 totlen = 0; | |
| 1056 sal = NULL; | |
| 1057 for (i = 0; i < n; i++) { | |
| 1058 if (!sal) | |
| 1059 sal = sarrayCreate(0); | |
| 1060 wd = sarrayGetString(sa, i, L_NOCOPY); | |
| 1061 len = strlen(wd); | |
| 1062 if (len == 0) { /* end of paragraph: end line & insert blank line */ | |
| 1063 if (totlen > 0) { | |
| 1064 strl = sarrayToString(sal, 2); | |
| 1065 sarrayAddString(saout, strl, L_INSERT); | |
| 1066 } | |
| 1067 sarrayAddString(saout, emptystring, L_COPY); | |
| 1068 sarrayDestroy(&sal); | |
| 1069 totlen = 0; | |
| 1070 } else if (totlen == 0 && len + 1 > linesize) { /* long word! */ | |
| 1071 sarrayAddString(saout, wd, L_COPY); /* copy to one line */ | |
| 1072 } else if (totlen + len + 1 > linesize) { /* end line & start new */ | |
| 1073 strl = sarrayToString(sal, 2); | |
| 1074 sarrayAddString(saout, strl, L_INSERT); | |
| 1075 sarrayDestroy(&sal); | |
| 1076 sal = sarrayCreate(0); | |
| 1077 sarrayAddString(sal, wd, L_COPY); | |
| 1078 totlen = len + 1; | |
| 1079 } else { /* add to current line */ | |
| 1080 sarrayAddString(sal, wd, L_COPY); | |
| 1081 totlen += len + 1; | |
| 1082 } | |
| 1083 } | |
| 1084 if (totlen > 0) { /* didn't end with blank line; output last line */ | |
| 1085 strl = sarrayToString(sal, 2); | |
| 1086 sarrayAddString(saout, strl, L_INSERT); | |
| 1087 sarrayDestroy(&sal); | |
| 1088 } | |
| 1089 | |
| 1090 return saout; | |
| 1091 } | |
| 1092 | |
| 1093 | |
| 1094 /*----------------------------------------------------------------------* | |
| 1095 * Split string on separator list * | |
| 1096 *----------------------------------------------------------------------*/ | |
| 1097 /* | |
| 1098 * \brief sarraySplitString() | |
| 1099 * | |
| 1100 * \param[in] sa to append to; typically empty initially | |
| 1101 * \param[in] str string to split; not changed | |
| 1102 * \param[in] separators characters that split input string | |
| 1103 * \return 0 if OK, 1 on error. | |
| 1104 * | |
| 1105 * <pre> | |
| 1106 * Notes: | |
| 1107 * (1) This uses strtokSafe(). See the notes there in utils.c. | |
| 1108 * </pre> | |
| 1109 */ | |
| 1110 l_int32 | |
| 1111 sarraySplitString(SARRAY *sa, | |
| 1112 const char *str, | |
| 1113 const char *separators) | |
| 1114 { | |
| 1115 char *cstr, *substr, *saveptr; | |
| 1116 | |
| 1117 if (!sa) | |
| 1118 return ERROR_INT("sa not defined", __func__, 1); | |
| 1119 if (!str) | |
| 1120 return ERROR_INT("str not defined", __func__, 1); | |
| 1121 if (!separators) | |
| 1122 return ERROR_INT("separators not defined", __func__, 1); | |
| 1123 | |
| 1124 cstr = stringNew(str); /* preserves const-ness of input str */ | |
| 1125 saveptr = NULL; | |
| 1126 substr = strtokSafe(cstr, separators, &saveptr); | |
| 1127 if (substr) | |
| 1128 sarrayAddString(sa, substr, L_INSERT); | |
| 1129 while ((substr = strtokSafe(NULL, separators, &saveptr))) | |
| 1130 sarrayAddString(sa, substr, L_INSERT); | |
| 1131 LEPT_FREE(cstr); | |
| 1132 | |
| 1133 return 0; | |
| 1134 } | |
| 1135 | |
| 1136 | |
| 1137 /*----------------------------------------------------------------------* | |
| 1138 * Filter sarray * | |
| 1139 *----------------------------------------------------------------------*/ | |
| 1140 /*! | |
| 1141 * \brief sarraySelectBySubstring() | |
| 1142 * | |
| 1143 * \param[in] sain input sarray | |
| 1144 * \param[in] substr [optional] substring for matching; can be NULL | |
| 1145 * \return saout output sarray, filtered with substring or NULL on error | |
| 1146 * | |
| 1147 * <pre> | |
| 1148 * Notes: | |
| 1149 * (1) This selects all strings in sain that have substr as a substring. | |
| 1150 * Note that we can't use strncmp() because we're looking for | |
| 1151 * a match to the substring anywhere within each filename. | |
| 1152 * (2) If substr == NULL, returns a copy of the sarray. | |
| 1153 * </pre> | |
| 1154 */ | |
| 1155 SARRAY * | |
| 1156 sarraySelectBySubstring(SARRAY *sain, | |
| 1157 const char *substr) | |
| 1158 { | |
| 1159 char *str; | |
| 1160 l_int32 n, i, offset, found; | |
| 1161 SARRAY *saout; | |
| 1162 | |
| 1163 if (!sain) | |
| 1164 return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL); | |
| 1165 | |
| 1166 n = sarrayGetCount(sain); | |
| 1167 if (!substr || n == 0) | |
| 1168 return sarrayCopy(sain); | |
| 1169 | |
| 1170 saout = sarrayCreate(n); | |
| 1171 for (i = 0; i < n; i++) { | |
| 1172 str = sarrayGetString(sain, i, L_NOCOPY); | |
| 1173 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, | |
| 1174 strlen(substr), &offset, &found); | |
| 1175 if (found) | |
| 1176 sarrayAddString(saout, str, L_COPY); | |
| 1177 } | |
| 1178 | |
| 1179 return saout; | |
| 1180 } | |
| 1181 | |
| 1182 | |
| 1183 /*! | |
| 1184 * \brief sarraySelectRange() | |
| 1185 * | |
| 1186 * \param[in] sain input sarray | |
| 1187 * \param[in] first index of first string to be selected | |
| 1188 * \param[in] last index of last string to be selected; | |
| 1189 * use 0 to go to the end of the sarray | |
| 1190 * \return saout output sarray, or NULL on error | |
| 1191 * | |
| 1192 * <pre> | |
| 1193 * Notes: | |
| 1194 * (1) This makes %saout consisting of copies of all strings in %sain | |
| 1195 * in the index set [first ... last]. Use %last == 0 to get all | |
| 1196 * strings from %first to the last string in the sarray. | |
| 1197 * </pre> | |
| 1198 */ | |
| 1199 SARRAY * | |
| 1200 sarraySelectRange(SARRAY *sain, | |
| 1201 l_int32 first, | |
| 1202 l_int32 last) | |
| 1203 { | |
| 1204 char *str; | |
| 1205 l_int32 n, i; | |
| 1206 SARRAY *saout; | |
| 1207 | |
| 1208 if (!sain) | |
| 1209 return (SARRAY *)ERROR_PTR("sain not defined", __func__, NULL); | |
| 1210 if (first < 0) first = 0; | |
| 1211 n = sarrayGetCount(sain); | |
| 1212 if (last <= 0) last = n - 1; | |
| 1213 if (last >= n) { | |
| 1214 L_WARNING("last > n - 1; setting to n - 1\n", __func__); | |
| 1215 last = n - 1; | |
| 1216 } | |
| 1217 if (first > last) | |
| 1218 return (SARRAY *)ERROR_PTR("first must be >= last", __func__, NULL); | |
| 1219 | |
| 1220 saout = sarrayCreate(0); | |
| 1221 for (i = first; i <= last; i++) { | |
| 1222 str = sarrayGetString(sain, i, L_COPY); | |
| 1223 sarrayAddString(saout, str, L_INSERT); | |
| 1224 } | |
| 1225 | |
| 1226 return saout; | |
| 1227 } | |
| 1228 | |
| 1229 | |
| 1230 /*! | |
| 1231 * \brief sarrayParseRange() | |
| 1232 * | |
| 1233 * \param[in] sa input sarray | |
| 1234 * \param[in] start index to start range search | |
| 1235 * \param[out] pactualstart index of actual start; may be > 'start' | |
| 1236 * \param[out] pend index of end | |
| 1237 * \param[out] pnewstart index of start of next range | |
| 1238 * \param[in] substr substring for matching at beginning of string | |
| 1239 * \param[in] loc byte offset within the string for the pattern; | |
| 1240 * use -1 if the location does not matter. | |
| 1241 * \return 0 if valid range found; 1 otherwise | |
| 1242 * | |
| 1243 * <pre> | |
| 1244 * Notes: | |
| 1245 * (1) This finds the range of the next set of strings in SA, | |
| 1246 * beginning the search at 'start', that does NOT have | |
| 1247 * the substring 'substr' either at the indicated location | |
| 1248 * in the string or anywhere in the string. The input | |
| 1249 * variable 'loc' is the specified offset within the string; | |
| 1250 * use -1 to indicate 'anywhere in the string'. | |
| 1251 * (2) Always check the return value to verify that a valid range | |
| 1252 * was found. | |
| 1253 * (3) If a valid range is not found, the values of actstart, | |
| 1254 * end and newstart are all set to the size of sa. | |
| 1255 * (4) If this is the last valid range, newstart returns the value n. | |
| 1256 * In use, this should be tested before calling the function. | |
| 1257 * (5) Usage example. To find all the valid ranges in a file | |
| 1258 * where the invalid lines begin with two dashes, copy each | |
| 1259 * line in the file to a string in an sarray, and do: | |
| 1260 * start = 0; | |
| 1261 * while (!sarrayParseRange(sa, start, &actstart, &end, &start, | |
| 1262 * "--", 0)) | |
| 1263 * lept_stderr("start = %d, end = %d\n", actstart, end); | |
| 1264 * </pre> | |
| 1265 */ | |
| 1266 l_int32 | |
| 1267 sarrayParseRange(SARRAY *sa, | |
| 1268 l_int32 start, | |
| 1269 l_int32 *pactualstart, | |
| 1270 l_int32 *pend, | |
| 1271 l_int32 *pnewstart, | |
| 1272 const char *substr, | |
| 1273 l_int32 loc) | |
| 1274 { | |
| 1275 char *str; | |
| 1276 l_int32 n, i, offset, found; | |
| 1277 | |
| 1278 if (!sa) | |
| 1279 return ERROR_INT("sa not defined", __func__, 1); | |
| 1280 if (!pactualstart || !pend || !pnewstart) | |
| 1281 return ERROR_INT("not all range addresses defined", __func__, 1); | |
| 1282 n = sarrayGetCount(sa); | |
| 1283 *pactualstart = *pend = *pnewstart = n; | |
| 1284 if (!substr) | |
| 1285 return ERROR_INT("substr not defined", __func__, 1); | |
| 1286 | |
| 1287 /* Look for the first string without the marker */ | |
| 1288 if (start < 0 || start >= n) | |
| 1289 return 1; | |
| 1290 for (i = start; i < n; i++) { | |
| 1291 str = sarrayGetString(sa, i, L_NOCOPY); | |
| 1292 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, | |
| 1293 strlen(substr), &offset, &found); | |
| 1294 if (loc < 0) { | |
| 1295 if (!found) break; | |
| 1296 } else { | |
| 1297 if (!found || offset != loc) break; | |
| 1298 } | |
| 1299 } | |
| 1300 start = i; | |
| 1301 if (i == n) /* couldn't get started */ | |
| 1302 return 1; | |
| 1303 | |
| 1304 /* Look for the last string without the marker */ | |
| 1305 *pactualstart = start; | |
| 1306 for (i = start + 1; i < n; i++) { | |
| 1307 str = sarrayGetString(sa, i, L_NOCOPY); | |
| 1308 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, | |
| 1309 strlen(substr), &offset, &found); | |
| 1310 if (loc < 0) { | |
| 1311 if (found) break; | |
| 1312 } else { | |
| 1313 if (found && offset == loc) break; | |
| 1314 } | |
| 1315 } | |
| 1316 *pend = i - 1; | |
| 1317 start = i; | |
| 1318 if (i == n) /* no further range */ | |
| 1319 return 0; | |
| 1320 | |
| 1321 /* Look for the first string after *pend without the marker. | |
| 1322 * This will start the next run of strings, if it exists. */ | |
| 1323 for (i = start; i < n; i++) { | |
| 1324 str = sarrayGetString(sa, i, L_NOCOPY); | |
| 1325 arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, | |
| 1326 strlen(substr), &offset, &found); | |
| 1327 if (loc < 0) { | |
| 1328 if (!found) break; | |
| 1329 } else { | |
| 1330 if (!found || offset != loc) break; | |
| 1331 } | |
| 1332 } | |
| 1333 if (i < n) | |
| 1334 *pnewstart = i; | |
| 1335 | |
| 1336 return 0; | |
| 1337 } | |
| 1338 | |
| 1339 | |
| 1340 /*----------------------------------------------------------------------* | |
| 1341 * Serialize for I/O * | |
| 1342 *----------------------------------------------------------------------*/ | |
| 1343 /*! | |
| 1344 * \brief sarrayRead() | |
| 1345 * | |
| 1346 * \param[in] filename | |
| 1347 * \return sarray, or NULL on error | |
| 1348 */ | |
| 1349 SARRAY * | |
| 1350 sarrayRead(const char *filename) | |
| 1351 { | |
| 1352 FILE *fp; | |
| 1353 SARRAY *sa; | |
| 1354 | |
| 1355 if (!filename) | |
| 1356 return (SARRAY *)ERROR_PTR("filename not defined", __func__, NULL); | |
| 1357 | |
| 1358 if ((fp = fopenReadStream(filename)) == NULL) | |
| 1359 return (SARRAY *)ERROR_PTR_1("stream not opened", | |
| 1360 filename, __func__, NULL); | |
| 1361 sa = sarrayReadStream(fp); | |
| 1362 fclose(fp); | |
| 1363 if (!sa) | |
| 1364 return (SARRAY *)ERROR_PTR_1("sa not read", filename, __func__, NULL); | |
| 1365 return sa; | |
| 1366 } | |
| 1367 | |
| 1368 | |
| 1369 /*! | |
| 1370 * \brief sarrayReadStream() | |
| 1371 * | |
| 1372 * \param[in] fp file stream | |
| 1373 * \return sarray, or NULL on error | |
| 1374 * | |
| 1375 * <pre> | |
| 1376 * Notes: | |
| 1377 * (1) We store the size of each string along with the string. | |
| 1378 * The limit on the number of strings is 50M. | |
| 1379 * The limit on the size of any string is 2^30 bytes. | |
| 1380 * (2) This allows a string to have embedded newlines. By reading | |
| 1381 * the entire string, as determined by its size, we are | |
| 1382 * not affected by any number of embedded newlines. | |
| 1383 * (3) It is OK for the sarray to be empty. | |
| 1384 * </pre> | |
| 1385 */ | |
| 1386 SARRAY * | |
| 1387 sarrayReadStream(FILE *fp) | |
| 1388 { | |
| 1389 char *stringbuf; | |
| 1390 l_int32 i, n, size, index, bufsize, version, ignore, success; | |
| 1391 SARRAY *sa; | |
| 1392 | |
| 1393 if (!fp) | |
| 1394 return (SARRAY *)ERROR_PTR("stream not defined", __func__, NULL); | |
| 1395 | |
| 1396 if (fscanf(fp, "\nSarray Version %d\n", &version) != 1) | |
| 1397 return (SARRAY *)ERROR_PTR("not an sarray file", __func__, NULL); | |
| 1398 if (version != SARRAY_VERSION_NUMBER) | |
| 1399 return (SARRAY *)ERROR_PTR("invalid sarray version", __func__, NULL); | |
| 1400 if (fscanf(fp, "Number of strings = %d\n", &n) != 1) | |
| 1401 return (SARRAY *)ERROR_PTR("error on # strings", __func__, NULL); | |
| 1402 if (n < 0) | |
| 1403 return (SARRAY *)ERROR_PTR("num string ptrs <= 0", __func__, NULL); | |
| 1404 if (n > (l_int32)MaxPtrArraySize) | |
| 1405 return (SARRAY *)ERROR_PTR("too many string ptrs", __func__, NULL); | |
| 1406 if (n == 0) L_INFO("the sarray is empty\n", __func__); | |
| 1407 | |
| 1408 success = TRUE; | |
| 1409 if ((sa = sarrayCreate(n)) == NULL) | |
| 1410 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL); | |
| 1411 bufsize = 512 + 1; | |
| 1412 stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char)); | |
| 1413 | |
| 1414 for (i = 0; i < n; i++) { | |
| 1415 /* Get the size of the stored string */ | |
| 1416 if ((fscanf(fp, "%d[%d]:", &index, &size) != 2) || (size > (1 << 30))) { | |
| 1417 success = FALSE; | |
| 1418 L_ERROR("error on string size\n", __func__); | |
| 1419 goto cleanup; | |
| 1420 } | |
| 1421 /* Expand the string buffer if necessary */ | |
| 1422 if (size > bufsize - 5) { | |
| 1423 LEPT_FREE(stringbuf); | |
| 1424 bufsize = (l_int32)(1.5 * size); | |
| 1425 stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char)); | |
| 1426 } | |
| 1427 /* Read the stored string, plus leading spaces and trailing \n */ | |
| 1428 if (fread(stringbuf, 1, size + 3, fp) != size + 3) { | |
| 1429 success = FALSE; | |
| 1430 L_ERROR("error reading string\n", __func__); | |
| 1431 goto cleanup; | |
| 1432 } | |
| 1433 /* Remove the \n that was added by sarrayWriteStream() */ | |
| 1434 stringbuf[size + 2] = '\0'; | |
| 1435 /* Copy it in, skipping the 2 leading spaces */ | |
| 1436 sarrayAddString(sa, stringbuf + 2, L_COPY); | |
| 1437 } | |
| 1438 ignore = fscanf(fp, "\n"); | |
| 1439 | |
| 1440 cleanup: | |
| 1441 LEPT_FREE(stringbuf); | |
| 1442 if (!success) sarrayDestroy(&sa); | |
| 1443 return sa; | |
| 1444 } | |
| 1445 | |
| 1446 | |
| 1447 /*! | |
| 1448 * \brief sarrayReadMem() | |
| 1449 * | |
| 1450 * \param[in] data serialization in ascii | |
| 1451 * \param[in] size of data; can use strlen to get it | |
| 1452 * \return sarray, or NULL on error | |
| 1453 */ | |
| 1454 SARRAY * | |
| 1455 sarrayReadMem(const l_uint8 *data, | |
| 1456 size_t size) | |
| 1457 { | |
| 1458 FILE *fp; | |
| 1459 SARRAY *sa; | |
| 1460 | |
| 1461 if (!data) | |
| 1462 return (SARRAY *)ERROR_PTR("data not defined", __func__, NULL); | |
| 1463 if ((fp = fopenReadFromMemory(data, size)) == NULL) | |
| 1464 return (SARRAY *)ERROR_PTR("stream not opened", __func__, NULL); | |
| 1465 | |
| 1466 sa = sarrayReadStream(fp); | |
| 1467 fclose(fp); | |
| 1468 if (!sa) L_ERROR("sarray not read\n", __func__); | |
| 1469 return sa; | |
| 1470 } | |
| 1471 | |
| 1472 | |
| 1473 /*! | |
| 1474 * \brief sarrayWrite() | |
| 1475 * | |
| 1476 * \param[in] filename | |
| 1477 * \param[in] sa string array | |
| 1478 * \return 0 if OK; 1 on error | |
| 1479 */ | |
| 1480 l_ok | |
| 1481 sarrayWrite(const char *filename, | |
| 1482 SARRAY *sa) | |
| 1483 { | |
| 1484 l_int32 ret; | |
| 1485 FILE *fp; | |
| 1486 | |
| 1487 if (!filename) | |
| 1488 return ERROR_INT("filename not defined", __func__, 1); | |
| 1489 if (!sa) | |
| 1490 return ERROR_INT("sa not defined", __func__, 1); | |
| 1491 | |
| 1492 if ((fp = fopenWriteStream(filename, "w")) == NULL) | |
| 1493 return ERROR_INT_1("stream not opened", filename, __func__, 1); | |
| 1494 ret = sarrayWriteStream(fp, sa); | |
| 1495 fclose(fp); | |
| 1496 if (ret) | |
| 1497 return ERROR_INT_1("sa not written to stream", filename, __func__, 1); | |
| 1498 return 0; | |
| 1499 } | |
| 1500 | |
| 1501 | |
| 1502 /*! | |
| 1503 * \brief sarrayWriteStream() | |
| 1504 * | |
| 1505 * \param[in] fp file stream; use NULL to write to stderr | |
| 1506 * \param[in] sa string array | |
| 1507 * \return 0 if OK; 1 on error | |
| 1508 * | |
| 1509 * <pre> | |
| 1510 * Notes: | |
| 1511 * (1) This appends a '\n' to each string, which is stripped | |
| 1512 * off by sarrayReadStream(). | |
| 1513 * </pre> | |
| 1514 */ | |
| 1515 l_ok | |
| 1516 sarrayWriteStream(FILE *fp, | |
| 1517 SARRAY *sa) | |
| 1518 { | |
| 1519 l_int32 i, n, len; | |
| 1520 | |
| 1521 if (!fp) | |
| 1522 return ERROR_INT("stream not defined", __func__, 1); | |
| 1523 if (!sa) | |
| 1524 return sarrayWriteStderr(sa); | |
| 1525 | |
| 1526 n = sarrayGetCount(sa); | |
| 1527 fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER); | |
| 1528 fprintf(fp, "Number of strings = %d\n", n); | |
| 1529 for (i = 0; i < n; i++) { | |
| 1530 len = strlen(sa->array[i]); | |
| 1531 fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]); | |
| 1532 } | |
| 1533 fprintf(fp, "\n"); | |
| 1534 | |
| 1535 return 0; | |
| 1536 } | |
| 1537 | |
| 1538 | |
| 1539 /*! | |
| 1540 * \brief sarrayWriteStderr() | |
| 1541 * | |
| 1542 * \param[in] sa string array | |
| 1543 * \return 0 if OK; 1 on error | |
| 1544 */ | |
| 1545 l_ok | |
| 1546 sarrayWriteStderr(SARRAY *sa) | |
| 1547 { | |
| 1548 l_int32 i, n, len; | |
| 1549 | |
| 1550 if (!sa) | |
| 1551 return ERROR_INT("sa not defined", __func__, 1); | |
| 1552 | |
| 1553 n = sarrayGetCount(sa); | |
| 1554 lept_stderr("\nSarray Version %d\n", SARRAY_VERSION_NUMBER); | |
| 1555 lept_stderr("Number of strings = %d\n", n); | |
| 1556 for (i = 0; i < n; i++) { | |
| 1557 len = strlen(sa->array[i]); | |
| 1558 lept_stderr(" %d[%d]: %s\n", i, len, sa->array[i]); | |
| 1559 } | |
| 1560 lept_stderr("\n"); | |
| 1561 return 0; | |
| 1562 } | |
| 1563 | |
| 1564 | |
| 1565 /*! | |
| 1566 * \brief sarrayWriteMem() | |
| 1567 * | |
| 1568 * \param[out] pdata data of serialized sarray; ascii | |
| 1569 * \param[out] psize size of returned data | |
| 1570 * \param[in] sa | |
| 1571 * \return 0 if OK, 1 on error | |
| 1572 * | |
| 1573 * <pre> | |
| 1574 * Notes: | |
| 1575 * (1) Serializes a sarray in memory and puts the result in a buffer. | |
| 1576 * </pre> | |
| 1577 */ | |
| 1578 l_ok | |
| 1579 sarrayWriteMem(l_uint8 **pdata, | |
| 1580 size_t *psize, | |
| 1581 SARRAY *sa) | |
| 1582 { | |
| 1583 l_int32 ret; | |
| 1584 FILE *fp; | |
| 1585 | |
| 1586 if (pdata) *pdata = NULL; | |
| 1587 if (psize) *psize = 0; | |
| 1588 if (!pdata) | |
| 1589 return ERROR_INT("&data not defined", __func__, 1); | |
| 1590 if (!psize) | |
| 1591 return ERROR_INT("&size not defined", __func__, 1); | |
| 1592 if (!sa) | |
| 1593 return ERROR_INT("sa not defined", __func__, 1); | |
| 1594 | |
| 1595 #if HAVE_FMEMOPEN | |
| 1596 if ((fp = open_memstream((char **)pdata, psize)) == NULL) | |
| 1597 return ERROR_INT("stream not opened", __func__, 1); | |
| 1598 ret = sarrayWriteStream(fp, sa); | |
| 1599 fputc('\0', fp); | |
| 1600 fclose(fp); | |
| 1601 if (*psize > 0) *psize = *psize - 1; | |
| 1602 #else | |
| 1603 L_INFO("no fmemopen API --> work-around: write to temp file\n", __func__); | |
| 1604 #ifdef _WIN32 | |
| 1605 if ((fp = fopenWriteWinTempfile()) == NULL) | |
| 1606 return ERROR_INT("tmpfile stream not opened", __func__, 1); | |
| 1607 #else | |
| 1608 if ((fp = tmpfile()) == NULL) | |
| 1609 return ERROR_INT("tmpfile stream not opened", __func__, 1); | |
| 1610 #endif /* _WIN32 */ | |
| 1611 ret = sarrayWriteStream(fp, sa); | |
| 1612 rewind(fp); | |
| 1613 *pdata = l_binaryReadStream(fp, psize); | |
| 1614 fclose(fp); | |
| 1615 #endif /* HAVE_FMEMOPEN */ | |
| 1616 return ret; | |
| 1617 } | |
| 1618 | |
| 1619 | |
| 1620 /*! | |
| 1621 * \brief sarrayAppend() | |
| 1622 * | |
| 1623 * \param[in] filename | |
| 1624 * \param[in] sa | |
| 1625 * \return 0 if OK; 1 on error | |
| 1626 */ | |
| 1627 l_ok | |
| 1628 sarrayAppend(const char *filename, | |
| 1629 SARRAY *sa) | |
| 1630 { | |
| 1631 FILE *fp; | |
| 1632 | |
| 1633 if (!filename) | |
| 1634 return ERROR_INT("filename not defined", __func__, 1); | |
| 1635 if (!sa) | |
| 1636 return ERROR_INT("sa not defined", __func__, 1); | |
| 1637 | |
| 1638 if ((fp = fopenWriteStream(filename, "a")) == NULL) | |
| 1639 return ERROR_INT_1("stream not opened", filename, __func__, 1); | |
| 1640 if (sarrayWriteStream(fp, sa)) { | |
| 1641 fclose(fp); | |
| 1642 return ERROR_INT_1("sa not appended to stream", filename, __func__, 1); | |
| 1643 } | |
| 1644 | |
| 1645 fclose(fp); | |
| 1646 return 0; | |
| 1647 } | |
| 1648 | |
| 1649 | |
| 1650 /*---------------------------------------------------------------------* | |
| 1651 * Directory filenames * | |
| 1652 *---------------------------------------------------------------------*/ | |
| 1653 /*! | |
| 1654 * \brief getNumberedPathnamesInDirectory() | |
| 1655 * | |
| 1656 * \param[in] dirname directory name | |
| 1657 * \param[in] substr [optional] substring filter on filenames; can be NULL | |
| 1658 * \param[in] numpre number of characters in name before number | |
| 1659 * \param[in] numpost number of characters in name after the number, | |
| 1660 * up to a dot before an extension | |
| 1661 * \param[in] maxnum only consider page numbers up to this value | |
| 1662 * \return sarray of numbered pathnames, or NULL on error | |
| 1663 * | |
| 1664 * <pre> | |
| 1665 * Notes: | |
| 1666 * (1) Returns the full pathnames of the numbered filenames in | |
| 1667 * the directory. The number in the filename is the index | |
| 1668 * into the sarray. For indices for which there are no filenames, | |
| 1669 * an empty string ("") is placed into the sarray. | |
| 1670 * This makes reading numbered files very simple. For example, | |
| 1671 * the image whose filename includes number N can be retrieved using | |
| 1672 * pixReadIndexed(sa, N); | |
| 1673 * (2) If %substr is not NULL, only filenames that contain | |
| 1674 * the substring can be included. If %substr is NULL, | |
| 1675 * all matching filenames are used. | |
| 1676 * (3) If no numbered files are found, it returns an empty sarray, | |
| 1677 * with no initialized strings. | |
| 1678 * (4) It is assumed that the page number is contained within | |
| 1679 * the basename (the filename without directory or extension). | |
| 1680 * %numpre is the number of characters in the basename | |
| 1681 * preceding the actual page number; %numpost is the number | |
| 1682 * following the page number, up to either the end of the | |
| 1683 * basename or a ".", whichever comes first. | |
| 1684 * (5) This is useful when all filenames contain numbers that are | |
| 1685 * not necessarily consecutive. 0-padding is not required. | |
| 1686 * (6) To use a O(n) matching algorithm, the largest page number | |
| 1687 * is found and two internal arrays of this size are created. | |
| 1688 * This maximum is constrained not to exceed %maxsum, | |
| 1689 * to make sure that an unrealistically large number is not | |
| 1690 * accidentally used to determine the array sizes. | |
| 1691 * </pre> | |
| 1692 */ | |
| 1693 SARRAY * | |
| 1694 getNumberedPathnamesInDirectory(const char *dirname, | |
| 1695 const char *substr, | |
| 1696 l_int32 numpre, | |
| 1697 l_int32 numpost, | |
| 1698 l_int32 maxnum) | |
| 1699 { | |
| 1700 l_int32 nfiles; | |
| 1701 SARRAY *sa, *saout; | |
| 1702 | |
| 1703 if (!dirname) | |
| 1704 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL); | |
| 1705 | |
| 1706 if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL) | |
| 1707 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL); | |
| 1708 if ((nfiles = sarrayGetCount(sa)) == 0) { | |
| 1709 sarrayDestroy(&sa); | |
| 1710 return sarrayCreate(1); | |
| 1711 } | |
| 1712 | |
| 1713 saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum); | |
| 1714 sarrayDestroy(&sa); | |
| 1715 return saout; | |
| 1716 } | |
| 1717 | |
| 1718 | |
| 1719 /*! | |
| 1720 * \brief getSortedPathnamesInDirectory() | |
| 1721 * | |
| 1722 * \param[in] dirname directory name | |
| 1723 * \param[in] substr [optional] substring filter on filenames; can be NULL | |
| 1724 * \param[in] first 0-based | |
| 1725 * \param[in] nfiles use 0 for all to the end | |
| 1726 * \return sarray of sorted pathnames, or NULL on error | |
| 1727 * | |
| 1728 * <pre> | |
| 1729 * Notes: | |
| 1730 * (1) Use %substr to filter filenames in the directory. If | |
| 1731 * %substr == NULL, this takes all files. | |
| 1732 * (2) The files in the directory, after optional filtering by | |
| 1733 * the substring, are lexically sorted in increasing order. | |
| 1734 * Use %first and %nfiles to select a contiguous set of files. | |
| 1735 * (3) The full pathnames are returned for the requested sequence. | |
| 1736 * If no files are found after filtering, returns an empty sarray. | |
| 1737 * </pre> | |
| 1738 */ | |
| 1739 SARRAY * | |
| 1740 getSortedPathnamesInDirectory(const char *dirname, | |
| 1741 const char *substr, | |
| 1742 l_int32 first, | |
| 1743 l_int32 nfiles) | |
| 1744 { | |
| 1745 char *fname, *fullname; | |
| 1746 l_int32 i, n, last; | |
| 1747 SARRAY *sa, *safiles, *saout; | |
| 1748 | |
| 1749 if (!dirname) | |
| 1750 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL); | |
| 1751 | |
| 1752 if ((sa = getFilenamesInDirectory(dirname)) == NULL) | |
| 1753 return (SARRAY *)ERROR_PTR("sa not made", __func__, NULL); | |
| 1754 safiles = sarraySelectBySubstring(sa, substr); | |
| 1755 sarrayDestroy(&sa); | |
| 1756 n = sarrayGetCount(safiles); | |
| 1757 if (n == 0) { | |
| 1758 L_WARNING("no files found\n", __func__); | |
| 1759 return safiles; | |
| 1760 } | |
| 1761 | |
| 1762 sarraySort(safiles, safiles, L_SORT_INCREASING); | |
| 1763 | |
| 1764 first = L_MIN(L_MAX(first, 0), n - 1); | |
| 1765 if (nfiles == 0) | |
| 1766 nfiles = n - first; | |
| 1767 last = L_MIN(first + nfiles - 1, n - 1); | |
| 1768 | |
| 1769 saout = sarrayCreate(last - first + 1); | |
| 1770 for (i = first; i <= last; i++) { | |
| 1771 fname = sarrayGetString(safiles, i, L_NOCOPY); | |
| 1772 fullname = pathJoin(dirname, fname); | |
| 1773 sarrayAddString(saout, fullname, L_INSERT); | |
| 1774 } | |
| 1775 | |
| 1776 sarrayDestroy(&safiles); | |
| 1777 return saout; | |
| 1778 } | |
| 1779 | |
| 1780 | |
| 1781 /*! | |
| 1782 * \brief convertSortedToNumberedPathnames() | |
| 1783 * | |
| 1784 * \param[in] sa sorted pathnames including zero-padded integers | |
| 1785 * \param[in] numpre number of characters in name before number | |
| 1786 * \param[in] numpost number of characters in name after the number, | |
| 1787 * up to a dot before an extension | |
| 1788 * \param[in] maxnum only consider page numbers up to this value | |
| 1789 * \return sarray of numbered pathnames, or NULL on error | |
| 1790 * | |
| 1791 * <pre> | |
| 1792 * Notes: | |
| 1793 * (1) Typically, numpre = numpost = 0; e.g., when the filename | |
| 1794 * just has a number followed by an optional extension. | |
| 1795 * </pre> | |
| 1796 */ | |
| 1797 SARRAY * | |
| 1798 convertSortedToNumberedPathnames(SARRAY *sa, | |
| 1799 l_int32 numpre, | |
| 1800 l_int32 numpost, | |
| 1801 l_int32 maxnum) | |
| 1802 { | |
| 1803 char *fname, *str; | |
| 1804 l_int32 i, nfiles, num, index; | |
| 1805 SARRAY *saout; | |
| 1806 | |
| 1807 if (!sa) | |
| 1808 return (SARRAY *)ERROR_PTR("sa not defined", __func__, NULL); | |
| 1809 if ((nfiles = sarrayGetCount(sa)) == 0) | |
| 1810 return sarrayCreate(1); | |
| 1811 | |
| 1812 /* Find the last file in the sorted array that has a number | |
| 1813 * that (a) matches the count pattern and (b) does not | |
| 1814 * exceed %maxnum. %maxnum sets an upper limit on the size | |
| 1815 * of the sarray. */ | |
| 1816 num = 0; | |
| 1817 for (i = nfiles - 1; i >= 0; i--) { | |
| 1818 fname = sarrayGetString(sa, i, L_NOCOPY); | |
| 1819 num = extractNumberFromFilename(fname, numpre, numpost); | |
| 1820 if (num < 0) continue; | |
| 1821 num = L_MIN(num + 1, maxnum); | |
| 1822 break; | |
| 1823 } | |
| 1824 | |
| 1825 if (num <= 0) /* none found */ | |
| 1826 return sarrayCreate(1); | |
| 1827 | |
| 1828 /* Insert pathnames into the output sarray. | |
| 1829 * Ignore numbers that are out of the range of sarray. */ | |
| 1830 saout = sarrayCreateInitialized(num, ""); | |
| 1831 for (i = 0; i < nfiles; i++) { | |
| 1832 fname = sarrayGetString(sa, i, L_NOCOPY); | |
| 1833 index = extractNumberFromFilename(fname, numpre, numpost); | |
| 1834 if (index < 0 || index >= num) continue; | |
| 1835 str = sarrayGetString(saout, index, L_NOCOPY); | |
| 1836 if (str[0] != '\0') { | |
| 1837 L_WARNING("\n Multiple files with same number: %d\n", | |
| 1838 __func__, index); | |
| 1839 } | |
| 1840 sarrayReplaceString(saout, index, fname, L_COPY); | |
| 1841 } | |
| 1842 | |
| 1843 return saout; | |
| 1844 } | |
| 1845 | |
| 1846 | |
| 1847 /*! | |
| 1848 * \brief getFilenamesInDirectory() | |
| 1849 * | |
| 1850 * \param[in] dirname directory name | |
| 1851 * \return sarray of file names, or NULL on error | |
| 1852 * | |
| 1853 * <pre> | |
| 1854 * Notes: | |
| 1855 * (1) The versions compiled under unix and cygwin use the POSIX C | |
| 1856 * library commands for handling directories. For Windows, | |
| 1857 * there is a separate implementation. | |
| 1858 * (2) It returns an array of filename tails; i.e., only the part of | |
| 1859 * the path after the last slash. | |
| 1860 * (3) Use of the d_type field of dirent is not portable: | |
| 1861 * "According to POSIX, the dirent structure contains a field | |
| 1862 * char d_name[] of unspecified size, with at most NAME_MAX | |
| 1863 * characters preceding the terminating null character. Use | |
| 1864 * of other fields will harm the portability of your programs." | |
| 1865 * (4) As a consequence of (3), we note several things: | |
| 1866 * ~ MINGW doesn't have a d_type member. | |
| 1867 * ~ Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN | |
| 1868 * for d_type from all files. | |
| 1869 * On these systems, this function will return directories | |
| 1870 * (except for '.' and '..', which are eliminated using | |
| 1871 * the d_name field). | |
| 1872 * (5) For unix, we avoid the bug in earlier versions of realpath() | |
| 1873 * by requiring either POSIX 2008 or use of glibc. | |
| 1874 * | |
| 1875 * </pre> | |
| 1876 */ | |
| 1877 | |
| 1878 #ifndef _WIN32 | |
| 1879 | |
| 1880 SARRAY * | |
| 1881 getFilenamesInDirectory(const char *dirname) | |
| 1882 { | |
| 1883 char *gendir, *realdir, *stat_path; | |
| 1884 size_t size; | |
| 1885 SARRAY *safiles; | |
| 1886 DIR *pdir; | |
| 1887 struct dirent *pdirentry; | |
| 1888 int dfd, stat_ret; | |
| 1889 struct stat st; | |
| 1890 | |
| 1891 if (!dirname) | |
| 1892 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL); | |
| 1893 if (dirname[0] == '\0') | |
| 1894 return (SARRAY *)ERROR_PTR("dirname is empty", __func__, NULL); | |
| 1895 | |
| 1896 /* Who would have thought it was this fiddly to open a directory | |
| 1897 and get the files inside? fstatat() works with relative | |
| 1898 directory paths, and stat() requires using the absolute path. | |
| 1899 realpath() works as follows for files and directories: | |
| 1900 * If the file or directory exists, realpath returns its path; | |
| 1901 else it returns NULL. | |
| 1902 * For realpath() we use the POSIX 2008 implementation, where | |
| 1903 the second arg is NULL and the path is malloc'd and returned | |
| 1904 if the file or directory exists. All versions of glibc | |
| 1905 support this. */ | |
| 1906 gendir = genPathname(dirname, NULL); | |
| 1907 realdir = realpath(gendir, NULL); | |
| 1908 LEPT_FREE(gendir); | |
| 1909 if (realdir == NULL) | |
| 1910 return (SARRAY *)ERROR_PTR("realdir not made", __func__, NULL); | |
| 1911 if ((pdir = opendir(realdir)) == NULL) { | |
| 1912 L_ERROR("directory %s not opened\n", __func__, realdir); | |
| 1913 LEPT_FREE(realdir); | |
| 1914 return NULL; | |
| 1915 } | |
| 1916 safiles = sarrayCreate(0); | |
| 1917 while ((pdirentry = readdir(pdir))) { | |
| 1918 #if HAVE_DIRFD && HAVE_FSTATAT | |
| 1919 /* Platform issues: although Linux has these POSIX functions, | |
| 1920 * AIX doesn't have fstatat() and Solaris doesn't have dirfd(). */ | |
| 1921 dfd = dirfd(pdir); | |
| 1922 stat_ret = fstatat(dfd, pdirentry->d_name, &st, 0); | |
| 1923 #else | |
| 1924 size = strlen(realdir) + strlen(pdirentry->d_name) + 2; | |
| 1925 stat_path = (char *)LEPT_CALLOC(size, 1); | |
| 1926 snprintf(stat_path, size, "%s/%s", realdir, pdirentry->d_name); | |
| 1927 stat_ret = stat(stat_path, &st); | |
| 1928 LEPT_FREE(stat_path); | |
| 1929 #endif | |
| 1930 if (stat_ret == 0 && S_ISDIR(st.st_mode)) | |
| 1931 continue; | |
| 1932 sarrayAddString(safiles, pdirentry->d_name, L_COPY); | |
| 1933 } | |
| 1934 closedir(pdir); | |
| 1935 LEPT_FREE(realdir); | |
| 1936 return safiles; | |
| 1937 } | |
| 1938 | |
| 1939 #else /* _WIN32 */ | |
| 1940 | |
| 1941 /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */ | |
| 1942 #include <windows.h> | |
| 1943 | |
| 1944 SARRAY * | |
| 1945 getFilenamesInDirectory(const char *dirname) | |
| 1946 { | |
| 1947 char *pszDir; | |
| 1948 char *realdir; | |
| 1949 HANDLE hFind = INVALID_HANDLE_VALUE; | |
| 1950 SARRAY *safiles; | |
| 1951 WIN32_FIND_DATAA ffd; | |
| 1952 | |
| 1953 if (!dirname) | |
| 1954 return (SARRAY *)ERROR_PTR("dirname not defined", __func__, NULL); | |
| 1955 | |
| 1956 realdir = genPathname(dirname, NULL); | |
| 1957 pszDir = stringJoin(realdir, "\\*"); | |
| 1958 LEPT_FREE(realdir); | |
| 1959 | |
| 1960 if (strlen(pszDir) + 1 > MAX_PATH) { | |
| 1961 LEPT_FREE(pszDir); | |
| 1962 return (SARRAY *)ERROR_PTR("dirname is too long", __func__, NULL); | |
| 1963 } | |
| 1964 | |
| 1965 if ((safiles = sarrayCreate(0)) == NULL) { | |
| 1966 LEPT_FREE(pszDir); | |
| 1967 return (SARRAY *)ERROR_PTR("safiles not made", __func__, NULL); | |
| 1968 } | |
| 1969 | |
| 1970 hFind = FindFirstFileA(pszDir, &ffd); | |
| 1971 if (INVALID_HANDLE_VALUE == hFind) { | |
| 1972 sarrayDestroy(&safiles); | |
| 1973 LEPT_FREE(pszDir); | |
| 1974 return (SARRAY *)ERROR_PTR("hFind not opened", __func__, NULL); | |
| 1975 } | |
| 1976 | |
| 1977 while (FindNextFileA(hFind, &ffd) != 0) { | |
| 1978 if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */ | |
| 1979 continue; | |
| 1980 convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR); | |
| 1981 sarrayAddString(safiles, ffd.cFileName, L_COPY); | |
| 1982 } | |
| 1983 | |
| 1984 FindClose(hFind); | |
| 1985 LEPT_FREE(pszDir); | |
| 1986 return safiles; | |
| 1987 } | |
| 1988 #endif /* _WIN32 */ |
