/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /*! * \file sarray1.c *
 *
 *      Create/Destroy/Copy
 *          SARRAY    *sarrayCreate()
 *          SARRAY    *sarrayCreateInitialized()
 *          SARRAY    *sarrayCreateWordsFromString()
 *          SARRAY    *sarrayCreateLinesFromString()
 *          void      *sarrayDestroy()
 *          SARRAY    *sarrayCopy()
 *          SARRAY    *sarrayClone()
 *
 *      Add/Remove string
 *          l_int32    sarrayAddString()
 *          static l_int32  sarrayExtendArray()
 *          char      *sarrayRemoveString()
 *          l_int32    sarrayReplaceString()
 *          l_int32    sarrayClear()
 *
 *      Accessors
 *          l_int32    sarrayGetCount()
 *          char     **sarrayGetArray()
 *          char      *sarrayGetString()
 *          l_int32    sarrayGetRefcount()
 *          l_int32    sarrayChangeRefcount()
 *
 *      Conversion back to string
 *          char      *sarrayToString()
 *          char      *sarrayToStringRange()
 *
 *      Concatenate strings uniformly within the sarray
 *          SARRAY    *sarrayConcatUniformly()
 *
 *      Join 2 sarrays
 *          l_int32    sarrayJoin()
 *          l_int32    sarrayAppendRange()
 *
 *      Pad an sarray to be the same size as another sarray
 *          l_int32    sarrayPadToSameSize()
 *
 *      Convert word sarray to (formatted) line sarray
 *          SARRAY    *sarrayConvertWordsToLines()
 *
 *      Split string on separator list
 *          SARRAY    *sarraySplitString()
 *
 *      Filter sarray
 *          SARRAY    *sarraySelectBySubstring()
 *          SARRAY    *sarraySelectByRange()
 *          l_int32    sarrayParseRange()
 *
 *      Serialize for I/O
 *          SARRAY    *sarrayRead()
 *          SARRAY    *sarrayReadStream()
 *          SARRAY    *sarrayReadMem()
 *          l_int32    sarrayWrite()
 *          l_int32    sarrayWriteStream()
 *          l_int32    sarrayWriteMem()
 *          l_int32    sarrayAppend()
 *
 *      Directory filenames
 *          SARRAY    *getNumberedPathnamesInDirectory()
 *          SARRAY    *getSortedPathnamesInDirectory()
 *          SARRAY    *convertSortedToNumberedPathnames()
 *          SARRAY    *getFilenamesInDirectory()
 *
 *      These functions are important for efficient manipulation
 *      of string data, and they have found widespread use in
 *      leptonica.  For example:
 *         (1) to generate text files: e.g., PostScript and PDF
 *             wrappers around sets of images
 *         (2) to parse text files: e.g., extracting prototypes
 *             from the source to generate allheaders.h
 *         (3) to generate code for compilation: e.g., the fast
 *             dwa code for arbitrary structuring elements.
 *
 *      Comments on usage:
 *
 *          The user is responsible for correctly disposing of strings
 *          that have been extracted from sarrays.  In the following,
 *          "str_not_owned" means the returned handle does not own the string,
 *          and "str_owned" means the returned handle owns the string.
 *            - To extract a string from an Sarray in order to inspect it
 *              or to make a copy of it later, get a handle to it:
 *                  copyflag = L_NOCOPY.
 *              In this case, you must neither free the string nor put it
 *              directly in another array:
 *                 str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
 *            - To extract a copy of a string from an Sarray, use:
 *                 str-owned = sarrayGetString(sa, index, L_COPY);
 *            ~ To insert a string that is in one array into another
 *              array (always leaving the first array intact), there are
 *              two options:
 *                 (1) use copyflag = L_COPY to make an immediate copy,
 *                     which you then add to the second array by insertion:
 *                       str-owned = sarrayGetString(sa, index, L_COPY);
 *                       sarrayAddString(sa, str-owned, L_INSERT);
 *                 (2) use copyflag = L_NOCOPY to get another handle to
 *                     the string; you then add a copy of it to the
 *                     second string array:
 *                       str-not-owned = sarrayGetString(sa, index, L_NOCOPY);
 *                       sarrayAddString(sa, str-not-owned, L_COPY).
 *              sarrayAddString() transfers ownership to the Sarray, so never
 *              use L_INSERT if the string is owned by another array.
 *
 *              In all cases, when you use copyflag = L_COPY to extract
 *              a string from an array, you must either free it
 *              or insert it in an array that will be freed later.
 * 
*/ #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include #ifndef _WIN32 #include /* unix only */ #include #include /* needed for realpath() */ #include /* needed for realpath() */ #endif /* ! _WIN32 */ #include "allheaders.h" static const l_uint32 MaxPtrArraySize = 25000000; /* 25 million */ static const l_int32 InitialPtrArraySize = 50; /*!< n'importe quoi */ /* Static functions */ static l_int32 sarrayExtendArray(SARRAY *sa); /*--------------------------------------------------------------------------* * String array create/destroy/copy/extend * *--------------------------------------------------------------------------*/ /*! * \brief sarrayCreate() * * \param[in] n size of string ptr array to be alloc'd; use 0 for default * \return sarray, or NULL on error */ SARRAY * sarrayCreate(l_int32 n) { SARRAY *sa; PROCNAME("sarrayCreate"); if (n <= 0 || n > MaxPtrArraySize) n = InitialPtrArraySize; sa = (SARRAY *)LEPT_CALLOC(1, sizeof(SARRAY)); if ((sa->array = (char **)LEPT_CALLOC(n, sizeof(char *))) == NULL) { sarrayDestroy(&sa); return (SARRAY *)ERROR_PTR("ptr array not made", procName, NULL); } sa->nalloc = n; sa->n = 0; sa->refcount = 1; return sa; } /*! * \brief sarrayCreateInitialized() * * \param[in] n size of string ptr array to be alloc'd * \param[in] initstr string to be initialized on the full array * \return sarray, or NULL on error */ SARRAY * sarrayCreateInitialized(l_int32 n, const char *initstr) { l_int32 i; SARRAY *sa; PROCNAME("sarrayCreateInitialized"); if (n <= 0) return (SARRAY *)ERROR_PTR("n must be > 0", procName, NULL); if (!initstr) return (SARRAY *)ERROR_PTR("initstr not defined", procName, NULL); sa = sarrayCreate(n); for (i = 0; i < n; i++) sarrayAddString(sa, initstr, L_COPY); return sa; } /*! * \brief sarrayCreateWordsFromString() * * \param[in] string * \return sarray, or NULL on error * *
 * Notes:
 *      (1) This finds the number of word substrings, creates an sarray
 *          of this size, and puts copies of each substring into the sarray.
 * 
*/ SARRAY * sarrayCreateWordsFromString(const char *string) { char separators[] = " \n\t"; l_int32 i, nsub, size, inword; SARRAY *sa; PROCNAME("sarrayCreateWordsFromString"); if (!string) return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL); /* Find the number of words */ size = strlen(string); nsub = 0; inword = FALSE; for (i = 0; i < size; i++) { if (inword == FALSE && (string[i] != ' ' && string[i] != '\t' && string[i] != '\n')) { inword = TRUE; nsub++; } else if (inword == TRUE && (string[i] == ' ' || string[i] == '\t' || string[i] == '\n')) { inword = FALSE; } } if ((sa = sarrayCreate(nsub)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); sarraySplitString(sa, string, separators); return sa; } /*! * \brief sarrayCreateLinesFromString() * * \param[in] string * \param[in] blankflag 0 to exclude blank lines; 1 to include * \return sarray, or NULL on error * *
 * Notes:
 *      (1) This finds the number of line substrings, each of which
 *          ends with a newline, and puts a copy of each substring
 *          in a new sarray.
 *      (2) The newline characters are removed from each substring.
 * 
*/ SARRAY * sarrayCreateLinesFromString(const char *string, l_int32 blankflag) { l_int32 i, nsub, size, startptr; char *cstring, *substring; SARRAY *sa; PROCNAME("sarrayCreateLinesFromString"); if (!string) return (SARRAY *)ERROR_PTR("textstr not defined", procName, NULL); /* Find the number of lines */ size = strlen(string); nsub = 0; for (i = 0; i < size; i++) { if (string[i] == '\n') nsub++; } if ((sa = sarrayCreate(nsub)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); if (blankflag) { /* keep blank lines as null strings */ /* Make a copy for munging */ if ((cstring = stringNew(string)) == NULL) { sarrayDestroy(&sa); return (SARRAY *)ERROR_PTR("cstring not made", procName, NULL); } /* We'll insert nulls like strtok */ startptr = 0; for (i = 0; i < size; i++) { if (cstring[i] == '\n') { cstring[i] = '\0'; if (i > 0 && cstring[i - 1] == '\r') cstring[i - 1] = '\0'; /* also remove Windows CR */ if ((substring = stringNew(cstring + startptr)) == NULL) { sarrayDestroy(&sa); LEPT_FREE(cstring); return (SARRAY *)ERROR_PTR("substring not made", procName, NULL); } sarrayAddString(sa, substring, L_INSERT); /* lept_stderr("substring = %s\n", substring); */ startptr = i + 1; } } if (startptr < size) { /* no newline at end of last line */ if ((substring = stringNew(cstring + startptr)) == NULL) { sarrayDestroy(&sa); LEPT_FREE(cstring); return (SARRAY *)ERROR_PTR("substring not made", procName, NULL); } sarrayAddString(sa, substring, L_INSERT); /* lept_stderr("substring = %s\n", substring); */ } LEPT_FREE(cstring); } else { /* remove blank lines; use strtok */ sarraySplitString(sa, string, "\r\n"); } return sa; } /*! * \brief sarrayDestroy() * * \param[in,out] psa will be set to null before returning * \return void * *
 * Notes:
 *      (1) Decrements the ref count and, if 0, destroys the sarray.
 *      (2) Always nulls the input ptr.
 * 
*/ void sarrayDestroy(SARRAY **psa) { l_int32 i; SARRAY *sa; PROCNAME("sarrayDestroy"); if (psa == NULL) { L_WARNING("ptr address is NULL!\n", procName); return; } if ((sa = *psa) == NULL) return; sarrayChangeRefcount(sa, -1); if (sarrayGetRefcount(sa) <= 0) { if (sa->array) { for (i = 0; i < sa->n; i++) { if (sa->array[i]) LEPT_FREE(sa->array[i]); } LEPT_FREE(sa->array); } LEPT_FREE(sa); } *psa = NULL; } /*! * \brief sarrayCopy() * * \param[in] sa string array * \return copy of sarray, or NULL on error */ SARRAY * sarrayCopy(SARRAY *sa) { l_int32 i; SARRAY *csa; PROCNAME("sarrayCopy"); if (!sa) return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); if ((csa = sarrayCreate(sa->nalloc)) == NULL) return (SARRAY *)ERROR_PTR("csa not made", procName, NULL); for (i = 0; i < sa->n; i++) sarrayAddString(csa, sa->array[i], L_COPY); return csa; } /*! * \brief sarrayClone() * * \param[in] sa string array * \return ptr to same sarray, or NULL on error */ SARRAY * sarrayClone(SARRAY *sa) { PROCNAME("sarrayClone"); if (!sa) return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); sarrayChangeRefcount(sa, 1); return sa; } /*! * \brief sarrayAddString() * * \param[in] sa string array * \param[in] string string to be added * \param[in] copyflag L_INSERT, L_NOCOPY or L_COPY * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) See usage comments at the top of this file.  L_INSERT is
 *          equivalent to L_NOCOPY.
 * 
*/ l_ok sarrayAddString(SARRAY *sa, const char *string, l_int32 copyflag) { l_int32 n; PROCNAME("sarrayAddString"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!string) return ERROR_INT("string not defined", procName, 1); if (copyflag != L_INSERT && copyflag != L_NOCOPY && copyflag != L_COPY) return ERROR_INT("invalid copyflag", procName, 1); n = sarrayGetCount(sa); if (n >= sa->nalloc) { if (sarrayExtendArray(sa)) return ERROR_INT("extension failed", procName, 1); } if (copyflag == L_COPY) sa->array[n] = stringNew(string); else /* L_INSERT or L_NOCOPY */ sa->array[n] = (char *)string; sa->n++; return 0; } /*! * \brief sarrayExtendArray() * * \param[in] sa string array * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) Doubles the size of the string ptr array.
 *      (2) The max number of strings is 25M.
 * 
*/ static l_int32 sarrayExtendArray(SARRAY *sa) { size_t oldsize, newsize; PROCNAME("sarrayExtendArray"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (sa->nalloc > MaxPtrArraySize) /* belt & suspenders */ return ERROR_INT("sa has too many ptrs", procName, 1); oldsize = sa->nalloc * sizeof(char *); newsize = 2 * oldsize; if (newsize > 8 * MaxPtrArraySize) /* ptrs for 25 million strings */ return ERROR_INT("newsize > 200 MB; too large", procName, 1); if ((sa->array = (char **)reallocNew((void **)&sa->array, oldsize, newsize)) == NULL) return ERROR_INT("new ptr array not returned", procName, 1); sa->nalloc *= 2; return 0; } /*! * \brief sarrayRemoveString() * * \param[in] sa string array * \param[in] index of string within sarray * \return removed string, or NULL on error */ char * sarrayRemoveString(SARRAY *sa, l_int32 index) { char *string; char **array; l_int32 i, n, nalloc; PROCNAME("sarrayRemoveString"); if (!sa) return (char *)ERROR_PTR("sa not defined", procName, NULL); if ((array = sarrayGetArray(sa, &nalloc, &n)) == NULL) return (char *)ERROR_PTR("array not returned", procName, NULL); if (index < 0 || index >= n) return (char *)ERROR_PTR("array index out of bounds", procName, NULL); string = array[index]; /* If removed string is not at end of array, shift * to fill in, maintaining original ordering. * Note: if we didn't care about the order, we could * put the last string array[n - 1] directly into the hole. */ for (i = index; i < n - 1; i++) array[i] = array[i + 1]; sa->n--; return string; } /*! * \brief sarrayReplaceString() * * \param[in] sa string array * \param[in] index of string within sarray to be replaced * \param[in] newstr string to replace existing one * \param[in] copyflag L_INSERT, L_COPY * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This destroys an existing string and replaces it with
 *          the new string or a copy of it.
 *      (2) By design, an sarray is always compacted, so there are
 *          never any holes (null ptrs) in the ptr array up to the
 *          current count.
 * 
*/ l_ok sarrayReplaceString(SARRAY *sa, l_int32 index, char *newstr, l_int32 copyflag) { char *str; l_int32 n; PROCNAME("sarrayReplaceString"); if (!sa) return ERROR_INT("sa not defined", procName, 1); n = sarrayGetCount(sa); if (index < 0 || index >= n) return ERROR_INT("array index out of bounds", procName, 1); if (!newstr) return ERROR_INT("newstr not defined", procName, 1); if (copyflag != L_INSERT && copyflag != L_COPY) return ERROR_INT("invalid copyflag", procName, 1); LEPT_FREE(sa->array[index]); if (copyflag == L_INSERT) str = newstr; else /* L_COPY */ str = stringNew(newstr); sa->array[index] = str; return 0; } /*! * \brief sarrayClear() * * \param[in] sa string array * \return 0 if OK; 1 on error */ l_ok sarrayClear(SARRAY *sa) { l_int32 i; PROCNAME("sarrayClear"); if (!sa) return ERROR_INT("sa not defined", procName, 1); for (i = 0; i < sa->n; i++) { /* free strings and null ptrs */ LEPT_FREE(sa->array[i]); sa->array[i] = NULL; } sa->n = 0; return 0; } /*----------------------------------------------------------------------* * Accessors * *----------------------------------------------------------------------*/ /*! * \brief sarrayGetCount() * * \param[in] sa string array * \return count, or 0 if no strings or on error */ l_int32 sarrayGetCount(SARRAY *sa) { PROCNAME("sarrayGetCount"); if (!sa) return ERROR_INT("sa not defined", procName, 0); return sa->n; } /*! * \brief sarrayGetArray() * * \param[in] sa string array * \param[out] pnalloc [optional] number allocated string ptrs * \param[out] pn [optional] number allocated strings * \return ptr to string array, or NULL on error * *
 * Notes:
 *      (1) Caution: the returned array is not a copy, so caller
 *          must not destroy it!
 * 
*/ char ** sarrayGetArray(SARRAY *sa, l_int32 *pnalloc, l_int32 *pn) { char **array; PROCNAME("sarrayGetArray"); if (!sa) return (char **)ERROR_PTR("sa not defined", procName, NULL); array = sa->array; if (pnalloc) *pnalloc = sa->nalloc; if (pn) *pn = sa->n; return array; } /*! * \brief sarrayGetString() * * \param[in] sa string array * \param[in] index to the index-th string * \param[in] copyflag L_NOCOPY or L_COPY * \return string, or NULL on error * *
 * Notes:
 *      (1) See usage comments at the top of this file.
 *      (2) To get a pointer to the string itself, use L_NOCOPY.
 *          To get a copy of the string, use L_COPY.
 * 
*/ char * sarrayGetString(SARRAY *sa, l_int32 index, l_int32 copyflag) { PROCNAME("sarrayGetString"); if (!sa) return (char *)ERROR_PTR("sa not defined", procName, NULL); if (index < 0 || index >= sa->n) return (char *)ERROR_PTR("index not valid", procName, NULL); if (copyflag != L_NOCOPY && copyflag != L_COPY) return (char *)ERROR_PTR("invalid copyflag", procName, NULL); if (copyflag == L_NOCOPY) return sa->array[index]; else /* L_COPY */ return stringNew(sa->array[index]); } /*! * \brief sarrayGetRefCount() * * \param[in] sa string array * \return refcount, or UNDEF on error */ l_int32 sarrayGetRefcount(SARRAY *sa) { PROCNAME("sarrayGetRefcount"); if (!sa) return ERROR_INT("sa not defined", procName, UNDEF); return sa->refcount; } /*! * \brief sarrayChangeRefCount() * * \param[in] sa string array * \param[in] delta change to be applied * \return 0 if OK, 1 on error */ l_ok sarrayChangeRefcount(SARRAY *sa, l_int32 delta) { PROCNAME("sarrayChangeRefcount"); if (!sa) return ERROR_INT("sa not defined", procName, UNDEF); sa->refcount += delta; return 0; } /*----------------------------------------------------------------------* * Conversion to string * *----------------------------------------------------------------------*/ /*! * \brief sarrayToString() * * \param[in] sa string array * \param[in] addnlflag flag: 0 adds nothing to each substring * 1 adds '\n' to each substring * 2 adds ' ' to each substring * 3 adds ',' to each substring * \return dest string, or NULL on error * *
 * Notes:
 *      (1) Concatenates all the strings in the sarray, preserving
 *          all white space.
 *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
 *      (3) This function was NOT implemented as:
 *            for (i = 0; i < n; i++)
 *                strcat(dest, sarrayGetString(sa, i, L_NOCOPY));
 *          Do you see why?
 * 
*/ char * sarrayToString(SARRAY *sa, l_int32 addnlflag) { PROCNAME("sarrayToString"); if (!sa) return (char *)ERROR_PTR("sa not defined", procName, NULL); return sarrayToStringRange(sa, 0, 0, addnlflag); } /*! * \brief sarrayToStringRange() * * \param[in] sa string array * \param[in] first index of first string to use; starts with 0 * \param[in] nstrings number of strings to append into the result; use * 0 to append to the end of the sarray * \param[in] addnlflag flag: 0 adds nothing to each substring * 1 adds '\n' to each substring * 2 adds ' ' to each substring * 3 adds ',' to each substring * \return dest string, or NULL on error * *
 * Notes:
 *      (1) Concatenates the specified strings in the sarray, preserving
 *          all white space.
 *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
 *      (3) If the sarray is empty, this returns a string with just
 *          the character corresponding to %addnlflag.
 * 
*/ char * sarrayToStringRange(SARRAY *sa, l_int32 first, l_int32 nstrings, l_int32 addnlflag) { char *dest, *src, *str; l_int32 n, i, last, size, index, len; PROCNAME("sarrayToStringRange"); if (!sa) return (char *)ERROR_PTR("sa not defined", procName, NULL); if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3) return (char *)ERROR_PTR("invalid addnlflag", procName, NULL); n = sarrayGetCount(sa); /* Empty sa; return char corresponding to addnlflag only */ if (n == 0) { if (first == 0) { if (addnlflag == 0) return stringNew(""); if (addnlflag == 1) return stringNew("\n"); if (addnlflag == 2) return stringNew(" "); else /* addnlflag == 3) */ return stringNew(","); } else { return (char *)ERROR_PTR("first not valid", procName, NULL); } } /* Determine the range of string indices to be used */ if (first < 0 || first >= n) return (char *)ERROR_PTR("first not valid", procName, NULL); if (nstrings == 0 || (nstrings > n - first)) nstrings = n - first; /* no overflow */ last = first + nstrings - 1; /* Determine the size of the output string */ size = 0; for (i = first; i <= last; i++) { if ((str = sarrayGetString(sa, i, L_NOCOPY)) == NULL) return (char *)ERROR_PTR("str not found", procName, NULL); size += strlen(str) + 2; } if ((dest = (char *)LEPT_CALLOC(size + 1, sizeof(char))) == NULL) return (char *)ERROR_PTR("dest not made", procName, NULL); /* Construct the output */ index = 0; for (i = first; i <= last; i++) { src = sarrayGetString(sa, i, L_NOCOPY); len = strlen(src); memcpy(dest + index, src, len); index += len; if (addnlflag == 1) { dest[index] = '\n'; index++; } else if (addnlflag == 2) { dest[index] = ' '; index++; } else if (addnlflag == 3) { dest[index] = ','; index++; } } return dest; } /*----------------------------------------------------------------------* * Concatenate strings uniformly within the sarray * *----------------------------------------------------------------------*/ /*! * \brief sarrayConcatUniformly() * * \param[in] sa string array * \param[in] n number of strings in output sarray * \param[in] addnlflag flag: 0 adds nothing to each substring * 1 adds '\n' to each substring * 2 adds ' ' to each substring * 3 adds ',' to each substring * \return dest sarray, or NULL on error * *
 * Notes:
 *      (1) Divides %sa into %n essentially equal sets of strings,
 *          concatenates each set individually, and makes an output
 *          sarray with the %n concatenations.  %n must not exceed the
 *          number of strings in %sa.
 *      (2) If addnlflag != 0, adds '\n', ' ' or ',' after each substring.
 * 
*/ SARRAY * sarrayConcatUniformly(SARRAY *sa, l_int32 n, l_int32 addnlflag) { l_int32 i, first, ntot, nstr; char *str; NUMA *na; SARRAY *saout; PROCNAME("sarrayConcatUniformly"); if (!sa) return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); ntot = sarrayGetCount(sa); if (n < 1) return (SARRAY *)ERROR_PTR("n must be >= 1", procName, NULL); if (n > ntot) { L_ERROR("n = %d > ntot = %d\n", procName, n, ntot); return NULL; } if (addnlflag != 0 && addnlflag != 1 && addnlflag != 2 && addnlflag != 3) return (SARRAY *)ERROR_PTR("invalid addnlflag", procName, NULL); saout = sarrayCreate(0); na = numaGetUniformBinSizes(ntot, n); for (i = 0, first = 0; i < n; i++) { numaGetIValue(na, i, &nstr); str = sarrayToStringRange(sa, first, nstr, addnlflag); sarrayAddString(saout, str, L_INSERT); first += nstr; } numaDestroy(&na); return saout; } /*----------------------------------------------------------------------* * Join 2 sarrays * *----------------------------------------------------------------------*/ /*! * \brief sarrayJoin() * * \param[in] sa1 to be added to * \param[in] sa2 append to sa1 * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) Copies of the strings in sarray2 are added to sarray1.
 * 
*/ l_ok sarrayJoin(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n, i; PROCNAME("sarrayJoin"); if (!sa1) return ERROR_INT("sa1 not defined", procName, 1); if (!sa2) return ERROR_INT("sa2 not defined", procName, 1); n = sarrayGetCount(sa2); for (i = 0; i < n; i++) { str = sarrayGetString(sa2, i, L_NOCOPY); sarrayAddString(sa1, str, L_COPY); } return 0; } /*! * \brief sarrayAppendRange() * * \param[in] sa1 to be added to * \param[in] sa2 append specified range of strings in sa2 to sa1 * \param[in] start index of first string of sa2 to append * \param[in] end index of last string of sa2 to append; * -1 to append to end of array * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) Copies of the strings in sarray2 are added to sarray1.
 *      (2) The [start ... end] range is truncated if necessary.
 *      (3) Use end == -1 to append to the end of sa2.
 * 
*/ l_ok sarrayAppendRange(SARRAY *sa1, SARRAY *sa2, l_int32 start, l_int32 end) { char *str; l_int32 n, i; PROCNAME("sarrayAppendRange"); if (!sa1) return ERROR_INT("sa1 not defined", procName, 1); if (!sa2) return ERROR_INT("sa2 not defined", procName, 1); if (start < 0) start = 0; n = sarrayGetCount(sa2); if (end < 0 || end >= n) end = n - 1; if (start > end) return ERROR_INT("start > end", procName, 1); for (i = start; i <= end; i++) { str = sarrayGetString(sa2, i, L_NOCOPY); sarrayAddString(sa1, str, L_COPY); } return 0; } /*----------------------------------------------------------------------* * Pad an sarray to be the same size as another sarray * *----------------------------------------------------------------------*/ /*! * \brief sarrayPadToSameSize() * * \param[in] sa1, sa2 * \param[in] padstring * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) If two sarrays have different size, this adds enough
 *          instances of %padstring to the smaller so that they are
 *          the same size.  It is useful when two or more sarrays
 *          are being sequenced in parallel, and it is necessary to
 *          find a valid string at each index.
 * 
*/ l_ok sarrayPadToSameSize(SARRAY *sa1, SARRAY *sa2, const char *padstring) { l_int32 i, n1, n2; PROCNAME("sarrayPadToSameSize"); if (!sa1 || !sa2) return ERROR_INT("both sa1 and sa2 not defined", procName, 1); n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); if (n1 < n2) { for (i = n1; i < n2; i++) sarrayAddString(sa1, padstring, L_COPY); } else if (n1 > n2) { for (i = n2; i < n1; i++) sarrayAddString(sa2, padstring, L_COPY); } return 0; } /*----------------------------------------------------------------------* * Convert word sarray to line sarray * *----------------------------------------------------------------------*/ /*! * \brief sarrayConvertWordsToLines() * * \param[in] sa sa of individual words * \param[in] linesize max num of chars in each line * \return saout sa of formatted lines, or NULL on error * *
 * Notes:
 *      (1) This is useful for re-typesetting text to a specific maximum
 *          line length.  The individual words in the input sarray
 *          are concatenated into textlines.  An input word string of zero
 *          length is taken to be a paragraph separator.  Each time
 *          such a string is found, the current line is ended and
 *          a new line is also produced that contains just the
 *          string of zero length "".  When the output sarray
 *          of lines is eventually converted to a string with newlines
 *          typically appended to each line string, the empty
 *          strings are just converted to newlines, producing the visible
 *          paragraph separation.
 *      (2) What happens when a word is larger than linesize?
 *          We write it out as a single line anyway!  Words preceding
 *          or following this long word are placed on lines preceding
 *          or following the line with the long word.  Why this choice?
 *          Long "words" found in text documents are typically URLs, and
 *          it's often desirable not to put newlines in the middle of a URL.
 *          The text display program e.g., text editor will typically
 *          wrap the long "word" to fit in the window.
 * 
*/ SARRAY * sarrayConvertWordsToLines(SARRAY *sa, l_int32 linesize) { char *wd, *strl; char emptystring[] = ""; l_int32 n, i, len, totlen; SARRAY *sal, *saout; PROCNAME("sarrayConvertWordsToLines"); if (!sa) return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); saout = sarrayCreate(0); n = sarrayGetCount(sa); totlen = 0; sal = NULL; for (i = 0; i < n; i++) { if (!sal) sal = sarrayCreate(0); wd = sarrayGetString(sa, i, L_NOCOPY); len = strlen(wd); if (len == 0) { /* end of paragraph: end line & insert blank line */ if (totlen > 0) { strl = sarrayToString(sal, 2); sarrayAddString(saout, strl, L_INSERT); } sarrayAddString(saout, emptystring, L_COPY); sarrayDestroy(&sal); totlen = 0; } else if (totlen == 0 && len + 1 > linesize) { /* long word! */ sarrayAddString(saout, wd, L_COPY); /* copy to one line */ } else if (totlen + len + 1 > linesize) { /* end line & start new */ strl = sarrayToString(sal, 2); sarrayAddString(saout, strl, L_INSERT); sarrayDestroy(&sal); sal = sarrayCreate(0); sarrayAddString(sal, wd, L_COPY); totlen = len + 1; } else { /* add to current line */ sarrayAddString(sal, wd, L_COPY); totlen += len + 1; } } if (totlen > 0) { /* didn't end with blank line; output last line */ strl = sarrayToString(sal, 2); sarrayAddString(saout, strl, L_INSERT); sarrayDestroy(&sal); } return saout; } /*----------------------------------------------------------------------* * Split string on separator list * *----------------------------------------------------------------------*/ /* * \brief sarraySplitString() * * \param[in] sa to append to; typically empty initially * \param[in] str string to split; not changed * \param[in] separators characters that split input string * \return 0 if OK, 1 on error. * *
 * Notes:
 *      (1) This uses strtokSafe().  See the notes there in utils.c.
 * 
*/ l_int32 sarraySplitString(SARRAY *sa, const char *str, const char *separators) { char *cstr, *substr, *saveptr; PROCNAME("sarraySplitString"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!str) return ERROR_INT("str not defined", procName, 1); if (!separators) return ERROR_INT("separators not defined", procName, 1); cstr = stringNew(str); /* preserves const-ness of input str */ saveptr = NULL; substr = strtokSafe(cstr, separators, &saveptr); if (substr) sarrayAddString(sa, substr, L_INSERT); while ((substr = strtokSafe(NULL, separators, &saveptr))) sarrayAddString(sa, substr, L_INSERT); LEPT_FREE(cstr); return 0; } /*----------------------------------------------------------------------* * Filter sarray * *----------------------------------------------------------------------*/ /*! * \brief sarraySelectBySubstring() * * \param[in] sain input sarray * \param[in] substr [optional] substring for matching; can be NULL * \return saout output sarray, filtered with substring or NULL on error * *
 * Notes:
 *      (1) This selects all strings in sain that have substr as a substring.
 *          Note that we can't use strncmp() because we're looking for
 *          a match to the substring anywhere within each filename.
 *      (2) If substr == NULL, returns a copy of the sarray.
 * 
*/ SARRAY * sarraySelectBySubstring(SARRAY *sain, const char *substr) { char *str; l_int32 n, i, offset, found; SARRAY *saout; PROCNAME("sarraySelectBySubstring"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); n = sarrayGetCount(sain); if (!substr || n == 0) return sarrayCopy(sain); saout = sarrayCreate(n); for (i = 0; i < n; i++) { str = sarrayGetString(sain, i, L_NOCOPY); arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, strlen(substr), &offset, &found); if (found) sarrayAddString(saout, str, L_COPY); } return saout; } /*! * \brief sarraySelectByRange() * * \param[in] sain input sarray * \param[in] first index of first string to be selected * \param[in] last index of last string to be selected; * use 0 to go to the end of the sarray * \return saout output sarray, or NULL on error * *
 * Notes:
 *      (1) This makes %saout consisting of copies of all strings in %sain
 *          in the index set [first ... last].  Use %last == 0 to get all
 *          strings from %first to the last string in the sarray.
 * 
*/ SARRAY * sarraySelectByRange(SARRAY *sain, l_int32 first, l_int32 last) { char *str; l_int32 n, i; SARRAY *saout; PROCNAME("sarraySelectByRange"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); if (first < 0) first = 0; n = sarrayGetCount(sain); if (last <= 0) last = n - 1; if (last >= n) { L_WARNING("last > n - 1; setting to n - 1\n", procName); last = n - 1; } if (first > last) return (SARRAY *)ERROR_PTR("first must be >= last", procName, NULL); saout = sarrayCreate(0); for (i = first; i <= last; i++) { str = sarrayGetString(sain, i, L_COPY); sarrayAddString(saout, str, L_INSERT); } return saout; } /*! * \brief sarrayParseRange() * * \param[in] sa input sarray * \param[in] start index to start range search * \param[out] pactualstart index of actual start; may be > 'start' * \param[out] pend index of end * \param[out] pnewstart index of start of next range * \param[in] substr substring for matching at beginning of string * \param[in] loc byte offset within the string for the pattern; * use -1 if the location does not matter. * \return 0 if valid range found; 1 otherwise * *
 * Notes:
 *      (1) This finds the range of the next set of strings in SA,
 *          beginning the search at 'start', that does NOT have
 *          the substring 'substr' either at the indicated location
 *          in the string or anywhere in the string.  The input
 *          variable 'loc' is the specified offset within the string;
 *          use -1 to indicate 'anywhere in the string'.
 *      (2) Always check the return value to verify that a valid range
 *          was found.
 *      (3) If a valid range is not found, the values of actstart,
 *          end and newstart are all set to the size of sa.
 *      (4) If this is the last valid range, newstart returns the value n.
 *          In use, this should be tested before calling the function.
 *      (5) Usage example.  To find all the valid ranges in a file
 *          where the invalid lines begin with two dashes, copy each
 *          line in the file to a string in an sarray, and do:
 *             start = 0;
 *             while (!sarrayParseRange(sa, start, &actstart, &end, &start,
 *                    "--", 0))
 *                 lept_stderr("start = %d, end = %d\n", actstart, end);
 * 
*/ l_int32 sarrayParseRange(SARRAY *sa, l_int32 start, l_int32 *pactualstart, l_int32 *pend, l_int32 *pnewstart, const char *substr, l_int32 loc) { char *str; l_int32 n, i, offset, found; PROCNAME("sarrayParseRange"); if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!pactualstart || !pend || !pnewstart) return ERROR_INT("not all range addresses defined", procName, 1); n = sarrayGetCount(sa); *pactualstart = *pend = *pnewstart = n; if (!substr) return ERROR_INT("substr not defined", procName, 1); /* Look for the first string without the marker */ if (start < 0 || start >= n) return 1; for (i = start; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, strlen(substr), &offset, &found); if (loc < 0) { if (!found) break; } else { if (!found || offset != loc) break; } } start = i; if (i == n) /* couldn't get started */ return 1; /* Look for the last string without the marker */ *pactualstart = start; for (i = start + 1; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, strlen(substr), &offset, &found); if (loc < 0) { if (found) break; } else { if (found && offset == loc) break; } } *pend = i - 1; start = i; if (i == n) /* no further range */ return 0; /* Look for the first string after *pend without the marker. * This will start the next run of strings, if it exists. */ for (i = start; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); arrayFindSequence((l_uint8 *)str, strlen(str), (l_uint8 *)substr, strlen(substr), &offset, &found); if (loc < 0) { if (!found) break; } else { if (!found || offset != loc) break; } } if (i < n) *pnewstart = i; return 0; } /*----------------------------------------------------------------------* * Serialize for I/O * *----------------------------------------------------------------------*/ /*! * \brief sarrayRead() * * \param[in] filename * \return sarray, or NULL on error */ SARRAY * sarrayRead(const char *filename) { FILE *fp; SARRAY *sa; PROCNAME("sarrayRead"); if (!filename) return (SARRAY *)ERROR_PTR("filename not defined", procName, NULL); if ((fp = fopenReadStream(filename)) == NULL) return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL); sa = sarrayReadStream(fp); fclose(fp); if (!sa) return (SARRAY *)ERROR_PTR("sa not read", procName, NULL); return sa; } /*! * \brief sarrayReadStream() * * \param[in] fp file stream * \return sarray, or NULL on error * *
 * Notes:
 *      (1) We store the size of each string along with the string.
 *          The limit on the number of strings is 25M.
 *          The limit on the size of any string is 2^30 bytes.
 *      (2) This allows a string to have embedded newlines.  By reading
 *          the entire string, as determined by its size, we are
 *          not affected by any number of embedded newlines.
 *      (3) It is OK for the sarray to be empty.
 * 
*/ SARRAY * sarrayReadStream(FILE *fp) { char *stringbuf; l_int32 i, n, size, index, bufsize, version, ignore, success; SARRAY *sa; PROCNAME("sarrayReadStream"); if (!fp) return (SARRAY *)ERROR_PTR("stream not defined", procName, NULL); if (fscanf(fp, "\nSarray Version %d\n", &version) != 1) return (SARRAY *)ERROR_PTR("not an sarray file", procName, NULL); if (version != SARRAY_VERSION_NUMBER) return (SARRAY *)ERROR_PTR("invalid sarray version", procName, NULL); if (fscanf(fp, "Number of strings = %d\n", &n) != 1) return (SARRAY *)ERROR_PTR("error on # strings", procName, NULL); if (n < 0) return (SARRAY *)ERROR_PTR("num string ptrs <= 0", procName, NULL); if (n > MaxPtrArraySize) return (SARRAY *)ERROR_PTR("too many string ptrs", procName, NULL); if (n == 0) L_INFO("the sarray is empty\n", procName); success = TRUE; if ((sa = sarrayCreate(n)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); bufsize = 512 + 1; stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char)); for (i = 0; i < n; i++) { /* Get the size of the stored string */ if ((fscanf(fp, "%d[%d]:", &index, &size) != 2) || (size > (1 << 30))) { success = FALSE; L_ERROR("error on string size\n", procName); goto cleanup; } /* Expand the string buffer if necessary */ if (size > bufsize - 5) { LEPT_FREE(stringbuf); bufsize = (l_int32)(1.5 * size); stringbuf = (char *)LEPT_CALLOC(bufsize, sizeof(char)); } /* Read the stored string, plus leading spaces and trailing \n */ if (fread(stringbuf, 1, size + 3, fp) != size + 3) { success = FALSE; L_ERROR("error reading string\n", procName); goto cleanup; } /* Remove the \n that was added by sarrayWriteStream() */ stringbuf[size + 2] = '\0'; /* Copy it in, skipping the 2 leading spaces */ sarrayAddString(sa, stringbuf + 2, L_COPY); } ignore = fscanf(fp, "\n"); cleanup: LEPT_FREE(stringbuf); if (!success) sarrayDestroy(&sa); return sa; } /*! * \brief sarrayReadMem() * * \param[in] data serialization in ascii * \param[in] size of data; can use strlen to get it * \return sarray, or NULL on error */ SARRAY * sarrayReadMem(const l_uint8 *data, size_t size) { FILE *fp; SARRAY *sa; PROCNAME("sarrayReadMem"); if (!data) return (SARRAY *)ERROR_PTR("data not defined", procName, NULL); if ((fp = fopenReadFromMemory(data, size)) == NULL) return (SARRAY *)ERROR_PTR("stream not opened", procName, NULL); sa = sarrayReadStream(fp); fclose(fp); if (!sa) L_ERROR("sarray not read\n", procName); return sa; } /*! * \brief sarrayWrite() * * \param[in] filename * \param[in] sa string array * \return 0 if OK; 1 on error */ l_ok sarrayWrite(const char *filename, SARRAY *sa) { l_int32 ret; FILE *fp; PROCNAME("sarrayWrite"); if (!filename) return ERROR_INT("filename not defined", procName, 1); if (!sa) return ERROR_INT("sa not defined", procName, 1); if ((fp = fopenWriteStream(filename, "w")) == NULL) return ERROR_INT("stream not opened", procName, 1); ret = sarrayWriteStream(fp, sa); fclose(fp); if (ret) return ERROR_INT("sa not written to stream", procName, 1); return 0; } /*! * \brief sarrayWriteStream() * * \param[in] fp file stream * \param[in] sa string array * \return 0 if OK; 1 on error * *
 * Notes:
 *      (1) This appends a '\n' to each string, which is stripped
 *          off by sarrayReadStream().
 * 
*/ l_ok sarrayWriteStream(FILE *fp, SARRAY *sa) { l_int32 i, n, len; PROCNAME("sarrayWriteStream"); if (!fp) return ERROR_INT("stream not defined", procName, 1); if (!sa) return ERROR_INT("sa not defined", procName, 1); n = sarrayGetCount(sa); fprintf(fp, "\nSarray Version %d\n", SARRAY_VERSION_NUMBER); fprintf(fp, "Number of strings = %d\n", n); for (i = 0; i < n; i++) { len = strlen(sa->array[i]); fprintf(fp, " %d[%d]: %s\n", i, len, sa->array[i]); } fprintf(fp, "\n"); return 0; } /*! * \brief sarrayWriteMem() * * \param[out] pdata data of serialized sarray; ascii * \param[out] psize size of returned data * \param[in] sa * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) Serializes a sarray in memory and puts the result in a buffer.
 * 
*/ l_ok sarrayWriteMem(l_uint8 **pdata, size_t *psize, SARRAY *sa) { l_int32 ret; FILE *fp; PROCNAME("sarrayWriteMem"); if (pdata) *pdata = NULL; if (psize) *psize = 0; if (!pdata) return ERROR_INT("&data not defined", procName, 1); if (!psize) return ERROR_INT("&size not defined", procName, 1); if (!sa) return ERROR_INT("sa not defined", procName, 1); #if HAVE_FMEMOPEN if ((fp = open_memstream((char **)pdata, psize)) == NULL) return ERROR_INT("stream not opened", procName, 1); ret = sarrayWriteStream(fp, sa); #else L_INFO("work-around: writing to a temp file\n", procName); #ifdef _WIN32 if ((fp = fopenWriteWinTempfile()) == NULL) return ERROR_INT("tmpfile stream not opened", procName, 1); #else if ((fp = tmpfile()) == NULL) return ERROR_INT("tmpfile stream not opened", procName, 1); #endif /* _WIN32 */ ret = sarrayWriteStream(fp, sa); rewind(fp); *pdata = l_binaryReadStream(fp, psize); #endif /* HAVE_FMEMOPEN */ fclose(fp); return ret; } /*! * \brief sarrayAppend() * * \param[in] filename * \param[in] sa * \return 0 if OK; 1 on error */ l_ok sarrayAppend(const char *filename, SARRAY *sa) { FILE *fp; PROCNAME("sarrayAppend"); if (!filename) return ERROR_INT("filename not defined", procName, 1); if (!sa) return ERROR_INT("sa not defined", procName, 1); if ((fp = fopenWriteStream(filename, "a")) == NULL) return ERROR_INT("stream not opened", procName, 1); if (sarrayWriteStream(fp, sa)) { fclose(fp); return ERROR_INT("sa not appended to stream", procName, 1); } fclose(fp); return 0; } /*---------------------------------------------------------------------* * Directory filenames * *---------------------------------------------------------------------*/ /*! * \brief getNumberedPathnamesInDirectory() * * \param[in] dirname directory name * \param[in] substr [optional] substring filter on filenames; can be NULL * \param[in] numpre number of characters in name before number * \param[in] numpost number of characters in name after the number, * up to a dot before an extension * \param[in] maxnum only consider page numbers up to this value * \return sarray of numbered pathnames, or NULL on error * *
 * Notes:
 *      (1) Returns the full pathnames of the numbered filenames in
 *          the directory.  The number in the filename is the index
 *          into the sarray.  For indices for which there are no filenames,
 *          an empty string ("") is placed into the sarray.
 *          This makes reading numbered files very simple.  For example,
 *          the image whose filename includes number N can be retrieved using
 *               pixReadIndexed(sa, N);
 *      (2) If %substr is not NULL, only filenames that contain
 *          the substring can be included.  If %substr is NULL,
 *          all matching filenames are used.
 *      (3) If no numbered files are found, it returns an empty sarray,
 *          with no initialized strings.
 *      (4) It is assumed that the page number is contained within
 *          the basename (the filename without directory or extension).
 *          %numpre is the number of characters in the basename
 *          preceding the actual page number; %numpost is the number
 *          following the page number, up to either the end of the
 *          basename or a ".", whichever comes first.
 *      (5) This is useful when all filenames contain numbers that are
 *          not necessarily consecutive.  0-padding is not required.
 *      (6) To use a O(n) matching algorithm, the largest page number
 *          is found and two internal arrays of this size are created.
 *          This maximum is constrained not to exceed %maxsum,
 *          to make sure that an unrealistically large number is not
 *          accidentally used to determine the array sizes.
 * 
*/ SARRAY * getNumberedPathnamesInDirectory(const char *dirname, const char *substr, l_int32 numpre, l_int32 numpost, l_int32 maxnum) { l_int32 nfiles; SARRAY *sa, *saout; PROCNAME("getNumberedPathnamesInDirectory"); if (!dirname) return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); if ((sa = getSortedPathnamesInDirectory(dirname, substr, 0, 0)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); if ((nfiles = sarrayGetCount(sa)) == 0) { sarrayDestroy(&sa); return sarrayCreate(1); } saout = convertSortedToNumberedPathnames(sa, numpre, numpost, maxnum); sarrayDestroy(&sa); return saout; } /*! * \brief getSortedPathnamesInDirectory() * * \param[in] dirname directory name * \param[in] substr [optional] substring filter on filenames; can be NULL * \param[in] first 0-based * \param[in] nfiles use 0 for all to the end * \return sarray of sorted pathnames, or NULL on error * *
 * Notes:
 *      (1) Use %substr to filter filenames in the directory.  If
 *          %substr == NULL, this takes all files.
 *      (2) The files in the directory, after optional filtering by
 *          the substring, are lexically sorted in increasing order.
 *          Use %first and %nfiles to select a contiguous set of files.
 *      (3) The full pathnames are returned for the requested sequence.
 *          If no files are found after filtering, returns an empty sarray.
 * 
*/ SARRAY * getSortedPathnamesInDirectory(const char *dirname, const char *substr, l_int32 first, l_int32 nfiles) { char *fname, *fullname; l_int32 i, n, last; SARRAY *sa, *safiles, *saout; PROCNAME("getSortedPathnamesInDirectory"); if (!dirname) return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); if ((sa = getFilenamesInDirectory(dirname)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); safiles = sarraySelectBySubstring(sa, substr); sarrayDestroy(&sa); n = sarrayGetCount(safiles); if (n == 0) { L_WARNING("no files found\n", procName); return safiles; } sarraySort(safiles, safiles, L_SORT_INCREASING); first = L_MIN(L_MAX(first, 0), n - 1); if (nfiles == 0) nfiles = n - first; last = L_MIN(first + nfiles - 1, n - 1); saout = sarrayCreate(last - first + 1); for (i = first; i <= last; i++) { fname = sarrayGetString(safiles, i, L_NOCOPY); fullname = pathJoin(dirname, fname); sarrayAddString(saout, fullname, L_INSERT); } sarrayDestroy(&safiles); return saout; } /*! * \brief convertSortedToNumberedPathnames() * * \param[in] sa sorted pathnames including zero-padded integers * \param[in] numpre number of characters in name before number * \param[in] numpost number of characters in name after the number, * up to a dot before an extension * \param[in] maxnum only consider page numbers up to this value * \return sarray of numbered pathnames, or NULL on error * *
 * Notes:
 *      (1) Typically, numpre = numpost = 0; e.g., when the filename
 *          just has a number followed by an optional extension.
 * 
*/ SARRAY * convertSortedToNumberedPathnames(SARRAY *sa, l_int32 numpre, l_int32 numpost, l_int32 maxnum) { char *fname, *str; l_int32 i, nfiles, num, index; SARRAY *saout; PROCNAME("convertSortedToNumberedPathnames"); if (!sa) return (SARRAY *)ERROR_PTR("sa not defined", procName, NULL); if ((nfiles = sarrayGetCount(sa)) == 0) return sarrayCreate(1); /* Find the last file in the sorted array that has a number * that (a) matches the count pattern and (b) does not * exceed %maxnum. %maxnum sets an upper limit on the size * of the sarray. */ num = 0; for (i = nfiles - 1; i >= 0; i--) { fname = sarrayGetString(sa, i, L_NOCOPY); num = extractNumberFromFilename(fname, numpre, numpost); if (num < 0) continue; num = L_MIN(num + 1, maxnum); break; } if (num <= 0) /* none found */ return sarrayCreate(1); /* Insert pathnames into the output sarray. * Ignore numbers that are out of the range of sarray. */ saout = sarrayCreateInitialized(num, ""); for (i = 0; i < nfiles; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); index = extractNumberFromFilename(fname, numpre, numpost); if (index < 0 || index >= num) continue; str = sarrayGetString(saout, index, L_NOCOPY); if (str[0] != '\0') { L_WARNING("\n Multiple files with same number: %d\n", procName, index); } sarrayReplaceString(saout, index, fname, L_COPY); } return saout; } /*! * \brief getFilenamesInDirectory() * * \param[in] dirname directory name * \return sarray of file names, or NULL on error * *
 * Notes:
 *      (1) The versions compiled under unix and cygwin use the POSIX C
 *          library commands for handling directories.  For windows,
 *          there is a separate implementation.
 *      (2) It returns an array of filename tails; i.e., only the part of
 *          the path after the last slash.
 *      (3) Use of the d_type field of dirent is not portable:
 *          "According to POSIX, the dirent structure contains a field
 *          char d_name[] of unspecified size, with at most NAME_MAX
 *          characters preceding the terminating null character.  Use
 *          of other fields will harm the portability of your programs."
 *      (4) As a consequence of (3), we note several things:
 *           ~ MINGW doesn't have a d_type member.
 *           ~ Older versions of gcc (e.g., 2.95.3) return DT_UNKNOWN
 *             for d_type from all files.
 *          On these systems, this function will return directories
 *          (except for '.' and '..', which are eliminated using
 *          the d_name field).
 * 
*/ #ifndef _WIN32 SARRAY * getFilenamesInDirectory(const char *dirname) { char dir[PATH_MAX + 1]; char *realdir, *stat_path, *ignore; size_t size; SARRAY *safiles; DIR *pdir; struct dirent *pdirentry; int dfd, stat_ret; struct stat st; PROCNAME("getFilenamesInDirectory"); if (!dirname) return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); if (dirname[0] == '\0') return (SARRAY *)ERROR_PTR("dirname is empty", procName, NULL); /* Who would have thought it was this fiddly to open a directory and get the files inside? fstatat() works with relative directory paths, and stat() requires using the absolute path. realpath works as follows for files and directories: * If the file or directory exists, realpath returns its path; else it returns NULL. * If the second arg to realpath is passed in, the canonical path is returned there. Use a buffer of sufficient size. If the second arg is NULL, the path is malloc'd and returned if the file or directory exists. We pass in a buffer for the second arg, and check that the canonical directory path was made. The existence of the directory is checked later, after its actual path is returned by genPathname(). */ dir[0] = '\0'; /* init empty in case realpath() fails to write it */ ignore = realpath(dirname, dir); if (dir[0] == '\0') return (SARRAY *)ERROR_PTR("dir not made", procName, NULL); realdir = genPathname(dir, NULL); if ((pdir = opendir(realdir)) == NULL) { LEPT_FREE(realdir); return (SARRAY *)ERROR_PTR("pdir not opened", procName, NULL); } safiles = sarrayCreate(0); dfd = dirfd(pdir); while ((pdirentry = readdir(pdir))) { #if HAVE_FSTATAT stat_ret = fstatat(dfd, pdirentry->d_name, &st, 0); #else size = strlen(realdir) + strlen(pdirentry->d_name) + 2; if (size > PATH_MAX) { L_ERROR("size = %zu too large; skipping\n", procName, size); continue; } stat_path = (char *)LEPT_CALLOC(size, 1); snprintf(stat_path, size, "%s/%s", realdir, pdirentry->d_name); stat_ret = stat(stat_path, &st); LEPT_FREE(stat_path); #endif if (stat_ret == 0 && S_ISDIR(st.st_mode)) continue; sarrayAddString(safiles, pdirentry->d_name, L_COPY); } closedir(pdir); LEPT_FREE(realdir); return safiles; } #else /* _WIN32 */ /* http://msdn2.microsoft.com/en-us/library/aa365200(VS.85).aspx */ #include SARRAY * getFilenamesInDirectory(const char *dirname) { char *pszDir; char *realdir; HANDLE hFind = INVALID_HANDLE_VALUE; SARRAY *safiles; WIN32_FIND_DATAA ffd; PROCNAME("getFilenamesInDirectory"); if (!dirname) return (SARRAY *)ERROR_PTR("dirname not defined", procName, NULL); realdir = genPathname(dirname, NULL); pszDir = stringJoin(realdir, "\\*"); LEPT_FREE(realdir); if (strlen(pszDir) + 1 > MAX_PATH) { LEPT_FREE(pszDir); return (SARRAY *)ERROR_PTR("dirname is too long", procName, NULL); } if ((safiles = sarrayCreate(0)) == NULL) { LEPT_FREE(pszDir); return (SARRAY *)ERROR_PTR("safiles not made", procName, NULL); } hFind = FindFirstFileA(pszDir, &ffd); if (INVALID_HANDLE_VALUE == hFind) { sarrayDestroy(&safiles); LEPT_FREE(pszDir); return (SARRAY *)ERROR_PTR("hFind not opened", procName, NULL); } while (FindNextFileA(hFind, &ffd) != 0) { if (ffd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) /* skip dirs */ continue; convertSepCharsInPath(ffd.cFileName, UNIX_PATH_SEPCHAR); sarrayAddString(safiles, ffd.cFileName, L_COPY); } FindClose(hFind); LEPT_FREE(pszDir); return safiles; } #endif /* _WIN32 */