/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /*! * \file utils2.c *
 *
 *      ------------------------------------------
 *      This file has these utilities:
 *         - safe string operations
 *         - find/replace operations on strings
 *         - read/write between file and memory
 *         - multi-platform file and directory operations
 *         - file name operations
 *      ------------------------------------------
 *
 *       Safe string procs
 *           char      *stringNew()
 *           l_int32    stringCopy()
 *           l_int32    stringCopySegment()
 *           l_int32    stringReplace()
 *           l_int32    stringLength()
 *           l_int32    stringCat()
 *           char      *stringConcatNew()
 *           char      *stringJoin()
 *           l_int32    stringJoinIP()
 *           char      *stringReverse()
 *           char      *strtokSafe()
 *           l_int32    stringSplitOnToken()
 *
 *       Find and replace string and array procs
 *           l_int32    stringCheckForChars()
 *           char      *stringRemoveChars()
 *           char      *stringReplaceEachSubstr()
 *           char      *stringReplaceSubstr()
 *           L_DNA     *stringFindEachSubstr()
 *           l_int32    stringFindSubstr()
 *           l_uint8   *arrayReplaceEachSequence()
 *           L_DNA     *arrayFindEachSequence()
 *           l_int32    arrayFindSequence()
 *
 *       Safe realloc
 *           void      *reallocNew()
 *
 *       Read and write between file and memory
 *           l_uint8   *l_binaryRead()
 *           l_uint8   *l_binaryReadStream()
 *           l_uint8   *l_binaryReadSelect()
 *           l_uint8   *l_binaryReadSelectStream()
 *           l_int32    l_binaryWrite()
 *           l_int32    nbytesInFile()
 *           l_int32    fnbytesInFile()
 *
 *       Copy and compare in memory
 *           l_uint8   *l_binaryCopy()
 *           l_uint8   *l_binaryCompare()
 *
 *       File copy operations
 *           l_int32    fileCopy()
 *           l_int32    fileConcatenate()
 *           l_int32    fileAppendString()
 *
 *       File split operations
 *           l_int32    fileSplitLinesUniform()
 *
 *       Multi-platform functions for opening file streams
 *           FILE      *fopenReadStream()
 *           FILE      *fopenWriteStream()
 *           FILE      *fopenReadFromMemory()
 *
 *       Opening a windows tmpfile for writing
 *           FILE      *fopenWriteWinTempfile()
 *
 *       Multi-platform functions that avoid C-runtime boundary crossing
 *       with Windows DLLs
 *           FILE      *lept_fopen()
 *           l_int32    lept_fclose()
 *           void      *lept_calloc()
 *           void       lept_free()
 *
 *       Multi-platform file system operations in temp directories
 *           l_int32    lept_mkdir()
 *           l_int32    lept_rmdir()
 *           l_int32    lept_direxists()
 *           l_int32    lept_mv()
 *           l_int32    lept_rm_match()
 *           l_int32    lept_rm()
 *           l_int32    lept_rmfile()
 *           l_int32    lept_cp()
 *
 *       Special debug/test function for calling 'system'
 *           void       callSystemDebug()
 *
 *       General file name operations
 *           l_int32    splitPathAtDirectory()
 *           l_int32    splitPathAtExtension()
 *           char      *pathJoin()
 *           char      *appendSubdirs()
 *
 *       Special file name operations
 *           l_int32    convertSepCharsInPath()
 *           char      *genPathname()
 *           l_int32    makeTempDirname()
 *           l_int32    modifyTrailingSlash()
 *           char      *l_makeTempFilename()
 *           l_int32    extractNumberFromFilename()
 *
 *
 *  Notes on multi-platform development
 *  -----------------------------------
 *  This is important:
 *  (1) With the exception of splitPathAtDirectory(), splitPathAtExtension()
  *     and genPathname(), all input pathnames must have unix separators.
 *  (2) On Windows, when you specify a read or write to "/tmp/...",
 *      the filename is rewritten to use the Windows temp directory:
 *         /tmp  ==>   [Temp]...    (windows)
 *  (3) This filename rewrite, along with the conversion from unix
 *      to windows pathnames, happens in genPathname().
 *  (4) Use fopenReadStream() and fopenWriteStream() to open files,
 *      because these use genPathname() to find the platform-dependent
 *      filenames.  Likewise for l_binaryRead() and l_binaryWrite().
 *  (5) For moving, copying and removing files and directories that are in
 *      subdirectories of /tmp, use the lept_*() file system shell wrappers:
 *         lept_mkdir(), lept_rmdir(), lept_mv(), lept_rm() and lept_cp().
 *  (6) Use the lept_*() C library wrappers.  These work properly on
 *      Windows, where the same DLL must perform complementary operations
 *      on file streams (open/close) and heap memory (malloc/free):
 *         lept_fopen(), lept_fclose(), lept_calloc() and lept_free().
 *  (7) Why read and write files to temp directories?
 *      The library needs the ability to read and write ephemeral
 *      files to default places, both for generating debugging output
 *      and for supporting regression tests.  Applications also need
 *      this ability for debugging.
 *  (8) Why do the pathname rewrite on Windows?
 *      The goal is to have the library, and programs using the library,
 *      run on multiple platforms without changes.  The location of
 *      temporary files depends on the platform as well as the user's
 *      configuration.  Temp files on Windows are in some directory
 *      not known a priori.  To make everything work seamlessly on
 *      Windows, every time you open a file for reading or writing,
 *      use a special function such as fopenReadStream() or
 *      fopenWriteStream(); these call genPathname() to ensure that
 *      if it is a temp file, the correct path is used.  To indicate
 *      that this is a temp file, the application is written with the
 *      root directory of the path in a canonical form: "/tmp".
 *  (9) Why is it that multi-platform directory functions like lept_mkdir()
 *      and lept_rmdir(), as well as associated file functions like
 *      lept_rm(), lept_mv() and lept_cp(), only work in the temp dir?
 *      These functions were designed to provide easy manipulation of
 *      temp files.  The restriction to temp files is for safety -- to
 *      prevent an accidental deletion of important files.  For example,
 *      lept_rmdir() first deletes all files in a specified subdirectory
 *      of temp, and then removes the directory.
 *
 * 
*/ #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #ifdef _MSC_VER #include #include #define getcwd _getcwd /* fix MSVC warning */ #else #include #endif /* _MSC_VER */ #ifdef _WIN32 #include #include /* _O_CREAT, ... */ #include /* _open */ #include /* _S_IREAD, _S_IWRITE */ #else #include /* for stat, mkdir(2) */ #include #endif #ifdef OS_IOS #include #include #endif #include #include #include "allheaders.h" /*--------------------------------------------------------------------* * Safe string operations * *--------------------------------------------------------------------*/ /*! * \brief stringNew() * * \param[in] src * \return dest copy of %src string, or NULL on error */ char * stringNew(const char *src) { l_int32 len; char *dest; PROCNAME("stringNew"); if (!src) { L_WARNING("src not defined\n", procName); return NULL; } len = strlen(src); if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL) return (char *)ERROR_PTR("dest not made", procName, NULL); stringCopy(dest, src, len); return dest; } /*! * \brief stringCopy() * * \param[in] dest existing byte buffer * \param[in] src string [optional] can be null * \param[in] n max number of characters to copy * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) Relatively safe wrapper for strncpy, that checks the input,
 *          and does not complain if %src is null or %n < 1.
 *          If %n < 1, this is a no-op.
 *      (2) %dest needs to be at least %n bytes in size.
 *      (3) We don't call strncpy() because valgrind complains about
 *          use of uninitialized values.
 * 
*/ l_ok stringCopy(char *dest, const char *src, l_int32 n) { l_int32 i; PROCNAME("stringCopy"); if (!dest) return ERROR_INT("dest not defined", procName, 1); if (!src || n < 1) return 0; /* Implementation of strncpy that valgrind doesn't complain about */ for (i = 0; i < n && src[i] != '\0'; i++) dest[i] = src[i]; for (; i < n; i++) dest[i] = '\0'; return 0; } /*! * \brief stringCopySegment() * * * \param[in] src string * \param[in] start byte position at start of segment * \param[in] nbytes number of bytes in the segment; use 0 to go to end * \return copy of segment, or NULL on error * *
 * Notes:
 *      (1) This is a variant of stringNew() that makes a new string
 *          from a segment of the input string.  The segment is specified
 *          by the starting position and the number of bytes.
 *      (2) The start location %start must be within the string %src.
 *      (3) The copy is truncated to the end of the source string.
 *          Use %nbytes = 0 to copy to the end of %src.
 * 
*/ char * stringCopySegment(const char *src, l_int32 start, l_int32 nbytes) { char *dest; l_int32 len; PROCNAME("stringCopySegment"); if (!src) return (char *)ERROR_PTR("src not defined", procName, NULL); len = strlen(src); if (start < 0 || start > len - 1) return (char *)ERROR_PTR("invalid start", procName, NULL); if (nbytes <= 0) /* copy to the end */ nbytes = len - start; if (start + nbytes > len) /* truncate to the end */ nbytes = len - start; if ((dest = (char *)LEPT_CALLOC(nbytes + 1, sizeof(char))) == NULL) return (char *)ERROR_PTR("dest not made", procName, NULL); stringCopy(dest, src + start, nbytes); return dest; } /*! * \brief stringReplace() * * \param[out] pdest string copy * \param[in] src [optional] string; can be null * \return 0 if OK; 1 on error * *
 * Notes:
 *      (1) Frees any existing dest string
 *      (2) Puts a copy of src string in the dest
 *      (3) If either or both strings are null, does something reasonable.
 * 
*/ l_ok stringReplace(char **pdest, const char *src) { PROCNAME("stringReplace"); if (!pdest) return ERROR_INT("pdest not defined", procName, 1); if (*pdest) LEPT_FREE(*pdest); if (src) *pdest = stringNew(src); else *pdest = NULL; return 0; } /*! * \brief stringLength() * * \param[in] src string can be null or NULL-terminated string * \param[in] size size of src buffer * \return length of src in bytes. * *
 * Notes:
 *      (1) Safe implementation of strlen that only checks size bytes
 *          for trailing NUL.
 *      (2) Valid returned string lengths are between 0 and size - 1.
 *          If size bytes are checked without finding a NUL byte, then
 *          an error is indicated by returning size.
 * 
*/ l_int32 stringLength(const char *src, size_t size) { l_int32 i; PROCNAME("stringLength"); if (!src) return ERROR_INT("src not defined", procName, 0); if (size < 1) return 0; for (i = 0; i < size; i++) { if (src[i] == '\0') return i; } return size; /* didn't find a NUL byte */ } /*! * \brief stringCat() * * \param[in] dest null-terminated byte buffer * \param[in] size size of dest * \param[in] src string can be null or NULL-terminated string * \return number of bytes added to dest; -1 on error * *
 * Notes:
 *      (1) Alternative implementation of strncat, that checks the input,
 *          is easier to use (since the size of the dest buffer is specified
 *          rather than the number of bytes to copy), and does not complain
 *          if %src is null.
 *      (2) Never writes past end of dest.
 *      (3) If there is not enough room to append the src, which is an error,
 *          it does nothing.
 *      (4) N.B. The order of 2nd and 3rd args is reversed from that in
 *          strncat, as in the Windows function strcat_s().
 * 
*/ l_int32 stringCat(char *dest, size_t size, const char *src) { l_int32 i, n; l_int32 lendest, lensrc; PROCNAME("stringCat"); if (!dest) return ERROR_INT("dest not defined", procName, -1); if (size < 1) return ERROR_INT("size < 1; too small", procName, -1); if (!src) return 0; lendest = stringLength(dest, size); if (lendest == size) return ERROR_INT("no terminating nul byte", procName, -1); lensrc = stringLength(src, size); if (lensrc == 0) return 0; n = (lendest + lensrc > size - 1 ? 0 : lensrc); if (n < 1) return ERROR_INT("dest too small for append", procName, -1); for (i = 0; i < n; i++) dest[lendest + i] = src[i]; dest[lendest + n] = '\0'; return n; } /*! * \brief stringConcatNew() * * \param[in] first first string in list * \param[in] ... NULL-terminated list of strings * \return result new string concatenating the input strings, or * NULL if first == NULL * *
 * Notes:
 *      (1) The last arg in the list of strings must be NULL.
 *      (2) Caller must free the returned string.
 * 
*/ char * stringConcatNew(const char *first, ...) { size_t len; char *result, *ptr; const char *arg; va_list args; if (!first) return NULL; /* Find the length of the output string */ va_start(args, first); len = strlen(first); while ((arg = va_arg(args, const char *)) != NULL) len += strlen(arg); va_end(args); result = (char *)LEPT_CALLOC(len + 1, sizeof(char)); /* Concatenate the args */ va_start(args, first); ptr = result; arg = first; while (*arg) *ptr++ = *arg++; while ((arg = va_arg(args, const char *)) != NULL) { while (*arg) *ptr++ = *arg++; } va_end(args); return result; } /*! * \brief stringJoin() * * \param[in] src1 [optional] string; can be null * \param[in] src2 [optional] string; can be null * \return concatenated string, or NULL on error * *
 * Notes:
 *      (1) This is a safe version of strcat; it makes a new string.
 *      (2) It is not an error if either or both of the strings
 *          are empty, or if either or both of the pointers are null.
 * 
*/ char * stringJoin(const char *src1, const char *src2) { char *dest; l_int32 srclen1, srclen2, destlen; PROCNAME("stringJoin"); srclen1 = (src1) ? strlen(src1) : 0; srclen2 = (src2) ? strlen(src2) : 0; destlen = srclen1 + srclen2 + 3; if ((dest = (char *)LEPT_CALLOC(destlen, sizeof(char))) == NULL) return (char *)ERROR_PTR("calloc fail for dest", procName, NULL); if (src1) stringCat(dest, destlen, src1); if (src2) stringCat(dest, destlen, src2); return dest; } /*! * \brief stringJoinIP() * * \param[in,out] psrc1 address of string src1; cannot be on the stack * \param[in] src2 [optional] string; can be null * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This is a safe in-place version of strcat.  The contents of
 *          src1 is replaced by the concatenation of src1 and src2.
 *      (2) It is not an error if either or both of the strings
 *          are empty (""), or if the pointers to the strings (*psrc1, src2)
 *          are null.
 *      (3) src1 should be initialized to null or an empty string
 *          before the first call.  Use one of these:
 *              char *src1 = NULL;
 *              char *src1 = stringNew("");
 *          Then call with:
 *              stringJoinIP(&src1, src2);
 *      (4) This can also be implemented as a macro:
 * \code
 *              #define stringJoinIP(src1, src2) \
 *                  {tmpstr = stringJoin((src1),(src2)); \
 *                  LEPT_FREE(src1); \
 *                  (src1) = tmpstr;}
 * \endcode
 *      (5) Another function to consider for joining many strings is
 *          stringConcatNew().
 * 
*/ l_ok stringJoinIP(char **psrc1, const char *src2) { char *tmpstr; PROCNAME("stringJoinIP"); if (!psrc1) return ERROR_INT("&src1 not defined", procName, 1); tmpstr = stringJoin(*psrc1, src2); LEPT_FREE(*psrc1); *psrc1 = tmpstr; return 0; } /*! * \brief stringReverse() * * \param[in] src string * \return dest newly-allocated reversed string */ char * stringReverse(const char *src) { char *dest; l_int32 i, len; PROCNAME("stringReverse"); if (!src) return (char *)ERROR_PTR("src not defined", procName, NULL); len = strlen(src); if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL) return (char *)ERROR_PTR("calloc fail for dest", procName, NULL); for (i = 0; i < len; i++) dest[i] = src[len - 1 - i]; return dest; } /*! * \brief strtokSafe() * * \param[in] cstr input string to be sequentially parsed; * use NULL after the first call * \param[in] seps a string of character separators * \param[out] psaveptr ptr to the next char after * the last encountered separator * \return substr a new string that is copied from the previous * saveptr up to but not including the next * separator character, or NULL if end of cstr. * *
 * Notes:
 *      (1) This is a thread-safe implementation of strtok.
 *      (2) It has the same interface as strtok_r.
 *      (3) It differs from strtok_r in usage in two respects:
 *          (a) the input string is not altered
 *          (b) each returned substring is newly allocated and must
 *              be freed after use.
 *      (4) Let me repeat that.  This is "safe" because the input
 *          string is not altered and because each returned string
 *          is newly allocated on the heap.
 *      (5) It is here because, surprisingly, some C libraries don't
 *          include strtok_r.
 *      (6) Important usage points:
 *          ~ Input the string to be parsed on the first invocation.
 *          ~ Then input NULL after that; the value returned in saveptr
 *            is used in all subsequent calls.
 *      (7) This is only slightly slower than strtok_r.
 * 
*/ char * strtokSafe(char *cstr, const char *seps, char **psaveptr) { char nextc; char *start, *substr; l_int32 istart, i, j, nchars; PROCNAME("strtokSafe"); if (!seps) return (char *)ERROR_PTR("seps not defined", procName, NULL); if (!psaveptr) return (char *)ERROR_PTR("&saveptr not defined", procName, NULL); if (!cstr) { start = *psaveptr; } else { start = cstr; *psaveptr = NULL; } if (!start) /* nothing to do */ return NULL; /* First time, scan for the first non-sep character */ istart = 0; if (cstr) { for (istart = 0;; istart++) { if ((nextc = start[istart]) == '\0') { *psaveptr = NULL; /* in case caller doesn't check ret value */ return NULL; } if (!strchr(seps, nextc)) break; } } /* Scan through, looking for a sep character; if none is * found, 'i' will be at the end of the string. */ for (i = istart;; i++) { if ((nextc = start[i]) == '\0') break; if (strchr(seps, nextc)) break; } /* Save the substring */ nchars = i - istart; substr = (char *)LEPT_CALLOC(nchars + 1, sizeof(char)); stringCopy(substr, start + istart, nchars); /* Look for the next non-sep character. * If this is the last substring, return a null saveptr. */ for (j = i;; j++) { if ((nextc = start[j]) == '\0') { *psaveptr = NULL; /* no more non-sep characters */ break; } if (!strchr(seps, nextc)) { *psaveptr = start + j; /* start here on next call */ break; } } return substr; } /*! * \brief stringSplitOnToken() * * \param[in] cstr input string to be split; not altered * \param[in] seps a string of character separators * \param[out] phead ptr to copy of the input string, up to * the first separator token encountered * \param[out] ptail ptr to copy of the part of the input string * starting with the first non-separator character * that occurs after the first separator is found * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) The input string is not altered; all split parts are new strings.
 *      (2) The split occurs around the first consecutive sequence of
 *          tokens encountered.
 *      (3) The head goes from the beginning of the string up to
 *          but not including the first token found.
 *      (4) The tail contains the second part of the string, starting
 *          with the first char in that part that is NOT a token.
 *      (5) If no separator token is found, 'head' contains a copy
 *          of the input string and 'tail' is null.
 * 
*/ l_ok stringSplitOnToken(char *cstr, const char *seps, char **phead, char **ptail) { char *saveptr; PROCNAME("stringSplitOnToken"); if (!phead) return ERROR_INT("&head not defined", procName, 1); if (!ptail) return ERROR_INT("&tail not defined", procName, 1); *phead = *ptail = NULL; if (!cstr) return ERROR_INT("cstr not defined", procName, 1); if (!seps) return ERROR_INT("seps not defined", procName, 1); *phead = strtokSafe(cstr, seps, &saveptr); if (saveptr) *ptail = stringNew(saveptr); return 0; } /*--------------------------------------------------------------------* * Find and replace procs * *--------------------------------------------------------------------*/ /*! * \brief stringCheckForChars() * * \param[in] src input string; can be of zero length * \param[in] chars string of chars to be searched for in %src * \param[out] pfound 1 if any characters are found; 0 otherwise * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This can be used to sanitize an operation by checking for
 *          special characters that don't belong in a string.
 * 
*/ l_ok stringCheckForChars(const char *src, const char *chars, l_int32 *pfound) { char ch; l_int32 i, n; PROCNAME("stringCheckForChars"); if (!pfound) return ERROR_INT("&found not defined", procName, 1); *pfound = FALSE; if (!src || !chars) return ERROR_INT("src and chars not both defined", procName, 1); n = strlen(src); for (i = 0; i < n; i++) { ch = src[i]; if (strchr(chars, ch)) { *pfound = TRUE; break; } } return 0; } /*! * \brief stringRemoveChars() * * \param[in] src input string; can be of zero length * \param[in] remchars string of chars to be removed from src * \return dest string with specified chars removed, or NULL on error */ char * stringRemoveChars(const char *src, const char *remchars) { char ch; char *dest; l_int32 nsrc, i, k; PROCNAME("stringRemoveChars"); if (!src) return (char *)ERROR_PTR("src not defined", procName, NULL); if (!remchars) return stringNew(src); if ((dest = (char *)LEPT_CALLOC(strlen(src) + 1, sizeof(char))) == NULL) return (char *)ERROR_PTR("dest not made", procName, NULL); nsrc = strlen(src); for (i = 0, k = 0; i < nsrc; i++) { ch = src[i]; if (!strchr(remchars, ch)) dest[k++] = ch; } return dest; } /*! * \brief stringReplaceEachSubstr() * * \param[in] src input string; can be of zero length * \param[in] sub1 substring to be replaced * \param[in] sub2 substring to put in; can be "" * \param[out] pcount [optional] the number of times that sub1 * is found in src; 0 if not found * \return dest string with substring replaced, or NULL if the * substring not found or on error. * *
 * Notes:
 *      (1) This is a wrapper for simple string substitution that uses
 *          the more general function arrayReplaceEachSequence().
 *      (2) This finds every non-overlapping occurrence of %sub1 in
 *          %src, and replaces it with %sub2.  By "non-overlapping"
 *          we mean that after it finds each match, it removes the
 *          matching characters, replaces with the substitution string
 *          (if not empty), and continues.  For example, if you replace
 *          'aa' by 'X' in 'baaabbb', you find one match at position 1
 *          and return 'bXabbb'.
 *      (3) To only remove each instance of sub1, use "" for sub2
 *      (4) Returns a copy of %src if sub1 and sub2 are the same.
 *      (5) If the input %src is binary data that can have null characters,
 *          use arrayReplaceEachSequence() directly.
 * 
*/ char * stringReplaceEachSubstr(const char *src, const char *sub1, const char *sub2, l_int32 *pcount) { size_t datalen; PROCNAME("stringReplaceEachSubstr"); if (pcount) *pcount = 0; if (!src || !sub1 || !sub2) return (char *)ERROR_PTR("src, sub1, sub2 not all defined", procName, NULL); if (strlen(sub2) > 0) { return (char *)arrayReplaceEachSequence( (const l_uint8 *)src, strlen(src), (const l_uint8 *)sub1, strlen(sub1), (const l_uint8 *)sub2, strlen(sub2), &datalen, pcount); } else { /* empty replacement string; removal only */ return (char *)arrayReplaceEachSequence( (const l_uint8 *)src, strlen(src), (const l_uint8 *)sub1, strlen(sub1), NULL, 0, &datalen, pcount); } } /*! * \brief stringReplaceSubstr() * * \param[in] src input string; can be of zero length * \param[in] sub1 substring to be replaced * \param[in] sub2 substring to put in; can be "" * \param[in,out] ploc [optional] input start location for search; * returns the loc after replacement * \param[out] pfound [optional] 1 if sub1 is found; 0 otherwise * \return dest string with substring replaced, or NULL on error. * *
 * Notes:
 *      (1) Replaces the first instance.
 *      (2) To remove sub1 without replacement, use "" for sub2.
 *      (3) Returns a copy of %src if either no instance of %sub1 is found,
 *          or if %sub1 and %sub2 are the same.
 *      (4) If %ploc == NULL, the search will start at the beginning of %src.
 *          If %ploc != NULL, *ploc must be initialized to the byte offset
 *          within %src from which the search starts.  To search the
 *          string from the beginning, set %loc = 0 and input &loc.
 *          After finding %sub1 and replacing it with %sub2, %loc will be
 *          returned as the next position after %sub2 in the output string.
 *      (5) Note that the output string also includes all the characters
 *          from the input string that occur after the single substitution.
 * 
*/ char * stringReplaceSubstr(const char *src, const char *sub1, const char *sub2, l_int32 *ploc, l_int32 *pfound) { const char *ptr; char *dest; l_int32 nsrc, nsub1, nsub2, len, npre, loc; PROCNAME("stringReplaceSubstr"); if (pfound) *pfound = 0; if (!src || !sub1 || !sub2) return (char *)ERROR_PTR("src, sub1, sub2 not all defined", procName, NULL); if (ploc) loc = *ploc; else loc = 0; if (!strcmp(sub1, sub2)) return stringNew(src); if ((ptr = strstr(src + loc, sub1)) == NULL) return stringNew(src); if (pfound) *pfound = 1; nsrc = strlen(src); nsub1 = strlen(sub1); nsub2 = strlen(sub2); len = nsrc + nsub2 - nsub1; if ((dest = (char *)LEPT_CALLOC(len + 1, sizeof(char))) == NULL) return (char *)ERROR_PTR("dest not made", procName, NULL); npre = ptr - src; memcpy(dest, src, npre); strcpy(dest + npre, sub2); strcpy(dest + npre + nsub2, ptr + nsub1); if (ploc) *ploc = npre + nsub2; return dest; } /*! * \brief stringFindEachSubstr() * * \param[in] src input string; can be of zero length * \param[in] sub substring to be searched for * \return dna of offsets where the sequence is found, or NULL if * none are found or on error * *
 * Notes:
 *      (1) This finds every non-overlapping occurrence in %src of %sub.
 *          After it finds each match, it moves forward in %src by the length
 *          of %sub before continuing the search.  So for example,
 *          if you search for the sequence 'aa' in the data 'baaabbb',
 *          you find one match at position 1.

 * 
*/ L_DNA * stringFindEachSubstr(const char *src, const char *sub) { PROCNAME("stringFindEachSubstr"); if (!src || !sub) return (L_DNA *)ERROR_PTR("src, sub not both defined", procName, NULL); return arrayFindEachSequence((const l_uint8 *)src, strlen(src), (const l_uint8 *)sub, strlen(sub)); } /*! * \brief stringFindSubstr() * * \param[in] src input string; can be of zero length * \param[in] sub substring to be searched for; must not be empty * \param[out] ploc [optional] location of substring in src * \return 1 if found; 0 if not found or on error * *
 * Notes:
 *      (1) This is a wrapper around strstr().  It finds the first
 *          instance of %sub in %src.  If the substring is not found
 *          and the location is returned, it has the value -1.
 *      (2) Both %src and %sub must be defined, and %sub must have
 *          length of at least 1.
 * 
*/ l_int32 stringFindSubstr(const char *src, const char *sub, l_int32 *ploc) { const char *ptr; PROCNAME("stringFindSubstr"); if (ploc) *ploc = -1; if (!src || !sub) return ERROR_INT("src and sub not both defined", procName, 0); if (strlen(sub) == 0) return ERROR_INT("substring length 0", procName, 0); if (strlen(src) == 0) return 0; if ((ptr = strstr(src, sub)) == NULL) /* not found */ return 0; if (ploc) *ploc = ptr - src; return 1; } /*! * \brief arrayReplaceEachSequence() * * \param[in] datas source byte array * \param[in] dataslen length of source data, in bytes * \param[in] seq subarray of bytes to find in source data * \param[in] seqlen length of subarray, in bytes * \param[in] newseq replacement subarray; can be null * \param[in] newseqlen length of replacement subarray, in bytes * \param[out] pdatadlen length of dest byte array, in bytes * \param[out] pcount [optional] the number of times that sub1 * is found in src; 0 if not found * \return datad with all all subarrays replaced (or removed) * *
 * Notes:
 *      (1) The byte arrays %datas, %seq and %newseq are not C strings,
 *          because they can contain null bytes.  Therefore, for each
 *          we must give the length of the array.
 *      (2) If %newseq == NULL, this just removes all instances of %seq.
 *          Otherwise, it replaces every non-overlapping occurrence of
 *          %seq in %datas with %newseq. A new array %datad and its
 *          size are returned.  See arrayFindEachSequence() for more
 *          details on finding non-overlapping occurrences.
 *      (3) If no instances of %seq are found, this returns a copy of %datas.
 *      (4) The returned %datad is null terminated.
 *      (5) Can use stringReplaceEachSubstr() if using C strings.
 * 
*/ l_uint8 * arrayReplaceEachSequence(const l_uint8 *datas, size_t dataslen, const l_uint8 *seq, size_t seqlen, const l_uint8 *newseq, size_t newseqlen, size_t *pdatadlen, l_int32 *pcount) { l_uint8 *datad; size_t newsize; l_int32 n, i, j, di, si, index, incr; L_DNA *da; PROCNAME("arrayReplaceEachSequence"); if (pcount) *pcount = 0; if (!datas || !seq) return (l_uint8 *)ERROR_PTR("datas & seq not both defined", procName, NULL); if (!pdatadlen) return (l_uint8 *)ERROR_PTR("&datadlen not defined", procName, NULL); *pdatadlen = 0; /* Identify the locations of the sequence. If there are none, * return a copy of %datas. */ if ((da = arrayFindEachSequence(datas, dataslen, seq, seqlen)) == NULL) { *pdatadlen = dataslen; return l_binaryCopy(datas, dataslen); } /* Allocate the output data; insure null termination */ n = l_dnaGetCount(da); if (pcount) *pcount = n; if (!newseq) newseqlen = 0; newsize = dataslen + n * (newseqlen - seqlen) + 4; if ((datad = (l_uint8 *)LEPT_CALLOC(newsize, sizeof(l_uint8))) == NULL) { l_dnaDestroy(&da); return (l_uint8 *)ERROR_PTR("datad not made", procName, NULL); } /* Replace each sequence instance with a new sequence */ l_dnaGetIValue(da, 0, &si); for (i = 0, di = 0, index = 0; i < dataslen; i++) { if (i == si) { index++; if (index < n) { l_dnaGetIValue(da, index, &si); incr = L_MIN(seqlen, si - i); /* amount to remove from datas */ } else { incr = seqlen; } i += incr - 1; /* jump over the matched sequence in datas */ if (newseq) { /* add new sequence to datad */ for (j = 0; j < newseqlen; j++) datad[di++] = newseq[j]; } } else { datad[di++] = datas[i]; } } *pdatadlen = di; l_dnaDestroy(&da); return datad; } /*! * \brief arrayFindEachSequence() * * \param[in] data byte array * \param[in] datalen length of data, in bytes * \param[in] sequence subarray of bytes to find in data * \param[in] seqlen length of sequence, in bytes * \return dna of offsets where the sequence is found, or NULL if * none are found or on error * *
 * Notes:
 *      (1) The byte arrays %data and %sequence are not C strings,
 *          because they can contain null bytes.  Therefore, for each
 *          we must give the length of the array.
 *      (2) This finds every non-overlapping occurrence in %data of %sequence.
 *          After it finds each match, it moves forward by the length
 *          of the sequence before continuing the search.  So for example,
 *          if you search for the sequence 'aa' in the data 'baaabbb',
 *          you find one match at position 1.
 * 
*/ L_DNA * arrayFindEachSequence(const l_uint8 *data, size_t datalen, const l_uint8 *sequence, size_t seqlen) { l_int32 start, offset, realoffset, found; L_DNA *da; PROCNAME("arrayFindEachSequence"); if (!data || !sequence) return (L_DNA *)ERROR_PTR("data & sequence not both defined", procName, NULL); da = l_dnaCreate(0); start = 0; while (1) { arrayFindSequence(data + start, datalen - start, sequence, seqlen, &offset, &found); if (found == FALSE) break; realoffset = start + offset; l_dnaAddNumber(da, realoffset); start = realoffset + seqlen; if (start >= datalen) break; } if (l_dnaGetCount(da) == 0) l_dnaDestroy(&da); return da; } /*! * \brief arrayFindSequence() * * \param[in] data byte array * \param[in] datalen length of data, in bytes * \param[in] sequence subarray of bytes to find in data * \param[in] seqlen length of sequence, in bytes * \param[out] poffset offset from beginning of * data where the sequence begins * \param[out] pfound 1 if sequence is found; 0 otherwise * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) The byte arrays 'data' and 'sequence' are not C strings,
 *          because they can contain null bytes.  Therefore, for each
 *          we must give the length of the array.
 *      (2) This searches for the first occurrence in %data of %sequence,
 *          which consists of %seqlen bytes.  The parameter %seqlen
 *          must not exceed the actual length of the %sequence byte array.
 *      (3) If the sequence is not found, the offset will be 0, so you
 *          must check %found.
 * 
*/ l_ok arrayFindSequence(const l_uint8 *data, size_t datalen, const l_uint8 *sequence, size_t seqlen, l_int32 *poffset, l_int32 *pfound) { l_int32 i, j, found, lastpos; PROCNAME("arrayFindSequence"); if (poffset) *poffset = 0; if (pfound) *pfound = FALSE; if (!data || !sequence) return ERROR_INT("data & sequence not both defined", procName, 1); if (!poffset || !pfound) return ERROR_INT("&offset and &found not defined", procName, 1); lastpos = datalen - seqlen + 1; found = FALSE; for (i = 0; i < lastpos; i++) { for (j = 0; j < seqlen; j++) { if (data[i + j] != sequence[j]) break; if (j == seqlen - 1) found = TRUE; } if (found == TRUE) break; } if (found == TRUE) { *poffset = i; *pfound = TRUE; } return 0; } /*--------------------------------------------------------------------* * Safe realloc * *--------------------------------------------------------------------*/ /*! * \brief reallocNew() * * \param[in,out] pindata nulls indata before reallocing * \param[in] oldsize size of input data to be copied, in bytes * \param[in] newsize size of buffer to be reallocated in bytes * \return ptr to new data, or NULL on error * * Action: !N.B. 3) and (4! * 1 Allocates memory, initialized to 0 * 2 Copies as much of the input data as possible * to the new block, truncating the copy if necessary * 3 Frees the input data * 4 Zeroes the input data ptr * *
 * Notes:
 *      (1) If newsize == 0, frees input data and nulls ptr
 *      (2) If input data is null, only callocs new memory
 *      (3) This differs from realloc in that it always allocates
 *          new memory (if newsize > 0) and initializes it to 0,
 *          it requires the amount of old data to be copied,
 *          and it takes the address of the input ptr and
 *          nulls the handle.
 * 
*/ void * reallocNew(void **pindata, size_t oldsize, size_t newsize) { size_t minsize; void *indata; void *newdata; PROCNAME("reallocNew"); if (!pindata) return ERROR_PTR("input data not defined", procName, NULL); indata = *pindata; if (newsize == 0) { /* nonstandard usage */ if (indata) { LEPT_FREE(indata); *pindata = NULL; } return NULL; } if (!indata) { /* nonstandard usage */ if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL) return ERROR_PTR("newdata not made", procName, NULL); return newdata; } /* Standard usage */ if ((newdata = (void *)LEPT_CALLOC(1, newsize)) == NULL) return ERROR_PTR("newdata not made", procName, NULL); minsize = L_MIN(oldsize, newsize); memcpy(newdata, indata, minsize); LEPT_FREE(indata); *pindata = NULL; return newdata; } /*--------------------------------------------------------------------* * Read and write between file and memory * *--------------------------------------------------------------------*/ /*! * \brief l_binaryRead() * * \param[in] filename * \param[out] pnbytes number of bytes read * \return data, or NULL on error */ l_uint8 * l_binaryRead(const char *filename, size_t *pnbytes) { l_uint8 *data; FILE *fp; PROCNAME("l_binaryRead"); if (!pnbytes) return (l_uint8 *)ERROR_PTR("pnbytes not defined", procName, NULL); *pnbytes = 0; if (!filename) return (l_uint8 *)ERROR_PTR("filename not defined", procName, NULL); if ((fp = fopenReadStream(filename)) == NULL) return (l_uint8 *)ERROR_PTR("file stream not opened", procName, NULL); data = l_binaryReadStream(fp, pnbytes); fclose(fp); return data; } /*! * \brief l_binaryReadStream() * * \param[in] fp file stream opened to read; can be stdin * \param[out] pnbytes number of bytes read * \return null-terminated array, or NULL on error; reading 0 bytes * is not an error * *
 * Notes:
 *      (1) The returned array is terminated with a null byte so that it can
 *          be used to read ascii data from a file into a proper C string.
 *      (2) This can be used to capture data that is piped in via stdin,
 *          because it does not require seeking within the file.
 *      (3) For example, you can read an image from stdin into memory
 *          using shell redirection, with one of these shell commands:
 * \code
 *             cat  | readprog
 *             readprog < 
 * \endcode
 *          where readprog is:
 * \code
 *             l_uint8 *data = l_binaryReadStream(stdin, &nbytes);
 *             Pix *pix = pixReadMem(data, nbytes);
 * \endcode
 * 
*/ l_uint8 * l_binaryReadStream(FILE *fp, size_t *pnbytes) { l_uint8 *data; l_int32 seekable, navail, nadd, nread; L_BBUFFER *bb; PROCNAME("l_binaryReadStream"); if (!pnbytes) return (l_uint8 *)ERROR_PTR("&nbytes not defined", procName, NULL); *pnbytes = 0; if (!fp) return (l_uint8 *)ERROR_PTR("fp not defined", procName, NULL); /* Test if the stream is seekable, by attempting to seek to * the start of data. This is a no-op. If it is seekable, use * l_binaryReadSelectStream() to determine the size of the * data to be read in advance. */ seekable = (ftell(fp) == 0) ? 1 : 0; if (seekable) return l_binaryReadSelectStream(fp, 0, 0, pnbytes); /* If it is not seekable, use the bbuffer to realloc memory * as needed during reading. */ bb = bbufferCreate(NULL, 4096); while (1) { navail = bb->nalloc - bb->n; if (navail < 4096) { nadd = L_MAX(bb->nalloc, 4096); bbufferExtendArray(bb, nadd); } nread = fread((void *)(bb->array + bb->n), 1, 4096, fp); bb->n += nread; if (nread != 4096) break; } /* Copy the data to a new array sized for the data, because * the bbuffer array can be nearly twice the size we need. */ if ((data = (l_uint8 *)LEPT_CALLOC(bb->n + 1, sizeof(l_uint8))) != NULL) { memcpy(data, bb->array, bb->n); *pnbytes = bb->n; } else { L_ERROR("calloc fail for data\n", procName); } bbufferDestroy(&bb); return data; } /*! * \brief l_binaryReadSelect() * * \param[in] filename * \param[in] start first byte to read * \param[in] nbytes number of bytes to read; use 0 to read to end of file * \param[out] pnread number of bytes actually read * \return data, or NULL on error * *
 * Notes:
 *      (1) The returned array is terminated with a null byte so that it can
 *          be used to read ascii data from a file into a proper C string.
 * 
*/ l_uint8 * l_binaryReadSelect(const char *filename, size_t start, size_t nbytes, size_t *pnread) { l_uint8 *data; FILE *fp; PROCNAME("l_binaryReadSelect"); if (!pnread) return (l_uint8 *)ERROR_PTR("pnread not defined", procName, NULL); *pnread = 0; if (!filename) return (l_uint8 *)ERROR_PTR("filename not defined", procName, NULL); if ((fp = fopenReadStream(filename)) == NULL) return (l_uint8 *)ERROR_PTR("file stream not opened", procName, NULL); data = l_binaryReadSelectStream(fp, start, nbytes, pnread); fclose(fp); return data; } /*! * \brief l_binaryReadSelectStream() * * \param[in] fp file stream * \param[in] start first byte to read * \param[in] nbytes number of bytes to read; use 0 to read to end of file * \param[out] pnread number of bytes actually read * \return null-terminated array, or NULL on error; reading 0 bytes * is not an error * *
 * Notes:
 *      (1) The returned array is terminated with a null byte so that it can
 *          be used to read ascii data from a file into a proper C string.
 *          If the file to be read is empty and %start == 0, an array
 *          with a single null byte is returned.
 *      (2) Side effect: the stream pointer is re-positioned to the
 *          beginning of the file.
 * 
*/ l_uint8 * l_binaryReadSelectStream(FILE *fp, size_t start, size_t nbytes, size_t *pnread) { l_uint8 *data; size_t bytesleft, bytestoread, nread, filebytes; PROCNAME("l_binaryReadSelectStream"); if (!pnread) return (l_uint8 *)ERROR_PTR("&nread not defined", procName, NULL); *pnread = 0; if (!fp) return (l_uint8 *)ERROR_PTR("stream not defined", procName, NULL); /* Verify and adjust the parameters if necessary */ fseek(fp, 0, SEEK_END); /* EOF */ filebytes = ftell(fp); fseek(fp, 0, SEEK_SET); if (start > filebytes) { L_ERROR("start = %zu but filebytes = %zu\n", procName, start, filebytes); return NULL; } if (filebytes == 0) /* start == 0; nothing to read; return null byte */ return (l_uint8 *)LEPT_CALLOC(1, 1); bytesleft = filebytes - start; /* greater than 0 */ if (nbytes == 0) nbytes = bytesleft; bytestoread = (bytesleft >= nbytes) ? nbytes : bytesleft; /* Read the data */ if ((data = (l_uint8 *)LEPT_CALLOC(1, bytestoread + 1)) == NULL) return (l_uint8 *)ERROR_PTR("calloc fail for data", procName, NULL); fseek(fp, start, SEEK_SET); nread = fread(data, 1, bytestoread, fp); if (nbytes != nread) L_INFO("%zu bytes requested; %zu bytes read\n", procName, nbytes, nread); *pnread = nread; fseek(fp, 0, SEEK_SET); return data; } /*! * \brief l_binaryWrite() * * \param[in] filename output file * \param[in] operation "w" for write; "a" for append * \param[in] data binary data to be written * \param[in] nbytes size of data array * \return 0 if OK; 1 on error */ l_ok l_binaryWrite(const char *filename, const char *operation, const void *data, size_t nbytes) { char actualOperation[20]; FILE *fp; PROCNAME("l_binaryWrite"); if (!filename) return ERROR_INT("filename not defined", procName, 1); if (!operation) return ERROR_INT("operation not defined", procName, 1); if (!data) return ERROR_INT("data not defined", procName, 1); if (nbytes <= 0) return ERROR_INT("nbytes must be > 0", procName, 1); if (strcmp(operation, "w") && strcmp(operation, "a")) return ERROR_INT("operation not one of {'w','a'}", procName, 1); /* The 'b' flag to fopen() is ignored for all POSIX * conforming systems. However, Windows needs the 'b' flag. */ stringCopy(actualOperation, operation, 2); stringCat(actualOperation, 20, "b"); if ((fp = fopenWriteStream(filename, actualOperation)) == NULL) return ERROR_INT("stream not opened", procName, 1); fwrite(data, 1, nbytes, fp); fclose(fp); return 0; } /*! * \brief nbytesInFile() * * \param[in] filename * \return nbytes in file; 0 on error */ size_t nbytesInFile(const char *filename) { size_t nbytes; FILE *fp; PROCNAME("nbytesInFile"); if (!filename) return ERROR_INT("filename not defined", procName, 0); if ((fp = fopenReadStream(filename)) == NULL) return ERROR_INT("stream not opened", procName, 0); nbytes = fnbytesInFile(fp); fclose(fp); return nbytes; } /*! * \brief fnbytesInFile() * * \param[in] fp file stream * \return nbytes in file; 0 on error */ size_t fnbytesInFile(FILE *fp) { l_int64 pos, nbytes; PROCNAME("fnbytesInFile"); if (!fp) return ERROR_INT("stream not open", procName, 0); pos = ftell(fp); /* initial position */ if (pos < 0) return ERROR_INT("seek position must be > 0", procName, 0); fseek(fp, 0, SEEK_END); /* EOF */ nbytes = ftell(fp); if (nbytes < 0) return ERROR_INT("nbytes is < 0", procName, 0); fseek(fp, pos, SEEK_SET); /* back to initial position */ return nbytes; } /*--------------------------------------------------------------------* * Copy and compare in memory * *--------------------------------------------------------------------*/ /*! * \brief l_binaryCopy() * * \param[in] datas * \param[in] size of data array * \return datad on heap, or NULL on error * *
 * Notes:
 *      (1) We add 4 bytes to the zeroed output because in some cases
 *          (e.g., string handling) it is important to have the data
 *          be null terminated.  This guarantees that after the memcpy,
 *          the result is automatically null terminated.
 * 
*/ l_uint8 * l_binaryCopy(const l_uint8 *datas, size_t size) { l_uint8 *datad; PROCNAME("l_binaryCopy"); if (!datas) return (l_uint8 *)ERROR_PTR("datas not defined", procName, NULL); if ((datad = (l_uint8 *)LEPT_CALLOC(size + 4, sizeof(l_uint8))) == NULL) return (l_uint8 *)ERROR_PTR("datad not made", procName, NULL); memcpy(datad, datas, size); return datad; } /*! * \brief l_binaryCompare() * * \param[in] data1 * \param[in] size1 of data1 * \param[in] data2 * \param[in] size2 of data1 * \param[out] psame (1 if the same, 0 if different) * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This can also be used to compare C strings str1 and str2.
 *          If the string lengths are not known, use strlen():
 *            l_binaryCompare((l_uint8 *)str1, strlen(str1),
                              (l_uint8 *)str2, strlen(str2));
 * 
*/ l_ok l_binaryCompare(const l_uint8 *data1, size_t size1, const l_uint8 *data2, size_t size2, l_int32 *psame) { l_int32 i; PROCNAME("l_binaryCompare"); if (!psame) return ERROR_INT("&same not defined", procName, 1); *psame = FALSE; if (!data1 || !data2) return ERROR_INT("data1 and data2 not both defined", procName, 1); if (size1 != size2) return 0; for (i = 0; i < size1; i++) { if (data1[i] != data2[i]) return 0; } *psame = TRUE; return 0; } /*--------------------------------------------------------------------* * File copy operations * *--------------------------------------------------------------------*/ /*! * \brief fileCopy() * * \param[in] srcfile copy from this file * \param[in] newfile copy to this file * \return 0 if OK, 1 on error */ l_ok fileCopy(const char *srcfile, const char *newfile) { l_int32 ret; size_t nbytes; l_uint8 *data; PROCNAME("fileCopy"); if (!srcfile) return ERROR_INT("srcfile not defined", procName, 1); if (!newfile) return ERROR_INT("newfile not defined", procName, 1); if ((data = l_binaryRead(srcfile, &nbytes)) == NULL) return ERROR_INT("data not returned", procName, 1); ret = l_binaryWrite(newfile, "w", data, nbytes); LEPT_FREE(data); return ret; } /*! * \brief fileConcatenate() * * \param[in] srcfile append data from this file * \param[in] destfile add data to this file * \return 0 if OK, 1 on error */ l_ok fileConcatenate(const char *srcfile, const char *destfile) { size_t nbytes; l_uint8 *data; PROCNAME("fileConcatenate"); if (!srcfile) return ERROR_INT("srcfile not defined", procName, 1); if (!destfile) return ERROR_INT("destfile not defined", procName, 1); data = l_binaryRead(srcfile, &nbytes); l_binaryWrite(destfile, "a", data, nbytes); LEPT_FREE(data); return 0; } /*! * \brief fileAppendString() * * \param[in] filename * \param[in] str string to append to file * \return 0 if OK, 1 on error */ l_ok fileAppendString(const char *filename, const char *str) { FILE *fp; PROCNAME("fileAppendString"); if (!filename) return ERROR_INT("filename not defined", procName, 1); if (!str) return ERROR_INT("str not defined", procName, 1); if ((fp = fopenWriteStream(filename, "a")) == NULL) return ERROR_INT("stream not opened", procName, 1); fprintf(fp, "%s", str); fclose(fp); return 0; } /*--------------------------------------------------------------------* * File split operations * *--------------------------------------------------------------------*/ /*! * \brief fileSplitLinesUniform() * * \param[in] filename input file * \param[in] n number of output files (>= 1) * \param[in] save_empty 1 to save empty lines; 0 to remove them * \param[in] rootpath root pathname of output files * \param[in] ext output extension, including the '.'; can be NULL * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This splits an input text file into %n files with roughly
 *          equal numbers of text lines in each file.
 *      (2) if %save_empty == 1, empty lines are included, and concatention
 *          of the text in the split files will be identical to the original.
 *      (3) The output filenames are in the form:
 *               _N., N = 1, ... n
 *      (4) This handles the temp directory pathname conversion on windows:
 *              /tmp  ==>  [Windows Temp directory]
 *      (5) Files can also be sharded into sets of lines by the program 'split':
 *              split -n l/ 
 *          Using 'split', the resulting files have approximately equal
 *          numbers of bytes, rather than equal numbers of lines.
 * 
*/ l_ok fileSplitLinesUniform(const char *filename, l_int32 n, l_int32 save_empty, const char *rootpath, const char *ext) { l_int32 i, totlines, nlines, index; size_t nbytes; l_uint8 *data; char *str; char outname[512]; NUMA *na; SARRAY *sa; PROCNAME("fileSplitLinesUniform"); if (!filename) return ERROR_INT("filename not defined", procName, 1); if (!rootpath) return ERROR_INT("rootpath not defined", procName, 1); if (n <= 0) return ERROR_INT("n must be > 0", procName, 1); if (save_empty != 0 && save_empty != 1) return ERROR_INT("save_empty not 0 or 1", procName, 1); /* Make sarray of lines; the newlines are stripped off */ if ((data = l_binaryRead(filename, &nbytes)) == NULL) return ERROR_INT("data not read", procName, 1); sa = sarrayCreateLinesFromString((const char *)data, save_empty); LEPT_FREE(data); if (!sa) return ERROR_INT("sa not made", procName, 1); totlines = sarrayGetCount(sa); if (n > totlines) { sarrayDestroy(&sa); L_ERROR("num files = %d > num lines = %d\n", procName, n, totlines); return 1; } /* Write n sets of lines to n files, adding the newlines back */ na = numaGetUniformBinSizes(totlines, n); index = 0; for (i = 0; i < n; i++) { if (ext == NULL) snprintf(outname, sizeof(outname), "%s_%d", rootpath, i); else snprintf(outname, sizeof(outname), "%s_%d%s", rootpath, i, ext); numaGetIValue(na, i, &nlines); str = sarrayToStringRange(sa, index, nlines, 1); /* add newlines */ l_binaryWrite(outname, "w", str, strlen(str)); LEPT_FREE(str); index += nlines; } numaDestroy(&na); sarrayDestroy(&sa); return 0; } /*--------------------------------------------------------------------* * Multi-platform functions for opening file streams * *--------------------------------------------------------------------*/ /*! * \brief fopenReadStream() * * \param[in] filename * \return stream, or NULL on error * *
 * Notes:
 *      (1) This should be used whenever you want to run fopen() to
 *          read from a stream.  Never call fopen() directory.
 *      (2) This handles the temp directory pathname conversion on windows:
 *              /tmp  ==>  [Windows Temp directory]
 * 
*/ FILE * fopenReadStream(const char *filename) { char *fname, *tail; FILE *fp; PROCNAME("fopenReadStream"); if (!filename) return (FILE *)ERROR_PTR("filename not defined", procName, NULL); /* Try input filename */ fname = genPathname(filename, NULL); fp = fopen(fname, "rb"); LEPT_FREE(fname); if (fp) return fp; /* Else, strip directory and try locally */ splitPathAtDirectory(filename, NULL, &tail); fp = fopen(tail, "rb"); LEPT_FREE(tail); if (!fp) return (FILE *)ERROR_PTR("file not found", procName, NULL); return fp; } /*! * \brief fopenWriteStream() * * \param[in] filename * \param[in] modestring * \return stream, or NULL on error * *
 * Notes:
 *      (1) This should be used whenever you want to run fopen() to
 *          write or append to a stream.  Never call fopen() directory.
 *      (2) This handles the temp directory pathname conversion on windows:
 *              /tmp  ==>  [Windows Temp directory]
 * 
*/ FILE * fopenWriteStream(const char *filename, const char *modestring) { char *fname; FILE *fp; PROCNAME("fopenWriteStream"); if (!filename) return (FILE *)ERROR_PTR("filename not defined", procName, NULL); fname = genPathname(filename, NULL); fp = fopen(fname, modestring); LEPT_FREE(fname); if (!fp) return (FILE *)ERROR_PTR("stream not opened", procName, NULL); return fp; } /*! * \brief fopenReadFromMemory() * * \param[in] data, size * \return file stream, or NULL on error * *
 * Notes:
 *      (1) Work-around if fmemopen() not available.
 *      (2) Windows tmpfile() writes into the root C:\ directory, which
 *          requires admin privileges.  This also works around that.
 * 
*/ FILE * fopenReadFromMemory(const l_uint8 *data, size_t size) { FILE *fp; PROCNAME("fopenReadFromMemory"); if (!data) return (FILE *)ERROR_PTR("data not defined", procName, NULL); #if HAVE_FMEMOPEN if ((fp = fmemopen((void *)data, size, "rb")) == NULL) return (FILE *)ERROR_PTR("stream not opened", procName, NULL); #else /* write to tmp file */ L_INFO("work-around: writing to a temp file\n", procName); #ifdef _WIN32 if ((fp = fopenWriteWinTempfile()) == NULL) return (FILE *)ERROR_PTR("tmpfile stream not opened", procName, NULL); #else if ((fp = tmpfile()) == NULL) return (FILE *)ERROR_PTR("tmpfile stream not opened", procName, NULL); #endif /* _WIN32 */ fwrite(data, 1, size, fp); rewind(fp); #endif /* HAVE_FMEMOPEN */ return fp; } /*--------------------------------------------------------------------* * Opening a windows tmpfile for writing * *--------------------------------------------------------------------*/ /*! * \brief fopenWriteWinTempfile() * * \return file stream, or NULL on error * *
 * Notes:
 *      (1) The Windows version of tmpfile() writes into the root
 *          C:\ directory, which requires admin privileges.  This
 *          function provides an alternative implementation.
 * 
*/ FILE * fopenWriteWinTempfile(void) { #ifdef _WIN32 l_int32 handle; FILE *fp; char *filename; PROCNAME("fopenWriteWinTempfile"); if ((filename = l_makeTempFilename()) == NULL) { L_ERROR("l_makeTempFilename failed, %s\n", procName, strerror(errno)); return NULL; } handle = _open(filename, _O_CREAT | _O_RDWR | _O_SHORT_LIVED | _O_TEMPORARY | _O_BINARY, _S_IREAD | _S_IWRITE); lept_free(filename); if (handle == -1) { L_ERROR("_open failed, %s\n", procName, strerror(errno)); return NULL; } if ((fp = _fdopen(handle, "r+b")) == NULL) { L_ERROR("_fdopen failed, %s\n", procName, strerror(errno)); return NULL; } return fp; #else return NULL; #endif /* _WIN32 */ } /*--------------------------------------------------------------------* * Multi-platform functions that avoid C-runtime boundary * * crossing for applications with Windows DLLs * *--------------------------------------------------------------------*/ /* * Problems arise when pointers to streams and data are passed * between two Windows DLLs that have been generated with different * C runtimes. To avoid this, leptonica provides wrappers for * several C library calls. */ /*! * \brief lept_fopen() * * \param[in] filename * \param[in] mode same as for fopen(); e.g., "rb" * \return stream or NULL on error * *
 * Notes:
 *      (1) This must be used by any application that passes
 *          a file handle to a leptonica Windows DLL.
 * 
*/ FILE * lept_fopen(const char *filename, const char *mode) { PROCNAME("lept_fopen"); if (!filename) return (FILE *)ERROR_PTR("filename not defined", procName, NULL); if (!mode) return (FILE *)ERROR_PTR("mode not defined", procName, NULL); if (stringFindSubstr(mode, "r", NULL)) return fopenReadStream(filename); else return fopenWriteStream(filename, mode); } /*! * \brief lept_fclose() * * \param[in] fp file stream * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This should be used by any application that accepts
 *          a file handle generated by a leptonica Windows DLL.
 * 
*/ l_ok lept_fclose(FILE *fp) { PROCNAME("lept_fclose"); if (!fp) return ERROR_INT("stream not defined", procName, 1); return fclose(fp); } /*! * \brief lept_calloc() * * \param[in] nmemb number of members * \param[in] size of each member * \return void ptr, or NULL on error * *
 * Notes:
 *      (1) For safety with windows DLLs, this can be used in conjunction
 *          with lept_free() to avoid C-runtime boundary problems.
 *          Just use these two functions throughout your application.
 * 
*/ void * lept_calloc(size_t nmemb, size_t size) { if (nmemb <= 0 || size <= 0) return NULL; return LEPT_CALLOC(nmemb, size); } /*! * \brief lept_free() * * \param[in] ptr * *
 * Notes:
 *      (1) This should be used by any application that accepts
 *          heap data allocated by a leptonica Windows DLL.
 * 
*/ void lept_free(void *ptr) { if (!ptr) return; LEPT_FREE(ptr); } /*--------------------------------------------------------------------* * Multi-platform file system operations * * [ These only write to /tmp or its subdirectories ] * *--------------------------------------------------------------------*/ /*! * \brief lept_mkdir() * * \param[in] subdir of /tmp or its equivalent on Windows * \return 0 on success, non-zero on failure * *
 * Notes:
 *      (1) %subdir is a partial path that can consist of one or more
 *          directories.
 *      (2) This makes any subdirectories of /tmp that are required.
 *      (3) The root temp directory is:
 *            /tmp    (unix)  [default]
 *            [Temp]  (windows)
 * 
*/ l_int32 lept_mkdir(const char *subdir) { char *dir, *tmpdir; l_int32 i, n; l_int32 ret = 0; SARRAY *sa; #ifdef _WIN32 l_uint32 attributes; #endif /* _WIN32 */ PROCNAME("lept_mkdir"); if (!LeptDebugOK) { L_INFO("making named temp subdirectory %s is disabled\n", procName, subdir); return 0; } if (!subdir) return ERROR_INT("subdir not defined", procName, 1); if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/')) return ERROR_INT("subdir not an actual subdirectory", procName, 1); sa = sarrayCreate(0); sarraySplitString(sa, subdir, "/"); n = sarrayGetCount(sa); dir = genPathname("/tmp", NULL); /* Make sure the tmp directory exists */ #ifndef _WIN32 ret = mkdir(dir, 0777); #else attributes = GetFileAttributes(dir); if (attributes == INVALID_FILE_ATTRIBUTES) ret = (CreateDirectory(dir, NULL) ? 0 : 1); #endif /* Make all the subdirectories */ for (i = 0; i < n; i++) { tmpdir = pathJoin(dir, sarrayGetString(sa, i, L_NOCOPY)); #ifndef _WIN32 ret += mkdir(tmpdir, 0777); #else if (CreateDirectory(tmpdir, NULL) == 0) ret += (GetLastError () != ERROR_ALREADY_EXISTS); #endif LEPT_FREE(dir); dir = tmpdir; } LEPT_FREE(dir); sarrayDestroy(&sa); if (ret > 0) L_ERROR("failure to create %d directories\n", procName, ret); return ret; } /*! * \brief lept_rmdir() * * \param[in] subdir of /tmp or its equivalent on Windows * \return 0 on success, non-zero on failure * *
 * Notes:
 *      (1) %subdir is a partial path that can consist of one or more
 *          directories.
 *      (2) This removes all files from the specified subdirectory of
 *          the root temp directory:
 *            /tmp    (unix)
 *            [Temp]  (windows)
 *          and then removes the subdirectory.
 *      (3) The combination
 *            lept_rmdir(subdir);
 *            lept_mkdir(subdir);
 *          is guaranteed to give you an empty subdirectory.
 * 
*/ l_int32 lept_rmdir(const char *subdir) { char *dir, *fname, *fullname; l_int32 exists, ret, i, nfiles; SARRAY *sa; #ifdef _WIN32 char *newpath; #else char *realdir; #endif /* _WIN32 */ PROCNAME("lept_rmdir"); if (!subdir) return ERROR_INT("subdir not defined", procName, 1); if ((strlen(subdir) == 0) || (subdir[0] == '.') || (subdir[0] == '/')) return ERROR_INT("subdir not an actual subdirectory", procName, 1); /* Find the temp subdirectory */ dir = pathJoin("/tmp", subdir); if (!dir) return ERROR_INT("directory name not made", procName, 1); lept_direxists(dir, &exists); if (!exists) { /* fail silently */ LEPT_FREE(dir); return 0; } /* List all the files in that directory */ if ((sa = getFilenamesInDirectory(dir)) == NULL) { L_ERROR("directory %s does not exist!\n", procName, dir); LEPT_FREE(dir); return 1; } nfiles = sarrayGetCount(sa); for (i = 0; i < nfiles; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); fullname = genPathname(dir, fname); remove(fullname); LEPT_FREE(fullname); } #ifndef _WIN32 realdir = genPathname("/tmp", subdir); ret = rmdir(realdir); LEPT_FREE(realdir); #else newpath = genPathname(dir, NULL); ret = (RemoveDirectory(newpath) ? 0 : 1); LEPT_FREE(newpath); #endif /* !_WIN32 */ sarrayDestroy(&sa); LEPT_FREE(dir); return ret; } /*! * \brief lept_direxists() * * \param[in] dir * \param[out] pexists 1 if it exists; 0 otherwise * \return void * *
 * Notes:
 *      (1) Always use unix pathname separators.
 *      (2) By calling genPathname(), if the pathname begins with "/tmp"
 *          this does an automatic directory translation on windows
 *          to a path in the windows [Temp] directory:
 *             "/tmp"  ==>  [Temp] (windows)
 * 
*/ void lept_direxists(const char *dir, l_int32 *pexists) { char *realdir; if (!pexists) return; *pexists = 0; if (!dir) return; if ((realdir = genPathname(dir, NULL)) == NULL) return; #ifndef _WIN32 { struct stat s; l_int32 err = stat(realdir, &s); if (err != -1 && S_ISDIR(s.st_mode)) *pexists = 1; } #else /* _WIN32 */ { l_uint32 attributes; attributes = GetFileAttributes(realdir); if (attributes != INVALID_FILE_ATTRIBUTES && (attributes & FILE_ATTRIBUTE_DIRECTORY)) *pexists = 1; } #endif /* _WIN32 */ LEPT_FREE(realdir); } /*! * \brief lept_rm_match() * * \param[in] subdir [optional] if NULL, the removed files are in /tmp * \param[in] substr [optional] pattern to match in filename * \return 0 on success, non-zero on failure * *
 * Notes:
 *      (1) This removes the matched files in /tmp or a subdirectory of /tmp.
 *          Use NULL for %subdir if the files are in /tmp.
 *      (2) If %substr == NULL, this removes all files in the directory.
 *          If %substr == "" (empty), this removes no files.
 *          If both %subdir == NULL and %substr == NULL, this removes
 *          all files in /tmp.
 *      (3) Use unix pathname separators.
 *      (4) By calling genPathname(), if the pathname begins with "/tmp"
 *          this does an automatic directory translation on windows
 *          to a path in the windows [Temp] directory:
 *             "/tmp"  ==>  [Temp] (windows)
 *      (5) Error conditions:
 *            * returns -1 if the directory is not found
 *            * returns the number of files (> 0) that it was unable to remove.
 * 
*/ l_int32 lept_rm_match(const char *subdir, const char *substr) { char *path, *fname; char tempdir[256]; l_int32 i, n, ret; SARRAY *sa; PROCNAME("lept_rm_match"); makeTempDirname(tempdir, sizeof(tempdir), subdir); if ((sa = getSortedPathnamesInDirectory(tempdir, substr, 0, 0)) == NULL) return ERROR_INT("sa not made", procName, -1); n = sarrayGetCount(sa); if (n == 0) { L_WARNING("no matching files found\n", procName); sarrayDestroy(&sa); return 0; } ret = 0; for (i = 0; i < n; i++) { fname = sarrayGetString(sa, i, L_NOCOPY); path = genPathname(fname, NULL); if (lept_rmfile(path) != 0) { L_ERROR("failed to remove %s\n", procName, path); ret++; } LEPT_FREE(path); } sarrayDestroy(&sa); return ret; } /*! * \brief lept_rm() * * \param[in] subdir [optional] subdir of '/tmp'; can be NULL * \param[in] tail filename without the directory * \return 0 on success, non-zero on failure * *
 * Notes:
 *      (1) By calling genPathname(), this does an automatic directory
 *          translation on windows to a path in the windows [Temp] directory:
 *             "/tmp/..."  ==>  [Temp]/... (windows)
 * 
*/ l_int32 lept_rm(const char *subdir, const char *tail) { char *path; char newtemp[256]; l_int32 ret; PROCNAME("lept_rm"); if (!tail || strlen(tail) == 0) return ERROR_INT("tail undefined or empty", procName, 1); if (makeTempDirname(newtemp, sizeof(newtemp), subdir)) return ERROR_INT("temp dirname not made", procName, 1); path = genPathname(newtemp, tail); ret = lept_rmfile(path); LEPT_FREE(path); return ret; } /*! * \brief * * lept_rmfile() * * \param[in] filepath full path to file including the directory * \return 0 on success, non-zero on failure * *
 * Notes:
 *      (1) This removes the named file.
 *      (2) Use unix pathname separators.
 *      (3) There is no name translation.
 *      (4) Unlike the other lept_* functions in this section, this can remove
 *          any file -- it is not restricted to files that are in /tmp or a
 *          subdirectory of it.
 * 
*/ l_int32 lept_rmfile(const char *filepath) { l_int32 ret; PROCNAME("lept_rmfile"); if (!filepath || strlen(filepath) == 0) return ERROR_INT("filepath undefined or empty", procName, 1); #ifndef _WIN32 ret = remove(filepath); #else /* Set attributes to allow deletion of read-only files */ SetFileAttributes(filepath, FILE_ATTRIBUTE_NORMAL); ret = DeleteFile(filepath) ? 0 : 1; #endif /* !_WIN32 */ return ret; } /*! * \brief lept_mv() * * \param[in] srcfile * \param[in] newdir [optional]; can be NULL * \param[in] newtail [optional]; can be NULL * \param[out] pnewpath [optional] of actual path; can be NULL * \return 0 on success, non-zero on failure * *
 * Notes:
 *      (1) This moves %srcfile to /tmp or to a subdirectory of /tmp.
 *      (2) %srcfile can either be a full path or relative to the
 *          current directory.
 *      (3) %newdir can either specify an existing subdirectory of /tmp
 *          or can be NULL.  In the latter case, the file will be written
 *          into /tmp.
 *      (4) %newtail can either specify a filename tail or, if NULL,
 *          the filename is taken from src-tail, the tail of %srcfile.
 *      (5) For debugging, the computed newpath can be returned.  It must
 *          be freed by the caller.
 *      (6) Reminders:
 *          (a) specify files using unix pathnames
 *          (b) for windows, translates
 *                 /tmp  ==>  [Temp]
 *              where [Temp] is the windows temp directory
 *      (7) Examples:
 *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
 *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
 *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
 *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
 * 
*/ l_int32 lept_mv(const char *srcfile, const char *newdir, const char *newtail, char **pnewpath) { char *srcpath, *newpath, *dir, *srctail; char newtemp[256]; l_int32 ret; PROCNAME("lept_mv"); if (!srcfile) return ERROR_INT("srcfile not defined", procName, 1); /* Require output pathname to be in /tmp/ or a subdirectory */ if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1) return ERROR_INT("newdir not NULL or a subdir of /tmp", procName, 1); /* Get canonical src pathname */ splitPathAtDirectory(srcfile, &dir, &srctail); #ifndef _WIN32 srcpath = pathJoin(dir, srctail); LEPT_FREE(dir); /* Generate output pathname */ if (!newtail || newtail[0] == '\0') newpath = pathJoin(newtemp, srctail); else newpath = pathJoin(newtemp, newtail); LEPT_FREE(srctail); /* Overwrite any existing file at 'newpath' */ ret = fileCopy(srcpath, newpath); if (!ret) { /* and remove srcfile */ char *realpath = genPathname(srcpath, NULL); remove(realpath); LEPT_FREE(realpath); } #else srcpath = genPathname(dir, srctail); LEPT_FREE(dir); /* Generate output pathname */ if (!newtail || newtail[0] == '\0') newpath = genPathname(newtemp, srctail); else newpath = genPathname(newtemp, newtail); LEPT_FREE(srctail); /* Overwrite any existing file at 'newpath' */ ret = MoveFileEx(srcpath, newpath, MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING) ? 0 : 1; #endif /* ! _WIN32 */ LEPT_FREE(srcpath); if (pnewpath) *pnewpath = newpath; else LEPT_FREE(newpath); return ret; } /*! * \brief lept_cp() * * \param[in] srcfile * \param[in] newdir [optional]; can be NULL * \param[in] newtail [optional]; can be NULL * \param[out] pnewpath [optional] of actual path; can be NULL * \return 0 on success, non-zero on failure * *
 * Notes:
 *      (1) This copies %srcfile to /tmp or to a subdirectory of /tmp.
 *      (2) %srcfile can either be a full path or relative to the
 *          current directory.
 *      (3) %newdir can either specify an existing subdirectory of /tmp,
 *          or can be NULL.  In the latter case, the file will be written
 *          into /tmp.
 *      (4) %newtail can either specify a filename tail or, if NULL,
 *          the filename is taken from src-tail, the tail of %srcfile.
 *      (5) For debugging, the computed newpath can be returned.  It must
 *          be freed by the caller.
 *      (6) Reminders:
 *          (a) specify files using unix pathnames
 *          (b) for windows, translates
 *                 /tmp  ==>  [Temp]
 *              where [Temp] is the windows temp directory
 *      (7) Examples:
 *          * newdir = NULL,    newtail = NULL    ==> /tmp/src-tail
 *          * newdir = NULL,    newtail = abc     ==> /tmp/abc
 *          * newdir = def/ghi, newtail = NULL    ==> /tmp/def/ghi/src-tail
 *          * newdir = def/ghi, newtail = abc     ==> /tmp/def/ghi/abc
 *
 * 
*/ l_int32 lept_cp(const char *srcfile, const char *newdir, const char *newtail, char **pnewpath) { char *srcpath, *newpath, *dir, *srctail; char newtemp[256]; l_int32 ret; PROCNAME("lept_cp"); if (!srcfile) return ERROR_INT("srcfile not defined", procName, 1); /* Require output pathname to be in /tmp or a subdirectory */ if (makeTempDirname(newtemp, sizeof(newtemp), newdir) == 1) return ERROR_INT("newdir not NULL or a subdir of /tmp", procName, 1); /* Get canonical src pathname */ splitPathAtDirectory(srcfile, &dir, &srctail); #ifndef _WIN32 srcpath = pathJoin(dir, srctail); LEPT_FREE(dir); /* Generate output pathname */ if (!newtail || newtail[0] == '\0') newpath = pathJoin(newtemp, srctail); else newpath = pathJoin(newtemp, newtail); LEPT_FREE(srctail); /* Overwrite any existing file at 'newpath' */ ret = fileCopy(srcpath, newpath); #else srcpath = genPathname(dir, srctail); LEPT_FREE(dir); /* Generate output pathname */ if (!newtail || newtail[0] == '\0') newpath = genPathname(newtemp, srctail); else newpath = genPathname(newtemp, newtail); LEPT_FREE(srctail); /* Overwrite any existing file at 'newpath' */ ret = CopyFile(srcpath, newpath, FALSE) ? 0 : 1; #endif /* !_WIN32 */ LEPT_FREE(srcpath); if (pnewpath) *pnewpath = newpath; else LEPT_FREE(newpath); return ret; } /*--------------------------------------------------------------------* * Special debug/test function for calling 'system' * *--------------------------------------------------------------------*/ #if defined(__APPLE__) #include "TargetConditionals.h" #endif /* __APPLE__ */ /*! * \brief callSystemDebug() * * \param[in] cmd command to be exec'd * \return void * *
 * Notes:
 *      (1) The C library 'system' call is only made through this function.
 *          It only works in debug/test mode, where the global variable
 *          LeptDebugOK == TRUE.  This variable is set to FALSE in the
 *          library as distributed, and calling this function will
 *          generate an error message.
 * 
*/ void callSystemDebug(const char *cmd) { l_int32 ret; PROCNAME("callSystemDebug"); if (!cmd) { L_ERROR("cmd not defined\n", procName); return; } if (LeptDebugOK == FALSE) { L_INFO("'system' calls are disabled\n", procName); return; } #if defined(__APPLE__) /* iOS 11 does not support system() */ #if TARGET_OS_OSX /* Mac OS X */ ret = system(cmd); #elif TARGET_OS_IPHONE || defined(OS_IOS) /* iOS */ L_ERROR("iOS 11 does not support system()\n", procName); #endif /* TARGET_OS_OSX */ #else /* ! __APPLE__ */ ret = system(cmd); #endif /* __APPLE__ */ } /*--------------------------------------------------------------------* * General file name operations * *--------------------------------------------------------------------*/ /*! * \brief splitPathAtDirectory() * * \param[in] pathname full path; can be a directory * \param[out] pdir [optional] root directory name of * input path, including trailing '/' * \param[out] ptail [optional] path tail, which is either * the file name within the root directory or * the last sub-directory in the path * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) If you only want the tail, input null for the root directory ptr.
 *      (2) If you only want the root directory name, input null for the
 *          tail ptr.
 *      (3) This function makes decisions based only on the lexical
 *          structure of the input.  Examples:
 *            /usr/tmp/abc.d  -->  dir: /usr/tmp/       tail: abc.d
 *            /usr/tmp/       -->  dir: /usr/tmp/       tail: [empty string]
 *            /usr/tmp        -->  dir: /usr/           tail: tmp
 *            abc.d           -->  dir: [empty string]  tail: abc.d
 *      (4  Consider the first example above: /usr/tmp/abc.d.
 *          Suppose you want the stem of the file, abc, without either
 *          the directory or the extension.  This can be extracted in two steps:
 *              splitPathAtDirectory("usr/tmp/abc.d", NULL, &tail);
 *                   [sets tail: "abc.d"]
 *              splitPathAtExtension(tail, &basename, NULL);
 *                   [sets basename: "abc"]
 *      (5) The input can have either forward (unix) or backward (win)
 *          slash separators.  The output has unix separators.
 *          Note that Win32 pathname functions generally accept both
 *          slash forms, but the windows command line interpreter
 *          only accepts backward slashes, because forward slashes are
 *          used to demarcate switches (vs. dashes in unix).
 * 
*/ l_ok splitPathAtDirectory(const char *pathname, char **pdir, char **ptail) { char *cpathname, *lastslash; PROCNAME("splitPathAtDirectory"); if (!pdir && !ptail) return ERROR_INT("null input for both strings", procName, 1); if (pdir) *pdir = NULL; if (ptail) *ptail = NULL; if (!pathname) return ERROR_INT("pathname not defined", procName, 1); cpathname = stringNew(pathname); convertSepCharsInPath(cpathname, UNIX_PATH_SEPCHAR); lastslash = strrchr(cpathname, '/'); if (lastslash) { if (ptail) *ptail = stringNew(lastslash + 1); if (pdir) { *(lastslash + 1) = '\0'; *pdir = cpathname; } else { LEPT_FREE(cpathname); } } else { /* no directory */ if (pdir) *pdir = stringNew(""); if (ptail) *ptail = cpathname; else LEPT_FREE(cpathname); } return 0; } /*! * \brief splitPathAtExtension() * * \param[in] pathname full path; can be a directory * \param[out] pbasename [optional] pathname not including the * last dot and characters after that * \param[out] pextension [optional] path extension, which is * the last dot and the characters after it. If * there is no extension, it returns the empty string * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) If you only want the extension, input null for the basename ptr.
 *      (2) If you only want the basename without extension, input null
 *          for the extension ptr.
 *      (3) This function makes decisions based only on the lexical
 *          structure of the input.  Examples:
 *            /usr/tmp/abc.jpg  -->  basename: /usr/tmp/abc    ext: .jpg
 *            /usr/tmp/.jpg     -->  basename: /usr/tmp/       ext: .jpg
 *            /usr/tmp.jpg/     -->  basename: /usr/tmp.jpg/   ext: [empty str]
 *            ./.jpg            -->  basename: ./              ext: .jpg
 *      (4) The input can have either forward (unix) or backward (win)
 *          slash separators.  The output has unix separators.
 *      (5) Note that basename, as used here, is different from the result
 *          of the unix program 'basename'.  Here, basename is the entire
 *          pathname up to a final extension and its preceding dot.
 * 
*/ l_ok splitPathAtExtension(const char *pathname, char **pbasename, char **pextension) { char *tail, *dir, *lastdot; char empty[4] = ""; PROCNAME("splitPathExtension"); if (!pbasename && !pextension) return ERROR_INT("null input for both strings", procName, 1); if (pbasename) *pbasename = NULL; if (pextension) *pextension = NULL; if (!pathname) return ERROR_INT("pathname not defined", procName, 1); /* Split out the directory first */ splitPathAtDirectory(pathname, &dir, &tail); /* Then look for a "." in the tail part. * This way we ignore all "." in the directory. */ if ((lastdot = strrchr(tail, '.'))) { if (pextension) *pextension = stringNew(lastdot); if (pbasename) { *lastdot = '\0'; *pbasename = stringJoin(dir, tail); } } else { if (pextension) *pextension = stringNew(empty); if (pbasename) *pbasename = stringNew(pathname); } LEPT_FREE(dir); LEPT_FREE(tail); return 0; } /*! * \brief pathJoin() * * \param[in] dir [optional] can be null * \param[in] fname [optional] can be null * \return specially concatenated path, or NULL on error * *
 * Notes:
 *      (1) Use unix-style pathname separators ('/').
 *      (2) %fname can be the entire path, or part of the path containing
 *          at least one directory, or a tail without a directory, or NULL.
 *      (3) It produces a path that strips multiple slashes to a single
 *          slash, joins %dir and %fname by a slash, and has no trailing
 *          slashes (except in the cases where %dir == "/" and
 *          %fname == NULL, or v.v.).
 *      (4) If both %dir and %fname are null, produces an empty string.
 *      (5) Neither %dir nor %fname can begin with '..'.
 *      (6) The result is not canonicalized or tested for correctness:
 *          garbage in (e.g., /&%), garbage out.
 *      (7) Examples:
 *             //tmp// + //abc/  -->  /tmp/abc
 *             tmp/ + /abc/      -->  tmp/abc
 *             tmp/ + abc/       -->  tmp/abc
 *             /tmp/ + ///       -->  /tmp
 *             /tmp/ + NULL      -->  /tmp
 *             // + /abc//       -->  /abc
 *             // + NULL         -->  /
 *             NULL + /abc/def/  -->  /abc/def
 *             NULL + abc//      -->  abc
 *             NULL + //         -->  /
 *             NULL + NULL       -->  (empty string)
 *             "" + ""           -->  (empty string)
 *             "" + /            -->  /
 *             ".." + /etc/foo   -->  NULL
 *             /tmp + ".."       -->  NULL
 * 
*/ char * pathJoin(const char *dir, const char *fname) { const char *slash = "/"; char *str, *dest; l_int32 i, n1, n2, emptydir; size_t size; SARRAY *sa1, *sa2; L_BYTEA *ba; PROCNAME("pathJoin"); if (!dir && !fname) return stringNew(""); if (dir && strlen(dir) >= 2 && dir[0] == '.' && dir[1] == '.') return (char *)ERROR_PTR("dir starts with '..'", procName, NULL); if (fname && strlen(fname) >= 2 && fname[0] == '.' && fname[1] == '.') return (char *)ERROR_PTR("fname starts with '..'", procName, NULL); sa1 = sarrayCreate(0); sa2 = sarrayCreate(0); ba = l_byteaCreate(4); /* Process %dir */ if (dir && strlen(dir) > 0) { if (dir[0] == '/') l_byteaAppendString(ba, slash); sarraySplitString(sa1, dir, "/"); /* removes all slashes */ n1 = sarrayGetCount(sa1); for (i = 0; i < n1; i++) { str = sarrayGetString(sa1, i, L_NOCOPY); l_byteaAppendString(ba, str); l_byteaAppendString(ba, slash); } } /* Special case to add leading slash: dir NULL or empty string */ emptydir = dir && strlen(dir) == 0; if ((!dir || emptydir) && fname && strlen(fname) > 0 && fname[0] == '/') l_byteaAppendString(ba, slash); /* Process %fname */ if (fname && strlen(fname) > 0) { sarraySplitString(sa2, fname, "/"); n2 = sarrayGetCount(sa2); for (i = 0; i < n2; i++) { str = sarrayGetString(sa2, i, L_NOCOPY); l_byteaAppendString(ba, str); l_byteaAppendString(ba, slash); } } /* Remove trailing slash */ dest = (char *)l_byteaCopyData(ba, &size); if (size > 1 && dest[size - 1] == '/') dest[size - 1] = '\0'; sarrayDestroy(&sa1); sarrayDestroy(&sa2); l_byteaDestroy(&ba); return dest; } /*! * \brief appendSubdirs() * * \param[in] basedir * \param[in] subdirs * \return concatenated full directory path without trailing slash, * or NULL on error * *
 * Notes:
 *      (1) Use unix pathname separators
 *      (2) Allocates a new string:  [basedir]/[subdirs]
 * 
*/ char * appendSubdirs(const char *basedir, const char *subdirs) { char *newdir; size_t len1, len2, len3, len4; PROCNAME("appendSubdirs"); if (!basedir || !subdirs) return (char *)ERROR_PTR("basedir and subdirs not both defined", procName, NULL); len1 = strlen(basedir); len2 = strlen(subdirs); len3 = len1 + len2 + 8; if ((newdir = (char *)LEPT_CALLOC(len3, 1)) == NULL) return (char *)ERROR_PTR("newdir not made", procName, NULL); stringCat(newdir, len3, basedir); if (newdir[len1 - 1] != '/') /* add '/' if necessary */ newdir[len1] = '/'; if (subdirs[0] == '/') /* add subdirs, stripping leading '/' */ stringCat(newdir, len3, subdirs + 1); else stringCat(newdir, len3, subdirs); len4 = strlen(newdir); if (newdir[len4 - 1] == '/') /* strip trailing '/' */ newdir[len4 - 1] = '\0'; return newdir; } /*--------------------------------------------------------------------* * Special file name operations * *--------------------------------------------------------------------*/ /*! * \brief convertSepCharsInPath() * * \param[in] path * \param[in] type UNIX_PATH_SEPCHAR, WIN_PATH_SEPCHAR * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) In-place conversion.
 *      (2) Type is the resulting type:
 *            * UNIX_PATH_SEPCHAR:  '\\' ==> '/'
 *            * WIN_PATH_SEPCHAR:   '/' ==> '\\'
 *      (3) Virtually all path operations in leptonica use unix separators.
 *      (4) The backslash is a valid character in unix pathnames and should
 *          not be converted.  Each backslash needs to be escaped with a
 *          preceding backslash for the shell, but the actual filename
 *          does not include these escape characters.
 * 
*/ l_ok convertSepCharsInPath(char *path, l_int32 type) { l_int32 i; size_t len; PROCNAME("convertSepCharsInPath"); if (!path) return ERROR_INT("path not defined", procName, 1); if (type != UNIX_PATH_SEPCHAR && type != WIN_PATH_SEPCHAR) return ERROR_INT("invalid type", procName, 1); len = strlen(path); if (type == UNIX_PATH_SEPCHAR) { #ifdef _WIN32 /* only convert on windows */ for (i = 0; i < len; i++) { if (path[i] == '\\') path[i] = '/'; } #endif /* _WIN32 */ } else { /* WIN_PATH_SEPCHAR */ for (i = 0; i < len; i++) { if (path[i] == '/') path[i] = '\\'; } } return 0; } /*! * \brief genPathname() * * \param[in] dir [optional] directory or full path name, * with or without the trailing '/' * \param[in] fname [optional] file name within a directory * \return pathname either a directory or full path, or NULL on error * *
 * Notes:
 *      (1) This function generates actual paths in the following ways:
 *            * from two sub-parts (e.g., a directory and a file name).
 *            * from a single path full path, placed in %dir, with
 *              %fname == NULL.
 *            * from the name of a file in the local directory placed in
 *              %fname, with %dir == NULL.
 *            * if in a "/tmp" directory and on windows, the windows
 *              temp directory is used.
 *      (2) On windows, if the root of %dir is '/tmp', this does a name
 *          translation:
 *             "/tmp"  ==>  [Temp] (windows)
 *          where [Temp] is the windows temp directory.
 *      (3) On unix, the TMPDIR variable is ignored.  No rewriting
 *          of temp directories is permitted.
 *      (4) There are four cases for the input:
 *          (a) %dir is a directory and %fname is defined: result is a full path
 *          (b) %dir is a directory and %fname is null: result is a directory
 *          (c) %dir is a full path and %fname is null: result is a full path
 *          (d) %dir is null or an empty string: start in the current dir;
 *              result is a full path
 *      (5) In all cases, the resulting pathname is not terminated with a slash
 *      (6) The caller is responsible for freeing the returned pathname.
 * 
*/ char * genPathname(const char *dir, const char *fname) { l_int32 is_win32 = FALSE; char *cdir, *pathout; l_int32 dirlen, namelen; size_t size; PROCNAME("genPathname"); if (!dir && !fname) return (char *)ERROR_PTR("no input", procName, NULL); /* Handle the case where we start from the current directory */ if (!dir || dir[0] == '\0') { if ((cdir = getcwd(NULL, 0)) == NULL) return (char *)ERROR_PTR("no current dir found", procName, NULL); } else { cdir = stringNew(dir); } /* Convert to unix path separators, and remove the trailing * slash in the directory, except when dir == "/" */ convertSepCharsInPath(cdir, UNIX_PATH_SEPCHAR); dirlen = strlen(cdir); if (cdir[dirlen - 1] == '/' && dirlen != 1) { cdir[dirlen - 1] = '\0'; dirlen--; } namelen = (fname) ? strlen(fname) : 0; size = dirlen + namelen + 256; if ((pathout = (char *)LEPT_CALLOC(size, sizeof(char))) == NULL) { LEPT_FREE(cdir); return (char *)ERROR_PTR("pathout not made", procName, NULL); } #ifdef _WIN32 is_win32 = TRUE; #endif /* _WIN32 */ /* First handle %dir (which may be a full pathname). * There is no path rewriting on unix, and on win32, we do not * rewrite unless the specified directory is /tmp or * a subdirectory of /tmp */ if (!is_win32 || dirlen < 4 || (dirlen == 4 && strncmp(cdir, "/tmp", 4) != 0) || /* not in "/tmp" */ (dirlen > 4 && strncmp(cdir, "/tmp/", 5) != 0)) { /* not in "/tmp/" */ stringCopy(pathout, cdir, dirlen); } else { /* Rewrite for win32 with "/tmp" specified for the directory. */ #ifdef _WIN32 l_int32 tmpdirlen; char tmpdir[MAX_PATH]; GetTempPath(sizeof(tmpdir), tmpdir); /* get the windows temp dir */ tmpdirlen = strlen(tmpdir); if (tmpdirlen > 0 && tmpdir[tmpdirlen - 1] == '\\') { tmpdir[tmpdirlen - 1] = '\0'; /* trim the trailing '\' */ } tmpdirlen = strlen(tmpdir); stringCopy(pathout, tmpdir, tmpdirlen); /* Add the rest of cdir */ if (dirlen > 4) stringCat(pathout, size, cdir + 4); #endif /* _WIN32 */ } /* Now handle %fname */ if (fname && strlen(fname) > 0) { dirlen = strlen(pathout); pathout[dirlen] = '/'; stringCat(pathout, size, fname); } LEPT_FREE(cdir); return pathout; } /*! * \brief makeTempDirname() * * \param[in] result preallocated on stack or heap and passed in * \param[in] nbytes size of %result array, in bytes * \param[in] subdir [optional]; can be NULL or an empty string * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This generates the directory path for output temp files,
 *          written into %result with unix separators.
 *      (2) Caller allocates %result, large enough to hold the path,
 *          which is:
 *            /tmp/%subdir       (unix)
 *            [Temp]/%subdir     (windows, mac, ios)
 *          where [Temp] is a path determined
 *             - on windows, mac: by GetTempPath()
 *             - on ios: by confstr() (see man page)
 *          and %subdir is in general a set of nested subdirectories:
 *            dir1/dir2/.../dirN
 *          which in use would not typically exceed 2 levels.
 *      (3) Usage example:
 * \code
 *           char  result[256];
 *           makeTempDirname(result, sizeof(result), "lept/golden");
 * \endcode
 * 
*/ l_ok makeTempDirname(char *result, size_t nbytes, const char *subdir) { char *dir, *path; l_int32 ret = 0; size_t pathlen; PROCNAME("makeTempDirname"); if (!result) return ERROR_INT("result not defined", procName, 1); if (subdir && ((subdir[0] == '.') || (subdir[0] == '/'))) return ERROR_INT("subdir not an actual subdirectory", procName, 1); memset(result, 0, nbytes); #ifdef OS_IOS { size_t n = confstr(_CS_DARWIN_USER_TEMP_DIR, result, nbytes); if (n == 0) { L_ERROR("failed to find tmp dir, %s\n", procName, strerror(errno)); return 1; } else if (n > nbytes) { return ERROR_INT("result array too small for path\n", procName, 1); } dir = pathJoin(result, subdir); } #else dir = pathJoin("/tmp", subdir); #endif /* ~ OS_IOS */ #ifndef _WIN32 path = stringNew(dir); #else path = genPathname(dir, NULL); #endif /* ~ _WIN32 */ pathlen = strlen(path); if (pathlen < nbytes - 1) { stringCat(result, nbytes, path); } else { L_ERROR("result array too small for path\n", procName); ret = 1; } LEPT_FREE(dir); LEPT_FREE(path); return ret; } /*! * \brief modifyTrailingSlash() * * \param[in] path preallocated on stack or heap and passed in * \param[in] nbytes size of %path array, in bytes * \param[in] flag L_ADD_TRAIL_SLASH or L_REMOVE_TRAIL_SLASH * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This carries out the requested action if necessary.
 * 
*/ l_ok modifyTrailingSlash(char *path, size_t nbytes, l_int32 flag) { char lastchar; size_t len; PROCNAME("modifyTrailingSlash"); if (!path) return ERROR_INT("path not defined", procName, 1); if (flag != L_ADD_TRAIL_SLASH && flag != L_REMOVE_TRAIL_SLASH) return ERROR_INT("invalid flag", procName, 1); len = strlen(path); lastchar = path[len - 1]; if (flag == L_ADD_TRAIL_SLASH && lastchar != '/' && len < nbytes - 2) { path[len] = '/'; path[len + 1] = '\0'; } else if (flag == L_REMOVE_TRAIL_SLASH && lastchar == '/') { path[len - 1] = '\0'; } return 0; } /*! * \brief l_makeTempFilename() * * \return fname : heap allocated filename; returns NULL on failure. * *
 * Notes:
 *      (1) On unix, this makes a filename of the form
 *               "/tmp/lept.XXXXXX",
 *          where each X is a random character.
 *      (2) On windows, this makes a filename of the form
 *               "/[Temp]/lp.XXXXXX".
 *      (3) On all systems, this fails if the file is not writable.
 *      (4) Safest usage is to write to a subdirectory in debug code.
 *      (5) The returned filename must be freed by the caller, using lept_free.
 *      (6) The tail of the filename has a '.', so that cygwin interprets
 *          the file as having an extension.  Otherwise, cygwin assumes it
 *          is an executable and appends ".exe" to the filename.
 *      (7) On unix, whenever possible use tmpfile() instead.  tmpfile()
 *          hides the file name, returns a stream opened for write,
 *          and deletes the temp file when the stream is closed.
 * 
*/ char * l_makeTempFilename(void) { char dirname[240]; PROCNAME("l_makeTempFilename"); if (makeTempDirname(dirname, sizeof(dirname), NULL) == 1) return (char *)ERROR_PTR("failed to make dirname", procName, NULL); #ifndef _WIN32 { char *pattern; l_int32 fd; pattern = stringConcatNew(dirname, "/lept.XXXXXX", NULL); fd = mkstemp(pattern); if (fd == -1) { LEPT_FREE(pattern); return (char *)ERROR_PTR("mkstemp failed", procName, NULL); } close(fd); return pattern; } #else { char fname[MAX_PATH]; FILE *fp; if (GetTempFileName(dirname, "lp.", 0, fname) == 0) return (char *)ERROR_PTR("GetTempFileName failed", procName, NULL); if ((fp = fopen(fname, "wb")) == NULL) return (char *)ERROR_PTR("file cannot be written to", procName, NULL); fclose(fp); return stringNew(fname); } #endif /* ~ _WIN32 */ } /*! * \brief extractNumberFromFilename() * * \param[in] fname * \param[in] numpre number of characters before the digits to be found * \param[in] numpost number of characters after the digits to be found * \return num number embedded in the filename; -1 on error or if * not found * *
 * Notes:
 *      (1) The number is to be found in the basename, which is the
 *          filename without either the directory or the last extension.
 *      (2) When a number is found, it is non-negative.  If no number
 *          is found, this returns -1, without an error message.  The
 *          caller needs to check.
 * 
*/ l_int32 extractNumberFromFilename(const char *fname, l_int32 numpre, l_int32 numpost) { char *tail, *basename; l_int32 len, nret, num; PROCNAME("extractNumberFromFilename"); if (!fname) return ERROR_INT("fname not defined", procName, -1); splitPathAtDirectory(fname, NULL, &tail); splitPathAtExtension(tail, &basename, NULL); LEPT_FREE(tail); len = strlen(basename); if (numpre + numpost > len - 1) { LEPT_FREE(basename); return ERROR_INT("numpre + numpost too big", procName, -1); } basename[len - numpost] = '\0'; nret = sscanf(basename + numpre, "%d", &num); LEPT_FREE(basename); if (nret == 1) return num; else return -1; /* not found */ }