/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /*! * \file sarray2.c *

 *
 *      Sort
 *          SARRAY     *sarraySort()
 *          SARRAY     *sarraySortByIndex()
 *          l_int32     stringCompareLexical()
 *
 *      Set operations using aset (rbtree)
 *          SARRAY     *sarrayUnionByAset()
 *          SARRAY     *sarrayRemoveDupsByAset()
 *          SARRAY     *sarrayIntersectionByAset()
 *          L_ASET     *l_asetCreateFromSarray()
 *
 *      Set operations using hashing (dnahash)
 *          l_int32     sarrayRemoveDupsByHash()
 *          SARRAY     *sarrayIntersectionByHash()
 *          l_int32     sarrayFindStringByHash()
 *          L_DNAHASH  *l_dnaHashCreateFromSarray()
 *
 *      Miscellaneous operations
 *          SARRAY     *sarrayGenerateIntegers()
 *          l_int32     sarrayLookupCSKV()
 *
 *
 * We have two implementations of set operations on an array of strings:
 *
 *   (1) Using an underlying tree (rbtree)
 *       This uses a good 64 bit hashing function for the key,
 *       that is not expected to have hash collisions (and we do
 *       not test for them).  The tree is built up of the hash
 *       values, and if the hash is found in the tree, it is
 *       assumed that the string has already been found.
 *
 *   (2) Using an underlying hashing of the keys (dnahash)
 *       This uses a fast 64 bit hashing function for the key,
 *       which is then hashed into a bucket (a dna in a dnaHash).
 *       Because hash collisions can occur, the index into the
 *       sarray for the string that gave rise to that key is stored,
 *       and the dna (bucket) is traversed, using the stored indices
 *       to determine if that string had already been seen.
 *
 *

*/ #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include #include "allheaders.h" /*----------------------------------------------------------------------* * Sort * *----------------------------------------------------------------------*/ /*! * \brief sarraySort() * * \param[in] saout output sarray; can be NULL or equal to sain * \param[in] sain input sarray * \param[in] sortorder L_SORT_INCREASING or L_SORT_DECREASING * \return saout output sarray, sorted by ascii value, or NULL on error * *

 * Notes:
 *      (1) Set saout = sain for in-place; otherwise, set naout = NULL.
 *      (2) Shell sort, modified from K&R, 2nd edition, p.62.
 *          Slow but simple O(n logn) sort.
 *

*/ SARRAY * sarraySort(SARRAY *saout, SARRAY *sain, l_int32 sortorder) { char **array; char *tmp; l_int32 n, i, j, gap; PROCNAME("sarraySort"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); /* Make saout if necessary; otherwise do in-place */ if (!saout) saout = sarrayCopy(sain); else if (sain != saout) return (SARRAY *)ERROR_PTR("invalid: not in-place", procName, NULL); array = saout->array; /* operate directly on the array */ n = sarrayGetCount(saout); /* Shell sort */ for (gap = n/2; gap > 0; gap = gap / 2) { for (i = gap; i < n; i++) { for (j = i - gap; j >= 0; j -= gap) { if ((sortorder == L_SORT_INCREASING && stringCompareLexical(array[j], array[j + gap])) || (sortorder == L_SORT_DECREASING && stringCompareLexical(array[j + gap], array[j]))) { tmp = array[j]; array[j] = array[j + gap]; array[j + gap] = tmp; } } } } return saout; } /*! * \brief sarraySortByIndex() * * \param[in] sain * \param[in] naindex na that maps from the new sarray to the input sarray * \return saout sorted, or NULL on error */ SARRAY * sarraySortByIndex(SARRAY *sain, NUMA *naindex) { char *str; l_int32 i, n, index; SARRAY *saout; PROCNAME("sarraySortByIndex"); if (!sain) return (SARRAY *)ERROR_PTR("sain not defined", procName, NULL); if (!naindex) return (SARRAY *)ERROR_PTR("naindex not defined", procName, NULL); n = sarrayGetCount(sain); saout = sarrayCreate(n); for (i = 0; i < n; i++) { numaGetIValue(naindex, i, &index); str = sarrayGetString(sain, index, L_COPY); sarrayAddString(saout, str, L_INSERT); } return saout; } /*! * \brief stringCompareLexical() * * \param[in] str1 * \param[in] str2 * \return 1 if str1 > str2 lexically; 0 otherwise * *

 * Notes:
 *      (1) If the lexical values are identical, return a 0, to
 *          indicate that no swapping is required to sort the strings.
 *

*/ l_int32 stringCompareLexical(const char *str1, const char *str2) { l_int32 i, len1, len2, len; PROCNAME("sarrayCompareLexical"); if (!str1) return ERROR_INT("str1 not defined", procName, 1); if (!str2) return ERROR_INT("str2 not defined", procName, 1); len1 = strlen(str1); len2 = strlen(str2); len = L_MIN(len1, len2); for (i = 0; i < len; i++) { if (str1[i] == str2[i]) continue; if (str1[i] > str2[i]) return 1; else return 0; } if (len1 > len2) return 1; else return 0; } /*----------------------------------------------------------------------* * Set operations using aset (rbtree) * *----------------------------------------------------------------------*/ /*! * \brief sarrayUnionByAset() * * \param[in] sa1, sa2 * \return sad with the union of the string set, or NULL on error * *

 * Notes:
 *      (1) Duplicates are removed from the concatenation of the two arrays.
 *      (2) The key for each string is a 64-bit hash.
 *      (2) Algorithm: Concatenate the two sarrays.  Then build a set,
 *          using hashed strings as keys.  As the set is built, first do
 *          a find; if not found, add the key to the set and add the string
 *          to the output sarray.  This is O(nlogn).
 *

*/ SARRAY * sarrayUnionByAset(SARRAY *sa1, SARRAY *sa2) { SARRAY *sa3, *sad; PROCNAME("sarrayUnionByAset"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Join */ sa3 = sarrayCopy(sa1); sarrayJoin(sa3, sa2); /* Eliminate duplicates */ sad = sarrayRemoveDupsByAset(sa3); sarrayDestroy(&sa3); return sad; } /*! * \brief sarrayRemoveDupsByAset() * * \param[in] sas * \return sad with duplicates removed, or NULL on error * *

 * Notes:
 *      (1) This is O(nlogn), considerably slower than
 *          sarrayRemoveDupsByHash() for large string arrays.
 *      (2) The key for each string is a 64-bit hash.
 *      (3) Build a set, using hashed strings as keys.  As the set is
 *          built, first do a find; if not found, add the key to the
 *          set and add the string to the output sarray.
 *

*/ SARRAY * sarrayRemoveDupsByAset(SARRAY *sas) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; SARRAY *sad; PROCNAME("sarrayRemoveDupsByAset"); if (!sas) return (SARRAY *)ERROR_PTR("sas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); sad = sarrayCreate(0); n = sarrayGetCount(sas); for (i = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (!l_asetFind(set, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set, key); } } l_asetDestroy(&set); return sad; } /*! * \brief sarrayIntersectionByAset() * * \param[in] sa1, sa2 * \return sad with the intersection of the string set, or NULL on error * *

 * Notes:
 *      (1) Algorithm: put the larger sarray into a set, using the string
 *          hashes as the key values.  Then run through the smaller sarray,
 *          building an output sarray and a second set from the strings
 *          in the larger array: if a string is in the first set but
 *          not in the second, add the string to the output sarray and hash
 *          it into the second set.  The second set is required to make
 *          sure only one instance of each string is put into the output sarray.
 *          This is O(mlogn), {m,n} = sizes of {smaller,larger} input arrays.
 *

*/ SARRAY * sarrayIntersectionByAset(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, i, n; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByAset"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ set1 = l_asetCreateFromSarray(sa_big); /* Build up the intersection of strings */ sad = sarrayCreate(0); n = sarrayGetCount(sa_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { sarrayAddString(sad, str, L_COPY); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return sad; } /*! * \brief l_asetCreateFromSarray() * * \param[in] sa * \return set using a string hash into a uint64 as the key */ L_ASET * l_asetCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("l_asetCreateFromSarray"); if (!sa) return (L_ASET *)ERROR_PTR("sa not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = sarrayGetCount(sa); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &hash); key.utype = hash; l_asetInsert(set, key); } return set; } /*----------------------------------------------------------------------* * Set operations using hashing (dnahash) * *----------------------------------------------------------------------*/ /*! * \brief sarrayRemoveDupsByHash() * * \param[in] sas * \param[out] psad unique set of strings; duplicates removed * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK, 1 on error * *

 * Notes:
 *      (1) Generates a sarray with unique values.
 *      (2) The dnahash is built up with sad to assure uniqueness.
 *          It can be used to find if a string is in the set:
 *              sarrayFindValByHash(sad, dahash, str, &index)
 *      (3) The hash of the string location is simple and fast.  It scales
 *          up with the number of buckets to insure a fairly random
 *          bucket selection input strings.
 *      (4) This is faster than sarrayRemoveDupsByAset(), because the
 *          bucket lookup is O(n), although there is a double-loop
 *          lookup within the dna in each bucket.
 *

*/ l_ok sarrayRemoveDupsByHash(SARRAY *sas, SARRAY **psad, L_DNAHASH **pdahash) { char *str; l_int32 i, n, index, items; l_uint32 nsize; l_uint64 key; SARRAY *sad; L_DNAHASH *dahash; PROCNAME("sarrayRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!psad) return ERROR_INT("&sad not defined", procName, 1); *psad = NULL; if (!sas) return ERROR_INT("sas not defined", procName, 1); n = sarrayGetCount(sas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); sad = sarrayCreate(n); *psad = sad; for (i = 0, items = 0; i < n; i++) { str = sarrayGetString(sas, i, L_NOCOPY); sarrayFindStringByHash(sad, dahash, str, &index); if (index < 0) { /* not found */ l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)items); sarrayAddString(sad, str, L_COPY); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; } /*! * \brief sarrayIntersectionByHash() * * \param[in] sa1, sa2 * \return sad intersection of the strings, or NULL on error * *

 * Notes:
 *      (1) This is faster than sarrayIntersectionByAset(), because the
 *          bucket lookup is O(n).
 *

*/ SARRAY * sarrayIntersectionByHash(SARRAY *sa1, SARRAY *sa2) { char *str; l_int32 n1, n2, nsmall, i, index1, index2; l_uint32 nsize2; l_uint64 key; L_DNAHASH *dahash1, *dahash2; SARRAY *sa_small, *sa_big, *sad; PROCNAME("sarrayIntersectionByHash"); if (!sa1) return (SARRAY *)ERROR_PTR("sa1 not defined", procName, NULL); if (!sa2) return (SARRAY *)ERROR_PTR("sa2 not defined", procName, NULL); /* Put the elements of the biggest sarray into a dnahash */ n1 = sarrayGetCount(sa1); n2 = sarrayGetCount(sa2); sa_small = (n1 < n2) ? sa1 : sa2; /* do not destroy sa_small */ sa_big = (n1 < n2) ? sa2 : sa1; /* do not destroy sa_big */ dahash1 = l_dnaHashCreateFromSarray(sa_big); /* Build up the intersection of strings. Add to %sad * if the string is in sa_big (using dahash1) but hasn't * yet been seen in the traversal of sa_small (using dahash2). */ sad = sarrayCreate(0); nsmall = sarrayGetCount(sa_small); findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */ dahash2 = l_dnaHashCreate(nsize2, 0); for (i = 0; i < nsmall; i++) { str = sarrayGetString(sa_small, i, L_NOCOPY); sarrayFindStringByHash(sa_big, dahash1, str, &index1); if (index1 >= 0) { sarrayFindStringByHash(sa_small, dahash2, str, &index2); if (index2 == -1) { sarrayAddString(sad, str, L_COPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash2, key, (l_float64)i); } } } l_dnaHashDestroy(&dahash1); l_dnaHashDestroy(&dahash2); return sad; } /*! * \brief sarrayFindStringByHash() * * \param[in] sa * \param[in] dahash built from sa * \param[in] str arbitrary string * \param[out] pindex index into %sa if %str is in %sa; -1 otherwise * \return 0 if OK, 1 on error * *

 * Notes:
 *      (1) Fast lookup in dnaHash associated with a sarray, to see if a
 *          random string %str is already stored in the hash table.
 *      (2) We use a strong hash function to minimize the chance that
 *          two different strings hash to the same key value.
 *      (3) We select the number of buckets to be about 5% of the size
 *          of the input sarray, so that when fully populated, each
 *          bucket (dna) will have about 20 entries, each being an index
 *          into sa.  In lookup, after hashing to the key, and then
 *          again to the bucket, we traverse the bucket (dna), using the
 *          index into sa to check if %str has been found before.
 *

*/ l_ok sarrayFindStringByHash(SARRAY *sa, L_DNAHASH *dahash, const char *str, l_int32 *pindex) { char *stri; l_int32 i, nvals, index; l_uint64 key; L_DNA *da; PROCNAME("sarrayFindStringByHash"); if (!pindex) return ERROR_INT("&index not defined", procName, 1); *pindex = -1; if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!dahash) return ERROR_INT("dahash not defined", procName, 1); l_hashStringToUint64(str, &key); da = l_dnaHashGetDna(dahash, key, L_NOCOPY); if (!da) return 0; /* Run through the da, looking for this string */ nvals = l_dnaGetCount(da); for (i = 0; i < nvals; i++) { l_dnaGetIValue(da, i, &index); stri = sarrayGetString(sa, index, L_NOCOPY); if (!strcmp(str, stri)) { /* duplicate */ *pindex = index; return 0; } } return 0; } /*! * \brief l_dnaHashCreateFromSarray() * * \param[in] sa * \return dahash, or NULL on error */ L_DNAHASH * l_dnaHashCreateFromSarray(SARRAY *sa) { char *str; l_int32 i, n; l_uint32 nsize; l_uint64 key; L_DNAHASH *dahash; /* Build up dnaHash of indices, hashed by a 64-bit key that * should randomize the lower bits used in bucket selection. * Having about 20 pts in each bucket is roughly optimal. */ n = sarrayGetCount(sa); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ /* lept_stderr("Prime used: %d\n", nsize); */ /* Add each string, using the hash as key and the index into %sa * as the value. Storing the index enables operations that check * for duplicates. */ dahash = l_dnaHashCreate(nsize, 8); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); l_hashStringToUint64(str, &key); l_dnaHashAdd(dahash, key, (l_float64)i); } return dahash; } /*----------------------------------------------------------------------* * Miscellaneous operations * *----------------------------------------------------------------------*/ /*! * \brief sarrayGenerateIntegers() * * \param[in] n * \return sa of printed numbers, 1 - n, or NULL on error */ SARRAY * sarrayGenerateIntegers(l_int32 n) { char buf[32]; l_int32 i; SARRAY *sa; PROCNAME("sarrayGenerateIntegers"); if ((sa = sarrayCreate(n)) == NULL) return (SARRAY *)ERROR_PTR("sa not made", procName, NULL); for (i = 0; i < n; i++) { snprintf(buf, sizeof(buf), "%d", i); sarrayAddString(sa, buf, L_COPY); } return sa; } /*! * \brief sarrayLookupCSKV() * * \param[in] sa of strings, each being a comma-separated pair * of strings, the first being a key and the * second a value * \param[in] keystring an input string to match with each key in %sa * \param[out] pvalstring the returned value string corresponding to the * input key string, if found; otherwise NULL * \return 0 if OK, 1 on error * *

 * Notes:
 *      (1) The input %sa can have other strings that are not in
 *          comma-separated key-value format.  These will be ignored.
 *      (2) This returns a copy of the first value string in %sa whose
 *          key string matches the input %keystring.
 *      (3) White space is not ignored; all white space before the ','
 *          is used for the keystring in matching.  This allows the
 *          key and val strings to have white space (e.g., multiple words).
 *

*/ l_ok sarrayLookupCSKV(SARRAY *sa, const char *keystring, char **pvalstring) { char *key, *val, *str; l_int32 i, n; SARRAY *sa1; PROCNAME("sarrayLookupCSKV"); if (!pvalstring) return ERROR_INT("&valstring not defined", procName, 1); *pvalstring = NULL; if (!sa) return ERROR_INT("sa not defined", procName, 1); if (!keystring) return ERROR_INT("keystring not defined", procName, 1); n = sarrayGetCount(sa); for (i = 0; i < n; i++) { str = sarrayGetString(sa, i, L_NOCOPY); sa1 = sarrayCreate(2); sarraySplitString(sa1, str, ","); if (sarrayGetCount(sa1) != 2) { sarrayDestroy(&sa1); continue; } key = sarrayGetString(sa1, 0, L_NOCOPY); val = sarrayGetString(sa1, 1, L_NOCOPY); if (!strcmp(key, keystring)) { *pvalstring = stringNew(val); sarrayDestroy(&sa1); return 0; } sarrayDestroy(&sa1); } return 0; }