/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /*! * \file ptafunc2.c *
* * -------------------------------------- * This file has these Pta utilities: * - sorting * - ordered set operations * - hash map operations * -------------------------------------- * * Sorting * PTA *ptaSort() * l_int32 ptaGetSortIndex() * PTA *ptaSortByIndex() * PTAA *ptaaSortByIndex() * l_int32 ptaGetRankValue() * PTA *ptaSort2d() * l_int32 ptaEqual() * * Set operations using aset (rbtree) * PTA *ptaUnionByAset() * PTA *ptaRemoveDupsByAset() * PTA *ptaIntersectionByAset() * L_ASET *l_asetCreateFromPta() * * Set operations using hashing (dnahash) * PTA *ptaUnionByHash() * l_int32 ptaRemoveDupsByHash() * PTA *ptaIntersectionByHash(); * l_int32 ptaFindPtByHash() * L_DNAHASH *l_dnaHashCreateFromPta() * * * We have two implementations of set operations on an array of points: * * (1) Using an underlying tree (rbtree) * This uses a good 64 bit hashing function for the key, * that is not expected to have hash collisions (and we do * not test for them). The tree is built up of the hash * values, and if the hash is found in the tree, it is * assumed that the point has already been found. * * (2) Using an underlying hashing of the keys (dnahash) * This uses a fast 64 bit hashing function for the key, * which is then hashed into a bucket (a dna in a dnaHash). * Because hash collisions can occur, the index into the * pta for the point that gave rise to that key is stored, * and the dna (bucket) is traversed, using the stored indices * to determine if that point had already been seen. * **/ #ifdef HAVE_CONFIG_H #include
* Notes: * (1) Sort increasing by row-major, scanning down from the UL corner, * where for each value of y, order the pts from left to right. **/ PTA * ptaSort2d(PTA *pta) { l_int32 index, i, j, n, nx, ny, start, end; l_float32 x, y, yp, val; NUMA *na1, *na2, *nas, *nax; PTA *pta1, *ptad; PROCNAME("ptaSort2d"); if (!pta) return (PTA *)ERROR_PTR("pta not defined", procName, NULL); /* Sort by row-major (y first, then x). After sort by y, * the x values at the same y are not sorted. */ pta1 = ptaSort(pta, L_SORT_BY_Y, L_SORT_INCREASING, NULL); /* Find start and ending indices with the same y value */ n = ptaGetCount(pta1); na1 = numaCreate(0); /* holds start index of sequence with same y */ na2 = numaCreate(0); /* holds end index of sequence with same y */ numaAddNumber(na1, 0); ptaGetPt(pta1, 0, &x, &yp); for (i = 1; i < n; i++) { ptaGetPt(pta1, i, &x, &y); if (y != yp) { numaAddNumber(na1, i); numaAddNumber(na2, i - 1); } yp = y; } numaAddNumber(na2, n - 1); /* Sort by increasing x each set with the same y value */ ptad = ptaCreate(n); ny = numaGetCount(na1); /* number of distinct y values */ for (i = 0, index = 0; i < ny; i++) { numaGetIValue(na1, i, &start); numaGetIValue(na2, i, &end); nx = end - start + 1; /* number of points with current y value */ if (nx == 1) { ptaGetPt(pta1, index++, &x, &y); ptaAddPt(ptad, x, y); } else { /* More than 1 point; extract and sort the x values */ nax = numaCreate(nx); for (j = 0; j < nx; j++) { ptaGetPt(pta1, index + j, &x, &y); numaAddNumber(nax, x); } nas = numaSort(NULL, nax, L_SORT_INCREASING); /* Add the points with x sorted */ for (j = 0; j < nx; j++) { numaGetFValue(nas, j, &val); ptaAddPt(ptad, val, y); } index += nx; numaDestroy(&nax); numaDestroy(&nas); } } numaDestroy(&na1); numaDestroy(&na2); ptaDestroy(&pta1); return ptad; } /*! * \brief ptaEqual() * * \param[in] pta1 * \param[in] pta2 * \param[out] psame 1 if same; 0 if different * \return 0 if OK; 1 on error * *
* Notes: * (1) Equality is defined as having the same set of points, * independent of the order in which they are presented. **/ l_ok ptaEqual(PTA *pta1, PTA *pta2, l_int32 *psame) { l_int32 i, n1, n2; l_float32 x1, y1, x2, y2; PTA *ptas1, *ptas2; PROCNAME("ptaEqual"); if (!psame) return ERROR_INT("&same not defined", procName, 1); *psame = 0.0; if (!pta1 || !pta2) return ERROR_INT("pta1 and pta2 not both defined", procName, 1); n1 = ptaGetCount(pta1); n2 = ptaGetCount(pta2); if (n1 != n2) return 0; /* 2d sort each and compare */ ptas1 = ptaSort2d(pta1); ptas2 = ptaSort2d(pta2); for (i = 0; i < n1; i++) { ptaGetPt(ptas1, i, &x1, &y1); ptaGetPt(ptas2, i, &x2, &y2); if (x1 != x2 || y1 != y2) { ptaDestroy(&ptas1); ptaDestroy(&ptas2); return 0; } } *psame = 1; ptaDestroy(&ptas1); ptaDestroy(&ptas2); return 0; } /*---------------------------------------------------------------------* * Set operations using aset (rbtree) * *---------------------------------------------------------------------*/ /*! * \brief ptaUnionByAset() * * \param[in] pta1, pta2 * \return ptad with the union of the set of points, or NULL on error * *
* Notes: * (1) See sarrayRemoveDupsByAset() for the approach. * (2) The key is a 64-bit hash from the (x,y) pair. * (3) This is slower than ptaUnionByHash(), mostly because of the * nlogn sort to build up the rbtree. Do not use for large * numbers of points (say, > 1M). * (4) The *Aset() functions use the sorted l_Aset, which is just * an rbtree in disguise. **/ PTA * ptaUnionByAset(PTA *pta1, PTA *pta2) { PTA *pta3, *ptad; PROCNAME("ptaUnionByAset"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Join */ pta3 = ptaCopy(pta1); ptaJoin(pta3, pta2, 0, -1); /* Eliminate duplicates */ ptad = ptaRemoveDupsByAset(pta3); ptaDestroy(&pta3); return ptad; } /*! * \brief ptaRemoveDupsByAset() * * \param[in] ptas assumed to be integer values * \return ptad with duplicates removed, or NULL on error * *
* Notes: * (1) This is slower than ptaRemoveDupsByHash(), mostly because * of the nlogn sort to build up the rbtree. Do not use for * large numbers of points (say, > 1M). **/ PTA * ptaRemoveDupsByAset(PTA *ptas) { l_int32 i, n, x, y; PTA *ptad; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("ptaRemoveDupsByAset"); if (!ptas) return (PTA *)ERROR_PTR("ptas not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = ptaGetCount(ptas); ptad = ptaCreate(n); for (i = 0; i < n; i++) { ptaGetIPt(ptas, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; if (!l_asetFind(set, key)) { ptaAddPt(ptad, x, y); l_asetInsert(set, key); } } l_asetDestroy(&set); return ptad; } /*! * \brief ptaIntersectionByAset() * * \param[in] pta1, pta2 * \return ptad intersection of the point sets, or NULL on error * *
* Notes: * (1) See sarrayIntersectionByAset() for the approach. * (2) The key is a 64-bit hash from the (x,y) pair. * (3) This is slower than ptaIntersectionByHash(), mostly because * of the nlogn sort to build up the rbtree. Do not use for * large numbers of points (say, > 1M). **/ PTA * ptaIntersectionByAset(PTA *pta1, PTA *pta2) { l_int32 n1, n2, i, n, x, y; l_uint64 hash; L_ASET *set1, *set2; RB_TYPE key; PTA *pta_small, *pta_big, *ptad; PROCNAME("ptaIntersectionByAset"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Put the elements of the biggest array into a set */ n1 = ptaGetCount(pta1); n2 = ptaGetCount(pta2); pta_small = (n1 < n2) ? pta1 : pta2; /* do not destroy pta_small */ pta_big = (n1 < n2) ? pta2 : pta1; /* do not destroy pta_big */ set1 = l_asetCreateFromPta(pta_big); /* Build up the intersection of points */ ptad = ptaCreate(0); n = ptaGetCount(pta_small); set2 = l_asetCreate(L_UINT_TYPE); for (i = 0; i < n; i++) { ptaGetIPt(pta_small, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; if (l_asetFind(set1, key) && !l_asetFind(set2, key)) { ptaAddPt(ptad, x, y); l_asetInsert(set2, key); } } l_asetDestroy(&set1); l_asetDestroy(&set2); return ptad; } /*! * \brief l_asetCreateFromPta() * * \param[in] pta * \return set using a 64-bit hash of (x,y) as the key */ L_ASET * l_asetCreateFromPta(PTA *pta) { l_int32 i, n, x, y; l_uint64 hash; L_ASET *set; RB_TYPE key; PROCNAME("l_asetCreateFromPta"); if (!pta) return (L_ASET *)ERROR_PTR("pta not defined", procName, NULL); set = l_asetCreate(L_UINT_TYPE); n = ptaGetCount(pta); for (i = 0; i < n; i++) { ptaGetIPt(pta, i, &x, &y); l_hashPtToUint64(x, y, &hash); key.utype = hash; l_asetInsert(set, key); } return set; } /*---------------------------------------------------------------------* * Set operations using hashing (rbtree) * *---------------------------------------------------------------------*/ /*! * \brief ptaUnionByHash() * * \param[in] pta1, pta2 * \return ptad with the union of the set of points, or NULL on error * *
* Notes: * (1) This is faster than ptaUnionByAset(), because the * bucket lookup is O(n). It should be used if the pts are * integers (e.g., representing pixel positions). **/ PTA * ptaUnionByHash(PTA *pta1, PTA *pta2) { PTA *pta3, *ptad; PROCNAME("ptaUnionByHash"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Join */ pta3 = ptaCopy(pta1); ptaJoin(pta3, pta2, 0, -1); /* Eliminate duplicates */ ptaRemoveDupsByHash(pta3, &ptad, NULL); ptaDestroy(&pta3); return ptad; } /*! * \brief ptaRemoveDupsByHash() * * \param[in] ptas assumed to be integer values * \param[out] pptad unique set of pts; duplicates removed * \param[out] pdahash [optional] dnahash used for lookup * \return 0 if OK, 1 on error * *
* Notes: * (1) Generates a pta with unique values. * (2) The dnahash is built up with ptad to assure uniqueness. * It can be used to find if a point is in the set: * ptaFindPtByHash(ptad, dahash, x, y, &index) * (3) The hash of the (x,y) location is simple and fast. It scales * up with the number of buckets to insure a fairly random * bucket selection for adjacent points. * (4) A Dna is used rather than a Numa because we need accurate * representation of 32-bit integers that are indices into ptas. * Integer --> float --> integer conversion makes errors for * integers larger than 10M. * (5) This is faster than ptaRemoveDupsByAset(), because the * bucket lookup is O(n), although there is a double-loop * lookup within the dna in each bucket. **/ l_ok ptaRemoveDupsByHash(PTA *ptas, PTA **pptad, L_DNAHASH **pdahash) { l_int32 i, n, index, items, x, y; l_uint32 nsize; l_uint64 key; PTA *ptad; L_DNAHASH *dahash; PROCNAME("ptaRemoveDupsByHash"); if (pdahash) *pdahash = NULL; if (!pptad) return ERROR_INT("&ptad not defined", procName, 1); *pptad = NULL; if (!ptas) return ERROR_INT("ptas not defined", procName, 1); n = ptaGetCount(ptas); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ dahash = l_dnaHashCreate(nsize, 8); ptad = ptaCreate(n); *pptad = ptad; for (i = 0, items = 0; i < n; i++) { ptaGetIPt(ptas, i, &x, &y); ptaFindPtByHash(ptad, dahash, x, y, &index); if (index < 0) { /* not found */ l_hashPtToUint64(x, y, &key); l_dnaHashAdd(dahash, key, (l_float64)items); ptaAddPt(ptad, x, y); items++; } } if (pdahash) *pdahash = dahash; else l_dnaHashDestroy(&dahash); return 0; } /*! * \brief ptaIntersectionByHash() * * \param[in] pta1, pta2 * \return ptad intersection of the point sets, or NULL on error * *
* Notes: * (1) This is faster than ptaIntersectionByAset(), because the * bucket lookup is O(n). It should be used if the pts are * integers (e.g., representing pixel positions). **/ PTA * ptaIntersectionByHash(PTA *pta1, PTA *pta2) { l_int32 n1, n2, nsmall, i, x, y, index1, index2; l_uint32 nsize2; l_uint64 key; L_DNAHASH *dahash1, *dahash2; PTA *pta_small, *pta_big, *ptad; PROCNAME("ptaIntersectionByHash"); if (!pta1) return (PTA *)ERROR_PTR("pta1 not defined", procName, NULL); if (!pta2) return (PTA *)ERROR_PTR("pta2 not defined", procName, NULL); /* Put the elements of the biggest pta into a dnahash */ n1 = ptaGetCount(pta1); n2 = ptaGetCount(pta2); pta_small = (n1 < n2) ? pta1 : pta2; /* do not destroy pta_small */ pta_big = (n1 < n2) ? pta2 : pta1; /* do not destroy pta_big */ dahash1 = l_dnaHashCreateFromPta(pta_big); /* Build up the intersection of points. Add to ptad * if the point is in pta_big (using dahash1) but hasn't * yet been seen in the traversal of pta_small (using dahash2). */ ptad = ptaCreate(0); nsmall = ptaGetCount(pta_small); findNextLargerPrime(nsmall / 20, &nsize2); /* buckets in hash table */ dahash2 = l_dnaHashCreate(nsize2, 0); for (i = 0; i < nsmall; i++) { ptaGetIPt(pta_small, i, &x, &y); ptaFindPtByHash(pta_big, dahash1, x, y, &index1); if (index1 >= 0) { /* found */ ptaFindPtByHash(pta_small, dahash2, x, y, &index2); if (index2 == -1) { /* not found */ ptaAddPt(ptad, x, y); l_hashPtToUint64(x, y, &key); l_dnaHashAdd(dahash2, key, (l_float64)i); } } } l_dnaHashDestroy(&dahash1); l_dnaHashDestroy(&dahash2); return ptad; } /*! * \brief ptaFindPtByHash() * * \param[in] pta * \param[in] dahash built from pta * \param[in] x, y arbitrary points * \param[out] pindex index into pta if (x,y) is in pta; -1 otherwise * \return 0 if OK, 1 on error * *
* Notes: * (1) Fast lookup in dnaHash associated with a pta, to see if a * random point (x,y) is already stored in the hash table. * (2) We use a strong hash function to minimize the chance that * two different points hash to the same key value. * (3) We select the number of buckets to be about 5% of the size * of the input %pta, so that when fully populated, each * bucket (dna) will have about 20 entries, each being an index * into %pta. In lookup, after hashing to the key, and then * again to the bucket, we traverse the bucket (dna), using the * index into %pta to check if the point (x,y) has been found before. **/ l_ok ptaFindPtByHash(PTA *pta, L_DNAHASH *dahash, l_int32 x, l_int32 y, l_int32 *pindex) { l_int32 i, nvals, index, xi, yi; l_uint64 key; L_DNA *da; PROCNAME("ptaFindPtByHash"); if (!pindex) return ERROR_INT("&index not defined", procName, 1); *pindex = -1; if (!pta) return ERROR_INT("pta not defined", procName, 1); if (!dahash) return ERROR_INT("dahash not defined", procName, 1); l_hashPtToUint64(x, y, &key); da = l_dnaHashGetDna(dahash, key, L_NOCOPY); if (!da) return 0; /* Run through the da, looking for this point */ nvals = l_dnaGetCount(da); for (i = 0; i < nvals; i++) { l_dnaGetIValue(da, i, &index); ptaGetIPt(pta, index, &xi, &yi); if (x == xi && y == yi) { *pindex = index; return 0; } } return 0; } /*! * \brief l_dnaHashCreateFromPta() * * \param[in] pta * \return dahash, or NULL on error */ L_DNAHASH * l_dnaHashCreateFromPta(PTA *pta) { l_int32 i, n, x, y; l_uint32 nsize; l_uint64 key; L_DNAHASH *dahash; PROCNAME("l_dnaHashCreateFromPta"); if (!pta) return (L_DNAHASH *)ERROR_PTR("pta not defined", procName, NULL); /* Build up dnaHash of indices, hashed by a key that is * a large linear combination of x and y values designed to * randomize the key. Having about 20 pts in each bucket is * roughly optimal for speed for large sets. */ n = ptaGetCount(pta); findNextLargerPrime(n / 20, &nsize); /* buckets in hash table */ /* lept_stderr("Prime used: %d\n", nsize); */ /* Add each point, using the hash as key and the index into * %ptas as the value. Storing the index enables operations * that check for duplicates. */ dahash = l_dnaHashCreate(nsize, 8); for (i = 0; i < n; i++) { ptaGetIPt(pta, i, &x, &y); l_hashPtToUint64(x, y, &key); l_dnaHashAdd(dahash, key, (l_float64)i); } return dahash; }