/*====================================================================* - Copyright (C) 2001 Leptonica. All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - 2. Redistributions in binary form must reproduce the above - copyright notice, this list of conditions and the following - disclaimer in the documentation and/or other materials - provided with the distribution. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ANY - CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY - OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *====================================================================*/ /*! * \file pageseg.c *
 *
 *      Top level page segmentation
 *          l_int32   pixGetRegionsBinary()
 *
 *      Halftone region extraction
 *          PIX      *pixGenHalftoneMask()    **Deprecated wrapper**
 *          PIX      *pixGenerateHalftoneMask()

 *
 *      Textline extraction
 *          PIX      *pixGenTextlineMask()
 *
 *      Textblock extraction
 *          PIX      *pixGenTextblockMask()
 *
 *      Location of page foreground
 *          PIX      *pixFindPageForeground()
 *
 *      Extraction of characters from image with only text
 *          l_int32   pixSplitIntoCharacters()
 *          BOXA     *pixSplitComponentWithProfile()
 *
 *      Extraction of lines of text
 *          PIXA     *pixExtractTextlines()
 *          PIXA     *pixExtractRawTextlines()
 *
 *      How many text columns
 *          l_int32   pixCountTextColumns()
 *
 *      Decision: text vs photo
 *          l_int32   pixDecideIfText()
 *          l_int32   pixFindThreshFgExtent()
 *
 *      Decision: table vs text
 *          l_int32   pixDecideIfTable()
 *          Pix      *pixPrepare1bpp()
 *
 *      Estimate the grayscale background value
 *          l_int32   pixEstimateBackground()
 *
 *      Largest white or black rectangles in an image
 *          l_int32   pixFindLargeRectangles()
 *          l_int32   pixFindLargestRectangle()
 *
 *      Generate rectangle inside connected component
 *          BOX      *pixFindRectangleInCC()
 *
 *      Automatic photoinvert for OCR
 *          PIX      *pixAutoPhotoinvert()
 * 
*/ #ifdef HAVE_CONFIG_H #include #endif /* HAVE_CONFIG_H */ #include "allheaders.h" #include "math.h" /* These functions are not intended to work on very low-res images */ static const l_int32 MinWidth = 100; static const l_int32 MinHeight = 100; /*------------------------------------------------------------------* * Top level page segmentation * *------------------------------------------------------------------*/ /*! * \brief pixGetRegionsBinary() * * \param[in] pixs 1 bpp, assumed to be 300 to 400 ppi * \param[out] ppixhm [optional] halftone mask * \param[out] ppixtm [optional] textline mask * \param[out] ppixtb [optional] textblock mask * \param[in] pixadb input for collecting debug pix; use NULL to skip * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) It is best to deskew the image before segmenting.
 *      (2) Passing in %pixadb enables debug output.
 * 
*/ l_ok pixGetRegionsBinary(PIX *pixs, PIX **ppixhm, PIX **ppixtm, PIX **ppixtb, PIXA *pixadb) { l_int32 w, h, htfound, tlfound; PIX *pixr, *pix1, *pix2; PIX *pixtext; /* text pixels only */ PIX *pixhm2; /* halftone mask; 2x reduction */ PIX *pixhm; /* halftone mask; */ PIX *pixtm2; /* textline mask; 2x reduction */ PIX *pixtm; /* textline mask */ PIX *pixvws; /* vertical white space mask */ PIX *pixtb2; /* textblock mask; 2x reduction */ PIX *pixtbf2; /* textblock mask; 2x reduction; small comps filtered */ PIX *pixtb; /* textblock mask */ PROCNAME("pixGetRegionsBinary"); if (ppixhm) *ppixhm = NULL; if (ppixtm) *ppixtm = NULL; if (ppixtb) *ppixtb = NULL; if (!pixs || pixGetDepth(pixs) != 1) return ERROR_INT("pixs undefined or not 1 bpp", procName, 1); pixGetDimensions(pixs, &w, &h, NULL); if (w < MinWidth || h < MinHeight) { L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h); return 1; } /* 2x reduce, to 150 -200 ppi */ pixr = pixReduceRankBinaryCascade(pixs, 1, 0, 0, 0); if (pixadb) pixaAddPix(pixadb, pixr, L_COPY); /* Get the halftone mask */ pixhm2 = pixGenerateHalftoneMask(pixr, &pixtext, &htfound, pixadb); /* Get the textline mask from the text pixels */ pixtm2 = pixGenTextlineMask(pixtext, &pixvws, &tlfound, pixadb); /* Get the textblock mask from the textline mask */ pixtb2 = pixGenTextblockMask(pixtm2, pixvws, pixadb); pixDestroy(&pixr); pixDestroy(&pixtext); pixDestroy(&pixvws); /* Remove small components from the mask, where a small * component is defined as one with both width and height < 60 */ pixtbf2 = NULL; if (pixtb2) { pixtbf2 = pixSelectBySize(pixtb2, 60, 60, 4, L_SELECT_IF_EITHER, L_SELECT_IF_GTE, NULL); pixDestroy(&pixtb2); if (pixadb) pixaAddPix(pixadb, pixtbf2, L_COPY); } /* Expand all masks to full resolution, and do filling or * small dilations for better coverage. */ pixhm = pixExpandReplicate(pixhm2, 2); pix1 = pixSeedfillBinary(NULL, pixhm, pixs, 8); pixOr(pixhm, pixhm, pix1); pixDestroy(&pixhm2); pixDestroy(&pix1); if (pixadb) pixaAddPix(pixadb, pixhm, L_COPY); pix1 = pixExpandReplicate(pixtm2, 2); pixtm = pixDilateBrick(NULL, pix1, 3, 3); pixDestroy(&pixtm2); pixDestroy(&pix1); if (pixadb) pixaAddPix(pixadb, pixtm, L_COPY); if (pixtbf2) { pix1 = pixExpandReplicate(pixtbf2, 2); pixtb = pixDilateBrick(NULL, pix1, 3, 3); pixDestroy(&pixtbf2); pixDestroy(&pix1); if (pixadb) pixaAddPix(pixadb, pixtb, L_COPY); } else { pixtb = pixCreateTemplate(pixs); /* empty mask */ } /* Debug: identify objects that are neither text nor halftone image */ if (pixadb) { pix1 = pixSubtract(NULL, pixs, pixtm); /* remove text pixels */ pix2 = pixSubtract(NULL, pix1, pixhm); /* remove halftone pixels */ pixaAddPix(pixadb, pix2, L_INSERT); pixDestroy(&pix1); } /* Debug: display textline components with random colors */ if (pixadb) { l_int32 w, h; BOXA *boxa; PIXA *pixa; boxa = pixConnComp(pixtm, &pixa, 8); pixGetDimensions(pixtm, &w, &h, NULL); pix1 = pixaDisplayRandomCmap(pixa, w, h); pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255); pixaAddPix(pixadb, pix1, L_INSERT); pixaDestroy(&pixa); boxaDestroy(&boxa); } /* Debug: identify the outlines of each textblock */ if (pixadb) { PIXCMAP *cmap; PTAA *ptaa; ptaa = pixGetOuterBordersPtaa(pixtb); lept_mkdir("lept/pageseg"); ptaaWriteDebug("/tmp/lept/pageseg/tb_outlines.ptaa", ptaa, 1); pix1 = pixRenderRandomCmapPtaa(pixtb, ptaa, 1, 16, 1); cmap = pixGetColormap(pix1); pixcmapResetColor(cmap, 0, 130, 130, 130); pixaAddPix(pixadb, pix1, L_INSERT); ptaaDestroy(&ptaa); } /* Debug: get b.b. for all mask components */ if (pixadb) { BOXA *bahm, *batm, *batb; bahm = pixConnComp(pixhm, NULL, 4); batm = pixConnComp(pixtm, NULL, 4); batb = pixConnComp(pixtb, NULL, 4); boxaWriteDebug("/tmp/lept/pageseg/htmask.boxa", bahm); boxaWriteDebug("/tmp/lept/pageseg/textmask.boxa", batm); boxaWriteDebug("/tmp/lept/pageseg/textblock.boxa", batb); boxaDestroy(&bahm); boxaDestroy(&batm); boxaDestroy(&batb); } if (pixadb) { pixaConvertToPdf(pixadb, 0, 1.0, 0, 0, "Debug page segmentation", "/tmp/lept/pageseg/debug.pdf"); L_INFO("Writing debug pdf to /tmp/lept/pageseg/debug.pdf\n", procName); } if (ppixhm) *ppixhm = pixhm; else pixDestroy(&pixhm); if (ppixtm) *ppixtm = pixtm; else pixDestroy(&pixtm); if (ppixtb) *ppixtb = pixtb; else pixDestroy(&pixtb); return 0; } /*------------------------------------------------------------------* * Halftone region extraction * *------------------------------------------------------------------*/ /*! * \brief pixGenHalftoneMask() * *
 * Deprecated:
 *   This wrapper avoids an ABI change with tesseract 3.0.4.
 *   It should be removed when we no longer need to support 3.0.4.
 *   The debug parameter is ignored (assumed 0).
 * 
*/ PIX * pixGenHalftoneMask(PIX *pixs, PIX **ppixtext, l_int32 *phtfound, l_int32 debug) { return pixGenerateHalftoneMask(pixs, ppixtext, phtfound, NULL); } /*! * \brief pixGenerateHalftoneMask() * * \param[in] pixs 1 bpp, assumed to be 150 to 200 ppi * \param[out] ppixtext [optional] text part of pixs * \param[out] phtfound [optional] 1 if the mask is not empty * \param[in] pixadb input for collecting debug pix; use NULL to skip * \return pixd halftone mask, or NULL on error * *
 * Notes:
 *      (1) This is not intended to work on small thumbnails.  The
 *          dimensions of pixs must be at least MinWidth x MinHeight.
 * 
*/ PIX * pixGenerateHalftoneMask(PIX *pixs, PIX **ppixtext, l_int32 *phtfound, PIXA *pixadb) { l_int32 w, h, empty; PIX *pix1, *pix2, *pixhs, *pixhm, *pixd; PROCNAME("pixGenerateHalftoneMask"); if (ppixtext) *ppixtext = NULL; if (phtfound) *phtfound = 0; if (!pixs || pixGetDepth(pixs) != 1) return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL); pixGetDimensions(pixs, &w, &h, NULL); if (w < MinWidth || h < MinHeight) { L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h); return NULL; } /* Compute seed for halftone parts at 8x reduction */ pix1 = pixReduceRankBinaryCascade(pixs, 4, 4, 0, 0); pix2 = pixOpenBrick(NULL, pix1, 5, 5); pixhs = pixExpandReplicate(pix2, 4); /* back to 2x reduction */ pixDestroy(&pix1); pixDestroy(&pix2); if (pixadb) pixaAddPix(pixadb, pixhs, L_COPY); /* Compute mask for connected regions */ pixhm = pixCloseSafeBrick(NULL, pixs, 4, 4); if (pixadb) pixaAddPix(pixadb, pixhm, L_COPY); /* Fill seed into mask to get halftone mask */ pixd = pixSeedfillBinary(NULL, pixhs, pixhm, 4); if (pixadb) pixaAddPix(pixadb, pixd, L_COPY); #if 0 pixOpenBrick(pixd, pixd, 9, 9); #endif /* Check if mask is empty */ pixZero(pixd, &empty); if (phtfound && !empty) *phtfound = 1; /* Optionally, get all pixels that are not under the halftone mask */ if (ppixtext) { if (empty) *ppixtext = pixCopy(NULL, pixs); else *ppixtext = pixSubtract(NULL, pixs, pixd); if (pixadb) pixaAddPix(pixadb, *ppixtext, L_COPY); } pixDestroy(&pixhs); pixDestroy(&pixhm); return pixd; } /*------------------------------------------------------------------* * Textline extraction * *------------------------------------------------------------------*/ /*! * \brief pixGenTextlineMask() * * \param[in] pixs 1 bpp, assumed to be 150 to 200 ppi * \param[out] ppixvws vertical whitespace mask * \param[out] ptlfound [optional] 1 if the mask is not empty * \param[in] pixadb input for collecting debug pix; use NULL to skip * \return pixd textline mask, or NULL on error * *
 * Notes:
 *      (1) The input pixs should be deskewed.
 *      (2) pixs should have no halftone pixels.
 *      (3) This is not intended to work on small thumbnails.  The
 *          dimensions of pixs must be at least MinWidth x MinHeight.
 *      (4) Both the input image and the returned textline mask
 *          are at the same resolution.
 * 
*/ PIX * pixGenTextlineMask(PIX *pixs, PIX **ppixvws, l_int32 *ptlfound, PIXA *pixadb) { l_int32 w, h, empty; PIX *pix1, *pix2, *pixvws, *pixd; PROCNAME("pixGenTextlineMask"); if (ptlfound) *ptlfound = 0; if (!ppixvws) return (PIX *)ERROR_PTR("&pixvws not defined", procName, NULL); *ppixvws = NULL; if (!pixs || pixGetDepth(pixs) != 1) return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL); pixGetDimensions(pixs, &w, &h, NULL); if (w < MinWidth || h < MinHeight) { L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h); return NULL; } /* First we need a vertical whitespace mask. Invert the image. */ pix1 = pixInvert(NULL, pixs); /* The whitespace mask will break textlines where there * is a large amount of white space below or above. * This can be prevented by identifying regions of the * inverted image that have large horizontal extent (bigger than * the separation between columns) and significant * vertical extent (bigger than the separation between * textlines), and subtracting this from the bg. */ pix2 = pixMorphCompSequence(pix1, "o80.60", 0); pixSubtract(pix1, pix1, pix2); if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); pixDestroy(&pix2); /* Identify vertical whitespace by opening the remaining bg. * o5.1 removes thin vertical bg lines and o1.200 extracts * long vertical bg lines. */ pixvws = pixMorphCompSequence(pix1, "o5.1 + o1.200", 0); *ppixvws = pixvws; if (pixadb) pixaAddPix(pixadb, pixvws, L_COPY); pixDestroy(&pix1); /* Three steps to getting text line mask: * (1) close the characters and words in the textlines * (2) open the vertical whitespace corridors back up * (3) small opening to remove noise */ pix1 = pixMorphSequence(pixs, "c30.1", 0); if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); pixd = pixSubtract(NULL, pix1, pixvws); pixOpenBrick(pixd, pixd, 3, 3); if (pixadb) pixaAddPix(pixadb, pixd, L_COPY); pixDestroy(&pix1); /* Check if text line mask is empty */ if (ptlfound) { pixZero(pixd, &empty); if (!empty) *ptlfound = 1; } return pixd; } /*------------------------------------------------------------------* * Textblock extraction * *------------------------------------------------------------------*/ /*! * \brief pixGenTextblockMask() * * \param[in] pixs 1 bpp, textline mask, assumed to be 150 to 200 ppi * \param[in] pixvws vertical white space mask * \param[in] pixadb input for collecting debug pix; use NULL to skip * \return pixd textblock mask, or NULL if empty or on error * *
 * Notes:
 *      (1) Both the input masks (textline and vertical white space) and
 *          the returned textblock mask are at the same resolution.
 *      (2) This is not intended to work on small thumbnails.  The
 *          dimensions of pixs must be at least MinWidth x MinHeight.
 *      (3) The result is somewhat noisy, in that small "blocks" of
 *          text may be included.  These can be removed by post-processing,
 *          using, e.g.,
 *             pixSelectBySize(pix, 60, 60, 4, L_SELECT_IF_EITHER,
 *                             L_SELECT_IF_GTE, NULL);
 * 
*/ PIX * pixGenTextblockMask(PIX *pixs, PIX *pixvws, PIXA *pixadb) { l_int32 w, h, empty; PIX *pix1, *pix2, *pix3, *pixd; PROCNAME("pixGenTextblockMask"); if (!pixs || pixGetDepth(pixs) != 1) return (PIX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL); pixGetDimensions(pixs, &w, &h, NULL); if (w < MinWidth || h < MinHeight) { L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h); return NULL; } if (!pixvws) return (PIX *)ERROR_PTR("pixvws not defined", procName, NULL); /* Join pixels vertically to make a textblock mask */ pix1 = pixMorphSequence(pixs, "c1.10 + o4.1", 0); pixZero(pix1, &empty); if (empty) { pixDestroy(&pix1); L_INFO("no fg pixels in textblock mask\n", procName); return NULL; } if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); /* Solidify the textblock mask and remove noise: * (1) For each cc, close the blocks and dilate slightly * to form a solid mask. * (2) Small horizontal closing between components. * (3) Open the white space between columns, again. * (4) Remove small components. */ pix2 = pixMorphSequenceByComponent(pix1, "c30.30 + d3.3", 8, 0, 0, NULL); pixCloseSafeBrick(pix2, pix2, 10, 1); if (pixadb) pixaAddPix(pixadb, pix2, L_COPY); pix3 = pixSubtract(NULL, pix2, pixvws); if (pixadb) pixaAddPix(pixadb, pix3, L_COPY); pixd = pixSelectBySize(pix3, 25, 5, 8, L_SELECT_IF_BOTH, L_SELECT_IF_GTE, NULL); if (pixadb) pixaAddPix(pixadb, pixd, L_COPY); pixDestroy(&pix1); pixDestroy(&pix2); pixDestroy(&pix3); return pixd; } /*------------------------------------------------------------------* * Location of page foreground * *------------------------------------------------------------------*/ /*! * \brief pixFindPageForeground() * * \param[in] pixs full resolution (any type or depth * \param[in] threshold for binarization; typically about 128 * \param[in] mindist min distance of text from border to allow * cleaning near border; at 2x reduction, this * should be larger than 50; typically about 70 * \param[in] erasedist when conditions are satisfied, erase anything * within this distance of the edge; * typically 20-30 at 2x reduction * \param[in] showmorph debug: set to a negative integer to show steps * in generating masks; this is typically used * for debugging region extraction * \param[in] pixac debug: allocate outside and pass this in to * accumulate results of each call to this function, * which can be displayed in a mosaic or a pdf. * \return box region including foreground, with some pixel noise * removed, or NULL if not found * *
 * Notes:
 *      (1) This doesn't simply crop to the fg.  It attempts to remove
 *          pixel noise and junk at the edge of the image before cropping.
 *          The input %threshold is used if pixs is not 1 bpp.
 *      (2) This is not intended to work on small thumbnails.  The
 *          dimensions of pixs must be at least MinWidth x MinHeight.
 *      (3) Debug: set showmorph to display the intermediate image in
 *          the morphological operations on this page.
 *      (4) Debug: to get pdf output of results when called repeatedly,
 *          call with an existing pixac, which will add an image of this page,
 *          with the fg outlined.  If no foreground is found, there is
 *          no output for this page image.
 * 
*/ BOX * pixFindPageForeground(PIX *pixs, l_int32 threshold, l_int32 mindist, l_int32 erasedist, l_int32 showmorph, PIXAC *pixac) { l_int32 flag, nbox, intersects; l_int32 w, h, bx, by, bw, bh, left, right, top, bottom; PIX *pixb, *pixb2, *pixseed, *pixsf, *pixm, *pix1, *pixg2; BOX *box, *boxfg, *boxin, *boxd; BOXA *ba1, *ba2; PROCNAME("pixFindPageForeground"); if (!pixs) return (BOX *)ERROR_PTR("pixs not defined", procName, NULL); pixGetDimensions(pixs, &w, &h, NULL); if (w < MinWidth || h < MinHeight) { L_ERROR("pix too small: w = %d, h = %d\n", procName, w, h); return NULL; } /* Binarize, downscale by 0.5, remove the noise to generate a seed, * and do a seedfill back from the seed into those 8-connected * components of the binarized image for which there was at least * one seed pixel. Also clear out any components that are within * 10 pixels of the edge at 2x reduction. */ flag = (showmorph) ? 100 : 0; pixb = pixConvertTo1(pixs, threshold); pixb2 = pixScale(pixb, 0.5, 0.5); pixseed = pixMorphSequence(pixb2, "o1.2 + c9.9 + o3.3", flag); pix1 = pixMorphSequence(pixb2, "o50.1", 0); pixOr(pixseed, pixseed, pix1); pixDestroy(&pix1); pix1 = pixMorphSequence(pixb2, "o1.50", 0); pixOr(pixseed, pixseed, pix1); pixDestroy(&pix1); pixsf = pixSeedfillBinary(NULL, pixseed, pixb2, 8); pixSetOrClearBorder(pixsf, 10, 10, 10, 10, PIX_SET); pixm = pixRemoveBorderConnComps(pixsf, 8); /* Now, where is the main block of text? We want to remove noise near * the edge of the image, but to do that, we have to be convinced that * (1) there is noise and (2) it is far enough from the text block * and close enough to the edge. For each edge, if the block * is more than mindist from that edge, then clean 'erasedist' * pixels from the edge. */ pix1 = pixMorphSequence(pixm, "c50.50", flag); ba1 = pixConnComp(pix1, NULL, 8); ba2 = boxaSort(ba1, L_SORT_BY_AREA, L_SORT_DECREASING, NULL); pixGetDimensions(pix1, &w, &h, NULL); nbox = boxaGetCount(ba2); if (nbox > 1) { box = boxaGetBox(ba2, 0, L_CLONE); boxGetGeometry(box, &bx, &by, &bw, &bh); left = (bx > mindist) ? erasedist : 0; right = (w - bx - bw > mindist) ? erasedist : 0; top = (by > mindist) ? erasedist : 0; bottom = (h - by - bh > mindist) ? erasedist : 0; pixSetOrClearBorder(pixm, left, right, top, bottom, PIX_CLR); boxDestroy(&box); } pixDestroy(&pix1); boxaDestroy(&ba1); boxaDestroy(&ba2); /* Locate the foreground region; don't bother cropping */ pixClipToForeground(pixm, NULL, &boxfg); /* Sanity check the fg region. Make sure it's not confined * to a thin boundary on the left and right sides of the image, * in which case it is likely to be noise. */ if (boxfg) { boxin = boxCreate(0.1 * w, 0, 0.8 * w, h); boxIntersects(boxfg, boxin, &intersects); boxDestroy(&boxin); if (!intersects) boxDestroy(&boxfg); } boxd = NULL; if (boxfg) { boxAdjustSides(boxfg, boxfg, -2, 2, -2, 2); /* tiny expansion */ boxd = boxTransform(boxfg, 0, 0, 2.0, 2.0); /* Save the debug image showing the box for this page */ if (pixac) { pixg2 = pixConvert1To4Cmap(pixb); pixRenderBoxArb(pixg2, boxd, 3, 255, 0, 0); pixacompAddPix(pixac, pixg2, IFF_DEFAULT); pixDestroy(&pixg2); } } pixDestroy(&pixb); pixDestroy(&pixb2); pixDestroy(&pixseed); pixDestroy(&pixsf); pixDestroy(&pixm); boxDestroy(&boxfg); return boxd; } /*------------------------------------------------------------------* * Extraction of characters from image with only text * *------------------------------------------------------------------*/ /*! * \brief pixSplitIntoCharacters() * * \param[in] pixs 1 bpp, contains only deskewed text * \param[in] minw min component width for initial filtering; typ. 4 * \param[in] minh min component height for initial filtering; typ. 4 * \param[out] pboxa [optional] character bounding boxes * \param[out] ppixa [optional] character images * \param[out] ppixdebug [optional] showing splittings * * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This is a simple function that attempts to find split points
 *          based on vertical pixel profiles.
 *      (2) It should be given an image that has an arbitrary number
 *          of text characters.
 *      (3) The returned pixa includes the boxes from which the
 *          (possibly split) components are extracted.
 * 
*/ l_ok pixSplitIntoCharacters(PIX *pixs, l_int32 minw, l_int32 minh, BOXA **pboxa, PIXA **ppixa, PIX **ppixdebug) { l_int32 ncomp, i, xoff, yoff; BOXA *boxa1, *boxa2, *boxat1, *boxat2, *boxad; BOXAA *baa; PIX *pix, *pix1, *pix2, *pixdb; PIXA *pixa1, *pixadb; PROCNAME("pixSplitIntoCharacters"); if (pboxa) *pboxa = NULL; if (ppixa) *ppixa = NULL; if (ppixdebug) *ppixdebug = NULL; if (!pixs || pixGetDepth(pixs) != 1) return ERROR_INT("pixs not defined or not 1 bpp", procName, 1); /* Remove the small stuff */ pix1 = pixSelectBySize(pixs, minw, minh, 8, L_SELECT_IF_BOTH, L_SELECT_IF_GT, NULL); /* Small vertical close for consolidation */ pix2 = pixMorphSequence(pix1, "c1.10", 0); pixDestroy(&pix1); /* Get the 8-connected components */ boxa1 = pixConnComp(pix2, &pixa1, 8); pixDestroy(&pix2); boxaDestroy(&boxa1); /* Split the components if obvious */ ncomp = pixaGetCount(pixa1); boxa2 = boxaCreate(ncomp); pixadb = (ppixdebug) ? pixaCreate(ncomp) : NULL; for (i = 0; i < ncomp; i++) { pix = pixaGetPix(pixa1, i, L_CLONE); if (ppixdebug) { boxat1 = pixSplitComponentWithProfile(pix, 10, 7, &pixdb); if (pixdb) pixaAddPix(pixadb, pixdb, L_INSERT); } else { boxat1 = pixSplitComponentWithProfile(pix, 10, 7, NULL); } pixaGetBoxGeometry(pixa1, i, &xoff, &yoff, NULL, NULL); boxat2 = boxaTransform(boxat1, xoff, yoff, 1.0, 1.0); boxaJoin(boxa2, boxat2, 0, -1); pixDestroy(&pix); boxaDestroy(&boxat1); boxaDestroy(&boxat2); } pixaDestroy(&pixa1); /* Generate the debug image */ if (ppixdebug) { if (pixaGetCount(pixadb) > 0) { *ppixdebug = pixaDisplayTiledInRows(pixadb, 32, 1500, 1.0, 0, 20, 1); } pixaDestroy(&pixadb); } /* Do a 2D sort on the bounding boxes, and flatten the result to 1D */ baa = boxaSort2d(boxa2, NULL, 0, 0, 5); boxad = boxaaFlattenToBoxa(baa, NULL, L_CLONE); boxaaDestroy(&baa); boxaDestroy(&boxa2); /* Optionally extract the pieces from the input image */ if (ppixa) *ppixa = pixClipRectangles(pixs, boxad); if (pboxa) *pboxa = boxad; else boxaDestroy(&boxad); return 0; } /*! * \brief pixSplitComponentWithProfile() * * \param[in] pixs 1 bpp, exactly one connected component * \param[in] delta distance used in extrema finding in a numa; typ. 10 * \param[in] mindel minimum required difference between profile * minimum and profile values +2 and -2 away; typ. 7 * \param[out] ppixdebug [optional] debug image of splitting * \return boxa of c.c. after splitting, or NULL on error * *
 * Notes:
 *      (1) This will split the most obvious cases of touching characters.
 *          The split points it is searching for are narrow and deep
 *          minimima in the vertical pixel projection profile, after a
 *          large vertical closing has been applied to the component.
 * 
*/ BOXA * pixSplitComponentWithProfile(PIX *pixs, l_int32 delta, l_int32 mindel, PIX **ppixdebug) { l_int32 w, h, n2, i, firstmin, xmin, xshift; l_int32 nmin, nleft, nright, nsplit, isplit, ncomp; l_int32 *array1, *array2; BOX *box; BOXA *boxad; NUMA *na1, *na2, *nasplit; PIX *pix1, *pixdb; PROCNAME("pixSplitComponentsWithProfile"); if (ppixdebug) *ppixdebug = NULL; if (!pixs || pixGetDepth(pixs) != 1) return (BOXA *)ERROR_PTR("pixa undefined or not 1 bpp", procName, NULL); pixGetDimensions(pixs, &w, &h, NULL); /* Closing to consolidate characters vertically */ pix1 = pixCloseSafeBrick(NULL, pixs, 1, 100); /* Get extrema of column projections */ boxad = boxaCreate(2); na1 = pixCountPixelsByColumn(pix1); /* w elements */ pixDestroy(&pix1); na2 = numaFindExtrema(na1, delta, NULL); n2 = numaGetCount(na2); if (n2 < 3) { /* no split possible */ box = boxCreate(0, 0, w, h); boxaAddBox(boxad, box, L_INSERT); numaDestroy(&na1); numaDestroy(&na2); return boxad; } /* Look for sufficiently deep and narrow minima. * All minima of of interest must be surrounded by max on each * side. firstmin is the index of first possible minimum. */ array1 = numaGetIArray(na1); array2 = numaGetIArray(na2); if (ppixdebug) numaWriteStderr(na2); firstmin = (array1[array2[0]] > array1[array2[1]]) ? 1 : 2; nasplit = numaCreate(n2); /* will hold split locations */ for (i = firstmin; i < n2 - 1; i+= 2) { xmin = array2[i]; nmin = array1[xmin]; if (xmin + 2 >= w) break; /* no more splits possible */ nleft = array1[xmin - 2]; nright = array1[xmin + 2]; if (ppixdebug) { lept_stderr( "Splitting: xmin = %d, w = %d; nl = %d, nmin = %d, nr = %d\n", xmin, w, nleft, nmin, nright); } if (nleft - nmin >= mindel && nright - nmin >= mindel) /* split */ numaAddNumber(nasplit, xmin); } nsplit = numaGetCount(nasplit); #if 0 if (ppixdebug && nsplit > 0) { lept_mkdir("lept/split"); gplotSimple1(na1, GPLOT_PNG, "/tmp/lept/split/split", NULL); } #endif numaDestroy(&na1); numaDestroy(&na2); LEPT_FREE(array1); LEPT_FREE(array2); if (nsplit == 0) { /* no splitting */ numaDestroy(&nasplit); box = boxCreate(0, 0, w, h); boxaAddBox(boxad, box, L_INSERT); return boxad; } /* Use split points to generate b.b. after splitting */ for (i = 0, xshift = 0; i < nsplit; i++) { numaGetIValue(nasplit, i, &isplit); box = boxCreate(xshift, 0, isplit - xshift, h); boxaAddBox(boxad, box, L_INSERT); xshift = isplit + 1; } box = boxCreate(xshift, 0, w - xshift, h); boxaAddBox(boxad, box, L_INSERT); numaDestroy(&nasplit); if (ppixdebug) { pixdb = pixConvertTo32(pixs); ncomp = boxaGetCount(boxad); for (i = 0; i < ncomp; i++) { box = boxaGetBox(boxad, i, L_CLONE); pixRenderBoxBlend(pixdb, box, 1, 255, 0, 0, 0.5); boxDestroy(&box); } *ppixdebug = pixdb; } return boxad; } /*------------------------------------------------------------------* * Extraction of lines of text * *------------------------------------------------------------------*/ /*! * \brief pixExtractTextlines() * * \param[in] pixs any depth, assumed to have nearly horizontal text * \param[in] maxw, maxh initial filtering: remove any components in pixs * with components larger than maxw or maxh * \param[in] minw, minh final filtering: remove extracted 'lines' * with sizes smaller than minw or minh; use * 0 for default. * \param[in] adjw, adjh final adjustment of boxes representing each * text line. If > 0, these increase the box * size at each edge by this amount. * \param[in] pixadb pixa for saving intermediate steps; NULL to omit * \return pixa of textline images, including bounding boxes, or * NULL on error * *
 * Notes:
 *      (1) This function assumes that textline fragments have sufficient
 *          vertical separation and small enough skew so that a
 *          horizontal dilation sufficient to join words will not join
 *          textlines.  It does not guarantee that horizontally adjacent
 *          textline fragments on the same line will be joined.
 *      (2) For images with multiple columns, it attempts to avoid joining
 *          textlines across the space between columns.  If that is not
 *          a concern, you can also use pixExtractRawTextlines(),
 *          which will join them with alacrity.
 *      (3) This first removes components from pixs that are either
 *          wide (> %maxw) or tall (> %maxh).
 *      (4) A final filtering operation removes small components, such
 *          that width < %minw or height < %minh.
 *      (5) For reasonable accuracy, the resolution of pixs should be
 *          at least 100 ppi.  For reasonable efficiency, the resolution
 *          should not exceed 600 ppi.
 *      (6) This can be used to determine if some region of a scanned
 *          image is horizontal text.
 *      (7) As an example, for a pix with resolution 300 ppi, a reasonable
 *          set of parameters is:
 *             pixExtractTextlines(pix, 150, 150, 36, 20, 5, 5, NULL);
 *          The defaults minw and minh for 300 ppi are about 36 and 20,
 *          so the same result is obtained with:
 *             pixExtractTextlines(pix, 150, 150, 0, 0, 5, 5, NULL);
 *      (8) The output pixa is composed of subimages, one for each textline,
 *          and the boxa in the pixa tells where in %pixs each textline goes.
 * 
*/ PIXA * pixExtractTextlines(PIX *pixs, l_int32 maxw, l_int32 maxh, l_int32 minw, l_int32 minh, l_int32 adjw, l_int32 adjh, PIXA *pixadb) { char buf[64]; l_int32 res, csize, empty; BOXA *boxa1, *boxa2, *boxa3; PIX *pix1, *pix2, *pix3; PIXA *pixa1, *pixa2, *pixa3; PROCNAME("pixExtractTextlines"); if (!pixs) return (PIXA *)ERROR_PTR("pixs not defined", procName, NULL); /* Binarize carefully, if necessary */ if (pixGetDepth(pixs) > 1) { pix2 = pixConvertTo8(pixs, FALSE); pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 190); pix1 = pixThresholdToBinary(pix3, 150); pixDestroy(&pix2); pixDestroy(&pix3); } else { pix1 = pixClone(pixs); } pixZero(pix1, &empty); if (empty) { pixDestroy(&pix1); L_INFO("no fg pixels in input image\n", procName); return NULL; } if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); /* Remove any very tall or very wide connected components */ pix2 = pixSelectBySize(pix1, maxw, maxh, 8, L_SELECT_IF_BOTH, L_SELECT_IF_LT, NULL); if (pixadb) pixaAddPix(pixadb, pix2, L_COPY); pixDestroy(&pix1); /* Filter to solidify the text lines within the x-height region. * The closing (csize) bridges gaps between words. The opening * removes isolated bridges between textlines. */ if ((res = pixGetXRes(pixs)) == 0) { L_INFO("Resolution is not set: setting to 300 ppi\n", procName); res = 300; } csize = L_MIN(120., 60.0 * res / 300.0); snprintf(buf, sizeof(buf), "c%d.1 + o%d.1", csize, csize / 3); pix3 = pixMorphCompSequence(pix2, buf, 0); if (pixadb) pixaAddPix(pixadb, pix3, L_COPY); /* Extract the connected components. These should be dilated lines */ boxa1 = pixConnComp(pix3, &pixa1, 4); if (pixadb) { pix1 = pixaDisplayRandomCmap(pixa1, 0, 0); pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255); pixaAddPix(pixadb, pix1, L_INSERT); } /* Set minw, minh if default is requested */ minw = (minw != 0) ? minw : (l_int32)(0.12 * res); minh = (minh != 0) ? minh : (l_int32)(0.07 * res); /* Remove line components that are too small */ pixa2 = pixaSelectBySize(pixa1, minw, minh, L_SELECT_IF_BOTH, L_SELECT_IF_GTE, NULL); if (pixadb) { pix1 = pixaDisplayRandomCmap(pixa2, 0, 0); pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255); pixaAddPix(pixadb, pix1, L_INSERT); pix1 = pixConvertTo32(pix2); pixRenderBoxaArb(pix1, pixa2->boxa, 2, 255, 0, 0); pixaAddPix(pixadb, pix1, L_INSERT); } /* Selectively AND with the version before dilation, and save */ boxa2 = pixaGetBoxa(pixa2, L_CLONE); boxa3 = boxaAdjustSides(boxa2, -adjw, adjw, -adjh, adjh); pixa3 = pixClipRectangles(pix2, boxa3); if (pixadb) { pix1 = pixaDisplayRandomCmap(pixa3, 0, 0); pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255); pixaAddPix(pixadb, pix1, L_INSERT); } pixDestroy(&pix2); pixDestroy(&pix3); pixaDestroy(&pixa1); pixaDestroy(&pixa2); boxaDestroy(&boxa1); boxaDestroy(&boxa2); boxaDestroy(&boxa3); return pixa3; } /*! * \brief pixExtractRawTextlines() * * \param[in] pixs any depth, assumed to have nearly horizontal text * \param[in] maxw, maxh initial filtering: remove any components in pixs * with components larger than maxw or maxh; * use 0 for default values. * \param[in] adjw, adjh final adjustment of boxes representing each * text line. If > 0, these increase the box * size at each edge by this amount. * \param[in] pixadb pixa for saving intermediate steps; NULL to omit * \return pixa of textline images, including bounding boxes, or * NULL on error * *
 * Notes:
 *      (1) This function assumes that textlines have sufficient
 *          vertical separation and small enough skew so that a
 *          horizontal dilation sufficient to join words will not join
 *          textlines.  It aggressively joins textlines across multiple
 *          columns, so if that is not desired, you must either (a) make
 *          sure that %pixs is a single column of text or (b) use instead
 *          pixExtractTextlines(), which is more conservative
 *          about joining text fragments that have vertical overlap.
 *      (2) This first removes components from pixs that are either
 *          very wide (> %maxw) or very tall (> %maxh).
 *      (3) For reasonable accuracy, the resolution of pixs should be
 *          at least 100 ppi.  For reasonable efficiency, the resolution
 *          should not exceed 600 ppi.
 *      (4) This can be used to determine if some region of a scanned
 *          image is horizontal text.
 *      (5) As an example, for a pix with resolution 300 ppi, a reasonable
 *          set of parameters is:
 *             pixExtractRawTextlines(pix, 150, 150, 0, 0, NULL);
 *      (6) The output pixa is composed of subimages, one for each textline,
 *          and the boxa in the pixa tells where in %pixs each textline goes.
 * 
*/ PIXA * pixExtractRawTextlines(PIX *pixs, l_int32 maxw, l_int32 maxh, l_int32 adjw, l_int32 adjh, PIXA *pixadb) { char buf[64]; l_int32 res, csize, empty; BOXA *boxa1, *boxa2, *boxa3; BOXAA *baa1; PIX *pix1, *pix2, *pix3; PIXA *pixa1, *pixa2; PROCNAME("pixExtractRawTextlines"); if (!pixs) return (PIXA *)ERROR_PTR("pixs not defined", procName, NULL); /* Set maxw, maxh if default is requested */ if ((res = pixGetXRes(pixs)) == 0) { L_INFO("Resolution is not set: setting to 300 ppi\n", procName); res = 300; } maxw = (maxw != 0) ? maxw : (l_int32)(0.5 * res); maxh = (maxh != 0) ? maxh : (l_int32)(0.5 * res); /* Binarize carefully, if necessary */ if (pixGetDepth(pixs) > 1) { pix2 = pixConvertTo8(pixs, FALSE); pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 190); pix1 = pixThresholdToBinary(pix3, 150); pixDestroy(&pix2); pixDestroy(&pix3); } else { pix1 = pixClone(pixs); } pixZero(pix1, &empty); if (empty) { pixDestroy(&pix1); L_INFO("no fg pixels in input image\n", procName); return NULL; } if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); /* Remove any very tall or very wide connected components */ pix2 = pixSelectBySize(pix1, maxw, maxh, 8, L_SELECT_IF_BOTH, L_SELECT_IF_LT, NULL); if (pixadb) pixaAddPix(pixadb, pix2, L_COPY); pixDestroy(&pix1); /* Filter to solidify the text lines within the x-height region. * The closing (csize) bridges gaps between words. */ csize = L_MIN(120., 60.0 * res / 300.0); snprintf(buf, sizeof(buf), "c%d.1", csize); pix3 = pixMorphCompSequence(pix2, buf, 0); if (pixadb) pixaAddPix(pixadb, pix3, L_COPY); /* Extract the connected components. These should be dilated lines */ boxa1 = pixConnComp(pix3, &pixa1, 4); if (pixadb) { pix1 = pixaDisplayRandomCmap(pixa1, 0, 0); pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255); pixaAddPix(pixadb, pix1, L_INSERT); } /* Do a 2-d sort, and generate a bounding box for each set of text * line segments that is aligned horizontally (i.e., has vertical * overlap) into a box representing a single text line. */ baa1 = boxaSort2d(boxa1, NULL, -1, -1, 5); boxaaGetExtent(baa1, NULL, NULL, NULL, &boxa2); if (pixadb) { pix1 = pixConvertTo32(pix2); pixRenderBoxaArb(pix1, boxa2, 2, 255, 0, 0); pixaAddPix(pixadb, pix1, L_INSERT); } /* Optionally adjust the sides of each text line box, and then * use the boxes to generate a pixa of the text lines. */ boxa3 = boxaAdjustSides(boxa2, -adjw, adjw, -adjh, adjh); pixa2 = pixClipRectangles(pix2, boxa3); if (pixadb) { pix1 = pixaDisplayRandomCmap(pixa2, 0, 0); pixcmapResetColor(pixGetColormap(pix1), 0, 255, 255, 255); pixaAddPix(pixadb, pix1, L_INSERT); } pixDestroy(&pix2); pixDestroy(&pix3); pixaDestroy(&pixa1); boxaDestroy(&boxa1); boxaDestroy(&boxa2); boxaDestroy(&boxa3); boxaaDestroy(&baa1); return pixa2; } /*------------------------------------------------------------------* * How many text columns * *------------------------------------------------------------------*/ /*! * \brief pixCountTextColumns() * * \param[in] pixs 1 bpp * \param[in] deltafract fraction of (max - min) to be used in the delta * for extrema finding; typ 0.3 * \param[in] peakfract fraction of (max - min) to be used to threshold * the peak value; typ. 0.5 * \param[in] clipfract fraction of image dimension removed on each side; * typ. 0.1, which leaves w and h reduced by 0.8 * \param[out] pncols number of columns; -1 if not determined * \param[in] pixadb [optional] pre-allocated, for showing * intermediate computation; use null to skip * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) It is assumed that pixs has the correct resolution set.
 *          If the resolution is 0, we set to 300 and issue a warning.
 *      (2) If necessary, the image is scaled to between 37 and 75 ppi;
 *          most of the processing is done at this resolution.
 *      (3) If no text is found (essentially a blank page),
 *          this returns ncols = 0.
 *      (4) For debug output, input a pre-allocated pixa.
 * 
*/ l_ok pixCountTextColumns(PIX *pixs, l_float32 deltafract, l_float32 peakfract, l_float32 clipfract, l_int32 *pncols, PIXA *pixadb) { l_int32 w, h, res, i, n, npeak; l_float32 scalefact, redfact, minval, maxval, val4, val5, fract; BOX *box; NUMA *na1, *na2, *na3, *na4, *na5; PIX *pix1, *pix2, *pix3, *pix4, *pix5; PROCNAME("pixCountTextColumns"); if (!pncols) return ERROR_INT("&ncols not defined", procName, 1); *pncols = -1; /* init */ if (!pixs || pixGetDepth(pixs) != 1) return ERROR_INT("pixs not defined or not 1 bpp", procName, 1); if (deltafract < 0.15 || deltafract > 0.75) L_WARNING("deltafract not in [0.15 ... 0.75]\n", procName); if (peakfract < 0.25 || peakfract > 0.9) L_WARNING("peakfract not in [0.25 ... 0.9]\n", procName); if (clipfract < 0.0 || clipfract >= 0.5) return ERROR_INT("clipfract not in [0.0 ... 0.5)\n", procName, 1); if (pixadb) pixaAddPix(pixadb, pixs, L_COPY); /* Scale to between 37.5 and 75 ppi */ if ((res = pixGetXRes(pixs)) == 0) { L_WARNING("resolution undefined; set to 300\n", procName); pixSetResolution(pixs, 300, 300); res = 300; } if (res < 37) { L_WARNING("resolution %d very low\n", procName, res); scalefact = 37.5 / res; pix1 = pixScale(pixs, scalefact, scalefact); } else { redfact = (l_float32)res / 37.5; if (redfact < 2.0) pix1 = pixClone(pixs); else if (redfact < 4.0) pix1 = pixReduceRankBinaryCascade(pixs, 1, 0, 0, 0); else if (redfact < 8.0) pix1 = pixReduceRankBinaryCascade(pixs, 1, 2, 0, 0); else if (redfact < 16.0) pix1 = pixReduceRankBinaryCascade(pixs, 1, 2, 2, 0); else pix1 = pixReduceRankBinaryCascade(pixs, 1, 2, 2, 2); } if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); /* Crop inner 80% of image */ pixGetDimensions(pix1, &w, &h, NULL); box = boxCreate(clipfract * w, clipfract * h, (1.0 - 2 * clipfract) * w, (1.0 - 2 * clipfract) * h); pix2 = pixClipRectangle(pix1, box, NULL); pixGetDimensions(pix2, &w, &h, NULL); boxDestroy(&box); if (pixadb) pixaAddPix(pixadb, pix2, L_COPY); /* Deskew */ pix3 = pixDeskew(pix2, 0); if (pixadb) pixaAddPix(pixadb, pix3, L_COPY); /* Close to increase column counts for text */ pix4 = pixCloseSafeBrick(NULL, pix3, 5, 21); if (pixadb) pixaAddPix(pixadb, pix4, L_COPY); pixInvert(pix4, pix4); na1 = pixCountByColumn(pix4, NULL); if (pixadb) { gplotSimple1(na1, GPLOT_PNG, "/tmp/lept/plot", NULL); pix5 = pixRead("/tmp/lept/plot.png"); pixaAddPix(pixadb, pix5, L_INSERT); } /* Analyze the column counts. na4 gives the locations of * the extrema in normalized units (0.0 to 1.0) across the * cropped image. na5 gives the magnitude of the * extrema, normalized to the dynamic range. The peaks * are values that are at least peakfract of (max - min). */ numaGetMax(na1, &maxval, NULL); numaGetMin(na1, &minval, NULL); fract = (l_float32)(maxval - minval) / h; /* is there much at all? */ if (fract < 0.05) { L_INFO("very little content on page; 0 text columns\n", procName); *pncols = 0; } else { na2 = numaFindExtrema(na1, deltafract * (maxval - minval), &na3); na4 = numaTransform(na2, 0, 1.0 / w); na5 = numaTransform(na3, -minval, 1.0 / (maxval - minval)); n = numaGetCount(na4); for (i = 0, npeak = 0; i < n; i++) { numaGetFValue(na4, i, &val4); numaGetFValue(na5, i, &val5); if (val4 > 0.3 && val4 < 0.7 && val5 >= peakfract) { npeak++; L_INFO("Peak(loc,val) = (%5.3f,%5.3f)\n", procName, val4, val5); } } *pncols = npeak + 1; numaDestroy(&na2); numaDestroy(&na3); numaDestroy(&na4); numaDestroy(&na5); } pixDestroy(&pix1); pixDestroy(&pix2); pixDestroy(&pix3); pixDestroy(&pix4); numaDestroy(&na1); return 0; } /*------------------------------------------------------------------* * Decision text vs photo * *------------------------------------------------------------------*/ /*! * \brief pixDecideIfText() * * \param[in] pixs any depth * \param[in] box [optional] if null, use entire pixs * \param[out] pistext 1 if text; 0 if photo; -1 if not determined or empty * \param[in] pixadb [optional] pre-allocated, for showing intermediate * computation; use NULL to skip * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) It is assumed that pixs has the correct resolution set.
 *          If the resolution is 0, we set to 300 and issue a warning.
 *      (2) If necessary, the image is scaled to 300 ppi; most of the
 *          processing is done at this resolution.
 *      (3) Text is assumed to be in horizontal lines.
 *      (4) Because thin vertical lines are removed before filtering for
 *          text lines, this should identify tables as text.
 *      (5) If %box is null and pixs contains both text lines and line art,
 *          this function might return %istext == true.
 *      (6) If the input pixs is empty, or for some other reason the
 *          result can not be determined, return -1.
 *      (7) For debug output, input a pre-allocated pixa.
 * 
*/ l_ok pixDecideIfText(PIX *pixs, BOX *box, l_int32 *pistext, PIXA *pixadb) { l_int32 i, empty, maxw, w, h, n1, n2, n3, minlines, big_comp; l_float32 ratio1, ratio2; L_BMF *bmf; BOXA *boxa1, *boxa2, *boxa3, *boxa4, *boxa5; PIX *pix1, *pix2, *pix3, *pix4, *pix5, *pix6, *pix7; PIXA *pixa1; SEL *sel1; PROCNAME("pixDecideIfText"); if (!pistext) return ERROR_INT("&istext not defined", procName, 1); *pistext = -1; if (!pixs) return ERROR_INT("pixs not defined", procName, 1); /* Crop, convert to 1 bpp, 300 ppi */ if ((pix1 = pixPrepare1bpp(pixs, box, 0.1f, 300)) == NULL) return ERROR_INT("pix1 not made", procName, 1); pixZero(pix1, &empty); if (empty) { pixDestroy(&pix1); L_INFO("pix is empty\n", procName); return 0; } w = pixGetWidth(pix1); /* Identify and remove tall, thin vertical lines (as found in tables) * that are up to 9 pixels wide. Make a hit-miss sel with an * 81 pixel vertical set of hits and with 3 pairs of misses that * are 10 pixels apart horizontally. It is necessary to use a * hit-miss transform; if we only opened with a vertical line of * hits, we would remove solid regions of pixels that are not * text or vertical lines. */ pix2 = pixCreate(11, 81, 1); for (i = 0; i < 81; i++) pixSetPixel(pix2, 5, i, 1); sel1 = selCreateFromPix(pix2, 40, 5, NULL); selSetElement(sel1, 20, 0, SEL_MISS); selSetElement(sel1, 20, 10, SEL_MISS); selSetElement(sel1, 40, 0, SEL_MISS); selSetElement(sel1, 40, 10, SEL_MISS); selSetElement(sel1, 60, 0, SEL_MISS); selSetElement(sel1, 60, 10, SEL_MISS); pix3 = pixHMT(NULL, pix1, sel1); pix4 = pixSeedfillBinaryRestricted(NULL, pix3, pix1, 8, 5, 1000); pix5 = pixXor(NULL, pix1, pix4); pixDestroy(&pix2); selDestroy(&sel1); /* Convert the text lines to separate long horizontal components */ pix6 = pixMorphCompSequence(pix5, "c30.1 + o15.1 + c60.1 + o2.2", 0); /* Estimate the distance to the bottom of the significant region */ if (box) { /* use full height */ pixGetDimensions(pix6, NULL, &h, NULL); } else { /* use height of region that has text lines */ pixFindThreshFgExtent(pix6, 400, NULL, &h); } if (pixadb) { bmf = bmfCreate(NULL, 6); pixaAddPixWithText(pixadb, pix1, 1, bmf, "threshold/crop to binary", 0x0000ff00, L_ADD_BELOW); pixaAddPixWithText(pixadb, pix3, 2, bmf, "hit-miss for vertical line", 0x0000ff00, L_ADD_BELOW); pixaAddPixWithText(pixadb, pix4, 2, bmf, "restricted seed-fill", 0x0000ff00, L_ADD_BELOW); pixaAddPixWithText(pixadb, pix5, 2, bmf, "remove using xor", 0x0000ff00, L_ADD_BELOW); pixaAddPixWithText(pixadb, pix6, 2, bmf, "make long horiz components", 0x0000ff00, L_ADD_BELOW); } /* Extract the connected components */ if (pixadb) { boxa1 = pixConnComp(pix6, &pixa1, 8); pix7 = pixaDisplayRandomCmap(pixa1, 0, 0); pixcmapResetColor(pixGetColormap(pix7), 0, 255, 255, 255); pixaAddPixWithText(pixadb, pix7, 2, bmf, "show connected components", 0x0000ff00, L_ADD_BELOW); pixDestroy(&pix7); pixaDestroy(&pixa1); bmfDestroy(&bmf); } else { boxa1 = pixConnComp(pix6, NULL, 8); } /* Analyze the connected components. The following conditions * at 300 ppi must be satisfied if the image is text: * (1) There are no components that are wider than 400 pixels and * taller than 175 pixels. * (2) The second longest component is at least 60% of the * (possibly cropped) image width. This catches images * that don't have any significant content. * (3) Of the components that are at least 40% of the length * of the longest (n2), at least 80% of them must not exceed * 60 pixels in height. * (4) The number of those long, thin components (n3) must * equal or exceed a minimum that scales linearly with the * image height. * Most images that are not text fail more than one of these * conditions. */ boxa2 = boxaSort(boxa1, L_SORT_BY_WIDTH, L_SORT_DECREASING, NULL); boxaGetBoxGeometry(boxa2, 1, NULL, NULL, &maxw, NULL); /* 2nd longest */ boxa3 = boxaSelectBySize(boxa1, 0.4 * maxw, 0, L_SELECT_WIDTH, L_SELECT_IF_GTE, NULL); boxa4 = boxaSelectBySize(boxa3, 0, 60, L_SELECT_HEIGHT, L_SELECT_IF_LTE, NULL); boxa5 = boxaSelectBySize(boxa1, 400, 175, L_SELECT_IF_BOTH, L_SELECT_IF_GT, NULL); big_comp = (boxaGetCount(boxa5) == 0) ? 0 : 1; n1 = boxaGetCount(boxa1); n2 = boxaGetCount(boxa3); n3 = boxaGetCount(boxa4); ratio1 = (l_float32)maxw / (l_float32)w; ratio2 = (l_float32)n3 / (l_float32)n2; minlines = L_MAX(2, h / 125); if (big_comp || ratio1 < 0.6 || ratio2 < 0.8 || n3 < minlines) *pistext = 0; else *pistext = 1; if (pixadb) { if (*pistext == 1) { L_INFO("This is text: \n n1 = %d, n2 = %d, n3 = %d, " "minlines = %d\n maxw = %d, ratio1 = %4.2f, h = %d, " "big_comp = %d\n", procName, n1, n2, n3, minlines, maxw, ratio1, h, big_comp); } else { L_INFO("This is not text: \n n1 = %d, n2 = %d, n3 = %d, " "minlines = %d\n maxw = %d, ratio1 = %4.2f, h = %d, " "big_comp = %d\n", procName, n1, n2, n3, minlines, maxw, ratio1, h, big_comp); } } boxaDestroy(&boxa1); boxaDestroy(&boxa2); boxaDestroy(&boxa3); boxaDestroy(&boxa4); boxaDestroy(&boxa5); pixDestroy(&pix1); pixDestroy(&pix3); pixDestroy(&pix4); pixDestroy(&pix5); pixDestroy(&pix6); return 0; } /*! * \brief pixFindThreshFgExtent() * * \param[in] pixs 1 bpp * \param[in] thresh threshold number of pixels in row * \param[out] ptop [optional] location of top of region * \param[out] pbot [optional] location of bottom of region * \return 0 if OK, 1 on error */ l_ok pixFindThreshFgExtent(PIX *pixs, l_int32 thresh, l_int32 *ptop, l_int32 *pbot) { l_int32 i, n; l_int32 *array; NUMA *na; PROCNAME("pixFindThreshFgExtent"); if (ptop) *ptop = 0; if (pbot) *pbot = 0; if (!ptop && !pbot) return ERROR_INT("nothing to determine", procName, 1); if (!pixs || pixGetDepth(pixs) != 1) return ERROR_INT("pixs not defined or not 1 bpp", procName, 1); na = pixCountPixelsByRow(pixs, NULL); n = numaGetCount(na); array = numaGetIArray(na); if (ptop) { for (i = 0; i < n; i++) { if (array[i] >= thresh) { *ptop = i; break; } } } if (pbot) { for (i = n - 1; i >= 0; i--) { if (array[i] >= thresh) { *pbot = i; break; } } } LEPT_FREE(array); numaDestroy(&na); return 0; } /*------------------------------------------------------------------* * Decision: table vs text * *------------------------------------------------------------------*/ /*! * \brief pixDecideIfTable() * * \param[in] pixs any depth, any resolution >= 75 ppi * \param[in] box [optional] if null, use entire pixs * \param[in] orient L_PORTRAIT_MODE, L_LANDSCAPE_MODE * \param[out] pscore 0 - 4; -1 if not determined * \param[in] pixadb [optional] pre-allocated, for showing intermediate * computation; use NULL to skip * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) It is assumed that pixs has the correct resolution set.
 *          If the resolution is 0, we assume it is 300 ppi and issue a warning.
 *      (2) If %orient == L_LANDSCAPE_MODE, the image is rotated 90 degrees
 *          clockwise before being analyzed.
 *      (3) The interpretation of the returned score:
 *            -1     undetermined
 *             0     no table
 *             1     unlikely to have a table
 *             2     likely to have a table
 *             3     even more likely to have a table
 *             4     extremely likely to have a table
 *          * Setting the condition for finding a table at score >= 2 works
 *            well, except for false positives on kanji and landscape text.
 *          * These false positives can be removed by setting the condition
 *            at score >= 3, but recall is lowered because it will not find
 *            tables without either horizontal or vertical lines.
 *      (4) Most of the processing takes place at 75 ppi.
 *      (5) Internally, three numbers are determined, for horizontal and
 *          vertical fg lines, and for vertical bg lines.  From these,
 *          four tests are made to decide if there is a table occupying
 *          a significant part of the image.
 *      (6) Images have arbitrary content and would be likely to trigger
 *          this detector, so they are checked for first, and if found,
 *          return with a 0 (no table) score.
 *      (7) Musical scores (tablature) are likely to trigger the detector.
 *      (8) Tables of content with more than 2 columns are likely to
 *          trigger the detector.
 *      (9) For debug output, input a pre-allocated pixa.
 * 
*/ l_ok pixDecideIfTable(PIX *pixs, BOX *box, l_int32 orient, l_int32 *pscore, PIXA *pixadb) { l_int32 empty, nhb, nvb, nvw, score, htfound; PIX *pix1, *pix2, *pix3, *pix4, *pix5, *pix6, *pix7, *pix8, *pix9; PROCNAME("pixDecideIfTable"); if (!pscore) return ERROR_INT("&score not defined", procName, 1); *pscore = -1; if (!pixs) return ERROR_INT("pixs not defined", procName, 1); /* Check if there is an image region. First convert to 1 bpp * at 175 ppi. If an image is found, assume there is no table. */ pix1 = pixPrepare1bpp(pixs, box, 0.1f, 175); pix2 = pixGenerateHalftoneMask(pix1, NULL, &htfound, NULL); if (htfound && pixadb) pixaAddPix(pixadb, pix2, L_COPY); pixDestroy(&pix1); pixDestroy(&pix2); if (htfound) { *pscore = 0; L_INFO("pix has an image region\n", procName); return 0; } /* Crop, convert to 1 bpp, 75 ppi */ if ((pix1 = pixPrepare1bpp(pixs, box, 0.05f, 75)) == NULL) return ERROR_INT("pix1 not made", procName, 1); pixZero(pix1, &empty); if (empty) { *pscore = 0; pixDestroy(&pix1); L_INFO("pix is empty\n", procName); return 0; } /* The 2x2 dilation on 75 ppi makes these two approaches very similar: * (1) pix1 = pixPrepare1bpp(..., 300); // 300 ppi resolution * pix2 = pixReduceRankBinaryCascade(pix1, 1, 1, 0, 0); * (2) pix1 = pixPrepare1bpp(..., 75); // 75 ppi resolution * pix2 = pixDilateBrick(NULL, pix1, 2, 2); * But (2) is more efficient if the input image to pixPrepare1bpp() * is not at 300 ppi. */ pix2 = pixDilateBrick(NULL, pix1, 2, 2); /* Deskew both horizontally and vertically; rotate by 90 * degrees if in landscape mode. */ pix3 = pixDeskewBoth(pix2, 1); if (pixadb) { pixaAddPix(pixadb, pix2, L_COPY); pixaAddPix(pixadb, pix3, L_COPY); } if (orient == L_LANDSCAPE_MODE) pix4 = pixRotate90(pix3, 1); else pix4 = pixClone(pix3); pixDestroy(&pix1); pixDestroy(&pix2); pixDestroy(&pix3); pix1 = pixClone(pix4); pixDestroy(&pix4); /* Look for horizontal and vertical lines */ pix2 = pixMorphSequence(pix1, "o100.1 + c1.4", 0); pix3 = pixSeedfillBinary(NULL, pix2, pix1, 8); pix4 = pixMorphSequence(pix1, "o1.100 + c4.1", 0); pix5 = pixSeedfillBinary(NULL, pix4, pix1, 8); pix6 = pixOr(NULL, pix3, pix5); if (pixadb) { pixaAddPix(pixadb, pix2, L_COPY); pixaAddPix(pixadb, pix4, L_COPY); pixaAddPix(pixadb, pix3, L_COPY); pixaAddPix(pixadb, pix5, L_COPY); pixaAddPix(pixadb, pix6, L_COPY); } pixCountConnComp(pix2, 8, &nhb); /* number of horizontal black lines */ pixCountConnComp(pix4, 8, &nvb); /* number of vertical black lines */ /* Remove the lines */ pixSubtract(pix1, pix1, pix6); if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); /* Remove noise pixels */ pix7 = pixMorphSequence(pix1, "c4.1 + o8.1", 0); if (pixadb) pixaAddPix(pixadb, pix7, L_COPY); /* Look for vertical white space. Invert to convert white bg * to fg. Use a single rank-1 2x reduction, which closes small * fg holes, for the final processing at 37.5 ppi. * The vertical opening is then about 3 inches on a 300 ppi image. * We also remove vertical whitespace that is less than 5 pixels * wide at this resolution (about 0.1 inches) */ pixInvert(pix7, pix7); pix8 = pixMorphSequence(pix7, "r1 + o1.100", 0); pix9 = pixSelectBySize(pix8, 5, 0, 8, L_SELECT_WIDTH, L_SELECT_IF_GTE, NULL); pixCountConnComp(pix9, 8, &nvw); /* number of vertical white lines */ if (pixadb) { pixaAddPix(pixadb, pixScale(pix8, 2.0, 2.0), L_INSERT); pixaAddPix(pixadb, pixScale(pix9, 2.0, 2.0), L_INSERT); } /* Require at least 2 of the following 4 conditions for a table. * Some tables do not have black (fg) lines, and for those we * require more than 6 long vertical whitespace (bg) lines. */ score = 0; if (nhb > 1) score++; if (nvb > 2) score++; if (nvw > 3) score++; if (nvw > 6) score++; *pscore = score; pixDestroy(&pix1); pixDestroy(&pix2); pixDestroy(&pix3); pixDestroy(&pix4); pixDestroy(&pix5); pixDestroy(&pix6); pixDestroy(&pix7); pixDestroy(&pix8); pixDestroy(&pix9); return 0; } /*! * \brief pixPrepare1bpp() * * \param[in] pixs any depth * \param[in] box [optional] if null, use entire pixs * \param[in] cropfract fraction to be removed from the boundary; * use 0.0 to retain the entire image * \param[in] outres desired resolution of output image; if the * input image resolution is not set, assume * 300 ppi; use 0 to skip scaling. * \return pixd if OK, NULL on error * *
 * Notes:
 *      (1) This handles some common pre-processing operations,
 *          where the page segmentation algorithm takes a 1 bpp image.
 * 
*/ PIX * pixPrepare1bpp(PIX *pixs, BOX *box, l_float32 cropfract, l_int32 outres) { l_int32 w, h, res; l_float32 factor; BOX *box1; PIX *pix1, *pix2, *pix3, *pix4, *pix5; PROCNAME("pixPrepare1bpp"); if (!pixs) return (PIX *)ERROR_PTR("pixs not defined", procName, NULL); /* Crop the image. If no box is given, use %cropfract to remove * pixels near the image boundary; this helps avoid false * negatives from noise that is often found there. */ if (box) { pix1 = pixClipRectangle(pixs, box, NULL); } else { pixGetDimensions(pixs, &w, &h, NULL); box1 = boxCreate((l_int32)(cropfract * w), (l_int32)(cropfract * h), (l_int32)((1.0 - 2 * cropfract) * w), (l_int32)((1.0 - 2 * cropfract) * h)); pix1 = pixClipRectangle(pixs, box1, NULL); boxDestroy(&box1); } /* Convert to 1 bpp with adaptive background cleaning */ if (pixGetDepth(pixs) > 1) { pix2 = pixConvertTo8(pix1, 0); pix3 = pixCleanBackgroundToWhite(pix2, NULL, NULL, 1.0, 70, 160); pixDestroy(&pix1); pixDestroy(&pix2); if (!pix3) { L_INFO("pix cleaning failed\n", procName); return NULL; } pix4 = pixThresholdToBinary(pix3, 200); pixDestroy(&pix3); } else { pix4 = pixClone(pix1); pixDestroy(&pix1); } /* Scale the image to the requested output resolution; do not scale if %outres <= 0 */ if (outres <= 0) return pix4; if ((res = pixGetXRes(pixs)) == 0) { L_WARNING("Resolution is not set: using 300 ppi\n", procName); res = 300; } if (res != outres) { factor = (l_float32)outres / (l_float32)res; pix5 = pixScale(pix4, factor, factor); } else { pix5 = pixClone(pix4); } pixDestroy(&pix4); return pix5; } /*------------------------------------------------------------------* * Estimate the grayscale background value * *------------------------------------------------------------------*/ /*! * \brief pixEstimateBackground() * * \param[in] pixs 8 bpp, with or without colormap * \param[in] darkthresh pixels below this value are never considered * part of the background; typ. 70; use 0 to skip * \param[in] edgecrop fraction of half-width on each side, and of * half-height at top and bottom, that are cropped * \param[out] pbg estimated background, or 0 on error * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) Caller should check that return bg value is > 0.
 * 
*/ l_ok pixEstimateBackground(PIX *pixs, l_int32 darkthresh, l_float32 edgecrop, l_int32 *pbg) { l_int32 w, h, sampling; l_float32 fbg; BOX *box; PIX *pix1, *pix2, *pixm; PROCNAME("pixEstimateBackground"); if (!pbg) return ERROR_INT("&bg not defined", procName, 1); *pbg = 0; if (!pixs || pixGetDepth(pixs) != 8) return ERROR_INT("pixs not defined or not 8 bpp", procName, 1); if (darkthresh > 128) L_WARNING("darkthresh unusually large\n", procName); if (edgecrop < 0.0 || edgecrop >= 1.0) return ERROR_INT("edgecrop not in [0.0 ... 1.0)", procName, 1); pix1 = pixRemoveColormap(pixs, REMOVE_CMAP_TO_GRAYSCALE); pixGetDimensions(pix1, &w, &h, NULL); /* Optionally crop inner part of image */ if (edgecrop > 0.0) { box = boxCreate(0.5 * edgecrop * w, 0.5 * edgecrop * h, (1.0 - edgecrop) * w, (1.0 - edgecrop) * h); pix2 = pixClipRectangle(pix1, box, NULL); boxDestroy(&box); } else { pix2 = pixClone(pix1); } /* We will use no more than 50K samples */ sampling = L_MAX(1, (l_int32)sqrt((l_float64)(w * h) / 50000. + 0.5)); /* Optionally make a mask over all pixels lighter than %darkthresh */ pixm = NULL; if (darkthresh > 0) { pixm = pixThresholdToBinary(pix2, darkthresh); pixInvert(pixm, pixm); } pixGetRankValueMasked(pix2, pixm, 0, 0, sampling, 0.5, &fbg, NULL); *pbg = (l_int32)(fbg + 0.5); pixDestroy(&pix1); pixDestroy(&pix2); pixDestroy(&pixm); return 0; } /*---------------------------------------------------------------------* * Largest white or black rectangles in an image * *---------------------------------------------------------------------*/ /*! * \brief pixFindLargeRectangles() * * \param[in] pixs 1 bpp * \param[in] polarity 0 within background, 1 within foreground * \param[in] nrect number of rectangles to be found * \param[out] pboxa largest rectangles, sorted by decreasing area * \param[in,out] ppixdb optional return output with rectangles drawn on it * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This does a greedy search to find the largest rectangles,
 *          either black or white and without overlaps, in %pix.
 *      (2) See pixFindLargestRectangle(), which is called multiple
 *          times, for details.  On each call, the largest rectangle
 *          found is painted, so that none of its pixels can be
 *          used later, before calling it again.
 *      (3) This function is surprisingly fast.  Although
 *          pixFindLargestRectangle() runs at about 50 MPix/sec, when it
 *          is run multiple times by pixFindLargeRectangles(), it processes
 *          at 150 - 250 MPix/sec, and the time is approximately linear
 *          in %nrect.  For example, for a 1 MPix image, searching for
 *          the largest 50 boxes takes about 0.2 seconds.
 * 
*/ l_ok pixFindLargeRectangles(PIX *pixs, l_int32 polarity, l_int32 nrect, BOXA **pboxa, PIX **ppixdb) { l_int32 i, op, bx, by, bw, bh; BOX *box; BOXA *boxa; PIX *pix; PROCNAME("pixFindLargeRectangles"); if (ppixdb) *ppixdb = NULL; if (!pboxa) return ERROR_INT("&boxa not defined", procName, 1); *pboxa = NULL; if (!pixs || pixGetDepth(pixs) != 1) return ERROR_INT("pixs not defined or not 1 bpp", procName, 1); if (polarity != 0 && polarity != 1) return ERROR_INT("invalid polarity", procName, 1); if (nrect > 1000) { L_WARNING("large num rectangles = %d requested; using 1000\n", procName, nrect); nrect = 1000; } pix = pixCopy(NULL, pixs); boxa = boxaCreate(nrect); *pboxa = boxa; /* Sequentially find largest rectangle and fill with opposite color */ for (i = 0; i < nrect; i++) { if (pixFindLargestRectangle(pix, polarity, &box, NULL) == 1) { boxDestroy(&box); L_ERROR("failure in pixFindLargestRectangle\n", procName); break; } boxaAddBox(boxa, box, L_INSERT); op = (polarity == 0) ? PIX_SET : PIX_CLR; boxGetGeometry(box, &bx, &by, &bw, &bh); pixRasterop(pix, bx, by, bw, bh, op, NULL, 0, 0); } if (ppixdb) *ppixdb = pixDrawBoxaRandom(pixs, boxa, 3); pixDestroy(&pix); return 0; } /*! * \brief pixFindLargestRectangle() * * \param[in] pixs 1 bpp * \param[in] polarity 0 within background, 1 within foreground * \param[out] pbox largest area rectangle * \param[in,out] ppixdb optional return output with rectangle drawn on it * \return 0 if OK, 1 on error * *
 * Notes:
 *      (1) This is a simple and elegant solution to a problem in
 *          computational geometry that at first appears to be quite
 *          difficult: what is the largest rectangle that can be
 *          placed in the image, covering only pixels of one polarity
 *          (bg or fg)?  The solution is O(n), where n is the number
 *          of pixels in the image, and it requires nothing more than
 *          using a simple recursion relation in a single sweep of the image.
 *      (2) In a sweep from UL to LR with left-to-right being the fast
 *          direction, calculate the largest white rectangle at (x, y),
 *          using previously calculated values at pixels #1 and #2:
 *             #1:    (x, y - 1)
 *             #2:    (x - 1, y)
 *          We also need the most recent "black" pixels that were seen
 *          in the current row and column.
 *          Consider the largest area.  There are only two possibilities:
 *             (a)  Min(w(1), horizdist) * (h(1) + 1)
 *             (b)  Min(h(2), vertdist) * (w(2) + 1)
 *          where
 *             horizdist: the distance from the rightmost "black" pixel seen
 *                        in the current row across to the current pixel
 *             vertdist: the distance from the lowest "black" pixel seen
 *                       in the current column down to the current pixel
 *          and we choose the Max of (a) and (b).
 *      (3) To convince yourself that these recursion relations are correct,
 *          it helps to draw the maximum rectangles at #1 and #2.
 *          Then for #1, you try to extend the rectangle down one line,
 *          so that the height is h(1) + 1.  Do you get the full
 *          width of #1, w(1)?  It depends on where the black pixels are
 *          in the current row.  You know the final width is bounded by w(1)
 *          and w(2) + 1, but the actual value depends on the distribution
 *          of black pixels in the current row that are at a distance
 *          from the current pixel that is between these limits.
 *          We call that value "horizdist", and the area is then given
 *          by the expression (a) above.  Using similar reasoning for #2,
 *          where you attempt to extend the rectangle to the right
 *          by 1 pixel, you arrive at (b).  The largest rectangle is
 *          then found by taking the Max.
 * 
*/ l_ok pixFindLargestRectangle(PIX *pixs, l_int32 polarity, BOX **pbox, PIX **ppixdb) { l_int32 i, j, w, h, d, wpls, val; l_int32 wp, hp, w1, w2, h1, h2, wmin, hmin, area1, area2; l_int32 xmax, ymax; /* LR corner of the largest rectangle */ l_int32 maxarea, wmax, hmax, vertdist, horizdist, prevfg; l_int32 *lowestfg; l_uint32 *datas, *lines; l_uint32 **linew, **lineh; BOX *box; PIX *pixw, *pixh; /* keeps the width and height for the largest */ /* rectangles whose LR corner is located there. */ PROCNAME("pixFindLargestRectangle"); if (ppixdb) *ppixdb = NULL; if (!pbox) return ERROR_INT("&box not defined", procName, 1); *pbox = NULL; if (!pixs) return ERROR_INT("pixs not defined", procName, 1); pixGetDimensions(pixs, &w, &h, &d); if (d != 1) return ERROR_INT("pixs not 1 bpp", procName, 1); if (polarity != 0 && polarity != 1) return ERROR_INT("invalid polarity", procName, 1); /* Initialize lowest "fg" seen so far for each column */ lowestfg = (l_int32 *)LEPT_CALLOC(w, sizeof(l_int32)); for (i = 0; i < w; i++) lowestfg[i] = -1; /* The combination (val ^ polarity) is the color for which we * are searching for the maximum rectangle. For polarity == 0, * we search in the bg (white). */ pixw = pixCreate(w, h, 32); /* stores width */ pixh = pixCreate(w, h, 32); /* stores height */ linew = (l_uint32 **)pixGetLinePtrs(pixw, NULL); lineh = (l_uint32 **)pixGetLinePtrs(pixh, NULL); datas = pixGetData(pixs); wpls = pixGetWpl(pixs); maxarea = xmax = ymax = wmax = hmax = 0; for (i = 0; i < h; i++) { lines = datas + i * wpls; prevfg = -1; for (j = 0; j < w; j++) { val = GET_DATA_BIT(lines, j); if ((val ^ polarity) == 0) { /* bg (0) if polarity == 0, etc. */ if (i == 0 && j == 0) { wp = hp = 1; } else if (i == 0) { wp = linew[i][j - 1] + 1; hp = 1; } else if (j == 0) { wp = 1; hp = lineh[i - 1][j] + 1; } else { /* Expand #1 prev rectangle down */ w1 = linew[i - 1][j]; h1 = lineh[i - 1][j]; horizdist = j - prevfg; wmin = L_MIN(w1, horizdist); /* width of new rectangle */ area1 = wmin * (h1 + 1); /* Expand #2 prev rectangle to right */ w2 = linew[i][j - 1]; h2 = lineh[i][j - 1]; vertdist = i - lowestfg[j]; hmin = L_MIN(h2, vertdist); /* height of new rectangle */ area2 = hmin * (w2 + 1); if (area1 > area2) { wp = wmin; hp = h1 + 1; } else { wp = w2 + 1; hp = hmin; } } } else { /* fg (1) if polarity == 0; bg (0) if polarity == 1 */ prevfg = j; lowestfg[j] = i; wp = hp = 0; } linew[i][j] = wp; lineh[i][j] = hp; if (wp * hp > maxarea) { maxarea = wp * hp; xmax = j; ymax = i; wmax = wp; hmax = hp; } } } /* Translate from LR corner to Box coords (UL corner, w, h) */ box = boxCreate(xmax - wmax + 1, ymax - hmax + 1, wmax, hmax); *pbox = box; if (ppixdb) { *ppixdb = pixConvertTo8(pixs, TRUE); pixRenderHashBoxArb(*ppixdb, box, 6, 2, L_NEG_SLOPE_LINE, 1, 255, 0, 0); } LEPT_FREE(linew); LEPT_FREE(lineh); LEPT_FREE(lowestfg); pixDestroy(&pixw); pixDestroy(&pixh); return 0; } /*---------------------------------------------------------------------* * Generate rectangle inside connected component * *---------------------------------------------------------------------*/ /*! * \brief pixFindRectangleInCC() * * \param[in] pixs 1 bpp, with sufficient closings to make the fg be * a single c.c. that is a convex hull * \param[in] boxs [optional] if NULL, %pixs should be a minimum * container of a single c.c. * \param[in] fract first and all consecutive lines found must be at * least this fraction of the fast scan dimension * \param[in] dir L_SCAN_HORIZONTAL, L_SCAN_VERTICAL; direction of * fast scan * \param[in] select L_GEOMETRIC_UNION, L_GEOMETRIC_INTERSECTION, * L_LARGEST_AREA, L_SMALEST_AREA * \param[in] debug if 1, generates output pdf showing intermediate * computation and final result * \return box of included rectangle, or NULL on error * *
 * Notes:
 *      (1) Computation is similar to pixFindLargestRectangle(), but allows
 *          a different set of results to choose from.
 *      (2) Select the fast scan direction.  Then, scanning in the slow
 *          direction, find the longest run of ON pixels in the fast
 *          scan direction and look for the first run that is longer
 *          than %fract of the dimension.  Continue until a shorter run
 *          is found.  This generates a box of ON pixels fitting into the c.c.
 *      (3) Do this from both slow scan directions and use %select to get
 *          a resulting box from these two.
 *      (4) The extracted rectangle is not necessarily the largest that
 *          can fit in the c.c.  To get that, use pixFindLargestRectangle().
 */
BOX *
pixFindRectangleInCC(PIX       *pixs,
                     BOX       *boxs,
                     l_float32  fract,
                     l_int32    dir,
                     l_int32    select,
                     l_int32    debug)
{
l_int32  x, y, i, w, h, w1, h1, w2, h2, found, res;
l_int32  xfirst, xlast, xstart, yfirst, ylast, length;
BOX     *box1, *box2, *box3, *box4, *box5;
PIX     *pix1, *pix2, *pixdb1, *pixdb2;
PIXA    *pixadb;

    PROCNAME("pixFindRectangleInCC");

    if (!pixs || pixGetDepth(pixs) != 1)
        return (BOX *)ERROR_PTR("pixs undefined or not 1 bpp", procName, NULL);
    if (fract <= 0.0 || fract > 1.0)
        return (BOX *)ERROR_PTR("invalid fraction", procName, NULL);
    if (dir != L_SCAN_VERTICAL && dir != L_SCAN_HORIZONTAL)
        return (BOX *)ERROR_PTR("invalid scan direction", procName, NULL);
    if (select != L_GEOMETRIC_UNION && select != L_GEOMETRIC_INTERSECTION &&
        select != L_LARGEST_AREA && select != L_SMALLEST_AREA)
        return (BOX *)ERROR_PTR("invalid select", procName, NULL);

        /* Extract the c.c. if necessary */
    x = y = 0;
    if (boxs) {
        pix1 = pixClipRectangle(pixs, boxs, NULL);
        boxGetGeometry(boxs, &x, &y, NULL, NULL);
    } else {
        pix1 = pixClone(pixs);
    }

        /* All fast scans are horizontal; rotate 90 deg cw if necessary */
    if (dir == L_SCAN_VERTICAL)
        pix2 = pixRotate90(pix1, 1);
    else  /* L_SCAN_HORIZONTAL */
        pix2 = pixClone(pix1);
    pixGetDimensions(pix2, &w, &h, NULL);

    pixadb = (debug) ? pixaCreate(0) : NULL;
    pixdb1 = NULL;
    if (pixadb) {
        lept_mkdir("lept/rect");
        pixaAddPix(pixadb, pix1, L_CLONE);
        pixdb1 = pixConvertTo32(pix2);
    }
    pixDestroy(&pix1);

        /* Scanning down, find the first scanline with a long enough run.
         * That run goes from (xfirst, yfirst) to (xlast, yfirst).  */
    found = FALSE;
    for (i = 0; i < h; i++) {
        pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
        if (length >= (l_int32)(fract * w + 0.5)) {
            yfirst = i;
            xfirst = xstart;
            xlast = xfirst + length - 1;
            found = TRUE;
            break;
        }
    }
    if (!found) {
        L_WARNING("no run of sufficient size was found\n", procName);
        pixDestroy(&pix2);
        pixDestroy(&pixdb1);
        pixaDestroy(&pixadb);
        return NULL;
    }

         /* Continue down until the condition fails */
    w1 = xlast - xfirst + 1;
    h1 = h - yfirst;  /* init */
    ylast = h - 1;  /* init */
    for (i = yfirst + 1; i < h; i++) {
        pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
        if (xstart > xfirst || (xstart + length - 1 < xlast) ||
            i == h - 1) {
            ylast = i - 1;
            h1 = ylast - yfirst + 1;
            break;
        }
    }
    box1 = boxCreate(xfirst, yfirst, w1, h1);

        /* Scanning up, find the first scanline with a long enough run.
         * That run goes from (xfirst, ylast) to (xlast, ylast).  */
    for (i = h - 1; i >= 0; i--) {
        pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
        if (length >= (l_int32)(fract * w + 0.5)) {
            ylast = i;
            xfirst = xstart;
            xlast = xfirst + length - 1;
            break;
        }
    }

         /* Continue up until the condition fails */
    w2 = xlast - xfirst + 1;
    h2 = ylast + 1;  /* initialize */
    for (i = ylast - 1; i >= 0; i--) {
        pixFindMaxHorizontalRunOnLine(pix2, i, &xstart, &length);
        if (xstart > xfirst || (xstart + length - 1 < xlast) ||
            i == 0) {
            yfirst = i + 1;
            h2 = ylast - yfirst + 1;
            break;
        }
    }
    box2 = boxCreate(xfirst, yfirst, w2, h2);
    pixDestroy(&pix2);

    if (pixadb) {
        pixRenderBoxArb(pixdb1, box1, 2, 255, 0, 0);
        pixRenderBoxArb(pixdb1, box2, 2, 0, 255, 0);
        pixaAddPix(pixadb, pixdb1, L_INSERT);
    }

        /* Select the final result from the two boxes */
    if (select == L_GEOMETRIC_UNION)
        box3 = boxBoundingRegion(box1, box2);
    else if (select == L_GEOMETRIC_INTERSECTION)
        box3 = boxOverlapRegion(box1, box2);
    else if (select == L_LARGEST_AREA)
        box3 = (w1 * h1 >= w2 * h2) ? boxCopy(box1) : boxCopy(box2);
    else  /* select == L_SMALLEST_AREA) */
        box3 = (w1 * h1 <= w2 * h2) ? boxCopy(box1) : boxCopy(box2);
    boxDestroy(&box1);
    boxDestroy(&box2);

        /* Rotate the box 90 degrees ccw if necessary */
    box4 = NULL;
    if (box3) {
        if (dir == L_SCAN_VERTICAL)
            box4 = boxRotateOrth(box3, w, h, 3);
        else
            box4 = boxCopy(box3);
    }

        /* Transform back to global coordinates if %boxs exists */
    box5 = (box4) ? boxTransform(box4, x, y, 1.0, 1.0) : NULL;
    boxDestroy(&box3);
    boxDestroy(&box4);

        /* Debug output */
    if (pixadb) {
        pixdb1 = pixConvertTo8(pixs, 0);
        pixAddConstantGray(pixdb1, 190);
        pixdb2 = pixConvertTo32(pixdb1);
        if (box5) pixRenderBoxArb(pixdb2, box5, 4, 0, 0, 255);
        pixaAddPix(pixadb, pixdb2, L_INSERT);
        res = pixGetXRes(pixs);
        L_INFO("Writing debug files to /tmp/lept/rect/\n", procName);
        pixaConvertToPdf(pixadb, res, 1.0, L_DEFAULT_ENCODE, 75, NULL,
                        "/tmp/lept/rect/fitrect.pdf");
        pix1 = pixaDisplayTiledAndScaled(pixadb, 32, 800, 1, 0, 40, 2);
        pixWrite("/tmp/lept/rect/fitrect.png", pix1, IFF_PNG);
        pixDestroy(&pix1);
        pixDestroy(&pixdb1);
        pixaDestroy(&pixadb);
    }

    return box5;
}

/*------------------------------------------------------------------*
 *                    Automatic photoinvert for OCR                 *
 *------------------------------------------------------------------*/
/*!
 * \brief   pixAutoPhotoinvert()
 *
 * \param[in]    pixs       any depth, colormap ok
 * \param[in]    thresh     binarization threshold; use 0 for default
 * \param[out]   ppixm      [optional] image regions to be inverted
 * \param[out]   pixadb     [optional] debug; input NULL to skip
 * \return  pixd   1 bpp image to be sent to OCR, or NULL on error
 *
 * 
 * Notes:
 *      (1) A 1 bpp image is returned, where pixels in image regions are
 *          photo-inverted.
 *      (2) If there is light text with a dark background, this will
 *          identify the region and photoinvert the pixels there if
 *          there are at least 60% fg pixels in the region.
 *      (3) For debug output, input a (typically empty) %pixadb.
 * 
*/ PIX * pixAutoPhotoinvert(PIX *pixs, l_int32 thresh, PIX **ppixm, PIXA *pixadb) { l_int32 i, n, empty, x, y, w, h; l_float32 fgfract; BOX *box1; BOXA *boxa1; PIX *pix1, *pix2, *pix3, *pix4, *pix5; PROCNAME("pixAutoPhotoinvert"); if (ppixm) *ppixm = NULL; if (!pixs) return (PIX *)ERROR_PTR("pixs not defined", procName, NULL); if (thresh == 0) thresh = 128; if ((pix1 = pixConvertTo1(pixs, thresh)) == NULL) return (PIX *)ERROR_PTR("pix1 not made", procName, NULL); if (pixadb) pixaAddPix(pixadb, pix1, L_COPY); /* Identify regions for photo-inversion: * (1) Start with the halftone mask. * (2) Eliminate ordinary text and halftones in the mask. * (3) Some regions of inverted text may have been removed in * steps (1) and (2). Conditionally fill holes in the mask, * but do not fill out to the bounding rect. */ pix2 = pixGenerateHalftoneMask(pix1, NULL, NULL, pixadb); pix3 = pixMorphSequence(pix2, "o15.15 + c25.25", 0); /* remove noise */ pix4 = pixFillHolesToBoundingRect(pix3, 1, 0.5, 1.0); if (pixadb) { pixaAddPix(pixadb, pix2, L_CLONE); pixaAddPix(pixadb, pix3, L_CLONE); pixaAddPix(pixadb, pix4, L_COPY); } pixDestroy(&pix2); pixDestroy(&pix3); pixZero(pix4, &empty); if (empty) { pixDestroy(&pix4); return pix1; } /* Examine each component and validate the inversion. * Require at least 60% of pixels under each component to be FG. */ boxa1 = pixConnCompBB(pix4, 8); n = boxaGetCount(boxa1); for (i = 0; i < n; i++) { box1 = boxaGetBox(boxa1, i, L_COPY); pix5 = pixClipRectangle(pix1, box1, NULL); pixForegroundFraction(pix5, &fgfract); if (pixadb) lept_stderr("fg fraction: %5.3f\n", fgfract); boxGetGeometry(box1, &x, &y, &w, &h); if (fgfract < 0.6) /* erase from the mask */ pixRasterop(pix4, x, y, w, h, PIX_CLR, NULL, 0, 0); pixDestroy(&pix5); boxDestroy(&box1); } boxaDestroy(&boxa1); pixZero(pix4, &empty); if (empty) { pixDestroy(&pix4); return pix1; } /* Combine pixels of the photo-inverted pix with the binarized input */ pix5 = pixInvert(NULL, pix1); pixCombineMasked(pix1, pix5, pix4); if (pixadb) { pixaAddPix(pixadb, pix5, L_CLONE); pixaAddPix(pixadb, pix1, L_COPY); } pixDestroy(&pix5); if (ppixm) *ppixm = pix4; else pixDestroy(&pix4); return pix1; }