liblept/jbclass.c - platform/external/tesseract - Git at Google

 /*====================================================================*
  -  Copyright (C) 2001 Leptonica.  All rights reserved.
  -  This software is distributed in the hope that it will be
  -  useful, but with NO WARRANTY OF ANY KIND.
  -  No author or distributor accepts responsibility to anyone for the
  -  consequences of using this software, or for whether it serves any
  -  particular purpose or works at all, unless he or she says so in
  -  writing.  Everyone is granted permission to copy, modify and
  -  redistribute this source code, for commercial or non-commercial
  -  purposes, with the following restrictions: (1) the origin of this
  -  source code must not be misrepresented; (2) modified versions must
  -  be plainly marked as such; and (3) this notice may not be removed
  -  or altered from any source or modified source distribution.
  *====================================================================*/


 /*
  * jbclass.c
  *
  *     These are functions for unsupervised classification of
  *     collections of connected components -- either characters or
  *     words -- in binary images.  They can be used as image
  *     processing steps in jbig2 compression.
  *
  *     Initialization
  *
  *         JBCLASSER         *jbRankHausInit()      [rank hausdorff encoder]
  *         JBCLASSER         *jbCorrelationInit()   [correlation encoder]
  *         JBCLASSER         *jbCorrelationInitWithoutComponents()  [ditto]
  *         static JBCLASSER  *jbCorrelationInitInternal()
  *
  *     Classify the pages
  *
  *         l_int32     jbAddPages()
  *         l_int32     jbAddPage()
  *         l_int32     jbAddPageComponents()
  *
  *     Rank hausdorff classifier
  *
  *         l_int32     jbClassifyRankHaus()
  *         l_int32     pixHaustest()
  *         l_int32     pixRankHaustest()
  *
  *     Binary correlation classifier
  *
  *         l_int32     jbClassifyCorrelation()
  *
  *     Determine the image components we start with
  *
  *         l_int32     jbGetComponents()
  *         PIX        *pixWordMaskByDilation()
  *
  *     Build grayscale composites (templates)
  *
  *         PIXA       *jbAccumulateComposites
  *         PIXA       *jbTemplatesFromComposites
  *
  *     Utility functions for Classer
  *
  *         JBCLASSER  *jbClasserCreate()
  *         void        jbClasserDestroy()
  *
  *     Utility functions for Data
  *
  *         JBDATA     *jbDataSave()
  *         void        jbDataDestroy()
  *         l_int32     jbDataWrite()
  *         JBDATA     *jbDataRead()
  *         PIXA       *jbDataRender()
  *         l_int32     jbGetULCorners()
  *         l_int32     jbGetLLCorners()
  *
  *     Static helpers
  *
  *         static JBFINDCTX *findSimilarSizedTemplatesInit()
  *         static l_int32    findSimilarSizedTemplatesNext()
  *         static void       findSimilarSizedTemplatesDestroy()
  *         static l_int32    finalPositioningForAlignment()
  *
  *     Note: this is NOT an implementation of the JPEG jbig2
  *     proposed standard encoder, the specifications for which
  *     can be found at http://www.jpeg.org/jbigpt2.html.
  *     (See below for a full implementation.)
  *     It is an implementation of the lower-level part of an encoder that:
  *
  *        (1) identifies connected components that are going to be used
  *        (2) puts them in similarity classes (this is an unsupervised
  *            classifier), and
  *        (3) stores the result in a simple file format (2 files,
  *            one for templates and one for page/coordinate/template-index
  *            quartets).
  *
  *     An actual implementation of the official jbig2 encoder could
  *     start with parts (1) and (2), and would then compress the quartets
  *     according to the standards requirements (e.g., Huffman or
  *     arithmetic coding of coordinate differences and image templates).
  *
  *     The low-level part of the encoder provided here has the
  *     following useful features:
  *
  *         - It is accurate in the identification of templates
  *           and classes because it uses a windowed hausdorff
  *           distance metric.
  *         - It is accurate in the placement of the connected
  *           components, doing a two step process of first aligning
  *           the the centroids of the template with those of each instance,
  *           and then making a further correction of up to +- 1 pixel
  *           in each direction to best align the templates.
  *         - It is fast because it uses a morphologically based
  *           matching algorithm to implement the hausdorff criterion,
  *           and it selects the patterns that are possible matches
  *           based on their size.
  *
  *     We provide two different matching functions, one using Hausdorff
  *     distance and one using a simple image correlation.
  *     The Hausdorff method sometimes produces better results for the
  *     same number of classes, because it gives a relatively small
  *     effective weight to foreground pixels near the boundary,
  *     and a relatively  large weight to foreground pixels that are
  *     not near the boundary.  By effectively ignoring these boundary
  *     pixels, Hausdorff weighting corresponds better to the expected
  *     probabilities of the pixel values in a scanned image, where the
  *     variations in instances of the same printed character are much
  *     more likely to be in pixels near the boundary.  By contrast,
  *     the correlation method gives equal weight to all foreground pixels.
  *
  *     For best results, use the correlation method.  Correlation takes
  *     the number of fg pixels in the AND of instance and template,
  *     divided by the product of the number of fg pixels in instance
  *     and template.  It compares this with a threshold that, in
  *     general, depends on the fractional coverage of the template.
  *     For heavy text, the threshold is raised above that for light
  *     text,  By using both these parameters (basic threshold and
  *     adjustment factor for text weight), one has more flexibility
  *     and can arrive at the fewest substitution errors, although
  *     this comes at the price of more templates.
  *
  *     The strict Hausdorff scoring is not a rank weighting, because a
  *     single pixel beyond the given distance will cause a match
  *     failure.  A rank Hausdorff is more robust to non-boundary noise,
  *     but it is also more susceptible to confusing components that
  *     should be in different classes.  For implementing a jbig2
  *     application for visually lossless binary image compression,
  *     you have two choices:
  *
  *        (1) use a 3x3 structuring element (size = 3) and a strict
  *            Hausdorff comparison (rank = 1.0 in the rank Hausdorff
  *            function).  This will result in a minimal number of classes,
  *            but confusion of small characters, such as italic and
  *            non-italic lower-case 'o', can still occur.
  *        (2) use the correlation method with a threshold of 0.85
  *            and a weighting factor of about 0.7.  This will result in
  *            a larger number of classes, but should not be confused
  *            either by similar small characters or by extremely
  *            thick sans serif characters, such as in prog/cootoots.png.
  *
  *     As mentioned above, if visual substitution errors must be
  *     avoided, you should use the correlation method.
  *
  *     We provide executables that show how to do the encoding:
  *         prog/jbrankhaus.c
  *         prog/jbcorrelation.c
  *
  *     The basic flow for correlation classification goes as follows,
  *     where specific choices have been made for parameters (Hausdorff
  *     is the same except for initialization):
  *
  *             // Initialize and save data in the classer
  *         JBCLASSER *classer =
  *             jbCorrelationInit(JB_CONN_COMPS, 0, 0, 0.8, 0.7);
  *         SARRAY *safiles = getSortedPathnamesInDirectory(directory,
  *                                                         NULL, 0, 0);
  *         jbAddPages(classer, safiles);
  *
  *             // Save the data in a data structure for serialization,
  *             // and write it into two files.
  *         JBDATA *data = jbDataSave(classer);
  *         jbDataWrite(rootname, data);
  *
  *             // Reconstruct (render) the pages from the encoded data.
  *         PIXA *pixa = jbDataRender(data, FALSE);
  *
  *     Adam Langley has recently built a jbig2 standards-compliant
  *     encoder, the first one to appear in open source.  You can get
  *     this encoder at:
  *
  *          http://www.imperialviolet.org/jbig2.html
  *
  *     It uses arithmetic encoding throughout.  It encodes binary images
  *     losslessly with a single arithmetic coding over the full image.
  *     It also does both lossy and lossless encoding from connected
  *     components, using leptonica to generate the templates representing
  *     each cluster.
  */

 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <math.h>
 #include "allheaders.h"

     /* MSVC can't handle arrays dimensioned by static const integers */
 #define  L_BUF_SIZE  512

     /* For jbClassifyRankHaus(): size of border added around
      * pix of each c.c., to allow further processing.  This
      * should be at least the sum of the MAX_DIFF_HEIGHT
      * (or MAX_DIFF_WIDTH) and one-half the size of the Sel  */
 static const l_int32  JB_ADDED_PIXELS = 6;

     /* For pixHaustest(), pixRankHaustest() and pixCorrelationScore():
      * choose these to be 2 or greater */
 static const l_int32  MAX_DIFF_WIDTH = 2;  /* use at least 2 */
 static const l_int32  MAX_DIFF_HEIGHT = 2;  /* use at least 2 */

     /* In initialization, you have the option to discard components
      * (cc, characters or words) that have either width or height larger
      * than a given size.  This is convenient for jbDataSave(), because
      * the components are placed onto a regular lattice with cell
      * dimension equal to the maximum component size.  The default
      * values are given here.  If you want to save all components,
      * use a sufficiently large set of dimensions. */
 static const l_int32  MAX_CONN_COMP_WIDTH = 350;  /* default max cc width */
 static const l_int32  MAX_CHAR_COMP_WIDTH = 350;  /* default max char width */
 static const l_int32  MAX_WORD_COMP_WIDTH = 1000;  /* default max word width */
 static const l_int32  MAX_COMP_HEIGHT = 120;  /* default max component height */

     /* Max allowed dilation to merge characters into words */
 #define  MAX_ALLOWED_DILATION  14

     /* This stores the state of a state machine which fetches
      * similar sized templates */
 struct JbFindTemplatesState
 {
     JBCLASSER       *classer;    /* classer                               */
     l_int32          w;          /* desired width                         */
     l_int32          h;          /* desired height                        */
     l_int32          i;          /* index into two_by_two step array      */
     NUMA            *numa;       /* current number array                  */
     l_int32          n;          /* current element of numa               */
 };
 typedef struct JbFindTemplatesState JBFINDCTX;


     /* Static initialization function */
 static JBCLASSER * jbCorrelationInitInternal(l_int32 components,
                        l_int32 maxwidth, l_int32 maxheight, l_float32 thresh,
                        l_float32 weightfactor, l_int32 keep_components);

     /* Static helper functions */
 static JBFINDCTX * findSimilarSizedTemplatesInit(JBCLASSER *classer, PIX *pixs);
 static l_int32 findSimilarSizedTemplatesNext(JBFINDCTX *context);
 static void findSimilarSizedTemplatesDestroy(JBFINDCTX **pcontext);
 static l_int32 finalPositioningForAlignment(PIX *pixs, l_int32 x, l_int32 y,
                              l_int32 idelx, l_int32 idely, PIX *pixt,
                              l_int32 *sumtab, l_int32 *pdx, l_int32 *pdy);

 #ifndef NO_CONSOLE_IO
 #define  DEBUG_PLOT_CC             0
 #define  DEBUG_CORRELATION_SCORE   0
 #endif  /* ~NO_CONSOLE_IO */


 /*----------------------------------------------------------------------*
  *                            Initialization                            *
  *----------------------------------------------------------------------*/
 /*!
  *  jbRankHausInit()
  *
  *      Input:  components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS)
  *              maxwidth (of component; use 0 for default)
  *              maxheight (of component; use 0 for default)
  *              size  (of square structuring element; 2, representing
  *                     2x2 sel, is necessary for reasonable accuracy of
  *                     small components; combine this with rank ~ 0.97
  *                     to avoid undue class expansion)
  *              rank (rank val of match, each way; in [0.5 - 1.0];
  *                    when using size = 2, 0.97 is a reasonable value)
  *      Return: jbclasser if OK; NULL on error
  */
 JBCLASSER *
 jbRankHausInit(l_int32    components,
                l_int32    maxwidth,
                l_int32    maxheight,
                l_int32    size,
                l_float32  rank)
 {
 JBCLASSER  *classer;

     PROCNAME("jbRankHausInit");

     if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
         components != JB_WORDS)
         return (JBCLASSER *)ERROR_PTR("invalid components", procName, NULL);
     if (size < 1 || size > 10)
         return (JBCLASSER *)ERROR_PTR("size not reasonable", procName, NULL);
     if (rank < 0.5 || rank > 1.0)
         return (JBCLASSER *)ERROR_PTR("rank not in [0.5-1.0]", procName, NULL);
     if (maxwidth == 0) {
         if (components == JB_CONN_COMPS)
             maxwidth = MAX_CONN_COMP_WIDTH;
         else if (components == JB_CHARACTERS)
             maxwidth = MAX_CHAR_COMP_WIDTH;
         else  /* JB_WORDS */
             maxwidth = MAX_WORD_COMP_WIDTH;
     }
     if (maxheight == 0)
         maxheight = MAX_COMP_HEIGHT;

     if ((classer = jbClasserCreate(JB_RANKHAUS, components)) == NULL)
         return (JBCLASSER *)ERROR_PTR("classer not made", procName, NULL);
     classer->maxwidth = maxwidth;
     classer->maxheight = maxheight;
     classer->sizehaus = size;
     classer->rankhaus = rank;
     classer->nahash = numaHashCreate(5507, 4);  /* 5507 is prime */
     return classer;
 }


 /*!
  *  jbCorrelationInit()
  *
  *      Input:  components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS)
  *              maxwidth (of component; use 0 for default)
  *              maxheight (of component; use 0 for default)
  *              thresh (value for correlation score: in [0.4 - 0.9])
  *              weightfactor (corrects thresh for thick characters [0.0 - 1.0])
  *      Return: jbclasser if OK; NULL on error
  */
 JBCLASSER *
 jbCorrelationInit(l_int32    components,
                   l_int32    maxwidth,
                   l_int32    maxheight,
                   l_float32  thresh,
                   l_float32  weightfactor)
 {
     return jbCorrelationInitInternal(components, maxwidth, maxheight, thresh,
                                      weightfactor, 1);
 }

 /*!
  *  jbCorrelationInitWithoutComponents()
  *
  *      Input:  same as jbCorrelationInit
  *      Output: same as jbCorrelationInit
  *
  *  Note: acts the same as jbCorrelationInit(), but the resulting
  *        object doesn't keep a list of all the components.
  */
 JBCLASSER *
 jbCorrelationInitWithoutComponents(l_int32    components,
                                    l_int32    maxwidth,
                                    l_int32    maxheight,
                                    l_float32  thresh,
                                    l_float32  weightfactor)
 {
     return jbCorrelationInitInternal(components, maxwidth, maxheight, thresh,
                                      weightfactor, 0);
 }


 static JBCLASSER *
 jbCorrelationInitInternal(l_int32    components,
                           l_int32    maxwidth,
                           l_int32    maxheight,
                           l_float32  thresh,
                           l_float32  weightfactor,
                           l_int32    keep_components)
 {
 JBCLASSER  *classer;

     PROCNAME("jbCorrelationInitInternal");

     if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
         components != JB_WORDS)
         return (JBCLASSER *)ERROR_PTR("invalid components", procName, NULL);
     if (thresh < 0.4 || thresh > 0.9)
         return (JBCLASSER *)ERROR_PTR("thresh not in range [0.4 - 0.9]",
                 procName, NULL);
     if (weightfactor < 0.0 || weightfactor > 1.0)
         return (JBCLASSER *)ERROR_PTR("weightfactor not in range [0.0 - 1.0]",
                 procName, NULL);
     if (maxwidth == 0) {
         if (components == JB_CONN_COMPS)
             maxwidth = MAX_CONN_COMP_WIDTH;
         else if (components == JB_CHARACTERS)
             maxwidth = MAX_CHAR_COMP_WIDTH;
         else  /* JB_WORDS */
             maxwidth = MAX_WORD_COMP_WIDTH;
     }
     if (maxheight == 0)
         maxheight = MAX_COMP_HEIGHT;


     if ((classer = jbClasserCreate(JB_CORRELATION, components)) == NULL)
         return (JBCLASSER *)ERROR_PTR("classer not made", procName, NULL);
     classer->maxwidth = maxwidth;
     classer->maxheight = maxheight;
     classer->thresh = thresh;
     classer->weightfactor = weightfactor;
     classer->nahash = numaHashCreate(5507, 4);  /* 5507 is prime */
     classer->keep_pixaa = keep_components;
     return classer;
 }


 /*----------------------------------------------------------------------*
  *                       Classify the pages                             *
  *----------------------------------------------------------------------*/
 /*!
  *  jbAddPages()
  *
  *      Input:  jbclasser
  *              safiles (of page image file names)
  *      Return: 0 if OK; 1 on error
  *
  *  Note:
  *      (1) jbclasser makes a copy of the array of file names.
  *      (2) The caller is still responsible for destroying the input array.
  */
 l_int32
 jbAddPages(JBCLASSER  *classer,
            SARRAY     *safiles)
 {
 l_int32  i, nfiles;
 char    *fname;
 PIX     *pix;

     PROCNAME("jbAddPages");

     if (!classer)
         return ERROR_INT("classer not defined", procName, 1);
     if (!safiles)
         return ERROR_INT("safiles not defined", procName, 1);

     classer->safiles = sarrayCopy(safiles);
     nfiles = sarrayGetCount(safiles);
     for (i = 0; i < nfiles; i++) {
         fname = sarrayGetString(safiles, i, 0);
         if ((pix = pixRead(fname)) == NULL) {
             L_WARNING_INT("image file %d not read", procName, i);
             continue;
         }
         jbAddPage(classer, pix);
         pixDestroy(&pix);
     }

     return 0;
 }


 /*!
  *  jbAddPage()
  *
  *      Input:  jbclasser
  *              pixs (of input page)
  *      Return: 0 if OK; 1 on error
  */
 l_int32
 jbAddPage(JBCLASSER  *classer,
           PIX        *pixs)
 {
 BOXA  *boxas;
 PIXA  *pixas;

     PROCNAME("jbAddPage");

     if (!classer)
         return ERROR_INT("classer not defined", procName, 1);
     if (!pixs)
         return ERROR_INT("pix not defined", procName, 1);

     classer->w = pixGetWidth(pixs);
     classer->h = pixGetHeight(pixs);

         /* Get the appropriate components and their bounding boxes */
     if (jbGetComponents(pixs, classer->components, classer->maxwidth,
                         classer->maxheight, &boxas, &pixas)) {
         return ERROR_INT("components not made", procName, 1);
     }

     jbAddPageComponents(classer, pixs, boxas, pixas);
     boxaDestroy(&boxas);
     pixaDestroy(&pixas);
     return 0;
 }


 /*!
  *  jbAddPageComponents()
  *
  *      Input:  jbclasser
  *              pixs (of input page)
  *              boxas (b.b. of components for this page)
  *              pixas (components for this page)
  *      Return: 0 if OK; 1 on error
  *
  *  Notes:
  *      (1) If there are no components on the page, we don't require input
  *          of empty boxas or pixas, although that's the typical situation.
  */
 l_int32
 jbAddPageComponents(JBCLASSER  *classer,
                     PIX        *pixs,
                     BOXA       *boxas,
                     PIXA       *pixas)
 {
 l_int32  n;

     PROCNAME("jbAddPageComponents");

     if (!classer)
         return ERROR_INT("classer not defined", procName, 1);
     if (!pixs)
         return ERROR_INT("pix not defined", procName, 1);

         /* Test for no components on the current page.  Always update the
          * number of pages processed, even if nothing is on it. */
     if (!boxas || !pixas || (boxaGetCount(boxas) == 0)) {
         classer->npages++;
         return 0;
     }

         /* Get classes.  For hausdorff, it uses a specified size of
          * structuring element and specified rank.  For correlation,
          * it uses a specified threshold. */
     if (classer->method == JB_RANKHAUS) {
         if (jbClassifyRankHaus(classer, boxas, pixas))
             return ERROR_INT("rankhaus classification failed", procName, 1);
     }
     else {  /* classer->method == JB_CORRELATION */
         if (jbClassifyCorrelation(classer, boxas, pixas))
             return ERROR_INT("correlation classification failed", procName, 1);
     }

         /* Find the global UL corners, adjusted for each instance so
          * that the class template and instance will have their
          * centroids in the same place.  Then the template can be
          * used to replace the instance. */
     if (jbGetULCorners(classer, pixs, boxas))
         return ERROR_INT("UL corners not found", procName, 1);

         /* Update total component counts and number of pages processed. */
     n = boxaGetCount(boxas);
     classer->baseindex += n;
     numaAddNumber(classer->nacomps, n);
     classer->npages++;

     return 0;
 }


 /*----------------------------------------------------------------------*
  *         Classification using windowed rank hausdorff metric          *
  *----------------------------------------------------------------------*/
 /*!
  *  jbClassifyRankHaus()
  *
  *      Input:  jbclasser
  *              boxa (of new components for classification)
  *              pixas (of new components for classification)
  *      Return: 0 if OK; 1 on error
  */
 l_int32
 jbClassifyRankHaus(JBCLASSER  *classer,
                    BOXA       *boxa,
                    PIXA       *pixas)
 {
 l_int32     n, nt, i, wt, ht, iclass, size, found, testval;
 l_int32    *sumtab;
 l_int32     npages, area1, area3;
 l_int32    *tab8;
 l_float32   rank, x1, y1, x2, y2;
 BOX        *box;
 NUMA       *naclass, *napage;
 NUMA       *nafg;   /* fg area of all instances */
 NUMA       *nafgt;  /* fg area of all templates */
 JBFINDCTX  *findcontext;
 NUMAHASH   *nahash;
 PIX        *pix, *pix1, *pix2, *pix3, *pix4;
 PIXA       *pixa, *pixa1, *pixa2, *pixat, *pixatd;
 PIXAA      *pixaa;
 PTA        *pta, *ptac, *ptact;
 SEL        *sel;

     PROCNAME("jbClassifyRankHaus");

     if (!classer)
         return ERROR_INT("classer not found", procName, 1);
     if (!boxa)
         return ERROR_INT("boxa not found", procName, 1);
     if (!pixas)
         return ERROR_INT("pixas not found", procName, 1);

     npages = classer->npages;
     size = classer->sizehaus;
     sel = selCreateBrick(size, size, size / 2, size / 2, SEL_HIT);

         /* Generate the bordered pixa, with and without dilation.
          * pixa1 and pixa2 contain all the input components. */
     n = pixaGetCount(pixas);
     pixa1 = pixaCreate(n);
     pixa2 = pixaCreate(n);
     for (i = 0; i < n; i++) {
         pix = pixaGetPix(pixas, i, L_CLONE);
         pix1 = pixAddBorderGeneral(pix, JB_ADDED_PIXELS, JB_ADDED_PIXELS,
                     JB_ADDED_PIXELS, JB_ADDED_PIXELS, 0);
         pix2 = pixDilate(NULL, pix1, sel);
         pixaAddPix(pixa1, pix1, L_INSERT);   /* un-dilated */
         pixaAddPix(pixa2, pix2, L_INSERT);   /* dilated */
         pixDestroy(&pix);
     }

         /* Get the centroids of all the bordered images.
          * These are relative to the UL corner of each (bordered) pix.  */
     pta = pixaCentroids(pixa1);  /* centroids for this page; use here */
     ptac = classer->ptac;  /* holds centroids of components up to this page */
     ptaJoin(ptac, pta, 0, 0);  /* save centroids of all components */
     ptact = classer->ptact;  /* holds centroids of templates */

         /* Use these to save the class and page of each component. */
     naclass = classer->naclass;
     napage = classer->napage;
     sumtab = makePixelSumTab8();

         /* Store the unbordered pix in a pixaa, in a hierarchical
          * set of arrays.  There is one pixa for each class,
          * and the pix in each pixa are all the instances found
          * of that class.  This is actually more than one would need
          * for a jbig2 encoder, but there are two reasons to keep
          * them around: (1) the set of instances for each class
          * can be used to make an improved binary (or, better,
          * a grayscale) template, rather than simply using the first
          * one in the set; (2) we can investigate the failures
          * of the classifier.  This pixaa grows as we process
          * successive pages. */
     pixaa = classer->pixaa;

         /* arrays to store class exemplars (templates) */
     pixat = classer->pixat;   /* un-dilated */
     pixatd = classer->pixatd;   /* dilated */

         /* Fill up the pixaa tree with the template exemplars as
          * the first pix in each pixa.  As we add each pix,
          * we also add the associated box to the pixa.
          * We also keep track of the centroid of each pix,
          * and use the difference between centroids (of the
          * pix with the exemplar we are checking it with)
          * to align the two when checking that the Hausdorff
          * distance does not exceed a threshold.
          * The threshold is set by the Sel used for dilating.
          * For example, a 3x3 brick, sel_3, corresponds to a
          * Hausdorff distance of 1.  In general, for an NxN brick,
          * with N odd, corresponds to a Hausdorff distance of (N - 1)/2.
          * It turns out that we actually need to use a sel of size 2x2
          * to avoid small bad components when there is a halftone image
          * from which components can be chosen.
          * The larger the Sel you use, the fewer the number of classes,
          * and the greater the likelihood of putting semantically
          * different objects in the same class.  For simplicity,
          * we do this separately for the case of rank == 1.0 (exact
          * match within the Hausdorff distance) and rank < 1.0.  */
     rank = classer->rankhaus;
     nahash = classer->nahash;
     if (rank == 1.0) {
         for (i = 0; i < n; i++) {
             pix1 = pixaGetPix(pixa1, i, L_CLONE);
             pix2 = pixaGetPix(pixa2, i, L_CLONE);
             ptaGetPt(pta, i, &x1, &y1);
             nt = pixaGetCount(pixat);  /* number of templates */
             found = FALSE;
             findcontext = findSimilarSizedTemplatesInit(classer, pix1);
             while ((iclass = findSimilarSizedTemplatesNext(findcontext)) > -1) {
                     /* Find score for this template */
                 pix3 = pixaGetPix(pixat, iclass, L_CLONE);
                 pix4 = pixaGetPix(pixatd, iclass, L_CLONE);
                 ptaGetPt(ptact, iclass, &x2, &y2);
                 testval = pixHaustest(pix1, pix2, pix3, pix4, x1 - x2, y1 - y2,
                                       MAX_DIFF_WIDTH, MAX_DIFF_HEIGHT);
                 pixDestroy(&pix3);
                 pixDestroy(&pix4);
                 if (testval == 1) {
                     found = TRUE;
                     numaAddNumber(naclass, iclass);
                     numaAddNumber(napage, npages);
                     if (classer->keep_pixaa) {
                         pixa = pixaaGetPixa(pixaa, iclass, L_CLONE);
                         pix = pixaGetPix(pixas, i, L_CLONE);
                         pixaAddPix(pixa, pix, L_INSERT);
                         box = boxaGetBox(boxa, i, L_CLONE);
                         pixaAddBox(pixa, box, L_INSERT);
                         pixaDestroy(&pixa);
                     }
                     break;
                 }
             }
             findSimilarSizedTemplatesDestroy(&findcontext);
             if (found == FALSE) {  /* new class */
                 numaAddNumber(naclass, nt);
                 numaAddNumber(napage, npages);
                 pixa = pixaCreate(0);
                 pix = pixaGetPix(pixas, i, L_CLONE);  /* unbordered instance */
                 pixaAddPix(pixa, pix, L_INSERT);
                 wt = pixGetWidth(pix);
                 ht = pixGetHeight(pix);
                 numaHashAdd(nahash, ht * wt, nt);
                 box = boxaGetBox(boxa, i, L_CLONE);
                 pixaAddBox(pixa, box, L_INSERT);
                 pixaaAddPixa(pixaa, pixa, L_INSERT);  /* unbordered instance */
                 ptaAddPt(ptact, x1, y1);
                 pixaAddPix(pixat, pix1, L_INSERT);  /* bordered template */
                 pixaAddPix(pixatd, pix2, L_INSERT);  /* bordered dil template */
             }
             else {   /* don't save them */
                 pixDestroy(&pix1);
                 pixDestroy(&pix2);
             }
         }
     }
     else {  /* rank < 1.0 */
         if ((nafg = pixaCountPixels(pixas)) == NULL)  /* areas for this page */
             return ERROR_INT("nafg not made", procName, 1);
         nafgt = classer->nafgt;
         tab8 = makePixelSumTab8();
         for (i = 0; i < n; i++) {   /* all instances on this page */
             pix1 = pixaGetPix(pixa1, i, L_CLONE);
             numaGetIValue(nafg, i, &area1);
             pix2 = pixaGetPix(pixa2, i, L_CLONE);
             ptaGetPt(pta, i, &x1, &y1);   /* use pta for this page */
             nt = pixaGetCount(pixat);  /* number of templates */
             found = FALSE;
             findcontext = findSimilarSizedTemplatesInit(classer, pix1);
             while ((iclass = findSimilarSizedTemplatesNext(findcontext)) > -1) {
                     /* Find score for this template */
                 pix3 = pixaGetPix(pixat, iclass, L_CLONE);
                 numaGetIValue(nafgt, iclass, &area3);
                 pix4 = pixaGetPix(pixatd, iclass, L_CLONE);
                 ptaGetPt(ptact, iclass, &x2, &y2);
                 testval = pixRankHaustest(pix1, pix2, pix3, pix4,
                                           x1 - x2, y1 - y2,
                                           MAX_DIFF_WIDTH, MAX_DIFF_HEIGHT,
                                           area1, area3, rank, tab8);
                 pixDestroy(&pix3);
                 pixDestroy(&pix4);
                 if (testval == 1) {  /* greedy match; take the first */
                     found = TRUE;
                     numaAddNumber(naclass, iclass);
                     numaAddNumber(napage, npages);
                     if (classer->keep_pixaa) {
                         pixa = pixaaGetPixa(pixaa, iclass, L_CLONE);
                         pix = pixaGetPix(pixas, i, L_CLONE);
                         pixaAddPix(pixa, pix, L_INSERT);
                         box = boxaGetBox(boxa, i, L_CLONE);
                         pixaAddBox(pixa, box, L_INSERT);
                         pixaDestroy(&pixa);
                     }
                     break;
                 }
             }
             findSimilarSizedTemplatesDestroy(&findcontext);
             if (found == FALSE) {  /* new class */
                 numaAddNumber(naclass, nt);
                 numaAddNumber(napage, npages);
                 pixa = pixaCreate(0);
                 pix = pixaGetPix(pixas, i, L_CLONE);  /* unbordered instance */
                 pixaAddPix(pixa, pix, L_INSERT);
                 wt = pixGetWidth(pix);
                 ht = pixGetHeight(pix);
                 numaHashAdd(nahash, ht * wt, nt);
                 box = boxaGetBox(boxa, i, L_CLONE);
                 pixaAddBox(pixa, box, L_INSERT);
                 pixaaAddPixa(pixaa, pixa, L_INSERT);  /* unbordered instance */
                 ptaAddPt(ptact, x1, y1);
                 pixaAddPix(pixat, pix1, L_INSERT);  /* bordered template */
                 pixaAddPix(pixatd, pix2, L_INSERT);  /* ditto */
                 numaAddNumber(nafgt, area1);
             }
             else {   /* don't save them */
                 pixDestroy(&pix1);
                 pixDestroy(&pix2);
             }
         }
         FREE(tab8);
         numaDestroy(&nafg);
     }
     classer->nclass = pixaGetCount(pixat);

     FREE(sumtab);
     ptaDestroy(&pta);
     pixaDestroy(&pixa1);
     pixaDestroy(&pixa2);
     selDestroy(&sel);
     return 0;
 }


 /*!
  *  pixHaustest()
  *
  *      Input:  pix1   (new pix, not dilated)
  *              pix2   (new pix, dilated)
  *              pix3   (exemplar pix, not dilated)
  *              pix4   (exemplar pix, dilated)
  *              delx   (x comp of centroid difference)
  *              dely   (y comp of centroid difference)
  *              maxdiffw (max width difference of pix1 and pix2)
  *              maxdiffh (max height difference of pix1 and pix2)
  *      Return: 0 (FALSE) if no match, 1 (TRUE) if the new
  *              pix is in the same class as the exemplar.
  *
  *  Note: we check first that the two pix are roughly
  *  the same size.  Only if they meet that criterion do
  *  we compare the bitmaps.  The Hausdorff is a 2-way
  *  check.  The centroid difference is used to align the two
  *  images to the nearest integer for each of the checks.
  *  These check that the dilated image of one contains
  *  ALL the pixels of the undilated image of the other.
  *  Checks are done in both direction.  A single pixel not
  *  contained in either direction results in failure of the test.
  */
 l_int32
 pixHaustest(PIX       *pix1,
             PIX       *pix2,
             PIX       *pix3,
             PIX       *pix4,
             l_float32  delx,   /* x(1) - x(3) */
             l_float32  dely,   /* y(1) - y(3) */
             l_int32    maxdiffw,
             l_int32    maxdiffh)
 {
 l_int32  wi, hi, wt, ht, delw, delh, idelx, idely, boolmatch;
 PIX     *pixt;

         /* Eliminate possible matches based on size difference */
     wi = pixGetWidth(pix1);
     hi = pixGetHeight(pix1);
     wt = pixGetWidth(pix3);
     ht = pixGetHeight(pix3);
     delw = L_ABS(wi - wt);
     if (delw > maxdiffw)
         return FALSE;
     delh = L_ABS(hi - ht);
     if (delh > maxdiffh)
         return FALSE;

         /* Round difference in centroid location to nearest integer;
          * use this as a shift when doing the matching. */
     if (delx >= 0)
         idelx = (l_int32)(delx + 0.5);
     else
         idelx = (l_int32)(delx - 0.5);
     if (dely >= 0)
         idely = (l_int32)(dely + 0.5);
     else
         idely = (l_int32)(dely - 0.5);

         /*  Do 1-direction hausdorff, checking that every pixel in pix1
          *  is within a dilation distance of some pixel in pix3.  Namely,
          *  that pix4 entirely covers pix1:
          *       pixt = pixSubtract(NULL, pix1, pix4), including shift
          *  where pixt has no ON pixels.  */
     pixt = pixCreateTemplate(pix1);
     pixRasterop(pixt, 0, 0, wi, hi, PIX_SRC, pix1, 0, 0);
     pixRasterop(pixt, idelx, idely, wi, hi, PIX_DST & PIX_NOT(PIX_SRC),
                 pix4, 0, 0);
     pixZero(pixt, &boolmatch);
     if (boolmatch == 0) {
         pixDestroy(&pixt);
         return FALSE;
     }

         /*  Do 1-direction hausdorff, checking that every pixel in pix3
          *  is within a dilation distance of some pixel in pix1.  Namely,
          *  that pix2 entirely covers pix3:
          *      pixSubtract(pixt, pix3, pix2), including shift
          *  where pixt has no ON pixels. */
     pixRasterop(pixt, idelx, idely, wt, ht, PIX_SRC, pix3, 0, 0);
     pixRasterop(pixt, 0, 0, wt, ht, PIX_DST & PIX_NOT(PIX_SRC), pix2, 0, 0);
     pixZero(pixt, &boolmatch);
     pixDestroy(&pixt);
     return boolmatch;
 }


 /*!
  *  pixRankHaustest()
  *
  *      Input:  pix1   (new pix, not dilated)
  *              pix2   (new pix, dilated)
  *              pix3   (exemplar pix, not dilated)
  *              pix4   (exemplar pix, dilated)
  *              delx   (x comp of centroid difference)
  *              dely   (y comp of centroid difference)
  *              maxdiffw (max width difference of pix1 and pix2)
  *              maxdiffh (max height difference of pix1 and pix2)
  *              area1  (fg pixels in pix1)
  *              area3  (fg pixels in pix3)
  *              rank   (rank value of test, each way)
  *              tab8   (table of pixel sums for byte)
  *      Return: 0 (FALSE) if no match, 1 (TRUE) if the new
  *                 pix is in the same class as the exemplar.
  *
  *  Note: we check first that the two pix are roughly
  *  the same size.  Only if they meet that criterion do
  *  we compare the bitmaps.  We convert the rank value to
  *  a number of pixels by multiplying the rank fraction by the number
  *  of pixels in the undilated image.  The Hausdorff is a 2-way
  *  check.  The centroid difference is used to align the two
  *  images to the nearest integer for each of the checks.
  *  The rank hausdorff checks that the dilated image of one
  *  contains the rank fraction of the pixels of the undilated
  *  image of the other.   Checks are done in both direction.
  *  Failure of the test in either direction results in failure
  *  of the test.
  */
 l_int32
 pixRankHaustest(PIX       *pix1,
                 PIX       *pix2,
                 PIX       *pix3,
                 PIX       *pix4,
                 l_float32  delx,   /* x(1) - x(3) */
                 l_float32  dely,   /* y(1) - y(3) */
                 l_int32    maxdiffw,
                 l_int32    maxdiffh,
                 l_int32    area1,
                 l_int32    area3,
                 l_float32  rank,
                 l_int32   *tab8)
 {
 l_int32  wi, hi, wt, ht, delw, delh, idelx, idely, boolmatch;
 l_int32  thresh1, thresh3;
 PIX     *pixt;

         /* Eliminate possible matches based on size difference */
     wi = pixGetWidth(pix1);
     hi = pixGetHeight(pix1);
     wt = pixGetWidth(pix3);
     ht = pixGetHeight(pix3);
     delw = L_ABS(wi - wt);
     if (delw > maxdiffw)
         return FALSE;
     delh = L_ABS(hi - ht);
     if (delh > maxdiffh)
         return FALSE;

         /* Upper bounds in remaining pixels for allowable match */
     thresh1 = (l_int32)(area1 * (1. - rank) + 0.5);
     thresh3 = (l_int32)(area3 * (1. - rank) + 0.5);

         /* Round difference in centroid location to nearest integer;
          * use this as a shift when doing the matching. */
     if (delx >= 0)
         idelx = (l_int32)(delx + 0.5);
     else
         idelx = (l_int32)(delx - 0.5);
     if (dely >= 0)
         idely = (l_int32)(dely + 0.5);
     else
         idely = (l_int32)(dely - 0.5);

         /*  Do 1-direction hausdorff, checking that every pixel in pix1
          *  is within a dilation distance of some pixel in pix3.  Namely,
          *  that pix4 entirely covers pix1:
          *       pixt = pixSubtract(NULL, pix1, pix4), including shift
          *  where pixt has no ON pixels.  */
     pixt = pixCreateTemplate(pix1);
     pixRasterop(pixt, 0, 0, wi, hi, PIX_SRC, pix1, 0, 0);
     pixRasterop(pixt, idelx, idely, wi, hi, PIX_DST & PIX_NOT(PIX_SRC),
                 pix4, 0, 0);
     pixThresholdPixels(pixt, thresh1, &boolmatch, tab8);
     if (boolmatch == 1) { /* above thresh1 */
         pixDestroy(&pixt);
         return FALSE;
     }

         /*  Do 1-direction hausdorff, checking that every pixel in pix3
          *  is within a dilation distance of some pixel in pix1.  Namely,
          *  that pix2 entirely covers pix3:
          *      pixSubtract(pixt, pix3, pix2), including shift
          *  where pixt has no ON pixels. */
     pixRasterop(pixt, idelx, idely, wt, ht, PIX_SRC, pix3, 0, 0);
     pixRasterop(pixt, 0, 0, wt, ht, PIX_DST & PIX_NOT(PIX_SRC), pix2, 0, 0);
     pixThresholdPixels(pixt, thresh3, &boolmatch, tab8);
     pixDestroy(&pixt);
     if (boolmatch == 1)  /* above thresh3 */
         return FALSE;
     else
         return TRUE;
 }


 /*----------------------------------------------------------------------*
  *            Classification using windowed correlation score           *
  *----------------------------------------------------------------------*/
 /*!
  *  jbClassifyCorrelation()
  *
  *      Input:  jbclasser
  *              boxa (of new components for classification)
  *              pixas (of new components for classification)
  *      Return: 0 if OK; 1 on error
  */
 l_int32
 jbClassifyCorrelation(JBCLASSER  *classer,
                       BOXA       *boxa,
                       PIXA       *pixas)
 {
 l_int32     n, nt, i, iclass, wt, ht, found, area, area1, area2, npages,
             overthreshold;
 l_int32    *sumtab, *centtab;
 l_uint32   *row, word;
 l_float32   x1, y1, x2, y2, xsum, ysum;
 l_float32   thresh, weight, threshold;
 BOX        *box;
 NUMA       *naclass, *napage;
 NUMA       *nafgt;   /* fg area of all templates */
 NUMA       *naarea;   /* w * h area of all templates */
 JBFINDCTX  *findcontext;
 NUMAHASH   *nahash;
 PIX        *pix, *pix1, *pix2;
 PIXA       *pixa, *pixa1, *pixat;
 PIXAA      *pixaa;
 PTA        *pta, *ptac, *ptact;
 l_int32    *pixcts;  /* pixel counts of each pixa */
 l_int32   **pixrowcts;  /* row-by-row pixel counts of each pixa */
 l_int32     x, y, rowcount, downcount, wpl;
 l_uint8     byte;

     PROCNAME("jbClassifyCorrelation");

     if (!classer)
         return ERROR_INT("classer not found", procName, 1);
     if (!boxa)
         return ERROR_INT("boxa not found", procName, 1);
     if (!pixas)
         return ERROR_INT("pixas not found", procName, 1);

     npages = classer->npages;

         /* Generate the bordered pixa, which contains all the the
          * input components.  This will not be saved.   */
     n = pixaGetCount(pixas);
     pixa1 = pixaCreate(n);
     for (i = 0; i < n; i++) {
         pix = pixaGetPix(pixas, i, L_CLONE);
         pix1 = pixAddBorderGeneral(pix, JB_ADDED_PIXELS, JB_ADDED_PIXELS,
                     JB_ADDED_PIXELS, JB_ADDED_PIXELS, 0);
         pixaAddPix(pixa1, pix1, L_INSERT);
         pixDestroy(&pix);
     }

         /* Use these to save the class and page of each component. */
     naclass = classer->naclass;
     napage = classer->napage;

         /* Get the number of fg pixels in each component.  */
     nafgt = classer->nafgt;    /* holds fg areas of the templates */
     sumtab = makePixelSumTab8();

     pixcts = (l_int32 *)CALLOC(n, sizeof(*pixcts));
     pixrowcts = (l_int32 **)CALLOC(n, sizeof(*pixrowcts));
     centtab = makePixelCentroidTab8();
     if (!pixcts || !pixrowcts || !centtab)
         return ERROR_INT("calloc fail in pix*cts or centtab", procName, 1);

         /* Count the "1" pixels in each row of the pix in pixa1; this
          * allows pixCorrelationScoreThresholded to abort early if a match
          * is impossible.  This loop merges three calculations: the total
          * number of "1" pixels, the number of "1" pixels in each row, and
          * the centroid.  The centroids are relative to the UL corner of
          * each (bordered) pix.  The pixrowcts[i][y] are the total number
          * of fg pixels in pixa[i] below row y. */
     pta = ptaCreate(n);
     for (i = 0; i < n; i++) {
         pix = pixaGetPix(pixa1, i, L_CLONE);
         pixrowcts[i] = (l_int32 *)CALLOC(pixGetHeight(pix),
                                          sizeof(**pixrowcts));
         xsum = 0;
         ysum = 0;
         wpl = pixGetWpl(pix);
         row = pixGetData(pix) + (pixGetHeight(pix) - 1) * wpl;
         downcount = 0;
         for (y = pixGetHeight(pix) - 1; y >= 0; y--, row -= wpl) {
             pixrowcts[i][y] = downcount;
             rowcount = 0;
             for (x = 0; x < wpl; x++) {
                 word = row[x];
                 byte = word & 0xff;
                 rowcount += sumtab[byte];
                 xsum += centtab[byte] + (x * 32 + 24) * sumtab[byte];
                 byte = (word >> 8) & 0xff;
                 rowcount += sumtab[byte];
                 xsum += centtab[byte] + (x * 32 + 16) * sumtab[byte];
                 byte = (word >> 16) & 0xff;
                 rowcount += sumtab[byte];
                 xsum += centtab[byte] + (x * 32 + 8) * sumtab[byte];
                 byte = (word >> 24) & 0xff;
                 rowcount += sumtab[byte];
                 xsum += centtab[byte] + x * 32 * sumtab[byte];
             }
             downcount += rowcount;
             ysum += rowcount * y;
         }
         pixcts[i] = downcount;
         ptaAddPt(pta,
                  xsum / (l_float32)downcount, ysum / (l_float32)downcount);
         pixDestroy(&pix);
     }

     ptac = classer->ptac;  /* holds centroids of components up to this page */
     ptaJoin(ptac, pta, 0, 0);  /* save centroids of all components */
     ptact = classer->ptact;  /* holds centroids of templates */

     /* Store the unbordered pix in a pixaa, in a hierarchical
      * set of arrays.  There is one pixa for each class,
      * and the pix in each pixa are all the instances found
      * of that class.  This is actually more than one would need
      * for a jbig2 encoder, but there are two reasons to keep
      * them around: (1) the set of instances for each class
      * can be used to make an improved binary (or, better,
      * a grayscale) template, rather than simply using the first
      * one in the set; (2) we can investigate the failures
      * of the classifier.  This pixaa grows as we process
      * successive pages. */
     pixaa = classer->pixaa;

         /* Array to store class exemplars */
     pixat = classer->pixat;

         /* Fill up the pixaa tree with the template exemplars as
          * the first pix in each pixa.  As we add each pix,
          * we also add the associated box to the pixa.
          * We also keep track of the centroid of each pix,
          * and use the difference between centroids (of the
          * pix with the exemplar we are checking it with)
          * to align the two when checking that the correlation
          * score exceeds a threshold.  The correlation score
          * is given by the square of the area of the AND
          * between aligned instance and template, divided by
          * the product of areas of each image.  For identical
          * template and instance, the score is 1.0.
          * If the threshold is too small, non-equivalent instances
          * will be placed in the same class; if too large, there will
          * be an unnecessary division of classes representing the
          * same character.  The weightfactor adds in some of the
          * difference (1.0 - thresh), depending on the heaviness
          * of the template (measured as the fraction of fg pixels). */
     thresh = classer->thresh;
     weight = classer->weightfactor;
     naarea = classer->naarea;
     nahash = classer->nahash;
     for (i = 0; i < n; i++) {
         pix1 = pixaGetPix(pixa1, i, L_CLONE);
         area1 = pixcts[i];
         ptaGetPt(pta, i, &x1, &y1);  /* centroid for this instance */
         nt = pixaGetCount(pixat);
         found = FALSE;
         findcontext = findSimilarSizedTemplatesInit(classer, pix1);
         while ( (iclass = findSimilarSizedTemplatesNext(findcontext)) > -1) {
                 /* Get the template */
             pix2 = pixaGetPix(pixat, iclass, L_CLONE);
             numaGetIValue(nafgt, iclass, &area2);
             ptaGetPt(ptact, iclass, &x2, &y2);  /* template centroid */

                 /* Find threshold for this template */
             if (weight > 0.0) {
                 numaGetIValue(naarea, iclass, &area);
                 threshold = thresh + (1. - thresh) * weight * area2 / area;
             }
             else
                 threshold = thresh;

                 /* Find score for this template */
             overthreshold = pixCorrelationScoreThresholded(pix1, pix2,
                                                            area1, area2,
                                                            x1 - x2, y1 - y2,
                                                            MAX_DIFF_WIDTH,
                                                            MAX_DIFF_HEIGHT,
                                                            sumtab,
                                                            pixrowcts[i],
                                                            threshold);
 #if DEBUG_CORRELATION_SCORE
             {
                 l_float32 score, testscore;
                 l_int32 count, testcount;
                 score = pixCorrelationScore(pix1, pix2, area1, area2,
                                             x1 - x2, y1 - y2,
                                             MAX_DIFF_WIDTH, MAX_DIFF_HEIGHT,
                                             sumtab);

                 testscore = pixCorrelationScoreSimple(pix1, pix2, area1, area2,
                                   x1 - x2, y1 - y2, MAX_DIFF_WIDTH,
                                   MAX_DIFF_HEIGHT, sumtab);
                 count = (l_int32)rint(sqrt(score * area1 * area2));
                 testcount = (l_int32)rint(sqrt(testscore * area1 * area2));
                 if ((score >= threshold) != (testscore >= threshold)) {
                     fprintf(stderr, "Correlation score mismatch: %d(%g,%d) vs %d(%g,%d) (%g)\n",
                             count, score, score >= threshold,
                             testcount, testscore, testscore >= threshold,
                             score - testscore);
                 }

                 if ((score >= threshold) != overthreshold) {
                     fprintf(stderr, "Mismatch between correlation/threshold comparison: %g(%g,%d) >= %g(%g) vs %s\n",
                             score, score*area1*area2, count, threshold, threshold*area1*area2, (overthreshold ? "true" : "false"));
                 }
             }
 #endif  /* DEBUG_CORRELATION_SCORE */
             pixDestroy(&pix2);

             if (overthreshold) {  /* greedy match */
                 found = TRUE;
                 numaAddNumber(naclass, iclass);
                 numaAddNumber(napage, npages);
                 if (classer->keep_pixaa) {
                         /* We are keeping a record of all components */
                     pixa = pixaaGetPixa(pixaa, iclass, L_CLONE);
                     pix = pixaGetPix(pixas, i, L_CLONE);
                     pixaAddPix(pixa, pix, L_INSERT);
                     box = boxaGetBox(boxa, i, L_CLONE);
                     pixaAddBox(pixa, box, L_INSERT);
                     pixaDestroy(&pixa);
                 }
                 break;
             }
         }
         findSimilarSizedTemplatesDestroy(&findcontext);
         if (found == FALSE) {  /* new class */
             numaAddNumber(naclass, nt);
             numaAddNumber(napage, npages);
             pixa = pixaCreate(0);
             pix = pixaGetPix(pixas, i, L_CLONE);  /* unbordered instance */
             pixaAddPix(pixa, pix, L_INSERT);
             wt = pixGetWidth(pix);
             ht = pixGetHeight(pix);
             numaHashAdd(nahash, ht * wt, nt);
             box = boxaGetBox(boxa, i, L_CLONE);
             pixaAddBox(pixa, box, L_INSERT);
             pixaaAddPixa(pixaa, pixa, L_INSERT);  /* unbordered instance */
             ptaAddPt(ptact, x1, y1);
             numaAddNumber(nafgt, area1);
             pixaAddPix(pixat, pix1, L_INSERT);   /* bordered template */
             area = (pixGetWidth(pix1) - 2 * JB_ADDED_PIXELS) *
                    (pixGetHeight(pix1) - 2 * JB_ADDED_PIXELS);
             numaAddNumber(naarea, area);
         }
         else {   /* don't save it */
             pixDestroy(&pix1);
         }
     }
     classer->nclass = pixaGetCount(pixat);

     FREE(pixcts);
     FREE(centtab);
     for (i = 0; i < n; i++) {
         FREE(pixrowcts[i]);
     }
     FREE(pixrowcts);

     FREE(sumtab);
     ptaDestroy(&pta);
     pixaDestroy(&pixa1);
     return 0;
 }


 /*----------------------------------------------------------------------*
  *             Determine the image components we start with             *
  *----------------------------------------------------------------------*/
 /*!
  *  jbGetComponents()
  *
  *      Input:  pixs (1 bpp)
  *              components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS)
  *              maxwidth, maxheight (of saved components; larger are discarded)
  *              &pboxa (<return> b.b. of component items)
  *              &ppixa (<return> component items)
  *      Return: 0 if OK, 1 on error
  */
 l_int32
 jbGetComponents(PIX     *pixs,
                 l_int32  components,
                 l_int32  maxwidth,
                 l_int32  maxheight,
                 BOXA   **pboxad,
                 PIXA   **ppixad)
 {
 l_int32    empty, res, redfactor;
 BOXA      *boxa;
 PIX       *pixt1, *pixt2, *pixt3;
 PIXA      *pixa, *pixat;

     PROCNAME("jbGetComponents");

     if (!pboxad)
         return ERROR_INT("&boxad not defined", procName, 1);
     *pboxad = NULL;
     if (!ppixad)
         return ERROR_INT("&pixad not defined", procName, 1);
     *ppixad = NULL;
     if (!pixs)
         return ERROR_INT("pixs not defined", procName, 1);
     if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
         components != JB_WORDS)
         return ERROR_INT("invalid components", procName, 1);

     pixZero(pixs, &empty);
     if (empty) {
         *pboxad = boxaCreate(0);
         *ppixad = pixaCreate(0);
         return 0;
     }

         /* If required, preprocess input pixs.  The method for both
          * characters and words is to generate a connected component
          * mask over the units that we want to aggregrate, which are,
          * in general, sets of related connected components in pixs.
          * For characters, we want to include the dots with
          * 'i', 'j' and '!', so we do a small vertical closing to
          * generate the mask.  For words, we make a mask over all
          * characters in each word.  This is a bit more tricky, because
          * the spacing between words is difficult to predict a priori,
          * and words can be typeset with variable spacing that can
          * in some cases be barely larger than the space between
          * characters.  The first step is to generate the mask and
          * identify each of its connected components.  */
     if (components == JB_CONN_COMPS) {  /* no preprocessing */
         boxa = pixConnComp(pixs, &pixa, 8);
     }
     else if (components == JB_CHARACTERS) {
         pixt1 = pixMorphSequence(pixs, "c1.6", 0);
         boxa = pixConnComp(pixt1, &pixat, 8);
         pixa = pixaClipToPix(pixat, pixs);
         pixDestroy(&pixt1);
         pixaDestroy(&pixat);
     }
     else {  /* components == JB_WORDS */

             /* Do the operations at about 150 ppi resolution.
              * It is much faster at 75 ppi, but the results are
              * more accurate at 150 ppi.  This will segment the
              * words in body text.  It can be expected that relatively
              * infrequent words in a larger font will be split. */
         res = pixGetXRes(pixs);
         if (res <= 200) {
             redfactor = 1;
             pixt1 = pixClone(pixs);
         }
         else if (res <= 400) {
             redfactor = 2;
             pixt1 = pixReduceRankBinaryCascade(pixs, 1, 0, 0, 0);
         }
         else {
             redfactor = 4;
             pixt1 = pixReduceRankBinaryCascade(pixs, 1, 1, 0, 0);
         }

         pixt2 = pixWordMaskByDilation(pixt1, 0, NULL);

             /* Expand the optimally dilated word mask to full res. */
         pixt3 = pixExpandReplicate(pixt2, redfactor);

             /* Pull out the pixels in pixs corresponding to the mask
              * components in pixt3.  Note that above we used threshold
              * levels in the reduction of 1 to insure that the resulting
              * mask fully covers the input pixs.  The downside of using
              * a threshold of 1 is that very close characters from adjacent
              * lines can be joined.  But with a level of 2 or greater,
              * it is necessary to use a seedfill, followed by a pixOr():
              *       pixt4 = pixSeedfillBinary(NULL, pixt3, pixs, 8);
              *       pixOr(pixt3, pixt3, pixt4);
              * to insure that the mask coverage is complete over pixs.  */
         boxa = pixConnComp(pixt3, &pixat, 4);
         pixa = pixaClipToPix(pixat, pixs);
         pixaDestroy(&pixat);
         pixDestroy(&pixt1);
         pixDestroy(&pixt2);
         pixDestroy(&pixt3);
     }

         /* Remove large components, and save the results.  */
     *ppixad = pixaSelectBySize(pixa, maxwidth, maxheight, L_SELECT_IF_BOTH,
                                L_SELECT_IF_LTE, NULL);
     *pboxad = boxaSelectBySize(boxa, maxwidth, maxheight, L_SELECT_IF_BOTH,
                                L_SELECT_IF_LTE, NULL);
     pixaDestroy(&pixa);
     boxaDestroy(&boxa);

     return 0;
 }


 /*!
  *  pixWordMaskByDilation()
  *
  *      Input:  pixs (1 bpp; typ. at 75 to 150 ppi)
  *              maxsize (use 0 for default; not to exceed 14)
  *              &size (<optional return> size of optimal horiz Sel)
  *      Return: pixd (dilated word mask), or null on error
  *
  *  Notes:
  *      (1) For 75 to 150 ppi, the optimal dilation should not exceed 7.
  *          This is the default size chosen if maxsize <= 0.
  *      (2) To run this on images at resolution between 200 and 300, it
  *          is advisable to use a larger maxsize, say between 10 and 14.
  *      (3) The best size for dilating to get word masks is optionally returned.
  */
 PIX *
 pixWordMaskByDilation(PIX      *pixs,
                       l_int32   maxsize,
                       l_int32  *psize)
 {
 l_int32  i, diffmin, ndiff, imin;
 l_int32  ncc[MAX_ALLOWED_DILATION + 1];
 BOXA    *boxa;
 NUMA    *nacc;
 PIX     *pixt1, *pixt2, *pixt3;
 PIXA    *pixa;
 SEL     *sel;

     PROCNAME("pixWordMaskbyDilation");

     if (!pixs)
         return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);

         /* Find the optimal dilation to create the word mask.
          * Look for successively increasing dilations where the
          * number of connected components doesn't decrease.
          * This is the situation where the components in the
          * word mask should properly cover each word.  Use of
          * 4 cc slightly reduces the likelihood that words from
          * different lines are joined.  */
     diffmin = 1000000;
     pixa = pixaCreate(8);
     pixt1 = pixCopy(NULL, pixs);
     pixaAddPix(pixa, pixt1, L_COPY);

     if (maxsize <= 0)
         maxsize = 7;  /* default */
     if (maxsize > MAX_ALLOWED_DILATION)
         maxsize = MAX_ALLOWED_DILATION;
     nacc = numaCreate(maxsize);
     for (i = 0; i <= maxsize; i++) {
         if (i == 0)     /* first one not dilated */
             pixt2 = pixCopy(NULL, pixt1);
         else    /* successive dilation by sel_2h */
             pixt2 = pixMorphSequence(pixt1, "d2.1", 0);
         boxa = pixConnCompBB(pixt2, 4);
         ncc[i] = boxaGetCount(boxa);
         numaAddNumber(nacc, ncc[i]);
         if (i > 0) {
             ndiff = ncc[i - 1] - ncc[i];
 #if  DEBUG_PLOT_CC
             fprintf(stderr, "ndiff[%d] = %d\n", i - 1, ndiff);
 #endif  /* DEBUG_PLOT_CC */
             if (ndiff < diffmin) {
                 imin = i;
                 diffmin = ndiff;
             }
         }
         pixaAddPix(pixa, pixt2, L_COPY);
         pixDestroy(&pixt1);
         pixt1 = pixt2;
         boxaDestroy(&boxa);
     }
     pixDestroy(&pixt1);

 #if  DEBUG_PLOT_CC
     {GPLOT *gplot;
      NUMA  *naseq;
         naseq = numaMakeSequence(1, 1, numaGetCount(nacc));
         gplot = gplotCreate("junk_cc_output", GPLOT_PNG,
                             "Number of cc vs. horizontal dilation",
                             "Sel horiz", "Number of cc");
         gplotAddPlot(gplot, naseq, nacc, GPLOT_LINES, "");
         gplotMakeOutput(gplot);
         gplotDestroy(&gplot);
         numaDestroy(&naseq);
     }
 #endif  /* DEBUG_PLOT_CC */

         /* Save the result of the optimal dilation */
     pixt2 = pixaGetPix(pixa, imin, L_CLONE);
     sel = selCreateBrick(1, imin, 0, imin - 1, SEL_HIT);
     pixt3 = pixErode(NULL, pixt2, sel);  /* remove effect of dilation */
     selDestroy(&sel);
     pixDestroy(&pixt2);
     pixaDestroy(&pixa);
     if (psize)
         *psize = imin + 1;

     numaDestroy(&nacc);
     return pixt3;
 }


 /*----------------------------------------------------------------------*
  *                 Build grayscale composites (templates)               *
  *----------------------------------------------------------------------*/
 /*!
  *  jbAccumulateComposites()
  *
  *      Input:  pixaa (one pixa for each class)
  *              &pna (<return> number of samples used to build each composite)
  *              &ptat (<return> centroids of bordered composites)
  *      Return: pixad (accumulated sum of samples in each class),
  *                     or null on error
  *
  */
 PIXA *
 jbAccumulateComposites(PIXAA  *pixaa,
                        NUMA  **pna,
                        PTA   **pptat)
 {
 l_int32    n, nt, i, j, d, minw, maxw, minh, maxh, xdiff, ydiff;
 l_float32  x, y, xave, yave;
 NUMA      *na;
 PIX       *pix, *pixt1, *pixt2, *pixsum;
 PIXA      *pixa, *pixad;
 PTA       *ptat, *pta;

     PROCNAME("jbAccumulateComposites");

     if (!pptat)
         return (PIXA *)ERROR_PTR("&ptat not defined", procName, NULL);
     *pptat = NULL;
     if (!pna)
         return (PIXA *)ERROR_PTR("&na not defined", procName, NULL);
     *pna = NULL;
     if (!pixaa)
         return (PIXA *)ERROR_PTR("pixaa not defined", procName, NULL);

     n = pixaaGetCount(pixaa);
     if ((ptat = ptaCreate(n)) == NULL)
         return (PIXA *)ERROR_PTR("ptat not made", procName, NULL);
     *pptat = ptat;
     pixad = pixaCreate(n);
     na = numaCreate(n);
     *pna = na;

     for (i = 0; i < n; i++) {
         pixa = pixaaGetPixa(pixaa, i, L_CLONE);
         nt = pixaGetCount(pixa);
         numaAddNumber(na, nt);
         if (nt == 0) {
             L_WARNING("empty pixa found!", procName);
             pixaDestroy(&pixa);
             continue;
         }
         pixaSizeRange(pixa, &minw, &minh, &maxw, &maxh);
         pix = pixaGetPix(pixa, 0, L_CLONE);
         d = pixGetDepth(pix);
         pixDestroy(&pix);
         pixt1 = pixCreate(maxw, maxh, d);
         pixsum = pixInitAccumulate(maxw, maxh, 0);
         pta = pixaCentroids(pixa);

             /* Find the average value of the centroids ... */
         xave = yave = 0;
         for (j = 0; j < nt; j++) {
             ptaGetPt(pta, j, &x, &y);
             xave += x;
             yave += y;
         }
         xave = xave / (l_float32)nt;
         yave = yave / (l_float32)nt;

             /* and place all centroids at their average value */
         for (j = 0; j < nt; j++) {
             pixt2 = pixaGetPix(pixa, j, L_CLONE);
             ptaGetPt(pta, j, &x, &y);
             xdiff = (l_int32)(x - xave);
             ydiff = (l_int32)(y - yave);
             pixClearAll(pixt1);
             pixRasterop(pixt1, xdiff, ydiff, maxw, maxh, PIX_SRC,
                         pixt2, 0, 0);
             pixAccumulate(pixsum, pixt1, L_ARITH_ADD);
             pixDestroy(&pixt2);
         }
         pixaAddPix(pixad, pixsum, L_INSERT);
         ptaAddPt(ptat, xave, yave);

         pixaDestroy(&pixa);
         pixDestroy(&pixt1);
         ptaDestroy(&pta);
     }

     return pixad;
 }


 /*!
  *  jbTemplatesFromComposites()
  *
  *      Input:  pixac (one pix of composites for each class)
  *              na (number of samples used for each class composite)
  *      Return: pixad (8 bpp templates for each class), or null on error
  *
  */
 PIXA *
 jbTemplatesFromComposites(PIXA  *pixac,
                           NUMA  *na)
 {
 l_int32    n, i;
 l_float32  nt;  /* number of samples in the composite; always an integer */
 l_float32  factor;
 PIX       *pixsum;   /* accumulated composite */
 PIX       *pixd;
 PIXA      *pixad;

     PROCNAME("jbTemplatesFromComposites");

     if (!pixac)
         return (PIXA *)ERROR_PTR("pixac not defined", procName, NULL);
     if (!na)
         return (PIXA *)ERROR_PTR("na not defined", procName, NULL);

     n = pixaGetCount(pixac);
     pixad = pixaCreate(n);
     for (i = 0; i < n; i++) {
         pixsum = pixaGetPix(pixac, i, L_COPY);  /* changed internally */
         numaGetFValue(na, i, &nt);
         factor = 255. / nt;
         pixMultConstAccumulate(pixsum, factor, 0);  /* changes pixsum */
         pixd = pixFinalAccumulate(pixsum, 0, 8);
         pixaAddPix(pixad, pixd, L_INSERT);
         pixDestroy(&pixsum);
     }

     return pixad;
 }


 /*----------------------------------------------------------------------*
  *                       jbig2 utility routines                         *
  *----------------------------------------------------------------------*/
 /*!
  *  jbClasserCreate()
  *
  *      Input:  method (JB_RANKHAUS, JB_CORRELATION)
  *              components (JB_CONN_COMPS, JB_CHARACTERS, JB_WORDS)
  *      Return: jbclasser, or null on error
  */
 JBCLASSER *
 jbClasserCreate(l_int32  method,
                 l_int32  components)
 {
 JBCLASSER  *classer;

     PROCNAME("jbClasserCreate");

     if ((classer = (JBCLASSER *)CALLOC(1, sizeof(JBCLASSER))) == NULL)
         return (JBCLASSER *)ERROR_PTR("classer not made", procName, NULL);
     if (method != JB_RANKHAUS && method != JB_CORRELATION)
         return (JBCLASSER *)ERROR_PTR("invalid type", procName, NULL);
     if (components != JB_CONN_COMPS && components != JB_CHARACTERS &&
         components != JB_WORDS)
         return (JBCLASSER *)ERROR_PTR("invalid type", procName, NULL);

     classer->method = method;
     classer->components = components;
     classer->nacomps = numaCreate(0);
     classer->pixaa = pixaaCreate(0);
     classer->pixat = pixaCreate(0);
     classer->pixatd = pixaCreate(0);
     classer->nafgt = numaCreate(0);
     classer->naarea = numaCreate(0);
     classer->ptac = ptaCreate(0);
     classer->ptact = ptaCreate(0);
     classer->naclass = numaCreate(0);
     classer->napage = numaCreate(0);
     classer->ptaul = ptaCreate(0);

     return classer;
 }


 /*
  *  jbClasserDestroy()
  *
  *      Input: &classer (<to be nulled>)
  *      Return: void
  */
 void
 jbClasserDestroy(JBCLASSER  **pclasser)
 {
 JBCLASSER  *classer;

     if (!pclasser)
         return;
     if ((classer = *pclasser) == NULL)
         return;

     sarrayDestroy(&classer->safiles);
     numaDestroy(&classer->nacomps);
     pixaaDestroy(&classer->pixaa);
     pixaDestroy(&classer->pixat);
     pixaDestroy(&classer->pixatd);
     numaHashDestroy(&classer->nahash);
     numaDestroy(&classer->nafgt);
     numaDestroy(&classer->naarea);
     ptaDestroy(&classer->ptac);
     ptaDestroy(&classer->ptact);
     numaDestroy(&classer->naclass);
     numaDestroy(&classer->napage);
     ptaDestroy(&classer->ptaul);
     ptaDestroy(&classer->ptall);
     FREE(classer);
     *pclasser = NULL;
     return;
 }


 /*!
  *  jbDataSave()
  *
  *      Input:  jbclasser
  *              latticew, latticeh (cell size used to store each
  *                  connected component in the composite)
  *      Return: jbdata, or null on error
  *
  *  Notes:
  *      (1) This routine stores the jbig2-type data required for
  *          generating a lossy jbig2 version of the image.
  *          It can be losslessly written to (and read from) two files.
  *      (2) It generates and stores the mosaic of templates.
  *      (3) It clones the Numa and Pta arrays, so these must all
  *          be destroyed by the caller.
  *      (4) Input 0 to use the default values for latticew and/or latticeh,
  */
 JBDATA *
 jbDataSave(JBCLASSER  *classer)
 {
 l_int32  maxw, maxh;
 JBDATA  *data;
 PIX     *pix;

     PROCNAME("jbDataSave");

     if (!classer)
         return (JBDATA *)ERROR_PTR("classer not defined", procName, NULL);

         /* Write the templates into an array. */
     pixaSizeRange(classer->pixat, NULL, NULL, &maxw, &maxh);
     if ((pix = pixaDisplayOnLattice(classer->pixat, maxw + 1, maxh + 1))
             == NULL)
         return (JBDATA *)ERROR_PTR("data not made", procName, NULL);

     if ((data = (JBDATA *)CALLOC(1, sizeof(JBDATA))) == NULL)
         return (JBDATA *)ERROR_PTR("data not made", procName, NULL);
     data->pix = pix;
     data->npages = classer->npages;
     data->w = classer->w;
     data->h = classer->h;
     data->nclass = classer->nclass;
     data->latticew = maxw + 1;
     data->latticeh = maxh + 1;
     data->naclass = numaClone(classer->naclass);
     data->napage = numaClone(classer->napage);
     data->ptaul = ptaClone(classer->ptaul);

     return data;
 }


 /*
  *  jbDataDestroy()
  *
  *      Input: &data (<to be nulled>)
  *      Return: void
  */
 void
 jbDataDestroy(JBDATA  **pdata)
 {
 JBDATA  *data;

     if (!pdata)
         return;
     if ((data = *pdata) == NULL)
         return;

     pixDestroy(&data->pix);
     numaDestroy(&data->naclass);
     numaDestroy(&data->napage);
     ptaDestroy(&data->ptaul);
     FREE(data);
     *pdata = NULL;
     return;
 }


 /*!
  *  jbDataWrite()
  *
  *      Input:  rootname (for output files; everything but the extension)
  *              jbdata
  *      Return: 0 if OK, 1 on error
  *
  *  Notes:
  *      (1) Serialization function that writes data in jbdata to file.
  */
 l_int32
 jbDataWrite(const char  *rootout,
             JBDATA      *jbdata)
 {
 char     buf[L_BUF_SIZE];
 l_int32  w, h, nclass, npages, cellw, cellh, ncomp, i, x, y, iclass, ipage;
 NUMA    *naclass, *napage;
 PTA     *ptaul;
 PIX     *pixt;
 FILE    *fp;

     PROCNAME("jbDataWrite");

     if (!rootout)
         return ERROR_INT("no rootout", procName, 1);
     if (!jbdata)
         return ERROR_INT("no jbdata", procName, 1);

     npages = jbdata->npages;
     w = jbdata->w;
     h = jbdata->h;
     pixt = jbdata->pix;
     nclass = jbdata->nclass;
     cellw = jbdata->latticew;
     cellh = jbdata->latticeh;
     naclass = jbdata->naclass;
     napage = jbdata->napage;
     ptaul = jbdata->ptaul;

     snprintf(buf, L_BUF_SIZE, "%s%s", rootout, JB_TEMPLATE_EXT);
     pixWrite(buf, pixt, IFF_PNG);

     snprintf(buf, L_BUF_SIZE, "%s%s", rootout, JB_DATA_EXT);
     if ((fp = fopen(buf, "w")) == NULL)
         return ERROR_INT("stream not opened", procName, 1);
     ncomp = ptaGetCount(ptaul);
     fprintf(fp, "jb data file\n");
     fprintf(fp, "num pages = %d\n", npages);
     fprintf(fp, "page size: w = %d, h = %d\n", w, h);
     fprintf(fp, "num components = %d\n", ncomp);
     fprintf(fp, "num classes = %d\n", nclass);
     fprintf(fp, "template lattice size: w = %d, h = %d\n", cellw, cellh);
     for (i = 0; i < ncomp; i++) {
         numaGetIValue(napage, i, &ipage);
         numaGetIValue(naclass, i, &iclass);
         ptaGetIPt(ptaul, i, &x, &y);
         fprintf(fp, "%d %d %d %d\n", ipage, iclass, x, y);
     }
     fclose(fp);

     return 0;
 }


 /*!
  *  jbDataRead()
  *
  *      Input:  rootname (for template and data files)
  *      Return: jbdata, or NULL on error
  */
 JBDATA *
 jbDataRead(const char  *rootname)
 {
 char      fname[L_BUF_SIZE];
 char     *linestr;
 l_uint8  *data;
 l_int32   nsa, i, w, h, cellw, cellh, x, y, iclass, ipage;
 l_int32   nbytes, npages, nclass, ncomp;
 JBDATA   *jbdata;
 NUMA     *naclass, *napage;
 PIX      *pixs;
 PTA      *ptaul;
 SARRAY   *sa;

     PROCNAME("jbDataRead");

     if (!rootname)
         return (JBDATA *)ERROR_PTR("rootname not defined", procName, NULL);

     snprintf(fname, L_BUF_SIZE, "%s%s", rootname, JB_TEMPLATE_EXT);
     if ((pixs = pixRead(fname)) == NULL)
         return (JBDATA *)ERROR_PTR("pix not read", procName, NULL);

     snprintf(fname, L_BUF_SIZE, "%s%s", rootname, JB_DATA_EXT);
     if ((data = arrayRead(fname, &nbytes)) == NULL)
         return (JBDATA *)ERROR_PTR("data not read", procName, NULL);

     if ((sa = sarrayCreateLinesFromString((char *)data, 0)) == NULL)
         return (JBDATA *)ERROR_PTR("sa not made", procName, NULL);
     nsa = sarrayGetCount(sa);   /* number of cc + 6 */
     linestr = sarrayGetString(sa, 0, 0);
     if (strcmp(linestr, "jb data file"))
         return (JBDATA *)ERROR_PTR("invalid jb data file", procName, NULL);
     linestr = sarrayGetString(sa, 1, 0);
     sscanf(linestr, "num pages = %d", &npages);
     linestr = sarrayGetString(sa, 2, 0);
     sscanf(linestr, "page size: w = %d, h = %d", &w, &h);
     linestr = sarrayGetString(sa, 3, 0);
     sscanf(linestr, "num components = %d", &ncomp);
     linestr = sarrayGetString(sa, 4, 0);
     sscanf(linestr, "num classes = %d\n", &nclass);
     linestr = sarrayGetString(sa, 5, 0);
     sscanf(linestr, "template lattice size: w = %d, h = %d\n", &cellw, &cellh);

 #if 1
     fprintf(stderr, "num pages = %d\n", npages);
     fprintf(stderr, "page size: w = %d, h = %d\n", w, h);
     fprintf(stderr, "num components = %d\n", ncomp);
     fprintf(stderr, "num classes = %d\n", nclass);
     fprintf(stderr, "template lattice size: w = %d, h = %d\n", cellw, cellh);
 #endif

     if ((naclass = numaCreate(ncomp)) == NULL)
         return (JBDATA *)ERROR_PTR("naclass not made", procName, NULL);
     if ((napage = numaCreate(ncomp)) == NULL)
         return (JBDATA *)ERROR_PTR("napage not made", procName, NULL);
     if ((ptaul = ptaCreate(ncomp)) == NULL)
         return (JBDATA *)ERROR_PTR("pta not made", procName, NULL);
     for (i = 6; i < nsa; i++) {
         linestr = sarrayGetString(sa, i, 0);
         sscanf(linestr, "%d %d %d %d\n", &ipage, &iclass, &x, &y);
         numaAddNumber(napage, ipage);
         numaAddNumber(naclass, iclass);
         ptaAddPt(ptaul, x, y);
     }

     if ((jbdata = (JBDATA *)CALLOC(1, sizeof(JBDATA))) == NULL)
         return (JBDATA *)ERROR_PTR("data not made", procName, NULL);
     jbdata->pix = pixs;
     jbdata->npages = npages;
     jbdata->w = w;
     jbdata->h = h;
     jbdata->nclass = nclass;
     jbdata->latticew = cellw;
     jbdata->latticeh = cellh;
     jbdata->naclass = naclass;
     jbdata->napage = napage;
     jbdata->ptaul = ptaul;

     FREE(data);
     sarrayDestroy(&sa);
     return jbdata;
 }


 /*!
  *  jbDataRender()
  *
  *      Input:  jbdata
  *              debugflag (if TRUE, writes into 2 bpp pix and adds
  *                         component outlines in color)
  *      Return: pixa (reconstruction of original images, using templates) or
  *              null on error
  */
 PIXA *
 jbDataRender(JBDATA  *data,
              l_int32  debugflag)
 {
 l_int32   i, w, h, cellw, cellh, x, y, iclass, ipage;
 l_int32   npages, nclass, ncomp, wp, hp;
 BOX      *box;
 NUMA     *naclass, *napage;
 PIX      *pixt, *pixt2, *pix, *pixd;
 PIXA     *pixat;   /* pixa of templates */
 PIXA     *pixad;   /* pixa of output images */
 PIXCMAP  *cmap;
 PTA      *ptaul;

     PROCNAME("jbDataRender");

     if (!data)
         return (PIXA *)ERROR_PTR("data not defined", procName, NULL);

     npages = data->npages;
     w = data->w;
     h = data->h;
     pixt = data->pix;
     nclass = data->nclass;
     cellw = data->latticew;
     cellh = data->latticeh;
     naclass = data->naclass;
     napage = data->napage;
     ptaul = data->ptaul;
     ncomp = numaGetCount(naclass);

         /* Reconstruct the original set of images from the templates
          * and the data associated with each component.  First,
          * generate the output pixa as a set of empty pix. */
     if ((pixad = pixaCreate(npages)) == NULL)
         return (PIXA *)ERROR_PTR("pixad not made", procName, NULL);
     for (i = 0; i < npages; i++) {
         if (debugflag == FALSE)
             pix = pixCreate(w, h, 1);
         else {
             pix = pixCreate(w, h, 2);
             cmap = pixcmapCreate(2);
             pixcmapAddColor(cmap, 255, 255, 255);
             pixcmapAddColor(cmap, 0, 0, 0);
             pixcmapAddColor(cmap, 255, 0, 0);  /* for box outlines */
             pixSetColormap(pix, cmap);
         }
         pixaAddPix(pixad, pix, L_INSERT);
     }

         /* Put the class templates into a pixa. */
     if ((pixat = pixaCreateFromPix(pixt, nclass, cellw, cellh)) == NULL)
         return (PIXA *)ERROR_PTR("pixat not made", procName, NULL);

         /* Place each component in the right location on its page. */
     for (i = 0; i < ncomp; i++) {
         numaGetIValue(napage, i, &ipage);
         numaGetIValue(naclass, i, &iclass);
         pix = pixaGetPix(pixat, iclass, L_CLONE);  /* the template */
         wp = pixGetWidth(pix);
         hp = pixGetHeight(pix);
         ptaGetIPt(ptaul, i, &x, &y);
         pixd = pixaGetPix(pixad, ipage, L_CLONE);   /* the output page */
         if (debugflag == FALSE)
             pixRasterop(pixd, x, y, wp, hp, PIX_SRC | PIX_DST, pix, 0, 0);
         else {
             pixt2 = pixConvert1To2Cmap(pix);
             pixRasterop(pixd, x, y, wp, hp, PIX_SRC | PIX_DST, pixt2, 0, 0);
             box = boxCreate(x, y, wp, hp);
             pixRenderBoxArb(pixd, box, 1, 255, 0, 0);
             pixDestroy(&pixt2);
             boxDestroy(&box);
         }
         pixDestroy(&pix);   /* the clone only */
         pixDestroy(&pixd);  /* the clone only */
     }

     pixaDestroy(&pixat);
     return pixad;
 }


 /*!
  *  jbGetULCorners()
  *
  *      Input:  jbclasser
  *              pixs (full res image)
  *              boxa (of c.c. bounding rectangles for this page)
  *      Return: 0 if OK, 1 on error
  *
  *  Notes:
  *      (1) This computes the ptaul field, which has the global UL corners,
  *          adjusted for each specific component, so that each component
  *          can be replaced by the template for its class and have the
  *          centroid in the template in the same position as the
  *          centroid of the original connected component.  It is important
  *          that this be done properly to avoid a wavy baseline in the
  *          result.
  *      (2) The array fields ptac and ptact give the centroids of
  *          those components relative to the UL corner of each component.
  *          Here, we compute the difference in each component, round to
  *          nearest integer, and correct the box->x and box->y by
  *          the appropriate integral difference.
  *      (3) The templates and stored instances are all bordered.
  */
 l_int32
 jbGetULCorners(JBCLASSER  *classer,
                PIX        *pixs,
                BOXA       *boxa)
 {
 l_int32    i, baseindex, index, n, iclass, idelx, idely, x, y, dx, dy;
 l_int32   *sumtab;
 l_float32  x1, x2, y1, y2, delx, dely;
 BOX       *box;
 NUMA      *naclass;
 PIX       *pixt;
 PTA       *ptac, *ptact, *ptaul;

     PROCNAME("jbGetULCorners");

     if (!classer)
         return ERROR_INT("classer not defined", procName, 1);
     if (!pixs)
         return ERROR_INT("pixs not defined", procName, 1);
     if (!boxa)
         return ERROR_INT("boxa not defined", procName, 1);

     n = boxaGetCount(boxa);
     ptaul = classer->ptaul;
     naclass = classer->naclass;
     ptac = classer->ptac;
     ptact = classer->ptact;
     baseindex = classer->baseindex;  /* num components before this page */
     sumtab = makePixelSumTab8();
     for (i = 0; i < n; i++) {
         index = baseindex + i;
         ptaGetPt(ptac, index, &x1, &y1);
         numaGetIValue(naclass, index, &iclass);
         ptaGetPt(ptact, iclass, &x2, &y2);
         delx = x2 - x1;
         dely = y2 - y1;
         if (delx >= 0)
             idelx = (l_int32)(delx + 0.5);
         else
             idelx = (l_int32)(delx - 0.5);
         if (dely >= 0)
             idely = (l_int32)(dely + 0.5);
         else
             idely = (l_int32)(dely - 0.5);
         if ((box = boxaGetBox(boxa, i, L_CLONE)) == NULL)
             return ERROR_INT("box not found", procName, 1);
         boxGetGeometry(box, &x, &y, NULL, NULL);

             /* Get final increments dx and dy for best alignment */
         pixt = pixaGetPix(classer->pixat, iclass, L_CLONE);
         finalPositioningForAlignment(pixs, x, y, idelx, idely,
                                      pixt, sumtab, &dx, &dy);
 /*        if (i % 20 == 0)
             fprintf(stderr, "dx = %d, dy = %d\n", dx, dy); */
         ptaAddPt(ptaul, x - idelx + dx, y - idely + dy);
         boxDestroy(&box);
         pixDestroy(&pixt);
     }

     FREE(sumtab);
     return 0;
 }


 /*!
  *  jbGetLLCorners()
  *
  *      Input:  jbclasser
  *      Return: 0 if OK, 1 on error
  *
  *  Notes:
  *      (1) This computes the ptall field, which has the global LL corners,
  *          adjusted for each specific component, so that each component
  *          can be replaced by the template for its class and have the
  *          centroid in the template in the same position as the
  *          centroid of the original connected component. It is important
  *          that this be done properly to avoid a wavy baseline in the result.
  *      (2) It is computed here from the corresponding UL corners, where
  *          the input templates and stored instances are all bordered.
  *          This should be done after all pages have been processed.
  *      (3) For proper substitution, the templates whose LL corners are
  *          placed in these locations must be UN-bordered.
  *          This is available for a realistic jbig2 encoder, which would
  *          (1) encode each template without a border, and (2) encode
  *          the position using the LL corner (rather than the UL
  *          corner) because the difference between y-values
  *          of successive instances is typically close to zero.
  */
 l_int32
 jbGetLLCorners(JBCLASSER  *classer)
 {
 l_int32    i, iclass, n, x1, y1, h;
 NUMA      *naclass;
 PIX       *pix;
 PIXA      *pixat;
 PTA       *ptaul, *ptall;

     PROCNAME("jbGetLLCorners");

     if (!classer)
         return ERROR_INT("classer not defined", procName, 1);

     ptaul = classer->ptaul;
     naclass = classer->naclass;
     pixat = classer->pixat;

     ptaDestroy(&classer->ptall);
     n = ptaGetCount(ptaul);
     ptall = ptaCreate(n);
     classer->ptall = ptall;

         /* If the templates were bordered, we would add h - 1 to the UL
          * corner y-value.  However, because the templates to be used
          * here have their borders removed, and the borders are
          * JB_ADDED_PIXELS on each side, we add h - 1 - 2 * JB_ADDED_PIXELS
          * to the UL corner y-value.  */
     for (i = 0; i < n; i++) {
         ptaGetIPt(ptaul, i, &x1, &y1);
         numaGetIValue(naclass, i, &iclass);
         pix = pixaGetPix(pixat, iclass, L_CLONE);
         h = pixGetHeight(pix);
         ptaAddPt(ptall, x1, y1 + h - 1 - 2 * JB_ADDED_PIXELS);
         pixDestroy(&pix);
     }

     return 0;
 }


 /*----------------------------------------------------------------------*
  *                              Static helpers                          *
  *----------------------------------------------------------------------*/
 /* When looking for similar matches we check templates whose size is +/- 2 in
  * each direction. This involves 25 possible sizes. This array contains the
  * offsets for each of those positions in a spiral pattern. There are 25 pairs
  * of numbers in this array: even positions are x values. */
 static int two_by_two_walk[50] = {
   0, 0,
   0, 1,
   -1, 0,
   0, -1,
   1, 0,
   -1, 1,
   1, 1,
   -1, -1,
   1, -1,
   0, -2,
   2, 0,
   0, 2,
   -2, 0,
   -1, -2,
   1, -2,
   2, -1,
   2, 1,
   1, 2,
   -1, 2,
   -2, 1,
   -2, -1,
   -2, -2,
   2, -2,
   2, 2,
   -2, 2};


 /*!
  *  findSimilarSizedTemplatesInit()
  *
  *      Input:  classer
  *              pixs (instance to be matched)
  *      Return: Allocated context to be used with findSimilar*
  */
 static JBFINDCTX *
 findSimilarSizedTemplatesInit(JBCLASSER  *classer,
                               PIX        *pixs)
 {
 JBFINDCTX  *state;

     state = (JBFINDCTX *)CALLOC(1, sizeof(JBFINDCTX));
     state->w = pixGetWidth(pixs) - 2 * JB_ADDED_PIXELS;
     state->h = pixGetHeight(pixs) - 2 * JB_ADDED_PIXELS;
     state->classer = classer;

     return state;
 }


 static void
 findSimilarSizedTemplatesDestroy(JBFINDCTX  **pstate)
 {
 JBFINDCTX  *state;

     PROCNAME("findSimilarSizedTemplatesDestroy");

     if (pstate == NULL) {
         L_WARNING("ptr address is null", procName);
         return;
     }
     if ((state = *pstate) == NULL)
         return;

     numaDestroy(&state->numa);
     FREE(state);
     *pstate = NULL;
     return;
 }


 /*!
  *  findSimilarSizedTemplatesNext()
  *
  *      Input:  state (from findSimilarSizedTemplatesInit)
  *      Return: Next template number, or -1 when finished
  *
  *  We have a hash table mapping template area to a list of template
  *  numbers with that area.  We wish to find similar sized templates,
  *  so we first look for templates with the same width and height, and
  *  then with width + 1, etc.  This walk is guided by the
  *  two_by_two_walk array, above.
  *
  *  We don't want to have to collect the whole list of templates first because
  *  (we hope) to find it quickly.  So we keep the context for this walk in an
  *  explictit state structure and this function acts like a generator.
  */
 static l_int32
 findSimilarSizedTemplatesNext(JBFINDCTX  *state)
 {
 l_int32  desiredh, desiredw, size, templ;
 PIX     *pixt;

     while(1) {  /* Continue the walk over step 'i' */
         if (state->i >= 25) {  /* all done */
             return -1;
         }

         desiredw = state->w + two_by_two_walk[2 * state->i];
         desiredh = state->h + two_by_two_walk[2 * state->i + 1];
         if (desiredh < 1 || desiredw < 1) {  /* invalid size */
             state->i++;
             continue;
         }

         if (!state->numa) {
                 /* We have yet to start walking the array for the step 'i' */
             state->numa = numaHashGetNuma(state->classer->nahash,
                                           desiredh * desiredw);
             if (!state->numa) {  /* nothing there */
                 state->i++;
                 continue;
             }

             state->n = 0;  /* OK, we got a numa. */
         }

             /* Continue working on this numa */
         size = numaGetCount(state->numa);
         for ( ; state->n < size; ) {
             templ = (l_int32)(state->numa->array[state->n++] + 0.5);
             pixt = pixaGetPix(state->classer->pixat, templ, L_CLONE);
             if (pixGetWidth(pixt) - 2 * JB_ADDED_PIXELS == desiredw &&
                 pixGetHeight(pixt) - 2 * JB_ADDED_PIXELS == desiredh) {
                 pixDestroy(&pixt);
                 return templ;
             }
             pixDestroy(&pixt);
         }

             /* Exhausted the numa; take another step and try again */
         state->i++;
         numaDestroy(&state->numa);
         continue;
     }
 }


 /*!
  *  finalPositioningForAlignment()
  *
  *      Input:  pixs (input page image)
  *              x, y (location of UL corner of bb of component in pixs)
  *              idelx, idely (compensation to match centroids of component
  *                            and template)
  *              pixt (template, with JB_ADDED_PIXELS of padding on all sides)
  *              sumtab (for summing fg pixels in an image)
  *              &dx, &dy (return delta on position for best match; each
  *                        one is in the set {-1, 0, 1})
  *      Return: 0 if OK, 1 on error
  *
  */
 static l_int32
 finalPositioningForAlignment(PIX      *pixs,
                              l_int32   x,
                              l_int32   y,
                              l_int32   idelx,
                              l_int32   idely,
                              PIX      *pixt,
                              l_int32  *sumtab,
                              l_int32  *pdx,
                              l_int32  *pdy)
 {
 l_int32  w, h, i, j, minx, miny, count, mincount;
 PIX     *pixi;  /* clipped from source pixs */
 PIX     *pixr;  /* temporary storage */
 BOX     *box;

     PROCNAME("finalPositioningForAlignment");

     if (!pixs)
         return ERROR_INT("pixs not defined", procName, 1);
     if (!pixt)
         return ERROR_INT("pixt not defined", procName, 1);
     if (!pdx || !pdy)
         return ERROR_INT("&dx and &dy not both defined", procName, 1);
     if (!sumtab)
         return ERROR_INT("sumtab not defined", procName, 1);
     *pdx = *pdy = 0;

         /* Use JB_ADDED_PIXELS pixels padding on each side */
     w = pixGetWidth(pixt);
     h = pixGetHeight(pixt);
     box = boxCreate(x - idelx - JB_ADDED_PIXELS,
                     y - idely - JB_ADDED_PIXELS, w, h);
     pixi = pixClipRectangle(pixs, box, NULL);
     boxDestroy(&box);
     if (!pixi)
         return ERROR_INT("pixi not made", procName, 1);

     pixr = pixCreate(pixGetWidth(pixi), pixGetHeight(pixi), 1);
     mincount = 0x7fffffff;
     for (i = -1; i <= 1; i++) {
         for (j = -1; j <= 1; j++) {
             pixCopy(pixr, pixi);
             pixRasterop(pixr, j, i, w, h, PIX_SRC ^ PIX_DST, pixt, 0, 0);
             pixCountPixels(pixr, &count, sumtab);
             if (count < mincount) {
                 minx = j;
                 miny = i;
                 mincount = count;
             }
         }
     }
     pixDestroy(&pixi);
     pixDestroy(&pixr);

     *pdx = minx;
     *pdy = miny;
     return 0;
 }