blob: 8d01e8d81554b409fcfa71c5fcc2a7bc9e205420 [file] [log] [blame]
/*====================================================================*
- Copyright (C) 2001 Leptonica. All rights reserved.
- This software is distributed in the hope that it will be
- useful, but with NO WARRANTY OF ANY KIND.
- No author or distributor accepts responsibility to anyone for the
- consequences of using this software, or for whether it serves any
- particular purpose or works at all, unless he or she says so in
- writing. Everyone is granted permission to copy, modify and
- redistribute this source code, for commercial or non-commercial
- purposes, with the following restrictions: (1) the origin of this
- source code must not be misrepresented; (2) modified versions must
- be plainly marked as such; and (3) this notice may not be removed
- or altered from any source or modified source distribution.
*====================================================================*/
/*
* baseline.c
*
* Locate text baselines in an image
* NUMA *pixFindBaselines()
*
* Projective transform to remove local skew
* PIX *pixDeskewLocal()
*
* Determine local skew
* l_int32 pixGetLocalSkewTransform()
* NUMA *pixGetLocalSkewAngles()
*
* We have two apparently different functions here:
* - finding baselines
* - finding a projective transform to remove keystone warping
* The function pixGetLocalSkewAngles() returns an array of angles,
* one for each raster line, and the baselines of the text lines
* should intersect the left edge of the image with that angle.
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "allheaders.h"
#ifndef NO_CONSOLE_IO
#define DEBUG_PLOT 0
#endif /* NO_CONSOLE_IO */
/* Min to travel after finding max before abandoning peak */
static const l_int32 MIN_DIST_IN_PEAK = 35;
/* Thresholds for peaks and zeros, relative to the max peak */
static const l_int32 PEAK_THRESHOLD_RATIO = 20;
static const l_int32 ZERO_THRESHOLD_RATIO = 100;
/* Default values for determining local skew */
static const l_int32 DEFAULT_SLICES = 10;
static const l_int32 DEFAULT_SWEEP_REDUCTION = 2;
static const l_int32 DEFAULT_BS_REDUCTION = 1;
static const l_float32 DEFAULT_SWEEP_RANGE = 5.; /* degrees */
static const l_float32 DEFAULT_SWEEP_DELTA = 1.; /* degrees */
static const l_float32 DEFAULT_MINBS_DELTA = 0.01; /* degrees */
/* Overlap slice fraction added to top and bottom of each slice */
static const l_float32 OVERLAP_FRACTION = 0.5;
/* Minimum allowed confidence (ratio) for accepting a value */
static const l_float32 MIN_ALLOWED_CONFIDENCE = 3.0;
/*---------------------------------------------------------------------*
* Locate text baselines in an image *
*---------------------------------------------------------------------*/
/*!
* pixFindBaselines()
*
* Input: pixs (1 bpp)
* &pta (<optional return> pairs of pts corresponding to
* approx. ends of each text line)
* debug (usually 0; set to 1 for debugging output)
* Return: na (of baseline y values), or null on error
*
* Notes:
* (1) Input binary image must have text lines already aligned
* horizontally. This can be done by either rotating the
* image with pixDeskew(), or, if a projective transform
* is required, by doing pixDeskewLocal() first.
* (2) Input null for &pta if you don't want this returned.
* The pta will come in pairs of points (left and right end
* of each baseline).
* (3) Caution: this will not work properly on text with multiple
* columns, where the lines are not aligned between columns.
* If there are multiple columns, they should be extracted
* separately before finding the baselines.
* (4) This function constructs different types of output
* for baselines; namely, a set of raster line values and
* a set of end points of each baseline.
* (5) This function was designed to handle short and long text lines
* without using dangerous thresholds on the peak heights. It does
* this by combining the differential signal with a morphological
* analysis of the locations of the text lines. One can also
* combine this data to normalize the peak heights, by weighting
* the differential signal in the region of each baseline
* by the inverse of the width of the text line found there.
* (6) There are various debug sections that can be turned on
* with the debug flag.
*/
NUMA *
pixFindBaselines(PIX *pixs,
PTA **ppta,
l_int32 debug)
{
l_int32 w, h, i, j, nbox, val1, val2, ndiff, bx, by, bw, bh;
l_int32 imaxloc, peakthresh, zerothresh, inpeak;
l_int32 mintosearch, max, maxloc, nloc, locval;
l_int32 *array;
l_float32 maxval;
BOXA *boxa1, *boxa2, *boxa3;
GPLOT *gplot;
NUMA *nasum, *nadiff, *naloc, *naval;
PIX *pixt1, *pixt2;
PTA *pta;
PROCNAME("pixFindBaselines");
if (!pixs)
return (NUMA *)ERROR_PTR("pixs not defined", procName, NULL);
pta = NULL;
if (ppta) {
pta = ptaCreate(0);
*ppta = pta;
}
/* Close up the text characters, removing noise */
pixt1 = pixMorphSequence(pixs, "c25.1 + e3.1", 0);
/* Save the difference of adjacent row sums.
* The high positive-going peaks are the baselines */
if ((nasum = pixCountPixelsByRow(pixt1, NULL)) == NULL)
return (NUMA *)ERROR_PTR("nasum not made", procName, NULL);
w = pixGetWidth(pixs);
h = pixGetHeight(pixs);
nadiff = numaCreate(h);
numaGetIValue(nasum, 0, &val2);
for (i = 0; i < h - 1; i++) {
val1 = val2;
numaGetIValue(nasum, i + 1, &val2);
numaAddNumber(nadiff, val1 - val2);
}
if (debug) /* show the difference signal */
gplotSimple1(nadiff, GPLOT_X11, "junkdiff", "difference");
/* Use the zeroes of the profile to locate each baseline. */
array = numaGetIArray(nadiff);
ndiff = numaGetCount(nadiff);
numaGetMax(nadiff, &maxval, &imaxloc);
/* Use this to begin locating a new peak: */
peakthresh = (l_int32)maxval / PEAK_THRESHOLD_RATIO;
/* Use this to begin a region between peaks: */
zerothresh = (l_int32)maxval / ZERO_THRESHOLD_RATIO;
naloc = numaCreate(0);
naval = numaCreate(0);
inpeak = FALSE;
for (i = 0; i < ndiff; i++) {
if (inpeak == FALSE) {
if (array[i] > peakthresh) { /* transition to in-peak */
inpeak = TRUE;
mintosearch = i + MIN_DIST_IN_PEAK; /* accept no zeros
* between i and mintosearch */
max = array[i];
maxloc = i;
}
}
else { /* inpeak == TRUE; look for max */
if (array[i] > max) {
max = array[i];
maxloc = i;
mintosearch = i + MIN_DIST_IN_PEAK;
}
else if (i > mintosearch && array[i] <= zerothresh) { /* leave */
inpeak = FALSE;
numaAddNumber(naval, max);
numaAddNumber(naloc, maxloc);
}
}
}
/* If array[ndiff-1] is max, eg. no descenders, baseline at bottom */
if (inpeak) {
numaAddNumber(naval, max);
numaAddNumber(naloc, maxloc);
}
FREE(array);
if (debug) { /* show the raster locations for the peaks */
gplot = gplotCreate("junkloc", GPLOT_X11, "Peak locations",
"rasterline", "height");
gplotAddPlot(gplot, naloc, naval, GPLOT_POINTS, "locs");
gplotMakeOutput(gplot);
gplotDestroy(&gplot);
}
/* Generate an approximate profile of text line width.
* First, filter the boxes of text, where there may be
* more than one box for a given textline. */
pixt2 = pixMorphSequence(pixt1, "r11 + c25.1 + o7.1 +c1.3", 0);
boxa1 = pixConnComp(pixt2, NULL, 4);
boxa2 = boxaTransform(boxa1, 0, 0, 4., 4.);
boxa3 = boxaSort(boxa2, L_SORT_BY_Y, L_SORT_INCREASING, NULL);
/* Then find the baseline segments */
if (pta) {
nloc = numaGetCount(naloc);
nbox = boxaGetCount(boxa3);
for (i = 0; i < nbox; i++) {
boxaGetBoxGeometry(boxa3, i, &bx, &by, &bw, &bh);
for (j = 0; j < nloc; j++) {
numaGetIValue(naloc, j, &locval);
if (L_ABS(locval - (by + bh)) > 25)
continue;
ptaAddPt(pta, bx, locval);
ptaAddPt(pta, bx + bw, locval);
break;
}
}
}
if (debug) { /* display baselines */
PIX *pixd;
l_int32 npts, x1, y1, x2, y2;
if (pta) {
pixd = pixConvertTo32(pixs);
npts = ptaGetCount(pta);
for (i = 0; i < npts; i += 2) {
ptaGetIPt(pta, i, &x1, &y1);
ptaGetIPt(pta, i + 1, &x2, &y2);
pixRenderLineArb(pixd, x1, y1, x2, y2, 1, 255, 0, 0);
}
pixDisplay(pixd, 200, 200);
pixWrite("junkbaselines", pixd, IFF_PNG);
pixDestroy(&pixd);
}
}
boxaDestroy(&boxa1);
boxaDestroy(&boxa2);
boxaDestroy(&boxa3);
pixDestroy(&pixt1);
pixDestroy(&pixt2);
numaDestroy(&nasum);
numaDestroy(&nadiff);
numaDestroy(&naval);
return naloc;
}
/*---------------------------------------------------------------------*
* Projective transform to remove local skew *
*---------------------------------------------------------------------*/
/*!
* pixDeskewLocal()
*
* Input: pixs
* nslices (the number of horizontal overlapping slices; must
* be larger than 1 and not exceed 20; use 0 for default)
* redsweep (sweep reduction factor: 1, 2, 4 or 8;
* use 0 for default value)
* redsearch (search reduction factor: 1, 2, 4 or 8, and
* not larger than redsweep; use 0 for default value)
* sweeprange (half the full range, assumed about 0; in degrees;
* use 0.0 for default value)
* sweepdelta (angle increment of sweep; in degrees;
* use 0.0 for default value)
* minbsdelta (min binary search increment angle; in degrees;
* use 0.0 for default value)
* Return: pixd, or null on error
*
* Notes:
* (1) This function allows deskew of a page whose skew changes
* approximately linearly with vertical position. It uses
* a projective tranform that in effect does a differential
* shear about the LHS of the page, and makes all text lines
* horizontal.
* (2) The origin of the keystoning can be either a cheap document
* feeder that rotates the page as it is passed through, or a
* camera image taken from either the left or right side
* of the vertical.
* (3) The image transformation is a projective warping,
* not a rotation. Apart from this function, the text lines
* must be properly aligned vertically with respect to each
* other. This can be done by pre-processing the page; e.g.,
* by rotating or horizontally shearing it.
* Typically, this can be achieved by vertically aligning
* the page edge.
*/
PIX *
pixDeskewLocal(PIX *pixs,
l_int32 nslices,
l_int32 redsweep,
l_int32 redsearch,
l_float32 sweeprange,
l_float32 sweepdelta,
l_float32 minbsdelta)
{
l_int32 ret;
PIX *pixd;
PTA *ptas, *ptad;
PROCNAME("pixDeskewLocal");
if (!pixs)
return (PIX *)ERROR_PTR("pixs not defined", procName, NULL);
/* Skew array gives skew angle (deg) as fctn of raster line
* where it intersects the LHS of the image */
ret = pixGetLocalSkewTransform(pixs, nslices, redsweep, redsearch,
sweeprange, sweepdelta, minbsdelta,
&ptas, &ptad);
if (ret != 0)
return (PIX *)ERROR_PTR("transform pts not found", procName, NULL);
/* Use a projective transform */
pixd = pixProjectiveSampledPta(pixs, ptad, ptas, L_BRING_IN_WHITE);
ptaDestroy(&ptas);
ptaDestroy(&ptad);
return pixd;
}
/*---------------------------------------------------------------------*
* Determine the local skew *
*---------------------------------------------------------------------*/
/*!
* pixGetLocalSkewTransform()
*
* Input: pixs
* nslices (the number of horizontal overlapping slices; must
* be larger than 1 and not exceed 20; use 0 for default)
* redsweep (sweep reduction factor: 1, 2, 4 or 8;
* use 0 for default value)
* redsearch (search reduction factor: 1, 2, 4 or 8, and
* not larger than redsweep; use 0 for default value)
* sweeprange (half the full range, assumed about 0; in degrees;
* use 0.0 for default value)
* sweepdelta (angle increment of sweep; in degrees;
* use 0.0 for default value)
* minbsdelta (min binary search increment angle; in degrees;
* use 0.0 for default value)
* &ptas (<return> 4 points in the source)
* &ptad (<return> the corresponding 4 pts in the dest)
* Return: 0 if OK, 1 on error
*
* Notes:
* (1) This generates two pairs of points in the src, each pair
* corresponding to a pair of points that would lie along
* the same raster line in a transformed (dewarped) image.
* (2) The sets of 4 src and 4 dest points returned by this function
* can then be used, in a projective or bilinear transform,
* to remove keystoning in the src.
*/
l_int32
pixGetLocalSkewTransform(PIX *pixs,
l_int32 nslices,
l_int32 redsweep,
l_int32 redsearch,
l_float32 sweeprange,
l_float32 sweepdelta,
l_float32 minbsdelta,
PTA **pptas,
PTA **pptad)
{
l_int32 w, h, i;
l_float32 deg2rad, angr, angd, dely;
NUMA *naskew;
PTA *ptas, *ptad;
PROCNAME("pixGetLocalSkewTransform");
if (!pptas || !pptad)
return ERROR_INT("&ptas and &ptad not defined", procName, 1);
*pptas = *pptad = NULL;
if (!pixs)
return ERROR_INT("pixs not defined", procName, 1);
if (nslices < 2 || nslices > 20)
nslices = DEFAULT_SLICES;
if (redsweep < 1 || redsweep > 8)
redsweep = DEFAULT_SWEEP_REDUCTION;
if (redsearch < 1 || redsearch > redsweep)
redsearch = DEFAULT_BS_REDUCTION;
if (sweeprange == 0.0)
sweeprange = DEFAULT_SWEEP_RANGE;
if (sweepdelta == 0.0)
sweepdelta = DEFAULT_SWEEP_DELTA;
if (minbsdelta == 0.0)
minbsdelta = DEFAULT_MINBS_DELTA;
naskew = pixGetLocalSkewAngles(pixs, nslices, redsweep, redsearch,
sweeprange, sweepdelta, minbsdelta,
NULL, NULL);
if (!naskew)
return ERROR_INT("naskew not made", procName, 1);
deg2rad = 3.14159265 / 180.;
w = pixGetWidth(pixs);
h = pixGetHeight(pixs);
ptas = ptaCreate(4);
ptad = ptaCreate(4);
*pptas = ptas;
*pptad = ptad;
/* Find i for skew line that intersects LHS at i and RHS at h / 20 */
for (i = 0; i < h; i++) {
numaGetFValue(naskew, i, &angd);
angr = angd * deg2rad;
dely = w * tan(angr);
if (i - dely > 0.05 * h)
break;
}
ptaAddPt(ptas, 0, i);
ptaAddPt(ptas, w - 1, i - dely);
ptaAddPt(ptad, 0, i);
ptaAddPt(ptad, w - 1, i);
/* Find i for skew line that intersects LHS at i and RHS at 19h / 20 */
for (i = h - 1; i > 0; i--) {
numaGetFValue(naskew, i, &angd);
angr = angd * deg2rad;
dely = w * tan(angr);
if (i - dely < 0.95 * h)
break;
}
ptaAddPt(ptas, 0, i);
ptaAddPt(ptas, w - 1, i - dely);
ptaAddPt(ptad, 0, i);
ptaAddPt(ptad, w - 1, i);
numaDestroy(&naskew);
return 0;
}
/*!
* pixGetLocalSkewAngles()
*
* Input: pixs
* nslices (the number of horizontal overlapping slices; must
* be larger than 1 and not exceed 20; use 0 for default)
* redsweep (sweep reduction factor: 1, 2, 4 or 8;
* use 0 for default value)
* redsearch (search reduction factor: 1, 2, 4 or 8, and
* not larger than redsweep; use 0 for default value)
* sweeprange (half the full range, assumed about 0; in degrees;
* use 0.0 for default value)
* sweepdelta (angle increment of sweep; in degrees;
* use 0.0 for default value)
* minbsdelta (min binary search increment angle; in degrees;
* use 0.0 for default value)
* &a (<optional return> slope of skew as fctn of y)
* &b (<optional return> intercept at y=0 of skew as fctn of y)
* Return: naskew, or null on error
*
* Notes:
* (1) The local skew is measured in a set of overlapping strips.
* We then do a least square linear fit parameters to get
* the slope and intercept parameters a and b in
* skew-angle = a * y + b (degrees)
* for the local skew as a function of raster line y.
* This is then used to make naskew, which can be interpreted
* as the computed skew angle (in degrees) at the left edge
* of each raster line.
* (2) naskew can then be used to find the baselines of text, because
* each text line has a baseline that should intersect
* the left edge of the image with the angle given by this
* array, evaluated at the raster line of intersection.
*/
NUMA *
pixGetLocalSkewAngles(PIX *pixs,
l_int32 nslices,
l_int32 redsweep,
l_int32 redsearch,
l_float32 sweeprange,
l_float32 sweepdelta,
l_float32 minbsdelta,
l_float32 *pa,
l_float32 *pb)
{
l_int32 w, h, hs, i, ystart, yend, ovlap, npts;
l_float32 angle, conf, ycenter, a, b;
BOX *box;
NUMA *naskew;
PIX *pix;
PTA *pta;
PROCNAME("pixGetLocalSkewAngles");
if (!pixs)
return (NUMA *)ERROR_PTR("pixs not defined", procName, NULL);
if (nslices < 2 || nslices > 20)
nslices = DEFAULT_SLICES;
if (redsweep < 1 || redsweep > 8)
redsweep = DEFAULT_SWEEP_REDUCTION;
if (redsearch < 1 || redsearch > redsweep)
redsearch = DEFAULT_BS_REDUCTION;
if (sweeprange == 0.0)
sweeprange = DEFAULT_SWEEP_RANGE;
if (sweepdelta == 0.0)
sweepdelta = DEFAULT_SWEEP_DELTA;
if (minbsdelta == 0.0)
minbsdelta = DEFAULT_MINBS_DELTA;
h = pixGetHeight(pixs);
w = pixGetWidth(pixs);
hs = h / nslices;
ovlap = (l_int32)(OVERLAP_FRACTION * hs);
pta = ptaCreate(nslices);
for (i = 0; i < nslices; i++) {
ystart = L_MAX(0, hs * i - ovlap);
yend = L_MIN(h - 1, hs * (i + 1) + ovlap);
ycenter = (ystart + yend) / 2;
box = boxCreate(0, ystart, w, yend - ystart + 1);
pix = pixClipRectangle(pixs, box, NULL);
pixFindSkewSweepAndSearch(pix, &angle, &conf, redsweep, redsearch,
sweeprange, sweepdelta, minbsdelta);
if (conf > MIN_ALLOWED_CONFIDENCE)
ptaAddPt(pta, ycenter, angle);
pixDestroy(&pix);
boxDestroy(&box);
}
/* ptaWriteStream(stderr, pta, 0); */
/* Do linear least squares fit */
if ((npts = ptaGetCount(pta)) < 2) {
ptaDestroy(&pta);
return (NUMA *)ERROR_PTR("can't fit skew", procName, NULL);
}
ptaGetLinearLSF(pta, &a, &b, NULL);
if (pa) *pa = a;
if (pb) *pb = b;
/* Make skew angle array as function of raster line */
naskew = numaCreate(h);
for (i = 0; i < h; i++) {
angle = a * i + b;
numaAddNumber(naskew, angle);
}
#if DEBUG_PLOT
{ NUMA *nax, *nay;
GPLOT *gplot;
ptaGetArrays(pta, &nax, &nay);
gplot = gplotCreate("junkskew", GPLOT_X11, "skew as fctn of y",
"y (in raster lines from top)", "angle (in degrees)");
gplotAddPlot(gplot, NULL, naskew, GPLOT_POINTS, "linear lsf");
gplotAddPlot(gplot, nax, nay, GPLOT_POINTS, "actual data pts");
gplotMakeOutput(gplot);
gplotDestroy(&gplot);
numaDestroy(&nax);
numaDestroy(&nay);
}
#endif /* DEBUG_PLOT */
ptaDestroy(&pta);
return naskew;
}