api/baseapi.cpp - platform/external/tesseract - Git at Google

 /**********************************************************************
  * File:        baseapi.cpp
  * Description: Simple API for calling tesseract.
  * Author:      Ray Smith
  * Created:     Fri Oct 06 15:35:01 PDT 2006
  *
  * (C) Copyright 2006, Google Inc.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
 #include "config_auto.h"
 #endif

 #ifdef HAVE_LIBLEPT
 // Include leptonica library only if autoconf (or makefile etc) tell us to.
 #include "allheaders.h"
 #endif

 #include "baseapi.h"

 #include "thresholder.h"
 #include "tesseractmain.h"
 #include "tesseractclass.h"
 #include "tessedit.h"
 #include "ocrclass.h"
 #include "pageres.h"
 #include "tessvars.h"
 #include "control.h"
 #include "applybox.h"
 #include "pgedit.h"
 #include "varabled.h"
 #include "output.h"
 #include "mainblk.h"
 #include "globals.h"
 #include "adaptmatch.h"
 #include "edgblob.h"
 #include "tessbox.h"
 #include "tordvars.h"
 #include "imgs.h"
 #include "makerow.h"
 #include "tstruct.h"
 #include "tessout.h"
 #include "tface.h"
 #include "permute.h"
 #include "otsuthr.h"
 #include "osdetect.h"
 #include "chopper.h"
 #include "matchtab.h"

 namespace tesseract {

 // Minimum sensible image size to be worth running tesseract.
 const int kMinRectSize = 10;
 // Character returned when Tesseract couldn't recognize as anything.
 const char kTesseractReject = '~';
 // Character used by UNLV error counter as a reject.
 const char kUNLVReject = '~';
 // Character used by UNLV as a suspect marker.
 const char kUNLVSuspect = '^';
 // Filename used for input image file, from which to derive a name to search
 // for a possible UNLV zone file, if none is specified by SetInputName.
 const char* kInputFile = "noname.tif";

 TessBaseAPI::TessBaseAPI()
   : tesseract_(NULL),
     // Thresholder is initialized to NULL here, but will be set before use by:
     // A constructor of a derived API,  SetThresholder(), or
     // created implicitly when used in InternalSetImage.
     thresholder_(NULL),
     threshold_done_(false),
     block_list_(NULL),
     page_res_(NULL),
     input_file_(NULL),
     output_file_(NULL),
     datapath_(NULL),
     language_(NULL),
     rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
     image_width_(0), image_height_(0) {
 }

 TessBaseAPI::~TessBaseAPI() {
   End();
 }

 // Set the name of the input file. Needed only for training and
 // loading a UNLV zone file.
 void TessBaseAPI::SetInputName(const char* name) {
   if (input_file_ == NULL)
     input_file_ = new STRING(name);
   else
     *input_file_ = name;
 }

 // Set the name of the output files. Needed only for debugging.
 void TessBaseAPI::SetOutputName(const char* name) {
   if (output_file_ == NULL)
     output_file_ = new STRING(name);
   else
     *output_file_ = name;
 }

 // Set the value of an internal "variable" (of either old or new types).
 // Supply the name of the variable and the value as a string, just as
 // you would in a config file.
 // Returns false if the name lookup failed.
 // SetVariable may be used before Init, to set things that control
 // initialization, but note that on End all settings are lost and
 // the next Init will use the defaults unless SetVariable is used again.
 bool TessBaseAPI::SetVariable(const char* variable, const char* value) {
   if (tesseract_ == NULL)
     tesseract_ = new Tesseract;
   return set_variable(variable, value);
 }

 // The datapath must be the name of the data directory (no ending /) or
 // some other file in which the data directory resides (for instance argv[0].)
 // The language is (usually) an ISO 639-3 string or NULL will default to eng.
 // If numeric_mode is true, then only digits and Roman numerals will
 // be returned.
 // Returns 0 on success and -1 on initialization failure.
 int TessBaseAPI::Init(const char* datapath, const char* language,
                       char **configs, int configs_size,
                       bool configs_global_only) {
   // If the datapath or the language have changed, then start again.
   // Note that the language_ field stores the last requested language that was
   // initialized successfully, while tesseract_->lang stores the language
   // actually used. They differ only if the requested language was NULL, in
   // which case tesseract_->lang is set to the Tesseract default ("eng").
   if (tesseract_ != NULL &&
       (datapath_ == NULL || language_ == NULL || *datapath_ != datapath
        || (*language_ != language && tesseract_->lang != language))) {
     tesseract_->end_tesseract();
     delete tesseract_;
     tesseract_ = NULL;
   }

   bool reset_classifier = true;
   if (tesseract_ == NULL) {
     reset_classifier = false;
     tesseract_ = new Tesseract;
     if (tesseract_->init_tesseract(
             datapath, output_file_ != NULL ? output_file_->string() : NULL,
             language, configs, configs_size, configs_global_only) != 0) {
       return -1;
     }
   }
   // Update datapath and language requested for the last valid initialization.
   if (datapath_ == NULL)
     datapath_ = new STRING(datapath);
   else
     *datapath_ = datapath;
   if (language_ == NULL)
     language_ = new STRING(language);
   else
     *language_ = language;

   // For same language and datapath, just reset the adaptive classifier.
   if (reset_classifier) tesseract_->ResetAdaptiveClassifier();

   return 0;
 }

 // Init only the lang model component of Tesseract. The only functions
 // that work after this init are SetVariable and IsValidWord.
 // WARNING: temporary! This function will be removed from here and placed
 // in a separate API at some future time.
 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
   if (tesseract_ == NULL)
     tesseract_ = new Tesseract;
   return tesseract_->init_tesseract_lm(datapath, NULL, language);
 }

 // Init only the classifer component of Tesseract. Used to initialize the
 // specified language when no dawg models are available.
 int TessBaseAPI::InitWithoutLangModel(const char* datapath,
                                       const char* language) {
   // If the datapath or the language have changed, then start again.
   if (tesseract_ != NULL &&
       (datapath_ == NULL || language_ == NULL ||
        *datapath_ != datapath || *language_ != language)) {
     tesseract_->end_tesseract();
     delete tesseract_;
     tesseract_ = NULL;
   }
   if (datapath_ == NULL)
     datapath_ = new STRING(datapath);
   else
     *datapath_ = datapath;
   if (language_ == NULL)
     language_ = new STRING(language);
   else
     *language_ = language;
   if (tesseract_ == NULL) {
     tesseract_ = new Tesseract;
     return tesseract_->init_tesseract_classifier(
         datapath, output_file_ != NULL ? output_file_->string() : NULL,
         language, NULL, 0, false);
   }
   // For same language and datapath, just reset the adaptive classifier.
   tesseract_->ResetAdaptiveClassifier();
   return 0;
 }

 // Read a "config" file containing a set of variable, value pairs.
 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
 // and also accepts a relative or absolute path name.
 void TessBaseAPI::ReadConfigFile(const char* filename, bool global_only) {
   tesseract_->read_config_file(filename, global_only);
 }

 // Set the current page segmentation mode. Defaults to PSM_AUTO.
 // The mode is stored as an INT_VARIABLE so it can also be modified by
 // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string).
 void TessBaseAPI::SetPageSegMode(PageSegMode mode) {
   if (tesseract_ == NULL)
     tesseract_ = new Tesseract;
   tesseract_->tessedit_pageseg_mode.set_value(mode);
 }

 // Return the current page segmentation mode.
 PageSegMode TessBaseAPI::GetPageSegMode() const {
   if (tesseract_ == NULL)
     return PSM_SINGLE_BLOCK;
   return static_cast<PageSegMode>(
     static_cast<int>(tesseract_->tessedit_pageseg_mode));
 }

 // Set the hint for trading accuracy against speed.
 // Default is AVS_FASTEST, which is the old behaviour.
 // Note that this is only a hint. Depending on the language and/or
 // build configuration, speed and accuracy may not be tradeable.
 // Also note that despite being an enum, any value in the range
 // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not
 // have an effect, depending on the implementation.
 // The mode is stored as an INT_VARIABLE so it can also be modified by
 // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string).
 void TessBaseAPI::SetAccuracyVSpeed(AccuracyVSpeed mode) {
   if (tesseract_ == NULL)
     tesseract_ = new Tesseract;
   tesseract_->tessedit_accuracyvspeed.set_value(mode);
 }

 // Recognize a rectangle from an image and return the result as a string.
 // May be called many times for a single Init.
 // Currently has no error checking.
 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
 // Palette color images will not work properly and must be converted to
 // 24 bit.
 // Binary images of 1 bit per pixel may also be given but they must be
 // byte packed with the MSB of the first byte being the first pixel, and a
 // one pixel is WHITE. For binary images set bytes_per_pixel=0.
 // The recognized text is returned as a char* which is coded
 // as UTF8 and must be freed with the delete [] operator.
 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
                                  int bytes_per_pixel,
                                  int bytes_per_line,
                                  int left, int top,
                                  int width, int height) {
   if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
     return NULL;  // Nothing worth doing.

   // Since this original api didn't give the exact size of the image,
   // we have to invent a reasonable value.
   int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
   SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height,
            bytes_per_pixel, bytes_per_line);
   SetRectangle(left, top, width, height);

   return GetUTF8Text();
 }

 // Call between pages or documents etc to free up memory and forget
 // adaptive data.
 void TessBaseAPI::ClearAdaptiveClassifier() {
   if (tesseract_ == NULL)
     return;
   tesseract_->ResetAdaptiveClassifier();
 }

 // Provide an image for Tesseract to recognize. Format is as
 // TesseractRect above. Does not copy the image buffer, or take
 // ownership. The source image may be destroyed after Recognize is called,
 // either explicitly or implicitly via one of the Get*Text functions.
 // SetImage clears all recognition results, and sets the rectangle to the
 // full image, so it may be followed immediately by a GetUTF8Text, and it
 // will automatically perform recognition.
 void TessBaseAPI::SetImage(const unsigned char* imagedata,
                            int width, int height,
                            int bytes_per_pixel, int bytes_per_line) {
   if (InternalSetImage())
     thresholder_->SetImage(imagedata, width, height,
                            bytes_per_pixel, bytes_per_line);
 }

 // Provide an image for Tesseract to recognize. As with SetImage above,
 // Tesseract doesn't take a copy or ownership or pixDestroy the image, so
 // it must persist until after Recognize.
 // Pix vs raw, which to use?
 // Use Pix where possible. A future version of Tesseract may choose to use Pix
 // as its internal representation and discard IMAGE altogether.
 // Because of that, an implementation that sources and targets Pix may end up
 // with less copies than an implementation that does not.
 void TessBaseAPI::SetImage(const Pix* pix) {
 #ifdef HAVE_LIBLEPT
   if (InternalSetImage())
     thresholder_->SetImage(pix);
 #endif
 }

 // Restrict recognition to a sub-rectangle of the image. Call after SetImage.
 // Each SetRectangle clears the recogntion results so multiple rectangles
 // can be recognized with the same image.
 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
   if (thresholder_ == NULL)
     return;
   thresholder_->SetRectangle(left, top, width, height);
   ClearResults();
 }

 // ONLY available if you have Leptonica installed.
 // Get a copy of the internal thresholded image from Tesseract.
 Pix* TessBaseAPI::GetThresholdedImage() {
 #ifdef HAVE_LIBLEPT
   if (tesseract_ == NULL)
     return NULL;
   if (tesseract_->pix_binary() == NULL)
     Threshold(tesseract_->mutable_pix_binary());
   return pixClone(tesseract_->pix_binary());
 #else
   return NULL;
 #endif
 }

 // Get the result of page layout analysis as a leptonica-style
 // Boxa, Pixa pair, in reading order.
 // Can be called before or after Recognize.
 // For now only gets text regions.
 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
 #ifdef HAVE_LIBLEPT
   if (block_list_ == NULL || block_list_->empty()) {
     FindLines();
   }
   int im_height = pixGetHeight(tesseract_->pix_binary());
   Boxa* boxa = boxaCreate(block_list_->length());
   if (pixa != NULL) {
     *pixa = pixaCreate(boxaGetCount(boxa));
   }
   BLOCK_IT it(block_list_);
   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
     BLOCK* block = it.data();
     POLY_BLOCK* poly = block->poly_block();
     TBOX box;
     if (poly != NULL) {
       if (!poly->IsText())
         continue;  // Use only text blocks.
       POLY_BLOCK image_block(poly->points(), poly->isA());
       image_block.rotate(block->re_rotation());
       box = *image_block.bounding_box();
       if (pixa != NULL) {
         Pix* pix = pixCreate(box.width(), box.height(), 1);
         PB_LINE_IT *lines;
         // Block outline is a polygon, so use a PC_LINE_IT to get the
         // rasterized interior. (Runs of interior pixels on a line.)
         lines = new PB_LINE_IT(&image_block);
         for (int y = box.bottom(); y < box.top(); ++y) {
           ICOORDELT_LIST* segments = lines->get_line(y);
           if (!segments->empty()) {
             ICOORDELT_IT s_it(segments);
             // Each element of segments is a start x and x size of the
             // run of interior pixels.
             for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) {
               int start = s_it.data()->x();
               int xext = s_it.data()->y();
               // Copy the run from the source image to the block image.
               pixRasterop(pix, start - box.left(),
                           box.height() - 1 - (y - box.bottom()),
                           xext, 1, PIX_SRC, tesseract_->pix_binary(),
                           start, im_height - 1 - y);
             }
           }
           delete segments;
         }
         delete lines;
         pixaAddPix(*pixa, pix, L_INSERT);
       }
     } else {
       if (!block_list_->singleton())
         continue;  // A null poly block can only be used if it is the only block.
       box = block->bounding_box();
       if (pixa != NULL) {
         Pix* pix = pixCreate(box.width(), box.height(), 1);
         // Just copy the whole block as there is only a bounding box.
         pixRasterop(pix, 0, 0, box.width(), box.height(),
                     PIX_SRC, tesseract_->pix_binary(),
                     box.left(), im_height - box.top());
         pixaAddPix(*pixa, pix, L_INSERT);
       }
     }
     Box* lbox = boxCreate(box.left(), im_height - box.top(),
                           box.width(), box.height());
     boxaAddBox(boxa, lbox, L_INSERT);
   }
   return boxa;
 #else
   return NULL;
 #endif
 }

 // Get the textlines as a leptonica-style
 // Boxa, Pixa pair, in reading order.
 // Can be called before or after Recognize.
 // If blockids is not NULL, the block-id of each line is also returned as an
 // array of one element per line. delete [] after use.
 Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) {
 #ifdef HAVE_LIBLEPT
   if (block_list_ == NULL || block_list_->empty()) {
     FindLines();
   }
   // A local PAGE_RES prevents the clear if Recognize is called after.
   PAGE_RES page_res(block_list_);
   PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
   // Count the lines to get a size for the arrays.
   int line_count = 0;
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward()) {
     if (page_res_it.row() != page_res_it.next_row()) {
       ++line_count;
     }
   }

   int im_height = pixGetHeight(tesseract_->pix_binary());
   Boxa* boxa = boxaCreate(line_count);
   if (pixa != NULL)
     *pixa = pixaCreate(line_count);
   if (blockids != NULL)
     *blockids = new int[line_count];
   int blockid = 0;
   int lineindex = 0;
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward(), ++lineindex) {
     WERD_RES *word = page_res_it.word();
     BLOCK* block = page_res_it.block()->block;
     // Get the line bounding box.
     PAGE_RES_IT word_it(page_res_it);  // Save start of line.
     TBOX line_box = word->word->bounding_box();
     while (page_res_it.next_row() == page_res_it.row()) {
       page_res_it.forward();
       word = page_res_it.word();
       TBOX word_box = word->word->bounding_box();
       word_box.rotate(block->re_rotation());
       line_box += word_box;
     }
     Box* lbox = boxCreate(line_box.left(), im_height - line_box.top(),
                           line_box.width(), line_box.height());
     boxaAddBox(boxa, lbox, L_INSERT);
     if (pixa != NULL) {
       Pix* pix = pixCreate(line_box.width(), line_box.height(), 1);
       // Copy all the words to the output pix.
       while (word_it.row() == page_res_it.row()) {
         word = word_it.word();
         TBOX word_box = word->word->bounding_box();
         word_box.rotate(block->re_rotation());
         pixRasterop(pix, word_box.left() - line_box.left(),
                     line_box.top() - word_box.top(),
                     word_box.width(), word_box.height(),
                     PIX_SRC, tesseract_->pix_binary(),
                     word_box.left(), im_height - word_box.top());
         word_it.forward();
       }
       pixaAddPix(*pixa, pix, L_INSERT);
       pixaAddBox(*pixa, lbox, L_CLONE);
     }
     if (blockids != NULL) {
       (*blockids)[lineindex] = blockid;
       if (page_res_it.block() != page_res_it.next_block())
         ++blockid;
     }
   }
   return boxa;
 #else
   return NULL;
 #endif
 }

 // Get the words as a leptonica-style
 // Boxa, Pixa pair, in reading order.
 // Can be called before or after Recognize.
 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
 #ifdef HAVE_LIBLEPT
   if (block_list_ == NULL || block_list_->empty()) {
     FindLines();
   }
   // A local PAGE_RES prevents the clear if Recognize is called after.
   PAGE_RES page_res(block_list_);
   PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res);
   // Count the words to get a size for the arrays.
   int word_count = 0;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward())
     ++word_count;

   int im_height = pixGetHeight(tesseract_->pix_binary());
   Boxa* boxa = boxaCreate(word_count);
   if (pixa != NULL) {
     *pixa = pixaCreate(word_count);
   }
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     BLOCK* block = page_res_it.block()->block;
     TBOX box = word->word->bounding_box();
     box.rotate(block->re_rotation());
     Box* lbox = boxCreate(box.left(), im_height - box.top(),
                           box.width(), box.height());
     boxaAddBox(boxa, lbox, L_INSERT);
     if (pixa != NULL) {
       Pix* pix = pixCreate(box.width(), box.height(), 1);
       // Copy the whole word bounding box to the output pix.
       pixRasterop(pix, 0, 0, box.width(), box.height(),
                   PIX_SRC, tesseract_->pix_binary(),
                   box.left(), im_height - box.top());
       pixaAddPix(*pixa, pix, L_INSERT);
       pixaAddBox(*pixa, lbox, L_CLONE);
     }
   }
   return boxa;
 #else
   return NULL;
 #endif  // HAVE_LIBLEPT
 }

 // Dump the internal binary image to a PGM file.
 void TessBaseAPI::DumpPGM(const char* filename) {
   if (tesseract_ == NULL)
     return;
   IMAGELINE line;
   line.init(page_image.get_xsize());
   FILE *fp = fopen(filename, "w");
   fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n",
           page_image.get_xsize(), page_image.get_ysize());
   for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
     page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
     for (int i = 0; i < page_image.get_xsize(); ++i) {
       uinT8 b = line.pixels[i] ? 255 : 0;
       fwrite(&b, 1, 1, fp);
     }
   }
   fclose(fp);
 }

 // Recognize the tesseract global image and return the result as Tesseract
 // internal structures.
 int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) {
   if (tesseract_ == NULL)
     return -1;
   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
     tprintf("Please call SetImage before attempting recognition.");
     return -1;
   }
   if (page_res_ != NULL)
     ClearResults();
   if (FindLines() != 0)
     return -1;
   if (tesseract_->tessedit_resegment_from_boxes)
     tesseract_->apply_boxes(*input_file_, block_list_);
   tesseract_->SetBlackAndWhitelist();

   page_res_ = new PAGE_RES(block_list_);
   int result = 0;
   if (interactive_mode) {
 #ifndef GRAPHICS_DISABLED
     tesseract_->pgeditor_main(block_list_);
 #endif
     // The page_res is invalid after an interactive session, so cleanup
     // in a way that lets us continue to the next page without crashing.
     delete page_res_;
     page_res_ = NULL;
     return -1;
   } else if (tesseract_->tessedit_train_from_boxes) {
     apply_box_training(*output_file_, block_list_);
   } else if (tesseract_->global_tessedit_ambigs_training) {
     FILE *ambigs_output_file = tesseract_->init_ambigs_training(*input_file_);
     // OCR the page segmented into words by tesseract.
     tesseract_->ambigs_training_segmented(
         *input_file_, page_res_, monitor, ambigs_output_file);
     fclose(ambigs_output_file);
   } else {
     // Now run the main recognition.
     // Running base tesseract if the inttemp for the current language loaded.
     if (tesseract_->inttemp_loaded_) {
       tesseract_->recog_all_words(page_res_, monitor);
     }
   }
   return result;
 }

 // Tests the chopper by exhaustively running chop_one_blob.
 int TessBaseAPI::RecognizeForChopTest(struct ETEXT_STRUCT* monitor) {
   if (tesseract_ == NULL)
     return -1;
   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
     tprintf("Please call SetImage before attempting recognition.");
     return -1;
   }
   if (page_res_ != NULL)
     ClearResults();
   if (FindLines() != 0)
     return -1;
   // Additional conditions under which chopper test cannot be run
   if (tesseract_->tessedit_train_from_boxes_word_level || interactive_mode)
     return -1;
   ASSERT_HOST(tesseract_->inttemp_loaded_);

   page_res_ = new PAGE_RES(block_list_);

   PAGE_RES_IT page_res_it(page_res_);

   tesseract_->tess_matcher = &Tesseract::tess_default_matcher;
   tesseract_->tess_tester = NULL;
   tesseract_->tess_trainer = NULL;

   while (page_res_it.word() != NULL) {
     WERD_RES *word_res = page_res_it.word();
     WERD *word = word_res->word;
     if (word->cblob_list()->empty()) {
       page_res_it.forward();
       continue;
     }
     WERD *bln_word = make_bln_copy(word, page_res_it.row()->row,
                                    page_res_it.block()->block,
                                    word_res->x_height, &word_res->denorm);
     ASSERT_HOST(!bln_word->blob_list()->empty());
     TWERD *tessword = make_tess_word(bln_word, NULL);
     if (tessword->blobs == NULL) {
       make_tess_word(bln_word, NULL);
     }
     TBLOB *pblob;
     TBLOB *blob;
     init_match_table();
     BLOB_CHOICE_LIST *match_result;
     BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
     tesseract_->tess_denorm = &word_res->denorm;
     tesseract_->tess_word = bln_word;
     ASSERT_HOST(tessword->blobs != NULL);
     for (blob = tessword->blobs, pblob = NULL;
          blob != NULL; blob = blob->next) {
       match_result = tesseract_->classify_blob(pblob, blob, blob->next, NULL,
                                    "chop_word:", Green);
       if (match_result == NULL)
         tprintf("Null classifier output!\n");
       tesseract_->modify_blob_choice(match_result, 0);
       ASSERT_HOST(!match_result->empty());
       *char_choices += match_result;
       pblob = blob;
     }
     inT32 blob_number;
     SEAMS seam_list = start_seam_list(tessword->blobs);
     int right_chop_index = 0;
     while (tesseract_->chop_one_blob(tessword, char_choices,
                                     &blob_number, &seam_list,
                                     &right_chop_index))   {
     }

     word_res->best_choice = new WERD_CHOICE();
     word_res->raw_choice = new WERD_CHOICE();
     word_res->best_choice->make_bad();
     word_res->raw_choice->make_bad();
     tesseract_->getDict().permute_characters(*char_choices, 1000.0,
                                              word_res->best_choice,
                                              word_res->raw_choice);

     word_res->outword = make_ed_word(tessword, bln_word);
     page_res_it.forward();
   }
   return 0;
 }

 // Make a text string from the internal data structures.
 char* TessBaseAPI::GetUTF8Text() {
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   int total_length = TextLength(NULL);
   PAGE_RES_IT   page_res_it(page_res_);
   char* result = new char[total_length];
   char* ptr = result;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     WERD_CHOICE* choice = word->best_choice;
     if (choice != NULL) {
       strcpy(ptr, choice->unichar_string().string());
       ptr += choice->unichar_string().length();
       if (word->word->flag(W_EOL))
         *ptr++ = '\n';
       else
         *ptr++ = ' ';
     }
   }
   *ptr++ = '\n';
   *ptr = '\0';
   return result;
 }

 static int ConvertWordToBoxText(WERD_RES *word,
                                 ROW_RES* row,
                                 int left,
                                 int bottom,
                                 char* word_str) {
   // Copy the output word and denormalize it back to image coords.
   WERD copy_outword;
   copy_outword = *(word->outword);
   copy_outword.baseline_denormalise(&word->denorm);
   PBLOB_IT blob_it;
   blob_it.set_to_list(copy_outword.blob_list());
   int length = copy_outword.blob_list()->length();
   int output_size = 0;

   if (length > 0) {
     for (int index = 0, offset = 0; index < length;
          offset += word->best_choice->unichar_lengths()[index++],
          blob_it.forward()) {
       PBLOB* blob = blob_it.data();
       TBOX blob_box = blob->bounding_box();
       if (word->tess_failed ||
           blob_box.left() < 0 ||
           blob_box.right() > page_image.get_xsize() ||
           blob_box.bottom() < 0 ||
           blob_box.top() > page_image.get_ysize()) {
         // Bounding boxes can be illegal when tess fails on a word.
         blob_box = word->word->bounding_box();  // Use original word as backup.
         tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
                 blob_box.left(), blob_box.bottom(),
                 blob_box.right(), blob_box.top());
       }

       // A single classification unit can be composed of several UTF-8
       // characters. Append each of them to the result.
       for (int sub = 0;
            sub < word->best_choice->unichar_lengths()[index]; ++sub) {
         char ch = word->best_choice->unichar_string()[offset + sub];
         // Tesseract uses space for recognition failure. Fix to a reject
         // character, kTesseractReject so we don't create illegal box files.
         if (ch == ' ')
           ch = kTesseractReject;
         word_str[output_size++] = ch;
       }
       sprintf(word_str + output_size, " %d %d %d %d\n",
               blob_box.left() + left, blob_box.bottom() + bottom,
               blob_box.right() + left, blob_box.top() + bottom);
       output_size += strlen(word_str + output_size);
     }
   }
   return output_size;
 }

 // Multiplier for max expected textlength assumes typically 4 numbers @
 // (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
 // orginal UTF8 characters, and one kMaxCharsPerChar.
 const int kCharsPerChar = 25;
 // A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
 // space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
 // Test against this on each iteration for safety.
 const int kMaxCharsPerChar = 85 + UNICHAR_LEN;

 // The recognized text is returned as a char* which is coded
 // as a UTF8 box file and must be freed with the delete [] operator.
 char* TessBaseAPI::GetBoxText() {
   int bottom = image_height_ - (rect_top_ + rect_height_);
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   int blob_count;
   int utf8_length = TextLength(&blob_count);
   int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar;
   PAGE_RES_IT   page_res_it(page_res_);
   char* result = new char[total_length];
   char* ptr = result;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
                                 ptr);
     // Just in case...
     if (ptr - result + kMaxCharsPerChar > total_length)
       break;
   }
   *ptr = '\0';
   return result;
 }

 // Conversion table for non-latin characters.
 // Maps characters out of the latin set into the latin set.
 // TODO(rays) incorporate this translation into unicharset.
 const int kUniChs[] = {
   0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
 };
 // Latin chars corresponding to the unicode chars above.
 const int kLatinChs[] = {
   0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
 };

 // The recognized text is returned as a char* which is coded
 // as UNLV format Latin-1 with specific reject and suspect codes
 // and must be freed with the delete [] operator.
 char* TessBaseAPI::GetUNLVText() {
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   bool tilde_crunch_written = false;
   bool last_char_was_newline = true;
   bool last_char_was_tilde = false;

   int total_length = TextLength(NULL);
   PAGE_RES_IT   page_res_it(page_res_);
   char* result = new char[total_length];
   char* ptr = result;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     // Process the current word.
     if (word->unlv_crunch_mode != CR_NONE) {
       if (word->unlv_crunch_mode != CR_DELETE &&
           (!tilde_crunch_written ||
            (word->unlv_crunch_mode == CR_KEEP_SPACE &&
             word->word->space() > 0 &&
             !word->word->flag(W_FUZZY_NON) &&
             !word->word->flag(W_FUZZY_SP)))) {
         if (!word->word->flag(W_BOL) &&
             word->word->space() > 0 &&
             !word->word->flag(W_FUZZY_NON) &&
             !word->word->flag(W_FUZZY_SP)) {
           /* Write a space to separate from preceeding good text */
           *ptr++ = ' ';
           last_char_was_tilde = false;
         }
         if (!last_char_was_tilde) {
           // Write a reject char.
           last_char_was_tilde = true;
           *ptr++ = kUNLVReject;
           tilde_crunch_written = true;
           last_char_was_newline = false;
         }
       }
     } else {
       // NORMAL PROCESSING of non tilde crunched words.
       tilde_crunch_written = false;

       if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
         ensure_rep_chars_are_consistent(word);

       tesseract_->set_unlv_suspects(word);
       const char* wordstr = word->best_choice->unichar_string().string();
       const STRING& lengths = word->best_choice->unichar_lengths();
       int length = lengths.length();
       int i = 0;
       int offset = 0;

       if (last_char_was_tilde &&
           word->word->space() == 0 && wordstr[offset] == ' ') {
         // Prevent adjacent tilde across words - we know that adjacent tildes
         // within words have been removed.
         // Skip the first character.
         offset = lengths[i++];
       }
       if (i < length && wordstr[offset] != 0) {
         if (!last_char_was_newline)
           *ptr++ = ' ';
         else
           last_char_was_newline = false;
         for (; i < length; offset += lengths[i++]) {
           if (wordstr[offset] == ' ' ||
               wordstr[offset] == kTesseractReject) {
             *ptr++ = kUNLVReject;
             last_char_was_tilde = true;
           } else {
             if (word->reject_map[i].rejected())
               *ptr++ = kUNLVSuspect;
             UNICHAR ch(wordstr + offset, lengths[i]);
             int uni_ch = ch.first_uni();
             for (int j = 0; kUniChs[j] != 0; ++j) {
               if (kUniChs[j] == uni_ch) {
                 uni_ch = kLatinChs[j];
                 break;
               }
             }
             if (uni_ch <= 0xff) {
               *ptr++ = static_cast<char>(uni_ch);
               last_char_was_tilde = false;
             } else {
               *ptr++ = kUNLVReject;
               last_char_was_tilde = true;
             }
           }
         }
       }
     }
     if (word->word->flag(W_EOL) && !last_char_was_newline) {
       /* Add a new line output */
       *ptr++ = '\n';
       tilde_crunch_written = false;
       last_char_was_newline = true;
       last_char_was_tilde = false;
     }
   }
   *ptr++ = '\n';
   *ptr = '\0';
   return result;
 }

 // Returns the average word confidence for Tesseract page result.
 int TessBaseAPI::MeanTextConf() {
   int* conf = AllWordConfidences();
   if (!conf) return 0;
   int sum = 0;
   int *pt = conf;
   while (*pt >= 0) sum += *pt++;
   if (pt != conf) sum /= pt - conf;
   delete [] conf;
   return sum;
 }

 // Returns an array of all word confidences, terminated by -1.
 int* TessBaseAPI::AllWordConfidences() {
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   int n_word = 0;
   PAGE_RES_IT res_it(page_res_);
   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
     n_word++;

   int* conf = new int[n_word+1];
   n_word = 0;
   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
     WERD_RES *word = res_it.word();
     WERD_CHOICE* choice = word->best_choice;
     int w_conf = static_cast<int>(100 + 5 * choice->certainty());
                  // This is the eq for converting Tesseract confidence to 1..100
     if (w_conf < 0) w_conf = 0;
     if (w_conf > 100) w_conf = 100;
     conf[n_word++] = w_conf;
   }
   conf[n_word] = -1;
   return conf;
 }

 // Free up recognition results and any stored image data, without actually
 // freeing any recognition data that would be time-consuming to reload.
 // Afterwards, you must call SetImage or TesseractRect before doing
 // any Recognize or Get* operation.
 void TessBaseAPI::Clear() {
   if (thresholder_ != NULL)
     thresholder_->Clear();
   ClearResults();
   page_image.destroy();
 }

 // Close down tesseract and free up all memory. End() is equivalent to
 // destructing and reconstructing your TessBaseAPI.
 // Once End() has been used, none of the other API functions may be used
 // other than Init and anything declared above it in the class definition.
 void TessBaseAPI::End() {
   if (thresholder_ != NULL) {
     delete thresholder_;
     thresholder_ = NULL;
   }
   if (page_res_ != NULL) {
     delete page_res_;
     page_res_ = NULL;
   }
   if (block_list_ != NULL) {
     delete block_list_;
     block_list_ = NULL;
   }
   if (tesseract_ != NULL) {
     tesseract_->end_tesseract();
     delete tesseract_;
     tesseract_ = NULL;
   }
   if (input_file_ != NULL) {
     delete input_file_;
     input_file_ = NULL;
   }
   if (output_file_ != NULL) {
     delete output_file_;
     output_file_ = NULL;
   }
   if (datapath_ != NULL) {
     delete datapath_;
     datapath_ = NULL;
   }
   if (language_ != NULL) {
     delete language_;
     language_ = NULL;
   }
 }

 // Check whether a word is valid according to Tesseract's language model
 // returns 0 if the word is invalid, non-zero if valid
 int TessBaseAPI::IsValidWord(const char *word) {
   return tesseract_->getDict().valid_word(word);
 }


 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
   if (page_res_ == NULL)
     FindLines();
   if (block_list_->length() < 1) {
     return false;
   }

   // Get first block
   BLOCK_IT block_it(block_list_);
   block_it.move_to_first();
   ROW_LIST* rows = block_it.data()->row_list();
   if (rows->length() != 1) {
     return false;
   }

   // Get first line of block
   ROW_IT row_it(rows);
   row_it.move_to_first();
   ROW* row = row_it.data();

   // Calculate offset and slope (NOTE: Kind of ugly)
   *out_offset = static_cast<int>(row->base_line(0.0));
   *out_slope = row->base_line(1.0) - row->base_line(0.0);

   return true;
 }

 // Set the letter_is_okay function to point somewhere else.
 void TessBaseAPI::SetDictFunc(DictFunc f) {
   if (tesseract_ != NULL) {
     tesseract_->getDict().letter_is_okay_ = f;
   }
 }

 // Common code for setting the image.
 bool TessBaseAPI::InternalSetImage() {
   if (tesseract_ == NULL) {
     tprintf("Please call Init before attempting to send an image.");
     return false;
   }
   if (thresholder_ == NULL)
     thresholder_ = new ImageThresholder;
   ClearResults();
   return true;
 }

 // Run the thresholder to make the thresholded image. If pix is not NULL,
 // the source is thresholded to pix instead of the internal IMAGE.
 void TessBaseAPI::Threshold(Pix** pix) {
 #ifdef HAVE_LIBLEPT
   if (pix != NULL)
     thresholder_->ThresholdToPix(pix);
   else
     thresholder_->ThresholdToIMAGE(&page_image);
 #else
   thresholder_->ThresholdToIMAGE(&page_image);
 #endif
   thresholder_->GetImageSizes(&rect_left_, &rect_top_,
                               &rect_width_, &rect_height_,
                               &image_width_, &image_height_);
   threshold_done_ = true;
 }

 // Find lines from the image making the BLOCK_LIST.
 int TessBaseAPI::FindLines() {
   if (!block_list_->empty()) {
     return 0;
   }
   if (tesseract_ == NULL) {
     tesseract_ = new Tesseract;
     tesseract_->InitAdaptiveClassifier();
   }
 #ifdef HAVE_LIBLEPT
   if (tesseract_->pix_binary() == NULL)
     Threshold(tesseract_->mutable_pix_binary());
 #endif
   if (!threshold_done_)
     Threshold(NULL);

   if (tesseract_->SegmentPage(input_file_, &page_image, block_list_) < 0)
     return -1;
   ASSERT_HOST(page_image.get_xsize() == rect_width_ ||
               page_image.get_xsize() == rect_width_ - 1);
   ASSERT_HOST(page_image.get_ysize() == rect_height_ ||
               page_image.get_ysize() == rect_height_ - 1);
   return 0;
 }

 // Delete the pageres and clear the block list ready for a new page.
 void TessBaseAPI::ClearResults() {
   threshold_done_ = false;
   if (tesseract_ != NULL)
     tesseract_->Clear();
   if (page_res_ != NULL) {
     delete page_res_;
     page_res_ = NULL;
   }
   if (block_list_ == NULL)
     block_list_ = new BLOCK_LIST;
   else
     block_list_->clear();
 }

 // Return the length of the output text string, as UTF8, assuming
 // one newline per line and one per block, with a terminator,
 // and assuming a single character reject marker for each rejected character.
 // Also return the number of recognized blobs in blob_count.
 int TessBaseAPI::TextLength(int* blob_count) {
   if (tesseract_ == NULL || page_res_ == NULL)
     return 0;

   PAGE_RES_IT   page_res_it(page_res_);
   int total_length = 2;
   int total_blobs = 0;
   // Iterate over the data structures to extract the recognition result.
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     WERD_CHOICE* choice = word->best_choice;
     if (choice != NULL) {
       total_blobs += choice->length() + 1;
       total_length += choice->unichar_string().length() + 1;
       for (int i = 0; i < word->reject_map.length(); ++i) {
         if (word->reject_map[i].rejected())
           ++total_length;
       }
     }
   }
   if (blob_count != NULL)
     *blob_count = total_blobs;
   return total_length;
 }

 // Estimates the Orientation And Script of the image.
 // Returns true if the image was processed successfully.
 bool TessBaseAPI::DetectOS(OSResults* osr) {
   if (tesseract_ == NULL)
     return false;
   ClearResults();
   Threshold(NULL);
   if (input_file_ == NULL)
     input_file_ = new STRING(kInputFile);
   return orientation_and_script_detection(*input_file_, osr, tesseract_);
 }

 // ____________________________________________________________________________
 // Ocropus add-ons.

 // Find lines from the image making the BLOCK_LIST.
 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
   FindLines();
   BLOCK_LIST* result = block_list_;
   block_list_ = NULL;
   return result;
 }

 // Delete a block list.
 // This is to keep BLOCK_LIST pointer opaque
 // and let go of including the other headers.
 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
   delete block_list;
 }


 static ROW *make_tess_ocrrow(float baseline,
                              float xheight,
                              float descender,
                              float ascender) {
   inT32 xstarts[] = {-32000};
   double quad_coeffs[] = {0, 0, baseline};
   return new ROW(1,
                  xstarts,
                  quad_coeffs,
                  xheight,
                  ascender - (baseline + xheight),
                  descender - baseline,
                  0,
                  0);
 }

 // Almost a copy of make_tess_row() from ccmain/tstruct.cpp.
 static void fill_dummy_row(float baseline, float xheight,
                            float descender, float ascender,
                            TEXTROW* tessrow) {
   tessrow->baseline.segments = 1;
   tessrow->baseline.xstarts[0] = -32767;
   tessrow->baseline.xstarts[1] = 32767;
   tessrow->baseline.quads[0].a = 0;
   tessrow->baseline.quads[0].b = 0;
   tessrow->baseline.quads[0].c = bln_baseline_offset;
   tessrow->xheight.segments = 1;
   tessrow->xheight.xstarts[0] = -32767;
   tessrow->xheight.xstarts[1] = 32767;
   tessrow->xheight.quads[0].a = 0;
   tessrow->xheight.quads[0].b = 0;
   tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height;
   tessrow->lineheight = bln_x_height;
   tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight;
   tessrow->descdrop = bln_x_height * (descender - baseline) / xheight;
 }


 // Return a TBLOB * from the whole page_image.
 // To be freed later with free_blob().
 TBLOB *make_tesseract_blob(float baseline, float xheight,
                            float descender, float ascender) {
   BLOCK *block = new BLOCK("a character",
                            TRUE,
                            0, 0,
                            0, 0,
                            page_image.get_xsize(),
                            page_image.get_ysize());

   // Create C_BLOBs from the page
   extract_edges(
 #ifndef GRAPHICS_DISABLED
 		NULL,
 #endif
 		&page_image, &page_image,
                 ICOORD(page_image.get_xsize(), page_image.get_ysize()),
                 block);

   // Create one PBLOB from all C_BLOBs
   C_BLOB_LIST *list = block->blob_list();
   C_BLOB_IT c_blob_it(list);
   PBLOB *pblob = new PBLOB;  // will be (hopefully) deleted by the pblob_list
   for (c_blob_it.mark_cycle_pt();
        !c_blob_it.cycled_list();
        c_blob_it.forward()) {
       C_BLOB *c_blob = c_blob_it.data();
       PBLOB c_as_p(c_blob, baseline + xheight);
       merge_blobs(pblob, &c_as_p);
   }
   PBLOB_LIST *pblob_list = new PBLOB_LIST;  // will be deleted by the word
   PBLOB_IT pblob_it(pblob_list);
   pblob_it.add_after_then_move(pblob);

   // Normalize PBLOB
   WERD word(pblob_list, 0, " ");
   ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender);
   word.baseline_normalise(row);
   delete row;

   // Create a TBLOB from PBLOB
   return make_tess_blob(pblob, /* flatten: */ TRUE);
 }


 // Adapt to recognize the current image as the given character.
 // The image must be preloaded and be just an image of a single character.
 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
                                    int length,
                                    float baseline,
                                    float xheight,
                                    float descender,
                                    float ascender) {
   UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
   LINE_STATS LineStats;
   TEXTROW row;
   fill_dummy_row(baseline, xheight, descender, ascender, &row);
   GetLineStatsFromRow(&row, &LineStats);

   TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender);
   float threshold;
   UNICHAR_ID best_class = 0;
   float best_rating = -100;


   // Classify to get a raw choice.
   BLOB_CHOICE_LIST choices;
   tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices, NULL);
   BLOB_CHOICE_IT choice_it;
   choice_it.set_to_list(&choices);
   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
     if (choice_it.data()->rating() > best_rating) {
       best_rating = choice_it.data()->rating();
       best_class = choice_it.data()->unichar_id();
     }
   }

   if (id == best_class) {
     threshold = matcher_good_threshold;
   } else {
     /* the blob was incorrectly classified - find the rating threshold
        needed to create a template which will correct the error with
        some margin.  However, don't waste time trying to make
        templates which are too tight. */
     threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id);
     threshold *= .9;
     const float max_threshold = .125;
     const float min_threshold = .02;

     if (threshold > max_threshold)
         threshold = max_threshold;

     // I have cuddled the following line to set it out of the strike
     // of the coverage testing tool. I have no idea how to trigger
     // this situation nor I have any necessity to do it. --mezhirov
     if (threshold < min_threshold) threshold = min_threshold;
   }

   if (blob->outlines)
     tesseract_->AdaptToChar(blob, &LineStats, id, threshold);
   free_blob(blob);
 }


 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
   PAGE_RES *page_res = new PAGE_RES(block_list);
   tesseract_->recog_all_words(page_res, NULL, NULL, 1);
   return page_res;
 }

 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
                                         PAGE_RES* pass1_result) {
   if (!pass1_result)
     pass1_result = new PAGE_RES(block_list);
   tesseract_->recog_all_words(pass1_result, NULL, NULL, 2);
   return pass1_result;
 }

 struct TESS_CHAR : ELIST_LINK {
   char *unicode_repr;
   int length;  // of unicode_repr
   float cost;
   TBOX box;

   TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
     length = (len == -1 ? strlen(repr) : len);
     unicode_repr = new char[length + 1];
     strncpy(unicode_repr, repr, length);
   }

   TESS_CHAR() {  // Satisfies ELISTIZE.
   }
   ~TESS_CHAR() {
     delete [] unicode_repr;
   }
 };

 ELISTIZEH(TESS_CHAR)
 ELISTIZE(TESS_CHAR)

 static void add_space(TESS_CHAR_IT* it) {
   TESS_CHAR *t = new TESS_CHAR(0, " ");
   it->add_after_then_move(t);
 }


 static float rating_to_cost(float rating) {
   rating = 100 + 5*rating;
   // cuddled that to save from coverage profiler
   // (I have never seen ratings worse than -100,
   //  but the check won't hurt)
   if (rating < 0) rating = 0;
   return rating;
 }


 // Extract the OCR results, costs (penalty points for uncertainty),
 // and the bounding boxes of the characters.
 static void extract_result(TESS_CHAR_IT* out,
                            PAGE_RES* page_res) {
   PAGE_RES_IT page_res_it(page_res);
   int word_count = 0;
   while (page_res_it.word() != NULL) {
     WERD_RES *word = page_res_it.word();
     const char *str = word->best_choice->unichar_string().string();
     const char *len = word->best_choice->unichar_lengths().string();

     if (word_count)
       add_space(out);
     TBOX bln_rect;
     PBLOB_LIST *blobs = word->outword->blob_list();
     PBLOB_IT it(blobs);
     int n = strlen(len);
     TBOX** boxes_to_fix = new TBOX*[n];
     for (int i = 0; i < n; i++) {
       PBLOB *blob = it.data();
       TBOX current = blob->bounding_box();
       bln_rect = bln_rect.bounding_union(current);
       TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->certainty()),
                                     str, *len);
       tc->box = current;
       boxes_to_fix[i] = &tc->box;

       out->add_after_then_move(tc);
       it.forward();
       str += *len;
       len++;
     }

     // Find the word bbox before normalization.
     // Here we can't use the C_BLOB bboxes directly,
     // since connected letters are not yet cut.
     TBOX real_rect = word->word->bounding_box();

     // Denormalize boxes by transforming the bbox of the whole bln word
     // into the denorm bbox (`real_rect') of the whole word.
     double x_stretch = static_cast<double>(real_rect.width())
                      / bln_rect.width();
     double y_stretch = static_cast<double>(real_rect.height())
                      / bln_rect.height();
     for (int j = 0; j < n; j++) {
       TBOX *box = boxes_to_fix[j];
       int x0 = static_cast<int>(real_rect.left() +
                    x_stretch * (box->left() - bln_rect.left()) + 0.5);
       int x1 = static_cast<int>(real_rect.left() +
                    x_stretch * (box->right() - bln_rect.left()) + 0.5);
       int y0 = static_cast<int>(real_rect.bottom() +
                    y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5);
       int y1 = static_cast<int>(real_rect.bottom() +
                    y_stretch * (box->top() - bln_rect.bottom()) + 0.5);
       *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1));
     }
     delete [] boxes_to_fix;

     page_res_it.forward();
     word_count++;
   }
 }


 // Extract the OCR results, costs (penalty points for uncertainty),
 // and the bounding boxes of the characters.
 int TessBaseAPI::TesseractExtractResult(char** text,
                                         int** lengths,
                                         float** costs,
                                         int** x0,
                                         int** y0,
                                         int** x1,
                                         int** y1,
                                         PAGE_RES* page_res) {
   TESS_CHAR_LIST tess_chars;
   TESS_CHAR_IT tess_chars_it(&tess_chars);
   extract_result(&tess_chars_it, page_res);
   tess_chars_it.move_to_first();
   int n = tess_chars.length();
   int text_len = 0;
   *lengths = new int[n];
   *costs = new float[n];
   *x0 = new int[n];
   *y0 = new int[n];
   *x1 = new int[n];
   *y1 = new int[n];
   int i = 0;
   for (tess_chars_it.mark_cycle_pt();
        !tess_chars_it.cycled_list();
        tess_chars_it.forward(), i++) {
     TESS_CHAR *tc = tess_chars_it.data();
     text_len += (*lengths)[i] = tc->length;
     (*costs)[i] = tc->cost;
     (*x0)[i] = tc->box.left();
     (*y0)[i] = tc->box.bottom();
     (*x1)[i] = tc->box.right();
     (*y1)[i] = tc->box.top();
   }
   char *p = *text = new char[text_len];

   tess_chars_it.move_to_first();
   for (tess_chars_it.mark_cycle_pt();
         !tess_chars_it.cycled_list();
        tess_chars_it.forward()) {
     TESS_CHAR *tc = tess_chars_it.data();
     strncpy(p, tc->unicode_repr, tc->length);
     p += tc->length;
   }
   return n;
 }

 // This method returns the features associated with the current image.
 // Make sure setimage has been called before calling this method.
 void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features,
                               int* num_features) {
   if (page_res_ != NULL)
     ClearResults();
   if (!threshold_done_)
     Threshold(NULL);
   // We have only one block, which is of the size of the page.
   BLOCK_LIST* blocks = new BLOCK_LIST;
   BLOCK *block = new BLOCK("",                       // filename.
                            TRUE,                     // proportional.
                            0,                        // kerning.
                            0,                        // spacing.
                            0,                        // Left.
                            0,                        // Bottom.
                            page_image.get_xsize(),   // Right.
                            page_image.get_ysize());  // Top.
   ICOORD bleft, tright;
   block->bounding_box (bleft, tright);

   BLOCK_IT block_it_add = blocks;
   block_it_add.add_to_end(block);

   ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize());
   TEXTROW tessrow;
   make_tess_row(NULL,       // Denormalizer.
                 &tessrow);  // Output row.
   LINE_STATS line_stats;
   GetLineStatsFromRow(&tessrow, &line_stats);

   // Perform a CC analysis to detect the blobs.
   BLOCK_IT block_it = blocks;
   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
        block_it.forward ()) {
     BLOCK* block = block_it.data();
 #ifndef GRAPHICS_DISABLED
     extract_edges(NULL,         // Scrollview window.
                   &page_image,  // Image.
                   &page_image,  // Thresholded image.
                   page_tr,      // corner of page.
                   block);       // block.
 #else
     extract_edges(&page_image,  // Image.
                   &page_image,  // Thresholded image.
                   page_tr,      // corner of page.
                   block);       // block.
 #endif
     C_BLOB_IT blob_it = block->blob_list();
     PBLOB *pblob = new PBLOB;
     // Iterate over all blobs found and get their features.
     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
          blob_it.forward()) {
       C_BLOB* blob = blob_it.data();
       blob = blob;
       PBLOB c_as_p(blob, page_image.get_ysize());
       merge_blobs(pblob, &c_as_p);
     }

     PBLOB_LIST *pblob_list = new PBLOB_LIST;
     PBLOB_IT pblob_it(pblob_list);
     pblob_it.add_after_then_move(pblob);
     WERD word(pblob_list,  // Blob list.
               0,           // Blanks in front.
               " ");        // Correct text.
     ROW *row = make_tess_ocrrow(0,                       // baseline.
                                 page_image.get_ysize(),  // xheight.
                                 0,                       // ascent.
                                 0);                      // descent.
     word.baseline_normalise(row);
     delete row;
     if (pblob->out_list () == NULL) {
       tprintf("Blob list is empty");
     }
     TBLOB* tblob = make_tess_blob(pblob,  // Blob.
                                   TRUE);  // Flatten.

     CLASS_NORMALIZATION_ARRAY norm_array;
     inT32 len;
     *num_features = tesseract_->GetCharNormFeatures(
         tblob, &line_stats,
         tesseract_->PreTrainedTemplates,
         int_features, norm_array, &len);
   }
   delete blocks;
 }

 // Return the pointer to the i-th dawg loaded into tesseract_ object.
 const Dawg *TessBaseAPI::GetDawg(int i) const {
   if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
   return tesseract_->getDict().GetDawg(i);
 }

 // Return the number of dawgs loaded into tesseract_ object.
 int TessBaseAPI::NumDawgs() const {
   return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
 }

 // Return the language used in the last valid initialization.
 const char* TessBaseAPI::GetLastInitLanguage() const {
   return (tesseract_ == NULL || tesseract_->lang.string() == NULL) ?
       "" : tesseract_->lang.string();
 }
 }  // namespace tesseract.