ccmain/baseapi.cpp - platform/external/tesseract - Git at Google

 /**********************************************************************
  * File:        baseapi.cpp
  * Description: Simple API for calling tesseract.
  * Author:      Ray Smith
  * Created:     Fri Oct 06 15:35:01 PDT 2006
  *
  * (C) Copyright 2006, Google Inc.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 // Include automatically generated configuration file if running autoconf.
 #ifdef HAVE_CONFIG_H
 #include "config_auto.h"
 #endif

 #ifdef HAVE_LIBLEPT
 // Include leptonica library only if autoconf (or makefile etc) tell us to.
 #include "allheaders.h"
 #endif

 #include "baseapi.h"

 #include "thresholder.h"
 #include "tesseractmain.h"
 #include "tesseractclass.h"
 #include "tessedit.h"
 #include "ocrclass.h"
 #include "pageres.h"
 #include "tessvars.h"
 #include "control.h"
 #include "applybox.h"
 #include "pgedit.h"
 #include "varabled.h"
 #include "variables.h"
 #include "output.h"
 #include "mainblk.h"
 #include "globals.h"
 #include "adaptmatch.h"
 #include "edgblob.h"
 #include "tessbox.h"
 #include "tordvars.h"
 #include "imgs.h"
 #include "makerow.h"
 #include "tstruct.h"
 #include "tessout.h"
 #include "tface.h"
 #include "permute.h"
 #include "otsuthr.h"
 #include "osdetect.h"
 #ifndef HAVE_LIBLEPT
 #include "pageseg.h"
 #include "blread.h"
 #endif//HAVE_LIBLEPT


 namespace tesseract {

 // Minimum sensible image size to be worth running tesseract.
 const int kMinRectSize = 10;
 // Character returned when Tesseract couldn't recognize as anything.
 const char kTesseractReject = '~';
 // Character used by UNLV error counter as a reject.
 const char kUNLVReject = '~';
 // Character used by UNLV as a suspect marker.
 const char kUNLVSuspect = '^';
 // Filename used for input image file, from which to derive a name to search
 // for a possible UNLV zone file, if none is specified by SetInputName.
 const char* kInputFile = "noname.tif";

 TessBaseAPI::TessBaseAPI()
   : tesseract_(NULL),
     // Thresholder is initialized to NULL here, but will be set before use by:
     // A constructor of a derived API,  SetThresholder(), or
     // created implicitly when used in InternalSetImage.
     thresholder_(NULL),
     threshold_done_(false),
     block_list_(NULL),
     page_res_(NULL),
     input_file_(NULL),
     output_file_(NULL),
     datapath_(NULL),
     language_(NULL),
     rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0),
     image_width_(0), image_height_(0) {
 }

 TessBaseAPI::~TessBaseAPI() {
   End();
 }

 // Set the name of the input file. Needed only for training and
 // loading a UNLV zone file.
 void TessBaseAPI::SetInputName(const char* name) {
   if (input_file_ == NULL)
     input_file_ = new STRING(name);
   else
     *input_file_ = name;
 }

 // Set the name of the output files. Needed only for debugging.
 void TessBaseAPI::SetOutputName(const char* name) {
   if (output_file_ == NULL)
     output_file_ = new STRING(name);
   else
     *output_file_ = name;
 }

 // Set the value of an internal "variable" (of either old or new types).
 // Supply the name of the variable and the value as a string, just as
 // you would in a config file.
 // Returns false if the name lookup failed.
 // SetVariable may be used before Init, to set things that control
 // initialization, but note that on End all settings are lost and
 // the next Init will use the defaults unless SetVariable is used again.
 bool TessBaseAPI::SetVariable(const char* variable, const char* value) {
   if (tesseract_ == NULL)
     tesseract_ = new Tesseract;

   if (set_new_style_variable(variable, value))
     return true;
   return set_old_style_variable(variable, value);
 }

 // The datapath must be the name of the data directory (no ending /) or
 // some other file in which the data directory resides (for instance argv[0].)
 // The language is (usually) an ISO 639-3 string or NULL will default to eng.
 // If numeric_mode is true, then only digits and Roman numerals will
 // be returned.
 int TessBaseAPI::Init(const char* datapath, const char* language) {
   // If the datapath or the language have changed, then start again.
   if (tesseract_ != NULL &&
       (datapath_ == NULL || language_ == NULL ||
        *datapath_ != datapath || *language_ != language)) {
     tesseract_->end_tesseract();
     delete tesseract_;
     tesseract_ = NULL;
   }
   if (datapath_ == NULL)
     datapath_ = new STRING(datapath);
   else
     *datapath_ = datapath;
   if (language_ == NULL)
     language_ = new STRING(language);
   else
     *language_ = language;
   if (tesseract_ == NULL) {
     tesseract_ = new Tesseract;
     return tesseract_->init_tesseract(datapath,
                           output_file_ != NULL ? output_file_->string() : NULL,
                           language, NULL, 0, NULL);
   }
   // For same language and datapath, just reseth the adaptive classifier.
   tesseract_->ResetAdaptiveClassifier();
   return 0;
 }

 // Init only the lang model component of Tesseract. The only functions
 // that work after this init are SetVariable and IsValidWord.
 // WARNING: temporary! This function will be removed from here and placed
 // in a separate API at some future time.
 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
   if (tesseract_ == NULL)
     tesseract_ = new Tesseract;
   return tesseract_->init_tesseract_lm(datapath, NULL,
                                        language, NULL, 0, NULL);
 }

 // Read a "config" file containing a set of variable, value pairs.
 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
 // and also accepts a relative or absolute path name.
 bool TessBaseAPI::ReadConfigFile(const char* filename) {
   STRING path = tesseract_->datadir;
   path += "configs/";
   path += filename;
   FILE* fp;
   if ((fp = fopen(path.string(), "r")) != NULL) {
     fclose(fp);
   } else {
     path = tesseract_->datadir;
     path += "tessconfigs/";
     path += filename;
     if ((fp = fopen(path.string(), "r")) != NULL) {
       fclose(fp);
     } else {
       path = filename;
     }
   }
   if (read_variables_file(path.string())) {
     tesseract_->read_variables(path.string());
   }
   return true;
 }

 // Recognize a rectangle from an image and return the result as a string.
 // May be called many times for a single Init.
 // Currently has no error checking.
 // Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
 // Palette color images will not work properly and must be converted to
 // 24 bit.
 // Binary images of 1 bit per pixel may also be given but they must be
 // byte packed with the MSB of the first byte being the first pixel, and a
 // one pixel is WHITE. For binary images set bytes_per_pixel=0.
 // The recognized text is returned as a char* which is coded
 // as UTF8 and must be freed with the delete [] operator.
 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
                                  int bytes_per_pixel,
                                  int bytes_per_line,
                                  int left, int top,
                                  int width, int height) {
   if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
     return NULL;  // Nothing worth doing.

   // Since this original api didn't give the exact size of the image,
   // we have to invent a reasonable value.
   int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
   SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height,
            bytes_per_pixel, bytes_per_line);
   SetRectangle(left, top, width, height);

   return GetUTF8Text();
 }

 // Call between pages or documents etc to free up memory and forget
 // adaptive data.
 void TessBaseAPI::ClearAdaptiveClassifier() {
   if (tesseract_ == NULL)
     return;
   tesseract_->ResetAdaptiveClassifier();
 }

 // Provide an image for Tesseract to recognize. Format is as
 // TesseractRect above. Does not copy the image buffer, or take
 // ownership. The source image may be destroyed after Recognize is called,
 // either explicitly or implicitly via one of the Get*Text functions.
 // SetImage clears all recognition results, and sets the rectangle to the
 // full image, so it may be followed immediately by a GetUTF8Text, and it
 // will automatically perform recognition.
 void TessBaseAPI::SetImage(const unsigned char* imagedata,
                            int width, int height,
                            int bytes_per_pixel, int bytes_per_line) {
   if (InternalSetImage())
     thresholder_->SetImage(imagedata, width, height,
                            bytes_per_pixel, bytes_per_line);
 }

 #ifdef HAVE_LIBLEPT
 // Provide an image for Tesseract to recognize. As with SetImage above,
 // Tesseract doesn't take a copy or ownership or pixDestroy the image, so
 // it must persist until after Recognize.
 // Pix vs raw, which to use?
 // Use Pix where possible. A future version of Tesseract may choose to use Pix
 // as its internal representation and discard IMAGE altogether.
 // Because of that, an implementation that sources and targets Pix may end up
 // with less copies than an implementation that does not.
 void TessBaseAPI::SetImage(const Pix* pix) {
   if (InternalSetImage())
     thresholder_->SetImage(pix);
 }
 #endif

 // Restrict recognition to a sub-rectangle of the image. Call after SetImage.
 // Each SetRectangle clears the recogntion results so multiple rectangles
 // can be recognized with the same image.
 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
   if (thresholder_ == NULL)
     return;
   thresholder_->SetRectangle(left, top, width, height);
   ClearResults();
 }

 #ifdef HAVE_LIBLEPT
 // ONLY available if you have Leptonica installed.
 // Get a copy of the internal thresholded image from Tesseract.
 Pix* TessBaseAPI::GetThresholdedImage() {
   if (tesseract_ == NULL)
     return NULL;
   if (!threshold_done_)
     Threshold();
   return page_image.ToPix();
 }
 #endif  // HAVE_LIBLEPT

 // Dump the internal binary image to a PGM file.
 void TessBaseAPI::DumpPGM(const char* filename) {
   if (tesseract_ == NULL)
     return;
   IMAGELINE line;
   line.init(page_image.get_xsize());
   FILE *fp = fopen(filename, "w");
   fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n",
           page_image.get_xsize(), page_image.get_ysize());
   for (int j = page_image.get_ysize()-1; j >= 0 ; --j) {
     page_image.get_line(0, j, page_image.get_xsize(), &line, 0);
     for (int i = 0; i < page_image.get_xsize(); ++i) {
       uinT8 b = line.pixels[i] ? 255 : 0;
       fwrite(&b, 1, 1, fp);
     }
   }
   fclose(fp);
 }

 // Recognize the tesseract global image and return the result as Tesseract
 // internal structures.
 int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) {
   if (tesseract_ == NULL)
     return -1;
   if (thresholder_ == NULL || thresholder_->IsEmpty()) {
     tprintf("Please call SetImage before attempting recognition.");
     return -1;
   }
   if (page_res_ != NULL)
     ClearResults();
   if (!threshold_done_)
     Threshold();
   if (FindLines() != 0)
     return -1;
   if (tesseract_->tessedit_resegment_from_boxes)
     tesseract_->apply_boxes(*input_file_, block_list_);
   tesseract_->SetBlackAndWhitelist();

   page_res_ = new PAGE_RES(block_list_);
   if (interactive_mode) {
 #ifdef HAVE_LIBLEPT
     tesseract_->pgeditor_main(block_list_);
 #endif
   } else if (tesseract_->tessedit_train_from_boxes) {
     apply_box_training(*output_file_, block_list_);
   } else {
     // Now run the main recognition.
     tesseract_->recog_all_words(page_res_, monitor);
   }
   return 0;
 }

 // Make a text string from the internal data structures.
 char* TessBaseAPI::GetUTF8Text() {
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   int total_length = TextLength(NULL);
   PAGE_RES_IT   page_res_it(page_res_);
   char* result = new char[total_length];
   char* ptr = result;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     WERD_CHOICE* choice = word->best_choice;
     if (choice != NULL) {
       strcpy(ptr, choice->unichar_string().string());
       ptr += choice->unichar_string().length();
       if (word->word->flag(W_EOL))
         *ptr++ = '\n';
       else
         *ptr++ = ' ';
     }
   }
   *ptr++ = '\n';
   *ptr = '\0';
   return result;
 }

 static int ConvertWordToBoxText(WERD_RES *word,
                                 ROW_RES* row,
                                 int left,
                                 int bottom,
                                 char* word_str) {
   // Copy the output word and denormalize it back to image coords.
   WERD copy_outword;
   copy_outword = *(word->outword);
   copy_outword.baseline_denormalise(&word->denorm);
   PBLOB_IT blob_it;
   blob_it.set_to_list(copy_outword.blob_list());
   int length = copy_outword.blob_list()->length();
   int output_size = 0;

   if (length > 0) {
     for (int index = 0, offset = 0; index < length;
          offset += word->best_choice->unichar_lengths()[index++],
          blob_it.forward()) {
       PBLOB* blob = blob_it.data();
       TBOX blob_box = blob->bounding_box();
       if (word->tess_failed ||
           blob_box.left() < 0 ||
           blob_box.right() > page_image.get_xsize() ||
           blob_box.bottom() < 0 ||
           blob_box.top() > page_image.get_ysize()) {
         // Bounding boxes can be illegal when tess fails on a word.
         blob_box = word->word->bounding_box();  // Use original word as backup.
         tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n",
                 blob_box.left(), blob_box.bottom(),
                 blob_box.right(), blob_box.top());
       }

       // A single classification unit can be composed of several UTF-8
       // characters. Append each of them to the result.
       for (int sub = 0;
            sub < word->best_choice->unichar_lengths()[index]; ++sub) {
         char ch = word->best_choice->unichar_string()[offset + sub];
         // Tesseract uses space for recognition failure. Fix to a reject
         // character, kTesseractReject so we don't create illegal box files.
         if (ch == ' ')
           ch = kTesseractReject;
         word_str[output_size++] = ch;
       }
       sprintf(word_str + output_size, " %d %d %d %d\n",
               blob_box.left() + left, blob_box.bottom() + bottom,
               blob_box.right() + left, blob_box.top() + bottom);
       output_size += strlen(word_str + output_size);
     }
   }
   return output_size;
 }

 // Multiplier for max expected textlength assumes typically 4 numbers @
 // (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the
 // orginal UTF8 characters, and one kMaxCharsPerChar.
 const int kCharsPerChar = 25;
 // A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a
 // space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR.
 // Test against this on each iteration for safety.
 const int kMaxCharsPerChar = 85 + UNICHAR_LEN;

 // The recognized text is returned as a char* which is coded
 // as a UTF8 box file and must be freed with the delete [] operator.
 char* TessBaseAPI::GetBoxText() {
   int bottom = image_height_ - (rect_top_ + rect_height_);
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   int blob_count;
   int utf8_length = TextLength(&blob_count);
   int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar;
   PAGE_RES_IT   page_res_it(page_res_);
   char* result = new char[total_length];
   char* ptr = result;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom,
                                 ptr);
     // Just in case...
     if (ptr - result + kMaxCharsPerChar > total_length)
       break;
   }
   *ptr = '\0';
   return result;
 }

 // Conversion table for non-latin characters.
 // Maps characters out of the latin set into the latin set.
 // TODO(rays) incorporate this translation into unicharset.
 const int kUniChs[] = {
   0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
 };
 // Latin chars corresponding to the unicode chars above.
 const int kLatinChs[] = {
   0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
 };

 // The recognized text is returned as a char* which is coded
 // as UNLV format Latin-1 with specific reject and suspect codes
 // and must be freed with the delete [] operator.
 char* TessBaseAPI::GetUNLVText() {
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   bool tilde_crunch_written = false;
   bool last_char_was_newline = true;
   bool last_char_was_tilde = false;

   int total_length = TextLength(NULL);
   PAGE_RES_IT   page_res_it(page_res_);
   char* result = new char[total_length];
   char* ptr = result;
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     // Process the current word.
     if (word->unlv_crunch_mode != CR_NONE) {
       if (word->unlv_crunch_mode != CR_DELETE &&
           (!tilde_crunch_written ||
            (word->unlv_crunch_mode == CR_KEEP_SPACE &&
             word->word->space() > 0 &&
             !word->word->flag(W_FUZZY_NON) &&
             !word->word->flag(W_FUZZY_SP)))) {
         if (!word->word->flag(W_BOL) &&
             word->word->space() > 0 &&
             !word->word->flag(W_FUZZY_NON) &&
             !word->word->flag(W_FUZZY_SP)) {
           /* Write a space to separate from preceeding good text */
           *ptr++ = ' ';
           last_char_was_tilde = false;
         }
         if (!last_char_was_tilde) {
           // Write a reject char.
           last_char_was_tilde = true;
           *ptr++ = kUNLVReject;
           tilde_crunch_written = true;
           last_char_was_newline = false;
         }
       }
     } else {
       // NORMAL PROCESSING of non tilde crunched words.
       tilde_crunch_written = false;

       if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps)
         ensure_rep_chars_are_consistent(word);

       tesseract_->set_unlv_suspects(word);
       const char* wordstr = word->best_choice->unichar_string().string();
       const STRING& lengths = word->best_choice->unichar_lengths();
       int length = lengths.length();
       int i = 0;
       int offset = 0;

       if (last_char_was_tilde &&
           word->word->space() == 0 && wordstr[offset] == ' ') {
         // Prevent adjacent tilde across words - we know that adjacent tildes
         // within words have been removed.
         // Skip the first character.
         offset = lengths[i++];
       }
       if (i < length && wordstr[offset] != 0) {
         if (!last_char_was_newline)
           *ptr++ = ' ';
         else
           last_char_was_newline = false;
         for (; i < length; offset += lengths[i++]) {
           if (wordstr[offset] == ' ' ||
               wordstr[offset] == kTesseractReject) {
             *ptr++ = kUNLVReject;
             last_char_was_tilde = true;
           } else {
             if (word->reject_map[i].rejected())
               *ptr++ = kUNLVSuspect;
             UNICHAR ch(wordstr + offset, lengths[i]);
             int uni_ch = ch.first_uni();
             for (int j = 0; kUniChs[j] != 0; ++j) {
               if (kUniChs[j] == uni_ch) {
                 uni_ch = kLatinChs[j];
                 break;
               }
             }
             if (uni_ch <= 0xff) {
               *ptr++ = static_cast<char>(uni_ch);
               last_char_was_tilde = false;
             } else {
               *ptr++ = kUNLVReject;
               last_char_was_tilde = true;
             }
           }
         }
       }
     }
     if (word->word->flag(W_EOL) && !last_char_was_newline) {
       /* Add a new line output */
       *ptr++ = '\n';
       tilde_crunch_written = false;
       last_char_was_newline = true;
       last_char_was_tilde = false;
     }
   }
   *ptr++ = '\n';
   *ptr = '\0';
   return result;
 }

 // Returns the average word confidence for Tesseract page result.
 int TessBaseAPI::MeanTextConf() {
   int* conf = AllWordConfidences();
   if (!conf) return 0;
   int sum = 0;
   int *pt = conf;
   while (*pt >= 0) sum += *pt++;
   if (pt != conf) sum /= pt - conf;
   delete [] conf;
   return sum;
 }

 // Returns an array of all word confidences, terminated by -1.
 int* TessBaseAPI::AllWordConfidences() {
   if (tesseract_ == NULL ||
       (page_res_ == NULL && Recognize(NULL) < 0))
     return NULL;
   int n_word = 0;
   PAGE_RES_IT res_it(page_res_);
   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
     n_word++;

   int* conf = new int[n_word+1];
   n_word = 0;
   for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
     WERD_RES *word = res_it.word();
     WERD_CHOICE* choice = word->best_choice;
     int w_conf = static_cast<int>(100 + 5 * choice->certainty());
                  // This is the eq for converting Tesseract confidence to 1..100
     if (w_conf < 0) w_conf = 0;
     if (w_conf > 100) w_conf = 100;
     conf[n_word++] = w_conf;
   }
   conf[n_word] = -1;
   return conf;
 }

 // Free up recognition results and any stored image data, without actually
 // freeing any recognition data that would be time-consuming to reload.
 // Afterwards, you must call SetImage or TesseractRect before doing
 // any Recognize or Get* operation.
 void TessBaseAPI::Clear() {
   if (thresholder_ != NULL)
     thresholder_->Clear();
   ClearResults();
   page_image.destroy();
 }

 // Close down tesseract and free up all memory. End() is equivalent to
 // destructing and reconstructing your TessBaseAPI.
 // Once End() has been used, none of the other API functions may be used
 // other than Init and anything declared above it in the class definition.
 void TessBaseAPI::End() {
   if (thresholder_ != NULL) {
     delete thresholder_;
     thresholder_ = NULL;
   }
   if (page_res_ != NULL) {
     delete page_res_;
     page_res_ = NULL;
   }
   if (block_list_ != NULL) {
     delete block_list_;
     block_list_ = NULL;
   }
   if (tesseract_ != NULL) {
     tesseract_->end_tesseract();
     delete tesseract_;
     tesseract_ = NULL;
   }
   if (input_file_ != NULL) {
     delete input_file_;
     input_file_ = NULL;
   }
   if (output_file_ != NULL) {
     delete output_file_;
     output_file_ = NULL;
   }
   if (datapath_ != NULL) {
     delete datapath_;
     datapath_ = NULL;
   }
   if (language_ != NULL) {
     delete language_;
     language_ = NULL;
   }
 }

 // Check whether a word is valid according to Tesseract's language model
 // returns 0 if the word is invalid, non-zero if valid
 int TessBaseAPI::IsValidWord(const char *word) {
   return tesseract_->getDict().valid_word(word);
 }


 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
   if (thresholder_ != NULL && !threshold_done_)
     Threshold();
   if (page_res_ == NULL)
     FindLines();
   if (block_list_->length() < 1) {
     return false;
   }

   // Get first block
   BLOCK_IT block_it(block_list_);
   block_it.move_to_first();
   ROW_LIST* rows = block_it.data()->row_list();
   if (rows->length() != 1) {
     return false;
   }

   // Get first line of block
   ROW_IT row_it(rows);
   row_it.move_to_first();
   ROW* row = row_it.data();

   // Calculate offset and slope (NOTE: Kind of ugly)
   *out_offset = static_cast<int>(row->base_line(0.0));
   *out_slope = row->base_line(1.0) - row->base_line(0.0);

   return true;
 }

 // Set the letter_is_okay function to point somewhere else.
 void TessBaseAPI::SetDictFunc(DictFunc f) {
   if (tesseract_ != NULL) {
     tesseract_->getDict().letter_is_okay_ = f;
   }
 }

 // Common code for setting the image.
 bool TessBaseAPI::InternalSetImage() {
   if (tesseract_ == NULL) {
     tprintf("Please call Init before attempting to send an image.");
     return false;
   }
   if (thresholder_ == NULL)
     thresholder_ = new ImageThresholder;
   ClearResults();
   return true;
 }

 #ifndef HAVE_LIBLEPT
 namespace tesseract {
 void pgeditor_read_file(STRING &filename,
                         BLOCK_LIST *blocks,  // block list to add to
 			Tesseract *tess)
 {
   STRING name = filename;        //truncated name
   const char *lastdot;           //of name
   TO_BLOCK_LIST land_blocks, port_blocks;
   TBOX page_box;

   lastdot = strrchr (name.string (), '.');
   if (lastdot != NULL)
     name[lastdot-name.string()] = '\0';
   if (!read_pd_file (name, page_image.get_xsize (), page_image.get_ysize (),
                      blocks)) {
     segment_page(blocks);
   }
   find_components(blocks, &land_blocks, &port_blocks, &page_box);
   textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks, tess);
 }
 }
 #endif//HAVE_LIBLEPT

 // Run the thresholder to make the thresholded image.
 void TessBaseAPI::Threshold() {
   thresholder_->ThresholdToIMAGE(&page_image);
   thresholder_->GetImageSizes(&rect_left_, &rect_top_,
                               &rect_width_, &rect_height_,
                               &image_width_, &image_height_);
   threshold_done_ = true;
 }

 // Find lines from the image making the BLOCK_LIST.
 int TessBaseAPI::FindLines() {
   // The following call creates a full-page block and then runs connected
   // component analysis and text line creation.
   if (input_file_ == NULL)
     input_file_ = new STRING(kInputFile);
   if (tesseract_ == NULL) {
     tesseract_ = new Tesseract;
     tesseract_->InitAdaptiveClassifier();
   }
 #ifdef HAVE_LIBLEPT
   tesseract_->pgeditor_read_file(*input_file_, block_list_);
 #else
   tesseract::pgeditor_read_file(*input_file_, block_list_, tesseract_);
 #endif
   return 0;
 }

 // Delete the pageres and clear the block list ready for a new page.
 void TessBaseAPI::ClearResults() {
   threshold_done_ = false;
   if (page_res_ != NULL) {
     delete page_res_;
     page_res_ = NULL;
   }
   if (block_list_ == NULL)
     block_list_ = new BLOCK_LIST;
   else
     block_list_->clear();
 }

 // Return the length of the output text string, as UTF8, assuming
 // one newline per line and one per block, with a terminator,
 // and assuming a single character reject marker for each rejected character.
 // Also return the number of recognized blobs in blob_count.
 int TessBaseAPI::TextLength(int* blob_count) {
   if (tesseract_ == NULL || page_res_ == NULL)
     return 0;

   PAGE_RES_IT   page_res_it(page_res_);
   int total_length = 2;
   int total_blobs = 0;
   // Iterate over the data structures to extract the recognition result.
   for (page_res_it.restart_page(); page_res_it.word () != NULL;
        page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     WERD_CHOICE* choice = word->best_choice;
     if (choice != NULL) {
       total_blobs += choice->length() + 1;
       total_length += choice->unichar_string().length() + 1;
       for (int i = 0; i < word->reject_map.length(); ++i) {
         if (word->reject_map[i].rejected())
           ++total_length;
       }
     }
   }
   if (blob_count != NULL)
     *blob_count = total_blobs;
   return total_length;
 }

 // Return the Orientation And Script
 void TessBaseAPI::DetectOS(OSResults* osr) {
   if (tesseract_ == NULL)
     return;
   ClearResults();
   Threshold();
   if (input_file_ == NULL)
     input_file_ = new STRING(kInputFile);
   orientation_and_script_detection(*input_file_, osr, tesseract_);
 }

 // ____________________________________________________________________________
 // Ocropus add-ons.

 // Find lines from the image making the BLOCK_LIST.
 BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() {
   FindLines();
   return block_list_;
 }

 // Delete a block list.
 // This is to keep BLOCK_LIST pointer opaque
 // and let go of including the other headers.
 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
   delete block_list;
 }


 static ROW *make_tess_ocrrow(float baseline,
                              float xheight,
                              float descender,
                              float ascender) {
   inT32 xstarts[] = {-32000};
   double quad_coeffs[] = {0, 0, baseline};
   return new ROW(1,
                  xstarts,
                  quad_coeffs,
                  xheight,
                  ascender - (baseline + xheight),
                  descender - baseline,
                  0,
                  0);
 }

 // Almost a copy of make_tess_row() from ccmain/tstruct.cpp.
 static void fill_dummy_row(float baseline, float xheight,
                            float descender, float ascender,
                            TEXTROW* tessrow) {
   tessrow->baseline.segments = 1;
   tessrow->baseline.xstarts[0] = -32767;
   tessrow->baseline.xstarts[1] = 32767;
   tessrow->baseline.quads[0].a = 0;
   tessrow->baseline.quads[0].b = 0;
   tessrow->baseline.quads[0].c = bln_baseline_offset;
   tessrow->xheight.segments = 1;
   tessrow->xheight.xstarts[0] = -32767;
   tessrow->xheight.xstarts[1] = 32767;
   tessrow->xheight.quads[0].a = 0;
   tessrow->xheight.quads[0].b = 0;
   tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height;
   tessrow->lineheight = bln_x_height;
   tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight;
   tessrow->descdrop = bln_x_height * (descender - baseline) / xheight;
 }


 // Return a TBLOB * from the whole page_image.
 // To be freed later with free_blob().
 TBLOB *make_tesseract_blob(float baseline, float xheight,
                            float descender, float ascender) {
   BLOCK *block = new BLOCK("a character",
                            TRUE,
                            0, 0,
                            0, 0,
                            page_image.get_xsize(),
                            page_image.get_ysize());

   // Create C_BLOBs from the page
   extract_edges(
 #ifndef GRAPHICS_DISABLED
 		NULL,
 #endif
 		&page_image, &page_image,
                 ICOORD(page_image.get_xsize(), page_image.get_ysize()),
                 block);

   // Create one PBLOB from all C_BLOBs
   C_BLOB_LIST *list = block->blob_list();
   C_BLOB_IT c_blob_it(list);
   PBLOB *pblob = new PBLOB;  // will be (hopefully) deleted by the pblob_list
   for (c_blob_it.mark_cycle_pt();
        !c_blob_it.cycled_list();
        c_blob_it.forward()) {
       C_BLOB *c_blob = c_blob_it.data();
       PBLOB c_as_p(c_blob, baseline + xheight);
       merge_blobs(pblob, &c_as_p);
   }
   PBLOB_LIST *pblob_list = new PBLOB_LIST;  // will be deleted by the word
   PBLOB_IT pblob_it(pblob_list);
   pblob_it.add_after_then_move(pblob);

   // Normalize PBLOB
   WERD word(pblob_list, 0, " ");
   ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender);
   word.baseline_normalise(row);
   delete row;

   // Create a TBLOB from PBLOB
   return make_tess_blob(pblob, /* flatten: */ TRUE);
 }


 // Adapt to recognize the current image as the given character.
 // The image must be preloaded and be just an image of a single character.
 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
                                    int length,
                                    float baseline,
                                    float xheight,
                                    float descender,
                                    float ascender) {
   UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
   LINE_STATS LineStats;
   TEXTROW row;
   fill_dummy_row(baseline, xheight, descender, ascender, &row);
   GetLineStatsFromRow(&row, &LineStats);

   TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender);
   float threshold;
   UNICHAR_ID best_class = 0;
   float best_rating = -100;


   // Classify to get a raw choice.
   BLOB_CHOICE_LIST choices;
   tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices);
   BLOB_CHOICE_IT choice_it;
   choice_it.set_to_list(&choices);
   for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
        choice_it.forward()) {
     if (choice_it.data()->rating() > best_rating) {
       best_rating = choice_it.data()->rating();
       best_class = choice_it.data()->unichar_id();
     }
   }

   if (id == best_class) {
     threshold = matcher_good_threshold;
   } else {
     /* the blob was incorrectly classified - find the rating threshold
        needed to create a template which will correct the error with
        some margin.  However, don't waste time trying to make
        templates which are too tight. */
     threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id);
     threshold *= .9;
     const float max_threshold = .125;
     const float min_threshold = .02;

     if (threshold > max_threshold)
         threshold = max_threshold;

     // I have cuddled the following line to set it out of the strike
     // of the coverage testing tool. I have no idea how to trigger
     // this situation nor I have any necessity to do it. --mezhirov
     if (threshold < min_threshold) threshold = min_threshold;
   }

   if (blob->outlines)
     tesseract_->AdaptToChar(blob, &LineStats, id, threshold);
   free_blob(blob);
 }


 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
   PAGE_RES *page_res = new PAGE_RES(block_list);
   tesseract_->recog_all_words(page_res, NULL, NULL, 1);
   return page_res;
 }

 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
                                         PAGE_RES* pass1_result) {
   if (!pass1_result)
     pass1_result = new PAGE_RES(block_list);
   tesseract_->recog_all_words(pass1_result, NULL, NULL, 2);
   return pass1_result;
 }

 struct TESS_CHAR : ELIST_LINK {
   char *unicode_repr;
   int length;  // of unicode_repr
   float cost;
   TBOX box;

   TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
     length = (len == -1 ? strlen(repr) : len);
     unicode_repr = new char[length + 1];
     strncpy(unicode_repr, repr, length);
   }

   TESS_CHAR() {  // Satisfies ELISTIZE.
   }
   ~TESS_CHAR() {
     delete [] unicode_repr;
   }
 };

 ELISTIZEH(TESS_CHAR)
 ELISTIZE(TESS_CHAR)

 static void add_space(TESS_CHAR_IT* it) {
   TESS_CHAR *t = new TESS_CHAR(0, " ");
   it->add_after_then_move(t);
 }


 static float rating_to_cost(float rating) {
   rating = 100 + rating;
   // cuddled that to save from coverage profiler
   // (I have never seen ratings worse than -100,
   //  but the check won't hurt)
   if (rating < 0) rating = 0;
   return rating;
 }


 // Extract the OCR results, costs (penalty points for uncertainty),
 // and the bounding boxes of the characters.
 static void extract_result(TESS_CHAR_IT* out,
                            PAGE_RES* page_res) {
   PAGE_RES_IT page_res_it(page_res);
   int word_count = 0;
   while (page_res_it.word() != NULL) {
     WERD_RES *word = page_res_it.word();
     const char *str = word->best_choice->unichar_string().string();
     const char *len = word->best_choice->unichar_lengths().string();

     if (word_count)
       add_space(out);
     TBOX bln_rect;
     PBLOB_LIST *blobs = word->outword->blob_list();
     PBLOB_IT it(blobs);
     int n = strlen(len);
     TBOX** boxes_to_fix = new TBOX*[n];
     for (int i = 0; i < n; i++) {
       PBLOB *blob = it.data();
       TBOX current = blob->bounding_box();
       bln_rect = bln_rect.bounding_union(current);
       TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
                                     str, *len);
       tc->box = current;
       boxes_to_fix[i] = &tc->box;

       out->add_after_then_move(tc);
       it.forward();
       str += *len;
       len++;
     }

     // Find the word bbox before normalization.
     // Here we can't use the C_BLOB bboxes directly,
     // since connected letters are not yet cut.
     TBOX real_rect = word->word->bounding_box();

     // Denormalize boxes by transforming the bbox of the whole bln word
     // into the denorm bbox (`real_rect') of the whole word.
     double x_stretch = static_cast<double>(real_rect.width())
                      / bln_rect.width();
     double y_stretch = static_cast<double>(real_rect.height())
                      / bln_rect.height();
     for (int j = 0; j < n; j++) {
       TBOX *box = boxes_to_fix[j];
       int x0 = static_cast<int>(real_rect.left() +
                    x_stretch * (box->left() - bln_rect.left()) + 0.5);
       int x1 = static_cast<int>(real_rect.left() +
                    x_stretch * (box->right() - bln_rect.left()) + 0.5);
       int y0 = static_cast<int>(real_rect.bottom() +
                    y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5);
       int y1 = static_cast<int>(real_rect.bottom() +
                    y_stretch * (box->top() - bln_rect.bottom()) + 0.5);
       *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1));
     }
     delete [] boxes_to_fix;

     page_res_it.forward();
     word_count++;
   }
 }


 // Extract the OCR results, costs (penalty points for uncertainty),
 // and the bounding boxes of the characters.
 int TessBaseAPI::TesseractExtractResult(char** text,
                                         int** lengths,
                                         float** costs,
                                         int** x0,
                                         int** y0,
                                         int** x1,
                                         int** y1,
                                         PAGE_RES* page_res) {
   TESS_CHAR_LIST tess_chars;
   TESS_CHAR_IT tess_chars_it(&tess_chars);
   extract_result(&tess_chars_it, page_res);
   tess_chars_it.move_to_first();
   int n = tess_chars.length();
   int text_len = 0;
   *lengths = new int[n];
   *costs = new float[n];
   *x0 = new int[n];
   *y0 = new int[n];
   *x1 = new int[n];
   *y1 = new int[n];
   int i = 0;
   for (tess_chars_it.mark_cycle_pt();
        !tess_chars_it.cycled_list();
        tess_chars_it.forward(), i++) {
     TESS_CHAR *tc = tess_chars_it.data();
     text_len += (*lengths)[i] = tc->length;
     (*costs)[i] = tc->cost;
     (*x0)[i] = tc->box.left();
     (*y0)[i] = tc->box.bottom();
     (*x1)[i] = tc->box.right();
     (*y1)[i] = tc->box.top();
   }
   char *p = *text = new char[text_len];

   tess_chars_it.move_to_first();
   for (tess_chars_it.mark_cycle_pt();
         !tess_chars_it.cycled_list();
        tess_chars_it.forward()) {
     TESS_CHAR *tc = tess_chars_it.data();
     strncpy(p, tc->unicode_repr, tc->length);
     p += tc->length;
   }
   return n;
 }

 // This method returns the features associated with the current image.
 // Make sure setimage has been called before calling this method.
 void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features,
                               int* num_features) {
   if (page_res_ != NULL)
     ClearResults();
   if (!threshold_done_)
     Threshold();
   // We have only one block, which is of the size of the page.
   BLOCK_LIST* blocks = new BLOCK_LIST;
   BLOCK *block = new BLOCK("",                       // filename.
                            TRUE,                     // proportional.
                            0,                        // kerning.
                            0,                        // spacing.
                            0,                        // Left.
                            0,                        // Bottom.
                            page_image.get_xsize(),   // Right.
                            page_image.get_ysize());  // Top.
   ICOORD bleft, tright;
   block->bounding_box (bleft, tright);

   BLOCK_IT block_it_add = blocks;
   block_it_add.add_to_end(block);

   ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize());
   TEXTROW tessrow;
   make_tess_row(NULL,       // Denormalizer.
                 &tessrow);  // Output row.
   LINE_STATS line_stats;
   GetLineStatsFromRow(&tessrow, &line_stats);

   // Perform a CC analysis to detect the blobs.
   BLOCK_IT block_it = blocks;
   for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
        block_it.forward ()) {
     BLOCK* block = block_it.data();
 #ifndef GRAPHICS_DISABLED
     extract_edges(NULL,         // Scrollview window.
                   &page_image,  // Image.
                   &page_image,  // Thresholded image.
                   page_tr,      // corner of page.
                   block);       // block.
 #else
     extract_edges(&page_image,  // Image.
                   &page_image,  // Thresholded image.
                   page_tr,      // corner of page.
                   block);       // block.
 #endif
     C_BLOB_IT blob_it = block->blob_list();
     PBLOB *pblob = new PBLOB;
     // Iterate over all blobs found and get their features.
     for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
          blob_it.forward()) {
       C_BLOB* blob = blob_it.data();
       blob = blob;
       PBLOB c_as_p(blob, page_image.get_ysize());
       merge_blobs(pblob, &c_as_p);
     }

     PBLOB_LIST *pblob_list = new PBLOB_LIST;
     PBLOB_IT pblob_it(pblob_list);
     pblob_it.add_after_then_move(pblob);
     WERD word(pblob_list,  // Blob list.
               0,           // Blanks in front.
               " ");        // Correct text.
     ROW *row = make_tess_ocrrow(0,                       // baseline.
                                 page_image.get_ysize(),  // xheight.
                                 0,                       // ascent.
                                 0);                      // descent.
     word.baseline_normalise(row);
     delete row;
     if (pblob->out_list () == NULL) {
       tprintf("Blob list is empty");
     }
     TBLOB* tblob = make_tess_blob(pblob,  // Blob.
                                   TRUE);  // Flatten.

     CLASS_NORMALIZATION_ARRAY norm_array;
     inT32 len;
     *num_features = tesseract_->GetCharNormFeatures(
         tblob, &line_stats,
         tesseract_->PreTrainedTemplates,
         int_features, norm_array, &len);
   }
   delete blocks;
 }

 }  // namespace tesseract.