| /********************************************************************** |
| * File: baseapi.cpp |
| * Description: Simple API for calling tesseract. |
| * Author: Ray Smith |
| * Created: Fri Oct 06 15:35:01 PDT 2006 |
| * |
| * (C) Copyright 2006, Google Inc. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| **********************************************************************/ |
| |
| // Include automatically generated configuration file if running autoconf. |
| #ifdef HAVE_CONFIG_H |
| #include "config_auto.h" |
| #endif |
| |
| #ifdef HAVE_LIBLEPT |
| // Include leptonica library only if autoconf (or makefile etc) tell us to. |
| #include "allheaders.h" |
| #endif |
| |
| #include "baseapi.h" |
| |
| #include "thresholder.h" |
| #include "tesseractmain.h" |
| #include "tesseractclass.h" |
| #include "tessedit.h" |
| #include "ocrclass.h" |
| #include "pageres.h" |
| #include "tessvars.h" |
| #include "control.h" |
| #include "applybox.h" |
| #include "pgedit.h" |
| #include "varabled.h" |
| #include "variables.h" |
| #include "output.h" |
| #include "mainblk.h" |
| #include "globals.h" |
| #include "adaptmatch.h" |
| #include "edgblob.h" |
| #include "tessbox.h" |
| #include "tordvars.h" |
| #include "imgs.h" |
| #include "makerow.h" |
| #include "tstruct.h" |
| #include "tessout.h" |
| #include "tface.h" |
| #include "permute.h" |
| #include "otsuthr.h" |
| #include "osdetect.h" |
| #ifndef HAVE_LIBLEPT |
| #include "pageseg.h" |
| #include "blread.h" |
| #endif//HAVE_LIBLEPT |
| |
| |
| namespace tesseract { |
| |
| // Minimum sensible image size to be worth running tesseract. |
| const int kMinRectSize = 10; |
| // Character returned when Tesseract couldn't recognize as anything. |
| const char kTesseractReject = '~'; |
| // Character used by UNLV error counter as a reject. |
| const char kUNLVReject = '~'; |
| // Character used by UNLV as a suspect marker. |
| const char kUNLVSuspect = '^'; |
| // Filename used for input image file, from which to derive a name to search |
| // for a possible UNLV zone file, if none is specified by SetInputName. |
| const char* kInputFile = "noname.tif"; |
| |
| TessBaseAPI::TessBaseAPI() |
| : tesseract_(NULL), |
| // Thresholder is initialized to NULL here, but will be set before use by: |
| // A constructor of a derived API, SetThresholder(), or |
| // created implicitly when used in InternalSetImage. |
| thresholder_(NULL), |
| threshold_done_(false), |
| block_list_(NULL), |
| page_res_(NULL), |
| input_file_(NULL), |
| output_file_(NULL), |
| datapath_(NULL), |
| language_(NULL), |
| rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0), |
| image_width_(0), image_height_(0) { |
| } |
| |
| TessBaseAPI::~TessBaseAPI() { |
| End(); |
| } |
| |
| // Set the name of the input file. Needed only for training and |
| // loading a UNLV zone file. |
| void TessBaseAPI::SetInputName(const char* name) { |
| if (input_file_ == NULL) |
| input_file_ = new STRING(name); |
| else |
| *input_file_ = name; |
| } |
| |
| // Set the name of the output files. Needed only for debugging. |
| void TessBaseAPI::SetOutputName(const char* name) { |
| if (output_file_ == NULL) |
| output_file_ = new STRING(name); |
| else |
| *output_file_ = name; |
| } |
| |
| // Set the value of an internal "variable" (of either old or new types). |
| // Supply the name of the variable and the value as a string, just as |
| // you would in a config file. |
| // Returns false if the name lookup failed. |
| // SetVariable may be used before Init, to set things that control |
| // initialization, but note that on End all settings are lost and |
| // the next Init will use the defaults unless SetVariable is used again. |
| bool TessBaseAPI::SetVariable(const char* variable, const char* value) { |
| if (tesseract_ == NULL) |
| tesseract_ = new Tesseract; |
| |
| if (set_new_style_variable(variable, value)) |
| return true; |
| return set_old_style_variable(variable, value); |
| } |
| |
| // The datapath must be the name of the data directory (no ending /) or |
| // some other file in which the data directory resides (for instance argv[0].) |
| // The language is (usually) an ISO 639-3 string or NULL will default to eng. |
| // If numeric_mode is true, then only digits and Roman numerals will |
| // be returned. |
| int TessBaseAPI::Init(const char* datapath, const char* language) { |
| // If the datapath or the language have changed, then start again. |
| if (tesseract_ != NULL && |
| (datapath_ == NULL || language_ == NULL || |
| *datapath_ != datapath || *language_ != language)) { |
| tesseract_->end_tesseract(); |
| delete tesseract_; |
| tesseract_ = NULL; |
| } |
| if (datapath_ == NULL) |
| datapath_ = new STRING(datapath); |
| else |
| *datapath_ = datapath; |
| if (language_ == NULL) |
| language_ = new STRING(language); |
| else |
| *language_ = language; |
| if (tesseract_ == NULL) { |
| tesseract_ = new Tesseract; |
| return tesseract_->init_tesseract(datapath, |
| output_file_ != NULL ? output_file_->string() : NULL, |
| language, NULL, 0, NULL); |
| } |
| // For same language and datapath, just reseth the adaptive classifier. |
| tesseract_->ResetAdaptiveClassifier(); |
| return 0; |
| } |
| |
| // Init only the lang model component of Tesseract. The only functions |
| // that work after this init are SetVariable and IsValidWord. |
| // WARNING: temporary! This function will be removed from here and placed |
| // in a separate API at some future time. |
| int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { |
| if (tesseract_ == NULL) |
| tesseract_ = new Tesseract; |
| return tesseract_->init_tesseract_lm(datapath, NULL, |
| language, NULL, 0, NULL); |
| } |
| |
| // Read a "config" file containing a set of variable, value pairs. |
| // Searches the standard places: tessdata/configs, tessdata/tessconfigs |
| // and also accepts a relative or absolute path name. |
| bool TessBaseAPI::ReadConfigFile(const char* filename) { |
| STRING path = tesseract_->datadir; |
| path += "configs/"; |
| path += filename; |
| FILE* fp; |
| if ((fp = fopen(path.string(), "r")) != NULL) { |
| fclose(fp); |
| } else { |
| path = tesseract_->datadir; |
| path += "tessconfigs/"; |
| path += filename; |
| if ((fp = fopen(path.string(), "r")) != NULL) { |
| fclose(fp); |
| } else { |
| path = filename; |
| } |
| } |
| if (read_variables_file(path.string())) { |
| tesseract_->read_variables(path.string()); |
| } |
| return true; |
| } |
| |
| // Recognize a rectangle from an image and return the result as a string. |
| // May be called many times for a single Init. |
| // Currently has no error checking. |
| // Greyscale of 8 and color of 24 or 32 bits per pixel may be given. |
| // Palette color images will not work properly and must be converted to |
| // 24 bit. |
| // Binary images of 1 bit per pixel may also be given but they must be |
| // byte packed with the MSB of the first byte being the first pixel, and a |
| // one pixel is WHITE. For binary images set bytes_per_pixel=0. |
| // The recognized text is returned as a char* which is coded |
| // as UTF8 and must be freed with the delete [] operator. |
| char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, |
| int bytes_per_pixel, |
| int bytes_per_line, |
| int left, int top, |
| int width, int height) { |
| if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize) |
| return NULL; // Nothing worth doing. |
| |
| // Since this original api didn't give the exact size of the image, |
| // we have to invent a reasonable value. |
| int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; |
| SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height, |
| bytes_per_pixel, bytes_per_line); |
| SetRectangle(left, top, width, height); |
| |
| return GetUTF8Text(); |
| } |
| |
| // Call between pages or documents etc to free up memory and forget |
| // adaptive data. |
| void TessBaseAPI::ClearAdaptiveClassifier() { |
| if (tesseract_ == NULL) |
| return; |
| tesseract_->ResetAdaptiveClassifier(); |
| } |
| |
| // Provide an image for Tesseract to recognize. Format is as |
| // TesseractRect above. Does not copy the image buffer, or take |
| // ownership. The source image may be destroyed after Recognize is called, |
| // either explicitly or implicitly via one of the Get*Text functions. |
| // SetImage clears all recognition results, and sets the rectangle to the |
| // full image, so it may be followed immediately by a GetUTF8Text, and it |
| // will automatically perform recognition. |
| void TessBaseAPI::SetImage(const unsigned char* imagedata, |
| int width, int height, |
| int bytes_per_pixel, int bytes_per_line) { |
| if (InternalSetImage()) |
| thresholder_->SetImage(imagedata, width, height, |
| bytes_per_pixel, bytes_per_line); |
| } |
| |
| #ifdef HAVE_LIBLEPT |
| // Provide an image for Tesseract to recognize. As with SetImage above, |
| // Tesseract doesn't take a copy or ownership or pixDestroy the image, so |
| // it must persist until after Recognize. |
| // Pix vs raw, which to use? |
| // Use Pix where possible. A future version of Tesseract may choose to use Pix |
| // as its internal representation and discard IMAGE altogether. |
| // Because of that, an implementation that sources and targets Pix may end up |
| // with less copies than an implementation that does not. |
| void TessBaseAPI::SetImage(const Pix* pix) { |
| if (InternalSetImage()) |
| thresholder_->SetImage(pix); |
| } |
| #endif |
| |
| // Restrict recognition to a sub-rectangle of the image. Call after SetImage. |
| // Each SetRectangle clears the recogntion results so multiple rectangles |
| // can be recognized with the same image. |
| void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { |
| if (thresholder_ == NULL) |
| return; |
| thresholder_->SetRectangle(left, top, width, height); |
| ClearResults(); |
| } |
| |
| #ifdef HAVE_LIBLEPT |
| // ONLY available if you have Leptonica installed. |
| // Get a copy of the internal thresholded image from Tesseract. |
| Pix* TessBaseAPI::GetThresholdedImage() { |
| if (tesseract_ == NULL) |
| return NULL; |
| if (!threshold_done_) |
| Threshold(); |
| return page_image.ToPix(); |
| } |
| #endif // HAVE_LIBLEPT |
| |
| // Dump the internal binary image to a PGM file. |
| void TessBaseAPI::DumpPGM(const char* filename) { |
| if (tesseract_ == NULL) |
| return; |
| IMAGELINE line; |
| line.init(page_image.get_xsize()); |
| FILE *fp = fopen(filename, "w"); |
| fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n", |
| page_image.get_xsize(), page_image.get_ysize()); |
| for (int j = page_image.get_ysize()-1; j >= 0 ; --j) { |
| page_image.get_line(0, j, page_image.get_xsize(), &line, 0); |
| for (int i = 0; i < page_image.get_xsize(); ++i) { |
| uinT8 b = line.pixels[i] ? 255 : 0; |
| fwrite(&b, 1, 1, fp); |
| } |
| } |
| fclose(fp); |
| } |
| |
| // Recognize the tesseract global image and return the result as Tesseract |
| // internal structures. |
| int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) { |
| if (tesseract_ == NULL) |
| return -1; |
| if (thresholder_ == NULL || thresholder_->IsEmpty()) { |
| tprintf("Please call SetImage before attempting recognition."); |
| return -1; |
| } |
| if (page_res_ != NULL) |
| ClearResults(); |
| if (!threshold_done_) |
| Threshold(); |
| if (FindLines() != 0) |
| return -1; |
| if (tesseract_->tessedit_resegment_from_boxes) |
| tesseract_->apply_boxes(*input_file_, block_list_); |
| tesseract_->SetBlackAndWhitelist(); |
| |
| page_res_ = new PAGE_RES(block_list_); |
| if (interactive_mode) { |
| #ifdef HAVE_LIBLEPT |
| tesseract_->pgeditor_main(block_list_); |
| #endif |
| } else if (tesseract_->tessedit_train_from_boxes) { |
| apply_box_training(*output_file_, block_list_); |
| } else { |
| // Now run the main recognition. |
| tesseract_->recog_all_words(page_res_, monitor); |
| } |
| return 0; |
| } |
| |
| // Make a text string from the internal data structures. |
| char* TessBaseAPI::GetUTF8Text() { |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| int total_length = TextLength(NULL); |
| PAGE_RES_IT page_res_it(page_res_); |
| char* result = new char[total_length]; |
| char* ptr = result; |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| WERD_CHOICE* choice = word->best_choice; |
| if (choice != NULL) { |
| strcpy(ptr, choice->unichar_string().string()); |
| ptr += choice->unichar_string().length(); |
| if (word->word->flag(W_EOL)) |
| *ptr++ = '\n'; |
| else |
| *ptr++ = ' '; |
| } |
| } |
| *ptr++ = '\n'; |
| *ptr = '\0'; |
| return result; |
| } |
| |
| static int ConvertWordToBoxText(WERD_RES *word, |
| ROW_RES* row, |
| int left, |
| int bottom, |
| char* word_str) { |
| // Copy the output word and denormalize it back to image coords. |
| WERD copy_outword; |
| copy_outword = *(word->outword); |
| copy_outword.baseline_denormalise(&word->denorm); |
| PBLOB_IT blob_it; |
| blob_it.set_to_list(copy_outword.blob_list()); |
| int length = copy_outword.blob_list()->length(); |
| int output_size = 0; |
| |
| if (length > 0) { |
| for (int index = 0, offset = 0; index < length; |
| offset += word->best_choice->unichar_lengths()[index++], |
| blob_it.forward()) { |
| PBLOB* blob = blob_it.data(); |
| TBOX blob_box = blob->bounding_box(); |
| if (word->tess_failed || |
| blob_box.left() < 0 || |
| blob_box.right() > page_image.get_xsize() || |
| blob_box.bottom() < 0 || |
| blob_box.top() > page_image.get_ysize()) { |
| // Bounding boxes can be illegal when tess fails on a word. |
| blob_box = word->word->bounding_box(); // Use original word as backup. |
| tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n", |
| blob_box.left(), blob_box.bottom(), |
| blob_box.right(), blob_box.top()); |
| } |
| |
| // A single classification unit can be composed of several UTF-8 |
| // characters. Append each of them to the result. |
| for (int sub = 0; |
| sub < word->best_choice->unichar_lengths()[index]; ++sub) { |
| char ch = word->best_choice->unichar_string()[offset + sub]; |
| // Tesseract uses space for recognition failure. Fix to a reject |
| // character, kTesseractReject so we don't create illegal box files. |
| if (ch == ' ') |
| ch = kTesseractReject; |
| word_str[output_size++] = ch; |
| } |
| sprintf(word_str + output_size, " %d %d %d %d\n", |
| blob_box.left() + left, blob_box.bottom() + bottom, |
| blob_box.right() + left, blob_box.top() + bottom); |
| output_size += strlen(word_str + output_size); |
| } |
| } |
| return output_size; |
| } |
| |
| // Multiplier for max expected textlength assumes typically 4 numbers @ |
| // (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the |
| // orginal UTF8 characters, and one kMaxCharsPerChar. |
| const int kCharsPerChar = 25; |
| // A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a |
| // space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR. |
| // Test against this on each iteration for safety. |
| const int kMaxCharsPerChar = 85 + UNICHAR_LEN; |
| |
| // The recognized text is returned as a char* which is coded |
| // as a UTF8 box file and must be freed with the delete [] operator. |
| char* TessBaseAPI::GetBoxText() { |
| int bottom = image_height_ - (rect_top_ + rect_height_); |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| int blob_count; |
| int utf8_length = TextLength(&blob_count); |
| int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar; |
| PAGE_RES_IT page_res_it(page_res_); |
| char* result = new char[total_length]; |
| char* ptr = result; |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom, |
| ptr); |
| // Just in case... |
| if (ptr - result + kMaxCharsPerChar > total_length) |
| break; |
| } |
| *ptr = '\0'; |
| return result; |
| } |
| |
| // Conversion table for non-latin characters. |
| // Maps characters out of the latin set into the latin set. |
| // TODO(rays) incorporate this translation into unicharset. |
| const int kUniChs[] = { |
| 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0 |
| }; |
| // Latin chars corresponding to the unicode chars above. |
| const int kLatinChs[] = { |
| 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0 |
| }; |
| |
| // The recognized text is returned as a char* which is coded |
| // as UNLV format Latin-1 with specific reject and suspect codes |
| // and must be freed with the delete [] operator. |
| char* TessBaseAPI::GetUNLVText() { |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| bool tilde_crunch_written = false; |
| bool last_char_was_newline = true; |
| bool last_char_was_tilde = false; |
| |
| int total_length = TextLength(NULL); |
| PAGE_RES_IT page_res_it(page_res_); |
| char* result = new char[total_length]; |
| char* ptr = result; |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| // Process the current word. |
| if (word->unlv_crunch_mode != CR_NONE) { |
| if (word->unlv_crunch_mode != CR_DELETE && |
| (!tilde_crunch_written || |
| (word->unlv_crunch_mode == CR_KEEP_SPACE && |
| word->word->space() > 0 && |
| !word->word->flag(W_FUZZY_NON) && |
| !word->word->flag(W_FUZZY_SP)))) { |
| if (!word->word->flag(W_BOL) && |
| word->word->space() > 0 && |
| !word->word->flag(W_FUZZY_NON) && |
| !word->word->flag(W_FUZZY_SP)) { |
| /* Write a space to separate from preceeding good text */ |
| *ptr++ = ' '; |
| last_char_was_tilde = false; |
| } |
| if (!last_char_was_tilde) { |
| // Write a reject char. |
| last_char_was_tilde = true; |
| *ptr++ = kUNLVReject; |
| tilde_crunch_written = true; |
| last_char_was_newline = false; |
| } |
| } |
| } else { |
| // NORMAL PROCESSING of non tilde crunched words. |
| tilde_crunch_written = false; |
| |
| if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps) |
| ensure_rep_chars_are_consistent(word); |
| |
| tesseract_->set_unlv_suspects(word); |
| const char* wordstr = word->best_choice->unichar_string().string(); |
| const STRING& lengths = word->best_choice->unichar_lengths(); |
| int length = lengths.length(); |
| int i = 0; |
| int offset = 0; |
| |
| if (last_char_was_tilde && |
| word->word->space() == 0 && wordstr[offset] == ' ') { |
| // Prevent adjacent tilde across words - we know that adjacent tildes |
| // within words have been removed. |
| // Skip the first character. |
| offset = lengths[i++]; |
| } |
| if (i < length && wordstr[offset] != 0) { |
| if (!last_char_was_newline) |
| *ptr++ = ' '; |
| else |
| last_char_was_newline = false; |
| for (; i < length; offset += lengths[i++]) { |
| if (wordstr[offset] == ' ' || |
| wordstr[offset] == kTesseractReject) { |
| *ptr++ = kUNLVReject; |
| last_char_was_tilde = true; |
| } else { |
| if (word->reject_map[i].rejected()) |
| *ptr++ = kUNLVSuspect; |
| UNICHAR ch(wordstr + offset, lengths[i]); |
| int uni_ch = ch.first_uni(); |
| for (int j = 0; kUniChs[j] != 0; ++j) { |
| if (kUniChs[j] == uni_ch) { |
| uni_ch = kLatinChs[j]; |
| break; |
| } |
| } |
| if (uni_ch <= 0xff) { |
| *ptr++ = static_cast<char>(uni_ch); |
| last_char_was_tilde = false; |
| } else { |
| *ptr++ = kUNLVReject; |
| last_char_was_tilde = true; |
| } |
| } |
| } |
| } |
| } |
| if (word->word->flag(W_EOL) && !last_char_was_newline) { |
| /* Add a new line output */ |
| *ptr++ = '\n'; |
| tilde_crunch_written = false; |
| last_char_was_newline = true; |
| last_char_was_tilde = false; |
| } |
| } |
| *ptr++ = '\n'; |
| *ptr = '\0'; |
| return result; |
| } |
| |
| // Returns the average word confidence for Tesseract page result. |
| int TessBaseAPI::MeanTextConf() { |
| int* conf = AllWordConfidences(); |
| if (!conf) return 0; |
| int sum = 0; |
| int *pt = conf; |
| while (*pt >= 0) sum += *pt++; |
| if (pt != conf) sum /= pt - conf; |
| delete [] conf; |
| return sum; |
| } |
| |
| // Returns an array of all word confidences, terminated by -1. |
| int* TessBaseAPI::AllWordConfidences() { |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| int n_word = 0; |
| PAGE_RES_IT res_it(page_res_); |
| for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) |
| n_word++; |
| |
| int* conf = new int[n_word+1]; |
| n_word = 0; |
| for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) { |
| WERD_RES *word = res_it.word(); |
| WERD_CHOICE* choice = word->best_choice; |
| int w_conf = static_cast<int>(100 + 5 * choice->certainty()); |
| // This is the eq for converting Tesseract confidence to 1..100 |
| if (w_conf < 0) w_conf = 0; |
| if (w_conf > 100) w_conf = 100; |
| conf[n_word++] = w_conf; |
| } |
| conf[n_word] = -1; |
| return conf; |
| } |
| |
| // Free up recognition results and any stored image data, without actually |
| // freeing any recognition data that would be time-consuming to reload. |
| // Afterwards, you must call SetImage or TesseractRect before doing |
| // any Recognize or Get* operation. |
| void TessBaseAPI::Clear() { |
| if (thresholder_ != NULL) |
| thresholder_->Clear(); |
| ClearResults(); |
| page_image.destroy(); |
| } |
| |
| // Close down tesseract and free up all memory. End() is equivalent to |
| // destructing and reconstructing your TessBaseAPI. |
| // Once End() has been used, none of the other API functions may be used |
| // other than Init and anything declared above it in the class definition. |
| void TessBaseAPI::End() { |
| if (thresholder_ != NULL) { |
| delete thresholder_; |
| thresholder_ = NULL; |
| } |
| if (page_res_ != NULL) { |
| delete page_res_; |
| page_res_ = NULL; |
| } |
| if (block_list_ != NULL) { |
| delete block_list_; |
| block_list_ = NULL; |
| } |
| if (tesseract_ != NULL) { |
| tesseract_->end_tesseract(); |
| delete tesseract_; |
| tesseract_ = NULL; |
| } |
| if (input_file_ != NULL) { |
| delete input_file_; |
| input_file_ = NULL; |
| } |
| if (output_file_ != NULL) { |
| delete output_file_; |
| output_file_ = NULL; |
| } |
| if (datapath_ != NULL) { |
| delete datapath_; |
| datapath_ = NULL; |
| } |
| if (language_ != NULL) { |
| delete language_; |
| language_ = NULL; |
| } |
| } |
| |
| // Check whether a word is valid according to Tesseract's language model |
| // returns 0 if the word is invalid, non-zero if valid |
| int TessBaseAPI::IsValidWord(const char *word) { |
| return tesseract_->getDict().valid_word(word); |
| } |
| |
| |
| bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) { |
| if (thresholder_ != NULL && !threshold_done_) |
| Threshold(); |
| if (page_res_ == NULL) |
| FindLines(); |
| if (block_list_->length() < 1) { |
| return false; |
| } |
| |
| // Get first block |
| BLOCK_IT block_it(block_list_); |
| block_it.move_to_first(); |
| ROW_LIST* rows = block_it.data()->row_list(); |
| if (rows->length() != 1) { |
| return false; |
| } |
| |
| // Get first line of block |
| ROW_IT row_it(rows); |
| row_it.move_to_first(); |
| ROW* row = row_it.data(); |
| |
| // Calculate offset and slope (NOTE: Kind of ugly) |
| *out_offset = static_cast<int>(row->base_line(0.0)); |
| *out_slope = row->base_line(1.0) - row->base_line(0.0); |
| |
| return true; |
| } |
| |
| // Set the letter_is_okay function to point somewhere else. |
| void TessBaseAPI::SetDictFunc(DictFunc f) { |
| if (tesseract_ != NULL) { |
| tesseract_->getDict().letter_is_okay_ = f; |
| } |
| } |
| |
| // Common code for setting the image. |
| bool TessBaseAPI::InternalSetImage() { |
| if (tesseract_ == NULL) { |
| tprintf("Please call Init before attempting to send an image."); |
| return false; |
| } |
| if (thresholder_ == NULL) |
| thresholder_ = new ImageThresholder; |
| ClearResults(); |
| return true; |
| } |
| |
| #ifndef HAVE_LIBLEPT |
| namespace tesseract { |
| void pgeditor_read_file(STRING &filename, |
| BLOCK_LIST *blocks, // block list to add to |
| Tesseract *tess) |
| { |
| STRING name = filename; //truncated name |
| const char *lastdot; //of name |
| TO_BLOCK_LIST land_blocks, port_blocks; |
| TBOX page_box; |
| |
| lastdot = strrchr (name.string (), '.'); |
| if (lastdot != NULL) |
| name[lastdot-name.string()] = '\0'; |
| if (!read_pd_file (name, page_image.get_xsize (), page_image.get_ysize (), |
| blocks)) { |
| segment_page(blocks); |
| } |
| find_components(blocks, &land_blocks, &port_blocks, &page_box); |
| textord_page(page_box.topright(), blocks, &land_blocks, &port_blocks, tess); |
| } |
| } |
| #endif//HAVE_LIBLEPT |
| |
| // Run the thresholder to make the thresholded image. |
| void TessBaseAPI::Threshold() { |
| thresholder_->ThresholdToIMAGE(&page_image); |
| thresholder_->GetImageSizes(&rect_left_, &rect_top_, |
| &rect_width_, &rect_height_, |
| &image_width_, &image_height_); |
| threshold_done_ = true; |
| } |
| |
| // Find lines from the image making the BLOCK_LIST. |
| int TessBaseAPI::FindLines() { |
| // The following call creates a full-page block and then runs connected |
| // component analysis and text line creation. |
| if (input_file_ == NULL) |
| input_file_ = new STRING(kInputFile); |
| if (tesseract_ == NULL) { |
| tesseract_ = new Tesseract; |
| tesseract_->InitAdaptiveClassifier(); |
| } |
| #ifdef HAVE_LIBLEPT |
| tesseract_->pgeditor_read_file(*input_file_, block_list_); |
| #else |
| tesseract::pgeditor_read_file(*input_file_, block_list_, tesseract_); |
| #endif |
| return 0; |
| } |
| |
| // Delete the pageres and clear the block list ready for a new page. |
| void TessBaseAPI::ClearResults() { |
| threshold_done_ = false; |
| if (page_res_ != NULL) { |
| delete page_res_; |
| page_res_ = NULL; |
| } |
| if (block_list_ == NULL) |
| block_list_ = new BLOCK_LIST; |
| else |
| block_list_->clear(); |
| } |
| |
| // Return the length of the output text string, as UTF8, assuming |
| // one newline per line and one per block, with a terminator, |
| // and assuming a single character reject marker for each rejected character. |
| // Also return the number of recognized blobs in blob_count. |
| int TessBaseAPI::TextLength(int* blob_count) { |
| if (tesseract_ == NULL || page_res_ == NULL) |
| return 0; |
| |
| PAGE_RES_IT page_res_it(page_res_); |
| int total_length = 2; |
| int total_blobs = 0; |
| // Iterate over the data structures to extract the recognition result. |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| WERD_CHOICE* choice = word->best_choice; |
| if (choice != NULL) { |
| total_blobs += choice->length() + 1; |
| total_length += choice->unichar_string().length() + 1; |
| for (int i = 0; i < word->reject_map.length(); ++i) { |
| if (word->reject_map[i].rejected()) |
| ++total_length; |
| } |
| } |
| } |
| if (blob_count != NULL) |
| *blob_count = total_blobs; |
| return total_length; |
| } |
| |
| // Return the Orientation And Script |
| void TessBaseAPI::DetectOS(OSResults* osr) { |
| if (tesseract_ == NULL) |
| return; |
| ClearResults(); |
| Threshold(); |
| if (input_file_ == NULL) |
| input_file_ = new STRING(kInputFile); |
| orientation_and_script_detection(*input_file_, osr, tesseract_); |
| } |
| |
| // ____________________________________________________________________________ |
| // Ocropus add-ons. |
| |
| // Find lines from the image making the BLOCK_LIST. |
| BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() { |
| FindLines(); |
| return block_list_; |
| } |
| |
| // Delete a block list. |
| // This is to keep BLOCK_LIST pointer opaque |
| // and let go of including the other headers. |
| void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) { |
| delete block_list; |
| } |
| |
| |
| static ROW *make_tess_ocrrow(float baseline, |
| float xheight, |
| float descender, |
| float ascender) { |
| inT32 xstarts[] = {-32000}; |
| double quad_coeffs[] = {0, 0, baseline}; |
| return new ROW(1, |
| xstarts, |
| quad_coeffs, |
| xheight, |
| ascender - (baseline + xheight), |
| descender - baseline, |
| 0, |
| 0); |
| } |
| |
| // Almost a copy of make_tess_row() from ccmain/tstruct.cpp. |
| static void fill_dummy_row(float baseline, float xheight, |
| float descender, float ascender, |
| TEXTROW* tessrow) { |
| tessrow->baseline.segments = 1; |
| tessrow->baseline.xstarts[0] = -32767; |
| tessrow->baseline.xstarts[1] = 32767; |
| tessrow->baseline.quads[0].a = 0; |
| tessrow->baseline.quads[0].b = 0; |
| tessrow->baseline.quads[0].c = bln_baseline_offset; |
| tessrow->xheight.segments = 1; |
| tessrow->xheight.xstarts[0] = -32767; |
| tessrow->xheight.xstarts[1] = 32767; |
| tessrow->xheight.quads[0].a = 0; |
| tessrow->xheight.quads[0].b = 0; |
| tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height; |
| tessrow->lineheight = bln_x_height; |
| tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight; |
| tessrow->descdrop = bln_x_height * (descender - baseline) / xheight; |
| } |
| |
| |
| // Return a TBLOB * from the whole page_image. |
| // To be freed later with free_blob(). |
| TBLOB *make_tesseract_blob(float baseline, float xheight, |
| float descender, float ascender) { |
| BLOCK *block = new BLOCK("a character", |
| TRUE, |
| 0, 0, |
| 0, 0, |
| page_image.get_xsize(), |
| page_image.get_ysize()); |
| |
| // Create C_BLOBs from the page |
| extract_edges( |
| #ifndef GRAPHICS_DISABLED |
| NULL, |
| #endif |
| &page_image, &page_image, |
| ICOORD(page_image.get_xsize(), page_image.get_ysize()), |
| block); |
| |
| // Create one PBLOB from all C_BLOBs |
| C_BLOB_LIST *list = block->blob_list(); |
| C_BLOB_IT c_blob_it(list); |
| PBLOB *pblob = new PBLOB; // will be (hopefully) deleted by the pblob_list |
| for (c_blob_it.mark_cycle_pt(); |
| !c_blob_it.cycled_list(); |
| c_blob_it.forward()) { |
| C_BLOB *c_blob = c_blob_it.data(); |
| PBLOB c_as_p(c_blob, baseline + xheight); |
| merge_blobs(pblob, &c_as_p); |
| } |
| PBLOB_LIST *pblob_list = new PBLOB_LIST; // will be deleted by the word |
| PBLOB_IT pblob_it(pblob_list); |
| pblob_it.add_after_then_move(pblob); |
| |
| // Normalize PBLOB |
| WERD word(pblob_list, 0, " "); |
| ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender); |
| word.baseline_normalise(row); |
| delete row; |
| |
| // Create a TBLOB from PBLOB |
| return make_tess_blob(pblob, /* flatten: */ TRUE); |
| } |
| |
| |
| // Adapt to recognize the current image as the given character. |
| // The image must be preloaded and be just an image of a single character. |
| void TessBaseAPI::AdaptToCharacter(const char *unichar_repr, |
| int length, |
| float baseline, |
| float xheight, |
| float descender, |
| float ascender) { |
| UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length); |
| LINE_STATS LineStats; |
| TEXTROW row; |
| fill_dummy_row(baseline, xheight, descender, ascender, &row); |
| GetLineStatsFromRow(&row, &LineStats); |
| |
| TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender); |
| float threshold; |
| UNICHAR_ID best_class = 0; |
| float best_rating = -100; |
| |
| |
| // Classify to get a raw choice. |
| BLOB_CHOICE_LIST choices; |
| tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices); |
| BLOB_CHOICE_IT choice_it; |
| choice_it.set_to_list(&choices); |
| for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); |
| choice_it.forward()) { |
| if (choice_it.data()->rating() > best_rating) { |
| best_rating = choice_it.data()->rating(); |
| best_class = choice_it.data()->unichar_id(); |
| } |
| } |
| |
| if (id == best_class) { |
| threshold = matcher_good_threshold; |
| } else { |
| /* the blob was incorrectly classified - find the rating threshold |
| needed to create a template which will correct the error with |
| some margin. However, don't waste time trying to make |
| templates which are too tight. */ |
| threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id); |
| threshold *= .9; |
| const float max_threshold = .125; |
| const float min_threshold = .02; |
| |
| if (threshold > max_threshold) |
| threshold = max_threshold; |
| |
| // I have cuddled the following line to set it out of the strike |
| // of the coverage testing tool. I have no idea how to trigger |
| // this situation nor I have any necessity to do it. --mezhirov |
| if (threshold < min_threshold) threshold = min_threshold; |
| } |
| |
| if (blob->outlines) |
| tesseract_->AdaptToChar(blob, &LineStats, id, threshold); |
| free_blob(blob); |
| } |
| |
| |
| PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) { |
| PAGE_RES *page_res = new PAGE_RES(block_list); |
| tesseract_->recog_all_words(page_res, NULL, NULL, 1); |
| return page_res; |
| } |
| |
| PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list, |
| PAGE_RES* pass1_result) { |
| if (!pass1_result) |
| pass1_result = new PAGE_RES(block_list); |
| tesseract_->recog_all_words(pass1_result, NULL, NULL, 2); |
| return pass1_result; |
| } |
| |
| struct TESS_CHAR : ELIST_LINK { |
| char *unicode_repr; |
| int length; // of unicode_repr |
| float cost; |
| TBOX box; |
| |
| TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) { |
| length = (len == -1 ? strlen(repr) : len); |
| unicode_repr = new char[length + 1]; |
| strncpy(unicode_repr, repr, length); |
| } |
| |
| TESS_CHAR() { // Satisfies ELISTIZE. |
| } |
| ~TESS_CHAR() { |
| delete [] unicode_repr; |
| } |
| }; |
| |
| ELISTIZEH(TESS_CHAR) |
| ELISTIZE(TESS_CHAR) |
| |
| static void add_space(TESS_CHAR_IT* it) { |
| TESS_CHAR *t = new TESS_CHAR(0, " "); |
| it->add_after_then_move(t); |
| } |
| |
| |
| static float rating_to_cost(float rating) { |
| rating = 100 + rating; |
| // cuddled that to save from coverage profiler |
| // (I have never seen ratings worse than -100, |
| // but the check won't hurt) |
| if (rating < 0) rating = 0; |
| return rating; |
| } |
| |
| |
| // Extract the OCR results, costs (penalty points for uncertainty), |
| // and the bounding boxes of the characters. |
| static void extract_result(TESS_CHAR_IT* out, |
| PAGE_RES* page_res) { |
| PAGE_RES_IT page_res_it(page_res); |
| int word_count = 0; |
| while (page_res_it.word() != NULL) { |
| WERD_RES *word = page_res_it.word(); |
| const char *str = word->best_choice->unichar_string().string(); |
| const char *len = word->best_choice->unichar_lengths().string(); |
| |
| if (word_count) |
| add_space(out); |
| TBOX bln_rect; |
| PBLOB_LIST *blobs = word->outword->blob_list(); |
| PBLOB_IT it(blobs); |
| int n = strlen(len); |
| TBOX** boxes_to_fix = new TBOX*[n]; |
| for (int i = 0; i < n; i++) { |
| PBLOB *blob = it.data(); |
| TBOX current = blob->bounding_box(); |
| bln_rect = bln_rect.bounding_union(current); |
| TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()), |
| str, *len); |
| tc->box = current; |
| boxes_to_fix[i] = &tc->box; |
| |
| out->add_after_then_move(tc); |
| it.forward(); |
| str += *len; |
| len++; |
| } |
| |
| // Find the word bbox before normalization. |
| // Here we can't use the C_BLOB bboxes directly, |
| // since connected letters are not yet cut. |
| TBOX real_rect = word->word->bounding_box(); |
| |
| // Denormalize boxes by transforming the bbox of the whole bln word |
| // into the denorm bbox (`real_rect') of the whole word. |
| double x_stretch = static_cast<double>(real_rect.width()) |
| / bln_rect.width(); |
| double y_stretch = static_cast<double>(real_rect.height()) |
| / bln_rect.height(); |
| for (int j = 0; j < n; j++) { |
| TBOX *box = boxes_to_fix[j]; |
| int x0 = static_cast<int>(real_rect.left() + |
| x_stretch * (box->left() - bln_rect.left()) + 0.5); |
| int x1 = static_cast<int>(real_rect.left() + |
| x_stretch * (box->right() - bln_rect.left()) + 0.5); |
| int y0 = static_cast<int>(real_rect.bottom() + |
| y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5); |
| int y1 = static_cast<int>(real_rect.bottom() + |
| y_stretch * (box->top() - bln_rect.bottom()) + 0.5); |
| *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1)); |
| } |
| delete [] boxes_to_fix; |
| |
| page_res_it.forward(); |
| word_count++; |
| } |
| } |
| |
| |
| // Extract the OCR results, costs (penalty points for uncertainty), |
| // and the bounding boxes of the characters. |
| int TessBaseAPI::TesseractExtractResult(char** text, |
| int** lengths, |
| float** costs, |
| int** x0, |
| int** y0, |
| int** x1, |
| int** y1, |
| PAGE_RES* page_res) { |
| TESS_CHAR_LIST tess_chars; |
| TESS_CHAR_IT tess_chars_it(&tess_chars); |
| extract_result(&tess_chars_it, page_res); |
| tess_chars_it.move_to_first(); |
| int n = tess_chars.length(); |
| int text_len = 0; |
| *lengths = new int[n]; |
| *costs = new float[n]; |
| *x0 = new int[n]; |
| *y0 = new int[n]; |
| *x1 = new int[n]; |
| *y1 = new int[n]; |
| int i = 0; |
| for (tess_chars_it.mark_cycle_pt(); |
| !tess_chars_it.cycled_list(); |
| tess_chars_it.forward(), i++) { |
| TESS_CHAR *tc = tess_chars_it.data(); |
| text_len += (*lengths)[i] = tc->length; |
| (*costs)[i] = tc->cost; |
| (*x0)[i] = tc->box.left(); |
| (*y0)[i] = tc->box.bottom(); |
| (*x1)[i] = tc->box.right(); |
| (*y1)[i] = tc->box.top(); |
| } |
| char *p = *text = new char[text_len]; |
| |
| tess_chars_it.move_to_first(); |
| for (tess_chars_it.mark_cycle_pt(); |
| !tess_chars_it.cycled_list(); |
| tess_chars_it.forward()) { |
| TESS_CHAR *tc = tess_chars_it.data(); |
| strncpy(p, tc->unicode_repr, tc->length); |
| p += tc->length; |
| } |
| return n; |
| } |
| |
| // This method returns the features associated with the current image. |
| // Make sure setimage has been called before calling this method. |
| void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features, |
| int* num_features) { |
| if (page_res_ != NULL) |
| ClearResults(); |
| if (!threshold_done_) |
| Threshold(); |
| // We have only one block, which is of the size of the page. |
| BLOCK_LIST* blocks = new BLOCK_LIST; |
| BLOCK *block = new BLOCK("", // filename. |
| TRUE, // proportional. |
| 0, // kerning. |
| 0, // spacing. |
| 0, // Left. |
| 0, // Bottom. |
| page_image.get_xsize(), // Right. |
| page_image.get_ysize()); // Top. |
| ICOORD bleft, tright; |
| block->bounding_box (bleft, tright); |
| |
| BLOCK_IT block_it_add = blocks; |
| block_it_add.add_to_end(block); |
| |
| ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize()); |
| TEXTROW tessrow; |
| make_tess_row(NULL, // Denormalizer. |
| &tessrow); // Output row. |
| LINE_STATS line_stats; |
| GetLineStatsFromRow(&tessrow, &line_stats); |
| |
| // Perform a CC analysis to detect the blobs. |
| BLOCK_IT block_it = blocks; |
| for (block_it.mark_cycle_pt (); !block_it.cycled_list (); |
| block_it.forward ()) { |
| BLOCK* block = block_it.data(); |
| #ifndef GRAPHICS_DISABLED |
| extract_edges(NULL, // Scrollview window. |
| &page_image, // Image. |
| &page_image, // Thresholded image. |
| page_tr, // corner of page. |
| block); // block. |
| #else |
| extract_edges(&page_image, // Image. |
| &page_image, // Thresholded image. |
| page_tr, // corner of page. |
| block); // block. |
| #endif |
| C_BLOB_IT blob_it = block->blob_list(); |
| PBLOB *pblob = new PBLOB; |
| // Iterate over all blobs found and get their features. |
| for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); |
| blob_it.forward()) { |
| C_BLOB* blob = blob_it.data(); |
| blob = blob; |
| PBLOB c_as_p(blob, page_image.get_ysize()); |
| merge_blobs(pblob, &c_as_p); |
| } |
| |
| PBLOB_LIST *pblob_list = new PBLOB_LIST; |
| PBLOB_IT pblob_it(pblob_list); |
| pblob_it.add_after_then_move(pblob); |
| WERD word(pblob_list, // Blob list. |
| 0, // Blanks in front. |
| " "); // Correct text. |
| ROW *row = make_tess_ocrrow(0, // baseline. |
| page_image.get_ysize(), // xheight. |
| 0, // ascent. |
| 0); // descent. |
| word.baseline_normalise(row); |
| delete row; |
| if (pblob->out_list () == NULL) { |
| tprintf("Blob list is empty"); |
| } |
| TBLOB* tblob = make_tess_blob(pblob, // Blob. |
| TRUE); // Flatten. |
| |
| CLASS_NORMALIZATION_ARRAY norm_array; |
| inT32 len; |
| *num_features = tesseract_->GetCharNormFeatures( |
| tblob, &line_stats, |
| tesseract_->PreTrainedTemplates, |
| int_features, norm_array, &len); |
| } |
| delete blocks; |
| } |
| |
| } // namespace tesseract. |