| /********************************************************************** |
| * File: baseapi.cpp |
| * Description: Simple API for calling tesseract. |
| * Author: Ray Smith |
| * Created: Fri Oct 06 15:35:01 PDT 2006 |
| * |
| * (C) Copyright 2006, Google Inc. |
| ** Licensed under the Apache License, Version 2.0 (the "License"); |
| ** you may not use this file except in compliance with the License. |
| ** You may obtain a copy of the License at |
| ** http://www.apache.org/licenses/LICENSE-2.0 |
| ** Unless required by applicable law or agreed to in writing, software |
| ** distributed under the License is distributed on an "AS IS" BASIS, |
| ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| ** See the License for the specific language governing permissions and |
| ** limitations under the License. |
| * |
| **********************************************************************/ |
| |
| // Include automatically generated configuration file if running autoconf. |
| #ifdef HAVE_CONFIG_H |
| #include "config_auto.h" |
| #endif |
| |
| #ifdef HAVE_LIBLEPT |
| // Include leptonica library only if autoconf (or makefile etc) tell us to. |
| #include "allheaders.h" |
| #endif |
| |
| #include "baseapi.h" |
| |
| #include "thresholder.h" |
| #include "tesseractmain.h" |
| #include "tesseractclass.h" |
| #include "tessedit.h" |
| #include "ocrclass.h" |
| #include "pageres.h" |
| #include "tessvars.h" |
| #include "control.h" |
| #include "applybox.h" |
| #include "pgedit.h" |
| #include "varabled.h" |
| #include "output.h" |
| #include "mainblk.h" |
| #include "globals.h" |
| #include "adaptmatch.h" |
| #include "edgblob.h" |
| #include "tessbox.h" |
| #include "tordvars.h" |
| #include "imgs.h" |
| #include "makerow.h" |
| #include "tstruct.h" |
| #include "tessout.h" |
| #include "tface.h" |
| #include "permute.h" |
| #include "otsuthr.h" |
| #include "osdetect.h" |
| #include "chopper.h" |
| #include "matchtab.h" |
| |
| namespace tesseract { |
| |
| // Minimum sensible image size to be worth running tesseract. |
| const int kMinRectSize = 10; |
| // Character returned when Tesseract couldn't recognize as anything. |
| const char kTesseractReject = '~'; |
| // Character used by UNLV error counter as a reject. |
| const char kUNLVReject = '~'; |
| // Character used by UNLV as a suspect marker. |
| const char kUNLVSuspect = '^'; |
| // Filename used for input image file, from which to derive a name to search |
| // for a possible UNLV zone file, if none is specified by SetInputName. |
| const char* kInputFile = "noname.tif"; |
| |
| TessBaseAPI::TessBaseAPI() |
| : tesseract_(NULL), |
| // Thresholder is initialized to NULL here, but will be set before use by: |
| // A constructor of a derived API, SetThresholder(), or |
| // created implicitly when used in InternalSetImage. |
| thresholder_(NULL), |
| threshold_done_(false), |
| block_list_(NULL), |
| page_res_(NULL), |
| input_file_(NULL), |
| output_file_(NULL), |
| datapath_(NULL), |
| language_(NULL), |
| rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0), |
| image_width_(0), image_height_(0) { |
| } |
| |
| TessBaseAPI::~TessBaseAPI() { |
| End(); |
| } |
| |
| // Set the name of the input file. Needed only for training and |
| // loading a UNLV zone file. |
| void TessBaseAPI::SetInputName(const char* name) { |
| if (input_file_ == NULL) |
| input_file_ = new STRING(name); |
| else |
| *input_file_ = name; |
| } |
| |
| // Set the name of the output files. Needed only for debugging. |
| void TessBaseAPI::SetOutputName(const char* name) { |
| if (output_file_ == NULL) |
| output_file_ = new STRING(name); |
| else |
| *output_file_ = name; |
| } |
| |
| // Set the value of an internal "variable" (of either old or new types). |
| // Supply the name of the variable and the value as a string, just as |
| // you would in a config file. |
| // Returns false if the name lookup failed. |
| // SetVariable may be used before Init, to set things that control |
| // initialization, but note that on End all settings are lost and |
| // the next Init will use the defaults unless SetVariable is used again. |
| bool TessBaseAPI::SetVariable(const char* variable, const char* value) { |
| if (tesseract_ == NULL) |
| tesseract_ = new Tesseract; |
| return set_variable(variable, value); |
| } |
| |
| // The datapath must be the name of the data directory (no ending /) or |
| // some other file in which the data directory resides (for instance argv[0].) |
| // The language is (usually) an ISO 639-3 string or NULL will default to eng. |
| // If numeric_mode is true, then only digits and Roman numerals will |
| // be returned. |
| // Returns 0 on success and -1 on initialization failure. |
| int TessBaseAPI::Init(const char* datapath, const char* language, |
| char **configs, int configs_size, |
| bool configs_global_only) { |
| // If the datapath or the language have changed, then start again. |
| // Note that the language_ field stores the last requested language that was |
| // initialized successfully, while tesseract_->lang stores the language |
| // actually used. They differ only if the requested language was NULL, in |
| // which case tesseract_->lang is set to the Tesseract default ("eng"). |
| if (tesseract_ != NULL && |
| (datapath_ == NULL || language_ == NULL || *datapath_ != datapath |
| || (*language_ != language && tesseract_->lang != language))) { |
| tesseract_->end_tesseract(); |
| delete tesseract_; |
| tesseract_ = NULL; |
| } |
| |
| bool reset_classifier = true; |
| if (tesseract_ == NULL) { |
| reset_classifier = false; |
| tesseract_ = new Tesseract; |
| if (tesseract_->init_tesseract( |
| datapath, output_file_ != NULL ? output_file_->string() : NULL, |
| language, configs, configs_size, configs_global_only) != 0) { |
| return -1; |
| } |
| } |
| // Update datapath and language requested for the last valid initialization. |
| if (datapath_ == NULL) |
| datapath_ = new STRING(datapath); |
| else |
| *datapath_ = datapath; |
| if (language_ == NULL) |
| language_ = new STRING(language); |
| else |
| *language_ = language; |
| |
| // For same language and datapath, just reset the adaptive classifier. |
| if (reset_classifier) tesseract_->ResetAdaptiveClassifier(); |
| |
| return 0; |
| } |
| |
| // Init only the lang model component of Tesseract. The only functions |
| // that work after this init are SetVariable and IsValidWord. |
| // WARNING: temporary! This function will be removed from here and placed |
| // in a separate API at some future time. |
| int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { |
| if (tesseract_ == NULL) |
| tesseract_ = new Tesseract; |
| return tesseract_->init_tesseract_lm(datapath, NULL, language); |
| } |
| |
| // Init only the classifer component of Tesseract. Used to initialize the |
| // specified language when no dawg models are available. |
| int TessBaseAPI::InitWithoutLangModel(const char* datapath, |
| const char* language) { |
| // If the datapath or the language have changed, then start again. |
| if (tesseract_ != NULL && |
| (datapath_ == NULL || language_ == NULL || |
| *datapath_ != datapath || *language_ != language)) { |
| tesseract_->end_tesseract(); |
| delete tesseract_; |
| tesseract_ = NULL; |
| } |
| if (datapath_ == NULL) |
| datapath_ = new STRING(datapath); |
| else |
| *datapath_ = datapath; |
| if (language_ == NULL) |
| language_ = new STRING(language); |
| else |
| *language_ = language; |
| if (tesseract_ == NULL) { |
| tesseract_ = new Tesseract; |
| return tesseract_->init_tesseract_classifier( |
| datapath, output_file_ != NULL ? output_file_->string() : NULL, |
| language, NULL, 0, false); |
| } |
| // For same language and datapath, just reset the adaptive classifier. |
| tesseract_->ResetAdaptiveClassifier(); |
| return 0; |
| } |
| |
| // Read a "config" file containing a set of variable, value pairs. |
| // Searches the standard places: tessdata/configs, tessdata/tessconfigs |
| // and also accepts a relative or absolute path name. |
| void TessBaseAPI::ReadConfigFile(const char* filename, bool global_only) { |
| tesseract_->read_config_file(filename, global_only); |
| } |
| |
| // Set the current page segmentation mode. Defaults to PSM_AUTO. |
| // The mode is stored as an INT_VARIABLE so it can also be modified by |
| // ReadConfigFile or SetVariable("tessedit_pageseg_mode", mode as string). |
| void TessBaseAPI::SetPageSegMode(PageSegMode mode) { |
| if (tesseract_ == NULL) |
| tesseract_ = new Tesseract; |
| tesseract_->tessedit_pageseg_mode.set_value(mode); |
| } |
| |
| // Return the current page segmentation mode. |
| PageSegMode TessBaseAPI::GetPageSegMode() const { |
| if (tesseract_ == NULL) |
| return PSM_SINGLE_BLOCK; |
| return static_cast<PageSegMode>( |
| static_cast<int>(tesseract_->tessedit_pageseg_mode)); |
| } |
| |
| // Set the hint for trading accuracy against speed. |
| // Default is AVS_FASTEST, which is the old behaviour. |
| // Note that this is only a hint. Depending on the language and/or |
| // build configuration, speed and accuracy may not be tradeable. |
| // Also note that despite being an enum, any value in the range |
| // AVS_FASTEST to AVS_MOST_ACCURATE can be provided, and may or may not |
| // have an effect, depending on the implementation. |
| // The mode is stored as an INT_VARIABLE so it can also be modified by |
| // ReadConfigFile or SetVariable("tessedit_accuracyvspeed", mode as string). |
| void TessBaseAPI::SetAccuracyVSpeed(AccuracyVSpeed mode) { |
| if (tesseract_ == NULL) |
| tesseract_ = new Tesseract; |
| tesseract_->tessedit_accuracyvspeed.set_value(mode); |
| } |
| |
| // Recognize a rectangle from an image and return the result as a string. |
| // May be called many times for a single Init. |
| // Currently has no error checking. |
| // Greyscale of 8 and color of 24 or 32 bits per pixel may be given. |
| // Palette color images will not work properly and must be converted to |
| // 24 bit. |
| // Binary images of 1 bit per pixel may also be given but they must be |
| // byte packed with the MSB of the first byte being the first pixel, and a |
| // one pixel is WHITE. For binary images set bytes_per_pixel=0. |
| // The recognized text is returned as a char* which is coded |
| // as UTF8 and must be freed with the delete [] operator. |
| char* TessBaseAPI::TesseractRect(const unsigned char* imagedata, |
| int bytes_per_pixel, |
| int bytes_per_line, |
| int left, int top, |
| int width, int height) { |
| if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize) |
| return NULL; // Nothing worth doing. |
| |
| // Since this original api didn't give the exact size of the image, |
| // we have to invent a reasonable value. |
| int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8; |
| SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height, |
| bytes_per_pixel, bytes_per_line); |
| SetRectangle(left, top, width, height); |
| |
| return GetUTF8Text(); |
| } |
| |
| // Call between pages or documents etc to free up memory and forget |
| // adaptive data. |
| void TessBaseAPI::ClearAdaptiveClassifier() { |
| if (tesseract_ == NULL) |
| return; |
| tesseract_->ResetAdaptiveClassifier(); |
| } |
| |
| // Provide an image for Tesseract to recognize. Format is as |
| // TesseractRect above. Does not copy the image buffer, or take |
| // ownership. The source image may be destroyed after Recognize is called, |
| // either explicitly or implicitly via one of the Get*Text functions. |
| // SetImage clears all recognition results, and sets the rectangle to the |
| // full image, so it may be followed immediately by a GetUTF8Text, and it |
| // will automatically perform recognition. |
| void TessBaseAPI::SetImage(const unsigned char* imagedata, |
| int width, int height, |
| int bytes_per_pixel, int bytes_per_line) { |
| if (InternalSetImage()) |
| thresholder_->SetImage(imagedata, width, height, |
| bytes_per_pixel, bytes_per_line); |
| } |
| |
| // Provide an image for Tesseract to recognize. As with SetImage above, |
| // Tesseract doesn't take a copy or ownership or pixDestroy the image, so |
| // it must persist until after Recognize. |
| // Pix vs raw, which to use? |
| // Use Pix where possible. A future version of Tesseract may choose to use Pix |
| // as its internal representation and discard IMAGE altogether. |
| // Because of that, an implementation that sources and targets Pix may end up |
| // with less copies than an implementation that does not. |
| void TessBaseAPI::SetImage(const Pix* pix) { |
| #ifdef HAVE_LIBLEPT |
| if (InternalSetImage()) |
| thresholder_->SetImage(pix); |
| #endif |
| } |
| |
| // Restrict recognition to a sub-rectangle of the image. Call after SetImage. |
| // Each SetRectangle clears the recogntion results so multiple rectangles |
| // can be recognized with the same image. |
| void TessBaseAPI::SetRectangle(int left, int top, int width, int height) { |
| if (thresholder_ == NULL) |
| return; |
| thresholder_->SetRectangle(left, top, width, height); |
| ClearResults(); |
| } |
| |
| // ONLY available if you have Leptonica installed. |
| // Get a copy of the internal thresholded image from Tesseract. |
| Pix* TessBaseAPI::GetThresholdedImage() { |
| #ifdef HAVE_LIBLEPT |
| if (tesseract_ == NULL) |
| return NULL; |
| if (tesseract_->pix_binary() == NULL) |
| Threshold(tesseract_->mutable_pix_binary()); |
| return pixClone(tesseract_->pix_binary()); |
| #else |
| return NULL; |
| #endif |
| } |
| |
| // Get the result of page layout analysis as a leptonica-style |
| // Boxa, Pixa pair, in reading order. |
| // Can be called before or after Recognize. |
| // For now only gets text regions. |
| Boxa* TessBaseAPI::GetRegions(Pixa** pixa) { |
| #ifdef HAVE_LIBLEPT |
| if (block_list_ == NULL || block_list_->empty()) { |
| FindLines(); |
| } |
| int im_height = pixGetHeight(tesseract_->pix_binary()); |
| Boxa* boxa = boxaCreate(block_list_->length()); |
| if (pixa != NULL) { |
| *pixa = pixaCreate(boxaGetCount(boxa)); |
| } |
| BLOCK_IT it(block_list_); |
| for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
| BLOCK* block = it.data(); |
| POLY_BLOCK* poly = block->poly_block(); |
| TBOX box; |
| if (poly != NULL) { |
| if (!poly->IsText()) |
| continue; // Use only text blocks. |
| POLY_BLOCK image_block(poly->points(), poly->isA()); |
| image_block.rotate(block->re_rotation()); |
| box = *image_block.bounding_box(); |
| if (pixa != NULL) { |
| Pix* pix = pixCreate(box.width(), box.height(), 1); |
| PB_LINE_IT *lines; |
| // Block outline is a polygon, so use a PC_LINE_IT to get the |
| // rasterized interior. (Runs of interior pixels on a line.) |
| lines = new PB_LINE_IT(&image_block); |
| for (int y = box.bottom(); y < box.top(); ++y) { |
| ICOORDELT_LIST* segments = lines->get_line(y); |
| if (!segments->empty()) { |
| ICOORDELT_IT s_it(segments); |
| // Each element of segments is a start x and x size of the |
| // run of interior pixels. |
| for (s_it.mark_cycle_pt(); !s_it.cycled_list(); s_it.forward()) { |
| int start = s_it.data()->x(); |
| int xext = s_it.data()->y(); |
| // Copy the run from the source image to the block image. |
| pixRasterop(pix, start - box.left(), |
| box.height() - 1 - (y - box.bottom()), |
| xext, 1, PIX_SRC, tesseract_->pix_binary(), |
| start, im_height - 1 - y); |
| } |
| } |
| delete segments; |
| } |
| delete lines; |
| pixaAddPix(*pixa, pix, L_INSERT); |
| } |
| } else { |
| if (!block_list_->singleton()) |
| continue; // A null poly block can only be used if it is the only block. |
| box = block->bounding_box(); |
| if (pixa != NULL) { |
| Pix* pix = pixCreate(box.width(), box.height(), 1); |
| // Just copy the whole block as there is only a bounding box. |
| pixRasterop(pix, 0, 0, box.width(), box.height(), |
| PIX_SRC, tesseract_->pix_binary(), |
| box.left(), im_height - box.top()); |
| pixaAddPix(*pixa, pix, L_INSERT); |
| } |
| } |
| Box* lbox = boxCreate(box.left(), im_height - box.top(), |
| box.width(), box.height()); |
| boxaAddBox(boxa, lbox, L_INSERT); |
| } |
| return boxa; |
| #else |
| return NULL; |
| #endif |
| } |
| |
| // Get the textlines as a leptonica-style |
| // Boxa, Pixa pair, in reading order. |
| // Can be called before or after Recognize. |
| // If blockids is not NULL, the block-id of each line is also returned as an |
| // array of one element per line. delete [] after use. |
| Boxa* TessBaseAPI::GetTextlines(Pixa** pixa, int** blockids) { |
| #ifdef HAVE_LIBLEPT |
| if (block_list_ == NULL || block_list_->empty()) { |
| FindLines(); |
| } |
| // A local PAGE_RES prevents the clear if Recognize is called after. |
| PAGE_RES page_res(block_list_); |
| PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res); |
| // Count the lines to get a size for the arrays. |
| int line_count = 0; |
| for (page_res_it.restart_page(); page_res_it.word() != NULL; |
| page_res_it.forward()) { |
| if (page_res_it.row() != page_res_it.next_row()) { |
| ++line_count; |
| } |
| } |
| |
| int im_height = pixGetHeight(tesseract_->pix_binary()); |
| Boxa* boxa = boxaCreate(line_count); |
| if (pixa != NULL) |
| *pixa = pixaCreate(line_count); |
| if (blockids != NULL) |
| *blockids = new int[line_count]; |
| int blockid = 0; |
| int lineindex = 0; |
| for (page_res_it.restart_page(); page_res_it.word() != NULL; |
| page_res_it.forward(), ++lineindex) { |
| WERD_RES *word = page_res_it.word(); |
| BLOCK* block = page_res_it.block()->block; |
| // Get the line bounding box. |
| PAGE_RES_IT word_it(page_res_it); // Save start of line. |
| TBOX line_box = word->word->bounding_box(); |
| while (page_res_it.next_row() == page_res_it.row()) { |
| page_res_it.forward(); |
| word = page_res_it.word(); |
| TBOX word_box = word->word->bounding_box(); |
| word_box.rotate(block->re_rotation()); |
| line_box += word_box; |
| } |
| Box* lbox = boxCreate(line_box.left(), im_height - line_box.top(), |
| line_box.width(), line_box.height()); |
| boxaAddBox(boxa, lbox, L_INSERT); |
| if (pixa != NULL) { |
| Pix* pix = pixCreate(line_box.width(), line_box.height(), 1); |
| // Copy all the words to the output pix. |
| while (word_it.row() == page_res_it.row()) { |
| word = word_it.word(); |
| TBOX word_box = word->word->bounding_box(); |
| word_box.rotate(block->re_rotation()); |
| pixRasterop(pix, word_box.left() - line_box.left(), |
| line_box.top() - word_box.top(), |
| word_box.width(), word_box.height(), |
| PIX_SRC, tesseract_->pix_binary(), |
| word_box.left(), im_height - word_box.top()); |
| word_it.forward(); |
| } |
| pixaAddPix(*pixa, pix, L_INSERT); |
| pixaAddBox(*pixa, lbox, L_CLONE); |
| } |
| if (blockids != NULL) { |
| (*blockids)[lineindex] = blockid; |
| if (page_res_it.block() != page_res_it.next_block()) |
| ++blockid; |
| } |
| } |
| return boxa; |
| #else |
| return NULL; |
| #endif |
| } |
| |
| // Get the words as a leptonica-style |
| // Boxa, Pixa pair, in reading order. |
| // Can be called before or after Recognize. |
| Boxa* TessBaseAPI::GetWords(Pixa** pixa) { |
| #ifdef HAVE_LIBLEPT |
| if (block_list_ == NULL || block_list_->empty()) { |
| FindLines(); |
| } |
| // A local PAGE_RES prevents the clear if Recognize is called after. |
| PAGE_RES page_res(block_list_); |
| PAGE_RES_IT page_res_it(page_res_ != NULL ? page_res_ : &page_res); |
| // Count the words to get a size for the arrays. |
| int word_count = 0; |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) |
| ++word_count; |
| |
| int im_height = pixGetHeight(tesseract_->pix_binary()); |
| Boxa* boxa = boxaCreate(word_count); |
| if (pixa != NULL) { |
| *pixa = pixaCreate(word_count); |
| } |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| BLOCK* block = page_res_it.block()->block; |
| TBOX box = word->word->bounding_box(); |
| box.rotate(block->re_rotation()); |
| Box* lbox = boxCreate(box.left(), im_height - box.top(), |
| box.width(), box.height()); |
| boxaAddBox(boxa, lbox, L_INSERT); |
| if (pixa != NULL) { |
| Pix* pix = pixCreate(box.width(), box.height(), 1); |
| // Copy the whole word bounding box to the output pix. |
| pixRasterop(pix, 0, 0, box.width(), box.height(), |
| PIX_SRC, tesseract_->pix_binary(), |
| box.left(), im_height - box.top()); |
| pixaAddPix(*pixa, pix, L_INSERT); |
| pixaAddBox(*pixa, lbox, L_CLONE); |
| } |
| } |
| return boxa; |
| #else |
| return NULL; |
| #endif // HAVE_LIBLEPT |
| } |
| |
| // Dump the internal binary image to a PGM file. |
| void TessBaseAPI::DumpPGM(const char* filename) { |
| if (tesseract_ == NULL) |
| return; |
| IMAGELINE line; |
| line.init(page_image.get_xsize()); |
| FILE *fp = fopen(filename, "w"); |
| fprintf(fp, "P5 " INT32FORMAT " " INT32FORMAT " 255\n", |
| page_image.get_xsize(), page_image.get_ysize()); |
| for (int j = page_image.get_ysize()-1; j >= 0 ; --j) { |
| page_image.get_line(0, j, page_image.get_xsize(), &line, 0); |
| for (int i = 0; i < page_image.get_xsize(); ++i) { |
| uinT8 b = line.pixels[i] ? 255 : 0; |
| fwrite(&b, 1, 1, fp); |
| } |
| } |
| fclose(fp); |
| } |
| |
| // Recognize the tesseract global image and return the result as Tesseract |
| // internal structures. |
| int TessBaseAPI::Recognize(struct ETEXT_STRUCT* monitor) { |
| if (tesseract_ == NULL) |
| return -1; |
| if (thresholder_ == NULL || thresholder_->IsEmpty()) { |
| tprintf("Please call SetImage before attempting recognition."); |
| return -1; |
| } |
| if (page_res_ != NULL) |
| ClearResults(); |
| if (FindLines() != 0) |
| return -1; |
| if (tesseract_->tessedit_resegment_from_boxes) |
| tesseract_->apply_boxes(*input_file_, block_list_); |
| tesseract_->SetBlackAndWhitelist(); |
| |
| page_res_ = new PAGE_RES(block_list_); |
| int result = 0; |
| if (interactive_mode) { |
| #ifndef GRAPHICS_DISABLED |
| tesseract_->pgeditor_main(block_list_); |
| #endif |
| // The page_res is invalid after an interactive session, so cleanup |
| // in a way that lets us continue to the next page without crashing. |
| delete page_res_; |
| page_res_ = NULL; |
| return -1; |
| } else if (tesseract_->tessedit_train_from_boxes) { |
| apply_box_training(*output_file_, block_list_); |
| } else if (tesseract_->global_tessedit_ambigs_training) { |
| FILE *ambigs_output_file = tesseract_->init_ambigs_training(*input_file_); |
| // OCR the page segmented into words by tesseract. |
| tesseract_->ambigs_training_segmented( |
| *input_file_, page_res_, monitor, ambigs_output_file); |
| fclose(ambigs_output_file); |
| } else { |
| // Now run the main recognition. |
| // Running base tesseract if the inttemp for the current language loaded. |
| if (tesseract_->inttemp_loaded_) { |
| tesseract_->recog_all_words(page_res_, monitor); |
| } |
| } |
| return result; |
| } |
| |
| // Tests the chopper by exhaustively running chop_one_blob. |
| int TessBaseAPI::RecognizeForChopTest(struct ETEXT_STRUCT* monitor) { |
| if (tesseract_ == NULL) |
| return -1; |
| if (thresholder_ == NULL || thresholder_->IsEmpty()) { |
| tprintf("Please call SetImage before attempting recognition."); |
| return -1; |
| } |
| if (page_res_ != NULL) |
| ClearResults(); |
| if (FindLines() != 0) |
| return -1; |
| // Additional conditions under which chopper test cannot be run |
| if (tesseract_->tessedit_train_from_boxes_word_level || interactive_mode) |
| return -1; |
| ASSERT_HOST(tesseract_->inttemp_loaded_); |
| |
| page_res_ = new PAGE_RES(block_list_); |
| |
| PAGE_RES_IT page_res_it(page_res_); |
| |
| tesseract_->tess_matcher = &Tesseract::tess_default_matcher; |
| tesseract_->tess_tester = NULL; |
| tesseract_->tess_trainer = NULL; |
| |
| while (page_res_it.word() != NULL) { |
| WERD_RES *word_res = page_res_it.word(); |
| WERD *word = word_res->word; |
| if (word->cblob_list()->empty()) { |
| page_res_it.forward(); |
| continue; |
| } |
| WERD *bln_word = make_bln_copy(word, page_res_it.row()->row, |
| page_res_it.block()->block, |
| word_res->x_height, &word_res->denorm); |
| ASSERT_HOST(!bln_word->blob_list()->empty()); |
| TWERD *tessword = make_tess_word(bln_word, NULL); |
| if (tessword->blobs == NULL) { |
| make_tess_word(bln_word, NULL); |
| } |
| TBLOB *pblob; |
| TBLOB *blob; |
| init_match_table(); |
| BLOB_CHOICE_LIST *match_result; |
| BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR(); |
| tesseract_->tess_denorm = &word_res->denorm; |
| tesseract_->tess_word = bln_word; |
| ASSERT_HOST(tessword->blobs != NULL); |
| for (blob = tessword->blobs, pblob = NULL; |
| blob != NULL; blob = blob->next) { |
| match_result = tesseract_->classify_blob(pblob, blob, blob->next, NULL, |
| "chop_word:", Green); |
| if (match_result == NULL) |
| tprintf("Null classifier output!\n"); |
| tesseract_->modify_blob_choice(match_result, 0); |
| ASSERT_HOST(!match_result->empty()); |
| *char_choices += match_result; |
| pblob = blob; |
| } |
| inT32 blob_number; |
| SEAMS seam_list = start_seam_list(tessword->blobs); |
| int right_chop_index = 0; |
| while (tesseract_->chop_one_blob(tessword, char_choices, |
| &blob_number, &seam_list, |
| &right_chop_index)) { |
| } |
| |
| word_res->best_choice = new WERD_CHOICE(); |
| word_res->raw_choice = new WERD_CHOICE(); |
| word_res->best_choice->make_bad(); |
| word_res->raw_choice->make_bad(); |
| tesseract_->getDict().permute_characters(*char_choices, 1000.0, |
| word_res->best_choice, |
| word_res->raw_choice); |
| |
| word_res->outword = make_ed_word(tessword, bln_word); |
| page_res_it.forward(); |
| } |
| return 0; |
| } |
| |
| // Make a text string from the internal data structures. |
| char* TessBaseAPI::GetUTF8Text() { |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| int total_length = TextLength(NULL); |
| PAGE_RES_IT page_res_it(page_res_); |
| char* result = new char[total_length]; |
| char* ptr = result; |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| WERD_CHOICE* choice = word->best_choice; |
| if (choice != NULL) { |
| strcpy(ptr, choice->unichar_string().string()); |
| ptr += choice->unichar_string().length(); |
| if (word->word->flag(W_EOL)) |
| *ptr++ = '\n'; |
| else |
| *ptr++ = ' '; |
| } |
| } |
| *ptr++ = '\n'; |
| *ptr = '\0'; |
| return result; |
| } |
| |
| static int ConvertWordToBoxText(WERD_RES *word, |
| ROW_RES* row, |
| int left, |
| int bottom, |
| char* word_str) { |
| // Copy the output word and denormalize it back to image coords. |
| WERD copy_outword; |
| copy_outword = *(word->outword); |
| copy_outword.baseline_denormalise(&word->denorm); |
| PBLOB_IT blob_it; |
| blob_it.set_to_list(copy_outword.blob_list()); |
| int length = copy_outword.blob_list()->length(); |
| int output_size = 0; |
| |
| if (length > 0) { |
| for (int index = 0, offset = 0; index < length; |
| offset += word->best_choice->unichar_lengths()[index++], |
| blob_it.forward()) { |
| PBLOB* blob = blob_it.data(); |
| TBOX blob_box = blob->bounding_box(); |
| if (word->tess_failed || |
| blob_box.left() < 0 || |
| blob_box.right() > page_image.get_xsize() || |
| blob_box.bottom() < 0 || |
| blob_box.top() > page_image.get_ysize()) { |
| // Bounding boxes can be illegal when tess fails on a word. |
| blob_box = word->word->bounding_box(); // Use original word as backup. |
| tprintf("Using substitute bounding box at (%d,%d)->(%d,%d)\n", |
| blob_box.left(), blob_box.bottom(), |
| blob_box.right(), blob_box.top()); |
| } |
| |
| // A single classification unit can be composed of several UTF-8 |
| // characters. Append each of them to the result. |
| for (int sub = 0; |
| sub < word->best_choice->unichar_lengths()[index]; ++sub) { |
| char ch = word->best_choice->unichar_string()[offset + sub]; |
| // Tesseract uses space for recognition failure. Fix to a reject |
| // character, kTesseractReject so we don't create illegal box files. |
| if (ch == ' ') |
| ch = kTesseractReject; |
| word_str[output_size++] = ch; |
| } |
| sprintf(word_str + output_size, " %d %d %d %d\n", |
| blob_box.left() + left, blob_box.bottom() + bottom, |
| blob_box.right() + left, blob_box.top() + bottom); |
| output_size += strlen(word_str + output_size); |
| } |
| } |
| return output_size; |
| } |
| |
| // Multiplier for max expected textlength assumes typically 4 numbers @ |
| // (5 digits and a space) plus the newline = 4*(5+1)+1. Add to this the |
| // orginal UTF8 characters, and one kMaxCharsPerChar. |
| const int kCharsPerChar = 25; |
| // A maximal single box could occupy 4 numbers at 20 digits (for 64 bit) and a |
| // space plus the newline 4*(20+1)+1 and the maximum length of a UNICHAR. |
| // Test against this on each iteration for safety. |
| const int kMaxCharsPerChar = 85 + UNICHAR_LEN; |
| |
| // The recognized text is returned as a char* which is coded |
| // as a UTF8 box file and must be freed with the delete [] operator. |
| char* TessBaseAPI::GetBoxText() { |
| int bottom = image_height_ - (rect_top_ + rect_height_); |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| int blob_count; |
| int utf8_length = TextLength(&blob_count); |
| int total_length = blob_count*kCharsPerChar + utf8_length + kMaxCharsPerChar; |
| PAGE_RES_IT page_res_it(page_res_); |
| char* result = new char[total_length]; |
| char* ptr = result; |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| ptr += ConvertWordToBoxText(word, page_res_it.row(), rect_left_, bottom, |
| ptr); |
| // Just in case... |
| if (ptr - result + kMaxCharsPerChar > total_length) |
| break; |
| } |
| *ptr = '\0'; |
| return result; |
| } |
| |
| // Conversion table for non-latin characters. |
| // Maps characters out of the latin set into the latin set. |
| // TODO(rays) incorporate this translation into unicharset. |
| const int kUniChs[] = { |
| 0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0 |
| }; |
| // Latin chars corresponding to the unicode chars above. |
| const int kLatinChs[] = { |
| 0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0 |
| }; |
| |
| // The recognized text is returned as a char* which is coded |
| // as UNLV format Latin-1 with specific reject and suspect codes |
| // and must be freed with the delete [] operator. |
| char* TessBaseAPI::GetUNLVText() { |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| bool tilde_crunch_written = false; |
| bool last_char_was_newline = true; |
| bool last_char_was_tilde = false; |
| |
| int total_length = TextLength(NULL); |
| PAGE_RES_IT page_res_it(page_res_); |
| char* result = new char[total_length]; |
| char* ptr = result; |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| // Process the current word. |
| if (word->unlv_crunch_mode != CR_NONE) { |
| if (word->unlv_crunch_mode != CR_DELETE && |
| (!tilde_crunch_written || |
| (word->unlv_crunch_mode == CR_KEEP_SPACE && |
| word->word->space() > 0 && |
| !word->word->flag(W_FUZZY_NON) && |
| !word->word->flag(W_FUZZY_SP)))) { |
| if (!word->word->flag(W_BOL) && |
| word->word->space() > 0 && |
| !word->word->flag(W_FUZZY_NON) && |
| !word->word->flag(W_FUZZY_SP)) { |
| /* Write a space to separate from preceeding good text */ |
| *ptr++ = ' '; |
| last_char_was_tilde = false; |
| } |
| if (!last_char_was_tilde) { |
| // Write a reject char. |
| last_char_was_tilde = true; |
| *ptr++ = kUNLVReject; |
| tilde_crunch_written = true; |
| last_char_was_newline = false; |
| } |
| } |
| } else { |
| // NORMAL PROCESSING of non tilde crunched words. |
| tilde_crunch_written = false; |
| |
| if (word->word->flag(W_REP_CHAR) && tessedit_consistent_reps) |
| ensure_rep_chars_are_consistent(word); |
| |
| tesseract_->set_unlv_suspects(word); |
| const char* wordstr = word->best_choice->unichar_string().string(); |
| const STRING& lengths = word->best_choice->unichar_lengths(); |
| int length = lengths.length(); |
| int i = 0; |
| int offset = 0; |
| |
| if (last_char_was_tilde && |
| word->word->space() == 0 && wordstr[offset] == ' ') { |
| // Prevent adjacent tilde across words - we know that adjacent tildes |
| // within words have been removed. |
| // Skip the first character. |
| offset = lengths[i++]; |
| } |
| if (i < length && wordstr[offset] != 0) { |
| if (!last_char_was_newline) |
| *ptr++ = ' '; |
| else |
| last_char_was_newline = false; |
| for (; i < length; offset += lengths[i++]) { |
| if (wordstr[offset] == ' ' || |
| wordstr[offset] == kTesseractReject) { |
| *ptr++ = kUNLVReject; |
| last_char_was_tilde = true; |
| } else { |
| if (word->reject_map[i].rejected()) |
| *ptr++ = kUNLVSuspect; |
| UNICHAR ch(wordstr + offset, lengths[i]); |
| int uni_ch = ch.first_uni(); |
| for (int j = 0; kUniChs[j] != 0; ++j) { |
| if (kUniChs[j] == uni_ch) { |
| uni_ch = kLatinChs[j]; |
| break; |
| } |
| } |
| if (uni_ch <= 0xff) { |
| *ptr++ = static_cast<char>(uni_ch); |
| last_char_was_tilde = false; |
| } else { |
| *ptr++ = kUNLVReject; |
| last_char_was_tilde = true; |
| } |
| } |
| } |
| } |
| } |
| if (word->word->flag(W_EOL) && !last_char_was_newline) { |
| /* Add a new line output */ |
| *ptr++ = '\n'; |
| tilde_crunch_written = false; |
| last_char_was_newline = true; |
| last_char_was_tilde = false; |
| } |
| } |
| *ptr++ = '\n'; |
| *ptr = '\0'; |
| return result; |
| } |
| |
| // Returns the average word confidence for Tesseract page result. |
| int TessBaseAPI::MeanTextConf() { |
| int* conf = AllWordConfidences(); |
| if (!conf) return 0; |
| int sum = 0; |
| int *pt = conf; |
| while (*pt >= 0) sum += *pt++; |
| if (pt != conf) sum /= pt - conf; |
| delete [] conf; |
| return sum; |
| } |
| |
| // Returns an array of all word confidences, terminated by -1. |
| int* TessBaseAPI::AllWordConfidences() { |
| if (tesseract_ == NULL || |
| (page_res_ == NULL && Recognize(NULL) < 0)) |
| return NULL; |
| int n_word = 0; |
| PAGE_RES_IT res_it(page_res_); |
| for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) |
| n_word++; |
| |
| int* conf = new int[n_word+1]; |
| n_word = 0; |
| for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) { |
| WERD_RES *word = res_it.word(); |
| WERD_CHOICE* choice = word->best_choice; |
| int w_conf = static_cast<int>(100 + 5 * choice->certainty()); |
| // This is the eq for converting Tesseract confidence to 1..100 |
| if (w_conf < 0) w_conf = 0; |
| if (w_conf > 100) w_conf = 100; |
| conf[n_word++] = w_conf; |
| } |
| conf[n_word] = -1; |
| return conf; |
| } |
| |
| // Free up recognition results and any stored image data, without actually |
| // freeing any recognition data that would be time-consuming to reload. |
| // Afterwards, you must call SetImage or TesseractRect before doing |
| // any Recognize or Get* operation. |
| void TessBaseAPI::Clear() { |
| if (thresholder_ != NULL) |
| thresholder_->Clear(); |
| ClearResults(); |
| page_image.destroy(); |
| } |
| |
| // Close down tesseract and free up all memory. End() is equivalent to |
| // destructing and reconstructing your TessBaseAPI. |
| // Once End() has been used, none of the other API functions may be used |
| // other than Init and anything declared above it in the class definition. |
| void TessBaseAPI::End() { |
| if (thresholder_ != NULL) { |
| delete thresholder_; |
| thresholder_ = NULL; |
| } |
| if (page_res_ != NULL) { |
| delete page_res_; |
| page_res_ = NULL; |
| } |
| if (block_list_ != NULL) { |
| delete block_list_; |
| block_list_ = NULL; |
| } |
| if (tesseract_ != NULL) { |
| tesseract_->end_tesseract(); |
| delete tesseract_; |
| tesseract_ = NULL; |
| } |
| if (input_file_ != NULL) { |
| delete input_file_; |
| input_file_ = NULL; |
| } |
| if (output_file_ != NULL) { |
| delete output_file_; |
| output_file_ = NULL; |
| } |
| if (datapath_ != NULL) { |
| delete datapath_; |
| datapath_ = NULL; |
| } |
| if (language_ != NULL) { |
| delete language_; |
| language_ = NULL; |
| } |
| } |
| |
| // Check whether a word is valid according to Tesseract's language model |
| // returns 0 if the word is invalid, non-zero if valid |
| int TessBaseAPI::IsValidWord(const char *word) { |
| return tesseract_->getDict().valid_word(word); |
| } |
| |
| |
| bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) { |
| if (page_res_ == NULL) |
| FindLines(); |
| if (block_list_->length() < 1) { |
| return false; |
| } |
| |
| // Get first block |
| BLOCK_IT block_it(block_list_); |
| block_it.move_to_first(); |
| ROW_LIST* rows = block_it.data()->row_list(); |
| if (rows->length() != 1) { |
| return false; |
| } |
| |
| // Get first line of block |
| ROW_IT row_it(rows); |
| row_it.move_to_first(); |
| ROW* row = row_it.data(); |
| |
| // Calculate offset and slope (NOTE: Kind of ugly) |
| *out_offset = static_cast<int>(row->base_line(0.0)); |
| *out_slope = row->base_line(1.0) - row->base_line(0.0); |
| |
| return true; |
| } |
| |
| // Set the letter_is_okay function to point somewhere else. |
| void TessBaseAPI::SetDictFunc(DictFunc f) { |
| if (tesseract_ != NULL) { |
| tesseract_->getDict().letter_is_okay_ = f; |
| } |
| } |
| |
| // Common code for setting the image. |
| bool TessBaseAPI::InternalSetImage() { |
| if (tesseract_ == NULL) { |
| tprintf("Please call Init before attempting to send an image."); |
| return false; |
| } |
| if (thresholder_ == NULL) |
| thresholder_ = new ImageThresholder; |
| ClearResults(); |
| return true; |
| } |
| |
| // Run the thresholder to make the thresholded image. If pix is not NULL, |
| // the source is thresholded to pix instead of the internal IMAGE. |
| void TessBaseAPI::Threshold(Pix** pix) { |
| #ifdef HAVE_LIBLEPT |
| if (pix != NULL) |
| thresholder_->ThresholdToPix(pix); |
| else |
| thresholder_->ThresholdToIMAGE(&page_image); |
| #else |
| thresholder_->ThresholdToIMAGE(&page_image); |
| #endif |
| thresholder_->GetImageSizes(&rect_left_, &rect_top_, |
| &rect_width_, &rect_height_, |
| &image_width_, &image_height_); |
| threshold_done_ = true; |
| } |
| |
| // Find lines from the image making the BLOCK_LIST. |
| int TessBaseAPI::FindLines() { |
| if (!block_list_->empty()) { |
| return 0; |
| } |
| if (tesseract_ == NULL) { |
| tesseract_ = new Tesseract; |
| tesseract_->InitAdaptiveClassifier(); |
| } |
| #ifdef HAVE_LIBLEPT |
| if (tesseract_->pix_binary() == NULL) |
| Threshold(tesseract_->mutable_pix_binary()); |
| #endif |
| if (!threshold_done_) |
| Threshold(NULL); |
| |
| if (tesseract_->SegmentPage(input_file_, &page_image, block_list_) < 0) |
| return -1; |
| ASSERT_HOST(page_image.get_xsize() == rect_width_ || |
| page_image.get_xsize() == rect_width_ - 1); |
| ASSERT_HOST(page_image.get_ysize() == rect_height_ || |
| page_image.get_ysize() == rect_height_ - 1); |
| return 0; |
| } |
| |
| // Delete the pageres and clear the block list ready for a new page. |
| void TessBaseAPI::ClearResults() { |
| threshold_done_ = false; |
| if (tesseract_ != NULL) |
| tesseract_->Clear(); |
| if (page_res_ != NULL) { |
| delete page_res_; |
| page_res_ = NULL; |
| } |
| if (block_list_ == NULL) |
| block_list_ = new BLOCK_LIST; |
| else |
| block_list_->clear(); |
| } |
| |
| // Return the length of the output text string, as UTF8, assuming |
| // one newline per line and one per block, with a terminator, |
| // and assuming a single character reject marker for each rejected character. |
| // Also return the number of recognized blobs in blob_count. |
| int TessBaseAPI::TextLength(int* blob_count) { |
| if (tesseract_ == NULL || page_res_ == NULL) |
| return 0; |
| |
| PAGE_RES_IT page_res_it(page_res_); |
| int total_length = 2; |
| int total_blobs = 0; |
| // Iterate over the data structures to extract the recognition result. |
| for (page_res_it.restart_page(); page_res_it.word () != NULL; |
| page_res_it.forward()) { |
| WERD_RES *word = page_res_it.word(); |
| WERD_CHOICE* choice = word->best_choice; |
| if (choice != NULL) { |
| total_blobs += choice->length() + 1; |
| total_length += choice->unichar_string().length() + 1; |
| for (int i = 0; i < word->reject_map.length(); ++i) { |
| if (word->reject_map[i].rejected()) |
| ++total_length; |
| } |
| } |
| } |
| if (blob_count != NULL) |
| *blob_count = total_blobs; |
| return total_length; |
| } |
| |
| // Estimates the Orientation And Script of the image. |
| // Returns true if the image was processed successfully. |
| bool TessBaseAPI::DetectOS(OSResults* osr) { |
| if (tesseract_ == NULL) |
| return false; |
| ClearResults(); |
| Threshold(NULL); |
| if (input_file_ == NULL) |
| input_file_ = new STRING(kInputFile); |
| return orientation_and_script_detection(*input_file_, osr, tesseract_); |
| } |
| |
| // ____________________________________________________________________________ |
| // Ocropus add-ons. |
| |
| // Find lines from the image making the BLOCK_LIST. |
| BLOCK_LIST* TessBaseAPI::FindLinesCreateBlockList() { |
| FindLines(); |
| BLOCK_LIST* result = block_list_; |
| block_list_ = NULL; |
| return result; |
| } |
| |
| // Delete a block list. |
| // This is to keep BLOCK_LIST pointer opaque |
| // and let go of including the other headers. |
| void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) { |
| delete block_list; |
| } |
| |
| |
| static ROW *make_tess_ocrrow(float baseline, |
| float xheight, |
| float descender, |
| float ascender) { |
| inT32 xstarts[] = {-32000}; |
| double quad_coeffs[] = {0, 0, baseline}; |
| return new ROW(1, |
| xstarts, |
| quad_coeffs, |
| xheight, |
| ascender - (baseline + xheight), |
| descender - baseline, |
| 0, |
| 0); |
| } |
| |
| // Almost a copy of make_tess_row() from ccmain/tstruct.cpp. |
| static void fill_dummy_row(float baseline, float xheight, |
| float descender, float ascender, |
| TEXTROW* tessrow) { |
| tessrow->baseline.segments = 1; |
| tessrow->baseline.xstarts[0] = -32767; |
| tessrow->baseline.xstarts[1] = 32767; |
| tessrow->baseline.quads[0].a = 0; |
| tessrow->baseline.quads[0].b = 0; |
| tessrow->baseline.quads[0].c = bln_baseline_offset; |
| tessrow->xheight.segments = 1; |
| tessrow->xheight.xstarts[0] = -32767; |
| tessrow->xheight.xstarts[1] = 32767; |
| tessrow->xheight.quads[0].a = 0; |
| tessrow->xheight.quads[0].b = 0; |
| tessrow->xheight.quads[0].c = bln_baseline_offset + bln_x_height; |
| tessrow->lineheight = bln_x_height; |
| tessrow->ascrise = bln_x_height * (ascender - (xheight + baseline)) / xheight; |
| tessrow->descdrop = bln_x_height * (descender - baseline) / xheight; |
| } |
| |
| |
| // Return a TBLOB * from the whole page_image. |
| // To be freed later with free_blob(). |
| TBLOB *make_tesseract_blob(float baseline, float xheight, |
| float descender, float ascender) { |
| BLOCK *block = new BLOCK("a character", |
| TRUE, |
| 0, 0, |
| 0, 0, |
| page_image.get_xsize(), |
| page_image.get_ysize()); |
| |
| // Create C_BLOBs from the page |
| extract_edges( |
| #ifndef GRAPHICS_DISABLED |
| NULL, |
| #endif |
| &page_image, &page_image, |
| ICOORD(page_image.get_xsize(), page_image.get_ysize()), |
| block); |
| |
| // Create one PBLOB from all C_BLOBs |
| C_BLOB_LIST *list = block->blob_list(); |
| C_BLOB_IT c_blob_it(list); |
| PBLOB *pblob = new PBLOB; // will be (hopefully) deleted by the pblob_list |
| for (c_blob_it.mark_cycle_pt(); |
| !c_blob_it.cycled_list(); |
| c_blob_it.forward()) { |
| C_BLOB *c_blob = c_blob_it.data(); |
| PBLOB c_as_p(c_blob, baseline + xheight); |
| merge_blobs(pblob, &c_as_p); |
| } |
| PBLOB_LIST *pblob_list = new PBLOB_LIST; // will be deleted by the word |
| PBLOB_IT pblob_it(pblob_list); |
| pblob_it.add_after_then_move(pblob); |
| |
| // Normalize PBLOB |
| WERD word(pblob_list, 0, " "); |
| ROW *row = make_tess_ocrrow(baseline, xheight, descender, ascender); |
| word.baseline_normalise(row); |
| delete row; |
| |
| // Create a TBLOB from PBLOB |
| return make_tess_blob(pblob, /* flatten: */ TRUE); |
| } |
| |
| |
| // Adapt to recognize the current image as the given character. |
| // The image must be preloaded and be just an image of a single character. |
| void TessBaseAPI::AdaptToCharacter(const char *unichar_repr, |
| int length, |
| float baseline, |
| float xheight, |
| float descender, |
| float ascender) { |
| UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length); |
| LINE_STATS LineStats; |
| TEXTROW row; |
| fill_dummy_row(baseline, xheight, descender, ascender, &row); |
| GetLineStatsFromRow(&row, &LineStats); |
| |
| TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender); |
| float threshold; |
| UNICHAR_ID best_class = 0; |
| float best_rating = -100; |
| |
| |
| // Classify to get a raw choice. |
| BLOB_CHOICE_LIST choices; |
| tesseract_->AdaptiveClassifier(blob, NULL, &row, &choices, NULL); |
| BLOB_CHOICE_IT choice_it; |
| choice_it.set_to_list(&choices); |
| for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); |
| choice_it.forward()) { |
| if (choice_it.data()->rating() > best_rating) { |
| best_rating = choice_it.data()->rating(); |
| best_class = choice_it.data()->unichar_id(); |
| } |
| } |
| |
| if (id == best_class) { |
| threshold = matcher_good_threshold; |
| } else { |
| /* the blob was incorrectly classified - find the rating threshold |
| needed to create a template which will correct the error with |
| some margin. However, don't waste time trying to make |
| templates which are too tight. */ |
| threshold = tesseract_->GetBestRatingFor(blob, &LineStats, id); |
| threshold *= .9; |
| const float max_threshold = .125; |
| const float min_threshold = .02; |
| |
| if (threshold > max_threshold) |
| threshold = max_threshold; |
| |
| // I have cuddled the following line to set it out of the strike |
| // of the coverage testing tool. I have no idea how to trigger |
| // this situation nor I have any necessity to do it. --mezhirov |
| if (threshold < min_threshold) threshold = min_threshold; |
| } |
| |
| if (blob->outlines) |
| tesseract_->AdaptToChar(blob, &LineStats, id, threshold); |
| free_blob(blob); |
| } |
| |
| |
| PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) { |
| PAGE_RES *page_res = new PAGE_RES(block_list); |
| tesseract_->recog_all_words(page_res, NULL, NULL, 1); |
| return page_res; |
| } |
| |
| PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list, |
| PAGE_RES* pass1_result) { |
| if (!pass1_result) |
| pass1_result = new PAGE_RES(block_list); |
| tesseract_->recog_all_words(pass1_result, NULL, NULL, 2); |
| return pass1_result; |
| } |
| |
| struct TESS_CHAR : ELIST_LINK { |
| char *unicode_repr; |
| int length; // of unicode_repr |
| float cost; |
| TBOX box; |
| |
| TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) { |
| length = (len == -1 ? strlen(repr) : len); |
| unicode_repr = new char[length + 1]; |
| strncpy(unicode_repr, repr, length); |
| } |
| |
| TESS_CHAR() { // Satisfies ELISTIZE. |
| } |
| ~TESS_CHAR() { |
| delete [] unicode_repr; |
| } |
| }; |
| |
| ELISTIZEH(TESS_CHAR) |
| ELISTIZE(TESS_CHAR) |
| |
| static void add_space(TESS_CHAR_IT* it) { |
| TESS_CHAR *t = new TESS_CHAR(0, " "); |
| it->add_after_then_move(t); |
| } |
| |
| |
| static float rating_to_cost(float rating) { |
| rating = 100 + 5*rating; |
| // cuddled that to save from coverage profiler |
| // (I have never seen ratings worse than -100, |
| // but the check won't hurt) |
| if (rating < 0) rating = 0; |
| return rating; |
| } |
| |
| |
| // Extract the OCR results, costs (penalty points for uncertainty), |
| // and the bounding boxes of the characters. |
| static void extract_result(TESS_CHAR_IT* out, |
| PAGE_RES* page_res) { |
| PAGE_RES_IT page_res_it(page_res); |
| int word_count = 0; |
| while (page_res_it.word() != NULL) { |
| WERD_RES *word = page_res_it.word(); |
| const char *str = word->best_choice->unichar_string().string(); |
| const char *len = word->best_choice->unichar_lengths().string(); |
| |
| if (word_count) |
| add_space(out); |
| TBOX bln_rect; |
| PBLOB_LIST *blobs = word->outword->blob_list(); |
| PBLOB_IT it(blobs); |
| int n = strlen(len); |
| TBOX** boxes_to_fix = new TBOX*[n]; |
| for (int i = 0; i < n; i++) { |
| PBLOB *blob = it.data(); |
| TBOX current = blob->bounding_box(); |
| bln_rect = bln_rect.bounding_union(current); |
| TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->certainty()), |
| str, *len); |
| tc->box = current; |
| boxes_to_fix[i] = &tc->box; |
| |
| out->add_after_then_move(tc); |
| it.forward(); |
| str += *len; |
| len++; |
| } |
| |
| // Find the word bbox before normalization. |
| // Here we can't use the C_BLOB bboxes directly, |
| // since connected letters are not yet cut. |
| TBOX real_rect = word->word->bounding_box(); |
| |
| // Denormalize boxes by transforming the bbox of the whole bln word |
| // into the denorm bbox (`real_rect') of the whole word. |
| double x_stretch = static_cast<double>(real_rect.width()) |
| / bln_rect.width(); |
| double y_stretch = static_cast<double>(real_rect.height()) |
| / bln_rect.height(); |
| for (int j = 0; j < n; j++) { |
| TBOX *box = boxes_to_fix[j]; |
| int x0 = static_cast<int>(real_rect.left() + |
| x_stretch * (box->left() - bln_rect.left()) + 0.5); |
| int x1 = static_cast<int>(real_rect.left() + |
| x_stretch * (box->right() - bln_rect.left()) + 0.5); |
| int y0 = static_cast<int>(real_rect.bottom() + |
| y_stretch * (box->bottom() - bln_rect.bottom()) + 0.5); |
| int y1 = static_cast<int>(real_rect.bottom() + |
| y_stretch * (box->top() - bln_rect.bottom()) + 0.5); |
| *box = TBOX(ICOORD(x0, y0), ICOORD(x1, y1)); |
| } |
| delete [] boxes_to_fix; |
| |
| page_res_it.forward(); |
| word_count++; |
| } |
| } |
| |
| |
| // Extract the OCR results, costs (penalty points for uncertainty), |
| // and the bounding boxes of the characters. |
| int TessBaseAPI::TesseractExtractResult(char** text, |
| int** lengths, |
| float** costs, |
| int** x0, |
| int** y0, |
| int** x1, |
| int** y1, |
| PAGE_RES* page_res) { |
| TESS_CHAR_LIST tess_chars; |
| TESS_CHAR_IT tess_chars_it(&tess_chars); |
| extract_result(&tess_chars_it, page_res); |
| tess_chars_it.move_to_first(); |
| int n = tess_chars.length(); |
| int text_len = 0; |
| *lengths = new int[n]; |
| *costs = new float[n]; |
| *x0 = new int[n]; |
| *y0 = new int[n]; |
| *x1 = new int[n]; |
| *y1 = new int[n]; |
| int i = 0; |
| for (tess_chars_it.mark_cycle_pt(); |
| !tess_chars_it.cycled_list(); |
| tess_chars_it.forward(), i++) { |
| TESS_CHAR *tc = tess_chars_it.data(); |
| text_len += (*lengths)[i] = tc->length; |
| (*costs)[i] = tc->cost; |
| (*x0)[i] = tc->box.left(); |
| (*y0)[i] = tc->box.bottom(); |
| (*x1)[i] = tc->box.right(); |
| (*y1)[i] = tc->box.top(); |
| } |
| char *p = *text = new char[text_len]; |
| |
| tess_chars_it.move_to_first(); |
| for (tess_chars_it.mark_cycle_pt(); |
| !tess_chars_it.cycled_list(); |
| tess_chars_it.forward()) { |
| TESS_CHAR *tc = tess_chars_it.data(); |
| strncpy(p, tc->unicode_repr, tc->length); |
| p += tc->length; |
| } |
| return n; |
| } |
| |
| // This method returns the features associated with the current image. |
| // Make sure setimage has been called before calling this method. |
| void TessBaseAPI::GetFeatures(INT_FEATURE_ARRAY int_features, |
| int* num_features) { |
| if (page_res_ != NULL) |
| ClearResults(); |
| if (!threshold_done_) |
| Threshold(NULL); |
| // We have only one block, which is of the size of the page. |
| BLOCK_LIST* blocks = new BLOCK_LIST; |
| BLOCK *block = new BLOCK("", // filename. |
| TRUE, // proportional. |
| 0, // kerning. |
| 0, // spacing. |
| 0, // Left. |
| 0, // Bottom. |
| page_image.get_xsize(), // Right. |
| page_image.get_ysize()); // Top. |
| ICOORD bleft, tright; |
| block->bounding_box (bleft, tright); |
| |
| BLOCK_IT block_it_add = blocks; |
| block_it_add.add_to_end(block); |
| |
| ICOORD page_tr(page_image.get_xsize(), page_image.get_ysize()); |
| TEXTROW tessrow; |
| make_tess_row(NULL, // Denormalizer. |
| &tessrow); // Output row. |
| LINE_STATS line_stats; |
| GetLineStatsFromRow(&tessrow, &line_stats); |
| |
| // Perform a CC analysis to detect the blobs. |
| BLOCK_IT block_it = blocks; |
| for (block_it.mark_cycle_pt (); !block_it.cycled_list (); |
| block_it.forward ()) { |
| BLOCK* block = block_it.data(); |
| #ifndef GRAPHICS_DISABLED |
| extract_edges(NULL, // Scrollview window. |
| &page_image, // Image. |
| &page_image, // Thresholded image. |
| page_tr, // corner of page. |
| block); // block. |
| #else |
| extract_edges(&page_image, // Image. |
| &page_image, // Thresholded image. |
| page_tr, // corner of page. |
| block); // block. |
| #endif |
| C_BLOB_IT blob_it = block->blob_list(); |
| PBLOB *pblob = new PBLOB; |
| // Iterate over all blobs found and get their features. |
| for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); |
| blob_it.forward()) { |
| C_BLOB* blob = blob_it.data(); |
| blob = blob; |
| PBLOB c_as_p(blob, page_image.get_ysize()); |
| merge_blobs(pblob, &c_as_p); |
| } |
| |
| PBLOB_LIST *pblob_list = new PBLOB_LIST; |
| PBLOB_IT pblob_it(pblob_list); |
| pblob_it.add_after_then_move(pblob); |
| WERD word(pblob_list, // Blob list. |
| 0, // Blanks in front. |
| " "); // Correct text. |
| ROW *row = make_tess_ocrrow(0, // baseline. |
| page_image.get_ysize(), // xheight. |
| 0, // ascent. |
| 0); // descent. |
| word.baseline_normalise(row); |
| delete row; |
| if (pblob->out_list () == NULL) { |
| tprintf("Blob list is empty"); |
| } |
| TBLOB* tblob = make_tess_blob(pblob, // Blob. |
| TRUE); // Flatten. |
| |
| CLASS_NORMALIZATION_ARRAY norm_array; |
| inT32 len; |
| *num_features = tesseract_->GetCharNormFeatures( |
| tblob, &line_stats, |
| tesseract_->PreTrainedTemplates, |
| int_features, norm_array, &len); |
| } |
| delete blocks; |
| } |
| |
| // Return the pointer to the i-th dawg loaded into tesseract_ object. |
| const Dawg *TessBaseAPI::GetDawg(int i) const { |
| if (tesseract_ == NULL || i >= NumDawgs()) return NULL; |
| return tesseract_->getDict().GetDawg(i); |
| } |
| |
| // Return the number of dawgs loaded into tesseract_ object. |
| int TessBaseAPI::NumDawgs() const { |
| return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs(); |
| } |
| |
| // Return the language used in the last valid initialization. |
| const char* TessBaseAPI::GetLastInitLanguage() const { |
| return (tesseract_ == NULL || tesseract_->lang.string() == NULL) ? |
| "" : tesseract_->lang.string(); |
| } |
| } // namespace tesseract. |