wordrec/chopper.cpp - platform/external/tesseract - Git at Google

 /* -*-C-*-
  ********************************************************************************
  *
  * File:        chopper.c  (Formerly chopper.c)
  * Description:
  * Author:       Mark Seaman, OCR Technology
  * Created:      Fri Oct 16 14:37:00 1987
  * Modified:     Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
  * Language:     C
  * Package:      N/A
  * Status:       Reusable Software Component
  *
  * (c) Copyright 1987, Hewlett-Packard Company.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **************************************************************************/

 /*----------------------------------------------------------------------
           I n c l u d e s
 ----------------------------------------------------------------------*/
 #include <math.h>

 #include "chopper.h"

 #include "assert.h"
 #include "associate.h"
 #include "callcpp.h"
 #include "choices.h"
 #include "const.h"
 #include "findseam.h"
 #include "freelist.h"
 #include "globals.h"
 #include "makechop.h"
 #include "metrics.h"
 #include "render.h"
 #include "permute.h"
 #include "pieces.h"
 #include "seam.h"
 #include "stopper.h"
 #include "structures.h"
 #include "tordvars.h"
 #include "unicharset.h"
 #include "wordclass.h"
 #include "wordrec.h"

 extern int blob_skip;
 INT_VAR (repair_unchopped_blobs, 1, "Fix blobs that aren't chopped");

 //?extern int tessedit_dangambigs_chop;
 double_VAR(tessedit_certainty_threshold, -2.25, "Good blob limit");

 BOOL_VAR(fragments_guide_chopper, FALSE,
          "Use information from fragments to guide chopping process");

 /*----------------------------------------------------------------------
           M a c r o s
 ----------------------------------------------------------------------*/
 /**********************************************************************
  * bounds_inside
  *
  * Check to see if the bounding box of one thing is inside the
  * bounding box of another.
  **********************************************************************/
 #define bounds_inside(inner_tl,inner_br,outer_tl,outer_br)  \
 ((inner_tl.x >= outer_tl.x)	&& \
 (inner_tl.y <= outer_tl.y)	&& \
 (inner_br.x <= outer_br.x)   && \
 (inner_br.y >= outer_br.y))     \

 /*----------------------------------------------------------------------
           F u n c t i o n s
 ----------------------------------------------------------------------*/
 /**********************************************************************
  * preserve_outline_tree
  *
  * Copy the list of outlines.
  **********************************************************************/
 void preserve_outline(EDGEPT *start) {
   EDGEPT *srcpt;

   if (start == NULL)
     return;
   srcpt = start;
   do {
     srcpt->flags[1] = 1;
     srcpt = srcpt->next;
   }
   while (srcpt != start);
   srcpt->flags[1] = 2;
 }


 /**************************************************************************/
 void preserve_outline_tree(TESSLINE *srcline) {
   TESSLINE *outline;

   for (outline = srcline; outline != NULL; outline = outline->next) {
     preserve_outline (outline->loop);
   }
   if (srcline->child != NULL)
     preserve_outline_tree (srcline->child);
 }


 /**********************************************************************
  * restore_outline_tree
  *
  * Copy the list of outlines.
  **********************************************************************/
 EDGEPT *restore_outline(EDGEPT *start) {
   EDGEPT *srcpt;
   EDGEPT *real_start;
   EDGEPT *deadpt;

   if (start == NULL)
     return NULL;
   srcpt = start;
   do {
     if (srcpt->flags[1] == 2)
       break;
     srcpt = srcpt->next;
   }
   while (srcpt != start);
   real_start = srcpt;
   do {
     if (srcpt->flags[1] == 0) {
       deadpt = srcpt;
       srcpt = srcpt->next;
       srcpt->prev = deadpt->prev;
       deadpt->prev->next = srcpt;
       deadpt->prev->vec.x = srcpt->pos.x - deadpt->prev->pos.x;
       deadpt->prev->vec.y = srcpt->pos.y - deadpt->prev->pos.y;
       oldedgept(deadpt);
     }
     else
       srcpt = srcpt->next;
   }
   while (srcpt != real_start);
   return real_start;
 }


 /******************************************************************************/
 void restore_outline_tree(TESSLINE *srcline) {
   TESSLINE *outline;

   for (outline = srcline; outline != NULL; outline = outline->next) {
     outline->loop = restore_outline (outline->loop);
     outline->start = outline->loop->pos;
   }
   if (srcline->child != NULL)
     restore_outline_tree (srcline->child);
 }


 /**********************************************************************
  * attempt_blob_chop
  *
  * Try to split the this blob after this one.  Check to make sure that
  * it was successful.
  **********************************************************************/
 SEAM *attempt_blob_chop(TWERD *word, inT32 blob_number, SEAMS seam_list) {
   TBLOB *blob;
   TBLOB *other_blob;
   SEAM *seam;
   TBLOB *last_blob;
   TBLOB *next_blob;
   inT16 x;

   if (first_pass)
     chops_attempted1++;
   else
     chops_attempted2++;

   last_blob = NULL;
   blob = word->blobs;
   for (x = 0; x < blob_number; x++) {
     last_blob = blob;
     blob = blob->next;
   }
   next_blob = blob->next;

   if (repair_unchopped_blobs)
     preserve_outline_tree (blob->outlines);
   other_blob = newblob ();       /* Make new blob */
   other_blob->next = blob->next;
   other_blob->outlines = NULL;
   blob->next = other_blob;

   seam = pick_good_seam (blob);
   if (chop_debug) {
     if (seam != NULL) {
       print_seam ("Good seam picked=", seam);
     }
     else
       cprintf ("\n** no seam picked *** \n");
   }
   if (seam) {
     apply_seam(blob, other_blob, seam);
   }

   if ((seam == NULL) ||
     (blob->outlines == NULL) ||
     (other_blob->outlines == NULL) ||
     total_containment (blob, other_blob) ||
     check_blob (other_blob) ||
     !(check_seam_order (blob, seam) &&
     check_seam_order (other_blob, seam)) ||
     any_shared_split_points (seam_list, seam) ||
     !test_insert_seam(seam_list, blob_number, blob, word->blobs)) {

     blob->next = next_blob;
     if (seam) {
       undo_seam(blob, other_blob, seam);
       delete_seam(seam);
 #ifndef GRAPHICS_DISABLED
       if (chop_debug) {
         if (chop_debug >2)
           display_blob(blob, Red);
         cprintf ("\n** seam being removed ** \n");
       }
 #endif
     }
     else {
       oldblob(other_blob);
     }

     if (repair_unchopped_blobs)
       restore_outline_tree (blob->outlines);
     return (NULL);
   }
   return (seam);
 }


 /**********************************************************************
  * any_shared_split_points
  *
  * Return true if any of the splits share a point with this one.
  **********************************************************************/
 int any_shared_split_points(SEAMS seam_list, SEAM *seam) {
   int length;
   int index;

   length = array_count (seam_list);
   for (index = 0; index < length; index++)
     if (shared_split_points ((SEAM *) array_value (seam_list, index), seam))
       return TRUE;
   return FALSE;
 }


 /**********************************************************************
  * check_blob
  *
  * Return true if blob has a non whole outline.
  **********************************************************************/
 int check_blob(TBLOB *blob) {
   TESSLINE *outline;
   EDGEPT *edgept;

   for (outline = blob->outlines; outline != NULL; outline = outline->next) {
     edgept = outline->loop;
     do {
       if (edgept == NULL)
         break;
       edgept = edgept->next;
     }
     while (edgept != outline->loop);
     if (edgept == NULL)
       return 1;
   }
   return 0;
 }


 /**********************************************************************
  * improve_one_blob
  *
  * Start with the current word of blobs and its classification.  Find
  * the worst blobs and try to divide it up to improve the ratings.
  *********************************************************************/
 namespace tesseract {
 bool Wordrec::improve_one_blob(TWERD *word,
                                BLOB_CHOICE_LIST_VECTOR *char_choices,
                                int fx,
                                inT32 *blob_number,
                                SEAMS *seam_list,
                                DANGERR *fixpt,
                                bool split_next_to_fragment) {
   TBLOB *pblob;
   TBLOB *blob;
   inT16 x = 0;
   float rating_ceiling = MAX_FLOAT32;
   BLOB_CHOICE_LIST *answer;
   BLOB_CHOICE_IT answer_it;
   SEAM *seam;

   do {
     *blob_number = select_blob_to_split(*char_choices, rating_ceiling,
                                         split_next_to_fragment);
     if (chop_debug)
       cprintf("blob_number = %d\n", *blob_number);
     if (*blob_number == -1)
       return false;

     seam = attempt_blob_chop (word, *blob_number, *seam_list);
     if (seam != NULL)
       break;
     /* Must split null blobs */
     answer = char_choices->get(*blob_number);
     if (answer == NULL)
       return false;
     answer_it.set_to_list(answer);
     rating_ceiling = answer_it.data()->rating();  // try a different blob
   } while (!blob_skip);
   /* Split OK */
   for (blob = word->blobs, pblob = NULL; x < *blob_number; x++) {
     pblob = blob;
     blob = blob->next;
   }

   *seam_list =
     insert_seam (*seam_list, *blob_number, seam, blob, word->blobs);

   delete char_choices->get(*blob_number);

   answer = classify_blob(pblob, blob, blob->next, NULL, "improve 1:", Red);
   char_choices->insert(answer, *blob_number);

   answer = classify_blob(blob, blob->next, blob->next->next, NULL,
                          "improve 2:", Yellow);
   char_choices->set(answer, *blob_number + 1);

   return true;
 }
 }  // namespace tesseract


 /**********************************************************************
  * check_seam_order
  *
  * Make sure that each of the splits in this seam match to outlines
  * in this blob.  If any of the splits could not correspond to this
  * blob then there is a problem (and FALSE should be returned to the
  * caller).
  **********************************************************************/
 inT16 check_seam_order(TBLOB *blob, SEAM *seam) {
   TESSLINE *outline;
   TESSLINE *last_outline;
   inT8 found_em[3];

   if (seam->split1 == NULL || seam->split1 == NULL || blob == NULL)
     return (TRUE);

   found_em[0] = found_em[1] = found_em[2] = FALSE;

   for (outline = blob->outlines; outline; outline = outline->next) {
     if (!found_em[0] &&
       ((seam->split1 == NULL) ||
     is_split_outline (outline, seam->split1))) {
       found_em[0] = TRUE;
     }
     if (!found_em[1] &&
       ((seam->split2 == NULL) ||
     is_split_outline (outline, seam->split2))) {
       found_em[1] = TRUE;
     }
     if (!found_em[2] &&
       ((seam->split3 == NULL) ||
     is_split_outline (outline, seam->split3))) {
       found_em[2] = TRUE;
     }
     last_outline = outline;
   }

   if (!found_em[0] || !found_em[1] || !found_em[2])
     return (FALSE);
   else
     return (TRUE);
 }


 /**********************************************************************
  * chop_word_main
  *
  * Classify the blobs in this word and permute the results.  Find the
  * worst blob in the word and chop it up.  Continue this process until
  * a good answer has been found or all the blobs have been chopped up
  * enough.  Return the word level ratings.
  **********************************************************************/
 namespace tesseract {
 BLOB_CHOICE_LIST_VECTOR *Wordrec::chop_word_main(register TWERD *word,
                                                  int fx,
                                                  WERD_CHOICE *best_choice,
                                                  WERD_CHOICE *raw_choice,
                                                  BOOL8 tester,
                                                  BOOL8 trainer) {
   TBLOB *pblob;
   TBLOB *blob;
   int index;
   int did_chopping;
   float rating_limit = 1000.0;
   STATE state;
   SEAMS seam_list = start_seam_list(word->blobs);
   BLOB_CHOICE_LIST *match_result;
   MATRIX *ratings = NULL;
   DANGERR fixpt;                 /*dangerous ambig */
   inT32 state_count;             //no of states
   inT32 bit_count;               //no of bits
   static STATE best_state;
   static STATE chop_states[64];  //in between states

   state_count = 0;
   best_choice->make_bad();
   raw_choice->make_bad();

   BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();

   did_chopping = 0;
   for (blob = word->blobs, pblob = NULL, index = 0;
        blob != NULL; blob = blob->next, index++) {
     match_result = classify_blob(pblob, blob, blob->next, NULL,
                                  "chop_word:", Green);
     if (match_result == NULL)
       cprintf("Null classifier output!\n");
     *char_choices += match_result;
     pblob = blob;
   }
   bit_count = index - 1;
   getDict().permute_characters(*char_choices, rating_limit,
                                best_choice, raw_choice);
   set_n_ones(&state, char_choices->length() - 1);
   if (matcher_fp != NULL) {
     bits_in_states = bit_count;
     chop_states[state_count] = state;
     state_count++;
   }
   if (!getDict().AcceptableChoice(*char_choices, *best_choice, *raw_choice,
                                   &fixpt, CHOPPER_CALLER) ||
       ((tester || trainer) &&
        strcmp(word->correct, best_choice->unichar_string().string()))) {
     did_chopping = 1;
     if (first_pass)
       words_chopped1++;
     else
       words_chopped2++;

     if (chop_enable)
       improve_by_chopping(word,
                           char_choices,
                           fx,
                           &state,
                           best_choice,
                           raw_choice,
                           &seam_list,
                           &fixpt,
                           chop_states,
                           &state_count);
     if (chop_debug)
       print_seams ("Final seam list:", seam_list);
     if ((enable_assoc &&
          !getDict().AcceptableChoice(*char_choices, *best_choice,
                                      *raw_choice, NULL, CHOPPER_CALLER)) ||
         ((tester || trainer) &&
          strcmp(word->correct, best_choice->unichar_string().string()))) {
       ratings = word_associator (word->blobs, seam_list, &state, fx,
         best_choice, raw_choice, word->correct,
         /*0, */ &fixpt, &best_state);
     }
     bits_in_states = bit_count + state_count - 1;

   }

   char_choices =
     rebuild_current_state(word->blobs, seam_list, &state, char_choices, fx,
                           (did_chopping || tester || trainer), *best_choice);

   if (ratings != NULL) {
     ratings->delete_matrix_pointers();
     delete ratings;
   }
   if (seam_list != NULL)
     free_seam_list(seam_list);
   if (matcher_fp != NULL) {
     best_state = state;
   }
   FilterWordChoices();
   return char_choices;
 }


 /**********************************************************************
  * improve_by_chopping
  *
  * Start with the current word of blobs and its classification.  Find
  * the worst blobs and try to divide them up to improve the ratings.
  * As long as ratings are produced by the new blob splitting.  When
  * all the splitting has been accomplished all the ratings memory is
  * reclaimed.
  **********************************************************************/
 void Wordrec::improve_by_chopping(register TWERD *word,
                                   BLOB_CHOICE_LIST_VECTOR *char_choices,
                                   int fx,
                                   STATE *best_state,
                                   WERD_CHOICE *best_choice,
                                   WERD_CHOICE *raw_choice,
                                   SEAMS *seam_list,
                                   DANGERR *fixpt,
                                   STATE *chop_states,
                                   inT32 *state_count) {
   inT32 blob_number;
   inT32 index;                   //to states
   float old_best;
   int fixpt_valid = 1;
   static inT32 old_count;        //from pass1

   do {  // improvement loop
     if (!fixpt_valid)
       fixpt->index = -1;
     old_best = best_choice->rating();
     if (improve_one_blob(word, char_choices, fx, &blob_number, seam_list,
                          fixpt, (fragments_guide_chopper &&
                                  best_choice->fragment_mark()))) {
       LogNewSplit(blob_number);
       getDict().permute_characters(*char_choices, best_choice->rating(),
                                    best_choice, raw_choice);

       if (old_best > best_choice->rating()) {
         set_n_ones(best_state, char_choices->length() - 1);
         fixpt_valid = 1;
       }
       else {
         insert_new_chunk(best_state, blob_number, char_choices->length() - 2);
         fixpt_valid = 0;
       }
       if (*state_count > 0) {
         for (index = 0; index < *state_count; index++) {
           insert_new_chunk(&chop_states[index], blob_number,
                            char_choices->length() - 2);
         }
         set_n_ones(&chop_states[index], char_choices->length() - 1);
         (*state_count)++;
       }

       if (chop_debug)
         print_state ("best state = ",
           best_state, count_blobs (word->blobs) - 1);
       if (first_pass)
         chops_performed1++;
       else
         chops_performed2++;
     } else {
       break;
     }
   } while (!getDict().AcceptableChoice(*char_choices, *best_choice,
                                        *raw_choice, fixpt, CHOPPER_CALLER) &&
            !blob_skip && char_choices->length() < MAX_NUM_CHUNKS);
   old_count = *state_count;
   if (!fixpt_valid)
     fixpt->index = -1;
 }


 /**********************************************************************
  * select_blob_to_split
  *
  * These are the results of the last classification.  Find a likely
  * place to apply splits.
  **********************************************************************/
 inT16 Wordrec::select_blob_to_split(const BLOB_CHOICE_LIST_VECTOR &char_choices,
                                     float rating_ceiling,
                                     bool split_next_to_fragment) {
   BLOB_CHOICE_IT blob_choice_it;
   BLOB_CHOICE *blob_choice;
   BLOB_CHOICE_IT temp_it;
   int x;
   float worst = -MAX_FLOAT32;
   int worst_index = -1;
   float worst_near_fragment = -MAX_FLOAT32;
   int worst_index_near_fragment = -1;
   const CHAR_FRAGMENT **fragments = NULL;

   if (chop_debug) {
     if (rating_ceiling < MAX_FLOAT32)
       cprintf("rating_ceiling = %8.4f\n", rating_ceiling);
     else
       cprintf("rating_ceiling = No Limit\n");
   }

   if (split_next_to_fragment && char_choices.length() > 0) {
     fragments = new const CHAR_FRAGMENT *[char_choices.length()];
     if (char_choices.get(0) != NULL) {
       temp_it.set_to_list(char_choices.get(0));
       fragments[0] = getDict().getUnicharset().get_fragment(
           temp_it.data()->unichar_id());
     } else {
       fragments[0] = NULL;
     }
   }

   for (x = 0; x < char_choices.length(); ++x) {
     if (char_choices.get(x) == NULL) {
       if (fragments != NULL) {
         delete[] fragments;
       }
       return x;
     } else {
       blob_choice_it.set_to_list(char_choices.get(x));
       blob_choice = blob_choice_it.data();
       // Populate fragments for the following position.
       if (split_next_to_fragment && x+1 < char_choices.length()) {
         if (char_choices.get(x+1) != NULL) {
           temp_it.set_to_list(char_choices.get(x+1));
           fragments[x+1] = getDict().getUnicharset().get_fragment(
               temp_it.data()->unichar_id());
         } else {
           fragments[x+1] = NULL;
         }
       }
       if (blob_choice->rating() < rating_ceiling &&
           blob_choice->certainty() < tessedit_certainty_threshold) {
         // Update worst and worst_index.
         if (blob_choice->rating() > worst) {
           worst_index = x;
           worst = blob_choice->rating();
         }
         if (split_next_to_fragment) {
           // Update worst_near_fragment and worst_index_near_fragment.
           bool expand_following_fragment =
             (x + 1 < char_choices.length() &&
              fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
           bool expand_preceding_fragment =
             (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
           if ((expand_following_fragment || expand_preceding_fragment) &&
               blob_choice->rating() > worst_near_fragment) {
             worst_index_near_fragment = x;
             worst_near_fragment = blob_choice->rating();
             if (chop_debug) {
               cprintf("worst_index_near_fragment=%d"
                       " expand_following_fragment=%d"
                       " expand_preceding_fragment=%d\n",
                       worst_index_near_fragment,
                       expand_following_fragment,
                       expand_preceding_fragment);
             }
           }
         }
       }
     }
   }
   if (fragments != NULL) {
     delete[] fragments;
   }
   // TODO(daria): maybe a threshold of badness for
   // worst_near_fragment would be useful.
   return worst_index_near_fragment != -1 ?
     worst_index_near_fragment : worst_index;
 }
 }  // namespace tesseract


 /**********************************************************************
  * start_seam_list
  *
  * Initialize a list of seams that match the original number of blobs
  * present in the starting segmentation.  Each of the seams created
  * by this routine have location information only.
  **********************************************************************/
 SEAMS start_seam_list(TBLOB *blobs) {
   TBLOB *blob;
   SEAMS seam_list;
   TPOINT topleft;
   TPOINT botright;
   int location;
   /* Seam slot per char */
   seam_list = new_seam_list ();

   for (blob = blobs; blob->next != NULL; blob = blob->next) {

     blob_bounding_box(blob, &topleft, &botright);
     location = botright.x;
     blob_bounding_box (blob->next, &topleft, &botright);
     location += topleft.x;
     location /= 2;

     seam_list = add_seam (seam_list,
       new_seam (0.0, location, NULL, NULL, NULL));
   }

   return (seam_list);
 }


 /**********************************************************************
  * total_containment
  *
  * Check to see if one of these outlines is totally contained within
  * the bounding box of the other.
  **********************************************************************/
 inT16 total_containment(TBLOB *blob1, TBLOB *blob2) {
   TPOINT topleft1;
   TPOINT botright1;
   TPOINT topleft2;
   TPOINT botright2;

   blob_bounding_box(blob1, &topleft1, &botright1);
   blob_bounding_box(blob2, &topleft2, &botright2);

   return (bounds_inside (topleft1, botright1, topleft2, botright2) ||
     bounds_inside (topleft2, botright2, topleft1, botright1));
 }


 /**********************************************************************
  * word_associator
  *
  * Reassociate and classify the blobs in a word.  Continue this process
  * until a good answer is found or all the possibilities have been tried.
  **********************************************************************/
 namespace tesseract {
 MATRIX *Wordrec::word_associator(TBLOB *blobs,
                                  SEAMS seams,
                                  STATE *state,
                                  int fxid,
                                  WERD_CHOICE *best_choice,
                                  WERD_CHOICE *raw_choice,
                                  char *correct,
                                  DANGERR *fixpt,
                                  STATE *best_state) {
   CHUNKS_RECORD chunks_record;
   BLOB_WEIGHTS blob_weights;
   int x;
   int num_chunks;
   BLOB_CHOICE_IT blob_choice_it;

   num_chunks = array_count (seams) + 1;

   chunks_record.chunks = blobs;
   chunks_record.splits = seams;
   chunks_record.ratings = record_piece_ratings (blobs);
   chunks_record.char_widths = blobs_widths (blobs);
   chunks_record.chunk_widths = blobs_widths (blobs);
   chunks_record.fx = fxid;
   /* Save chunk weights */
   for (x = 0; x < num_chunks; x++) {
     blob_choice_it.set_to_list(chunks_record.ratings->get(x, x));
     //This is done by Jetsoft. Divide by zero is possible.
     if (blob_choice_it.data()->certainty() == 0) {
       blob_weights[x]=0;
     } else {
       blob_weights[x] =
         -(inT16) (10 * blob_choice_it.data()->rating() /
                   blob_choice_it.data()->certainty());
     }
   }
   chunks_record.weights = blob_weights;

   if (chop_debug)
     chunks_record.ratings->print(getDict().getUnicharset());

   best_first_search(&chunks_record,
                     best_choice,
                     raw_choice,
                     state,
                     fixpt,
                     best_state);

   free_widths (chunks_record.chunk_widths);
   free_widths (chunks_record.char_widths);
   return chunks_record.ratings;
 }
 }  // namespace tesseract