| /////////////////////////////////////////////////////////////////////// |
| // File: tablefind.cpp |
| // Description: Helper classes to find tables from ColPartitions. |
| // Author: Faisal Shafait (faisal.shafait@dfki.de) |
| // Created: Tue Jan 06 11:13:01 PST 2009 |
| // |
| // (C) Copyright 2009, Google Inc. |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| /////////////////////////////////////////////////////////////////////// |
| |
| #include "colfind.h" |
| #include <math.h> |
| #ifdef HAVE_CONFIG_H |
| #include "config_auto.h" |
| #endif |
| #ifdef HAVE_LIBLEPT |
| #include "allheaders.h" |
| #endif |
| |
| namespace tesseract { |
| |
| // Maximum vertical spacing between neighbor partitions |
| const int kMaxVerticalSpacing = 500; |
| |
| // Minimum number of components in a text partition. A partition having fewer |
| // components than that is more likely a data partition and is a candidate |
| // table cell. |
| const int kMinBoxesInTextPartition = 10; |
| |
| // Maximum number of components that a data partition can have |
| const int kMaxBoxesInDataPartition = 20; |
| |
| // Maximum allowed gap in a text partitions as a multiple of its median size. |
| const double kMaxGapInTextPartition = 4.0; |
| |
| // Minimum value that the maximum gap in a text partition should have as a |
| // factor of its median size. |
| const double kMinMaxGapInTextPartition = 0.5; |
| |
| // Maximum x-height a table partition can have as a multiple of global |
| // median x-height |
| const double kMaxTableCellXheight = 2.0; |
| |
| // Maximum line spacing between a table column header and column contents |
| // for merging the two |
| const int kMaxColumnHeaderDistance = 100; |
| |
| // Minimum ratio of num_table_partitions to num_text_partitions in a column |
| // block to be called it a table column |
| const double kTableColumnThreshold = 3.0; |
| |
| // Search for horizontal ruling lines within the vertical margin as a |
| // multiple of grid size |
| const int kRulingVerticalMargin = 3; |
| |
| // Minimum overlap that a colpartition must have with a table region |
| // to become part of that table |
| const double kMinOverlapWithTable = 0.6; |
| |
| // Maximum side space (distance from column boundary) that a typical |
| // text-line in flowing text should have as a multiple of its x-height |
| // (Median size). |
| const int kSideSpaceMargin = 10; |
| |
| // Fraction of the peak of x-projection of a table region to set the |
| // threshold for the x-projection histogram |
| const double kProjectionThreshold = 0.35; |
| |
| // Minmimum number of rows in a table |
| const int kMinRowsInTable = 3; |
| |
| BOOL_VAR(textord_dump_table_images, false, "Paint table detection output"); |
| BOOL_VAR(textord_show_tables, false, "Show table regions"); |
| |
| ELISTIZE(ColSegment) |
| CLISTIZE(ColSegment) |
| |
| // Copy cleaned partitions from part_grid_ to clean_part_grid_ and |
| // insert dot-like noise into period_grid_ |
| void ColumnFinder::GetCleanPartitions(TO_BLOCK* block) { |
| double min_dim = block->line_size/3.0; |
| // Initialize clean partitions list and grid |
| clean_part_grid_.Init(gridsize(), bleft(), tright()); |
| period_grid_.Init(gridsize(), bleft(), tright()); |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| ColPartition* clean_part = part->ShallowCopy(); |
| // Insert all non-text partitions to clean_parts |
| if (!part->IsTextType()) { |
| clean_part_grid_.InsertBBox(true, true, clean_part); |
| continue; |
| } |
| // Insert text colpartitions after removing noisy components from them |
| BLOBNBOX_CLIST* part_boxes = part->boxes(); |
| BLOBNBOX_C_IT pit(part_boxes); |
| for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { |
| BLOBNBOX *pblob = pit.data(); |
| if (!pblob->noise_flag()) { |
| clean_part->AddBox(pblob); |
| } else { |
| TBOX blob_box = pblob->bounding_box(); |
| if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) { |
| period_grid_.InsertBBox(false, false, pblob); |
| } |
| } |
| } |
| if (!clean_part->IsEmpty()) |
| clean_part_grid_.InsertBBox(true, true, clean_part); |
| } |
| |
| // TODO(rays) This is the previous period blob code. Neither is completely |
| // satisfactory, as a more disciplined approach to noise removal would be |
| // better, so revisit this choice and decide what to keep when the earlier |
| // stages do a better job of noise removal. |
| #if 0 |
| BLOBNBOX_IT sit(&block->small_blobs); |
| BLOBNBOX_IT nit(&block->noise_blobs); |
| BLOBNBOX_IT it(&period_blobs_); |
| // Insert dot sized boxes from small_blobs into period_blobs_ |
| for (sit.mark_cycle_pt(); !sit.cycled_list(); sit.forward()) { |
| BLOBNBOX * blob = sit.data(); |
| TBOX blob_box = blob->bounding_box(); |
| if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) { |
| it.add_after_then_move(sit.extract()); |
| } |
| } |
| // Insert dot sized boxes from noise_blobs into period_blobs_ |
| for (nit.mark_cycle_pt(); !nit.cycled_list(); nit.forward()) { |
| BLOBNBOX * blob = nit.data(); |
| TBOX blob_box = blob->bounding_box(); |
| if (blob_box.height() < min_dim && blob_box.width() < 2*min_dim) { |
| it.add_after_then_move(nit.extract()); |
| } |
| } |
| InsertBlobList(false, false, false, &period_blobs_, false, &period_grid_); |
| #endif |
| } |
| |
| // High level function to perform table detection |
| void ColumnFinder::LocateTables() { |
| // Make single-column blocks from good_columns_ partitions. col_segments are |
| // moved to a grid later which takes the ownership |
| ColSegment_LIST column_blocks; |
| GetColumnBlocks(&column_blocks); |
| |
| SetPartitionSpacings(); |
| |
| // Mark ColPartitions as being candidate table partition depending on |
| // the inter-word spacing |
| GridMarkTablePartitions(); |
| FilterFalseAlarms(); |
| SmoothTablePartitionRuns(); |
| |
| // Set the ratio of candidate table partitions in each column |
| SetColumnsType(&column_blocks); |
| |
| // Move column segments to col_seg_grid_ |
| MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_); |
| |
| // Detect split in column layout that might have occured due to the |
| // presence of a table. In such a case, merge the corresponding columns. |
| GridMergeColumnBlocks(); |
| |
| // Group horizontally overlapping table partitions into table columns. |
| // table_columns created here get deleted at the end of this method. |
| ColSegment_LIST table_columns; |
| GetTableColumns(&table_columns); |
| |
| // Within each column, mark the range table regions occupy based on the |
| // table columns detected. table_regions are moved to a grid later which |
| // takes the ownership |
| ColSegment_LIST table_regions; |
| GetTableRegions(&table_columns, &table_regions); |
| |
| // Merge table regions across columns for tables spanning multiple |
| // columns |
| MoveColSegmentsToGrid(&table_regions, &table_grid_); |
| GridMergeTableRegions(); |
| |
| // Adjust table boundaries by including nearby horizontal lines and left |
| // out column headers |
| AdjustTableBoundaries(); |
| GridMergeTableRegions(); |
| |
| // Remove false alarms consiting of a single column |
| DeleteSingleColumnTables(); |
| |
| if (textord_show_tables) { |
| ScrollView* table_win = MakeWindow(1500, 300, "Detected Tables"); |
| DisplayColPartitions(table_win, ScrollView::BLUE); |
| DisplayColSegments(&table_columns, table_win, ScrollView::GREEN); |
| table_grid_.DisplayBoxes(table_win); |
| } |
| |
| if (textord_dump_table_images) |
| WriteToPix(); |
| |
| // Merge all colpartitions in table regions to make them a single |
| // colpartition and revert types of isolated table cells not |
| // assigned to any table to their original types. |
| MakeTableBlocks(); |
| } |
| |
| // Make single-column blocks from good_columns_ partitions. |
| void ColumnFinder::GetColumnBlocks(ColSegment_LIST* column_blocks) { |
| for (int i = 0; i < gridheight_; ++i) { |
| ColPartitionSet* columns = best_columns_[i]; |
| if (columns != NULL) { |
| ColSegment_LIST new_blocks; |
| // Get boxes from the current vertical position on the grid |
| columns->GetColumnBoxes(i*gridsize_, (i + 1) * gridsize_, &new_blocks); |
| // Merge the new_blocks boxes into column_blocks if they are well-aligned |
| GroupColumnBlocks(&new_blocks, column_blocks); |
| } |
| } |
| } |
| |
| // Merge column segments into the current list if they are well aligned. |
| void ColumnFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks, |
| ColSegment_LIST* column_blocks) { |
| ColSegment_IT src_it(new_blocks); |
| ColSegment_IT dest_it(column_blocks); |
| // iterate through the source list |
| for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { |
| ColSegment* src_seg = src_it.data(); |
| TBOX src_box = src_seg->bounding_box(); |
| bool match_found = false; |
| // iterate through the destination list to find a matching column block |
| for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) { |
| ColSegment* dest_seg = dest_it.data(); |
| TBOX dest_box = dest_seg->bounding_box(); |
| if (ConsecutiveBoxes(src_box, dest_box)) { |
| // If matching block is found, insert the current block into it |
| // and delete the soure block |
| dest_seg->InsertBox(src_box); |
| match_found = true; |
| delete src_it.extract(); |
| break; |
| } |
| } |
| // If no match is found, just append the source block to column_blocks |
| if (!match_found) { |
| dest_it.add_after_then_move(src_it.extract()); |
| } |
| } |
| } |
| |
| // are the two boxes immediate neighbors along the vertical direction |
| bool ColumnFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) { |
| int x_margin = 20; |
| int y_margin = 5; |
| return (abs(b1.left() - b2.left()) < x_margin) && |
| (abs(b1.right() - b2.right()) < x_margin) && |
| (abs(b1.top()-b2.bottom()) < y_margin || |
| abs(b2.top()-b1.bottom()) < y_margin); |
| } |
| |
| // Set left, right and top, bottom spacings of each colpartition. |
| void ColumnFinder::SetPartitionSpacings() { |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| ColPartitionSet* columns = best_columns_[gsearch.GridY()]; |
| TBOX box = part->bounding_box(); |
| int y = part->MidY(); |
| ColPartition* left_column = columns->ColumnContaining(box.left(), y); |
| ColPartition* right_column = columns->ColumnContaining(box.right(), y); |
| // set distance from left column as space to the left |
| if (left_column) { |
| int left_space = MAX(0, box.left() - left_column->LeftAtY(y)); |
| part->set_space_to_left(left_space); |
| } |
| // set distance from right column as space to the right |
| if (right_column) { |
| int right_space = MAX(0, right_column->RightAtY(y) - box.right()); |
| part->set_space_to_right(right_space); |
| } |
| SetVerticalSpacing(part); |
| } |
| SetGlobalSpacings(); |
| } |
| |
| // Set spacing and closest neighbors above and below a given colpartition. |
| void ColumnFinder::SetVerticalSpacing(ColPartition* part) { |
| TBOX box = part->bounding_box(); |
| int top_range = MIN(box.top() + kMaxVerticalSpacing, tright().y()); |
| int bottom_range = MAX(box.bottom() - kMaxVerticalSpacing, bleft().y()); |
| box.set_top(top_range); |
| box.set_bottom(bottom_range); |
| |
| TBOX part_box = part->bounding_box(); |
| // Start a rect search |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| rectsearch(&clean_part_grid_); |
| rectsearch.StartRectSearch(box); |
| ColPartition* neighbor; |
| int min_space_above = kMaxVerticalSpacing; |
| int min_space_below = kMaxVerticalSpacing; |
| ColPartition* above_neighbor = NULL; |
| ColPartition* below_neighbor = NULL; |
| while ((neighbor = rectsearch.NextRectSearch()) != NULL) { |
| if (neighbor == part) |
| continue; |
| TBOX neighbor_box = neighbor->bounding_box(); |
| if (neighbor_box.major_x_overlap(part_box)) { |
| int gap = abs(part->median_bottom() - neighbor->median_bottom()); |
| // If neighbor is below current partition |
| if (neighbor_box.top() < part_box.bottom() && |
| gap < min_space_below) { |
| min_space_below = gap; |
| below_neighbor = neighbor; |
| } // If neighbor is above current partition |
| else if (part_box.top() < neighbor_box.bottom() && |
| gap < min_space_above) { |
| min_space_above = gap; |
| above_neighbor = neighbor; |
| } |
| } |
| } |
| part->set_space_above(min_space_above); |
| part->set_space_below(min_space_below); |
| part->set_nearest_neighbor_above(above_neighbor); |
| part->set_nearest_neighbor_below(below_neighbor); |
| } |
| |
| // Set global spacing and x-height estimates |
| void ColumnFinder::SetGlobalSpacings() { |
| STATS xheight_stats(0, kMaxVerticalSpacing + 1); |
| STATS ledding_stats(0, kMaxVerticalSpacing + 1); |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| if (part->IsTextType()) { |
| xheight_stats.add(part->median_size(), 1); |
| ledding_stats.add(part->space_above(), 1); |
| ledding_stats.add(part->space_below(), 1); |
| } |
| } |
| // Set estimates based on median of statistics obtained |
| global_median_xheight_ = static_cast<int>(xheight_stats.median() + 0.5); |
| global_median_ledding_ = static_cast<int>(ledding_stats.median() + 0.5); |
| if (textord_show_tables) { |
| ScrollView* stats_win = MakeWindow(500, 10, |
| "X-height and ledding histograms"); |
| xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN); |
| ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED); |
| } |
| } |
| |
| // Three types of partitions are maked as table partitions: |
| // 1- Partitions that have at lease one large gap between words |
| // 2- Partitions that consist of only one word (no significant gap |
| // between components) |
| // 3- Partitions that vertically overlap with other partitions within the |
| // same column. |
| void ColumnFinder::GridMarkTablePartitions() { |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| if (!part->IsTextType()) // Only consider text partitions |
| continue; |
| // Only consider partitions in dominant font size or smaller |
| if (part->median_size() > kMaxTableCellXheight * global_median_xheight_) |
| continue; |
| // Mark partitions with a large gap, or no significant gap as |
| // table partitions. |
| // Comments: It produces several false alarms at: |
| // - last line of a paragraph |
| // - single word section headings |
| // - page headers and footers |
| // - numbered equations |
| // - line drawing regions |
| // TODO(faisal): detect and fix above-mentioned cases |
| if (HasWideOrNoInterWordGap(part)) { |
| part->set_table_type(); |
| } |
| } |
| } |
| |
| // Check if the partition has at lease one large gap between words or no |
| // significant gap at all |
| bool ColumnFinder::HasWideOrNoInterWordGap(ColPartition* part) { |
| BLOBNBOX_CLIST* part_boxes = part->boxes(); |
| BLOBNBOX_C_IT pit(part_boxes); |
| |
| if (part->bounding_box().width() < |
| kMinBoxesInTextPartition * part->median_size() && |
| pit.length() < kMinBoxesInTextPartition) |
| return true; |
| |
| // Make a copy of the components in the current partition and insert periods |
| // into it to compute gaps while taking periods into account. |
| BLOBNBOX_CLIST boxes; |
| BLOBNBOX_C_IT it(&boxes); |
| for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { |
| BLOBNBOX *pblob = pit.data(); |
| it.add_after_then_move(pblob); |
| } |
| // Start rectangular search to find periods in this partition |
| GridSearch<BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT> rectsearch(&period_grid_); |
| TBOX part_box = part->bounding_box(); |
| rectsearch.StartRectSearch(part_box); |
| BLOBNBOX* period; |
| while ((period = rectsearch.NextRectSearch()) != NULL) { |
| // Insert a period only if it lies in a gap between two consecutive boxes |
| if (LiesInGap(period, &boxes)) |
| boxes.add_sorted(SortByBoxLeft<BLOBNBOX>, true, period); |
| } |
| |
| int current_x0; |
| int current_x1; |
| int previous_x1 = -1; |
| int max_partition_gap = -1; |
| double max_gap = kMaxGapInTextPartition * part->median_size(); |
| double min_gap = kMinMaxGapInTextPartition * part->median_size(); |
| |
| for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
| BLOBNBOX *blob = it.data(); |
| current_x0 = blob->bounding_box().left(); |
| current_x1 = blob->bounding_box().right(); |
| if (previous_x1 != -1) { |
| int gap = current_x0 - previous_x1; |
| // If a large enough gap is found, mark it as a table cell (return true) |
| if (gap > max_gap) |
| return true; |
| if (gap > max_partition_gap) |
| max_partition_gap = gap; |
| } |
| previous_x1 = current_x1; |
| } |
| // Since no large gap was found, return false if the partition is too |
| // long to be a data cell |
| if (part->bounding_box().width() > |
| kMaxBoxesInDataPartition * part->median_size() || |
| pit.length() > kMaxBoxesInDataPartition) |
| return false; |
| |
| // return true if the maximum gap found is smaller than the minimum allowed |
| // max_gap in a text partition |
| return (max_partition_gap < min_gap); |
| } |
| |
| // Check if the period lies in a gap between consecutive boxes |
| bool ColumnFinder::LiesInGap(BLOBNBOX* period, BLOBNBOX_CLIST* boxes) { |
| BLOBNBOX_C_IT it(boxes); |
| TBOX period_box = period->bounding_box(); |
| int num_boxes = it.length(); |
| // skip the first element since it has no gap to its left. |
| it.forward(); |
| for (int i = 1; i < num_boxes; i++) { |
| TBOX box = it.data()->bounding_box(); |
| TBOX previous_blob = it.data_relative(-1)->bounding_box(); |
| TBOX gap_box = TBOX(previous_blob.botright(), box.topleft()); |
| if (gap_box.major_overlap(period_box)) { |
| return true; |
| } |
| it.forward(); |
| } |
| return false; |
| } |
| |
| // Filter individual text partitions marked as table partitions |
| // consisting of paragraph endings, small section headings, and |
| // headers and footers. |
| void ColumnFinder::FilterFalseAlarms() { |
| // Detect last line of paragraph |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| if (part->type() != PT_TABLE) |
| continue; // Consider only table partitions |
| |
| // Paragraph ending should have flowing text above it. |
| ColPartition* upper_part = part->nearest_neighbor_above(); |
| if (!upper_part) |
| continue; |
| if (upper_part->type() != PT_FLOWING_TEXT) |
| continue; |
| if (upper_part->bounding_box().width() < |
| 2 * part->bounding_box().width()) |
| continue; |
| // Paragraph ending should be left-aligned to text line above it. |
| if (abs(part->bounding_box().left() - upper_part->bounding_box().left()) |
| > global_median_xheight_) |
| continue; |
| // Ledding above the line should be less than ledding below |
| if (part->space_above() < part->space_below() && |
| part->space_above() <= 2 * global_median_ledding_) |
| part->clear_table_type(); |
| } |
| // Consider top-most text colpartition as header and bottom most as footer |
| ColPartition* header = NULL; |
| ColPartition* footer = NULL; |
| int max_top = -MAX_INT32; |
| int min_bottom = MAX_INT32; |
| gsearch.StartFullSearch(); |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| if (!part->IsTextType()) |
| continue; // Consider only text partitions |
| int top = part->bounding_box().top(); |
| int bottom = part->bounding_box().bottom(); |
| if (top > max_top) { |
| max_top = top; |
| header = part; |
| } |
| if (bottom < min_bottom) { |
| min_bottom = bottom; |
| footer = part; |
| } |
| } |
| if (header) |
| header->clear_table_type(); |
| if (footer) |
| footer->clear_table_type(); |
| } |
| |
| // Mark all ColPartitions as table cells that have a table cell above |
| // and below them |
| // TODO(faisal): This is too aggressive at the moment. The method needs to |
| // consider spacing and alignment as well. Detection of false alarm table cells |
| // should also be done as part of it. |
| void ColumnFinder::SmoothTablePartitionRuns() { |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| if (part->type() >= PT_TABLE) |
| continue; // Consider only text partitions |
| ColPartition* upper_part = part->nearest_neighbor_above(); |
| ColPartition* lower_part = part->nearest_neighbor_below(); |
| if (!upper_part || !lower_part) |
| continue; |
| if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) |
| part->set_table_type(); |
| } |
| } |
| |
| // Set the type of a column segment based on the ratio of table to text cells |
| void ColumnFinder::SetColumnsType(ColSegment_LIST* column_blocks) { |
| ColSegment_IT it(column_blocks); |
| for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
| ColSegment* seg = it.data(); |
| TBOX box = seg->bounding_box(); |
| int num_table_cells = 0; |
| int num_text_cells = 0; |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| rsearch(&clean_part_grid_); |
| rsearch.StartRectSearch(box); |
| ColPartition* part; |
| while ((part = rsearch.NextRectSearch()) != NULL) { |
| if (!rsearch.ReturnedSeedElement()) |
| continue; // Consider each partition only once |
| if (part->type() == PT_TABLE) { |
| num_table_cells++; |
| } else if (part->type() == PT_FLOWING_TEXT) { |
| num_text_cells++; |
| } |
| } |
| // If a column block has no text or table partition in it, it is not needed |
| // for table detection. |
| if (!num_table_cells && !num_text_cells) { |
| delete it.extract(); |
| } else { |
| seg->set_num_table_cells(num_table_cells); |
| seg->set_num_text_cells(num_text_cells); |
| // set column type based on the ratio of table to text cells |
| seg->set_type(); |
| } |
| } |
| } |
| |
| // Move column blocks to grid |
| void ColumnFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments, |
| ColSegmentGrid *col_seg_grid) { |
| col_seg_grid->Init(gridsize(), bleft(), tright()); |
| ColSegment_IT it(segments); |
| for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
| ColSegment* seg = it.extract(); |
| col_seg_grid->InsertBBox(true, true, seg); |
| } |
| } |
| |
| // Merge column blocks if a split is detected due to the presence of a |
| // table. A text block is considered split if it has multiple |
| // neighboring blocks above/below it, and at least one of the |
| // neighboring blocks is of table type (has a high density of table |
| // partitions). In this case neighboring blocks in the direction |
| // (above/below) of the table block are merged with the text block. |
| |
| // Comment: This method does not handle split due to a full page table |
| // since table columns in this case do not have a text column on which |
| // split decision can be based. |
| void ColumnFinder::GridMergeColumnBlocks() { |
| int margin = gridsize_; |
| |
| // Iterate the Column Blocks in the grid. |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| gsearch(&col_seg_grid_); |
| gsearch.StartFullSearch(); |
| ColSegment* seg; |
| while ((seg = gsearch.NextFullSearch()) != NULL) { |
| if (seg->type() != COL_TEXT) |
| continue; // only consider text blocks for split detection |
| bool neighbor_found = false; |
| bool modified = false; // Modified at least once |
| // keep expanding current box as long as neighboring table columns |
| // are found above or below it. |
| do { |
| TBOX box = seg->bounding_box(); |
| // slightly expand the search region vertically |
| int top_range = MIN(box.top() + margin, tright().y()); |
| int bottom_range = MAX(box.bottom() - margin, bleft().y()); |
| box.set_top(top_range); |
| box.set_bottom(bottom_range); |
| neighbor_found = false; |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| rectsearch(&col_seg_grid_); |
| rectsearch.StartRectSearch(box); |
| ColSegment* neighbor; |
| while ((neighbor = rectsearch.NextRectSearch()) != NULL) { |
| if (neighbor == seg) |
| continue; |
| TBOX neighbor_box = neighbor->bounding_box(); |
| // If the neighbor box significantly overlaps with the current |
| // box (due to the expansion of the current box in the |
| // previous iteration of this loop), remove the neighbor box |
| // and expand the current box to include it. |
| if (neighbor_box.overlap_fraction(box) >= 0.9) { |
| seg->InsertBox(neighbor_box); |
| modified = true; |
| rectsearch.RemoveBBox(); |
| gsearch.RepositionIterator(); |
| delete neighbor; |
| continue; |
| } |
| // Only expand if the neighbor box is of table type |
| if (neighbor->type() != COL_TABLE) |
| continue; |
| // Insert the neighbor box into the current column block |
| if (neighbor_box.major_x_overlap(box) && |
| !box.contains(neighbor_box)) { |
| seg->InsertBox(neighbor_box); |
| neighbor_found = true; |
| modified = true; |
| rectsearch.RemoveBBox(); |
| gsearch.RepositionIterator(); |
| delete neighbor; |
| } |
| } |
| } while (neighbor_found); |
| if (modified) { |
| // Because the box has changed, it has to be removed first. |
| gsearch.RemoveBBox(); |
| col_seg_grid_.InsertBBox(true, true, seg); |
| gsearch.RepositionIterator(); |
| } |
| } |
| } |
| |
| // Group horizontally overlapping table partitions into table columns. |
| // TODO(faisal): This is too aggressive at the moment. The method should |
| // consider more attributes to group table partitions together. Some common |
| // errors are: |
| // 1- page number is merged with a table column above it even |
| // if there is a large vertical gap between them. |
| // 2- column headers go on to catch one of the columns arbitrarily |
| // 3- an isolated noise blob near page top or bottom merges with the table |
| // column below/above it |
| // 4- cells from two vertically adjacent tables merge together to make a |
| // single column resulting in merging of the two tables |
| void ColumnFinder::GetTableColumns(ColSegment_LIST *table_columns) { |
| ColSegment_IT it(table_columns); |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| if (part->inside_table_column() || part->type() != PT_TABLE) |
| continue; // prevent a partition to be assigned to multiple columns |
| TBOX box = part->bounding_box(); |
| ColSegment* col = new ColSegment(); |
| col->InsertBox(box); |
| part->set_inside_table_column(true); |
| // Start a search below the current cell to find bottom neighbours |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| vsearch(&clean_part_grid_); |
| vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom()); |
| ColPartition* neighbor; |
| bool found_neighbours = false; |
| while ((neighbor = vsearch.NextVerticalSearch(true)) != NULL) { |
| // only consider neighbors not assigned to any column yet |
| if (neighbor->inside_table_column()) |
| continue; |
| |
| // presence of a non-table neighbor marks the end of current |
| // table column |
| if (neighbor->type() != PT_TABLE) { |
| // Horizontal lines should not break the flow |
| if (neighbor->IsLineType()) |
| continue; |
| else |
| break; |
| } |
| TBOX neighbor_box = neighbor->bounding_box(); |
| col->InsertBox(neighbor_box); |
| neighbor->set_inside_table_column(true); |
| found_neighbours = true; |
| } |
| if (found_neighbours) { |
| it.add_after_then_move(col); |
| } else { |
| part->set_inside_table_column(false); |
| delete col; |
| } |
| } |
| } |
| |
| // Mark regions in a column that are x-bounded by the column boundaries and |
| // y-bounded by the table columns' projection on the y-axis as table regions |
| void ColumnFinder::GetTableRegions(ColSegment_LIST* table_columns, |
| ColSegment_LIST* table_regions) { |
| ColSegment_IT cit(table_columns); |
| ColSegment_IT rit(table_regions); |
| // Iterate through column blocks |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| gsearch(&col_seg_grid_); |
| gsearch.StartFullSearch(); |
| ColSegment* part; |
| int page_height = tright().y() - bleft().y(); |
| ASSERT_HOST(page_height > 0); |
| // create a bool array to hold projection on y-axis |
| bool* table_region = new bool[page_height]; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| TBOX part_box = part->bounding_box(); |
| // reset the projection array |
| for (int i = 0; i < page_height; i++) { |
| table_region[i] = false; |
| } |
| // iterate through all table columns to find regions in the current |
| // page column block |
| cit.move_to_first(); |
| for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) { |
| TBOX col_box = cit.data()->bounding_box(); |
| // find intersection region of table column and page column |
| TBOX intersection_box = col_box.intersection(part_box); |
| // project table column on the y-axis |
| for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) { |
| table_region[i - bleft().y()] = true; |
| } |
| } |
| // set x-limits of table regions to page column width |
| TBOX current_table_box; |
| current_table_box.set_left(part_box.left()); |
| current_table_box.set_right(part_box.right()); |
| // go through the y-axis projection to find runs of table |
| // regions. Each run makes one table region. |
| for (int i = 1; i < page_height; i++) { |
| // detect start of a table region |
| if (!table_region[i - 1] && table_region[i]) { |
| current_table_box.set_bottom(i + bleft().y()); |
| } |
| // detect end of a table region |
| if (table_region[i - 1] && !table_region[i]) { |
| current_table_box.set_top(i + bleft().y()); |
| if (!current_table_box.null_box()) { |
| ColSegment* seg = new ColSegment(); |
| seg->InsertBox(current_table_box); |
| rit.add_after_then_move(seg); |
| } |
| } |
| } |
| } |
| delete[] table_region; |
| } |
| |
| // Merge table regions corresponding to tables spanning multiple columns if |
| // there is a colpartition (horizontal ruling line or normal text) that |
| // touches both regions. |
| // TODO(faisal): A rare error occurs if there are two horizontally adjacent |
| // tables with aligned ruling lines. In this case, line finder returns a |
| // single line and hence the tables get merged together |
| void ColumnFinder::GridMergeTableRegions() { |
| // Iterate the table regions in the grid. |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| gsearch(&table_grid_); |
| gsearch.StartFullSearch(); |
| ColSegment* seg; |
| while ((seg = gsearch.NextFullSearch()) != NULL) { |
| bool neighbor_found = false; |
| bool modified = false; // Modified at least once |
| do { |
| // Start a rectangle search x-bounded by the image and y by the table |
| TBOX box = seg->bounding_box(); |
| TBOX search_region(box); |
| search_region.set_left(bleft().x()); |
| search_region.set_right(tright().x()); |
| neighbor_found = false; |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| rectsearch(&table_grid_); |
| rectsearch.StartRectSearch(search_region); |
| ColSegment* neighbor; |
| while ((neighbor = rectsearch.NextRectSearch()) != NULL) { |
| if (neighbor == seg) |
| continue; |
| TBOX neighbor_box = neighbor->bounding_box(); |
| // Check if a neighbor box has a large overlap with the table |
| // region. This may happen as a result of merging two table |
| // regions in the previous iteration. |
| if (neighbor_box.overlap_fraction(box) >= 0.9) { |
| seg->InsertBox(neighbor_box); |
| rectsearch.RemoveBBox(); |
| gsearch.RepositionIterator(); |
| delete neighbor; |
| modified = true; |
| continue; |
| } |
| // Check if two table regions belong together based on a common |
| // horizontal ruling line |
| if (BelongToOneTable(box, neighbor_box)) { |
| seg->InsertBox(neighbor_box); |
| neighbor_found = true; |
| modified = true; |
| } |
| } |
| } while (neighbor_found); |
| if (modified) { |
| // Because the box has changed, it has to be removed first. |
| gsearch.RemoveBBox(); |
| table_grid_.InsertBBox(true, true, seg); |
| gsearch.RepositionIterator(); |
| } |
| } |
| } |
| |
| // Decide if two table regions belong to one table based on a common |
| // horizontal ruling line or another colpartition |
| bool ColumnFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) { |
| // Check for ColPartitions spanning both table regions |
| TBOX bbox = box1.bounding_union(box2); |
| // Start a rect search on bbox |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| rectsearch(&clean_part_grid_); |
| rectsearch.StartRectSearch(bbox); |
| ColPartition* part; |
| while ((part = rectsearch.NextRectSearch()) != NULL) { |
| TBOX part_box = part->bounding_box(); |
| // return true if a colpartition spanning both table regions is found |
| if (part_box.overlap(box1) && part_box.overlap(box2)) |
| return true; |
| } |
| return false; |
| } |
| |
| // Adjust table boundaries by: |
| // - building a tight bounding box around all ColPartitions contained in it. |
| // - expanding table boundaries to include all colpartitions that overlap the |
| // table by more than half of their area |
| // - expanding table boundaries to include nearby horizontal rule lines |
| // - expanding table vertically to include left out column headers |
| // TODO(faisal): Expansion of table boundaries is quite aggressive. It usually |
| // makes following errors: |
| // 1- horizontal lines consisting of underlines are included in the table if |
| // they are close enough |
| // 2- horizontal lines originating from noise tend to get merged with a table |
| // near the top of the page |
| // 3- the criteria for including horizontal lines is very generous. Many times |
| // horizontal lines separating headers and footers get merged with a |
| // single-column table in a multi-column page thereby including text |
| // from the neighboring column inside the table |
| // 4- the criteria for including left out column headers also tends to |
| // occasionally include text-lines above the tables, typically from |
| // table caption |
| void ColumnFinder::AdjustTableBoundaries() { |
| // Iterate the table regions in the grid |
| ColSegment_CLIST adjusted_tables; |
| ColSegment_C_IT it(&adjusted_tables); |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| gsearch(&table_grid_); |
| gsearch.StartFullSearch(); |
| ColSegment* table; |
| // search for horizontal ruling lines within the vertical margin |
| int vertical_margin = kRulingVerticalMargin * gridsize_; |
| while ((table = gsearch.NextFullSearch()) != NULL) { |
| TBOX table_box = table->bounding_box(); |
| TBOX search_box = table_box; |
| int top = MIN(search_box.top() + vertical_margin, tright().y()); |
| int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y()); |
| search_box.set_top(top); |
| search_box.set_bottom(bottom); |
| TBOX box; |
| // Start a rect search on table_box |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| rectsearch(&clean_part_grid_); |
| rectsearch.StartRectSearch(search_box); |
| ColPartition* part; |
| while ((part = rectsearch.NextRectSearch()) != NULL) { |
| // Do not consider image partitions |
| if (part->IsImageType()) |
| continue; |
| TBOX part_box = part->bounding_box(); |
| // Include partition in the table if more than half of it |
| // is covered by the table |
| if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { |
| box = box.bounding_union(part_box); |
| continue; |
| } |
| // Include a partially overlapping horizontal line only if the |
| // extra ColPartitions that will be included due to expansion |
| // have large side spacing w.r.t. columns containing them. |
| if (HLineBelongsToTable(part, table_box)) { |
| box = box.bounding_union(part_box); |
| } |
| } |
| IncludeLeftOutColumnHeaders(box); |
| // To prevent a table from expanding again, do not insert the |
| // modified box back to the grid. Instead move it to a list and |
| // and remove it from the grid. The list is moved later back to the grid. |
| if (!box.null_box()) { |
| ColSegment* col = new ColSegment(); |
| col->InsertBox(box); |
| it.add_after_then_move(col); |
| } |
| gsearch.RemoveBBox(); |
| delete table; |
| } |
| // clear table grid to move final tables in it |
| table_grid_.Clear(); |
| it.move_to_first(); |
| // move back final tables to table_grid_ |
| for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
| ColSegment* seg = it.extract(); |
| table_grid_.InsertBBox(true, true, seg); |
| } |
| } |
| |
| // Checks whether the horizontal line belong to the table by looking at the |
| // side spacing of extra ColParitions that will be included in the table |
| // due to expansion |
| bool ColumnFinder::HLineBelongsToTable(ColPartition* part, |
| const TBOX& table_box) { |
| TBOX part_box = part->bounding_box(); |
| if (!part->IsLineType() || !part_box.major_x_overlap(table_box)) |
| return false; |
| // Do not consider top-most horizontal line since it usually |
| // originates from noise |
| if (!part->nearest_neighbor_above()) |
| return false; |
| TBOX bbox = part_box.bounding_union(table_box); |
| // Start a rect search on bbox |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| rectsearch(&clean_part_grid_); |
| rectsearch.StartRectSearch(bbox); |
| ColPartition* extra_part; |
| int num_extra_partitions = 0; |
| int extra_space_to_right = 0; |
| int extra_space_to_left = 0; |
| while ((extra_part = rectsearch.NextRectSearch()) != NULL) { |
| if (!rectsearch.ReturnedSeedElement()) |
| continue; |
| TBOX extra_part_box = extra_part->bounding_box(); |
| if (extra_part_box.overlap_fraction(table_box) > 0.6) |
| continue; // ColPartition already in table |
| if (extra_part->IsImageType()) // Non-text ColPartitions do not contribute |
| continue; |
| num_extra_partitions++; |
| // presense of a table cell is a strong hint, so just increment the scores |
| // without looking at the spacing. |
| if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) { |
| extra_space_to_right++; |
| extra_space_to_left++; |
| continue; |
| } |
| int space_threshold = kSideSpaceMargin * part->median_size(); |
| if (extra_part->space_to_right() > space_threshold) |
| extra_space_to_right++; |
| if (extra_part->space_to_left() > space_threshold) |
| extra_space_to_left++; |
| } |
| // tprintf("%d %d %d\n", |
| // num_extra_partitions,extra_space_to_right,extra_space_to_left); |
| return (extra_space_to_right > num_extra_partitions / 2) || |
| (extra_space_to_left > num_extra_partitions / 2); |
| } |
| |
| // Look for isolated column headers above the given table box and |
| // include them in the table |
| void ColumnFinder::IncludeLeftOutColumnHeaders(TBOX& table_box) { |
| // Start a search above the current table to look for column headers |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| vsearch(&clean_part_grid_); |
| vsearch.StartVerticalSearch(table_box.left(), table_box.right(), |
| table_box.top()); |
| ColPartition* neighbor; |
| ColPartition* previous_neighbor = NULL; |
| while ((neighbor = vsearch.NextVerticalSearch(false)) != NULL) { |
| int table_top = table_box.top(); |
| TBOX box = neighbor->bounding_box(); |
| // Do not continue if the next box is way above |
| // TODO(faisal): make the threshold some factor of line spacing |
| if (box.bottom() - table_top > kMaxColumnHeaderDistance) |
| break; |
| // Unconditionally include partitions of type TABLE or LINE |
| // TODO(faisal): add some reasonable conditions here |
| if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) { |
| table_box.set_top(box.top()); |
| previous_neighbor = NULL; |
| continue; |
| } |
| // If there are two text partitions, one above the other, without a table |
| // cell on their left or right side, consider them a barrier and quit |
| if (previous_neighbor == NULL) { |
| previous_neighbor = neighbor; |
| } else { |
| TBOX previous_box = previous_neighbor->bounding_box(); |
| if (!box.major_y_overlap(previous_box)) |
| break; |
| } |
| } |
| } |
| |
| // Remove false alarms consiting of a single column based on their |
| // projection on the x-axis. Projection of a real table on the x-axis |
| // should have at least one zero-valley larger than the global median |
| // x-height of the page. |
| void ColumnFinder::DeleteSingleColumnTables() { |
| int page_width = tright().x() - bleft().x(); |
| ASSERT_HOST(page_width > 0); |
| // create an integer array to hold projection on x-axis |
| int* table_xprojection = new int[page_width]; |
| // Iterate through all tables in the table grid |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| table_search(&table_grid_); |
| table_search.StartFullSearch(); |
| ColSegment* table; |
| while ((table = table_search.NextFullSearch()) != NULL) { |
| TBOX table_box = table->bounding_box(); |
| // reset the projection array |
| for (int i = 0; i < page_width; i++) { |
| table_xprojection[i] = 0; |
| } |
| // Start a rect search on table_box |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| rectsearch(&clean_part_grid_); |
| rectsearch.StartRectSearch(table_box); |
| ColPartition* part; |
| while ((part = rectsearch.NextRectSearch()) != NULL) { |
| if (!rectsearch.ReturnedSeedElement()) |
| continue; // Consider each partition only once |
| if (!part->IsTextType()) |
| continue; // Do not consider non-text partitions |
| TBOX part_box = part->bounding_box(); |
| // Do not consider partitions partially covered by the table |
| if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) |
| continue; |
| BLOBNBOX_CLIST* part_boxes = part->boxes(); |
| BLOBNBOX_C_IT pit(part_boxes); |
| for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) { |
| BLOBNBOX *pblob = pit.data(); |
| // ignore blob height for the purpose of projection since we |
| // are only interested in finding valleys |
| int xstart = pblob->bounding_box().left(); |
| int xend = pblob->bounding_box().right(); |
| for (int i = xstart; i < xend; i++) |
| table_xprojection[i - bleft().x()]++; |
| } |
| } |
| // Find largest valley between two reasonable peaks in the table |
| if (!GapInXProjection(table_xprojection, page_width)) { |
| table_search.RemoveBBox(); |
| delete table; |
| } |
| } |
| delete[] table_xprojection; |
| } |
| |
| // Return true if at least one gap larger than the global x-height |
| // exists in the horizontal projection |
| bool ColumnFinder::GapInXProjection(int* xprojection, int length) { |
| // Find peak value of the histogram |
| int peak_value = 0; |
| for (int i = 0; i < length; i++) { |
| if (xprojection[i] > peak_value) { |
| peak_value = xprojection[i]; |
| } |
| } |
| // Peak value represents the maximum number of horizontally |
| // overlapping colpartitions, so this can be considered as the |
| // number of rows in the table |
| if (peak_value < kMinRowsInTable) |
| return false; |
| double projection_threshold = kProjectionThreshold * peak_value; |
| // Threshold the histogram |
| for (int i = 0; i < length; i++) { |
| xprojection[i] = (xprojection[i] > projection_threshold) ? 1 : 0; |
| } |
| // Find the largest run of zeros between two ones |
| int largest_gap = 0; |
| int run_start = -1; |
| for (int i = 1; i < length; i++) { |
| // detect start of a run of zeros |
| if (xprojection[i - 1] && !xprojection[i]) { |
| run_start = i; |
| } |
| // detect end of a run of zeros and update the value of largest gap |
| if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) { |
| int gap = i - run_start; |
| if (gap > largest_gap) |
| largest_gap = gap; |
| run_start = -1; |
| } |
| } |
| return (largest_gap > global_median_xheight_); |
| } |
| |
| // Displays the column segments in some window. |
| void ColumnFinder::DisplayColSegments(ColSegment_LIST *segments, |
| ScrollView* win, |
| ScrollView::Color color) { |
| #ifndef GRAPHICS_DISABLED |
| win->Pen(color); |
| win->Brush(ScrollView::NONE); |
| ColSegment_IT it(segments); |
| for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { |
| ColSegment *col = it.data(); |
| TBOX box = col->bounding_box(); |
| int left_x = box.left(); |
| int right_x = box.right(); |
| int top_y = box.top(); |
| int bottom_y = box.bottom(); |
| win->Rectangle(left_x, bottom_y, right_x, top_y); |
| } |
| win->Update(); |
| #endif |
| } |
| |
| // Displays the colpartitions using a new coloring on an existing window. |
| // Note: This method is only for debug purpose during development and |
| // would not be part of checked in code |
| void ColumnFinder::DisplayColPartitions(ScrollView* win, |
| ScrollView::Color default_color) { |
| #ifndef GRAPHICS_DISABLED |
| // Iterate the ColPartitions in the grid. |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| win->Brush(ScrollView::NONE); |
| ScrollView::Color color; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| color = default_color; |
| TBOX box = part->bounding_box(); |
| // ColPartition* upper_part = part->nearest_neighbor_above(); |
| // ColPartition* lower_part = part->nearest_neighbor_below(); |
| // if (!upper_part && !lower_part) |
| // color = ScrollView::ORANGE; |
| // else if (upper_part && !lower_part) |
| // color = ScrollView::CYAN; |
| // else if (!upper_part && lower_part) |
| // color = ScrollView::MAGENTA; |
| if (part->type() == PT_TABLE) |
| color = ScrollView::YELLOW; |
| |
| int left_x = box.left(); |
| int right_x = box.right(); |
| int top_y = box.top(); |
| int bottom_y = box.bottom(); |
| win->Pen(color); |
| win->Rectangle(left_x, bottom_y, right_x, top_y); |
| } |
| win->Update(); |
| #endif |
| } |
| |
| // Write debug image and text file. |
| // Note: This method is only for debug purpose during development and |
| // would not be part of checked in code |
| void ColumnFinder::WriteToPix() { |
| #ifdef HAVE_LIBLEPT |
| // Input file must be named test1.tif |
| PIX* pix = pixRead("test1.tif"); |
| if (!pix) { |
| tprintf("Input file test1.tif not found.\n"); |
| return; |
| } |
| int img_height = pixGetHeight(pix); |
| int img_width = pixGetWidth(pix); |
| // Maximum number of text or table partitions |
| int num_boxes = 10; |
| BOXA* text_box_array = boxaCreate(num_boxes); |
| BOXA* table_box_array = boxaCreate(num_boxes); |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&clean_part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| // load colpartitions into text_box_array and table_box_array |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| TBOX box = part->bounding_box(); |
| box.rotate_large(reskew_); |
| BOX* lept_box = boxCreate(box.left(), img_height - box.top(), |
| box.right() - box.left(), |
| box.top() - box.bottom()); |
| if (part->type() == PT_TABLE) |
| boxaAddBox(table_box_array, lept_box, L_INSERT); |
| else |
| boxaAddBox(text_box_array, lept_box, L_INSERT); |
| } |
| // draw colpartitions on the output image |
| PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000); |
| out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00); |
| |
| BOXA* table_array = boxaCreate(num_boxes); |
| // text file containing detected table bounding boxes |
| FILE* fptr = fopen("tess-table.txt", "w"); |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| table_search(&table_grid_); |
| table_search.StartFullSearch(); |
| ColSegment* table; |
| // load table boxes to table_array and write them to text file as well |
| while ((table = table_search.NextFullSearch()) != NULL) { |
| TBOX box = table->bounding_box(); |
| box.rotate_large(reskew_); |
| // Since deskewing introduces negative coordinates, reskewing |
| // might not completely recover from that since both steps enlarge |
| // the actual box. Hence a box that undergoes deskewing/reskewing |
| // may go out of image boundaries. Crop a table box if needed to |
| // contain it inside the image dimensions. |
| box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1)); |
| BOX* lept_box = boxCreate(box.left(), img_height - box.top(), |
| box.right() - box.left(), |
| box.top() - box.bottom()); |
| boxaAddBox(table_array, lept_box, L_INSERT); |
| fprintf(fptr, "%d %d %d %d TABLE\n", box.left(), |
| img_height - box.top(), box.right(), img_height - box.bottom()); |
| } |
| fclose(fptr); |
| // paint table boxes on the debug image |
| out = pixDrawBoxa(out, table_array, 5, 0x7fff0000); |
| |
| pixWrite("out.png", out, IFF_PNG); |
| // memory cleanup |
| boxaDestroy(&text_box_array); |
| boxaDestroy(&table_box_array); |
| boxaDestroy(&table_array); |
| pixDestroy(&pix); |
| pixDestroy(&out); |
| #endif |
| } |
| |
| // Merge all colpartitions in table regions to make them a single |
| // colpartition and revert types of isolated table cells not |
| // assigned to any table to their original types. |
| void ColumnFinder::MakeTableBlocks() { |
| // Since we have table blocks already, remove table tags from all |
| // colpartitions |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| gsearch(&part_grid_); |
| gsearch.StartFullSearch(); |
| ColPartition* part; |
| while ((part = gsearch.NextFullSearch()) != NULL) { |
| if (part->type() == PT_TABLE) { |
| part->clear_table_type(); |
| } |
| } |
| // Now make a single colpartition out of each table block and remove |
| // all colpartitions contained within a table |
| GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> |
| table_search(&table_grid_); |
| table_search.StartFullSearch(); |
| ColSegment* table; |
| while ((table = table_search.NextFullSearch()) != NULL) { |
| TBOX table_box = table->bounding_box(); |
| // Start a rect search on table_box |
| GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> |
| rectsearch(&part_grid_); |
| rectsearch.StartRectSearch(table_box); |
| ColPartition* part; |
| ColPartition* table_partition = NULL; |
| while ((part = rectsearch.NextRectSearch()) != NULL) { |
| // Do not consider image partitions |
| if (!part->IsTextType()) |
| continue; |
| TBOX part_box = part->bounding_box(); |
| // Include partition in the table if more than half of it |
| // is covered by the table |
| if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) { |
| rectsearch.RemoveBBox(); |
| if (table_partition) { |
| table_partition->Absorb(part, WidthCB()); |
| } else { |
| table_partition = part; |
| } |
| } |
| } |
| // Insert table colpartition back to part_grid_ |
| if (table_partition) { |
| table_partition->SetPartitionType(best_columns_[table_search.GridY()]); |
| table_partition->set_table_type(); |
| part_grid_.InsertBBox(true, true, table_partition); |
| } |
| } |
| } |
| |
| // Provides a color for BBGrid to draw the rectangle. |
| ScrollView::Color ColSegment::BoxColor() const { |
| const ScrollView::Color kBoxColors[PT_COUNT] = { |
| ScrollView::YELLOW, |
| ScrollView::BLUE, |
| ScrollView::YELLOW, |
| ScrollView::MAGENTA, |
| }; |
| return kBoxColors[type_]; |
| } |
| |
| // Insert a box into this column segment |
| void ColSegment::InsertBox(const TBOX& other) { |
| bounding_box_ = bounding_box_.bounding_union(other); |
| } |
| |
| // Set column segment type based on the ratio of text and table partitions |
| // in it. |
| void ColSegment::set_type() { |
| if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) |
| type_ = COL_TABLE; |
| else if (num_text_cells_ > num_table_cells_) |
| type_ = COL_TEXT; |
| else |
| type_ = COL_MIXED; |
| } |
| |
| } // namespace tesseract. |