21 #pragma warning(disable:4244) // Conversion warnings
27 #include "config_auto.h"
30 #include "allheaders.h"
153 "Debug table marking steps in detail");
155 "Show page stats used in table finding");
157 "Enables the table recognizer for table layout and filtering.");
170 global_median_xheight_(0),
171 global_median_blob_width_(0),
172 global_median_ledding_(0),
173 left_to_right_language_(true) {
191 const ICOORD& top_right) {
230 BLOBNBOX_CLIST* part_boxes = part->
boxes();
231 BLOBNBOX_C_IT pit(part_boxes);
232 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
239 if (leader_part ==
NULL) {
243 leader_part->
AddBox(pblob);
245 clean_part->
AddBox(pblob);
252 if (leader_part !=
NULL) {
282 table_win =
MakeWindow(100, 300,
"Fragmented Text");
290 ColSegment_LIST column_blocks;
304 ColSegment_LIST table_columns;
310 ColSegment_LIST table_regions;
439 if (part->
boxes()->empty()) {
449 bool found_split =
true;
450 while (found_split) {
452 BLOBNBOX_C_IT box_it(right_part->
boxes());
460 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461 const TBOX& box = box_it.data()->bounding_box();
463 box.
left() - previous_right > kThreshold) {
466 int mid_x = (box.
left() + previous_right) / 2;
468 right_part = left_part->
SplitAt(mid_x);
476 previous_right =
MAX(previous_right, box.
right());
509 return box.
height() > kHeightRequired &&
510 box.
width() > kWidthRequired &&
511 box.
area() > kAreaRequired;
524 ColSegment_LIST* column_blocks) {
527 if (columns !=
NULL) {
528 ColSegment_LIST new_blocks;
539 ColSegment_LIST* column_blocks) {
540 ColSegment_IT src_it(new_blocks);
541 ColSegment_IT dest_it(column_blocks);
543 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
546 bool match_found =
false;
548 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
556 delete src_it.extract();
562 dest_it.add_after_then_move(src_it.extract());
571 return (abs(b1.
left() - b2.
left()) < x_margin) &&
595 int y = part->
MidY();
619 if (right < box.
left()) {
632 if (left > box.
right()) {
684 if (neighbor == part)
690 if (neighbor_box.
top() < part_box.
bottom() &&
691 gap < min_space_below) {
692 min_space_below = gap;
693 below_neighbor = neighbor;
695 else if (part_box.
top() < neighbor_box.
bottom() &&
696 gap < min_space_above) {
697 min_space_above = gap;
698 above_neighbor = neighbor;
731 BLOBNBOX_C_IT it(part->
boxes());
732 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733 xheight_stats.
add(it.data()->bounding_box().height(), 1);
734 width_stats.
add(it.data()->bounding_box().width(), 1);
745 #ifndef GRAPHICS_DISABLED
747 const char* kWindowName =
"X-height (R), X-width (G), and ledding (B)";
753 #endif // GRAPHICS_DISABLED
861 BLOBNBOX_CLIST* part_boxes = part->
boxes();
862 BLOBNBOX_C_IT it(part_boxes);
872 int previous_x1 = -1;
874 int largest_partition_gap_found = -1;
881 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
885 if (previous_x1 != -1) {
886 int gap = current_x0 - previous_x1;
898 previous_x1 =
MAX(previous_x1, current_x1);
912 if (gap > largest_partition_gap_found)
913 largest_partition_gap_found = gap;
915 previous_x1 = current_x1;
929 if (largest_partition_gap_found == -1)
935 return largest_partition_gap_found < min_gap;
955 const int top = box.
top() + search_size;
956 const int bottom = box.
bottom() - search_size;
960 int x = right_to_left ? box.
right() : box.
left();
1022 int current_spacing = 0;
1023 int upper_spacing = 0;
1029 current_spacing = mid - left;
1030 upper_spacing = upper_mid - left;
1036 current_spacing = right - mid;
1037 upper_spacing = right - upper_mid;
1088 if (top > max_top) {
1092 if (bottom < min_bottom) {
1093 min_bottom = bottom;
1118 if (!upper_part || !lower_part)
1144 ColSegment_IT it(column_blocks);
1145 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1148 int num_table_cells = 0;
1149 int num_text_cells = 0;
1164 if (!num_table_cells && !num_text_cells) {
1165 delete it.extract();
1178 ColSegment_IT it(segments);
1179 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1206 bool neighbor_found =
false;
1207 bool modified =
false;
1217 neighbor_found =
false;
1223 if (neighbor == seg)
1245 neighbor_found =
true;
1252 }
while (neighbor_found);
1274 ColSegment_IT it(table_columns);
1294 bool found_neighbours =
false;
1310 found_neighbours =
true;
1312 if (found_neighbours) {
1313 it.add_after_then_move(col);
1324 ColSegment_LIST* table_regions) {
1325 ColSegment_IT cit(table_columns);
1326 ColSegment_IT rit(table_regions);
1335 bool* table_region =
new bool[page_height];
1339 for (
int i = 0; i < page_height; i++) {
1340 table_region[i] =
false;
1344 cit.move_to_first();
1345 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346 TBOX col_box = cit.data()->bounding_box();
1350 for (
int i = intersection_box.
bottom(); i < intersection_box.
top(); i++) {
1351 table_region[i -
bleft().
y()] =
true;
1355 TBOX current_table_box;
1360 for (
int i = 1; i < page_height; i++) {
1362 if (!table_region[i - 1] && table_region[i]) {
1367 if (table_region[i - 1] && !table_region[i]) {
1369 if (!current_table_box.
null_box()) {
1372 rit.add_after_then_move(seg);
1377 delete[] table_region;
1393 bool neighbor_found =
false;
1394 bool modified =
false;
1398 TBOX search_region(box);
1401 neighbor_found =
false;
1407 if (neighbor == seg)
1425 neighbor_found =
true;
1432 }
while (neighbor_found);
1487 ColSegment_CLIST adjusted_tables;
1488 ColSegment_C_IT it(&adjusted_tables);
1494 TBOX grown_box = table_box;
1502 it.add_after_then_move(col);
1513 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1528 TBOX search_box = table_box;
1542 const TBOX& search_range,
1546 for (
int i = 0; i < 2; ++i) {
1570 const TBOX& search_range,
1585 if (result_box->
contains(part_box))
1600 const TBOX& table_box) {
1618 int num_extra_partitions = 0;
1619 int extra_space_to_right = 0;
1620 int extra_space_to_left = 0;
1623 for (
int i = 0; i < 2; ++i) {
1640 num_extra_partitions++;
1644 extra_space_to_right++;
1645 extra_space_to_left++;
1650 extra_space_to_right++;
1652 extra_space_to_left++;
1657 return (extra_space_to_right > num_extra_partitions / 2) ||
1658 (extra_space_to_left > num_extra_partitions / 2);
1674 int table_top = table_box->
top();
1677 if (box.
bottom() - table_top > max_distance)
1683 previous_neighbor =
NULL;
1688 if (previous_neighbor ==
NULL) {
1689 previous_neighbor = neighbor;
1706 int* table_xprojection =
new int[page_width];
1715 for (
int i = 0; i < page_width; i++) {
1716 table_xprojection[i] = 0;
1733 BLOBNBOX_CLIST* part_boxes = part->
boxes();
1734 BLOBNBOX_C_IT pit(part_boxes);
1741 int next_position_to_write = 0;
1743 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1750 xstart =
MAX(xstart, next_position_to_write);
1751 for (
int i = xstart; i < xend; i++)
1752 table_xprojection[i -
bleft().
x()]++;
1753 next_position_to_write = xend;
1762 delete[] table_xprojection;
1770 for (
int i = 0; i < length; i++) {
1771 if (xprojection[i] > peak_value) {
1772 peak_value = xprojection[i];
1784 for (
int i = 0; i < length; i++) {
1785 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1788 int largest_gap = 0;
1790 for (
int i = 1; i < length; i++) {
1792 if (xprojection[i - 1] && !xprojection[i]) {
1796 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1797 int gap = i - run_start;
1798 if (gap > largest_gap)
1819 table_win =
MakeWindow(0, 0,
"Table Structure");
1834 ColSegment_CLIST good_tables;
1835 ColSegment_C_IT good_it(&good_tables);
1850 if (table_structure !=
NULL) {
1855 delete table_structure;
1856 good_it.add_after_then_move(found_table);
1865 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1871 ColSegment_LIST *segments,
1873 #ifndef GRAPHICS_DISABLED
1876 ColSegment_IT it(segments);
1877 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1880 int left_x = box.
left();
1881 int right_x = box.
right();
1882 int top_y = box.
top();
1883 int bottom_y = box.
bottom();
1884 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1892 #ifndef GRAPHICS_DISABLED
1900 int left_x = box.
left();
1901 int right_x = box.
right();
1902 int top_y = box.
top();
1903 int bottom_y = box.
bottom();
1906 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1919 #ifndef GRAPHICS_DISABLED
1927 color = default_color;
1929 color = table_color;
1932 int left_x = box.
left();
1933 int right_x = box.
right();
1934 int top_y = box.
top();
1935 int bottom_y = box.
bottom();
1938 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1953 #ifndef GRAPHICS_DISABLED
1961 int left_x = box.
left();
1962 int right_x = box.
right();
1963 int top_y = box.
top();
1964 int bottom_y = box.
bottom();
1969 int mid_x = (left_x + right_x) / 2;
1970 int mid_y = (top_y + bottom_y) / 2;
1971 int other_x = (upper_box.
left() + upper_box.
right()) / 2;
1972 int other_y = (upper_box.
top() + upper_box.
bottom()) / 2;
1975 win->
Line(mid_x, mid_y, other_x, other_y);
1980 int mid_x = (left_x + right_x) / 2;
1981 int mid_y = (top_y + bottom_y) / 2;
1982 int other_x = (lower_box.
left() + lower_box.
right()) / 2;
1983 int other_y = (lower_box.
top() + lower_box.
bottom()) / 2;
1986 win->
Line(mid_x, mid_y, other_x, other_y);
1999 PIX* pix = pixRead(
"test1.tif");
2001 tprintf(
"Input file test1.tif not found.\n");
2004 int img_height = pixGetHeight(pix);
2005 int img_width = pixGetWidth(pix);
2008 BOXA* text_box_array = boxaCreate(num_boxes);
2009 BOXA* table_box_array = boxaCreate(num_boxes);
2018 BOX* lept_box = boxCreate(box.
left(), img_height - box.
top(),
2022 boxaAddBox(table_box_array, lept_box, L_INSERT);
2024 boxaAddBox(text_box_array, lept_box, L_INSERT);
2027 PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000);
2028 out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00);
2030 BOXA* table_array = boxaCreate(num_boxes);
2032 FILE* fptr = fopen(
"tess-table.txt",
"wb");
2047 BOX* lept_box = boxCreate(box.
left(), img_height - box.
top(),
2050 boxaAddBox(table_array, lept_box, L_INSERT);
2051 fprintf(fptr,
"%d %d %d %d TABLE\n", box.
left(),
2056 out = pixDrawBoxa(out, table_array, 5, 0x7fff0000);
2058 pixWrite(
"out.png", out, IFF_PNG);
2060 boxaDestroy(&text_box_array);
2061 boxaDestroy(&table_box_array);
2062 boxaDestroy(&table_array);
2108 if (table_partition) {
2109 table_partition->
Absorb(part, width_cb);
2111 table_partition = part;
2116 if (table_partition) {
2128 grid->
InsertBBox(
true,
true, table_partition);
2137 num_table_cells_(0),
2152 return kBoxColors[type_];
2165 else if (num_text_cells_ > num_table_cells_)