Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
output.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: output.cpp (Formerly output.c)
3  * Description: Output pass
4  * Author: Phil Cheatle
5  * Created: Thu Aug 4 10:56:08 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "mfcpch.h"
25 #include <string.h>
26 #include <ctype.h>
27 #ifdef __UNIX__
28 #include <assert.h>
29 #include <unistd.h>
30 #include <errno.h>
31 #endif
32 #include "helpers.h"
33 #include "tfacep.h"
34 #include "tessvars.h"
35 #include "control.h"
36 #include "secname.h"
37 #include "reject.h"
38 #include "docqual.h"
39 #include "output.h"
40 #include "bestfirst.h"
41 #include "globals.h"
42 #include "tesseractclass.h"
43 
44 #define EPAPER_EXT ".ep"
45 #define PAGE_YSIZE 3508
46 #define CTRL_INSET '\024' //dc4=text inset
47 #define CTRL_FONT '\016' //so=font change
48 #define CTRL_DEFAULT '\017' //si=default font
49 #define CTRL_SHIFT '\022' //dc2=x shift
50 #define CTRL_TAB '\011' //tab
51 #define CTRL_NEWLINE '\012' //newline
52 #define CTRL_HARDLINE '\015' //cr
53 
54 /**********************************************************************
55  * pixels_to_pts
56  *
57  * Convert an integer number of pixels to the nearest integer
58  * number of points.
59  **********************************************************************/
60 
61 inT32 pixels_to_pts( //convert coords
62  inT32 pixels,
63  inT32 pix_res //resolution
64  ) {
65  float pts; //converted value
66 
67  pts = pixels * 72.0 / pix_res;
68  return (inT32) (pts + 0.5); //round it
69 }
70 
71 namespace tesseract {
72 void Tesseract::output_pass( //Tess output pass //send to api
73  PAGE_RES_IT &page_res_it,
74  const TBOX *target_word_box) {
75  BLOCK_RES *block_of_last_word;
76  inT16 block_id;
77  BOOL8 force_eol; //During output
78  BLOCK *nextblock; //block of next word
79  WERD *nextword; //next word
80 
81  page_res_it.restart_page ();
82  block_of_last_word = NULL;
83  while (page_res_it.word () != NULL) {
84  check_debug_pt (page_res_it.word (), 120);
85 
86  if (target_word_box)
87  {
88 
89  TBOX current_word_box=page_res_it.word ()->word->bounding_box();
90  FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2);
91  if (!target_word_box->contains(center_pt))
92  {
93  page_res_it.forward ();
94  continue;
95  }
96 
97  }
99  block_of_last_word != page_res_it.block ()) {
100  block_of_last_word = page_res_it.block ();
101  block_id = block_of_last_word->block->index();
102  }
103 
104  force_eol = (tessedit_write_block_separators &&
105  (page_res_it.block () != page_res_it.next_block ())) ||
106  (page_res_it.next_word () == NULL);
107 
108  if (page_res_it.next_word () != NULL)
109  nextword = page_res_it.next_word ()->word;
110  else
111  nextword = NULL;
112  if (page_res_it.next_block () != NULL)
113  nextblock = page_res_it.next_block ()->block;
114  else
115  nextblock = NULL;
116  //regardless of tilde crunching
117  write_results(page_res_it,
118  determine_newline_type(page_res_it.word()->word,
119  page_res_it.block()->block,
120  nextword, nextblock), force_eol);
121  page_res_it.forward();
122  }
123 }
124 
125 
126 /*************************************************************************
127  * write_results()
128  *
129  * All recognition and rejection has now been done. Generate the following:
130  * .txt file - giving the final best choices with NO highlighting
131  * .raw file - giving the tesseract top choice output for each word
132  * .map file - showing how the .txt file has been rejected in the .ep file
133  * epchoice list - a list of one element per word, containing the text for the
134  * epaper. Reject strings are inserted.
135  * inset list - a list of bounding boxes of reject insets - indexed by the
136  * reject strings in the epchoice text.
137  *************************************************************************/
139  char newline_type, // type of newline
140  BOOL8 force_eol) { // override tilde crunch?
141  WERD_RES *word = page_res_it.word();
142  const UNICHARSET &uchset = *word->uch_set;
143  STRING repetition_code;
144  const STRING *wordstr;
145  STRING wordstr_lengths;
146  int i;
147  char unrecognised = STRING (unrecognised_char)[0];
148  char ep_chars[32]; //Only for unlv_tilde_crunch
149  int ep_chars_index = 0;
150  char txt_chs[32]; //Only for unlv_tilde_crunch
151  char map_chs[32]; //Only for unlv_tilde_crunch
152  int txt_index = 0;
153  BOOL8 need_reject = FALSE;
154  UNICHAR_ID space = uchset.unichar_to_id(" ");
155  if ((word->unlv_crunch_mode != CR_NONE ||
156  word->best_choice->length() == 0) &&
158  if ((word->unlv_crunch_mode != CR_DELETE) &&
159  (!stats_.tilde_crunch_written ||
160  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
161  (word->word->space () > 0) &&
162  !word->word->flag (W_FUZZY_NON) &&
163  !word->word->flag (W_FUZZY_SP)))) {
164  if (!word->word->flag (W_BOL) &&
165  (word->word->space () > 0) &&
166  !word->word->flag (W_FUZZY_NON) &&
167  !word->word->flag (W_FUZZY_SP)) {
168  // Write a space to separate from preceeding good text.
169  txt_chs[txt_index] = ' ';
170  map_chs[txt_index++] = '1';
171  ep_chars[ep_chars_index++] = ' ';
172  stats_.last_char_was_tilde = false;
173  }
174  need_reject = TRUE;
175  }
176  if ((need_reject && !stats_.last_char_was_tilde) ||
177  (force_eol && stats_.write_results_empty_block)) {
178  /* Write a reject char - mark as rejected unless zero_rejection mode */
179  stats_.last_char_was_tilde = TRUE;
180  txt_chs[txt_index] = unrecognised;
181  if (tessedit_zero_rejection || (suspect_level == 0)) {
182  map_chs[txt_index++] = '1';
183  ep_chars[ep_chars_index++] = unrecognised;
184  }
185  else {
186  map_chs[txt_index++] = '0';
187  /*
188  The ep_choice string is a faked reject to allow newdiff to sync the
189  .etx with the .txt and .map files.
190  */
191  ep_chars[ep_chars_index++] = CTRL_INSET; // escape code
192  //dummy reject
193  ep_chars[ep_chars_index++] = 1;
194  //dummy reject
195  ep_chars[ep_chars_index++] = 1;
196  //type
197  ep_chars[ep_chars_index++] = 2;
198  //dummy reject
199  ep_chars[ep_chars_index++] = 1;
200  //dummy reject
201  ep_chars[ep_chars_index++] = 1;
202  }
203  stats_.tilde_crunch_written = true;
204  stats_.last_char_was_newline = false;
205  stats_.write_results_empty_block = false;
206  }
207 
208  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
209  /* Add a new line output */
210  txt_chs[txt_index] = '\n';
211  map_chs[txt_index++] = '\n';
212  //end line
213  ep_chars[ep_chars_index++] = newline_type;
214 
215  //Cos of the real newline
216  stats_.tilde_crunch_written = false;
217  stats_.last_char_was_newline = true;
218  stats_.last_char_was_tilde = false;
219  }
220  txt_chs[txt_index] = '\0';
221  map_chs[txt_index] = '\0';
222  ep_chars[ep_chars_index] = '\0'; // terminate string
223  word->ep_choice = new WERD_CHOICE(ep_chars, uchset);
224 
225  if (force_eol)
226  stats_.write_results_empty_block = true;
227  return;
228  }
229 
230  /* NORMAL PROCESSING of non tilde crunched words */
231 
232  stats_.tilde_crunch_written = false;
233  if (newline_type)
234  stats_.last_char_was_newline = true;
235  else
236  stats_.last_char_was_newline = false;
237  stats_.write_results_empty_block = force_eol; // about to write a real word
238 
239  if (unlv_tilde_crunching &&
240  stats_.last_char_was_tilde &&
241  (word->word->space() == 0) &&
243  (word->best_choice->unichar_id(0) == space)) {
244  /* Prevent adjacent tilde across words - we know that adjacent tildes within
245  words have been removed */
246  word->best_choice->remove_unichar_id(0);
247  if (word->best_choice->blob_choices() != NULL) {
248  BLOB_CHOICE_LIST_C_IT blob_choices_it(word->best_choice->blob_choices());
249  if (!blob_choices_it.empty()) delete blob_choices_it.extract();
250  }
251  word->reject_map.remove_pos (0);
252  word->box_word->DeleteBox(0);
253  }
254  if (newline_type ||
256  stats_.last_char_was_tilde = false;
257  else {
258  if (word->reject_map.length () > 0) {
259  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
260  stats_.last_char_was_tilde = true;
261  else
262  stats_.last_char_was_tilde = false;
263  }
264  else if (word->word->space () > 0)
265  stats_.last_char_was_tilde = false;
266  /* else it is unchanged as there are no output chars */
267  }
268 
269  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
270 
271  set_unlv_suspects(word);
272  check_debug_pt (word, 120);
274  tprintf ("Dict word: \"%s\": %d\n",
275  word->best_choice->debug_string().string(),
276  dict_word(*(word->best_choice)));
277  }
278  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {
279  repetition_code = "|^~R";
280  wordstr_lengths = "\001\001\001\001";
281  repetition_code += uchset.id_to_unichar(get_rep_char(word));
282  wordstr_lengths += strlen(uchset.id_to_unichar(get_rep_char(word)));
283  wordstr = &repetition_code;
284  } else {
286  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
287  for (i = 0; i < word->best_choice->length(); ++i) {
288  if (word->reject_map[i].rejected())
289  word->reject_map[i].setrej_minimal_rej_accept();
290  }
291  }
293  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
294  for (i = 0; i < word->best_choice->length(); ++i) {
295  if ((word->best_choice->unichar_id(i) != space) &&
296  word->reject_map[i].rejected())
297  word->reject_map[i].setrej_minimal_rej_accept();
298  }
299  }
300  }
301 }
302 } // namespace tesseract
303 
304 /**********************************************************************
305  * determine_newline_type
306  *
307  * Find whether we have a wrapping or hard newline.
308  * Return FALSE if not at end of line.
309  **********************************************************************/
310 
311 char determine_newline_type( //test line ends
312  WERD *word, //word to do
313  BLOCK *block, //current block
314  WERD *next_word, //next word
315  BLOCK *next_block //block of next word
316  ) {
317  inT16 end_gap; //to right edge
318  inT16 width; //of next word
319  TBOX word_box; //bounding
320  TBOX next_box; //next word
321  TBOX block_box; //block bounding
322 
323  if (!word->flag (W_EOL))
324  return FALSE; //not end of line
325  if (next_word == NULL || next_block == NULL || block != next_block)
326  return CTRL_NEWLINE;
327  if (next_word->space () > 0)
328  return CTRL_HARDLINE; //it is tabbed
329  word_box = word->bounding_box ();
330  next_box = next_word->bounding_box ();
331  block_box = block->bounding_box ();
332  //gap to eol
333  end_gap = block_box.right () - word_box.right ();
334  end_gap -= (inT32) block->space ();
335  width = next_box.right () - next_box.left ();
336  // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
337  // block_box.right(),word_box.right(),end_gap,
338  // next_box.right(),next_box.left(),width,
339  // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
340  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
341 }
342 
343 /*************************************************************************
344  * get_rep_char()
345  * Return the first accepted character from the repetition string. This is the
346  * character which is repeated - as determined earlier by fix_rep_char()
347  *************************************************************************/
348 namespace tesseract {
349 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
350  int i;
351  for (i = 0; ((i < word->reject_map.length()) &&
352  (word->reject_map[i].rejected())); ++i);
353 
354  if (i < word->reject_map.length()) {
355  return word->best_choice->unichar_id(i);
356  } else {
357  return word->uch_set->unichar_to_id(unrecognised_char.string());
358  }
359 }
360 
361 /*************************************************************************
362  * SUSPECT LEVELS
363  *
364  * 0 - dont reject ANYTHING
365  * 1,2 - partial rejection
366  * 3 - BEST
367  *
368  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
369  * tessedit_minimal_rejection.
370  *************************************************************************/
372  int len = word_res->reject_map.length();
373  const WERD_CHOICE &word = *(word_res->best_choice);
374  const UNICHARSET &uchset = *word.unicharset();
375  int i;
376  float rating_per_ch;
377 
378  if (suspect_level == 0) {
379  for (i = 0; i < len; i++) {
380  if (word_res->reject_map[i].rejected())
381  word_res->reject_map[i].setrej_minimal_rej_accept();
382  }
383  return;
384  }
385 
386  if (suspect_level >= 3)
387  return; //Use defaults
388 
389  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
390 
391  if (safe_dict_word(word_res) &&
392  (count_alphas(word) > suspect_short_words)) {
393  /* Unreject alphas in dictionary words */
394  for (i = 0; i < len; ++i) {
395  if (word_res->reject_map[i].rejected() &&
396  uchset.get_isalpha(word.unichar_id(i)))
397  word_res->reject_map[i].setrej_minimal_rej_accept();
398  }
399  }
400 
401  rating_per_ch = word.rating() / word_res->reject_map.length();
402 
403  if (rating_per_ch >= suspect_rating_per_ch)
404  return; //Dont touch bad ratings
405 
406  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
407  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
408  for (i = 0; i < len; ++i) {
409  if (word_res->reject_map[i].rejected() &&
410  (!uchset.eq(word.unichar_id(i), " ")))
411  word_res->reject_map[i].setrej_minimal_rej_accept();
412  }
413  }
414 
415  for (i = 0; i < len; i++) {
416  if (word_res->reject_map[i].rejected()) {
417  if (word_res->reject_map[i].flag(R_DOC_REJ))
418  word_res->reject_map[i].setrej_minimal_rej_accept();
419  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
420  word_res->reject_map[i].setrej_minimal_rej_accept();
421  if (word_res->reject_map[i].flag(R_ROW_REJ))
422  word_res->reject_map[i].setrej_minimal_rej_accept();
423  }
424  }
425 
426  if (suspect_level == 2)
427  return;
428 
429  if (!suspect_constrain_1Il ||
430  (word_res->reject_map.length() <= suspect_short_words)) {
431  for (i = 0; i < len; i++) {
432  if (word_res->reject_map[i].rejected()) {
433  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
434  word_res->reject_map[i].flag(R_POSTNN_1IL)))
435  word_res->reject_map[i].setrej_minimal_rej_accept();
436 
437  if (!suspect_constrain_1Il &&
438  word_res->reject_map[i].flag(R_MM_REJECT))
439  word_res->reject_map[i].setrej_minimal_rej_accept();
440  }
441  }
442  }
443 
444  if (acceptable_word_string(*word_res->uch_set,
445  word.unichar_string().string(),
446  word.unichar_lengths().string()) !=
447  AC_UNACCEPTABLE ||
449  word.unichar_lengths().string())) {
450  if (word_res->reject_map.length() > suspect_short_words) {
451  for (i = 0; i < len; i++) {
452  if (word_res->reject_map[i].rejected() &&
453  (!word_res->reject_map[i].perm_rejected() ||
454  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
455  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
456  word_res->reject_map[i].flag (R_MM_REJECT))) {
457  word_res->reject_map[i].setrej_minimal_rej_accept();
458  }
459  }
460  }
461  }
462 }
463 
465  int count = 0;
466  for (int i = 0; i < word.length(); ++i) {
467  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
468  count++;
469  }
470  return count;
471 }
472 
473 
475  int count = 0;
476  for (int i = 0; i < word.length(); ++i) {
477  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
478  word.unicharset()->get_isdigit(word.unichar_id(i)))
479  count++;
480  }
481  return count;
482 }
483 
484 
486  const char *lengths) {
487  BOOL8 prev_digit = FALSE;
488 
489  if (*lengths == 1 && *s == '(')
490  s++;
491 
492  if (*lengths == 1 &&
493  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
494  s++;
495 
496  for (; *s != '\0'; s += *(lengths++)) {
497  if (unicharset.get_isdigit(s, *lengths))
498  prev_digit = TRUE;
499  else if (prev_digit &&
500  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
501  prev_digit = FALSE;
502  else if (prev_digit && *lengths == 1 &&
503  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
504  return TRUE;
505  else if (prev_digit &&
506  *lengths == 1 && (*s == '%') &&
507  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
508  (*(s + *lengths + *(lengths + 1)) == '\0'))
509  return TRUE;
510  else
511  return FALSE;
512  }
513  return TRUE;
514 }
515 } // namespace tesseract