Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
tesseractclass.h
Go to the documentation of this file.
1 
2 // File: tesseractclass.h
3 // Description: An instance of Tesseract. For thread safety, *every*
4 // global variable goes in here, directly, or indirectly.
5 // Author: Ray Smith
6 // Created: Fri Mar 07 08:17:01 PST 2008
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__
22 #define TESSERACT_CCMAIN_TESSERACTCLASS_H__
23 
24 #include "allheaders.h"
25 #include "control.h"
26 #include "docqual.h"
27 #include "devanagari_processing.h"
28 #include "genericvector.h"
29 #include "params.h"
30 #include "ocrclass.h"
31 #include "textord.h"
32 #include "wordrec.h"
33 
34 class PAGE_RES;
35 class PAGE_RES_IT;
36 class BLOCK_LIST;
37 class CharSamp;
38 class TO_BLOCK_LIST;
39 class IMAGE;
40 class WERD_RES;
41 class ROW;
42 class TBOX;
43 class SVMenuNode;
44 struct Pix;
45 class WERD_CHOICE;
46 class WERD;
47 class BLOB_CHOICE_LIST_CLIST;
48 struct OSResults;
49 
50 
51 // Top-level class for all tesseract global instance data.
52 // This class either holds or points to all data used by an instance
53 // of Tesseract, including the memory allocator. When this is
54 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
55 //
56 // NOTE to developers: Do not create cyclic dependencies through this class!
57 // The directory dependency tree must remain a tree! The keep this clean,
58 // lower-level code (eg in ccutil, the bottom level) must never need to
59 // know about the content of a higher-level directory.
60 // The following scheme will grant the easiest access to lower-level
61 // global members without creating a cyclic dependency:
62 //
63 // Class Hierarchy (^ = inheritance):
64 //
65 // CCUtil (ccutil/ccutil.h)
66 // ^ Members include: UNICHARSET
67 // CUtil (cutil/cutil_class.h)
68 // ^ Members include: TBLOB*, TEXTBLOCK*
69 // CCStruct (ccstruct/ccstruct.h)
70 // ^ Members include: Image
71 // Classify (classify/classify.h)
72 // ^ Members include: Dict
73 // WordRec (wordrec/wordrec.h)
74 // ^ Members include: WERD*, DENORM*
75 // Tesseract (ccmain/tesseractclass.h)
76 // Members include: Pix*, CubeRecoContext*,
77 // TesseractCubeCombiner*
78 //
79 // Other important classes:
80 //
81 // TessBaseAPI (api/baseapi.h)
82 // Members include: BLOCK_LIST*, PAGE_RES*,
83 // Tesseract*, ImageThresholder*
84 // Dict (dict/dict.h)
85 // Members include: Image* (private)
86 //
87 // NOTE: that each level contains members that correspond to global
88 // data that is defined (and used) at that level, not necessarily where
89 // the type is defined so for instance:
90 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
91 // goes inside the Textord class, not the cc_util class.
92 
93 namespace tesseract {
94 
95 class ColumnFinder;
96 class CubeLineObject;
97 class CubeObject;
98 class CubeRecoContext;
99 class EquationDetect;
100 class Tesseract;
101 class TesseractCubeCombiner;
102 
103 typedef void (Tesseract::*WordRecognizer)(BLOCK* block,
104  ROW *row,
105  WERD_RES *word);
106 
107 // A collection of various variables for statistics and debugging.
111  doc_blob_quality(0),
112  doc_outline_errs(0),
113  doc_char_quality(0),
114  good_char_count(0),
116  word_count(0),
117  dict_words(0),
118  tilde_crunch_written(false),
119  last_char_was_newline(true),
120  last_char_was_tilde(false),
122 
129  inT32 word_count; // count of word in the document
130  inT32 dict_words; // number of dicitionary words in the document
131  STRING dump_words_str; // accumulator used by dump_words()
132  // Flags used by write_results()
137 };
138 
139 class Tesseract : public Wordrec {
140  public:
141  Tesseract();
142  ~Tesseract();
143 
144  // Clear as much used memory as possible without resetting the adaptive
145  // classifier or losing any other classifier data.
146  void Clear();
147  // Clear all memory of adaption for this and all subclassifiers.
149  // Clear the document dictionary for this and all subclassifiers.
151 
152  // Set the equation detector.
153  void SetEquationDetect(EquationDetect* detector);
154 
155  // Simple accessors.
156  const FCOORD& reskew() const {
157  return reskew_;
158  }
159  // Destroy any existing pix and return a pointer to the pointer.
161  Clear();
162  return &pix_binary_;
163  }
164  Pix* pix_binary() const {
165  return pix_binary_;
166  }
167  Pix* pix_grey() const {
168  return pix_grey_;
169  }
170  void set_pix_grey(Pix* grey_pix) {
171  pixDestroy(&pix_grey_);
172  pix_grey_ = grey_pix;
173  }
174  // Returns a pointer to a Pix representing the best available image of the
175  // page. The image will be 8-bit grey if the input was grey or color. Note
176  // that in grey 0 is black and 255 is white. If the input was binary, then
177  // the returned Pix will be binary. Note that here black is 1 and white is 0.
178  // To tell the difference pixGetDepth() will return 8 or 1.
179  // In either case, the return value is a borrowed Pix, and should not be
180  // deleted or pixDestroyed.
181  Pix* BestPix() const {
182  return pix_grey_ != NULL ? pix_grey_ : pix_binary_;
183  }
184  int source_resolution() const {
185  return source_resolution_;
186  }
187  void set_source_resolution(int ppi) {
188  source_resolution_ = ppi;
189  }
190  int ImageWidth() const {
191  return pixGetWidth(pix_binary_);
192  }
193  int ImageHeight() const {
194  return pixGetHeight(pix_binary_);
195  }
196  Pix* scaled_color() const {
197  return scaled_color_;
198  }
199  int scaled_factor() const {
200  return scaled_factor_;
201  }
202  void SetScaledColor(int factor, Pix* color) {
203  scaled_factor_ = factor;
204  scaled_color_ = color;
205  }
206  const Textord& textord() const {
207  return textord_;
208  }
210  return &textord_;
211  }
212 
213  bool right_to_left() const {
214  return right_to_left_;
215  }
216  int num_sub_langs() const {
217  return sub_langs_.size();
218  }
219  Tesseract* get_sub_lang(int index) const {
220  return sub_langs_[index];
221  }
222 
223  void SetBlackAndWhitelist();
224 
225  // Perform steps to prepare underlying binary image/other data structures for
226  // page segmentation. Uses the strategy specified in the global variable
227  // pageseg_devanagari_split_strategy for perform splitting while preparing for
228  // page segmentation.
229  void PrepareForPageseg();
230 
231  // Perform steps to prepare underlying binary image/other data structures for
232  // Tesseract OCR. The current segmentation is required by this method.
233  // Uses the strategy specified in the global variable
234  // ocr_devanagari_split_strategy for performing splitting while preparing for
235  // Tesseract ocr.
236  void PrepareForTessOCR(BLOCK_LIST* block_list,
237  Tesseract* osd_tess, OSResults* osr);
238 
239  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
240  Tesseract* osd_tess, OSResults* osr);
241  void SetupWordScripts(BLOCK_LIST* blocks);
242  int AutoPageSeg(bool single_column, bool osd, bool only_osd,
243  BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
244  Tesseract* osd_tess, OSResults* osr);
246  bool single_column, bool osd, bool only_osd,
247  BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr,
248  TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix, Pix** music_mask_pix);
249 
251  bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
252  const char* word_config, int pass);
253  bool recog_all_words(PAGE_RES* page_res,
254  ETEXT_DESC* monitor,
255  const TBOX* target_word_box,
256  const char* word_config,
257  int dopasses);
258  void rejection_passes(PAGE_RES* page_res,
259  ETEXT_DESC* monitor,
260  const TBOX* target_word_box,
261  const char* word_config);
262  void bigram_correction_pass(PAGE_RES *page_res);
263  void blamer_pass(PAGE_RES* page_res);
264  // Helper to recognize the word using the given (language-specific) tesseract.
265  // Returns true if the result was better than previously.
266  bool RetryWithLanguage(WERD_RES *word, BLOCK* block, ROW *row,
267  WordRecognizer recognizer);
269  BLOCK* block, ROW *row, WERD_RES *word);
270  void classify_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
271  void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
272  TBOX &selection_box);
273 
274  void fix_rep_char(PAGE_RES_IT* page_res_it);
275  void ExplodeRepeatedWord(BLOB_CHOICE* best_choice, PAGE_RES_IT* page_res_it);
276 
278  const char *s,
279  const char *lengths);
280  void match_word_pass2( //recog one word
281  WERD_RES *word, //word to do
282  ROW *row,
283  BLOCK* block);
284  void classify_word_pass2(BLOCK* block, ROW *row, WERD_RES *word);
285  void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
286  WERD_RES* word, WERD_RES* new_word);
287  bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
288  bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
289  BOOL8 recog_interactive(BLOCK* block, ROW* row, WERD_RES* word_res);
290 
291  void set_word_fonts(
292  WERD_RES *word, // set fonts of this word
293  BLOB_CHOICE_LIST_CLIST *blob_choices); // detailed results
294  void font_recognition_pass(PAGE_RES* page_res);
295  BOOL8 check_debug_pt(WERD_RES *word, int location);
296 
298  bool init_cube_objects(bool load_combiner,
300  // Iterates through tesseract's results and calls cube on each word,
301  // combining the results with the existing tesseract result.
302  void run_cube_combiner(PAGE_RES *page_res);
303  // Recognizes a single word using (only) cube. Compatible with
304  // Tesseract's classify_word_pass1/classify_word_pass2.
305  void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word);
306  // Cube recognizer to recognize a single word as with classify_word_pass1
307  // but also returns the cube object in case the combiner is needed.
309  // Combines the cube and tesseract results for a single word, leaving the
310  // result in tess_word.
311  void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word,
312  WERD_RES* tess_word);
313  // Call cube on the current word, and write the result to word.
314  // Sets up a fake result and returns false if something goes wrong.
315  bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word);
316  void fill_werd_res(const BoxWord& cube_box_word,
317  WERD_CHOICE* cube_werd_choice,
318  const char* cube_best_str,
319  WERD_RES* tess_werd_res);
320  bool extract_cube_state(CubeObject* cube_obj, int* num_chars,
321  Boxa** char_boxes, CharSamp*** char_samples);
322  bool create_cube_box_word(Boxa *char_boxes, int num_chars,
323  TBOX word_box, BoxWord* box_word);
325 
326  void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
327  void write_results(PAGE_RES_IT &page_res_it, // full info
328  char newline_type, // type of newline
329  BOOL8 force_eol // override tilde crunch?
330  );
331  void set_unlv_suspects(WERD_RES *word);
332  UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated?
333  BOOL8 acceptable_number_string(const char *s,
334  const char *lengths);
335  inT16 count_alphanums(const WERD_CHOICE &word);
336  inT16 count_alphas(const WERD_CHOICE &word);
338  void read_config_file(const char *filename, SetParamConstraint constraint);
339  // Initialize for potentially a set of languages defined by the language
340  // string and recursively any additional languages required by any language
341  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
342  // See init_tesseract_internal for args.
343  int init_tesseract(const char *arg0,
344  const char *textbase,
345  const char *language,
346  OcrEngineMode oem,
347  char **configs,
348  int configs_size,
349  const GenericVector<STRING> *vars_vec,
350  const GenericVector<STRING> *vars_values,
351  bool set_only_init_params);
352  int init_tesseract(const char *datapath,
353  const char *language,
354  OcrEngineMode oem) {
355  return init_tesseract(datapath, NULL, language, oem,
356  NULL, 0, NULL, NULL, false);
357  }
358  // Common initialization for a single language.
359  // arg0 is the datapath for the tessdata directory, which could be the
360  // path of the tessdata directory with no trailing /, or (if tessdata
361  // lives in the same directory as the executable, the path of the executable,
362  // hence the name arg0.
363  // textbase is an optional output file basename (used only for training)
364  // language is the language code to load.
365  // oem controls which engine(s) will operate on the image
366  // configs (argv) is an array of config filenames to load variables from.
367  // May be NULL.
368  // configs_size (argc) is the number of elements in configs.
369  // vars_vec is an optional vector of variables to set.
370  // vars_values is an optional corresponding vector of values for the variables
371  // in vars_vec.
372  // If set_only_init_params is true, then only the initialization variables
373  // will be set.
374  int init_tesseract_internal(const char *arg0,
375  const char *textbase,
376  const char *language,
377  OcrEngineMode oem,
378  char **configs,
379  int configs_size,
380  const GenericVector<STRING> *vars_vec,
381  const GenericVector<STRING> *vars_values,
382  bool set_only_init_params);
383 
384  // Set the universal_id member of each font to be unique among all
385  // instances of the same font loaded.
386  void SetupUniversalFontIds();
387 
388  int init_tesseract_lm(const char *arg0,
389  const char *textbase,
390  const char *language);
391 
392  void recognize_page(STRING& image_name);
393  void end_tesseract();
394 
395  bool init_tesseract_lang_data(const char *arg0,
396  const char *textbase,
397  const char *language,
398  OcrEngineMode oem,
399  char **configs,
400  int configs_size,
401  const GenericVector<STRING> *vars_vec,
402  const GenericVector<STRING> *vars_values,
403  bool set_only_init_params);
404 
405  void ParseLanguageString(const char* lang_str,
406  GenericVector<STRING>* to_load,
407  GenericVector<STRING>* not_to_load);
408 
411  #ifndef GRAPHICS_DISABLED
412  void pgeditor_main(int width, int height, PAGE_RES* page_res);
413  #endif // GRAPHICS_DISABLED
414  void process_image_event( // action in image win
415  const SVEvent &event);
416  BOOL8 process_cmd_win_event( // UI command semantics
417  inT32 cmd_event, // which menu item?
418  char *new_value // any prompt data
419  );
420  void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
421  void do_re_display(
422  BOOL8 (tesseract::Tesseract::*word_painter)(BLOCK* block,
423  ROW* row,
424  WERD_RES* word_res));
425  BOOL8 word_display(BLOCK* block, ROW* row, WERD_RES* word_res);
426  BOOL8 word_bln_display(BLOCK* block, ROW* row, WERD_RES* word_res);
427  BOOL8 word_blank_and_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
428  BOOL8 word_set_display(BLOCK* block, ROW* row, WERD_RES* word_res);
429  // #ifndef GRAPHICS_DISABLED
430  BOOL8 word_dumper(BLOCK* block, ROW* row, WERD_RES* word_res);
431  // #endif // GRAPHICS_DISABLED
433  void make_reject_map( //make rej map for wd //detailed results
434  WERD_RES *word,
435  BLOB_CHOICE_LIST_CLIST *blob_choices,
436  ROW *row,
437  inT16 pass //1st or 2nd?
438  );
439  BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
440  inT16 first_alphanum_index(const char *word,
441  const char *word_lengths);
442  inT16 first_alphanum_offset(const char *word,
443  const char *word_lengths);
444  inT16 alpha_count(const char *word,
445  const char *word_lengths);
446  BOOL8 word_contains_non_1_digit(const char *word,
447  const char *word_lengths);
448  void dont_allow_1Il(WERD_RES *word);
449  inT16 count_alphanums( //how many alphanums
450  WERD_RES *word);
451  void flip_0O(WERD_RES *word);
452  BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
453  BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
455  void nn_match_word( //Match a word
456  WERD_RES *word,
457  ROW *row);
458  void nn_recover_rejects(WERD_RES *word, ROW *row);
459  BOOL8 test_ambig_word( //test for ambiguity
460  WERD_RES *word);
461  void set_done( //set done flag
462  WERD_RES *word,
463  inT16 pass);
464  inT16 safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict?
465  void flip_hyphens(WERD_RES *word);
466  void reject_I_1_L(WERD_RES *word);
467  void reject_edge_blobs(WERD_RES *word);
468  void reject_mostly_rejects(WERD_RES *word);
470  BOOL8 word_adaptable( //should we adapt?
471  WERD_RES *word,
472  uinT16 mode);
473 
475  void recog_word_recursive(WERD_RES* word,
476  BLOB_CHOICE_LIST_CLIST *blob_choices);
477  void recog_word(WERD_RES *word,
478  BLOB_CHOICE_LIST_CLIST *blob_choices);
479  void split_and_recog_word(WERD_RES* word,
480  BLOB_CHOICE_LIST_CLIST *blob_choices);
482  BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
483  inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
484  void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
485  inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
486  void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
487  void fix_fuzzy_space_list( //space explorer
488  WERD_RES_LIST &best_perm,
489  ROW *row,
490  BLOCK* block);
491  void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
492  void fix_fuzzy_spaces( //find fuzzy words
493  ETEXT_DESC *monitor, //progress monitor
494  inT32 word_count, //count of words in doc
495  PAGE_RES *page_res);
496  void dump_words(WERD_RES_LIST &perm, inT16 score,
497  inT16 mode, BOOL8 improved);
500  inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
501  float blob_noise_score(TBLOB *blob);
502  void break_noisiest_blob_word(WERD_RES_LIST &words);
504  GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
506  GARBAGE_LEVEL garbage_level,
507  BOOL8 ok_dict_word);
508  void tilde_crunch(PAGE_RES_IT &page_res_it);
509  void unrej_good_quality_words( //unreject potential
510  PAGE_RES_IT &page_res_it);
511  void doc_and_block_rejection( //reject big chunks
512  PAGE_RES_IT &page_res_it,
513  BOOL8 good_quality_doc);
514  void quality_based_rejection(PAGE_RES_IT &page_res_it,
515  BOOL8 good_quality_doc);
516  void convert_bad_unlv_chs(WERD_RES *word_res);
517  void tilde_delete(PAGE_RES_IT &page_res_it);
518  inT16 word_blob_quality(WERD_RES *word, ROW *row);
519  void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count,
520  inT16 *accepted_match_count);
521  void unrej_good_chs(WERD_RES *word, ROW *row);
522  inT16 count_outline_errs(char c, inT16 outline_count);
524  BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
525  CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
527  BOOL8 noise_outlines(TWERD *word);
529  void
531  PAGE_RES* page_res, // blocks to check
532  //function to call
533  TBOX & selection_box,
534  BOOL8 (tesseract::Tesseract::*word_processor) (BLOCK* block,
535  ROW* row,
536  WERD_RES* word_res));
538  void tess_add_doc_word( //test acceptability
539  WERD_CHOICE *word_choice //after context
540  );
541  void tess_segment_pass1(WERD_RES *word,
542  BLOB_CHOICE_LIST_CLIST *blob_choices);
543  void tess_segment_pass2(WERD_RES *word,
544  BLOB_CHOICE_LIST_CLIST *blob_choices);
545  BOOL8 tess_acceptable_word( //test acceptability
546  WERD_CHOICE *word_choice, //after context
547  WERD_CHOICE *raw_choice //before context
548  );
550  // Applies the box file based on the image name fname, and resegments
551  // the words in the block_list (page), with:
552  // blob-mode: one blob per line in the box file, words as input.
553  // word/line-mode: one blob per space-delimited unit after the #, and one word
554  // per line in the box file. (See comment above for box file format.)
555  // If find_segmentation is true, (word/line mode) then the classifier is used
556  // to re-segment words/lines to match the space-delimited truth string for
557  // each box. In this case, the input box may be for a word or even a whole
558  // text line, and the output words will contain multiple blobs corresponding
559  // to the space-delimited input string.
560  // With find_segmentation false, no classifier is needed, but the chopper
561  // can still be used to correctly segment touching characters with the help
562  // of the input boxes.
563  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
564  // from normal classification, ie. with a word, chopped_word, rebuild_word,
565  // seam_array, denorm, box_word, and best_state, but NO best_choice or
566  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
567  // Instead, the correct_text member of WERD_RES is set, and this may be later
568  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
569  // is not required before calling ApplyBoxTraining.
570  PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
571  BLOCK_LIST *block_list);
572 
573  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
574  // All fuzzy spaces are removed, and all the words are maximally chopped.
576  BLOCK_LIST *block_list);
577  // Tests the chopper by exhaustively running chop_one_blob.
578  // The word_res will contain filled chopped_word, seam_array, denorm,
579  // box_word and best_state for the maximally chopped word.
580  void MaximallyChopWord(const GenericVector<TBOX>& boxes,
581  BLOCK* block, ROW* row, WERD_RES* word_res);
582  // Gather consecutive blobs that match the given box into the best_state
583  // and corresponding correct_text.
584  // Fights over which box owns which blobs are settled by pre-chopping and
585  // applying the blobs to box or next_box with the least non-overlap.
586  // Returns false if the box was in error, which can only be caused by
587  // failing to find an appropriate blob for a box.
588  // This means that occasionally, blobs may be incorrectly segmented if the
589  // chopper fails to find a suitable chop point.
590  bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
591  const TBOX& box, const TBOX& next_box,
592  const char* correct_text);
593  // Consume all source blobs that strongly overlap the given box,
594  // putting them into a new word, with the correct_text label.
595  // Fights over which box owns which blobs are settled by
596  // applying the blobs to box or next_box with the least non-overlap.
597  // Returns false if the box was in error, which can only be caused by
598  // failing to find an overlapping blob for a box.
599  bool ResegmentWordBox(BLOCK_LIST *block_list,
600  const TBOX& box, const TBOX& next_box,
601  const char* correct_text);
602  // Resegments the words by running the classifier in an attempt to find the
603  // correct segmentation that produces the required string.
604  void ReSegmentByClassification(PAGE_RES* page_res);
605  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
606  // Returns false if an invalid UNICHAR_ID is encountered.
607  bool ConvertStringToUnichars(const char* utf8,
608  GenericVector<UNICHAR_ID>* class_ids);
609  // Resegments the word to achieve the target_text from the classifier.
610  // Returns false if the re-segmentation fails.
611  // Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and
612  // applies a full search on the classifier results to find the best classified
613  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
614  // substitutions ARE used.
615  bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
616  WERD_RES* word_res);
617  // Recursive helper to find a match to the target_text (from text_index
618  // position) in the choices (from choices_pos position).
619  // Choices is an array of GenericVectors, of length choices_length, with each
620  // element representing a starting position in the word, and the
621  // GenericVector holding classification results for a sequence of consecutive
622  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
624  int choices_pos, int choices_length,
625  const GenericVector<UNICHAR_ID>& target_text,
626  int text_index,
627  float rating, GenericVector<int>* segmentation,
628  float* best_rating, GenericVector<int>* best_segmentation);
629  // Counts up the labelled words and the blobs within.
630  // Deletes all unused or emptied words, counting the unused ones.
631  // Resets W_BOL and W_EOL flags correctly.
632  // Builds the rebuild_word and rebuilds the box_word.
633  void TidyUp(PAGE_RES* page_res);
634  // Logs a bad box by line in the box file and box coords.
635  void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
636  const char *err_msg);
637  // Creates a fake best_choice entry in each WERD_RES with the correct text.
638  void CorrectClassifyWords(PAGE_RES* page_res);
639  // Call LearnWord to extract features for labelled blobs within each word.
640  // Features are written to the given filename.
641  void ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res);
642 
644  // Returns the number of misfit blob tops in this word.
645  int CountMisfitTops(WERD_RES *word_res);
646  // Returns a new x-height in pixels (original image coords) that is
647  // maximally compatible with the result in word_res.
648  // Returns 0.0f if no x-height is found that is better than the current
649  // estimate.
650  float ComputeCompatibleXheight(WERD_RES *word_res);
652  // TODO(ocr-team): Remove obsolete parameters.
654  "Take segmentation and labeling from box file");
656  "Conversion of word/line box file to char box file");
658  "Generate training data from boxed chars");
660  "Generate more boxes from boxed chars");
662  "Dump intermediate images made during page segmentation");
664  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
665  " 5=line, 6=word, 7=char"
666  " (Values from PageSegMode enum in publictypes.h)");
668  "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults"
669  " to loading and running only Tesseract (no Cube, no combiner)."
670  " (Values from OcrEngineMode enum in tesseractclass.h)");
672  "Blacklist of chars not to recognize");
674  "Whitelist of chars to recognize");
676  "Perform training for ambiguities");
679  "Whether to use the top-line splitting process for Devanagari "
680  "documents while performing page-segmentation.");
683  "Whether to use the top-line splitting process for Devanagari "
684  "documents while performing ocr.");
686  "Write all parameters to the given file.");
688  "Adapt to words that contain "
689  " a character composed form fragments");
691  "Generate and print debug information for adaption");
692  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
693  INT_VAR_H(applybox_debug, 1, "Debug level");
694  INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
696  "Exposure value follows this pattern in the image"
697  " filename. The name of the image files are expected"
698  " to be in the form [lang].[fontname].exp[num].tif");
700  "Learn both character fragments (as is done in the"
701  " special low exposure mode) as well as unfragmented"
702  " characters.");
704  "Each bounding box is assumed to contain ngrams. Only"
705  " learn the ngrams whose outlines overlap horizontally.");
706  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
707  BOOL_VAR_H(tessedit_training_tess, false, "Call Tess to learn blobs");
708  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
710  "Try to improve fuzzy spaces");
712  "Dont bother with word plausibility");
713  BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
714  BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
716  "Add words to the document dictionary");
717  BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
718  BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
720  "Enable correction based on the word bigram dictionary.");
721  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
722  "correction.");
723  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
724  BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
725  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
726  STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
727  STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
728  double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
729  double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
731  "good_quality_doc lte outline error limit");
732  double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
733  INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
735  "Use reject map to control Tesseract adaption");
737  "Adaptation decision algorithm for tess");
739  "Do minimal rejection on pass 1 output");
740  BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
741  BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
743  "Adaptation decision algorithm for tess");
745  "Save the results of the recognition step"
746  " (blob_choices) within the corresponding WERD_CHOICE");
747  BOOL_VAR_H(test_pt, false, "Test for point");
748  double_VAR_H(test_pt_x, 99999.99, "xcoord");
749  double_VAR_H(test_pt_y, 99999.99, "ycoord");
750  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
751  INT_VAR_H(cube_debug_level, 1, "Print cube debug info.");
752  STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
753  STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
755  "Allow outline errs in unrejection?");
757  "Reduce rejection on good docs");
758  BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
760  "%rej allowed before rej whole doc");
762  "%rej allowed before rej whole block");
764  "%rej allowed before rej whole row");
766  "Number of row rejects in whole word rejects"
767  "which prevents whole row rejection");
769  "Only rej partially rejected words in block rejection");
771  "Only rej partially rejected words in row rejection");
773  "Use word segmentation quality metric");
775  "Use word segmentation quality metric");
777  "Only preserve wds longer than this");
779  "Apply row rejection to good docs");
781  "rej good doc wd if more than this fraction rejected");
783  "Reject all bad quality wds");
786  "Output data to debug file");
787  BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs");
789  "good_quality_doc gte good char limit");
791  "Mark v.bad words for tilde crunch");
792  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
793  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
794  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
795  BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
797  "crunch garbage cert lt this");
798  double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
799  double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
800  double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
801  BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
802  double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
803  double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
804  double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
805  double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
806  double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
808  "Del if word gt xht x this above bl");
809  double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
810  double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
811  INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
812  INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
813  BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings");
814  BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
816  "Dont pot crunch sensible strings");
817  BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
819  "Dont crunch words with long lower case strings");
821  "Dont crunch words with long lower case strings");
822  INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
823  INT_VAR_H(crunch_debug, 0, "As it says");
825  "How many non-noise blbs either side?");
826  double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
827  BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins");
828  INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
829  INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
831  "Punct. chs expected WITHIN numbers");
833  "Max allowed deviation of blob top outside of font data");
834  INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
836  "Write block separators in output");
838  "Write repetition char code");
839  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
840  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
842  "Output char for unidentified blobs");
843  INT_VAR_H(suspect_level, 99, "Suspect marker level");
845  "Min suspect level for rejecting spaces");
847  "Dont Suspect dict wds longer than this");
848  BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
849  double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit");
850  double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
851  BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
852  BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING");
854  "Make output have exactly one word per WERD");
856  "Dont reject ANYTHING AT ALL");
857  BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
858  INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
859  INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
860  BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
861  BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
863  "Aspect ratio dot/hyphen test");
865  "Aspect ratio dot/hyphen test");
866  BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
867  BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
868  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check");
869  BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
870  BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
871  BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
872  BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
873  BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
875  INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
877  "Allow NN to unrej");
878  STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
879  INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
880  BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
882  "-1 -> All pages, else specifc page to process");
883  BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
884  BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
885  STRING_VAR_H(file_type, ".tif", "Filename extension");
886  BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
888  "Debug level for TessdataManager functions.");
890  "List of languages to load with this one");
891  // Min acceptable orientation margin (difference in scores between top and 2nd
892  // choice in OSResults::orientations) to believe the page orientation.
894  "Min acceptable orientation margin");
895  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
896  BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
898  "Only initialize with the config file. Useful if the instance is "
899  "not going to be used for OCR but say only for layout analysis.");
900  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
901 
903  FILE *init_recog_training(const STRING &fname);
904  void recog_training_segmented(const STRING &fname,
905  PAGE_RES *page_res,
906  volatile ETEXT_DESC *monitor,
907  FILE *output_file);
908  void ambigs_classify_and_output(WERD_RES *werd_res,
909  ROW_RES *row_res,
910  BLOCK_RES *block_res,
911  const char *label,
912  FILE *output_file);
913 
914  inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; }
915 
916  private:
917  // The filename of a backup config file. If not null, then we currently
918  // have a temporary debug config file loaded, and backup_config_file_
919  // will be loaded, and set to null when debug is complete.
920  const char* backup_config_file_;
921  // The filename of a config file to read when processing a debug word.
922  STRING word_config_;
923  // Image used for input to layout analysis and tesseract recognition.
924  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
925  Pix* pix_binary_;
926  // Unmodified image used for input to cube. Always valid.
927  Pix* cube_binary_;
928  // Grey-level input image if the input was not binary, otherwise NULL.
929  Pix* pix_grey_;
930  // Input image resolution after any scaling. The resolution is not well
931  // transmitted by operations on Pix, so we keep an independent record here.
932  int source_resolution_;
933  // The shiro-rekha splitter object which is used to split top-lines in
934  // Devanagari words to provide a better word and grapheme segmentation.
935  ShiroRekhaSplitter splitter_;
936  // Page segmentation/layout
937  Textord textord_;
938  // True if the primary language uses right_to_left reading order.
939  bool right_to_left_;
940  Pix* scaled_color_;
941  int scaled_factor_;
942  FCOORD deskew_;
943  FCOORD reskew_;
944  TesseractStats stats_;
945  // Sub-languages to be tried in addition to this.
946  GenericVector<Tesseract*> sub_langs_;
947  // Most recently used Tesseract out of this and sub_langs_. The default
948  // language for the next word.
949  Tesseract* most_recently_used_;
950  // The size of the font table, ie max possible font id + 1.
951  int font_table_size_;
952  // Cube objects.
953  CubeRecoContext* cube_cntxt_;
954  TesseractCubeCombiner *tess_cube_combiner_;
955  // Equation detector. Note: this pointer is NOT owned by the class.
956  EquationDetect* equ_detect_;
957 };
958 
959 } // namespace tesseract
960 
961 
962 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__