Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
permute.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: permute.c (Formerly permute.c)
5  * Description: Choose OCR text given character-probability maps
6  * for sequences of glyph fragments and a dictionary provided as
7  * a Dual Acyclic Word Graph.
8  * In this file, "permute" should be read "combine."
9  * Author: Mark Seaman, OCR Technology
10  * Created: Fri Sep 22 14:05:51 1989
11  * Modified: Thu Jan 3 16:38:46 1991 (Mark Seaman) marks@hpgrlt
12  * Language: C
13  * Package: N/A
14  * Status: Experimental (Do Not Distribute)
15  *
16  * (c) Copyright 1989, Hewlett-Packard Company.
17  ** Licensed under the Apache License, Version 2.0 (the "License");
18  ** you may not use this file except in compliance with the License.
19  ** You may obtain a copy of the License at
20  ** http://www.apache.org/licenses/LICENSE-2.0
21  ** Unless required by applicable law or agreed to in writing, software
22  ** distributed under the License is distributed on an "AS IS" BASIS,
23  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24  ** See the License for the specific language governing permissions and
25  ** limitations under the License.
26  *
27  *********************************************************************************/
28 /*----------------------------------------------------------------------
29  I n c l u d e s
30 ---------------------------------------------------------------------*/
31 
32 #ifdef _MSC_VER
33 #pragma warning(disable:4244) // Conversion warnings
34 #pragma warning(disable:4800) // int/bool warnings
35 #endif
36 
37 #include <assert.h>
38 #include <math.h>
39 
40 #include "const.h"
41 
42 #include "permute.h"
43 
44 #include "callcpp.h"
45 #include "ccutil.h"
46 #include "dict.h"
47 #include "freelist.h"
48 #include "helpers.h"
49 #include "image.h"
50 #include "globals.h"
51 #include "ndminx.h"
52 #include "ratngs.h"
53 #include "stopper.h"
54 #include "tprintf.h"
55 #include "trie.h"
56 #include "params.h"
57 #include "unicharset.h"
58 
59 
60 namespace tesseract {
61 
62 /*----------------------------------------------------------------------
63  F u n c t i o n s
64 ----------------------------------------------------------------------*/
65 
75  WERD_CHOICE *choice2) {
76  if (!choice1) return choice2;
77  if (!choice2) return choice1;
78  if (choice1->rating() < choice2->rating() || choice2->length() == 0) {
79  delete choice2;
80  return choice1;
81  } else {
82  delete choice1;
83  return choice2;
84  }
85 }
86 
91 BLOB_CHOICE* get_nth_choice(BLOB_CHOICE_LIST* blob_list, int n) {
92  BLOB_CHOICE_IT c_it(blob_list);
93  while (n-- > 0 && !c_it.at_last())
94  c_it.forward();
95  return c_it.data();
96 }
97 
99 UNICHAR_ID get_top_choice_uid(BLOB_CHOICE_LIST *blob_list) {
100  if (!blob_list) return INVALID_UNICHAR_ID;
101  BLOB_CHOICE_IT blob_choice_it(blob_list);
102  return (blob_choice_it.data()) ? blob_choice_it.data()->unichar_id()
103  : INVALID_UNICHAR_ID;
104 }
105 
110 int find_choice_by_uid(BLOB_CHOICE_LIST *blob_list, UNICHAR_ID target_uid) {
111  BLOB_CHOICE_IT c_it(blob_list);
112  int pos = 0;
113  while (1) {
114  if (c_it.data()->unichar_id() == target_uid) return pos;
115  if (c_it.at_last()) break;
116  c_it.forward();
117  pos++;
118  }
119  return -1;
120 }
121 
130  const BLOB_CHOICE_LIST_VECTOR &char_choices,
131  int start_pos,
132  const char* pos_str,
133  float *certainties) {
134  int pos_str_len = strlen(pos_str);
135  WERD_CHOICE* wchoice = new WERD_CHOICE(unicharset);
136  if (start_pos + pos_str_len > char_choices.length()) {
137  wchoice->make_bad();
138  return wchoice;
139  }
140  for (int x = 0; x < pos_str_len; x++) {
141  int pos = pos_str[x]-'0';
142  if (pos < 0) pos = 0; // use the top choice by default, eg. '.'
143  if (pos >= 10)
144  tprintf("PosStr[%d](%d)=%c %d\n", x, pos_str_len, pos_str[x], pos);
145  ASSERT_HOST(pos < 10);
146  BLOB_CHOICE* blob_it = get_nth_choice(char_choices.get(start_pos+x), pos);
147  wchoice->set_permuter(NO_PERM);
148  wchoice->append_unichar_id(blob_it->unichar_id(), 1,
149  blob_it->rating(),
150  blob_it->certainty());
151  if (certainties != NULL) certainties[x] = blob_it->certainty();
152  }
153  return wchoice;
154 }
155 
162  WERD_CHOICE* word_choice,
163  int start_pos,
164  char* pos_str) {
165  for (int i = 0; i < word_choice->length(); i++) {
166  UNICHAR_ID target_id = word_choice->unichar_id(i);
167  BLOB_CHOICE_LIST* blob_choice_list = char_choices.get(start_pos + i);
168  int pos = find_choice_by_uid(blob_choice_list, target_id);
169  if (pos < 0) pos = 0;
170  pos_str[i] = pos + '0';
171  }
172  pos_str[word_choice->length()] = '\0';
173 }
174 
182  BLOB_CHOICE_LIST *blob_choices,
183  char target_type,
184  const UNICHARSET &unicharset) {
185  BLOB_CHOICE_IT c_it(blob_choices);
186  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
187  if (c_it.data() &&
188  unicharset.get_chartype(c_it.data()->unichar_id()) == target_type)
189  return c_it.data();
190  }
191  return NULL;
192 }
193 
207  BLOB_CHOICE_LIST *blob_choices,
208  int target_sid,
209  int backup_sid,
210  int secondary_sid) {
211  BLOB_CHOICE_IT c_it(blob_choices);
212  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
213  bool found = false;
214  if (c_it.data()->script_id() == 0) continue;
215  if (c_it.data()->script_id() == target_sid) found = true;
216  if (backup_sid > 0 && c_it.data()->script_id() == backup_sid) found = true;
217  if (found) return c_it.data();
218  }
219  if (secondary_sid > 0) {
220  c_it.set_to_list(blob_choices);
221  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
222  if (c_it.data()->script_id() == 0) continue;
223  if (c_it.data()->script_id() == secondary_sid)
224  return c_it.data();
225  }
226  }
227  return NULL;
228 }
229 
230 
232  unicharset_ = NULL;
233  char_choices_ = NULL;
234  adjust_factor_ = 1.0f;
235  allow_collision_ = false;
236  word_length_ = 0;
237  debug_ = false;
238 }
239 
241  const UNICHARSET& unicharset,
242  float default_bias,
243  bool debug) {
244  ASSERT_HOST(char_choices.length() < MAX_PERM_LENGTH);
245  unicharset_ = &unicharset;
246  char_choices_ = &char_choices;
247  word_length_ = char_choices.length();
248  for (int i = 0; i < word_length_; ++i)
249  perm_state_[i] = kPosFree;
250  perm_state_[word_length_] = '\0';
251  // Skip fragment choice and the mark positions so they remain unchanged.
252  for (int i = 0; i < word_length_; ++i) {
253  UNICHAR_ID unichar_id = get_top_choice_uid(char_choices.get(i));
254  if (unicharset.get_fragment(unichar_id) != NULL)
255  perm_state_[i] = '1';
256  }
257  adjust_factor_ = default_bias;
258  allow_collision_ = false;
259  debug_ = debug;
260 }
261 
262 // Promote char positions specified in pos_str with given weight.
263 // For example, AddPreference(5, "234", 0.95f)
264 // would promote the 3rd, 4th and 5th choice for character 5, 6, 7
265 // (starting at 0) in the word with a rating adjustment of 0.95.
266 void PermuterState::AddPreference(int start_pos, char* pos_str, float weight) {
267  ASSERT_HOST(char_choices_ != NULL);
268  ASSERT_HOST(start_pos + strlen(pos_str) - 1 < word_length_);
269  if (debug_) {
270  tprintf("Copy over %s -> %s @ %d ", pos_str, perm_state_, start_pos);
271  }
272  // copy over preferred position without the terminating null
273  if (!allow_collision_) {
274  int len = strlen(pos_str);
275  for (int i = 0; i < len; ++i)
276  if (position_marked(start_pos + i)) return;
277  }
278  strncpy(&perm_state_[start_pos], pos_str, strlen(pos_str));
279  adjust_factor_ *= weight;
280  if (debug_) tprintf("==> %s %f\n", perm_state_, adjust_factor_);
281 }
282 
283 // Promote the input blob_choice at specified position with given weight.
284 void PermuterState::AddPreference(int char_pos, BLOB_CHOICE* blob_choice,
285  float weight) {
286  ASSERT_HOST(char_choices_ != NULL);
287  ASSERT_HOST(char_pos < word_length_);
288  // avoid collision (but this doesn't work if the first choice is favored!
289  if (!allow_collision_ && position_marked(char_pos)) return;
290 
291  if (debug_) {
292  tprintf("Set UID %d -> %s @ %d ",
293  blob_choice->unichar_id(), perm_state_, char_pos);
294  }
295  int pos = find_choice_by_uid(char_choices_->get(char_pos),
296  blob_choice->unichar_id());
297  perm_state_[char_pos] = pos + '0';
298  adjust_factor_ *= weight;
299  if (debug_) tprintf("==> %s %f\n", perm_state_, adjust_factor_);
300 }
301 
302 // Get the best word permutation so far. Caller should destroy WERD_CHOICE.
304  float *adjust_factor) {
305  ASSERT_HOST(char_choices_ != NULL);
306  WERD_CHOICE *word_choice = get_choice_from_posstr(
307  unicharset_, *char_choices_, 0, perm_state_, certainties);
308  float rating = word_choice->rating() * adjust_factor_;
309  word_choice->set_rating(rating);
310  *adjust_factor = adjust_factor_;
311  return word_choice;
312 }
313 
314 
315 /**********************************************************************
316  * permute_all
317  *
318  * Permute all the characters together using all of the different types
319  * of permuters/selectors available. Each of the characters must have
320  * a non-NULL choice list.
321  *
322  * Note: order of applying permuters does matter, since the latter
323  * permuter will be recorded if the resulting word ratings are the same.
324  *
325  * Note: the function assumes that best_choice and raw_choice are not NULL.
326  *
327  * Note: Since permuter_all maybe called recursively (through permuter_
328  * compound_words), there must be a separate instance of permuter_state
329  * for each invocation.
330  **********************************************************************/
332  const WERD_CHOICE *best_choice,
333  WERD_CHOICE *raw_choice) {
334  WERD_CHOICE *result1 = NULL;
335  WERD_CHOICE *result2 = NULL;
336  BOOL8 any_alpha;
337  float top_choice_rating_limit = best_choice->rating();
338  int word_script_id = get_top_word_script(char_choices, getUnicharset());
339 
340  PermuterState permuter_state;
341  if (getUnicharset().han_sid() != getUnicharset().null_sid() &&
342  word_script_id == getUnicharset().han_sid()) {
343  permuter_state.Init(char_choices, getUnicharset(), 1.0f, permute_debug);
344 
345  result1 = get_top_choice_word(char_choices);
346 
347  // Note that we no longer need the returned word from these permuters,
348  // except to delete the memory. The word choice from all permutations
349  // is returned by permuter_state.GetpermutedWord() at the end.
351  result2 = permute_fixed_length_words(char_choices, &permuter_state);
352  delete result2;
353  }
354  if (permute_chartype_word) {
355  result2 = permute_chartype_words(char_choices, &permuter_state);
356  delete result2;
357  }
358  if (permute_script_word) {
359  result2 = permute_script_words(char_choices, &permuter_state);
360  delete result2;
361  }
362 
363  float certainties[MAX_PERM_LENGTH];
364  float adjust_factor;
365  result2 = permuter_state.GetPermutedWord(certainties, &adjust_factor);
366  LogNewChoice(adjust_factor, certainties, false, result2, char_choices);
367  result1 = get_best_delete_other(result1, result2);
368 
370  } else {
371  result1 = permute_top_choice(char_choices, &top_choice_rating_limit,
372  raw_choice, &any_alpha);
373  if (result1 == NULL)
374  return (NULL);
375  if (permute_only_top)
376  return result1;
377 
378  if (permute_chartype_word) {
379  permuter_state.Init(char_choices, getUnicharset(),
381  result2 = permute_chartype_words(char_choices, &permuter_state);
382  result1 = get_best_delete_other(result1, result2);
383  }
384 
385  // Permute character fragments if necessary.
386  if (result1 == NULL || result1->fragment_mark()) {
387  result2 = top_fragments_permute_and_select(char_choices,
388  top_choice_rating_limit);
389  result1 = get_best_delete_other(result1, result2);
390  }
391 
392  result2 = dawg_permute_and_select(char_choices, best_choice->rating());
393  result1 = get_best_delete_other(result1, result2);
394 
395  result2 = permute_compound_words(char_choices, best_choice->rating());
396  result1 = get_best_delete_other(result1, result2);
397  }
398  return result1;
399 }
400 
410  if (!word || wordseg_rating_adjust_factor_ <= 0) return;
411 
412  float old_rating = word->rating();
413  float new_rating = old_rating * wordseg_rating_adjust_factor_;
414  word->set_rating(new_rating);
415  if (permute_debug)
416  tprintf("Permute segadjust %f * %f --> %f\n",
417  old_rating, wordseg_rating_adjust_factor_, new_rating);
418 }
419 
420 
431  const BLOB_CHOICE_LIST_VECTOR &char_choices,
432  PermuterState *permuter_state) {
433  if (permute_debug)
434  print_char_choices_list("\n\nPermute FixedLength Word",
435  char_choices, getUnicharset(), false);
436  WERD_CHOICE* best_choice =
437  new WERD_CHOICE(&getUnicharset(), char_choices.length());
438  const int max_dict_len = max_fixed_length_dawgs_wdlen_;
439  const int min_dict_len = 2;
440  char posstr[256];
441  int match_score = 0;
442  int anchor_pos = 0;
443  while (anchor_pos < char_choices.length()) {
444  // search from longest phrase to shortest, stop when we find a match
445  WERD_CHOICE* part_choice = NULL;
446  int step = max_dict_len;
447  while (step >= min_dict_len) {
448  int end_pos = anchor_pos + step - 1;
449  if (end_pos < char_choices.length()) {
450  part_choice = dawg_permute_and_select(char_choices,
451  200.0, // rate limit
452  step,
453  anchor_pos);
454  if (part_choice->length() == step) {
455  if (permute_debug)
456  tprintf("match found at pos=%d len=%d\n%s\n", anchor_pos, step,
457  part_choice->unichar_string().string());
458  break;
459  }
460  delete part_choice;
461  part_choice = NULL;
462  }
463  step--;
464  }
465 
466  if (part_choice && step > 1) { // found lexicon match
467  get_posstr_from_choice(char_choices, part_choice, anchor_pos, posstr);
468  float adjust_factor = pow(0.95, 1.0 + step*2.0/char_choices.length());
469  if (permuter_state)
470  permuter_state->AddPreference(anchor_pos, posstr, adjust_factor);
471  match_score += step - 1; // single chars don't count
472  if (permute_debug)
473  tprintf("Promote word rating %d-len%d\n%s\n", anchor_pos, step,
474  part_choice->unichar_string().string());
475  } else { // no lexicon match
476  step = 1;
477  part_choice = get_choice_from_posstr(&getUnicharset(), char_choices,
478  anchor_pos, "0", NULL);
479  if (permute_debug)
480  tprintf("Single char %d %s\n", anchor_pos,
481  part_choice->unichar_string().string());
482  }
483  if (part_choice && part_choice->length() > 0)
484  (*best_choice) += (*part_choice);
485  if (part_choice) delete part_choice;
486  anchor_pos += step;
487  }
488 
489  if (match_score > 0) {
490  float adjust_factor = pow(0.8, // 1.0/segment_penalty_dict_nonword,
491  match_score * 2.0 / char_choices.length());
492  float adjusted_score = best_choice->rating() * adjust_factor;
493  if (permute_debug)
494  tprintf("Adjusting score %f @ %d -> %f\n",
495  best_choice->rating(), match_score, adjusted_score);
496  best_choice->set_rating(adjusted_score);
497  }
498  if (permute_debug)
499  tprintf("Found Best CJK word %f: %s\n",
500  best_choice->rating(), best_choice->unichar_string().string());
501  return best_choice;
502 }
503 
504 
505 /**********************************************************************
506  * Returns the dominant chartype for the word. Only the "main" chartype
507  * of each character is used, and a consistent chartype is defined by
508  * the majority chartype from non-ambiguous character positions.
509  * If pos_chartypes is not NULL, then the "main" chartype at each pos
510  * is also returned. The caller must allocate and deallocate the space.
511  **********************************************************************/
513  char* pos_chartypes) {
514  const UNICHARSET &unicharset = getUnicharset();
515  const int hist_size = 128; // to contain 'A','a','0','x','p'
516  int chprop[hist_size];
517  int x;
518  for (x = 0; x < hist_size; x++) chprop[x] = 0;
519  for (x = 0; x < char_choices.length(); ++x) {
520  UNICHAR_ID unichar_id = get_top_choice_uid(char_choices.get(x));
521  char ctype = unicharset.get_chartype(unichar_id);
522  if (pos_chartypes) pos_chartypes[x] = ctype;
523  if (ctype == 0 || ctype == 'p') continue;
524  if (getUnicharAmbigs().OneToOneDefiniteAmbigs(unichar_id) != NULL) continue;
525  chprop[ctype]++;
526  if (x == 0 && ctype == 'A') // first-cap also counts as lower
527  chprop['a']++;
528  }
529  int max_prop = 0;
530  for (x = 1; x < hist_size; x++)
531  if (chprop[x] >= chprop[max_prop]) max_prop = x;
532  return (chprop[max_prop] > 0) ? max_prop : 0;
533 }
534 
535 /**********************************************************************
536  * Promote consistent character type based on neighboring characters.
537  * For each character position, if the top choice property is inconsistent
538  * with that of the word or previous character, then its likely
539  * subsitutions, as defined by DangAmbigs, will be examined and the one
540  * with a matching property will be selected.
541  **********************************************************************/
543  const BLOB_CHOICE_LIST_VECTOR &char_choices,
544  PermuterState *permuter_state) {
545 
546  if (char_choices.length() >= MAX_PERM_LENGTH)
547  return NULL;
548  // Store main character property of top choice at every position
549  char pos_chartypes[MAX_PERM_LENGTH];
550  char word_type = top_word_chartype(char_choices, pos_chartypes);
551  if (word_type == 0 || word_type == 'p')
552  return NULL; // skip if word type is punctuation or unknown
553  if (permute_debug) {
554  tprintf("\n\nPermuteCharType[%c]\n", word_type);
555  print_char_choices_list("", char_choices, getUnicharset(), true);
556  }
557 
558  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
559  BLOB_CHOICE_IT blob_choice_it;
560  const UNICHARSET& unicharset = getUnicharset();
561  bool replaced = false; // has any character choice been replaced
562  int prev_unambig_type = 0; // the last chartype of an unambiguous char
563  float certainties[MAX_PERM_LENGTH + 1];
564  for (int x = 0; x < char_choices.length(); ++x) {
565  BLOB_CHOICE_LIST* pos_choice = char_choices.get(x);
566  UNICHAR_ID unichar_id = get_top_choice_uid(pos_choice);
567  if (unichar_id == 0) {
568  delete current_word;
569  return NULL;
570  }
571  blob_choice_it.set_to_list(pos_choice);
572  BLOB_CHOICE *first_choice = blob_choice_it.data();
573  ASSERT_HOST(first_choice != NULL);
574 
575  const UnicharIdVector* ambig_uids =
577  bool is_ambiguous = (ambig_uids != NULL);
578  bool is_punct = unicharset.get_ispunctuation(unichar_id);
579  bool is_consistent = is_punct ||
580  unicharset.get_chartype(unichar_id) == prev_unambig_type ||
581  unicharset.get_chartype(unichar_id) == word_type;
582  bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL;
583  if (permute_debug)
584  tprintf("char[%d]:%s is_ambig %c is_punct %c is_consistent %c\n",
585  x, unicharset.id_to_unichar(unichar_id),
586  is_ambiguous?'T':'F', is_punct?'T':'F', is_consistent?'T':'F');
587 
588  if (is_fragment) {
589  // Ignore any fragmented characters by skipping them to next choice
590  // (originally first choice).
591  first_choice = get_nth_choice(pos_choice, 1);
592  ASSERT_HOST(first_choice != NULL);
593  } else if (is_ambiguous && !is_consistent) {
594  // Check every confusable blob choice where the top choice is inconsistent
595  // with the character type of the previous unambiguous character.
596  if (permute_debug) {
597  tprintf("Checking %s r%g PrevCharType %c\n",
598  unicharset.id_to_unichar(unichar_id),
599  first_choice->rating(), prev_unambig_type);
600  print_ratings_list("\t", pos_choice, getUnicharset());
601  }
602  BLOB_CHOICE* c_it = NULL;
603  if (c_it == NULL) {
604  c_it = find_choice_by_type(pos_choice, word_type, unicharset);
605  }
606 
607  // Prefer a character choice whose type is the same as the previous
608  // unambiguous character and the confusion appears in the ambig list.
609  if (c_it == NULL && prev_unambig_type > 0) {
610  c_it = find_choice_by_type(pos_choice, prev_unambig_type, unicharset);
611  if (c_it &&
612  UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0)
613  c_it = NULL;
614  }
615 
616  // Otherwise, perfer a punctuation
617  if (c_it == NULL) {
618  c_it = find_choice_by_type(pos_choice, 'p', unicharset);
619  if (c_it &&
620  UnicharIdArrayUtils::find_in(*ambig_uids, c_it->unichar_id()) < 0)
621  c_it = NULL;
622  }
623 
624  // save any preference other than the top choice
625  if (c_it != NULL) {
626  if (permute_debug) {
627  tprintf("Replacing %s r%g ==> %s r%g\n",
628  unicharset.id_to_unichar(unichar_id), first_choice->rating(),
629  unicharset.id_to_unichar(c_it->unichar_id()), c_it->rating());
630  tprintf("\n\nPermuteCharType[%c]\n", word_type);
631  print_char_choices_list("", char_choices, getUnicharset(), false);
632  }
633  if (permuter_state)
634  permuter_state->AddPreference(x, c_it, segment_reward_chartype);
635  first_choice = c_it;
636  replaced = true;
637  }
638  } else if (!is_ambiguous && !is_punct) {
639  // keep the last unambiguous character type
640  prev_unambig_type = pos_chartypes[x];
641  }
642  current_word->append_unichar_id(first_choice->unichar_id(), 1,
643  first_choice->rating(),
644  first_choice->certainty());
645  certainties[x] = first_choice->certainty();
646  }
647  // All permuter choices should go through adjust_non_word so the choice
648  // rating would be adjusted on the same scale.
649  adjust_non_word(current_word, certainties, &char_choices, permute_debug);
650  if (replaced) {
651  // Apply a reward multiplier on rating if an chartype permutation is made.
652  float rating = current_word->rating();
653  current_word->set_rating(rating * segment_reward_chartype);
654  if (permute_debug)
655  current_word->print("<== permute_chartype_word **");
656  }
657  return current_word;
658 }
659 
670  const BLOB_CHOICE_LIST_VECTOR &char_choices,
671  PermuterState *permuter_state) {
672  if (char_choices.length() >= MAX_WERD_LENGTH)
673  return NULL;
674 
675  int word_sid = get_top_word_script(char_choices, getUnicharset());
676  if (word_sid == getUnicharset().null_sid())
677  return NULL;
678 
679  if (permute_debug) {
680  tprintf("\n\nPermuteScript %s\n",
681  getUnicharset().get_script_from_script_id(word_sid));
682  print_char_choices_list("", char_choices, getUnicharset(),
683  permute_debug > 1);
684  }
685 
686  WERD_CHOICE *current_word = new WERD_CHOICE(&getUnicharset());
687  BLOB_CHOICE_IT blob_choice_it;
688  bool replaced = false;
689  bool prev_is_consistent = false;
690  float certainties[MAX_PERM_LENGTH + 1];
691  for (int x = 0; x < char_choices.length(); ++x) {
692  blob_choice_it.set_to_list(char_choices.get(x));
693  BLOB_CHOICE *first_choice = blob_choice_it.data();
694  if (!first_choice) {
695  delete current_word;
696  return NULL;
697  }
698  UNICHAR_ID unichar_id = first_choice->unichar_id();
699  if (unichar_id == 0) {
700  delete current_word;
701  return NULL;
702  }
703  bool sid_consistent = (getUnicharset().get_script(unichar_id) == word_sid);
704  bool this_is_punct = getUnicharset().get_chartype(unichar_id) == 'p';
705  bool is_fragment = getUnicharset().get_fragment(unichar_id) != NULL;
706 
707  if (is_fragment) {
708  // Ignore any fragmented characters by skipping them to next choice
709  // (originally first choice).
710  first_choice = get_nth_choice(char_choices.get(x), 1);
711  ASSERT_HOST(first_choice != NULL);
712  } else if (!sid_consistent && !this_is_punct && prev_is_consistent) {
713  // If the previous char is CJK, we prefer a cjk over non-cjk char
714  if (permute_debug) {
715  tprintf("Checking %s r%g\n", getUnicharset().id_to_unichar(unichar_id),
716  first_choice->rating());
717  print_ratings_list("\t", char_choices.get(x), getUnicharset());
718  }
719  // prefer a script consistent choice
720  BLOB_CHOICE* c_it = find_choice_by_script(char_choices.get(x),
721  word_sid, 0, 0);
722  // otherwise, prefer a punctuation
723  if (c_it == NULL)
724  c_it = find_choice_by_type(char_choices.get(x), 'p', getUnicharset());
725 
726  if (c_it != NULL) {
727  if (permute_debug)
728  tprintf("Replacing %s r%g ==> %s r%g\n",
729  getUnicharset().id_to_unichar(unichar_id),
730  first_choice->rating(),
732  c_it->rating());
733  if (permuter_state)
734  permuter_state->AddPreference(x, c_it, segment_reward_script);
735  first_choice = c_it;
736  replaced = true;
737  }
738  }
739  current_word->append_unichar_id(first_choice->unichar_id(), 1,
740  first_choice->rating(),
741  first_choice->certainty());
742  certainties[x] = first_choice->certainty();
743  prev_is_consistent = sid_consistent;
744  }
745  // All permuter choices should go through adjust_non_word so the choice
746  // rating would be adjusted on the same scale.
747  adjust_non_word(current_word, certainties, &char_choices, permute_debug);
748  if (replaced) {
749  // Apply a reward multiplier on rating if an script permutation is made.
750  float rating = current_word->rating();
751  current_word->set_rating(rating * segment_reward_script);
752  if (permute_debug)
753  current_word->print("<== permute_script_word **");
754  }
755  return current_word;
756 }
757 
766  WERD_CHOICE *best_choice,
767  WERD_CHOICE *raw_choice) {
768  if (permute_debug) {
769  tprintf("\n\n\n##### Permute_Characters #######\n");
770  print_char_choices_list("\n==> Input CharChoices", char_choices,
772  tprintf("\n");
773  }
774 
775  if (char_choices.length() == 1 &&
776  get_top_choice_uid(char_choices.get(0)) == 0) return false;
777  WERD_CHOICE *this_choice = permute_all(char_choices, best_choice, raw_choice);
778 
779  if (this_choice && this_choice->rating() < best_choice->rating()) {
780  *best_choice = *this_choice;
781 
782  if (permute_debug) {
783  best_choice->print("\n**** Populate BestChoice");
784  cprintf("populate best_choice\n\t%s\n",
785  best_choice->debug_string().string());
786  }
787  delete this_choice;
788  return true;
789  }
790  delete this_choice;
791  return false;
792 }
793 
800  const BLOB_CHOICE_LIST_VECTOR &char_choices,
801  float rating_limit) {
802  BLOB_CHOICE *first_choice;
803  WERD_CHOICE *best_choice = NULL;
804  WERD_CHOICE current_word(&getUnicharset(), MAX_WERD_LENGTH);
805  int first_index = 0;
806  int x;
807  BLOB_CHOICE_IT blob_choice_it;
808 
809  if (char_choices.length() > MAX_WERD_LENGTH) {
810  WERD_CHOICE *bad_word_choice = new WERD_CHOICE(&getUnicharset());
811  bad_word_choice->make_bad();
812  return bad_word_choice;
813  }
814 
815  UNICHAR_ID slash = getUnicharset().unichar_to_id("/");
816  UNICHAR_ID dash = getUnicharset().unichar_to_id("-");
817  for (x = 0; x < char_choices.length(); ++x) {
818  blob_choice_it.set_to_list(char_choices.get(x));
819  first_choice = blob_choice_it.data();
820  if (first_choice->unichar_id() == slash ||
821  first_choice->unichar_id() == dash) {
822  if (x > first_index) {
823  if (segment_debug)
824  cprintf ("Hyphenated word found\n");
825  permute_subword(char_choices, rating_limit, first_index,
826  x - 1, &current_word);
827  if (current_word.rating() > rating_limit)
828  break;
829  }
830  // Append hyphen/slash separator to current_word.
832  first_choice->unichar_id(), 1,
833  first_choice->rating(), first_choice->certainty());
834 
835  first_index = x + 1; // update first_index
836  }
837  }
838 
839  if (first_index > 0 && first_index < x &&
840  current_word.rating() <= rating_limit) {
841  permute_subword(char_choices, rating_limit, first_index,
842  x - 1, &current_word);
843  best_choice = new WERD_CHOICE(current_word);
844  best_choice->set_permuter(COMPOUND_PERM);
845  }
846  return (best_choice);
847 }
848 
849 
860  float rating_limit,
861  int start,
862  int end,
863  WERD_CHOICE *current_word) {
864  int x;
865  BLOB_CHOICE_LIST_VECTOR subchoices;
866  WERD_CHOICE *best_choice = NULL;
867  WERD_CHOICE raw_choice(&getUnicharset());
868  raw_choice.make_bad();
869 
871 
872  for (x = start; x <= end; x++) {
873  if (char_choices.get(x) != NULL) {
874  subchoices += char_choices.get(x);
875  }
876  }
877 
878  if (!subchoices.empty()) {
879  WERD_CHOICE initial_choice(&getUnicharset());
880  initial_choice.make_bad();
881  initial_choice.set_rating(rating_limit);
882 
883  best_choice = permute_all(subchoices, &initial_choice, &raw_choice);
884 
885  if (best_choice && best_choice->length() > 0) {
886  *current_word += *best_choice;
887  } else {
888  current_word->set_rating(MAX_FLOAT32);
889  }
890  } else {
891  current_word->set_rating(MAX_FLOAT32);
892  }
893 
894  if (best_choice)
895  delete best_choice;
896 
897  if (segment_debug && current_word->rating() < MAX_FLOAT32) {
898  cprintf ("Subword permuted = %s, %5.2f, %5.2f\n\n",
899  current_word->debug_string().string(),
900  current_word->rating(), current_word->certainty());
901  }
903 }
904 
909  const BLOB_CHOICE_LIST_VECTOR &char_choices) {
911  float certainties[MAX_PERM_LENGTH];
912  top_word->set_permuter(TOP_CHOICE_PERM);
913  for (int x = 0; x < char_choices.length(); x++) {
914  BLOB_CHOICE_IT blob_choice_it;
915  blob_choice_it.set_to_list(char_choices.get(x));
916  BLOB_CHOICE *top_choice = blob_choice_it.data();
917  top_word->append_unichar_id_space_allocated(top_choice->unichar_id(), 1,
918  top_choice->rating(),
919  top_choice->certainty());
920  certainties[x] = top_choice->certainty();
921  }
922  LogNewChoice(1.0, certainties, true, top_word, char_choices);
923  return top_word;
924 }
925 
935  const BLOB_CHOICE_LIST_VECTOR &char_choices,
936  float* rating_limit,
937  WERD_CHOICE *raw_choice,
938  BOOL8 *any_alpha) {
939  BLOB_CHOICE *first_choice;
940  const char *first_char; //first choice
941  const char *second_char; //second choice
942  const char *third_char; //third choice
943  char prev_char[UNICHAR_LEN + 1]; //prev in word
944  const char *next_char = ""; //next in word
945  const char *next_next_char = ""; //after next next in word
946 
948  word.set_permuter(TOP_CHOICE_PERM);
949  WERD_CHOICE capital_word(&getUnicharset(), MAX_PERM_LENGTH);
950  capital_word.set_permuter(UPPER_CASE_PERM);
951  WERD_CHOICE lower_word(&getUnicharset(), MAX_PERM_LENGTH);
952  lower_word.set_permuter(LOWER_CASE_PERM);
953 
954  int x;
955  BOOL8 char_alpha;
956  float first_rating = 0;
957 
958  float certainties[MAX_PERM_LENGTH + 1];
959  float lower_certainties[MAX_PERM_LENGTH + 1];
960  float upper_certainties[MAX_PERM_LENGTH + 1];
961 
962  BLOB_CHOICE_IT blob_choice_it;
963  UNICHAR_ID temp_id;
964  UNICHAR_ID unichar_id;
965  UNICHAR_ID space = getUnicharset().unichar_to_id(" ");
966  register const char* ch;
967  register inT8 lower_done;
968  register inT8 upper_done;
969 
970  prev_char[0] = '\0';
971 
972  if (any_alpha != NULL)
973  *any_alpha = FALSE;
974 
975  if (char_choices.length() > MAX_PERM_LENGTH) {
976  return (NULL);
977  }
978 
979  for (x = 0; x < char_choices.length(); ++x) {
980  if (x + 1 < char_choices.length()) {
981  unichar_id = get_top_choice_uid(char_choices.get(x+1));
982  next_char = unichar_id != INVALID_UNICHAR_ID ?
983  getUnicharset().id_to_unichar(unichar_id) : "";
984  } else {
985  next_char = "";
986  }
987 
988  if (x + 2 < char_choices.length()) {
989  unichar_id = get_top_choice_uid(char_choices.get(x+2));
990  next_next_char = unichar_id != INVALID_UNICHAR_ID ?
991  getUnicharset().id_to_unichar(unichar_id) : "";
992  } else {
993  next_next_char = "";
994  }
995 
996  blob_choice_it.set_to_list(char_choices.get(x));
997  ASSERT_HOST(!blob_choice_it.empty());
998  first_choice = NULL;
999  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1000  blob_choice_it.forward()) { // find the best non-fragment char choice
1001  temp_id = blob_choice_it.data()->unichar_id();
1002  if (!(getUnicharset().get_fragment(temp_id))) {
1003  first_choice = blob_choice_it.data();
1004  break;
1005  } else if (char_choices.length() > 1) {
1006  word.set_fragment_mark(true);
1007  capital_word.set_fragment_mark(true);
1008  lower_word.set_fragment_mark(true);
1009  }
1010  }
1011  if (first_choice == NULL) {
1012  cprintf("Permuter found only fragments for"
1013  " character at position %d; word=%s\n",
1014  x, word.debug_string().string());
1015  }
1016  ASSERT_HOST(first_choice != NULL);
1017 
1018  unichar_id = first_choice->unichar_id() != INVALID_UNICHAR_ID ?
1019  first_choice->unichar_id() : space;
1020  first_char = getUnicharset().id_to_unichar(unichar_id);
1021  first_rating = first_choice->rating();
1023  unichar_id, 1, first_choice->rating(), first_choice->certainty());
1024  capital_word.append_unichar_id_space_allocated(
1025  unichar_id, 1, first_choice->rating(), first_choice->certainty());
1027  unichar_id, 1, first_choice->rating(), first_choice->certainty());
1028 
1029  certainties[x] = first_choice->certainty();
1030  lower_certainties[x] = first_choice->certainty();
1031  upper_certainties[x] = first_choice->certainty();
1032 
1033  lower_done = FALSE;
1034  upper_done = FALSE;
1035  char_alpha = FALSE;
1036  second_char = "";
1037  third_char = "";
1038  for (; !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
1039  unichar_id = blob_choice_it.data()->unichar_id();
1040  if (getUnicharset().eq(unichar_id, "l") && !blob_choice_it.at_last() &&
1041  blob_choice_it.data_relative(1)->rating() == first_rating) {
1042  temp_id = blob_choice_it.data_relative(1)->unichar_id();
1043  if (getUnicharset().eq(temp_id, "1") ||
1044  getUnicharset().eq(temp_id, "I")) {
1045  second_char = getUnicharset().id_to_unichar(temp_id);
1046  blob_choice_it.forward();
1047  if (!blob_choice_it.at_last() &&
1048  blob_choice_it.data_relative(1)->rating() == first_rating) {
1049  temp_id = blob_choice_it.data_relative(1)->unichar_id();
1050  if (getUnicharset().eq(temp_id, "1") ||
1051  getUnicharset().eq(temp_id, "I")) {
1052  third_char = getUnicharset().id_to_unichar(temp_id);
1053  blob_choice_it.forward();
1054  }
1055  }
1056  ch = choose_il1 (first_char, second_char, third_char,
1057  prev_char, next_char, next_next_char);
1058  unichar_id = (ch != NULL && *ch != '\0') ?
1059  getUnicharset().unichar_to_id(ch) : INVALID_UNICHAR_ID;
1060  if (strcmp(ch, "l") != 0 &&
1061  getUnicharset().eq(word.unichar_id(x), "l")) {
1062  word.set_unichar_id(unichar_id, x);
1063  lower_word.set_unichar_id(unichar_id, x);
1064  capital_word.set_unichar_id(unichar_id, x);
1065  }
1066  }
1067  }
1068  if (unichar_id != INVALID_UNICHAR_ID) {
1069  /* Find lower case */
1070  if (!lower_done &&
1071  (getUnicharset().get_islower(unichar_id) ||
1072  (getUnicharset().get_isupper(unichar_id) && x == 0))) {
1073  lower_word.set_unichar_id(unichar_id, x);
1074  lower_word.set_rating(lower_word.rating() -
1075  first_choice->rating() + blob_choice_it.data()->rating());
1076  if (blob_choice_it.data()->certainty() < lower_word.certainty()) {
1077  lower_word.set_certainty(blob_choice_it.data()->certainty());
1078  }
1079  lower_certainties[x] = blob_choice_it.data()->certainty();
1080  lower_done = TRUE;
1081  }
1082  /* Find upper case */
1083  if (!upper_done && getUnicharset().get_isupper(unichar_id)) {
1084  capital_word.set_unichar_id(unichar_id, x);
1085  capital_word.set_rating(capital_word.rating() -
1086  first_choice->rating() + blob_choice_it.data()->rating());
1087  if (blob_choice_it.data()->certainty() < capital_word.certainty()) {
1088  capital_word.set_certainty(blob_choice_it.data()->certainty());
1089  }
1090  upper_certainties[x] = blob_choice_it.data()->certainty();
1091  upper_done = TRUE;
1092  }
1093  if (!char_alpha) {
1094  const CHAR_FRAGMENT *fragment =
1095  getUnicharset().get_fragment(unichar_id);
1096  temp_id = !fragment ? unichar_id :
1097  getUnicharset().unichar_to_id(fragment->get_unichar());
1098  if (getUnicharset().get_isalpha(temp_id)) {
1099  char_alpha = TRUE;
1100  }
1101  }
1102  if (lower_done && upper_done)
1103  break;
1104  }
1105  }
1106  if (char_alpha && any_alpha != NULL)
1107  *any_alpha = TRUE;
1108 
1109  if (word.rating() > bestrate_pruning_factor * *rating_limit) {
1110  if (permute_debug)
1111  tprintf("\n***** Aborting high-cost word: %g > limit %g\n",
1112  word.rating(), bestrate_pruning_factor * *rating_limit);
1113  return (NULL);
1114  }
1115 
1116  *prev_char = '\0';
1117  temp_id = word.unichar_id(word.length()-1);
1118  if (temp_id != INVALID_UNICHAR_ID) {
1119  strcpy(prev_char, getUnicharset().id_to_unichar(temp_id));
1120  }
1121  }
1122 
1123  if (raw_choice != NULL && word.rating() < raw_choice->rating()) {
1124  *raw_choice = word;
1125  LogNewChoice(1.0, certainties, true, raw_choice, char_choices);
1126  }
1127  float rating = word.rating();
1128  adjust_non_word(&word, certainties, &char_choices, permute_debug);
1129 
1130  float lower_rating = lower_word.rating();
1131  adjust_non_word(&lower_word, lower_certainties, &char_choices,
1132  permute_debug);
1133 
1134  float upper_rating = capital_word.rating();
1135  adjust_non_word(&capital_word, upper_certainties, &char_choices,
1136  permute_debug);
1137 
1138  WERD_CHOICE *best_choice = &word;
1139  *rating_limit = rating;
1140  if (lower_word.rating() < best_choice->rating()) {
1141  best_choice = &lower_word;
1142  *rating_limit = lower_rating;
1143  }
1144  if (capital_word.rating() < best_choice->rating()) {
1145  best_choice = &capital_word;
1146  *rating_limit = upper_rating;
1147  }
1148  return new WERD_CHOICE(*best_choice);
1149 }
1150 
1151 
1163 const char* Dict::choose_il1(const char *first_char,
1164  const char *second_char,
1165  const char *third_char,
1166  const char *prev_char,
1167  const char *next_char,
1168  const char *next_next_char) {
1169  inT32 type1; //1/I/l type of first choice
1170  inT32 type2; //1/I/l type of second choice
1171  inT32 type3; //1/I/l type of third choice
1172 
1173  int first_char_length = strlen(first_char);
1174  int prev_char_length = strlen(prev_char);
1175  int next_char_length = strlen(next_char);
1176  int next_next_char_length = strlen(next_next_char);
1177 
1178  if (*first_char == 'l' && *second_char != '\0') {
1179  if (*second_char == 'I'
1180  && (((prev_char_length != 0 &&
1181  getUnicharset().get_isupper (prev_char, prev_char_length)) &&
1182  (next_char_length == 0 ||
1183  !getUnicharset().get_islower (next_char, next_char_length)) &&
1184  (next_char_length == 0 ||
1185  !getUnicharset().get_isdigit (next_char, next_char_length))) ||
1186  ((next_char_length != 0 &&
1187  getUnicharset().get_isupper (next_char, next_char_length)) &&
1188  (prev_char_length == 0 ||
1189  !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1190  (prev_char_length == 0 ||
1191  !getUnicharset().get_isdigit (prev_char, prev_char_length)))))
1192  first_char = second_char; //override
1193  else if (*second_char == '1' || *third_char == '1') {
1194  if ((next_char_length != 0 &&
1195  getUnicharset().get_isdigit (next_char, next_char_length)) ||
1196  (prev_char_length != 0 &&
1197  getUnicharset().get_isdigit (prev_char, prev_char_length))
1198  || (*next_char == 'l' &&
1199  (next_next_char_length != 0 &&
1200  getUnicharset().get_isdigit (next_next_char,
1201  next_next_char_length)))) {
1202  first_char = "1";
1203  first_char_length = 1;
1204  }
1205  else if ((prev_char_length == 0 ||
1206  !getUnicharset().get_islower (prev_char, prev_char_length)) &&
1207  ((next_char_length == 0 ||
1208  !getUnicharset().get_islower (next_char, next_char_length)) ||
1209  (*next_char == 's' &&
1210  *next_next_char == 't'))) {
1211  if (((*prev_char != '\'' && *prev_char != '`') || *next_char != '\0')
1212  && ((*next_char != '\'' && *next_char != '`')
1213  || *prev_char != '\0')) {
1214  first_char = "1";
1215  first_char_length = 1;
1216  }
1217  }
1218  }
1219  if (*first_char == 'l' && *next_char != '\0' &&
1220  (prev_char_length == 0 ||
1221  !getUnicharset().get_isalpha (prev_char, prev_char_length))) {
1222  type1 = 2;
1223 
1224  if (*second_char == '1')
1225  type2 = 0;
1226  else if (*second_char == 'I')
1227  type2 = 1;
1228  else if (*second_char == 'l')
1229  type2 = 2;
1230  else
1231  type2 = type1;
1232 
1233  if (*third_char == '1')
1234  type3 = 0;
1235  else if (*third_char == 'I')
1236  type3 = 1;
1237  else if (*third_char == 'l')
1238  type3 = 2;
1239  else
1240  type3 = type1;
1241 
1242 #if 0
1243  if (bigram_counts[*next_char][type2] >
1244  bigram_counts[*next_char][type1]) {
1245  first_char = second_char;
1246  type1 = type2;
1247  }
1248  if (bigram_counts[*next_char][type3] >
1249  bigram_counts[*next_char][type1]) {
1250  first_char = third_char;
1251  }
1252 #endif
1253  }
1254  }
1255  return first_char;
1256 }
1257 
1284  float curr_rating, float curr_certainty,
1285  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1286  const char *debug, int word_ending,
1287  CHAR_FRAGMENT_INFO *char_frag_info) {
1288  const CHAR_FRAGMENT *this_fragment =
1289  getUnicharset().get_fragment(curr_unichar_id);
1290  const CHAR_FRAGMENT *prev_fragment =
1291  prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
1292 
1293  // Print debug info for fragments.
1294  if (debug && (prev_fragment || this_fragment)) {
1295  cprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
1296  getUnicharset().debug_str(curr_unichar_id).string(),
1297  word_ending);
1298  if (prev_fragment) {
1299  cprintf("prev_fragment %s\n", prev_fragment->to_string().string());
1300  }
1301  if (this_fragment) {
1302  cprintf("this_fragment %s\n", this_fragment->to_string().string());
1303  }
1304  }
1305 
1306  char_frag_info->unichar_id = curr_unichar_id;
1307  char_frag_info->fragment = this_fragment;
1308  char_frag_info->rating = curr_rating;
1309  char_frag_info->certainty = curr_certainty;
1310  char_frag_info->num_fragments = 1;
1311  if (prev_fragment && !this_fragment) {
1312  if (debug) tprintf("Skip choice with incomplete fragment\n");
1313  return false;
1314  }
1315  if (this_fragment) {
1316  // We are dealing with a fragment.
1317  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
1318  if (prev_fragment) {
1319  if (!this_fragment->is_continuation_of(prev_fragment)) {
1320  if (debug) tprintf("Non-matching fragment piece\n");
1321  return false;
1322  }
1323  if (this_fragment->is_ending()) {
1324  char_frag_info->unichar_id =
1325  getUnicharset().unichar_to_id(this_fragment->get_unichar());
1326  char_frag_info->fragment = NULL;
1327  if (debug) {
1328  tprintf("Built character %s from fragments\n",
1329  getUnicharset().debug_str(
1330  char_frag_info->unichar_id).string());
1331  }
1332  } else {
1333  if (debug) tprintf("Record fragment continuation\n");
1334  char_frag_info->fragment = this_fragment;
1335  }
1336  // Update certainty and rating.
1337  char_frag_info->rating =
1338  prev_char_frag_info->rating + curr_rating;
1339  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
1340  char_frag_info->certainty =
1341  MIN(curr_certainty, prev_char_frag_info->certainty);
1342  } else {
1343  if (this_fragment->is_beginning()) {
1344  if (debug) cprintf("Record fragment beginning\n");
1345  } else {
1346  if (debug) {
1347  tprintf("Non-starting fragment piece with no prev_fragment\n");
1348  }
1349  return false;
1350  }
1351  }
1352  }
1353  if (word_ending && char_frag_info->fragment) {
1354  if (debug) tprintf("Word can not end with a fragment\n");
1355  return false;
1356  }
1357  return true;
1358 }
1368  const BLOB_CHOICE_LIST_VECTOR &char_choices,
1369  float rating_limit) {
1370  if (char_choices.length() <= 1 ||
1371  char_choices.length() > MAX_PERM_LENGTH) {
1372  return NULL;
1373  }
1374  // See it would be possible to benefit from permuting fragments.
1375  int x;
1376  float min_rating = 0.0;
1377  BLOB_CHOICE_IT blob_choice_it;
1378  for (x = 0; x < char_choices.length(); ++x) {
1379  blob_choice_it.set_to_list(char_choices.get(x));
1380  if (blob_choice_it.data()) {
1381  min_rating += blob_choice_it.data()->rating();
1382  }
1383  if (min_rating >= rating_limit) {
1384  return NULL;
1385  }
1386  }
1387  if (fragments_debug > 1) {
1388  tprintf("A choice with fragment beats top choice\n");
1389  tprintf("Running fragment permuter...\n");
1390  }
1391 
1392  // Construct a modified choices list that contains (for each position):
1393  // the best choice, all fragments and at least one choice for
1394  // a non-fragmented character.
1395  BLOB_CHOICE_LIST_VECTOR frag_char_choices(char_choices.length());
1396  for (x = 0; x < char_choices.length(); ++x) {
1397  bool need_nonfrag_char = true;
1398  BLOB_CHOICE_LIST *frag_choices = new BLOB_CHOICE_LIST();
1399  BLOB_CHOICE_IT frag_choices_it;
1400  frag_choices_it.set_to_list(frag_choices);
1401  blob_choice_it.set_to_list(char_choices.get(x));
1402  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1403  blob_choice_it.forward()) {
1404  if (getUnicharset().get_fragment(blob_choice_it.data()->unichar_id())) {
1405  frag_choices_it.add_after_then_move(
1406  new BLOB_CHOICE(*(blob_choice_it.data())));
1407  } else if (need_nonfrag_char) {
1408  frag_choices_it.add_after_then_move(
1409  new BLOB_CHOICE(*(blob_choice_it.data())));
1410  need_nonfrag_char = false;
1411  }
1412  }
1413  frag_char_choices += frag_choices;
1414  }
1415 
1416  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
1417  best_choice->make_bad();
1419  word.set_permuter(TOP_CHOICE_PERM);
1420  float certainties[MAX_PERM_LENGTH];
1422  int attempts_left = max_permuter_attempts;
1423  permute_choices((fragments_debug > 1) ? "fragments_debug" : NULL,
1424  frag_char_choices, 0, NULL, &word, certainties,
1425  &rating_limit, best_choice, &attempts_left, NULL);
1426 
1427  frag_char_choices.delete_data_pointers();
1428  return best_choice;
1429 }
1430 
1438  const char *debug,
1439  const BLOB_CHOICE_LIST_VECTOR &char_choices,
1440  int char_choice_index,
1441  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1442  WERD_CHOICE *word,
1443  float certainties[],
1444  float *limit,
1445  WERD_CHOICE *best_choice,
1446  int *attempts_left,
1447  void *more_args) {
1448  if (debug) {
1449  tprintf("%s permute_choices: char_choice_index=%d"
1450  " limit=%g rating=%g, certainty=%g word=%s\n",
1451  debug, char_choice_index, *limit, word->rating(),
1452  word->certainty(), word->debug_string().string());
1453  }
1454  if (char_choice_index < char_choices.length()) {
1455  BLOB_CHOICE_IT blob_choice_it;
1456  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
1457  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
1458  blob_choice_it.forward()) {
1459  (*attempts_left)--;
1460  append_choices(debug, char_choices, *(blob_choice_it.data()),
1461  char_choice_index, prev_char_frag_info, word,
1462  certainties, limit, best_choice, attempts_left, more_args);
1463  if (*attempts_left <= 0) {
1464  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
1465  break;
1466  }
1467  }
1468  }
1469 }
1470 
1480  const char *debug,
1481  const BLOB_CHOICE_LIST_VECTOR &char_choices,
1482  const BLOB_CHOICE &blob_choice,
1483  int char_choice_index,
1484  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1485  WERD_CHOICE *word,
1486  float certainties[],
1487  float *limit,
1488  WERD_CHOICE *best_choice,
1489  int *attempts_left,
1490  void *more_args) {
1491  int word_ending =
1492  (char_choice_index == char_choices.length() - 1) ? true : false;
1493 
1494  // Deal with fragments.
1495  CHAR_FRAGMENT_INFO char_frag_info;
1496  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
1497  blob_choice.certainty(), prev_char_frag_info, debug,
1498  word_ending, &char_frag_info)) {
1499  return; // blob_choice must be an invalid fragment
1500  }
1501  // Search the next letter if this character is a fragment.
1502  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
1503  permute_choices(debug, char_choices, char_choice_index + 1,
1504  &char_frag_info, word, certainties, limit,
1505  best_choice, attempts_left, more_args);
1506  return;
1507  }
1508 
1509  // Add the next unichar.
1510  float old_rating = word->rating();
1511  float old_certainty = word->certainty();
1512  uinT8 old_permuter = word->permuter();
1513  certainties[word->length()] = char_frag_info.certainty;
1515  char_frag_info.unichar_id, char_frag_info.num_fragments,
1516  char_frag_info.rating, char_frag_info.certainty);
1517 
1518  // Explore the next unichar.
1519  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
1520  &char_frag_info, word_ending, word, certainties,
1521  limit, best_choice, attempts_left, more_args);
1522 
1523  // Remove the unichar we added to explore other choices in it's place.
1524  word->remove_last_unichar_id();
1525  word->set_rating(old_rating);
1526  word->set_certainty(old_certainty);
1527  word->set_permuter(old_permuter);
1528 }
1529 
1539  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
1540  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
1541  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
1542  WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
1543  if (word->rating() < *limit) {
1544  if (word_ending) {
1545  if (fragments_debug > 1) {
1546  tprintf("fragments_debug new choice = %s\n",
1547  word->debug_string().string());
1548  }
1549  *limit = word->rating();
1550  adjust_non_word(word, certainties, &char_choices, permute_debug);
1551  update_best_choice(*word, best_choice);
1552  } else { // search the next letter
1553  permute_choices(debug, char_choices, char_choice_index + 1,
1554  prev_char_frag_info, word, certainties, limit,
1555  best_choice, attempts_left, more_args);
1556  }
1557  } else {
1558  if (fragments_debug > 1) {
1559  tprintf("fragments_debug pruned word (%s, rating=%4.2f, limit=%4.2f)\n",
1560  word->debug_string().string(), word->rating(), *limit);
1561  }
1562  }
1563 }
1564 
1565 } // namespace tesseract