Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccmain/docqual.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccmain/docqual.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,926 @@ +/****************************************************************** + * File: docqual.cpp (Formerly docqual.c) + * Description: Document Quality Metrics + * Author: Phil Cheatle + * + * (C) Copyright 1994, Hewlett-Packard Ltd. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "docqual.h" +#include <cctype> +#include "reject.h" +#include "tesseractclass.h" +#include "tessvars.h" + +namespace tesseract { + +static void countMatchingBlobs(int16_t &match_count, int /*index*/) { + ++match_count; +} + +static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count, + int index) { + if (word->reject_map[index].accepted()) { + ++accepted_match_count; + } + ++match_count; +} + +static void acceptIfGoodQuality(WERD_RES *word, int index) { + if (word->reject_map[index].accept_if_good_quality()) { + word->reject_map[index].setrej_quality_accept(); + } +} + +/************************************************************************* + * word_blob_quality() + * How many blobs in the box_word are identical to those of the inword? + * ASSUME blobs in both initial word and box_word are in ascending order of + * left hand blob edge. + *************************************************************************/ +int16_t Tesseract::word_blob_quality(WERD_RES *word) { + int16_t match_count = 0; + if (word->bln_boxes != nullptr && word->rebuild_word != nullptr && + !word->rebuild_word->blobs.empty()) { + using namespace std::placeholders; // for _1 + word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word, + std::bind(countMatchingBlobs, match_count, _1)); + } + return match_count; +} + +int16_t Tesseract::word_outline_errs(WERD_RES *word) { + int16_t err_count = 0; + + if (word->rebuild_word != nullptr) { + int16_t i = 0; + for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) { + TBLOB *blob = word->rebuild_word->blobs[b]; + err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines()); + i++; + } + } + return err_count; +} + +/************************************************************************* + * word_char_quality() + * Combination of blob quality and outline quality - how many good chars are + * there? - I.e chars which pass the blob AND outline tests. + *************************************************************************/ +void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count, + int16_t *accepted_match_count) { + *match_count = 0; + *accepted_match_count = 0; + if (word->bln_boxes != nullptr && word->rebuild_word != nullptr && + !word->rebuild_word->blobs.empty()) { + using namespace std::placeholders; // for _1 + word->bln_boxes->ProcessMatchedBlobs( + *word->rebuild_word, + std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1)); + } +} + +/************************************************************************* + * unrej_good_chs() + * Unreject POTENTIAL rejects if the blob passes the blob and outline checks + *************************************************************************/ +void Tesseract::unrej_good_chs(WERD_RES *word) { + if (word->bln_boxes != nullptr && word->rebuild_word != nullptr && + word->rebuild_word->blobs.empty()) { + using namespace std::placeholders; // for _1 + word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word, + std::bind(acceptIfGoodQuality, word, _1)); + } +} + +int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) { + int expected_outline_count; + + if (outlines_odd.contains(c)) { + return 0; // Don't use this char + } else if (outlines_2.contains(c)) { + expected_outline_count = 2; + } else { + expected_outline_count = 1; + } + return abs(outline_count - expected_outline_count); +} + +void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) { + if ((tessedit_good_quality_unrej && good_quality_doc)) { + unrej_good_quality_words(page_res_it); + } + doc_and_block_rejection(page_res_it, good_quality_doc); + if (unlv_tilde_crunching) { + tilde_crunch(page_res_it); + tilde_delete(page_res_it); + } +} + +/************************************************************************* + * unrej_good_quality_words() + * Accept potential rejects in words which pass the following checks: + * - Contains a potential reject + * - Word looks like a sensible alpha word. + * - Word segmentation is the same as the original image + * - All characters have the expected number of outlines + * NOTE - the rejection counts are recalculated after unrejection + * - CAN'T do it in a single pass without a bit of fiddling + * - keep it simple but inefficient + *************************************************************************/ +void Tesseract::unrej_good_quality_words( // unreject potential + PAGE_RES_IT &page_res_it) { + WERD_RES *word; + ROW_RES *current_row; + BLOCK_RES *current_block; + int i; + + page_res_it.restart_page(); + while (page_res_it.word() != nullptr) { + check_debug_pt(page_res_it.word(), 100); + if (bland_unrej) { + word = page_res_it.word(); + for (i = 0; i < word->reject_map.length(); i++) { + if (word->reject_map[i].accept_if_good_quality()) { + word->reject_map[i].setrej_quality_accept(); + } + } + page_res_it.forward(); + } else if ((page_res_it.row()->char_count > 0) && + ((page_res_it.row()->rej_count / + static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) { + word = page_res_it.word(); + if (word->reject_map.quality_recoverable_rejects() && + (tessedit_unrej_any_wd || + acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(), + word->best_choice->unichar_lengths().c_str()) != + AC_UNACCEPTABLE)) { + unrej_good_chs(word); + } + page_res_it.forward(); + } else { + // Skip to end of dodgy row. + current_row = page_res_it.row(); + while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) { + page_res_it.forward(); + } + } + check_debug_pt(page_res_it.word(), 110); + } + page_res_it.restart_page(); + page_res_it.page_res->char_count = 0; + page_res_it.page_res->rej_count = 0; + current_block = nullptr; + current_row = nullptr; + while (page_res_it.word() != nullptr) { + if (current_block != page_res_it.block()) { + current_block = page_res_it.block(); + current_block->char_count = 0; + current_block->rej_count = 0; + } + if (current_row != page_res_it.row()) { + current_row = page_res_it.row(); + current_row->char_count = 0; + current_row->rej_count = 0; + current_row->whole_word_rej_count = 0; + } + page_res_it.rej_stat_word(); + page_res_it.forward(); + } +} + +/************************************************************************* + * doc_and_block_rejection() + * + * If the page has too many rejects - reject all of it. + * If any block has too many rejects - reject all words in the block + *************************************************************************/ + +void Tesseract::doc_and_block_rejection( // reject big chunks + PAGE_RES_IT &page_res_it, bool good_quality_doc) { + BLOCK_RES *current_block; + + int16_t char_quality = 0; + int16_t accepted_char_quality; + + if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count > + tessedit_reject_doc_percent) { + reject_whole_page(page_res_it); + if (tessedit_debug_doc_rejection) { + tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count, + page_res_it.page_res->rej_count); + } + } else { + if (tessedit_debug_doc_rejection) { + tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count, + page_res_it.page_res->rej_count); + } + + /* Walk blocks testing for block rejection */ + + page_res_it.restart_page(); + WERD_RES *word; + while ((word = page_res_it.word()) != nullptr) { + current_block = page_res_it.block(); + int16_t block_no = current_block->block->pdblk.index(); + if (current_block->char_count > 0 && + (current_block->rej_count * 100.0 / current_block->char_count) > + tessedit_reject_block_percent) { + if (tessedit_debug_block_rejection) { + tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no, + current_block->char_count, current_block->rej_count); + } + bool prev_word_rejected = false; + while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) { + bool rej_word; + if (tessedit_preserve_blk_rej_perfect_wds) { + rej_word = word->reject_map.reject_count() > 0 || + word->reject_map.length() < tessedit_preserve_min_wd_len; + if (rej_word && tessedit_dont_blkrej_good_wds && + word->reject_map.length() >= tessedit_preserve_min_wd_len && + acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(), + word->best_choice->unichar_lengths().c_str()) != + AC_UNACCEPTABLE) { + word_char_quality(word, &char_quality, &accepted_char_quality); + rej_word = char_quality != word->reject_map.length(); + } + } else { + rej_word = true; + } + if (rej_word) { + /* + Reject spacing if both current and prev words are rejected. + NOTE - this is NOT restricted to FUZZY spaces. - When tried this + generated more space errors. +*/ + if (tessedit_use_reject_spaces && prev_word_rejected && + page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) { + word->reject_spaces = true; + } + word->reject_map.rej_word_block_rej(); + } + prev_word_rejected = rej_word; + page_res_it.forward(); + } + } else { + if (tessedit_debug_block_rejection) { + tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no, + page_res_it.block()->char_count, page_res_it.block()->rej_count); + } + + /* Walk rows in block testing for row rejection */ + int16_t row_no = 0; + while (page_res_it.word() != nullptr && page_res_it.block() == current_block) { + ROW_RES *current_row = page_res_it.row(); + row_no++; + /* Reject whole row if: + fraction of chars on row which are rejected exceed a limit AND + fraction rejects which occur in WHOLE WERD rejects is LESS THAN a + limit +*/ + if (current_row->char_count > 0 && + (current_row->rej_count * 100.0 / current_row->char_count) > + tessedit_reject_row_percent && + (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) < + tessedit_whole_wd_rej_row_percent) { + if (tessedit_debug_block_rejection) { + tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no, + current_row->char_count, current_row->rej_count); + } + bool prev_word_rejected = false; + while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) { + /* Preserve words on good docs unless they are mostly rejected*/ + bool rej_word; + if (!tessedit_row_rej_good_docs && good_quality_doc) { + rej_word = word->reject_map.reject_count() / + static_cast<float>(word->reject_map.length()) > + tessedit_good_doc_still_rowrej_wd; + } else if (tessedit_preserve_row_rej_perfect_wds) { + /* Preserve perfect words anyway */ + rej_word = word->reject_map.reject_count() > 0 || + word->reject_map.length() < tessedit_preserve_min_wd_len; + if (rej_word && tessedit_dont_rowrej_good_wds && + word->reject_map.length() >= tessedit_preserve_min_wd_len && + acceptable_word_string( + *word->uch_set, word->best_choice->unichar_string().c_str(), + word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) { + word_char_quality(word, &char_quality, &accepted_char_quality); + rej_word = char_quality != word->reject_map.length(); + } + } else { + rej_word = true; + } + if (rej_word) { + /* + Reject spacing if both current and prev words are rejected. + NOTE - this is NOT restricted to FUZZY spaces. - When tried + this generated more space errors. +*/ + if (tessedit_use_reject_spaces && prev_word_rejected && + page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) { + word->reject_spaces = true; + } + word->reject_map.rej_word_row_rej(); + } + prev_word_rejected = rej_word; + page_res_it.forward(); + } + } else { + if (tessedit_debug_block_rejection) { + tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no, + current_row->char_count, current_row->rej_count); + } + while (page_res_it.word() != nullptr && page_res_it.row() == current_row) { + page_res_it.forward(); + } + } + } + } + } + } +} + +/************************************************************************* + * reject_whole_page() + * Don't believe any of it - set the reject map to 00..00 in all words + * + *************************************************************************/ + +void reject_whole_page(PAGE_RES_IT &page_res_it) { + page_res_it.restart_page(); + while (page_res_it.word() != nullptr) { + page_res_it.word()->reject_map.rej_word_doc_rej(); + page_res_it.forward(); + } + // whole page is rejected + page_res_it.page_res->rejected = true; +} + +void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) { + WERD_RES *word; + GARBAGE_LEVEL garbage_level; + PAGE_RES_IT copy_it; + bool prev_potential_marked = false; + bool found_terrible_word = false; + bool ok_dict_word; + + page_res_it.restart_page(); + while (page_res_it.word() != nullptr) { + POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block(); + if (pb != nullptr && !pb->IsText()) { + page_res_it.forward(); + continue; + } + word = page_res_it.word(); + + if (crunch_early_convert_bad_unlv_chs) { + convert_bad_unlv_chs(word); + } + + if (crunch_early_merge_tess_fails) { + word->merge_tess_fails(); + } + + if (word->reject_map.accept_count() != 0) { + found_terrible_word = false; + // Forget earlier potential crunches + prev_potential_marked = false; + } else { + ok_dict_word = safe_dict_word(word); + garbage_level = garbage_word(word, ok_dict_word); + + if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) { + if (crunch_debug > 0) { + tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str()); + } + word->unlv_crunch_mode = CR_KEEP_SPACE; + if (prev_potential_marked) { + while (copy_it.word() != word) { + if (crunch_debug > 0) { + tprintf("P1 CRUNCHING: \"%s\"\n", + copy_it.word()->best_choice->unichar_string().c_str()); + } + copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE; + copy_it.forward(); + } + prev_potential_marked = false; + } + found_terrible_word = true; + } else if ((garbage_level != G_NEVER_CRUNCH) && + (potential_word_crunch(word, garbage_level, ok_dict_word))) { + if (found_terrible_word) { + if (crunch_debug > 0) { + tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str()); + } + word->unlv_crunch_mode = CR_KEEP_SPACE; + } else if (!prev_potential_marked) { + copy_it = page_res_it; + prev_potential_marked = true; + if (crunch_debug > 1) { + tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str()); + } + } + } else { + found_terrible_word = false; + // Forget earlier potential crunches + prev_potential_marked = false; + if (crunch_debug > 2) { + tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str()); + } + } + } + page_res_it.forward(); + } +} + +bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) { + int crunch_mode = 0; + + if (word->best_choice->unichar_string().empty() || + (strspn(word->best_choice->unichar_string().c_str(), " ") == + word->best_choice->unichar_string().size())) { + crunch_mode = 1; + } else { + int adjusted_len = word->reject_map.length(); + if (adjusted_len > crunch_rating_max) { + adjusted_len = crunch_rating_max; + } + float rating_per_ch = word->best_choice->rating() / adjusted_len; + + if (rating_per_ch > crunch_terrible_rating) { + crunch_mode = 2; + } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) { + crunch_mode = 3; + } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) && + (garbage_level != G_OK)) { + crunch_mode = 4; + } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) { + crunch_mode = 5; + } + } + if (crunch_mode > 0) { + if (crunch_debug > 2) { + tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode, + word->best_choice->unichar_string().c_str()); + } + return true; + } else { + return false; + } +} + +bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, + bool ok_dict_word) { + float rating_per_ch; + int adjusted_len; + const char *str = word->best_choice->unichar_string().c_str(); + const char *lengths = word->best_choice->unichar_lengths().c_str(); + bool word_crunchable; + int poor_indicator_count = 0; + + word_crunchable = + !crunch_leave_accept_strings || word->reject_map.length() < 3 || + (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word); + + adjusted_len = word->reject_map.length(); + if (adjusted_len > 10) { + adjusted_len = 10; + } + rating_per_ch = word->best_choice->rating() / adjusted_len; + + if (rating_per_ch > crunch_pot_poor_rate) { + if (crunch_debug > 2) { + tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str()); + } + poor_indicator_count++; + } + + if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) { + if (crunch_debug > 2) { + tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str()); + } + poor_indicator_count++; + } + + if (garbage_level != G_OK) { + if (crunch_debug > 2) { + tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str()); + } + poor_indicator_count++; + } + return poor_indicator_count >= crunch_pot_indicators; +} + +void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) { + PAGE_RES_IT copy_it; + bool deleting_from_bol = false; + bool marked_delete_point = false; + int16_t debug_delete_mode; + CRUNCH_MODE delete_mode; + int16_t x_debug_delete_mode; + CRUNCH_MODE x_delete_mode; + + page_res_it.restart_page(); + while (page_res_it.word() != nullptr) { + WERD_RES *word = page_res_it.word(); + + delete_mode = word_deletable(word, debug_delete_mode); + if (delete_mode != CR_NONE) { + if (word->word->flag(W_BOL) || deleting_from_bol) { + if (crunch_debug > 0) { + tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, + word->best_choice->unichar_string().c_str()); + } + word->unlv_crunch_mode = delete_mode; + deleting_from_bol = true; + } else if (word->word->flag(W_EOL)) { + if (marked_delete_point) { + while (copy_it.word() != word) { + x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode); + if (crunch_debug > 0) { + tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode, + copy_it.word()->best_choice->unichar_string().c_str()); + } + copy_it.word()->unlv_crunch_mode = x_delete_mode; + copy_it.forward(); + } + } + if (crunch_debug > 0) { + tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, + word->best_choice->unichar_string().c_str()); + } + word->unlv_crunch_mode = delete_mode; + deleting_from_bol = false; + marked_delete_point = false; + } else { + if (!marked_delete_point) { + copy_it = page_res_it; + marked_delete_point = true; + } + } + } else { + deleting_from_bol = false; + // Forget earlier potential crunches + marked_delete_point = false; + } + /* + The following step has been left till now as the tess fails are used to + determine if the word is deletable. +*/ + if (!crunch_early_merge_tess_fails) { + word->merge_tess_fails(); + } + page_res_it.forward(); + } +} + +void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) { + int i; + UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-"); + UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" "); + UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~"); + UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^"); + for (i = 0; i < word_res->reject_map.length(); ++i) { + if (word_res->best_choice->unichar_id(i) == unichar_tilde) { + word_res->best_choice->set_unichar_id(unichar_dash, i); + if (word_res->reject_map[i].accepted()) { + word_res->reject_map[i].setrej_unlv_rej(); + } + } + if (word_res->best_choice->unichar_id(i) == unichar_pow) { + word_res->best_choice->set_unichar_id(unichar_space, i); + if (word_res->reject_map[i].accepted()) { + word_res->reject_map[i].setrej_unlv_rej(); + } + } + } +} + +GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) { + enum STATES { + JUNK, + FIRST_UPPER, + FIRST_LOWER, + FIRST_NUM, + SUBSEQUENT_UPPER, + SUBSEQUENT_LOWER, + SUBSEQUENT_NUM + }; + const char *str = word->best_choice->unichar_string().c_str(); + const char *lengths = word->best_choice->unichar_lengths().c_str(); + STATES state = JUNK; + int len = 0; + int isolated_digits = 0; + int isolated_alphas = 0; + int bad_char_count = 0; + int tess_rejs = 0; + int dodgy_chars = 0; + int ok_chars; + UNICHAR_ID last_char = -1; + int alpha_repetition_count = 0; + int longest_alpha_repetition_count = 0; + int longest_lower_run_len = 0; + int lower_string_count = 0; + int longest_upper_run_len = 0; + int upper_string_count = 0; + int total_alpha_count = 0; + int total_digit_count = 0; + + for (; *str != '\0'; str += *(lengths++)) { + len++; + if (word->uch_set->get_isupper(str, *lengths)) { + total_alpha_count++; + switch (state) { + case SUBSEQUENT_UPPER: + case FIRST_UPPER: + state = SUBSEQUENT_UPPER; + upper_string_count++; + if (longest_upper_run_len < upper_string_count) { + longest_upper_run_len = upper_string_count; + } + if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { + alpha_repetition_count++; + if (longest_alpha_repetition_count < alpha_repetition_count) { + longest_alpha_repetition_count = alpha_repetition_count; + } + } else { + last_char = word->uch_set->unichar_to_id(str, *lengths); + alpha_repetition_count = 1; + } + break; + case FIRST_NUM: + isolated_digits++; + // Fall through. + default: + state = FIRST_UPPER; + last_char = word->uch_set->unichar_to_id(str, *lengths); + alpha_repetition_count = 1; + upper_string_count = 1; + break; + } + } else if (word->uch_set->get_islower(str, *lengths)) { + total_alpha_count++; + switch (state) { + case SUBSEQUENT_LOWER: + case FIRST_LOWER: + state = SUBSEQUENT_LOWER; + lower_string_count++; + if (longest_lower_run_len < lower_string_count) { + longest_lower_run_len = lower_string_count; + } + if (last_char == word->uch_set->unichar_to_id(str, *lengths)) { + alpha_repetition_count++; + if (longest_alpha_repetition_count < alpha_repetition_count) { + longest_alpha_repetition_count = alpha_repetition_count; + } + } else { + last_char = word->uch_set->unichar_to_id(str, *lengths); + alpha_repetition_count = 1; + } + break; + case FIRST_NUM: + isolated_digits++; + // Fall through. + default: + state = FIRST_LOWER; + last_char = word->uch_set->unichar_to_id(str, *lengths); + alpha_repetition_count = 1; + lower_string_count = 1; + break; + } + } else if (word->uch_set->get_isdigit(str, *lengths)) { + total_digit_count++; + switch (state) { + case FIRST_NUM: + state = SUBSEQUENT_NUM; + case SUBSEQUENT_NUM: + break; + case FIRST_UPPER: + case FIRST_LOWER: + isolated_alphas++; + // Fall through. + default: + state = FIRST_NUM; + break; + } + } else { + if (*lengths == 1 && *str == ' ') { + tess_rejs++; + } else { + bad_char_count++; + } + switch (state) { + case FIRST_NUM: + isolated_digits++; + break; + case FIRST_UPPER: + case FIRST_LOWER: + isolated_alphas++; + default: + break; + } + state = JUNK; + } + } + + switch (state) { + case FIRST_NUM: + isolated_digits++; + break; + case FIRST_UPPER: + case FIRST_LOWER: + isolated_alphas++; + default: + break; + } + + if (crunch_include_numerals) { + total_alpha_count += total_digit_count - isolated_digits; + } + + if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len && + longest_alpha_repetition_count < crunch_long_repetitions) { + if ((crunch_accept_ok && + acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) || + longest_lower_run_len > crunch_leave_lc_strings || + longest_upper_run_len > crunch_leave_uc_strings) { + return G_NEVER_CRUNCH; + } + } + if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr && + (word->best_choice->permuter() == SYSTEM_DAWG_PERM || + word->best_choice->permuter() == FREQ_DAWG_PERM || + word->best_choice->permuter() == USER_DAWG_PERM || + word->best_choice->permuter() == NUMBER_PERM || + acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) { + return G_OK; + } + + ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs; + + if (crunch_debug > 3) { + tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str()); + tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count, + isolated_digits, isolated_alphas, tess_rejs); + } + if (bad_char_count == 0 && tess_rejs == 0 && + (len > isolated_digits + isolated_alphas || len <= 2)) { + return G_OK; + } + + if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) { + return G_TERRIBLE; + } + + if (len > 4) { + dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas; + if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) { + return G_DODGY; + } else { + return G_OK; + } + } else { + dodgy_chars = 2 * tess_rejs + bad_char_count; + if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) { + return G_DODGY; + } else { + return G_OK; + } + } +} + +/************************************************************************* + * word_deletable() + * DELETE WERDS AT ENDS OF ROWS IF + * Word is crunched && + * ( string length = 0 OR + * > 50% of chars are "|" (before merging) OR + * certainty < -10 OR + * rating /char > 60 OR + * TOP of word is more than 0.5 xht BELOW baseline OR + * BOTTOM of word is more than 0.5 xht ABOVE xht OR + * length of word < 3xht OR + * height of word < 0.7 xht OR + * height of word > 3.0 xht OR + * >75% of the outline BBs have longest dimension < 0.5xht + *************************************************************************/ + +CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) { + int word_len = word->reject_map.length(); + float rating_per_ch; + TBOX box; // BB of word + + if (word->unlv_crunch_mode == CR_NONE) { + delete_mode = 0; + return CR_NONE; + } + + if (word_len == 0) { + delete_mode = 1; + return CR_DELETE; + } + + if (word->rebuild_word != nullptr) { + // Cube leaves rebuild_word nullptr. + box = word->rebuild_word->bounding_box(); + if (box.height() < crunch_del_min_ht * kBlnXHeight) { + delete_mode = 4; + return CR_DELETE; + } + + if (noise_outlines(word->rebuild_word)) { + delete_mode = 5; + return CR_DELETE; + } + } + + if ((failure_count(word) * 1.5) > word_len) { + delete_mode = 2; + return CR_LOOSE_SPACE; + } + + if (word->best_choice->certainty() < crunch_del_cert) { + delete_mode = 7; + return CR_LOOSE_SPACE; + } + + rating_per_ch = word->best_choice->rating() / word_len; + + if (rating_per_ch > crunch_del_rating) { + delete_mode = 8; + return CR_LOOSE_SPACE; + } + + if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) { + delete_mode = 9; + return CR_LOOSE_SPACE; + } + + if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) { + delete_mode = 10; + return CR_LOOSE_SPACE; + } + + if (box.height() > crunch_del_max_ht * kBlnXHeight) { + delete_mode = 11; + return CR_LOOSE_SPACE; + } + + if (box.width() < crunch_del_min_width * kBlnXHeight) { + delete_mode = 3; + return CR_LOOSE_SPACE; + } + + delete_mode = 0; + return CR_NONE; +} + +int16_t Tesseract::failure_count(WERD_RES *word) { + const char *str = word->best_choice->unichar_string().c_str(); + int tess_rejs = 0; + + for (; *str != '\0'; str++) { + if (*str == ' ') { + tess_rejs++; + } + } + return tess_rejs; +} + +bool Tesseract::noise_outlines(TWERD *word) { + TBOX box; // BB of outline + int16_t outline_count = 0; + int16_t small_outline_count = 0; + int16_t max_dimension; + float small_limit = kBlnXHeight * crunch_small_outlines_size; + + for (unsigned b = 0; b < word->NumBlobs(); ++b) { + TBLOB *blob = word->blobs[b]; + for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) { + outline_count++; + box = ol->bounding_box(); + if (box.height() > box.width()) { + max_dimension = box.height(); + } else { + max_dimension = box.width(); + } + if (max_dimension < small_limit) { + small_outline_count++; + } + } + } + return small_outline_count >= outline_count; +} + +} // namespace tesseract
