Python2/PyMuPDF: mupdf-source/thirdparty/tesseract/src/classify/adaptmatch.cpp comparison

comparison mupdf-source/thirdparty/tesseract/src/classify/adaptmatch.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+/******************************************************************************
+** Filename:    adaptmatch.cpp
+** Purpose:     High level adaptive matcher.
+** Author:      Dan Johnson
+**
+** (c) Copyright Hewlett-Packard Company, 1988.
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+** http://www.apache.org/licenses/LICENSE-2.0
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+******************************************************************************/
+/*-----------------------------------------------------------------------------
+Include Files and Type Defines
+-----------------------------------------------------------------------------*/
+#ifdef HAVE_CONFIG_H
+#  include "config_auto.h"
+#endif
+#include "adaptive.h"        // for ADAPT_CLASS
+#include "ambigs.h"          // for UnicharIdVector, UnicharAmbigs
+#include "bitvec.h"          // for FreeBitVector, NewBitVector, BIT_VECTOR
+#include "blobs.h"           // for TBLOB, TWERD
+#include "classify.h"        // for Classify, CST_FRAGMENT, CST_WHOLE
+#include "dict.h"            // for Dict
+#include "errcode.h"         // for ASSERT_HOST
+#include "featdefs.h"        // for CharNormDesc
+#include "float2int.h"       // for BASELINE_Y_SHIFT
+#include "fontinfo.h"        // for ScoredFont, FontSet
+#include "intfx.h"           // for BlobToTrainingSample, INT_FX_RESULT_S...
+#include "intmatcher.h"      // for CP_RESULT_STRUCT, IntegerMatcher
+#include "intproto.h"        // for INT_FEATURE_STRUCT, (anonymous), Clas...
+#include "matchdefs.h"       // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO
+#include "mfoutline.h"       // for baseline, character, MF_SCALE_FACTOR
+#include "normalis.h"        // for DENORM, kBlnBaselineOffset, kBlnXHeight
+#include "normfeat.h"        // for ActualOutlineLength, CharNormLength
+#include "ocrfeatures.h"     // for FEATURE_STRUCT, FEATURE
+#include "oldlist.h"         // for push, delete_d
+#include "outfeat.h"         // for OutlineFeatDir, OutlineFeatLength
+#include "pageres.h"         // for WERD_RES
+#include "params.h"          // for IntParam, BoolParam, DoubleParam, Str...
+#include "picofeat.h"        // for PicoFeatDir, PicoFeatX, PicoFeatY
+#include "protos.h"          // for PROTO_STRUCT, FillABC
+#include "ratngs.h"          // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...
+#include "rect.h"            // for TBOX
+#include "scrollview.h"      // for ScrollView, ScrollView::BROWN, Scroll...
+#include "seam.h"            // for SEAM
+#include "shapeclassifier.h" // for ShapeClassifier
+#include "shapetable.h"      // for UnicharRating, ShapeTable, Shape, Uni...
+#include "tessclassifier.h"  // for TessClassifier
+#include "tessdatamanager.h" // for TessdataManager, TESSDATA_INTTEMP
+#include "tprintf.h"         // for tprintf
+#include "trainingsample.h"  // for TrainingSample
+#include "unicharset.h"      // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE
+#include "unicity_table.h"   // for UnicityTable
+#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
+#include "helpers.h"           // for IntCastRounded, ClipToRange
+#include "serialis.h"          // for TFile
+#include <algorithm> // for max, min
+#include <cassert>   // for assert
+#include <cmath>     // for fabs
+#include <cstdint>   // for INT32_MAX, UINT8_MAX
+#include <cstdio>    // for fflush, fclose, fopen, stdout, FILE
+#include <cstring>   // for strstr, memset, strcmp
+namespace tesseract {
+// TODO: The parameter classify_enable_adaptive_matcher can cause
+// a segmentation fault if it is set to false (issue #256),
+// so override it here.
+#define classify_enable_adaptive_matcher true
+#define ADAPT_TEMPLATE_SUFFIX ".a"
+#define MAX_MATCHES 10
+#define UNLIKELY_NUM_FEAT 200
+#define NO_DEBUG 0
+#define MAX_ADAPTABLE_WERD_SIZE 40
+#define ADAPTABLE_WERD_ADJUSTMENT (0.05)
+#define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
+#define WORST_POSSIBLE_RATING (0.0f)
+struct ADAPT_RESULTS {
+int32_t BlobLength;
+bool HasNonfragment;
+UNICHAR_ID best_unichar_id;
+int best_match_index;
+float best_rating;
+std::vector<UnicharRating> match;
+std::vector<CP_RESULT_STRUCT> CPResults;
+/// Initializes data members to the default values. Sets the initial
+/// rating of each class to be the worst possible rating (1.0).
+inline void Initialize() {
+BlobLength = INT32_MAX;
+HasNonfragment = false;
+ComputeBest();
+}
+// Computes best_unichar_id, best_match_index and best_rating.
+void ComputeBest() {
+best_unichar_id = INVALID_UNICHAR_ID;
+best_match_index = -1;
+best_rating = WORST_POSSIBLE_RATING;
+for (unsigned i = 0; i < match.size(); ++i) {
+if (match[i].rating > best_rating) {
+best_rating = match[i].rating;
+best_unichar_id = match[i].unichar_id;
+best_match_index = i;
+}
+}
+}
+};
+struct PROTO_KEY {
+ADAPT_TEMPLATES_STRUCT *Templates;
+CLASS_ID ClassId;
+int ConfigId;
+};
+// Sort function to sort ratings appropriately by descending rating.
+static bool SortDescendingRating(const UnicharRating &a, const UnicharRating &b) {
+if (a.rating != b.rating) {
+return a.rating > b.rating;
+} else {
+return a.unichar_id < b.unichar_id;
+}
+}
+/*-----------------------------------------------------------------------------
+Private Macros
+-----------------------------------------------------------------------------*/
+inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
+return (1.0f - confidence) > matcher_great_threshold;
+}
+/*-----------------------------------------------------------------------------
+Private Function Prototypes
+-----------------------------------------------------------------------------*/
+// Returns the index of the given id in results, if present, or the size of the
+// vector (index it will go at) if not present.
+static unsigned FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
+for (unsigned i = 0; i < results.match.size(); i++) {
+if (results.match[i].unichar_id == id) {
+return i;
+}
+}
+return results.match.size();
+}
+// Returns the current rating for a unichar id if we have rated it, defaulting
+// to WORST_POSSIBLE_RATING.
+static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
+unsigned index = FindScoredUnichar(id, results);
+if (index >= results.match.size()) {
+return WORST_POSSIBLE_RATING;
+}
+return results.match[index].rating;
+}
+void InitMatcherRatings(float *Rating);
+int MakeTempProtoPerm(void *item1, void *item2);
+void SetAdaptiveThreshold(float Threshold);
+/*-----------------------------------------------------------------------------
+Public Code
+-----------------------------------------------------------------------------*/
+/**
+* This routine calls the adaptive matcher
+* which returns (in an array) the class id of each
+* class matched.
+*
+* It also returns the number of classes matched.
+* For each class matched it places the best rating
+* found for that class into the Ratings array.
+*
+* Bad matches are then removed so that they don't
+* need to be sorted.  The remaining good matches are
+* then sorted and converted to choices.
+*
+* This routine also performs some simple speckle
+* filtering.
+*
+* @param Blob    blob to be classified
+* @param[out] Choices    List of choices found by adaptive matcher.
+* filled on return with the choices found by the
+* class pruner and the ratings there from. Also
+* contains the detailed results of the integer matcher.
+*
+*/
+void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
+assert(Choices != nullptr);
+auto *Results = new ADAPT_RESULTS;
+Results->Initialize();
+ASSERT_HOST(AdaptedTemplates != nullptr);
+DoAdaptiveMatch(Blob, Results);
+RemoveBadMatches(Results);
+std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
+RemoveExtraPuncs(Results);
+Results->ComputeBest();
+ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results, Choices);
+// TODO(rays) Move to before ConvertMatchesToChoices!
+if (LargeSpeckle(*Blob) || Choices->empty()) {
+AddLargeSpeckleTo(Results->BlobLength, Choices);
+}
+if (matcher_debug_level >= 1) {
+tprintf("AD Matches =  ");
+PrintAdaptiveMatchResults(*Results);
+}
+#ifndef GRAPHICS_DISABLED
+if (classify_enable_adaptive_debugger) {
+DebugAdaptiveClassifier(Blob, Results);
+}
+#endif
+delete Results;
+} /* AdaptiveClassifier */
+#ifndef GRAPHICS_DISABLED
+// If *win is nullptr, sets it to a new ScrollView() object with title msg.
+// Clears the window and draws baselines.
+void Classify::RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset,
+const TBOX &wbox) {
+const int kSampleSpaceWidth = 500;
+if (*win == nullptr) {
+*win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200, kSampleSpaceWidth * 2,
+200, true);
+}
+(*win)->Clear();
+(*win)->Pen(64, 64, 64);
+(*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset, kSampleSpaceWidth, kBlnBaselineOffset);
+(*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset, kSampleSpaceWidth,
+kBlnXHeight + kBlnBaselineOffset);
+(*win)->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
+}
+#endif // !GRAPHICS_DISABLED
+// Learns the given word using its chopped_word, seam_array, denorm,
+// box_word, best_state, and correct_text to learn both correctly and
+// incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
+// is called and the data will be saved in an internal buffer.
+// Otherwise AdaptToBlob is called for adaption within a document.
+void Classify::LearnWord(const char *fontname, WERD_RES *word) {
+int word_len = word->correct_text.size();
+if (word_len == 0) {
+return;
+}
+float *thresholds = nullptr;
+if (fontname == nullptr) {
+// Adaption mode.
+if (!EnableLearning || word->best_choice == nullptr) {
+return; // Can't or won't adapt.
+}
+if (classify_learning_debug_level >= 1) {
+tprintf("\n\nAdapting to word = %s\n", word->best_choice->debug_string().c_str());
+}
+thresholds = new float[word_len];
+word->ComputeAdaptionThresholds(getDict().certainty_scale, matcher_perfect_threshold,
+matcher_good_threshold, matcher_rating_margin, thresholds);
+}
+int start_blob = 0;
+#ifndef GRAPHICS_DISABLED
+if (classify_debug_character_fragments) {
+if (learn_fragmented_word_debug_win_ != nullptr) {
+learn_fragmented_word_debug_win_->Wait();
+}
+RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
+word->chopped_word->bounding_box());
+RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
+word->chopped_word->bounding_box());
+word->chopped_word->plot(learn_fragmented_word_debug_win_);
+ScrollView::Update();
+}
+#endif // !GRAPHICS_DISABLED
+for (int ch = 0; ch < word_len; ++ch) {
+if (classify_debug_character_fragments) {
+tprintf("\nLearning %s\n", word->correct_text[ch].c_str());
+}
+if (word->correct_text[ch].length() > 0) {
+float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
+LearnPieces(fontname, start_blob, word->best_state[ch], threshold, CST_WHOLE,
+word->correct_text[ch].c_str(), word);
+if (word->best_state[ch] > 1 && !disable_character_fragments) {
+// Check that the character breaks into meaningful fragments
+// that each match a whole character with at least
+// classify_character_fragments_garbage_certainty_threshold
+bool garbage = false;
+int frag;
+for (frag = 0; frag < word->best_state[ch]; ++frag) {
+TBLOB *frag_blob = word->chopped_word->blobs[start_blob + frag];
+if (classify_character_fragments_garbage_certainty_threshold < 0) {
+garbage |= LooksLikeGarbage(frag_blob);
+}
+}
+// Learn the fragments.
+if (!garbage) {
+bool pieces_all_natural = word->PiecesAllNatural(start_blob, word->best_state[ch]);
+if (pieces_all_natural || !prioritize_division) {
+for (frag = 0; frag < word->best_state[ch]; ++frag) {
+std::vector<std::string> tokens = split(word->correct_text[ch], ' ');
+tokens[0] = CHAR_FRAGMENT::to_string(tokens[0].c_str(), frag, word->best_state[ch],
+pieces_all_natural);
+std::string full_string;
+for (unsigned i = 0; i < tokens.size(); i++) {
+full_string += tokens[i];
+if (i != tokens.size() - 1) {
+full_string += ' ';
+}
+}
+LearnPieces(fontname, start_blob + frag, 1, threshold, CST_FRAGMENT,
+full_string.c_str(), word);
+}
+}
+}
+}
+// TODO(rays): re-enable this part of the code when we switch to the
+// new classifier that needs to see examples of garbage.
+/*
+if (word->best_state[ch] > 1) {
+// If the next blob is good, make junk with the rightmost fragment.
+if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
+LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
+word->best_state[ch + 1] + 1,
+threshold, CST_IMPROPER, INVALID_UNICHAR, word);
+}
+// If the previous blob is good, make junk with the leftmost fragment.
+if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
+LearnPieces(fontname, start_blob - word->best_state[ch - 1],
+word->best_state[ch - 1] + 1,
+threshold, CST_IMPROPER, INVALID_UNICHAR, word);
+}
+}
+// If the next blob is good, make a join with it.
+if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
+std::string joined_text = word->correct_text[ch];
+joined_text += word->correct_text[ch + 1];
+LearnPieces(fontname, start_blob,
+word->best_state[ch] + word->best_state[ch + 1],
+threshold, CST_NGRAM, joined_text.c_str(), word);
+}
+*/
+}
+start_blob += word->best_state[ch];
+}
+delete[] thresholds;
+} // LearnWord.
+// Builds a blob of length fragments, from the word, starting at start,
+// and then learns it, as having the given correct_text.
+// If fontname is not nullptr, then LearnBlob is called and the data will be
+// saved in an internal buffer for static training.
+// Otherwise AdaptToBlob is called for adaption within a document.
+// threshold is a magic number required by AdaptToChar and generated by
+// ComputeAdaptionThresholds.
+// Although it can be partly inferred from the string, segmentation is
+// provided to explicitly clarify the character segmentation.
+void Classify::LearnPieces(const char *fontname, int start, int length, float threshold,
+CharSegmentationType segmentation, const char *correct_text,
+WERD_RES *word) {
+// TODO(daria) Remove/modify this if/when we want
+// to train and/or adapt to n-grams.
+if (segmentation != CST_WHOLE && (segmentation != CST_FRAGMENT || disable_character_fragments)) {
+return;
+}
+if (length > 1) {
+SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
+}
+TBLOB *blob = word->chopped_word->blobs[start];
+// Rotate the blob if needed for classification.
+TBLOB *rotated_blob = blob->ClassifyNormalizeIfNeeded();
+if (rotated_blob == nullptr) {
+rotated_blob = blob;
+}
+#ifndef GRAPHICS_DISABLED
+// Draw debug windows showing the blob that is being learned if needed.
+if (strcmp(classify_learn_debug_str.c_str(), correct_text) == 0) {
+RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600, word->chopped_word->bounding_box());
+rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
+learn_debug_win_->Update();
+learn_debug_win_->Wait();
+}
+if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
+ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
+blob->plot(learn_fragments_debug_win_, ScrollView::BLUE, ScrollView::BROWN);
+learn_fragments_debug_win_->Update();
+}
+#endif // !GRAPHICS_DISABLED
+if (fontname != nullptr) {
+classify_norm_method.set_value(character); // force char norm spc 30/11/93
+tess_bn_matching.set_value(false);         // turn it off
+tess_cn_matching.set_value(false);
+DENORM bl_denorm, cn_denorm;
+INT_FX_RESULT_STRUCT fx_info;
+SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm, &bl_denorm, &cn_denorm, &fx_info);
+LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
+} else if (unicharset.contains_unichar(correct_text)) {
+UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
+int font_id = word->fontinfo != nullptr ? fontinfo_table_.get_index(*word->fontinfo) : 0;
+if (classify_learning_debug_level >= 1) {
+tprintf("Adapting to char = %s, thr= %g font_id= %d\n", unicharset.id_to_unichar(class_id),
+threshold, font_id);
+}
+// If filename is not nullptr we are doing recognition
+// (as opposed to training), so we must have already set word fonts.
+AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
+if (BackupAdaptedTemplates != nullptr) {
+// Adapt the backup templates too. They will be used if the primary gets
+// too full.
+AdaptToChar(rotated_blob, class_id, font_id, threshold, BackupAdaptedTemplates);
+}
+} else if (classify_debug_level >= 1) {
+tprintf("Can't adapt to %s not in unicharset\n", correct_text);
+}
+if (rotated_blob != blob) {
+delete rotated_blob;
+}
+SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
+} // LearnPieces.
+/*---------------------------------------------------------------------------*/
+/**
+* This routine performs cleanup operations
+* on the adaptive classifier.  It should be called
+* before the program is terminated.  Its main function
+* is to save the adapted templates to a file.
+*
+* Globals:
+* - #AdaptedTemplates current set of adapted templates
+* - #classify_save_adapted_templates true if templates should be saved
+* - #classify_enable_adaptive_matcher true if adaptive matcher is enabled
+*/
+void Classify::EndAdaptiveClassifier() {
+std::string Filename;
+FILE *File;
+if (AdaptedTemplates != nullptr && classify_enable_adaptive_matcher &&
+classify_save_adapted_templates) {
+Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
+File = fopen(Filename.c_str(), "wb");
+if (File == nullptr) {
+tprintf("Unable to save adapted templates to %s!\n", Filename.c_str());
+} else {
+tprintf("\nSaving adapted templates to %s ...", Filename.c_str());
+fflush(stdout);
+WriteAdaptedTemplates(File, AdaptedTemplates);
+tprintf("\n");
+fclose(File);
+}
+}
+delete AdaptedTemplates;
+AdaptedTemplates = nullptr;
+delete BackupAdaptedTemplates;
+BackupAdaptedTemplates = nullptr;
+if (PreTrainedTemplates != nullptr) {
+delete PreTrainedTemplates;
+PreTrainedTemplates = nullptr;
+}
+getDict().EndDangerousAmbigs();
+FreeNormProtos();
+if (AllProtosOn != nullptr) {
+FreeBitVector(AllProtosOn);
+FreeBitVector(AllConfigsOn);
+FreeBitVector(AllConfigsOff);
+FreeBitVector(TempProtoMask);
+AllProtosOn = nullptr;
+AllConfigsOn = nullptr;
+AllConfigsOff = nullptr;
+TempProtoMask = nullptr;
+}
+delete shape_table_;
+shape_table_ = nullptr;
+delete static_classifier_;
+static_classifier_ = nullptr;
+} /* EndAdaptiveClassifier */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine reads in the training
+* information needed by the adaptive classifier
+* and saves it into global variables.
+*  Parameters:
+*      load_pre_trained_templates  Indicates whether the pre-trained
+*                     templates (inttemp, normproto and pffmtable components)
+*                     should be loaded. Should only be set to true if the
+*                     necessary classifier components are present in the
+*                     [lang].traineddata file.
+*  Globals:
+*      BuiltInTemplatesFile  file to get built-in temps from
+*      BuiltInCutoffsFile    file to get avg. feat per class from
+*      classify_use_pre_adapted_templates
+*                            enables use of pre-adapted templates
+*/
+void Classify::InitAdaptiveClassifier(TessdataManager *mgr) {
+if (!classify_enable_adaptive_matcher) {
+return;
+}
+if (AllProtosOn != nullptr) {
+EndAdaptiveClassifier(); // Don't leak with multiple inits.
+}
+// If there is no language_data_path_prefix, the classifier will be
+// adaptive only.
+if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
+TFile fp;
+ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
+PreTrainedTemplates = ReadIntTemplates(&fp);
+if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
+shape_table_ = new ShapeTable(unicharset);
+if (!shape_table_->DeSerialize(&fp)) {
+tprintf("Error loading shape table!\n");
+delete shape_table_;
+shape_table_ = nullptr;
+}
+}
+ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
+ReadNewCutoffs(&fp, CharNormCutoffs);
+ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
+NormProtos = ReadNormProtos(&fp);
+static_classifier_ = new TessClassifier(false, this);
+}
+InitIntegerFX();
+AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
+AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
+AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
+TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
+set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
+set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
+zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
+for (uint16_t &BaselineCutoff : BaselineCutoffs) {
+BaselineCutoff = 0;
+}
+if (classify_use_pre_adapted_templates) {
+TFile fp;
+std::string Filename = imagefile;
+Filename += ADAPT_TEMPLATE_SUFFIX;
+if (!fp.Open(Filename.c_str(), nullptr)) {
+AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
+} else {
+tprintf("\nReading pre-adapted templates from %s ...\n", Filename.c_str());
+fflush(stdout);
+AdaptedTemplates = ReadAdaptedTemplates(&fp);
+tprintf("\n");
+PrintAdaptedTemplates(stdout, AdaptedTemplates);
+for (unsigned i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
+BaselineCutoffs[i] = CharNormCutoffs[i];
+}
+}
+} else {
+delete AdaptedTemplates;
+AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
+}
+} /* InitAdaptiveClassifier */
+void Classify::ResetAdaptiveClassifierInternal() {
+if (classify_learning_debug_level > 0) {
+tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n", NumAdaptationsFailed);
+}
+delete AdaptedTemplates;
+AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
+delete BackupAdaptedTemplates;
+BackupAdaptedTemplates = nullptr;
+NumAdaptationsFailed = 0;
+}
+// If there are backup adapted templates, switches to those, otherwise resets
+// the main adaptive classifier (because it is full.)
+void Classify::SwitchAdaptiveClassifier() {
+if (BackupAdaptedTemplates == nullptr) {
+ResetAdaptiveClassifierInternal();
+return;
+}
+if (classify_learning_debug_level > 0) {
+tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
+NumAdaptationsFailed);
+}
+delete AdaptedTemplates;
+AdaptedTemplates = BackupAdaptedTemplates;
+BackupAdaptedTemplates = nullptr;
+NumAdaptationsFailed = 0;
+}
+// Resets the backup adaptive classifier to empty.
+void Classify::StartBackupAdaptiveClassifier() {
+delete BackupAdaptedTemplates;
+BackupAdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
+}
+/*---------------------------------------------------------------------------*/
+/**
+* This routine prepares the adaptive
+* matcher for the start
+* of the first pass.  Learning is enabled (unless it
+* is disabled for the whole program).
+*
+* @note this is somewhat redundant, it simply says that if learning is
+* enabled then it will remain enabled on the first pass.  If it is
+* disabled, then it will remain disabled.  This is only put here to
+* make it very clear that learning is controlled directly by the global
+* setting of EnableLearning.
+*
+* Globals:
+* - #EnableLearning
+* set to true by this routine
+*/
+void Classify::SetupPass1() {
+EnableLearning = classify_enable_learning;
+getDict().SetupStopperPass1();
+} /* SetupPass1 */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine prepares the adaptive
+* matcher for the start of the second pass.  Further
+* learning is disabled.
+*
+* Globals:
+* - #EnableLearning set to false by this routine
+*/
+void Classify::SetupPass2() {
+EnableLearning = false;
+getDict().SetupStopperPass2();
+} /* SetupPass2 */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine creates a new adapted
+* class and uses Blob as the model for the first
+* config in that class.
+*
+* @param Blob blob to model new class after
+* @param ClassId id of the class to be initialized
+* @param FontinfoId font information inferred from pre-trained templates
+* @param Class adapted class to be initialized
+* @param Templates adapted templates to add new class to
+*
+* Globals:
+* - #AllProtosOn dummy mask with all 1's
+* - BaselineCutoffs kludge needed to get cutoffs
+* - #PreTrainedTemplates kludge needed to get cutoffs
+*/
+void Classify::InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,
+ADAPT_TEMPLATES_STRUCT *Templates) {
+FEATURE_SET Features;
+int Fid, Pid;
+FEATURE Feature;
+int NumFeatures;
+PROTO_STRUCT *Proto;
+INT_CLASS_STRUCT *IClass;
+TEMP_CONFIG_STRUCT *Config;
+classify_norm_method.set_value(baseline);
+Features = ExtractOutlineFeatures(Blob);
+NumFeatures = Features->NumFeatures;
+if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
+delete Features;
+return;
+}
+Config = new TEMP_CONFIG_STRUCT(NumFeatures - 1, FontinfoId);
+TempConfigFor(Class, 0) = Config;
+/* this is a kludge to construct cutoffs for adapted templates */
+if (Templates == AdaptedTemplates) {
+BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
+}
+IClass = ClassForClassId(Templates->Templates, ClassId);
+for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
+Pid = AddIntProto(IClass);
+assert(Pid != NO_PROTO);
+Feature = Features->Features[Fid];
+auto TempProto = new TEMP_PROTO_STRUCT;
+Proto = &(TempProto->Proto);
+/* compute proto params - NOTE that Y_DIM_OFFSET must be used because
+ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
+instead of the -0.25 to 0.75 used in baseline normalization */
+Proto->Angle = Feature->Params[OutlineFeatDir];
+Proto->X = Feature->Params[OutlineFeatX];
+Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
+Proto->Length = Feature->Params[OutlineFeatLength];
+FillABC(Proto);
+TempProto->ProtoId = Pid;
+SET_BIT(Config->Protos, Pid);
+ConvertProto(Proto, Pid, IClass);
+AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
+Class->TempProtos = push(Class->TempProtos, TempProto);
+}
+delete Features;
+AddIntConfig(IClass);
+ConvertConfig(AllProtosOn, 0, IClass);
+if (classify_learning_debug_level >= 1) {
+tprintf("Added new class '%s' with class id %d and %d protos.\n",
+unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
+#ifndef GRAPHICS_DISABLED
+if (classify_learning_debug_level > 1) {
+DisplayAdaptedChar(Blob, IClass);
+}
+#endif
+}
+if (IsEmptyAdaptedClass(Class)) {
+(Templates->NumNonEmptyClasses)++;
+}
+} /* InitAdaptedClass */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine sets up the feature
+* extractor to extract baseline normalized
+* pico-features.
+*
+* The extracted pico-features are converted
+* to integer form and placed in IntFeatures. The
+* original floating-pt. features are returned in
+* FloatFeatures.
+*
+* Globals: none
+* @param Blob blob to extract features from
+* @param[out] IntFeatures array to fill with integer features
+* @param[out] FloatFeatures place to return actual floating-pt features
+*
+* @return Number of pico-features returned (0 if
+* an error occurred)
+*/
+int Classify::GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures,
+FEATURE_SET *FloatFeatures) {
+FEATURE_SET Features;
+int NumFeatures;
+classify_norm_method.set_value(baseline);
+Features = ExtractPicoFeatures(Blob);
+NumFeatures = Features->NumFeatures;
+if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
+delete Features;
+return 0;
+}
+ComputeIntFeatures(Features, IntFeatures);
+*FloatFeatures = Features;
+return NumFeatures;
+} /* GetAdaptiveFeatures */
+/*-----------------------------------------------------------------------------
+Private Code
+-----------------------------------------------------------------------------*/
+/*---------------------------------------------------------------------------*/
+/**
+* Return true if the specified word is acceptable for adaptation.
+*
+* Globals: none
+*
+* @param word current word
+*
+* @return true or false
+*/
+bool Classify::AdaptableWord(WERD_RES *word) {
+if (word->best_choice == nullptr) {
+return false;
+}
+auto BestChoiceLength = word->best_choice->length();
+float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
+return // rules that apply in general - simplest to compute first
+BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&
+BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
+// This basically ensures that the word is at least a dictionary match
+// (freq word, user word, system dawg word, etc).
+// Since all the other adjustments will make adjust factor higher
+// than higher than adaptable_score=1.1+0.05=1.15
+// Since these are other flags that ensure that the word is dict word,
+// this check could be at times redundant.
+word->best_choice->adjust_factor() <= adaptable_score &&
+// Make sure that alternative choices are not dictionary words.
+word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
+}
+/*---------------------------------------------------------------------------*/
+/**
+* @param Blob blob to add to templates for ClassId
+* @param ClassId class to add blob to
+* @param FontinfoId font information from pre-trained templates
+* @param Threshold minimum match rating to existing template
+* @param adaptive_templates current set of adapted templates
+*
+* Globals:
+* - AllProtosOn dummy mask to match against all protos
+* - AllConfigsOn dummy mask to match against all configs
+*/
+void Classify::AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,
+ADAPT_TEMPLATES_STRUCT *adaptive_templates) {
+int NumFeatures;
+INT_FEATURE_ARRAY IntFeatures;
+UnicharRating int_result;
+INT_CLASS_STRUCT *IClass;
+ADAPT_CLASS_STRUCT *Class;
+TEMP_CONFIG_STRUCT *TempConfig;
+FEATURE_SET FloatFeatures;
+int NewTempConfigId;
+if (!LegalClassId(ClassId)) {
+return;
+}
+int_result.unichar_id = ClassId;
+Class = adaptive_templates->Class[ClassId];
+assert(Class != nullptr);
+if (IsEmptyAdaptedClass(Class)) {
+InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
+} else {
+IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
+NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
+if (NumFeatures <= 0) {
+return; // Features already freed by GetAdaptiveFeatures.
+}
+// Only match configs with the matching font.
+BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
+for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
+if (GetFontinfoId(Class, cfg) == FontinfoId) {
+SET_BIT(MatchingFontConfigs, cfg);
+} else {
+reset_bit(MatchingFontConfigs, cfg);
+}
+}
+im_.Match(IClass, AllProtosOn, MatchingFontConfigs, NumFeatures, IntFeatures, &int_result,
+classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
+FreeBitVector(MatchingFontConfigs);
+SetAdaptiveThreshold(Threshold);
+if (1.0f - int_result.rating <= Threshold) {
+if (ConfigIsPermanent(Class, int_result.config)) {
+if (classify_learning_debug_level >= 1) {
+tprintf("Found good match to perm config %d = %4.1f%%.\n", int_result.config,
+int_result.rating * 100.0);
+}
+delete FloatFeatures;
+return;
+}
+TempConfig = TempConfigFor(Class, int_result.config);
+IncreaseConfidence(TempConfig);
+if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
+Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
+}
+if (classify_learning_debug_level >= 1) {
+tprintf("Increasing reliability of temp config %d to %d.\n", int_result.config,
+TempConfig->NumTimesSeen);
+}
+if (TempConfigReliable(ClassId, TempConfig)) {
+MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
+UpdateAmbigsGroup(ClassId, Blob);
+}
+} else {
+if (classify_learning_debug_level >= 1) {
+tprintf("Found poor match to temp config %d = %4.1f%%.\n", int_result.config,
+int_result.rating * 100.0);
+#ifndef GRAPHICS_DISABLED
+if (classify_learning_debug_level > 2) {
+DisplayAdaptedChar(Blob, IClass);
+}
+#endif
+}
+NewTempConfigId = MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId, NumFeatures,
+IntFeatures, FloatFeatures);
+if (NewTempConfigId >= 0 &&
+TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
+MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
+UpdateAmbigsGroup(ClassId, Blob);
+}
+#ifndef GRAPHICS_DISABLED
+if (classify_learning_debug_level > 1) {
+DisplayAdaptedChar(Blob, IClass);
+}
+#endif
+}
+delete FloatFeatures;
+}
+} /* AdaptToChar */
+#ifndef GRAPHICS_DISABLED
+void Classify::DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class) {
+INT_FX_RESULT_STRUCT fx_info;
+std::vector<INT_FEATURE_STRUCT> bl_features;
+TrainingSample *sample =
+BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info, &bl_features);
+if (sample == nullptr) {
+return;
+}
+UnicharRating int_result;
+im_.Match(int_class, AllProtosOn, AllConfigsOn, bl_features.size(), &bl_features[0], &int_result,
+classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
+tprintf("Best match to temp config %d = %4.1f%%.\n", int_result.config,
+int_result.rating * 100.0);
+if (classify_learning_debug_level >= 2) {
+uint32_t ConfigMask;
+ConfigMask = 1 << int_result.config;
+ShowMatchDisplay();
+im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask), bl_features.size(),
+&bl_features[0], &int_result, classify_adapt_feature_threshold, 6 | 0x19,
+matcher_debug_separate_windows);
+UpdateMatchDisplay();
+}
+delete sample;
+}
+#endif
+/**
+* This routine adds the result of a classification into
+* Results.  If the new rating is much worse than the current
+* best rating, it is not entered into results because it
+* would end up being stripped later anyway.  If the new rating
+* is better than the old rating for the class, it replaces the
+* old rating.  If this is the first rating for the class, the
+* class is added to the list of matched classes in Results.
+* If the new rating is better than the best so far, it
+* becomes the best so far.
+*
+* Globals:
+* - #matcher_bad_match_pad defines limits of an acceptable match
+*
+* @param new_result new result to add
+* @param[out] results results to add new result to
+*/
+void Classify::AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results) {
+auto old_match = FindScoredUnichar(new_result.unichar_id, *results);
+if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
+(old_match < results->match.size() &&
+new_result.rating <= results->match[old_match].rating)) {
+return; // New one not good enough.
+}
+if (!unicharset.get_fragment(new_result.unichar_id)) {
+results->HasNonfragment = true;
+}
+if (old_match < results->match.size()) {
+results->match[old_match].rating = new_result.rating;
+} else {
+results->match.push_back(new_result);
+}
+if (new_result.rating > results->best_rating &&
+// Ensure that fragments do not affect best rating, class and config.
+// This is needed so that at least one non-fragmented character is
+// always present in the results.
+// TODO(daria): verify that this helps accuracy and does not
+// hurt performance.
+!unicharset.get_fragment(new_result.unichar_id)) {
+results->best_match_index = old_match;
+results->best_rating = new_result.rating;
+results->best_unichar_id = new_result.unichar_id;
+}
+} /* AddNewResult */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine is identical to CharNormClassifier()
+* except that it does no class pruning.  It simply matches
+* the unknown blob against the classes listed in
+* Ambiguities.
+*
+* Globals:
+* - #AllProtosOn mask that enables all protos
+* - #AllConfigsOn mask that enables all configs
+*
+* @param blob blob to be classified
+* @param templates built-in templates to classify against
+* @param classes adapted class templates
+* @param ambiguities array of unichar id's to match against
+* @param[out] results place to put match results
+* @param int_features
+* @param fx_info
+*/
+void Classify::AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,
+const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,
+INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes,
+UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) {
+if (int_features.empty()) {
+return;
+}
+auto *CharNormArray = new uint8_t[unicharset.size()];
+UnicharRating int_result;
+results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr, CharNormArray);
+bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
+if (debug) {
+tprintf("AM Matches =  ");
+}
+int top = blob->bounding_box().top();
+int bottom = blob->bounding_box().bottom();
+while (*ambiguities >= 0) {
+CLASS_ID class_id = *ambiguities;
+int_result.unichar_id = class_id;
+im_.Match(ClassForClassId(templates, class_id), AllProtosOn, AllConfigsOn, int_features.size(),
+&int_features[0], &int_result, classify_adapt_feature_threshold, NO_DEBUG,
+matcher_debug_separate_windows);
+ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0, results->BlobLength,
+classify_integer_matcher_multiplier, CharNormArray, &int_result,
+results);
+ambiguities++;
+}
+delete[] CharNormArray;
+} /* AmbigClassifier */
+/*---------------------------------------------------------------------------*/
+/// Factored-out calls to IntegerMatcher based on class pruner results.
+/// Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.
+void Classify::MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,
+const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,
+ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier,
+const TBOX &blob_box, const std::vector<CP_RESULT_STRUCT> &results,
+ADAPT_RESULTS *final_results) {
+int top = blob_box.top();
+int bottom = blob_box.bottom();
+UnicharRating int_result;
+for (auto &&result : results) {
+CLASS_ID class_id = result.Class;
+BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos : AllProtosOn;
+BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs : AllConfigsOn;
+int_result.unichar_id = class_id;
+im_.Match(ClassForClassId(templates, class_id), protos, configs, num_features, features,
+&int_result, classify_adapt_feature_threshold, debug, matcher_debug_separate_windows);
+bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
+ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top, result.Rating,
+final_results->BlobLength, matcher_multiplier, norm_factors,
+&int_result, final_results);
+}
+}
+// Converts configs to fonts, and if the result is not adapted, and a
+// shape_table_ is present, the shape is expanded to include all
+// unichar_ids represented, before applying a set of corrections to the
+// distance rating in int_result, (see ComputeCorrectedRating.)
+// The results are added to the final_results output.
+void Classify::ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id,
+int bottom, int top, float cp_rating,
+int blob_length, int matcher_multiplier,
+const uint8_t *cn_factors, UnicharRating *int_result,
+ADAPT_RESULTS *final_results) {
+if (classes != nullptr) {
+// Adapted result. Convert configs to fontinfo_ids.
+int_result->adapted = true;
+for (auto &font : int_result->fonts) {
+font.fontinfo_id = GetFontinfoId(classes[class_id], font.fontinfo_id);
+}
+} else {
+// Pre-trained result. Map fonts using font_sets_.
+int_result->adapted = false;
+for (auto &font : int_result->fonts) {
+font.fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, font.fontinfo_id);
+}
+if (shape_table_ != nullptr) {
+// Two possible cases:
+// 1. Flat shapetable. All unichar-ids of the shapes referenced by
+// int_result->fonts are the same. In this case build a new vector of
+// mapped fonts and replace the fonts in int_result.
+// 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
+// by int_result. In this case, build a vector of UnicharRating to
+// gather together different font-ids for each unichar. Also covers case1.
+std::vector<UnicharRating> mapped_results;
+for (auto &f : int_result->fonts) {
+int shape_id = f.fontinfo_id;
+const Shape &shape = shape_table_->GetShape(shape_id);
+for (int c = 0; c < shape.size(); ++c) {
+int unichar_id = shape[c].unichar_id;
+if (!unicharset.get_enabled(unichar_id)) {
+continue;
+}
+// Find the mapped_result for unichar_id.
+unsigned r = 0;
+for (r = 0; r < mapped_results.size() && mapped_results[r].unichar_id != unichar_id;
+++r) {
+}
+if (r == mapped_results.size()) {
+mapped_results.push_back(*int_result);
+mapped_results[r].unichar_id = unichar_id;
+mapped_results[r].fonts.clear();
+}
+for (int font_id : shape[c].font_ids) {
+mapped_results[r].fonts.emplace_back(font_id, f.score);
+}
+}
+}
+for (auto &m : mapped_results) {
+m.rating = ComputeCorrectedRating(debug, m.unichar_id, cp_rating, int_result->rating,
+int_result->feature_misses, bottom, top, blob_length,
+matcher_multiplier, cn_factors);
+AddNewResult(m, final_results);
+}
+return;
+}
+}
+if (unicharset.get_enabled(class_id)) {
+int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating, int_result->rating,
+int_result->feature_misses, bottom, top,
+blob_length, matcher_multiplier, cn_factors);
+AddNewResult(*int_result, final_results);
+}
+}
+// Applies a set of corrections to the confidence im_rating,
+// including the cn_correction, miss penalty and additional penalty
+// for non-alnums being vertical misfits. Returns the corrected confidence.
+double Classify::ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
+double im_rating, int feature_misses, int bottom, int top,
+int blob_length, int matcher_multiplier,
+const uint8_t *cn_factors) {
+// Compute class feature corrections.
+double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length, cn_factors[unichar_id],
+matcher_multiplier);
+double miss_penalty = tessedit_class_miss_scale * feature_misses;
+double vertical_penalty = 0.0;
+// Penalize non-alnums for being vertical misfits.
+if (!unicharset.get_isalpha(unichar_id) && !unicharset.get_isdigit(unichar_id) &&
+cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
+int min_bottom, max_bottom, min_top, max_top;
+unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
+if (debug) {
+tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n", top, min_top, max_top, bottom,
+min_bottom, max_bottom);
+}
+if (top < min_top || top > max_top || bottom < min_bottom || bottom > max_bottom) {
+vertical_penalty = classify_misfit_junk_penalty;
+}
+}
+double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
+if (result < WORST_POSSIBLE_RATING) {
+result = WORST_POSSIBLE_RATING;
+}
+if (debug) {
+tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
+unicharset.id_to_unichar(unichar_id), result * 100.0, cp_rating * 100.0,
+(1.0 - im_rating) * 100.0, (cn_corrected - (1.0 - im_rating)) * 100.0,
+cn_factors[unichar_id], miss_penalty * 100.0, vertical_penalty * 100.0);
+}
+return result;
+}
+/*---------------------------------------------------------------------------*/
+/**
+* This routine extracts baseline normalized features
+* from the unknown character and matches them against the
+* specified set of templates.  The classes which match
+* are added to Results.
+*
+* Globals:
+* - BaselineCutoffs expected num features for each class
+*
+* @param Blob blob to be classified
+* @param Templates current set of adapted templates
+* @param Results place to put match results
+* @param int_features
+* @param fx_info
+*
+* @return Array of possible ambiguous chars that should be checked.
+*/
+UNICHAR_ID *Classify::BaselineClassifier(TBLOB *Blob,
+const std::vector<INT_FEATURE_STRUCT> &int_features,
+const INT_FX_RESULT_STRUCT &fx_info,
+ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results) {
+if (int_features.empty()) {
+return nullptr;
+}
+auto *CharNormArray = new uint8_t[unicharset.size()];
+ClearCharNormArray(CharNormArray);
+Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);
+PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0], CharNormArray,
+BaselineCutoffs, &Results->CPResults);
+if (matcher_debug_level >= 2 || classify_debug_level > 1) {
+tprintf("BL Matches =  ");
+}
+MasterMatcher(Templates->Templates, int_features.size(), &int_features[0], CharNormArray,
+Templates->Class, matcher_debug_flags, 0, Blob->bounding_box(), Results->CPResults,
+Results);
+delete[] CharNormArray;
+CLASS_ID ClassId = Results->best_unichar_id;
+if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0) {
+return nullptr;
+}
+return Templates->Class[ClassId]
+->Config[Results->match[Results->best_match_index].config]
+.Perm->Ambigs;
+} /* BaselineClassifier */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine extracts character normalized features
+* from the unknown character and matches them against the
+* specified set of templates.  The classes which match
+* are added to Results.
+*
+* @param blob blob to be classified
+* @param sample templates to classify unknown against
+* @param adapt_results place to put match results
+*
+* Globals:
+* - CharNormCutoffs expected num features for each class
+* - AllProtosOn mask that enables all protos
+* - AllConfigsOn mask that enables all configs
+*/
+int Classify::CharNormClassifier(TBLOB *blob, const TrainingSample &sample,
+ADAPT_RESULTS *adapt_results) {
+// This is the length that is used for scaling ratings vs certainty.
+adapt_results->BlobLength = IntCastRounded(sample.outline_length() / kStandardFeatureLength);
+std::vector<UnicharRating> unichar_results;
+static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0, -1, &unichar_results);
+// Convert results to the format used internally by AdaptiveClassifier.
+for (auto &r : unichar_results) {
+AddNewResult(r, adapt_results);
+}
+return sample.num_features();
+} /* CharNormClassifier */
+// As CharNormClassifier, but operates on a TrainingSample and outputs to
+// a vector of ShapeRating without conversion to classes.
+int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
+std::vector<UnicharRating> *results) {
+results->clear();
+std::unique_ptr<ADAPT_RESULTS> adapt_results(new ADAPT_RESULTS());
+adapt_results->Initialize();
+// Compute the bounding box of the features.
+uint32_t num_features = sample.num_features();
+// Only the top and bottom of the blob_box are used by MasterMatcher, so
+// fabricate right and left using top and bottom.
+TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
+sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
+// Compute the char_norm_array from the saved cn_feature.
+FEATURE norm_feature = sample.GetCNFeature();
+std::vector<uint8_t> char_norm_array(unicharset.size());
+auto num_pruner_classes = std::max(static_cast<unsigned>(unicharset.size()), PreTrainedTemplates->NumClasses);
+std::vector<uint8_t> pruner_norm_array(num_pruner_classes);
+adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5f);
+ComputeCharNormArrays(norm_feature, PreTrainedTemplates, &char_norm_array[0], &pruner_norm_array[0]);
+PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), &pruner_norm_array[0],
+shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
+&adapt_results->CPResults);
+if (keep_this >= 0) {
+adapt_results->CPResults[0].Class = keep_this;
+adapt_results->CPResults.resize(1);
+}
+if (pruner_only) {
+// Convert pruner results to output format.
+for (auto &it : adapt_results->CPResults) {
+int class_id = it.Class;
+results->push_back(UnicharRating(class_id, 1.0f - it.Rating));
+}
+} else {
+MasterMatcher(PreTrainedTemplates, num_features, sample.features(), &char_norm_array[0], nullptr,
+matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,
+adapt_results->CPResults, adapt_results.get());
+// Convert master matcher results to output format.
+for (auto &i : adapt_results->match) {
+results->push_back(i);
+}
+if (results->size() > 1) {
+std::sort(results->begin(), results->end(), SortDescendingRating);
+}
+}
+return num_features;
+} /* CharNormTrainingSample */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine computes a rating which reflects the
+* likelihood that the blob being classified is a noise
+* blob.  NOTE: assumes that the blob length has already been
+* computed and placed into Results.
+*
+* @param results results to add noise classification to
+*
+* Globals:
+* - matcher_avg_noise_size avg. length of a noise blob
+*/
+void Classify::ClassifyAsNoise(ADAPT_RESULTS *results) {
+float rating = results->BlobLength / matcher_avg_noise_size;
+rating *= rating;
+rating /= 1 + rating;
+AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
+} /* ClassifyAsNoise */
+/// The function converts the given match ratings to the list of blob
+/// choices with ratings and certainties (used by the context checkers).
+/// If character fragments are present in the results, this function also makes
+/// sure that there is at least one non-fragmented classification included.
+/// For each classification result check the unicharset for "definite"
+/// ambiguities and modify the resulting Choices accordingly.
+void Classify::ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box,
+ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) {
+assert(Choices != nullptr);
+float Rating;
+float Certainty;
+BLOB_CHOICE_IT temp_it;
+bool contains_nonfrag = false;
+temp_it.set_to_list(Choices);
+int choices_length = 0;
+// With no shape_table_ maintain the previous MAX_MATCHES as the maximum
+// number of returned results, but with a shape_table_ we want to have room
+// for at least the biggest shape (which might contain hundreds of Indic
+// grapheme fragments) and more, so use double the size of the biggest shape
+// if that is more than the default.
+int max_matches = MAX_MATCHES;
+if (shape_table_ != nullptr) {
+max_matches = shape_table_->MaxNumUnichars() * 2;
+if (max_matches < MAX_MATCHES) {
+max_matches = MAX_MATCHES;
+}
+}
+float best_certainty = -FLT_MAX;
+for (auto &it : Results->match) {
+const UnicharRating &result = it;
+bool adapted = result.adapted;
+bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
+if (temp_it.length() + 1 == max_matches && !contains_nonfrag && current_is_frag) {
+continue; // look for a non-fragmented character to fill the
+// last spot in Choices if only fragments are present
+}
+// BlobLength can never be legally 0, this means recognition failed.
+// But we must return a classification result because some invoking
+// functions (chopper/permuter) do not anticipate a null blob choice.
+// So we need to assign a poor, but not infinitely bad score.
+if (Results->BlobLength == 0) {
+Certainty = -20;
+Rating = 100; // should be -certainty * real_blob_length
+} else {
+Rating = Certainty = (1.0f - result.rating);
+Rating *= rating_scale * Results->BlobLength;
+Certainty *= -(getDict().certainty_scale);
+}
+// Adapted results, by their very nature, should have good certainty.
+// Those that don't are at best misleading, and often lead to errors,
+// so don't accept adapted results that are too far behind the best result,
+// whether adapted or static.
+// TODO(rays) find some way of automatically tuning these constants.
+if (Certainty > best_certainty) {
+best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
+} else if (adapted && Certainty / classify_adapted_pruning_factor < best_certainty) {
+continue; // Don't accept bad adapted results.
+}
+float min_xheight, max_xheight, yshift;
+denorm.XHeightRange(result.unichar_id, unicharset, box, &min_xheight, &max_xheight, &yshift);
+auto *choice = new BLOB_CHOICE(
+result.unichar_id, Rating, Certainty, unicharset.get_script(result.unichar_id), min_xheight,
+max_xheight, yshift, adapted ? BCC_ADAPTED_CLASSIFIER : BCC_STATIC_CLASSIFIER);
+choice->set_fonts(result.fonts);
+temp_it.add_to_end(choice);
+contains_nonfrag |= !current_is_frag; // update contains_nonfrag
+choices_length++;
+if (choices_length >= max_matches) {
+break;
+}
+}
+Results->match.resize(choices_length);
+} // ConvertMatchesToChoices
+/*---------------------------------------------------------------------------*/
+#ifndef GRAPHICS_DISABLED
+/**
+*
+* @param blob blob whose classification is being debugged
+* @param Results results of match being debugged
+*
+* Globals: none
+*/
+void Classify::DebugAdaptiveClassifier(TBLOB *blob, ADAPT_RESULTS *Results) {
+if (static_classifier_ == nullptr) {
+return;
+}
+INT_FX_RESULT_STRUCT fx_info;
+std::vector<INT_FEATURE_STRUCT> bl_features;
+TrainingSample *sample = BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
+if (sample == nullptr) {
+return;
+}
+static_classifier_->DebugDisplay(*sample, blob->denorm().pix(), Results->best_unichar_id);
+} /* DebugAdaptiveClassifier */
+#endif
+/*---------------------------------------------------------------------------*/
+/**
+* This routine performs an adaptive classification.
+* If we have not yet adapted to enough classes, a simple
+* classification to the pre-trained templates is performed.
+* Otherwise, we match the blob against the adapted templates.
+* If the adapted templates do not match well, we try a
+* match against the pre-trained templates.  If an adapted
+* template match is found, we do a match to any pre-trained
+* templates which could be ambiguous.  The results from all
+* of these classifications are merged together into Results.
+*
+* @param Blob blob to be classified
+* @param Results place to put match results
+*
+* Globals:
+* - PreTrainedTemplates built-in training templates
+* - AdaptedTemplates templates adapted for this page
+* - matcher_reliable_adaptive_result rating limit for a great match
+*/
+void Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) {
+UNICHAR_ID *Ambiguities;
+INT_FX_RESULT_STRUCT fx_info;
+std::vector<INT_FEATURE_STRUCT> bl_features;
+TrainingSample *sample =
+BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
+if (sample == nullptr) {
+return;
+}
+// TODO: With LSTM, static_classifier_ is nullptr.
+// Return to avoid crash in CharNormClassifier.
+if (static_classifier_ == nullptr) {
+delete sample;
+return;
+}
+if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min || tess_cn_matching) {
+CharNormClassifier(Blob, *sample, Results);
+} else {
+Ambiguities = BaselineClassifier(Blob, bl_features, fx_info, AdaptedTemplates, Results);
+if ((!Results->match.empty() &&
+MarginalMatch(Results->best_rating, matcher_reliable_adaptive_result) &&
+!tess_bn_matching) ||
+Results->match.empty()) {
+CharNormClassifier(Blob, *sample, Results);
+} else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
+AmbigClassifier(bl_features, fx_info, Blob, PreTrainedTemplates, AdaptedTemplates->Class,
+Ambiguities, Results);
+}
+}
+// Force the blob to be classified as noise
+// if the results contain only fragments.
+// TODO(daria): verify that this is better than
+// just adding a nullptr classification.
+if (!Results->HasNonfragment || Results->match.empty()) {
+ClassifyAsNoise(Results);
+}
+delete sample;
+} /* DoAdaptiveMatch */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine matches blob to the built-in templates
+* to find out if there are any classes other than the correct
+* class which are potential ambiguities.
+*
+* @param Blob blob to get classification ambiguities for
+* @param CorrectClass correct class for Blob
+*
+* Globals:
+* - CurrentRatings used by qsort compare routine
+* - PreTrainedTemplates built-in templates
+*
+* @return String containing all possible ambiguous classes.
+*/
+UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass) {
+auto *Results = new ADAPT_RESULTS();
+UNICHAR_ID *Ambiguities;
+Results->Initialize();
+INT_FX_RESULT_STRUCT fx_info;
+std::vector<INT_FEATURE_STRUCT> bl_features;
+TrainingSample *sample =
+BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
+if (sample == nullptr) {
+delete Results;
+return nullptr;
+}
+CharNormClassifier(Blob, *sample, Results);
+delete sample;
+RemoveBadMatches(Results);
+std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
+/* copy the class id's into an string of ambiguities - don't copy if
+the correct class is the only class id matched */
+Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
+if (Results->match.size() > 1 ||
+(Results->match.size() == 1 && Results->match[0].unichar_id != CorrectClass)) {
+unsigned i;
+for (i = 0; i < Results->match.size(); i++) {
+Ambiguities[i] = Results->match[i].unichar_id;
+}
+Ambiguities[i] = -1;
+} else {
+Ambiguities[0] = -1;
+}
+delete Results;
+return Ambiguities;
+} /* GetAmbiguities */
+// Returns true if the given blob looks too dissimilar to any character
+// present in the classifier templates.
+bool Classify::LooksLikeGarbage(TBLOB *blob) {
+auto *ratings = new BLOB_CHOICE_LIST();
+AdaptiveClassifier(blob, ratings);
+BLOB_CHOICE_IT ratings_it(ratings);
+const UNICHARSET &unicharset = getDict().getUnicharset();
+if (classify_debug_character_fragments) {
+print_ratings_list("======================\nLooksLikeGarbage() got ", ratings, unicharset);
+}
+for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list(); ratings_it.forward()) {
+if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
+continue;
+}
+float certainty = ratings_it.data()->certainty();
+delete ratings;
+return certainty < classify_character_fragments_garbage_certainty_threshold;
+}
+delete ratings;
+return true; // no whole characters in ratings
+}
+/*---------------------------------------------------------------------------*/
+/**
+* This routine calls the integer (Hardware) feature
+* extractor if it has not been called before for this blob.
+*
+* The results from the feature extractor are placed into
+* globals so that they can be used in other routines without
+* re-extracting the features.
+*
+* It then copies the char norm features into the IntFeatures
+* array provided by the caller.
+*
+* @param templates used to compute char norm adjustments
+* @param pruner_norm_array Array of factors from blob normalization
+*        process
+* @param char_norm_array array to fill with dummy char norm adjustments
+* @param fx_info
+*
+* Globals:
+*
+* @return Number of features extracted or 0 if an error occurred.
+*/
+int Classify::GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates,
+uint8_t *pruner_norm_array, uint8_t *char_norm_array) {
+auto norm_feature = new FEATURE_STRUCT(&CharNormDesc);
+float baseline = kBlnBaselineOffset;
+float scale = MF_SCALE_FACTOR;
+norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
+norm_feature->Params[CharNormLength] = fx_info.Length * scale / LENGTH_COMPRESSION;
+norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
+norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
+// Deletes norm_feature.
+ComputeCharNormArrays(norm_feature, templates, char_norm_array, pruner_norm_array);
+return IntCastRounded(fx_info.Length / kStandardFeatureLength);
+} /* GetCharNormFeature */
+// Computes the char_norm_array for the unicharset and, if not nullptr, the
+// pruner_array as appropriate according to the existence of the shape_table.
+void Classify::ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates,
+uint8_t *char_norm_array, uint8_t *pruner_array) {
+ComputeIntCharNormArray(*norm_feature, char_norm_array);
+//if (pruner_array != nullptr) {
+if (shape_table_ == nullptr) {
+ComputeIntCharNormArray(*norm_feature, pruner_array);
+} else {
+memset(&pruner_array[0], UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));
+// Each entry in the pruner norm array is the MIN of all the entries of
+// the corresponding unichars in the CharNormArray.
+for (unsigned id = 0; id < templates->NumClasses; ++id) {
+int font_set_id = templates->Class[id]->font_set_id;
+const FontSet &fs = fontset_table_.at(font_set_id);
+for (auto f : fs) {
+const Shape &shape = shape_table_->GetShape(f);
+for (int c = 0; c < shape.size(); ++c) {
+if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {
+pruner_array[id] = char_norm_array[shape[c].unichar_id];
+}
+}
+}
+}
+}
+//}
+delete norm_feature;
+}
+/*---------------------------------------------------------------------------*/
+/**
+*
+* @param Templates adapted templates to add new config to
+* @param ClassId class id to associate with new config
+* @param FontinfoId font information inferred from pre-trained templates
+* @param NumFeatures number of features in IntFeatures
+* @param Features features describing model for new config
+* @param FloatFeatures floating-pt representation of features
+*
+* @return The id of the new config created, a negative integer in
+* case of error.
+*/
+int Classify::MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId,
+int NumFeatures, INT_FEATURE_ARRAY Features,
+FEATURE_SET FloatFeatures) {
+INT_CLASS_STRUCT *IClass;
+ADAPT_CLASS_STRUCT *Class;
+PROTO_ID OldProtos[MAX_NUM_PROTOS];
+FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
+int NumOldProtos;
+int NumBadFeatures;
+int MaxProtoId, OldMaxProtoId;
+int MaskSize;
+int ConfigId;
+int i;
+int debug_level = NO_DEBUG;
+if (classify_learning_debug_level >= 3) {
+debug_level = PRINT_MATCH_SUMMARY | PRINT_FEATURE_MATCHES | PRINT_PROTO_MATCHES;
+}
+IClass = ClassForClassId(Templates->Templates, ClassId);
+Class = Templates->Class[ClassId];
+if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
+++NumAdaptationsFailed;
+if (classify_learning_debug_level >= 1) {
+tprintf("Cannot make new temporary config: maximum number exceeded.\n");
+}
+return -1;
+}
+OldMaxProtoId = IClass->NumProtos - 1;
+NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff, NumFeatures, Features,
+OldProtos, classify_adapt_proto_threshold, debug_level);
+MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
+zero_all_bits(TempProtoMask, MaskSize);
+for (i = 0; i < NumOldProtos; i++) {
+SET_BIT(TempProtoMask, OldProtos[i]);
+}
+NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn, NumFeatures, Features,
+BadFeatures, classify_adapt_feature_threshold, debug_level);
+MaxProtoId =
+MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures, IClass, Class, TempProtoMask);
+if (MaxProtoId == NO_PROTO) {
+++NumAdaptationsFailed;
+if (classify_learning_debug_level >= 1) {
+tprintf("Cannot make new temp protos: maximum number exceeded.\n");
+}
+return -1;
+}
+ConfigId = AddIntConfig(IClass);
+ConvertConfig(TempProtoMask, ConfigId, IClass);
+auto Config = new TEMP_CONFIG_STRUCT(MaxProtoId, FontinfoId);
+TempConfigFor(Class, ConfigId) = Config;
+copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
+if (classify_learning_debug_level >= 1) {
+tprintf(
+"Making new temp config %d fontinfo id %d"
+" using %d old and %d new protos.\n",
+ConfigId, Config->FontinfoId, NumOldProtos, MaxProtoId - OldMaxProtoId);
+}
+return ConfigId;
+} /* MakeNewTemporaryConfig */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine finds sets of sequential bad features
+* that all have the same angle and converts each set into
+* a new temporary proto.  The temp proto is added to the
+* proto pruner for IClass, pushed onto the list of temp
+* protos in Class, and added to TempProtoMask.
+*
+* @param Features floating-pt features describing new character
+* @param NumBadFeat number of bad features to turn into protos
+* @param BadFeat feature id's of bad features
+* @param IClass integer class templates to add new protos to
+* @param Class adapted class templates to add new protos to
+* @param TempProtoMask proto mask to add new protos to
+*
+* Globals: none
+*
+* @return Max proto id in class after all protos have been added.
+*/
+PROTO_ID Classify::MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[],
+INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class,
+BIT_VECTOR TempProtoMask) {
+FEATURE_ID *ProtoStart;
+FEATURE_ID *ProtoEnd;
+FEATURE_ID *LastBad;
+PROTO_STRUCT *Proto;
+FEATURE F1, F2;
+float X1, X2, Y1, Y2;
+float A1, A2, AngleDelta;
+float SegmentLength;
+PROTO_ID Pid;
+for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat; ProtoStart < LastBad;
+ProtoStart = ProtoEnd) {
+F1 = Features->Features[*ProtoStart];
+X1 = F1->Params[PicoFeatX];
+Y1 = F1->Params[PicoFeatY];
+A1 = F1->Params[PicoFeatDir];
+for (ProtoEnd = ProtoStart + 1, SegmentLength = GetPicoFeatureLength(); ProtoEnd < LastBad;
+ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
+F2 = Features->Features[*ProtoEnd];
+X2 = F2->Params[PicoFeatX];
+Y2 = F2->Params[PicoFeatY];
+A2 = F2->Params[PicoFeatDir];
+AngleDelta = std::fabs(A1 - A2);
+if (AngleDelta > 0.5f) {
+AngleDelta = 1 - AngleDelta;
+}
+if (AngleDelta > matcher_clustering_max_angle_delta || std::fabs(X1 - X2) > SegmentLength ||
+std::fabs(Y1 - Y2) > SegmentLength) {
+break;
+}
+}
+F2 = Features->Features[*(ProtoEnd - 1)];
+X2 = F2->Params[PicoFeatX];
+Y2 = F2->Params[PicoFeatY];
+A2 = F2->Params[PicoFeatDir];
+Pid = AddIntProto(IClass);
+if (Pid == NO_PROTO) {
+return (NO_PROTO);
+}
+auto TempProto = new TEMP_PROTO_STRUCT;
+Proto = &(TempProto->Proto);
+/* compute proto params - NOTE that Y_DIM_OFFSET must be used because
+ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
+instead of the -0.25 to 0.75 used in baseline normalization */
+Proto->Length = SegmentLength;
+Proto->Angle = A1;
+Proto->X = (X1 + X2) / 2;
+Proto->Y = (Y1 + Y2) / 2 - Y_DIM_OFFSET;
+FillABC(Proto);
+TempProto->ProtoId = Pid;
+SET_BIT(TempProtoMask, Pid);
+ConvertProto(Proto, Pid, IClass);
+AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
+Class->TempProtos = push(Class->TempProtos, TempProto);
+}
+return IClass->NumProtos - 1;
+} /* MakeNewTempProtos */
+/*---------------------------------------------------------------------------*/
+/**
+*
+* @param Templates current set of adaptive templates
+* @param ClassId class containing config to be made permanent
+* @param ConfigId config to be made permanent
+* @param Blob current blob being adapted to
+*
+* Globals: none
+*/
+void Classify::MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId,
+TBLOB *Blob) {
+UNICHAR_ID *Ambigs;
+PROTO_KEY ProtoKey;
+auto Class = Templates->Class[ClassId];
+auto Config = TempConfigFor(Class, ConfigId);
+MakeConfigPermanent(Class, ConfigId);
+if (Class->NumPermConfigs == 0) {
+Templates->NumPermClasses++;
+}
+Class->NumPermConfigs++;
+// Initialize permanent config.
+Ambigs = GetAmbiguities(Blob, ClassId);
+auto Perm = new PERM_CONFIG_STRUCT;
+Perm->Ambigs = Ambigs;
+Perm->FontinfoId = Config->FontinfoId;
+// Free memory associated with temporary config (since ADAPTED_CONFIG
+// is a union we need to clean up before we record permanent config).
+ProtoKey.Templates = Templates;
+ProtoKey.ClassId = ClassId;
+ProtoKey.ConfigId = ConfigId;
+Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
+delete Config;
+// Record permanent config.
+PermConfigFor(Class, ConfigId) = Perm;
+if (classify_learning_debug_level >= 1) {
+tprintf(
+"Making config %d for %s (ClassId %d) permanent:"
+" fontinfo id %d, ambiguities '",
+ConfigId, getDict().getUnicharset().debug_str(ClassId).c_str(), ClassId,
+PermConfigFor(Class, ConfigId)->FontinfoId);
+for (UNICHAR_ID *AmbigsPointer = Ambigs; *AmbigsPointer >= 0; ++AmbigsPointer) {
+tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
+}
+tprintf("'.\n");
+}
+} /* MakePermanent */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine converts TempProto to be permanent if
+* its proto id is used by the configuration specified in
+* ProtoKey.
+*
+* @param item1 (TEMP_PROTO) temporary proto to compare to key
+* @param item2 (PROTO_KEY) defines which protos to make permanent
+*
+* Globals: none
+*
+* @return true if TempProto is converted, false otherwise
+*/
+int MakeTempProtoPerm(void *item1, void *item2) {
+auto TempProto = static_cast<TEMP_PROTO_STRUCT *>(item1);
+auto ProtoKey = static_cast<PROTO_KEY *>(item2);
+auto Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
+auto Config = TempConfigFor(Class, ProtoKey->ConfigId);
+if (TempProto->ProtoId > Config->MaxProtoId || !test_bit(Config->Protos, TempProto->ProtoId)) {
+return false;
+}
+MakeProtoPermanent(Class, TempProto->ProtoId);
+AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId, ProtoKey->Templates->Templates);
+delete TempProto;
+return true;
+} /* MakeTempProtoPerm */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine writes the matches in Results to File.
+*
+* @param results match results to write to File
+*
+* Globals: none
+*/
+void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS &results) {
+for (auto &it : results.match) {
+tprintf("%s  ", unicharset.debug_str(it.unichar_id).c_str());
+it.Print();
+}
+} /* PrintAdaptiveMatchResults */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine steps through each matching class in Results
+* and removes it from the match list if its rating
+* is worse than the BestRating plus a pad.  In other words,
+* all good matches get moved to the front of the classes
+* array.
+*
+* @param Results contains matches to be filtered
+*
+* Globals:
+* - matcher_bad_match_pad defines a "bad match"
+*/
+void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) {
+unsigned Next, NextGood;
+float BadMatchThreshold;
+static const char *romans = "i v x I V X";
+BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
+if (classify_bln_numeric_mode) {
+UNICHAR_ID unichar_id_one =
+unicharset.contains_unichar("1") ? unicharset.unichar_to_id("1") : -1;
+UNICHAR_ID unichar_id_zero =
+unicharset.contains_unichar("0") ? unicharset.unichar_to_id("0") : -1;
+float scored_one = ScoredUnichar(unichar_id_one, *Results);
+float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
+for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
+const UnicharRating &match = Results->match[Next];
+if (match.rating >= BadMatchThreshold) {
+if (!unicharset.get_isalpha(match.unichar_id) ||
+strstr(romans, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
+} else if (unicharset.eq(match.unichar_id, "l") && scored_one < BadMatchThreshold) {
+Results->match[Next].unichar_id = unichar_id_one;
+} else if (unicharset.eq(match.unichar_id, "O") && scored_zero < BadMatchThreshold) {
+Results->match[Next].unichar_id = unichar_id_zero;
+} else {
+Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
+}
+if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
+if (NextGood == Next) {
+++NextGood;
+} else {
+Results->match[NextGood++] = Results->match[Next];
+}
+}
+}
+}
+} else {
+for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
+if (Results->match[Next].rating >= BadMatchThreshold) {
+if (NextGood == Next) {
+++NextGood;
+} else {
+Results->match[NextGood++] = Results->match[Next];
+}
+}
+}
+}
+Results->match.resize(NextGood);
+} /* RemoveBadMatches */
+/*----------------------------------------------------------------------------*/
+/**
+* This routine discards extra digits or punctuation from the results.
+* We keep only the top 2 punctuation answers and the top 1 digit answer if
+* present.
+*
+* @param Results contains matches to be filtered
+*/
+void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) {
+unsigned Next, NextGood;
+int punc_count; /*no of garbage characters */
+int digit_count;
+/*garbage characters */
+static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
+static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
+punc_count = 0;
+digit_count = 0;
+for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
+const UnicharRating &match = Results->match[Next];
+bool keep = true;
+if (strstr(punc_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
+if (punc_count >= 2) {
+keep = false;
+}
+punc_count++;
+} else {
+if (strstr(digit_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
+if (digit_count >= 1) {
+keep = false;
+}
+digit_count++;
+}
+}
+if (keep) {
+if (NextGood == Next) {
+++NextGood;
+} else {
+Results->match[NextGood++] = match;
+}
+}
+}
+Results->match.resize(NextGood);
+} /* RemoveExtraPuncs */
+/*---------------------------------------------------------------------------*/
+/**
+* This routine resets the internal thresholds inside
+* the integer matcher to correspond to the specified
+* threshold.
+*
+* @param Threshold threshold for creating new templates
+*
+* Globals:
+* - matcher_good_threshold default good match rating
+*/
+void Classify::SetAdaptiveThreshold(float Threshold) {
+Threshold = (Threshold == matcher_good_threshold) ? 0.9f : (1 - Threshold);
+classify_adapt_proto_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
+classify_adapt_feature_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
+} /* SetAdaptiveThreshold */
+#ifndef GRAPHICS_DISABLED
+/*---------------------------------------------------------------------------*/
+/**
+* This routine displays debug information for the best config
+* of the given shape_id for the given set of features.
+*
+* @param shape_id classifier id to work with
+* @param features features of the unknown character
+* @param num_features Number of features in the features array.
+*/
+void Classify::ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features,
+int num_features) {
+uint32_t config_mask;
+if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
+tprintf("No built-in templates for class/shape %d\n", shape_id);
+return;
+}
+if (num_features <= 0) {
+tprintf("Illegal blob (char norm features)!\n");
+return;
+}
+UnicharRating cn_result;
+classify_norm_method.set_value(character);
+im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, AllConfigsOn, num_features,
+features, &cn_result, classify_adapt_feature_threshold, NO_DEBUG,
+matcher_debug_separate_windows);
+tprintf("\n");
+config_mask = 1 << cn_result.config;
+tprintf("Static Shape ID: %d\n", shape_id);
+ShowMatchDisplay();
+im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, &config_mask, num_features,
+features, &cn_result, classify_adapt_feature_threshold, matcher_debug_flags,
+matcher_debug_separate_windows);
+UpdateMatchDisplay();
+} /* ShowBestMatchFor */
+#endif // !GRAPHICS_DISABLED
+// Returns a string for the classifier class_id: either the corresponding
+// unicharset debug_str or the shape_table_ debug str.
+std::string Classify::ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,
+int config_id) const {
+std::string class_string;
+if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
+int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
+class_string = shape_table_->DebugStr(shape_id);
+} else {
+class_string = unicharset.debug_str(class_id);
+}
+return class_string;
+}
+// Converts a classifier class_id index to a shape_table_ index
+int Classify::ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const {
+int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
+// Older inttemps have no font_ids.
+if (font_set_id < 0) {
+return kBlankFontinfoId;
+}
+const FontSet &fs = fontset_table_.at(font_set_id);
+return fs.at(int_result_config);
+}
+// Converts a shape_table_ index to a classifier class_id index (not a
+// unichar-id!). Uses a search, so not fast.
+int Classify::ShapeIDToClassID(int shape_id) const {
+for (unsigned id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
+int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
+ASSERT_HOST(font_set_id >= 0);
+const FontSet &fs = fontset_table_.at(font_set_id);
+for (auto f : fs) {
+if (f == shape_id) {
+return id;
+}
+}
+}
+tprintf("Shape %d not found\n", shape_id);
+return -1;
+}
+// Returns true if the given TEMP_CONFIG_STRUCT is good enough to make it
+// a permanent config.
+bool Classify::TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config) {
+if (classify_learning_debug_level >= 1) {
+tprintf("NumTimesSeen for config of %s is %d\n",
+getDict().getUnicharset().debug_str(class_id).c_str(), config->NumTimesSeen);
+}
+if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
+return true;
+} else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
+return false;
+} else if (use_ambigs_for_adaption) {
+// Go through the ambigs vector and see whether we have already seen
+// enough times all the characters represented by the ambigs vector.
+const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
+int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
+for (int ambig = 0; ambig < ambigs_size; ++ambig) {
+ADAPT_CLASS_STRUCT *ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
+assert(ambig_class != nullptr);
+if (ambig_class->NumPermConfigs == 0 &&
+ambig_class->MaxNumTimesSeen < matcher_min_examples_for_prototyping) {
+if (classify_learning_debug_level >= 1) {
+tprintf(
+"Ambig %s has not been seen enough times,"
+" not making config for %s permanent\n",
+getDict().getUnicharset().debug_str((*ambigs)[ambig]).c_str(),
+getDict().getUnicharset().debug_str(class_id).c_str());
+}
+return false;
+}
+}
+}
+return true;
+}
+void Classify::UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob) {
+const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().ReverseAmbigsForAdaption(class_id);
+int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
+if (classify_learning_debug_level >= 1) {
+tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
+getDict().getUnicharset().debug_str(class_id).c_str(), class_id);
+}
+for (int ambig = 0; ambig < ambigs_size; ++ambig) {
+CLASS_ID ambig_class_id = (*ambigs)[ambig];
+const ADAPT_CLASS_STRUCT *ambigs_class = AdaptedTemplates->Class[ambig_class_id];
+for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
+if (ConfigIsPermanent(ambigs_class, cfg)) {
+continue;
+}
+const TEMP_CONFIG_STRUCT *config = TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
+if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
+if (classify_learning_debug_level >= 1) {
+tprintf("Making config %d of %s permanent\n", cfg,
+getDict().getUnicharset().debug_str(ambig_class_id).c_str());
+}
+MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
+}
+}
+}
+}
+} // namespace tesseract

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/tesseract/src/classify/adaptmatch.cpp @ 2:b50eed0cc0ef upstream