diff mupdf-source/thirdparty/tesseract/src/ccstruct/params_training_featdef.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/params_training_featdef.h	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,154 @@
+///////////////////////////////////////////////////////////////////////
+// File:        params_training_featdef.h
+// Description: Feature definitions for params training.
+// Author:      Rika Antonova
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
+#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
+
+#include <cstring> // for memset
+#include <string>
+#include <vector>
+
+namespace tesseract {
+
+// Maximum number of unichars in the small and medium sized words
+static const int kMaxSmallWordUnichars = 3;
+static const int kMaxMediumWordUnichars = 6;
+
+// Raw features extracted from a single OCR hypothesis.
+// The features are normalized (by outline length or number of unichars as
+// appropriate) real-valued quantities with unbounded range and
+// unknown distribution.
+// Normalization / binarization of these features is done at a later stage.
+// Note: when adding new fields to this enum make sure to modify
+// kParamsTrainingFeatureTypeName
+enum kParamsTrainingFeatureType {
+  // Digits
+  PTRAIN_DIGITS_SHORT, // 0
+  PTRAIN_DIGITS_MED,   // 1
+  PTRAIN_DIGITS_LONG,  // 2
+  // Number or pattern (NUMBER_PERM, USER_PATTERN_PERM)
+  PTRAIN_NUM_SHORT, // 3
+  PTRAIN_NUM_MED,   // 4
+  PTRAIN_NUM_LONG,  // 5
+  // Document word (DOC_DAWG_PERM)
+  PTRAIN_DOC_SHORT, // 6
+  PTRAIN_DOC_MED,   // 7
+  PTRAIN_DOC_LONG,  // 8
+  // Word (SYSTEM_DAWG_PERM, USER_DAWG_PERM, COMPOUND_PERM)
+  PTRAIN_DICT_SHORT, // 9
+  PTRAIN_DICT_MED,   // 10
+  PTRAIN_DICT_LONG,  // 11
+  // Frequent word (FREQ_DAWG_PERM)
+  PTRAIN_FREQ_SHORT,          // 12
+  PTRAIN_FREQ_MED,            // 13
+  PTRAIN_FREQ_LONG,           // 14
+  PTRAIN_SHAPE_COST_PER_CHAR, // 15
+  PTRAIN_NGRAM_COST_PER_CHAR, // 16
+  PTRAIN_NUM_BAD_PUNC,        // 17
+  PTRAIN_NUM_BAD_CASE,        // 18
+  PTRAIN_XHEIGHT_CONSISTENCY, // 19
+  PTRAIN_NUM_BAD_CHAR_TYPE,   // 20
+  PTRAIN_NUM_BAD_SPACING,     // 21
+  PTRAIN_NUM_BAD_FONT,        // 22
+  PTRAIN_RATING_PER_CHAR,     // 23
+
+  PTRAIN_NUM_FEATURE_TYPES
+};
+
+static const char *const kParamsTrainingFeatureTypeName[] = {
+    "PTRAIN_DIGITS_SHORT",        // 0
+    "PTRAIN_DIGITS_MED",          // 1
+    "PTRAIN_DIGITS_LONG",         // 2
+    "PTRAIN_NUM_SHORT",           // 3
+    "PTRAIN_NUM_MED",             // 4
+    "PTRAIN_NUM_LONG",            // 5
+    "PTRAIN_DOC_SHORT",           // 6
+    "PTRAIN_DOC_MED",             // 7
+    "PTRAIN_DOC_LONG",            // 8
+    "PTRAIN_DICT_SHORT",          // 9
+    "PTRAIN_DICT_MED",            // 10
+    "PTRAIN_DICT_LONG",           // 11
+    "PTRAIN_FREQ_SHORT",          // 12
+    "PTRAIN_FREQ_MED",            // 13
+    "PTRAIN_FREQ_LONG",           // 14
+    "PTRAIN_SHAPE_COST_PER_CHAR", // 15
+    "PTRAIN_NGRAM_COST_PER_CHAR", // 16
+    "PTRAIN_NUM_BAD_PUNC",        // 17
+    "PTRAIN_NUM_BAD_CASE",        // 18
+    "PTRAIN_XHEIGHT_CONSISTENCY", // 19
+    "PTRAIN_NUM_BAD_CHAR_TYPE",   // 20
+    "PTRAIN_NUM_BAD_SPACING",     // 21
+    "PTRAIN_NUM_BAD_FONT",        // 22
+    "PTRAIN_RATING_PER_CHAR",     // 23
+};
+
+// Returns the index of the given feature (by name),
+// or -1 meaning the feature is unknown.
+int ParamsTrainingFeatureByName(const char *name);
+
+// Entry with features extracted from a single OCR hypothesis for a word.
+struct ParamsTrainingHypothesis {
+  ParamsTrainingHypothesis() : cost(0.0) {
+    memset(features, 0, sizeof(features));
+  }
+  ParamsTrainingHypothesis(const ParamsTrainingHypothesis &other) {
+    memcpy(features, other.features, sizeof(features));
+    str = other.str;
+    cost = other.cost;
+  }
+  ParamsTrainingHypothesis &operator=(const ParamsTrainingHypothesis &other) {
+    memcpy(features, other.features, sizeof(features));
+    str = other.str;
+    cost = other.cost;
+    return *this;
+  }
+  std::string str; // string corresponding to word hypothesis (for debugging)
+  float features[PTRAIN_NUM_FEATURE_TYPES];
+  float cost; // path cost computed by segsearch
+};
+
+// A list of hypotheses explored during one run of segmentation search.
+using ParamsTrainingHypothesisList = std::vector<ParamsTrainingHypothesis>;
+
+// A bundle that accumulates all of the hypothesis lists explored during all
+// of the runs of segmentation search on a word (e.g. a list of hypotheses
+// explored on PASS1, PASS2, fix xheight pass, etc).
+class ParamsTrainingBundle {
+public:
+  ParamsTrainingBundle() = default;
+  // Starts a new hypothesis list.
+  // Should be called at the beginning of a new run of the segmentation search.
+  void StartHypothesisList() {
+    hyp_list_vec.emplace_back();
+  }
+  // Adds a new ParamsTrainingHypothesis to the current hypothesis list
+  // and returns the reference to the newly added entry.
+  ParamsTrainingHypothesis &AddHypothesis(const ParamsTrainingHypothesis &other) {
+    if (hyp_list_vec.empty()) {
+      StartHypothesisList();
+    }
+    hyp_list_vec.back().push_back(ParamsTrainingHypothesis(other));
+    return hyp_list_vec.back().back();
+  }
+
+  std::vector<ParamsTrainingHypothesisList> hyp_list_vec;
+};
+
+} // namespace tesseract
+
+#endif // TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_