Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/classify/blobclass.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/classify/blobclass.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,73 @@ +/****************************************************************************** + ** Filename: blobclass.c + ** Purpose: High level blob classification and training routines. + ** Author: Dan Johnson + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + ******************************************************************************/ + +#include <cstdio> + +#include "classify.h" +#include "featdefs.h" +#include "mf.h" +#include "normfeat.h" + +namespace tesseract { + +/*---------------------------------------------------------------------------*/ + +// Extracts features from the given blob and saves them in the tr_file_data_ +// member variable. +// fontname: Name of font that this blob was printed in. +// cn_denorm: Character normalization transformation to apply to the blob. +// fx_info: Character normalization parameters computed with cn_denorm. +// blob_text: Ground truth text for the blob. +void Classify::LearnBlob(const std::string &fontname, TBLOB *blob, const DENORM &cn_denorm, + const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text) { + std::unique_ptr<CHAR_DESC_STRUCT> CharDesc(new CHAR_DESC_STRUCT(feature_defs_)); + CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm); + CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info); + CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info); + CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info); + + if (ValidCharDescription(feature_defs_, CharDesc.get())) { + // Label the features with a class name and font name. + tr_file_data_ += "\n"; + tr_file_data_ += fontname; + tr_file_data_ += " "; + tr_file_data_ += blob_text; + tr_file_data_ += "\n"; + + // write micro-features to file and clean up + WriteCharDescription(feature_defs_, CharDesc.get(), tr_file_data_); + } else { + tprintf("Blob learned was invalid!\n"); + } +} // LearnBlob + +// Writes stored training data to a .tr file based on the given filename. +// Returns false on error. +bool Classify::WriteTRFile(const char *filename) { + bool result = false; + std::string tr_filename = filename; + tr_filename += ".tr"; + FILE *fp = fopen(tr_filename.c_str(), "wb"); + if (fp) { + result = tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length()); + fclose(fp); + } + tr_file_data_.resize(0); + return result; +} + +} // namespace tesseract
