Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/classify/picofeat.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/classify/picofeat.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,256 @@ +/****************************************************************************** + ** Filename: picofeat.c + ** Purpose: Definition of pico-features. + ** Author: Dan Johnson + ** + ** (c) Copyright Hewlett-Packard Company, 1988. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + ******************************************************************************/ + +#include "picofeat.h" + +#include "classify.h" +#include "featdefs.h" +#include "fpoint.h" +#include "mfoutline.h" +#include "ocrfeatures.h" +#include "params.h" +#include "trainingsample.h" + +#include <cmath> +#include <cstdio> + +namespace tesseract { + +/*--------------------------------------------------------------------------- + Variables +----------------------------------------------------------------------------*/ + +double_VAR(classify_pico_feature_length, 0.05, "Pico Feature Length"); + +/*--------------------------------------------------------------------------- + Private Function Prototypes +----------------------------------------------------------------------------*/ +void ConvertSegmentToPicoFeat(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet); + +void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet); + +void NormalizePicoX(FEATURE_SET FeatureSet); + +/*---------------------------------------------------------------------------- + Public Code +----------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ +/** + * Operation: Dummy for now. + * + * Globals: + * - classify_norm_method normalization method currently specified + * @param Blob blob to extract pico-features from + * @return Pico-features for Blob. + */ +FEATURE_SET Classify::ExtractPicoFeatures(TBLOB *Blob) { + auto FeatureSet = new FEATURE_SET_STRUCT(MAX_PICO_FEATURES); + auto Outlines = ConvertBlob(Blob); + float XScale, YScale; + NormalizeOutlines(Outlines, &XScale, &YScale); + auto RemainingOutlines = Outlines; + iterate(RemainingOutlines) { + auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node()); + ConvertToPicoFeatures2(Outline, FeatureSet); + } + if (classify_norm_method == baseline) { + NormalizePicoX(FeatureSet); + } + FreeOutlines(Outlines); + return (FeatureSet); + +} /* ExtractPicoFeatures */ + +/*---------------------------------------------------------------------------- + Private Code +----------------------------------------------------------------------------*/ +/*---------------------------------------------------------------------------*/ +/** + * This routine converts an entire segment of an outline + * into a set of pico features which are added to + * FeatureSet. The length of the segment is rounded to the + * nearest whole number of pico-features. The pico-features + * are spaced evenly over the entire segment. + * Results are placed in FeatureSet. + * Globals: + * - classify_pico_feature_length length of a single pico-feature + * @param Start starting point of pico-feature + * @param End ending point of pico-feature + * @param FeatureSet set to add pico-feature to + */ +void ConvertSegmentToPicoFeat(FPOINT *Start, FPOINT *End, FEATURE_SET FeatureSet) { + float Angle; + float Length; + int NumFeatures; + FPOINT Center; + FPOINT Delta; + int i; + + Angle = NormalizedAngleFrom(Start, End, 1.0); + Length = DistanceBetween(*Start, *End); + NumFeatures = static_cast<int>(floor(Length / classify_pico_feature_length + 0.5)); + if (NumFeatures < 1) { + NumFeatures = 1; + } + + /* compute vector for one pico feature */ + Delta.x = XDelta(*Start, *End) / NumFeatures; + Delta.y = YDelta(*Start, *End) / NumFeatures; + + /* compute position of first pico feature */ + Center.x = Start->x + Delta.x / 2.0; + Center.y = Start->y + Delta.y / 2.0; + + /* compute each pico feature in segment and add to feature set */ + for (i = 0; i < NumFeatures; i++) { + auto Feature = new FEATURE_STRUCT(&PicoFeatDesc); + Feature->Params[PicoFeatDir] = Angle; + Feature->Params[PicoFeatX] = Center.x; + Feature->Params[PicoFeatY] = Center.y; + AddFeature(FeatureSet, Feature); + + Center.x += Delta.x; + Center.y += Delta.y; + } +} /* ConvertSegmentToPicoFeat */ + +/*---------------------------------------------------------------------------*/ +/** + * This routine steps through the specified outline and cuts it + * up into pieces of equal length. These pieces become the + * desired pico-features. Each segment in the outline + * is converted into an integral number of pico-features. + * Results are returned in FeatureSet. + * + * Globals: + * - classify_pico_feature_length length of features to be extracted + * @param Outline outline to extract micro-features from + * @param FeatureSet set of features to add pico-features to + */ +void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet) { + MFOUTLINE Next; + MFOUTLINE First; + MFOUTLINE Current; + + if (DegenerateOutline(Outline)) { + return; + } + + First = Outline; + Current = First; + Next = NextPointAfter(Current); + do { + /* note that an edge is hidden if the ending point of the edge is + marked as hidden. This situation happens because the order of + the outlines is reversed when they are converted from the old + format. In the old format, a hidden edge is marked by the + starting point for that edge. */ + if (!(PointAt(Next)->Hidden)) { + ConvertSegmentToPicoFeat(&(PointAt(Current)->Point), &(PointAt(Next)->Point), FeatureSet); + } + + Current = Next; + Next = NextPointAfter(Current); + } while (Current != First); + +} /* ConvertToPicoFeatures2 */ + +/*---------------------------------------------------------------------------*/ +/** + * This routine computes the average x position over all + * of the pico-features in FeatureSet and then renormalizes + * the pico-features to force this average to be the x origin + * (i.e. x=0). + * FeatureSet is changed. + * @param FeatureSet pico-features to be normalized + */ +void NormalizePicoX(FEATURE_SET FeatureSet) { + int i; + FEATURE Feature; + float Origin = 0.0; + + for (i = 0; i < FeatureSet->NumFeatures; i++) { + Feature = FeatureSet->Features[i]; + Origin += Feature->Params[PicoFeatX]; + } + Origin /= FeatureSet->NumFeatures; + + for (i = 0; i < FeatureSet->NumFeatures; i++) { + Feature = FeatureSet->Features[i]; + Feature->Params[PicoFeatX] -= Origin; + } +} /* NormalizePicoX */ + +/*---------------------------------------------------------------------------*/ +/** + * @param blob blob to extract features from + * @param fx_info + * @return Integer character-normalized features for blob. + */ +FEATURE_SET Classify::ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) { + INT_FX_RESULT_STRUCT local_fx_info(fx_info); + std::vector<INT_FEATURE_STRUCT> bl_features; + tesseract::TrainingSample *sample = + tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features); + if (sample == nullptr) { + return nullptr; + } + + uint32_t num_features = sample->num_features(); + const INT_FEATURE_STRUCT *features = sample->features(); + auto feature_set = new FEATURE_SET_STRUCT(num_features); + for (uint32_t f = 0; f < num_features; ++f) { + auto feature = new FEATURE_STRUCT(&IntFeatDesc); + feature->Params[IntX] = features[f].X; + feature->Params[IntY] = features[f].Y; + feature->Params[IntDir] = features[f].Theta; + AddFeature(feature_set, feature); + } + delete sample; + + return feature_set; +} /* ExtractIntCNFeatures */ + +/*---------------------------------------------------------------------------*/ +/** + * @param blob blob to extract features from + * @param fx_info + * @return Geometric (top/bottom/width) features for blob. + */ +FEATURE_SET Classify::ExtractIntGeoFeatures(const TBLOB &blob, + const INT_FX_RESULT_STRUCT &fx_info) { + INT_FX_RESULT_STRUCT local_fx_info(fx_info); + std::vector<INT_FEATURE_STRUCT> bl_features; + tesseract::TrainingSample *sample = + tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features); + if (sample == nullptr) { + return nullptr; + } + + auto feature_set = new FEATURE_SET_STRUCT(1); + auto feature = new FEATURE_STRUCT(&IntFeatDesc); + + feature->Params[GeoBottom] = sample->geo_feature(GeoBottom); + feature->Params[GeoTop] = sample->geo_feature(GeoTop); + feature->Params[GeoWidth] = sample->geo_feature(GeoWidth); + AddFeature(feature_set, feature); + delete sample; + + return feature_set; +} /* ExtractIntGeoFeatures */ + +} // namespace tesseract.
