view mupdf-source/thirdparty/tesseract/src/classify/normfeat.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line source

/******************************************************************************
 ** Filename:    normfeat.c
 ** Purpose:     Definition of char normalization features.
 ** Author:      Dan Johnson
 **
 ** (c) Copyright Hewlett-Packard Company, 1988.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 ******************************************************************************/

#include "normfeat.h"

#include "featdefs.h"
#include "intfx.h"
#include "mfoutline.h"

namespace tesseract {

/** Return the length of the outline in baseline normalized form. */
float ActualOutlineLength(FEATURE Feature) {
  return (Feature->Params[CharNormLength] * LENGTH_COMPRESSION);
}

/**
 * Return the character normalization feature for a blob.
 *
 * The features returned are in a scale where the x-height has been
 * normalized to live in the region y = [-0.25 .. 0.25].  Example ranges
 * for English below are based on the Linux font collection on 2009-12-04:
 *
 *  - Params[CharNormY]
 *     - The y coordinate of the grapheme's centroid.
 *     - English: [-0.27, 0.71]
 *
 *  - Params[CharNormLength]
 *     - The length of the grapheme's outline (tiny segments discarded),
 *     divided by 10.0=LENGTH_COMPRESSION.
 *     - English: [0.16, 0.85]
 *
 *  - Params[CharNormRx]
 *     - The radius of gyration about the x axis, as measured from CharNormY.
 *     - English: [0.011, 0.34]
 *
 *  - Params[CharNormRy]
 *     - The radius of gyration about the y axis, as measured from
 *     the x center of the grapheme's bounding box.
 *     - English: [0.011, 0.31]
 */
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info) {
  auto feature_set = new FEATURE_SET_STRUCT(1);
  auto feature = new FEATURE_STRUCT(&CharNormDesc);

  feature->Params[CharNormY] = MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
  feature->Params[CharNormLength] = MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
  feature->Params[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
  feature->Params[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;

  AddFeature(feature_set, feature);

  return feature_set;
} /* ExtractCharNormFeatures */

} // namespace tesseract