Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/associate.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/associate.h Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,120 @@ +/////////////////////////////////////////////////////////////////////// +// File: associate.h +// Description: Structs, classes, typedefs useful for the segmentation +// search. Functions for scoring segmentation paths according +// to their character widths, gap widths and seam cuts. +// Author: Daria Antonova +// Created: Mon Mar 8 11:26:43 PDT 2010 +// +// (C) Copyright 2010, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef ASSOCIATE_H +#define ASSOCIATE_H + +#include "blobs.h" +#include "elst.h" +#include "ratngs.h" +#include "seam.h" +#include "split.h" + +namespace tesseract { + +class WERD_RES; + +// Statistics about character widths, gaps and seams. +struct AssociateStats { + AssociateStats() { + Clear(); + } + + void Clear() { + shape_cost = 0.0f; + bad_shape = false; + full_wh_ratio = 0.0f; + full_wh_ratio_total = 0.0f; + full_wh_ratio_var = 0.0f; + bad_fixed_pitch_right_gap = false; + bad_fixed_pitch_wh_ratio = false; + gap_sum = 0; + } + + void Print() { + tprintf("AssociateStats: s(%g %d)\n", shape_cost, bad_shape); + } + + float shape_cost; // cost of blob shape + bool bad_shape; // true if the shape of the blob is unacceptable + float full_wh_ratio; // width-to-height ratio + gap on the right + float full_wh_ratio_total; // sum of width-to-height ratios + // on the path terminating at this blob + float full_wh_ratio_var; // variance of full_wh_ratios on the path + bool bad_fixed_pitch_right_gap; // true if there is no gap before + // the blob on the right + bool bad_fixed_pitch_wh_ratio; // true if the blobs has width-to-height + // ratio > kMaxFixedPitchCharAspectRatio + int gap_sum; // sum of gaps within the blob +}; + +// Utility functions for scoring segmentation paths according to their +// character widths, gap widths, seam characteristics. +class AssociateUtils { +public: + static const float kMaxFixedPitchCharAspectRatio; + static const float kMinGap; + + // Returns outline length of the given blob is computed as: + // rating_cert_scale * rating / certainty + // Since from Wordrec::SegSearch() in segsearch.cpp + // rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale + // And from Classify::ConvertMatchesToChoices() in adaptmatch.cpp + // Rating = Certainty = next.rating + // Rating *= rating_scale * Results->BlobLength + // Certainty *= -(getDict().certainty_scale) + static inline float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b) { + return rating_cert_scale * b.rating() / b.certainty(); + } + static inline float ComputeRating(float rating_cert_scale, float cert, int width) { + return static_cast<float>(width) * cert / rating_cert_scale; + } + + // Computes character widths, gaps and seams stats given the + // AssociateStats of the path so far, col, row of the blob that + // is being added to the path, and WERD_RES containing information + // about character widths, gaps and seams. + // Fills associate_cost with the combined shape, gap and seam cost + // of adding a unichar from (col, row) to the path (note that since + // this function could be used to compute the prioritization for + // pain points, (col, row) entry might not be classified yet; thus + // information in the (col, row) entry of the ratings matrix is not used). + // + // Note: the function assumes that word_res, stats and + // associate_cost pointers are not nullptr. + static void ComputeStats(int col, int row, const AssociateStats *parent_stats, + int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, + WERD_RES *word_res, bool debug, AssociateStats *stats); + + // Returns the width cost for fixed-pitch text. + static float FixedPitchWidthCost(float norm_width, float right_gap, bool end_pos, + float max_char_wh_ratio); + + // Returns the gap cost for fixed-pitch text (penalizes vertically + // overlapping components). + static inline float FixedPitchGapCost(float norm_gap, bool end_pos) { + return (norm_gap < 0.05 && !end_pos) ? 5.0f : 0.0f; + } +}; + +} // namespace tesseract + +#endif
