comparison mupdf-source/thirdparty/tesseract/src/wordrec/associate.h @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: associate.h
3 // Description: Structs, classes, typedefs useful for the segmentation
4 // search. Functions for scoring segmentation paths according
5 // to their character widths, gap widths and seam cuts.
6 // Author: Daria Antonova
7 // Created: Mon Mar 8 11:26:43 PDT 2010
8 //
9 // (C) Copyright 2010, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
20 ///////////////////////////////////////////////////////////////////////
21
22 #ifndef ASSOCIATE_H
23 #define ASSOCIATE_H
24
25 #include "blobs.h"
26 #include "elst.h"
27 #include "ratngs.h"
28 #include "seam.h"
29 #include "split.h"
30
31 namespace tesseract {
32
33 class WERD_RES;
34
35 // Statistics about character widths, gaps and seams.
36 struct AssociateStats {
37 AssociateStats() {
38 Clear();
39 }
40
41 void Clear() {
42 shape_cost = 0.0f;
43 bad_shape = false;
44 full_wh_ratio = 0.0f;
45 full_wh_ratio_total = 0.0f;
46 full_wh_ratio_var = 0.0f;
47 bad_fixed_pitch_right_gap = false;
48 bad_fixed_pitch_wh_ratio = false;
49 gap_sum = 0;
50 }
51
52 void Print() {
53 tprintf("AssociateStats: s(%g %d)\n", shape_cost, bad_shape);
54 }
55
56 float shape_cost; // cost of blob shape
57 bool bad_shape; // true if the shape of the blob is unacceptable
58 float full_wh_ratio; // width-to-height ratio + gap on the right
59 float full_wh_ratio_total; // sum of width-to-height ratios
60 // on the path terminating at this blob
61 float full_wh_ratio_var; // variance of full_wh_ratios on the path
62 bool bad_fixed_pitch_right_gap; // true if there is no gap before
63 // the blob on the right
64 bool bad_fixed_pitch_wh_ratio; // true if the blobs has width-to-height
65 // ratio > kMaxFixedPitchCharAspectRatio
66 int gap_sum; // sum of gaps within the blob
67 };
68
69 // Utility functions for scoring segmentation paths according to their
70 // character widths, gap widths, seam characteristics.
71 class AssociateUtils {
72 public:
73 static const float kMaxFixedPitchCharAspectRatio;
74 static const float kMinGap;
75
76 // Returns outline length of the given blob is computed as:
77 // rating_cert_scale * rating / certainty
78 // Since from Wordrec::SegSearch() in segsearch.cpp
79 // rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale
80 // And from Classify::ConvertMatchesToChoices() in adaptmatch.cpp
81 // Rating = Certainty = next.rating
82 // Rating *= rating_scale * Results->BlobLength
83 // Certainty *= -(getDict().certainty_scale)
84 static inline float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b) {
85 return rating_cert_scale * b.rating() / b.certainty();
86 }
87 static inline float ComputeRating(float rating_cert_scale, float cert, int width) {
88 return static_cast<float>(width) * cert / rating_cert_scale;
89 }
90
91 // Computes character widths, gaps and seams stats given the
92 // AssociateStats of the path so far, col, row of the blob that
93 // is being added to the path, and WERD_RES containing information
94 // about character widths, gaps and seams.
95 // Fills associate_cost with the combined shape, gap and seam cost
96 // of adding a unichar from (col, row) to the path (note that since
97 // this function could be used to compute the prioritization for
98 // pain points, (col, row) entry might not be classified yet; thus
99 // information in the (col, row) entry of the ratings matrix is not used).
100 //
101 // Note: the function assumes that word_res, stats and
102 // associate_cost pointers are not nullptr.
103 static void ComputeStats(int col, int row, const AssociateStats *parent_stats,
104 int parent_path_length, bool fixed_pitch, float max_char_wh_ratio,
105 WERD_RES *word_res, bool debug, AssociateStats *stats);
106
107 // Returns the width cost for fixed-pitch text.
108 static float FixedPitchWidthCost(float norm_width, float right_gap, bool end_pos,
109 float max_char_wh_ratio);
110
111 // Returns the gap cost for fixed-pitch text (penalizes vertically
112 // overlapping components).
113 static inline float FixedPitchGapCost(float norm_gap, bool end_pos) {
114 return (norm_gap < 0.05 && !end_pos) ? 5.0f : 0.0f;
115 }
116 };
117
118 } // namespace tesseract
119
120 #endif