Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/wordrec/associate.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: associate.cpp | |
| 3 // Description: Functions for scoring segmentation paths according to | |
| 4 // their character widths, gap widths and seam cuts. | |
| 5 // Author: Daria Antonova | |
| 6 // Created: Mon Mar 8 11:26:43 PDT 2010 | |
| 7 // | |
| 8 // (C) Copyright 2010, Google Inc. | |
| 9 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 10 // you may not use this file except in compliance with the License. | |
| 11 // You may obtain a copy of the License at | |
| 12 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 13 // Unless required by applicable law or agreed to in writing, software | |
| 14 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 16 // See the License for the specific language governing permissions and | |
| 17 // limitations under the License. | |
| 18 // | |
| 19 /////////////////////////////////////////////////////////////////////// | |
| 20 | |
| 21 #include <cmath> | |
| 22 #include <cstdio> | |
| 23 | |
| 24 #include "associate.h" | |
| 25 #include "normalis.h" | |
| 26 #include "pageres.h" | |
| 27 | |
| 28 namespace tesseract { | |
| 29 | |
| 30 const float AssociateUtils::kMaxFixedPitchCharAspectRatio = 2.0f; | |
| 31 const float AssociateUtils::kMinGap = 0.03f; | |
| 32 | |
| 33 void AssociateUtils::ComputeStats(int col, int row, const AssociateStats *parent_stats, | |
| 34 int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, | |
| 35 WERD_RES *word_res, bool debug, AssociateStats *stats) { | |
| 36 stats->Clear(); | |
| 37 | |
| 38 ASSERT_HOST(word_res != nullptr); | |
| 39 if (word_res->blob_widths.empty()) { | |
| 40 return; | |
| 41 } | |
| 42 if (debug) { | |
| 43 tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n", col, row, | |
| 44 fixed_pitch ? " (fixed pitch)" : ""); | |
| 45 } | |
| 46 float normalizing_height = kBlnXHeight; | |
| 47 ROW *blob_row = word_res->blob_row; | |
| 48 // TODO(rays/daria) Can unicharset.script_has_xheight be useful here? | |
| 49 if (fixed_pitch && blob_row != nullptr) { | |
| 50 // For fixed pitch language like CJK, we use the full text height | |
| 51 // as the normalizing factor so we are not dependent on xheight | |
| 52 // calculation. | |
| 53 if (blob_row->body_size() > 0.0f) { | |
| 54 normalizing_height = word_res->denorm.y_scale() * blob_row->body_size(); | |
| 55 } else { | |
| 56 normalizing_height = | |
| 57 word_res->denorm.y_scale() * (blob_row->x_height() + blob_row->ascenders()); | |
| 58 } | |
| 59 if (debug) { | |
| 60 tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n", normalizing_height, | |
| 61 word_res->denorm.y_scale(), blob_row->x_height(), blob_row->ascenders()); | |
| 62 } | |
| 63 } | |
| 64 float wh_ratio = word_res->GetBlobsWidth(col, row) / normalizing_height; | |
| 65 if (wh_ratio > max_char_wh_ratio) { | |
| 66 stats->bad_shape = true; | |
| 67 } | |
| 68 // Compute the gap sum for this shape. If there are only negative or only | |
| 69 // positive gaps, record their sum in stats->gap_sum. However, if there is | |
| 70 // a mixture, record only the sum of the positive gaps. | |
| 71 // TODO(antonova): explain fragment. | |
| 72 int negative_gap_sum = 0; | |
| 73 for (int c = col; c < row; ++c) { | |
| 74 int gap = word_res->GetBlobsGap(c); | |
| 75 (gap > 0) ? stats->gap_sum += gap : negative_gap_sum += gap; | |
| 76 } | |
| 77 if (stats->gap_sum == 0) { | |
| 78 stats->gap_sum = negative_gap_sum; | |
| 79 } | |
| 80 if (debug) { | |
| 81 tprintf("wh_ratio=%g (max_char_wh_ratio=%g) gap_sum=%d %s\n", wh_ratio, max_char_wh_ratio, | |
| 82 stats->gap_sum, stats->bad_shape ? "bad_shape" : ""); | |
| 83 } | |
| 84 // Compute shape_cost (for fixed pitch mode). | |
| 85 if (fixed_pitch) { | |
| 86 bool end_row = (row == (word_res->ratings->dimension() - 1)); | |
| 87 | |
| 88 // Ensure that the blob has gaps on the left and the right sides | |
| 89 // (except for beginning and ending punctuation) and that there is | |
| 90 // no cutting through ink at the blob boundaries. | |
| 91 if (col > 0) { | |
| 92 float left_gap = word_res->GetBlobsGap(col - 1) / normalizing_height; | |
| 93 SEAM *left_seam = word_res->seam_array[col - 1]; | |
| 94 if ((!end_row && left_gap < kMinGap) || left_seam->priority() > 0.0f) { | |
| 95 stats->bad_shape = true; | |
| 96 } | |
| 97 if (debug) { | |
| 98 tprintf("left_gap %g, left_seam %g %s\n", left_gap, left_seam->priority(), | |
| 99 stats->bad_shape ? "bad_shape" : ""); | |
| 100 } | |
| 101 } | |
| 102 float right_gap = 0.0f; | |
| 103 if (!end_row) { | |
| 104 right_gap = word_res->GetBlobsGap(row) / normalizing_height; | |
| 105 SEAM *right_seam = word_res->seam_array[row]; | |
| 106 if (right_gap < kMinGap || right_seam->priority() > 0.0f) { | |
| 107 stats->bad_shape = true; | |
| 108 if (right_gap < kMinGap) { | |
| 109 stats->bad_fixed_pitch_right_gap = true; | |
| 110 } | |
| 111 } | |
| 112 if (debug) { | |
| 113 tprintf("right_gap %g right_seam %g %s\n", right_gap, right_seam->priority(), | |
| 114 stats->bad_shape ? "bad_shape" : ""); | |
| 115 } | |
| 116 } | |
| 117 | |
| 118 // Impose additional segmentation penalties if blob widths or gaps | |
| 119 // distribution don't fit a fixed-pitch model. | |
| 120 // Since we only know the widths and gaps of the path explored so far, | |
| 121 // the means and variances are computed for the path so far (not | |
| 122 // considering characters to the right of the last character on the path). | |
| 123 stats->full_wh_ratio = wh_ratio + right_gap; | |
| 124 if (parent_stats != nullptr) { | |
| 125 stats->full_wh_ratio_total = (parent_stats->full_wh_ratio_total + stats->full_wh_ratio); | |
| 126 float mean = stats->full_wh_ratio_total / static_cast<float>(parent_path_length + 1); | |
| 127 stats->full_wh_ratio_var = | |
| 128 parent_stats->full_wh_ratio_var + pow(mean - stats->full_wh_ratio, 2); | |
| 129 } else { | |
| 130 stats->full_wh_ratio_total = stats->full_wh_ratio; | |
| 131 } | |
| 132 if (debug) { | |
| 133 tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n", | |
| 134 stats->full_wh_ratio, stats->full_wh_ratio_total, stats->full_wh_ratio_var); | |
| 135 } | |
| 136 | |
| 137 stats->shape_cost = FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio); | |
| 138 | |
| 139 // For some reason Tesseract prefers to treat the whole CJ words | |
| 140 // as one blob when the initial segmentation is particularly bad. | |
| 141 // This hack is to avoid favoring such states. | |
| 142 if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) { | |
| 143 stats->shape_cost += 10; | |
| 144 } | |
| 145 stats->shape_cost += stats->full_wh_ratio_var; | |
| 146 if (debug) { | |
| 147 tprintf("shape_cost %g\n", stats->shape_cost); | |
| 148 } | |
| 149 } | |
| 150 } | |
| 151 | |
| 152 float AssociateUtils::FixedPitchWidthCost(float norm_width, float right_gap, bool end_pos, | |
| 153 float max_char_wh_ratio) { | |
| 154 float cost = 0.0f; | |
| 155 if (norm_width > max_char_wh_ratio) { | |
| 156 cost += norm_width; | |
| 157 } | |
| 158 if (norm_width > kMaxFixedPitchCharAspectRatio) { | |
| 159 cost += norm_width * norm_width; // extra penalty for merging CJK chars | |
| 160 } | |
| 161 // Penalize skinny blobs, except for punctuation in the last position. | |
| 162 if (norm_width + right_gap < 0.5f && !end_pos) { | |
| 163 cost += 1.0f - (norm_width + right_gap); | |
| 164 } | |
| 165 return cost; | |
| 166 } | |
| 167 | |
| 168 } // namespace tesseract |
