Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/wordrec/associate.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/wordrec/associate.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,168 @@ +/////////////////////////////////////////////////////////////////////// +// File: associate.cpp +// Description: Functions for scoring segmentation paths according to +// their character widths, gap widths and seam cuts. +// Author: Daria Antonova +// Created: Mon Mar 8 11:26:43 PDT 2010 +// +// (C) Copyright 2010, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include <cmath> +#include <cstdio> + +#include "associate.h" +#include "normalis.h" +#include "pageres.h" + +namespace tesseract { + +const float AssociateUtils::kMaxFixedPitchCharAspectRatio = 2.0f; +const float AssociateUtils::kMinGap = 0.03f; + +void AssociateUtils::ComputeStats(int col, int row, const AssociateStats *parent_stats, + int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, + WERD_RES *word_res, bool debug, AssociateStats *stats) { + stats->Clear(); + + ASSERT_HOST(word_res != nullptr); + if (word_res->blob_widths.empty()) { + return; + } + if (debug) { + tprintf("AssociateUtils::ComputeStats() for col=%d, row=%d%s\n", col, row, + fixed_pitch ? " (fixed pitch)" : ""); + } + float normalizing_height = kBlnXHeight; + ROW *blob_row = word_res->blob_row; + // TODO(rays/daria) Can unicharset.script_has_xheight be useful here? + if (fixed_pitch && blob_row != nullptr) { + // For fixed pitch language like CJK, we use the full text height + // as the normalizing factor so we are not dependent on xheight + // calculation. + if (blob_row->body_size() > 0.0f) { + normalizing_height = word_res->denorm.y_scale() * blob_row->body_size(); + } else { + normalizing_height = + word_res->denorm.y_scale() * (blob_row->x_height() + blob_row->ascenders()); + } + if (debug) { + tprintf("normalizing height = %g (scale %g xheight %g ascenders %g)\n", normalizing_height, + word_res->denorm.y_scale(), blob_row->x_height(), blob_row->ascenders()); + } + } + float wh_ratio = word_res->GetBlobsWidth(col, row) / normalizing_height; + if (wh_ratio > max_char_wh_ratio) { + stats->bad_shape = true; + } + // Compute the gap sum for this shape. If there are only negative or only + // positive gaps, record their sum in stats->gap_sum. However, if there is + // a mixture, record only the sum of the positive gaps. + // TODO(antonova): explain fragment. + int negative_gap_sum = 0; + for (int c = col; c < row; ++c) { + int gap = word_res->GetBlobsGap(c); + (gap > 0) ? stats->gap_sum += gap : negative_gap_sum += gap; + } + if (stats->gap_sum == 0) { + stats->gap_sum = negative_gap_sum; + } + if (debug) { + tprintf("wh_ratio=%g (max_char_wh_ratio=%g) gap_sum=%d %s\n", wh_ratio, max_char_wh_ratio, + stats->gap_sum, stats->bad_shape ? "bad_shape" : ""); + } + // Compute shape_cost (for fixed pitch mode). + if (fixed_pitch) { + bool end_row = (row == (word_res->ratings->dimension() - 1)); + + // Ensure that the blob has gaps on the left and the right sides + // (except for beginning and ending punctuation) and that there is + // no cutting through ink at the blob boundaries. + if (col > 0) { + float left_gap = word_res->GetBlobsGap(col - 1) / normalizing_height; + SEAM *left_seam = word_res->seam_array[col - 1]; + if ((!end_row && left_gap < kMinGap) || left_seam->priority() > 0.0f) { + stats->bad_shape = true; + } + if (debug) { + tprintf("left_gap %g, left_seam %g %s\n", left_gap, left_seam->priority(), + stats->bad_shape ? "bad_shape" : ""); + } + } + float right_gap = 0.0f; + if (!end_row) { + right_gap = word_res->GetBlobsGap(row) / normalizing_height; + SEAM *right_seam = word_res->seam_array[row]; + if (right_gap < kMinGap || right_seam->priority() > 0.0f) { + stats->bad_shape = true; + if (right_gap < kMinGap) { + stats->bad_fixed_pitch_right_gap = true; + } + } + if (debug) { + tprintf("right_gap %g right_seam %g %s\n", right_gap, right_seam->priority(), + stats->bad_shape ? "bad_shape" : ""); + } + } + + // Impose additional segmentation penalties if blob widths or gaps + // distribution don't fit a fixed-pitch model. + // Since we only know the widths and gaps of the path explored so far, + // the means and variances are computed for the path so far (not + // considering characters to the right of the last character on the path). + stats->full_wh_ratio = wh_ratio + right_gap; + if (parent_stats != nullptr) { + stats->full_wh_ratio_total = (parent_stats->full_wh_ratio_total + stats->full_wh_ratio); + float mean = stats->full_wh_ratio_total / static_cast<float>(parent_path_length + 1); + stats->full_wh_ratio_var = + parent_stats->full_wh_ratio_var + pow(mean - stats->full_wh_ratio, 2); + } else { + stats->full_wh_ratio_total = stats->full_wh_ratio; + } + if (debug) { + tprintf("full_wh_ratio %g full_wh_ratio_total %g full_wh_ratio_var %g\n", + stats->full_wh_ratio, stats->full_wh_ratio_total, stats->full_wh_ratio_var); + } + + stats->shape_cost = FixedPitchWidthCost(wh_ratio, right_gap, end_row, max_char_wh_ratio); + + // For some reason Tesseract prefers to treat the whole CJ words + // as one blob when the initial segmentation is particularly bad. + // This hack is to avoid favoring such states. + if (col == 0 && end_row && wh_ratio > max_char_wh_ratio) { + stats->shape_cost += 10; + } + stats->shape_cost += stats->full_wh_ratio_var; + if (debug) { + tprintf("shape_cost %g\n", stats->shape_cost); + } + } +} + +float AssociateUtils::FixedPitchWidthCost(float norm_width, float right_gap, bool end_pos, + float max_char_wh_ratio) { + float cost = 0.0f; + if (norm_width > max_char_wh_ratio) { + cost += norm_width; + } + if (norm_width > kMaxFixedPitchCharAspectRatio) { + cost += norm_width * norm_width; // extra penalty for merging CJK chars + } + // Penalize skinny blobs, except for punctuation in the last position. + if (norm_width + right_gap < 0.5f && !end_pos) { + cost += 1.0f - (norm_width + right_gap); + } + return cost; +} + +} // namespace tesseract
