Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/pango/boxchar.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/pango/boxchar.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,348 @@ +/********************************************************************** + * File: boxchar.cpp + * Description: Simple class to associate a Tesseract classification unit with + * its bounding box so that the boxes can be rotated as the image + * is rotated for degradation. Also includes routines to output + * the character-tagged boxes to a boxfile. + * Author: Ray Smith + * + * (C) Copyright 2013, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#include "boxchar.h" + +#include "fileio.h" +#include "normstrngs.h" +#include "tesserrstream.h" // for tesserr +#include "tprintf.h" +#include "unicharset.h" +#include "unicode/uchar.h" // from libicu + +#include <algorithm> +#include <cstddef> +#include <vector> + +// Absolute Ratio of dx:dy or dy:dx to be a newline. +const int kMinNewlineRatio = 5; + +namespace tesseract { + +BoxChar::BoxChar(const char *utf8_str, int len) + : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {} + +BoxChar::~BoxChar() { + boxDestroy(&box_); +} + +void BoxChar::AddBox(int x, int y, int width, int height) { + box_ = boxCreate(x, y, width, height); +} + +// Increments *num_rtl and *num_ltr according to the directionality of +// characters in the box. +void BoxChar::GetDirection(int *num_rtl, int *num_ltr) const { + // Convert the unichar to UTF32 representation + std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str()); + if (uni_vector.empty()) { + tprintf("Illegal utf8 in boxchar string:%s = ", ch_.c_str()); + for (char c : ch_) { + tprintf(" 0x%x", c); + } + tprintf("\n"); + return; + } + for (char32 ch : uni_vector) { + UCharDirection dir = u_charDirection(ch); + if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || dir == U_RIGHT_TO_LEFT_ISOLATE) { + ++*num_rtl; + } else if ((dir == U_ARABIC_NUMBER) || + (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) { + ++*num_ltr; + } + } +} + +// Reverses the order of unicodes within the box. If Pango generates a +// ligature, these will get reversed on output, so reverse now. +void BoxChar::ReverseUnicodesInBox() { + std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str()); + std::reverse(unicodes.begin(), unicodes.end()); + ch_ = UNICHAR::UTF32ToUTF8(unicodes); +} + +/* static */ +void BoxChar::TranslateBoxes(int xshift, int yshift, std::vector<BoxChar *> *boxes) { + for (auto &boxe : *boxes) { + Box *box = boxe->box_; + if (box != nullptr) { + box->x += xshift; + box->y += yshift; + } + } +} + +// Prepares for writing the boxes to a file by inserting newlines, spaces, +// and re-ordering so the boxes are strictly left-to-right. +/* static */ +void BoxChar::PrepareToWrite(std::vector<BoxChar *> *boxes) { + bool rtl_rules = ContainsMostlyRTL(*boxes); + bool vertical_rules = MostlyVertical(*boxes); + InsertNewlines(rtl_rules, vertical_rules, boxes); + InsertSpaces(rtl_rules, vertical_rules, boxes); + for (size_t i = 0; i < boxes->size(); ++i) { + if ((*boxes)[i]->box_ == nullptr) { + tesserr << "Null box at index " << i << '\n'; + } + } + if (rtl_rules) { + ReorderRTLText(boxes); + } +} + +// Inserts newline (tab) characters into the vector at newline positions. +/* static */ +void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) { + size_t prev_i = SIZE_MAX; + int max_shift = 0; + for (size_t i = 0; i < boxes->size(); ++i) { + Box *box = (*boxes)[i]->box_; + if (box == nullptr) { + if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) { + // Erase null boxes at the start of a line and after another null box. + do { + delete (*boxes)[i]; + boxes->erase(boxes->begin() + i); + if (i == 0) { + break; + } + } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr); + } + continue; + } + if (prev_i != SIZE_MAX) { + Box *prev_box = (*boxes)[prev_i]->box_; + int shift = box->x - prev_box->x; + if (vertical_rules) { + shift = box->y - prev_box->y; + } else if (rtl_rules) { + shift = -shift; + } + if (-shift > max_shift) { + // This is a newline. Since nothing cares about the size of the box, + // except the out-of-bounds checker, minimize the chance of creating + // a box outside the image by making the width and height 1. + int width = 1; + int height = 1; + int x = prev_box->x + prev_box->w; + int y = prev_box->y; + if (vertical_rules) { + x = prev_box->x; + y = prev_box->y + prev_box->h; + } else if (rtl_rules) { + x = prev_box->x - width; + if (x < 0) { + tprintf("prev x = %d, width=%d\n", prev_box->x, width); + x = 0; + } + } + if (prev_i + 1 == i) { + // New character needed. + auto *new_box = new BoxChar("\t", 1); + new_box->AddBox(x, y, width, height); + new_box->page_ = (*boxes)[i]->page_; + boxes->insert(boxes->begin() + i, new_box); + ++i; + } else { + (*boxes)[i - 1]->AddBox(x, y, width, height); + (*boxes)[i - 1]->ch_ = "\t"; + } + max_shift = 0; + } else if (shift > max_shift) { + max_shift = shift; + } + } + prev_i = i; + } +} + +// Converts nullptr boxes to space characters, with appropriate bounding boxes. +/* static */ +void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector<BoxChar *> *boxes) { + // After InsertNewlines, any remaining null boxes are not newlines, and are + // singletons, so add a box to each remaining null box. + for (size_t i = 1; i + 1 < boxes->size(); ++i) { + Box *box = (*boxes)[i]->box_; + if (box == nullptr) { + Box *prev = (*boxes)[i - 1]->box_; + Box *next = (*boxes)[i + 1]->box_; + ASSERT_HOST(prev != nullptr && next != nullptr); + int top = std::min(prev->y, next->y); + int bottom = std::max(prev->y + prev->h, next->y + next->h); + int left = prev->x + prev->w; + int right = next->x; + if (vertical_rules) { + top = prev->y + prev->h; + bottom = next->y; + left = std::min(prev->x, next->x); + right = std::max(prev->x + prev->w, next->x + next->w); + } else if (rtl_rules) { + // With RTL we have to account for BiDi. + // Right becomes the min left of all prior boxes back to the first + // space or newline. + right = prev->x; + left = next->x + next->w; + for (int j = i - 2; j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; --j) { + prev = (*boxes)[j]->box_; + ASSERT_HOST(prev != nullptr); + if (prev->x < right) { + right = prev->x; + } + } + // Left becomes the max right of all next boxes forward to the first + // space or newline. + for (size_t j = i + 2; + j < boxes->size() && (*boxes)[j]->box_ != nullptr && (*boxes)[j]->ch_ != "\t"; ++j) { + next = (*boxes)[j]->box_; + if (next->x + next->w > left) { + left = next->x + next->w; + } + } + } + // Italic and stylized characters can produce negative spaces, which + // Leptonica doesn't like, so clip to a positive size. + if (right <= left) { + right = left + 1; + } + if (bottom <= top) { + bottom = top + 1; + } + (*boxes)[i]->AddBox(left, top, right - left, bottom - top); + (*boxes)[i]->ch_ = " "; + } + } +} + +// Reorders text in a right-to-left script in left-to-right order. +/* static */ +void BoxChar::ReorderRTLText(std::vector<BoxChar *> *boxes) { + // Ideally we need the inverse of the algorithm used by ResultIterator. + // For now, let's try a sort that reverses original positions for RTL + // characters, otherwise by x-position. This should be much closer to + // correct than just sorting by x-position. + size_t num_boxes = boxes->size(); + for (size_t i = 0; i < num_boxes; ++i) { + int num_rtl = 0, num_ltr = 0; + (*boxes)[i]->GetDirection(&num_rtl, &num_ltr); + if (num_rtl > num_ltr) { + (*boxes)[i]->set_rtl_index(i); + (*boxes)[i]->ReverseUnicodesInBox(); + } + } + BoxCharPtrSort sorter; + size_t end = 0; + for (size_t start = 0; start < boxes->size(); start = end + 1) { + end = start + 1; + while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") { + ++end; + } + std::sort(boxes->begin() + start, boxes->begin() + end, sorter); + } +} + +// Returns true if the vector contains mostly RTL characters. +/* static */ +bool BoxChar::ContainsMostlyRTL(const std::vector<BoxChar *> &boxes) { + int num_rtl = 0, num_ltr = 0; + for (auto boxe : boxes) { + boxe->GetDirection(&num_rtl, &num_ltr); + } + return num_rtl > num_ltr; +} + +// Returns true if the text is mostly laid out vertically. +/* static */ +bool BoxChar::MostlyVertical(const std::vector<BoxChar *> &boxes) { + int64_t total_dx = 0, total_dy = 0; + for (size_t i = 1; i < boxes.size(); ++i) { + if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr && + boxes[i - 1]->page_ == boxes[i]->page_) { + int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x; + int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y; + if (abs(dx) > abs(dy) * kMinNewlineRatio || abs(dy) > abs(dx) * kMinNewlineRatio) { + total_dx += static_cast<int64_t>(dx) * dx; + total_dy += static_cast<int64_t>(dy) * dy; + } + } + } + return total_dy > total_dx; +} + +// Returns the total length of all the strings in the boxes. +/* static */ +int BoxChar::TotalByteLength(const std::vector<BoxChar *> &boxes) { + int total_length = 0; + for (auto boxe : boxes) { + total_length += boxe->ch_.size(); + } + return total_length; +} + +// Rotate the boxes in [start_box, end_box) by the given rotation. +// The rotation is in radians clockwise about the given center. +/* static */ +void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, + std::vector<BoxChar *> *boxes) { + Boxa *orig = boxaCreate(0); + for (int i = start_box; i < end_box; ++i) { + Box *box = (*boxes)[i]->box_; + if (box) { + boxaAddBox(orig, box, L_CLONE); + } + } + Boxa *rotated = boxaRotate(orig, xcenter, ycenter, rotation); + boxaDestroy(&orig); + for (int i = start_box, box_ind = 0; i < end_box; ++i) { + if ((*boxes)[i]->box_) { + boxDestroy(&((*boxes)[i]->box_)); + (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE); + } + } + boxaDestroy(&rotated); +} + +const int kMaxLineLength = 1024; +/* static */ +void BoxChar::WriteTesseractBoxFile(const std::string &filename, int height, + const std::vector<BoxChar *> &boxes) { + std::string output = GetTesseractBoxStr(height, boxes); + File::WriteStringToFileOrDie(output, filename); +} + +/* static */ +std::string BoxChar::GetTesseractBoxStr(int height, const std::vector<BoxChar *> &boxes) { + std::string output; + char buffer[kMaxLineLength]; + for (auto boxe : boxes) { + const Box *box = boxe->box_; + if (box == nullptr) { + tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n"); + return ""; + } + int nbytes = snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", boxe->ch_.c_str(), box->x, + height - box->y - box->h, box->x + box->w, height - box->y, boxe->page_); + output.append(buffer, nbytes); + } + return output; +} + +} // namespace tesseract
