Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/pango/stringrenderer.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: stringrenderer.cpp | |
| 3 * Description: Class for rendering UTF-8 text to an image, and retrieving | |
| 4 * bounding boxes around each grapheme cluster. | |
| 5 * Author: Ranjith Unnikrishnan | |
| 6 * | |
| 7 * (C) Copyright 2013, Google Inc. | |
| 8 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 * you may not use this file except in compliance with the License. | |
| 10 * You may obtain a copy of the License at | |
| 11 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 * Unless required by applicable law or agreed to in writing, software | |
| 13 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 * See the License for the specific language governing permissions and | |
| 16 * limitations under the License. | |
| 17 * | |
| 18 **********************************************************************/ | |
| 19 | |
| 20 #include "stringrenderer.h" | |
| 21 | |
| 22 #include <allheaders.h> // from leptonica | |
| 23 #include "boxchar.h" | |
| 24 #include "helpers.h" // for TRand | |
| 25 #include "ligature_table.h" | |
| 26 #include "normstrngs.h" | |
| 27 #include "tlog.h" | |
| 28 | |
| 29 #include <tesseract/unichar.h> | |
| 30 | |
| 31 #include "pango/pango-font.h" | |
| 32 #include "pango/pango-glyph-item.h" | |
| 33 #include "unicode/uchar.h" // from libicu | |
| 34 | |
| 35 #include <algorithm> | |
| 36 #include <cassert> | |
| 37 #include <cstdio> | |
| 38 #include <cstring> | |
| 39 #include <map> | |
| 40 #include <utility> | |
| 41 #include <vector> | |
| 42 | |
| 43 #define DISABLE_HEAP_LEAK_CHECK | |
| 44 | |
| 45 namespace tesseract { | |
| 46 | |
| 47 static const int kDefaultOutputResolution = 300; | |
| 48 | |
| 49 // Word joiner (U+2060) inserted after letters in ngram mode, as per | |
| 50 // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at | |
| 51 // hyphens and other non-alpha characters. | |
| 52 static const char *kWordJoinerUTF8 = "\u2060"; | |
| 53 | |
| 54 static bool IsCombiner(int ch) { | |
| 55 const int char_type = u_charType(ch); | |
| 56 return ((char_type == U_NON_SPACING_MARK) || (char_type == U_ENCLOSING_MARK) || | |
| 57 (char_type == U_COMBINING_SPACING_MARK)); | |
| 58 } | |
| 59 | |
| 60 static std::string EncodeAsUTF8(const char32 ch32) { | |
| 61 UNICHAR uni_ch(ch32); | |
| 62 return std::string(uni_ch.utf8(), uni_ch.utf8_len()); | |
| 63 } | |
| 64 | |
| 65 // Returns true with probability 'prob'. | |
| 66 static bool RandBool(const double prob, TRand *rand) { | |
| 67 if (prob == 1.0) { | |
| 68 return true; | |
| 69 } | |
| 70 if (prob == 0.0) { | |
| 71 return false; | |
| 72 } | |
| 73 return rand->UnsignedRand(1.0) < prob; | |
| 74 } | |
| 75 | |
| 76 /* static */ | |
| 77 static Image CairoARGB32ToPixFormat(cairo_surface_t *surface) { | |
| 78 if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) { | |
| 79 printf("Unexpected surface format %d\n", cairo_image_surface_get_format(surface)); | |
| 80 return nullptr; | |
| 81 } | |
| 82 const int width = cairo_image_surface_get_width(surface); | |
| 83 const int height = cairo_image_surface_get_height(surface); | |
| 84 Image pix = pixCreate(width, height, 32); | |
| 85 int byte_stride = cairo_image_surface_get_stride(surface); | |
| 86 | |
| 87 for (int i = 0; i < height; ++i) { | |
| 88 memcpy(reinterpret_cast<unsigned char *>(pixGetData(pix) + i * pixGetWpl(pix)) + 1, | |
| 89 cairo_image_surface_get_data(surface) + i * byte_stride, | |
| 90 byte_stride - ((i == height - 1) ? 1 : 0)); | |
| 91 } | |
| 92 return pix; | |
| 93 } | |
| 94 | |
| 95 StringRenderer::StringRenderer(const std::string &font_desc, int page_width, int page_height) | |
| 96 : font_(font_desc) | |
| 97 , page_width_(page_width) | |
| 98 , page_height_(page_height) | |
| 99 , h_margin_(50) | |
| 100 , v_margin_(50) | |
| 101 , pen_color_{0.0, 0.0, 0.0} | |
| 102 , char_spacing_(0) | |
| 103 , leading_(0) | |
| 104 , vertical_text_(false) | |
| 105 , gravity_hint_strong_(false) | |
| 106 , render_fullwidth_latin_(false) | |
| 107 , underline_start_prob_(0) | |
| 108 , underline_continuation_prob_(0) | |
| 109 , underline_style_(PANGO_UNDERLINE_SINGLE) | |
| 110 , drop_uncovered_chars_(true) | |
| 111 , strip_unrenderable_words_(false) | |
| 112 , add_ligatures_(false) | |
| 113 , output_word_boxes_(false) | |
| 114 , surface_(nullptr) | |
| 115 , cr_(nullptr) | |
| 116 , layout_(nullptr) | |
| 117 , start_box_(0) | |
| 118 , page_(0) | |
| 119 , box_padding_(0) | |
| 120 , page_boxes_(nullptr) | |
| 121 , total_chars_(0) | |
| 122 , font_index_(0) | |
| 123 , last_offset_(0) { | |
| 124 set_resolution(kDefaultOutputResolution); | |
| 125 set_font(font_desc); | |
| 126 } | |
| 127 | |
| 128 bool StringRenderer::set_font(const std::string &desc) { | |
| 129 bool success = font_.ParseFontDescriptionName(desc); | |
| 130 font_.set_resolution(resolution_); | |
| 131 return success; | |
| 132 } | |
| 133 | |
| 134 void StringRenderer::set_resolution(const int resolution) { | |
| 135 resolution_ = resolution; | |
| 136 font_.set_resolution(resolution); | |
| 137 } | |
| 138 | |
| 139 void StringRenderer::set_underline_start_prob(const double frac) { | |
| 140 underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0); | |
| 141 } | |
| 142 | |
| 143 void StringRenderer::set_underline_continuation_prob(const double frac) { | |
| 144 underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0); | |
| 145 } | |
| 146 | |
| 147 StringRenderer::~StringRenderer() { | |
| 148 ClearBoxes(); | |
| 149 FreePangoCairo(); | |
| 150 } | |
| 151 | |
| 152 void StringRenderer::InitPangoCairo() { | |
| 153 FreePangoCairo(); | |
| 154 surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_, page_height_); | |
| 155 cr_ = cairo_create(surface_); | |
| 156 { | |
| 157 DISABLE_HEAP_LEAK_CHECK; | |
| 158 layout_ = pango_cairo_create_layout(cr_); | |
| 159 } | |
| 160 | |
| 161 if (vertical_text_) { | |
| 162 PangoContext *context = pango_layout_get_context(layout_); | |
| 163 pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST); | |
| 164 if (gravity_hint_strong_) { | |
| 165 pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG); | |
| 166 } | |
| 167 pango_layout_context_changed(layout_); | |
| 168 } | |
| 169 | |
| 170 SetLayoutProperties(); | |
| 171 } | |
| 172 | |
| 173 void StringRenderer::SetLayoutProperties() { | |
| 174 std::string font_desc = font_.DescriptionName(); | |
| 175 // Specify the font via a description name | |
| 176 PangoFontDescription *desc = pango_font_description_from_string(font_desc.c_str()); | |
| 177 // Assign the font description to the layout | |
| 178 pango_layout_set_font_description(layout_, desc); | |
| 179 pango_font_description_free(desc); // free the description | |
| 180 pango_cairo_context_set_resolution(pango_layout_get_context(layout_), resolution_); | |
| 181 | |
| 182 int max_width = page_width_ - 2 * h_margin_; | |
| 183 int max_height = page_height_ - 2 * v_margin_; | |
| 184 tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height); | |
| 185 if (vertical_text_) { | |
| 186 using std::swap; | |
| 187 swap(max_width, max_height); | |
| 188 } | |
| 189 pango_layout_set_width(layout_, max_width * PANGO_SCALE); | |
| 190 // Ultra-wide Thai strings need to wrap at char level. | |
| 191 pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR); | |
| 192 | |
| 193 // Adjust character spacing | |
| 194 PangoAttrList *attr_list = pango_attr_list_new(); | |
| 195 if (char_spacing_) { | |
| 196 PangoAttribute *spacing_attr = pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE); | |
| 197 spacing_attr->start_index = 0; | |
| 198 spacing_attr->end_index = static_cast<guint>(-1); | |
| 199 pango_attr_list_change(attr_list, spacing_attr); | |
| 200 } | |
| 201 | |
| 202 if (add_ligatures_) { | |
| 203 set_features("liga, clig, dlig, hlig"); | |
| 204 PangoAttribute *feature_attr = pango_attr_font_features_new(features_.c_str()); | |
| 205 pango_attr_list_change(attr_list, feature_attr); | |
| 206 } | |
| 207 | |
| 208 pango_layout_set_attributes(layout_, attr_list); | |
| 209 pango_attr_list_unref(attr_list); | |
| 210 // Adjust line spacing | |
| 211 if (leading_) { | |
| 212 pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE); | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 void StringRenderer::FreePangoCairo() { | |
| 217 if (layout_) { | |
| 218 g_object_unref(layout_); | |
| 219 layout_ = nullptr; | |
| 220 } | |
| 221 if (cr_) { | |
| 222 cairo_destroy(cr_); | |
| 223 cr_ = nullptr; | |
| 224 } | |
| 225 if (surface_) { | |
| 226 cairo_surface_destroy(surface_); | |
| 227 surface_ = nullptr; | |
| 228 } | |
| 229 } | |
| 230 | |
| 231 void StringRenderer::SetWordUnderlineAttributes(const std::string &page_text) { | |
| 232 if (underline_start_prob_ == 0) { | |
| 233 return; | |
| 234 } | |
| 235 PangoAttrList *attr_list = pango_layout_get_attributes(layout_); | |
| 236 | |
| 237 const char *text = page_text.c_str(); | |
| 238 size_t offset = 0; | |
| 239 TRand rand; | |
| 240 bool started_underline = false; | |
| 241 PangoAttribute *und_attr = nullptr; | |
| 242 | |
| 243 while (offset < page_text.length()) { | |
| 244 offset += SpanUTF8Whitespace(text + offset); | |
| 245 if (offset == page_text.length()) { | |
| 246 break; | |
| 247 } | |
| 248 | |
| 249 int word_start = offset; | |
| 250 int word_len = SpanUTF8NotWhitespace(text + offset); | |
| 251 offset += word_len; | |
| 252 if (started_underline) { | |
| 253 // Should we continue the underline to the next word? | |
| 254 if (RandBool(underline_continuation_prob_, &rand)) { | |
| 255 // Continue the current underline to this word. | |
| 256 und_attr->end_index = word_start + word_len; | |
| 257 } else { | |
| 258 // Otherwise end the current underline attribute at the end of the | |
| 259 // previous word. | |
| 260 pango_attr_list_insert(attr_list, und_attr); | |
| 261 started_underline = false; | |
| 262 und_attr = nullptr; | |
| 263 } | |
| 264 } | |
| 265 if (!started_underline && RandBool(underline_start_prob_, &rand)) { | |
| 266 // Start a new underline attribute | |
| 267 und_attr = pango_attr_underline_new(underline_style_); | |
| 268 und_attr->start_index = word_start; | |
| 269 und_attr->end_index = word_start + word_len; | |
| 270 started_underline = true; | |
| 271 } | |
| 272 } | |
| 273 // Finish the current underline attribute at the end of the page. | |
| 274 if (started_underline) { | |
| 275 und_attr->end_index = page_text.length(); | |
| 276 pango_attr_list_insert(attr_list, und_attr); | |
| 277 } | |
| 278 } | |
| 279 | |
| 280 // Returns offset in utf8 bytes to first page. | |
| 281 int StringRenderer::FindFirstPageBreakOffset(const char *text, int text_length) { | |
| 282 if (!text_length) { | |
| 283 return 0; | |
| 284 } | |
| 285 const int max_height = (page_height_ - 2 * v_margin_); | |
| 286 const int max_width = (page_width_ - 2 * h_margin_); | |
| 287 const int max_layout_height = vertical_text_ ? max_width : max_height; | |
| 288 | |
| 289 UNICHAR::const_iterator it = UNICHAR::begin(text, text_length); | |
| 290 const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length); | |
| 291 const int kMaxUnicodeBufLength = 15000; | |
| 292 for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i) { | |
| 293 ; | |
| 294 } | |
| 295 int buf_length = it.utf8_data() - text; | |
| 296 tlog(1, "len = %d buf_len = %d\n", text_length, buf_length); | |
| 297 pango_layout_set_text(layout_, text, buf_length); | |
| 298 | |
| 299 PangoLayoutIter *line_iter = nullptr; | |
| 300 { // Fontconfig caches some info here that is not freed before exit. | |
| 301 DISABLE_HEAP_LEAK_CHECK; | |
| 302 line_iter = pango_layout_get_iter(layout_); | |
| 303 } | |
| 304 bool first_page = true; | |
| 305 int page_top = 0; | |
| 306 int offset = buf_length; | |
| 307 do { | |
| 308 // Get bounding box of the current line | |
| 309 PangoRectangle line_ink_rect; | |
| 310 pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr); | |
| 311 pango_extents_to_pixels(&line_ink_rect, nullptr); | |
| 312 PangoLayoutLine *line = pango_layout_iter_get_line_readonly(line_iter); | |
| 313 if (first_page) { | |
| 314 page_top = line_ink_rect.y; | |
| 315 first_page = false; | |
| 316 } | |
| 317 int line_bottom = line_ink_rect.y + line_ink_rect.height; | |
| 318 if (line_bottom - page_top > max_layout_height) { | |
| 319 offset = line->start_index; | |
| 320 tlog(1, "Found offset = %d\n", offset); | |
| 321 break; | |
| 322 } | |
| 323 } while (pango_layout_iter_next_line(line_iter)); | |
| 324 pango_layout_iter_free(line_iter); | |
| 325 return offset; | |
| 326 } | |
| 327 | |
| 328 const std::vector<BoxChar *> &StringRenderer::GetBoxes() const { | |
| 329 return boxchars_; | |
| 330 } | |
| 331 | |
| 332 Boxa *StringRenderer::GetPageBoxes() const { | |
| 333 return page_boxes_; | |
| 334 } | |
| 335 | |
| 336 void StringRenderer::RotatePageBoxes(float rotation) { | |
| 337 BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2, start_box_, boxchars_.size(), | |
| 338 &boxchars_); | |
| 339 } | |
| 340 | |
| 341 void StringRenderer::ClearBoxes() { | |
| 342 for (auto &boxchar : boxchars_) { | |
| 343 delete boxchar; | |
| 344 } | |
| 345 boxchars_.clear(); | |
| 346 boxaDestroy(&page_boxes_); | |
| 347 } | |
| 348 | |
| 349 std::string StringRenderer::GetBoxesStr() { | |
| 350 BoxChar::PrepareToWrite(&boxchars_); | |
| 351 return BoxChar::GetTesseractBoxStr(page_height_, boxchars_); | |
| 352 } | |
| 353 | |
| 354 void StringRenderer::WriteAllBoxes(const std::string &filename) { | |
| 355 BoxChar::PrepareToWrite(&boxchars_); | |
| 356 BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_); | |
| 357 } | |
| 358 | |
| 359 // Returns cluster strings in logical order. | |
| 360 bool StringRenderer::GetClusterStrings(std::vector<std::string> *cluster_text) { | |
| 361 std::map<int, std::string> start_byte_to_text; | |
| 362 PangoLayoutIter *run_iter = pango_layout_get_iter(layout_); | |
| 363 const char *full_text = pango_layout_get_text(layout_); | |
| 364 do { | |
| 365 PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter); | |
| 366 if (!run) { | |
| 367 // End of line nullptr run marker | |
| 368 tlog(2, "Found end of line marker\n"); | |
| 369 continue; | |
| 370 } | |
| 371 PangoGlyphItemIter cluster_iter; | |
| 372 gboolean have_cluster; | |
| 373 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, full_text); | |
| 374 have_cluster; have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) { | |
| 375 const int start_byte_index = cluster_iter.start_index; | |
| 376 const int end_byte_index = cluster_iter.end_index; | |
| 377 std::string text = | |
| 378 std::string(full_text + start_byte_index, end_byte_index - start_byte_index); | |
| 379 if (IsUTF8Whitespace(text.c_str())) { | |
| 380 tlog(2, "Found whitespace\n"); | |
| 381 text = " "; | |
| 382 } | |
| 383 tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index, end_byte_index, text.c_str()); | |
| 384 if (add_ligatures_) { | |
| 385 // Make sure the output box files have ligatured text in case the font | |
| 386 // decided to use an unmapped glyph. | |
| 387 text = LigatureTable::Get()->AddLigatures(text, nullptr); | |
| 388 } | |
| 389 start_byte_to_text[start_byte_index] = std::move(text); | |
| 390 } | |
| 391 } while (pango_layout_iter_next_run(run_iter)); | |
| 392 pango_layout_iter_free(run_iter); | |
| 393 | |
| 394 cluster_text->clear(); | |
| 395 for (auto it = start_byte_to_text.begin(); it != start_byte_to_text.end(); ++it) { | |
| 396 cluster_text->push_back(it->second); | |
| 397 } | |
| 398 return !cluster_text->empty(); | |
| 399 } | |
| 400 | |
| 401 // Merges an array of BoxChars into words based on the identification of | |
| 402 // BoxChars containing the space character as inter-word separators. | |
| 403 // | |
| 404 // Sometime two adjacent characters in the sequence may be detected as lying on | |
| 405 // different lines based on their spatial positions. This may be the result of a | |
| 406 // newline character at end of the last word on a line in the source text, or of | |
| 407 // a discretionary line-break created by Pango at intra-word locations like | |
| 408 // hyphens. When this is detected the word is split at that location into | |
| 409 // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and | |
| 410 // its bounding box. | |
| 411 static void MergeBoxCharsToWords(std::vector<BoxChar *> *boxchars) { | |
| 412 std::vector<BoxChar *> result; | |
| 413 bool started_word = false; | |
| 414 for (auto &boxchar : *boxchars) { | |
| 415 if (boxchar->ch() == " " || boxchar->box() == nullptr) { | |
| 416 result.push_back(boxchar); | |
| 417 boxchar = nullptr; | |
| 418 started_word = false; | |
| 419 continue; | |
| 420 } | |
| 421 | |
| 422 if (!started_word) { | |
| 423 // Begin new word | |
| 424 started_word = true; | |
| 425 result.push_back(boxchar); | |
| 426 boxchar = nullptr; | |
| 427 } else { | |
| 428 BoxChar *last_boxchar = result.back(); | |
| 429 // Compute bounding box union | |
| 430 const Box *box = boxchar->box(); | |
| 431 Box *last_box = last_boxchar->mutable_box(); | |
| 432 int left = std::min(last_box->x, box->x); | |
| 433 int right = std::max(last_box->x + last_box->w, box->x + box->w); | |
| 434 int top = std::min(last_box->y, box->y); | |
| 435 int bottom = std::max(last_box->y + last_box->h, box->y + box->h); | |
| 436 // Conclude that the word was broken to span multiple lines based on the | |
| 437 // size of the merged bounding box in relation to those of the individual | |
| 438 // characters seen so far. | |
| 439 if (right - left > last_box->w + 5 * box->w) { | |
| 440 tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str()); | |
| 441 // Insert a fake interword space and start a new word with the current | |
| 442 // boxchar. | |
| 443 result.push_back(new BoxChar(" ", 1)); | |
| 444 result.push_back(boxchar); | |
| 445 boxchar = nullptr; | |
| 446 continue; | |
| 447 } | |
| 448 // Append to last word | |
| 449 last_boxchar->mutable_ch()->append(boxchar->ch()); | |
| 450 last_box->x = left; | |
| 451 last_box->w = right - left; | |
| 452 last_box->y = top; | |
| 453 last_box->h = bottom - top; | |
| 454 delete boxchar; | |
| 455 boxchar = nullptr; | |
| 456 } | |
| 457 } | |
| 458 boxchars->swap(result); | |
| 459 } | |
| 460 | |
| 461 void StringRenderer::ComputeClusterBoxes() { | |
| 462 const char *text = pango_layout_get_text(layout_); | |
| 463 PangoLayoutIter *cluster_iter = pango_layout_get_iter(layout_); | |
| 464 | |
| 465 // Do a first pass to store cluster start indexes. | |
| 466 std::vector<int> cluster_start_indices; | |
| 467 do { | |
| 468 cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter)); | |
| 469 tlog(3, "Added %d\n", cluster_start_indices.back()); | |
| 470 } while (pango_layout_iter_next_cluster(cluster_iter)); | |
| 471 pango_layout_iter_free(cluster_iter); | |
| 472 cluster_start_indices.push_back(strlen(text)); | |
| 473 tlog(3, "Added last index %d\n", cluster_start_indices.back()); | |
| 474 // Sort the indices and create a map from start to end indices. | |
| 475 std::sort(cluster_start_indices.begin(), cluster_start_indices.end()); | |
| 476 std::map<int, int> cluster_start_to_end_index; | |
| 477 for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) { | |
| 478 cluster_start_to_end_index[cluster_start_indices[i]] = cluster_start_indices[i + 1]; | |
| 479 } | |
| 480 | |
| 481 // Iterate again to compute cluster boxes and their text with the obtained | |
| 482 // cluster extent information. | |
| 483 cluster_iter = pango_layout_get_iter(layout_); | |
| 484 // Store BoxChars* sorted by their byte start positions | |
| 485 std::map<int, BoxChar *> start_byte_to_box; | |
| 486 do { | |
| 487 PangoRectangle cluster_rect; | |
| 488 pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr); | |
| 489 pango_extents_to_pixels(&cluster_rect, nullptr); | |
| 490 const int start_byte_index = pango_layout_iter_get_index(cluster_iter); | |
| 491 const int end_byte_index = cluster_start_to_end_index[start_byte_index]; | |
| 492 std::string cluster_text = | |
| 493 std::string(text + start_byte_index, end_byte_index - start_byte_index); | |
| 494 if (!cluster_text.empty() && cluster_text[0] == '\n') { | |
| 495 tlog(2, "Skipping newlines at start of text.\n"); | |
| 496 continue; | |
| 497 } | |
| 498 if (!cluster_rect.width || !cluster_rect.height || IsUTF8Whitespace(cluster_text.c_str())) { | |
| 499 tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n", cluster_rect.width, | |
| 500 cluster_rect.height, cluster_text.c_str()); | |
| 501 auto *boxchar = new BoxChar(" ", 1); | |
| 502 boxchar->set_page(page_); | |
| 503 start_byte_to_box[start_byte_index] = boxchar; | |
| 504 continue; | |
| 505 } | |
| 506 // Prepare a boxchar for addition at this byte position. | |
| 507 tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n", cluster_rect.x, cluster_rect.y, | |
| 508 cluster_rect.width, cluster_rect.height, start_byte_index, end_byte_index, | |
| 509 cluster_text.c_str()); | |
| 510 ASSERT_HOST_MSG(cluster_rect.width, "cluster_text:%s start_byte_index:%d\n", | |
| 511 cluster_text.c_str(), start_byte_index); | |
| 512 ASSERT_HOST_MSG(cluster_rect.height, "cluster_text:%s start_byte_index:%d\n", | |
| 513 cluster_text.c_str(), start_byte_index); | |
| 514 if (box_padding_) { | |
| 515 cluster_rect.x = std::max(0, cluster_rect.x - box_padding_); | |
| 516 cluster_rect.width += 2 * box_padding_; | |
| 517 cluster_rect.y = std::max(0, cluster_rect.y - box_padding_); | |
| 518 cluster_rect.height += 2 * box_padding_; | |
| 519 } | |
| 520 if (add_ligatures_) { | |
| 521 // Make sure the output box files have ligatured text in case the font | |
| 522 // decided to use an unmapped glyph. | |
| 523 cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr); | |
| 524 } | |
| 525 auto *boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size()); | |
| 526 boxchar->set_page(page_); | |
| 527 boxchar->AddBox(cluster_rect.x, cluster_rect.y, cluster_rect.width, cluster_rect.height); | |
| 528 start_byte_to_box[start_byte_index] = boxchar; | |
| 529 } while (pango_layout_iter_next_cluster(cluster_iter)); | |
| 530 pango_layout_iter_free(cluster_iter); | |
| 531 | |
| 532 // There is a subtle bug in the cluster text reported by the PangoLayoutIter | |
| 533 // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work | |
| 534 // around this, we use text reported using the PangoGlyphIter which is | |
| 535 // accurate. | |
| 536 // TODO(ranjith): Revisit whether this is still needed in newer versions of | |
| 537 // pango. | |
| 538 std::vector<std::string> cluster_text; | |
| 539 if (GetClusterStrings(&cluster_text)) { | |
| 540 ASSERT_HOST(cluster_text.size() == start_byte_to_box.size()); | |
| 541 int ind = 0; | |
| 542 for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it, ++ind) { | |
| 543 it->second->mutable_ch()->swap(cluster_text[ind]); | |
| 544 } | |
| 545 } | |
| 546 | |
| 547 // Append to the boxchars list in byte order. | |
| 548 std::vector<BoxChar *> page_boxchars; | |
| 549 page_boxchars.reserve(start_byte_to_box.size()); | |
| 550 std::string last_ch; | |
| 551 for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) { | |
| 552 if (it->second->ch() == kWordJoinerUTF8) { | |
| 553 // Skip zero-width joiner characters (ZWJs) here. | |
| 554 delete it->second; | |
| 555 } else { | |
| 556 page_boxchars.push_back(it->second); | |
| 557 } | |
| 558 } | |
| 559 CorrectBoxPositionsToLayout(&page_boxchars); | |
| 560 | |
| 561 if (render_fullwidth_latin_) { | |
| 562 for (auto &it : start_byte_to_box) { | |
| 563 // Convert fullwidth Latin characters to their halfwidth forms. | |
| 564 std::string half(ConvertFullwidthLatinToBasicLatin(it.second->ch())); | |
| 565 it.second->mutable_ch()->swap(half); | |
| 566 } | |
| 567 } | |
| 568 | |
| 569 // Merge the character boxes into word boxes if we are rendering n-grams. | |
| 570 if (output_word_boxes_) { | |
| 571 MergeBoxCharsToWords(&page_boxchars); | |
| 572 } | |
| 573 | |
| 574 boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end()); | |
| 575 | |
| 576 // Compute the page bounding box | |
| 577 Box *page_box = nullptr; | |
| 578 Boxa *all_boxes = nullptr; | |
| 579 for (auto &page_boxchar : page_boxchars) { | |
| 580 if (page_boxchar->box() == nullptr) { | |
| 581 continue; | |
| 582 } | |
| 583 if (all_boxes == nullptr) { | |
| 584 all_boxes = boxaCreate(0); | |
| 585 } | |
| 586 boxaAddBox(all_boxes, page_boxchar->mutable_box(), L_CLONE); | |
| 587 } | |
| 588 if (all_boxes != nullptr) { | |
| 589 boxaGetExtent(all_boxes, nullptr, nullptr, &page_box); | |
| 590 boxaDestroy(&all_boxes); | |
| 591 if (page_boxes_ == nullptr) { | |
| 592 page_boxes_ = boxaCreate(0); | |
| 593 } | |
| 594 boxaAddBox(page_boxes_, page_box, L_INSERT); | |
| 595 } | |
| 596 } | |
| 597 | |
| 598 void StringRenderer::CorrectBoxPositionsToLayout(std::vector<BoxChar *> *boxchars) { | |
| 599 if (vertical_text_) { | |
| 600 const double rotation = -pango_gravity_to_rotation( | |
| 601 pango_context_get_base_gravity(pango_layout_get_context(layout_))); | |
| 602 BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars); | |
| 603 BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_, 0, boxchars->size(), | |
| 604 boxchars); | |
| 605 } else { | |
| 606 BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars); | |
| 607 } | |
| 608 } | |
| 609 | |
| 610 int StringRenderer::StripUnrenderableWords(std::string *utf8_text) const { | |
| 611 std::string output_text; | |
| 612 std::string unrenderable_words; | |
| 613 const char *text = utf8_text->c_str(); | |
| 614 size_t offset = 0; | |
| 615 int num_dropped = 0; | |
| 616 while (offset < utf8_text->length()) { | |
| 617 int space_len = SpanUTF8Whitespace(text + offset); | |
| 618 output_text.append(text + offset, space_len); | |
| 619 offset += space_len; | |
| 620 if (offset == utf8_text->length()) { | |
| 621 break; | |
| 622 } | |
| 623 | |
| 624 int word_len = SpanUTF8NotWhitespace(text + offset); | |
| 625 if (font_.CanRenderString(text + offset, word_len)) { | |
| 626 output_text.append(text + offset, word_len); | |
| 627 } else { | |
| 628 ++num_dropped; | |
| 629 unrenderable_words.append(text + offset, word_len); | |
| 630 unrenderable_words.append(" "); | |
| 631 } | |
| 632 offset += word_len; | |
| 633 } | |
| 634 utf8_text->swap(output_text); | |
| 635 | |
| 636 if (num_dropped > 0) { | |
| 637 tprintf("Stripped %d unrenderable word(s): '%s'\n", num_dropped, unrenderable_words.c_str()); | |
| 638 } | |
| 639 return num_dropped; | |
| 640 } | |
| 641 | |
| 642 int StringRenderer::RenderToGrayscaleImage(const char *text, int text_length, Image *pix) { | |
| 643 Image orig_pix = nullptr; | |
| 644 int offset = RenderToImage(text, text_length, &orig_pix); | |
| 645 if (orig_pix) { | |
| 646 *pix = pixConvertTo8(orig_pix, false); | |
| 647 orig_pix.destroy(); | |
| 648 } | |
| 649 return offset; | |
| 650 } | |
| 651 | |
| 652 int StringRenderer::RenderToBinaryImage(const char *text, int text_length, int threshold, | |
| 653 Image *pix) { | |
| 654 Image orig_pix = nullptr; | |
| 655 int offset = RenderToImage(text, text_length, &orig_pix); | |
| 656 if (orig_pix) { | |
| 657 Image gray_pix = pixConvertTo8(orig_pix, false); | |
| 658 orig_pix.destroy(); | |
| 659 *pix = pixThresholdToBinary(gray_pix, threshold); | |
| 660 gray_pix.destroy(); | |
| 661 } else { | |
| 662 *pix = orig_pix; | |
| 663 } | |
| 664 return offset; | |
| 665 } | |
| 666 | |
| 667 // Add word joiner (WJ) characters between adjacent non-space characters except | |
| 668 // immediately before a combiner. | |
| 669 /* static */ | |
| 670 std::string StringRenderer::InsertWordJoiners(const std::string &text) { | |
| 671 std::string out_str; | |
| 672 const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(), text.length()); | |
| 673 for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length()); it != it_end; | |
| 674 ++it) { | |
| 675 // Add the symbol to the output string. | |
| 676 out_str.append(it.utf8_data(), it.utf8_len()); | |
| 677 // Check the next symbol. | |
| 678 UNICHAR::const_iterator next_it = it; | |
| 679 ++next_it; | |
| 680 bool next_char_is_boundary = (next_it == it_end || *next_it == ' '); | |
| 681 bool next_char_is_combiner = (next_it == it_end) ? false : IsCombiner(*next_it); | |
| 682 if (*it != ' ' && *it != '\n' && !next_char_is_boundary && !next_char_is_combiner) { | |
| 683 out_str += kWordJoinerUTF8; | |
| 684 } | |
| 685 } | |
| 686 return out_str; | |
| 687 } | |
| 688 | |
| 689 // Convert halfwidth Basic Latin characters to their fullwidth forms. | |
| 690 std::string StringRenderer::ConvertBasicLatinToFullwidthLatin(const std::string &str) { | |
| 691 std::string full_str; | |
| 692 const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); | |
| 693 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) { | |
| 694 // Convert printable and non-space 7-bit ASCII characters to | |
| 695 // their fullwidth forms. | |
| 696 if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) { | |
| 697 // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII. | |
| 698 char32 full_char = *it + 0xFEE0; | |
| 699 full_str.append(EncodeAsUTF8(full_char)); | |
| 700 } else { | |
| 701 full_str.append(it.utf8_data(), it.utf8_len()); | |
| 702 } | |
| 703 } | |
| 704 return full_str; | |
| 705 } | |
| 706 | |
| 707 // Convert fullwidth Latin characters to their halfwidth forms. | |
| 708 std::string StringRenderer::ConvertFullwidthLatinToBasicLatin(const std::string &str) { | |
| 709 std::string half_str; | |
| 710 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length()); | |
| 711 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) { | |
| 712 char32 half_char = FullwidthToHalfwidth(*it); | |
| 713 // Convert fullwidth Latin characters to their halfwidth forms | |
| 714 // only if halfwidth forms are printable and non-space 7-bit ASCII. | |
| 715 if (IsInterchangeValid7BitAscii(half_char) && isprint(half_char) && !isspace(half_char)) { | |
| 716 half_str.append(EncodeAsUTF8(half_char)); | |
| 717 } else { | |
| 718 half_str.append(it.utf8_data(), it.utf8_len()); | |
| 719 } | |
| 720 } | |
| 721 return half_str; | |
| 722 } | |
| 723 | |
| 724 // Returns offset to end of text substring rendered in this method. | |
| 725 int StringRenderer::RenderToImage(const char *text, int text_length, Image *pix) { | |
| 726 if (pix && *pix) { | |
| 727 pix->destroy(); | |
| 728 } | |
| 729 InitPangoCairo(); | |
| 730 | |
| 731 const int page_offset = FindFirstPageBreakOffset(text, text_length); | |
| 732 if (!page_offset) { | |
| 733 return 0; | |
| 734 } | |
| 735 start_box_ = boxchars_.size(); | |
| 736 | |
| 737 if (!vertical_text_) { | |
| 738 // Translate by the specified margin | |
| 739 cairo_translate(cr_, h_margin_, v_margin_); | |
| 740 } else { | |
| 741 // Vertical text rendering is achieved by a two-step process of first | |
| 742 // performing regular horizontal layout with character orientation set to | |
| 743 // EAST, and then translating and rotating the layout before rendering onto | |
| 744 // the desired image surface. The settings required for the former step are | |
| 745 // done within InitPangoCairo(). | |
| 746 // | |
| 747 // Translate to the top-right margin of page | |
| 748 cairo_translate(cr_, page_width_ - h_margin_, v_margin_); | |
| 749 // Rotate the layout | |
| 750 double rotation = -pango_gravity_to_rotation( | |
| 751 pango_context_get_base_gravity(pango_layout_get_context(layout_))); | |
| 752 tlog(2, "Rotating by %f radians\n", rotation); | |
| 753 cairo_rotate(cr_, rotation); | |
| 754 pango_cairo_update_layout(cr_, layout_); | |
| 755 } | |
| 756 std::string page_text(text, page_offset); | |
| 757 if (render_fullwidth_latin_) { | |
| 758 // Convert Basic Latin to their fullwidth forms. | |
| 759 page_text = ConvertBasicLatinToFullwidthLatin(page_text); | |
| 760 } | |
| 761 if (strip_unrenderable_words_) { | |
| 762 StripUnrenderableWords(&page_text); | |
| 763 } | |
| 764 if (drop_uncovered_chars_ && !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) { | |
| 765 int num_dropped = font_.DropUncoveredChars(&page_text); | |
| 766 if (num_dropped) { | |
| 767 tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped); | |
| 768 } | |
| 769 } | |
| 770 if (add_ligatures_) { | |
| 771 // Add ligatures wherever possible, including custom ligatures. | |
| 772 page_text = LigatureTable::Get()->AddLigatures(page_text, &font_); | |
| 773 } | |
| 774 if (underline_start_prob_ > 0) { | |
| 775 SetWordUnderlineAttributes(page_text); | |
| 776 } | |
| 777 | |
| 778 pango_layout_set_text(layout_, page_text.c_str(), page_text.length()); | |
| 779 | |
| 780 if (pix) { | |
| 781 // Set a white background for the target image surface. | |
| 782 cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white | |
| 783 // Fill the surface with the active colour (if you don't do this, you will | |
| 784 // be given a surface with a transparent background to draw on) | |
| 785 cairo_paint(cr_); | |
| 786 // Set the ink color to black | |
| 787 cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]); | |
| 788 // If the target surface or transformation properties of the cairo instance | |
| 789 // have changed, update the pango layout to reflect this | |
| 790 pango_cairo_update_layout(cr_, layout_); | |
| 791 { | |
| 792 DISABLE_HEAP_LEAK_CHECK; // for Fontconfig | |
| 793 // Draw the pango layout onto the cairo surface | |
| 794 pango_cairo_show_layout(cr_, layout_); | |
| 795 } | |
| 796 *pix = CairoARGB32ToPixFormat(surface_); | |
| 797 } | |
| 798 ComputeClusterBoxes(); | |
| 799 FreePangoCairo(); | |
| 800 // Update internal state variables. | |
| 801 ++page_; | |
| 802 return page_offset; | |
| 803 } | |
| 804 | |
| 805 // Render a string to an image, returning it as an 8 bit pix. Behaves as | |
| 806 // RenderString, except that it ignores the font set at construction and works | |
| 807 // through all the fonts, returning 0 until they are exhausted, at which point | |
| 808 // it returns the value it should have returned all along, but no pix this time. | |
| 809 // Fonts that don't contain a given proportion of the characters in the string | |
| 810 // get skipped. | |
| 811 // Fonts that work each get rendered and the font name gets added | |
| 812 // to the image. | |
| 813 // NOTE that no boxes are produced by this function. | |
| 814 // | |
| 815 // Example usage: To render a null terminated char-array "txt" | |
| 816 // | |
| 817 // int offset = 0; | |
| 818 // do { | |
| 819 // Image pix; | |
| 820 // offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset, | |
| 821 // strlen(txt + offset), nullptr, | |
| 822 // &pix); | |
| 823 // ... | |
| 824 // } while (offset < strlen(text)); | |
| 825 // | |
| 826 int StringRenderer::RenderAllFontsToImage(double min_coverage, const char *text, int text_length, | |
| 827 std::string *font_used, Image *image) { | |
| 828 *image = nullptr; | |
| 829 // Select a suitable font to render the title with. | |
| 830 const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%"; | |
| 831 std::string title_font; | |
| 832 if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate), &title_font, nullptr)) { | |
| 833 tprintf("WARNING: Could not find a font to render image title with!\n"); | |
| 834 title_font = "Arial"; | |
| 835 } | |
| 836 title_font += " 8"; | |
| 837 tlog(1, "Selected title font: %s\n", title_font.c_str()); | |
| 838 if (font_used) { | |
| 839 font_used->clear(); | |
| 840 } | |
| 841 | |
| 842 std::string orig_font = font_.DescriptionName(); | |
| 843 if (char_map_.empty()) { | |
| 844 total_chars_ = 0; | |
| 845 // Fill the hash table and use that for computing which fonts to use. | |
| 846 for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length); | |
| 847 it != UNICHAR::end(text, text_length); ++it) { | |
| 848 ++total_chars_; | |
| 849 ++char_map_[*it]; | |
| 850 } | |
| 851 tprintf("Total chars = %d\n", total_chars_); | |
| 852 } | |
| 853 const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts(); | |
| 854 | |
| 855 for (size_t i = font_index_; i < all_fonts.size(); ++i) { | |
| 856 ++font_index_; | |
| 857 int raw_score = 0; | |
| 858 int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr); | |
| 859 if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) { | |
| 860 set_font(all_fonts[i]); | |
| 861 int offset = RenderToBinaryImage(text, text_length, 128, image); | |
| 862 ClearBoxes(); // Get rid of them as they are garbage. | |
| 863 const int kMaxTitleLength = 1024; | |
| 864 char title[kMaxTitleLength]; | |
| 865 snprintf(title, kMaxTitleLength, kTitleTemplate, all_fonts[i].c_str(), ok_chars, | |
| 866 100.0 * ok_chars / total_chars_, raw_score, 100.0 * raw_score / char_map_.size()); | |
| 867 tprintf("%s\n", title); | |
| 868 // This is a good font! Store the offset to return once we've tried all | |
| 869 // the fonts. | |
| 870 if (offset) { | |
| 871 last_offset_ = offset; | |
| 872 if (font_used) { | |
| 873 *font_used = all_fonts[i]; | |
| 874 } | |
| 875 } | |
| 876 // Add the font to the image. | |
| 877 set_font(title_font); | |
| 878 v_margin_ /= 8; | |
| 879 Image title_image = nullptr; | |
| 880 RenderToBinaryImage(title, strlen(title), 128, &title_image); | |
| 881 *image |= title_image; | |
| 882 title_image.destroy(); | |
| 883 | |
| 884 v_margin_ *= 8; | |
| 885 set_font(orig_font); | |
| 886 // We return the real offset only after cycling through the list of fonts. | |
| 887 return 0; | |
| 888 } else { | |
| 889 tprintf("Font %s failed with %d hits = %.2f%%\n", all_fonts[i].c_str(), ok_chars, | |
| 890 100.0 * ok_chars / total_chars_); | |
| 891 } | |
| 892 } | |
| 893 font_index_ = 0; | |
| 894 char_map_.clear(); | |
| 895 return last_offset_ == 0 ? -1 : last_offset_; | |
| 896 } | |
| 897 | |
| 898 } // namespace tesseract |
