comparison mupdf-source/thirdparty/tesseract/src/training/pango/pango_font_info.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /**********************************************************************
2 * File: pango_font_info.cpp
3 * Description: Font-related objects and helper functions
4 * Author: Ranjith Unnikrishnan
5 *
6 * (C) Copyright 2013, Google Inc.
7 * Licensed under the Apache License, Version 2.0 (the "License");
8 * you may not use this file except in compliance with the License.
9 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #if (defined __CYGWIN__)
25 // workaround for stdlib.h and putenv
26 # undef __STRICT_ANSI__
27 #endif
28
29 #include "commandlineflags.h"
30 #include "fileio.h"
31 #include "normstrngs.h"
32 #include "pango_font_info.h"
33 #include "tlog.h"
34
35 #include <tesseract/unichar.h>
36
37 #include "pango/pango.h"
38 #include "pango/pangocairo.h"
39 #include "pango/pangofc-font.h"
40
41 #include <algorithm>
42 #include <cstdio>
43 #include <cstdlib>
44 #include <cstring>
45
46 #ifndef _MSC_VER
47 # include <sys/param.h>
48 #endif
49
50 #define DISABLE_HEAP_LEAK_CHECK
51
52 using namespace tesseract;
53
54 namespace tesseract {
55
56 // Default assumed output resolution. Required only for providing font metrics
57 // in pixels.
58 const int kDefaultResolution = 300;
59
60 std::string PangoFontInfo::fonts_dir_;
61 std::string PangoFontInfo::cache_dir_;
62
63 static PangoGlyph get_glyph(PangoFont *font, gunichar wc) {
64 #if PANGO_VERSION_CHECK(1, 44, 0)
65 // pango_font_get_hb_font requires Pango 1.44 or newer.
66 hb_font_t *hb_font = pango_font_get_hb_font(font);
67 hb_codepoint_t glyph;
68 hb_font_get_nominal_glyph(hb_font, wc, &glyph);
69 #else
70 // Use deprecated pango_fc_font_get_glyph for older Pango versions.
71 PangoGlyph glyph = pango_fc_font_get_glyph(PANGO_FC_FONT(font), wc);
72 #endif
73 return glyph;
74 }
75
76 PangoFontInfo::PangoFontInfo() : desc_(nullptr), resolution_(kDefaultResolution) {
77 Clear();
78 }
79
80 PangoFontInfo::PangoFontInfo(const std::string &desc)
81 : desc_(nullptr), resolution_(kDefaultResolution) {
82 if (!ParseFontDescriptionName(desc)) {
83 tprintf("ERROR: Could not parse %s\n", desc.c_str());
84 Clear();
85 }
86 }
87
88 void PangoFontInfo::Clear() {
89 font_size_ = 0;
90 family_name_.clear();
91 font_type_ = UNKNOWN;
92 if (desc_) {
93 pango_font_description_free(desc_);
94 desc_ = nullptr;
95 }
96 }
97
98 PangoFontInfo::~PangoFontInfo() {
99 pango_font_description_free(desc_);
100 }
101
102 std::string PangoFontInfo::DescriptionName() const {
103 if (!desc_) {
104 return "";
105 }
106 char *desc_str = pango_font_description_to_string(desc_);
107 std::string desc_name(desc_str);
108 g_free(desc_str);
109 return desc_name;
110 }
111
112 // If not already initialized, initializes FontConfig by setting its
113 // environment variable and creating a fonts.conf file that points to the
114 // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
115 /* static */
116 void PangoFontInfo::SoftInitFontConfig() {
117 if (fonts_dir_.empty()) {
118 HardInitFontConfig(FLAGS_fonts_dir.c_str(), FLAGS_fontconfig_tmpdir.c_str());
119 }
120 }
121
122 // Re-initializes font config, whether or not already initialized.
123 // If already initialized, any existing cache is deleted, just to be sure.
124 /* static */
125 void PangoFontInfo::HardInitFontConfig(const char *fonts_dir, const char *cache_dir) {
126 if (!cache_dir_.empty()) {
127 File::DeleteMatchingFiles(File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str());
128 }
129 const int MAX_FONTCONF_FILESIZE = 1024;
130 char fonts_conf_template[MAX_FONTCONF_FILESIZE];
131 cache_dir_ = cache_dir;
132 fonts_dir_ = fonts_dir;
133 snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
134 "<?xml version=\"1.0\"?>\n"
135 "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
136 "<fontconfig>\n"
137 "<dir>%s</dir>\n"
138 "<cachedir>%s</cachedir>\n"
139 "<config></config>\n"
140 "</fontconfig>\n",
141 fonts_dir, cache_dir);
142 std::string fonts_conf_file = File::JoinPath(cache_dir, "fonts.conf");
143 File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
144 #ifdef _WIN32
145 std::string env("FONTCONFIG_PATH=");
146 env.append(cache_dir);
147 _putenv(env.c_str());
148 _putenv("LANG=en_US.utf8");
149 #else
150 setenv("FONTCONFIG_PATH", cache_dir, true);
151 // Fix the locale so that the reported font names are consistent.
152 setenv("LANG", "en_US.utf8", true);
153 #endif // _WIN32
154
155 if (FcInitReinitialize() != FcTrue) {
156 tprintf("FcInitiReinitialize failed!!\n");
157 }
158 FontUtils::ReInit();
159 // Clear Pango's font cache too.
160 pango_cairo_font_map_set_default(nullptr);
161 }
162
163 static void ListFontFamilies(PangoFontFamily ***families, int *n_families) {
164 PangoFontInfo::SoftInitFontConfig();
165 PangoFontMap *font_map = pango_cairo_font_map_get_default();
166 DISABLE_HEAP_LEAK_CHECK;
167 pango_font_map_list_families(font_map, families, n_families);
168 }
169
170 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
171 Clear();
172 const char *family = pango_font_description_get_family(desc);
173 if (!family) {
174 char *desc_str = pango_font_description_to_string(desc);
175 tprintf("WARNING: Could not parse family name from description: '%s'\n", desc_str);
176 g_free(desc_str);
177 return false;
178 }
179 family_name_ = std::string(family);
180 desc_ = pango_font_description_copy(desc);
181
182 // Set font size in points
183 font_size_ = pango_font_description_get_size(desc);
184 if (!pango_font_description_get_size_is_absolute(desc)) {
185 font_size_ /= PANGO_SCALE;
186 }
187
188 return true;
189 }
190
191 bool PangoFontInfo::ParseFontDescriptionName(const std::string &name) {
192 PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
193 bool success = ParseFontDescription(desc);
194 pango_font_description_free(desc);
195 return success;
196 }
197
198 // Returns the PangoFont structure corresponding to the closest available font
199 // in the font map. Note that if the font is wholly missing, this could
200 // correspond to a completely different font family and face.
201 PangoFont *PangoFontInfo::ToPangoFont() const {
202 SoftInitFontConfig();
203 PangoFontMap *font_map = pango_cairo_font_map_get_default();
204 PangoContext *context = pango_context_new();
205 pango_cairo_context_set_resolution(context, resolution_);
206 pango_context_set_font_map(context, font_map);
207 PangoFont *font = nullptr;
208 {
209 DISABLE_HEAP_LEAK_CHECK;
210 font = pango_font_map_load_font(font_map, context, desc_);
211 }
212 g_object_unref(context);
213 return font;
214 }
215
216 bool PangoFontInfo::CoversUTF8Text(const char *utf8_text, int byte_length) const {
217 PangoFont *font = ToPangoFont();
218 if (font == nullptr) {
219 // Font not found.
220 return false;
221 }
222 PangoCoverage *coverage = pango_font_get_coverage(font, nullptr);
223 for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
224 it != UNICHAR::end(utf8_text, byte_length); ++it) {
225 if (IsWhitespace(*it) || pango_is_zero_width(*it)) {
226 continue;
227 }
228 if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
229 char tmp[5];
230 int len = it.get_utf8(tmp);
231 tmp[len] = '\0';
232 tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
233 #if PANGO_VERSION_CHECK(1, 52, 0)
234 g_object_unref(coverage);
235 #else
236 pango_coverage_unref(coverage);
237 #endif
238 g_object_unref(font);
239 return false;
240 }
241 }
242 #if PANGO_VERSION_CHECK(1, 52, 0)
243 g_object_unref(coverage);
244 #else
245 pango_coverage_unref(coverage);
246 #endif
247 g_object_unref(font);
248 return true;
249 }
250
251 // This variant of strncpy permits src and dest to overlap. It will copy the
252 // first byte first.
253 static char *my_strnmove(char *dest, const char *src, size_t n) {
254 char *ret = dest;
255
256 // Copy characters until n reaches zero or the src byte is a nul.
257 do {
258 *dest = *src;
259 --n;
260 ++dest;
261 ++src;
262 } while (n && src[0]);
263
264 // If we reached a nul byte and there are more 'n' left, zero them out.
265 while (n) {
266 *dest = '\0';
267 --n;
268 ++dest;
269 }
270 return ret;
271 }
272
273 int PangoFontInfo::DropUncoveredChars(std::string *utf8_text) const {
274 int num_dropped_chars = 0;
275 PangoFont *font = ToPangoFont();
276 if (font == nullptr) {
277 // Font not found, drop all characters.
278 num_dropped_chars = utf8_text->length();
279 utf8_text->clear();
280 return num_dropped_chars;
281 }
282 PangoCoverage *coverage = pango_font_get_coverage(font, nullptr);
283 // Maintain two iterators that point into the string. For space efficiency, we
284 // will repeatedly copy one covered UTF8 character from one to the other, and
285 // at the end resize the string to the right length.
286 char *out = const_cast<char *>(utf8_text->c_str());
287 const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
288 const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_text->c_str(), utf8_text->length());
289 for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
290 // Skip bad utf-8.
291 if (!it.is_legal()) {
292 ++it; // One suitable error message will still be issued.
293 continue;
294 }
295 int unicode = *it;
296 int utf8_len = it.utf8_len();
297 const char *utf8_char = it.utf8_data();
298 // Move it forward before the data gets modified.
299 ++it;
300 if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
301 pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
302 if (TLOG_IS_ON(2)) {
303 UNICHAR unichar(unicode);
304 char *str = unichar.utf8_str();
305 tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
306 delete[] str;
307 }
308 ++num_dropped_chars;
309 continue;
310 }
311 my_strnmove(out, utf8_char, utf8_len);
312 out += utf8_len;
313 }
314 #if PANGO_VERSION_CHECK(1, 52, 0)
315 g_object_unref(coverage);
316 #else
317 pango_coverage_unref(coverage);
318 #endif
319 g_object_unref(font);
320 utf8_text->resize(out - utf8_text->c_str());
321 return num_dropped_chars;
322 }
323
324 bool PangoFontInfo::GetSpacingProperties(const std::string &utf8_char, int *x_bearing,
325 int *x_advance) const {
326 // Convert to equivalent PangoFont structure
327 PangoFont *font = ToPangoFont();
328 if (!font) {
329 return false;
330 }
331 // Find the glyph index in the font for the supplied utf8 character.
332 int total_advance = 0;
333 int min_bearing = 0;
334 // Handle multi-unicode strings by reporting the left-most position of the
335 // x-bearing, and right-most position of the x-advance if the string were to
336 // be rendered.
337 const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(), utf8_char.length());
338 const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(), utf8_char.length());
339 for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
340 PangoGlyph glyph_index = get_glyph(font, *it);
341 if (!glyph_index) {
342 // Glyph for given unicode character doesn't exist in font.
343 g_object_unref(font);
344 return false;
345 }
346 // Find the ink glyph extents for the glyph
347 PangoRectangle ink_rect, logical_rect;
348 pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
349 pango_extents_to_pixels(&ink_rect, nullptr);
350 pango_extents_to_pixels(&logical_rect, nullptr);
351
352 int bearing = total_advance + PANGO_LBEARING(ink_rect);
353 if (it == it_begin || bearing < min_bearing) {
354 min_bearing = bearing;
355 }
356 total_advance += PANGO_RBEARING(logical_rect);
357 }
358 *x_bearing = min_bearing;
359 *x_advance = total_advance;
360 g_object_unref(font);
361 return true;
362 }
363
364 bool PangoFontInfo::CanRenderString(const char *utf8_word, int len) const {
365 std::vector<std::string> graphemes;
366 return CanRenderString(utf8_word, len, &graphemes);
367 }
368
369 bool PangoFontInfo::CanRenderString(const char *utf8_word, int len,
370 std::vector<std::string> *graphemes) const {
371 if (graphemes) {
372 graphemes->clear();
373 }
374 // We check for font coverage of the text first, as otherwise Pango could
375 // (undesirably) fall back to another font that does have the required
376 // coverage.
377 if (!CoversUTF8Text(utf8_word, len)) {
378 return false;
379 }
380 // U+25CC dotted circle character that often (but not always) gets rendered
381 // when there is an illegal grapheme sequence.
382 const char32 kDottedCircleGlyph = 9676;
383 bool bad_glyph = false;
384 PangoFontMap *font_map = pango_cairo_font_map_get_default();
385 PangoContext *context = pango_context_new();
386 pango_context_set_font_map(context, font_map);
387 PangoLayout *layout;
388 {
389 // Pango is not releasing the cached layout.
390 DISABLE_HEAP_LEAK_CHECK;
391 layout = pango_layout_new(context);
392 }
393 if (desc_) {
394 pango_layout_set_font_description(layout, desc_);
395 } else {
396 PangoFontDescription *desc = pango_font_description_from_string(DescriptionName().c_str());
397 pango_layout_set_font_description(layout, desc);
398 pango_font_description_free(desc);
399 }
400 pango_layout_set_text(layout, utf8_word, len);
401 PangoLayoutIter *run_iter = nullptr;
402 { // Fontconfig caches some information here that is not freed before exit.
403 DISABLE_HEAP_LEAK_CHECK;
404 run_iter = pango_layout_get_iter(layout);
405 }
406 do {
407 PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter);
408 if (!run) {
409 tlog(2, "Found end of line nullptr run marker\n");
410 continue;
411 }
412 PangoGlyph dotted_circle_glyph;
413 PangoFont *font = run->item->analysis.font;
414
415 dotted_circle_glyph = get_glyph(font, kDottedCircleGlyph);
416
417 if (TLOG_IS_ON(2)) {
418 PangoFontDescription *desc = pango_font_describe(font);
419 char *desc_str = pango_font_description_to_string(desc);
420 tlog(2, "Desc of font in run: %s\n", desc_str);
421 g_free(desc_str);
422 pango_font_description_free(desc);
423 }
424
425 PangoGlyphItemIter cluster_iter;
426 gboolean have_cluster;
427 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, utf8_word);
428 have_cluster && !bad_glyph;
429 have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
430 const int start_byte_index = cluster_iter.start_index;
431 const int end_byte_index = cluster_iter.end_index;
432 int start_glyph_index = cluster_iter.start_glyph;
433 int end_glyph_index = cluster_iter.end_glyph;
434 std::string cluster_text =
435 std::string(utf8_word + start_byte_index, end_byte_index - start_byte_index);
436 if (graphemes) {
437 graphemes->push_back(cluster_text);
438 }
439 if (IsUTF8Whitespace(cluster_text.c_str())) {
440 tlog(2, "Skipping whitespace\n");
441 continue;
442 }
443 if (TLOG_IS_ON(2)) {
444 printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ", start_byte_index,
445 end_byte_index, start_glyph_index, end_glyph_index);
446 }
447 for (int i = start_glyph_index, step = (end_glyph_index > start_glyph_index) ? 1 : -1;
448 !bad_glyph && i != end_glyph_index; i += step) {
449 const bool unknown_glyph =
450 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph & PANGO_GLYPH_UNKNOWN_FLAG);
451 const bool illegal_glyph =
452 (cluster_iter.glyph_item->glyphs->glyphs[i].glyph == dotted_circle_glyph);
453 bad_glyph = unknown_glyph || illegal_glyph;
454 if (TLOG_IS_ON(2)) {
455 printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph, bad_glyph ? 1 : 0);
456 }
457 }
458 if (TLOG_IS_ON(2)) {
459 printf(" '%s'\n", cluster_text.c_str());
460 }
461 if (bad_glyph)
462 tlog(1, "Found illegal glyph!\n");
463 }
464 } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
465
466 pango_layout_iter_free(run_iter);
467 g_object_unref(context);
468 g_object_unref(layout);
469 if (bad_glyph && graphemes) {
470 graphemes->clear();
471 }
472 return !bad_glyph;
473 }
474
475 // ------------------------ FontUtils ------------------------------------
476 std::vector<std::string> FontUtils::available_fonts_; // cache list
477
478 // Returns whether the specified font description is available in the fonts
479 // directory.
480 //
481 // The generated list of font families and faces includes "synthesized" font
482 // faces that are not truly loadable. Pango versions >=1.18 have a
483 // pango_font_face_is_synthesized method that can be used to prune the list.
484 // Until then, we are restricted to using a hack where we try to load the font
485 // from the font_map, and then check what we loaded to see if it has the
486 // description we expected. If it is not, then the font is deemed unavailable.
487 //
488 // TODO: This function reports also some not synthesized fonts as not available
489 // e.g. 'Bitstream Charter Medium Italic', 'LMRoman17', so we need this hack
490 // until other solution is found.
491 /* static */
492 bool FontUtils::IsAvailableFont(const char *input_query_desc, std::string *best_match) {
493 std::string query_desc(input_query_desc);
494 PangoFontDescription *desc = pango_font_description_from_string(query_desc.c_str());
495 PangoFont *selected_font = nullptr;
496 {
497 PangoFontInfo::SoftInitFontConfig();
498 PangoFontMap *font_map = pango_cairo_font_map_get_default();
499 PangoContext *context = pango_context_new();
500 pango_context_set_font_map(context, font_map);
501 {
502 DISABLE_HEAP_LEAK_CHECK;
503 selected_font = pango_font_map_load_font(font_map, context, desc);
504 }
505 g_object_unref(context);
506 }
507 if (selected_font == nullptr) {
508 pango_font_description_free(desc);
509 tlog(4, "** Font '%s' failed to load from font map!\n", input_query_desc);
510 return false;
511 }
512 PangoFontDescription *selected_desc = pango_font_describe(selected_font);
513
514 bool equal = pango_font_description_equal(desc, selected_desc);
515 tlog(3, "query weight = %d \t selected weight =%d\n", pango_font_description_get_weight(desc),
516 pango_font_description_get_weight(selected_desc));
517
518 char *selected_desc_str = pango_font_description_to_string(selected_desc);
519 tlog(2, "query_desc: '%s' Selected: '%s'\n", query_desc.c_str(), selected_desc_str);
520 if (!equal && best_match != nullptr) {
521 *best_match = selected_desc_str;
522 // Clip the ending ' 0' if there is one. It seems that, if there is no
523 // point size on the end of the fontname, then Pango always appends ' 0'.
524 auto len = best_match->size();
525 if (len > 2 && best_match->at(len - 1) == '0' && best_match->at(len - 2) == ' ') {
526 best_match->resize(len - 2);
527 }
528 }
529 g_free(selected_desc_str);
530 pango_font_description_free(selected_desc);
531 g_object_unref(selected_font);
532 pango_font_description_free(desc);
533 if (!equal)
534 tlog(4, "** Font '%s' failed pango_font_description_equal!\n", input_query_desc);
535 return equal;
536 }
537
538 static bool ShouldIgnoreFontFamilyName(const char *query) {
539 static const char *kIgnoredFamilyNames[] = {"Sans", "Serif", "Monospace", nullptr};
540 const char **list = kIgnoredFamilyNames;
541 for (; *list != nullptr; ++list) {
542 if (!strcmp(*list, query)) {
543 return true;
544 }
545 }
546 return false;
547 }
548
549 // Outputs description names of available fonts.
550 /* static */
551 const std::vector<std::string> &FontUtils::ListAvailableFonts() {
552 if (!available_fonts_.empty()) {
553 return available_fonts_;
554 }
555
556 PangoFontFamily **families = nullptr;
557 int n_families = 0;
558 ListFontFamilies(&families, &n_families);
559 for (int i = 0; i < n_families; ++i) {
560 const char *family_name = pango_font_family_get_name(families[i]);
561 tlog(2, "Listing family %s\n", family_name);
562 if (ShouldIgnoreFontFamilyName(family_name)) {
563 continue;
564 }
565
566 int n_faces;
567 PangoFontFace **faces = nullptr;
568 pango_font_family_list_faces(families[i], &faces, &n_faces);
569 for (int j = 0; j < n_faces; ++j) {
570 PangoFontDescription *desc = pango_font_face_describe(faces[j]);
571 char *desc_str = pango_font_description_to_string(desc);
572 // "synthesized" font faces that are not truly loadable, so we skip it
573 if (!pango_font_face_is_synthesized(faces[j]) && IsAvailableFont(desc_str)) {
574 available_fonts_.emplace_back(desc_str);
575 }
576 pango_font_description_free(desc);
577 g_free(desc_str);
578 }
579 g_free(faces);
580 }
581 g_free(families);
582 std::sort(available_fonts_.begin(), available_fonts_.end());
583 return available_fonts_;
584 }
585
586 // Utilities written to be backward compatible with StringRender
587
588 /* static */
589 int FontUtils::FontScore(const std::unordered_map<char32, int64_t> &ch_map,
590 const std::string &fontname, int *raw_score, std::vector<bool> *ch_flags) {
591 PangoFontInfo font_info;
592 if (!font_info.ParseFontDescriptionName(fontname)) {
593 tprintf("ERROR: Could not parse %s\n", fontname.c_str());
594 }
595 PangoFont *font = font_info.ToPangoFont();
596 PangoCoverage *coverage = nullptr;
597 if (font != nullptr) {
598 coverage = pango_font_get_coverage(font, nullptr);
599 }
600 if (ch_flags) {
601 ch_flags->clear();
602 ch_flags->reserve(ch_map.size());
603 }
604 *raw_score = 0;
605 int ok_chars = 0;
606 for (auto &&it : ch_map) {
607 bool covered =
608 (coverage != nullptr) && (IsWhitespace(it.first) ||
609 (pango_coverage_get(coverage, it.first) == PANGO_COVERAGE_EXACT));
610 if (covered) {
611 ++(*raw_score);
612 ok_chars += it.second;
613 }
614 if (ch_flags) {
615 ch_flags->push_back(covered);
616 }
617 }
618 #if PANGO_VERSION_CHECK(1, 52, 0)
619 g_object_unref(coverage);
620 #else
621 pango_coverage_unref(coverage);
622 #endif
623 g_object_unref(font);
624 return ok_chars;
625 }
626
627 /* static */
628 std::string FontUtils::BestFonts(const std::unordered_map<char32, int64_t> &ch_map,
629 std::vector<std::pair<const char *, std::vector<bool>>> *fonts) {
630 const double kMinOKFraction = 0.99;
631 // Weighted fraction of characters that must be renderable in a font to make
632 // it OK even if the raw count is not good.
633 const double kMinWeightedFraction = 0.99995;
634
635 fonts->clear();
636 std::vector<std::vector<bool>> font_flags;
637 std::vector<int> font_scores;
638 std::vector<int> raw_scores;
639 int most_ok_chars = 0;
640 int best_raw_score = 0;
641 const std::vector<std::string> &font_names = FontUtils::ListAvailableFonts();
642 for (const auto &font_name : font_names) {
643 std::vector<bool> ch_flags;
644 int raw_score = 0;
645 int ok_chars = FontScore(ch_map, font_name, &raw_score, &ch_flags);
646 most_ok_chars = std::max(ok_chars, most_ok_chars);
647 best_raw_score = std::max(raw_score, best_raw_score);
648
649 font_flags.push_back(ch_flags);
650 font_scores.push_back(ok_chars);
651 raw_scores.push_back(raw_score);
652 }
653
654 // Now select the fonts with a score above a threshold fraction
655 // of both the raw and weighted best scores. To prevent bogus fonts being
656 // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
657 // BOTH weighted and raw scores.
658 // In low character-count scripts, the issue is more getting enough fonts,
659 // when only 1 or 2 might have all those rare dingbats etc in them, so we
660 // allow a font with a very high weighted (coverage) score
661 // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
662 int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
663 int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
664 int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
665
666 std::string font_list;
667 for (unsigned i = 0; i < font_names.size(); ++i) {
668 int score = font_scores[i];
669 int raw_score = raw_scores[i];
670 if ((score >= least_good_enough && raw_score >= least_raw_enough) || score >= override_enough) {
671 fonts->push_back(std::make_pair(font_names[i].c_str(), font_flags[i]));
672 tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n", font_names[i].c_str(),
673 100.0 * score / most_ok_chars, raw_score, 100.0 * raw_score / best_raw_score);
674 font_list += font_names[i];
675 font_list += "\n";
676 } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
677 tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n", font_names[i].c_str(),
678 100.0 * score / most_ok_chars, raw_score, 100.0 * raw_score / best_raw_score);
679 }
680 }
681 return font_list;
682 }
683
684 /* static */
685 bool FontUtils::SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name,
686 std::vector<std::string> *graphemes) {
687 return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name, graphemes);
688 }
689
690 /* static */
691 bool FontUtils::SelectFont(const char *utf8_word, const int utf8_len,
692 const std::vector<std::string> &all_fonts, std::string *font_name,
693 std::vector<std::string> *graphemes) {
694 if (font_name) {
695 font_name->clear();
696 }
697 if (graphemes) {
698 graphemes->clear();
699 }
700 for (const auto &all_font : all_fonts) {
701 PangoFontInfo font;
702 std::vector<std::string> found_graphemes;
703 ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_font), "Could not parse font desc name %s\n",
704 all_font.c_str());
705 if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
706 if (graphemes) {
707 graphemes->swap(found_graphemes);
708 }
709 if (font_name) {
710 *font_name = all_font;
711 }
712 return true;
713 }
714 }
715 return false;
716 }
717
718 // PangoFontInfo is reinitialized, so clear the static list of fonts.
719 /* static */
720 void FontUtils::ReInit() {
721 available_fonts_.clear();
722 }
723
724 // Print info about used font backend
725 /* static */
726 void FontUtils::PangoFontTypeInfo() {
727 PangoFontMap *font_map = pango_cairo_font_map_get_default();
728 if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==
729 CAIRO_FONT_TYPE_TOY) {
730 printf("Using CAIRO_FONT_TYPE_TOY.\n");
731 } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==
732 CAIRO_FONT_TYPE_FT) {
733 printf("Using CAIRO_FONT_TYPE_FT.\n");
734 } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==
735 CAIRO_FONT_TYPE_WIN32) {
736 printf("Using CAIRO_FONT_TYPE_WIN32.\n");
737 } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==
738 CAIRO_FONT_TYPE_QUARTZ) {
739 printf("Using CAIRO_FONT_TYPE_QUARTZ.\n");
740 } else if (pango_cairo_font_map_get_font_type(reinterpret_cast<PangoCairoFontMap *>(font_map)) ==
741 CAIRO_FONT_TYPE_USER) {
742 printf("Using CAIRO_FONT_TYPE_USER.\n");
743 } else if (!font_map) {
744 printf("Cannot create pango cairo font map!\n");
745 }
746 }
747
748 } // namespace tesseract