Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/training/text2image.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /********************************************************************** | |
| 2 * File: text2image.cpp | |
| 3 * Description: Program to generate OCR training pages. Given a text file it | |
| 4 * outputs an image with a given font and degradation. | |
| 5 * | |
| 6 * Note that since the results depend on the fonts available on | |
| 7 * your system, running the code on a different machine, or | |
| 8 * different OS, or even at a different time on the same machine, | |
| 9 * may produce different fonts even if --font is given explicitly. | |
| 10 * To see names of available fonts, use --list_available_fonts with | |
| 11 * the appropriate --fonts_dir path. | |
| 12 * Specifying --use_only_legacy_fonts will restrict the available | |
| 13 * fonts to those listed in legacy_fonts.h | |
| 14 * Authors: Ranjith Unnikrishnan, Ray Smith | |
| 15 * | |
| 16 * (C) Copyright 2013, Google Inc. | |
| 17 * Licensed under the Apache License, Version 2.0 (the "License"); | |
| 18 * you may not use this file except in compliance with the License. | |
| 19 * You may obtain a copy of the License at | |
| 20 * http://www.apache.org/licenses/LICENSE-2.0 | |
| 21 * Unless required by applicable law or agreed to in writing, software | |
| 22 * distributed under the License is distributed on an "AS IS" BASIS, | |
| 23 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 24 * See the License for the specific language governing permissions and | |
| 25 * limitations under the License. | |
| 26 * | |
| 27 **********************************************************************/ | |
| 28 | |
| 29 #include "boxchar.h" | |
| 30 #include "commandlineflags.h" | |
| 31 #include "commontraining.h" // CheckSharedLibraryVersion | |
| 32 #include "degradeimage.h" | |
| 33 #include "errcode.h" | |
| 34 #include "fileio.h" | |
| 35 #include "helpers.h" | |
| 36 #include "normstrngs.h" | |
| 37 #include "stringrenderer.h" | |
| 38 #include "tlog.h" | |
| 39 #include "unicharset.h" | |
| 40 | |
| 41 #include <allheaders.h> // from leptonica | |
| 42 | |
| 43 #include <algorithm> | |
| 44 #include <cstdlib> | |
| 45 #include <cstring> | |
| 46 #include <iostream> | |
| 47 #include <map> | |
| 48 #include <random> | |
| 49 #include <string> | |
| 50 #include <utility> | |
| 51 #include <vector> | |
| 52 | |
| 53 #ifdef _MSC_VER | |
| 54 # define putenv(s) _putenv(s) | |
| 55 #endif | |
| 56 | |
| 57 using namespace tesseract; | |
| 58 | |
| 59 // A number with which to initialize the random number generator. | |
| 60 const int kRandomSeed = 0x18273645; | |
| 61 | |
| 62 // The text input file. | |
| 63 static STRING_PARAM_FLAG(text, "", "File name of text input to process"); | |
| 64 | |
| 65 // The text output file. | |
| 66 static STRING_PARAM_FLAG(outputbase, "", "Basename for output image/box file"); | |
| 67 | |
| 68 // Degrade the rendered image to mimic scanner quality. | |
| 69 static BOOL_PARAM_FLAG(degrade_image, true, | |
| 70 "Degrade rendered image with speckle noise, dilation/erosion " | |
| 71 "and rotation"); | |
| 72 | |
| 73 // Rotate the rendered image to have more realistic glyph borders | |
| 74 static BOOL_PARAM_FLAG(rotate_image, true, "Rotate the image in a random way."); | |
| 75 | |
| 76 // Degradation to apply to the image. | |
| 77 static INT_PARAM_FLAG(exposure, 0, "Exposure level in photocopier"); | |
| 78 | |
| 79 // Distort the rendered image by various means according to the bool flags. | |
| 80 static BOOL_PARAM_FLAG(distort_image, false, "Degrade rendered image with noise, blur, invert."); | |
| 81 | |
| 82 // Distortion to apply to the image. | |
| 83 static BOOL_PARAM_FLAG(invert, true, "Invert the image"); | |
| 84 | |
| 85 // Distortion to apply to the image. | |
| 86 static BOOL_PARAM_FLAG(white_noise, true, "Add Gaussian Noise"); | |
| 87 | |
| 88 // Distortion to apply to the image. | |
| 89 static BOOL_PARAM_FLAG(smooth_noise, true, "Smoothen Noise"); | |
| 90 | |
| 91 // Distortion to apply to the image. | |
| 92 static BOOL_PARAM_FLAG(blur, true, "Blur the image"); | |
| 93 | |
| 94 #if 0 | |
| 95 | |
| 96 // Distortion to apply to the image. | |
| 97 static BOOL_PARAM_FLAG(perspective, false, "Generate Perspective Distortion"); | |
| 98 | |
| 99 // Distortion to apply to the image. | |
| 100 static INT_PARAM_FLAG(box_reduction, 0, "Integer reduction factor box_scale"); | |
| 101 | |
| 102 #endif | |
| 103 | |
| 104 // Output image resolution. | |
| 105 static INT_PARAM_FLAG(resolution, 300, "Pixels per inch"); | |
| 106 | |
| 107 // Width of output image (in pixels). | |
| 108 static INT_PARAM_FLAG(xsize, 3600, "Width of output image"); | |
| 109 | |
| 110 // Max height of output image (in pixels). | |
| 111 static INT_PARAM_FLAG(ysize, 4800, "Height of output image"); | |
| 112 | |
| 113 // Max number of pages to produce. | |
| 114 static INT_PARAM_FLAG(max_pages, 0, "Maximum number of pages to output (0=unlimited)"); | |
| 115 | |
| 116 // Margin around text (in pixels). | |
| 117 static INT_PARAM_FLAG(margin, 100, "Margin round edges of image"); | |
| 118 | |
| 119 // Size of text (in points). | |
| 120 static INT_PARAM_FLAG(ptsize, 12, "Size of printed text"); | |
| 121 | |
| 122 // Inter-character space (in ems). | |
| 123 static DOUBLE_PARAM_FLAG(char_spacing, 0, "Inter-character space in ems"); | |
| 124 | |
| 125 // Sets the probability (value in [0, 1]) of starting to render a word with an | |
| 126 // underline. Words are assumed to be space-delimited. | |
| 127 static DOUBLE_PARAM_FLAG(underline_start_prob, 0, | |
| 128 "Fraction of words to underline (value in [0,1])"); | |
| 129 // Set the probability (value in [0, 1]) of continuing a started underline to | |
| 130 // the next word. | |
| 131 static DOUBLE_PARAM_FLAG(underline_continuation_prob, 0, | |
| 132 "Fraction of words to underline (value in [0,1])"); | |
| 133 | |
| 134 // Inter-line space (in pixels). | |
| 135 static INT_PARAM_FLAG(leading, 12, "Inter-line space (in pixels)"); | |
| 136 | |
| 137 // Layout and glyph orientation on rendering. | |
| 138 static STRING_PARAM_FLAG(writing_mode, "horizontal", | |
| 139 "Specify one of the following writing" | |
| 140 " modes.\n" | |
| 141 "'horizontal' : Render regular horizontal text. (default)\n" | |
| 142 "'vertical' : Render vertical text. Glyph orientation is" | |
| 143 " selected by Pango.\n" | |
| 144 "'vertical-upright' : Render vertical text. Glyph " | |
| 145 " orientation is set to be upright."); | |
| 146 | |
| 147 static INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes"); | |
| 148 | |
| 149 static BOOL_PARAM_FLAG(strip_unrenderable_words, true, | |
| 150 "Remove unrenderable words from source text"); | |
| 151 | |
| 152 // Font name. | |
| 153 static STRING_PARAM_FLAG(font, "Arial", "Font description name to use"); | |
| 154 | |
| 155 static BOOL_PARAM_FLAG(ligatures, false, "Rebuild and render ligatures"); | |
| 156 | |
| 157 static BOOL_PARAM_FLAG(find_fonts, false, "Search for all fonts that can render the text"); | |
| 158 static BOOL_PARAM_FLAG(render_per_font, true, | |
| 159 "If find_fonts==true, render each font to its own image. " | |
| 160 "Image filenames are of the form output_name.font_name.tif"); | |
| 161 static DOUBLE_PARAM_FLAG(min_coverage, 1.0, | |
| 162 "If find_fonts==true, the minimum coverage the font has of " | |
| 163 "the characters in the text file to include it, between " | |
| 164 "0 and 1."); | |
| 165 | |
| 166 static BOOL_PARAM_FLAG(list_available_fonts, false, "List available fonts and quit."); | |
| 167 | |
| 168 static BOOL_PARAM_FLAG(render_ngrams, false, | |
| 169 "Put each space-separated entity from the" | |
| 170 " input file into one bounding box. The ngrams in the input" | |
| 171 " file will be randomly permuted before rendering (so that" | |
| 172 " there is sufficient variety of characters on each line)."); | |
| 173 | |
| 174 static BOOL_PARAM_FLAG(output_word_boxes, false, | |
| 175 "Output word bounding boxes instead of character boxes. " | |
| 176 "This is used for Cube training, and implied by " | |
| 177 "--render_ngrams."); | |
| 178 | |
| 179 static STRING_PARAM_FLAG(unicharset_file, "", | |
| 180 "File with characters in the unicharset. If --render_ngrams" | |
| 181 " is true and --unicharset_file is specified, ngrams with" | |
| 182 " characters that are not in unicharset will be omitted"); | |
| 183 | |
| 184 static BOOL_PARAM_FLAG(bidirectional_rotation, false, "Rotate the generated characters both ways."); | |
| 185 | |
| 186 static BOOL_PARAM_FLAG(only_extract_font_properties, false, | |
| 187 "Assumes that the input file contains a list of ngrams. Renders" | |
| 188 " each ngram, extracts spacing properties and records them in" | |
| 189 " output_base/[font_name].fontinfo file."); | |
| 190 | |
| 191 // Use these flags to output zero-padded, square individual character images | |
| 192 static BOOL_PARAM_FLAG(output_individual_glyph_images, false, | |
| 193 "If true also outputs individual character images"); | |
| 194 static INT_PARAM_FLAG(glyph_resized_size, 0, | |
| 195 "Each glyph is square with this side length in pixels"); | |
| 196 static INT_PARAM_FLAG(glyph_num_border_pixels_to_pad, 0, | |
| 197 "Final_size=glyph_resized_size+2*glyph_num_border_pixels_to_pad"); | |
| 198 | |
| 199 namespace tesseract { | |
| 200 | |
| 201 struct SpacingProperties { | |
| 202 SpacingProperties() : x_gap_before(0), x_gap_after(0) {} | |
| 203 SpacingProperties(int b, int a) : x_gap_before(b), x_gap_after(a) {} | |
| 204 // These values are obtained from FT_Glyph_Metrics struct | |
| 205 // used by the FreeType font engine. | |
| 206 int x_gap_before; // horizontal x bearing | |
| 207 int x_gap_after; // horizontal advance - x_gap_before - width | |
| 208 std::map<std::string, int> kerned_x_gaps; | |
| 209 }; | |
| 210 | |
| 211 static bool IsWhitespaceBox(const BoxChar *boxchar) { | |
| 212 return (boxchar->box() == nullptr || SpanUTF8Whitespace(boxchar->ch().c_str())); | |
| 213 } | |
| 214 | |
| 215 static std::string StringReplace(const std::string &in, const std::string &oldsub, | |
| 216 const std::string &newsub) { | |
| 217 std::string out; | |
| 218 size_t start_pos = 0, pos; | |
| 219 while ((pos = in.find(oldsub, start_pos)) != std::string::npos) { | |
| 220 out.append(in.data() + start_pos, pos - start_pos); | |
| 221 out.append(newsub.data(), newsub.length()); | |
| 222 start_pos = pos + oldsub.length(); | |
| 223 } | |
| 224 out.append(in.data() + start_pos, in.length() - start_pos); | |
| 225 return out; | |
| 226 } | |
| 227 | |
| 228 // Assumes that each word (whitespace-separated entity) in text is a bigram. | |
| 229 // Renders the bigrams and calls FontInfo::GetSpacingProperties() to | |
| 230 // obtain spacing information. Produces the output .fontinfo file with a line | |
| 231 // per unichar of the form: | |
| 232 // unichar space_before space_after kerned1 kerned_space1 kerned2 ... | |
| 233 // Fox example, if unichar "A" has spacing of 0 pixels before and -1 pixels | |
| 234 // after, is kerned with "V" resulting in spacing of "AV" to be -7 and kerned | |
| 235 // with "T", such that "AT" has spacing of -5, the entry/line for unichar "A" | |
| 236 // in .fontinfo file will be: | |
| 237 // A 0 -1 T -5 V -7 | |
| 238 static void ExtractFontProperties(const std::string &utf8_text, StringRenderer *render, | |
| 239 const std::string &output_base) { | |
| 240 std::map<std::string, SpacingProperties> spacing_map; | |
| 241 std::map<std::string, SpacingProperties>::iterator spacing_map_it0; | |
| 242 std::map<std::string, SpacingProperties>::iterator spacing_map_it1; | |
| 243 int x_bearing, x_advance; | |
| 244 int len = utf8_text.length(); | |
| 245 int offset = 0; | |
| 246 const char *text = utf8_text.c_str(); | |
| 247 while (offset < len) { | |
| 248 offset += render->RenderToImage(text + offset, strlen(text + offset), nullptr); | |
| 249 const std::vector<BoxChar *> &boxes = render->GetBoxes(); | |
| 250 | |
| 251 // If the page break split a bigram, correct the offset so we try the bigram | |
| 252 // on the next iteration. | |
| 253 if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) && | |
| 254 IsWhitespaceBox(boxes[boxes.size() - 2])) { | |
| 255 if (boxes.size() > 3) { | |
| 256 tprintf("WARNING: Adjusting to bad page break after '%s%s'\n", | |
| 257 boxes[boxes.size() - 4]->ch().c_str(), boxes[boxes.size() - 3]->ch().c_str()); | |
| 258 } | |
| 259 offset -= boxes[boxes.size() - 1]->ch().size(); | |
| 260 } | |
| 261 | |
| 262 for (size_t b = 0; b < boxes.size(); b += 2) { | |
| 263 while (b < boxes.size() && IsWhitespaceBox(boxes[b])) { | |
| 264 ++b; | |
| 265 } | |
| 266 if (b + 1 >= boxes.size()) { | |
| 267 break; | |
| 268 } | |
| 269 const std::string &ch0 = boxes[b]->ch(); | |
| 270 // We encountered a ligature. This happens in at least two scenarios: | |
| 271 // One is when the rendered bigram forms a grapheme cluster (eg. the | |
| 272 // second character in the bigram is a combining vowel), in which case we | |
| 273 // correctly output only one bounding box. | |
| 274 // A second far less frequent case is when caused some fonts like 'DejaVu | |
| 275 // Sans Ultra-Light' force Pango to render a ligatured character even if | |
| 276 // the input consists of the separated characters. NOTE(ranjith): As per | |
| 277 // behdad@ this is not currently controllable at the level of the Pango | |
| 278 // API. | |
| 279 // The most frequent of all is a single character "word" made by the CJK | |
| 280 // segmenter. | |
| 281 // Safeguard against these cases here by just skipping the bigram. | |
| 282 if (IsWhitespaceBox(boxes[b + 1])) { | |
| 283 continue; | |
| 284 } | |
| 285 int xgap = (boxes[b + 1]->box()->x - (boxes[b]->box()->x + boxes[b]->box()->w)); | |
| 286 spacing_map_it0 = spacing_map.find(ch0); | |
| 287 int ok_count = 0; | |
| 288 if (spacing_map_it0 == spacing_map.end() && | |
| 289 render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) { | |
| 290 spacing_map[ch0] = SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b]->box()->w); | |
| 291 spacing_map_it0 = spacing_map.find(ch0); | |
| 292 ++ok_count; | |
| 293 } | |
| 294 const std::string &ch1 = boxes[b + 1]->ch(); | |
| 295 tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str()); | |
| 296 spacing_map_it1 = spacing_map.find(ch1); | |
| 297 if (spacing_map_it1 == spacing_map.end() && | |
| 298 render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) { | |
| 299 spacing_map[ch1] = | |
| 300 SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b + 1]->box()->w); | |
| 301 spacing_map_it1 = spacing_map.find(ch1); | |
| 302 ++ok_count; | |
| 303 } | |
| 304 if (ok_count == 2 && | |
| 305 xgap != (spacing_map_it0->second.x_gap_after + spacing_map_it1->second.x_gap_before)) { | |
| 306 spacing_map_it0->second.kerned_x_gaps[ch1] = xgap; | |
| 307 } | |
| 308 } | |
| 309 render->ClearBoxes(); | |
| 310 } | |
| 311 std::string output_string; | |
| 312 const int kBufSize = 1024; | |
| 313 char buf[kBufSize]; | |
| 314 snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size())); | |
| 315 output_string.append(buf); | |
| 316 std::map<std::string, SpacingProperties>::const_iterator spacing_map_it; | |
| 317 for (spacing_map_it = spacing_map.begin(); spacing_map_it != spacing_map.end(); | |
| 318 ++spacing_map_it) { | |
| 319 snprintf(buf, kBufSize, "%s %d %d %d", spacing_map_it->first.c_str(), | |
| 320 spacing_map_it->second.x_gap_before, spacing_map_it->second.x_gap_after, | |
| 321 static_cast<int>(spacing_map_it->second.kerned_x_gaps.size())); | |
| 322 output_string.append(buf); | |
| 323 std::map<std::string, int>::const_iterator kern_it; | |
| 324 for (kern_it = spacing_map_it->second.kerned_x_gaps.begin(); | |
| 325 kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) { | |
| 326 snprintf(buf, kBufSize, " %s %d", kern_it->first.c_str(), kern_it->second); | |
| 327 output_string.append(buf); | |
| 328 } | |
| 329 output_string.append("\n"); | |
| 330 } | |
| 331 File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo"); | |
| 332 } | |
| 333 | |
| 334 static bool MakeIndividualGlyphs(Image pix, const std::vector<BoxChar *> &vbox, | |
| 335 const int input_tiff_page) { | |
| 336 // If checks fail, return false without exiting text2image | |
| 337 if (!pix) { | |
| 338 tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is nullptr\n"); | |
| 339 return false; | |
| 340 } else if (FLAGS_glyph_resized_size <= 0) { | |
| 341 tprintf("ERROR: --glyph_resized_size must be positive\n"); | |
| 342 return false; | |
| 343 } else if (FLAGS_glyph_num_border_pixels_to_pad < 0) { | |
| 344 tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n"); | |
| 345 return false; | |
| 346 } | |
| 347 | |
| 348 const int n_boxes = vbox.size(); | |
| 349 int n_boxes_saved = 0; | |
| 350 int current_tiff_page = 0; | |
| 351 int y_previous = 0; | |
| 352 static int glyph_count = 0; | |
| 353 for (int i = 0; i < n_boxes; i++) { | |
| 354 // Get one bounding box | |
| 355 Box *b = vbox[i]->mutable_box(); | |
| 356 if (!b) { | |
| 357 continue; | |
| 358 } | |
| 359 const int x = b->x; | |
| 360 const int y = b->y; | |
| 361 const int w = b->w; | |
| 362 const int h = b->h; | |
| 363 // Check present tiff page (for multipage tiff) | |
| 364 if (y < y_previous - pixGetHeight(pix) / 10) { | |
| 365 tprintf("ERROR: Wrap-around encountered, at i=%d\n", i); | |
| 366 current_tiff_page++; | |
| 367 } | |
| 368 if (current_tiff_page < input_tiff_page) { | |
| 369 continue; | |
| 370 } else if (current_tiff_page > input_tiff_page) { | |
| 371 break; | |
| 372 } | |
| 373 // Check box validity | |
| 374 if (x < 0 || y < 0 || (x + w - 1) >= pixGetWidth(pix) || (y + h - 1) >= pixGetHeight(pix)) { | |
| 375 tprintf( | |
| 376 "ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d" | |
| 377 " (x=%d, y=%d, w=%d, h=%d\n)", | |
| 378 i, x, y, w, h); | |
| 379 continue; | |
| 380 } else if (w < FLAGS_glyph_num_border_pixels_to_pad && | |
| 381 h < FLAGS_glyph_num_border_pixels_to_pad) { | |
| 382 tprintf("ERROR: Input image too small to be a character, at i=%d\n", i); | |
| 383 continue; | |
| 384 } | |
| 385 // Crop the boxed character | |
| 386 Image pix_glyph = pixClipRectangle(pix, b, nullptr); | |
| 387 if (!pix_glyph) { | |
| 388 tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i); | |
| 389 continue; | |
| 390 } | |
| 391 // Resize to square | |
| 392 Image pix_glyph_sq = | |
| 393 pixScaleToSize(pix_glyph, FLAGS_glyph_resized_size, FLAGS_glyph_resized_size); | |
| 394 if (!pix_glyph_sq) { | |
| 395 tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i); | |
| 396 continue; | |
| 397 } | |
| 398 // Zero-pad | |
| 399 Image pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq, FLAGS_glyph_num_border_pixels_to_pad, 0); | |
| 400 if (!pix_glyph_sq_pad) { | |
| 401 tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n", i); | |
| 402 continue; | |
| 403 } | |
| 404 // Write out | |
| 405 Image pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false); | |
| 406 char filename[1024]; | |
| 407 snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(), glyph_count++); | |
| 408 if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) { | |
| 409 tprintf( | |
| 410 "ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s," | |
| 411 " at i=%d\n", | |
| 412 filename, i); | |
| 413 continue; | |
| 414 } | |
| 415 | |
| 416 pix_glyph.destroy(); | |
| 417 pix_glyph_sq.destroy(); | |
| 418 pix_glyph_sq_pad.destroy(); | |
| 419 pix_glyph_sq_pad_8.destroy(); | |
| 420 n_boxes_saved++; | |
| 421 y_previous = y; | |
| 422 } | |
| 423 if (n_boxes_saved == 0) { | |
| 424 return false; | |
| 425 } else { | |
| 426 tprintf("Total number of characters saved = %d\n", n_boxes_saved); | |
| 427 return true; | |
| 428 } | |
| 429 } | |
| 430 } // namespace tesseract | |
| 431 | |
| 432 using tesseract::DegradeImage; | |
| 433 using tesseract::ExtractFontProperties; | |
| 434 using tesseract::File; | |
| 435 using tesseract::FontUtils; | |
| 436 using tesseract::SpanUTF8NotWhitespace; | |
| 437 using tesseract::SpanUTF8Whitespace; | |
| 438 using tesseract::StringRenderer; | |
| 439 | |
| 440 static int Main() { | |
| 441 if (FLAGS_list_available_fonts) { | |
| 442 const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts(); | |
| 443 for (unsigned int i = 0; i < all_fonts.size(); ++i) { | |
| 444 // Remove trailing comma: pango-font-description-to-string adds a comma | |
| 445 // to some fonts. | |
| 446 // See https://github.com/tesseract-ocr/tesseract/issues/408 | |
| 447 std::string font_name(all_fonts[i].c_str()); | |
| 448 if (font_name.back() == ',') { | |
| 449 font_name.pop_back(); | |
| 450 } | |
| 451 printf("%3u: %s\n", i, font_name.c_str()); | |
| 452 ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()), | |
| 453 "Font %s is unrecognized.\n", all_fonts[i].c_str()); | |
| 454 } | |
| 455 return EXIT_SUCCESS; | |
| 456 } | |
| 457 | |
| 458 // Check validity of input flags. | |
| 459 if (FLAGS_text.empty()) { | |
| 460 tprintf("'--text' option is missing!\n"); | |
| 461 return EXIT_FAILURE; | |
| 462 } | |
| 463 if (FLAGS_outputbase.empty()) { | |
| 464 tprintf("'--outputbase' option is missing!\n"); | |
| 465 return EXIT_FAILURE; | |
| 466 } | |
| 467 if (!FLAGS_unicharset_file.empty() && FLAGS_render_ngrams) { | |
| 468 tprintf("Use '--unicharset_file' only if '--render_ngrams' is set.\n"); | |
| 469 return EXIT_FAILURE; | |
| 470 } | |
| 471 | |
| 472 std::string font_name = FLAGS_font.c_str(); | |
| 473 if (!FLAGS_find_fonts && !FontUtils::IsAvailableFont(font_name.c_str())) { | |
| 474 font_name += ','; | |
| 475 std::string pango_name; | |
| 476 if (!FontUtils::IsAvailableFont(font_name.c_str(), &pango_name)) { | |
| 477 tprintf("Could not find font named '%s'.\n", FLAGS_font.c_str()); | |
| 478 if (!pango_name.empty()) { | |
| 479 tprintf("Pango suggested font '%s'.\n", pango_name.c_str()); | |
| 480 } | |
| 481 tprintf("Please correct --font arg.\n"); | |
| 482 return EXIT_FAILURE; | |
| 483 } | |
| 484 } | |
| 485 | |
| 486 if (FLAGS_render_ngrams) { | |
| 487 FLAGS_output_word_boxes = true; | |
| 488 } | |
| 489 | |
| 490 char font_desc_name[1024]; | |
| 491 snprintf(font_desc_name, 1024, "%s %d", font_name.c_str(), static_cast<int>(FLAGS_ptsize)); | |
| 492 | |
| 493 StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize); | |
| 494 render.set_add_ligatures(FLAGS_ligatures); | |
| 495 render.set_leading(FLAGS_leading); | |
| 496 render.set_resolution(FLAGS_resolution); | |
| 497 render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize); | |
| 498 render.set_h_margin(FLAGS_margin); | |
| 499 render.set_v_margin(FLAGS_margin); | |
| 500 render.set_output_word_boxes(FLAGS_output_word_boxes); | |
| 501 render.set_box_padding(FLAGS_box_padding); | |
| 502 render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words); | |
| 503 render.set_underline_start_prob(FLAGS_underline_start_prob); | |
| 504 render.set_underline_continuation_prob(FLAGS_underline_continuation_prob); | |
| 505 | |
| 506 // Set text rendering orientation and their forms. | |
| 507 if (FLAGS_writing_mode == "horizontal") { | |
| 508 // Render regular horizontal text (default). | |
| 509 render.set_vertical_text(false); | |
| 510 render.set_gravity_hint_strong(false); | |
| 511 render.set_render_fullwidth_latin(false); | |
| 512 } else if (FLAGS_writing_mode == "vertical") { | |
| 513 // Render vertical text. Glyph orientation is selected by Pango. | |
| 514 render.set_vertical_text(true); | |
| 515 render.set_gravity_hint_strong(false); | |
| 516 render.set_render_fullwidth_latin(false); | |
| 517 } else if (FLAGS_writing_mode == "vertical-upright") { | |
| 518 // Render vertical text. Glyph orientation is set to be upright. | |
| 519 // Also Basic Latin characters are converted to their fullwidth forms | |
| 520 // on rendering, since fullwidth Latin characters are well designed to fit | |
| 521 // vertical text lines, while .box files store halfwidth Basic Latin | |
| 522 // unichars. | |
| 523 render.set_vertical_text(true); | |
| 524 render.set_gravity_hint_strong(true); | |
| 525 render.set_render_fullwidth_latin(true); | |
| 526 } else { | |
| 527 tprintf("Invalid writing mode: %s\n", FLAGS_writing_mode.c_str()); | |
| 528 return EXIT_FAILURE; | |
| 529 } | |
| 530 | |
| 531 std::string src_utf8; | |
| 532 // This c_str is NOT redundant! | |
| 533 if (!File::ReadFileToString(FLAGS_text.c_str(), &src_utf8)) { | |
| 534 tprintf("Failed to read file: %s\n", FLAGS_text.c_str()); | |
| 535 return EXIT_FAILURE; | |
| 536 } | |
| 537 | |
| 538 // Remove the unicode mark if present. | |
| 539 if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) { | |
| 540 src_utf8.erase(0, 3); | |
| 541 } | |
| 542 tlog(1, "Render string of size %zu\n", src_utf8.length()); | |
| 543 | |
| 544 if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) { | |
| 545 // Try to preserve behavior of old text2image by expanding inter-word | |
| 546 // spaces by a factor of 4. | |
| 547 const std::string kSeparator = FLAGS_render_ngrams ? " " : " "; | |
| 548 // Also restrict the number of characters per line to try and avoid | |
| 549 // line-breaking in the middle of words like "-A", "R$" etc. which are | |
| 550 // otherwise allowed by the standard unicode line-breaking rules. | |
| 551 const unsigned int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100; | |
| 552 std::string rand_utf8; | |
| 553 UNICHARSET unicharset; | |
| 554 if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() && | |
| 555 !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) { | |
| 556 tprintf("Failed to load unicharset from file %s\n", FLAGS_unicharset_file.c_str()); | |
| 557 return EXIT_FAILURE; | |
| 558 } | |
| 559 | |
| 560 // If we are rendering ngrams that will be OCRed later, shuffle them so that | |
| 561 // tesseract does not have difficulties finding correct baseline, word | |
| 562 // spaces, etc. | |
| 563 const char *str8 = src_utf8.c_str(); | |
| 564 int len = src_utf8.length(); | |
| 565 int step; | |
| 566 std::vector<std::pair<int, int>> offsets; | |
| 567 int offset = SpanUTF8Whitespace(str8); | |
| 568 while (offset < len) { | |
| 569 step = SpanUTF8NotWhitespace(str8 + offset); | |
| 570 offsets.emplace_back(offset, step); | |
| 571 offset += step; | |
| 572 offset += SpanUTF8Whitespace(str8 + offset); | |
| 573 } | |
| 574 if (FLAGS_render_ngrams) { | |
| 575 std::seed_seq seed{kRandomSeed}; | |
| 576 std::mt19937 random_gen(seed); | |
| 577 std::shuffle(offsets.begin(), offsets.end(), random_gen); | |
| 578 } | |
| 579 | |
| 580 for (size_t i = 0, line = 1; i < offsets.size(); ++i) { | |
| 581 const char *curr_pos = str8 + offsets[i].first; | |
| 582 int ngram_len = offsets[i].second; | |
| 583 // Skip words that contain characters not in found in unicharset. | |
| 584 std::string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len); | |
| 585 if (!FLAGS_unicharset_file.empty() && | |
| 586 !unicharset.encodable_string(cleaned.c_str(), nullptr)) { | |
| 587 continue; | |
| 588 } | |
| 589 rand_utf8.append(curr_pos, ngram_len); | |
| 590 if (rand_utf8.length() > line * kCharsPerLine) { | |
| 591 rand_utf8.append(" \n"); | |
| 592 ++line; | |
| 593 if (line & 0x1) { | |
| 594 rand_utf8.append(kSeparator); | |
| 595 } | |
| 596 } else { | |
| 597 rand_utf8.append(kSeparator); | |
| 598 } | |
| 599 } | |
| 600 tlog(1, "Rendered ngram string of size %zu\n", rand_utf8.length()); | |
| 601 src_utf8.swap(rand_utf8); | |
| 602 } | |
| 603 if (FLAGS_only_extract_font_properties) { | |
| 604 tprintf("Extracting font properties only\n"); | |
| 605 ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str()); | |
| 606 tprintf("Done!\n"); | |
| 607 return EXIT_SUCCESS; | |
| 608 } | |
| 609 | |
| 610 int im = 0; | |
| 611 std::vector<float> page_rotation; | |
| 612 const char *to_render_utf8 = src_utf8.c_str(); | |
| 613 | |
| 614 tesseract::TRand randomizer; | |
| 615 randomizer.set_seed(kRandomSeed); | |
| 616 std::vector<std::string> font_names; | |
| 617 // We use a two pass mechanism to rotate images in both direction. | |
| 618 // The first pass(0) will rotate the images in random directions and | |
| 619 // the second pass(1) will mirror those rotations. | |
| 620 int num_pass = FLAGS_bidirectional_rotation ? 2 : 1; | |
| 621 for (int pass = 0; pass < num_pass; ++pass) { | |
| 622 int page_num = 0; | |
| 623 std::string font_used; | |
| 624 for (size_t offset = 0; | |
| 625 offset < strlen(to_render_utf8) && (FLAGS_max_pages == 0 || page_num < FLAGS_max_pages); | |
| 626 ++im, ++page_num) { | |
| 627 tlog(1, "Starting page %d\n", im); | |
| 628 Image pix = nullptr; | |
| 629 if (FLAGS_find_fonts) { | |
| 630 offset += render.RenderAllFontsToImage(FLAGS_min_coverage, to_render_utf8 + offset, | |
| 631 strlen(to_render_utf8 + offset), &font_used, &pix); | |
| 632 } else { | |
| 633 offset += | |
| 634 render.RenderToImage(to_render_utf8 + offset, strlen(to_render_utf8 + offset), &pix); | |
| 635 } | |
| 636 if (pix != nullptr) { | |
| 637 float rotation = 0; | |
| 638 if (pass == 1) { | |
| 639 // Pass 2, do mirror rotation. | |
| 640 rotation = -1 * page_rotation[page_num]; | |
| 641 } | |
| 642 if (FLAGS_degrade_image) { | |
| 643 pix = DegradeImage(pix, FLAGS_exposure, &randomizer, | |
| 644 FLAGS_rotate_image ? &rotation : nullptr); | |
| 645 } | |
| 646 if (FLAGS_distort_image) { | |
| 647 // TODO: perspective is set to false and box_reduction to 1. | |
| 648 pix = PrepareDistortedPix(pix, false, FLAGS_invert, FLAGS_white_noise, FLAGS_smooth_noise, | |
| 649 FLAGS_blur, 1, &randomizer, nullptr); | |
| 650 } | |
| 651 render.RotatePageBoxes(rotation); | |
| 652 | |
| 653 if (pass == 0) { | |
| 654 // Pass 1, rotate randomly and store the rotation.. | |
| 655 page_rotation.push_back(rotation); | |
| 656 } | |
| 657 | |
| 658 Image gray_pix = pixConvertTo8(pix, false); | |
| 659 pix.destroy(); | |
| 660 Image binary = pixThresholdToBinary(gray_pix, 128); | |
| 661 gray_pix.destroy(); | |
| 662 char tiff_name[1024]; | |
| 663 if (FLAGS_find_fonts) { | |
| 664 if (FLAGS_render_per_font) { | |
| 665 std::string fontname_for_file = tesseract::StringReplace(font_used, " ", "_"); | |
| 666 snprintf(tiff_name, 1024, "%s.%s.tif", FLAGS_outputbase.c_str(), | |
| 667 fontname_for_file.c_str()); | |
| 668 pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, "w"); | |
| 669 tprintf("Rendered page %d to file %s\n", im, tiff_name); | |
| 670 } else { | |
| 671 font_names.push_back(font_used); | |
| 672 } | |
| 673 } else { | |
| 674 snprintf(tiff_name, 1024, "%s.tif", FLAGS_outputbase.c_str()); | |
| 675 pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? "w" : "a"); | |
| 676 tprintf("Rendered page %d to file %s\n", im, tiff_name); | |
| 677 } | |
| 678 // Make individual glyphs | |
| 679 if (FLAGS_output_individual_glyph_images) { | |
| 680 if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) { | |
| 681 tprintf("ERROR: Individual glyphs not saved\n"); | |
| 682 } | |
| 683 } | |
| 684 binary.destroy(); | |
| 685 } | |
| 686 if (FLAGS_find_fonts && offset != 0) { | |
| 687 // We just want a list of names, or some sample images so we don't need | |
| 688 // to render more than the first page of the text. | |
| 689 break; | |
| 690 } | |
| 691 } | |
| 692 } | |
| 693 if (!FLAGS_find_fonts) { | |
| 694 std::string box_name = FLAGS_outputbase.c_str(); | |
| 695 box_name += ".box"; | |
| 696 render.WriteAllBoxes(box_name); | |
| 697 } else if (!FLAGS_render_per_font && !font_names.empty()) { | |
| 698 std::string filename = FLAGS_outputbase.c_str(); | |
| 699 filename += ".fontlist.txt"; | |
| 700 FILE *fp = fopen(filename.c_str(), "wb"); | |
| 701 if (fp == nullptr) { | |
| 702 tprintf("Failed to create output font list %s\n", filename.c_str()); | |
| 703 } else { | |
| 704 for (auto &font_name : font_names) { | |
| 705 fprintf(fp, "%s\n", font_name.c_str()); | |
| 706 } | |
| 707 fclose(fp); | |
| 708 } | |
| 709 } | |
| 710 | |
| 711 return EXIT_SUCCESS; | |
| 712 } | |
| 713 | |
| 714 int main(int argc, char **argv) { | |
| 715 // Respect environment variable. could be: | |
| 716 // fc (fontconfig), win32, and coretext | |
| 717 // If not set force fontconfig for Mac OS. | |
| 718 // See https://github.com/tesseract-ocr/tesseract/issues/736 | |
| 719 char *backend; | |
| 720 backend = getenv("PANGOCAIRO_BACKEND"); | |
| 721 if (backend == nullptr) { | |
| 722 static char envstring[] = "PANGOCAIRO_BACKEND=fc"; | |
| 723 putenv(envstring); | |
| 724 } else { | |
| 725 printf( | |
| 726 "Using '%s' as pango cairo backend based on environment " | |
| 727 "variable.\n", | |
| 728 backend); | |
| 729 } | |
| 730 tesseract::CheckSharedLibraryVersion(); | |
| 731 if (argc > 1) { | |
| 732 if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) { | |
| 733 FontUtils::PangoFontTypeInfo(); | |
| 734 printf("Pango version: %s\n", pango_version_string()); | |
| 735 } | |
| 736 } | |
| 737 tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); | |
| 738 return Main(); | |
| 739 } |
