Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/training/text2image.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/training/text2image.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,739 @@ +/********************************************************************** + * File: text2image.cpp + * Description: Program to generate OCR training pages. Given a text file it + * outputs an image with a given font and degradation. + * + * Note that since the results depend on the fonts available on + * your system, running the code on a different machine, or + * different OS, or even at a different time on the same machine, + * may produce different fonts even if --font is given explicitly. + * To see names of available fonts, use --list_available_fonts with + * the appropriate --fonts_dir path. + * Specifying --use_only_legacy_fonts will restrict the available + * fonts to those listed in legacy_fonts.h + * Authors: Ranjith Unnikrishnan, Ray Smith + * + * (C) Copyright 2013, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#include "boxchar.h" +#include "commandlineflags.h" +#include "commontraining.h" // CheckSharedLibraryVersion +#include "degradeimage.h" +#include "errcode.h" +#include "fileio.h" +#include "helpers.h" +#include "normstrngs.h" +#include "stringrenderer.h" +#include "tlog.h" +#include "unicharset.h" + +#include <allheaders.h> // from leptonica + +#include <algorithm> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include <map> +#include <random> +#include <string> +#include <utility> +#include <vector> + +#ifdef _MSC_VER +# define putenv(s) _putenv(s) +#endif + +using namespace tesseract; + +// A number with which to initialize the random number generator. +const int kRandomSeed = 0x18273645; + +// The text input file. +static STRING_PARAM_FLAG(text, "", "File name of text input to process"); + +// The text output file. +static STRING_PARAM_FLAG(outputbase, "", "Basename for output image/box file"); + +// Degrade the rendered image to mimic scanner quality. +static BOOL_PARAM_FLAG(degrade_image, true, + "Degrade rendered image with speckle noise, dilation/erosion " + "and rotation"); + +// Rotate the rendered image to have more realistic glyph borders +static BOOL_PARAM_FLAG(rotate_image, true, "Rotate the image in a random way."); + +// Degradation to apply to the image. +static INT_PARAM_FLAG(exposure, 0, "Exposure level in photocopier"); + +// Distort the rendered image by various means according to the bool flags. +static BOOL_PARAM_FLAG(distort_image, false, "Degrade rendered image with noise, blur, invert."); + +// Distortion to apply to the image. +static BOOL_PARAM_FLAG(invert, true, "Invert the image"); + +// Distortion to apply to the image. +static BOOL_PARAM_FLAG(white_noise, true, "Add Gaussian Noise"); + +// Distortion to apply to the image. +static BOOL_PARAM_FLAG(smooth_noise, true, "Smoothen Noise"); + +// Distortion to apply to the image. +static BOOL_PARAM_FLAG(blur, true, "Blur the image"); + +#if 0 + +// Distortion to apply to the image. +static BOOL_PARAM_FLAG(perspective, false, "Generate Perspective Distortion"); + +// Distortion to apply to the image. +static INT_PARAM_FLAG(box_reduction, 0, "Integer reduction factor box_scale"); + +#endif + +// Output image resolution. +static INT_PARAM_FLAG(resolution, 300, "Pixels per inch"); + +// Width of output image (in pixels). +static INT_PARAM_FLAG(xsize, 3600, "Width of output image"); + +// Max height of output image (in pixels). +static INT_PARAM_FLAG(ysize, 4800, "Height of output image"); + +// Max number of pages to produce. +static INT_PARAM_FLAG(max_pages, 0, "Maximum number of pages to output (0=unlimited)"); + +// Margin around text (in pixels). +static INT_PARAM_FLAG(margin, 100, "Margin round edges of image"); + +// Size of text (in points). +static INT_PARAM_FLAG(ptsize, 12, "Size of printed text"); + +// Inter-character space (in ems). +static DOUBLE_PARAM_FLAG(char_spacing, 0, "Inter-character space in ems"); + +// Sets the probability (value in [0, 1]) of starting to render a word with an +// underline. Words are assumed to be space-delimited. +static DOUBLE_PARAM_FLAG(underline_start_prob, 0, + "Fraction of words to underline (value in [0,1])"); +// Set the probability (value in [0, 1]) of continuing a started underline to +// the next word. +static DOUBLE_PARAM_FLAG(underline_continuation_prob, 0, + "Fraction of words to underline (value in [0,1])"); + +// Inter-line space (in pixels). +static INT_PARAM_FLAG(leading, 12, "Inter-line space (in pixels)"); + +// Layout and glyph orientation on rendering. +static STRING_PARAM_FLAG(writing_mode, "horizontal", + "Specify one of the following writing" + " modes.\n" + "'horizontal' : Render regular horizontal text. (default)\n" + "'vertical' : Render vertical text. Glyph orientation is" + " selected by Pango.\n" + "'vertical-upright' : Render vertical text. Glyph " + " orientation is set to be upright."); + +static INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes"); + +static BOOL_PARAM_FLAG(strip_unrenderable_words, true, + "Remove unrenderable words from source text"); + +// Font name. +static STRING_PARAM_FLAG(font, "Arial", "Font description name to use"); + +static BOOL_PARAM_FLAG(ligatures, false, "Rebuild and render ligatures"); + +static BOOL_PARAM_FLAG(find_fonts, false, "Search for all fonts that can render the text"); +static BOOL_PARAM_FLAG(render_per_font, true, + "If find_fonts==true, render each font to its own image. " + "Image filenames are of the form output_name.font_name.tif"); +static DOUBLE_PARAM_FLAG(min_coverage, 1.0, + "If find_fonts==true, the minimum coverage the font has of " + "the characters in the text file to include it, between " + "0 and 1."); + +static BOOL_PARAM_FLAG(list_available_fonts, false, "List available fonts and quit."); + +static BOOL_PARAM_FLAG(render_ngrams, false, + "Put each space-separated entity from the" + " input file into one bounding box. The ngrams in the input" + " file will be randomly permuted before rendering (so that" + " there is sufficient variety of characters on each line)."); + +static BOOL_PARAM_FLAG(output_word_boxes, false, + "Output word bounding boxes instead of character boxes. " + "This is used for Cube training, and implied by " + "--render_ngrams."); + +static STRING_PARAM_FLAG(unicharset_file, "", + "File with characters in the unicharset. If --render_ngrams" + " is true and --unicharset_file is specified, ngrams with" + " characters that are not in unicharset will be omitted"); + +static BOOL_PARAM_FLAG(bidirectional_rotation, false, "Rotate the generated characters both ways."); + +static BOOL_PARAM_FLAG(only_extract_font_properties, false, + "Assumes that the input file contains a list of ngrams. Renders" + " each ngram, extracts spacing properties and records them in" + " output_base/[font_name].fontinfo file."); + +// Use these flags to output zero-padded, square individual character images +static BOOL_PARAM_FLAG(output_individual_glyph_images, false, + "If true also outputs individual character images"); +static INT_PARAM_FLAG(glyph_resized_size, 0, + "Each glyph is square with this side length in pixels"); +static INT_PARAM_FLAG(glyph_num_border_pixels_to_pad, 0, + "Final_size=glyph_resized_size+2*glyph_num_border_pixels_to_pad"); + +namespace tesseract { + +struct SpacingProperties { + SpacingProperties() : x_gap_before(0), x_gap_after(0) {} + SpacingProperties(int b, int a) : x_gap_before(b), x_gap_after(a) {} + // These values are obtained from FT_Glyph_Metrics struct + // used by the FreeType font engine. + int x_gap_before; // horizontal x bearing + int x_gap_after; // horizontal advance - x_gap_before - width + std::map<std::string, int> kerned_x_gaps; +}; + +static bool IsWhitespaceBox(const BoxChar *boxchar) { + return (boxchar->box() == nullptr || SpanUTF8Whitespace(boxchar->ch().c_str())); +} + +static std::string StringReplace(const std::string &in, const std::string &oldsub, + const std::string &newsub) { + std::string out; + size_t start_pos = 0, pos; + while ((pos = in.find(oldsub, start_pos)) != std::string::npos) { + out.append(in.data() + start_pos, pos - start_pos); + out.append(newsub.data(), newsub.length()); + start_pos = pos + oldsub.length(); + } + out.append(in.data() + start_pos, in.length() - start_pos); + return out; +} + +// Assumes that each word (whitespace-separated entity) in text is a bigram. +// Renders the bigrams and calls FontInfo::GetSpacingProperties() to +// obtain spacing information. Produces the output .fontinfo file with a line +// per unichar of the form: +// unichar space_before space_after kerned1 kerned_space1 kerned2 ... +// Fox example, if unichar "A" has spacing of 0 pixels before and -1 pixels +// after, is kerned with "V" resulting in spacing of "AV" to be -7 and kerned +// with "T", such that "AT" has spacing of -5, the entry/line for unichar "A" +// in .fontinfo file will be: +// A 0 -1 T -5 V -7 +static void ExtractFontProperties(const std::string &utf8_text, StringRenderer *render, + const std::string &output_base) { + std::map<std::string, SpacingProperties> spacing_map; + std::map<std::string, SpacingProperties>::iterator spacing_map_it0; + std::map<std::string, SpacingProperties>::iterator spacing_map_it1; + int x_bearing, x_advance; + int len = utf8_text.length(); + int offset = 0; + const char *text = utf8_text.c_str(); + while (offset < len) { + offset += render->RenderToImage(text + offset, strlen(text + offset), nullptr); + const std::vector<BoxChar *> &boxes = render->GetBoxes(); + + // If the page break split a bigram, correct the offset so we try the bigram + // on the next iteration. + if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) && + IsWhitespaceBox(boxes[boxes.size() - 2])) { + if (boxes.size() > 3) { + tprintf("WARNING: Adjusting to bad page break after '%s%s'\n", + boxes[boxes.size() - 4]->ch().c_str(), boxes[boxes.size() - 3]->ch().c_str()); + } + offset -= boxes[boxes.size() - 1]->ch().size(); + } + + for (size_t b = 0; b < boxes.size(); b += 2) { + while (b < boxes.size() && IsWhitespaceBox(boxes[b])) { + ++b; + } + if (b + 1 >= boxes.size()) { + break; + } + const std::string &ch0 = boxes[b]->ch(); + // We encountered a ligature. This happens in at least two scenarios: + // One is when the rendered bigram forms a grapheme cluster (eg. the + // second character in the bigram is a combining vowel), in which case we + // correctly output only one bounding box. + // A second far less frequent case is when caused some fonts like 'DejaVu + // Sans Ultra-Light' force Pango to render a ligatured character even if + // the input consists of the separated characters. NOTE(ranjith): As per + // behdad@ this is not currently controllable at the level of the Pango + // API. + // The most frequent of all is a single character "word" made by the CJK + // segmenter. + // Safeguard against these cases here by just skipping the bigram. + if (IsWhitespaceBox(boxes[b + 1])) { + continue; + } + int xgap = (boxes[b + 1]->box()->x - (boxes[b]->box()->x + boxes[b]->box()->w)); + spacing_map_it0 = spacing_map.find(ch0); + int ok_count = 0; + if (spacing_map_it0 == spacing_map.end() && + render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) { + spacing_map[ch0] = SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b]->box()->w); + spacing_map_it0 = spacing_map.find(ch0); + ++ok_count; + } + const std::string &ch1 = boxes[b + 1]->ch(); + tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str()); + spacing_map_it1 = spacing_map.find(ch1); + if (spacing_map_it1 == spacing_map.end() && + render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) { + spacing_map[ch1] = + SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b + 1]->box()->w); + spacing_map_it1 = spacing_map.find(ch1); + ++ok_count; + } + if (ok_count == 2 && + xgap != (spacing_map_it0->second.x_gap_after + spacing_map_it1->second.x_gap_before)) { + spacing_map_it0->second.kerned_x_gaps[ch1] = xgap; + } + } + render->ClearBoxes(); + } + std::string output_string; + const int kBufSize = 1024; + char buf[kBufSize]; + snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size())); + output_string.append(buf); + std::map<std::string, SpacingProperties>::const_iterator spacing_map_it; + for (spacing_map_it = spacing_map.begin(); spacing_map_it != spacing_map.end(); + ++spacing_map_it) { + snprintf(buf, kBufSize, "%s %d %d %d", spacing_map_it->first.c_str(), + spacing_map_it->second.x_gap_before, spacing_map_it->second.x_gap_after, + static_cast<int>(spacing_map_it->second.kerned_x_gaps.size())); + output_string.append(buf); + std::map<std::string, int>::const_iterator kern_it; + for (kern_it = spacing_map_it->second.kerned_x_gaps.begin(); + kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) { + snprintf(buf, kBufSize, " %s %d", kern_it->first.c_str(), kern_it->second); + output_string.append(buf); + } + output_string.append("\n"); + } + File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo"); +} + +static bool MakeIndividualGlyphs(Image pix, const std::vector<BoxChar *> &vbox, + const int input_tiff_page) { + // If checks fail, return false without exiting text2image + if (!pix) { + tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is nullptr\n"); + return false; + } else if (FLAGS_glyph_resized_size <= 0) { + tprintf("ERROR: --glyph_resized_size must be positive\n"); + return false; + } else if (FLAGS_glyph_num_border_pixels_to_pad < 0) { + tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n"); + return false; + } + + const int n_boxes = vbox.size(); + int n_boxes_saved = 0; + int current_tiff_page = 0; + int y_previous = 0; + static int glyph_count = 0; + for (int i = 0; i < n_boxes; i++) { + // Get one bounding box + Box *b = vbox[i]->mutable_box(); + if (!b) { + continue; + } + const int x = b->x; + const int y = b->y; + const int w = b->w; + const int h = b->h; + // Check present tiff page (for multipage tiff) + if (y < y_previous - pixGetHeight(pix) / 10) { + tprintf("ERROR: Wrap-around encountered, at i=%d\n", i); + current_tiff_page++; + } + if (current_tiff_page < input_tiff_page) { + continue; + } else if (current_tiff_page > input_tiff_page) { + break; + } + // Check box validity + if (x < 0 || y < 0 || (x + w - 1) >= pixGetWidth(pix) || (y + h - 1) >= pixGetHeight(pix)) { + tprintf( + "ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d" + " (x=%d, y=%d, w=%d, h=%d\n)", + i, x, y, w, h); + continue; + } else if (w < FLAGS_glyph_num_border_pixels_to_pad && + h < FLAGS_glyph_num_border_pixels_to_pad) { + tprintf("ERROR: Input image too small to be a character, at i=%d\n", i); + continue; + } + // Crop the boxed character + Image pix_glyph = pixClipRectangle(pix, b, nullptr); + if (!pix_glyph) { + tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i); + continue; + } + // Resize to square + Image pix_glyph_sq = + pixScaleToSize(pix_glyph, FLAGS_glyph_resized_size, FLAGS_glyph_resized_size); + if (!pix_glyph_sq) { + tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i); + continue; + } + // Zero-pad + Image pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq, FLAGS_glyph_num_border_pixels_to_pad, 0); + if (!pix_glyph_sq_pad) { + tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n", i); + continue; + } + // Write out + Image pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false); + char filename[1024]; + snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(), glyph_count++); + if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) { + tprintf( + "ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s," + " at i=%d\n", + filename, i); + continue; + } + + pix_glyph.destroy(); + pix_glyph_sq.destroy(); + pix_glyph_sq_pad.destroy(); + pix_glyph_sq_pad_8.destroy(); + n_boxes_saved++; + y_previous = y; + } + if (n_boxes_saved == 0) { + return false; + } else { + tprintf("Total number of characters saved = %d\n", n_boxes_saved); + return true; + } +} +} // namespace tesseract + +using tesseract::DegradeImage; +using tesseract::ExtractFontProperties; +using tesseract::File; +using tesseract::FontUtils; +using tesseract::SpanUTF8NotWhitespace; +using tesseract::SpanUTF8Whitespace; +using tesseract::StringRenderer; + +static int Main() { + if (FLAGS_list_available_fonts) { + const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts(); + for (unsigned int i = 0; i < all_fonts.size(); ++i) { + // Remove trailing comma: pango-font-description-to-string adds a comma + // to some fonts. + // See https://github.com/tesseract-ocr/tesseract/issues/408 + std::string font_name(all_fonts[i].c_str()); + if (font_name.back() == ',') { + font_name.pop_back(); + } + printf("%3u: %s\n", i, font_name.c_str()); + ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()), + "Font %s is unrecognized.\n", all_fonts[i].c_str()); + } + return EXIT_SUCCESS; + } + + // Check validity of input flags. + if (FLAGS_text.empty()) { + tprintf("'--text' option is missing!\n"); + return EXIT_FAILURE; + } + if (FLAGS_outputbase.empty()) { + tprintf("'--outputbase' option is missing!\n"); + return EXIT_FAILURE; + } + if (!FLAGS_unicharset_file.empty() && FLAGS_render_ngrams) { + tprintf("Use '--unicharset_file' only if '--render_ngrams' is set.\n"); + return EXIT_FAILURE; + } + + std::string font_name = FLAGS_font.c_str(); + if (!FLAGS_find_fonts && !FontUtils::IsAvailableFont(font_name.c_str())) { + font_name += ','; + std::string pango_name; + if (!FontUtils::IsAvailableFont(font_name.c_str(), &pango_name)) { + tprintf("Could not find font named '%s'.\n", FLAGS_font.c_str()); + if (!pango_name.empty()) { + tprintf("Pango suggested font '%s'.\n", pango_name.c_str()); + } + tprintf("Please correct --font arg.\n"); + return EXIT_FAILURE; + } + } + + if (FLAGS_render_ngrams) { + FLAGS_output_word_boxes = true; + } + + char font_desc_name[1024]; + snprintf(font_desc_name, 1024, "%s %d", font_name.c_str(), static_cast<int>(FLAGS_ptsize)); + + StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize); + render.set_add_ligatures(FLAGS_ligatures); + render.set_leading(FLAGS_leading); + render.set_resolution(FLAGS_resolution); + render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize); + render.set_h_margin(FLAGS_margin); + render.set_v_margin(FLAGS_margin); + render.set_output_word_boxes(FLAGS_output_word_boxes); + render.set_box_padding(FLAGS_box_padding); + render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words); + render.set_underline_start_prob(FLAGS_underline_start_prob); + render.set_underline_continuation_prob(FLAGS_underline_continuation_prob); + + // Set text rendering orientation and their forms. + if (FLAGS_writing_mode == "horizontal") { + // Render regular horizontal text (default). + render.set_vertical_text(false); + render.set_gravity_hint_strong(false); + render.set_render_fullwidth_latin(false); + } else if (FLAGS_writing_mode == "vertical") { + // Render vertical text. Glyph orientation is selected by Pango. + render.set_vertical_text(true); + render.set_gravity_hint_strong(false); + render.set_render_fullwidth_latin(false); + } else if (FLAGS_writing_mode == "vertical-upright") { + // Render vertical text. Glyph orientation is set to be upright. + // Also Basic Latin characters are converted to their fullwidth forms + // on rendering, since fullwidth Latin characters are well designed to fit + // vertical text lines, while .box files store halfwidth Basic Latin + // unichars. + render.set_vertical_text(true); + render.set_gravity_hint_strong(true); + render.set_render_fullwidth_latin(true); + } else { + tprintf("Invalid writing mode: %s\n", FLAGS_writing_mode.c_str()); + return EXIT_FAILURE; + } + + std::string src_utf8; + // This c_str is NOT redundant! + if (!File::ReadFileToString(FLAGS_text.c_str(), &src_utf8)) { + tprintf("Failed to read file: %s\n", FLAGS_text.c_str()); + return EXIT_FAILURE; + } + + // Remove the unicode mark if present. + if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) { + src_utf8.erase(0, 3); + } + tlog(1, "Render string of size %zu\n", src_utf8.length()); + + if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) { + // Try to preserve behavior of old text2image by expanding inter-word + // spaces by a factor of 4. + const std::string kSeparator = FLAGS_render_ngrams ? " " : " "; + // Also restrict the number of characters per line to try and avoid + // line-breaking in the middle of words like "-A", "R$" etc. which are + // otherwise allowed by the standard unicode line-breaking rules. + const unsigned int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100; + std::string rand_utf8; + UNICHARSET unicharset; + if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() && + !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) { + tprintf("Failed to load unicharset from file %s\n", FLAGS_unicharset_file.c_str()); + return EXIT_FAILURE; + } + + // If we are rendering ngrams that will be OCRed later, shuffle them so that + // tesseract does not have difficulties finding correct baseline, word + // spaces, etc. + const char *str8 = src_utf8.c_str(); + int len = src_utf8.length(); + int step; + std::vector<std::pair<int, int>> offsets; + int offset = SpanUTF8Whitespace(str8); + while (offset < len) { + step = SpanUTF8NotWhitespace(str8 + offset); + offsets.emplace_back(offset, step); + offset += step; + offset += SpanUTF8Whitespace(str8 + offset); + } + if (FLAGS_render_ngrams) { + std::seed_seq seed{kRandomSeed}; + std::mt19937 random_gen(seed); + std::shuffle(offsets.begin(), offsets.end(), random_gen); + } + + for (size_t i = 0, line = 1; i < offsets.size(); ++i) { + const char *curr_pos = str8 + offsets[i].first; + int ngram_len = offsets[i].second; + // Skip words that contain characters not in found in unicharset. + std::string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len); + if (!FLAGS_unicharset_file.empty() && + !unicharset.encodable_string(cleaned.c_str(), nullptr)) { + continue; + } + rand_utf8.append(curr_pos, ngram_len); + if (rand_utf8.length() > line * kCharsPerLine) { + rand_utf8.append(" \n"); + ++line; + if (line & 0x1) { + rand_utf8.append(kSeparator); + } + } else { + rand_utf8.append(kSeparator); + } + } + tlog(1, "Rendered ngram string of size %zu\n", rand_utf8.length()); + src_utf8.swap(rand_utf8); + } + if (FLAGS_only_extract_font_properties) { + tprintf("Extracting font properties only\n"); + ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str()); + tprintf("Done!\n"); + return EXIT_SUCCESS; + } + + int im = 0; + std::vector<float> page_rotation; + const char *to_render_utf8 = src_utf8.c_str(); + + tesseract::TRand randomizer; + randomizer.set_seed(kRandomSeed); + std::vector<std::string> font_names; + // We use a two pass mechanism to rotate images in both direction. + // The first pass(0) will rotate the images in random directions and + // the second pass(1) will mirror those rotations. + int num_pass = FLAGS_bidirectional_rotation ? 2 : 1; + for (int pass = 0; pass < num_pass; ++pass) { + int page_num = 0; + std::string font_used; + for (size_t offset = 0; + offset < strlen(to_render_utf8) && (FLAGS_max_pages == 0 || page_num < FLAGS_max_pages); + ++im, ++page_num) { + tlog(1, "Starting page %d\n", im); + Image pix = nullptr; + if (FLAGS_find_fonts) { + offset += render.RenderAllFontsToImage(FLAGS_min_coverage, to_render_utf8 + offset, + strlen(to_render_utf8 + offset), &font_used, &pix); + } else { + offset += + render.RenderToImage(to_render_utf8 + offset, strlen(to_render_utf8 + offset), &pix); + } + if (pix != nullptr) { + float rotation = 0; + if (pass == 1) { + // Pass 2, do mirror rotation. + rotation = -1 * page_rotation[page_num]; + } + if (FLAGS_degrade_image) { + pix = DegradeImage(pix, FLAGS_exposure, &randomizer, + FLAGS_rotate_image ? &rotation : nullptr); + } + if (FLAGS_distort_image) { + // TODO: perspective is set to false and box_reduction to 1. + pix = PrepareDistortedPix(pix, false, FLAGS_invert, FLAGS_white_noise, FLAGS_smooth_noise, + FLAGS_blur, 1, &randomizer, nullptr); + } + render.RotatePageBoxes(rotation); + + if (pass == 0) { + // Pass 1, rotate randomly and store the rotation.. + page_rotation.push_back(rotation); + } + + Image gray_pix = pixConvertTo8(pix, false); + pix.destroy(); + Image binary = pixThresholdToBinary(gray_pix, 128); + gray_pix.destroy(); + char tiff_name[1024]; + if (FLAGS_find_fonts) { + if (FLAGS_render_per_font) { + std::string fontname_for_file = tesseract::StringReplace(font_used, " ", "_"); + snprintf(tiff_name, 1024, "%s.%s.tif", FLAGS_outputbase.c_str(), + fontname_for_file.c_str()); + pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, "w"); + tprintf("Rendered page %d to file %s\n", im, tiff_name); + } else { + font_names.push_back(font_used); + } + } else { + snprintf(tiff_name, 1024, "%s.tif", FLAGS_outputbase.c_str()); + pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? "w" : "a"); + tprintf("Rendered page %d to file %s\n", im, tiff_name); + } + // Make individual glyphs + if (FLAGS_output_individual_glyph_images) { + if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) { + tprintf("ERROR: Individual glyphs not saved\n"); + } + } + binary.destroy(); + } + if (FLAGS_find_fonts && offset != 0) { + // We just want a list of names, or some sample images so we don't need + // to render more than the first page of the text. + break; + } + } + } + if (!FLAGS_find_fonts) { + std::string box_name = FLAGS_outputbase.c_str(); + box_name += ".box"; + render.WriteAllBoxes(box_name); + } else if (!FLAGS_render_per_font && !font_names.empty()) { + std::string filename = FLAGS_outputbase.c_str(); + filename += ".fontlist.txt"; + FILE *fp = fopen(filename.c_str(), "wb"); + if (fp == nullptr) { + tprintf("Failed to create output font list %s\n", filename.c_str()); + } else { + for (auto &font_name : font_names) { + fprintf(fp, "%s\n", font_name.c_str()); + } + fclose(fp); + } + } + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) { + // Respect environment variable. could be: + // fc (fontconfig), win32, and coretext + // If not set force fontconfig for Mac OS. + // See https://github.com/tesseract-ocr/tesseract/issues/736 + char *backend; + backend = getenv("PANGOCAIRO_BACKEND"); + if (backend == nullptr) { + static char envstring[] = "PANGOCAIRO_BACKEND=fc"; + putenv(envstring); + } else { + printf( + "Using '%s' as pango cairo backend based on environment " + "variable.\n", + backend); + } + tesseract::CheckSharedLibraryVersion(); + if (argc > 1) { + if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) { + FontUtils::PangoFontTypeInfo(); + printf("Pango version: %s\n", pango_version_string()); + } + } + tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); + return Main(); +}
