comparison mupdf-source/thirdparty/tesseract/src/training/unicharset/unicharset_training_utils.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: unicharset_training_utils.cpp
3 // Description: Training utilities for UNICHARSET.
4 // Author: Ray Smith
5 //
6 // (C) Copyright 2014, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18
19 #include "unicharset_training_utils.h"
20
21 #include <cstdlib>
22 #include <cstring>
23 #include <string>
24 #include <vector>
25
26 #include <tesseract/unichar.h>
27 #include "fileio.h"
28 #include "icuerrorcode.h"
29 #include "normstrngs.h"
30 #include "statistc.h"
31 #include "tesserrstream.h" // for tesserr
32 #include "unicharset.h"
33 #include "unicode/uchar.h" // from libicu
34 #include "unicode/uscript.h" // from libicu
35
36 namespace tesseract {
37
38 // Helper sets the character attribute properties and sets up the script table.
39 // Does not set tops and bottoms.
40 void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset) {
41 for (size_t unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
42 // Convert any custom ligatures.
43 const char *unichar_str = unicharset->id_to_unichar(unichar_id);
44 for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) {
45 if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) {
46 unichar_str = UNICHARSET::kCustomLigatures[i][0];
47 break;
48 }
49 }
50
51 // Convert the unichar to UTF32 representation
52 std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(unichar_str);
53
54 // Assume that if the property is true for any character in the string,
55 // then it holds for the whole "character".
56 bool unichar_isalpha = false;
57 bool unichar_islower = false;
58 bool unichar_isupper = false;
59 bool unichar_isdigit = false;
60 bool unichar_ispunct = false;
61
62 for (char32 u_ch : uni_vector) {
63 if (u_isalpha(u_ch)) {
64 unichar_isalpha = true;
65 }
66 if (u_islower(u_ch)) {
67 unichar_islower = true;
68 }
69 if (u_isupper(u_ch)) {
70 unichar_isupper = true;
71 }
72 if (u_isdigit(u_ch)) {
73 unichar_isdigit = true;
74 }
75 if (u_ispunct(u_ch)) {
76 unichar_ispunct = true;
77 }
78 }
79
80 unicharset->set_isalpha(unichar_id, unichar_isalpha);
81 unicharset->set_islower(unichar_id, unichar_islower);
82 unicharset->set_isupper(unichar_id, unichar_isupper);
83 unicharset->set_isdigit(unichar_id, unichar_isdigit);
84 unicharset->set_ispunctuation(unichar_id, unichar_ispunct);
85
86 tesseract::IcuErrorCode err;
87 unicharset->set_script(unichar_id, uscript_getName(uscript_getScript(uni_vector[0], err)));
88
89 const int num_code_points = uni_vector.size();
90 // Obtain the lower/upper case if needed and record it in the properties.
91 unicharset->set_other_case(unichar_id, unichar_id);
92 if (unichar_islower || unichar_isupper) {
93 std::vector<char32> other_case(num_code_points, 0);
94 for (int i = 0; i < num_code_points; ++i) {
95 // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used.
96 // However since they deal with UChars (so need a conversion function
97 // from char32 or UTF8string) and require a meaningful locale string,
98 // for now u_tolower()/u_toupper() are used.
99 other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : u_tolower(uni_vector[i]);
100 }
101 std::string other_case_uch = UNICHAR::UTF32ToUTF8(other_case);
102 UNICHAR_ID other_case_id = unicharset->unichar_to_id(other_case_uch.c_str());
103 if (other_case_id != INVALID_UNICHAR_ID) {
104 unicharset->set_other_case(unichar_id, other_case_id);
105 } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) {
106 tprintf("Other case %s of %s is not in unicharset\n", other_case_uch.c_str(), unichar_str);
107 }
108 }
109
110 // Set RTL property and obtain mirror unichar ID from ICU.
111 std::vector<char32> mirrors(num_code_points, 0);
112 for (int i = 0; i < num_code_points; ++i) {
113 mirrors[i] = u_charMirror(uni_vector[i]);
114 if (i == 0) { // set directionality to that of the 1st code point
115 unicharset->set_direction(
116 unichar_id, static_cast<UNICHARSET::Direction>(u_charDirection(uni_vector[i])));
117 }
118 }
119 std::string mirror_uch = UNICHAR::UTF32ToUTF8(mirrors);
120 UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str());
121 if (mirror_uch_id != INVALID_UNICHAR_ID) {
122 unicharset->set_mirror(unichar_id, mirror_uch_id);
123 } else if (report_errors) {
124 tprintf("Mirror %s of %s is not in unicharset\n", mirror_uch.c_str(), unichar_str);
125 }
126
127 // Record normalized version of this unichar.
128 std::string normed_str;
129 if (unichar_id != 0 &&
130 tesseract::NormalizeUTF8String(
131 decompose ? tesseract::UnicodeNormMode::kNFD : tesseract::UnicodeNormMode::kNFC,
132 tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone, unichar_str,
133 &normed_str) &&
134 !normed_str.empty()) {
135 unicharset->set_normed(unichar_id, normed_str.c_str());
136 } else {
137 unicharset->set_normed(unichar_id, unichar_str);
138 }
139 ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size());
140 }
141 unicharset->post_load_setup();
142 }
143
144 // Helper sets the properties from universal script unicharsets, if found.
145 void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset) {
146 for (int s = 0; s < unicharset->get_script_table_size(); ++s) {
147 // Load the unicharset for the script if available.
148 std::string filename =
149 script_dir + "/" + unicharset->get_script_from_script_id(s) + ".unicharset";
150 UNICHARSET script_set;
151 if (script_set.load_from_file(filename.c_str())) {
152 unicharset->SetPropertiesFromOther(script_set);
153 } else if (s != unicharset->common_sid() && s != unicharset->null_sid()) {
154 tprintf("Failed to load script unicharset from:%s\n", filename.c_str());
155 }
156 }
157 for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset->size(); ++c) {
158 if (unicharset->PropertiesIncomplete(c)) {
159 tprintf("Warning: properties incomplete for index %d = %s\n", c,
160 unicharset->id_to_unichar(c));
161 }
162 }
163 }
164
165 // Helper gets the combined x-heights string.
166 std::string GetXheightString(const std::string &script_dir, const UNICHARSET &unicharset) {
167 std::string xheights_str;
168 for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
169 // Load the xheights for the script if available.
170 std::string filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + ".xheights";
171 std::string script_heights;
172 if (File::ReadFileToString(filename, &script_heights)) {
173 xheights_str += script_heights;
174 }
175 }
176 return xheights_str;
177 }
178
179 // Helper to set the properties for an input unicharset file, writes to the
180 // output file. If an appropriate script unicharset can be found in the
181 // script_dir directory, then the tops and bottoms are expanded using the
182 // script unicharset.
183 // If non-empty, xheight data for the fonts are written to the xheights_file.
184 void SetPropertiesForInputFile(const std::string &script_dir,
185 const std::string &input_unicharset_file,
186 const std::string &output_unicharset_file,
187 const std::string &output_xheights_file) {
188 UNICHARSET unicharset;
189
190 // Load the input unicharset
191 unicharset.load_from_file(input_unicharset_file.c_str());
192 tesserr << "Loaded unicharset of size " << unicharset.size()
193 << " from file " << input_unicharset_file << '\n';
194
195 // Set unichar properties
196 tprintf("Setting unichar properties\n");
197 SetupBasicProperties(true, false, &unicharset);
198 tprintf("Setting script properties\n");
199 SetScriptProperties(script_dir, &unicharset);
200 if (!output_xheights_file.empty()) {
201 std::string xheights_str = GetXheightString(script_dir, unicharset);
202 File::WriteStringToFileOrDie(xheights_str, output_xheights_file);
203 }
204
205 // Write the output unicharset
206 tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str());
207 unicharset.save_to_file(output_unicharset_file.c_str());
208 }
209
210 } // namespace tesseract