Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/classify/shapetable.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2010 Google Inc. All Rights Reserved. | |
| 2 // Author: rays@google.com (Ray Smith) | |
| 3 /////////////////////////////////////////////////////////////////////// | |
| 4 // File: shapetable.cpp | |
| 5 // Description: Class to map a classifier shape index to unicharset | |
| 6 // indices and font indices. | |
| 7 // Author: Ray Smith | |
| 8 // | |
| 9 // (C) Copyright 2010, Google Inc. | |
| 10 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 11 // you may not use this file except in compliance with the License. | |
| 12 // You may obtain a copy of the License at | |
| 13 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 14 // Unless required by applicable law or agreed to in writing, software | |
| 15 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 17 // See the License for the specific language governing permissions and | |
| 18 // limitations under the License. | |
| 19 // | |
| 20 /////////////////////////////////////////////////////////////////////// | |
| 21 | |
| 22 #include "shapetable.h" | |
| 23 | |
| 24 #include "bitvector.h" | |
| 25 #include "fontinfo.h" | |
| 26 #include "intfeaturespace.h" | |
| 27 #include "unicharset.h" | |
| 28 #include "unicity_table.h" | |
| 29 | |
| 30 #include <algorithm> | |
| 31 | |
| 32 namespace tesseract { | |
| 33 | |
| 34 // Helper function to get the index of the first result with the required | |
| 35 // unichar_id. If the results are sorted by rating, this will also be the | |
| 36 // best result with the required unichar_id. | |
| 37 // Returns -1 if the unichar_id is not found | |
| 38 int ShapeRating::FirstResultWithUnichar(const std::vector<ShapeRating> &results, | |
| 39 const ShapeTable &shape_table, UNICHAR_ID unichar_id) { | |
| 40 for (unsigned r = 0; r < results.size(); ++r) { | |
| 41 const auto shape_id = results[r].shape_id; | |
| 42 const Shape &shape = shape_table.GetShape(shape_id); | |
| 43 if (shape.ContainsUnichar(unichar_id)) { | |
| 44 return r; | |
| 45 } | |
| 46 } | |
| 47 return -1; | |
| 48 } | |
| 49 | |
| 50 // Helper function to get the index of the first result with the required | |
| 51 // unichar_id. If the results are sorted by rating, this will also be the | |
| 52 // best result with the required unichar_id. | |
| 53 // Returns -1 if the unichar_id is not found | |
| 54 int UnicharRating::FirstResultWithUnichar(const std::vector<UnicharRating> &results, | |
| 55 UNICHAR_ID unichar_id) { | |
| 56 for (unsigned r = 0; r < results.size(); ++r) { | |
| 57 if (results[r].unichar_id == unichar_id) { | |
| 58 return r; | |
| 59 } | |
| 60 } | |
| 61 return -1; | |
| 62 } | |
| 63 | |
| 64 // Writes to the given file. Returns false in case of error. | |
| 65 bool UnicharAndFonts::Serialize(FILE *fp) const { | |
| 66 return tesseract::Serialize(fp, &unichar_id) && tesseract::Serialize(fp, font_ids); | |
| 67 } | |
| 68 | |
| 69 // Reads from the given file. Returns false in case of error. | |
| 70 bool UnicharAndFonts::DeSerialize(TFile *fp) { | |
| 71 return fp->DeSerialize(&unichar_id) && fp->DeSerialize(font_ids); | |
| 72 } | |
| 73 | |
| 74 // Sort function to sort a pair of UnicharAndFonts by unichar_id. | |
| 75 int UnicharAndFonts::SortByUnicharId(const void *v1, const void *v2) { | |
| 76 const auto *p1 = static_cast<const UnicharAndFonts *>(v1); | |
| 77 const auto *p2 = static_cast<const UnicharAndFonts *>(v2); | |
| 78 return p1->unichar_id - p2->unichar_id; | |
| 79 } | |
| 80 | |
| 81 bool UnicharAndFonts::StdSortByUnicharId(const UnicharAndFonts &v1, const UnicharAndFonts &v2) { | |
| 82 return v1.unichar_id < v2.unichar_id; | |
| 83 } | |
| 84 | |
| 85 // Writes to the given file. Returns false in case of error. | |
| 86 bool Shape::Serialize(FILE *fp) const { | |
| 87 uint8_t sorted = unichars_sorted_; | |
| 88 return tesseract::Serialize(fp, &sorted) && tesseract::Serialize(fp, unichars_); | |
| 89 } | |
| 90 // Reads from the given file. Returns false in case of error. | |
| 91 | |
| 92 bool Shape::DeSerialize(TFile *fp) { | |
| 93 uint8_t sorted; | |
| 94 if (!fp->DeSerialize(&sorted)) { | |
| 95 return false; | |
| 96 } | |
| 97 unichars_sorted_ = sorted != 0; | |
| 98 return fp->DeSerialize(unichars_); | |
| 99 } | |
| 100 | |
| 101 // Adds a font_id for the given unichar_id. If the unichar_id is not | |
| 102 // in the shape, it is added. | |
| 103 void Shape::AddToShape(int unichar_id, int font_id) { | |
| 104 for (auto &unichar : unichars_) { | |
| 105 if (unichar.unichar_id == unichar_id) { | |
| 106 // Found the unichar in the shape table. | |
| 107 std::vector<int> &font_list = unichar.font_ids; | |
| 108 for (int f : font_list) { | |
| 109 if (f == font_id) { | |
| 110 return; // Font is already there. | |
| 111 } | |
| 112 } | |
| 113 font_list.push_back(font_id); | |
| 114 return; | |
| 115 } | |
| 116 } | |
| 117 // Unichar_id is not in shape, so add it to shape. | |
| 118 unichars_.emplace_back(unichar_id, font_id); | |
| 119 unichars_sorted_ = unichars_.size() <= 1; | |
| 120 } | |
| 121 | |
| 122 // Adds everything in other to this. | |
| 123 void Shape::AddShape(const Shape &other) { | |
| 124 for (const auto &unichar : other.unichars_) { | |
| 125 for (unsigned f = 0; f < unichar.font_ids.size(); ++f) { | |
| 126 AddToShape(unichar.unichar_id, unichar.font_ids[f]); | |
| 127 } | |
| 128 } | |
| 129 unichars_sorted_ = unichars_.size() <= 1; | |
| 130 } | |
| 131 | |
| 132 // Returns true if the shape contains the given unichar_id, font_id pair. | |
| 133 bool Shape::ContainsUnicharAndFont(int unichar_id, int font_id) const { | |
| 134 for (const auto &unichar : unichars_) { | |
| 135 if (unichar.unichar_id == unichar_id) { | |
| 136 // Found the unichar, so look for the font. | |
| 137 auto &font_list = unichar.font_ids; | |
| 138 for (int f : font_list) { | |
| 139 if (f == font_id) { | |
| 140 return true; | |
| 141 } | |
| 142 } | |
| 143 return false; | |
| 144 } | |
| 145 } | |
| 146 return false; | |
| 147 } | |
| 148 | |
| 149 // Returns true if the shape contains the given unichar_id, ignoring font. | |
| 150 bool Shape::ContainsUnichar(int unichar_id) const { | |
| 151 for (const auto &unichar : unichars_) { | |
| 152 if (unichar.unichar_id == unichar_id) { | |
| 153 return true; | |
| 154 } | |
| 155 } | |
| 156 return false; | |
| 157 } | |
| 158 | |
| 159 // Returns true if the shape contains the given font, ignoring unichar_id. | |
| 160 bool Shape::ContainsFont(int font_id) const { | |
| 161 for (const auto &unichar : unichars_) { | |
| 162 auto &font_list = unichar.font_ids; | |
| 163 for (int f : font_list) { | |
| 164 if (f == font_id) { | |
| 165 return true; | |
| 166 } | |
| 167 } | |
| 168 } | |
| 169 return false; | |
| 170 } | |
| 171 // Returns true if the shape contains the given font properties, ignoring | |
| 172 // unichar_id. | |
| 173 bool Shape::ContainsFontProperties(const FontInfoTable &font_table, uint32_t properties) const { | |
| 174 for (const auto &unichar : unichars_) { | |
| 175 auto &font_list = unichar.font_ids; | |
| 176 for (int f : font_list) { | |
| 177 if (font_table.at(f).properties == properties) { | |
| 178 return true; | |
| 179 } | |
| 180 } | |
| 181 } | |
| 182 return false; | |
| 183 } | |
| 184 // Returns true if the shape contains multiple different font properties, | |
| 185 // ignoring unichar_id. | |
| 186 bool Shape::ContainsMultipleFontProperties(const FontInfoTable &font_table) const { | |
| 187 uint32_t properties = font_table.at(unichars_[0].font_ids[0]).properties; | |
| 188 for (const auto &unichar : unichars_) { | |
| 189 auto &font_list = unichar.font_ids; | |
| 190 for (int f : font_list) { | |
| 191 if (font_table.at(f).properties != properties) { | |
| 192 return true; | |
| 193 } | |
| 194 } | |
| 195 } | |
| 196 return false; | |
| 197 } | |
| 198 | |
| 199 // Returns true if this shape is equal to other (ignoring order of unichars | |
| 200 // and fonts). | |
| 201 bool Shape::operator==(const Shape &other) const { | |
| 202 return IsSubsetOf(other) && other.IsSubsetOf(*this); | |
| 203 } | |
| 204 | |
| 205 // Returns true if this is a subset (including equal) of other. | |
| 206 bool Shape::IsSubsetOf(const Shape &other) const { | |
| 207 for (const auto &unichar : unichars_) { | |
| 208 int unichar_id = unichar.unichar_id; | |
| 209 const std::vector<int> &font_list = unichar.font_ids; | |
| 210 for (int f : font_list) { | |
| 211 if (!other.ContainsUnicharAndFont(unichar_id, f)) { | |
| 212 return false; | |
| 213 } | |
| 214 } | |
| 215 } | |
| 216 return true; | |
| 217 } | |
| 218 | |
| 219 // Returns true if the lists of unichar ids are the same in this and other, | |
| 220 // ignoring fonts. | |
| 221 // NOT const, as it will sort the unichars on demand. | |
| 222 bool Shape::IsEqualUnichars(Shape *other) { | |
| 223 if (unichars_.size() != other->unichars_.size()) { | |
| 224 return false; | |
| 225 } | |
| 226 if (!unichars_sorted_) { | |
| 227 SortUnichars(); | |
| 228 } | |
| 229 if (!other->unichars_sorted_) { | |
| 230 other->SortUnichars(); | |
| 231 } | |
| 232 for (unsigned c = 0; c < unichars_.size(); ++c) { | |
| 233 if (unichars_[c].unichar_id != other->unichars_[c].unichar_id) { | |
| 234 return false; | |
| 235 } | |
| 236 } | |
| 237 return true; | |
| 238 } | |
| 239 | |
| 240 // Sorts the unichars_ vector by unichar. | |
| 241 void Shape::SortUnichars() { | |
| 242 std::sort(unichars_.begin(), unichars_.end(), UnicharAndFonts::StdSortByUnicharId); | |
| 243 unichars_sorted_ = true; | |
| 244 } | |
| 245 | |
| 246 ShapeTable::ShapeTable() : unicharset_(nullptr), num_fonts_(0) {} | |
| 247 ShapeTable::ShapeTable(const UNICHARSET &unicharset) : unicharset_(&unicharset), num_fonts_(0) {} | |
| 248 | |
| 249 // Writes to the given file. Returns false in case of error. | |
| 250 bool ShapeTable::Serialize(FILE *fp) const { | |
| 251 return tesseract::Serialize(fp, shape_table_); | |
| 252 } | |
| 253 // Reads from the given file. Returns false in case of error. | |
| 254 | |
| 255 bool ShapeTable::DeSerialize(TFile *fp) { | |
| 256 if (!fp->DeSerialize(shape_table_)) { | |
| 257 return false; | |
| 258 } | |
| 259 num_fonts_ = 0; | |
| 260 return true; | |
| 261 } | |
| 262 | |
| 263 // Returns the number of fonts used in this ShapeTable, computing it if | |
| 264 // necessary. | |
| 265 int ShapeTable::NumFonts() const { | |
| 266 if (num_fonts_ <= 0) { | |
| 267 for (auto shape_id : shape_table_) { | |
| 268 const Shape &shape = *shape_id; | |
| 269 for (int c = 0; c < shape.size(); ++c) { | |
| 270 for (int font_id : shape[c].font_ids) { | |
| 271 if (font_id >= num_fonts_) { | |
| 272 num_fonts_ = font_id + 1; | |
| 273 } | |
| 274 } | |
| 275 } | |
| 276 } | |
| 277 } | |
| 278 return num_fonts_; | |
| 279 } | |
| 280 | |
| 281 // Re-indexes the class_ids in the shapetable according to the given map. | |
| 282 // Useful in conjunction with set_unicharset. | |
| 283 void ShapeTable::ReMapClassIds(const std::vector<int> &unicharset_map) { | |
| 284 for (auto shape : shape_table_) { | |
| 285 for (int c = 0; c < shape->size(); ++c) { | |
| 286 shape->SetUnicharId(c, unicharset_map[(*shape)[c].unichar_id]); | |
| 287 } | |
| 288 } | |
| 289 } | |
| 290 | |
| 291 // Returns a string listing the classes/fonts in a shape. | |
| 292 std::string ShapeTable::DebugStr(unsigned shape_id) const { | |
| 293 if (shape_id >= shape_table_.size()) { | |
| 294 return "INVALID_UNICHAR_ID"; | |
| 295 } | |
| 296 const Shape &shape = GetShape(shape_id); | |
| 297 std::string result; | |
| 298 result += "Shape" + std::to_string(shape_id); | |
| 299 if (shape.size() > 100) { | |
| 300 result += " Num unichars=" + std::to_string(shape.size()); | |
| 301 return result; | |
| 302 } | |
| 303 for (int c = 0; c < shape.size(); ++c) { | |
| 304 result += " c_id=" + std::to_string(shape[c].unichar_id); | |
| 305 result += "="; | |
| 306 result += unicharset_->id_to_unichar(shape[c].unichar_id); | |
| 307 if (shape.size() < 10) { | |
| 308 result += ", " + std::to_string(shape[c].font_ids.size()); | |
| 309 result += " fonts ="; | |
| 310 int num_fonts = shape[c].font_ids.size(); | |
| 311 if (num_fonts > 10) { | |
| 312 result += " " + std::to_string(shape[c].font_ids[0]); | |
| 313 result += " ... " + std::to_string(shape[c].font_ids[num_fonts - 1]); | |
| 314 } else { | |
| 315 for (int f = 0; f < num_fonts; ++f) { | |
| 316 result += " " + std::to_string(shape[c].font_ids[f]); | |
| 317 } | |
| 318 } | |
| 319 } | |
| 320 } | |
| 321 return result; | |
| 322 } | |
| 323 | |
| 324 // Returns a debug string summarizing the table. | |
| 325 std::string ShapeTable::SummaryStr() const { | |
| 326 int max_unichars = 0; | |
| 327 int num_multi_shapes = 0; | |
| 328 int num_master_shapes = 0; | |
| 329 for (unsigned s = 0; s < shape_table_.size(); ++s) { | |
| 330 if (MasterDestinationIndex(s) != s) { | |
| 331 continue; | |
| 332 } | |
| 333 ++num_master_shapes; | |
| 334 int shape_size = GetShape(s).size(); | |
| 335 if (shape_size > 1) { | |
| 336 ++num_multi_shapes; | |
| 337 } | |
| 338 if (shape_size > max_unichars) { | |
| 339 max_unichars = shape_size; | |
| 340 } | |
| 341 } | |
| 342 std::string result; | |
| 343 result += "Number of shapes = " + std::to_string(num_master_shapes); | |
| 344 result += " max unichars = " + std::to_string(max_unichars); | |
| 345 result += " number with multiple unichars = " + std::to_string(num_multi_shapes); | |
| 346 return result; | |
| 347 } | |
| 348 | |
| 349 // Adds a new shape starting with the given unichar_id and font_id. | |
| 350 // Returns the assigned index. | |
| 351 unsigned ShapeTable::AddShape(int unichar_id, int font_id) { | |
| 352 auto index = shape_table_.size(); | |
| 353 auto *shape = new Shape; | |
| 354 shape->AddToShape(unichar_id, font_id); | |
| 355 shape_table_.push_back(shape); | |
| 356 num_fonts_ = std::max(num_fonts_, font_id + 1); | |
| 357 return index; | |
| 358 } | |
| 359 | |
| 360 // Adds a copy of the given shape unless it is already present. | |
| 361 // Returns the assigned index or index of existing shape if already present. | |
| 362 unsigned ShapeTable::AddShape(const Shape &other) { | |
| 363 unsigned index; | |
| 364 for (index = 0; index < shape_table_.size() && !(other == *shape_table_[index]); ++index) { | |
| 365 continue; | |
| 366 } | |
| 367 if (index == shape_table_.size()) { | |
| 368 auto *shape = new Shape(other); | |
| 369 shape_table_.push_back(shape); | |
| 370 } | |
| 371 num_fonts_ = 0; | |
| 372 return index; | |
| 373 } | |
| 374 | |
| 375 // Removes the shape given by the shape index. | |
| 376 void ShapeTable::DeleteShape(unsigned shape_id) { | |
| 377 delete shape_table_[shape_id]; | |
| 378 shape_table_.erase(shape_table_.begin() + shape_id); | |
| 379 } | |
| 380 | |
| 381 // Adds a font_id to the given existing shape index for the given | |
| 382 // unichar_id. If the unichar_id is not in the shape, it is added. | |
| 383 void ShapeTable::AddToShape(unsigned shape_id, int unichar_id, int font_id) { | |
| 384 Shape &shape = *shape_table_[shape_id]; | |
| 385 shape.AddToShape(unichar_id, font_id); | |
| 386 num_fonts_ = std::max(num_fonts_, font_id + 1); | |
| 387 } | |
| 388 | |
| 389 // Adds the given shape to the existing shape with the given index. | |
| 390 void ShapeTable::AddShapeToShape(unsigned shape_id, const Shape &other) { | |
| 391 Shape &shape = *shape_table_[shape_id]; | |
| 392 shape.AddShape(other); | |
| 393 num_fonts_ = 0; | |
| 394 } | |
| 395 | |
| 396 // Returns the id of the shape that contains the given unichar and font. | |
| 397 // If not found, returns -1. | |
| 398 // If font_id < 0, the font_id is ignored and the first shape that matches | |
| 399 // the unichar_id is returned. | |
| 400 int ShapeTable::FindShape(int unichar_id, int font_id) const { | |
| 401 for (unsigned s = 0; s < shape_table_.size(); ++s) { | |
| 402 const Shape &shape = GetShape(s); | |
| 403 for (int c = 0; c < shape.size(); ++c) { | |
| 404 if (shape[c].unichar_id == unichar_id) { | |
| 405 if (font_id < 0) { | |
| 406 return s; // We don't care about the font. | |
| 407 } | |
| 408 for (int f : shape[c].font_ids) { | |
| 409 if (f == font_id) { | |
| 410 return s; | |
| 411 } | |
| 412 } | |
| 413 } | |
| 414 } | |
| 415 } | |
| 416 return -1; | |
| 417 } | |
| 418 | |
| 419 // Returns the first unichar_id and font_id in the given shape. | |
| 420 void ShapeTable::GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const { | |
| 421 const UnicharAndFonts &unichar_and_fonts = (*shape_table_[shape_id])[0]; | |
| 422 *unichar_id = unichar_and_fonts.unichar_id; | |
| 423 *font_id = unichar_and_fonts.font_ids[0]; | |
| 424 } | |
| 425 | |
| 426 // Expands all the classes/fonts in the shape individually to build | |
| 427 // a ShapeTable. | |
| 428 int ShapeTable::BuildFromShape(const Shape &shape, const ShapeTable &master_shapes) { | |
| 429 BitVector shape_map(master_shapes.NumShapes()); | |
| 430 for (int u_ind = 0; u_ind < shape.size(); ++u_ind) { | |
| 431 for (unsigned f_ind = 0; f_ind < shape[u_ind].font_ids.size(); ++f_ind) { | |
| 432 int c = shape[u_ind].unichar_id; | |
| 433 int f = shape[u_ind].font_ids[f_ind]; | |
| 434 int master_id = master_shapes.FindShape(c, f); | |
| 435 if (master_id >= 0) { | |
| 436 shape_map.SetBit(master_id); | |
| 437 } else if (FindShape(c, f) < 0) { | |
| 438 AddShape(c, f); | |
| 439 } | |
| 440 } | |
| 441 } | |
| 442 int num_masters = 0; | |
| 443 for (unsigned s = 0; s < master_shapes.NumShapes(); ++s) { | |
| 444 if (shape_map[s]) { | |
| 445 AddShape(master_shapes.GetShape(s)); | |
| 446 ++num_masters; | |
| 447 } | |
| 448 } | |
| 449 return num_masters; | |
| 450 } | |
| 451 | |
| 452 // Returns true if the shapes are already merged. | |
| 453 bool ShapeTable::AlreadyMerged(unsigned shape_id1, unsigned shape_id2) const { | |
| 454 return MasterDestinationIndex(shape_id1) == MasterDestinationIndex(shape_id2); | |
| 455 } | |
| 456 | |
| 457 // Returns true if any shape contains multiple unichars. | |
| 458 bool ShapeTable::AnyMultipleUnichars() const { | |
| 459 auto num_shapes = NumShapes(); | |
| 460 for (unsigned s1 = 0; s1 < num_shapes; ++s1) { | |
| 461 if (MasterDestinationIndex(s1) != s1) { | |
| 462 continue; | |
| 463 } | |
| 464 if (GetShape(s1).size() > 1) { | |
| 465 return true; | |
| 466 } | |
| 467 } | |
| 468 return false; | |
| 469 } | |
| 470 | |
| 471 // Returns the maximum number of unichars over all shapes. | |
| 472 int ShapeTable::MaxNumUnichars() const { | |
| 473 int max_num_unichars = 0; | |
| 474 int num_shapes = NumShapes(); | |
| 475 for (int s = 0; s < num_shapes; ++s) { | |
| 476 if (GetShape(s).size() > max_num_unichars) { | |
| 477 max_num_unichars = GetShape(s).size(); | |
| 478 } | |
| 479 } | |
| 480 return max_num_unichars; | |
| 481 } | |
| 482 | |
| 483 // Merges shapes with a common unichar over the [start, end) interval. | |
| 484 // Assumes single unichar per shape. | |
| 485 void ShapeTable::ForceFontMerges(unsigned start, unsigned end) { | |
| 486 for (unsigned s1 = start; s1 < end; ++s1) { | |
| 487 if (MasterDestinationIndex(s1) == s1 && GetShape(s1).size() == 1) { | |
| 488 int unichar_id = GetShape(s1)[0].unichar_id; | |
| 489 for (auto s2 = s1 + 1; s2 < end; ++s2) { | |
| 490 if (MasterDestinationIndex(s2) == s2 && GetShape(s2).size() == 1 && | |
| 491 unichar_id == GetShape(s2)[0].unichar_id) { | |
| 492 MergeShapes(s1, s2); | |
| 493 } | |
| 494 } | |
| 495 } | |
| 496 } | |
| 497 ShapeTable compacted(*unicharset_); | |
| 498 compacted.AppendMasterShapes(*this, nullptr); | |
| 499 *this = compacted; | |
| 500 } | |
| 501 | |
| 502 // Returns the number of unichars in the master shape. | |
| 503 unsigned ShapeTable::MasterUnicharCount(unsigned shape_id) const { | |
| 504 int master_id = MasterDestinationIndex(shape_id); | |
| 505 return GetShape(master_id).size(); | |
| 506 } | |
| 507 | |
| 508 // Returns the sum of the font counts in the master shape. | |
| 509 int ShapeTable::MasterFontCount(unsigned shape_id) const { | |
| 510 int master_id = MasterDestinationIndex(shape_id); | |
| 511 const Shape &shape = GetShape(master_id); | |
| 512 int font_count = 0; | |
| 513 for (int c = 0; c < shape.size(); ++c) { | |
| 514 font_count += shape[c].font_ids.size(); | |
| 515 } | |
| 516 return font_count; | |
| 517 } | |
| 518 | |
| 519 // Returns the number of unichars that would result from merging the shapes. | |
| 520 int ShapeTable::MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const { | |
| 521 // Do it the easy way for now. | |
| 522 int master_id1 = MasterDestinationIndex(shape_id1); | |
| 523 int master_id2 = MasterDestinationIndex(shape_id2); | |
| 524 Shape combined_shape(*shape_table_[master_id1]); | |
| 525 combined_shape.AddShape(*shape_table_[master_id2]); | |
| 526 return combined_shape.size(); | |
| 527 } | |
| 528 | |
| 529 // Merges two shape_ids, leaving shape_id2 marked as merged. | |
| 530 void ShapeTable::MergeShapes(unsigned shape_id1, unsigned shape_id2) { | |
| 531 auto master_id1 = MasterDestinationIndex(shape_id1); | |
| 532 auto master_id2 = MasterDestinationIndex(shape_id2); | |
| 533 // Point master_id2 (and all merged shapes) to master_id1. | |
| 534 shape_table_[master_id2]->set_destination_index(master_id1); | |
| 535 // Add all the shapes of master_id2 to master_id1. | |
| 536 shape_table_[master_id1]->AddShape(*shape_table_[master_id2]); | |
| 537 } | |
| 538 | |
| 539 // Swaps two shape_ids. | |
| 540 void ShapeTable::SwapShapes(unsigned shape_id1, unsigned shape_id2) { | |
| 541 Shape *tmp = shape_table_[shape_id1]; | |
| 542 shape_table_[shape_id1] = shape_table_[shape_id2]; | |
| 543 shape_table_[shape_id2] = tmp; | |
| 544 } | |
| 545 | |
| 546 // Returns the destination of this shape, (if merged), taking into account | |
| 547 // the fact that the destination may itself have been merged. | |
| 548 unsigned ShapeTable::MasterDestinationIndex(unsigned shape_id) const { | |
| 549 auto dest_id = shape_table_[shape_id]->destination_index(); | |
| 550 if (static_cast<unsigned>(dest_id) == shape_id || dest_id < 0) { | |
| 551 return shape_id; // Is master already. | |
| 552 } | |
| 553 auto master_id = shape_table_[dest_id]->destination_index(); | |
| 554 if (master_id == dest_id || master_id < 0) { | |
| 555 return dest_id; // Dest is the master and shape_id points to it. | |
| 556 } | |
| 557 master_id = MasterDestinationIndex(master_id); | |
| 558 return master_id; | |
| 559 } | |
| 560 | |
| 561 // Returns false if the unichars in neither shape is a subset of the other. | |
| 562 bool ShapeTable::SubsetUnichar(unsigned shape_id1, unsigned shape_id2) const { | |
| 563 const Shape &shape1 = GetShape(shape_id1); | |
| 564 const Shape &shape2 = GetShape(shape_id2); | |
| 565 int c1, c2; | |
| 566 for (c1 = 0; c1 < shape1.size(); ++c1) { | |
| 567 int unichar_id1 = shape1[c1].unichar_id; | |
| 568 if (!shape2.ContainsUnichar(unichar_id1)) { | |
| 569 break; | |
| 570 } | |
| 571 } | |
| 572 for (c2 = 0; c2 < shape2.size(); ++c2) { | |
| 573 int unichar_id2 = shape2[c2].unichar_id; | |
| 574 if (!shape1.ContainsUnichar(unichar_id2)) { | |
| 575 break; | |
| 576 } | |
| 577 } | |
| 578 return c1 == shape1.size() || c2 == shape2.size(); | |
| 579 } | |
| 580 | |
| 581 // Returns false if the unichars in neither shape is a subset of the other. | |
| 582 bool ShapeTable::MergeSubsetUnichar(int merge_id1, int merge_id2, unsigned shape_id) const { | |
| 583 const Shape &merge1 = GetShape(merge_id1); | |
| 584 const Shape &merge2 = GetShape(merge_id2); | |
| 585 const Shape &shape = GetShape(shape_id); | |
| 586 int cm1, cm2, cs; | |
| 587 for (cs = 0; cs < shape.size(); ++cs) { | |
| 588 int unichar_id = shape[cs].unichar_id; | |
| 589 if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) { | |
| 590 break; // Shape is not a subset of the merge. | |
| 591 } | |
| 592 } | |
| 593 for (cm1 = 0; cm1 < merge1.size(); ++cm1) { | |
| 594 int unichar_id1 = merge1[cm1].unichar_id; | |
| 595 if (!shape.ContainsUnichar(unichar_id1)) { | |
| 596 break; // Merge is not a subset of shape | |
| 597 } | |
| 598 } | |
| 599 for (cm2 = 0; cm2 < merge2.size(); ++cm2) { | |
| 600 int unichar_id2 = merge2[cm2].unichar_id; | |
| 601 if (!shape.ContainsUnichar(unichar_id2)) { | |
| 602 break; // Merge is not a subset of shape | |
| 603 } | |
| 604 } | |
| 605 return cs == shape.size() || (cm1 == merge1.size() && cm2 == merge2.size()); | |
| 606 } | |
| 607 | |
| 608 // Returns true if the unichar sets are equal between the shapes. | |
| 609 bool ShapeTable::EqualUnichars(unsigned shape_id1, unsigned shape_id2) const { | |
| 610 const Shape &shape1 = GetShape(shape_id1); | |
| 611 const Shape &shape2 = GetShape(shape_id2); | |
| 612 for (int c1 = 0; c1 < shape1.size(); ++c1) { | |
| 613 int unichar_id1 = shape1[c1].unichar_id; | |
| 614 if (!shape2.ContainsUnichar(unichar_id1)) { | |
| 615 return false; | |
| 616 } | |
| 617 } | |
| 618 for (int c2 = 0; c2 < shape2.size(); ++c2) { | |
| 619 int unichar_id2 = shape2[c2].unichar_id; | |
| 620 if (!shape1.ContainsUnichar(unichar_id2)) { | |
| 621 return false; | |
| 622 } | |
| 623 } | |
| 624 return true; | |
| 625 } | |
| 626 | |
| 627 // Returns true if the unichar sets are equal between the shapes. | |
| 628 bool ShapeTable::MergeEqualUnichars(int merge_id1, int merge_id2, unsigned shape_id) const { | |
| 629 const Shape &merge1 = GetShape(merge_id1); | |
| 630 const Shape &merge2 = GetShape(merge_id2); | |
| 631 const Shape &shape = GetShape(shape_id); | |
| 632 for (int cs = 0; cs < shape.size(); ++cs) { | |
| 633 int unichar_id = shape[cs].unichar_id; | |
| 634 if (!merge1.ContainsUnichar(unichar_id) && !merge2.ContainsUnichar(unichar_id)) { | |
| 635 return false; // Shape has a unichar that appears in neither merge. | |
| 636 } | |
| 637 } | |
| 638 for (int cm1 = 0; cm1 < merge1.size(); ++cm1) { | |
| 639 int unichar_id1 = merge1[cm1].unichar_id; | |
| 640 if (!shape.ContainsUnichar(unichar_id1)) { | |
| 641 return false; // Merge has a unichar that is not in shape. | |
| 642 } | |
| 643 } | |
| 644 for (int cm2 = 0; cm2 < merge2.size(); ++cm2) { | |
| 645 int unichar_id2 = merge2[cm2].unichar_id; | |
| 646 if (!shape.ContainsUnichar(unichar_id2)) { | |
| 647 return false; // Merge has a unichar that is not in shape. | |
| 648 } | |
| 649 } | |
| 650 return true; | |
| 651 } | |
| 652 | |
| 653 // Returns true if there is a common unichar between the shapes. | |
| 654 bool ShapeTable::CommonUnichars(unsigned shape_id1, unsigned shape_id2) const { | |
| 655 const Shape &shape1 = GetShape(shape_id1); | |
| 656 const Shape &shape2 = GetShape(shape_id2); | |
| 657 for (int c1 = 0; c1 < shape1.size(); ++c1) { | |
| 658 int unichar_id1 = shape1[c1].unichar_id; | |
| 659 if (shape2.ContainsUnichar(unichar_id1)) { | |
| 660 return true; | |
| 661 } | |
| 662 } | |
| 663 return false; | |
| 664 } | |
| 665 | |
| 666 // Returns true if there is a common font id between the shapes. | |
| 667 bool ShapeTable::CommonFont(unsigned shape_id1, unsigned shape_id2) const { | |
| 668 const Shape &shape1 = GetShape(shape_id1); | |
| 669 const Shape &shape2 = GetShape(shape_id2); | |
| 670 for (int c1 = 0; c1 < shape1.size(); ++c1) { | |
| 671 const std::vector<int> &font_list1 = shape1[c1].font_ids; | |
| 672 for (int f : font_list1) { | |
| 673 if (shape2.ContainsFont(f)) { | |
| 674 return true; | |
| 675 } | |
| 676 } | |
| 677 } | |
| 678 return false; | |
| 679 } | |
| 680 | |
| 681 // Appends the master shapes from other to this. | |
| 682 // If not nullptr, shape_map is set to map other shape_ids to this's shape_ids. | |
| 683 void ShapeTable::AppendMasterShapes(const ShapeTable &other, std::vector<int> *shape_map) { | |
| 684 if (shape_map != nullptr) { | |
| 685 shape_map->clear(); | |
| 686 shape_map->resize(other.NumShapes(), -1); | |
| 687 } | |
| 688 for (unsigned s = 0; s < other.shape_table_.size(); ++s) { | |
| 689 if (other.shape_table_[s]->destination_index() < 0) { | |
| 690 int index = AddShape(*other.shape_table_[s]); | |
| 691 if (shape_map != nullptr) { | |
| 692 (*shape_map)[s] = index; | |
| 693 } | |
| 694 } | |
| 695 } | |
| 696 } | |
| 697 | |
| 698 // Returns the number of master shapes remaining after merging. | |
| 699 int ShapeTable::NumMasterShapes() const { | |
| 700 int num_shapes = 0; | |
| 701 for (auto s : shape_table_) { | |
| 702 if (s->destination_index() < 0) { | |
| 703 ++num_shapes; | |
| 704 } | |
| 705 } | |
| 706 return num_shapes; | |
| 707 } | |
| 708 | |
| 709 // Adds the unichars of the given shape_id to the vector of results. Any | |
| 710 // unichar_id that is already present just has the fonts added to the | |
| 711 // font set for that result without adding a new entry in the vector. | |
| 712 // NOTE: it is assumed that the results are given to this function in order | |
| 713 // of decreasing rating. | |
| 714 // The unichar_map vector indicates the index of the results entry containing | |
| 715 // each unichar, or -1 if the unichar is not yet included in results. | |
| 716 void ShapeTable::AddShapeToResults(const ShapeRating &shape_rating, std::vector<int> *unichar_map, | |
| 717 std::vector<UnicharRating> *results) const { | |
| 718 if (shape_rating.joined) { | |
| 719 AddUnicharToResults(UNICHAR_JOINED, shape_rating.rating, unichar_map, results); | |
| 720 } | |
| 721 if (shape_rating.broken) { | |
| 722 AddUnicharToResults(UNICHAR_BROKEN, shape_rating.rating, unichar_map, results); | |
| 723 } | |
| 724 const Shape &shape = GetShape(shape_rating.shape_id); | |
| 725 for (int u = 0; u < shape.size(); ++u) { | |
| 726 int result_index = | |
| 727 AddUnicharToResults(shape[u].unichar_id, shape_rating.rating, unichar_map, results); | |
| 728 for (int font_id : shape[u].font_ids) { | |
| 729 (*results)[result_index].fonts.emplace_back(font_id, | |
| 730 IntCastRounded(shape_rating.rating * INT16_MAX)); | |
| 731 } | |
| 732 } | |
| 733 } | |
| 734 | |
| 735 // Adds the given unichar_id to the results if needed, updating unichar_map | |
| 736 // and returning the index of unichar in results. | |
| 737 int ShapeTable::AddUnicharToResults(int unichar_id, float rating, std::vector<int> *unichar_map, | |
| 738 std::vector<UnicharRating> *results) const { | |
| 739 int result_index = unichar_map->at(unichar_id); | |
| 740 if (result_index < 0) { | |
| 741 UnicharRating result(unichar_id, rating); | |
| 742 result_index = results->size(); | |
| 743 results->push_back(result); | |
| 744 (*unichar_map)[unichar_id] = result_index; | |
| 745 } | |
| 746 return result_index; | |
| 747 } | |
| 748 | |
| 749 } // namespace tesseract |
