Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/imagedata.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: imagedata.cpp | |
| 3 // Description: Class to hold information about a single multi-page tiff | |
| 4 // training file and its corresponding boxes or text file. | |
| 5 // Author: Ray Smith | |
| 6 // | |
| 7 // (C) Copyright 2013, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 // Include automatically generated configuration file if running autoconf. | |
| 20 #ifdef HAVE_CONFIG_H | |
| 21 # include "config_auto.h" | |
| 22 #endif | |
| 23 | |
| 24 #include "imagedata.h" | |
| 25 | |
| 26 #include "boxread.h" // for ReadMemBoxes | |
| 27 #include "rect.h" // for TBOX | |
| 28 #include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE | |
| 29 #include "tprintf.h" // for tprintf | |
| 30 #include "tesserrstream.h" // for tesserr | |
| 31 | |
| 32 #include "helpers.h" // for IntCastRounded, TRand, ClipToRange, Modulo | |
| 33 #include "serialis.h" // for TFile | |
| 34 | |
| 35 #include <allheaders.h> // for pixDestroy, pixGetHeight, pixGetWidth, lept_... | |
| 36 | |
| 37 #include <cinttypes> // for PRId64 | |
| 38 #include <fstream> // for std::ifstream | |
| 39 | |
| 40 namespace tesseract { | |
| 41 | |
| 42 // Number of documents to read ahead while training. Doesn't need to be very | |
| 43 // large. | |
| 44 const int kMaxReadAhead = 8; | |
| 45 | |
| 46 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {} | |
| 47 // Takes ownership of the pix and destroys it. | |
| 48 ImageData::ImageData(bool vertical, Image pix) | |
| 49 : page_number_(0), vertical_text_(vertical) { | |
| 50 SetPix(pix); | |
| 51 } | |
| 52 ImageData::~ImageData() { | |
| 53 #ifdef TESSERACT_IMAGEDATA_AS_PIX | |
| 54 internal_pix_.destroy(); | |
| 55 #endif | |
| 56 } | |
| 57 | |
| 58 // Builds and returns an ImageData from the basic data. Note that imagedata, | |
| 59 // truth_text, and box_text are all the actual file data, NOT filenames. | |
| 60 ImageData *ImageData::Build(const char *name, int page_number, const char *lang, | |
| 61 const char *imagedata, int imagedatasize, | |
| 62 const char *truth_text, const char *box_text) { | |
| 63 auto *image_data = new ImageData(); | |
| 64 image_data->imagefilename_ = name; | |
| 65 image_data->page_number_ = page_number; | |
| 66 image_data->language_ = lang; | |
| 67 // Save the imagedata. | |
| 68 // TODO: optimize resize (no init). | |
| 69 image_data->image_data_.resize(imagedatasize); | |
| 70 memcpy(&image_data->image_data_[0], imagedata, imagedatasize); | |
| 71 if (!image_data->AddBoxes(box_text)) { | |
| 72 if (truth_text == nullptr || truth_text[0] == '\0') { | |
| 73 tprintf("Error: No text corresponding to page %d from image %s!\n", | |
| 74 page_number, name); | |
| 75 delete image_data; | |
| 76 return nullptr; | |
| 77 } | |
| 78 image_data->transcription_ = truth_text; | |
| 79 // If we have no boxes, the transcription is in the 0th box_texts_. | |
| 80 image_data->box_texts_.emplace_back(truth_text); | |
| 81 // We will create a box for the whole image on PreScale, to save unpacking | |
| 82 // the image now. | |
| 83 } else if (truth_text != nullptr && truth_text[0] != '\0' && | |
| 84 image_data->transcription_ != truth_text) { | |
| 85 // Save the truth text as it is present and disagrees with the box text. | |
| 86 image_data->transcription_ = truth_text; | |
| 87 } | |
| 88 return image_data; | |
| 89 } | |
| 90 | |
| 91 // Writes to the given file. Returns false in case of error. | |
| 92 bool ImageData::Serialize(TFile *fp) const { | |
| 93 if (!fp->Serialize(imagefilename_)) { | |
| 94 return false; | |
| 95 } | |
| 96 if (!fp->Serialize(&page_number_)) { | |
| 97 return false; | |
| 98 } | |
| 99 if (!fp->Serialize(image_data_)) { | |
| 100 return false; | |
| 101 } | |
| 102 if (!fp->Serialize(language_)) { | |
| 103 return false; | |
| 104 } | |
| 105 if (!fp->Serialize(transcription_)) { | |
| 106 return false; | |
| 107 } | |
| 108 if (!fp->Serialize(boxes_)) { | |
| 109 return false; | |
| 110 } | |
| 111 if (!fp->Serialize(box_texts_)) { | |
| 112 return false; | |
| 113 } | |
| 114 int8_t vertical = vertical_text_; | |
| 115 return fp->Serialize(&vertical); | |
| 116 } | |
| 117 | |
| 118 // Reads from the given file. Returns false in case of error. | |
| 119 bool ImageData::DeSerialize(TFile *fp) { | |
| 120 if (!fp->DeSerialize(imagefilename_)) { | |
| 121 return false; | |
| 122 } | |
| 123 if (!fp->DeSerialize(&page_number_)) { | |
| 124 return false; | |
| 125 } | |
| 126 if (!fp->DeSerialize(image_data_)) { | |
| 127 return false; | |
| 128 } | |
| 129 if (!fp->DeSerialize(language_)) { | |
| 130 return false; | |
| 131 } | |
| 132 if (!fp->DeSerialize(transcription_)) { | |
| 133 return false; | |
| 134 } | |
| 135 if (!fp->DeSerialize(boxes_)) { | |
| 136 return false; | |
| 137 } | |
| 138 if (!fp->DeSerialize(box_texts_)) { | |
| 139 return false; | |
| 140 } | |
| 141 int8_t vertical = 0; | |
| 142 if (!fp->DeSerialize(&vertical)) { | |
| 143 return false; | |
| 144 } | |
| 145 vertical_text_ = vertical != 0; | |
| 146 return true; | |
| 147 } | |
| 148 | |
| 149 // As DeSerialize, but only seeks past the data - hence a static method. | |
| 150 bool ImageData::SkipDeSerialize(TFile *fp) { | |
| 151 if (!fp->DeSerializeSkip()) { | |
| 152 return false; | |
| 153 } | |
| 154 int32_t page_number; | |
| 155 if (!fp->DeSerialize(&page_number)) { | |
| 156 return false; | |
| 157 } | |
| 158 if (!fp->DeSerializeSkip()) { | |
| 159 return false; | |
| 160 } | |
| 161 if (!fp->DeSerializeSkip()) { | |
| 162 return false; | |
| 163 } | |
| 164 if (!fp->DeSerializeSkip()) { | |
| 165 return false; | |
| 166 } | |
| 167 if (!fp->DeSerializeSkip(sizeof(TBOX))) { | |
| 168 return false; | |
| 169 } | |
| 170 int32_t number; | |
| 171 if (!fp->DeSerialize(&number)) { | |
| 172 return false; | |
| 173 } | |
| 174 for (int i = 0; i < number; i++) { | |
| 175 if (!fp->DeSerializeSkip()) { | |
| 176 return false; | |
| 177 } | |
| 178 } | |
| 179 int8_t vertical = 0; | |
| 180 return fp->DeSerialize(&vertical); | |
| 181 } | |
| 182 | |
| 183 // Saves the given Pix as a PNG-encoded string and destroys it. | |
| 184 // In case of missing PNG support in Leptonica use PNM format, | |
| 185 // which requires more memory. | |
| 186 void ImageData::SetPix(Image pix) { | |
| 187 #ifdef TESSERACT_IMAGEDATA_AS_PIX | |
| 188 internal_pix_ = pix; | |
| 189 #else | |
| 190 SetPixInternal(pix, &image_data_); | |
| 191 #endif | |
| 192 } | |
| 193 | |
| 194 // Returns the Pix image for *this. Must be pixDestroyed after use. | |
| 195 Image ImageData::GetPix() const { | |
| 196 #ifdef TESSERACT_IMAGEDATA_AS_PIX | |
| 197 # ifdef GRAPHICS_DISABLED | |
| 198 /* The only caller of this is the scaling functions to prescale the | |
| 199 * source. Thus we can just return a new pointer to the same data. */ | |
| 200 return internal_pix_.clone(); | |
| 201 # else | |
| 202 /* pixCopy always does an actual copy, so the caller can modify the | |
| 203 * changed data. */ | |
| 204 return internal_pix_.copy(); | |
| 205 # endif | |
| 206 #else | |
| 207 return GetPixInternal(image_data_); | |
| 208 #endif | |
| 209 } | |
| 210 | |
| 211 // Gets anything and everything with a non-nullptr pointer, prescaled to a | |
| 212 // given target_height (if 0, then the original image height), and aligned. | |
| 213 // Also returns (if not nullptr) the width and height of the scaled image. | |
| 214 // The return value is the scaled Pix, which must be pixDestroyed after use, | |
| 215 // and scale_factor (if not nullptr) is set to the scale factor that was applied | |
| 216 // to the image to achieve the target_height. | |
| 217 Image ImageData::PreScale(int target_height, int max_height, | |
| 218 float *scale_factor, int *scaled_width, | |
| 219 int *scaled_height, std::vector<TBOX> *boxes) const { | |
| 220 int input_width = 0; | |
| 221 int input_height = 0; | |
| 222 Image src_pix = GetPix(); | |
| 223 ASSERT_HOST(src_pix != nullptr); | |
| 224 input_width = pixGetWidth(src_pix); | |
| 225 input_height = pixGetHeight(src_pix); | |
| 226 if (target_height == 0) { | |
| 227 target_height = std::min(input_height, max_height); | |
| 228 } | |
| 229 float im_factor = static_cast<float>(target_height) / input_height; | |
| 230 if (scaled_width != nullptr) { | |
| 231 *scaled_width = IntCastRounded(im_factor * input_width); | |
| 232 } | |
| 233 if (scaled_height != nullptr) { | |
| 234 *scaled_height = target_height; | |
| 235 } | |
| 236 // Get the scaled image. | |
| 237 Image pix = pixScale(src_pix, im_factor, im_factor); | |
| 238 if (pix == nullptr) { | |
| 239 tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", | |
| 240 input_width, input_height, im_factor); | |
| 241 src_pix.destroy(); | |
| 242 return nullptr; | |
| 243 } | |
| 244 if (scaled_width != nullptr) { | |
| 245 *scaled_width = pixGetWidth(pix); | |
| 246 } | |
| 247 if (scaled_height != nullptr) { | |
| 248 *scaled_height = pixGetHeight(pix); | |
| 249 } | |
| 250 src_pix.destroy(); | |
| 251 if (boxes != nullptr) { | |
| 252 // Get the boxes. | |
| 253 boxes->clear(); | |
| 254 for (auto box : boxes_) { | |
| 255 box.scale(im_factor); | |
| 256 boxes->push_back(box); | |
| 257 } | |
| 258 if (boxes->empty()) { | |
| 259 // Make a single box for the whole image. | |
| 260 TBOX box(0, 0, im_factor * input_width, target_height); | |
| 261 boxes->push_back(box); | |
| 262 } | |
| 263 } | |
| 264 if (scale_factor != nullptr) { | |
| 265 *scale_factor = im_factor; | |
| 266 } | |
| 267 return pix; | |
| 268 } | |
| 269 | |
| 270 int ImageData::MemoryUsed() const { | |
| 271 return image_data_.size(); | |
| 272 } | |
| 273 | |
| 274 #ifndef GRAPHICS_DISABLED | |
| 275 | |
| 276 // Draws the data in a new window. | |
| 277 void ImageData::Display() const { | |
| 278 const int kTextSize = 64; | |
| 279 // Draw the image. | |
| 280 Image pix = GetPix(); | |
| 281 if (pix == nullptr) { | |
| 282 return; | |
| 283 } | |
| 284 int width = pixGetWidth(pix); | |
| 285 int height = pixGetHeight(pix); | |
| 286 auto *win = new ScrollView("Imagedata", 100, 100, 2 * (width + 2 * kTextSize), | |
| 287 2 * (height + 4 * kTextSize), width + 10, | |
| 288 height + 3 * kTextSize, true); | |
| 289 win->Draw(pix, 0, height - 1); | |
| 290 pix.destroy(); | |
| 291 // Draw the boxes. | |
| 292 win->Pen(ScrollView::RED); | |
| 293 win->Brush(ScrollView::NONE); | |
| 294 int text_size = kTextSize; | |
| 295 if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) { | |
| 296 text_size = boxes_[0].height() * 2; | |
| 297 } | |
| 298 win->TextAttributes("Arial", text_size, false, false, false); | |
| 299 if (!boxes_.empty()) { | |
| 300 for (unsigned b = 0; b < boxes_.size(); ++b) { | |
| 301 boxes_[b].plot(win); | |
| 302 win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str()); | |
| 303 } | |
| 304 } else { | |
| 305 // The full transcription. | |
| 306 win->Pen(ScrollView::CYAN); | |
| 307 win->Text(0, height + kTextSize * 2, transcription_.c_str()); | |
| 308 } | |
| 309 win->Update(); | |
| 310 win->Wait(); | |
| 311 } | |
| 312 | |
| 313 #endif | |
| 314 | |
| 315 // Adds the supplied boxes and transcriptions that correspond to the correct | |
| 316 // page number. | |
| 317 void ImageData::AddBoxes(const std::vector<TBOX> &boxes, | |
| 318 const std::vector<std::string> &texts, | |
| 319 const std::vector<int> &box_pages) { | |
| 320 // Copy the boxes and make the transcription. | |
| 321 for (unsigned i = 0; i < box_pages.size(); ++i) { | |
| 322 if (page_number_ >= 0 && box_pages[i] != page_number_) { | |
| 323 continue; | |
| 324 } | |
| 325 transcription_ += texts[i]; | |
| 326 boxes_.push_back(boxes[i]); | |
| 327 box_texts_.push_back(texts[i]); | |
| 328 } | |
| 329 } | |
| 330 | |
| 331 #ifndef TESSERACT_IMAGEDATA_AS_PIX | |
| 332 // Saves the given Pix as a PNG-encoded string and destroys it. | |
| 333 // In case of missing PNG support in Leptonica use PNM format, | |
| 334 // which requires more memory. | |
| 335 void ImageData::SetPixInternal(Image pix, std::vector<char> *image_data) { | |
| 336 l_uint8 *data; | |
| 337 size_t size; | |
| 338 l_int32 ret; | |
| 339 ret = pixWriteMem(&data, &size, pix, IFF_PNG); | |
| 340 if (ret) { | |
| 341 ret = pixWriteMem(&data, &size, pix, IFF_PNM); | |
| 342 } | |
| 343 pix.destroy(); | |
| 344 // TODO: optimize resize (no init). | |
| 345 image_data->resize(size); | |
| 346 memcpy(&(*image_data)[0], data, size); | |
| 347 lept_free(data); | |
| 348 } | |
| 349 | |
| 350 // Returns the Pix image for the image_data. Must be pixDestroyed after use. | |
| 351 Image ImageData::GetPixInternal(const std::vector<char> &image_data) { | |
| 352 Image pix = nullptr; | |
| 353 if (!image_data.empty()) { | |
| 354 // Convert the array to an image. | |
| 355 const auto *u_data = | |
| 356 reinterpret_cast<const unsigned char *>(&image_data[0]); | |
| 357 pix = pixReadMem(u_data, image_data.size()); | |
| 358 } | |
| 359 return pix; | |
| 360 } | |
| 361 #endif | |
| 362 | |
| 363 // Parses the text string as a box file and adds any discovered boxes that | |
| 364 // match the page number. Returns false on error. | |
| 365 bool ImageData::AddBoxes(const char *box_text) { | |
| 366 if (box_text != nullptr && box_text[0] != '\0') { | |
| 367 std::vector<TBOX> boxes; | |
| 368 std::vector<std::string> texts; | |
| 369 std::vector<int> box_pages; | |
| 370 if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text, | |
| 371 /*continue_on_failure*/ true, &boxes, &texts, nullptr, | |
| 372 &box_pages)) { | |
| 373 AddBoxes(boxes, texts, box_pages); | |
| 374 return true; | |
| 375 } else { | |
| 376 tprintf("Error: No boxes for page %d from image %s!\n", page_number_, | |
| 377 imagefilename_.c_str()); | |
| 378 } | |
| 379 } | |
| 380 return false; | |
| 381 } | |
| 382 | |
| 383 DocumentData::DocumentData(const std::string &name) | |
| 384 : document_name_(name), | |
| 385 pages_offset_(-1), | |
| 386 total_pages_(-1), | |
| 387 memory_used_(0), | |
| 388 max_memory_(0), | |
| 389 reader_(nullptr) {} | |
| 390 | |
| 391 DocumentData::~DocumentData() { | |
| 392 if (thread.joinable()) { | |
| 393 thread.join(); | |
| 394 } | |
| 395 std::lock_guard<std::mutex> lock_p(pages_mutex_); | |
| 396 std::lock_guard<std::mutex> lock_g(general_mutex_); | |
| 397 for (auto data : pages_) { | |
| 398 delete data; | |
| 399 } | |
| 400 } | |
| 401 | |
| 402 // Reads all the pages in the given lstmf filename to the cache. The reader | |
| 403 // is used to read the file. | |
| 404 bool DocumentData::LoadDocument(const char *filename, int start_page, | |
| 405 int64_t max_memory, FileReader reader) { | |
| 406 SetDocument(filename, max_memory, reader); | |
| 407 pages_offset_ = start_page; | |
| 408 return ReCachePages(); | |
| 409 } | |
| 410 | |
| 411 // Sets up the document, without actually loading it. | |
| 412 void DocumentData::SetDocument(const char *filename, int64_t max_memory, | |
| 413 FileReader reader) { | |
| 414 std::lock_guard<std::mutex> lock_p(pages_mutex_); | |
| 415 std::lock_guard<std::mutex> lock(general_mutex_); | |
| 416 document_name_ = filename; | |
| 417 pages_offset_ = -1; | |
| 418 max_memory_ = max_memory; | |
| 419 reader_ = reader; | |
| 420 } | |
| 421 | |
| 422 // Writes all the pages to the given filename. Returns false on error. | |
| 423 bool DocumentData::SaveDocument(const char *filename, FileWriter writer) { | |
| 424 std::lock_guard<std::mutex> lock(pages_mutex_); | |
| 425 TFile fp; | |
| 426 fp.OpenWrite(nullptr); | |
| 427 if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) { | |
| 428 tprintf("Serialize failed: %s\n", filename); | |
| 429 return false; | |
| 430 } | |
| 431 return true; | |
| 432 } | |
| 433 | |
| 434 // Adds the given page data to this document, counting up memory. | |
| 435 void DocumentData::AddPageToDocument(ImageData *page) { | |
| 436 std::lock_guard<std::mutex> lock(pages_mutex_); | |
| 437 pages_.push_back(page); | |
| 438 set_memory_used(memory_used() + page->MemoryUsed()); | |
| 439 } | |
| 440 | |
| 441 // If the given index is not currently loaded, loads it using a separate | |
| 442 // thread. | |
| 443 void DocumentData::LoadPageInBackground(int index) { | |
| 444 ImageData *page = nullptr; | |
| 445 if (IsPageAvailable(index, &page)) { | |
| 446 return; | |
| 447 } | |
| 448 { | |
| 449 std::lock_guard<std::mutex> lock(pages_mutex_); | |
| 450 if (pages_offset_ == index) { | |
| 451 return; | |
| 452 } | |
| 453 pages_offset_ = index; | |
| 454 for (auto page : pages_) { | |
| 455 delete page; | |
| 456 } | |
| 457 pages_.clear(); | |
| 458 } | |
| 459 if (thread.joinable()) { | |
| 460 thread.join(); | |
| 461 } | |
| 462 // Don't run next statement asynchronously because that would | |
| 463 // create too many threads on Linux (see issue #3111). | |
| 464 ReCachePages(); | |
| 465 } | |
| 466 | |
| 467 // Returns a pointer to the page with the given index, modulo the total | |
| 468 // number of pages. Blocks until the background load is completed. | |
| 469 const ImageData *DocumentData::GetPage(int index) { | |
| 470 ImageData *page = nullptr; | |
| 471 while (!IsPageAvailable(index, &page)) { | |
| 472 // If there is no background load scheduled, schedule one now. | |
| 473 pages_mutex_.lock(); | |
| 474 bool needs_loading = pages_offset_ != index; | |
| 475 pages_mutex_.unlock(); | |
| 476 if (needs_loading) { | |
| 477 LoadPageInBackground(index); | |
| 478 } | |
| 479 // We can't directly load the page, or the background load will delete it | |
| 480 // while the caller is using it, so give it a chance to work. | |
| 481 std::this_thread::yield(); | |
| 482 } | |
| 483 return page; | |
| 484 } | |
| 485 | |
| 486 // Returns true if the requested page is available, and provides a pointer, | |
| 487 // which may be nullptr if the document is empty. May block, even though it | |
| 488 // doesn't guarantee to return true. | |
| 489 bool DocumentData::IsPageAvailable(int index, ImageData **page) { | |
| 490 std::lock_guard<std::mutex> lock(pages_mutex_); | |
| 491 int num_pages = NumPages(); | |
| 492 if (num_pages == 0 || index < 0) { | |
| 493 *page = nullptr; // Empty Document. | |
| 494 return true; | |
| 495 } | |
| 496 if (num_pages > 0) { | |
| 497 index = Modulo(index, num_pages); | |
| 498 if (pages_offset_ <= index && | |
| 499 static_cast<unsigned>(index) < pages_offset_ + pages_.size()) { | |
| 500 *page = pages_[index - pages_offset_]; // Page is available already. | |
| 501 return true; | |
| 502 } | |
| 503 } | |
| 504 return false; | |
| 505 } | |
| 506 | |
| 507 // Removes all pages from memory and frees the memory, but does not forget | |
| 508 // the document metadata. | |
| 509 int64_t DocumentData::UnCache() { | |
| 510 std::lock_guard<std::mutex> lock(pages_mutex_); | |
| 511 int64_t memory_saved = memory_used(); | |
| 512 for (auto page : pages_) { | |
| 513 delete page; | |
| 514 } | |
| 515 pages_.clear(); | |
| 516 pages_offset_ = -1; | |
| 517 set_total_pages(-1); | |
| 518 set_memory_used(0); | |
| 519 tprintf("Unloaded document %s, saving %" PRId64 " memory\n", | |
| 520 document_name_.c_str(), memory_saved); | |
| 521 return memory_saved; | |
| 522 } | |
| 523 | |
| 524 // Shuffles all the pages in the document. | |
| 525 void DocumentData::Shuffle() { | |
| 526 TRand random; | |
| 527 // Different documents get shuffled differently, but the same for the same | |
| 528 // name. | |
| 529 random.set_seed(document_name_.c_str()); | |
| 530 int num_pages = pages_.size(); | |
| 531 // Execute one random swap for each page in the document. | |
| 532 for (int i = 0; i < num_pages; ++i) { | |
| 533 int src = random.IntRand() % num_pages; | |
| 534 int dest = random.IntRand() % num_pages; | |
| 535 std::swap(pages_[src], pages_[dest]); | |
| 536 } | |
| 537 } | |
| 538 | |
| 539 // Locks the pages_mutex_ and loads as many pages as will fit into max_memory_ | |
| 540 // starting at index pages_offset_. | |
| 541 bool DocumentData::ReCachePages() { | |
| 542 std::lock_guard<std::mutex> lock(pages_mutex_); | |
| 543 // Read the file. | |
| 544 set_total_pages(0); | |
| 545 set_memory_used(0); | |
| 546 int loaded_pages = 0; | |
| 547 for (auto page : pages_) { | |
| 548 delete page; | |
| 549 } | |
| 550 pages_.clear(); | |
| 551 #if !defined(TESSERACT_IMAGEDATA_AS_PIX) | |
| 552 auto name_size = document_name_.size(); | |
| 553 if (name_size > 4 && document_name_.substr(name_size - 4) == ".png") { | |
| 554 // PNG image given instead of LSTMF file. | |
| 555 std::string gt_name = document_name_.substr(0, name_size - 3) + "gt.txt"; | |
| 556 std::ifstream t(gt_name); | |
| 557 std::string line; | |
| 558 std::getline(t, line); | |
| 559 t.close(); | |
| 560 ImageData *image_data = ImageData::Build(document_name_.c_str(), 0, "", nullptr, 0, line.c_str(), nullptr); | |
| 561 Image image = pixRead(document_name_.c_str()); | |
| 562 image_data->SetPix(image); | |
| 563 pages_.push_back(image_data); | |
| 564 loaded_pages = 1; | |
| 565 pages_offset_ %= loaded_pages; | |
| 566 set_total_pages(loaded_pages); | |
| 567 set_memory_used(memory_used() + image_data->MemoryUsed()); | |
| 568 #if 0 | |
| 569 tprintf("Loaded %zu/%d lines (%d-%zu) of document %s\n", pages_.size(), | |
| 570 loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(), | |
| 571 document_name_.c_str()); | |
| 572 #endif | |
| 573 return !pages_.empty(); | |
| 574 } | |
| 575 #endif | |
| 576 TFile fp; | |
| 577 if (!fp.Open(document_name_.c_str(), reader_) || | |
| 578 !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) { | |
| 579 tprintf("Deserialize header failed: %s\n", document_name_.c_str()); | |
| 580 return false; | |
| 581 } | |
| 582 pages_offset_ %= loaded_pages; | |
| 583 // Skip pages before the first one we want, and load the rest until max | |
| 584 // memory and skip the rest after that. | |
| 585 int page; | |
| 586 for (page = 0; page < loaded_pages; ++page) { | |
| 587 uint8_t non_null; | |
| 588 if (!fp.DeSerialize(&non_null)) { | |
| 589 break; | |
| 590 } | |
| 591 if (page < pages_offset_ || | |
| 592 (max_memory_ > 0 && memory_used() > max_memory_)) { | |
| 593 if (non_null && !ImageData::SkipDeSerialize(&fp)) { | |
| 594 break; | |
| 595 } | |
| 596 } else { | |
| 597 ImageData *image_data = nullptr; | |
| 598 if (non_null) { | |
| 599 image_data = new ImageData; | |
| 600 if (!image_data->DeSerialize(&fp)) { | |
| 601 delete image_data; | |
| 602 break; | |
| 603 } | |
| 604 } | |
| 605 pages_.push_back(image_data); | |
| 606 if (image_data->imagefilename().empty()) { | |
| 607 image_data->set_imagefilename(document_name_); | |
| 608 image_data->set_page_number(page); | |
| 609 } | |
| 610 set_memory_used(memory_used() + image_data->MemoryUsed()); | |
| 611 } | |
| 612 } | |
| 613 if (page < loaded_pages) { | |
| 614 tprintf("Deserialize failed: %s read %d/%d lines\n", document_name_.c_str(), | |
| 615 page, loaded_pages); | |
| 616 for (auto page : pages_) { | |
| 617 delete page; | |
| 618 } | |
| 619 pages_.clear(); | |
| 620 } else if (loaded_pages > 1) { | |
| 621 // Avoid lots of messages for training with single line images. | |
| 622 tesserr << "Loaded " << pages_.size() << '/' << loaded_pages << " lines (" | |
| 623 << pages_offset_ + 1 << '-' | |
| 624 << pages_offset_ + pages_.size() << ") of document " | |
| 625 << document_name_ << '\n'; | |
| 626 } | |
| 627 set_total_pages(loaded_pages); | |
| 628 return !pages_.empty(); | |
| 629 } | |
| 630 | |
| 631 // A collection of DocumentData that knows roughly how much memory it is using. | |
| 632 DocumentCache::DocumentCache(int64_t max_memory) : max_memory_(max_memory) {} | |
| 633 | |
| 634 DocumentCache::~DocumentCache() { | |
| 635 for (auto *document : documents_) { | |
| 636 delete document; | |
| 637 } | |
| 638 } | |
| 639 | |
| 640 // Adds all the documents in the list of filenames, counting memory. | |
| 641 // The reader is used to read the files. | |
| 642 bool DocumentCache::LoadDocuments(const std::vector<std::string> &filenames, | |
| 643 CachingStrategy cache_strategy, | |
| 644 FileReader reader) { | |
| 645 cache_strategy_ = cache_strategy; | |
| 646 int64_t fair_share_memory = 0; | |
| 647 // In the round-robin case, each DocumentData handles restricting its content | |
| 648 // to its fair share of memory. In the sequential case, DocumentCache | |
| 649 // determines which DocumentDatas are held entirely in memory. | |
| 650 if (cache_strategy_ == CS_ROUND_ROBIN) { | |
| 651 fair_share_memory = max_memory_ / filenames.size(); | |
| 652 } | |
| 653 for (const auto &filename : filenames) { | |
| 654 auto *document = new DocumentData(filename); | |
| 655 document->SetDocument(filename.c_str(), fair_share_memory, reader); | |
| 656 AddToCache(document); | |
| 657 } | |
| 658 if (!documents_.empty()) { | |
| 659 // Try to get the first page now to verify the list of filenames. | |
| 660 if (GetPageBySerial(0) != nullptr) { | |
| 661 return true; | |
| 662 } | |
| 663 tprintf("Load of page 0 failed!\n"); | |
| 664 } | |
| 665 return false; | |
| 666 } | |
| 667 | |
| 668 // Adds document to the cache. | |
| 669 bool DocumentCache::AddToCache(DocumentData *data) { | |
| 670 documents_.push_back(data); | |
| 671 return true; | |
| 672 } | |
| 673 | |
| 674 // Finds and returns a document by name. | |
| 675 DocumentData *DocumentCache::FindDocument( | |
| 676 const std::string &document_name) const { | |
| 677 for (auto *document : documents_) { | |
| 678 if (document->document_name() == document_name) { | |
| 679 return document; | |
| 680 } | |
| 681 } | |
| 682 return nullptr; | |
| 683 } | |
| 684 | |
| 685 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache | |
| 686 // strategy, could take a long time. | |
| 687 int DocumentCache::TotalPages() { | |
| 688 if (cache_strategy_ == CS_SEQUENTIAL) { | |
| 689 // In sequential mode, we assume each doc has the same number of pages | |
| 690 // whether it is true or not. | |
| 691 if (num_pages_per_doc_ == 0) { | |
| 692 GetPageSequential(0); | |
| 693 } | |
| 694 return num_pages_per_doc_ * documents_.size(); | |
| 695 } | |
| 696 int total_pages = 0; | |
| 697 for (auto *document : documents_) { | |
| 698 // We have to load a page to make NumPages() valid. | |
| 699 document->GetPage(0); | |
| 700 total_pages += document->NumPages(); | |
| 701 } | |
| 702 return total_pages; | |
| 703 } | |
| 704 | |
| 705 // Returns a page by serial number, selecting them in a round-robin fashion | |
| 706 // from all the documents. Highly disk-intensive, but doesn't need samples | |
| 707 // to be shuffled between files to begin with. | |
| 708 const ImageData *DocumentCache::GetPageRoundRobin(int serial) { | |
| 709 int num_docs = documents_.size(); | |
| 710 int doc_index = serial % num_docs; | |
| 711 const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs); | |
| 712 for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) { | |
| 713 doc_index = (serial + offset) % num_docs; | |
| 714 int page = (serial + offset) / num_docs; | |
| 715 documents_[doc_index]->LoadPageInBackground(page); | |
| 716 } | |
| 717 return doc; | |
| 718 } | |
| 719 | |
| 720 // Returns a page by serial number, selecting them in sequence from each file. | |
| 721 // Requires the samples to be shuffled between the files to give a random or | |
| 722 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin. | |
| 723 const ImageData *DocumentCache::GetPageSequential(int serial) { | |
| 724 int num_docs = documents_.size(); | |
| 725 ASSERT_HOST(num_docs > 0); | |
| 726 if (num_pages_per_doc_ == 0) { | |
| 727 // Use the pages in the first doc as the number of pages in each doc. | |
| 728 documents_[0]->GetPage(0); | |
| 729 num_pages_per_doc_ = documents_[0]->NumPages(); | |
| 730 if (num_pages_per_doc_ == 0) { | |
| 731 tprintf("First document cannot be empty!!\n"); | |
| 732 ASSERT_HOST(num_pages_per_doc_ > 0); | |
| 733 } | |
| 734 // Get rid of zero now if we don't need it. | |
| 735 if (serial / num_pages_per_doc_ % num_docs > 0) { | |
| 736 documents_[0]->UnCache(); | |
| 737 } | |
| 738 } | |
| 739 int doc_index = serial / num_pages_per_doc_ % num_docs; | |
| 740 const ImageData *doc = | |
| 741 documents_[doc_index]->GetPage(serial % num_pages_per_doc_); | |
| 742 // Count up total memory. Background loading makes it more complicated to | |
| 743 // keep a running count. | |
| 744 int64_t total_memory = 0; | |
| 745 for (auto *document : documents_) { | |
| 746 total_memory += document->memory_used(); | |
| 747 } | |
| 748 if (total_memory >= max_memory_) { | |
| 749 // Find something to un-cache. | |
| 750 // If there are more than 3 in front, then serial is from the back reader | |
| 751 // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then | |
| 752 // we create a hole between them and then un-caching the backmost occupied | |
| 753 // will work for both. | |
| 754 int num_in_front = CountNeighbourDocs(doc_index, 1); | |
| 755 for (int offset = num_in_front - 2; | |
| 756 offset > 1 && total_memory >= max_memory_; --offset) { | |
| 757 int next_index = (doc_index + offset) % num_docs; | |
| 758 total_memory -= documents_[next_index]->UnCache(); | |
| 759 } | |
| 760 // If that didn't work, the best solution is to un-cache from the back. If | |
| 761 // we take away the document that a 2nd reader is using, it will put it | |
| 762 // back and make a hole between. | |
| 763 int num_behind = CountNeighbourDocs(doc_index, -1); | |
| 764 for (int offset = num_behind; offset < 0 && total_memory >= max_memory_; | |
| 765 ++offset) { | |
| 766 int next_index = (doc_index + offset + num_docs) % num_docs; | |
| 767 total_memory -= documents_[next_index]->UnCache(); | |
| 768 } | |
| 769 } | |
| 770 int next_index = (doc_index + 1) % num_docs; | |
| 771 if (!documents_[next_index]->IsCached() && total_memory < max_memory_) { | |
| 772 documents_[next_index]->LoadPageInBackground(0); | |
| 773 } | |
| 774 return doc; | |
| 775 } | |
| 776 | |
| 777 // Helper counts the number of adjacent cached neighbours of index looking in | |
| 778 // direction dir, ie index+dir, index+2*dir etc. | |
| 779 int DocumentCache::CountNeighbourDocs(int index, int dir) { | |
| 780 int num_docs = documents_.size(); | |
| 781 for (int offset = dir; abs(offset) < num_docs; offset += dir) { | |
| 782 int offset_index = (index + offset + num_docs) % num_docs; | |
| 783 if (!documents_[offset_index]->IsCached()) { | |
| 784 return offset - dir; | |
| 785 } | |
| 786 } | |
| 787 return num_docs; | |
| 788 } | |
| 789 | |
| 790 } // namespace tesseract. |
