comparison mupdf-source/thirdparty/tesseract/src/ccstruct/imagedata.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 ///////////////////////////////////////////////////////////////////////
2 // File: imagedata.cpp
3 // Description: Class to hold information about a single multi-page tiff
4 // training file and its corresponding boxes or text file.
5 // Author: Ray Smith
6 //
7 // (C) Copyright 2013, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "imagedata.h"
25
26 #include "boxread.h" // for ReadMemBoxes
27 #include "rect.h" // for TBOX
28 #include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE
29 #include "tprintf.h" // for tprintf
30 #include "tesserrstream.h" // for tesserr
31
32 #include "helpers.h" // for IntCastRounded, TRand, ClipToRange, Modulo
33 #include "serialis.h" // for TFile
34
35 #include <allheaders.h> // for pixDestroy, pixGetHeight, pixGetWidth, lept_...
36
37 #include <cinttypes> // for PRId64
38 #include <fstream> // for std::ifstream
39
40 namespace tesseract {
41
42 // Number of documents to read ahead while training. Doesn't need to be very
43 // large.
44 const int kMaxReadAhead = 8;
45
46 ImageData::ImageData() : page_number_(-1), vertical_text_(false) {}
47 // Takes ownership of the pix and destroys it.
48 ImageData::ImageData(bool vertical, Image pix)
49 : page_number_(0), vertical_text_(vertical) {
50 SetPix(pix);
51 }
52 ImageData::~ImageData() {
53 #ifdef TESSERACT_IMAGEDATA_AS_PIX
54 internal_pix_.destroy();
55 #endif
56 }
57
58 // Builds and returns an ImageData from the basic data. Note that imagedata,
59 // truth_text, and box_text are all the actual file data, NOT filenames.
60 ImageData *ImageData::Build(const char *name, int page_number, const char *lang,
61 const char *imagedata, int imagedatasize,
62 const char *truth_text, const char *box_text) {
63 auto *image_data = new ImageData();
64 image_data->imagefilename_ = name;
65 image_data->page_number_ = page_number;
66 image_data->language_ = lang;
67 // Save the imagedata.
68 // TODO: optimize resize (no init).
69 image_data->image_data_.resize(imagedatasize);
70 memcpy(&image_data->image_data_[0], imagedata, imagedatasize);
71 if (!image_data->AddBoxes(box_text)) {
72 if (truth_text == nullptr || truth_text[0] == '\0') {
73 tprintf("Error: No text corresponding to page %d from image %s!\n",
74 page_number, name);
75 delete image_data;
76 return nullptr;
77 }
78 image_data->transcription_ = truth_text;
79 // If we have no boxes, the transcription is in the 0th box_texts_.
80 image_data->box_texts_.emplace_back(truth_text);
81 // We will create a box for the whole image on PreScale, to save unpacking
82 // the image now.
83 } else if (truth_text != nullptr && truth_text[0] != '\0' &&
84 image_data->transcription_ != truth_text) {
85 // Save the truth text as it is present and disagrees with the box text.
86 image_data->transcription_ = truth_text;
87 }
88 return image_data;
89 }
90
91 // Writes to the given file. Returns false in case of error.
92 bool ImageData::Serialize(TFile *fp) const {
93 if (!fp->Serialize(imagefilename_)) {
94 return false;
95 }
96 if (!fp->Serialize(&page_number_)) {
97 return false;
98 }
99 if (!fp->Serialize(image_data_)) {
100 return false;
101 }
102 if (!fp->Serialize(language_)) {
103 return false;
104 }
105 if (!fp->Serialize(transcription_)) {
106 return false;
107 }
108 if (!fp->Serialize(boxes_)) {
109 return false;
110 }
111 if (!fp->Serialize(box_texts_)) {
112 return false;
113 }
114 int8_t vertical = vertical_text_;
115 return fp->Serialize(&vertical);
116 }
117
118 // Reads from the given file. Returns false in case of error.
119 bool ImageData::DeSerialize(TFile *fp) {
120 if (!fp->DeSerialize(imagefilename_)) {
121 return false;
122 }
123 if (!fp->DeSerialize(&page_number_)) {
124 return false;
125 }
126 if (!fp->DeSerialize(image_data_)) {
127 return false;
128 }
129 if (!fp->DeSerialize(language_)) {
130 return false;
131 }
132 if (!fp->DeSerialize(transcription_)) {
133 return false;
134 }
135 if (!fp->DeSerialize(boxes_)) {
136 return false;
137 }
138 if (!fp->DeSerialize(box_texts_)) {
139 return false;
140 }
141 int8_t vertical = 0;
142 if (!fp->DeSerialize(&vertical)) {
143 return false;
144 }
145 vertical_text_ = vertical != 0;
146 return true;
147 }
148
149 // As DeSerialize, but only seeks past the data - hence a static method.
150 bool ImageData::SkipDeSerialize(TFile *fp) {
151 if (!fp->DeSerializeSkip()) {
152 return false;
153 }
154 int32_t page_number;
155 if (!fp->DeSerialize(&page_number)) {
156 return false;
157 }
158 if (!fp->DeSerializeSkip()) {
159 return false;
160 }
161 if (!fp->DeSerializeSkip()) {
162 return false;
163 }
164 if (!fp->DeSerializeSkip()) {
165 return false;
166 }
167 if (!fp->DeSerializeSkip(sizeof(TBOX))) {
168 return false;
169 }
170 int32_t number;
171 if (!fp->DeSerialize(&number)) {
172 return false;
173 }
174 for (int i = 0; i < number; i++) {
175 if (!fp->DeSerializeSkip()) {
176 return false;
177 }
178 }
179 int8_t vertical = 0;
180 return fp->DeSerialize(&vertical);
181 }
182
183 // Saves the given Pix as a PNG-encoded string and destroys it.
184 // In case of missing PNG support in Leptonica use PNM format,
185 // which requires more memory.
186 void ImageData::SetPix(Image pix) {
187 #ifdef TESSERACT_IMAGEDATA_AS_PIX
188 internal_pix_ = pix;
189 #else
190 SetPixInternal(pix, &image_data_);
191 #endif
192 }
193
194 // Returns the Pix image for *this. Must be pixDestroyed after use.
195 Image ImageData::GetPix() const {
196 #ifdef TESSERACT_IMAGEDATA_AS_PIX
197 # ifdef GRAPHICS_DISABLED
198 /* The only caller of this is the scaling functions to prescale the
199 * source. Thus we can just return a new pointer to the same data. */
200 return internal_pix_.clone();
201 # else
202 /* pixCopy always does an actual copy, so the caller can modify the
203 * changed data. */
204 return internal_pix_.copy();
205 # endif
206 #else
207 return GetPixInternal(image_data_);
208 #endif
209 }
210
211 // Gets anything and everything with a non-nullptr pointer, prescaled to a
212 // given target_height (if 0, then the original image height), and aligned.
213 // Also returns (if not nullptr) the width and height of the scaled image.
214 // The return value is the scaled Pix, which must be pixDestroyed after use,
215 // and scale_factor (if not nullptr) is set to the scale factor that was applied
216 // to the image to achieve the target_height.
217 Image ImageData::PreScale(int target_height, int max_height,
218 float *scale_factor, int *scaled_width,
219 int *scaled_height, std::vector<TBOX> *boxes) const {
220 int input_width = 0;
221 int input_height = 0;
222 Image src_pix = GetPix();
223 ASSERT_HOST(src_pix != nullptr);
224 input_width = pixGetWidth(src_pix);
225 input_height = pixGetHeight(src_pix);
226 if (target_height == 0) {
227 target_height = std::min(input_height, max_height);
228 }
229 float im_factor = static_cast<float>(target_height) / input_height;
230 if (scaled_width != nullptr) {
231 *scaled_width = IntCastRounded(im_factor * input_width);
232 }
233 if (scaled_height != nullptr) {
234 *scaled_height = target_height;
235 }
236 // Get the scaled image.
237 Image pix = pixScale(src_pix, im_factor, im_factor);
238 if (pix == nullptr) {
239 tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n",
240 input_width, input_height, im_factor);
241 src_pix.destroy();
242 return nullptr;
243 }
244 if (scaled_width != nullptr) {
245 *scaled_width = pixGetWidth(pix);
246 }
247 if (scaled_height != nullptr) {
248 *scaled_height = pixGetHeight(pix);
249 }
250 src_pix.destroy();
251 if (boxes != nullptr) {
252 // Get the boxes.
253 boxes->clear();
254 for (auto box : boxes_) {
255 box.scale(im_factor);
256 boxes->push_back(box);
257 }
258 if (boxes->empty()) {
259 // Make a single box for the whole image.
260 TBOX box(0, 0, im_factor * input_width, target_height);
261 boxes->push_back(box);
262 }
263 }
264 if (scale_factor != nullptr) {
265 *scale_factor = im_factor;
266 }
267 return pix;
268 }
269
270 int ImageData::MemoryUsed() const {
271 return image_data_.size();
272 }
273
274 #ifndef GRAPHICS_DISABLED
275
276 // Draws the data in a new window.
277 void ImageData::Display() const {
278 const int kTextSize = 64;
279 // Draw the image.
280 Image pix = GetPix();
281 if (pix == nullptr) {
282 return;
283 }
284 int width = pixGetWidth(pix);
285 int height = pixGetHeight(pix);
286 auto *win = new ScrollView("Imagedata", 100, 100, 2 * (width + 2 * kTextSize),
287 2 * (height + 4 * kTextSize), width + 10,
288 height + 3 * kTextSize, true);
289 win->Draw(pix, 0, height - 1);
290 pix.destroy();
291 // Draw the boxes.
292 win->Pen(ScrollView::RED);
293 win->Brush(ScrollView::NONE);
294 int text_size = kTextSize;
295 if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) {
296 text_size = boxes_[0].height() * 2;
297 }
298 win->TextAttributes("Arial", text_size, false, false, false);
299 if (!boxes_.empty()) {
300 for (unsigned b = 0; b < boxes_.size(); ++b) {
301 boxes_[b].plot(win);
302 win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
303 }
304 } else {
305 // The full transcription.
306 win->Pen(ScrollView::CYAN);
307 win->Text(0, height + kTextSize * 2, transcription_.c_str());
308 }
309 win->Update();
310 win->Wait();
311 }
312
313 #endif
314
315 // Adds the supplied boxes and transcriptions that correspond to the correct
316 // page number.
317 void ImageData::AddBoxes(const std::vector<TBOX> &boxes,
318 const std::vector<std::string> &texts,
319 const std::vector<int> &box_pages) {
320 // Copy the boxes and make the transcription.
321 for (unsigned i = 0; i < box_pages.size(); ++i) {
322 if (page_number_ >= 0 && box_pages[i] != page_number_) {
323 continue;
324 }
325 transcription_ += texts[i];
326 boxes_.push_back(boxes[i]);
327 box_texts_.push_back(texts[i]);
328 }
329 }
330
331 #ifndef TESSERACT_IMAGEDATA_AS_PIX
332 // Saves the given Pix as a PNG-encoded string and destroys it.
333 // In case of missing PNG support in Leptonica use PNM format,
334 // which requires more memory.
335 void ImageData::SetPixInternal(Image pix, std::vector<char> *image_data) {
336 l_uint8 *data;
337 size_t size;
338 l_int32 ret;
339 ret = pixWriteMem(&data, &size, pix, IFF_PNG);
340 if (ret) {
341 ret = pixWriteMem(&data, &size, pix, IFF_PNM);
342 }
343 pix.destroy();
344 // TODO: optimize resize (no init).
345 image_data->resize(size);
346 memcpy(&(*image_data)[0], data, size);
347 lept_free(data);
348 }
349
350 // Returns the Pix image for the image_data. Must be pixDestroyed after use.
351 Image ImageData::GetPixInternal(const std::vector<char> &image_data) {
352 Image pix = nullptr;
353 if (!image_data.empty()) {
354 // Convert the array to an image.
355 const auto *u_data =
356 reinterpret_cast<const unsigned char *>(&image_data[0]);
357 pix = pixReadMem(u_data, image_data.size());
358 }
359 return pix;
360 }
361 #endif
362
363 // Parses the text string as a box file and adds any discovered boxes that
364 // match the page number. Returns false on error.
365 bool ImageData::AddBoxes(const char *box_text) {
366 if (box_text != nullptr && box_text[0] != '\0') {
367 std::vector<TBOX> boxes;
368 std::vector<std::string> texts;
369 std::vector<int> box_pages;
370 if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text,
371 /*continue_on_failure*/ true, &boxes, &texts, nullptr,
372 &box_pages)) {
373 AddBoxes(boxes, texts, box_pages);
374 return true;
375 } else {
376 tprintf("Error: No boxes for page %d from image %s!\n", page_number_,
377 imagefilename_.c_str());
378 }
379 }
380 return false;
381 }
382
383 DocumentData::DocumentData(const std::string &name)
384 : document_name_(name),
385 pages_offset_(-1),
386 total_pages_(-1),
387 memory_used_(0),
388 max_memory_(0),
389 reader_(nullptr) {}
390
391 DocumentData::~DocumentData() {
392 if (thread.joinable()) {
393 thread.join();
394 }
395 std::lock_guard<std::mutex> lock_p(pages_mutex_);
396 std::lock_guard<std::mutex> lock_g(general_mutex_);
397 for (auto data : pages_) {
398 delete data;
399 }
400 }
401
402 // Reads all the pages in the given lstmf filename to the cache. The reader
403 // is used to read the file.
404 bool DocumentData::LoadDocument(const char *filename, int start_page,
405 int64_t max_memory, FileReader reader) {
406 SetDocument(filename, max_memory, reader);
407 pages_offset_ = start_page;
408 return ReCachePages();
409 }
410
411 // Sets up the document, without actually loading it.
412 void DocumentData::SetDocument(const char *filename, int64_t max_memory,
413 FileReader reader) {
414 std::lock_guard<std::mutex> lock_p(pages_mutex_);
415 std::lock_guard<std::mutex> lock(general_mutex_);
416 document_name_ = filename;
417 pages_offset_ = -1;
418 max_memory_ = max_memory;
419 reader_ = reader;
420 }
421
422 // Writes all the pages to the given filename. Returns false on error.
423 bool DocumentData::SaveDocument(const char *filename, FileWriter writer) {
424 std::lock_guard<std::mutex> lock(pages_mutex_);
425 TFile fp;
426 fp.OpenWrite(nullptr);
427 if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) {
428 tprintf("Serialize failed: %s\n", filename);
429 return false;
430 }
431 return true;
432 }
433
434 // Adds the given page data to this document, counting up memory.
435 void DocumentData::AddPageToDocument(ImageData *page) {
436 std::lock_guard<std::mutex> lock(pages_mutex_);
437 pages_.push_back(page);
438 set_memory_used(memory_used() + page->MemoryUsed());
439 }
440
441 // If the given index is not currently loaded, loads it using a separate
442 // thread.
443 void DocumentData::LoadPageInBackground(int index) {
444 ImageData *page = nullptr;
445 if (IsPageAvailable(index, &page)) {
446 return;
447 }
448 {
449 std::lock_guard<std::mutex> lock(pages_mutex_);
450 if (pages_offset_ == index) {
451 return;
452 }
453 pages_offset_ = index;
454 for (auto page : pages_) {
455 delete page;
456 }
457 pages_.clear();
458 }
459 if (thread.joinable()) {
460 thread.join();
461 }
462 // Don't run next statement asynchronously because that would
463 // create too many threads on Linux (see issue #3111).
464 ReCachePages();
465 }
466
467 // Returns a pointer to the page with the given index, modulo the total
468 // number of pages. Blocks until the background load is completed.
469 const ImageData *DocumentData::GetPage(int index) {
470 ImageData *page = nullptr;
471 while (!IsPageAvailable(index, &page)) {
472 // If there is no background load scheduled, schedule one now.
473 pages_mutex_.lock();
474 bool needs_loading = pages_offset_ != index;
475 pages_mutex_.unlock();
476 if (needs_loading) {
477 LoadPageInBackground(index);
478 }
479 // We can't directly load the page, or the background load will delete it
480 // while the caller is using it, so give it a chance to work.
481 std::this_thread::yield();
482 }
483 return page;
484 }
485
486 // Returns true if the requested page is available, and provides a pointer,
487 // which may be nullptr if the document is empty. May block, even though it
488 // doesn't guarantee to return true.
489 bool DocumentData::IsPageAvailable(int index, ImageData **page) {
490 std::lock_guard<std::mutex> lock(pages_mutex_);
491 int num_pages = NumPages();
492 if (num_pages == 0 || index < 0) {
493 *page = nullptr; // Empty Document.
494 return true;
495 }
496 if (num_pages > 0) {
497 index = Modulo(index, num_pages);
498 if (pages_offset_ <= index &&
499 static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {
500 *page = pages_[index - pages_offset_]; // Page is available already.
501 return true;
502 }
503 }
504 return false;
505 }
506
507 // Removes all pages from memory and frees the memory, but does not forget
508 // the document metadata.
509 int64_t DocumentData::UnCache() {
510 std::lock_guard<std::mutex> lock(pages_mutex_);
511 int64_t memory_saved = memory_used();
512 for (auto page : pages_) {
513 delete page;
514 }
515 pages_.clear();
516 pages_offset_ = -1;
517 set_total_pages(-1);
518 set_memory_used(0);
519 tprintf("Unloaded document %s, saving %" PRId64 " memory\n",
520 document_name_.c_str(), memory_saved);
521 return memory_saved;
522 }
523
524 // Shuffles all the pages in the document.
525 void DocumentData::Shuffle() {
526 TRand random;
527 // Different documents get shuffled differently, but the same for the same
528 // name.
529 random.set_seed(document_name_.c_str());
530 int num_pages = pages_.size();
531 // Execute one random swap for each page in the document.
532 for (int i = 0; i < num_pages; ++i) {
533 int src = random.IntRand() % num_pages;
534 int dest = random.IntRand() % num_pages;
535 std::swap(pages_[src], pages_[dest]);
536 }
537 }
538
539 // Locks the pages_mutex_ and loads as many pages as will fit into max_memory_
540 // starting at index pages_offset_.
541 bool DocumentData::ReCachePages() {
542 std::lock_guard<std::mutex> lock(pages_mutex_);
543 // Read the file.
544 set_total_pages(0);
545 set_memory_used(0);
546 int loaded_pages = 0;
547 for (auto page : pages_) {
548 delete page;
549 }
550 pages_.clear();
551 #if !defined(TESSERACT_IMAGEDATA_AS_PIX)
552 auto name_size = document_name_.size();
553 if (name_size > 4 && document_name_.substr(name_size - 4) == ".png") {
554 // PNG image given instead of LSTMF file.
555 std::string gt_name = document_name_.substr(0, name_size - 3) + "gt.txt";
556 std::ifstream t(gt_name);
557 std::string line;
558 std::getline(t, line);
559 t.close();
560 ImageData *image_data = ImageData::Build(document_name_.c_str(), 0, "", nullptr, 0, line.c_str(), nullptr);
561 Image image = pixRead(document_name_.c_str());
562 image_data->SetPix(image);
563 pages_.push_back(image_data);
564 loaded_pages = 1;
565 pages_offset_ %= loaded_pages;
566 set_total_pages(loaded_pages);
567 set_memory_used(memory_used() + image_data->MemoryUsed());
568 #if 0
569 tprintf("Loaded %zu/%d lines (%d-%zu) of document %s\n", pages_.size(),
570 loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),
571 document_name_.c_str());
572 #endif
573 return !pages_.empty();
574 }
575 #endif
576 TFile fp;
577 if (!fp.Open(document_name_.c_str(), reader_) ||
578 !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) {
579 tprintf("Deserialize header failed: %s\n", document_name_.c_str());
580 return false;
581 }
582 pages_offset_ %= loaded_pages;
583 // Skip pages before the first one we want, and load the rest until max
584 // memory and skip the rest after that.
585 int page;
586 for (page = 0; page < loaded_pages; ++page) {
587 uint8_t non_null;
588 if (!fp.DeSerialize(&non_null)) {
589 break;
590 }
591 if (page < pages_offset_ ||
592 (max_memory_ > 0 && memory_used() > max_memory_)) {
593 if (non_null && !ImageData::SkipDeSerialize(&fp)) {
594 break;
595 }
596 } else {
597 ImageData *image_data = nullptr;
598 if (non_null) {
599 image_data = new ImageData;
600 if (!image_data->DeSerialize(&fp)) {
601 delete image_data;
602 break;
603 }
604 }
605 pages_.push_back(image_data);
606 if (image_data->imagefilename().empty()) {
607 image_data->set_imagefilename(document_name_);
608 image_data->set_page_number(page);
609 }
610 set_memory_used(memory_used() + image_data->MemoryUsed());
611 }
612 }
613 if (page < loaded_pages) {
614 tprintf("Deserialize failed: %s read %d/%d lines\n", document_name_.c_str(),
615 page, loaded_pages);
616 for (auto page : pages_) {
617 delete page;
618 }
619 pages_.clear();
620 } else if (loaded_pages > 1) {
621 // Avoid lots of messages for training with single line images.
622 tesserr << "Loaded " << pages_.size() << '/' << loaded_pages << " lines ("
623 << pages_offset_ + 1 << '-'
624 << pages_offset_ + pages_.size() << ") of document "
625 << document_name_ << '\n';
626 }
627 set_total_pages(loaded_pages);
628 return !pages_.empty();
629 }
630
631 // A collection of DocumentData that knows roughly how much memory it is using.
632 DocumentCache::DocumentCache(int64_t max_memory) : max_memory_(max_memory) {}
633
634 DocumentCache::~DocumentCache() {
635 for (auto *document : documents_) {
636 delete document;
637 }
638 }
639
640 // Adds all the documents in the list of filenames, counting memory.
641 // The reader is used to read the files.
642 bool DocumentCache::LoadDocuments(const std::vector<std::string> &filenames,
643 CachingStrategy cache_strategy,
644 FileReader reader) {
645 cache_strategy_ = cache_strategy;
646 int64_t fair_share_memory = 0;
647 // In the round-robin case, each DocumentData handles restricting its content
648 // to its fair share of memory. In the sequential case, DocumentCache
649 // determines which DocumentDatas are held entirely in memory.
650 if (cache_strategy_ == CS_ROUND_ROBIN) {
651 fair_share_memory = max_memory_ / filenames.size();
652 }
653 for (const auto &filename : filenames) {
654 auto *document = new DocumentData(filename);
655 document->SetDocument(filename.c_str(), fair_share_memory, reader);
656 AddToCache(document);
657 }
658 if (!documents_.empty()) {
659 // Try to get the first page now to verify the list of filenames.
660 if (GetPageBySerial(0) != nullptr) {
661 return true;
662 }
663 tprintf("Load of page 0 failed!\n");
664 }
665 return false;
666 }
667
668 // Adds document to the cache.
669 bool DocumentCache::AddToCache(DocumentData *data) {
670 documents_.push_back(data);
671 return true;
672 }
673
674 // Finds and returns a document by name.
675 DocumentData *DocumentCache::FindDocument(
676 const std::string &document_name) const {
677 for (auto *document : documents_) {
678 if (document->document_name() == document_name) {
679 return document;
680 }
681 }
682 return nullptr;
683 }
684
685 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache
686 // strategy, could take a long time.
687 int DocumentCache::TotalPages() {
688 if (cache_strategy_ == CS_SEQUENTIAL) {
689 // In sequential mode, we assume each doc has the same number of pages
690 // whether it is true or not.
691 if (num_pages_per_doc_ == 0) {
692 GetPageSequential(0);
693 }
694 return num_pages_per_doc_ * documents_.size();
695 }
696 int total_pages = 0;
697 for (auto *document : documents_) {
698 // We have to load a page to make NumPages() valid.
699 document->GetPage(0);
700 total_pages += document->NumPages();
701 }
702 return total_pages;
703 }
704
705 // Returns a page by serial number, selecting them in a round-robin fashion
706 // from all the documents. Highly disk-intensive, but doesn't need samples
707 // to be shuffled between files to begin with.
708 const ImageData *DocumentCache::GetPageRoundRobin(int serial) {
709 int num_docs = documents_.size();
710 int doc_index = serial % num_docs;
711 const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs);
712 for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) {
713 doc_index = (serial + offset) % num_docs;
714 int page = (serial + offset) / num_docs;
715 documents_[doc_index]->LoadPageInBackground(page);
716 }
717 return doc;
718 }
719
720 // Returns a page by serial number, selecting them in sequence from each file.
721 // Requires the samples to be shuffled between the files to give a random or
722 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin.
723 const ImageData *DocumentCache::GetPageSequential(int serial) {
724 int num_docs = documents_.size();
725 ASSERT_HOST(num_docs > 0);
726 if (num_pages_per_doc_ == 0) {
727 // Use the pages in the first doc as the number of pages in each doc.
728 documents_[0]->GetPage(0);
729 num_pages_per_doc_ = documents_[0]->NumPages();
730 if (num_pages_per_doc_ == 0) {
731 tprintf("First document cannot be empty!!\n");
732 ASSERT_HOST(num_pages_per_doc_ > 0);
733 }
734 // Get rid of zero now if we don't need it.
735 if (serial / num_pages_per_doc_ % num_docs > 0) {
736 documents_[0]->UnCache();
737 }
738 }
739 int doc_index = serial / num_pages_per_doc_ % num_docs;
740 const ImageData *doc =
741 documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
742 // Count up total memory. Background loading makes it more complicated to
743 // keep a running count.
744 int64_t total_memory = 0;
745 for (auto *document : documents_) {
746 total_memory += document->memory_used();
747 }
748 if (total_memory >= max_memory_) {
749 // Find something to un-cache.
750 // If there are more than 3 in front, then serial is from the back reader
751 // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then
752 // we create a hole between them and then un-caching the backmost occupied
753 // will work for both.
754 int num_in_front = CountNeighbourDocs(doc_index, 1);
755 for (int offset = num_in_front - 2;
756 offset > 1 && total_memory >= max_memory_; --offset) {
757 int next_index = (doc_index + offset) % num_docs;
758 total_memory -= documents_[next_index]->UnCache();
759 }
760 // If that didn't work, the best solution is to un-cache from the back. If
761 // we take away the document that a 2nd reader is using, it will put it
762 // back and make a hole between.
763 int num_behind = CountNeighbourDocs(doc_index, -1);
764 for (int offset = num_behind; offset < 0 && total_memory >= max_memory_;
765 ++offset) {
766 int next_index = (doc_index + offset + num_docs) % num_docs;
767 total_memory -= documents_[next_index]->UnCache();
768 }
769 }
770 int next_index = (doc_index + 1) % num_docs;
771 if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
772 documents_[next_index]->LoadPageInBackground(0);
773 }
774 return doc;
775 }
776
777 // Helper counts the number of adjacent cached neighbours of index looking in
778 // direction dir, ie index+dir, index+2*dir etc.
779 int DocumentCache::CountNeighbourDocs(int index, int dir) {
780 int num_docs = documents_.size();
781 for (int offset = dir; abs(offset) < num_docs; offset += dir) {
782 int offset_index = (index + offset + num_docs) % num_docs;
783 if (!documents_[offset_index]->IsCached()) {
784 return offset - dir;
785 }
786 }
787 return num_docs;
788 }
789
790 } // namespace tesseract.