Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/tesseract/src/ccstruct/imagedata.cpp @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/tesseract/src/ccstruct/imagedata.cpp Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,790 @@ +/////////////////////////////////////////////////////////////////////// +// File: imagedata.cpp +// Description: Class to hold information about a single multi-page tiff +// training file and its corresponding boxes or text file. +// Author: Ray Smith +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +# include "config_auto.h" +#endif + +#include "imagedata.h" + +#include "boxread.h" // for ReadMemBoxes +#include "rect.h" // for TBOX +#include "scrollview.h" // for ScrollView, ScrollView::CYAN, ScrollView::NONE +#include "tprintf.h" // for tprintf +#include "tesserrstream.h" // for tesserr + +#include "helpers.h" // for IntCastRounded, TRand, ClipToRange, Modulo +#include "serialis.h" // for TFile + +#include <allheaders.h> // for pixDestroy, pixGetHeight, pixGetWidth, lept_... + +#include <cinttypes> // for PRId64 +#include <fstream> // for std::ifstream + +namespace tesseract { + +// Number of documents to read ahead while training. Doesn't need to be very +// large. +const int kMaxReadAhead = 8; + +ImageData::ImageData() : page_number_(-1), vertical_text_(false) {} +// Takes ownership of the pix and destroys it. +ImageData::ImageData(bool vertical, Image pix) + : page_number_(0), vertical_text_(vertical) { + SetPix(pix); +} +ImageData::~ImageData() { +#ifdef TESSERACT_IMAGEDATA_AS_PIX + internal_pix_.destroy(); +#endif +} + +// Builds and returns an ImageData from the basic data. Note that imagedata, +// truth_text, and box_text are all the actual file data, NOT filenames. +ImageData *ImageData::Build(const char *name, int page_number, const char *lang, + const char *imagedata, int imagedatasize, + const char *truth_text, const char *box_text) { + auto *image_data = new ImageData(); + image_data->imagefilename_ = name; + image_data->page_number_ = page_number; + image_data->language_ = lang; + // Save the imagedata. + // TODO: optimize resize (no init). + image_data->image_data_.resize(imagedatasize); + memcpy(&image_data->image_data_[0], imagedata, imagedatasize); + if (!image_data->AddBoxes(box_text)) { + if (truth_text == nullptr || truth_text[0] == '\0') { + tprintf("Error: No text corresponding to page %d from image %s!\n", + page_number, name); + delete image_data; + return nullptr; + } + image_data->transcription_ = truth_text; + // If we have no boxes, the transcription is in the 0th box_texts_. + image_data->box_texts_.emplace_back(truth_text); + // We will create a box for the whole image on PreScale, to save unpacking + // the image now. + } else if (truth_text != nullptr && truth_text[0] != '\0' && + image_data->transcription_ != truth_text) { + // Save the truth text as it is present and disagrees with the box text. + image_data->transcription_ = truth_text; + } + return image_data; +} + +// Writes to the given file. Returns false in case of error. +bool ImageData::Serialize(TFile *fp) const { + if (!fp->Serialize(imagefilename_)) { + return false; + } + if (!fp->Serialize(&page_number_)) { + return false; + } + if (!fp->Serialize(image_data_)) { + return false; + } + if (!fp->Serialize(language_)) { + return false; + } + if (!fp->Serialize(transcription_)) { + return false; + } + if (!fp->Serialize(boxes_)) { + return false; + } + if (!fp->Serialize(box_texts_)) { + return false; + } + int8_t vertical = vertical_text_; + return fp->Serialize(&vertical); +} + +// Reads from the given file. Returns false in case of error. +bool ImageData::DeSerialize(TFile *fp) { + if (!fp->DeSerialize(imagefilename_)) { + return false; + } + if (!fp->DeSerialize(&page_number_)) { + return false; + } + if (!fp->DeSerialize(image_data_)) { + return false; + } + if (!fp->DeSerialize(language_)) { + return false; + } + if (!fp->DeSerialize(transcription_)) { + return false; + } + if (!fp->DeSerialize(boxes_)) { + return false; + } + if (!fp->DeSerialize(box_texts_)) { + return false; + } + int8_t vertical = 0; + if (!fp->DeSerialize(&vertical)) { + return false; + } + vertical_text_ = vertical != 0; + return true; +} + +// As DeSerialize, but only seeks past the data - hence a static method. +bool ImageData::SkipDeSerialize(TFile *fp) { + if (!fp->DeSerializeSkip()) { + return false; + } + int32_t page_number; + if (!fp->DeSerialize(&page_number)) { + return false; + } + if (!fp->DeSerializeSkip()) { + return false; + } + if (!fp->DeSerializeSkip()) { + return false; + } + if (!fp->DeSerializeSkip()) { + return false; + } + if (!fp->DeSerializeSkip(sizeof(TBOX))) { + return false; + } + int32_t number; + if (!fp->DeSerialize(&number)) { + return false; + } + for (int i = 0; i < number; i++) { + if (!fp->DeSerializeSkip()) { + return false; + } + } + int8_t vertical = 0; + return fp->DeSerialize(&vertical); +} + +// Saves the given Pix as a PNG-encoded string and destroys it. +// In case of missing PNG support in Leptonica use PNM format, +// which requires more memory. +void ImageData::SetPix(Image pix) { +#ifdef TESSERACT_IMAGEDATA_AS_PIX + internal_pix_ = pix; +#else + SetPixInternal(pix, &image_data_); +#endif +} + +// Returns the Pix image for *this. Must be pixDestroyed after use. +Image ImageData::GetPix() const { +#ifdef TESSERACT_IMAGEDATA_AS_PIX +# ifdef GRAPHICS_DISABLED + /* The only caller of this is the scaling functions to prescale the + * source. Thus we can just return a new pointer to the same data. */ + return internal_pix_.clone(); +# else + /* pixCopy always does an actual copy, so the caller can modify the + * changed data. */ + return internal_pix_.copy(); +# endif +#else + return GetPixInternal(image_data_); +#endif +} + +// Gets anything and everything with a non-nullptr pointer, prescaled to a +// given target_height (if 0, then the original image height), and aligned. +// Also returns (if not nullptr) the width and height of the scaled image. +// The return value is the scaled Pix, which must be pixDestroyed after use, +// and scale_factor (if not nullptr) is set to the scale factor that was applied +// to the image to achieve the target_height. +Image ImageData::PreScale(int target_height, int max_height, + float *scale_factor, int *scaled_width, + int *scaled_height, std::vector<TBOX> *boxes) const { + int input_width = 0; + int input_height = 0; + Image src_pix = GetPix(); + ASSERT_HOST(src_pix != nullptr); + input_width = pixGetWidth(src_pix); + input_height = pixGetHeight(src_pix); + if (target_height == 0) { + target_height = std::min(input_height, max_height); + } + float im_factor = static_cast<float>(target_height) / input_height; + if (scaled_width != nullptr) { + *scaled_width = IntCastRounded(im_factor * input_width); + } + if (scaled_height != nullptr) { + *scaled_height = target_height; + } + // Get the scaled image. + Image pix = pixScale(src_pix, im_factor, im_factor); + if (pix == nullptr) { + tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", + input_width, input_height, im_factor); + src_pix.destroy(); + return nullptr; + } + if (scaled_width != nullptr) { + *scaled_width = pixGetWidth(pix); + } + if (scaled_height != nullptr) { + *scaled_height = pixGetHeight(pix); + } + src_pix.destroy(); + if (boxes != nullptr) { + // Get the boxes. + boxes->clear(); + for (auto box : boxes_) { + box.scale(im_factor); + boxes->push_back(box); + } + if (boxes->empty()) { + // Make a single box for the whole image. + TBOX box(0, 0, im_factor * input_width, target_height); + boxes->push_back(box); + } + } + if (scale_factor != nullptr) { + *scale_factor = im_factor; + } + return pix; +} + +int ImageData::MemoryUsed() const { + return image_data_.size(); +} + +#ifndef GRAPHICS_DISABLED + +// Draws the data in a new window. +void ImageData::Display() const { + const int kTextSize = 64; + // Draw the image. + Image pix = GetPix(); + if (pix == nullptr) { + return; + } + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + auto *win = new ScrollView("Imagedata", 100, 100, 2 * (width + 2 * kTextSize), + 2 * (height + 4 * kTextSize), width + 10, + height + 3 * kTextSize, true); + win->Draw(pix, 0, height - 1); + pix.destroy(); + // Draw the boxes. + win->Pen(ScrollView::RED); + win->Brush(ScrollView::NONE); + int text_size = kTextSize; + if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) { + text_size = boxes_[0].height() * 2; + } + win->TextAttributes("Arial", text_size, false, false, false); + if (!boxes_.empty()) { + for (unsigned b = 0; b < boxes_.size(); ++b) { + boxes_[b].plot(win); + win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str()); + } + } else { + // The full transcription. + win->Pen(ScrollView::CYAN); + win->Text(0, height + kTextSize * 2, transcription_.c_str()); + } + win->Update(); + win->Wait(); +} + +#endif + +// Adds the supplied boxes and transcriptions that correspond to the correct +// page number. +void ImageData::AddBoxes(const std::vector<TBOX> &boxes, + const std::vector<std::string> &texts, + const std::vector<int> &box_pages) { + // Copy the boxes and make the transcription. + for (unsigned i = 0; i < box_pages.size(); ++i) { + if (page_number_ >= 0 && box_pages[i] != page_number_) { + continue; + } + transcription_ += texts[i]; + boxes_.push_back(boxes[i]); + box_texts_.push_back(texts[i]); + } +} + +#ifndef TESSERACT_IMAGEDATA_AS_PIX +// Saves the given Pix as a PNG-encoded string and destroys it. +// In case of missing PNG support in Leptonica use PNM format, +// which requires more memory. +void ImageData::SetPixInternal(Image pix, std::vector<char> *image_data) { + l_uint8 *data; + size_t size; + l_int32 ret; + ret = pixWriteMem(&data, &size, pix, IFF_PNG); + if (ret) { + ret = pixWriteMem(&data, &size, pix, IFF_PNM); + } + pix.destroy(); + // TODO: optimize resize (no init). + image_data->resize(size); + memcpy(&(*image_data)[0], data, size); + lept_free(data); +} + +// Returns the Pix image for the image_data. Must be pixDestroyed after use. +Image ImageData::GetPixInternal(const std::vector<char> &image_data) { + Image pix = nullptr; + if (!image_data.empty()) { + // Convert the array to an image. + const auto *u_data = + reinterpret_cast<const unsigned char *>(&image_data[0]); + pix = pixReadMem(u_data, image_data.size()); + } + return pix; +} +#endif + +// Parses the text string as a box file and adds any discovered boxes that +// match the page number. Returns false on error. +bool ImageData::AddBoxes(const char *box_text) { + if (box_text != nullptr && box_text[0] != '\0') { + std::vector<TBOX> boxes; + std::vector<std::string> texts; + std::vector<int> box_pages; + if (ReadMemBoxes(page_number_, /*skip_blanks*/ false, box_text, + /*continue_on_failure*/ true, &boxes, &texts, nullptr, + &box_pages)) { + AddBoxes(boxes, texts, box_pages); + return true; + } else { + tprintf("Error: No boxes for page %d from image %s!\n", page_number_, + imagefilename_.c_str()); + } + } + return false; +} + +DocumentData::DocumentData(const std::string &name) + : document_name_(name), + pages_offset_(-1), + total_pages_(-1), + memory_used_(0), + max_memory_(0), + reader_(nullptr) {} + +DocumentData::~DocumentData() { + if (thread.joinable()) { + thread.join(); + } + std::lock_guard<std::mutex> lock_p(pages_mutex_); + std::lock_guard<std::mutex> lock_g(general_mutex_); + for (auto data : pages_) { + delete data; + } +} + +// Reads all the pages in the given lstmf filename to the cache. The reader +// is used to read the file. +bool DocumentData::LoadDocument(const char *filename, int start_page, + int64_t max_memory, FileReader reader) { + SetDocument(filename, max_memory, reader); + pages_offset_ = start_page; + return ReCachePages(); +} + +// Sets up the document, without actually loading it. +void DocumentData::SetDocument(const char *filename, int64_t max_memory, + FileReader reader) { + std::lock_guard<std::mutex> lock_p(pages_mutex_); + std::lock_guard<std::mutex> lock(general_mutex_); + document_name_ = filename; + pages_offset_ = -1; + max_memory_ = max_memory; + reader_ = reader; +} + +// Writes all the pages to the given filename. Returns false on error. +bool DocumentData::SaveDocument(const char *filename, FileWriter writer) { + std::lock_guard<std::mutex> lock(pages_mutex_); + TFile fp; + fp.OpenWrite(nullptr); + if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) { + tprintf("Serialize failed: %s\n", filename); + return false; + } + return true; +} + +// Adds the given page data to this document, counting up memory. +void DocumentData::AddPageToDocument(ImageData *page) { + std::lock_guard<std::mutex> lock(pages_mutex_); + pages_.push_back(page); + set_memory_used(memory_used() + page->MemoryUsed()); +} + +// If the given index is not currently loaded, loads it using a separate +// thread. +void DocumentData::LoadPageInBackground(int index) { + ImageData *page = nullptr; + if (IsPageAvailable(index, &page)) { + return; + } + { + std::lock_guard<std::mutex> lock(pages_mutex_); + if (pages_offset_ == index) { + return; + } + pages_offset_ = index; + for (auto page : pages_) { + delete page; + } + pages_.clear(); + } + if (thread.joinable()) { + thread.join(); + } + // Don't run next statement asynchronously because that would + // create too many threads on Linux (see issue #3111). + ReCachePages(); +} + +// Returns a pointer to the page with the given index, modulo the total +// number of pages. Blocks until the background load is completed. +const ImageData *DocumentData::GetPage(int index) { + ImageData *page = nullptr; + while (!IsPageAvailable(index, &page)) { + // If there is no background load scheduled, schedule one now. + pages_mutex_.lock(); + bool needs_loading = pages_offset_ != index; + pages_mutex_.unlock(); + if (needs_loading) { + LoadPageInBackground(index); + } + // We can't directly load the page, or the background load will delete it + // while the caller is using it, so give it a chance to work. + std::this_thread::yield(); + } + return page; +} + +// Returns true if the requested page is available, and provides a pointer, +// which may be nullptr if the document is empty. May block, even though it +// doesn't guarantee to return true. +bool DocumentData::IsPageAvailable(int index, ImageData **page) { + std::lock_guard<std::mutex> lock(pages_mutex_); + int num_pages = NumPages(); + if (num_pages == 0 || index < 0) { + *page = nullptr; // Empty Document. + return true; + } + if (num_pages > 0) { + index = Modulo(index, num_pages); + if (pages_offset_ <= index && + static_cast<unsigned>(index) < pages_offset_ + pages_.size()) { + *page = pages_[index - pages_offset_]; // Page is available already. + return true; + } + } + return false; +} + +// Removes all pages from memory and frees the memory, but does not forget +// the document metadata. +int64_t DocumentData::UnCache() { + std::lock_guard<std::mutex> lock(pages_mutex_); + int64_t memory_saved = memory_used(); + for (auto page : pages_) { + delete page; + } + pages_.clear(); + pages_offset_ = -1; + set_total_pages(-1); + set_memory_used(0); + tprintf("Unloaded document %s, saving %" PRId64 " memory\n", + document_name_.c_str(), memory_saved); + return memory_saved; +} + +// Shuffles all the pages in the document. +void DocumentData::Shuffle() { + TRand random; + // Different documents get shuffled differently, but the same for the same + // name. + random.set_seed(document_name_.c_str()); + int num_pages = pages_.size(); + // Execute one random swap for each page in the document. + for (int i = 0; i < num_pages; ++i) { + int src = random.IntRand() % num_pages; + int dest = random.IntRand() % num_pages; + std::swap(pages_[src], pages_[dest]); + } +} + +// Locks the pages_mutex_ and loads as many pages as will fit into max_memory_ +// starting at index pages_offset_. +bool DocumentData::ReCachePages() { + std::lock_guard<std::mutex> lock(pages_mutex_); + // Read the file. + set_total_pages(0); + set_memory_used(0); + int loaded_pages = 0; + for (auto page : pages_) { + delete page; + } + pages_.clear(); +#if !defined(TESSERACT_IMAGEDATA_AS_PIX) + auto name_size = document_name_.size(); + if (name_size > 4 && document_name_.substr(name_size - 4) == ".png") { + // PNG image given instead of LSTMF file. + std::string gt_name = document_name_.substr(0, name_size - 3) + "gt.txt"; + std::ifstream t(gt_name); + std::string line; + std::getline(t, line); + t.close(); + ImageData *image_data = ImageData::Build(document_name_.c_str(), 0, "", nullptr, 0, line.c_str(), nullptr); + Image image = pixRead(document_name_.c_str()); + image_data->SetPix(image); + pages_.push_back(image_data); + loaded_pages = 1; + pages_offset_ %= loaded_pages; + set_total_pages(loaded_pages); + set_memory_used(memory_used() + image_data->MemoryUsed()); +#if 0 + tprintf("Loaded %zu/%d lines (%d-%zu) of document %s\n", pages_.size(), + loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(), + document_name_.c_str()); +#endif + return !pages_.empty(); + } +#endif + TFile fp; + if (!fp.Open(document_name_.c_str(), reader_) || + !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) { + tprintf("Deserialize header failed: %s\n", document_name_.c_str()); + return false; + } + pages_offset_ %= loaded_pages; + // Skip pages before the first one we want, and load the rest until max + // memory and skip the rest after that. + int page; + for (page = 0; page < loaded_pages; ++page) { + uint8_t non_null; + if (!fp.DeSerialize(&non_null)) { + break; + } + if (page < pages_offset_ || + (max_memory_ > 0 && memory_used() > max_memory_)) { + if (non_null && !ImageData::SkipDeSerialize(&fp)) { + break; + } + } else { + ImageData *image_data = nullptr; + if (non_null) { + image_data = new ImageData; + if (!image_data->DeSerialize(&fp)) { + delete image_data; + break; + } + } + pages_.push_back(image_data); + if (image_data->imagefilename().empty()) { + image_data->set_imagefilename(document_name_); + image_data->set_page_number(page); + } + set_memory_used(memory_used() + image_data->MemoryUsed()); + } + } + if (page < loaded_pages) { + tprintf("Deserialize failed: %s read %d/%d lines\n", document_name_.c_str(), + page, loaded_pages); + for (auto page : pages_) { + delete page; + } + pages_.clear(); + } else if (loaded_pages > 1) { + // Avoid lots of messages for training with single line images. + tesserr << "Loaded " << pages_.size() << '/' << loaded_pages << " lines (" + << pages_offset_ + 1 << '-' + << pages_offset_ + pages_.size() << ") of document " + << document_name_ << '\n'; + } + set_total_pages(loaded_pages); + return !pages_.empty(); +} + +// A collection of DocumentData that knows roughly how much memory it is using. +DocumentCache::DocumentCache(int64_t max_memory) : max_memory_(max_memory) {} + +DocumentCache::~DocumentCache() { + for (auto *document : documents_) { + delete document; + } +} + +// Adds all the documents in the list of filenames, counting memory. +// The reader is used to read the files. +bool DocumentCache::LoadDocuments(const std::vector<std::string> &filenames, + CachingStrategy cache_strategy, + FileReader reader) { + cache_strategy_ = cache_strategy; + int64_t fair_share_memory = 0; + // In the round-robin case, each DocumentData handles restricting its content + // to its fair share of memory. In the sequential case, DocumentCache + // determines which DocumentDatas are held entirely in memory. + if (cache_strategy_ == CS_ROUND_ROBIN) { + fair_share_memory = max_memory_ / filenames.size(); + } + for (const auto &filename : filenames) { + auto *document = new DocumentData(filename); + document->SetDocument(filename.c_str(), fair_share_memory, reader); + AddToCache(document); + } + if (!documents_.empty()) { + // Try to get the first page now to verify the list of filenames. + if (GetPageBySerial(0) != nullptr) { + return true; + } + tprintf("Load of page 0 failed!\n"); + } + return false; +} + +// Adds document to the cache. +bool DocumentCache::AddToCache(DocumentData *data) { + documents_.push_back(data); + return true; +} + +// Finds and returns a document by name. +DocumentData *DocumentCache::FindDocument( + const std::string &document_name) const { + for (auto *document : documents_) { + if (document->document_name() == document_name) { + return document; + } + } + return nullptr; +} + +// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache +// strategy, could take a long time. +int DocumentCache::TotalPages() { + if (cache_strategy_ == CS_SEQUENTIAL) { + // In sequential mode, we assume each doc has the same number of pages + // whether it is true or not. + if (num_pages_per_doc_ == 0) { + GetPageSequential(0); + } + return num_pages_per_doc_ * documents_.size(); + } + int total_pages = 0; + for (auto *document : documents_) { + // We have to load a page to make NumPages() valid. + document->GetPage(0); + total_pages += document->NumPages(); + } + return total_pages; +} + +// Returns a page by serial number, selecting them in a round-robin fashion +// from all the documents. Highly disk-intensive, but doesn't need samples +// to be shuffled between files to begin with. +const ImageData *DocumentCache::GetPageRoundRobin(int serial) { + int num_docs = documents_.size(); + int doc_index = serial % num_docs; + const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs); + for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) { + doc_index = (serial + offset) % num_docs; + int page = (serial + offset) / num_docs; + documents_[doc_index]->LoadPageInBackground(page); + } + return doc; +} + +// Returns a page by serial number, selecting them in sequence from each file. +// Requires the samples to be shuffled between the files to give a random or +// uniform distribution of data. Less disk-intensive than GetPageRoundRobin. +const ImageData *DocumentCache::GetPageSequential(int serial) { + int num_docs = documents_.size(); + ASSERT_HOST(num_docs > 0); + if (num_pages_per_doc_ == 0) { + // Use the pages in the first doc as the number of pages in each doc. + documents_[0]->GetPage(0); + num_pages_per_doc_ = documents_[0]->NumPages(); + if (num_pages_per_doc_ == 0) { + tprintf("First document cannot be empty!!\n"); + ASSERT_HOST(num_pages_per_doc_ > 0); + } + // Get rid of zero now if we don't need it. + if (serial / num_pages_per_doc_ % num_docs > 0) { + documents_[0]->UnCache(); + } + } + int doc_index = serial / num_pages_per_doc_ % num_docs; + const ImageData *doc = + documents_[doc_index]->GetPage(serial % num_pages_per_doc_); + // Count up total memory. Background loading makes it more complicated to + // keep a running count. + int64_t total_memory = 0; + for (auto *document : documents_) { + total_memory += document->memory_used(); + } + if (total_memory >= max_memory_) { + // Find something to un-cache. + // If there are more than 3 in front, then serial is from the back reader + // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then + // we create a hole between them and then un-caching the backmost occupied + // will work for both. + int num_in_front = CountNeighbourDocs(doc_index, 1); + for (int offset = num_in_front - 2; + offset > 1 && total_memory >= max_memory_; --offset) { + int next_index = (doc_index + offset) % num_docs; + total_memory -= documents_[next_index]->UnCache(); + } + // If that didn't work, the best solution is to un-cache from the back. If + // we take away the document that a 2nd reader is using, it will put it + // back and make a hole between. + int num_behind = CountNeighbourDocs(doc_index, -1); + for (int offset = num_behind; offset < 0 && total_memory >= max_memory_; + ++offset) { + int next_index = (doc_index + offset + num_docs) % num_docs; + total_memory -= documents_[next_index]->UnCache(); + } + } + int next_index = (doc_index + 1) % num_docs; + if (!documents_[next_index]->IsCached() && total_memory < max_memory_) { + documents_[next_index]->LoadPageInBackground(0); + } + return doc; +} + +// Helper counts the number of adjacent cached neighbours of index looking in +// direction dir, ie index+dir, index+2*dir etc. +int DocumentCache::CountNeighbourDocs(int index, int dir) { + int num_docs = documents_.size(); + for (int offset = dir; abs(offset) < num_docs; offset += dir) { + int offset_index = (index + offset + num_docs) % num_docs; + if (!documents_[offset_index]->IsCached()) { + return offset - dir; + } + } + return num_docs; +} + +} // namespace tesseract.
