Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/tesseract/src/ccstruct/imagedata.h @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 /////////////////////////////////////////////////////////////////////// | |
| 2 // File: imagedata.h | |
| 3 // Description: Class to hold information about a single image and its | |
| 4 // corresponding boxes or text file. | |
| 5 // Author: Ray Smith | |
| 6 // | |
| 7 // (C) Copyright 2013, Google Inc. | |
| 8 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 9 // you may not use this file except in compliance with the License. | |
| 10 // You may obtain a copy of the License at | |
| 11 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 12 // Unless required by applicable law or agreed to in writing, software | |
| 13 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 15 // See the License for the specific language governing permissions and | |
| 16 // limitations under the License. | |
| 17 /////////////////////////////////////////////////////////////////////// | |
| 18 | |
| 19 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_ | |
| 20 #define TESSERACT_IMAGE_IMAGEDATA_H_ | |
| 21 | |
| 22 #include "image.h" | |
| 23 #include "points.h" // for FCOORD | |
| 24 | |
| 25 #include <mutex> // for std::mutex | |
| 26 #include <thread> // for std::thread | |
| 27 | |
| 28 struct Pix; | |
| 29 | |
| 30 namespace tesseract { | |
| 31 | |
| 32 class TFile; | |
| 33 class ScrollView; | |
| 34 class TBOX; | |
| 35 | |
| 36 // Amount of padding to apply in output pixels in feature mode. | |
| 37 const int kFeaturePadding = 2; | |
| 38 // Number of pixels to pad around text boxes. | |
| 39 const int kImagePadding = 4; | |
| 40 | |
| 41 // Enum to determine the caching and data sequencing strategy. | |
| 42 enum CachingStrategy { | |
| 43 // Reads all of one file before moving on to the next. Requires samples to be | |
| 44 // shuffled across files. Uses the count of samples in the first file as | |
| 45 // the count in all the files to achieve high-speed random access. As a | |
| 46 // consequence, if subsequent files are smaller, they get entries used more | |
| 47 // than once, and if subsequent files are larger, some entries are not used. | |
| 48 // Best for larger data sets that don't fit in memory. | |
| 49 CS_SEQUENTIAL, | |
| 50 // Reads one sample from each file in rotation. Does not require shuffled | |
| 51 // samples, but is extremely disk-intensive. Samples in smaller files also | |
| 52 // get used more often than samples in larger files. | |
| 53 // Best for smaller data sets that mostly fit in memory. | |
| 54 CS_ROUND_ROBIN, | |
| 55 }; | |
| 56 | |
| 57 // Class to hold information on a single image: | |
| 58 // Filename, cached image as a Pix*, character boxes, text transcription. | |
| 59 // The text transcription is the ground truth UTF-8 text for the image. | |
| 60 // Character boxes are optional and indicate the desired segmentation of | |
| 61 // the text into recognition units. | |
| 62 class TESS_API ImageData { | |
| 63 public: | |
| 64 ImageData(); | |
| 65 // Takes ownership of the pix. | |
| 66 ImageData(bool vertical, Image pix); | |
| 67 ~ImageData(); | |
| 68 | |
| 69 // Builds and returns an ImageData from the basic data. Note that imagedata, | |
| 70 // truth_text, and box_text are all the actual file data, NOT filenames. | |
| 71 static ImageData *Build(const char *name, int page_number, const char *lang, | |
| 72 const char *imagedata, int imagedatasize, const char *truth_text, | |
| 73 const char *box_text); | |
| 74 | |
| 75 // Writes to the given file. Returns false in case of error. | |
| 76 bool Serialize(TFile *fp) const; | |
| 77 // Reads from the given file. Returns false in case of error. | |
| 78 bool DeSerialize(TFile *fp); | |
| 79 // As DeSerialize, but only seeks past the data - hence a static method. | |
| 80 static bool SkipDeSerialize(TFile *fp); | |
| 81 | |
| 82 // Other accessors. | |
| 83 const std::string &imagefilename() const { | |
| 84 return imagefilename_; | |
| 85 } | |
| 86 void set_imagefilename(const std::string &name) { | |
| 87 imagefilename_ = name; | |
| 88 } | |
| 89 int page_number() const { | |
| 90 return page_number_; | |
| 91 } | |
| 92 void set_page_number(int num) { | |
| 93 page_number_ = num; | |
| 94 } | |
| 95 const std::vector<char> &image_data() const { | |
| 96 return image_data_; | |
| 97 } | |
| 98 const std::string &language() const { | |
| 99 return language_; | |
| 100 } | |
| 101 void set_language(const std::string &lang) { | |
| 102 language_ = lang; | |
| 103 } | |
| 104 const std::string &transcription() const { | |
| 105 return transcription_; | |
| 106 } | |
| 107 const std::vector<TBOX> &boxes() const { | |
| 108 return boxes_; | |
| 109 } | |
| 110 const std::vector<std::string> &box_texts() const { | |
| 111 return box_texts_; | |
| 112 } | |
| 113 const std::string &box_text(int index) const { | |
| 114 return box_texts_[index]; | |
| 115 } | |
| 116 // Saves the given Pix as a PNG-encoded string and destroys it. | |
| 117 // In case of missing PNG support in Leptonica use PNM format, | |
| 118 // which requires more memory. | |
| 119 void SetPix(Image pix); | |
| 120 // Returns the Pix image for *this. Must be pixDestroyed after use. | |
| 121 Image GetPix() const; | |
| 122 // Gets anything and everything with a non-nullptr pointer, prescaled to a | |
| 123 // given target_height (if 0, then the original image height), and aligned. | |
| 124 // Also returns (if not nullptr) the width and height of the scaled image. | |
| 125 // The return value is the scaled Pix, which must be pixDestroyed after use, | |
| 126 // and scale_factor (if not nullptr) is set to the scale factor that was | |
| 127 // applied to the image to achieve the target_height. | |
| 128 Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, | |
| 129 int *scaled_height, std::vector<TBOX> *boxes) const; | |
| 130 | |
| 131 int MemoryUsed() const; | |
| 132 | |
| 133 // Draws the data in a new window. | |
| 134 void Display() const; | |
| 135 | |
| 136 // Adds the supplied boxes and transcriptions that correspond to the correct | |
| 137 // page number. | |
| 138 void AddBoxes(const std::vector<TBOX> &boxes, const std::vector<std::string> &texts, | |
| 139 const std::vector<int> &box_pages); | |
| 140 | |
| 141 private: | |
| 142 // Saves the given Pix as a PNG-encoded string and destroys it. | |
| 143 // In case of missing PNG support in Leptonica use PNM format, | |
| 144 // which requires more memory. | |
| 145 static void SetPixInternal(Image pix, std::vector<char> *image_data); | |
| 146 // Returns the Pix image for the image_data. Must be pixDestroyed after use. | |
| 147 static Image GetPixInternal(const std::vector<char> &image_data); | |
| 148 // Parses the text string as a box file and adds any discovered boxes that | |
| 149 // match the page number. Returns false on error. | |
| 150 bool AddBoxes(const char *box_text); | |
| 151 | |
| 152 private: | |
| 153 std::string imagefilename_; // File to read image from. | |
| 154 int32_t page_number_; // Page number if multi-page tif or -1. | |
| 155 // see https://github.com/tesseract-ocr/tesseract/pull/2965 | |
| 156 // EP: reconsider for tess6.0/opencv | |
| 157 #ifdef TESSERACT_IMAGEDATA_AS_PIX | |
| 158 Image internal_pix_; | |
| 159 #endif | |
| 160 std::vector<char> image_data_; // PNG/PNM file data. | |
| 161 std::string language_; // Language code for image. | |
| 162 std::string transcription_; // UTF-8 ground truth of image. | |
| 163 std::vector<TBOX> boxes_; // If non-empty boxes of the image. | |
| 164 std::vector<std::string> box_texts_; // String for text in each box. | |
| 165 bool vertical_text_; // Image has been rotated from vertical. | |
| 166 }; | |
| 167 | |
| 168 // A collection of ImageData that knows roughly how much memory it is using. | |
| 169 class DocumentData { | |
| 170 public: | |
| 171 TESS_API | |
| 172 explicit DocumentData(const std::string &name); | |
| 173 TESS_API | |
| 174 ~DocumentData(); | |
| 175 | |
| 176 // Reads all the pages in the given lstmf filename to the cache. The reader | |
| 177 // is used to read the file. | |
| 178 TESS_API | |
| 179 bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader); | |
| 180 // Sets up the document, without actually loading it. | |
| 181 void SetDocument(const char *filename, int64_t max_memory, FileReader reader); | |
| 182 // Writes all the pages to the given filename. Returns false on error. | |
| 183 TESS_API | |
| 184 bool SaveDocument(const char *filename, FileWriter writer); | |
| 185 | |
| 186 // Adds the given page data to this document, counting up memory. | |
| 187 TESS_API | |
| 188 void AddPageToDocument(ImageData *page); | |
| 189 | |
| 190 const std::string &document_name() const { | |
| 191 std::lock_guard<std::mutex> lock(general_mutex_); | |
| 192 return document_name_; | |
| 193 } | |
| 194 int NumPages() const { | |
| 195 std::lock_guard<std::mutex> lock(general_mutex_); | |
| 196 return total_pages_; | |
| 197 } | |
| 198 size_t PagesSize() const { | |
| 199 return pages_.size(); | |
| 200 } | |
| 201 int64_t memory_used() const { | |
| 202 std::lock_guard<std::mutex> lock(general_mutex_); | |
| 203 return memory_used_; | |
| 204 } | |
| 205 // If the given index is not currently loaded, loads it using a separate | |
| 206 // thread. Note: there are 4 cases: | |
| 207 // Document uncached: IsCached() returns false, total_pages_ < 0. | |
| 208 // Required page is available: IsPageAvailable returns true. In this case, | |
| 209 // total_pages_ > 0 and | |
| 210 // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size() | |
| 211 // Pages are loaded, but the required one is not. | |
| 212 // The requested page is being loaded by LoadPageInBackground. In this case, | |
| 213 // index == pages_offset_. Once the loading starts, the pages lock is held | |
| 214 // until it completes, at which point IsPageAvailable will unblock and return | |
| 215 // true. | |
| 216 void LoadPageInBackground(int index); | |
| 217 // Returns a pointer to the page with the given index, modulo the total | |
| 218 // number of pages. Blocks until the background load is completed. | |
| 219 TESS_API | |
| 220 const ImageData *GetPage(int index); | |
| 221 // Returns true if the requested page is available, and provides a pointer, | |
| 222 // which may be nullptr if the document is empty. May block, even though it | |
| 223 // doesn't guarantee to return true. | |
| 224 bool IsPageAvailable(int index, ImageData **page); | |
| 225 // Takes ownership of the given page index. The page is made nullptr in *this. | |
| 226 ImageData *TakePage(int index) { | |
| 227 std::lock_guard<std::mutex> lock(pages_mutex_); | |
| 228 ImageData *page = pages_[index]; | |
| 229 pages_[index] = nullptr; | |
| 230 return page; | |
| 231 } | |
| 232 // Returns true if the document is currently loaded or in the process of | |
| 233 // loading. | |
| 234 bool IsCached() const { | |
| 235 return NumPages() >= 0; | |
| 236 } | |
| 237 // Removes all pages from memory and frees the memory, but does not forget | |
| 238 // the document metadata. Returns the memory saved. | |
| 239 int64_t UnCache(); | |
| 240 // Shuffles all the pages in the document. | |
| 241 void Shuffle(); | |
| 242 | |
| 243 private: | |
| 244 // Sets the value of total_pages_ behind a mutex. | |
| 245 void set_total_pages(int total) { | |
| 246 std::lock_guard<std::mutex> lock(general_mutex_); | |
| 247 total_pages_ = total; | |
| 248 } | |
| 249 void set_memory_used(int64_t memory_used) { | |
| 250 std::lock_guard<std::mutex> lock(general_mutex_); | |
| 251 memory_used_ = memory_used; | |
| 252 } | |
| 253 // Locks the pages_mutex_ and loads as many pages as will fit into max_memory_ | |
| 254 // starting at index pages_offset_. | |
| 255 bool ReCachePages(); | |
| 256 | |
| 257 private: | |
| 258 // A name for this document. | |
| 259 std::string document_name_; | |
| 260 // A group of pages that corresponds in some loose way to a document. | |
| 261 std::vector<ImageData *> pages_; | |
| 262 // Page number of the first index in pages_. | |
| 263 int pages_offset_; | |
| 264 // Total number of pages in document (may exceed size of pages_.) | |
| 265 int total_pages_; | |
| 266 // Total of all pix sizes in the document. | |
| 267 int64_t memory_used_; | |
| 268 // Max memory to use at any time. | |
| 269 int64_t max_memory_; | |
| 270 // Saved reader from LoadDocument to allow re-caching. | |
| 271 FileReader reader_; | |
| 272 // Mutex that protects pages_ and pages_offset_ against multiple parallel | |
| 273 // loads, and provides a wait for page. | |
| 274 std::mutex pages_mutex_; | |
| 275 // Mutex that protects other data members that callers want to access without | |
| 276 // waiting for a load operation. | |
| 277 mutable std::mutex general_mutex_; | |
| 278 | |
| 279 // Thread which loads document. | |
| 280 std::thread thread; | |
| 281 }; | |
| 282 | |
| 283 // A collection of DocumentData that knows roughly how much memory it is using. | |
| 284 // Note that while it supports background read-ahead, it assumes that a single | |
| 285 // thread is accessing documents, ie it is not safe for multiple threads to | |
| 286 // access different documents in parallel, as one may de-cache the other's | |
| 287 // content. | |
| 288 class DocumentCache { | |
| 289 public: | |
| 290 TESS_API | |
| 291 explicit DocumentCache(int64_t max_memory); | |
| 292 TESS_API | |
| 293 ~DocumentCache(); | |
| 294 | |
| 295 // Deletes all existing documents from the cache. | |
| 296 void Clear() { | |
| 297 for (auto *document : documents_) { | |
| 298 delete document; | |
| 299 } | |
| 300 documents_.clear(); | |
| 301 num_pages_per_doc_ = 0; | |
| 302 } | |
| 303 // Adds all the documents in the list of filenames, counting memory. | |
| 304 // The reader is used to read the files. | |
| 305 TESS_API | |
| 306 bool LoadDocuments(const std::vector<std::string> &filenames, CachingStrategy cache_strategy, | |
| 307 FileReader reader); | |
| 308 | |
| 309 // Adds document to the cache. | |
| 310 bool AddToCache(DocumentData *data); | |
| 311 | |
| 312 // Finds and returns a document by name. | |
| 313 DocumentData *FindDocument(const std::string &document_name) const; | |
| 314 | |
| 315 // Returns a page by serial number using the current cache_strategy_ to | |
| 316 // determine the mapping from serial number to page. | |
| 317 const ImageData *GetPageBySerial(int serial) { | |
| 318 if (cache_strategy_ == CS_SEQUENTIAL) { | |
| 319 return GetPageSequential(serial); | |
| 320 } else { | |
| 321 return GetPageRoundRobin(serial); | |
| 322 } | |
| 323 } | |
| 324 | |
| 325 const std::vector<DocumentData *> &documents() const { | |
| 326 return documents_; | |
| 327 } | |
| 328 // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache | |
| 329 // strategy, could take a long time. | |
| 330 TESS_API | |
| 331 int TotalPages(); | |
| 332 | |
| 333 private: | |
| 334 // Returns a page by serial number, selecting them in a round-robin fashion | |
| 335 // from all the documents. Highly disk-intensive, but doesn't need samples | |
| 336 // to be shuffled between files to begin with. | |
| 337 TESS_API | |
| 338 const ImageData *GetPageRoundRobin(int serial); | |
| 339 // Returns a page by serial number, selecting them in sequence from each file. | |
| 340 // Requires the samples to be shuffled between the files to give a random or | |
| 341 // uniform distribution of data. Less disk-intensive than GetPageRoundRobin. | |
| 342 TESS_API | |
| 343 const ImageData *GetPageSequential(int serial); | |
| 344 | |
| 345 // Helper counts the number of adjacent cached neighbour documents_ of index | |
| 346 // looking in direction dir, ie index+dir, index+2*dir etc. | |
| 347 int CountNeighbourDocs(int index, int dir); | |
| 348 | |
| 349 // A group of pages that corresponds in some loose way to a document. | |
| 350 std::vector<DocumentData *> documents_; | |
| 351 // Strategy to use for caching and serializing data samples. | |
| 352 CachingStrategy cache_strategy_ = CS_SEQUENTIAL; | |
| 353 // Number of pages in the first document, used as a divisor in | |
| 354 // GetPageSequential to determine the document index. | |
| 355 int num_pages_per_doc_ = 0; | |
| 356 // Max memory allowed in this cache. | |
| 357 int64_t max_memory_ = 0; | |
| 358 }; | |
| 359 | |
| 360 } // namespace tesseract | |
| 361 | |
| 362 #endif // TESSERACT_IMAGE_IMAGEDATA_H_ |
