diff mupdf-source/thirdparty/tesseract/src/dict/dawg_cache.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/tesseract/src/dict/dawg_cache.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,97 @@
+///////////////////////////////////////////////////////////////////////
+// File:        dawg_cache.cpp
+// Description: A class that knows about loading and caching dawgs.
+// Author:      David Eger
+//
+// (C) Copyright 2012, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+
+#include "dawg_cache.h"
+
+#include "dawg.h"
+#include "object_cache.h"
+#include "tessdatamanager.h"
+
+namespace tesseract {
+
+struct DawgLoader {
+  DawgLoader(const std::string &lang, TessdataType tessdata_dawg_type, int dawg_debug_level,
+             TessdataManager *data_file)
+      : lang_(lang)
+      , data_file_(data_file)
+      , tessdata_dawg_type_(tessdata_dawg_type)
+      , dawg_debug_level_(dawg_debug_level) {}
+
+  Dawg *Load();
+
+  std::string lang_;
+  TessdataManager *data_file_;
+  TessdataType tessdata_dawg_type_;
+  int dawg_debug_level_;
+};
+
+Dawg *DawgCache::GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type,
+                                 int debug_level, TessdataManager *data_file) {
+  std::string data_id = data_file->GetDataFileName();
+  data_id += kTessdataFileSuffixes[tessdata_dawg_type];
+  DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file);
+  return dawgs_.Get(data_id, std::bind(&DawgLoader::Load, &loader));
+}
+
+Dawg *DawgLoader::Load() {
+  TFile fp;
+  if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) {
+    return nullptr;
+  }
+  DawgType dawg_type;
+  PermuterType perm_type;
+  switch (tessdata_dawg_type_) {
+    case TESSDATA_PUNC_DAWG:
+    case TESSDATA_LSTM_PUNC_DAWG:
+      dawg_type = DAWG_TYPE_PUNCTUATION;
+      perm_type = PUNC_PERM;
+      break;
+    case TESSDATA_SYSTEM_DAWG:
+    case TESSDATA_LSTM_SYSTEM_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = SYSTEM_DAWG_PERM;
+      break;
+    case TESSDATA_NUMBER_DAWG:
+    case TESSDATA_LSTM_NUMBER_DAWG:
+      dawg_type = DAWG_TYPE_NUMBER;
+      perm_type = NUMBER_PERM;
+      break;
+    case TESSDATA_BIGRAM_DAWG:
+      dawg_type = DAWG_TYPE_WORD; // doesn't actually matter
+      perm_type = COMPOUND_PERM;  // doesn't actually matter
+      break;
+    case TESSDATA_UNAMBIG_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = SYSTEM_DAWG_PERM;
+      break;
+    case TESSDATA_FREQ_DAWG:
+      dawg_type = DAWG_TYPE_WORD;
+      perm_type = FREQ_DAWG_PERM;
+      break;
+    default:
+      return nullptr;
+  }
+  auto *retval = new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_);
+  if (retval->Load(&fp)) {
+    return retval;
+  }
+  delete retval;
+  return nullptr;
+}
+
+} // namespace tesseract