diff mupdf-source/source/fitz/tessocr.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/fitz/tessocr.cpp	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,351 @@
+// Copyright (C) 2020-2024 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz/config.h"
+
+#ifndef OCR_DISABLED
+
+#include <climits>
+#include "tesseract/baseapi.h"
+#include "tesseract/capi.h"          // for ETEXT_DESC
+
+extern "C" {
+
+#include "allheaders.h"
+
+#include "tessocr.h"
+#include "leptonica-wrap.h"
+
+#if TESSERACT_MAJOR_VERSION >= 5
+
+static bool
+load_file(const char* filename, std::vector<char>* data)
+{
+	bool result = false;
+	FILE *fp = fopen(filename, "rb");
+	if (fp == NULL)
+		return false;
+
+	fseek(fp, 0, SEEK_END);
+	long size = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+
+	// Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
+	if (size > 0 && size < LONG_MAX)
+	{
+		// reserve an extra byte in case caller wants to append a '\0' character
+		data->reserve(size + 1);
+		data->resize(size);
+		result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
+	}
+	fclose(fp);
+	return result;
+}
+
+static bool
+tess_file_reader(const char *fname, std::vector<char> *out)
+{
+	/* FIXME: Look for inbuilt ones. */
+
+	/* Then under TESSDATA */
+	return load_file(fname, out);
+}
+
+#else
+
+static bool
+load_file(const char* filename, GenericVector<char>* data)
+{
+	bool result = false;
+	FILE *fp = fopen(filename, "rb");
+	if (fp == NULL)
+		return false;
+
+	fseek(fp, 0, SEEK_END);
+	long size = ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+
+	// Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
+	if (size > 0 && size < LONG_MAX)
+	{
+		// reserve an extra byte in case caller wants to append a '\0' character
+		data->reserve(size + 1);
+		data->resize_no_init(size);
+		result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
+	}
+	fclose(fp);
+	return result;
+}
+
+static bool
+tess_file_reader(const STRING& fname, GenericVector<char> *out)
+{
+	/* FIXME: Look for inbuilt ones. */
+
+	/* Then under TESSDATA */
+	return load_file(fname.c_str(), out);
+}
+#endif
+
+void *ocr_init(fz_context *ctx, const char *language, const char *datadir)
+{
+	tesseract::TessBaseAPI *api;
+
+	fz_set_leptonica_mem(ctx);
+	api = new tesseract::TessBaseAPI();
+
+	if (api == NULL)
+	{
+		fz_clear_leptonica_mem(ctx);
+		fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract base initialisation failed");
+	}
+
+	if (language == NULL || language[0] == 0)
+		language = "eng";
+
+	// Initialize tesseract-ocr with English, without specifying tessdata path
+	if (api->Init(datadir, 0, /* data, data_size */
+		language,
+		tesseract::OcrEngineMode::OEM_DEFAULT,
+		NULL, 0, /* configs, configs_size */
+		NULL, NULL, /* vars_vec */
+		false, /* set_only_non_debug_params */
+		&tess_file_reader))
+	{
+		delete api;
+		fz_clear_leptonica_mem(ctx);
+		fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract language initialisation failed");
+	}
+
+	return api;
+}
+
+void ocr_fin(fz_context *ctx, void *api_)
+{
+	tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
+
+	if (api == NULL)
+		return;
+
+	api->End();
+	delete api;
+	fz_clear_leptonica_mem(ctx);
+}
+
+static inline int isbigendian(void)
+{
+	static const int one = 1;
+	return *(char*)&one == 0;
+}
+
+
+static Pix *
+ocr_set_image(fz_context *ctx, tesseract::TessBaseAPI *api, fz_pixmap *pix)
+{
+	Pix *image = pixCreateHeader(pix->w, pix->h, 8);
+
+	if (image == NULL)
+		fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract image creation failed");
+	pixSetData(image, (l_uint32 *)pix->samples);
+	pixSetPadBits(image, 1);
+	pixSetXRes(image, pix->xres);
+	pixSetYRes(image, pix->yres);
+
+	if (!isbigendian())
+	{
+		/* Frizzle the image */
+		int x, y;
+		uint32_t *d = (uint32_t *)pix->samples;
+		for (y = pix->h; y > 0; y--)
+			for (x = pix->w>>2; x > 0; x--)
+			{
+				uint32_t v = *d;
+				((uint8_t *)d)[0] = v>>24;
+				((uint8_t *)d)[1] = v>>16;
+				((uint8_t *)d)[2] = v>>8;
+				((uint8_t *)d)[3] = v;
+				d++;
+			}
+	}
+	/* pixWrite("test.pnm", image, IFF_PNM); */
+
+	api->SetImage(image);
+
+	return image;
+}
+
+static void
+ocr_clear_image(fz_context *ctx, Pix *image)
+{
+	pixSetData(image, NULL);
+	pixDestroy(&image);
+}
+
+typedef struct {
+	fz_context *ctx;
+	void *arg;
+	int (*progress)(fz_context *, void *, int progress);
+} progress_arg;
+
+static bool
+do_cancel(void *arg, int dummy)
+{
+	return true;
+}
+
+static bool
+progress_callback(ETEXT_DESC *monitor, int l, int r, int t, int b)
+{
+	progress_arg *details = (progress_arg *)monitor->cancel_this;
+	int cancel;
+
+	if (!details->progress)
+		return false;
+
+	cancel = details->progress(details->ctx, details->arg, monitor->progress);
+	if (cancel)
+		monitor->cancel = do_cancel;
+
+	return false;
+}
+
+void ocr_recognise(fz_context *ctx,
+		void *api_,
+		fz_pixmap *pix,
+		void (*callback)(fz_context *ctx,
+				void *arg,
+				int unicode,
+				const char *font_name,
+				const int *line_bbox,
+				const int *word_bbox,
+				const int *char_bbox,
+				int pointsize),
+		int (*progress)(fz_context *ctx,
+				void *arg,
+				int progress),
+		void *arg)
+{
+	tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
+	Pix *image;
+	int code;
+	int word_bbox[4];
+	int char_bbox[4];
+	int line_bbox[4];
+	bool bold, italic, underlined, monospace, serif, smallcaps;
+	int pointsize, font_id;
+	const char* font_name;
+	ETEXT_DESC monitor;
+	progress_arg details;
+
+	if (api == NULL)
+		return;
+
+	image = ocr_set_image(ctx, api, pix);
+
+	monitor.cancel = nullptr;
+	monitor.cancel_this = &details;
+	details.ctx = ctx;
+	details.arg = arg;
+	details.progress = progress;
+	monitor.progress_callback2 = progress_callback;
+
+	code = api->Recognize(&monitor);
+	if (code < 0)
+	{
+		ocr_clear_image(ctx, image);
+		fz_throw(ctx, FZ_ERROR_LIBRARY, "OCR recognise failed");
+	}
+
+	if (!isbigendian())
+	{
+		/* Frizzle the image */
+		int x, y;
+		uint32_t *d = (uint32_t *)pix->samples;
+		for (y = pix->h; y > 0; y--)
+			for (x = pix->w>>2; x > 0; x--)
+			{
+				uint32_t v = *d;
+				((uint8_t *)d)[0] = v>>24;
+				((uint8_t *)d)[1] = v>>16;
+				((uint8_t *)d)[2] = v>>8;
+				((uint8_t *)d)[3] = v;
+				d++;
+			}
+	}
+
+	tesseract::ResultIterator *res_it = api->GetIterator();
+
+	fz_try(ctx)
+	{
+		while (!res_it->Empty(tesseract::RIL_BLOCK))
+		{
+			if (res_it->Empty(tesseract::RIL_WORD))
+			{
+				res_it->Next(tesseract::RIL_WORD);
+				continue;
+			}
+
+			res_it->BoundingBox(tesseract::RIL_TEXTLINE,
+					line_bbox, line_bbox+1,
+					line_bbox+2, line_bbox+3);
+			res_it->BoundingBox(tesseract::RIL_WORD,
+					word_bbox, word_bbox+1,
+					word_bbox+2, word_bbox+3);
+			font_name = res_it->WordFontAttributes(&bold,
+							&italic,
+							&underlined,
+							&monospace,
+							&serif,
+							&smallcaps,
+							&pointsize,
+							&font_id);
+			do
+			{
+				const char *graph = res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
+				if (graph && graph[0] != 0)
+				{
+					int unicode;
+					res_it->BoundingBox(tesseract::RIL_SYMBOL,
+							char_bbox, char_bbox+1,
+							char_bbox+2, char_bbox+3);
+					fz_chartorune(&unicode, graph);
+					callback(ctx, arg, unicode, font_name, line_bbox, word_bbox, char_bbox, pointsize);
+				}
+				delete[] graph;
+				res_it->Next(tesseract::RIL_SYMBOL);
+			}
+			while (!res_it->Empty(tesseract::RIL_BLOCK) &&
+				!res_it->IsAtBeginningOf(tesseract::RIL_WORD));
+		}
+	}
+	fz_always(ctx)
+	{
+		delete res_it;
+		ocr_clear_image(ctx, image);
+	}
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+}
+
+}
+
+#endif