comparison mupdf-source/source/fitz/tessocr.cpp @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2020-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz/config.h"
24
25 #ifndef OCR_DISABLED
26
27 #include <climits>
28 #include "tesseract/baseapi.h"
29 #include "tesseract/capi.h" // for ETEXT_DESC
30
31 extern "C" {
32
33 #include "allheaders.h"
34
35 #include "tessocr.h"
36 #include "leptonica-wrap.h"
37
38 #if TESSERACT_MAJOR_VERSION >= 5
39
40 static bool
41 load_file(const char* filename, std::vector<char>* data)
42 {
43 bool result = false;
44 FILE *fp = fopen(filename, "rb");
45 if (fp == NULL)
46 return false;
47
48 fseek(fp, 0, SEEK_END);
49 long size = ftell(fp);
50 fseek(fp, 0, SEEK_SET);
51
52 // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
53 if (size > 0 && size < LONG_MAX)
54 {
55 // reserve an extra byte in case caller wants to append a '\0' character
56 data->reserve(size + 1);
57 data->resize(size);
58 result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
59 }
60 fclose(fp);
61 return result;
62 }
63
64 static bool
65 tess_file_reader(const char *fname, std::vector<char> *out)
66 {
67 /* FIXME: Look for inbuilt ones. */
68
69 /* Then under TESSDATA */
70 return load_file(fname, out);
71 }
72
73 #else
74
75 static bool
76 load_file(const char* filename, GenericVector<char>* data)
77 {
78 bool result = false;
79 FILE *fp = fopen(filename, "rb");
80 if (fp == NULL)
81 return false;
82
83 fseek(fp, 0, SEEK_END);
84 long size = ftell(fp);
85 fseek(fp, 0, SEEK_SET);
86
87 // Trying to open a directory on Linux sets size to LONG_MAX. Catch it here.
88 if (size > 0 && size < LONG_MAX)
89 {
90 // reserve an extra byte in case caller wants to append a '\0' character
91 data->reserve(size + 1);
92 data->resize_no_init(size);
93 result = static_cast<long>(fread(&(*data)[0], 1, size, fp)) == size;
94 }
95 fclose(fp);
96 return result;
97 }
98
99 static bool
100 tess_file_reader(const STRING& fname, GenericVector<char> *out)
101 {
102 /* FIXME: Look for inbuilt ones. */
103
104 /* Then under TESSDATA */
105 return load_file(fname.c_str(), out);
106 }
107 #endif
108
109 void *ocr_init(fz_context *ctx, const char *language, const char *datadir)
110 {
111 tesseract::TessBaseAPI *api;
112
113 fz_set_leptonica_mem(ctx);
114 api = new tesseract::TessBaseAPI();
115
116 if (api == NULL)
117 {
118 fz_clear_leptonica_mem(ctx);
119 fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract base initialisation failed");
120 }
121
122 if (language == NULL || language[0] == 0)
123 language = "eng";
124
125 // Initialize tesseract-ocr with English, without specifying tessdata path
126 if (api->Init(datadir, 0, /* data, data_size */
127 language,
128 tesseract::OcrEngineMode::OEM_DEFAULT,
129 NULL, 0, /* configs, configs_size */
130 NULL, NULL, /* vars_vec */
131 false, /* set_only_non_debug_params */
132 &tess_file_reader))
133 {
134 delete api;
135 fz_clear_leptonica_mem(ctx);
136 fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract language initialisation failed");
137 }
138
139 return api;
140 }
141
142 void ocr_fin(fz_context *ctx, void *api_)
143 {
144 tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
145
146 if (api == NULL)
147 return;
148
149 api->End();
150 delete api;
151 fz_clear_leptonica_mem(ctx);
152 }
153
154 static inline int isbigendian(void)
155 {
156 static const int one = 1;
157 return *(char*)&one == 0;
158 }
159
160
161 static Pix *
162 ocr_set_image(fz_context *ctx, tesseract::TessBaseAPI *api, fz_pixmap *pix)
163 {
164 Pix *image = pixCreateHeader(pix->w, pix->h, 8);
165
166 if (image == NULL)
167 fz_throw(ctx, FZ_ERROR_LIBRARY, "Tesseract image creation failed");
168 pixSetData(image, (l_uint32 *)pix->samples);
169 pixSetPadBits(image, 1);
170 pixSetXRes(image, pix->xres);
171 pixSetYRes(image, pix->yres);
172
173 if (!isbigendian())
174 {
175 /* Frizzle the image */
176 int x, y;
177 uint32_t *d = (uint32_t *)pix->samples;
178 for (y = pix->h; y > 0; y--)
179 for (x = pix->w>>2; x > 0; x--)
180 {
181 uint32_t v = *d;
182 ((uint8_t *)d)[0] = v>>24;
183 ((uint8_t *)d)[1] = v>>16;
184 ((uint8_t *)d)[2] = v>>8;
185 ((uint8_t *)d)[3] = v;
186 d++;
187 }
188 }
189 /* pixWrite("test.pnm", image, IFF_PNM); */
190
191 api->SetImage(image);
192
193 return image;
194 }
195
196 static void
197 ocr_clear_image(fz_context *ctx, Pix *image)
198 {
199 pixSetData(image, NULL);
200 pixDestroy(&image);
201 }
202
203 typedef struct {
204 fz_context *ctx;
205 void *arg;
206 int (*progress)(fz_context *, void *, int progress);
207 } progress_arg;
208
209 static bool
210 do_cancel(void *arg, int dummy)
211 {
212 return true;
213 }
214
215 static bool
216 progress_callback(ETEXT_DESC *monitor, int l, int r, int t, int b)
217 {
218 progress_arg *details = (progress_arg *)monitor->cancel_this;
219 int cancel;
220
221 if (!details->progress)
222 return false;
223
224 cancel = details->progress(details->ctx, details->arg, monitor->progress);
225 if (cancel)
226 monitor->cancel = do_cancel;
227
228 return false;
229 }
230
231 void ocr_recognise(fz_context *ctx,
232 void *api_,
233 fz_pixmap *pix,
234 void (*callback)(fz_context *ctx,
235 void *arg,
236 int unicode,
237 const char *font_name,
238 const int *line_bbox,
239 const int *word_bbox,
240 const int *char_bbox,
241 int pointsize),
242 int (*progress)(fz_context *ctx,
243 void *arg,
244 int progress),
245 void *arg)
246 {
247 tesseract::TessBaseAPI *api = (tesseract::TessBaseAPI *)api_;
248 Pix *image;
249 int code;
250 int word_bbox[4];
251 int char_bbox[4];
252 int line_bbox[4];
253 bool bold, italic, underlined, monospace, serif, smallcaps;
254 int pointsize, font_id;
255 const char* font_name;
256 ETEXT_DESC monitor;
257 progress_arg details;
258
259 if (api == NULL)
260 return;
261
262 image = ocr_set_image(ctx, api, pix);
263
264 monitor.cancel = nullptr;
265 monitor.cancel_this = &details;
266 details.ctx = ctx;
267 details.arg = arg;
268 details.progress = progress;
269 monitor.progress_callback2 = progress_callback;
270
271 code = api->Recognize(&monitor);
272 if (code < 0)
273 {
274 ocr_clear_image(ctx, image);
275 fz_throw(ctx, FZ_ERROR_LIBRARY, "OCR recognise failed");
276 }
277
278 if (!isbigendian())
279 {
280 /* Frizzle the image */
281 int x, y;
282 uint32_t *d = (uint32_t *)pix->samples;
283 for (y = pix->h; y > 0; y--)
284 for (x = pix->w>>2; x > 0; x--)
285 {
286 uint32_t v = *d;
287 ((uint8_t *)d)[0] = v>>24;
288 ((uint8_t *)d)[1] = v>>16;
289 ((uint8_t *)d)[2] = v>>8;
290 ((uint8_t *)d)[3] = v;
291 d++;
292 }
293 }
294
295 tesseract::ResultIterator *res_it = api->GetIterator();
296
297 fz_try(ctx)
298 {
299 while (!res_it->Empty(tesseract::RIL_BLOCK))
300 {
301 if (res_it->Empty(tesseract::RIL_WORD))
302 {
303 res_it->Next(tesseract::RIL_WORD);
304 continue;
305 }
306
307 res_it->BoundingBox(tesseract::RIL_TEXTLINE,
308 line_bbox, line_bbox+1,
309 line_bbox+2, line_bbox+3);
310 res_it->BoundingBox(tesseract::RIL_WORD,
311 word_bbox, word_bbox+1,
312 word_bbox+2, word_bbox+3);
313 font_name = res_it->WordFontAttributes(&bold,
314 &italic,
315 &underlined,
316 &monospace,
317 &serif,
318 &smallcaps,
319 &pointsize,
320 &font_id);
321 do
322 {
323 const char *graph = res_it->GetUTF8Text(tesseract::RIL_SYMBOL);
324 if (graph && graph[0] != 0)
325 {
326 int unicode;
327 res_it->BoundingBox(tesseract::RIL_SYMBOL,
328 char_bbox, char_bbox+1,
329 char_bbox+2, char_bbox+3);
330 fz_chartorune(&unicode, graph);
331 callback(ctx, arg, unicode, font_name, line_bbox, word_bbox, char_bbox, pointsize);
332 }
333 delete[] graph;
334 res_it->Next(tesseract::RIL_SYMBOL);
335 }
336 while (!res_it->Empty(tesseract::RIL_BLOCK) &&
337 !res_it->IsAtBeginningOf(tesseract::RIL_WORD));
338 }
339 }
340 fz_always(ctx)
341 {
342 delete res_it;
343 ocr_clear_image(ctx, image);
344 }
345 fz_catch(ctx)
346 fz_rethrow(ctx);
347 }
348
349 }
350
351 #endif