comparison mupdf-source/source/fitz/ocr-device.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24
25 #include <assert.h>
26 #include <string.h>
27 #include <errno.h>
28
29 #undef DEBUG_OCR
30
31 #ifndef OCR_DISABLED
32 #include "tessocr.h"
33
34 /*
35
36 This device can be used in 2 modes, with or without a list.
37
38 In both modes the OCR device is created with a target device. The
39 caller runs the page to the device, and the device processes the calls
40 and (eventually) calls through to the target.
41
42 In both modes, all incoming calls are forwarded to an internal draw
43 device to render the page, so the page rendering is always complete.
44 The incoming calls are also forwarded (mostly, eventually) to the
45 target. Where the 2 modes differ is in the timing/content of those
46 forwarded calls.
47
48 In the first mode (without a list), the device instantly forwards all
49 non-text calls to the target. When the OCR device is closed, an OCR pass
50 is performed, and the recovered text is forwarded to the target. All
51 recovered text is listed as Courier, and ends up on top of the content.
52
53 This is fine for text extraction and probably for most cases of document
54 conversion. It's no good for correcting the unicode values within a
55 document though.
56
57 So, we have concocted a second way of working, using a display list. In
58 this mode, as well as rendering every device call that comes in, it
59 forwards them to a display list (and not the target). When the device
60 is closed we OCR the text image, and store the results. We then play
61 the list back through a 'rewrite' device to the target. The rewrite
62 device rewrites the text objects with the correct unicode values. Any
63 characters given by the OCR pass that aren't used by the rewrite step
64 are then sent through as invisible text.
65
66 This means that all the target device sees is the exact same graphical
67 objects in the exact same order, but with corrected unicode values.
68 Also, any text that appears in the document as a result of images or
69 line art is sent through as 'invisible' text at the end, so it will work
70 for cut/paste or search.
71
72 Or, at least, that was the plan. Unfortunately, it turns out that
73 Tesseract (with the LSTM engine (the most modern one)) is really bad at
74 giving bounding boxes for characters. It seems that the neural network
75 can say "hey, there is an 'X'", but it can't actually say where the X
76 occurred within the word. So tesseract knows where the words are, and
77 knows the order of the letters within the word, but basically guesses
78 at bboxes for the letters.
79
80 Because of this, we can't rely on character bboxes from tesseract to be
81 correct. We have to work off the word bboxes alone, together with the
82 order in which characters are passed to us.
83
84 So, as Tesseract gives us data, we store the word bbox, together with
85 the list of chars within that word.
86
87 When we play the list back through the display device, we then have to
88 rewrite text objects based on which word they are in. For the first
89 version, we'll make the extremely dodgy assumption that characters
90 come in the same order within the word.
91
92 For future versions we may want to collect bboxes for each text char
93 on our initial list building pass, collate those into matching 'words'
94 and sort them accordingly.
95 */
96
97
98 typedef struct word_record_s {
99 int len;
100 fz_rect bbox;
101 int n;
102 int unicode[FZ_FLEXIBLE_ARRAY];
103 } word_record;
104
105 typedef struct fz_ocr_device_s
106 {
107 fz_device super;
108
109 /* Progress monitoring */
110 int (*progress)(fz_context *, void *, int progress);
111 void *progress_arg;
112
113 fz_device *target;
114 fz_display_list *list;
115 fz_device *list_dev;
116 fz_device *draw_dev;
117 fz_pixmap *pixmap;
118
119 fz_rect mediabox;
120 fz_matrix ctm;
121
122 fz_rect word_bbox;
123 fz_font *font;
124
125 /* Current word */
126 int char_max;
127 int char_len;
128 int *chars;
129
130 /* Entire page */
131 int words_max;
132 int words_len;
133 word_record **words;
134
135 char *language;
136 char *datadir;
137 } fz_ocr_device;
138
139 static void
140 fz_ocr_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm,
141 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
142 {
143 fz_ocr_device *ocr = (fz_ocr_device *)dev;
144
145 fz_fill_path(ctx, ocr->list_dev, path, even_odd, ctm, colorspace, color, alpha, color_params);
146 fz_fill_path(ctx, ocr->draw_dev, path, even_odd, ctm, colorspace, color, alpha, color_params);
147 }
148
149 static void
150 fz_ocr_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke,
151 fz_matrix ctm, fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
152 {
153 fz_ocr_device *ocr = (fz_ocr_device *)dev;
154
155 fz_stroke_path(ctx, ocr->list_dev, path, stroke, ctm, colorspace, color, alpha, color_params);
156 fz_stroke_path(ctx, ocr->draw_dev, path, stroke, ctm, colorspace, color, alpha, color_params);
157 }
158
159 static void
160 fz_ocr_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm,
161 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
162 {
163 fz_ocr_device *ocr = (fz_ocr_device *)dev;
164
165 if (ocr->list_dev != ocr->target)
166 fz_fill_text(ctx, ocr->list_dev, text, ctm, colorspace, color, alpha, color_params);
167 fz_fill_text(ctx, ocr->draw_dev, text, ctm, colorspace, color, alpha, color_params);
168 }
169
170 static void
171 fz_ocr_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke,
172 fz_matrix ctm, fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
173 {
174 fz_ocr_device *ocr = (fz_ocr_device *)dev;
175
176 if (ocr->list_dev != ocr->target)
177 fz_stroke_text(ctx, ocr->list_dev, text, stroke, ctm, colorspace, color, alpha, color_params);
178 fz_stroke_text(ctx, ocr->draw_dev, text, stroke, ctm, colorspace, color, alpha, color_params);
179 }
180
181 static void
182 fz_ocr_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params)
183 {
184 fz_ocr_device *ocr = (fz_ocr_device *)dev;
185
186 fz_fill_shade(ctx, ocr->list_dev, shade, ctm, alpha, color_params);
187 fz_fill_shade(ctx, ocr->draw_dev, shade, ctm, alpha, color_params);
188 }
189
190 static void
191 fz_ocr_fill_image(fz_context *ctx, fz_device *dev, fz_image *image, fz_matrix ctm, float alpha, fz_color_params color_params)
192 {
193 fz_ocr_device *ocr = (fz_ocr_device *)dev;
194
195 fz_fill_image(ctx, ocr->list_dev, image, ctm, alpha, color_params);
196 fz_fill_image(ctx, ocr->draw_dev, image, ctm, alpha, color_params);
197 }
198
199 static void
200 fz_ocr_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *image, fz_matrix ctm,
201 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params)
202 {
203 fz_ocr_device *ocr = (fz_ocr_device *)dev;
204
205 fz_fill_image_mask(ctx, ocr->list_dev, image, ctm, colorspace, color, alpha, color_params);
206 fz_fill_image_mask(ctx, ocr->draw_dev, image, ctm, colorspace, color, alpha, color_params);
207 }
208
209 static void
210 fz_ocr_clip_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_rect scissor)
211 {
212 fz_ocr_device *ocr = (fz_ocr_device *)dev;
213
214 fz_clip_path(ctx, ocr->list_dev, path, even_odd, ctm, scissor);
215 fz_clip_path(ctx, ocr->draw_dev, path, even_odd, ctm, scissor);
216 }
217
218 static void
219 fz_ocr_clip_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
220 {
221 fz_ocr_device *ocr = (fz_ocr_device *)dev;
222
223 fz_clip_stroke_path(ctx, ocr->list_dev, path, stroke, ctm, scissor);
224 fz_clip_stroke_path(ctx, ocr->draw_dev, path, stroke, ctm, scissor);
225 }
226
227 static void
228 fz_ocr_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
229 {
230 fz_ocr_device *ocr = (fz_ocr_device *)dev;
231
232 if (ocr->list_dev != ocr->target)
233 fz_clip_text(ctx, ocr->list_dev, text, ctm, scissor);
234 fz_clip_text(ctx, ocr->draw_dev, text, ctm, scissor);
235 }
236
237 static void
238 fz_ocr_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
239 {
240 fz_ocr_device *ocr = (fz_ocr_device *)dev;
241
242 if (ocr->list_dev != ocr->target)
243 fz_clip_stroke_text(ctx, ocr->list_dev, text, stroke, ctm, scissor);
244 fz_clip_stroke_text(ctx, ocr->draw_dev, text, stroke, ctm, scissor);
245 }
246
247 static void
248 fz_ocr_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
249 {
250 fz_ocr_device *ocr = (fz_ocr_device *)dev;
251
252 /* Ignore text is generally used when text has been sent as
253 * part of other graphics - such as line art or images. As such
254 * we'll pick up the 'true' unicode values of such text in the
255 * OCR phase. We therefore send text to the list device (so
256 * it can be rewritten), but not direct to the target. */
257 if (ocr->list_dev != ocr->target)
258 fz_ignore_text(ctx, ocr->list_dev, text, ctm);
259 fz_ignore_text(ctx, ocr->draw_dev, text, ctm);
260 }
261
262 static void
263 fz_ocr_clip_image_mask(fz_context *ctx, fz_device *dev, fz_image *image, fz_matrix ctm, fz_rect scissor)
264 {
265 fz_ocr_device *ocr = (fz_ocr_device *)dev;
266
267 fz_clip_image_mask(ctx, ocr->list_dev, image, ctm, scissor);
268 fz_clip_image_mask(ctx, ocr->draw_dev, image, ctm, scissor);
269 }
270
271 static void
272 fz_ocr_pop_clip(fz_context *ctx, fz_device *dev)
273 {
274 fz_ocr_device *ocr = (fz_ocr_device *)dev;
275
276 fz_pop_clip(ctx, ocr->list_dev);
277 fz_pop_clip(ctx, ocr->draw_dev);
278 }
279
280 static void
281 fz_ocr_begin_mask(fz_context *ctx, fz_device *dev, fz_rect rect, int luminosity, fz_colorspace *colorspace, const float *color, fz_color_params color_params)
282 {
283 fz_ocr_device *ocr = (fz_ocr_device *)dev;
284
285 fz_begin_mask(ctx, ocr->list_dev, rect, luminosity, colorspace, color, color_params);
286 fz_begin_mask(ctx, ocr->draw_dev, rect, luminosity, colorspace, color, color_params);
287 }
288
289 static void
290 fz_ocr_end_mask(fz_context *ctx, fz_device *dev, fz_function *tr)
291 {
292 fz_ocr_device *ocr = (fz_ocr_device *)dev;
293
294 fz_end_mask_tr(ctx, ocr->list_dev, tr);
295 fz_end_mask_tr(ctx, ocr->draw_dev, tr);
296 }
297
298 static void
299 fz_ocr_begin_group(fz_context *ctx, fz_device *dev, fz_rect rect, fz_colorspace *cs, int isolated, int knockout, int blendmode, float alpha)
300 {
301 fz_ocr_device *ocr = (fz_ocr_device *)dev;
302
303 fz_begin_group(ctx, ocr->list_dev, rect, cs, isolated, knockout, blendmode, alpha);
304 fz_begin_group(ctx, ocr->draw_dev, rect, cs, isolated, knockout, blendmode, alpha);
305 }
306
307 static void
308 fz_ocr_end_group(fz_context *ctx, fz_device *dev)
309 {
310 fz_ocr_device *ocr = (fz_ocr_device *)dev;
311
312 fz_end_group(ctx, ocr->list_dev);
313 fz_end_group(ctx, ocr->draw_dev);
314 }
315
316 static int
317 fz_ocr_begin_tile(fz_context *ctx, fz_device *dev, fz_rect area, fz_rect view, float xstep, float ystep, fz_matrix ctm, int id, int doc_id)
318 {
319 fz_ocr_device *ocr = (fz_ocr_device *)dev;
320
321 /* Always pass 0 as tile id here so that neither device can
322 * disagree about whether the contents need to be sent. */
323 (void)fz_begin_tile_tid(ctx, ocr->list_dev, area, view, xstep, ystep, ctm, 0, 0);
324 (void)fz_begin_tile_tid(ctx, ocr->draw_dev, area, view, xstep, ystep, ctm, 0, 0);
325
326 return 0;
327 }
328
329 static void
330 fz_ocr_end_tile(fz_context *ctx, fz_device *dev)
331 {
332 fz_ocr_device *ocr = (fz_ocr_device *)dev;
333
334 fz_end_tile(ctx, ocr->list_dev);
335 fz_end_tile(ctx, ocr->draw_dev);
336 }
337
338 static void
339 fz_ocr_render_flags(fz_context *ctx, fz_device *dev, int set, int clear)
340 {
341 fz_ocr_device *ocr = (fz_ocr_device *)dev;
342
343 fz_render_flags(ctx, ocr->list_dev, set, clear);
344 fz_render_flags(ctx, ocr->draw_dev, set, clear);
345 }
346
347 static void
348 fz_ocr_set_default_colorspaces(fz_context *ctx, fz_device *dev, fz_default_colorspaces *cs)
349 {
350 fz_ocr_device *ocr = (fz_ocr_device *)dev;
351
352 fz_set_default_colorspaces(ctx, ocr->list_dev, cs);
353 fz_set_default_colorspaces(ctx, ocr->draw_dev, cs);
354 }
355
356 static void
357 fz_ocr_begin_layer(fz_context *ctx, fz_device *dev, const char *layer_name)
358 {
359 fz_ocr_device *ocr = (fz_ocr_device *)dev;
360
361 fz_begin_layer(ctx, ocr->list_dev, layer_name);
362 fz_begin_layer(ctx, ocr->draw_dev, layer_name);
363 }
364
365 static void
366 fz_ocr_end_layer(fz_context *ctx, fz_device *dev)
367 {
368 fz_ocr_device *ocr = (fz_ocr_device *)dev;
369
370 fz_end_layer(ctx, ocr->list_dev);
371 fz_end_layer(ctx, ocr->draw_dev);
372 }
373
374 static void
375 drop_ocr_device(fz_context *ctx, fz_ocr_device *ocr)
376 {
377 int i;
378
379 if (ocr == NULL)
380 return;
381
382 if (ocr->list_dev != ocr->target)
383 fz_drop_device(ctx, ocr->list_dev);
384 fz_drop_display_list(ctx, ocr->list);
385 fz_drop_device(ctx, ocr->draw_dev);
386 fz_drop_pixmap(ctx, ocr->pixmap);
387 for (i = 0; i < ocr->words_len; i++)
388 fz_free(ctx, ocr->words[i]);
389 fz_free(ctx, ocr->words);
390 fz_free(ctx, ocr->chars);
391 fz_free(ctx, ocr->language);
392 fz_free(ctx, ocr->datadir);
393 }
394
395 static void
396 flush_word(fz_context *ctx, fz_ocr_device *ocr)
397 {
398 float color = 1;
399 fz_color_params params = { 0 };
400 int i;
401 fz_text *text = NULL;
402 fz_matrix trm;
403 float step;
404 fz_rect char_bbox;
405
406 if (ocr->char_len == 0)
407 return;
408
409 /* If we're not sending direct to the target device, then insert
410 * all the chars we've found into a table so we can rewrite
411 * the text objects that come from the list device on the fly.
412 */
413 if (ocr->list_dev != ocr->target)
414 {
415 word_record *word;
416
417 if (ocr->words_len == ocr->words_max)
418 {
419 int new_max = ocr->words_max * 2;
420 if (new_max == 0)
421 new_max = 32;
422 ocr->words = fz_realloc_array(ctx, ocr->words, new_max, word_record *);
423 ocr->words_max = new_max;
424 }
425 word = fz_malloc_flexible(ctx, word_record, unicode, ocr->char_len);
426 word->len = ocr->char_len;
427 word->bbox = ocr->word_bbox;
428 word->n = 0;
429 memcpy(word->unicode, ocr->chars, ocr->char_len * sizeof(int));
430 ocr->words[ocr->words_len++] = word;
431 ocr->char_len = 0;
432 return;
433 }
434 /* FIXME: Look at font-name. */
435 /* All this is a bit horrid, because the detection of sizes for
436 * the glyphs depends on the width of the glyphs. Use Courier
437 * because it's monospaced. */
438 if (ocr->font == NULL)
439 ocr->font = fz_new_base14_font(ctx, "Courier");
440
441 fz_var(text);
442
443 fz_try(ctx)
444 {
445 text = fz_new_text(ctx);
446
447 /* Divide the word box into equal lengths. */
448 /* This falls down when we have words with chars of
449 * different widths in, but it's acceptable for these
450 * purposes. */
451 /* FIXME: This assumes L2R motion of text. */
452 step = (ocr->word_bbox.x1 - ocr->word_bbox.x0) / ocr->char_len;
453 char_bbox.x1 = ocr->word_bbox.x0;
454 char_bbox.y0 = ocr->word_bbox.y0;
455 char_bbox.y1 = ocr->word_bbox.y1;
456 for (i = 0; i < ocr->char_len; i++)
457 {
458 char_bbox.x0 = char_bbox.x1;
459 char_bbox.x1 += step;
460 /* Horrid constants that happen to work with Courier. */
461 trm.a = 10.0f/6 * (char_bbox.x1 - char_bbox.x0);
462 trm.b = 0;
463 trm.c = 0;
464 trm.d = 10.0f/6 * (char_bbox.y1 - char_bbox.y0);
465 trm.e = char_bbox.x0;
466 trm.f = char_bbox.y0;
467 fz_show_glyph(ctx, text, ocr->font, trm,
468 fz_encode_character(ctx, ocr->font, ocr->chars[i]), ocr->chars[i],
469 0, 0, FZ_BIDI_LTR, 0);
470 }
471
472 fz_fill_text(ctx, ocr->target, text, fz_identity,
473 fz_device_gray(ctx), &color, 1, params);
474 }
475 fz_always(ctx)
476 {
477 fz_drop_text(ctx, text);
478 }
479 fz_catch(ctx)
480 fz_rethrow(ctx);
481
482 ocr->char_len = 0;
483 }
484
485 static void
486 char_callback(fz_context *ctx, void *arg, int unicode,
487 const char *font_name,
488 const int *line_bbox, const int *word_bbox,
489 const int *char_bbox, int pointsize)
490 {
491 fz_ocr_device *ocr = (fz_ocr_device *)arg;
492 fz_rect bbox = { word_bbox[0]-1, word_bbox[1]-1, word_bbox[2]+1, word_bbox[3]+1 };
493
494 if (bbox.x0 != ocr->word_bbox.x0 ||
495 bbox.y0 != ocr->word_bbox.y0 ||
496 bbox.x1 != ocr->word_bbox.x1 ||
497 bbox.y1 != ocr->word_bbox.y1)
498 {
499 flush_word(ctx, ocr);
500 ocr->word_bbox = bbox;
501 }
502
503 if (ocr->char_max == ocr->char_len)
504 {
505 int new_max = ocr->char_max * 2;
506 if (new_max == 0)
507 new_max = 32;
508 ocr->chars = fz_realloc_array(ctx, ocr->chars, new_max, int);
509 ocr->char_max = new_max;
510 }
511
512 ocr->chars[ocr->char_len++] = unicode;
513 }
514
515
516 typedef struct
517 {
518 fz_device super;
519
520 fz_device *target;
521 int words_len;
522 word_record **words;
523 int current;
524 } fz_rewrite_device;
525
526 static fz_text_span *
527 fz_clone_text_span(fz_context *ctx, const fz_text_span *span)
528 {
529 fz_text_span *cspan;
530
531 if (span == NULL)
532 return NULL;
533
534 cspan = fz_malloc_struct(ctx, fz_text_span);
535 *cspan = *span;
536 cspan->cap = cspan->len;
537 cspan->items = fz_calloc_no_throw(ctx, cspan->len, sizeof(*cspan->items));
538 if (cspan->items == NULL)
539 {
540 fz_free(ctx, cspan);
541 errno = ENOMEM;
542 fz_throw(ctx, FZ_ERROR_SYSTEM, "calloc (%zu x %zu bytes) failed", (size_t)cspan->len, sizeof(*cspan->items));
543 }
544 memcpy(cspan->items, span->items, sizeof(*cspan->items) * cspan->len);
545 fz_keep_font(ctx, cspan->font);
546
547 return cspan;
548 }
549
550 #ifdef DEBUG_OCR
551 static void
552 debug_word(fz_context *ctx, word_record *word)
553 {
554 int i;
555
556 fz_write_printf(ctx, fz_stdout(ctx), " %g %g %g %g:",
557 word->bbox.x0,
558 word->bbox.y0,
559 word->bbox.x1,
560 word->bbox.y1);
561
562 for (i = 0; i < word->n; i++)
563 {
564 int unicode = word->unicode[i];
565 if (unicode >= 32 && unicode < 127)
566 fz_write_printf(ctx, fz_stdout(ctx), "%c", unicode);
567 else
568 fz_write_printf(ctx, fz_stdout(ctx), "<%04x>", unicode);
569 }
570 if (word->n < word->len)
571 {
572 int unicode = word->unicode[i++];
573 if (unicode >= 32 && unicode < 127)
574 fz_write_printf(ctx, fz_stdout(ctx), "{%c}", unicode);
575 else
576 fz_write_printf(ctx, fz_stdout(ctx), "{<%04x>}", unicode);
577 for (; i < word->len; i++)
578 {
579 int unicode = word->unicode[i];
580 if (unicode >= 32 && unicode < 127)
581 fz_write_printf(ctx, fz_stdout(ctx), "%c", unicode);
582 else
583 fz_write_printf(ctx, fz_stdout(ctx), "<%04x>", unicode);
584 }
585 }
586 fz_write_printf(ctx, fz_stdout(ctx), "\n");
587 }
588 #endif
589
590 static void
591 rewrite_char(fz_context *ctx, fz_rewrite_device *dev, fz_matrix ctm, fz_text_item *item, fz_point vadv)
592 {
593 int i, start;
594 fz_point p = { item->x, item->y };
595
596 /* No point in trying to rewrite spaces! */
597 if (item->ucs == 32)
598 return;
599
600 p = fz_transform_point(p, ctm);
601 p.x += vadv.x/2;
602 p.y += vadv.y/2;
603
604 #ifdef DEBUG_OCR
605 fz_write_printf(ctx, fz_stdout(ctx), "Looking for '%c' at %g %g\n", item->ucs, p.x, p.y);
606 #endif
607
608 start = dev->current;
609 for (i = start; i < dev->words_len; i++)
610 {
611 #ifdef DEBUG_OCR
612 debug_word(ctx, dev->words[i]);
613 #endif
614 if (dev->words[i]->n >= dev->words[i]->len)
615 continue;
616 if (dev->words[i]->bbox.x0 <= p.x &&
617 dev->words[i]->bbox.x1 >= p.x &&
618 dev->words[i]->bbox.y0 <= p.y &&
619 dev->words[i]->bbox.y1 >= p.y)
620 {
621 item->ucs = dev->words[i]->unicode[dev->words[i]->n++];
622 dev->current = i;
623 return;
624 }
625 }
626 for (i = 0; i < start; i++)
627 {
628 #ifdef DEBUG_OCR
629 debug_word(ctx, dev->words[i]);
630 #endif
631 if (dev->words[i]->n >= dev->words[i]->len)
632 continue;
633 if (dev->words[i]->bbox.x0 <= p.x &&
634 dev->words[i]->bbox.x1 >= p.x &&
635 dev->words[i]->bbox.y0 <= p.y &&
636 dev->words[i]->bbox.y1 >= p.y)
637 {
638 item->ucs = dev->words[i]->unicode[dev->words[i]->n++];
639 dev->current = i;
640 return;
641 }
642 }
643 }
644
645 static fz_text_span *
646 rewrite_span(fz_context *ctx, fz_rewrite_device *dev, fz_matrix ctm, const fz_text_span *span)
647 {
648 fz_text_span *rspan = fz_clone_text_span(ctx, span);
649 int wmode = span->wmode;
650 int i;
651 fz_point dir;
652 fz_matrix trm = span->trm;
653
654 trm.e = 0;
655 trm.f = 0;
656 trm = fz_concat(trm, ctm);
657
658 if (wmode == 0)
659 {
660 dir.x = 1;
661 dir.y = 0;
662 }
663 else
664 {
665 dir.x = 0;
666 dir.y = -1;
667 }
668 dir = fz_transform_vector(dir, trm);
669
670 /* And do the actual rewriting */
671 for (i = 0; i < rspan->len; i++) {
672 float advance = rspan->items[i].adv;
673 fz_point vadv = { dir.x * advance, dir.y * advance };
674 rewrite_char(ctx, dev, ctm, &rspan->items[i], vadv);
675 }
676
677 return rspan;
678 }
679
680 static fz_text *
681 rewrite_text(fz_context *ctx, fz_rewrite_device *dev, fz_matrix ctm, const fz_text *text)
682 {
683 fz_text *rtext = fz_new_text(ctx);
684 fz_text_span *span = text->head;
685 fz_text_span **dspan = &rtext->head;
686
687 fz_try(ctx)
688 {
689 while (span)
690 {
691 *dspan = rewrite_span(ctx, dev, ctm, span);
692 rtext->tail = *dspan;
693 dspan = &(*dspan)->next;
694 span = span->next;
695 }
696 }
697 fz_catch(ctx)
698 {
699 fz_drop_text(ctx, rtext);
700 fz_rethrow(ctx);
701 }
702
703 return rtext;
704 }
705
706 static void
707 rewrite_fill_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
708 {
709 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
710
711 fz_fill_path(ctx, rewrite->target, path, even_odd, ctm, cs, color, alpha, params);
712 }
713
714 static void
715 rewrite_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
716 {
717 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
718
719 fz_stroke_path(ctx, rewrite->target, path, stroke, ctm, cs, color, alpha, params);
720 }
721
722 static void
723 rewrite_clip_path(fz_context *ctx, fz_device *dev, const fz_path *path, int even_odd, fz_matrix ctm, fz_rect scissor)
724 {
725 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
726
727 fz_clip_path(ctx, rewrite->target, path, even_odd, ctm, scissor);
728 }
729
730 static void
731 rewrite_clip_stroke_path(fz_context *ctx, fz_device *dev, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
732 {
733 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
734
735 fz_clip_stroke_path(ctx, rewrite->target, path, stroke, ctm, scissor);
736 }
737
738 static void
739 rewrite_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
740 {
741 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
742 fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
743
744 fz_try(ctx)
745 fz_fill_text(ctx, rewrite->target, rtext, ctm, cs, color, alpha, params);
746 fz_always(ctx)
747 fz_drop_text(ctx, rtext);
748 fz_catch(ctx)
749 fz_rethrow(ctx);
750 }
751
752 static void
753 rewrite_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params params)
754 {
755 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
756 fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
757
758 fz_try(ctx)
759 fz_stroke_text(ctx, rewrite->target, rtext, stroke, ctm, cs, color, alpha, params);
760 fz_always(ctx)
761 fz_drop_text(ctx, rtext);
762 fz_catch(ctx)
763 fz_rethrow(ctx);
764 }
765
766 static void
767 rewrite_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor)
768 {
769 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
770 fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
771
772 fz_try(ctx)
773 fz_clip_text(ctx, rewrite->target, rtext, ctm, scissor);
774 fz_always(ctx)
775 fz_drop_text(ctx, rtext);
776 fz_catch(ctx)
777 fz_rethrow(ctx);
778 }
779
780 static void
781 rewrite_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor)
782 {
783 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
784 fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
785
786 fz_try(ctx)
787 fz_clip_stroke_text(ctx, rewrite->target, rtext, stroke, ctm, scissor);
788 fz_always(ctx)
789 fz_drop_text(ctx, rtext);
790 fz_catch(ctx)
791 fz_rethrow(ctx);
792 }
793
794 static void
795 rewrite_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm)
796 {
797 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
798 fz_text *rtext = rewrite_text(ctx, rewrite, ctm, text);
799
800 fz_try(ctx)
801 fz_ignore_text(ctx, rewrite->target, rtext, ctm);
802 fz_always(ctx)
803 fz_drop_text(ctx, rtext);
804 fz_catch(ctx)
805 fz_rethrow(ctx);
806 }
807
808 static void
809 rewrite_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shd, fz_matrix ctm, float alpha, fz_color_params color_params)
810 {
811 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
812
813 fz_fill_shade(ctx, rewrite->target, shd, ctm, alpha, color_params);
814 }
815
816 static void
817 rewrite_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params)
818 {
819 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
820
821 fz_fill_image(ctx, rewrite->target, img, ctm, alpha, color_params);
822 }
823
824 static void
825 rewrite_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, fz_colorspace *cs, const float *color, float alpha, fz_color_params color_params)
826 {
827 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
828
829 fz_fill_image_mask(ctx, rewrite->target, img, ctm, cs, color, alpha, color_params);
830 }
831
832 static void
833 rewrite_clip_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, fz_rect scissor)
834 {
835 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
836
837 fz_clip_image_mask(ctx, rewrite->target, img, ctm, scissor);
838 }
839
840 static void
841 rewrite_pop_clip(fz_context *ctx, fz_device *dev)
842 {
843 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
844
845 fz_pop_clip(ctx, rewrite->target);
846 }
847
848 static void
849 rewrite_begin_mask(fz_context *ctx, fz_device *dev, fz_rect area, int luminosity, fz_colorspace *cs, const float *bc, fz_color_params params)
850 {
851 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
852
853 fz_begin_mask(ctx, rewrite->target, area, luminosity, cs, bc, params);
854 }
855
856 static void
857 rewrite_end_mask(fz_context *ctx, fz_device *dev, fz_function *tr)
858 {
859 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
860
861 fz_end_mask_tr(ctx, rewrite->target, tr);
862 }
863
864 static void
865 rewrite_begin_group(fz_context *ctx, fz_device *dev, fz_rect area, fz_colorspace *cs, int isolated, int knockout, int blendmode, float alpha)
866 {
867 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
868
869 fz_begin_group(ctx, rewrite->target, area, cs, isolated, knockout, blendmode, alpha);
870 }
871
872 static void
873 rewrite_end_group(fz_context *ctx, fz_device *dev)
874 {
875 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
876
877 fz_end_group(ctx, rewrite->target);
878 }
879
880 static int
881 rewrite_begin_tile(fz_context *ctx, fz_device *dev, fz_rect area, fz_rect view, float xstep, float ystep, fz_matrix ctm, int id, int doc_id)
882 {
883 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
884
885 return fz_begin_tile_tid(ctx, rewrite->target, area, view, xstep, ystep, ctm, id, doc_id);
886 }
887
888 static void
889 rewrite_end_tile(fz_context *ctx, fz_device *dev)
890 {
891 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
892
893 fz_end_tile(ctx, rewrite->target);
894 }
895
896 static void
897 rewrite_render_flags(fz_context *ctx, fz_device *dev, int set, int clear)
898 {
899 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
900
901 fz_render_flags(ctx, rewrite->target, set, clear);
902 }
903
904 static void
905 rewrite_set_default_colorspaces(fz_context *ctx, fz_device *dev, fz_default_colorspaces *cs)
906 {
907 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
908
909 fz_set_default_colorspaces(ctx, rewrite->target, cs);
910 }
911
912 static void
913 rewrite_begin_layer(fz_context *ctx, fz_device *dev, const char *layer_name)
914 {
915 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
916
917 fz_begin_layer(ctx, rewrite->target, layer_name);
918 }
919
920 static void
921 rewrite_end_layer(fz_context *ctx, fz_device *dev)
922 {
923 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
924
925 fz_end_layer(ctx, rewrite->target);
926 }
927
928 static void
929 rewrite_close(fz_context *ctx, fz_device *dev)
930 {
931 fz_rewrite_device *rewrite = (fz_rewrite_device *)dev;
932 fz_font *font;
933 fz_text *text = NULL;
934 fz_matrix trm;
935 int i, j;
936
937 /* All this is a bit horrid, because the detection of sizes for
938 * the glyphs depends on the width of the glyphs. Use Courier
939 * because it's monospaced. */
940 font = fz_new_base14_font(ctx, "Courier");
941
942 fz_var(text);
943
944 fz_try(ctx)
945 {
946 text = fz_new_text(ctx);
947
948 for (i = 0; i < rewrite->words_len; i++)
949 {
950 word_record *word = rewrite->words[i];
951 fz_rect char_bbox;
952 float step;
953
954 if (word->n >= word->len)
955 continue;
956 step = (word->bbox.x1 - word->bbox.x0) / word->len;
957 char_bbox.x1 = word->bbox.x0;
958 char_bbox.y0 = word->bbox.y0;
959 char_bbox.y1 = word->bbox.y1;
960 for (j = 0; j < word->len; j++)
961 {
962 char_bbox.x0 = char_bbox.x1;
963 char_bbox.x1 += step;
964 /* Horrid constants that happen to work with Courier. */
965 trm.a = 10.0f/6 * (char_bbox.x1 - char_bbox.x0);
966 trm.b = 0;
967 trm.c = 0;
968 trm.d = (char_bbox.y1 - char_bbox.y0);
969 trm.e = char_bbox.x0;
970 trm.f = char_bbox.y0;
971 fz_show_glyph(ctx, text, font, trm,
972 word->unicode[j], word->unicode[j],
973 0, 0, FZ_BIDI_LTR, 0);
974 }
975 }
976
977 fz_ignore_text(ctx, rewrite->target, text, fz_identity);
978 }
979 fz_always(ctx)
980 {
981 fz_drop_text(ctx, text);
982 fz_drop_font(ctx, font);
983 }
984 fz_catch(ctx)
985 fz_rethrow(ctx);
986 }
987
988 static fz_device *
989 new_rewrite_device(fz_context *ctx, fz_device *target, word_record **words, int words_len)
990 {
991 fz_rewrite_device *rewrite;
992
993 rewrite = fz_new_derived_device(ctx, fz_rewrite_device);
994
995 rewrite->super.close_device = rewrite_close;
996
997 rewrite->super.fill_path = rewrite_fill_path;
998 rewrite->super.stroke_path = rewrite_stroke_path;
999 rewrite->super.clip_path = rewrite_clip_path;
1000 rewrite->super.clip_stroke_path = rewrite_clip_stroke_path;
1001
1002 rewrite->super.fill_text = rewrite_fill_text;
1003 rewrite->super.stroke_text = rewrite_stroke_text;
1004 rewrite->super.clip_text = rewrite_clip_text;
1005 rewrite->super.clip_stroke_text = rewrite_clip_stroke_text;
1006 rewrite->super.ignore_text = rewrite_ignore_text;
1007
1008 rewrite->super.fill_shade = rewrite_fill_shade;
1009 rewrite->super.fill_image = rewrite_fill_image;
1010 rewrite->super.fill_image_mask = rewrite_fill_image_mask;
1011 rewrite->super.clip_image_mask = rewrite_clip_image_mask;
1012
1013 rewrite->super.pop_clip = rewrite_pop_clip;
1014
1015 rewrite->super.begin_mask = rewrite_begin_mask;
1016 rewrite->super.end_mask = rewrite_end_mask;
1017 rewrite->super.begin_group = rewrite_begin_group;
1018 rewrite->super.end_group = rewrite_end_group;
1019
1020 rewrite->super.begin_tile = rewrite_begin_tile;
1021 rewrite->super.end_tile = rewrite_end_tile;
1022
1023 rewrite->super.render_flags = rewrite_render_flags;
1024 rewrite->super.set_default_colorspaces = rewrite_set_default_colorspaces;
1025
1026 rewrite->super.begin_layer = rewrite_begin_layer;
1027 rewrite->super.end_layer = rewrite_end_layer;
1028
1029 rewrite->target = target;
1030 rewrite->words = words;
1031 rewrite->words_len = words_len;
1032 rewrite->current = 0;
1033
1034 return &rewrite->super;
1035 }
1036
1037 static int
1038 fz_ocr_progress(fz_context *ctx, void *arg, int prog)
1039 {
1040 fz_ocr_device *ocr = (fz_ocr_device *)arg;
1041
1042 if (ocr->progress == NULL)
1043 return 0;
1044
1045 return ocr->progress(ctx, ocr->progress_arg, prog);
1046 }
1047
1048 static void
1049 fz_ocr_close_device(fz_context *ctx, fz_device *dev)
1050 {
1051 fz_ocr_device *ocr = (fz_ocr_device *)dev;
1052 void *tessapi;
1053 fz_device *rewrite_device;
1054 fz_rect bbox;
1055
1056 fz_close_device(ctx, ocr->draw_dev);
1057
1058 /* Now run the OCR */
1059 tessapi = ocr_init(ctx, ocr->language, ocr->datadir);
1060
1061 fz_try(ctx)
1062 {
1063 ocr_recognise(ctx, tessapi, ocr->pixmap, char_callback, &fz_ocr_progress, ocr);
1064 flush_word(ctx, ocr);
1065 }
1066 fz_always(ctx)
1067 ocr_fin(ctx, tessapi);
1068 fz_catch(ctx)
1069 fz_rethrow(ctx);
1070
1071 /* If we're not using a list, we're done! */
1072 if (ocr->list_dev == ocr->target)
1073 return;
1074
1075 fz_close_device(ctx, ocr->list_dev);
1076
1077 bbox = fz_transform_rect(ocr->mediabox, ocr->ctm);
1078 rewrite_device = new_rewrite_device(ctx, ocr->target, ocr->words, ocr->words_len);
1079 fz_try(ctx)
1080 {
1081 fz_run_display_list(ctx, ocr->list, rewrite_device,
1082 fz_identity, bbox, NULL);
1083 }
1084 fz_always(ctx)
1085 {
1086 fz_close_device(ctx, rewrite_device);
1087 fz_drop_device(ctx, rewrite_device);
1088 }
1089 fz_catch(ctx)
1090 fz_rethrow(ctx);
1091 }
1092
1093 static void
1094 fz_ocr_drop_device(fz_context *ctx, fz_device *dev)
1095 {
1096 drop_ocr_device(ctx, (fz_ocr_device *)dev);
1097 }
1098 #endif
1099
1100 fz_device *
1101 fz_new_ocr_device(fz_context *ctx,
1102 fz_device *target,
1103 fz_matrix ctm,
1104 fz_rect mediabox,
1105 int with_list,
1106 const char *language,
1107 const char *datadir,
1108 int (*progress)(fz_context *, void *, int),
1109 void *progress_arg)
1110 {
1111 #ifdef OCR_DISABLED
1112 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "OCR Disabled in this build");
1113 #else
1114 fz_ocr_device *dev;
1115
1116 if (target == NULL)
1117 fz_throw(ctx, FZ_ERROR_ARGUMENT, "OCR devices require a target");
1118
1119 dev = fz_new_derived_device(ctx, fz_ocr_device);
1120
1121 dev->super.close_device = fz_ocr_close_device;
1122 dev->super.drop_device = fz_ocr_drop_device;
1123
1124 dev->super.fill_path = fz_ocr_fill_path;
1125 dev->super.stroke_path = fz_ocr_stroke_path;
1126 dev->super.clip_path = fz_ocr_clip_path;
1127 dev->super.clip_stroke_path = fz_ocr_clip_stroke_path;
1128
1129 dev->super.fill_text = fz_ocr_fill_text;
1130 dev->super.stroke_text = fz_ocr_stroke_text;
1131 dev->super.clip_text = fz_ocr_clip_text;
1132 dev->super.clip_stroke_text = fz_ocr_clip_stroke_text;
1133 dev->super.ignore_text = fz_ocr_ignore_text;
1134
1135 dev->super.fill_shade = fz_ocr_fill_shade;
1136 dev->super.fill_image = fz_ocr_fill_image;
1137 dev->super.fill_image_mask = fz_ocr_fill_image_mask;
1138 dev->super.clip_image_mask = fz_ocr_clip_image_mask;
1139
1140 dev->super.pop_clip = fz_ocr_pop_clip;
1141
1142 dev->super.begin_mask = fz_ocr_begin_mask;
1143 dev->super.end_mask = fz_ocr_end_mask;
1144 dev->super.begin_group = fz_ocr_begin_group;
1145 dev->super.end_group = fz_ocr_end_group;
1146
1147 dev->super.begin_tile = fz_ocr_begin_tile;
1148 dev->super.end_tile = fz_ocr_end_tile;
1149
1150 dev->super.render_flags = fz_ocr_render_flags;
1151 dev->super.set_default_colorspaces = fz_ocr_set_default_colorspaces;
1152 dev->super.begin_layer = fz_ocr_begin_layer;
1153 dev->super.end_layer = fz_ocr_end_layer;
1154
1155 dev->progress = progress;
1156 dev->progress_arg = progress_arg;
1157
1158 fz_try(ctx)
1159 {
1160 fz_rect bbox;
1161 fz_irect ibox;
1162 fz_point res;
1163
1164 dev->target = target;
1165 dev->mediabox = mediabox;
1166 dev->ctm = ctm;
1167
1168 bbox = fz_transform_rect(mediabox, ctm);
1169 ibox = fz_round_rect(bbox);
1170 /* Fudge the width to be a multiple of 4. */
1171 ibox.x1 += (4-(ibox.x1-ibox.x0)) & 3;
1172 dev->pixmap = fz_new_pixmap_with_bbox(ctx, fz_device_gray(ctx),
1173 ibox, NULL, 0);
1174 fz_clear_pixmap(ctx, dev->pixmap);
1175 res = fz_transform_point_xy(72, 72, ctm);
1176 if (res.x < 0)
1177 res.x = -res.x;
1178 if (res.x < 1)
1179 res.x = 1;
1180 if (res.y < 0)
1181 res.y = -res.y;
1182 if (res.y < 1)
1183 res.y = 1;
1184 fz_set_pixmap_resolution(ctx, dev->pixmap, res.x, res.y);
1185
1186 dev->language = fz_strdup(ctx, language ? language : "eng");
1187 dev->datadir = fz_strdup(ctx, datadir ? datadir : "");
1188
1189 dev->draw_dev = fz_new_draw_device(ctx, fz_identity, dev->pixmap);
1190 if (with_list)
1191 {
1192 dev->list = fz_new_display_list(ctx, mediabox);
1193 dev->list_dev = fz_new_list_device(ctx, dev->list);
1194 } else
1195 dev->list_dev = dev->target;
1196 }
1197 fz_catch(ctx)
1198 {
1199 drop_ocr_device(ctx, dev);
1200 fz_rethrow(ctx);
1201 }
1202
1203 return (fz_device*)dev;
1204 #endif
1205 }