comparison src_classic/helper-stext.i @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 %{
2 /*
3 # ------------------------------------------------------------------------
4 # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
5 # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
6 #
7 # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
8 # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
9 # maintained and developed by Artifex Software, Inc. https://artifex.com.
10 # ------------------------------------------------------------------------
11 */
12 // need own versions of ascender / descender
13 static const float
14 JM_font_ascender(fz_context *ctx, fz_font *font)
15 {
16 if (skip_quad_corrections) {
17 return 0.8f;
18 }
19 return fz_font_ascender(ctx, font);
20 }
21
22 static const float
23 JM_font_descender(fz_context *ctx, fz_font *font)
24 {
25 if (skip_quad_corrections) {
26 return -0.2f;
27 }
28 return fz_font_descender(ctx, font);
29 }
30
31
32 //----------------------------------------------------------------
33 // Return true if character is considered to be a word delimiter
34 //----------------------------------------------------------------
35 static const int
36 JM_is_word_delimiter(int c, PyObject *delimiters)
37 {
38 if (c <= 32 || c == 160) return 1; // a standard delimiter
39
40 // extra delimiters must be a non-empty sequence
41 if (!delimiters || PyObject_Not(delimiters) || !PySequence_Check(delimiters)) {
42 return 0;
43 }
44
45 // convert to tuple for easier looping
46 PyObject *delims = PySequence_Tuple(delimiters);
47 if (!delims) {
48 PyErr_Clear();
49 return 0;
50 }
51
52 // Make 1-char PyObject from character given as integer
53 PyObject *cchar = Py_BuildValue("C", c); // single character PyObject
54 Py_ssize_t i, len = PyTuple_Size(delims);
55 for (i = 0; i < len; i++) {
56 int rc = PyUnicode_Compare(cchar, PyTuple_GET_ITEM(delims, i));
57 if (rc == 0) { // equal to a delimiter character
58 Py_DECREF(cchar);
59 Py_DECREF(delims);
60 PyErr_Clear();
61 return 1;
62 }
63 }
64
65 Py_DECREF(delims);
66 PyErr_Clear();
67 return 0;
68 }
69
70 /* inactive
71 //-----------------------------------------------------------------------------
72 // Make OCR text page directly from an fz_page
73 //-----------------------------------------------------------------------------
74 fz_stext_page *
75 JM_new_stext_page_ocr_from_page(fz_context *ctx, fz_page *page, fz_rect rect, int flags,
76 const char *lang, const char *tessdata)
77 {
78 if (!page) return NULL;
79 int with_list = 1;
80 fz_stext_page *tp = NULL;
81 fz_device *dev = NULL, *ocr_dev = NULL;
82 fz_var(dev);
83 fz_var(ocr_dev);
84 fz_var(tp);
85 fz_stext_options options;
86 memset(&options, 0, sizeof options);
87 options.flags = flags;
88 //fz_matrix ctm = fz_identity;
89 fz_matrix ctm1 = fz_make_matrix(100/72, 0, 0, 100/72, 0, 0);
90 fz_matrix ctm2 = fz_make_matrix(400/72, 0, 0, 400/72, 0, 0);
91
92 fz_try(ctx) {
93 tp = fz_new_stext_page(ctx, rect);
94 dev = fz_new_stext_device(ctx, tp, &options);
95 ocr_dev = fz_new_ocr_device(ctx, dev, fz_identity, rect, with_list, lang, tessdata, NULL);
96 fz_run_page(ctx, page, ocr_dev, fz_identity, NULL);
97 fz_close_device(ctx, ocr_dev);
98 fz_close_device(ctx, dev);
99 }
100 fz_always(ctx) {
101 fz_drop_device(ctx, dev);
102 fz_drop_device(ctx, ocr_dev);
103 }
104 fz_catch(ctx) {
105 fz_drop_stext_page(ctx, tp);
106 fz_rethrow(ctx);
107 }
108 return tp;
109 }
110 */
111
112 //---------------------------------------------------------------------------
113 // APPEND non-ascii runes in unicode escape format to fz_buffer
114 //---------------------------------------------------------------------------
115 void JM_append_rune(fz_context *ctx, fz_buffer *buff, int ch)
116 {
117 if (ch == 92) { // prevent accidental "\u" etc.
118 fz_append_string(ctx, buff, "\\u005c");
119 } else if ((ch >= 32 && ch <= 255) || ch == 10) {
120 fz_append_byte(ctx, buff, ch);
121 } else if (ch >= 0xd800 && ch <= 0xdfff) { // surrogate Unicode range
122 fz_append_string(ctx, buff, "\\ufffd");
123 } else if (ch <= 0xffff) { // 4 hex digits
124 fz_append_printf(ctx, buff, "\\u%04x", ch);
125 } else { // 8 hex digits
126 fz_append_printf(ctx, buff, "\\U%08x", ch);
127 }
128 }
129
130
131 // re-compute char quad if ascender/descender values make no sense
132 static fz_quad
133 JM_char_quad(fz_context *ctx, fz_stext_line *line, fz_stext_char *ch)
134 {
135 if (skip_quad_corrections) { // no special handling
136 return ch->quad;
137 }
138 if (line->wmode) { // never touch vertical write mode
139 return ch->quad;
140 }
141 fz_font *font = ch->font;
142 float asc = JM_font_ascender(ctx, font);
143 float dsc = JM_font_descender(ctx, font);
144 float c, s, fsize = ch->size;
145 float asc_dsc = asc - dsc + FLT_EPSILON;
146 if (asc_dsc >= 1 && small_glyph_heights == 0) { // no problem
147 return ch->quad;
148 }
149 if (asc < 1e-3) { // probably Tesseract glyphless font
150 dsc = -0.1f;
151 asc = 0.9f;
152 asc_dsc = 1.0f;
153 }
154
155 if (small_glyph_heights || asc_dsc < 1) {
156 dsc = dsc / asc_dsc;
157 asc = asc / asc_dsc;
158 }
159 asc_dsc = asc - dsc;
160 asc = asc * fsize / asc_dsc;
161 dsc = dsc * fsize / asc_dsc;
162
163 /* ------------------------------
164 Re-compute quad with the adjusted ascender / descender values:
165 Move ch->origin to (0,0) and de-rotate quad, then adjust the corners,
166 re-rotate and move back to ch->origin location.
167 ------------------------------ */
168 fz_matrix trm1, trm2, xlate1, xlate2;
169 fz_quad quad;
170 c = line->dir.x; // cosine
171 s = line->dir.y; // sine
172 trm1 = fz_make_matrix(c, -s, s, c, 0, 0); // derotate
173 trm2 = fz_make_matrix(c, s, -s, c, 0, 0); // rotate
174 if (c == -1) { // left-right flip
175 trm1.d = 1;
176 trm2.d = 1;
177 }
178 xlate1 = fz_make_matrix(1, 0, 0, 1, -ch->origin.x, -ch->origin.y);
179 xlate2 = fz_make_matrix(1, 0, 0, 1, ch->origin.x, ch->origin.y);
180
181 quad = fz_transform_quad(ch->quad, xlate1); // move origin to (0,0)
182 quad = fz_transform_quad(quad, trm1); // de-rotate corners
183
184 // adjust vertical coordinates
185 if (c == 1 && quad.ul.y > 0) { // up-down flip
186 quad.ul.y = asc;
187 quad.ur.y = asc;
188 quad.ll.y = dsc;
189 quad.lr.y = dsc;
190 } else {
191 quad.ul.y = -asc;
192 quad.ur.y = -asc;
193 quad.ll.y = -dsc;
194 quad.lr.y = -dsc;
195 }
196
197 // adjust horizontal coordinates that are too crazy:
198 // (1) left x must be >= 0
199 // (2) if bbox width is 0, lookup char advance in font.
200 if (quad.ll.x < 0) {
201 quad.ll.x = 0;
202 quad.ul.x = 0;
203 }
204 float cwidth = quad.lr.x - quad.ll.x;
205 if (cwidth < FLT_EPSILON) {
206 int glyph = fz_encode_character(ctx, font, ch->c);
207 if (glyph) {
208 float fwidth = fz_advance_glyph(ctx, font, glyph, line->wmode);
209 quad.lr.x = quad.ll.x + fwidth * fsize;
210 quad.ur.x = quad.lr.x;
211 }
212 }
213
214 quad = fz_transform_quad(quad, trm2); // rotate back
215 quad = fz_transform_quad(quad, xlate2); // translate back
216 return quad;
217 }
218
219
220 // return rect of char quad
221 static fz_rect
222 JM_char_bbox(fz_context *ctx, fz_stext_line *line, fz_stext_char *ch)
223 {
224 fz_rect r = fz_rect_from_quad(JM_char_quad(ctx, line, ch));
225 if (!line->wmode) {
226 return r;
227 }
228 if (r.y1 < r.y0 + ch->size) {
229 r.y0 = r.y1 - ch->size;
230 }
231 return r;
232 }
233
234
235 //-------------------------------------------
236 // make a buffer from an stext_page's text
237 //-------------------------------------------
238 fz_buffer *
239 JM_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page)
240 {
241 fz_stext_block *block;
242 fz_stext_line *line;
243 fz_stext_char *ch;
244 fz_rect rect = page->mediabox;
245 fz_buffer *buf = NULL;
246
247 fz_try(ctx)
248 {
249 buf = fz_new_buffer(ctx, 256);
250 for (block = page->first_block; block; block = block->next) {
251 if (block->type == FZ_STEXT_BLOCK_TEXT) {
252 for (line = block->u.t.first_line; line; line = line->next) {
253 for (ch = line->first_char; ch; ch = ch->next) {
254 if (!JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch)) &&
255 !fz_is_infinite_rect(rect)) {
256 continue;
257 }
258 fz_append_rune(ctx, buf, ch->c);
259 }
260 fz_append_byte(ctx, buf, '\n');
261 }
262 fz_append_byte(ctx, buf, '\n');
263 }
264 }
265 }
266 fz_catch(ctx) {
267 fz_drop_buffer(ctx, buf);
268 fz_rethrow(ctx);
269 }
270 return buf;
271 }
272
273
274 static float hdist(fz_point *dir, fz_point *a, fz_point *b)
275 {
276 float dx = b->x - a->x;
277 float dy = b->y - a->y;
278 return fz_abs(dx * dir->x + dy * dir->y);
279 }
280
281
282 static float vdist(fz_point *dir, fz_point *a, fz_point *b)
283 {
284 float dx = b->x - a->x;
285 float dy = b->y - a->y;
286 return fz_abs(dx * dir->y + dy * dir->x);
287 }
288
289
290 struct highlight
291 {
292 Py_ssize_t len;
293 PyObject *quads;
294 float hfuzz, vfuzz;
295 };
296
297
298 static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch)
299 {
300 struct highlight *hits = arg;
301 float vfuzz = ch->size * hits->vfuzz;
302 float hfuzz = ch->size * hits->hfuzz;
303 fz_quad ch_quad = JM_char_quad(ctx, line, ch);
304 if (hits->len > 0) {
305 PyObject *quad = PySequence_ITEM(hits->quads, hits->len - 1);
306 fz_quad end = JM_quad_from_py(quad);
307 Py_DECREF(quad);
308 if (hdist(&line->dir, &end.lr, &ch_quad.ll) < hfuzz
309 && vdist(&line->dir, &end.lr, &ch_quad.ll) < vfuzz
310 && hdist(&line->dir, &end.ur, &ch_quad.ul) < hfuzz
311 && vdist(&line->dir, &end.ur, &ch_quad.ul) < vfuzz)
312 {
313 end.ur = ch_quad.ur;
314 end.lr = ch_quad.lr;
315 quad = JM_py_from_quad(end);
316 PyList_SetItem(hits->quads, hits->len - 1, quad);
317 return;
318 }
319 }
320 LIST_APPEND_DROP(hits->quads, JM_py_from_quad(ch_quad));
321 hits->len++;
322 }
323
324
325 static inline int canon(int c)
326 {
327 /* TODO: proper unicode case folding */
328 /* TODO: character equivalence (a matches รค, etc) */
329 if (c == 0xA0 || c == 0x2028 || c == 0x2029)
330 return ' ';
331 if (c == '\r' || c == '\n' || c == '\t')
332 return ' ';
333 if (c >= 'A' && c <= 'Z')
334 return c - 'A' + 'a';
335 return c;
336 }
337
338
339 static inline int chartocanon(int *c, const char *s)
340 {
341 int n = fz_chartorune(c, s);
342 *c = canon(*c);
343 return n;
344 }
345
346
347 static const char *match_string(const char *h, const char *n)
348 {
349 int hc, nc;
350 const char *e = h;
351 h += chartocanon(&hc, h);
352 n += chartocanon(&nc, n);
353 while (hc == nc)
354 {
355 e = h;
356 if (hc == ' ')
357 do
358 h += chartocanon(&hc, h);
359 while (hc == ' ');
360 else
361 h += chartocanon(&hc, h);
362 if (nc == ' ')
363 do
364 n += chartocanon(&nc, n);
365 while (nc == ' ');
366 else
367 n += chartocanon(&nc, n);
368 }
369 return nc == 0 ? e : NULL;
370 }
371
372
373 static const char *find_string(const char *s, const char *needle, const char **endp)
374 {
375 const char *end;
376 while (*s)
377 {
378 end = match_string(s, needle);
379 if (end)
380 return *endp = end, s;
381 ++s;
382 }
383 return *endp = NULL, NULL;
384 }
385
386
387 PyObject *
388 JM_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle)
389 {
390 struct highlight hits;
391 fz_stext_block *block;
392 fz_stext_line *line;
393 fz_stext_char *ch;
394 fz_buffer *buffer = NULL;
395 const char *haystack, *begin, *end;
396 fz_rect rect = page->mediabox;
397 int c, inside;
398
399 if (strlen(needle) == 0) Py_RETURN_NONE;
400 PyObject *quads = PyList_New(0);
401 hits.len = 0;
402 hits.quads = quads;
403 hits.hfuzz = 0.2f; /* merge kerns but not large gaps */
404 hits.vfuzz = 0.1f;
405
406 fz_try(ctx) {
407 buffer = JM_new_buffer_from_stext_page(ctx, page);
408 haystack = fz_string_from_buffer(ctx, buffer);
409 begin = find_string(haystack, needle, &end);
410 if (!begin) goto no_more_matches;
411
412 inside = 0;
413 for (block = page->first_block; block; block = block->next) {
414 if (block->type != FZ_STEXT_BLOCK_TEXT) {
415 continue;
416 }
417 for (line = block->u.t.first_line; line; line = line->next) {
418 for (ch = line->first_char; ch; ch = ch->next) {
419 if (!fz_is_infinite_rect(rect) &&
420 !JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch))) {
421 goto next_char;
422 }
423 try_new_match:
424 if (!inside) {
425 if (haystack >= begin) inside = 1;
426 }
427 if (inside) {
428 if (haystack < end) {
429 on_highlight_char(ctx, &hits, line, ch);
430 } else {
431 inside = 0;
432 begin = find_string(haystack, needle, &end);
433 if (!begin) goto no_more_matches;
434 else goto try_new_match;
435 }
436 }
437 haystack += fz_chartorune(&c, haystack);
438 next_char:;
439 }
440 assert(*haystack == '\n');
441 ++haystack;
442 }
443 assert(*haystack == '\n');
444 ++haystack;
445 }
446 no_more_matches:;
447 }
448 fz_always(ctx)
449 fz_drop_buffer(ctx, buffer);
450 fz_catch(ctx)
451 fz_rethrow(ctx);
452
453 return quads;
454 }
455
456
457 //-----------------------------------------------------------------------------
458 // Plain text output. An identical copy of fz_print_stext_page_as_text,
459 // but lines within a block are concatenated by space instead a new-line
460 // character (which else leads to 2 new-lines).
461 //-----------------------------------------------------------------------------
462 void
463 JM_print_stext_page_as_text(fz_context *ctx, fz_buffer *buff, fz_stext_page *page)
464 {
465 fz_stext_block *block;
466 fz_stext_line *line;
467 fz_stext_char *ch;
468 fz_rect rect = page->mediabox;
469 fz_rect chbbox;
470 int last_char = 0;
471 char utf[10];
472 int i, n;
473
474 for (block = page->first_block; block; block = block->next) {
475 if (block->type == FZ_STEXT_BLOCK_TEXT) {
476 for (line = block->u.t.first_line; line; line = line->next) {
477 last_char = 0;
478 for (ch = line->first_char; ch; ch = ch->next) {
479 chbbox = JM_char_bbox(ctx, line, ch);
480 if (fz_is_infinite_rect(rect) ||
481 JM_rects_overlap(rect, chbbox)) {
482 last_char = ch->c;
483 JM_append_rune(ctx, buff, ch->c);
484 }
485 }
486 if (last_char != 10 && last_char > 0) {
487 fz_append_string(ctx, buff, "\n");
488 }
489 }
490 }
491 }
492 }
493
494 //-----------------------------------------------------------------------------
495 // Functions for wordlist output
496 //-----------------------------------------------------------------------------
497 int JM_append_word(fz_context *ctx, PyObject *lines, fz_buffer *buff, fz_rect *wbbox,
498 int block_n, int line_n, int word_n)
499 {
500 PyObject *s = JM_EscapeStrFromBuffer(ctx, buff);
501 PyObject *litem = Py_BuildValue("ffffOiii",
502 wbbox->x0,
503 wbbox->y0,
504 wbbox->x1,
505 wbbox->y1,
506 s,
507 block_n, line_n, word_n);
508 LIST_APPEND_DROP(lines, litem);
509 Py_DECREF(s);
510 *wbbox = fz_empty_rect;
511 return word_n + 1; // word counter
512 }
513
514 //-----------------------------------------------------------------------------
515 // Functions for dictionary output
516 //-----------------------------------------------------------------------------
517
518 static int detect_super_script(fz_stext_line *line, fz_stext_char *ch)
519 {
520 if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
521 return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
522 return 0;
523 }
524
525 static int JM_char_font_flags(fz_context *ctx, fz_font *font, fz_stext_line *line, fz_stext_char *ch)
526 {
527 int flags = detect_super_script(line, ch);
528 flags += fz_font_is_italic(ctx, font) * TEXT_FONT_ITALIC;
529 flags += fz_font_is_serif(ctx, font) * TEXT_FONT_SERIFED;
530 flags += fz_font_is_monospaced(ctx, font) * TEXT_FONT_MONOSPACED;
531 flags += fz_font_is_bold(ctx, font) * TEXT_FONT_BOLD;
532 return flags;
533 }
534
535 static const char *
536 JM_font_name(fz_context *ctx, fz_font *font)
537 {
538 const char *name = fz_font_name(ctx, font);
539 const char *s = strchr(name, '+');
540 if (subset_fontnames || s == NULL || s-name != 6) {
541 return name;
542 }
543 return s + 1;
544 }
545
546
547 static fz_rect
548 JM_make_spanlist(fz_context *ctx, PyObject *line_dict,
549 fz_stext_line *line, int raw, fz_buffer *buff,
550 fz_rect tp_rect)
551 {
552 PyObject *span = NULL, *char_list = NULL, *char_dict;
553 PyObject *span_list = PyList_New(0);
554 fz_clear_buffer(ctx, buff);
555 fz_stext_char *ch;
556 fz_rect span_rect = fz_empty_rect;
557 fz_rect line_rect = fz_empty_rect;
558 fz_point span_origin = {0, 0};
559 typedef struct style_s {
560 float size; int flags; const char *font; int color;
561 float asc; float desc;
562 } char_style;
563 char_style old_style = { -1, -1, "", -1, 0, 0 }, style;
564
565 for (ch = line->first_char; ch; ch = ch->next) {
566 fz_rect r = JM_char_bbox(ctx, line, ch);
567 if (!JM_rects_overlap(tp_rect, r) &&
568 !fz_is_infinite_rect(tp_rect)) {
569 continue;
570 }
571 int flags = JM_char_font_flags(ctx, ch->font, line, ch);
572 fz_point origin = ch->origin;
573 style.size = ch->size;
574 style.flags = flags;
575 style.font = JM_font_name(ctx, ch->font);
576 style.color = ch->color;
577 style.asc = JM_font_ascender(ctx, ch->font);
578 style.desc = JM_font_descender(ctx, ch->font);
579
580 if (style.size != old_style.size ||
581 style.flags != old_style.flags ||
582 style.color != old_style.color ||
583 strcmp(style.font, old_style.font) != 0) {
584
585 if (old_style.size >= 0) {
586 // not first one, output previous
587 if (raw) {
588 // put character list in the span
589 DICT_SETITEM_DROP(span, dictkey_chars, char_list);
590 char_list = NULL;
591 } else {
592 // put text string in the span
593 DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(ctx, buff));
594 fz_clear_buffer(ctx, buff);
595 }
596
597 DICT_SETITEM_DROP(span, dictkey_origin,
598 JM_py_from_point(span_origin));
599 DICT_SETITEM_DROP(span, dictkey_bbox,
600 JM_py_from_rect(span_rect));
601 line_rect = fz_union_rect(line_rect, span_rect);
602 LIST_APPEND_DROP(span_list, span);
603 span = NULL;
604 }
605
606 span = PyDict_New();
607 float asc = style.asc, desc = style.desc;
608 if (style.asc < 1e-3) {
609 asc = 0.9f;
610 desc = -0.1f;
611 }
612
613 DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size));
614 DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags));
615 DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font));
616 DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color));
617 DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc));
618 DICT_SETITEMSTR_DROP(span, "descender", Py_BuildValue("f", desc));
619
620 old_style = style;
621 span_rect = r;
622 span_origin = origin;
623
624 }
625 span_rect = fz_union_rect(span_rect, r);
626
627 if (raw) { // make and append a char dict
628 char_dict = PyDict_New();
629 DICT_SETITEM_DROP(char_dict, dictkey_origin,
630 JM_py_from_point(ch->origin));
631
632 DICT_SETITEM_DROP(char_dict, dictkey_bbox,
633 JM_py_from_rect(r));
634
635 DICT_SETITEM_DROP(char_dict, dictkey_c,
636 Py_BuildValue("C", ch->c));
637
638 if (!char_list) {
639 char_list = PyList_New(0);
640 }
641 LIST_APPEND_DROP(char_list, char_dict);
642 } else { // add character byte to buffer
643 JM_append_rune(ctx, buff, ch->c);
644 }
645 }
646 // all characters processed, now flush remaining span
647 if (span) {
648 if (raw) {
649 DICT_SETITEM_DROP(span, dictkey_chars, char_list);
650 char_list = NULL;
651 } else {
652 DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(ctx, buff));
653 fz_clear_buffer(ctx, buff);
654 }
655 DICT_SETITEM_DROP(span, dictkey_origin, JM_py_from_point(span_origin));
656 DICT_SETITEM_DROP(span, dictkey_bbox, JM_py_from_rect(span_rect));
657
658 if (!fz_is_empty_rect(span_rect)) {
659 LIST_APPEND_DROP(span_list, span);
660 line_rect = fz_union_rect(line_rect, span_rect);
661 } else {
662 Py_DECREF(span);
663 }
664 span = NULL;
665 }
666 if (!fz_is_empty_rect(line_rect)) {
667 DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list);
668 } else {
669 DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list);
670 }
671 return line_rect;
672 }
673
674 static void JM_make_image_block(fz_context *ctx, fz_stext_block *block, PyObject *block_dict)
675 {
676 fz_image *image = block->u.i.image;
677 fz_buffer *buf = NULL, *freebuf = NULL;
678 fz_compressed_buffer *buffer = fz_compressed_image_buffer(ctx, image);
679 fz_var(buf);
680 fz_var(freebuf);
681 int n = fz_colorspace_n(ctx, image->colorspace);
682 int w = image->w;
683 int h = image->h;
684 const char *ext = NULL;
685 int type = FZ_IMAGE_UNKNOWN;
686 if (buffer)
687 type = buffer->params.type;
688 if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2)
689 type = FZ_IMAGE_UNKNOWN;
690 PyObject *bytes = NULL;
691 fz_var(bytes);
692 fz_try(ctx) {
693 if (buffer && type != FZ_IMAGE_UNKNOWN) {
694 buf = buffer->buffer;
695 ext = JM_image_extension(type);
696 } else {
697 buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params);
698 ext = "png";
699 }
700 bytes = JM_BinFromBuffer(ctx, buf);
701 }
702 fz_always(ctx) {
703 if (!bytes)
704 bytes = JM_BinFromChar("");
705
706 DICT_SETITEM_DROP(block_dict, dictkey_width,
707 Py_BuildValue("i", w));
708 DICT_SETITEM_DROP(block_dict, dictkey_height,
709 Py_BuildValue("i", h));
710 DICT_SETITEM_DROP(block_dict, dictkey_ext,
711 Py_BuildValue("s", ext));
712 DICT_SETITEM_DROP(block_dict, dictkey_colorspace,
713 Py_BuildValue("i", n));
714 DICT_SETITEM_DROP(block_dict, dictkey_xres,
715 Py_BuildValue("i", image->xres));
716 DICT_SETITEM_DROP(block_dict, dictkey_yres,
717 Py_BuildValue("i", image->xres));
718 DICT_SETITEM_DROP(block_dict, dictkey_bpc,
719 Py_BuildValue("i", (int) image->bpc));
720 DICT_SETITEM_DROP(block_dict, dictkey_matrix,
721 JM_py_from_matrix(block->u.i.transform));
722 DICT_SETITEM_DROP(block_dict, dictkey_size,
723 Py_BuildValue("n", PyBytes_Size(bytes)));
724 DICT_SETITEM_DROP(block_dict, dictkey_image, bytes);
725
726 fz_drop_buffer(ctx, freebuf);
727 }
728 fz_catch(ctx) {;}
729 return;
730 }
731
732 static void JM_make_text_block(fz_context *ctx, fz_stext_block *block, PyObject *block_dict, int raw, fz_buffer *buff, fz_rect tp_rect)
733 {
734 fz_stext_line *line;
735 PyObject *line_list = PyList_New(0), *line_dict;
736 fz_rect block_rect = fz_empty_rect;
737 for (line = block->u.t.first_line; line; line = line->next) {
738 if (fz_is_empty_rect(fz_intersect_rect(tp_rect, line->bbox)) &&
739 !fz_is_infinite_rect(tp_rect)) {
740 continue;
741 }
742 line_dict = PyDict_New();
743 fz_rect line_rect = JM_make_spanlist(ctx, line_dict, line, raw, buff, tp_rect);
744 block_rect = fz_union_rect(block_rect, line_rect);
745 DICT_SETITEM_DROP(line_dict, dictkey_wmode,
746 Py_BuildValue("i", line->wmode));
747 DICT_SETITEM_DROP(line_dict, dictkey_dir, JM_py_from_point(line->dir));
748 DICT_SETITEM_DROP(line_dict, dictkey_bbox,
749 JM_py_from_rect(line_rect));
750 LIST_APPEND_DROP(line_list, line_dict);
751 }
752 DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block_rect));
753 DICT_SETITEM_DROP(block_dict, dictkey_lines, line_list);
754 return;
755 }
756
757 void JM_make_textpage_dict(fz_context *ctx, fz_stext_page *tp, PyObject *page_dict, int raw)
758 {
759 fz_stext_block *block;
760 fz_buffer *text_buffer = fz_new_buffer(ctx, 128);
761 PyObject *block_dict, *block_list = PyList_New(0);
762 fz_rect tp_rect = tp->mediabox;
763 int block_n = -1;
764 for (block = tp->first_block; block; block = block->next) {
765 block_n++;
766 if (!fz_contains_rect(tp_rect, block->bbox) &&
767 !fz_is_infinite_rect(tp_rect) &&
768 block->type == FZ_STEXT_BLOCK_IMAGE) {
769 continue;
770 }
771 if (!fz_is_infinite_rect(tp_rect) &&
772 fz_is_empty_rect(fz_intersect_rect(tp_rect, block->bbox))) {
773 continue;
774 }
775
776 block_dict = PyDict_New();
777 DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n));
778 DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type));
779 if (block->type == FZ_STEXT_BLOCK_IMAGE) {
780 DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox));
781 JM_make_image_block(ctx, block, block_dict);
782 } else {
783 JM_make_text_block(ctx, block, block_dict, raw, text_buffer, tp_rect);
784 }
785
786 LIST_APPEND_DROP(block_list, block_dict);
787 }
788 DICT_SETITEM_DROP(page_dict, dictkey_blocks, block_list);
789 fz_drop_buffer(ctx, text_buffer);
790 }
791
792
793
794 //---------------------------------------------------------------------
795 PyObject *
796 JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area)
797 {
798 fz_stext_block *block;
799 fz_stext_line *line;
800 fz_stext_char *ch;
801 fz_buffer *buffer;
802 int need_new_line = 0;
803 PyObject *rc = NULL;
804 fz_try(ctx) {
805 buffer = fz_new_buffer(ctx, 1024);
806 for (block = page->first_block; block; block = block->next) {
807 if (block->type != FZ_STEXT_BLOCK_TEXT)
808 continue;
809 for (line = block->u.t.first_line; line; line = line->next) {
810 int line_had_text = 0;
811 for (ch = line->first_char; ch; ch = ch->next) {
812 fz_rect r = JM_char_bbox(ctx, line, ch);
813 if (JM_rects_overlap(area, r)) {
814 line_had_text = 1;
815 if (need_new_line) {
816 fz_append_string(ctx, buffer, "\n");
817 need_new_line = 0;
818 }
819 JM_append_rune(ctx, buffer, ch->c);
820 }
821 }
822 if (line_had_text)
823 need_new_line = 1;
824 }
825 }
826 fz_terminate_buffer(ctx, buffer);
827 rc = JM_EscapeStrFromBuffer(ctx, buffer);
828 if (!rc) {
829 rc = EMPTY_STRING;
830 PyErr_Clear();
831 }
832 }
833 fz_always(ctx) {
834 fz_drop_buffer(ctx, buffer);
835 }
836 fz_catch(ctx) {
837 fz_rethrow(ctx);
838 }
839 return rc;
840 }
841 //---------------------------------------------------------------------
842
843
844
845
846 fz_buffer *JM_object_to_buffer(fz_context *ctx, pdf_obj *what, int compress, int ascii)
847 {
848 fz_buffer *res=NULL;
849 fz_output *out=NULL;
850 fz_try(ctx) {
851 res = fz_new_buffer(ctx, 512);
852 out = fz_new_output_with_buffer(ctx, res);
853 pdf_print_obj(ctx, out, what, compress, ascii);
854 }
855 fz_always(ctx) {
856 fz_drop_output(ctx, out);
857 }
858 fz_catch(ctx) {
859 fz_rethrow(ctx);
860 }
861 fz_terminate_buffer(ctx, res);
862 return res;
863 }
864
865 //-----------------------------------------------------------------------------
866 // Merge the /Resources object created by a text pdf device into the page.
867 // The device may have created multiple /ExtGState/Alp? and /Font/F? objects.
868 // These need to be renamed (renumbered) to not overwrite existing page
869 // objects from previous executions.
870 // Returns the next available numbers n, m for objects /Alp<n>, /F<m>.
871 //-----------------------------------------------------------------------------
872 PyObject *JM_merge_resources(fz_context *ctx, pdf_page *page, pdf_obj *temp_res)
873 {
874 // page objects /Resources, /Resources/ExtGState, /Resources/Font
875 pdf_obj *resources = pdf_dict_get(ctx, page->obj, PDF_NAME(Resources));
876 pdf_obj *main_extg = pdf_dict_get(ctx, resources, PDF_NAME(ExtGState));
877 pdf_obj *main_fonts = pdf_dict_get(ctx, resources, PDF_NAME(Font));
878
879 // text pdf device objects /ExtGState, /Font
880 pdf_obj *temp_extg = pdf_dict_get(ctx, temp_res, PDF_NAME(ExtGState));
881 pdf_obj *temp_fonts = pdf_dict_get(ctx, temp_res, PDF_NAME(Font));
882
883
884 int max_alp = -1, max_fonts = -1, i, n;
885 char text[20];
886
887 // Handle /Alp objects
888 if (pdf_is_dict(ctx, temp_extg)) // any created at all?
889 {
890 n = pdf_dict_len(ctx, temp_extg);
891 if (pdf_is_dict(ctx, main_extg)) { // does page have /ExtGState yet?
892 for (i = 0; i < pdf_dict_len(ctx, main_extg); i++) {
893 // get highest number of objects named /Alpxxx
894 char *alp = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, main_extg, i));
895 if (strncmp(alp, "Alp", 3) != 0) continue;
896 int j = fz_atoi(alp + 3);
897 if (j > max_alp) max_alp = j;
898 }
899 }
900 else // create a /ExtGState for the page
901 main_extg = pdf_dict_put_dict(ctx, resources, PDF_NAME(ExtGState), n);
902
903 max_alp += 1;
904 for (i = 0; i < n; i++) // copy over renumbered /Alp objects
905 {
906 char *alp = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, temp_extg, i));
907 int j = fz_atoi(alp + 3) + max_alp;
908 fz_snprintf(text, sizeof(text), "Alp%d", j); // new name
909 pdf_obj *val = pdf_dict_get_val(ctx, temp_extg, i);
910 pdf_dict_puts(ctx, main_extg, text, val);
911 }
912 }
913
914
915 if (pdf_is_dict(ctx, main_fonts)) { // has page any fonts yet?
916 for (i = 0; i < pdf_dict_len(ctx, main_fonts); i++) { // get max font number
917 char *font = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, main_fonts, i));
918 if (strncmp(font, "F", 1) != 0) continue;
919 int j = fz_atoi(font + 1);
920 if (j > max_fonts) max_fonts = j;
921 }
922 }
923 else // create a Resources/Font for the page
924 main_fonts = pdf_dict_put_dict(ctx, resources, PDF_NAME(Font), 2);
925
926 max_fonts += 1;
927 for (i = 0; i < pdf_dict_len(ctx, temp_fonts); i++) { // copy renumbered fonts
928 char *font = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, temp_fonts, i));
929 int j = fz_atoi(font + 1) + max_fonts;
930 fz_snprintf(text, sizeof(text), "F%d", j);
931 pdf_obj *val = pdf_dict_get_val(ctx, temp_fonts, i);
932 pdf_dict_puts(ctx, main_fonts, text, val);
933 }
934 return Py_BuildValue("ii", max_alp, max_fonts); // next available numbers
935 }
936
937
938 //-----------------------------------------------------------------------------
939 // version of fz_show_string, which covers SMALL CAPS
940 //-----------------------------------------------------------------------------
941 fz_matrix
942 JM_show_string_cs(fz_context *ctx, fz_text *text, fz_font *user_font, fz_matrix trm, const char *s,
943 int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language)
944 {
945 fz_font *font=NULL;
946 int gid, ucs;
947 float adv;
948
949 while (*s)
950 {
951 s += fz_chartorune(&ucs, s);
952 gid = fz_encode_character_sc(ctx, user_font, ucs);
953 if (gid == 0) {
954 gid = fz_encode_character_with_fallback(ctx, user_font, ucs, 0, language, &font);
955 } else {
956 font = user_font;
957 }
958 fz_show_glyph(ctx, text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language);
959 adv = fz_advance_glyph(ctx, font, gid, wmode);
960 if (wmode == 0)
961 trm = fz_pre_translate(trm, adv, 0);
962 else
963 trm = fz_pre_translate(trm, 0, -adv);
964 }
965
966 return trm;
967 }
968
969
970 //-----------------------------------------------------------------------------
971 // version of fz_show_string, which also covers UCDN script
972 //-----------------------------------------------------------------------------
973 fz_matrix JM_show_string(fz_context *ctx, fz_text *text, fz_font *user_font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language, int script)
974 {
975 fz_font *font;
976 int gid, ucs;
977 float adv;
978
979 while (*s) {
980 s += fz_chartorune(&ucs, s);
981 gid = fz_encode_character_with_fallback(ctx, user_font, ucs, script, language, &font);
982 fz_show_glyph(ctx, text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language);
983 adv = fz_advance_glyph(ctx, font, gid, wmode);
984 if (wmode == 0)
985 trm = fz_pre_translate(trm, adv, 0);
986 else
987 trm = fz_pre_translate(trm, 0, -adv);
988 }
989 return trm;
990 }
991
992
993 //-----------------------------------------------------------------------------
994 // return a fz_font from a number of parameters
995 //-----------------------------------------------------------------------------
996 fz_font *JM_get_font(fz_context *ctx,
997 char *fontname,
998 char *fontfile,
999 PyObject *fontbuffer,
1000 int script,
1001 int lang,
1002 int ordering,
1003 int is_bold,
1004 int is_italic,
1005 int is_serif,
1006 int embed)
1007 {
1008 const unsigned char *data = NULL;
1009 int size, index=0;
1010 fz_buffer *res = NULL;
1011 fz_font *font = NULL;
1012 fz_try(ctx) {
1013 if (fontfile) goto have_file;
1014 if (EXISTS(fontbuffer)) goto have_buffer;
1015 if (ordering > -1) goto have_cjk;
1016 if (fontname) goto have_base14;
1017 goto have_noto;
1018
1019 // Base-14 or a MuPDF builtin font
1020 have_base14:;
1021 font = fz_new_base14_font(ctx, fontname);
1022 if (font) {
1023 goto fertig;
1024 }
1025 font = fz_new_builtin_font(ctx, fontname, is_bold, is_italic);
1026 goto fertig;
1027
1028 // CJK font
1029 have_cjk:;
1030 font = fz_new_cjk_font(ctx, ordering);
1031 goto fertig;
1032
1033 // fontfile
1034 have_file:;
1035 font = fz_new_font_from_file(ctx, NULL, fontfile, index, 0);
1036 goto fertig;
1037
1038 // fontbuffer
1039 have_buffer:;
1040 res = JM_BufferFromBytes(ctx, fontbuffer);
1041 font = fz_new_font_from_buffer(ctx, NULL, res, index, 0);
1042 goto fertig;
1043
1044 // Check for NOTO font
1045 have_noto:;
1046 data = fz_lookup_noto_font(ctx, script, lang, &size, &index);
1047 if (data) font = fz_new_font_from_memory(ctx, NULL, data, size, index, 0);
1048 if (font) goto fertig;
1049 font = fz_load_fallback_font(ctx, script, lang, is_serif, is_bold, is_italic);
1050 goto fertig;
1051
1052 fertig:;
1053 if (!font) {
1054 RAISEPY(ctx, MSG_FONT_FAILED, PyExc_RuntimeError);
1055 }
1056 #if FZ_VERSION_MAJOR == 1 && FZ_VERSION_MINOR >= 22
1057 // if font allows this, set embedding
1058 if (!font->flags.never_embed) {
1059 fz_set_font_embedding(ctx, font, embed);
1060 }
1061 #endif
1062 }
1063 fz_always(ctx) {
1064 fz_drop_buffer(ctx, res);
1065 }
1066 fz_catch(ctx) {
1067 fz_rethrow(ctx);
1068 }
1069 return font;
1070 }
1071
1072 %}