Mercurial > hgrepos > Python2 > PyMuPDF
view src_classic/helper-stext.i @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children |
line wrap: on
line source
%{ /* # ------------------------------------------------------------------------ # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html # # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is # maintained and developed by Artifex Software, Inc. https://artifex.com. # ------------------------------------------------------------------------ */ // need own versions of ascender / descender static const float JM_font_ascender(fz_context *ctx, fz_font *font) { if (skip_quad_corrections) { return 0.8f; } return fz_font_ascender(ctx, font); } static const float JM_font_descender(fz_context *ctx, fz_font *font) { if (skip_quad_corrections) { return -0.2f; } return fz_font_descender(ctx, font); } //---------------------------------------------------------------- // Return true if character is considered to be a word delimiter //---------------------------------------------------------------- static const int JM_is_word_delimiter(int c, PyObject *delimiters) { if (c <= 32 || c == 160) return 1; // a standard delimiter // extra delimiters must be a non-empty sequence if (!delimiters || PyObject_Not(delimiters) || !PySequence_Check(delimiters)) { return 0; } // convert to tuple for easier looping PyObject *delims = PySequence_Tuple(delimiters); if (!delims) { PyErr_Clear(); return 0; } // Make 1-char PyObject from character given as integer PyObject *cchar = Py_BuildValue("C", c); // single character PyObject Py_ssize_t i, len = PyTuple_Size(delims); for (i = 0; i < len; i++) { int rc = PyUnicode_Compare(cchar, PyTuple_GET_ITEM(delims, i)); if (rc == 0) { // equal to a delimiter character Py_DECREF(cchar); Py_DECREF(delims); PyErr_Clear(); return 1; } } Py_DECREF(delims); PyErr_Clear(); return 0; } /* inactive //----------------------------------------------------------------------------- // Make OCR text page directly from an fz_page //----------------------------------------------------------------------------- fz_stext_page * JM_new_stext_page_ocr_from_page(fz_context *ctx, fz_page *page, fz_rect rect, int flags, const char *lang, const char *tessdata) { if (!page) return NULL; int with_list = 1; fz_stext_page *tp = NULL; fz_device *dev = NULL, *ocr_dev = NULL; fz_var(dev); fz_var(ocr_dev); fz_var(tp); fz_stext_options options; memset(&options, 0, sizeof options); options.flags = flags; //fz_matrix ctm = fz_identity; fz_matrix ctm1 = fz_make_matrix(100/72, 0, 0, 100/72, 0, 0); fz_matrix ctm2 = fz_make_matrix(400/72, 0, 0, 400/72, 0, 0); fz_try(ctx) { tp = fz_new_stext_page(ctx, rect); dev = fz_new_stext_device(ctx, tp, &options); ocr_dev = fz_new_ocr_device(ctx, dev, fz_identity, rect, with_list, lang, tessdata, NULL); fz_run_page(ctx, page, ocr_dev, fz_identity, NULL); fz_close_device(ctx, ocr_dev); fz_close_device(ctx, dev); } fz_always(ctx) { fz_drop_device(ctx, dev); fz_drop_device(ctx, ocr_dev); } fz_catch(ctx) { fz_drop_stext_page(ctx, tp); fz_rethrow(ctx); } return tp; } */ //--------------------------------------------------------------------------- // APPEND non-ascii runes in unicode escape format to fz_buffer //--------------------------------------------------------------------------- void JM_append_rune(fz_context *ctx, fz_buffer *buff, int ch) { if (ch == 92) { // prevent accidental "\u" etc. fz_append_string(ctx, buff, "\\u005c"); } else if ((ch >= 32 && ch <= 255) || ch == 10) { fz_append_byte(ctx, buff, ch); } else if (ch >= 0xd800 && ch <= 0xdfff) { // surrogate Unicode range fz_append_string(ctx, buff, "\\ufffd"); } else if (ch <= 0xffff) { // 4 hex digits fz_append_printf(ctx, buff, "\\u%04x", ch); } else { // 8 hex digits fz_append_printf(ctx, buff, "\\U%08x", ch); } } // re-compute char quad if ascender/descender values make no sense static fz_quad JM_char_quad(fz_context *ctx, fz_stext_line *line, fz_stext_char *ch) { if (skip_quad_corrections) { // no special handling return ch->quad; } if (line->wmode) { // never touch vertical write mode return ch->quad; } fz_font *font = ch->font; float asc = JM_font_ascender(ctx, font); float dsc = JM_font_descender(ctx, font); float c, s, fsize = ch->size; float asc_dsc = asc - dsc + FLT_EPSILON; if (asc_dsc >= 1 && small_glyph_heights == 0) { // no problem return ch->quad; } if (asc < 1e-3) { // probably Tesseract glyphless font dsc = -0.1f; asc = 0.9f; asc_dsc = 1.0f; } if (small_glyph_heights || asc_dsc < 1) { dsc = dsc / asc_dsc; asc = asc / asc_dsc; } asc_dsc = asc - dsc; asc = asc * fsize / asc_dsc; dsc = dsc * fsize / asc_dsc; /* ------------------------------ Re-compute quad with the adjusted ascender / descender values: Move ch->origin to (0,0) and de-rotate quad, then adjust the corners, re-rotate and move back to ch->origin location. ------------------------------ */ fz_matrix trm1, trm2, xlate1, xlate2; fz_quad quad; c = line->dir.x; // cosine s = line->dir.y; // sine trm1 = fz_make_matrix(c, -s, s, c, 0, 0); // derotate trm2 = fz_make_matrix(c, s, -s, c, 0, 0); // rotate if (c == -1) { // left-right flip trm1.d = 1; trm2.d = 1; } xlate1 = fz_make_matrix(1, 0, 0, 1, -ch->origin.x, -ch->origin.y); xlate2 = fz_make_matrix(1, 0, 0, 1, ch->origin.x, ch->origin.y); quad = fz_transform_quad(ch->quad, xlate1); // move origin to (0,0) quad = fz_transform_quad(quad, trm1); // de-rotate corners // adjust vertical coordinates if (c == 1 && quad.ul.y > 0) { // up-down flip quad.ul.y = asc; quad.ur.y = asc; quad.ll.y = dsc; quad.lr.y = dsc; } else { quad.ul.y = -asc; quad.ur.y = -asc; quad.ll.y = -dsc; quad.lr.y = -dsc; } // adjust horizontal coordinates that are too crazy: // (1) left x must be >= 0 // (2) if bbox width is 0, lookup char advance in font. if (quad.ll.x < 0) { quad.ll.x = 0; quad.ul.x = 0; } float cwidth = quad.lr.x - quad.ll.x; if (cwidth < FLT_EPSILON) { int glyph = fz_encode_character(ctx, font, ch->c); if (glyph) { float fwidth = fz_advance_glyph(ctx, font, glyph, line->wmode); quad.lr.x = quad.ll.x + fwidth * fsize; quad.ur.x = quad.lr.x; } } quad = fz_transform_quad(quad, trm2); // rotate back quad = fz_transform_quad(quad, xlate2); // translate back return quad; } // return rect of char quad static fz_rect JM_char_bbox(fz_context *ctx, fz_stext_line *line, fz_stext_char *ch) { fz_rect r = fz_rect_from_quad(JM_char_quad(ctx, line, ch)); if (!line->wmode) { return r; } if (r.y1 < r.y0 + ch->size) { r.y0 = r.y1 - ch->size; } return r; } //------------------------------------------- // make a buffer from an stext_page's text //------------------------------------------- fz_buffer * JM_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_rect rect = page->mediabox; fz_buffer *buf = NULL; fz_try(ctx) { buf = fz_new_buffer(ctx, 256); for (block = page->first_block; block; block = block->next) { if (block->type == FZ_STEXT_BLOCK_TEXT) { for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { if (!JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch)) && !fz_is_infinite_rect(rect)) { continue; } fz_append_rune(ctx, buf, ch->c); } fz_append_byte(ctx, buf, '\n'); } fz_append_byte(ctx, buf, '\n'); } } } fz_catch(ctx) { fz_drop_buffer(ctx, buf); fz_rethrow(ctx); } return buf; } static float hdist(fz_point *dir, fz_point *a, fz_point *b) { float dx = b->x - a->x; float dy = b->y - a->y; return fz_abs(dx * dir->x + dy * dir->y); } static float vdist(fz_point *dir, fz_point *a, fz_point *b) { float dx = b->x - a->x; float dy = b->y - a->y; return fz_abs(dx * dir->y + dy * dir->x); } struct highlight { Py_ssize_t len; PyObject *quads; float hfuzz, vfuzz; }; static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) { struct highlight *hits = arg; float vfuzz = ch->size * hits->vfuzz; float hfuzz = ch->size * hits->hfuzz; fz_quad ch_quad = JM_char_quad(ctx, line, ch); if (hits->len > 0) { PyObject *quad = PySequence_ITEM(hits->quads, hits->len - 1); fz_quad end = JM_quad_from_py(quad); Py_DECREF(quad); if (hdist(&line->dir, &end.lr, &ch_quad.ll) < hfuzz && vdist(&line->dir, &end.lr, &ch_quad.ll) < vfuzz && hdist(&line->dir, &end.ur, &ch_quad.ul) < hfuzz && vdist(&line->dir, &end.ur, &ch_quad.ul) < vfuzz) { end.ur = ch_quad.ur; end.lr = ch_quad.lr; quad = JM_py_from_quad(end); PyList_SetItem(hits->quads, hits->len - 1, quad); return; } } LIST_APPEND_DROP(hits->quads, JM_py_from_quad(ch_quad)); hits->len++; } static inline int canon(int c) { /* TODO: proper unicode case folding */ /* TODO: character equivalence (a matches ä, etc) */ if (c == 0xA0 || c == 0x2028 || c == 0x2029) return ' '; if (c == '\r' || c == '\n' || c == '\t') return ' '; if (c >= 'A' && c <= 'Z') return c - 'A' + 'a'; return c; } static inline int chartocanon(int *c, const char *s) { int n = fz_chartorune(c, s); *c = canon(*c); return n; } static const char *match_string(const char *h, const char *n) { int hc, nc; const char *e = h; h += chartocanon(&hc, h); n += chartocanon(&nc, n); while (hc == nc) { e = h; if (hc == ' ') do h += chartocanon(&hc, h); while (hc == ' '); else h += chartocanon(&hc, h); if (nc == ' ') do n += chartocanon(&nc, n); while (nc == ' '); else n += chartocanon(&nc, n); } return nc == 0 ? e : NULL; } static const char *find_string(const char *s, const char *needle, const char **endp) { const char *end; while (*s) { end = match_string(s, needle); if (end) return *endp = end, s; ++s; } return *endp = NULL, NULL; } PyObject * JM_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle) { struct highlight hits; fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_buffer *buffer = NULL; const char *haystack, *begin, *end; fz_rect rect = page->mediabox; int c, inside; if (strlen(needle) == 0) Py_RETURN_NONE; PyObject *quads = PyList_New(0); hits.len = 0; hits.quads = quads; hits.hfuzz = 0.2f; /* merge kerns but not large gaps */ hits.vfuzz = 0.1f; fz_try(ctx) { buffer = JM_new_buffer_from_stext_page(ctx, page); haystack = fz_string_from_buffer(ctx, buffer); begin = find_string(haystack, needle, &end); if (!begin) goto no_more_matches; inside = 0; for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) { continue; } for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { if (!fz_is_infinite_rect(rect) && !JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch))) { goto next_char; } try_new_match: if (!inside) { if (haystack >= begin) inside = 1; } if (inside) { if (haystack < end) { on_highlight_char(ctx, &hits, line, ch); } else { inside = 0; begin = find_string(haystack, needle, &end); if (!begin) goto no_more_matches; else goto try_new_match; } } haystack += fz_chartorune(&c, haystack); next_char:; } assert(*haystack == '\n'); ++haystack; } assert(*haystack == '\n'); ++haystack; } no_more_matches:; } fz_always(ctx) fz_drop_buffer(ctx, buffer); fz_catch(ctx) fz_rethrow(ctx); return quads; } //----------------------------------------------------------------------------- // Plain text output. An identical copy of fz_print_stext_page_as_text, // but lines within a block are concatenated by space instead a new-line // character (which else leads to 2 new-lines). //----------------------------------------------------------------------------- void JM_print_stext_page_as_text(fz_context *ctx, fz_buffer *buff, fz_stext_page *page) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_rect rect = page->mediabox; fz_rect chbbox; int last_char = 0; char utf[10]; int i, n; for (block = page->first_block; block; block = block->next) { if (block->type == FZ_STEXT_BLOCK_TEXT) { for (line = block->u.t.first_line; line; line = line->next) { last_char = 0; for (ch = line->first_char; ch; ch = ch->next) { chbbox = JM_char_bbox(ctx, line, ch); if (fz_is_infinite_rect(rect) || JM_rects_overlap(rect, chbbox)) { last_char = ch->c; JM_append_rune(ctx, buff, ch->c); } } if (last_char != 10 && last_char > 0) { fz_append_string(ctx, buff, "\n"); } } } } } //----------------------------------------------------------------------------- // Functions for wordlist output //----------------------------------------------------------------------------- int JM_append_word(fz_context *ctx, PyObject *lines, fz_buffer *buff, fz_rect *wbbox, int block_n, int line_n, int word_n) { PyObject *s = JM_EscapeStrFromBuffer(ctx, buff); PyObject *litem = Py_BuildValue("ffffOiii", wbbox->x0, wbbox->y0, wbbox->x1, wbbox->y1, s, block_n, line_n, word_n); LIST_APPEND_DROP(lines, litem); Py_DECREF(s); *wbbox = fz_empty_rect; return word_n + 1; // word counter } //----------------------------------------------------------------------------- // Functions for dictionary output //----------------------------------------------------------------------------- static int detect_super_script(fz_stext_line *line, fz_stext_char *ch) { if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; return 0; } static int JM_char_font_flags(fz_context *ctx, fz_font *font, fz_stext_line *line, fz_stext_char *ch) { int flags = detect_super_script(line, ch); flags += fz_font_is_italic(ctx, font) * TEXT_FONT_ITALIC; flags += fz_font_is_serif(ctx, font) * TEXT_FONT_SERIFED; flags += fz_font_is_monospaced(ctx, font) * TEXT_FONT_MONOSPACED; flags += fz_font_is_bold(ctx, font) * TEXT_FONT_BOLD; return flags; } static const char * JM_font_name(fz_context *ctx, fz_font *font) { const char *name = fz_font_name(ctx, font); const char *s = strchr(name, '+'); if (subset_fontnames || s == NULL || s-name != 6) { return name; } return s + 1; } static fz_rect JM_make_spanlist(fz_context *ctx, PyObject *line_dict, fz_stext_line *line, int raw, fz_buffer *buff, fz_rect tp_rect) { PyObject *span = NULL, *char_list = NULL, *char_dict; PyObject *span_list = PyList_New(0); fz_clear_buffer(ctx, buff); fz_stext_char *ch; fz_rect span_rect = fz_empty_rect; fz_rect line_rect = fz_empty_rect; fz_point span_origin = {0, 0}; typedef struct style_s { float size; int flags; const char *font; int color; float asc; float desc; } char_style; char_style old_style = { -1, -1, "", -1, 0, 0 }, style; for (ch = line->first_char; ch; ch = ch->next) { fz_rect r = JM_char_bbox(ctx, line, ch); if (!JM_rects_overlap(tp_rect, r) && !fz_is_infinite_rect(tp_rect)) { continue; } int flags = JM_char_font_flags(ctx, ch->font, line, ch); fz_point origin = ch->origin; style.size = ch->size; style.flags = flags; style.font = JM_font_name(ctx, ch->font); style.color = ch->color; style.asc = JM_font_ascender(ctx, ch->font); style.desc = JM_font_descender(ctx, ch->font); if (style.size != old_style.size || style.flags != old_style.flags || style.color != old_style.color || strcmp(style.font, old_style.font) != 0) { if (old_style.size >= 0) { // not first one, output previous if (raw) { // put character list in the span DICT_SETITEM_DROP(span, dictkey_chars, char_list); char_list = NULL; } else { // put text string in the span DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(ctx, buff)); fz_clear_buffer(ctx, buff); } DICT_SETITEM_DROP(span, dictkey_origin, JM_py_from_point(span_origin)); DICT_SETITEM_DROP(span, dictkey_bbox, JM_py_from_rect(span_rect)); line_rect = fz_union_rect(line_rect, span_rect); LIST_APPEND_DROP(span_list, span); span = NULL; } span = PyDict_New(); float asc = style.asc, desc = style.desc; if (style.asc < 1e-3) { asc = 0.9f; desc = -0.1f; } DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size)); DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags)); DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font)); DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color)); DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc)); DICT_SETITEMSTR_DROP(span, "descender", Py_BuildValue("f", desc)); old_style = style; span_rect = r; span_origin = origin; } span_rect = fz_union_rect(span_rect, r); if (raw) { // make and append a char dict char_dict = PyDict_New(); DICT_SETITEM_DROP(char_dict, dictkey_origin, JM_py_from_point(ch->origin)); DICT_SETITEM_DROP(char_dict, dictkey_bbox, JM_py_from_rect(r)); DICT_SETITEM_DROP(char_dict, dictkey_c, Py_BuildValue("C", ch->c)); if (!char_list) { char_list = PyList_New(0); } LIST_APPEND_DROP(char_list, char_dict); } else { // add character byte to buffer JM_append_rune(ctx, buff, ch->c); } } // all characters processed, now flush remaining span if (span) { if (raw) { DICT_SETITEM_DROP(span, dictkey_chars, char_list); char_list = NULL; } else { DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(ctx, buff)); fz_clear_buffer(ctx, buff); } DICT_SETITEM_DROP(span, dictkey_origin, JM_py_from_point(span_origin)); DICT_SETITEM_DROP(span, dictkey_bbox, JM_py_from_rect(span_rect)); if (!fz_is_empty_rect(span_rect)) { LIST_APPEND_DROP(span_list, span); line_rect = fz_union_rect(line_rect, span_rect); } else { Py_DECREF(span); } span = NULL; } if (!fz_is_empty_rect(line_rect)) { DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list); } else { DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list); } return line_rect; } static void JM_make_image_block(fz_context *ctx, fz_stext_block *block, PyObject *block_dict) { fz_image *image = block->u.i.image; fz_buffer *buf = NULL, *freebuf = NULL; fz_compressed_buffer *buffer = fz_compressed_image_buffer(ctx, image); fz_var(buf); fz_var(freebuf); int n = fz_colorspace_n(ctx, image->colorspace); int w = image->w; int h = image->h; const char *ext = NULL; int type = FZ_IMAGE_UNKNOWN; if (buffer) type = buffer->params.type; if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2) type = FZ_IMAGE_UNKNOWN; PyObject *bytes = NULL; fz_var(bytes); fz_try(ctx) { if (buffer && type != FZ_IMAGE_UNKNOWN) { buf = buffer->buffer; ext = JM_image_extension(type); } else { buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params); ext = "png"; } bytes = JM_BinFromBuffer(ctx, buf); } fz_always(ctx) { if (!bytes) bytes = JM_BinFromChar(""); DICT_SETITEM_DROP(block_dict, dictkey_width, Py_BuildValue("i", w)); DICT_SETITEM_DROP(block_dict, dictkey_height, Py_BuildValue("i", h)); DICT_SETITEM_DROP(block_dict, dictkey_ext, Py_BuildValue("s", ext)); DICT_SETITEM_DROP(block_dict, dictkey_colorspace, Py_BuildValue("i", n)); DICT_SETITEM_DROP(block_dict, dictkey_xres, Py_BuildValue("i", image->xres)); DICT_SETITEM_DROP(block_dict, dictkey_yres, Py_BuildValue("i", image->xres)); DICT_SETITEM_DROP(block_dict, dictkey_bpc, Py_BuildValue("i", (int) image->bpc)); DICT_SETITEM_DROP(block_dict, dictkey_matrix, JM_py_from_matrix(block->u.i.transform)); DICT_SETITEM_DROP(block_dict, dictkey_size, Py_BuildValue("n", PyBytes_Size(bytes))); DICT_SETITEM_DROP(block_dict, dictkey_image, bytes); fz_drop_buffer(ctx, freebuf); } fz_catch(ctx) {;} return; } static void JM_make_text_block(fz_context *ctx, fz_stext_block *block, PyObject *block_dict, int raw, fz_buffer *buff, fz_rect tp_rect) { fz_stext_line *line; PyObject *line_list = PyList_New(0), *line_dict; fz_rect block_rect = fz_empty_rect; for (line = block->u.t.first_line; line; line = line->next) { if (fz_is_empty_rect(fz_intersect_rect(tp_rect, line->bbox)) && !fz_is_infinite_rect(tp_rect)) { continue; } line_dict = PyDict_New(); fz_rect line_rect = JM_make_spanlist(ctx, line_dict, line, raw, buff, tp_rect); block_rect = fz_union_rect(block_rect, line_rect); DICT_SETITEM_DROP(line_dict, dictkey_wmode, Py_BuildValue("i", line->wmode)); DICT_SETITEM_DROP(line_dict, dictkey_dir, JM_py_from_point(line->dir)); DICT_SETITEM_DROP(line_dict, dictkey_bbox, JM_py_from_rect(line_rect)); LIST_APPEND_DROP(line_list, line_dict); } DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block_rect)); DICT_SETITEM_DROP(block_dict, dictkey_lines, line_list); return; } void JM_make_textpage_dict(fz_context *ctx, fz_stext_page *tp, PyObject *page_dict, int raw) { fz_stext_block *block; fz_buffer *text_buffer = fz_new_buffer(ctx, 128); PyObject *block_dict, *block_list = PyList_New(0); fz_rect tp_rect = tp->mediabox; int block_n = -1; for (block = tp->first_block; block; block = block->next) { block_n++; if (!fz_contains_rect(tp_rect, block->bbox) && !fz_is_infinite_rect(tp_rect) && block->type == FZ_STEXT_BLOCK_IMAGE) { continue; } if (!fz_is_infinite_rect(tp_rect) && fz_is_empty_rect(fz_intersect_rect(tp_rect, block->bbox))) { continue; } block_dict = PyDict_New(); DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); if (block->type == FZ_STEXT_BLOCK_IMAGE) { DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); JM_make_image_block(ctx, block, block_dict); } else { JM_make_text_block(ctx, block, block_dict, raw, text_buffer, tp_rect); } LIST_APPEND_DROP(block_list, block_dict); } DICT_SETITEM_DROP(page_dict, dictkey_blocks, block_list); fz_drop_buffer(ctx, text_buffer); } //--------------------------------------------------------------------- PyObject * JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area) { fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_buffer *buffer; int need_new_line = 0; PyObject *rc = NULL; fz_try(ctx) { buffer = fz_new_buffer(ctx, 1024); for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) continue; for (line = block->u.t.first_line; line; line = line->next) { int line_had_text = 0; for (ch = line->first_char; ch; ch = ch->next) { fz_rect r = JM_char_bbox(ctx, line, ch); if (JM_rects_overlap(area, r)) { line_had_text = 1; if (need_new_line) { fz_append_string(ctx, buffer, "\n"); need_new_line = 0; } JM_append_rune(ctx, buffer, ch->c); } } if (line_had_text) need_new_line = 1; } } fz_terminate_buffer(ctx, buffer); rc = JM_EscapeStrFromBuffer(ctx, buffer); if (!rc) { rc = EMPTY_STRING; PyErr_Clear(); } } fz_always(ctx) { fz_drop_buffer(ctx, buffer); } fz_catch(ctx) { fz_rethrow(ctx); } return rc; } //--------------------------------------------------------------------- fz_buffer *JM_object_to_buffer(fz_context *ctx, pdf_obj *what, int compress, int ascii) { fz_buffer *res=NULL; fz_output *out=NULL; fz_try(ctx) { res = fz_new_buffer(ctx, 512); out = fz_new_output_with_buffer(ctx, res); pdf_print_obj(ctx, out, what, compress, ascii); } fz_always(ctx) { fz_drop_output(ctx, out); } fz_catch(ctx) { fz_rethrow(ctx); } fz_terminate_buffer(ctx, res); return res; } //----------------------------------------------------------------------------- // Merge the /Resources object created by a text pdf device into the page. // The device may have created multiple /ExtGState/Alp? and /Font/F? objects. // These need to be renamed (renumbered) to not overwrite existing page // objects from previous executions. // Returns the next available numbers n, m for objects /Alp<n>, /F<m>. //----------------------------------------------------------------------------- PyObject *JM_merge_resources(fz_context *ctx, pdf_page *page, pdf_obj *temp_res) { // page objects /Resources, /Resources/ExtGState, /Resources/Font pdf_obj *resources = pdf_dict_get(ctx, page->obj, PDF_NAME(Resources)); pdf_obj *main_extg = pdf_dict_get(ctx, resources, PDF_NAME(ExtGState)); pdf_obj *main_fonts = pdf_dict_get(ctx, resources, PDF_NAME(Font)); // text pdf device objects /ExtGState, /Font pdf_obj *temp_extg = pdf_dict_get(ctx, temp_res, PDF_NAME(ExtGState)); pdf_obj *temp_fonts = pdf_dict_get(ctx, temp_res, PDF_NAME(Font)); int max_alp = -1, max_fonts = -1, i, n; char text[20]; // Handle /Alp objects if (pdf_is_dict(ctx, temp_extg)) // any created at all? { n = pdf_dict_len(ctx, temp_extg); if (pdf_is_dict(ctx, main_extg)) { // does page have /ExtGState yet? for (i = 0; i < pdf_dict_len(ctx, main_extg); i++) { // get highest number of objects named /Alpxxx char *alp = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, main_extg, i)); if (strncmp(alp, "Alp", 3) != 0) continue; int j = fz_atoi(alp + 3); if (j > max_alp) max_alp = j; } } else // create a /ExtGState for the page main_extg = pdf_dict_put_dict(ctx, resources, PDF_NAME(ExtGState), n); max_alp += 1; for (i = 0; i < n; i++) // copy over renumbered /Alp objects { char *alp = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, temp_extg, i)); int j = fz_atoi(alp + 3) + max_alp; fz_snprintf(text, sizeof(text), "Alp%d", j); // new name pdf_obj *val = pdf_dict_get_val(ctx, temp_extg, i); pdf_dict_puts(ctx, main_extg, text, val); } } if (pdf_is_dict(ctx, main_fonts)) { // has page any fonts yet? for (i = 0; i < pdf_dict_len(ctx, main_fonts); i++) { // get max font number char *font = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, main_fonts, i)); if (strncmp(font, "F", 1) != 0) continue; int j = fz_atoi(font + 1); if (j > max_fonts) max_fonts = j; } } else // create a Resources/Font for the page main_fonts = pdf_dict_put_dict(ctx, resources, PDF_NAME(Font), 2); max_fonts += 1; for (i = 0; i < pdf_dict_len(ctx, temp_fonts); i++) { // copy renumbered fonts char *font = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, temp_fonts, i)); int j = fz_atoi(font + 1) + max_fonts; fz_snprintf(text, sizeof(text), "F%d", j); pdf_obj *val = pdf_dict_get_val(ctx, temp_fonts, i); pdf_dict_puts(ctx, main_fonts, text, val); } return Py_BuildValue("ii", max_alp, max_fonts); // next available numbers } //----------------------------------------------------------------------------- // version of fz_show_string, which covers SMALL CAPS //----------------------------------------------------------------------------- fz_matrix JM_show_string_cs(fz_context *ctx, fz_text *text, fz_font *user_font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language) { fz_font *font=NULL; int gid, ucs; float adv; while (*s) { s += fz_chartorune(&ucs, s); gid = fz_encode_character_sc(ctx, user_font, ucs); if (gid == 0) { gid = fz_encode_character_with_fallback(ctx, user_font, ucs, 0, language, &font); } else { font = user_font; } fz_show_glyph(ctx, text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language); adv = fz_advance_glyph(ctx, font, gid, wmode); if (wmode == 0) trm = fz_pre_translate(trm, adv, 0); else trm = fz_pre_translate(trm, 0, -adv); } return trm; } //----------------------------------------------------------------------------- // version of fz_show_string, which also covers UCDN script //----------------------------------------------------------------------------- fz_matrix JM_show_string(fz_context *ctx, fz_text *text, fz_font *user_font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language, int script) { fz_font *font; int gid, ucs; float adv; while (*s) { s += fz_chartorune(&ucs, s); gid = fz_encode_character_with_fallback(ctx, user_font, ucs, script, language, &font); fz_show_glyph(ctx, text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language); adv = fz_advance_glyph(ctx, font, gid, wmode); if (wmode == 0) trm = fz_pre_translate(trm, adv, 0); else trm = fz_pre_translate(trm, 0, -adv); } return trm; } //----------------------------------------------------------------------------- // return a fz_font from a number of parameters //----------------------------------------------------------------------------- fz_font *JM_get_font(fz_context *ctx, char *fontname, char *fontfile, PyObject *fontbuffer, int script, int lang, int ordering, int is_bold, int is_italic, int is_serif, int embed) { const unsigned char *data = NULL; int size, index=0; fz_buffer *res = NULL; fz_font *font = NULL; fz_try(ctx) { if (fontfile) goto have_file; if (EXISTS(fontbuffer)) goto have_buffer; if (ordering > -1) goto have_cjk; if (fontname) goto have_base14; goto have_noto; // Base-14 or a MuPDF builtin font have_base14:; font = fz_new_base14_font(ctx, fontname); if (font) { goto fertig; } font = fz_new_builtin_font(ctx, fontname, is_bold, is_italic); goto fertig; // CJK font have_cjk:; font = fz_new_cjk_font(ctx, ordering); goto fertig; // fontfile have_file:; font = fz_new_font_from_file(ctx, NULL, fontfile, index, 0); goto fertig; // fontbuffer have_buffer:; res = JM_BufferFromBytes(ctx, fontbuffer); font = fz_new_font_from_buffer(ctx, NULL, res, index, 0); goto fertig; // Check for NOTO font have_noto:; data = fz_lookup_noto_font(ctx, script, lang, &size, &index); if (data) font = fz_new_font_from_memory(ctx, NULL, data, size, index, 0); if (font) goto fertig; font = fz_load_fallback_font(ctx, script, lang, is_serif, is_bold, is_italic); goto fertig; fertig:; if (!font) { RAISEPY(ctx, MSG_FONT_FAILED, PyExc_RuntimeError); } #if FZ_VERSION_MAJOR == 1 && FZ_VERSION_MINOR >= 22 // if font allows this, set embedding if (!font->flags.never_embed) { fz_set_font_embedding(ctx, font, embed); } #endif } fz_always(ctx) { fz_drop_buffer(ctx, res); } fz_catch(ctx) { fz_rethrow(ctx); } return font; } %}
