Mercurial > hgrepos > Python2 > PyMuPDF
diff src_classic/helper-stext.i @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | 1d09e1dec1d9 |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src_classic/helper-stext.i Mon Sep 15 11:44:09 2025 +0200 @@ -0,0 +1,1072 @@ +%{ +/* +# ------------------------------------------------------------------------ +# Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com +# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html +# +# Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a +# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is +# maintained and developed by Artifex Software, Inc. https://artifex.com. +# ------------------------------------------------------------------------ +*/ +// need own versions of ascender / descender +static const float +JM_font_ascender(fz_context *ctx, fz_font *font) +{ + if (skip_quad_corrections) { + return 0.8f; + } + return fz_font_ascender(ctx, font); +} + +static const float +JM_font_descender(fz_context *ctx, fz_font *font) +{ + if (skip_quad_corrections) { + return -0.2f; + } + return fz_font_descender(ctx, font); +} + + +//---------------------------------------------------------------- +// Return true if character is considered to be a word delimiter +//---------------------------------------------------------------- +static const int +JM_is_word_delimiter(int c, PyObject *delimiters) +{ + if (c <= 32 || c == 160) return 1; // a standard delimiter + + // extra delimiters must be a non-empty sequence + if (!delimiters || PyObject_Not(delimiters) || !PySequence_Check(delimiters)) { + return 0; + } + + // convert to tuple for easier looping + PyObject *delims = PySequence_Tuple(delimiters); + if (!delims) { + PyErr_Clear(); + return 0; + } + + // Make 1-char PyObject from character given as integer + PyObject *cchar = Py_BuildValue("C", c); // single character PyObject + Py_ssize_t i, len = PyTuple_Size(delims); + for (i = 0; i < len; i++) { + int rc = PyUnicode_Compare(cchar, PyTuple_GET_ITEM(delims, i)); + if (rc == 0) { // equal to a delimiter character + Py_DECREF(cchar); + Py_DECREF(delims); + PyErr_Clear(); + return 1; + } + } + + Py_DECREF(delims); + PyErr_Clear(); + return 0; +} + +/* inactive +//----------------------------------------------------------------------------- +// Make OCR text page directly from an fz_page +//----------------------------------------------------------------------------- +fz_stext_page * +JM_new_stext_page_ocr_from_page(fz_context *ctx, fz_page *page, fz_rect rect, int flags, + const char *lang, const char *tessdata) +{ + if (!page) return NULL; + int with_list = 1; + fz_stext_page *tp = NULL; + fz_device *dev = NULL, *ocr_dev = NULL; + fz_var(dev); + fz_var(ocr_dev); + fz_var(tp); + fz_stext_options options; + memset(&options, 0, sizeof options); + options.flags = flags; + //fz_matrix ctm = fz_identity; + fz_matrix ctm1 = fz_make_matrix(100/72, 0, 0, 100/72, 0, 0); + fz_matrix ctm2 = fz_make_matrix(400/72, 0, 0, 400/72, 0, 0); + + fz_try(ctx) { + tp = fz_new_stext_page(ctx, rect); + dev = fz_new_stext_device(ctx, tp, &options); + ocr_dev = fz_new_ocr_device(ctx, dev, fz_identity, rect, with_list, lang, tessdata, NULL); + fz_run_page(ctx, page, ocr_dev, fz_identity, NULL); + fz_close_device(ctx, ocr_dev); + fz_close_device(ctx, dev); + } + fz_always(ctx) { + fz_drop_device(ctx, dev); + fz_drop_device(ctx, ocr_dev); + } + fz_catch(ctx) { + fz_drop_stext_page(ctx, tp); + fz_rethrow(ctx); + } + return tp; +} +*/ + +//--------------------------------------------------------------------------- +// APPEND non-ascii runes in unicode escape format to fz_buffer +//--------------------------------------------------------------------------- +void JM_append_rune(fz_context *ctx, fz_buffer *buff, int ch) +{ + if (ch == 92) { // prevent accidental "\u" etc. + fz_append_string(ctx, buff, "\\u005c"); + } else if ((ch >= 32 && ch <= 255) || ch == 10) { + fz_append_byte(ctx, buff, ch); + } else if (ch >= 0xd800 && ch <= 0xdfff) { // surrogate Unicode range + fz_append_string(ctx, buff, "\\ufffd"); + } else if (ch <= 0xffff) { // 4 hex digits + fz_append_printf(ctx, buff, "\\u%04x", ch); + } else { // 8 hex digits + fz_append_printf(ctx, buff, "\\U%08x", ch); + } +} + + +// re-compute char quad if ascender/descender values make no sense +static fz_quad +JM_char_quad(fz_context *ctx, fz_stext_line *line, fz_stext_char *ch) +{ + if (skip_quad_corrections) { // no special handling + return ch->quad; + } + if (line->wmode) { // never touch vertical write mode + return ch->quad; + } + fz_font *font = ch->font; + float asc = JM_font_ascender(ctx, font); + float dsc = JM_font_descender(ctx, font); + float c, s, fsize = ch->size; + float asc_dsc = asc - dsc + FLT_EPSILON; + if (asc_dsc >= 1 && small_glyph_heights == 0) { // no problem + return ch->quad; + } + if (asc < 1e-3) { // probably Tesseract glyphless font + dsc = -0.1f; + asc = 0.9f; + asc_dsc = 1.0f; + } + + if (small_glyph_heights || asc_dsc < 1) { + dsc = dsc / asc_dsc; + asc = asc / asc_dsc; + } + asc_dsc = asc - dsc; + asc = asc * fsize / asc_dsc; + dsc = dsc * fsize / asc_dsc; + + /* ------------------------------ + Re-compute quad with the adjusted ascender / descender values: + Move ch->origin to (0,0) and de-rotate quad, then adjust the corners, + re-rotate and move back to ch->origin location. + ------------------------------ */ + fz_matrix trm1, trm2, xlate1, xlate2; + fz_quad quad; + c = line->dir.x; // cosine + s = line->dir.y; // sine + trm1 = fz_make_matrix(c, -s, s, c, 0, 0); // derotate + trm2 = fz_make_matrix(c, s, -s, c, 0, 0); // rotate + if (c == -1) { // left-right flip + trm1.d = 1; + trm2.d = 1; + } + xlate1 = fz_make_matrix(1, 0, 0, 1, -ch->origin.x, -ch->origin.y); + xlate2 = fz_make_matrix(1, 0, 0, 1, ch->origin.x, ch->origin.y); + + quad = fz_transform_quad(ch->quad, xlate1); // move origin to (0,0) + quad = fz_transform_quad(quad, trm1); // de-rotate corners + + // adjust vertical coordinates + if (c == 1 && quad.ul.y > 0) { // up-down flip + quad.ul.y = asc; + quad.ur.y = asc; + quad.ll.y = dsc; + quad.lr.y = dsc; + } else { + quad.ul.y = -asc; + quad.ur.y = -asc; + quad.ll.y = -dsc; + quad.lr.y = -dsc; + } + + // adjust horizontal coordinates that are too crazy: + // (1) left x must be >= 0 + // (2) if bbox width is 0, lookup char advance in font. + if (quad.ll.x < 0) { + quad.ll.x = 0; + quad.ul.x = 0; + } + float cwidth = quad.lr.x - quad.ll.x; + if (cwidth < FLT_EPSILON) { + int glyph = fz_encode_character(ctx, font, ch->c); + if (glyph) { + float fwidth = fz_advance_glyph(ctx, font, glyph, line->wmode); + quad.lr.x = quad.ll.x + fwidth * fsize; + quad.ur.x = quad.lr.x; + } + } + + quad = fz_transform_quad(quad, trm2); // rotate back + quad = fz_transform_quad(quad, xlate2); // translate back + return quad; +} + + +// return rect of char quad +static fz_rect +JM_char_bbox(fz_context *ctx, fz_stext_line *line, fz_stext_char *ch) +{ + fz_rect r = fz_rect_from_quad(JM_char_quad(ctx, line, ch)); + if (!line->wmode) { + return r; + } + if (r.y1 < r.y0 + ch->size) { + r.y0 = r.y1 - ch->size; + } + return r; +} + + +//------------------------------------------- +// make a buffer from an stext_page's text +//------------------------------------------- +fz_buffer * +JM_new_buffer_from_stext_page(fz_context *ctx, fz_stext_page *page) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_rect rect = page->mediabox; + fz_buffer *buf = NULL; + + fz_try(ctx) + { + buf = fz_new_buffer(ctx, 256); + for (block = page->first_block; block; block = block->next) { + if (block->type == FZ_STEXT_BLOCK_TEXT) { + for (line = block->u.t.first_line; line; line = line->next) { + for (ch = line->first_char; ch; ch = ch->next) { + if (!JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch)) && + !fz_is_infinite_rect(rect)) { + continue; + } + fz_append_rune(ctx, buf, ch->c); + } + fz_append_byte(ctx, buf, '\n'); + } + fz_append_byte(ctx, buf, '\n'); + } + } + } + fz_catch(ctx) { + fz_drop_buffer(ctx, buf); + fz_rethrow(ctx); + } + return buf; +} + + +static float hdist(fz_point *dir, fz_point *a, fz_point *b) +{ + float dx = b->x - a->x; + float dy = b->y - a->y; + return fz_abs(dx * dir->x + dy * dir->y); +} + + +static float vdist(fz_point *dir, fz_point *a, fz_point *b) +{ + float dx = b->x - a->x; + float dy = b->y - a->y; + return fz_abs(dx * dir->y + dy * dir->x); +} + + +struct highlight +{ + Py_ssize_t len; + PyObject *quads; + float hfuzz, vfuzz; +}; + + +static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) +{ + struct highlight *hits = arg; + float vfuzz = ch->size * hits->vfuzz; + float hfuzz = ch->size * hits->hfuzz; + fz_quad ch_quad = JM_char_quad(ctx, line, ch); + if (hits->len > 0) { + PyObject *quad = PySequence_ITEM(hits->quads, hits->len - 1); + fz_quad end = JM_quad_from_py(quad); + Py_DECREF(quad); + if (hdist(&line->dir, &end.lr, &ch_quad.ll) < hfuzz + && vdist(&line->dir, &end.lr, &ch_quad.ll) < vfuzz + && hdist(&line->dir, &end.ur, &ch_quad.ul) < hfuzz + && vdist(&line->dir, &end.ur, &ch_quad.ul) < vfuzz) + { + end.ur = ch_quad.ur; + end.lr = ch_quad.lr; + quad = JM_py_from_quad(end); + PyList_SetItem(hits->quads, hits->len - 1, quad); + return; + } + } + LIST_APPEND_DROP(hits->quads, JM_py_from_quad(ch_quad)); + hits->len++; +} + + +static inline int canon(int c) +{ + /* TODO: proper unicode case folding */ + /* TODO: character equivalence (a matches รค, etc) */ + if (c == 0xA0 || c == 0x2028 || c == 0x2029) + return ' '; + if (c == '\r' || c == '\n' || c == '\t') + return ' '; + if (c >= 'A' && c <= 'Z') + return c - 'A' + 'a'; + return c; +} + + +static inline int chartocanon(int *c, const char *s) +{ + int n = fz_chartorune(c, s); + *c = canon(*c); + return n; +} + + +static const char *match_string(const char *h, const char *n) +{ + int hc, nc; + const char *e = h; + h += chartocanon(&hc, h); + n += chartocanon(&nc, n); + while (hc == nc) + { + e = h; + if (hc == ' ') + do + h += chartocanon(&hc, h); + while (hc == ' '); + else + h += chartocanon(&hc, h); + if (nc == ' ') + do + n += chartocanon(&nc, n); + while (nc == ' '); + else + n += chartocanon(&nc, n); + } + return nc == 0 ? e : NULL; +} + + +static const char *find_string(const char *s, const char *needle, const char **endp) +{ + const char *end; + while (*s) + { + end = match_string(s, needle); + if (end) + return *endp = end, s; + ++s; + } + return *endp = NULL, NULL; +} + + +PyObject * +JM_search_stext_page(fz_context *ctx, fz_stext_page *page, const char *needle) +{ + struct highlight hits; + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_buffer *buffer = NULL; + const char *haystack, *begin, *end; + fz_rect rect = page->mediabox; + int c, inside; + + if (strlen(needle) == 0) Py_RETURN_NONE; + PyObject *quads = PyList_New(0); + hits.len = 0; + hits.quads = quads; + hits.hfuzz = 0.2f; /* merge kerns but not large gaps */ + hits.vfuzz = 0.1f; + + fz_try(ctx) { + buffer = JM_new_buffer_from_stext_page(ctx, page); + haystack = fz_string_from_buffer(ctx, buffer); + begin = find_string(haystack, needle, &end); + if (!begin) goto no_more_matches; + + inside = 0; + for (block = page->first_block; block; block = block->next) { + if (block->type != FZ_STEXT_BLOCK_TEXT) { + continue; + } + for (line = block->u.t.first_line; line; line = line->next) { + for (ch = line->first_char; ch; ch = ch->next) { + if (!fz_is_infinite_rect(rect) && + !JM_rects_overlap(rect, JM_char_bbox(ctx, line, ch))) { + goto next_char; + } +try_new_match: + if (!inside) { + if (haystack >= begin) inside = 1; + } + if (inside) { + if (haystack < end) { + on_highlight_char(ctx, &hits, line, ch); + } else { + inside = 0; + begin = find_string(haystack, needle, &end); + if (!begin) goto no_more_matches; + else goto try_new_match; + } + } + haystack += fz_chartorune(&c, haystack); +next_char:; + } + assert(*haystack == '\n'); + ++haystack; + } + assert(*haystack == '\n'); + ++haystack; + } +no_more_matches:; + } + fz_always(ctx) + fz_drop_buffer(ctx, buffer); + fz_catch(ctx) + fz_rethrow(ctx); + + return quads; +} + + +//----------------------------------------------------------------------------- +// Plain text output. An identical copy of fz_print_stext_page_as_text, +// but lines within a block are concatenated by space instead a new-line +// character (which else leads to 2 new-lines). +//----------------------------------------------------------------------------- +void +JM_print_stext_page_as_text(fz_context *ctx, fz_buffer *buff, fz_stext_page *page) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_rect rect = page->mediabox; + fz_rect chbbox; + int last_char = 0; + char utf[10]; + int i, n; + + for (block = page->first_block; block; block = block->next) { + if (block->type == FZ_STEXT_BLOCK_TEXT) { + for (line = block->u.t.first_line; line; line = line->next) { + last_char = 0; + for (ch = line->first_char; ch; ch = ch->next) { + chbbox = JM_char_bbox(ctx, line, ch); + if (fz_is_infinite_rect(rect) || + JM_rects_overlap(rect, chbbox)) { + last_char = ch->c; + JM_append_rune(ctx, buff, ch->c); + } + } + if (last_char != 10 && last_char > 0) { + fz_append_string(ctx, buff, "\n"); + } + } + } + } +} + +//----------------------------------------------------------------------------- +// Functions for wordlist output +//----------------------------------------------------------------------------- +int JM_append_word(fz_context *ctx, PyObject *lines, fz_buffer *buff, fz_rect *wbbox, + int block_n, int line_n, int word_n) +{ + PyObject *s = JM_EscapeStrFromBuffer(ctx, buff); + PyObject *litem = Py_BuildValue("ffffOiii", + wbbox->x0, + wbbox->y0, + wbbox->x1, + wbbox->y1, + s, + block_n, line_n, word_n); + LIST_APPEND_DROP(lines, litem); + Py_DECREF(s); + *wbbox = fz_empty_rect; + return word_n + 1; // word counter +} + +//----------------------------------------------------------------------------- +// Functions for dictionary output +//----------------------------------------------------------------------------- + +static int detect_super_script(fz_stext_line *line, fz_stext_char *ch) +{ + if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) + return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; + return 0; +} + +static int JM_char_font_flags(fz_context *ctx, fz_font *font, fz_stext_line *line, fz_stext_char *ch) +{ + int flags = detect_super_script(line, ch); + flags += fz_font_is_italic(ctx, font) * TEXT_FONT_ITALIC; + flags += fz_font_is_serif(ctx, font) * TEXT_FONT_SERIFED; + flags += fz_font_is_monospaced(ctx, font) * TEXT_FONT_MONOSPACED; + flags += fz_font_is_bold(ctx, font) * TEXT_FONT_BOLD; + return flags; +} + +static const char * +JM_font_name(fz_context *ctx, fz_font *font) +{ + const char *name = fz_font_name(ctx, font); + const char *s = strchr(name, '+'); + if (subset_fontnames || s == NULL || s-name != 6) { + return name; + } + return s + 1; +} + + +static fz_rect +JM_make_spanlist(fz_context *ctx, PyObject *line_dict, + fz_stext_line *line, int raw, fz_buffer *buff, + fz_rect tp_rect) +{ + PyObject *span = NULL, *char_list = NULL, *char_dict; + PyObject *span_list = PyList_New(0); + fz_clear_buffer(ctx, buff); + fz_stext_char *ch; + fz_rect span_rect = fz_empty_rect; + fz_rect line_rect = fz_empty_rect; + fz_point span_origin = {0, 0}; + typedef struct style_s { + float size; int flags; const char *font; int color; + float asc; float desc; + } char_style; + char_style old_style = { -1, -1, "", -1, 0, 0 }, style; + + for (ch = line->first_char; ch; ch = ch->next) { + fz_rect r = JM_char_bbox(ctx, line, ch); + if (!JM_rects_overlap(tp_rect, r) && + !fz_is_infinite_rect(tp_rect)) { + continue; + } + int flags = JM_char_font_flags(ctx, ch->font, line, ch); + fz_point origin = ch->origin; + style.size = ch->size; + style.flags = flags; + style.font = JM_font_name(ctx, ch->font); + style.color = ch->color; + style.asc = JM_font_ascender(ctx, ch->font); + style.desc = JM_font_descender(ctx, ch->font); + + if (style.size != old_style.size || + style.flags != old_style.flags || + style.color != old_style.color || + strcmp(style.font, old_style.font) != 0) { + + if (old_style.size >= 0) { + // not first one, output previous + if (raw) { + // put character list in the span + DICT_SETITEM_DROP(span, dictkey_chars, char_list); + char_list = NULL; + } else { + // put text string in the span + DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(ctx, buff)); + fz_clear_buffer(ctx, buff); + } + + DICT_SETITEM_DROP(span, dictkey_origin, + JM_py_from_point(span_origin)); + DICT_SETITEM_DROP(span, dictkey_bbox, + JM_py_from_rect(span_rect)); + line_rect = fz_union_rect(line_rect, span_rect); + LIST_APPEND_DROP(span_list, span); + span = NULL; + } + + span = PyDict_New(); + float asc = style.asc, desc = style.desc; + if (style.asc < 1e-3) { + asc = 0.9f; + desc = -0.1f; + } + + DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size)); + DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("i", style.flags)); + DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font)); + DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("i", style.color)); + DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc)); + DICT_SETITEMSTR_DROP(span, "descender", Py_BuildValue("f", desc)); + + old_style = style; + span_rect = r; + span_origin = origin; + + } + span_rect = fz_union_rect(span_rect, r); + + if (raw) { // make and append a char dict + char_dict = PyDict_New(); + DICT_SETITEM_DROP(char_dict, dictkey_origin, + JM_py_from_point(ch->origin)); + + DICT_SETITEM_DROP(char_dict, dictkey_bbox, + JM_py_from_rect(r)); + + DICT_SETITEM_DROP(char_dict, dictkey_c, + Py_BuildValue("C", ch->c)); + + if (!char_list) { + char_list = PyList_New(0); + } + LIST_APPEND_DROP(char_list, char_dict); + } else { // add character byte to buffer + JM_append_rune(ctx, buff, ch->c); + } + } + // all characters processed, now flush remaining span + if (span) { + if (raw) { + DICT_SETITEM_DROP(span, dictkey_chars, char_list); + char_list = NULL; + } else { + DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(ctx, buff)); + fz_clear_buffer(ctx, buff); + } + DICT_SETITEM_DROP(span, dictkey_origin, JM_py_from_point(span_origin)); + DICT_SETITEM_DROP(span, dictkey_bbox, JM_py_from_rect(span_rect)); + + if (!fz_is_empty_rect(span_rect)) { + LIST_APPEND_DROP(span_list, span); + line_rect = fz_union_rect(line_rect, span_rect); + } else { + Py_DECREF(span); + } + span = NULL; + } + if (!fz_is_empty_rect(line_rect)) { + DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list); + } else { + DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list); + } + return line_rect; +} + +static void JM_make_image_block(fz_context *ctx, fz_stext_block *block, PyObject *block_dict) +{ + fz_image *image = block->u.i.image; + fz_buffer *buf = NULL, *freebuf = NULL; + fz_compressed_buffer *buffer = fz_compressed_image_buffer(ctx, image); + fz_var(buf); + fz_var(freebuf); + int n = fz_colorspace_n(ctx, image->colorspace); + int w = image->w; + int h = image->h; + const char *ext = NULL; + int type = FZ_IMAGE_UNKNOWN; + if (buffer) + type = buffer->params.type; + if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2) + type = FZ_IMAGE_UNKNOWN; + PyObject *bytes = NULL; + fz_var(bytes); + fz_try(ctx) { + if (buffer && type != FZ_IMAGE_UNKNOWN) { + buf = buffer->buffer; + ext = JM_image_extension(type); + } else { + buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params); + ext = "png"; + } + bytes = JM_BinFromBuffer(ctx, buf); + } + fz_always(ctx) { + if (!bytes) + bytes = JM_BinFromChar(""); + + DICT_SETITEM_DROP(block_dict, dictkey_width, + Py_BuildValue("i", w)); + DICT_SETITEM_DROP(block_dict, dictkey_height, + Py_BuildValue("i", h)); + DICT_SETITEM_DROP(block_dict, dictkey_ext, + Py_BuildValue("s", ext)); + DICT_SETITEM_DROP(block_dict, dictkey_colorspace, + Py_BuildValue("i", n)); + DICT_SETITEM_DROP(block_dict, dictkey_xres, + Py_BuildValue("i", image->xres)); + DICT_SETITEM_DROP(block_dict, dictkey_yres, + Py_BuildValue("i", image->xres)); + DICT_SETITEM_DROP(block_dict, dictkey_bpc, + Py_BuildValue("i", (int) image->bpc)); + DICT_SETITEM_DROP(block_dict, dictkey_matrix, + JM_py_from_matrix(block->u.i.transform)); + DICT_SETITEM_DROP(block_dict, dictkey_size, + Py_BuildValue("n", PyBytes_Size(bytes))); + DICT_SETITEM_DROP(block_dict, dictkey_image, bytes); + + fz_drop_buffer(ctx, freebuf); + } + fz_catch(ctx) {;} + return; +} + +static void JM_make_text_block(fz_context *ctx, fz_stext_block *block, PyObject *block_dict, int raw, fz_buffer *buff, fz_rect tp_rect) +{ + fz_stext_line *line; + PyObject *line_list = PyList_New(0), *line_dict; + fz_rect block_rect = fz_empty_rect; + for (line = block->u.t.first_line; line; line = line->next) { + if (fz_is_empty_rect(fz_intersect_rect(tp_rect, line->bbox)) && + !fz_is_infinite_rect(tp_rect)) { + continue; + } + line_dict = PyDict_New(); + fz_rect line_rect = JM_make_spanlist(ctx, line_dict, line, raw, buff, tp_rect); + block_rect = fz_union_rect(block_rect, line_rect); + DICT_SETITEM_DROP(line_dict, dictkey_wmode, + Py_BuildValue("i", line->wmode)); + DICT_SETITEM_DROP(line_dict, dictkey_dir, JM_py_from_point(line->dir)); + DICT_SETITEM_DROP(line_dict, dictkey_bbox, + JM_py_from_rect(line_rect)); + LIST_APPEND_DROP(line_list, line_dict); + } + DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block_rect)); + DICT_SETITEM_DROP(block_dict, dictkey_lines, line_list); + return; +} + +void JM_make_textpage_dict(fz_context *ctx, fz_stext_page *tp, PyObject *page_dict, int raw) +{ + fz_stext_block *block; + fz_buffer *text_buffer = fz_new_buffer(ctx, 128); + PyObject *block_dict, *block_list = PyList_New(0); + fz_rect tp_rect = tp->mediabox; + int block_n = -1; + for (block = tp->first_block; block; block = block->next) { + block_n++; + if (!fz_contains_rect(tp_rect, block->bbox) && + !fz_is_infinite_rect(tp_rect) && + block->type == FZ_STEXT_BLOCK_IMAGE) { + continue; + } + if (!fz_is_infinite_rect(tp_rect) && + fz_is_empty_rect(fz_intersect_rect(tp_rect, block->bbox))) { + continue; + } + + block_dict = PyDict_New(); + DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); + DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); + if (block->type == FZ_STEXT_BLOCK_IMAGE) { + DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); + JM_make_image_block(ctx, block, block_dict); + } else { + JM_make_text_block(ctx, block, block_dict, raw, text_buffer, tp_rect); + } + + LIST_APPEND_DROP(block_list, block_dict); + } + DICT_SETITEM_DROP(page_dict, dictkey_blocks, block_list); + fz_drop_buffer(ctx, text_buffer); +} + + + +//--------------------------------------------------------------------- +PyObject * +JM_copy_rectangle(fz_context *ctx, fz_stext_page *page, fz_rect area) +{ + fz_stext_block *block; + fz_stext_line *line; + fz_stext_char *ch; + fz_buffer *buffer; + int need_new_line = 0; + PyObject *rc = NULL; + fz_try(ctx) { + buffer = fz_new_buffer(ctx, 1024); + for (block = page->first_block; block; block = block->next) { + if (block->type != FZ_STEXT_BLOCK_TEXT) + continue; + for (line = block->u.t.first_line; line; line = line->next) { + int line_had_text = 0; + for (ch = line->first_char; ch; ch = ch->next) { + fz_rect r = JM_char_bbox(ctx, line, ch); + if (JM_rects_overlap(area, r)) { + line_had_text = 1; + if (need_new_line) { + fz_append_string(ctx, buffer, "\n"); + need_new_line = 0; + } + JM_append_rune(ctx, buffer, ch->c); + } + } + if (line_had_text) + need_new_line = 1; + } + } + fz_terminate_buffer(ctx, buffer); + rc = JM_EscapeStrFromBuffer(ctx, buffer); + if (!rc) { + rc = EMPTY_STRING; + PyErr_Clear(); + } + } + fz_always(ctx) { + fz_drop_buffer(ctx, buffer); + } + fz_catch(ctx) { + fz_rethrow(ctx); + } + return rc; +} +//--------------------------------------------------------------------- + + + + +fz_buffer *JM_object_to_buffer(fz_context *ctx, pdf_obj *what, int compress, int ascii) +{ + fz_buffer *res=NULL; + fz_output *out=NULL; + fz_try(ctx) { + res = fz_new_buffer(ctx, 512); + out = fz_new_output_with_buffer(ctx, res); + pdf_print_obj(ctx, out, what, compress, ascii); + } + fz_always(ctx) { + fz_drop_output(ctx, out); + } + fz_catch(ctx) { + fz_rethrow(ctx); + } + fz_terminate_buffer(ctx, res); + return res; +} + +//----------------------------------------------------------------------------- +// Merge the /Resources object created by a text pdf device into the page. +// The device may have created multiple /ExtGState/Alp? and /Font/F? objects. +// These need to be renamed (renumbered) to not overwrite existing page +// objects from previous executions. +// Returns the next available numbers n, m for objects /Alp<n>, /F<m>. +//----------------------------------------------------------------------------- +PyObject *JM_merge_resources(fz_context *ctx, pdf_page *page, pdf_obj *temp_res) +{ + // page objects /Resources, /Resources/ExtGState, /Resources/Font + pdf_obj *resources = pdf_dict_get(ctx, page->obj, PDF_NAME(Resources)); + pdf_obj *main_extg = pdf_dict_get(ctx, resources, PDF_NAME(ExtGState)); + pdf_obj *main_fonts = pdf_dict_get(ctx, resources, PDF_NAME(Font)); + + // text pdf device objects /ExtGState, /Font + pdf_obj *temp_extg = pdf_dict_get(ctx, temp_res, PDF_NAME(ExtGState)); + pdf_obj *temp_fonts = pdf_dict_get(ctx, temp_res, PDF_NAME(Font)); + + + int max_alp = -1, max_fonts = -1, i, n; + char text[20]; + + // Handle /Alp objects + if (pdf_is_dict(ctx, temp_extg)) // any created at all? + { + n = pdf_dict_len(ctx, temp_extg); + if (pdf_is_dict(ctx, main_extg)) { // does page have /ExtGState yet? + for (i = 0; i < pdf_dict_len(ctx, main_extg); i++) { + // get highest number of objects named /Alpxxx + char *alp = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, main_extg, i)); + if (strncmp(alp, "Alp", 3) != 0) continue; + int j = fz_atoi(alp + 3); + if (j > max_alp) max_alp = j; + } + } + else // create a /ExtGState for the page + main_extg = pdf_dict_put_dict(ctx, resources, PDF_NAME(ExtGState), n); + + max_alp += 1; + for (i = 0; i < n; i++) // copy over renumbered /Alp objects + { + char *alp = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, temp_extg, i)); + int j = fz_atoi(alp + 3) + max_alp; + fz_snprintf(text, sizeof(text), "Alp%d", j); // new name + pdf_obj *val = pdf_dict_get_val(ctx, temp_extg, i); + pdf_dict_puts(ctx, main_extg, text, val); + } + } + + + if (pdf_is_dict(ctx, main_fonts)) { // has page any fonts yet? + for (i = 0; i < pdf_dict_len(ctx, main_fonts); i++) { // get max font number + char *font = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, main_fonts, i)); + if (strncmp(font, "F", 1) != 0) continue; + int j = fz_atoi(font + 1); + if (j > max_fonts) max_fonts = j; + } + } + else // create a Resources/Font for the page + main_fonts = pdf_dict_put_dict(ctx, resources, PDF_NAME(Font), 2); + + max_fonts += 1; + for (i = 0; i < pdf_dict_len(ctx, temp_fonts); i++) { // copy renumbered fonts + char *font = (char *) pdf_to_name(ctx, pdf_dict_get_key(ctx, temp_fonts, i)); + int j = fz_atoi(font + 1) + max_fonts; + fz_snprintf(text, sizeof(text), "F%d", j); + pdf_obj *val = pdf_dict_get_val(ctx, temp_fonts, i); + pdf_dict_puts(ctx, main_fonts, text, val); + } + return Py_BuildValue("ii", max_alp, max_fonts); // next available numbers +} + + +//----------------------------------------------------------------------------- +// version of fz_show_string, which covers SMALL CAPS +//----------------------------------------------------------------------------- +fz_matrix +JM_show_string_cs(fz_context *ctx, fz_text *text, fz_font *user_font, fz_matrix trm, const char *s, + int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language) +{ + fz_font *font=NULL; + int gid, ucs; + float adv; + + while (*s) + { + s += fz_chartorune(&ucs, s); + gid = fz_encode_character_sc(ctx, user_font, ucs); + if (gid == 0) { + gid = fz_encode_character_with_fallback(ctx, user_font, ucs, 0, language, &font); + } else { + font = user_font; + } + fz_show_glyph(ctx, text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language); + adv = fz_advance_glyph(ctx, font, gid, wmode); + if (wmode == 0) + trm = fz_pre_translate(trm, adv, 0); + else + trm = fz_pre_translate(trm, 0, -adv); + } + + return trm; +} + + +//----------------------------------------------------------------------------- +// version of fz_show_string, which also covers UCDN script +//----------------------------------------------------------------------------- +fz_matrix JM_show_string(fz_context *ctx, fz_text *text, fz_font *user_font, fz_matrix trm, const char *s, int wmode, int bidi_level, fz_bidi_direction markup_dir, fz_text_language language, int script) +{ + fz_font *font; + int gid, ucs; + float adv; + + while (*s) { + s += fz_chartorune(&ucs, s); + gid = fz_encode_character_with_fallback(ctx, user_font, ucs, script, language, &font); + fz_show_glyph(ctx, text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language); + adv = fz_advance_glyph(ctx, font, gid, wmode); + if (wmode == 0) + trm = fz_pre_translate(trm, adv, 0); + else + trm = fz_pre_translate(trm, 0, -adv); + } + return trm; +} + + +//----------------------------------------------------------------------------- +// return a fz_font from a number of parameters +//----------------------------------------------------------------------------- +fz_font *JM_get_font(fz_context *ctx, + char *fontname, + char *fontfile, + PyObject *fontbuffer, + int script, + int lang, + int ordering, + int is_bold, + int is_italic, + int is_serif, + int embed) +{ + const unsigned char *data = NULL; + int size, index=0; + fz_buffer *res = NULL; + fz_font *font = NULL; + fz_try(ctx) { + if (fontfile) goto have_file; + if (EXISTS(fontbuffer)) goto have_buffer; + if (ordering > -1) goto have_cjk; + if (fontname) goto have_base14; + goto have_noto; + + // Base-14 or a MuPDF builtin font + have_base14:; + font = fz_new_base14_font(ctx, fontname); + if (font) { + goto fertig; + } + font = fz_new_builtin_font(ctx, fontname, is_bold, is_italic); + goto fertig; + + // CJK font + have_cjk:; + font = fz_new_cjk_font(ctx, ordering); + goto fertig; + + // fontfile + have_file:; + font = fz_new_font_from_file(ctx, NULL, fontfile, index, 0); + goto fertig; + + // fontbuffer + have_buffer:; + res = JM_BufferFromBytes(ctx, fontbuffer); + font = fz_new_font_from_buffer(ctx, NULL, res, index, 0); + goto fertig; + + // Check for NOTO font + have_noto:; + data = fz_lookup_noto_font(ctx, script, lang, &size, &index); + if (data) font = fz_new_font_from_memory(ctx, NULL, data, size, index, 0); + if (font) goto fertig; + font = fz_load_fallback_font(ctx, script, lang, is_serif, is_bold, is_italic); + goto fertig; + + fertig:; + if (!font) { + RAISEPY(ctx, MSG_FONT_FAILED, PyExc_RuntimeError); + } + #if FZ_VERSION_MAJOR == 1 && FZ_VERSION_MINOR >= 22 + // if font allows this, set embedding + if (!font->flags.never_embed) { + fz_set_font_embedding(ctx, font, embed); + } + #endif + } + fz_always(ctx) { + fz_drop_buffer(ctx, res); + } + fz_catch(ctx) { + fz_rethrow(ctx); + } + return font; +} + +%}
