Mercurial > hgrepos > Python2 > PyMuPDF
view src/extra.i @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children | a6bc019ac0b2 |
line wrap: on
line source
%module fitz_extra %pythoncode %{ # pylint: disable=all %} %begin %{ #define SWIG_PYTHON_INTERPRETER_NO_DEBUG /* This seems to be necessary on some Windows machines with Py_LIMITED_API, otherwise compilation can fail because free() and malloc() are not declared. */ #include <stdlib.h> %} %init %{ /* Initialise some globals that require Python functions. [Prior to 2023-08-18 we initialised these global variables inline, but this causes a SEGV on Windows with Python-3.10 for `dictkey_c` (actually any string of length 1 failed).] */ dictkey_align = PyUnicode_InternFromString("align"); dictkey_ascender = PyUnicode_InternFromString("ascender"); dictkey_bidi = PyUnicode_InternFromString("bidi"); dictkey_bbox = PyUnicode_InternFromString("bbox"); dictkey_blocks = PyUnicode_InternFromString("blocks"); dictkey_bpc = PyUnicode_InternFromString("bpc"); dictkey_c = PyUnicode_InternFromString("c"); dictkey_chars = PyUnicode_InternFromString("chars"); dictkey_color = PyUnicode_InternFromString("color"); dictkey_colorspace = PyUnicode_InternFromString("colorspace"); dictkey_content = PyUnicode_InternFromString("content"); dictkey_creationDate = PyUnicode_InternFromString("creationDate"); dictkey_cs_name = PyUnicode_InternFromString("cs-name"); dictkey_da = PyUnicode_InternFromString("da"); dictkey_dashes = PyUnicode_InternFromString("dashes"); dictkey_desc = PyUnicode_InternFromString("descender"); dictkey_descender = PyUnicode_InternFromString("descender"); dictkey_dir = PyUnicode_InternFromString("dir"); dictkey_effect = PyUnicode_InternFromString("effect"); dictkey_ext = PyUnicode_InternFromString("ext"); dictkey_filename = PyUnicode_InternFromString("filename"); dictkey_fill = PyUnicode_InternFromString("fill"); dictkey_flags = PyUnicode_InternFromString("flags"); dictkey_char_flags = PyUnicode_InternFromString("char_flags"); /* Only used with mupdf >= 1.25.2. */ dictkey_font = PyUnicode_InternFromString("font"); dictkey_glyph = PyUnicode_InternFromString("glyph"); dictkey_height = PyUnicode_InternFromString("height"); dictkey_id = PyUnicode_InternFromString("id"); dictkey_image = PyUnicode_InternFromString("image"); dictkey_items = PyUnicode_InternFromString("items"); dictkey_length = PyUnicode_InternFromString("length"); dictkey_lines = PyUnicode_InternFromString("lines"); dictkey_matrix = PyUnicode_InternFromString("transform"); dictkey_modDate = PyUnicode_InternFromString("modDate"); dictkey_name = PyUnicode_InternFromString("name"); dictkey_number = PyUnicode_InternFromString("number"); dictkey_origin = PyUnicode_InternFromString("origin"); dictkey_rect = PyUnicode_InternFromString("rect"); dictkey_size = PyUnicode_InternFromString("size"); dictkey_smask = PyUnicode_InternFromString("smask"); dictkey_spans = PyUnicode_InternFromString("spans"); dictkey_stroke = PyUnicode_InternFromString("stroke"); dictkey_style = PyUnicode_InternFromString("style"); dictkey_subject = PyUnicode_InternFromString("subject"); dictkey_text = PyUnicode_InternFromString("text"); dictkey_title = PyUnicode_InternFromString("title"); dictkey_type = PyUnicode_InternFromString("type"); dictkey_ufilename = PyUnicode_InternFromString("ufilename"); dictkey_width = PyUnicode_InternFromString("width"); dictkey_wmode = PyUnicode_InternFromString("wmode"); dictkey_xref = PyUnicode_InternFromString("xref"); dictkey_xres = PyUnicode_InternFromString("xres"); dictkey_yres = PyUnicode_InternFromString("yres"); %} %include std_string.i %include exception.i %exception { try { $action } /* this might not be ok on windows. catch (Swig::DirectorException &e) { SWIG_fail; }*/ catch(std::exception& e) { SWIG_exception(SWIG_RuntimeError, e.what()); } catch(...) { SWIG_exception(SWIG_RuntimeError, "Unknown exception"); } } %{ #include "mupdf/classes2.h" #include "mupdf/exceptions.h" #include "mupdf/internal.h" #include <algorithm> #include <float.h> #define MAKE_MUPDF_VERSION_INT(major, minor, patch) ((major << 16) + (minor << 8) + (patch << 0)) #define MUPDF_VERSION_INT MAKE_MUPDF_VERSION_INT(FZ_VERSION_MAJOR, FZ_VERSION_MINOR, FZ_VERSION_PATCH) #define MUPDF_VERSION_GE(major, minor, patch) \ MUPDF_VERSION_INT >= MAKE_MUPDF_VERSION_INT(major, minor, patch) /* Define a wrapper for PDF_NAME that returns a mupdf::PdfObj instead of a pdf_obj*. This avoids implicit construction of a mupdf::PdfObj, which is deliberately prohibited (with `explicit` on constructors) by recent MuPDF. */ #define PDF_NAME2(X) mupdf::PdfObj(PDF_NAME(X)) /* Returns equivalent of `repr(x)`. */ static std::string repr(PyObject* x) { PyObject* repr = PyObject_Repr(x); PyObject* repr_str = PyUnicode_AsEncodedString(repr, "utf-8", "~E~"); #ifdef Py_LIMITED_API const char* repr_str_s = PyBytes_AsString(repr_str); #else const char* repr_str_s = PyBytes_AS_STRING(repr_str); #endif std::string ret = repr_str_s; Py_DECREF(repr_str); Py_DECREF(repr); return ret; } #ifdef Py_LIMITED_API static PyObject* PySequence_ITEM(PyObject* o, Py_ssize_t i) { return PySequence_GetItem(o, i); } static const char* PyUnicode_AsUTF8(PyObject* o) { static PyObject* string = nullptr; Py_XDECREF(string); string = PyUnicode_AsUTF8String(o); return PyBytes_AsString(string); } #endif /* These are also in pymupdf/__init__.py. */ const char MSG_BAD_ANNOT_TYPE[] = "bad annot type"; const char MSG_BAD_APN[] = "bad or missing annot AP/N"; const char MSG_BAD_ARG_INK_ANNOT[] = "arg must be seq of seq of float pairs"; const char MSG_BAD_ARG_POINTS[] = "bad seq of points"; const char MSG_BAD_BUFFER[] = "bad type: 'buffer'"; const char MSG_BAD_COLOR_SEQ[] = "bad color sequence"; const char MSG_BAD_DOCUMENT[] = "cannot open broken document"; const char MSG_BAD_FILETYPE[] = "bad filetype"; const char MSG_BAD_LOCATION[] = "bad location"; const char MSG_BAD_OC_CONFIG[] = "bad config number"; const char MSG_BAD_OC_LAYER[] = "bad layer number"; const char MSG_BAD_OC_REF[] = "bad 'oc' reference"; const char MSG_BAD_PAGEID[] = "bad page id"; const char MSG_BAD_PAGENO[] = "bad page number(s)"; const char MSG_BAD_PDFROOT[] = "PDF has no root"; const char MSG_BAD_RECT[] = "rect is infinite or empty"; const char MSG_BAD_TEXT[] = "bad type: 'text'"; const char MSG_BAD_XREF[] = "bad xref"; const char MSG_COLOR_COUNT_FAILED[] = "color count failed"; const char MSG_FILE_OR_BUFFER[] = "need font file or buffer"; const char MSG_FONT_FAILED[] = "cannot create font"; const char MSG_IS_NO_ANNOT[] = "is no annotation"; const char MSG_IS_NO_IMAGE[] = "is no image"; const char MSG_IS_NO_PDF[] = "is no PDF"; const char MSG_IS_NO_DICT[] = "object is no PDF dict"; const char MSG_PIX_NOALPHA[] = "source pixmap has no alpha"; const char MSG_PIXEL_OUTSIDE[] = "pixel(s) outside image"; #define JM_BOOL(x) PyBool_FromLong((long) (x)) static PyObject *JM_UnicodeFromStr(const char *c); #ifdef _WIN32 /* These functions are not provided on Windows. */ int vasprintf(char** str, const char* fmt, va_list ap) { va_list ap2; va_copy(ap2, ap); int len = vsnprintf(nullptr, 0, fmt, ap2); va_end(ap2); char* buffer = (char*) malloc(len + 1); if (!buffer) { *str = nullptr; return -1; } va_copy(ap2, ap); int len2 = vsnprintf(buffer, len + 1, fmt, ap2); va_end(ap2); assert(len2 == len); *str = buffer; return len; } int asprintf(char** str, const char* fmt, ...) { va_list ap; va_start(ap, fmt); int ret = vasprintf(str, fmt, ap); va_end(ap); return ret; } #endif static void messagev(const char* format, va_list va) { static PyObject* pymupdf_module = PyImport_ImportModule("pymupdf"); static PyObject* message_fn = PyObject_GetAttrString(pymupdf_module, "message"); char* text; vasprintf(&text, format, va); PyObject* text_py = PyString_FromString(text); PyObject* args = PyTuple_Pack(1, text_py); PyObject* ret = PyObject_CallObject(message_fn, args); Py_XDECREF(ret); Py_XDECREF(args); Py_XDECREF(text_py); free(text); } static void messagef(const char* format, ...) { va_list args; va_start(args, format); messagev(format, args); va_end(args); } PyObject* JM_EscapeStrFromStr(const char* c) { if (!c) return PyUnicode_FromString(""); PyObject* val = PyUnicode_DecodeRawUnicodeEscape(c, (Py_ssize_t) strlen(c), "replace"); if (!val) { val = PyUnicode_FromString(""); PyErr_Clear(); } return val; } PyObject* JM_EscapeStrFromBuffer(fz_buffer* buff) { if (!buff) return PyUnicode_FromString(""); unsigned char* s = nullptr; size_t len = mupdf::ll_fz_buffer_storage(buff, &s); PyObject* val = PyUnicode_DecodeRawUnicodeEscape((const char*) s, (Py_ssize_t) len, "replace"); if (!val) { val = PyUnicode_FromString(""); PyErr_Clear(); } return val; } //---------------------------------------------------------------------------- // Deep-copies a source page to the target. // Modified version of function of pdfmerge.c: we also copy annotations, but // we skip some subtypes. In addition we rotate output. //---------------------------------------------------------------------------- static void page_merge( mupdf::PdfDocument& doc_des, mupdf::PdfDocument& doc_src, int page_from, int page_to, int rotate, int links, int copy_annots, mupdf::PdfGraftMap& graft_map ) { // list of object types (per page) we want to copy /* Fixme: on linux these get destructed /after/ mupdf/platform/c++/implementation/internal.cpp:s_thread_state, which causes problems - s_thread_state::m_ctx will have been freed. We have a hack that sets s_thread_state::m_ctx when destructed, so it mostly works when s_thread_state.get_context() is called after destruction, but this causes memento leaks and is clearly incorrect. Perhaps we could use pdf_obj* known_page_objs[] = {...} and create PdfObj wrappers as used - this would avoid any cleanup at exit. And it's a general solution to problem of ordering of cleanup of globals. */ static pdf_obj* known_page_objs[] = { PDF_NAME(Contents), PDF_NAME(Resources), PDF_NAME(MediaBox), PDF_NAME(CropBox), PDF_NAME(BleedBox), PDF_NAME(TrimBox), PDF_NAME(ArtBox), PDF_NAME(Rotate), PDF_NAME(UserUnit) }; int known_page_objs_num = sizeof(known_page_objs) / sizeof(known_page_objs[0]); mupdf::PdfObj page_ref = mupdf::pdf_lookup_page_obj(doc_src, page_from); // make new page dict in dest doc mupdf::PdfObj page_dict = mupdf::pdf_new_dict(doc_des, 4); mupdf::pdf_dict_put(page_dict, PDF_NAME2(Type), PDF_NAME2(Page)); for (int i = 0; i < known_page_objs_num; ++i) { mupdf::PdfObj known_page_obj(known_page_objs[i]); mupdf::PdfObj obj = mupdf::pdf_dict_get_inheritable(page_ref, known_page_obj); if (obj.m_internal) { mupdf::pdf_dict_put( page_dict, known_page_obj, mupdf::pdf_graft_mapped_object(graft_map, obj) ); } } // Copy annotations, but skip Link, Popup, IRT, Widget types // If selected, remove dict keys P (parent) and Popup if (copy_annots) { mupdf::PdfObj old_annots = mupdf::pdf_dict_get(page_ref, PDF_NAME2(Annots)); int n = mupdf::pdf_array_len(old_annots); if (n > 0) { mupdf::PdfObj new_annots = mupdf::pdf_dict_put_array(page_dict, PDF_NAME2(Annots), n); for (int i = 0; i < n; i++) { mupdf::PdfObj o = mupdf::pdf_array_get(old_annots, i); if (!o.m_internal || !mupdf::pdf_is_dict(o)) // skip non-dict items { continue; // skip invalid/null/non-dict items } if (mupdf::pdf_dict_get(o, PDF_NAME2(IRT)).m_internal) continue; mupdf::PdfObj subtype = mupdf::pdf_dict_get(o, PDF_NAME2(Subtype)); if (mupdf::pdf_name_eq(subtype, PDF_NAME2(Link))) continue; if (mupdf::pdf_name_eq(subtype, PDF_NAME2(Popup))) continue; if (mupdf::pdf_name_eq(subtype, PDF_NAME2(Widget))) continue; mupdf::pdf_dict_del(o, PDF_NAME2(Popup)); mupdf::pdf_dict_del(o, PDF_NAME2(P)); mupdf::PdfObj copy_o = mupdf::pdf_graft_mapped_object(graft_map, o); mupdf::PdfObj annot = mupdf::pdf_new_indirect( doc_des, mupdf::pdf_to_num(copy_o), 0 ); mupdf::pdf_array_push(new_annots, annot); } } } // rotate the page if (rotate != -1) { mupdf::pdf_dict_put_int(page_dict, PDF_NAME2(Rotate), rotate); } // Now add the page dictionary to dest PDF mupdf::PdfObj ref = mupdf::pdf_add_object(doc_des, page_dict); // Insert new page at specified location mupdf::pdf_insert_page(doc_des, page_to, ref); } //----------------------------------------------------------------------------- // Copy a range of pages (spage, epage) from a source PDF to a specified // location (apage) of the target PDF. // If spage > epage, the sequence of source pages is reversed. //----------------------------------------------------------------------------- static void JM_merge_range( mupdf::PdfDocument& doc_des, mupdf::PdfDocument& doc_src, int spage, int epage, int apage, int rotate, int links, int annots, int show_progress, mupdf::PdfGraftMap& graft_map ) { int afterpage = apage; int counter = 0; // copied pages counter int total = mupdf::ll_fz_absi(epage - spage) + 1; // total pages to copy if (spage < epage) { for (int page = spage; page <= epage; page++, afterpage++) { page_merge(doc_des, doc_src, page, afterpage, rotate, links, annots, graft_map); counter++; if (show_progress > 0 && counter % show_progress == 0) { messagef("Inserted %i of %i pages.", counter, total); } } } else { for (int page = spage; page >= epage; page--, afterpage++) { page_merge(doc_des, doc_src, page, afterpage, rotate, links, annots, graft_map); counter++; if (show_progress > 0 && counter % show_progress == 0) { messagef("Inserted %i of %i pages.", counter, total); } } } } static bool JM_have_operation(mupdf::PdfDocument& pdf) { // Ensure valid journalling state if (pdf.m_internal->journal and !mupdf::pdf_undoredo_step(pdf, 0)) { return 0; } return 1; } static void JM_ensure_operation(mupdf::PdfDocument& pdf) { if (!JM_have_operation(pdf)) { throw std::runtime_error("No journalling operation started"); } } static void FzDocument_insert_pdf( mupdf::FzDocument& doc, mupdf::FzDocument& src, int from_page, int to_page, int start_at, int rotate, int links, int annots, int show_progress, int final, mupdf::PdfGraftMap& graft_map ) { //std::cerr << __FILE__ << ":" << __LINE__ << ":" << __FUNCTION__ << "\n"; mupdf::PdfDocument pdfout = mupdf::pdf_specifics(doc); mupdf::PdfDocument pdfsrc = mupdf::pdf_specifics(src); int outCount = mupdf::fz_count_pages(doc); int srcCount = mupdf::fz_count_pages(src); // local copies of page numbers int fp = from_page; int tp = to_page; int sa = start_at; // normalize page numbers fp = std::max(fp, 0); // -1 = first page fp = std::min(fp, srcCount - 1); // but do not exceed last page if (tp < 0) tp = srcCount - 1; // -1 = last page tp = std::min(tp, srcCount - 1); // but do not exceed last page if (sa < 0) sa = outCount; // -1 = behind last page sa = std::min(sa, outCount); // but that is also the limit if (!pdfout.m_internal || !pdfsrc.m_internal) { throw std::runtime_error("source or target not a PDF"); } JM_ensure_operation(pdfout); JM_merge_range(pdfout, pdfsrc, fp, tp, sa, rotate, links, annots, show_progress, graft_map); } static int page_xref(mupdf::FzDocument& this_doc, int pno) { int page_count = mupdf::fz_count_pages(this_doc); int n = pno; while (n < 0) { n += page_count; } mupdf::PdfDocument pdf = mupdf::pdf_specifics(this_doc); assert(pdf.m_internal); int xref = 0; if (n >= page_count) { throw std::runtime_error(MSG_BAD_PAGENO);//, PyExc_ValueError); } xref = mupdf::pdf_to_num(mupdf::pdf_lookup_page_obj(pdf, n)); return xref; } static void _newPage(mupdf::PdfDocument& pdf, int pno=-1, float width=595, float height=842) { if (!pdf.m_internal) { throw std::runtime_error("is no PDF"); } mupdf::FzRect mediabox(0, 0, width, height); if (pno < -1) { throw std::runtime_error("bad page number(s)"); // Should somehow be Python ValueError } JM_ensure_operation(pdf); // create /Resources and /Contents objects mupdf::PdfObj resources = mupdf::pdf_add_new_dict(pdf, 1); mupdf::FzBuffer contents; mupdf::PdfObj page_obj = mupdf::pdf_add_page(pdf, mediabox, 0, resources, contents); mupdf::pdf_insert_page(pdf, pno, page_obj); } static void _newPage(mupdf::FzDocument& self, int pno=-1, float width=595, float height=842) { mupdf::PdfDocument pdf = mupdf::pdf_specifics(self); _newPage(pdf, pno, width, height); } //------------------------------------------------------------------------ // return the annotation names (list of /NM entries) //------------------------------------------------------------------------ static std::vector< std::string> JM_get_annot_id_list(mupdf::PdfPage& page) { std::vector< std::string> names; mupdf::PdfObj annots = mupdf::pdf_dict_get(page.obj(), PDF_NAME2(Annots)); if (!annots.m_internal) return names; int n = mupdf::pdf_array_len(annots); for (int i = 0; i < n; i++) { mupdf::PdfObj annot_obj = mupdf::pdf_array_get(annots, i); mupdf::PdfObj name = mupdf::pdf_dict_gets(annot_obj, "NM"); if (name.m_internal) { names.push_back(mupdf::pdf_to_text_string(name)); } } return names; } //------------------------------------------------------------------------ // Add a unique /NM key to an annotation or widget. // Append a number to 'stem' such that the result is a unique name. //------------------------------------------------------------------------ static void JM_add_annot_id(mupdf::PdfAnnot& annot, const char* stem) { mupdf::PdfPage page = mupdf::pdf_annot_page(annot); mupdf::PdfObj annot_obj = mupdf::pdf_annot_obj(annot); std::vector< std::string> names = JM_get_annot_id_list(page); char* stem_id = nullptr; for (int i=0; ; ++i) { free(stem_id); asprintf(&stem_id, "fitz-%s%d", stem, i); if (std::find(names.begin(), names.end(), stem_id) == names.end()) { break; } } mupdf::PdfObj name = mupdf::pdf_new_string(stem_id, strlen(stem_id)); free(stem_id); mupdf::pdf_dict_puts(annot_obj, "NM", name); page.m_internal->doc->resynth_required = 0; } //---------------------------------------------------------------- // page add_caret_annot //---------------------------------------------------------------- static mupdf::PdfAnnot _add_caret_annot(mupdf::PdfPage& page, mupdf::FzPoint& point) { mupdf::PdfAnnot annot = mupdf::pdf_create_annot(page, ::PDF_ANNOT_CARET); mupdf::FzPoint p = point; mupdf::FzRect r = mupdf::pdf_annot_rect(annot); r = mupdf::fz_make_rect(p.x, p.y, p.x + r.x1 - r.x0, p.y + r.y1 - r.y0); mupdf::pdf_set_annot_rect(annot, r); mupdf::pdf_update_annot(annot); JM_add_annot_id(annot, "A"); return annot; } static mupdf::PdfAnnot _add_caret_annot(mupdf::FzPage& page, mupdf::FzPoint& point) { mupdf::PdfPage pdf_page = mupdf::pdf_page_from_fz_page(page); return _add_caret_annot(pdf_page, point); } static const char* Tools_parse_da(mupdf::PdfAnnot& this_annot) { const char* da_str = nullptr; mupdf::PdfObj this_annot_obj = mupdf::pdf_annot_obj(this_annot); mupdf::PdfDocument pdf = mupdf::pdf_get_bound_document(this_annot_obj); try { mupdf::PdfObj da = mupdf::pdf_dict_get_inheritable(this_annot_obj, PDF_NAME2(DA)); if (!da.m_internal) { mupdf::PdfObj trailer = mupdf::pdf_trailer(pdf); da = mupdf::pdf_dict_getl( &trailer, PDF_NAME(Root), PDF_NAME(AcroForm), PDF_NAME(DA), nullptr ); } da_str = mupdf::pdf_to_text_string(da); } catch (std::exception&) { return nullptr; } return da_str; } //---------------------------------------------------------------------------- // Turn fz_buffer into a Python bytes object //---------------------------------------------------------------------------- static PyObject* JM_BinFromBuffer(fz_buffer* buffer) { if (!buffer) { return PyBytes_FromStringAndSize("", 0); } unsigned char* c = nullptr; size_t len = mupdf::ll_fz_buffer_storage(buffer, &c); return PyBytes_FromStringAndSize((const char*) c, len); } static PyObject* JM_BinFromBuffer(mupdf::FzBuffer& buffer) { return JM_BinFromBuffer( buffer.m_internal); } static PyObject* Annot_getAP(mupdf::PdfAnnot& annot) { mupdf::PdfObj annot_obj = mupdf::pdf_annot_obj(annot); mupdf::PdfObj ap = mupdf::pdf_dict_getl( &annot_obj, PDF_NAME(AP), PDF_NAME(N), nullptr ); if (mupdf::pdf_is_stream(ap)) { mupdf::FzBuffer res = mupdf::pdf_load_stream(ap); return JM_BinFromBuffer(res); } return PyBytes_FromStringAndSize("", 0); } void Tools_update_da(mupdf::PdfAnnot& this_annot, const char* da_str) { mupdf::PdfObj this_annot_obj = mupdf::pdf_annot_obj(this_annot); mupdf::pdf_dict_put_text_string(this_annot_obj, PDF_NAME2(DA), da_str); mupdf::pdf_dict_del(this_annot_obj, PDF_NAME2(DS)); /* not supported */ mupdf::pdf_dict_del(this_annot_obj, PDF_NAME2(RC)); /* not supported */ } static int jm_float_item(PyObject* obj, Py_ssize_t idx, double* result) { PyObject* temp = PySequence_ITEM(obj, idx); if (!temp) return 1; *result = PyFloat_AsDouble(temp); Py_DECREF(temp); if (PyErr_Occurred()) { PyErr_Clear(); return 1; } return 0; } static mupdf::FzPoint JM_point_from_py(PyObject* p) { fz_point p0 = fz_make_point(FZ_MIN_INF_RECT, FZ_MIN_INF_RECT); if (!p || !PySequence_Check(p) || PySequence_Size(p) != 2) { return p0; } double x; double y; if (jm_float_item(p, 0, &x) == 1) return p0; if (jm_float_item(p, 1, &y) == 1) return p0; if (x < FZ_MIN_INF_RECT) x = FZ_MIN_INF_RECT; if (y < FZ_MIN_INF_RECT) y = FZ_MIN_INF_RECT; if (x > FZ_MAX_INF_RECT) x = FZ_MAX_INF_RECT; if (y > FZ_MAX_INF_RECT) y = FZ_MAX_INF_RECT; return fz_make_point(x, y); } static int s_list_append_drop(PyObject* list, PyObject* item) { if (!list || !PyList_Check(list) || !item) { return -2; } int rc = PyList_Append(list, item); Py_DECREF(item); return rc; } static int LIST_APPEND_DROP(PyObject *list, PyObject *item) { if (!list || !PyList_Check(list) || !item) return -2; int rc = PyList_Append(list, item); Py_DECREF(item); return rc; } static int LIST_APPEND(PyObject *list, PyObject *item) { if (!list || !PyList_Check(list) || !item) return -2; int rc = PyList_Append(list, item); return rc; } static int DICT_SETITEM_DROP(PyObject *dict, PyObject *key, PyObject *value) { if (!dict || !PyDict_Check(dict) || !key || !value) return -2; int rc = PyDict_SetItem(dict, key, value); Py_DECREF(value); return rc; } static int DICT_SETITEMSTR_DROP(PyObject *dict, const char *key, PyObject *value) { if (!dict || !PyDict_Check(dict) || !key || !value) return -2; int rc = PyDict_SetItemString(dict, key, value); Py_DECREF(value); return rc; } //----------------------------------------------------------------------------- // Functions converting between PySequences and pymupdf geometry objects //----------------------------------------------------------------------------- static int jm_init_item(PyObject* obj, Py_ssize_t idx, int* result) { PyObject* temp = PySequence_ITEM(obj, idx); if (!temp) { return 1; } if (PyLong_Check(temp)) { *result = (int) PyLong_AsLong(temp); Py_DECREF(temp); } else if (PyFloat_Check(temp)) { *result = (int) PyFloat_AsDouble(temp); Py_DECREF(temp); } else { Py_DECREF(temp); return 1; } if (PyErr_Occurred()) { PyErr_Clear(); return 1; } return 0; } // TODO: ------------------------------------------------------------------ // This is a temporary solution and should be replaced by a C++ extension: // There is no way in Python specify an array of fz_point - as is required // for function pdf_set_annot_callout_line(). static void JM_set_annot_callout_line(mupdf::PdfAnnot& annot, PyObject *callout, int count) { fz_point points[3]; mupdf::FzPoint p; for (int i = 0; i < count; i++) { p = JM_point_from_py(PyTuple_GetItem(callout, (Py_ssize_t) i)); points[i] = fz_make_point(p.x, p.y); } mupdf::pdf_set_annot_callout_line(annot, points, count); } //---------------------------------------------------------------------------- // Return list of outline xref numbers. Recursive function. Arguments: // 'obj' first OL item // 'xrefs' empty Python list //---------------------------------------------------------------------------- static PyObject* JM_outline_xrefs(mupdf::PdfObj obj, PyObject* xrefs) { if (!obj.m_internal) { return xrefs; } PyObject* newxref = nullptr; mupdf::PdfObj thisobj = obj; while (thisobj.m_internal) { int nxr = mupdf::pdf_to_num(thisobj); newxref = PyLong_FromLong((long) nxr); if (PySequence_Contains(xrefs, newxref) or mupdf::pdf_dict_get(thisobj, PDF_NAME2(Type)).m_internal ) { // circular ref or top of chain: terminate Py_DECREF(newxref); break; } s_list_append_drop(xrefs, newxref); mupdf::PdfObj first = mupdf::pdf_dict_get(thisobj, PDF_NAME2(First)); // try go down if (mupdf::pdf_is_dict(first)) { xrefs = JM_outline_xrefs(first, xrefs); } thisobj = mupdf::pdf_dict_get(thisobj, PDF_NAME2(Next)); // try go next mupdf::PdfObj parent = mupdf::pdf_dict_get(thisobj, PDF_NAME2(Parent)); // get parent if (!mupdf::pdf_is_dict(thisobj)) { thisobj = parent; } } return xrefs; } PyObject* dictkey_align = NULL; PyObject* dictkey_ascender = NULL; PyObject* dictkey_bidi = NULL; PyObject* dictkey_bbox = NULL; PyObject* dictkey_blocks = NULL; PyObject* dictkey_bpc = NULL; PyObject* dictkey_c = NULL; PyObject* dictkey_chars = NULL; PyObject* dictkey_color = NULL; PyObject* dictkey_colorspace = NULL; PyObject* dictkey_content = NULL; PyObject* dictkey_creationDate = NULL; PyObject* dictkey_cs_name = NULL; PyObject* dictkey_da = NULL; PyObject* dictkey_dashes = NULL; PyObject* dictkey_desc = NULL; PyObject* dictkey_descender = NULL; PyObject* dictkey_dir = NULL; PyObject* dictkey_effect = NULL; PyObject* dictkey_ext = NULL; PyObject* dictkey_filename = NULL; PyObject* dictkey_fill = NULL; PyObject* dictkey_flags = NULL; PyObject* dictkey_char_bidi = NULL; PyObject* dictkey_char_flags = NULL; PyObject* dictkey_font = NULL; PyObject* dictkey_glyph = NULL; PyObject* dictkey_height = NULL; PyObject* dictkey_id = NULL; PyObject* dictkey_image = NULL; PyObject* dictkey_items = NULL; PyObject* dictkey_length = NULL; PyObject* dictkey_lines = NULL; PyObject* dictkey_matrix = NULL; PyObject* dictkey_modDate = NULL; PyObject* dictkey_name = NULL; PyObject* dictkey_number = NULL; PyObject* dictkey_origin = NULL; PyObject* dictkey_rect = NULL; PyObject* dictkey_size = NULL; PyObject* dictkey_smask = NULL; PyObject* dictkey_spans = NULL; PyObject* dictkey_stroke = NULL; PyObject* dictkey_style = NULL; PyObject* dictkey_subject = NULL; PyObject* dictkey_text = NULL; PyObject* dictkey_title = NULL; PyObject* dictkey_type = NULL; PyObject* dictkey_ufilename = NULL; PyObject* dictkey_width = NULL; PyObject* dictkey_wmode = NULL; PyObject* dictkey_xref = NULL; PyObject* dictkey_xres = NULL; PyObject* dictkey_yres = NULL; static int dict_setitem_drop(PyObject* dict, PyObject* key, PyObject* value) { if (!dict || !PyDict_Check(dict) || !key || !value) { return -2; } int rc = PyDict_SetItem(dict, key, value); Py_DECREF(value); return rc; } static int dict_setitemstr_drop(PyObject* dict, const char* key, PyObject* value) { if (!dict || !PyDict_Check(dict) || !key || !value) { return -2; } int rc = PyDict_SetItemString(dict, key, value); Py_DECREF(value); return rc; } static void Document_extend_toc_items(mupdf::PdfDocument& pdf, PyObject* items) { PyObject* item=nullptr; PyObject* itemdict=nullptr; PyObject* xrefs=nullptr; PyObject* bold = PyUnicode_FromString("bold"); PyObject* italic = PyUnicode_FromString("italic"); PyObject* collapse = PyUnicode_FromString("collapse"); PyObject* zoom = PyUnicode_FromString("zoom"); try { /* Need to define these things early because later code uses `goto`; otherwise we get compiler warnings 'jump bypasses variable initialization' */ int xref = 0; mupdf::PdfObj root; mupdf::PdfObj olroot; mupdf::PdfObj first; Py_ssize_t n; Py_ssize_t m; root = mupdf::pdf_dict_get(mupdf::pdf_trailer(pdf), PDF_NAME2(Root)); if (!root.m_internal) goto end; olroot = mupdf::pdf_dict_get(root, PDF_NAME2(Outlines)); if (!olroot.m_internal) goto end; first = mupdf::pdf_dict_get(olroot, PDF_NAME2(First)); if (!first.m_internal) goto end; xrefs = PyList_New(0); // pre-allocate an empty list xrefs = JM_outline_xrefs(first, xrefs); n = PySequence_Size(xrefs); m = PySequence_Size(items); if (!n) goto end; if (n != m) { throw std::runtime_error("internal error finding outline xrefs"); } // update all TOC item dictionaries for (int i = 0; i < n; i++) { jm_init_item(xrefs, i, &xref); item = PySequence_ITEM(items, i); itemdict = PySequence_ITEM(item, 3); if (!itemdict || !PyDict_Check(itemdict)) { throw std::runtime_error("need non-simple TOC format"); } PyDict_SetItem(itemdict, dictkey_xref, PySequence_ITEM(xrefs, i)); mupdf::PdfObj bm = mupdf::pdf_load_object(pdf, xref); int flags = mupdf::pdf_to_int(mupdf::pdf_dict_get(bm, PDF_NAME2(F))); if (flags == 1) { PyDict_SetItem(itemdict, italic, Py_True); } else if (flags == 2) { PyDict_SetItem(itemdict, bold, Py_True); } else if (flags == 3) { PyDict_SetItem(itemdict, italic, Py_True); PyDict_SetItem(itemdict, bold, Py_True); } int count = mupdf::pdf_to_int(mupdf::pdf_dict_get(bm, PDF_NAME2(Count))); if (count < 0) { PyDict_SetItem(itemdict, collapse, Py_True); } else if (count > 0) { PyDict_SetItem(itemdict, collapse, Py_False); } mupdf::PdfObj col = mupdf::pdf_dict_get(bm, PDF_NAME2(C)); if (mupdf::pdf_is_array(col) && mupdf::pdf_array_len(col) == 3) { PyObject* color = PyTuple_New(3); PyTuple_SET_ITEM(color, 0, Py_BuildValue("f", mupdf::pdf_to_real(mupdf::pdf_array_get(col, 0)))); PyTuple_SET_ITEM(color, 1, Py_BuildValue("f", mupdf::pdf_to_real(mupdf::pdf_array_get(col, 1)))); PyTuple_SET_ITEM(color, 2, Py_BuildValue("f", mupdf::pdf_to_real(mupdf::pdf_array_get(col, 2)))); dict_setitem_drop(itemdict, dictkey_color, color); } float z=0; mupdf::PdfObj obj = mupdf::pdf_dict_get(bm, PDF_NAME2(Dest)); if (!obj.m_internal || !mupdf::pdf_is_array(obj)) { obj = mupdf::pdf_dict_getl(&bm, PDF_NAME(A), PDF_NAME(D), nullptr); } if (mupdf::pdf_is_array(obj) && mupdf::pdf_array_len(obj) == 5) { z = mupdf::pdf_to_real(mupdf::pdf_array_get(obj, 4)); } dict_setitem_drop(itemdict, zoom, Py_BuildValue("f", z)); PyList_SetItem(item, 3, itemdict); PyList_SetItem(items, i, item); } end:; } catch (std::exception&) { } Py_CLEAR(xrefs); Py_CLEAR(bold); Py_CLEAR(italic); Py_CLEAR(collapse); Py_CLEAR(zoom); } static void Document_extend_toc_items(mupdf::FzDocument& document, PyObject* items) { mupdf::PdfDocument pdf = mupdf::pdf_document_from_fz_document(document); return Document_extend_toc_items(pdf, items); } //----------------------------------------------------------------------------- // PySequence from fz_rect //----------------------------------------------------------------------------- static PyObject* JM_py_from_rect(fz_rect r) { return Py_BuildValue("ffff", r.x0, r.y0, r.x1, r.y1); } static PyObject* JM_py_from_rect(mupdf::FzRect r) { return JM_py_from_rect(*r.internal()); } //----------------------------------------------------------------------------- // PySequence from fz_point //----------------------------------------------------------------------------- static PyObject* JM_py_from_point(fz_point p) { return Py_BuildValue("ff", p.x, p.y); } //----------------------------------------------------------------------------- // PySequence from fz_quad. //----------------------------------------------------------------------------- static PyObject * JM_py_from_quad(fz_quad q) { return Py_BuildValue("((f,f),(f,f),(f,f),(f,f))", q.ul.x, q.ul.y, q.ur.x, q.ur.y, q.ll.x, q.ll.y, q.lr.x, q.lr.y); } //---------------------------------------------------------------- // annotation rectangle //---------------------------------------------------------------- static mupdf::FzRect Annot_rect(mupdf::PdfAnnot& annot) { mupdf::FzRect rect = mupdf::pdf_bound_annot(annot); return rect; } static PyObject* Annot_rect3(mupdf::PdfAnnot& annot) { fz_rect rect = mupdf::ll_pdf_bound_annot(annot.m_internal); return JM_py_from_rect(rect); } //----------------------------------------------------------------------------- // PySequence to fz_rect. Default: infinite rect //----------------------------------------------------------------------------- static fz_rect JM_rect_from_py(PyObject* r) { if (!r || !PySequence_Check(r) || PySequence_Size(r) != 4) { return *mupdf::FzRect(mupdf::FzRect::Fixed_INFINITE).internal();// fz_infinite_rect; } double f[4]; for (int i = 0; i < 4; i++) { if (jm_float_item(r, i, &f[i]) == 1) { return *mupdf::FzRect(mupdf::FzRect::Fixed_INFINITE).internal(); } if (f[i] < FZ_MIN_INF_RECT) f[i] = FZ_MIN_INF_RECT; if (f[i] > FZ_MAX_INF_RECT) f[i] = FZ_MAX_INF_RECT; } return mupdf::ll_fz_make_rect( (float) f[0], (float) f[1], (float) f[2], (float) f[3] ); } //----------------------------------------------------------------------------- // PySequence to fz_matrix. Default: fz_identity //----------------------------------------------------------------------------- static fz_matrix JM_matrix_from_py(PyObject* m) { double a[6]; if (!m || !PySequence_Check(m) || PySequence_Size(m) != 6) { return fz_identity; } for (int i = 0; i < 6; i++) { if (jm_float_item(m, i, &a[i]) == 1) { return *mupdf::FzMatrix().internal(); } } return mupdf::ll_fz_make_matrix( (float) a[0], (float) a[1], (float) a[2], (float) a[3], (float) a[4], (float) a[5] ); } PyObject* util_transform_rect(PyObject* rect, PyObject* matrix) { return JM_py_from_rect( mupdf::ll_fz_transform_rect( JM_rect_from_py(rect), JM_matrix_from_py(matrix) ) ); } //---------------------------------------------------------------------------- // return normalized /Rotate value:one of 0, 90, 180, 270 //---------------------------------------------------------------------------- static int JM_norm_rotation(int rotate) { while (rotate < 0) rotate += 360; while (rotate >= 360) rotate -= 360; if (rotate % 90 != 0) return 0; return rotate; } //---------------------------------------------------------------------------- // return a PDF page's /Rotate value: one of (0, 90, 180, 270) //---------------------------------------------------------------------------- static int JM_page_rotation(mupdf::PdfPage& page) { int rotate = 0; rotate = mupdf::pdf_to_int( mupdf::pdf_dict_get_inheritable(page.obj(), PDF_NAME2(Rotate)) ); rotate = JM_norm_rotation(rotate); return rotate; } //---------------------------------------------------------------------------- // return a PDF page's MediaBox //---------------------------------------------------------------------------- static mupdf::FzRect JM_mediabox(mupdf::PdfObj& page_obj) { mupdf::FzRect mediabox = mupdf::pdf_to_rect( mupdf::pdf_dict_get_inheritable(page_obj, PDF_NAME2(MediaBox)) ); if (mupdf::fz_is_empty_rect(mediabox) || mupdf::fz_is_infinite_rect(mediabox)) { mediabox.x0 = 0; mediabox.y0 = 0; mediabox.x1 = 612; mediabox.y1 = 792; } mupdf::FzRect page_mediabox; page_mediabox.x0 = mupdf::fz_min(mediabox.x0, mediabox.x1); page_mediabox.y0 = mupdf::fz_min(mediabox.y0, mediabox.y1); page_mediabox.x1 = mupdf::fz_max(mediabox.x0, mediabox.x1); page_mediabox.y1 = mupdf::fz_max(mediabox.y0, mediabox.y1); if (0 || page_mediabox.x1 - page_mediabox.x0 < 1 || page_mediabox.y1 - page_mediabox.y0 < 1 ) { page_mediabox = *mupdf::FzRect(mupdf::FzRect::Fixed_UNIT).internal(); //fz_unit_rect; } return page_mediabox; } //---------------------------------------------------------------------------- // return a PDF page's CropBox //---------------------------------------------------------------------------- mupdf::FzRect JM_cropbox(mupdf::PdfObj& page_obj) { mupdf::FzRect mediabox = JM_mediabox(page_obj); mupdf::FzRect cropbox = mupdf::pdf_to_rect( mupdf::pdf_dict_get_inheritable(page_obj, PDF_NAME2(CropBox)) ); if (mupdf::fz_is_infinite_rect(cropbox) || mupdf::fz_is_empty_rect(cropbox)) { cropbox = mediabox; } float y0 = mediabox.y1 - cropbox.y1; float y1 = mediabox.y1 - cropbox.y0; cropbox.y0 = y0; cropbox.y1 = y1; return cropbox; } //---------------------------------------------------------------------------- // calculate width and height of the UNROTATED page //---------------------------------------------------------------------------- static mupdf::FzPoint JM_cropbox_size(mupdf::PdfObj& page_obj) { mupdf::FzPoint size; mupdf::FzRect rect = JM_cropbox(page_obj); float w = (rect.x0 < rect.x1) ? rect.x1 - rect.x0 : rect.x0 - rect.x1; float h = (rect.y0 < rect.y1) ? rect.y1 - rect.y0 : rect.y0 - rect.y1; size = fz_make_point(w, h); return size; } //---------------------------------------------------------------------------- // calculate page rotation matrices //---------------------------------------------------------------------------- static mupdf::FzMatrix JM_rotate_page_matrix(mupdf::PdfPage& page) { if (!page.m_internal) { return *mupdf::FzMatrix().internal(); // no valid pdf page given } int rotation = JM_page_rotation(page); if (rotation == 0) { return *mupdf::FzMatrix().internal(); // no rotation } auto po = page.obj(); mupdf::FzPoint cb_size = JM_cropbox_size(po); float w = cb_size.x; float h = cb_size.y; mupdf::FzMatrix m; if (rotation == 90) { m = mupdf::fz_make_matrix(0, 1, -1, 0, h, 0); } else if (rotation == 180) { m = mupdf::fz_make_matrix(-1, 0, 0, -1, w, h); } else { m = mupdf::fz_make_matrix(0, -1, 1, 0, 0, w); } return m; } static mupdf::FzMatrix JM_derotate_page_matrix(mupdf::PdfPage& page) { // just the inverse of rotation return mupdf::fz_invert_matrix(JM_rotate_page_matrix(page)); } //----------------------------------------------------------------------------- // PySequence from fz_matrix //----------------------------------------------------------------------------- static PyObject* JM_py_from_matrix(mupdf::FzMatrix m) { return Py_BuildValue("ffffff", m.a, m.b, m.c, m.d, m.e, m.f); } static mupdf::FzMatrix Page_derotate_matrix(mupdf::PdfPage& pdfpage) { if (!pdfpage.m_internal) { return mupdf::FzMatrix(); } return JM_derotate_page_matrix(pdfpage); } static mupdf::FzMatrix Page_derotate_matrix(mupdf::FzPage& page) { mupdf::PdfPage pdf_page = mupdf::pdf_page_from_fz_page(page); return Page_derotate_matrix(pdf_page); } static PyObject *lll_JM_get_annot_xref_list(pdf_obj *page_obj) { fz_context* ctx = mupdf::internal_context_get(); PyObject *names = PyList_New(0); pdf_obj *id, *subtype, *annots, *annot_obj; int xref, type, i, n; fz_try(ctx) { annots = pdf_dict_get(ctx, page_obj, PDF_NAME(Annots)); n = pdf_array_len(ctx, annots); for (i = 0; i < n; i++) { annot_obj = pdf_array_get(ctx, annots, i); xref = pdf_to_num(ctx, annot_obj); subtype = pdf_dict_get(ctx, annot_obj, PDF_NAME(Subtype)); if (!subtype) { continue; // subtype is required } type = pdf_annot_type_from_string(ctx, pdf_to_name(ctx, subtype)); if (type == PDF_ANNOT_UNKNOWN) { continue; // only accept valid annot types } id = pdf_dict_gets(ctx, annot_obj, "NM"); LIST_APPEND_DROP(names, Py_BuildValue("iis", xref, type, pdf_to_text_string(ctx, id))); } } fz_catch(ctx) { return names; } return names; } //------------------------------------------------------------------------ // return the xrefs and /NM ids of a page's annots, links and fields //------------------------------------------------------------------------ static PyObject* JM_get_annot_xref_list(const mupdf::PdfObj& page_obj) { PyObject* names = PyList_New(0); if (!page_obj.m_internal) { return names; } return lll_JM_get_annot_xref_list( page_obj.m_internal); } static mupdf::FzBuffer JM_object_to_buffer(const mupdf::PdfObj& what, int compress, int ascii) { mupdf::FzBuffer res = mupdf::fz_new_buffer(512); mupdf::FzOutput out(res); mupdf::pdf_print_obj(out, what, compress, ascii); out.fz_close_output(); mupdf::fz_terminate_buffer(res); return res; } static PyObject* JM_EscapeStrFromBuffer(mupdf::FzBuffer& buff) { if (!buff.m_internal) { return PyUnicode_FromString(""); } unsigned char* s = nullptr; size_t len = mupdf::fz_buffer_storage(buff, &s); PyObject* val = PyUnicode_DecodeRawUnicodeEscape((const char*) s, (Py_ssize_t) len, "replace"); if (!val) { val = PyUnicode_FromString(""); PyErr_Clear(); } return val; } static PyObject* xref_object(mupdf::PdfDocument& pdf, int xref, int compressed=0, int ascii=0) { if (!pdf.m_internal) { throw std::runtime_error(MSG_IS_NO_PDF); } int xreflen = mupdf::pdf_xref_len(pdf); if ((xref < 1 || xref >= xreflen) and xref != -1) { throw std::runtime_error(MSG_BAD_XREF); } mupdf::PdfObj obj = (xref > 0) ? mupdf::pdf_load_object(pdf, xref) : mupdf::pdf_trailer(pdf); mupdf::FzBuffer res = JM_object_to_buffer(mupdf::pdf_resolve_indirect(obj), compressed, ascii); PyObject* text = JM_EscapeStrFromBuffer(res); return text; } static PyObject* xref_object(mupdf::FzDocument& document, int xref, int compressed=0, int ascii=0) { mupdf::PdfDocument pdf = mupdf::pdf_document_from_fz_document(document); return xref_object(pdf, xref, compressed, ascii); } //------------------------------------- // fz_output for Python file objects //------------------------------------- static PyObject* Link_is_external(mupdf::FzLink& this_link) { const char* uri = this_link.m_internal->uri; if (!uri) { return PyBool_FromLong(0); } bool ret = mupdf::fz_is_external_link(uri); return PyBool_FromLong((long) ret); } static mupdf::FzLink Link_next(mupdf::FzLink& this_link) { return this_link.next(); } //----------------------------------------------------------------------------- // create PDF object from given string //----------------------------------------------------------------------------- static pdf_obj *lll_JM_pdf_obj_from_str(fz_context *ctx, pdf_document *doc, const char *src) { pdf_obj *result = NULL; pdf_lexbuf lexbuf; fz_stream *stream = fz_open_memory(ctx, (unsigned char *)src, strlen(src)); pdf_lexbuf_init(ctx, &lexbuf, PDF_LEXBUF_SMALL); fz_try(ctx) { result = pdf_parse_stm_obj(ctx, doc, stream, &lexbuf); } fz_always(ctx) { pdf_lexbuf_fin(ctx, &lexbuf); fz_drop_stream(ctx, stream); } fz_catch(ctx) { mupdf::internal_throw_exception(ctx); } return result; } /*********************************************************************/ // Page._addAnnot_FromString // Add new links provided as an array of string object definitions. /*********************************************************************/ PyObject* Page_addAnnot_FromString(mupdf::PdfPage& page, PyObject* linklist) { PyObject* txtpy = nullptr; int lcount = (int) PySequence_Size(linklist); // link count //printf("Page_addAnnot_FromString(): lcount=%i\n", lcount); if (lcount < 1) { Py_RETURN_NONE; } try { // insert links from the provided sources if (!page.m_internal) { throw std::runtime_error(MSG_IS_NO_PDF); } if (!mupdf::pdf_dict_get(page.obj(), PDF_NAME2(Annots)).m_internal) { mupdf::pdf_dict_put_array(page.obj(), PDF_NAME2(Annots), lcount); } mupdf::PdfObj annots = mupdf::pdf_dict_get(page.obj(), PDF_NAME2(Annots)); mupdf::PdfDocument doc = page.doc(); //printf("lcount=%i\n", lcount); fz_context* ctx = mupdf::internal_context_get(); for (int i = 0; i < lcount; i++) { const char* text = nullptr; txtpy = PySequence_ITEM(linklist, (Py_ssize_t) i); text = PyUnicode_AsUTF8(txtpy); Py_CLEAR(txtpy); if (!text) { messagef("skipping bad link / annot item %i.", i); continue; } try { pdf_obj* obj = lll_JM_pdf_obj_from_str(ctx, doc.m_internal, text); pdf_obj* annot = pdf_add_object_drop( ctx, doc.m_internal, obj ); pdf_obj* ind_obj = pdf_new_indirect(ctx, doc.m_internal, pdf_to_num(ctx, annot), 0); pdf_array_push_drop(ctx, annots.m_internal, ind_obj); pdf_drop_obj(ctx, annot); } catch (std::exception&) { messagef("skipping bad link / annot item %i.", i); } } } catch (std::exception&) { PyErr_Clear(); return nullptr; } Py_RETURN_NONE; } PyObject* Page_addAnnot_FromString(mupdf::FzPage& page, PyObject* linklist) { mupdf::PdfPage pdf_page = mupdf::pdf_page_from_fz_page(page); return Page_addAnnot_FromString(pdf_page, linklist); } static int page_count_fz2(void* document) { mupdf::FzDocument* document2 = (mupdf::FzDocument*) document; return mupdf::fz_count_pages(*document2); } static int page_count_fz(mupdf::FzDocument& document) { return mupdf::fz_count_pages(document); } static int page_count_pdf(mupdf::PdfDocument& pdf) { mupdf::FzDocument document = pdf.super(); return page_count_fz(document); } static int page_count(mupdf::FzDocument& document) { return mupdf::fz_count_pages(document); } static int page_count(mupdf::PdfDocument& pdf) { mupdf::FzDocument document = pdf.super(); return page_count(document); } static PyObject* page_annot_xrefs(mupdf::FzDocument& document, mupdf::PdfDocument& pdf, int pno) { int page_count = mupdf::fz_count_pages(document); int n = pno; while (n < 0) { n += page_count; } PyObject* annots = nullptr; if (n >= page_count) { throw std::runtime_error(MSG_BAD_PAGENO); } if (!pdf.m_internal) { throw std::runtime_error(MSG_IS_NO_PDF); } annots = JM_get_annot_xref_list(mupdf::pdf_lookup_page_obj(pdf, n)); return annots; } static PyObject* page_annot_xrefs(mupdf::FzDocument& document, int pno) { mupdf::PdfDocument pdf = mupdf::pdf_specifics(document); return page_annot_xrefs(document, pdf, pno); } static PyObject* page_annot_xrefs(mupdf::PdfDocument& pdf, int pno) { mupdf::FzDocument document = pdf.super(); return page_annot_xrefs(document, pdf, pno); } static bool Outline_is_external(mupdf::FzOutline* outline) { if (!outline->m_internal->uri) { return false; } return mupdf::ll_fz_is_external_link(outline->m_internal->uri); } int ll_fz_absi(int i) { return mupdf::ll_fz_absi(i); } enum { TEXT_FONT_SUPERSCRIPT = 1, TEXT_FONT_ITALIC = 2, TEXT_FONT_SERIFED = 4, TEXT_FONT_MONOSPACED = 8, TEXT_FONT_BOLD = 16, }; int g_skip_quad_corrections = 0; int g_subset_fontnames = 0; int g_small_glyph_heights = 0; void set_skip_quad_corrections(int on) { g_skip_quad_corrections = on; } void set_subset_fontnames(int on) { g_subset_fontnames = on; } void set_small_glyph_heights(int on) { g_small_glyph_heights = on; } struct jm_lineart_device { fz_device super; PyObject* out = {}; PyObject* method = {}; PyObject* pathdict = {}; PyObject* scissors = {}; float pathfactor = {}; fz_matrix ctm = {}; fz_matrix ptm = {}; fz_matrix rot = {}; fz_point lastpoint = {}; fz_point firstpoint = {}; int havemove = 0; fz_rect pathrect = {}; int clips = {}; int linecount = {}; float linewidth = {}; int path_type = {}; long depth = {}; size_t seqno = {}; char* layer_name; }; static void jm_lineart_drop_device(fz_context *ctx, fz_device *dev_) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (PyList_Check(dev->out)) { Py_CLEAR(dev->out); } Py_CLEAR(dev->method); Py_CLEAR(dev->scissors); mupdf::ll_fz_free(dev->layer_name); dev->layer_name = nullptr; } typedef jm_lineart_device jm_tracedraw_device; // need own versions of ascender / descender static float JM_font_ascender(fz_font* font) { if (g_skip_quad_corrections) { return 0.8f; } return mupdf::ll_fz_font_ascender(font); } static float JM_font_descender(fz_font* font) { if (g_skip_quad_corrections) { return -0.2f; } return mupdf::ll_fz_font_descender(font); } //---------------------------------------------------------------- // Return true if character is considered to be a word delimiter //---------------------------------------------------------------- static int JM_is_word_delimiter(int c, PyObject *delimiters) { if (c <= 32 || c == 160) return 1; // a standard delimiter if (0x202a <= c && c <= 0x202e) { return 1; // change between writing directions } // extra delimiters must be a non-empty sequence if (!delimiters || PyObject_Not(delimiters) || !PySequence_Check(delimiters)) { return 0; } // convert to tuple for easier looping PyObject *delims = PySequence_Tuple(delimiters); if (!delims) { PyErr_Clear(); return 0; } // Make 1-char PyObject from character given as integer PyObject *cchar = Py_BuildValue("C", c); // single character PyObject Py_ssize_t i, len = PyTuple_Size(delims); for (i = 0; i < len; i++) { int rc = PyUnicode_Compare(cchar, PyTuple_GET_ITEM(delims, i)); if (rc == 0) { // equal to a delimiter character Py_DECREF(cchar); Py_DECREF(delims); PyErr_Clear(); return 1; } } Py_DECREF(delims); PyErr_Clear(); return 0; } static int JM_is_rtl_char(int c) { if (c < 0x590 || c > 0x900) return 0; return 1; } static const char* JM_font_name(fz_font* font) { const char* name = mupdf::ll_fz_font_name(font); const char* s = strchr(name, '+'); if (g_subset_fontnames || !s || s-name != 6) { return name; } return s + 1; } static int detect_super_script(fz_stext_line *line, fz_stext_char *ch) { if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0) { return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f; } return 0; } static int JM_char_font_flags(fz_font *font, fz_stext_line *line, fz_stext_char *ch) { int flags = 0; if (line && ch) { flags += detect_super_script(line, ch) * TEXT_FONT_SUPERSCRIPT; } flags += mupdf::ll_fz_font_is_italic(font) * TEXT_FONT_ITALIC; flags += mupdf::ll_fz_font_is_serif(font) * TEXT_FONT_SERIFED; flags += mupdf::ll_fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED; flags += mupdf::ll_fz_font_is_bold(font) * TEXT_FONT_BOLD; return flags; } static void jm_trace_text_span( jm_tracedraw_device* dev, fz_text_span* span, int type, fz_matrix ctm, fz_colorspace* colorspace, const float* color, float alpha, size_t seqno ) { //printf("extra.jm_trace_text_span(): seqno=%zi\n", seqno); //fz_matrix join = mupdf::ll_fz_concat(span->trm, ctm); //double fsize = sqrt(fabs((double) span->trm.a * (double) span->trm.d)); fz_matrix mat = mupdf::ll_fz_concat(span->trm, ctm); // text transformation matrix fz_point dir = mupdf::ll_fz_transform_vector(mupdf::ll_fz_make_point(1, 0), mat); // writing direction double fsize = sqrt(dir.x * dir.x + dir.y * dir.y); // font size dir = mupdf::ll_fz_normalize_vector(dir); // compute effective ascender / descender double asc = (double) JM_font_ascender(span->font); double dsc = (double) JM_font_descender(span->font); if (asc < 1e-3) { // probably Tesseract font dsc = -0.1; asc = 0.9; } double ascsize = asc * fsize / (asc - dsc); double dscsize = dsc * fsize / (asc - dsc); int fflags = 0; // font flags int mono = mupdf::ll_fz_font_is_monospaced(span->font); fflags += mono * TEXT_FONT_MONOSPACED; fflags += mupdf::ll_fz_font_is_italic(span->font) * TEXT_FONT_ITALIC; fflags += mupdf::ll_fz_font_is_serif(span->font) * TEXT_FONT_SERIFED; fflags += mupdf::ll_fz_font_is_bold(span->font) * TEXT_FONT_BOLD; // walk through characters of span fz_matrix rot = mupdf::ll_fz_make_matrix(dir.x, dir.y, -dir.y, dir.x, 0, 0); if (dir.x == -1) { // left-right flip rot.d = 1; } PyObject* chars = PyTuple_New(span->len); double space_adv = 0; double last_adv = 0; fz_rect span_bbox; for (int i = 0; i < span->len; i++) { double adv = 0; if (span->items[i].gid >= 0) { adv = (double) mupdf::ll_fz_advance_glyph(span->font, span->items[i].gid, span->wmode); } adv *= fsize; last_adv = adv; if (span->items[i].ucs == 32) { space_adv = adv; } fz_point char_orig; char_orig = fz_make_point(span->items[i].x, span->items[i].y); char_orig = fz_transform_point(char_orig, ctm); fz_matrix m1 = mupdf::ll_fz_make_matrix(1, 0, 0, 1, -char_orig.x, -char_orig.y); m1 = mupdf::ll_fz_concat(m1, rot); m1 = mupdf::ll_fz_concat(m1, mupdf::ll_fz_make_matrix(1, 0, 0, 1, char_orig.x, char_orig.y)); float x0 = char_orig.x; float x1 = x0 + adv; float y0; float y1; if ( (mat.d > 0 && (dir.x == 1 || dir.x == -1)) || (mat.b !=0 && mat.b == -mat.c) ) // up-down flip { // up-down flip y0 = char_orig.y + dscsize; y1 = char_orig.y + ascsize; } else { y0 = char_orig.y - ascsize; y1 = char_orig.y - dscsize; } fz_rect char_bbox = mupdf::ll_fz_make_rect(x0, y0, x1, y1); char_bbox = mupdf::ll_fz_transform_rect(char_bbox, m1); PyTuple_SET_ITEM( chars, (Py_ssize_t) i, Py_BuildValue( "ii(ff)(ffff)", span->items[i].ucs, span->items[i].gid, char_orig.x, char_orig.y, char_bbox.x0, char_bbox.y0, char_bbox.x1, char_bbox.y1 ) ); if (i > 0) { span_bbox = mupdf::ll_fz_union_rect(span_bbox, char_bbox); } else { span_bbox = char_bbox; } } if (!space_adv) { if (!(fflags & TEXT_FONT_MONOSPACED)) { fz_font* out_font = nullptr; space_adv = mupdf::ll_fz_advance_glyph( span->font, mupdf::ll_fz_encode_character_with_fallback(span->font, 32, 0, 0, &out_font), span->wmode ); space_adv *= fsize; if (!space_adv) { space_adv = last_adv; } } else { space_adv = last_adv; // for mono any char width suffices } } // make the span dictionary PyObject* span_dict = PyDict_New(); dict_setitemstr_drop(span_dict, "dir", JM_py_from_point(dir)); dict_setitem_drop(span_dict, dictkey_font, JM_EscapeStrFromStr(JM_font_name(span->font))); dict_setitem_drop(span_dict, dictkey_wmode, PyLong_FromLong((long) span->wmode)); dict_setitem_drop(span_dict, dictkey_flags, PyLong_FromLong((long) fflags)); dict_setitemstr_drop(span_dict, "bidi_lvl", PyLong_FromLong((long) span->bidi_level)); dict_setitemstr_drop(span_dict, "bidi_dir", PyLong_FromLong((long) span->markup_dir)); dict_setitem_drop(span_dict, dictkey_ascender, PyFloat_FromDouble(asc)); dict_setitem_drop(span_dict, dictkey_descender, PyFloat_FromDouble(dsc)); dict_setitem_drop(span_dict, dictkey_colorspace, PyLong_FromLong(3)); float rgb[3]; if (colorspace) { mupdf::ll_fz_convert_color( colorspace, color, mupdf::ll_fz_device_rgb(), rgb, nullptr, fz_default_color_params ); } else { rgb[0] = rgb[1] = rgb[2] = 0; } double linewidth; if (dev->linewidth > 0) // width of character border { linewidth = (double) dev->linewidth; } else { linewidth = fsize * 0.05; // default: 5% of font size } if (0) std::cout << " dev->linewidth=" << dev->linewidth << " fsize=" << fsize << " linewidth=" << linewidth << "\n"; dict_setitem_drop(span_dict, dictkey_color, Py_BuildValue("fff", rgb[0], rgb[1], rgb[2])); dict_setitem_drop(span_dict, dictkey_size, PyFloat_FromDouble(fsize)); dict_setitemstr_drop(span_dict, "opacity", PyFloat_FromDouble((double) alpha)); dict_setitemstr_drop(span_dict, "linewidth", PyFloat_FromDouble((double) linewidth)); dict_setitemstr_drop(span_dict, "spacewidth", PyFloat_FromDouble(space_adv)); dict_setitem_drop(span_dict, dictkey_type, PyLong_FromLong((long) type)); dict_setitem_drop(span_dict, dictkey_bbox, JM_py_from_rect(span_bbox)); dict_setitemstr_drop(span_dict, "layer", JM_UnicodeFromStr(dev->layer_name)); dict_setitemstr_drop(span_dict, "seqno", PyLong_FromSize_t(seqno)); dict_setitem_drop(span_dict, dictkey_chars, chars); //std::cout << "span_dict=" << repr(span_dict) << "\n"; s_list_append_drop(dev->out, span_dict); } static inline void jm_increase_seqno(fz_context* ctx, fz_device* dev_) { jm_tracedraw_device* dev = (jm_tracedraw_device*) dev_; dev->seqno += 1; } static void jm_fill_path( fz_context* ctx, fz_device* dev, const fz_path*, int even_odd, fz_matrix, fz_colorspace*, const float* color, float alpha, fz_color_params ) { jm_increase_seqno(ctx, dev); } static void jm_fill_shade( fz_context* ctx, fz_device* dev, fz_shade* shd, fz_matrix ctm, float alpha, fz_color_params color_params ) { jm_increase_seqno(ctx, dev); } static void jm_fill_image( fz_context* ctx, fz_device* dev, fz_image* img, fz_matrix ctm, float alpha, fz_color_params color_params ) { jm_increase_seqno(ctx, dev); } static void jm_fill_image_mask( fz_context* ctx, fz_device* dev, fz_image* img, fz_matrix ctm, fz_colorspace* cs, const float* color, float alpha, fz_color_params color_params ) { jm_increase_seqno(ctx, dev); } static void jm_dev_linewidth( fz_context* ctx, fz_device* dev_, const fz_path* path, const fz_stroke_state* stroke, fz_matrix ctm, fz_colorspace* colorspace, const float* color, float alpha, fz_color_params color_params ) { jm_tracedraw_device* dev = (jm_tracedraw_device*) dev_; if (0) std::cout << "jm_dev_linewidth(): changing dev->linewidth from " << dev->linewidth << " to stroke->linewidth=" << stroke->linewidth << "\n"; dev->linewidth = stroke->linewidth; jm_increase_seqno(ctx, dev_); } static void jm_trace_text( jm_tracedraw_device* dev, const fz_text* text, int type, fz_matrix ctm, fz_colorspace* colorspace, const float* color, float alpha, size_t seqno ) { fz_text_span* span; for (span = text->head; span; span = span->next) { jm_trace_text_span(dev, span, type, ctm, colorspace, color, alpha, seqno); } } /*--------------------------------------------------------- There are 3 text trace types: 0 - fill text (PDF Tr 0) 1 - stroke text (PDF Tr 1) 3 - ignore text (PDF Tr 3) ---------------------------------------------------------*/ static void jm_tracedraw_fill_text( fz_context* ctx, fz_device* dev_, const fz_text* text, fz_matrix ctm, fz_colorspace* colorspace, const float* color, float alpha, fz_color_params color_params ) { jm_tracedraw_device* dev = (jm_tracedraw_device*) dev_; jm_trace_text(dev, text, 0, ctm, colorspace, color, alpha, dev->seqno); dev->seqno += 1; } static void jm_tracedraw_stroke_text( fz_context* ctx, fz_device* dev_, const fz_text* text, const fz_stroke_state* stroke, fz_matrix ctm, fz_colorspace* colorspace, const float* color, float alpha, fz_color_params color_params ) { jm_tracedraw_device* dev = (jm_tracedraw_device*) dev_; jm_trace_text(dev, text, 1, ctm, colorspace, color, alpha, dev->seqno); dev->seqno += 1; } static void jm_tracedraw_ignore_text( fz_context* ctx, fz_device* dev_, const fz_text* text, fz_matrix ctm ) { jm_tracedraw_device* dev = (jm_tracedraw_device*) dev_; jm_trace_text(dev, text, 3, ctm, nullptr, nullptr, 1, dev->seqno); dev->seqno += 1; } static void jm_lineart_begin_layer(fz_context *ctx, fz_device *dev_, const char *name) { jm_tracedraw_device* dev = (jm_tracedraw_device*) dev_; mupdf::ll_fz_free(dev->layer_name); dev->layer_name = mupdf::ll_fz_strdup(name); } static void jm_lineart_end_layer(fz_context *ctx, fz_device *dev_) { jm_tracedraw_device* dev = (jm_tracedraw_device*) dev_; mupdf::ll_fz_free(dev->layer_name); dev->layer_name = nullptr; } mupdf::FzDevice JM_new_texttrace_device(PyObject* out) { mupdf::FzDevice device(sizeof(jm_tracedraw_device)); jm_tracedraw_device* dev = (jm_tracedraw_device*) device.m_internal; dev->super.close_device = nullptr; dev->super.drop_device = jm_lineart_drop_device; dev->super.fill_path = jm_fill_path; dev->super.stroke_path = jm_dev_linewidth; dev->super.clip_path = nullptr; dev->super.clip_stroke_path = nullptr; dev->super.fill_text = jm_tracedraw_fill_text; dev->super.stroke_text = jm_tracedraw_stroke_text; dev->super.clip_text = nullptr; dev->super.clip_stroke_text = nullptr; dev->super.ignore_text = jm_tracedraw_ignore_text; dev->super.fill_shade = jm_fill_shade; dev->super.fill_image = jm_fill_image; dev->super.fill_image_mask = jm_fill_image_mask; dev->super.clip_image_mask = nullptr; dev->super.pop_clip = nullptr; dev->super.begin_mask = nullptr; dev->super.end_mask = nullptr; dev->super.begin_group = nullptr; dev->super.end_group = nullptr; dev->super.begin_tile = nullptr; dev->super.end_tile = nullptr; dev->super.begin_layer = jm_lineart_begin_layer; dev->super.end_layer = jm_lineart_end_layer; dev->super.begin_structure = nullptr; dev->super.end_structure = nullptr; dev->super.begin_metatext = nullptr; dev->super.end_metatext = nullptr; dev->super.render_flags = nullptr; dev->super.set_default_colorspaces = nullptr; Py_XINCREF(out); dev->out = out; dev->seqno = 0; return device; } static fz_quad JM_char_quad(fz_stext_line *line, fz_stext_char *ch) { if (g_skip_quad_corrections) { // no special handling return ch->quad; } if (line->wmode) { // never touch vertical write mode return ch->quad; } fz_font *font = ch->font; float asc = JM_font_ascender(font); float dsc = JM_font_descender(font); float c, s, fsize = ch->size; float asc_dsc = asc - dsc + FLT_EPSILON; if (asc_dsc >= 1 && g_small_glyph_heights == 0) { // no problem return ch->quad; } if (asc < 1e-3) { // probably Tesseract glyphless font dsc = -0.1f; asc = 0.9f; asc_dsc = 1.0f; } if (g_small_glyph_heights || asc_dsc < 1) { dsc = dsc / asc_dsc; asc = asc / asc_dsc; } asc_dsc = asc - dsc; asc = asc * fsize / asc_dsc; dsc = dsc * fsize / asc_dsc; /* ------------------------------ Re-compute quad with the adjusted ascender / descender values: Move ch->origin to (0,0) and de-rotate quad, then adjust the corners, re-rotate and move back to ch->origin location. ------------------------------ */ fz_matrix trm1, trm2, xlate1, xlate2; fz_quad quad; c = line->dir.x; // cosine s = line->dir.y; // sine trm1 = mupdf::ll_fz_make_matrix(c, -s, s, c, 0, 0); // derotate trm2 = mupdf::ll_fz_make_matrix(c, s, -s, c, 0, 0); // rotate if (c == -1) { // left-right flip trm1.d = 1; trm2.d = 1; } xlate1 = mupdf::ll_fz_make_matrix(1, 0, 0, 1, -ch->origin.x, -ch->origin.y); xlate2 = mupdf::ll_fz_make_matrix(1, 0, 0, 1, ch->origin.x, ch->origin.y); quad = mupdf::ll_fz_transform_quad(ch->quad, xlate1); // move origin to (0,0) quad = mupdf::ll_fz_transform_quad(quad, trm1); // de-rotate corners // adjust vertical coordinates if (c == 1 && quad.ul.y > 0) { // up-down flip quad.ul.y = asc; quad.ur.y = asc; quad.ll.y = dsc; quad.lr.y = dsc; } else { quad.ul.y = -asc; quad.ur.y = -asc; quad.ll.y = -dsc; quad.lr.y = -dsc; } // adjust horizontal coordinates that are too crazy: // (1) left x must be >= 0 // (2) if bbox width is 0, lookup char advance in font. if (quad.ll.x < 0) { quad.ll.x = 0; quad.ul.x = 0; } float cwidth = quad.lr.x - quad.ll.x; if (cwidth < FLT_EPSILON) { int glyph = mupdf::ll_fz_encode_character( font, ch->c); if (glyph) { float fwidth = mupdf::ll_fz_advance_glyph( font, glyph, line->wmode); quad.lr.x = quad.ll.x + fwidth * fsize; quad.ur.x = quad.lr.x; } } quad = mupdf::ll_fz_transform_quad(quad, trm2); // rotate back quad = mupdf::ll_fz_transform_quad(quad, xlate2); // translate back return quad; } static fz_rect JM_char_bbox(fz_stext_line* line, fz_stext_char* ch) { fz_rect r = mupdf::ll_fz_rect_from_quad(JM_char_quad( line, ch)); if (!line->wmode) { return r; } if (r.y1 < r.y0 + ch->size) { r.y0 = r.y1 - ch->size; } return r; } fz_rect JM_char_bbox(const mupdf::FzStextLine& line, const mupdf::FzStextChar& ch) { return JM_char_bbox( line.m_internal, ch.m_internal); } static int JM_rects_overlap(const fz_rect a, const fz_rect b) { if (0 || a.x0 >= b.x1 || a.y0 >= b.y1 || a.x1 <= b.x0 || a.y1 <= b.y0 ) return 0; return 1; } // void JM_append_rune(fz_buffer *buff, int ch); //----------------------------------------------------------------------------- // Plain text output. An identical copy of fz_print_stext_page_as_text, // but lines within a block are concatenated by space instead a new-line // character (which else leads to 2 new-lines). //----------------------------------------------------------------------------- void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page) { fz_rect rect = page.m_internal->mediabox; for (auto block: page) { if (block.m_internal->type == FZ_STEXT_BLOCK_TEXT) { for (auto line: block) { int last_char = 0; for (auto ch: line) { fz_rect chbbox = JM_char_bbox( line, ch); if (mupdf::ll_fz_is_infinite_rect(rect) || JM_rects_overlap(rect, chbbox) ) { last_char = ch.m_internal->c; JM_append_rune(res.m_internal, last_char); } } if (last_char != 10 && last_char > 0) { mupdf::ll_fz_append_string(res.m_internal, "\n"); } } } } } // path_type is one of: #define FILL_PATH 1 #define STROKE_PATH 2 #define CLIP_PATH 3 #define CLIP_STROKE_PATH 4 // Every scissor of a clip is a sub rectangle of the preceding clip scissor if // the clip level is larger. static fz_rect compute_scissor(jm_lineart_device *dev) { PyObject *last_scissor = NULL; fz_rect scissor; if (!dev->scissors) { dev->scissors = PyList_New(0); } Py_ssize_t num_scissors = PyList_Size(dev->scissors); if (num_scissors > 0) { last_scissor = PyList_GET_ITEM(dev->scissors, num_scissors-1); scissor = JM_rect_from_py(last_scissor); scissor = fz_intersect_rect(scissor, dev->pathrect); } else { scissor = dev->pathrect; } LIST_APPEND_DROP(dev->scissors, JM_py_from_rect(scissor)); return scissor; } /* -------------------------------------------------------------------------- Check whether the last 4 lines represent a quad. Because of how we count, the lines are a polyline already, i.e. last point of a line equals 1st point of next line. So we check for a polygon (last line's end point equals start point). If not true we return 0. -------------------------------------------------------------------------- */ static int jm_checkquad(jm_lineart_device* dev) { PyObject *items = PyDict_GetItem(dev->pathdict, dictkey_items); Py_ssize_t i, len = PyList_Size(items); float f[8]; // coordinates of the 4 corners mupdf::FzPoint temp, lp; // line = (temp, lp) PyObject *rect; PyObject *line; // fill the 8 floats in f, start from items[-4:] for (i = 0; i < 4; i++) { // store line start points line = PyList_GET_ITEM(items, len - 4 + i); temp = JM_point_from_py(PyTuple_GET_ITEM(line, 1)); f[i * 2] = temp.x; f[i * 2 + 1] = temp.y; lp = JM_point_from_py(PyTuple_GET_ITEM(line, 2)); } if (lp.x != f[0] || lp.y != f[1]) { // not a polygon! //dev_linecount -= 1; return 0; } // we have detected a quad dev->linecount = 0; // reset this // a quad item is ("qu", (ul, ur, ll, lr)), where the tuple items // are pairs of floats representing a quad corner each. rect = PyTuple_New(2); PyTuple_SET_ITEM(rect, 0, PyUnicode_FromString("qu")); /* ---------------------------------------------------- * relationship of float array to quad points: * (0, 1) = ul, (2, 3) = ll, (6, 7) = ur, (4, 5) = lr ---------------------------------------------------- */ fz_quad q = fz_make_quad(f[0], f[1], f[6], f[7], f[2], f[3], f[4], f[5]); PyTuple_SET_ITEM(rect, 1, JM_py_from_quad(q)); PyList_SetItem(items, len - 4, rect); // replace item -4 by rect PyList_SetSlice(items, len - 3, len, NULL); // delete remaining 3 items return 1; } /* -------------------------------------------------------------------------- Check whether the last 3 path items represent a rectangle. Line 1 and 3 must be horizontal, line 2 must be vertical. Returns 1 if we have modified the path, otherwise 0. -------------------------------------------------------------------------- */ static int jm_checkrect(jm_lineart_device* dev) { dev->linecount = 0; // reset line count long orientation = 0; // area orientation of rectangle mupdf::FzPoint ll, lr, ur, ul; mupdf::FzRect r; PyObject *rect; PyObject *line0, *line2; PyObject *items = PyDict_GetItem(dev->pathdict, dictkey_items); Py_ssize_t len = PyList_Size(items); line0 = PyList_GET_ITEM(items, len - 3); ll = JM_point_from_py(PyTuple_GET_ITEM(line0, 1)); lr = JM_point_from_py(PyTuple_GET_ITEM(line0, 2)); // no need to extract "line1"! line2 = PyList_GET_ITEM(items, len - 1); ur = JM_point_from_py(PyTuple_GET_ITEM(line2, 1)); ul = JM_point_from_py(PyTuple_GET_ITEM(line2, 2)); /* --------------------------------------------------------------------- Assumption: When decomposing rects, MuPDF always starts with a horizontal line, followed by a vertical line, followed by a horizontal line. First line: (ll, lr), third line: (ul, ur). If 1st line is below 3rd line, we record anti-clockwise (+1), else clockwise (-1) orientation. --------------------------------------------------------------------- */ if (ll.y != lr.y || ll.x != ul.x || ur.y != ul.y || ur.x != lr.x) { goto drop_out; // not a rectangle } // we have a rect, replace last 3 "l" items by one "re" item. if (ul.y < lr.y) { r = fz_make_rect(ul.x, ul.y, lr.x, lr.y); orientation = 1; } else { r = fz_make_rect(ll.x, ll.y, ur.x, ur.y); orientation = -1; } rect = PyTuple_New(3); PyTuple_SET_ITEM(rect, 0, PyUnicode_FromString("re")); PyTuple_SET_ITEM(rect, 1, JM_py_from_rect(r)); PyTuple_SET_ITEM(rect, 2, PyLong_FromLong(orientation)); PyList_SetItem(items, len - 3, rect); // replace item -3 by rect PyList_SetSlice(items, len - 2, len, NULL); // delete remaining 2 items return 1; drop_out:; return 0; } static PyObject * jm_lineart_color(fz_colorspace *colorspace, const float *color) { float rgb[3]; if (colorspace) { mupdf::ll_fz_convert_color(colorspace, color, mupdf::ll_fz_device_rgb(), rgb, NULL, fz_default_color_params); return Py_BuildValue("fff", rgb[0], rgb[1], rgb[2]); } return PyTuple_New(0); } static void trace_moveto(fz_context *ctx, void *dev_, float x, float y) { jm_lineart_device* dev = (jm_lineart_device*) dev_; dev->lastpoint = mupdf::ll_fz_transform_point(fz_make_point(x, y), dev->ctm); if (mupdf::ll_fz_is_infinite_rect(dev->pathrect)) { dev->pathrect = mupdf::ll_fz_make_rect( dev->lastpoint.x, dev->lastpoint.y, dev->lastpoint.x, dev->lastpoint.y ); } dev->firstpoint = dev->lastpoint; dev->havemove = 1; dev->linecount = 0; // reset # of consec. lines } static void trace_lineto(fz_context *ctx, void *dev_, float x, float y) { jm_lineart_device* dev = (jm_lineart_device*) dev_; fz_point p1 = fz_transform_point(fz_make_point(x, y), dev->ctm); dev->pathrect = fz_include_point_in_rect(dev->pathrect, p1); PyObject *list = PyTuple_New(3); PyTuple_SET_ITEM(list, 0, PyUnicode_FromString("l")); PyTuple_SET_ITEM(list, 1, JM_py_from_point(dev->lastpoint)); PyTuple_SET_ITEM(list, 2, JM_py_from_point(p1)); dev->lastpoint = p1; PyObject *items = PyDict_GetItem(dev->pathdict, dictkey_items); LIST_APPEND_DROP(items, list); dev->linecount += 1; // counts consecutive lines if (dev->linecount == 4 && dev->path_type != FILL_PATH) { // shrink to "re" or "qu" item jm_checkquad(dev); } } static void trace_curveto(fz_context *ctx, void *dev_, float x1, float y1, float x2, float y2, float x3, float y3) { jm_lineart_device* dev = (jm_lineart_device*) dev_; dev->linecount = 0; // reset # of consec. lines fz_point p1 = fz_make_point(x1, y1); fz_point p2 = fz_make_point(x2, y2); fz_point p3 = fz_make_point(x3, y3); p1 = fz_transform_point(p1, dev->ctm); p2 = fz_transform_point(p2, dev->ctm); p3 = fz_transform_point(p3, dev->ctm); dev->pathrect = fz_include_point_in_rect(dev->pathrect, p1); dev->pathrect = fz_include_point_in_rect(dev->pathrect, p2); dev->pathrect = fz_include_point_in_rect(dev->pathrect, p3); PyObject *list = PyTuple_New(5); PyTuple_SET_ITEM(list, 0, PyUnicode_FromString("c")); PyTuple_SET_ITEM(list, 1, JM_py_from_point(dev->lastpoint)); PyTuple_SET_ITEM(list, 2, JM_py_from_point(p1)); PyTuple_SET_ITEM(list, 3, JM_py_from_point(p2)); PyTuple_SET_ITEM(list, 4, JM_py_from_point(p3)); dev->lastpoint = p3; PyObject *items = PyDict_GetItem(dev->pathdict, dictkey_items); LIST_APPEND_DROP(items, list); } static void trace_close(fz_context *ctx, void *dev_) { jm_lineart_device* dev = (jm_lineart_device*) dev_; if (dev->linecount == 3) { if (jm_checkrect(dev)) { return; } } dev->linecount = 0; // reset # of consec. lines if (dev->havemove) { if (dev->firstpoint.x != dev->lastpoint.x || dev->firstpoint.y != dev->lastpoint.y) { PyObject *list = PyTuple_New(3); PyTuple_SET_ITEM(list, 0, PyUnicode_FromString("l")); PyTuple_SET_ITEM(list, 1, JM_py_from_point(dev->lastpoint)); PyTuple_SET_ITEM(list, 2, JM_py_from_point(dev->firstpoint)); dev->lastpoint = dev->firstpoint; PyObject *items = PyDict_GetItem(dev->pathdict, dictkey_items); LIST_APPEND_DROP(items, list); } dev->havemove = 0; DICT_SETITEMSTR_DROP(dev->pathdict, "closePath", JM_BOOL(0)); } else { DICT_SETITEMSTR_DROP(dev->pathdict, "closePath", JM_BOOL(1)); } } static const fz_path_walker trace_path_walker = { trace_moveto, trace_lineto, trace_curveto, trace_close }; /* --------------------------------------------------------------------- Create the "items" list of the path dictionary * either create or empty the path dictionary * reset the end point of the path * reset count of consecutive lines * invoke fz_walk_path(), which create the single items * if no items detected, empty path dict again --------------------------------------------------------------------- */ static void jm_lineart_path(jm_lineart_device *dev, const fz_path *path) { dev->pathrect = fz_infinite_rect; dev->linecount = 0; dev->lastpoint = fz_make_point(0, 0); dev->firstpoint = fz_make_point(0, 0); if (dev->pathdict) { Py_CLEAR(dev->pathdict); } dev->pathdict = PyDict_New(); DICT_SETITEM_DROP(dev->pathdict, dictkey_items, PyList_New(0)); mupdf::ll_fz_walk_path(path, &trace_path_walker, dev); // Check if any items were added ... if (!PyDict_GetItem(dev->pathdict, dictkey_items) || !PyList_Size(PyDict_GetItem(dev->pathdict, dictkey_items))) { Py_CLEAR(dev->pathdict); } } //--------------------------------------------------------------------------- // Append current path to list or merge into last path of the list. // (1) Append if first path, different item lists or not a 'stroke' version // of previous path // (2) If new path has the same items, merge its content into previous path // and change path["type"] to "fs". // (3) If "out" is callable, skip the previous and pass dictionary to it. //--------------------------------------------------------------------------- static void // todo: remove `method` arg - it is dev->method. jm_append_merge(jm_lineart_device *dev) { Py_ssize_t len; int rc; PyObject *prev; PyObject *previtems; PyObject *thisitems; const char *thistype; const char *prevtype; if (PyCallable_Check(dev->out) || dev->method != Py_None) { // function or method goto callback; } len = PyList_Size(dev->out); // len of output list so far if (len == 0) { // always append first path goto append; } thistype = PyUnicode_AsUTF8(PyDict_GetItem(dev->pathdict, dictkey_type)); if (strcmp(thistype, "s") != 0) { // if not stroke, then append goto append; } prev = PyList_GET_ITEM(dev->out, len - 1); // get prev path prevtype = PyUnicode_AsUTF8(PyDict_GetItem(prev, dictkey_type)); if (strcmp(prevtype, "f") != 0) { // if previous not fill, append goto append; } // last check: there must be the same list of items for "f" and "s". previtems = PyDict_GetItem(prev, dictkey_items); thisitems = PyDict_GetItem(dev->pathdict, dictkey_items); if (PyObject_RichCompareBool(previtems, thisitems, Py_NE)) { goto append; } rc = PyDict_Merge(prev, dev->pathdict, 0); // merge, do not override if (rc == 0) { DICT_SETITEM_DROP(prev, dictkey_type, PyUnicode_FromString("fs")); goto postappend; } else { messagef("could not merge stroke and fill path"); goto append; } append:; //printf("Appending to dev->out. len(dev->out)=%zi\n", PyList_Size(dev->out)); PyList_Append(dev->out, dev->pathdict); postappend:; Py_CLEAR(dev->pathdict); return; callback:; // callback function or method PyObject *resp = NULL; if (dev->method == Py_None) { resp = PyObject_CallFunctionObjArgs(dev->out, dev->pathdict, NULL); } else { resp = PyObject_CallMethodObjArgs(dev->out, dev->method, dev->pathdict, NULL); } if (resp) { Py_DECREF(resp); } else { messagef("calling cdrawings callback function/method failed!"); PyErr_Clear(); } Py_CLEAR(dev->pathdict); return; } static void jm_lineart_fill_path(fz_context *ctx, fz_device *dev_, const fz_path *path, int even_odd, fz_matrix ctm, fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) { jm_lineart_device *dev = (jm_lineart_device *) dev_; //printf("extra.jm_lineart_fill_path(): dev->seqno=%zi\n", dev->seqno); dev->ctm = ctm; //fz_concat(ctm, trace_device_ptm); dev->path_type = FILL_PATH; jm_lineart_path(dev, path); if (!dev->pathdict) { return; } DICT_SETITEM_DROP(dev->pathdict, dictkey_type, PyUnicode_FromString("f")); DICT_SETITEMSTR_DROP(dev->pathdict, "even_odd", JM_BOOL(even_odd)); DICT_SETITEMSTR_DROP(dev->pathdict, "fill_opacity", Py_BuildValue("f", alpha)); DICT_SETITEMSTR_DROP(dev->pathdict, "fill", jm_lineart_color(colorspace, color)); DICT_SETITEM_DROP(dev->pathdict, dictkey_rect, JM_py_from_rect(dev->pathrect)); DICT_SETITEMSTR_DROP(dev->pathdict, "seqno", PyLong_FromSize_t(dev->seqno)); DICT_SETITEMSTR_DROP(dev->pathdict, "layer", JM_UnicodeFromStr(dev->layer_name)); if (dev->clips) { DICT_SETITEMSTR_DROP(dev->pathdict, "level", PyLong_FromLong(dev->depth)); } jm_append_merge(dev); dev->seqno += 1; } static void jm_lineart_stroke_path(fz_context *ctx, fz_device *dev_, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) { jm_lineart_device *dev = (jm_lineart_device *)dev_; //printf("extra.jm_lineart_stroke_path(): dev->seqno=%zi\n", dev->seqno); int i; dev->pathfactor = 1; if (ctm.a != 0 && fz_abs(ctm.a) == fz_abs(ctm.d)) { dev->pathfactor = fz_abs(ctm.a); } else { if (ctm.b != 0 && fz_abs(ctm.b) == fz_abs(ctm.c)) { dev->pathfactor = fz_abs(ctm.b); } } dev->ctm = ctm; // fz_concat(ctm, trace_device_ptm); dev->path_type = STROKE_PATH; jm_lineart_path(dev, path); if (!dev->pathdict) { return; } DICT_SETITEM_DROP(dev->pathdict, dictkey_type, PyUnicode_FromString("s")); DICT_SETITEMSTR_DROP(dev->pathdict, "stroke_opacity", Py_BuildValue("f", alpha)); DICT_SETITEMSTR_DROP(dev->pathdict, "color", jm_lineart_color(colorspace, color)); DICT_SETITEM_DROP(dev->pathdict, dictkey_width, Py_BuildValue("f", dev->pathfactor * stroke->linewidth)); DICT_SETITEMSTR_DROP(dev->pathdict, "lineCap", Py_BuildValue("iii", stroke->start_cap, stroke->dash_cap, stroke->end_cap)); DICT_SETITEMSTR_DROP(dev->pathdict, "lineJoin", Py_BuildValue("f", dev->pathfactor * stroke->linejoin)); if (!PyDict_GetItemString(dev->pathdict, "closePath")) { DICT_SETITEMSTR_DROP(dev->pathdict, "closePath", JM_BOOL(0)); } // output the "dashes" string if (stroke->dash_len) { mupdf::FzBuffer buff(256); mupdf::fz_append_string(buff, "[ "); // left bracket for (i = 0; i < stroke->dash_len; i++) { fz_append_printf(ctx, buff.m_internal, "%g ", dev->pathfactor * stroke->dash_list[i]); } fz_append_printf(ctx, buff.m_internal, "] %g", dev->pathfactor * stroke->dash_phase); DICT_SETITEMSTR_DROP(dev->pathdict, "dashes", JM_EscapeStrFromBuffer(buff)); } else { DICT_SETITEMSTR_DROP(dev->pathdict, "dashes", PyUnicode_FromString("[] 0")); } DICT_SETITEM_DROP(dev->pathdict, dictkey_rect, JM_py_from_rect(dev->pathrect)); DICT_SETITEMSTR_DROP(dev->pathdict, "layer", JM_UnicodeFromStr(dev->layer_name)); DICT_SETITEMSTR_DROP(dev->pathdict, "seqno", PyLong_FromSize_t(dev->seqno)); if (dev->clips) { DICT_SETITEMSTR_DROP(dev->pathdict, "level", PyLong_FromLong(dev->depth)); } // output the dict - potentially merging it with a previous fill_path twin jm_append_merge(dev); dev->seqno += 1; } static void jm_lineart_clip_path(fz_context *ctx, fz_device *dev_, const fz_path *path, int even_odd, fz_matrix ctm, fz_rect scissor) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; dev->ctm = ctm; //fz_concat(ctm, trace_device_ptm); dev->path_type = CLIP_PATH; jm_lineart_path(dev, path); if (!dev->pathdict) { return; } DICT_SETITEM_DROP(dev->pathdict, dictkey_type, PyUnicode_FromString("clip")); DICT_SETITEMSTR_DROP(dev->pathdict, "even_odd", JM_BOOL(even_odd)); if (!PyDict_GetItemString(dev->pathdict, "closePath")) { DICT_SETITEMSTR_DROP(dev->pathdict, "closePath", JM_BOOL(0)); } DICT_SETITEMSTR_DROP(dev->pathdict, "scissor", JM_py_from_rect(compute_scissor(dev))); DICT_SETITEMSTR_DROP(dev->pathdict, "level", PyLong_FromLong(dev->depth)); DICT_SETITEMSTR_DROP(dev->pathdict, "layer", JM_UnicodeFromStr(dev->layer_name)); jm_append_merge(dev); dev->depth++; } static void jm_lineart_clip_stroke_path(fz_context *ctx, fz_device *dev_, const fz_path *path, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; dev->ctm = ctm; //fz_concat(ctm, trace_device_ptm); dev->path_type = CLIP_STROKE_PATH; jm_lineart_path(dev, path); if (!dev->pathdict) { return; } DICT_SETITEM_DROP(dev->pathdict, dictkey_type, PyUnicode_FromString("clip")); DICT_SETITEMSTR_DROP(dev->pathdict, "even_odd", Py_BuildValue("s", NULL)); if (!PyDict_GetItemString(dev->pathdict, "closePath")) { DICT_SETITEMSTR_DROP(dev->pathdict, "closePath", JM_BOOL(0)); } DICT_SETITEMSTR_DROP(dev->pathdict, "scissor", JM_py_from_rect(compute_scissor(dev))); DICT_SETITEMSTR_DROP(dev->pathdict, "level", PyLong_FromLong(dev->depth)); DICT_SETITEMSTR_DROP(dev->pathdict, "layer", JM_UnicodeFromStr(dev->layer_name)); jm_append_merge(dev); dev->depth++; } static void jm_lineart_clip_stroke_text(fz_context *ctx, fz_device *dev_, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; compute_scissor(dev); dev->depth++; } static void jm_lineart_clip_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, fz_rect scissor) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; compute_scissor(dev); dev->depth++; } static void jm_lineart_clip_image_mask(fz_context *ctx, fz_device *dev_, fz_image *image, fz_matrix ctm, fz_rect scissor) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; compute_scissor(dev); dev->depth++; } static void jm_lineart_pop_clip(fz_context *ctx, fz_device *dev_) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; if (!dev->scissors) return; Py_ssize_t len = PyList_Size(dev->scissors); if (len < 1) return; PyList_SetSlice(dev->scissors, len - 1, len, NULL); dev->depth--; } static void jm_lineart_begin_group(fz_context *ctx, fz_device *dev_, fz_rect bbox, fz_colorspace *cs, int isolated, int knockout, int blendmode, float alpha) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; dev->pathdict = Py_BuildValue("{s:s,s:N,s:N,s:N,s:s,s:f,s:i,s:N}", "type", "group", "rect", JM_py_from_rect(bbox), "isolated", JM_BOOL(isolated), "knockout", JM_BOOL(knockout), "blendmode", fz_blendmode_name(blendmode), "opacity", alpha, "level", dev->depth, "layer", JM_UnicodeFromStr(dev->layer_name) ); jm_append_merge(dev); dev->depth++; } static void jm_lineart_end_group(fz_context *ctx, fz_device *dev_) { jm_lineart_device *dev = (jm_lineart_device *)dev_; if (!dev->clips) return; dev->depth--; } static void jm_lineart_fill_text(fz_context *ctx, fz_device *dev, const fz_text *, fz_matrix, fz_colorspace *, const float *color, float alpha, fz_color_params) { jm_increase_seqno(ctx, dev); } static void jm_lineart_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *, const fz_stroke_state *, fz_matrix, fz_colorspace *, const float *color, float alpha, fz_color_params) { jm_increase_seqno(ctx, dev); } static void jm_lineart_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shd, fz_matrix ctm, float alpha, fz_color_params color_params) { jm_increase_seqno(ctx, dev); } static void jm_lineart_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) { jm_increase_seqno(ctx, dev); } static void jm_lineart_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, fz_colorspace *, const float *color, float alpha, fz_color_params color_params) { jm_increase_seqno(ctx, dev); } static void jm_lineart_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *, fz_matrix) { jm_increase_seqno(ctx, dev); } //------------------------------------------------------------------- // LINEART device for Python method Page.get_cdrawings() //------------------------------------------------------------------- mupdf::FzDevice JM_new_lineart_device(PyObject *out, int clips, PyObject *method) { //printf("extra.JM_new_lineart_device()\n"); jm_lineart_device* dev = (jm_lineart_device*) mupdf::ll_fz_new_device_of_size(sizeof(jm_lineart_device)); dev->super.close_device = NULL; dev->super.drop_device = jm_lineart_drop_device; dev->super.fill_path = jm_lineart_fill_path; dev->super.stroke_path = jm_lineart_stroke_path; dev->super.clip_path = jm_lineart_clip_path; dev->super.clip_stroke_path = jm_lineart_clip_stroke_path; dev->super.fill_text = jm_lineart_fill_text; dev->super.stroke_text = jm_lineart_stroke_text; dev->super.clip_text = jm_lineart_clip_text; dev->super.clip_stroke_text = jm_lineart_clip_stroke_text; dev->super.ignore_text = jm_lineart_ignore_text; dev->super.fill_shade = jm_lineart_fill_shade; dev->super.fill_image = jm_lineart_fill_image; dev->super.fill_image_mask = jm_lineart_fill_image_mask; dev->super.clip_image_mask = jm_lineart_clip_image_mask; dev->super.pop_clip = jm_lineart_pop_clip; dev->super.begin_mask = NULL; dev->super.end_mask = NULL; dev->super.begin_group = jm_lineart_begin_group; dev->super.end_group = jm_lineart_end_group; dev->super.begin_tile = NULL; dev->super.end_tile = NULL; dev->super.begin_layer = jm_lineart_begin_layer; dev->super.end_layer = jm_lineart_end_layer; dev->super.begin_structure = NULL; dev->super.end_structure = NULL; dev->super.begin_metatext = NULL; dev->super.end_metatext = NULL; dev->super.render_flags = NULL; dev->super.set_default_colorspaces = NULL; if (PyList_Check(out)) { Py_INCREF(out); } Py_INCREF(method); dev->out = out; dev->seqno = 0; dev->depth = 0; dev->clips = clips; dev->method = method; dev->pathdict = nullptr; return mupdf::FzDevice(&dev->super); } PyObject* get_cdrawings(mupdf::FzPage& page, PyObject *extended=NULL, PyObject *callback=NULL, PyObject *method=NULL) { //fz_page *page = (fz_page *) $self; //fz_device *dev = NULL; PyObject *rc = NULL; int clips = PyObject_IsTrue(extended); mupdf::FzDevice dev; if (PyCallable_Check(callback) || method != Py_None) { dev = JM_new_lineart_device(callback, clips, method); } else { rc = PyList_New(0); dev = JM_new_lineart_device(rc, clips, method); } mupdf::FzRect prect = mupdf::fz_bound_page(page); ((jm_lineart_device*) dev.m_internal)->ptm = mupdf::ll_fz_make_matrix(1, 0, 0, -1, 0, prect.y1); mupdf::FzCookie cookie; mupdf::FzMatrix identity; mupdf::fz_run_page( page, dev, *identity.internal(), cookie); mupdf::fz_close_device( dev); if (PyCallable_Check(callback) || method != Py_None) { Py_RETURN_NONE; } return rc; } //--------------------------------------------------------------------------- // APPEND non-ascii runes in unicode escape format to fz_buffer //--------------------------------------------------------------------------- void JM_append_rune(fz_buffer *buff, int ch) { char text[32]; if (ch == 92) // prevent accidental "\u", "\U" sequences { mupdf::ll_fz_append_string(buff, "\\u005c"); } else if ((ch >= 32 && ch <= 127) || ch == 10) { mupdf::ll_fz_append_byte(buff, ch); } else if (ch >= 0xd800 && ch <= 0xdfff) // orphaned surrogate Unicodes { mupdf::ll_fz_append_string(buff, "\\ufffd"); } else if (ch <= 0xffff) { // 4 hex digits snprintf(text, sizeof(text), "\\u%04x", ch); mupdf::ll_fz_append_string(buff, text); } else { // 8 hex digits snprintf(text, sizeof(text), "\\U%08x", ch); mupdf::ll_fz_append_string(buff, text); } } mupdf::FzRect JM_make_spanlist( PyObject *line_dict, mupdf::FzStextLine& line, int raw, mupdf::FzBuffer& buff, mupdf::FzRect& tp_rect ) { PyObject *span = NULL, *char_list = NULL, *char_dict; PyObject *span_list = PyList_New(0); mupdf::fz_clear_buffer(buff); fz_rect span_rect = fz_empty_rect; fz_rect line_rect = fz_empty_rect; fz_point span_origin = {0, 0}; struct char_style { float size = -1; unsigned flags = 0; #if MUPDF_VERSION_GE(1, 25, 2) /* From mupdf:include/mupdf/fitz/structured-text.h:fz_stext_char::flags, which uses anonymous enum values: FZ_STEXT_STRIKEOUT = 1, FZ_STEXT_UNDERLINE = 2, FZ_STEXT_SYNTHETIC = 4, FZ_STEXT_FILLED = 16, FZ_STEXT_STROKED = 32, FZ_STEXT_CLIPPED = 64 */ unsigned char_flags = 0; #endif const char *font = ""; unsigned argb = 0; float asc = 0; float desc = 0; uint16_t bidi = 0; }; char_style old_style; char_style style; for (mupdf::FzStextChar ch: line) { fz_rect r = JM_char_bbox(line, ch); if (!JM_rects_overlap(*tp_rect.internal(), r) && !fz_is_infinite_rect(tp_rect)) { continue; } /* Info from: detect_super_script() fz_font_is_italic() fz_font_is_serif() fz_font_is_monospaced() fz_font_is_bold() */ int flags = JM_char_font_flags( ch.m_internal->font, line.m_internal, ch.m_internal); fz_point origin = ch.m_internal->origin; style.size = ch.m_internal->size; style.flags = flags; #if MUPDF_VERSION_GE(1, 25, 2) /* FZ_STEXT_SYNTHETIC is per-char, not per-span. */ style.char_flags = ch.m_internal->flags & ~FZ_STEXT_SYNTHETIC; #endif style.font = JM_font_name(ch.m_internal->font); #if MUPDF_VERSION_GE(1, 25, 0) style.argb = ch.m_internal->argb; #else style.argb = ch.m_internal->color; #endif style.asc = JM_font_ascender(ch.m_internal->font); style.desc = JM_font_descender(ch.m_internal->font); if (0 || style.size != old_style.size || style.flags != old_style.flags #if MUPDF_VERSION_GE(1, 25, 2) || style.char_flags != old_style.char_flags #endif || style.argb != old_style.argb || strcmp(style.font, old_style.font) != 0 || style.bidi != old_style.bidi ) { if (old_style.size >= 0) { // not first one, output previous if (raw) { // put character list in the span DICT_SETITEM_DROP(span, dictkey_chars, char_list); char_list = NULL; } else { // put text string in the span DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(buff)); mupdf::fz_clear_buffer(buff); } DICT_SETITEM_DROP(span, dictkey_origin, JM_py_from_point(span_origin)); DICT_SETITEM_DROP(span, dictkey_bbox, JM_py_from_rect(span_rect)); line_rect = mupdf::ll_fz_union_rect(line_rect, span_rect); LIST_APPEND_DROP(span_list, span); span = NULL; } span = PyDict_New(); float asc = style.asc, desc = style.desc; if (style.asc < 1e-3) { asc = 0.9f; desc = -0.1f; } DICT_SETITEM_DROP(span, dictkey_size, Py_BuildValue("f", style.size)); DICT_SETITEM_DROP(span, dictkey_flags, Py_BuildValue("I", style.flags)); DICT_SETITEM_DROP(span, dictkey_bidi, Py_BuildValue("I", style.bidi)); #if MUPDF_VERSION_GE(1, 25, 2) DICT_SETITEM_DROP(span, dictkey_char_flags, Py_BuildValue("I", style.char_flags)); #endif DICT_SETITEM_DROP(span, dictkey_font, JM_EscapeStrFromStr(style.font)); DICT_SETITEM_DROP(span, dictkey_color, Py_BuildValue("I", style.argb & 0xffffff)); #if MUPDF_VERSION_GE(1, 25, 0) DICT_SETITEMSTR_DROP(span, "alpha", Py_BuildValue("I", style.argb >> 24)); #endif DICT_SETITEMSTR_DROP(span, "ascender", Py_BuildValue("f", asc)); DICT_SETITEMSTR_DROP(span, "descender", Py_BuildValue("f", desc)); old_style = style; span_rect = r; span_origin = origin; } span_rect = mupdf::ll_fz_union_rect(span_rect, r); if (raw) { // make and append a char dict char_dict = PyDict_New(); DICT_SETITEM_DROP(char_dict, dictkey_origin, JM_py_from_point(ch.m_internal->origin)); DICT_SETITEM_DROP(char_dict, dictkey_bbox, JM_py_from_rect(r)); DICT_SETITEM_DROP(char_dict, dictkey_c, Py_BuildValue("C", ch.m_internal->c)); DICT_SETITEMSTR_DROP(char_dict, "synthetic", Py_BuildValue("O", (ch.m_internal->flags & FZ_STEXT_SYNTHETIC) ? Py_True : Py_False)); if (!char_list) { char_list = PyList_New(0); } LIST_APPEND_DROP(char_list, char_dict); } else { // add character byte to buffer JM_append_rune(buff.m_internal, ch.m_internal->c); } } // all characters processed, now flush remaining span if (span) { if (raw) { DICT_SETITEM_DROP(span, dictkey_chars, char_list); char_list = NULL; } else { DICT_SETITEM_DROP(span, dictkey_text, JM_EscapeStrFromBuffer(buff)); mupdf::fz_clear_buffer(buff); } DICT_SETITEM_DROP(span, dictkey_origin, JM_py_from_point(span_origin)); DICT_SETITEM_DROP(span, dictkey_bbox, JM_py_from_rect(span_rect)); if (!fz_is_empty_rect(span_rect)) { LIST_APPEND_DROP(span_list, span); line_rect = fz_union_rect(line_rect, span_rect); } else { Py_DECREF(span); } span = NULL; } if (!mupdf::fz_is_empty_rect(line_rect)) { DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list); } else { DICT_SETITEM_DROP(line_dict, dictkey_spans, span_list); } return line_rect; } //----------------------------------------------------------------------------- // Functions for wordlist output //----------------------------------------------------------------------------- int JM_append_word( PyObject* lines, fz_buffer* buff, fz_rect* wbbox, int block_n, int line_n, int word_n ) { PyObject* s = JM_EscapeStrFromBuffer(buff); PyObject* litem = Py_BuildValue( "ffffOiii", wbbox->x0, wbbox->y0, wbbox->x1, wbbox->y1, s, block_n, line_n, word_n ); LIST_APPEND_DROP(lines, litem); Py_DECREF(s); *wbbox = fz_empty_rect; return word_n + 1; // word counter } PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters) { int block_n = -1; fz_rect wbbox = fz_empty_rect; // word bbox fz_rect tp_rect = this_tpage.m_internal->mediabox; PyObject *lines = NULL; mupdf::FzBuffer buff = mupdf::fz_new_buffer(64); lines = PyList_New(0); for (mupdf::FzStextBlock block: this_tpage) { block_n++; if (block.m_internal->type != FZ_STEXT_BLOCK_TEXT) { continue; } int line_n = -1; for (mupdf::FzStextLine line: block) { line_n++; int word_n = 0; // word counter per line mupdf::fz_clear_buffer(buff); // reset word buffer size_t buflen = 0; // reset char counter int last_char_rtl = 0; // was last character RTL? for (mupdf::FzStextChar ch: line) { mupdf::FzRect cbbox = JM_char_bbox(line, ch); if (!JM_rects_overlap(tp_rect, *cbbox.internal()) && !fz_is_infinite_rect(tp_rect)) { continue; } int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters); int this_char_rtl = JM_is_rtl_char(ch.m_internal->c); if (word_delimiter || this_char_rtl != last_char_rtl) { if (buflen == 0 && word_delimiter) { continue; // skip delimiters at line start } if (!fz_is_empty_rect(wbbox)) { word_n = JM_append_word( lines, buff.m_internal, &wbbox, block_n, line_n, word_n ); } mupdf::fz_clear_buffer(buff); buflen = 0; // reset char counter if (word_delimiter) continue; } // append one unicode character to the word JM_append_rune(buff.m_internal, ch.m_internal->c); last_char_rtl = this_char_rtl; buflen++; // enlarge word bbox wbbox = fz_union_rect(wbbox, JM_char_bbox(line, ch)); } if (buflen && !fz_is_empty_rect(wbbox)) { word_n = JM_append_word( lines, buff.m_internal, &wbbox, block_n, line_n, word_n ); } mupdf::fz_clear_buffer(buff); buflen = 0; } } return lines; } struct ScopedPyObject /* PyObject* wrapper, destructor calls Py_CLEAR() unless `release()` has been called. */ { ScopedPyObject(PyObject* rhs=nullptr) : m_pyobject(rhs) {} PyObject*& get() { return m_pyobject; } ScopedPyObject& operator= (PyObject* rhs) { Py_CLEAR(m_pyobject); m_pyobject = rhs; return *this; } PyObject* release() { PyObject* ret = m_pyobject; m_pyobject = nullptr; return ret; } ~ScopedPyObject() { Py_CLEAR(m_pyobject); } PyObject* m_pyobject = nullptr; }; PyObject* extractBLOCKS(mupdf::FzStextPage& self) { fz_stext_page *this_tpage = self.m_internal; fz_rect tp_rect = this_tpage->mediabox; mupdf::FzBuffer res(1024); ScopedPyObject lines( PyList_New(0)); int block_n = -1; for (fz_stext_block* block = this_tpage->first_block; block; block = block->next) { ScopedPyObject text; block_n++; fz_rect blockrect = fz_empty_rect; if (block->type == FZ_STEXT_BLOCK_TEXT) { mupdf::fz_clear_buffer(res); // set text buffer to empty int line_n = -1; int last_char = 0; (void) line_n; /* Not actually used, but keeping in the code for now. */ for (fz_stext_line* line = block->u.t.first_line; line; line = line->next) { line_n++; fz_rect linerect = fz_empty_rect; for (fz_stext_char* ch = line->first_char; ch; ch = ch->next) { fz_rect cbbox = JM_char_bbox(line, ch); if (!JM_rects_overlap(tp_rect, cbbox) && !fz_is_infinite_rect(tp_rect)) { continue; } JM_append_rune(res.m_internal, ch->c); last_char = ch->c; linerect = fz_union_rect(linerect, cbbox); } if (last_char != 10 && !fz_is_empty_rect(linerect)) { mupdf::fz_append_byte(res, 10); } blockrect = fz_union_rect(blockrect, linerect); } text = JM_EscapeStrFromBuffer(res); } else if (JM_rects_overlap(tp_rect, block->bbox) || fz_is_infinite_rect(tp_rect)) { fz_image *img = block->u.i.image; fz_colorspace *cs = img->colorspace; text = PyUnicode_FromFormat( "<image: %s, width: %d, height: %d, bpc: %d>", mupdf::ll_fz_colorspace_name(cs), img->w, img->h, img->bpc ); blockrect = fz_union_rect(blockrect, block->bbox); } if (!fz_is_empty_rect(blockrect)) { ScopedPyObject litem = PyTuple_New(7); PyTuple_SET_ITEM(litem.get(), 0, Py_BuildValue("f", blockrect.x0)); PyTuple_SET_ITEM(litem.get(), 1, Py_BuildValue("f", blockrect.y0)); PyTuple_SET_ITEM(litem.get(), 2, Py_BuildValue("f", blockrect.x1)); PyTuple_SET_ITEM(litem.get(), 3, Py_BuildValue("f", blockrect.y1)); PyTuple_SET_ITEM(litem.get(), 4, Py_BuildValue("O", text.get())); PyTuple_SET_ITEM(litem.get(), 5, Py_BuildValue("i", block_n)); PyTuple_SET_ITEM(litem.get(), 6, Py_BuildValue("i", block->type)); LIST_APPEND(lines.get(), litem.get()); } } return lines.release(); } #define EMPTY_STRING PyUnicode_FromString("") static PyObject *JM_UnicodeFromStr(const char *c) { if (!c) return EMPTY_STRING; PyObject *val = Py_BuildValue("s", c); if (!val) { val = EMPTY_STRING; PyErr_Clear(); } return val; } PyObject* link_uri(mupdf::FzLink& link) { return JM_UnicodeFromStr( link.m_internal->uri); } fz_stext_page* page_get_textpage( mupdf::FzPage& self, PyObject* clip, int flags, PyObject* matrix ) { fz_context* ctx = mupdf::internal_context_get(); fz_stext_page *tpage=NULL; fz_page *page = self.m_internal; fz_device *dev = NULL; fz_stext_options options; memset(&options, 0, sizeof options); options.flags = flags; fz_try(ctx) { // Default to page's rect if `clip` not specified, for #2048. fz_rect rect = (clip==Py_None) ? fz_bound_page(ctx, page) : JM_rect_from_py(clip); fz_matrix ctm = JM_matrix_from_py(matrix); tpage = fz_new_stext_page(ctx, rect); dev = fz_new_stext_device(ctx, tpage, &options); fz_run_page(ctx, page, dev, ctm, NULL); fz_close_device(ctx, dev); } fz_always(ctx) { fz_drop_device(ctx, dev); } fz_catch(ctx) { mupdf::internal_throw_exception(ctx); } return tpage; } // return extension for pymupdf image type const char *JM_image_extension(int type) { switch (type) { case(FZ_IMAGE_RAW): return "raw"; case(FZ_IMAGE_FLATE): return "flate"; case(FZ_IMAGE_LZW): return "lzw"; case(FZ_IMAGE_RLD): return "rld"; case(FZ_IMAGE_BMP): return "bmp"; case(FZ_IMAGE_GIF): return "gif"; case(FZ_IMAGE_JBIG2): return "jb2"; case(FZ_IMAGE_JPEG): return "jpeg"; case(FZ_IMAGE_JPX): return "jpx"; case(FZ_IMAGE_JXR): return "jxr"; case(FZ_IMAGE_PNG): return "png"; case(FZ_IMAGE_PNM): return "pnm"; case(FZ_IMAGE_TIFF): return "tiff"; default: return "n/a"; } } void JM_make_image_block(fz_stext_block *block, PyObject *block_dict) { fz_context* ctx = mupdf::internal_context_get(); fz_image *image = block->u.i.image; fz_buffer *buf = NULL, *freebuf = NULL, *mask_buf = NULL; fz_compressed_buffer *buffer = fz_compressed_image_buffer(ctx, image); fz_var(buf); fz_var(freebuf); fz_var(mask_buf); int n = fz_colorspace_n(ctx, image->colorspace); int w = image->w; int h = image->h; const char *ext = ""; int type = FZ_IMAGE_UNKNOWN; if (buffer) { type = buffer->params.type; ext = JM_image_extension(type); } if (type < FZ_IMAGE_BMP || type == FZ_IMAGE_JBIG2) type = FZ_IMAGE_UNKNOWN; PyObject *bytes = NULL; fz_var(bytes); PyObject *mask_bytes = NULL; fz_var(mask_bytes); fz_try(ctx) { if (!buffer || type == FZ_IMAGE_UNKNOWN) { buf = freebuf = fz_new_buffer_from_image_as_png(ctx, image, fz_default_color_params); ext = "png"; } else if (n == 4 && strcmp(ext, "jpeg") == 0) // JPEG CMYK needs another step { buf = freebuf = fz_new_buffer_from_image_as_jpeg(ctx, image, fz_default_color_params, 95, 1); } else { buf = buffer->buffer; } bytes = JM_BinFromBuffer(buf); if (image->mask) { mask_buf = fz_new_buffer_from_image_as_png(ctx, image->mask, fz_default_color_params); mask_bytes = JM_BinFromBuffer(mask_buf); } else { mask_bytes = Py_BuildValue("s", NULL); } } fz_always(ctx) { if (!bytes) bytes = PyBytes_FromString(""); DICT_SETITEM_DROP(block_dict, dictkey_width, Py_BuildValue("i", w)); DICT_SETITEM_DROP(block_dict, dictkey_height, Py_BuildValue("i", h)); DICT_SETITEM_DROP(block_dict, dictkey_ext, Py_BuildValue("s", ext)); DICT_SETITEM_DROP(block_dict, dictkey_colorspace, Py_BuildValue("i", n)); DICT_SETITEM_DROP(block_dict, dictkey_xres, Py_BuildValue("i", image->xres)); DICT_SETITEM_DROP(block_dict, dictkey_yres, Py_BuildValue("i", image->xres)); DICT_SETITEM_DROP(block_dict, dictkey_bpc, Py_BuildValue("i", (int) image->bpc)); DICT_SETITEM_DROP(block_dict, dictkey_matrix, JM_py_from_matrix(block->u.i.transform)); DICT_SETITEM_DROP(block_dict, dictkey_size, Py_BuildValue("n", PyBytes_Size(bytes))); DICT_SETITEM_DROP(block_dict, dictkey_image, bytes); DICT_SETITEMSTR_DROP(block_dict, "mask", mask_bytes); fz_drop_buffer(ctx, mask_buf); fz_drop_buffer(ctx, freebuf); } fz_catch(ctx) {;} return; } static void JM_make_text_block(fz_stext_block *block, PyObject *block_dict, int raw, fz_buffer *buff, fz_rect tp_rect) { fz_stext_line *line; PyObject *line_list = PyList_New(0), *line_dict; fz_rect block_rect = fz_empty_rect; for (line = block->u.t.first_line; line; line = line->next) { if (fz_is_empty_rect(fz_intersect_rect(tp_rect, line->bbox)) && !fz_is_infinite_rect(tp_rect)) { continue; } line_dict = PyDict_New(); mupdf::FzStextLine line2(line); mupdf::FzBuffer buff2( mupdf::ll_fz_keep_buffer( buff)); mupdf::FzRect tp_rect2( tp_rect); mupdf::FzRect line_rect2 = JM_make_spanlist( line_dict, line2, raw, buff2, tp_rect2 ); fz_rect& line_rect = *line_rect2.internal(); block_rect = fz_union_rect(block_rect, line_rect); DICT_SETITEM_DROP(line_dict, dictkey_wmode, Py_BuildValue("i", line->wmode)); DICT_SETITEM_DROP(line_dict, dictkey_dir, JM_py_from_point(line->dir)); DICT_SETITEM_DROP(line_dict, dictkey_bbox, JM_py_from_rect(line_rect)); LIST_APPEND_DROP(line_list, line_dict); } DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block_rect)); DICT_SETITEM_DROP(block_dict, dictkey_lines, line_list); return; } void JM_make_textpage_dict(fz_stext_page *tp, PyObject *page_dict, int raw) { fz_context* ctx = mupdf::internal_context_get(); fz_stext_block *block; fz_buffer *text_buffer = fz_new_buffer(ctx, 128); PyObject *block_dict, *block_list = PyList_New(0); fz_rect tp_rect = tp->mediabox; int block_n = -1; for (block = tp->first_block; block; block = block->next) { block_n++; if (!fz_contains_rect(tp_rect, block->bbox) && !fz_is_infinite_rect(tp_rect) && block->type == FZ_STEXT_BLOCK_IMAGE) { continue; } if (!fz_is_infinite_rect(tp_rect) && fz_is_empty_rect(fz_intersect_rect(tp_rect, block->bbox))) { continue; } block_dict = PyDict_New(); DICT_SETITEM_DROP(block_dict, dictkey_number, Py_BuildValue("i", block_n)); DICT_SETITEM_DROP(block_dict, dictkey_type, Py_BuildValue("i", block->type)); if (block->type == FZ_STEXT_BLOCK_IMAGE) { DICT_SETITEM_DROP(block_dict, dictkey_bbox, JM_py_from_rect(block->bbox)); JM_make_image_block(block, block_dict); } else { JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect); } LIST_APPEND_DROP(block_list, block_dict); } DICT_SETITEM_DROP(page_dict, dictkey_blocks, block_list); fz_drop_buffer(ctx, text_buffer); } //----------------------------------------------------------------- // get one pixel as a list //----------------------------------------------------------------- PyObject *pixmap_pixel(fz_pixmap* pm, int x, int y) { fz_context* ctx = mupdf::internal_context_get(); PyObject *p = NULL; if (0 || x < 0 || x >= pm->w || y < 0 || y >= pm->h ) { throw std::range_error( MSG_PIXEL_OUTSIDE); } int n = pm->n; int stride = fz_pixmap_stride(ctx, pm); int i = stride * y + n * x; p = PyTuple_New(n); for (int j = 0; j < n; j++) { PyTuple_SET_ITEM(p, j, Py_BuildValue("i", pm->samples[i + j])); } return p; } int pixmap_n(mupdf::FzPixmap& pixmap) { return mupdf::fz_pixmap_components( pixmap); } static int JM_INT_ITEM(PyObject *obj, Py_ssize_t idx, int *result) { PyObject *temp = PySequence_ITEM(obj, idx); if (!temp) return 1; if (PyLong_Check(temp)) { *result = (int) PyLong_AsLong(temp); Py_DECREF(temp); } else if (PyFloat_Check(temp)) { *result = (int) PyFloat_AsDouble(temp); Py_DECREF(temp); } else { Py_DECREF(temp); return 1; } if (PyErr_Occurred()) { PyErr_Clear(); return 1; } return 0; } PyObject *set_pixel(fz_pixmap* pm, int x, int y, PyObject *color) { fz_context* ctx = mupdf::internal_context_get(); if (0 || x < 0 || x >= pm->w || y < 0 || y >= pm->h ) { throw std::range_error( MSG_PIXEL_OUTSIDE); } int n = pm->n; if (!PySequence_Check(color) || PySequence_Size(color) != n) { throw std::range_error(MSG_BAD_COLOR_SEQ); } int i, j; unsigned char c[5]; for (j = 0; j < n; j++) { if (JM_INT_ITEM(color, j, &i) == 1) { throw std::range_error(MSG_BAD_COLOR_SEQ); } if (i < 0 or i >= 256) { throw std::range_error(MSG_BAD_COLOR_SEQ); } c[j] = (unsigned char) i; } int stride = fz_pixmap_stride(ctx, pm); i = stride * y + n * x; for (j = 0; j < n; j++) { pm->samples[i + j] = c[j]; } Py_RETURN_NONE; } //------------------------------------------- // make a buffer from an stext_page's text //------------------------------------------- fz_buffer * JM_new_buffer_from_stext_page(fz_stext_page *page) { fz_context* ctx = mupdf::internal_context_get(); fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_rect rect = page->mediabox; fz_buffer *buf = NULL; fz_try(ctx) { buf = fz_new_buffer(ctx, 256); for (block = page->first_block; block; block = block->next) { if (block->type == FZ_STEXT_BLOCK_TEXT) { for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { if (!JM_rects_overlap(rect, JM_char_bbox(line, ch)) && !fz_is_infinite_rect(rect)) { continue; } fz_append_rune(ctx, buf, ch->c); } fz_append_byte(ctx, buf, '\n'); } fz_append_byte(ctx, buf, '\n'); } } } fz_catch(ctx) { fz_drop_buffer(ctx, buf); mupdf::internal_throw_exception(ctx); } return buf; } static inline int canon(int c) { /* TODO: proper unicode case folding */ /* TODO: character equivalence (a matches ä, etc) */ if (c == 0xA0 || c == 0x2028 || c == 0x2029) return ' '; if (c == '\r' || c == '\n' || c == '\t') return ' '; if (c >= 'A' && c <= 'Z') return c - 'A' + 'a'; return c; } static inline int chartocanon(int *c, const char *s) { int n = fz_chartorune(c, s); *c = canon(*c); return n; } static const char *match_string(const char *h, const char *n) { int hc, nc; const char *e = h; h += chartocanon(&hc, h); n += chartocanon(&nc, n); while (hc == nc) { e = h; if (hc == ' ') do h += chartocanon(&hc, h); while (hc == ' '); else h += chartocanon(&hc, h); if (nc == ' ') do n += chartocanon(&nc, n); while (nc == ' '); else n += chartocanon(&nc, n); } return nc == 0 ? e : NULL; } static const char *find_string(const char *s, const char *needle, const char **endp) { const char *end; while (*s) { end = match_string(s, needle); if (end) { *endp = end; return s; } ++s; } *endp = NULL; return NULL; } struct highlight { Py_ssize_t len; PyObject *quads; float hfuzz, vfuzz; }; static int JM_FLOAT_ITEM(PyObject *obj, Py_ssize_t idx, double *result) { PyObject *temp = PySequence_ITEM(obj, idx); if (!temp) return 1; *result = PyFloat_AsDouble(temp); Py_DECREF(temp); if (PyErr_Occurred()) { PyErr_Clear(); return 1; } return 0; } //----------------------------------------------------------------------------- // fz_quad from PySequence. Four floats are treated as rect. // Else must be four pairs of floats. //----------------------------------------------------------------------------- static fz_quad JM_quad_from_py(PyObject *r) { fz_quad q = fz_make_quad(FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT); fz_point p[4]; double test, x, y; Py_ssize_t i; PyObject *obj = NULL; if (!r || !PySequence_Check(r) || PySequence_Size(r) != 4) return q; if (JM_FLOAT_ITEM(r, 0, &test) == 0) return fz_quad_from_rect(JM_rect_from_py(r)); for (i = 0; i < 4; i++) { obj = PySequence_ITEM(r, i); // next point item if (!obj || !PySequence_Check(obj) || PySequence_Size(obj) != 2) goto exit_result; // invalid: cancel the rest if (JM_FLOAT_ITEM(obj, 0, &x) == 1) goto exit_result; if (JM_FLOAT_ITEM(obj, 1, &y) == 1) goto exit_result; if (x < FZ_MIN_INF_RECT) x = FZ_MIN_INF_RECT; if (y < FZ_MIN_INF_RECT) y = FZ_MIN_INF_RECT; if (x > FZ_MAX_INF_RECT) x = FZ_MAX_INF_RECT; if (y > FZ_MAX_INF_RECT) y = FZ_MAX_INF_RECT; p[i] = fz_make_point((float) x, (float) y); Py_CLEAR(obj); } q.ul = p[0]; q.ur = p[1]; q.ll = p[2]; q.lr = p[3]; return q; exit_result:; Py_CLEAR(obj); return q; } static float hdist(fz_point *dir, fz_point *a, fz_point *b) { float dx = b->x - a->x; float dy = b->y - a->y; return fz_abs(dx * dir->x + dy * dir->y); } static float vdist(fz_point *dir, fz_point *a, fz_point *b) { float dx = b->x - a->x; float dy = b->y - a->y; return fz_abs(dx * dir->y + dy * dir->x); } static void on_highlight_char(fz_context *ctx, void *arg, fz_stext_line *line, fz_stext_char *ch) { struct highlight* hits = (struct highlight*) arg; float vfuzz = ch->size * hits->vfuzz; float hfuzz = ch->size * hits->hfuzz; fz_quad ch_quad = JM_char_quad(line, ch); if (hits->len > 0) { PyObject *quad = PySequence_ITEM(hits->quads, hits->len - 1); fz_quad end = JM_quad_from_py(quad); Py_DECREF(quad); if (hdist(&line->dir, &end.lr, &ch_quad.ll) < hfuzz && vdist(&line->dir, &end.lr, &ch_quad.ll) < vfuzz && hdist(&line->dir, &end.ur, &ch_quad.ul) < hfuzz && vdist(&line->dir, &end.ur, &ch_quad.ul) < vfuzz) { end.ur = ch_quad.ur; end.lr = ch_quad.lr; quad = JM_py_from_quad(end); PyList_SetItem(hits->quads, hits->len - 1, quad); return; } } LIST_APPEND_DROP(hits->quads, JM_py_from_quad(ch_quad)); hits->len++; } PyObject* JM_search_stext_page(fz_stext_page *page, const char *needle) { fz_context* ctx = mupdf::internal_context_get(); struct highlight hits; fz_stext_block *block; fz_stext_line *line; fz_stext_char *ch; fz_buffer *buffer = NULL; const char *haystack, *begin, *end; fz_rect rect = page->mediabox; int c, inside; if (strlen(needle) == 0) Py_RETURN_NONE; PyObject *quads = PyList_New(0); hits.len = 0; hits.quads = quads; hits.hfuzz = 0.2f; /* merge kerns but not large gaps */ hits.vfuzz = 0.1f; fz_try(ctx) { buffer = JM_new_buffer_from_stext_page( page); haystack = fz_string_from_buffer(ctx, buffer); begin = find_string(haystack, needle, &end); if (!begin) goto no_more_matches; inside = 0; for (block = page->first_block; block; block = block->next) { if (block->type != FZ_STEXT_BLOCK_TEXT) { continue; } for (line = block->u.t.first_line; line; line = line->next) { for (ch = line->first_char; ch; ch = ch->next) { if (!fz_is_infinite_rect(rect) && !JM_rects_overlap(rect, JM_char_bbox(line, ch))) { goto next_char; } try_new_match: if (!inside) { if (haystack >= begin) inside = 1; } if (inside) { if (haystack < end) { on_highlight_char(ctx, &hits, line, ch); } else { inside = 0; begin = find_string(haystack, needle, &end); if (!begin) goto no_more_matches; else goto try_new_match; } } haystack += fz_chartorune(&c, haystack); next_char:; } assert(*haystack == '\n'); ++haystack; } assert(*haystack == '\n'); ++haystack; } no_more_matches:; } fz_always(ctx) fz_drop_buffer(ctx, buffer); fz_catch(ctx) mupdf::internal_throw_exception(ctx); return quads; } void pixmap_copy( fz_pixmap* pm, const fz_pixmap* src, int n) { assert(pm->w == src->w); assert(pm->h == src->h); assert(n <= pm->n); assert(n <= src->n); if (pm->n == src->n) { // identical samples assert(pm->stride == src->stride); memcpy(pm->samples, src->samples, pm->w * pm->h * pm->n); } else { int nn; int do_alpha; if (pm->n > src->n) { assert(pm->n == src->n + 1); nn = src->n; assert(!src->alpha); assert(pm->alpha); do_alpha = 1; } else { assert(src->n == pm->n + 1); nn = pm->n; assert(src->alpha); assert(!pm->alpha); do_alpha = 0; } for (int y=0; y<pm->h; ++y) { for (int x=0; x<pm->w; ++x) { memcpy( pm->samples + pm->stride * y + pm->n * x, src->samples + src->stride * y + src->n * x, nn ); if (do_alpha) { pm->samples[pm->stride * y + pm->n * x + pm->n-1] = 255; } } } } } PyObject* ll_JM_color_count(fz_pixmap *pm, PyObject *clip) { fz_context* ctx = mupdf::internal_context_get(); PyObject* rc = PyDict_New(); fz_irect irect = fz_pixmap_bbox(ctx, pm); irect = fz_intersect_irect(irect, fz_round_rect(JM_rect_from_py(clip))); if (fz_is_empty_irect(irect)) { return rc; } size_t stride = pm->stride; size_t width = irect.x1 - irect.x0; size_t height = irect.y1 - irect.y0; size_t n = (size_t) pm->n; size_t substride = width * n; unsigned char* s = pm->samples + stride * (irect.y0 - pm->y) + n * (irect.x0 - pm->x); // Cache previous pixel. char oldpix[10]; assert(n <= sizeof(oldpix)); memcpy(oldpix, s, n); long cnt = 0; for (size_t i = 0; i < height; i++) { for (size_t j = 0; j < substride; j += n) { const char* newpix = (const char*) s + j; if (memcmp(oldpix, newpix, n)) { /* Pixel differs from previous pixel, so update results with last run of pixels. We get a PyObject representation of pixel so we can look up in Python dict <rc>. */ PyObject* pixel = PyBytes_FromStringAndSize(&oldpix[0], n); PyObject* c = PyDict_GetItem(rc, pixel); if (c) cnt += PyLong_AsLong(c); DICT_SETITEM_DROP(rc, pixel, PyLong_FromLong(cnt)); Py_DECREF(pixel); /* Start next run of identical pixels. */ cnt = 1; memcpy(oldpix, newpix, n); } else { cnt += 1; } } s += stride; } /* Update results with last pixel. */ PyObject* pixel = PyBytes_FromStringAndSize(&oldpix[0], n); PyObject* c = PyDict_GetItem(rc, pixel); if (c) cnt += PyLong_AsLong(c); DICT_SETITEM_DROP(rc, pixel, PyLong_FromLong(cnt)); Py_DECREF(pixel); PyErr_Clear(); return rc; } %} /* Declarations for functions defined above. */ void page_merge( mupdf::PdfDocument& doc_des, mupdf::PdfDocument& doc_src, int page_from, int page_to, int rotate, int links, int copy_annots, mupdf::PdfGraftMap& graft_map ); void JM_merge_range( mupdf::PdfDocument& doc_des, mupdf::PdfDocument& doc_src, int spage, int epage, int apage, int rotate, int links, int annots, int show_progress, mupdf::PdfGraftMap& graft_map ); void FzDocument_insert_pdf( mupdf::FzDocument& doc, mupdf::FzDocument& src, int from_page, int to_page, int start_at, int rotate, int links, int annots, int show_progress, int final, mupdf::PdfGraftMap& graft_map ); int page_xref(mupdf::FzDocument& this_doc, int pno); void _newPage(mupdf::FzDocument& self, int pno=-1, float width=595, float height=842); void _newPage(mupdf::PdfDocument& self, int pno=-1, float width=595, float height=842); void JM_add_annot_id(mupdf::PdfAnnot& annot, const char* stem); void JM_set_annot_callout_line(mupdf::PdfAnnot& annot, PyObject *callout, int count); std::vector< std::string> JM_get_annot_id_list(mupdf::PdfPage& page); mupdf::PdfAnnot _add_caret_annot(mupdf::PdfPage& self, mupdf::FzPoint& point); mupdf::PdfAnnot _add_caret_annot(mupdf::FzPage& self, mupdf::FzPoint& point); const char* Tools_parse_da(mupdf::PdfAnnot& this_annot); PyObject* Annot_getAP(mupdf::PdfAnnot& annot); void Tools_update_da(mupdf::PdfAnnot& this_annot, const char* da_str); mupdf::FzPoint JM_point_from_py(PyObject* p); mupdf::FzRect Annot_rect(mupdf::PdfAnnot& annot); PyObject* util_transform_rect(PyObject* rect, PyObject* matrix); PyObject* Annot_rect3(mupdf::PdfAnnot& annot); mupdf::FzMatrix Page_derotate_matrix(mupdf::PdfPage& pdfpage); mupdf::FzMatrix Page_derotate_matrix(mupdf::FzPage& pdfpage); PyObject* JM_get_annot_xref_list(const mupdf::PdfObj& page_obj); PyObject* xref_object(mupdf::PdfDocument& pdf, int xref, int compressed=0, int ascii=0); PyObject* xref_object(mupdf::FzDocument& document, int xref, int compressed=0, int ascii=0); PyObject* Link_is_external(mupdf::FzLink& this_link); PyObject* Page_addAnnot_FromString(mupdf::PdfPage& page, PyObject* linklist); PyObject* Page_addAnnot_FromString(mupdf::FzPage& page, PyObject* linklist); mupdf::FzLink Link_next(mupdf::FzLink& this_link); static int page_count_fz2(void* document); int page_count_fz(mupdf::FzDocument& document); int page_count_pdf(mupdf::PdfDocument& pdf); int page_count(mupdf::FzDocument& document); int page_count(mupdf::PdfDocument& pdf); PyObject* page_annot_xrefs(mupdf::PdfDocument& pdf, int pno); PyObject* page_annot_xrefs(mupdf::FzDocument& document, int pno); bool Outline_is_external(mupdf::FzOutline* outline); void Document_extend_toc_items(mupdf::PdfDocument& pdf, PyObject* items); void Document_extend_toc_items(mupdf::FzDocument& document, PyObject* items); int ll_fz_absi(int i); mupdf::FzDevice JM_new_texttrace_device(PyObject* out); fz_rect JM_char_bbox(const mupdf::FzStextLine& line, const mupdf::FzStextChar& ch); static fz_quad JM_char_quad( fz_stext_line *line, fz_stext_char *ch); void JM_print_stext_page_as_text(mupdf::FzBuffer& res, mupdf::FzStextPage& page); void set_skip_quad_corrections(int on); void set_subset_fontnames(int on); void set_small_glyph_heights(int on); mupdf::FzRect JM_cropbox(mupdf::PdfObj& page_obj); PyObject* get_cdrawings(mupdf::FzPage& page, PyObject *extended=NULL, PyObject *callback=NULL, PyObject *method=NULL); mupdf::FzRect JM_make_spanlist( PyObject *line_dict, mupdf::FzStextLine& line, int raw, mupdf::FzBuffer& buff, mupdf::FzRect& tp_rect ); PyObject* extractWORDS(mupdf::FzStextPage& this_tpage, PyObject *delimiters); PyObject* extractBLOCKS(mupdf::FzStextPage& self); PyObject* link_uri(mupdf::FzLink& link); fz_stext_page* page_get_textpage( mupdf::FzPage& self, PyObject* clip, int flags, PyObject* matrix ); void JM_make_textpage_dict(fz_stext_page *tp, PyObject *page_dict, int raw); PyObject *pixmap_pixel(fz_pixmap* pm, int x, int y); int pixmap_n(mupdf::FzPixmap& pixmap); PyObject* JM_search_stext_page(fz_stext_page *page, const char *needle); PyObject *set_pixel(fz_pixmap* pm, int x, int y, PyObject *color); /* Copies from <src> to <pm>, which must have same width and height. pm->n - src->n must be -1, 0 or +1. If -1, <src> must have alpha and <pm> must not have alpha, and we copy the non-alpha bytes. If +1 <src> must not have alpha and <pm> must have alpha and we set <pm>'s alpha bytes all to 255.*/ void pixmap_copy(fz_pixmap* pm, const fz_pixmap* src, int n); PyObject* ll_JM_color_count(fz_pixmap *pm, PyObject *clip);
