Mercurial > hgrepos > Python2 > PyMuPDF
view src/__init__.py @ 46:7ee69f120f19 default tip
>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:17:30 +0200 |
| parents | 4621bd954a09 |
| children |
line wrap: on
line source
''' PyMuPDF implemented on top of MuPDF Python bindings. License: SPDX-License-Identifier: GPL-3.0-only ''' # To reduce startup times, we don't import everything we require here. # import atexit import binascii import collections import inspect import io import math import os import packaging.version import pathlib import glob import re import string import sys import tarfile import time import typing import warnings import weakref import zipfile from . import extra # Set up g_out_log and g_out_message from environment variables. # # PYMUPDF_MESSAGE controls the destination of user messages (from function # `pymupdf.message()`). # # PYMUPDF_LOG controls the destination of internal development logging (from # function `pymupdf.log()`). # # For syntax, see _make_output()'s `text` arg. # def _make_output( *, text=None, fd=None, stream=None, path=None, path_append=None, pylogging=None, pylogging_logger=None, pylogging_level=None, pylogging_name=None, default=None, ): ''' Returns a stream that writes to a specified destination, which can be a file descriptor, a file, an existing stream or Python's `logging' system. Args: text: text specification of destination. fd:<int> - write to file descriptor. path:<str> - write to file. path+:<str> - append to file. logging:<items> - write to Python `logging` module. items: comma-separated <name=value> pairs. level=<int> name=<str>. Other names are ignored. fd: an int file descriptor. stream: something with methods .write(text) and .flush(). If specified we simply return <stream>. path: a file path. If specified we return a stream that writes to this file. path_append: a file path. If specified we return a stream that appends to this file. pylogging*: if any of these args is not None, we return a stream that writes to Python's `logging` module. pylogging: Unused other than to activate use of logging module. pylogging_logger: A logging.Logger; If None, set from <pylogging_name>. pylogging_level: An int log level, if None we use pylogging_logger.getEffectiveLevel(). pylogging_name: Only used if <pylogging_logger> is None: If <pylogging_name> is None, we set it to 'pymupdf'. Then we do: pylogging_logger = logging.getLogger(pylogging_name) ''' if text is not None: # Textual specification, for example from from environment variable. if text.startswith('fd:'): fd = int(text[3:]) elif text.startswith('path:'): path = text[5:] elif text.startswith('path+'): path_append = text[5:] elif text.startswith('logging:'): pylogging = True items_d = dict() items = text[8:].split(',') #items_d = {n: v for (n, v) in [item.split('=', 1) for item in items]} for item in items: if not item: continue nv = item.split('=', 1) assert len(nv) == 2, f'Need `=` in {item=}.' n, v = nv items_d[n] = v pylogging_level = items_d.get('level') if pylogging_level is not None: pylogging_level = int(pylogging_level) pylogging_name = items_d.get('name', 'pymupdf') else: assert 0, f'Expected prefix `fd:`, `path:`. `path+:` or `logging:` in {text=}.' if fd is not None: ret = open(fd, mode='w', closefd=False) elif stream is not None: assert hasattr(stream, 'write') assert hasattr(stream, 'flush') ret = stream elif path is not None: ret = open(path, 'w') elif path_append is not None: ret = open(path_append, 'a') elif (0 or pylogging is not None or pylogging_logger is not None or pylogging_level is not None or pylogging_name is not None ): import logging if pylogging_logger is None: if pylogging_name is None: pylogging_name = 'pymupdf' pylogging_logger = logging.getLogger(pylogging_name) assert isinstance(pylogging_logger, logging.Logger) if pylogging_level is None: pylogging_level = pylogging_logger.getEffectiveLevel() class Out: def write(self, text): # `logging` module appends newlines, but so does the `print()` # functions in our caller message() and log() fns, so we need to # remove them here. text = text.rstrip('\n') if text: pylogging_logger.log(pylogging_level, text) def flush(self): pass ret = Out() else: ret = default return ret # Set steam used by PyMuPDF messaging. _g_out_message = _make_output(text=os.environ.get('PYMUPDF_MESSAGE'), default=sys.stdout) # Set steam used by PyMuPDF development/debugging logging. _g_out_log = _make_output(text=os.environ.get('PYMUPDF_LOG'), default=sys.stdout) # Things for testing logging. _g_log_items = list() _g_log_items_active = False def _log_items(): return _g_log_items def _log_items_active(active): global _g_log_items_active _g_log_items_active = active def _log_items_clear(): del _g_log_items[:] def set_messages( *, text=None, fd=None, stream=None, path=None, path_append=None, pylogging=None, pylogging_logger=None, pylogging_level=None, pylogging_name=None, ): ''' Sets destination of PyMuPDF messages. See _make_output() for details. ''' global _g_out_message _g_out_message = _make_output( text=text, fd=fd, stream=stream, path=path, path_append=path_append, pylogging=pylogging, pylogging_logger=pylogging_logger, pylogging_level=pylogging_level, pylogging_name=pylogging_name, default=_g_out_message, ) def set_log( *, text=None, fd=None, stream=None, path=None, path_append=None, pylogging=None, pylogging_logger=None, pylogging_level=None, pylogging_name=None, ): ''' Sets destination of PyMuPDF development/debugging logging. See _make_output() for details. ''' global _g_out_log _g_out_log = _make_output( text=text, fd=fd, stream=stream, path=path, path_append=path_append, pylogging=pylogging, pylogging_logger=pylogging_logger, pylogging_level=pylogging_level, pylogging_name=pylogging_name, default=_g_out_log, ) def log( text='', caller=1): ''' For development/debugging diagnostics. ''' try: stack = inspect.stack(context=0) except StopIteration: pass else: frame_record = stack[caller] try: filename = os.path.relpath(frame_record.filename) except Exception: # Can fail on windows. filename = frame_record.filename line = frame_record.lineno function = frame_record.function text = f'{filename}:{line}:{function}(): {text}' if _g_log_items_active: _g_log_items.append(text) if _g_out_log: print(text, file=_g_out_log, flush=1) def message(text=''): ''' For user messages. ''' # It looks like `print()` does nothing if sys.stdout is None (without # raising an exception), but we don't rely on this. if _g_out_message: print(text, file=_g_out_message, flush=1) def exception_info(): import traceback log(f'exception_info:') log(traceback.format_exc()) # PDF names must not contain these characters: INVALID_NAME_CHARS = set(string.whitespace + "()<>[]{}/%" + chr(0)) def get_env_bool( name, default): ''' Returns `True`, `False` or `default` depending on whether $<name> is '1', '0' or unset. Otherwise assert-fails. ''' v = os.environ.get( name) if v is None: ret = default elif v == '1': ret = True elif v == '0': ret = False else: assert 0, f'Unrecognised value for {name}: {v!r}' if ret != default: log(f'Using non-default setting from {name}: {v!r}') return ret def get_env_int( name, default): ''' Returns `True`, `False` or `default` depending on whether $<name> is '1', '0' or unset. Otherwise assert-fails. ''' v = os.environ.get( name) if v is None: ret = default else: ret = int(v) if ret != default: log(f'Using non-default setting from {name}: {v}') return ret # All our `except ...` blocks output diagnostics if `g_exceptions_verbose` is # true. g_exceptions_verbose = get_env_int( 'PYMUPDF_EXCEPTIONS_VERBOSE', 1) # $PYMUPDF_USE_EXTRA overrides whether to use optimised C fns in `extra`. # g_use_extra = get_env_bool( 'PYMUPDF_USE_EXTRA', True) # Global switches # class _Globals: def __init__(self): self.no_device_caching = 0 self.small_glyph_heights = 0 self.subset_fontnames = 0 self.skip_quad_corrections = 0 _globals = _Globals() # Optionally use MuPDF via cppyy bindings; experimental and not tested recently # as of 2023-01-20 11:51:40 # mupdf_cppyy = os.environ.get( 'MUPDF_CPPYY') if mupdf_cppyy is not None: # pylint: disable=all log( f'{__file__}: $MUPDF_CPPYY={mupdf_cppyy!r} so attempting to import mupdf_cppyy.') log( f'{__file__}: $PYTHONPATH={os.environ["PYTHONPATH"]}') if mupdf_cppyy == '': import mupdf_cppyy else: import importlib mupdf_cppyy = importlib.machinery.SourceFileLoader( 'mupdf_cppyy', mupdf_cppyy ).load_module() mupdf = mupdf_cppyy.cppyy.gbl.mupdf else: # Use MuPDF Python SWIG bindings. We allow import from either our own # directory for conventional wheel installs, or from separate place in case # we are using a separately-installed system installation of mupdf. # try: from . import mupdf except Exception: import mupdf if hasattr(mupdf, 'internal_check_ndebug'): mupdf.internal_check_ndebug() mupdf.reinit_singlethreaded() def _int_rc(text): ''' Converts string to int, ignoring trailing 'rc...'. ''' rc = text.find('rc') if rc >= 0: text = text[:rc] return int(text) # Basic version information. # # (We use `noqa F401` to avoid flake8 errors such as `F401 # '._build.mupdf_location' imported but unused`. # from ._build import mupdf_location # noqa F401 from ._build import pymupdf_git_branch # noqa F401 from ._build import pymupdf_git_diff # noqa F401 from ._build import pymupdf_git_sha # noqa F401 from ._build import pymupdf_version # noqa F401 from ._build import pymupdf_version_tuple # noqa F401 from ._build import swig_version # noqa F401 from ._build import swig_version_tuple # noqa F401 mupdf_version = mupdf.FZ_VERSION # Removed in PyMuPDF-1.26.1. pymupdf_date = None # Versions as tuples; useful when comparing versions. # mupdf_version_tuple = packaging.version.Version(mupdf_version).release assert mupdf_version_tuple == (mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH), \ f'Inconsistent MuPDF version numbers: {mupdf_version_tuple=} != {(mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH)=}' # Legacy version information. # version = (pymupdf_version, mupdf_version, None) VersionFitz = mupdf_version VersionBind = pymupdf_version VersionDate = None # String formatting. def _format_g(value, *, fmt='%g'): ''' Returns `value` formatted with mupdf.fz_format_double() if available, otherwise with Python's `%`. If `value` is a list or tuple, we return a space-separated string of formatted values. ''' if isinstance(value, (list, tuple)): ret = '' for v in value: if ret: ret += ' ' ret += _format_g(v, fmt=fmt) return ret else: return mupdf.fz_format_double(fmt, value) format_g = _format_g # ByteString is gone from typing in 3.14. # collections.abc.Buffer available from 3.12 only try: ByteString = typing.ByteString except AttributeError: ByteString = bytes | bytearray | memoryview # Names required by class method typing annotations. OptBytes = typing.Optional[ByteString] OptDict = typing.Optional[dict] OptFloat = typing.Optional[float] OptInt = typing.Union[int, None] OptSeq = typing.Optional[typing.Sequence] OptStr = typing.Optional[str] Page = 'Page_forward_decl' Point = 'Point_forward_decl' matrix_like = 'matrix_like' point_like = 'point_like' quad_like = 'quad_like' rect_like = 'rect_like' def _as_fz_document(document): ''' Returns document as a mupdf.FzDocument, upcasting as required. Raises 'document closed' exception if closed. ''' if isinstance(document, Document): if document.is_closed: raise ValueError('document closed') document = document.this if isinstance(document, mupdf.FzDocument): return document elif isinstance(document, mupdf.PdfDocument): return document.super() elif document is None: assert 0, f'document is None' else: assert 0, f'Unrecognised {type(document)=}' def _as_pdf_document(document, required=True): ''' Returns `document` downcast to a mupdf.PdfDocument. If downcast fails (i.e. `document` is not actually a `PdfDocument`) then we assert-fail if `required` is true (the default) else return a `mupdf.PdfDocument` with `.m_internal` false. ''' if isinstance(document, Document): if document.is_closed: raise ValueError('document closed') document = document.this if isinstance(document, mupdf.PdfDocument): return document elif isinstance(document, mupdf.FzDocument): ret = mupdf.PdfDocument(document) if required: assert ret.m_internal return ret elif document is None: assert 0, f'document is None' else: assert 0, f'Unrecognised {type(document)=}' def _as_fz_page(page): ''' Returns page as a mupdf.FzPage, upcasting as required. ''' if isinstance(page, Page): page = page.this if isinstance(page, mupdf.PdfPage): return page.super() elif isinstance(page, mupdf.FzPage): return page elif page is None: assert 0, f'page is None' else: assert 0, f'Unrecognised {type(page)=}' def _as_pdf_page(page, required=True): ''' Returns `page` downcast to a mupdf.PdfPage. If downcast fails (i.e. `page` is not actually a `PdfPage`) then we assert-fail if `required` is true (the default) else return a `mupdf.PdfPage` with `.m_internal` false. ''' if isinstance(page, Page): page = page.this if isinstance(page, mupdf.PdfPage): return page elif isinstance(page, mupdf.FzPage): ret = mupdf.pdf_page_from_fz_page(page) if required: assert ret.m_internal return ret elif page is None: assert 0, f'page is None' else: assert 0, f'Unrecognised {type(page)=}' def _pdf_annot_page(annot): ''' Wrapper for mupdf.pdf_annot_page() which raises an exception if <annot> is not bound to a page instead of returning a mupdf.PdfPage with `.m_internal=None`. [Some other MuPDF functions such as pdf_update_annot()` already raise a similar exception if a pdf_annot's .page field is null.] ''' page = mupdf.pdf_annot_page(annot) if not page.m_internal: raise RuntimeError('Annot is not bound to a page') return page # Fixme: we don't support JM_MEMORY=1. JM_MEMORY = 0 # Classes # class Annot: def __init__(self, annot): assert isinstance( annot, mupdf.PdfAnnot) self.this = annot def __repr__(self): parent = getattr(self, 'parent', '<>') return "'%s' annotation on %s" % (self.type[1], str(parent)) def __str__(self): return self.__repr__() def _erase(self): if getattr(self, "thisown", False): self.thisown = False def _get_redact_values(self): annot = self.this if mupdf.pdf_annot_type(annot) != mupdf.PDF_ANNOT_REDACT: return values = dict() try: obj = mupdf.pdf_dict_gets(mupdf.pdf_annot_obj(annot), "RO") if obj.m_internal: message_warning("Ignoring redaction key '/RO'.") xref = mupdf.pdf_to_num(obj) values[dictkey_xref] = xref obj = mupdf.pdf_dict_gets(mupdf.pdf_annot_obj(annot), "OverlayText") if obj.m_internal: text = mupdf.pdf_to_text_string(obj) values[dictkey_text] = JM_UnicodeFromStr(text) else: values[dictkey_text] = '' obj = mupdf.pdf_dict_get(mupdf.pdf_annot_obj(annot), PDF_NAME('Q')) align = 0 if obj.m_internal: align = mupdf.pdf_to_int(obj) values[dictkey_align] = align except Exception: if g_exceptions_verbose: exception_info() return val = values if not val: return val val["rect"] = self.rect text_color, fontname, fontsize = TOOLS._parse_da(self) val["text_color"] = text_color val["fontname"] = fontname val["fontsize"] = fontsize fill = self.colors["fill"] val["fill"] = fill return val def _getAP(self): if g_use_extra: assert isinstance( self.this, mupdf.PdfAnnot) ret = extra.Annot_getAP(self.this) assert isinstance( ret, bytes) return ret else: r = None res = None annot = self.this assert isinstance( annot, mupdf.PdfAnnot) annot_obj = mupdf.pdf_annot_obj( annot) ap = mupdf.pdf_dict_getl( annot_obj, PDF_NAME('AP'), PDF_NAME('N')) if mupdf.pdf_is_stream( ap): res = mupdf.pdf_load_stream( ap) if res and res.m_internal: r = JM_BinFromBuffer(res) return r def _setAP(self, buffer_, rect=0): try: annot = self.this annot_obj = mupdf.pdf_annot_obj( annot) page = _pdf_annot_page(annot) apobj = mupdf.pdf_dict_getl( annot_obj, PDF_NAME('AP'), PDF_NAME('N')) if not apobj.m_internal: raise RuntimeError( MSG_BAD_APN) if not mupdf.pdf_is_stream( apobj): raise RuntimeError( MSG_BAD_APN) res = JM_BufferFromBytes( buffer_) if not res.m_internal: raise ValueError( MSG_BAD_BUFFER) JM_update_stream( page.doc(), apobj, res, 1) if rect: bbox = mupdf.pdf_dict_get_rect( annot_obj, PDF_NAME('Rect')) mupdf.pdf_dict_put_rect( apobj, PDF_NAME('BBox'), bbox) except Exception: if g_exceptions_verbose: exception_info() def _update_appearance(self, opacity=-1, blend_mode=None, fill_color=None, rotate=-1): annot = self.this assert annot.m_internal annot_obj = mupdf.pdf_annot_obj( annot) page = _pdf_annot_page(annot) pdf = page.doc() type_ = mupdf.pdf_annot_type( annot) nfcol, fcol = JM_color_FromSequence(fill_color) try: # remove fill color from unsupported annots # or if so requested if nfcol == 0 or type_ not in ( mupdf.PDF_ANNOT_SQUARE, mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_LINE, mupdf.PDF_ANNOT_POLY_LINE, mupdf.PDF_ANNOT_POLYGON ): mupdf.pdf_dict_del( annot_obj, PDF_NAME('IC')) elif nfcol > 0: mupdf.pdf_set_annot_interior_color( annot, fcol[:nfcol]) insert_rot = 1 if rotate >= 0 else 0 if type_ not in ( mupdf.PDF_ANNOT_CARET, mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_FREE_TEXT, mupdf.PDF_ANNOT_FILE_ATTACHMENT, mupdf.PDF_ANNOT_INK, mupdf.PDF_ANNOT_LINE, mupdf.PDF_ANNOT_POLY_LINE, mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_SQUARE, mupdf.PDF_ANNOT_STAMP, mupdf.PDF_ANNOT_TEXT, ): insert_rot = 0 if insert_rot: mupdf.pdf_dict_put_int(annot_obj, PDF_NAME('Rotate'), rotate) # insert fill color if type_ == mupdf.PDF_ANNOT_FREE_TEXT: if nfcol > 0: mupdf.pdf_set_annot_color(annot, fcol[:nfcol]) elif nfcol > 0: col = mupdf.pdf_new_array(page.doc(), nfcol) for i in range( nfcol): mupdf.pdf_array_push_real(col, fcol[i]) mupdf.pdf_dict_put(annot_obj, PDF_NAME('IC'), col) mupdf.pdf_dirty_annot(annot) mupdf.pdf_update_annot(annot) # let MuPDF update pdf.resynth_required = 0 except Exception as e: if g_exceptions_verbose: exception_info() message( f'cannot update annot: {e}') raise if (opacity < 0 or opacity >= 1) and not blend_mode: # no opacity, no blend_mode return True try: # create or update /ExtGState ap = mupdf.pdf_dict_getl( mupdf.pdf_annot_obj(annot), PDF_NAME('AP'), PDF_NAME('N') ) if not ap.m_internal: # should never happen raise RuntimeError( MSG_BAD_APN) resources = mupdf.pdf_dict_get( ap, PDF_NAME('Resources')) if not resources.m_internal: # no Resources yet: make one resources = mupdf.pdf_dict_put_dict( ap, PDF_NAME('Resources'), 2) alp0 = mupdf.pdf_new_dict( page.doc(), 3) if opacity >= 0 and opacity < 1: mupdf.pdf_dict_put_real( alp0, PDF_NAME('CA'), opacity) mupdf.pdf_dict_put_real( alp0, PDF_NAME('ca'), opacity) mupdf.pdf_dict_put_real( annot_obj, PDF_NAME('CA'), opacity) if blend_mode: mupdf.pdf_dict_put_name( alp0, PDF_NAME('BM'), blend_mode) mupdf.pdf_dict_put_name( annot_obj, PDF_NAME('BM'), blend_mode) extg = mupdf.pdf_dict_get( resources, PDF_NAME('ExtGState')) if not extg.m_internal: # no ExtGState yet: make one extg = mupdf.pdf_dict_put_dict( resources, PDF_NAME('ExtGState'), 2) mupdf.pdf_dict_put( extg, PDF_NAME('H'), alp0) except Exception as e: if g_exceptions_verbose: exception_info() message( f'cannot set opacity or blend mode\n: {e}') raise return True @property def apn_bbox(self): """annotation appearance bbox""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) ap = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AP'), PDF_NAME('N')) if not ap.m_internal: val = JM_py_from_rect(mupdf.FzRect(mupdf.FzRect.Fixed_INFINITE)) else: rect = mupdf.pdf_dict_get_rect(ap, PDF_NAME('BBox')) val = JM_py_from_rect(rect) val = Rect(val) * self.get_parent().transformation_matrix val *= self.get_parent().derotation_matrix return val @property def apn_matrix(self): """annotation appearance matrix""" try: CheckParent(self) annot = self.this assert isinstance(annot, mupdf.PdfAnnot) ap = mupdf.pdf_dict_getl( mupdf.pdf_annot_obj(annot), mupdf.PDF_ENUM_NAME_AP, mupdf.PDF_ENUM_NAME_N ) if not ap.m_internal: return JM_py_from_matrix(mupdf.FzMatrix()) mat = mupdf.pdf_dict_get_matrix(ap, mupdf.PDF_ENUM_NAME_Matrix) val = JM_py_from_matrix(mat) val = Matrix(val) return val except Exception: if g_exceptions_verbose: exception_info() raise @property def blendmode(self): """annotation BlendMode""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) obj = mupdf.pdf_dict_get(annot_obj, PDF_NAME('BM')) blend_mode = None if obj.m_internal: blend_mode = JM_UnicodeFromStr(mupdf.pdf_to_name(obj)) return blend_mode # loop through the /AP/N/Resources/ExtGState objects obj = mupdf.pdf_dict_getl( annot_obj, PDF_NAME('AP'), PDF_NAME('N'), PDF_NAME('Resources'), PDF_NAME('ExtGState'), ) if mupdf.pdf_is_dict(obj): n = mupdf.pdf_dict_len(obj) for i in range(n): obj1 = mupdf.pdf_dict_get_val(obj, i) if mupdf.pdf_is_dict(obj1): m = mupdf.pdf_dict_len(obj1) for j in range(m): obj2 = mupdf.pdf_dict_get_key(obj1, j) if mupdf.pdf_objcmp(obj2, PDF_NAME('BM')) == 0: blend_mode = JM_UnicodeFromStr(mupdf.pdf_to_name(mupdf.pdf_dict_get_val(obj1, j))) return blend_mode return blend_mode @property def border(self): """Border information.""" CheckParent(self) atype = self.type[0] if atype not in ( mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_FREE_TEXT, mupdf.PDF_ANNOT_INK, mupdf.PDF_ANNOT_LINE, mupdf.PDF_ANNOT_POLY_LINE, mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_SQUARE, ): return dict() ao = mupdf.pdf_annot_obj(self.this) ret = JM_annot_border(ao) return ret def clean_contents(self, sanitize=1): """Clean appearance contents stream.""" CheckParent(self) annot = self.this pdf = mupdf.pdf_get_bound_document(mupdf.pdf_annot_obj(annot)) filter_ = _make_PdfFilterOptions(recurse=1, instance_forms=0, ascii=0, sanitize=sanitize) mupdf.pdf_filter_annot_contents(pdf, annot, filter_) @property def colors(self): """Color definitions.""" try: CheckParent(self) annot = self.this assert isinstance(annot, mupdf.PdfAnnot) return JM_annot_colors(mupdf.pdf_annot_obj(annot)) except Exception: if g_exceptions_verbose: exception_info() raise def delete_responses(self): """Delete 'Popup' and responding annotations.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) page = _pdf_annot_page(annot) while 1: irt_annot = JM_find_annot_irt(annot) if not irt_annot: break mupdf.pdf_delete_annot(page, irt_annot) mupdf.pdf_dict_del(annot_obj, PDF_NAME('Popup')) annots = mupdf.pdf_dict_get(page.obj(), PDF_NAME('Annots')) n = mupdf.pdf_array_len(annots) found = 0 for i in range(n-1, -1, -1): o = mupdf.pdf_array_get(annots, i) p = mupdf.pdf_dict_get(o, PDF_NAME('Parent')) if not o.m_internal: continue if not mupdf.pdf_objcmp(p, annot_obj): mupdf.pdf_array_delete(annots, i) found = 1 if found: mupdf.pdf_dict_put(page.obj(), PDF_NAME('Annots'), annots) @property def file_info(self): """Attached file information.""" CheckParent(self) res = dict() length = -1 size = -1 desc = None annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) type_ = mupdf.pdf_annot_type(annot) if type_ != mupdf.PDF_ANNOT_FILE_ATTACHMENT: raise TypeError( MSG_BAD_ANNOT_TYPE) stream = mupdf.pdf_dict_getl( annot_obj, PDF_NAME('FS'), PDF_NAME('EF'), PDF_NAME('F'), ) if not stream.m_internal: RAISEPY( "bad PDF: file entry not found", JM_Exc_FileDataError) fs = mupdf.pdf_dict_get(annot_obj, PDF_NAME('FS')) o = mupdf.pdf_dict_get(fs, PDF_NAME('UF')) if o.m_internal: filename = mupdf.pdf_to_text_string(o) else: o = mupdf.pdf_dict_get(fs, PDF_NAME('F')) if o.m_internal: filename = mupdf.pdf_to_text_string(o) o = mupdf.pdf_dict_get(fs, PDF_NAME('Desc')) if o.m_internal: desc = mupdf.pdf_to_text_string(o) o = mupdf.pdf_dict_get(stream, PDF_NAME('Length')) if o.m_internal: length = mupdf.pdf_to_int(o) o = mupdf.pdf_dict_getl(stream, PDF_NAME('Params'), PDF_NAME('Size')) if o.m_internal: size = mupdf.pdf_to_int(o) res[ dictkey_filename] = JM_EscapeStrFromStr(filename) res[ dictkey_descr] = JM_UnicodeFromStr(desc) res[ dictkey_length] = length res[ dictkey_size] = size return res @property def flags(self): """Flags field.""" CheckParent(self) annot = self.this return mupdf.pdf_annot_flags(annot) def get_file(self): """Retrieve attached file content.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) type = mupdf.pdf_annot_type(annot) if type != mupdf.PDF_ANNOT_FILE_ATTACHMENT: raise TypeError( MSG_BAD_ANNOT_TYPE) stream = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('FS'), PDF_NAME('EF'), PDF_NAME('F')) if not stream.m_internal: RAISEPY( "bad PDF: file entry not found", JM_Exc_FileDataError) buf = mupdf.pdf_load_stream(stream) res = JM_BinFromBuffer(buf) return res def get_oc(self): """Get annotation optional content reference.""" CheckParent(self) oc = 0 annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) obj = mupdf.pdf_dict_get(annot_obj, PDF_NAME('OC')) if obj.m_internal: oc = mupdf.pdf_to_num(obj) return oc # PyMuPDF doesn't seem to have this .parent member, but removing it breaks # 11 tests...? #@property def get_parent(self): try: ret = getattr( self, 'parent') except AttributeError: page = _pdf_annot_page(self.this) assert isinstance( page, mupdf.PdfPage) document = Document( page.doc()) if page.m_internal else None ret = Page(page, document) #self.parent = weakref.proxy( ret) self.parent = ret #log(f'No attribute .parent: {type(self)=} {id(self)=}: have set {id(self.parent)=}.') #log( f'Have set self.parent') return ret def get_pixmap(self, matrix=None, dpi=None, colorspace=None, alpha=0): """annotation Pixmap""" CheckParent(self) cspaces = {"gray": csGRAY, "rgb": csRGB, "cmyk": csCMYK} if type(colorspace) is str: colorspace = cspaces.get(colorspace.lower(), None) if dpi: matrix = Matrix(dpi / 72, dpi / 72) ctm = JM_matrix_from_py(matrix) cs = colorspace if not cs: cs = mupdf.fz_device_rgb() pix = mupdf.pdf_new_pixmap_from_annot(self.this, ctm, cs, mupdf.FzSeparations(0), alpha) ret = Pixmap(pix) if dpi: ret.set_dpi(dpi, dpi) return ret def get_sound(self): """Retrieve sound stream.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) type = mupdf.pdf_annot_type(annot) sound = mupdf.pdf_dict_get(annot_obj, PDF_NAME('Sound')) if type != mupdf.PDF_ANNOT_SOUND or not sound.m_internal: raise TypeError( MSG_BAD_ANNOT_TYPE) if mupdf.pdf_dict_get(sound, PDF_NAME('F')).m_internal: RAISEPY( "unsupported sound stream", JM_Exc_FileDataError) res = dict() obj = mupdf.pdf_dict_get(sound, PDF_NAME('R')) if obj.m_internal: res['rate'] = mupdf.pdf_to_real(obj) obj = mupdf.pdf_dict_get(sound, PDF_NAME('C')) if obj.m_internal: res['channels'] = mupdf.pdf_to_int(obj) obj = mupdf.pdf_dict_get(sound, PDF_NAME('B')) if obj.m_internal: res['bps'] = mupdf.pdf_to_int(obj) obj = mupdf.pdf_dict_get(sound, PDF_NAME('E')) if obj.m_internal: res['encoding'] = mupdf.pdf_to_name(obj) obj = mupdf.pdf_dict_gets(sound, "CO") if obj.m_internal: res['compression'] = mupdf.pdf_to_name(obj) buf = mupdf.pdf_load_stream(sound) stream = JM_BinFromBuffer(buf) res['stream'] = stream return res def get_text(self, *args, **kwargs): return utils.get_text(self, *args, **kwargs) def get_textbox(self, *args, **kwargs): return utils.get_textbox(self, *args, **kwargs) def get_textpage(self, clip=None, flags=0): """Make annotation TextPage.""" CheckParent(self) options = mupdf.FzStextOptions(flags) if clip: assert hasattr(mupdf, 'FZ_STEXT_CLIP_RECT'), f'MuPDF-{mupdf_version} does not support FZ_STEXT_CLIP_RECT.' clip2 = JM_rect_from_py(clip) options.clip = clip2.internal() options.flags |= mupdf.FZ_STEXT_CLIP_RECT annot = self.this stextpage = mupdf.FzStextPage(annot, options) ret = TextPage(stextpage) p = self.get_parent() if isinstance(p, weakref.ProxyType): ret.parent = p else: ret.parent = weakref.proxy(p) return ret @property def has_popup(self): """Check if annotation has a Popup.""" CheckParent(self) annot = self.this obj = mupdf.pdf_dict_get(mupdf.pdf_annot_obj(annot), PDF_NAME('Popup')) return True if obj.m_internal else False @property def info(self): """Various information details.""" CheckParent(self) annot = self.this res = dict() res[dictkey_content] = JM_UnicodeFromStr(mupdf.pdf_annot_contents(annot)) o = mupdf.pdf_dict_get(mupdf.pdf_annot_obj(annot), PDF_NAME('Name')) res[dictkey_name] = JM_UnicodeFromStr(mupdf.pdf_to_name(o)) # Title (= author) o = mupdf.pdf_dict_get(mupdf.pdf_annot_obj(annot), PDF_NAME('T')) res[dictkey_title] = JM_UnicodeFromStr(mupdf.pdf_to_text_string(o)) # CreationDate o = mupdf.pdf_dict_gets(mupdf.pdf_annot_obj(annot), "CreationDate") res[dictkey_creationDate] = JM_UnicodeFromStr(mupdf.pdf_to_text_string(o)) # ModDate o = mupdf.pdf_dict_get(mupdf.pdf_annot_obj(annot), PDF_NAME('M')) res[dictkey_modDate] = JM_UnicodeFromStr(mupdf.pdf_to_text_string(o)) # Subj o = mupdf.pdf_dict_gets(mupdf.pdf_annot_obj(annot), "Subj") res[dictkey_subject] = mupdf.pdf_to_text_string(o) # Identification (PDF key /NM) o = mupdf.pdf_dict_gets(mupdf.pdf_annot_obj(annot), "NM") res[dictkey_id] = JM_UnicodeFromStr(mupdf.pdf_to_text_string(o)) return res @property def irt_xref(self): ''' annotation IRT xref ''' annot = self.this annot_obj = mupdf.pdf_annot_obj( annot) irt = mupdf.pdf_dict_get( annot_obj, PDF_NAME('IRT')) if not irt.m_internal: return 0 return mupdf.pdf_to_num( irt) @property def is_open(self): """Get 'open' status of annotation or its Popup.""" CheckParent(self) return mupdf.pdf_annot_is_open(self.this) @property def language(self): """annotation language""" this_annot = self.this lang = mupdf.pdf_annot_language(this_annot) if lang == mupdf.FZ_LANG_UNSET: return assert hasattr(mupdf, 'fz_string_from_text_language2') return mupdf.fz_string_from_text_language2(lang) @property def line_ends(self): """Line end codes.""" CheckParent(self) annot = self.this # return nothing for invalid annot types if not mupdf.pdf_annot_has_line_ending_styles(annot): return lstart = mupdf.pdf_annot_line_start_style(annot) lend = mupdf.pdf_annot_line_end_style(annot) return lstart, lend @property def next(self): """Next annotation.""" CheckParent(self) this_annot = self.this assert isinstance(this_annot, mupdf.PdfAnnot) assert this_annot.m_internal type_ = mupdf.pdf_annot_type(this_annot) if type_ != mupdf.PDF_ANNOT_WIDGET: annot = mupdf.pdf_next_annot(this_annot) else: annot = mupdf.pdf_next_widget(this_annot) val = Annot(annot) if annot.m_internal else None if not val: return None val.thisown = True assert val.get_parent().this.m_internal_value() == self.get_parent().this.m_internal_value() val.parent._annot_refs[id(val)] = val if val.type[0] == mupdf.PDF_ANNOT_WIDGET: widget = Widget() TOOLS._fill_widget(val, widget) val = widget return val @property def opacity(self): """Opacity.""" CheckParent(self) annot = self.this opy = -1 ca = mupdf.pdf_dict_get( mupdf.pdf_annot_obj(annot), mupdf.PDF_ENUM_NAME_CA) if mupdf.pdf_is_number(ca): opy = mupdf.pdf_to_real(ca) return opy @property def popup_rect(self): """annotation 'Popup' rectangle""" CheckParent(self) rect = mupdf.FzRect(mupdf.FzRect.Fixed_INFINITE) annot = self.this annot_obj = mupdf.pdf_annot_obj( annot) obj = mupdf.pdf_dict_get( annot_obj, PDF_NAME('Popup')) if obj.m_internal: rect = mupdf.pdf_dict_get_rect(obj, PDF_NAME('Rect')) #log( '{rect=}') val = JM_py_from_rect(rect) #log( '{val=}') val = Rect(val) * self.get_parent().transformation_matrix val *= self.get_parent().derotation_matrix return val @property def popup_xref(self): """annotation 'Popup' xref""" CheckParent(self) xref = 0 annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) obj = mupdf.pdf_dict_get(annot_obj, PDF_NAME('Popup')) if obj.m_internal: xref = mupdf.pdf_to_num(obj) return xref @property def rect(self): """annotation rectangle""" if g_use_extra: val = extra.Annot_rect3( self.this) else: val = mupdf.pdf_bound_annot(self.this) val = Rect(val) # Caching self.parent_() reduces 1000x from 0.07 to 0.04. # p = self.get_parent() #p = getattr( self, 'parent', None) #if p is None: # p = self.parent # self.parent = p #p = self.parent_() val *= p.derotation_matrix return val @property def rect_delta(self): ''' annotation delta values to rectangle ''' annot_obj = mupdf.pdf_annot_obj(self.this) arr = mupdf.pdf_dict_get( annot_obj, PDF_NAME('RD')) if mupdf.pdf_array_len( arr) == 4: return ( mupdf.pdf_to_real( mupdf.pdf_array_get( arr, 0)), mupdf.pdf_to_real( mupdf.pdf_array_get( arr, 1)), -mupdf.pdf_to_real( mupdf.pdf_array_get( arr, 2)), -mupdf.pdf_to_real( mupdf.pdf_array_get( arr, 3)), ) @property def rotation(self): """annotation rotation""" CheckParent(self) annot = self.this rotation = mupdf.pdf_dict_get( mupdf.pdf_annot_obj(annot), mupdf.PDF_ENUM_NAME_Rotate) if not rotation.m_internal: return -1 return mupdf.pdf_to_int( rotation) def set_apn_bbox(self, bbox): """ Set annotation appearance bbox. """ CheckParent(self) page = self.get_parent() rot = page.rotation_matrix mat = page.transformation_matrix bbox *= rot * ~mat annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) ap = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AP'), PDF_NAME('N')) if not ap.m_internal: raise RuntimeError( MSG_BAD_APN) rect = JM_rect_from_py(bbox) mupdf.pdf_dict_put_rect(ap, PDF_NAME('BBox'), rect) def set_apn_matrix(self, matrix): """Set annotation appearance matrix.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) ap = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AP'), PDF_NAME('N')) if not ap.m_internal: raise RuntimeError( MSG_BAD_APN) mat = JM_matrix_from_py(matrix) mupdf.pdf_dict_put_matrix(ap, PDF_NAME('Matrix'), mat) def set_blendmode(self, blend_mode): """Set annotation BlendMode.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) mupdf.pdf_dict_put_name(annot_obj, PDF_NAME('BM'), blend_mode) def set_border(self, border=None, width=-1, style=None, dashes=None, clouds=-1): """Set border properties. Either a dict, or direct arguments width, style, dashes or clouds.""" CheckParent(self) atype, atname = self.type[:2] # annotation type if atype not in ( mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_FREE_TEXT, mupdf.PDF_ANNOT_INK, mupdf.PDF_ANNOT_LINE, mupdf.PDF_ANNOT_POLY_LINE, mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_SQUARE, ): message(f"Cannot set border for '{atname}'.") return None if atype not in ( mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_FREE_TEXT, mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_SQUARE, ): if clouds > 0: message(f"Cannot set cloudy border for '{atname}'.") clouds = -1 # do not set border effect if type(border) is not dict: border = {"width": width, "style": style, "dashes": dashes, "clouds": clouds} border.setdefault("width", -1) border.setdefault("style", None) border.setdefault("dashes", None) border.setdefault("clouds", -1) if border["width"] is None: border["width"] = -1 if border["clouds"] is None: border["clouds"] = -1 if hasattr(border["dashes"], "__getitem__"): # ensure sequence items are integers border["dashes"] = tuple(border["dashes"]) for item in border["dashes"]: if not isinstance(item, int): border["dashes"] = None break annot = self.this annot_obj = mupdf.pdf_annot_obj( annot) pdf = mupdf.pdf_get_bound_document( annot_obj) return JM_annot_set_border( border, pdf, annot_obj) def set_colors(self, colors=None, stroke=None, fill=None): """Set 'stroke' and 'fill' colors. Use either a dict or the direct arguments. """ if self.type[0] == mupdf.PDF_ANNOT_FREE_TEXT: raise ValueError("cannot be used for FreeText annotations") CheckParent(self) doc = self.get_parent().parent if type(colors) is not dict: colors = {"fill": fill, "stroke": stroke} fill = colors.get("fill") stroke = colors.get("stroke") fill_annots = (mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_SQUARE, mupdf.PDF_ANNOT_LINE, mupdf.PDF_ANNOT_POLY_LINE, mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_REDACT,) if stroke in ([], ()): doc.xref_set_key(self.xref, "C", "[]") elif stroke is not None: if hasattr(stroke, "__float__"): stroke = [float(stroke)] CheckColor(stroke) assert len(stroke) in (1, 3, 4) s = f"[{_format_g(stroke)}]" doc.xref_set_key(self.xref, "C", s) if fill and self.type[0] not in fill_annots: message("Warning: fill color ignored for annot type '%s'." % self.type[1]) return if fill in ([], ()): doc.xref_set_key(self.xref, "IC", "[]") elif fill is not None: if hasattr(fill, "__float__"): fill = [float(fill)] CheckColor(fill) assert len(fill) in (1, 3, 4) s = f"[{_format_g(fill)}]" doc.xref_set_key(self.xref, "IC", s) def set_flags(self, flags): """Set annotation flags.""" CheckParent(self) annot = self.this mupdf.pdf_set_annot_flags(annot, flags) def set_info(self, info=None, content=None, title=None, creationDate=None, modDate=None, subject=None): """Set various properties.""" CheckParent(self) if type(info) is dict: # build the args from the dictionary content = info.get("content", None) title = info.get("title", None) creationDate = info.get("creationDate", None) modDate = info.get("modDate", None) subject = info.get("subject", None) info = None annot = self.this # use this to indicate a 'markup' annot type is_markup = mupdf.pdf_annot_has_author(annot) # contents if content: mupdf.pdf_set_annot_contents(annot, content) if is_markup: # title (= author) if title: mupdf.pdf_set_annot_author(annot, title) # creation date if creationDate: mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(annot), PDF_NAME('CreationDate'), creationDate) # mod date if modDate: mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(annot), PDF_NAME('M'), modDate) # subject if subject: mupdf.pdf_dict_puts(mupdf.pdf_annot_obj(annot), "Subj", mupdf.pdf_new_text_string(subject)) def set_irt_xref(self, xref): ''' Set annotation IRT xref ''' annot = self.this annot_obj = mupdf.pdf_annot_obj( annot) page = _pdf_annot_page(annot) if xref < 1 or xref >= mupdf.pdf_xref_len( page.doc()): raise ValueError( MSG_BAD_XREF) irt = mupdf.pdf_new_indirect( page.doc(), xref, 0) subt = mupdf.pdf_dict_get( irt, PDF_NAME('Subtype')) irt_subt = mupdf.pdf_annot_type_from_string( mupdf.pdf_to_name( subt)) if irt_subt < 0: raise ValueError( MSG_IS_NO_ANNOT) mupdf.pdf_dict_put( annot_obj, PDF_NAME('IRT'), irt) def set_language(self, language=None): """Set annotation language.""" CheckParent(self) this_annot = self.this if not language: lang = mupdf.FZ_LANG_UNSET else: lang = mupdf.fz_text_language_from_string(language) mupdf.pdf_set_annot_language(this_annot, lang) def set_line_ends(self, start, end): """Set line end codes.""" CheckParent(self) annot = self.this if mupdf.pdf_annot_has_line_ending_styles(annot): mupdf.pdf_set_annot_line_ending_styles(annot, start, end) else: message_warning("bad annot type for line ends") def set_name(self, name): """Set /Name (icon) of annotation.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) mupdf.pdf_dict_put_name(annot_obj, PDF_NAME('Name'), name) def set_oc(self, oc=0): """Set / remove annotation OC xref.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) if not oc: mupdf.pdf_dict_del(annot_obj, PDF_NAME('OC')) else: JM_add_oc_object(mupdf.pdf_get_bound_document(annot_obj), annot_obj, oc) def set_opacity(self, opacity): """Set opacity.""" CheckParent(self) annot = self.this if not _INRANGE(opacity, 0.0, 1.0): mupdf.pdf_set_annot_opacity(annot, 1) return mupdf.pdf_set_annot_opacity(annot, opacity) if opacity < 1.0: page = _pdf_annot_page(annot) page.transparency = 1 def set_open(self, is_open): """Set 'open' status of annotation or its Popup.""" CheckParent(self) annot = self.this mupdf.pdf_set_annot_is_open(annot, is_open) def set_popup(self, rect): ''' Create annotation 'Popup' or update rectangle. ''' CheckParent(self) annot = self.this pdfpage = _pdf_annot_page(annot) rot = JM_rotate_page_matrix(pdfpage) r = mupdf.fz_transform_rect(JM_rect_from_py(rect), rot) mupdf.pdf_set_annot_popup(annot, r) def set_rect(self, rect): """Set annotation rectangle.""" CheckParent(self) annot = self.this pdfpage = _pdf_annot_page(annot) rot = JM_rotate_page_matrix(pdfpage) r = mupdf.fz_transform_rect(JM_rect_from_py(rect), rot) if mupdf.fz_is_empty_rect(r) or mupdf.fz_is_infinite_rect(r): raise ValueError( MSG_BAD_RECT) try: mupdf.pdf_set_annot_rect(annot, r) except Exception as e: message(f'cannot set rect: {e}') return False def set_rotation(self, rotate=0): """Set annotation rotation.""" CheckParent(self) annot = self.this type = mupdf.pdf_annot_type(annot) if type not in ( mupdf.PDF_ANNOT_CARET, mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_FREE_TEXT, mupdf.PDF_ANNOT_FILE_ATTACHMENT, mupdf.PDF_ANNOT_INK, mupdf.PDF_ANNOT_LINE, mupdf.PDF_ANNOT_POLY_LINE, mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_SQUARE, mupdf.PDF_ANNOT_STAMP, mupdf.PDF_ANNOT_TEXT, ): return rot = rotate while rot < 0: rot += 360 while rot >= 360: rot -= 360 if type == mupdf.PDF_ANNOT_FREE_TEXT and rot % 90 != 0: rot = 0 annot_obj = mupdf.pdf_annot_obj(annot) mupdf.pdf_dict_put_int(annot_obj, PDF_NAME('Rotate'), rot) @property def type(self): """annotation type""" CheckParent(self) if not self.this.m_internal: return 'null' type_ = mupdf.pdf_annot_type(self.this) c = mupdf.pdf_string_from_annot_type(type_) o = mupdf.pdf_dict_gets( mupdf.pdf_annot_obj(self.this), 'IT') if not o.m_internal or mupdf.pdf_is_name(o): return (type_, c) it = mupdf.pdf_to_name(o) return (type_, c, it) def update(self, blend_mode: OptStr =None, opacity: OptFloat =None, fontsize: float =0, fontname: OptStr =None, text_color: OptSeq =None, border_color: OptSeq =None, fill_color: OptSeq =None, cross_out: bool =True, rotate: int =-1, ): """Update annot appearance. Notes: Depending on the annot type, some parameters make no sense, while others are only available in this method to achieve the desired result. This is especially true for 'FreeText' annots. Args: blend_mode: set the blend mode, all annotations. opacity: set the opacity, all annotations. fontsize: set fontsize, 'FreeText' only. fontname: set the font, 'FreeText' only. border_color: set border color, 'FreeText' only. text_color: set text color, 'FreeText' only. fill_color: set fill color, all annotations. cross_out: draw diagonal lines, 'Redact' only. rotate: set rotation, 'FreeText' and some others. """ annot_obj = mupdf.pdf_annot_obj(self.this) if border_color: is_rich_text = mupdf.pdf_dict_get(annot_obj, PDF_NAME("RC")) if not is_rich_text: raise ValueError("cannot set border_color if rich_text is False") Annot.update_timing_test() CheckParent(self) def color_string(cs, code): """Return valid PDF color operator for a given color sequence. """ cc = ColorCode(cs, code) if not cc: return b"" return (cc + "\n").encode() annot_type = self.type[0] # get the annot type dt = self.border.get("dashes", None) # get the dashes spec bwidth = self.border.get("width", -1) # get border line width stroke = self.colors["stroke"] # get the stroke color if fill_color is not None: fill = fill_color else: fill = self.colors["fill"] rect = None # self.rect # prevent MuPDF fiddling with it apnmat = self.apn_matrix # prevent MuPDF fiddling with it if rotate != -1: # sanitize rotation value while rotate < 0: rotate += 360 while rotate >= 360: rotate -= 360 if annot_type == mupdf.PDF_ANNOT_FREE_TEXT and rotate % 90 != 0: rotate = 0 #------------------------------------------------------------------ # handle opacity and blend mode #------------------------------------------------------------------ if blend_mode is None: blend_mode = self.blendmode if not hasattr(opacity, "__float__"): opacity = self.opacity if 0 <= opacity < 1 or blend_mode: opa_code = "/H gs\n" # then we must reference this 'gs' else: opa_code = "" if annot_type == mupdf.PDF_ANNOT_FREE_TEXT: CheckColor(text_color) CheckColor(fill_color) tcol, fname, fsize = TOOLS._parse_da(self) # read and update default appearance as necessary if fsize <= 0: fsize = 12 if text_color: tcol = text_color if fontname: fname = fontname if fontsize > 0: fsize = fontsize JM_make_annot_DA(self, len(tcol), tcol, fname, fsize) blend_mode = None # not supported for free text annotations! #------------------------------------------------------------------ # now invoke MuPDF to update the annot appearance #------------------------------------------------------------------ val = self._update_appearance( opacity=opacity, blend_mode=blend_mode, fill_color=fill, rotate=rotate, ) if val is False: raise RuntimeError("Error updating annotation.") if annot_type == mupdf.PDF_ANNOT_FREE_TEXT: # in absence of previous opacity, we may need to modify the AP ap = self._getAP() if 0 <= opacity < 1 and not ap.startswith(b"/H gs"): self._setAP(b"/H gs\n" + ap) return bfill = color_string(fill, "f") bstroke = color_string(stroke, "c") p_ctm = self.get_parent().transformation_matrix imat = ~p_ctm # inverse page transf. matrix if dt: dashes = "[" + " ".join(map(str, dt)) + "] 0 d\n" dashes = dashes.encode("utf-8") else: dashes = None if self.line_ends: line_end_le, line_end_ri = self.line_ends else: line_end_le, line_end_ri = 0, 0 # init line end codes # read contents as created by MuPDF ap = self._getAP() ap_tab = ap.splitlines() # split in single lines ap_updated = False # assume we did nothing if annot_type == mupdf.PDF_ANNOT_REDACT: if cross_out: # create crossed-out rect ap_updated = True ap_tab = ap_tab[:-1] _, LL, LR, UR, UL = ap_tab ap_tab.append(LR) ap_tab.append(LL) ap_tab.append(UR) ap_tab.append(LL) ap_tab.append(UL) ap_tab.append(b"S") if bwidth > 0 or bstroke != b"": ap_updated = True ntab = [_format_g(bwidth).encode() + b" w"] if bwidth > 0 else [] for line in ap_tab: if line.endswith(b"w"): continue if line.endswith(b"RG") and bstroke != b"": line = bstroke[:-1] ntab.append(line) ap_tab = ntab ap = b"\n".join(ap_tab) if annot_type in (mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_POLY_LINE): ap = b"\n".join(ap_tab[:-1]) + b"\n" ap_updated = True if bfill != b"": if annot_type == mupdf.PDF_ANNOT_POLYGON: ap = ap + bfill + b"b" # close, fill, and stroke elif annot_type == mupdf.PDF_ANNOT_POLY_LINE: ap = ap + b"S" # stroke else: if annot_type == mupdf.PDF_ANNOT_POLYGON: ap = ap + b"s" # close and stroke elif annot_type == mupdf.PDF_ANNOT_POLY_LINE: ap = ap + b"S" # stroke if dashes is not None: # handle dashes ap = dashes + ap # reset dashing - only applies for LINE annots with line ends given ap = ap.replace(b"\nS\n", b"\nS\n[] 0 d\n", 1) ap_updated = True if opa_code: ap = opa_code.encode("utf-8") + ap ap_updated = True ap = b"q\n" + ap + b"\nQ\n" #---------------------------------------------------------------------- # the following handles line end symbols for 'Polygon' and 'Polyline' #---------------------------------------------------------------------- if line_end_le + line_end_ri > 0 and annot_type in (mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_POLY_LINE): le_funcs = (None, TOOLS._le_square, TOOLS._le_circle, TOOLS._le_diamond, TOOLS._le_openarrow, TOOLS._le_closedarrow, TOOLS._le_butt, TOOLS._le_ropenarrow, TOOLS._le_rclosedarrow, TOOLS._le_slash) le_funcs_range = range(1, len(le_funcs)) d = 2 * max(1, self.border["width"]) rect = self.rect + (-d, -d, d, d) ap_updated = True points = self.vertices if line_end_le in le_funcs_range: p1 = Point(points[0]) * imat p2 = Point(points[1]) * imat left = le_funcs[line_end_le](self, p1, p2, False, fill_color) ap += left.encode() if line_end_ri in le_funcs_range: p1 = Point(points[-2]) * imat p2 = Point(points[-1]) * imat left = le_funcs[line_end_ri](self, p1, p2, True, fill_color) ap += left.encode() if ap_updated: if rect: # rect modified here? self.set_rect(rect) self._setAP(ap, rect=1) else: self._setAP(ap, rect=0) #------------------------------- # handle annotation rotations #------------------------------- if annot_type not in ( # only these types are supported mupdf.PDF_ANNOT_CARET, mupdf.PDF_ANNOT_CIRCLE, mupdf.PDF_ANNOT_FILE_ATTACHMENT, mupdf.PDF_ANNOT_INK, mupdf.PDF_ANNOT_LINE, mupdf.PDF_ANNOT_POLY_LINE, mupdf.PDF_ANNOT_POLYGON, mupdf.PDF_ANNOT_SQUARE, mupdf.PDF_ANNOT_STAMP, mupdf.PDF_ANNOT_TEXT, ): return rot = self.rotation # get value from annot object if rot == -1: # nothing to change return M = (self.rect.tl + self.rect.br) / 2 # center of annot rect if rot == 0: # undo rotations if abs(apnmat - Matrix(1, 1)) < 1e-5: return # matrix already is a no-op quad = self.rect.morph(M, ~apnmat) # derotate rect self.setRect(quad.rect) self.set_apn_matrix(Matrix(1, 1)) # appearance matrix = no-op return mat = Matrix(rot) quad = self.rect.morph(M, mat) self.set_rect(quad.rect) self.set_apn_matrix(apnmat * mat) def update_file(self, buffer_=None, filename=None, ufilename=None, desc=None): """Update attached file.""" CheckParent(self) annot = self.this annot_obj = mupdf.pdf_annot_obj(annot) pdf = mupdf.pdf_get_bound_document(annot_obj) # the owning PDF type = mupdf.pdf_annot_type(annot) if type != mupdf.PDF_ANNOT_FILE_ATTACHMENT: raise TypeError( MSG_BAD_ANNOT_TYPE) stream = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('FS'), PDF_NAME('EF'), PDF_NAME('F')) # the object for file content if not stream.m_internal: RAISEPY( "bad PDF: no /EF object", JM_Exc_FileDataError) fs = mupdf.pdf_dict_get(annot_obj, PDF_NAME('FS')) # file content given res = JM_BufferFromBytes(buffer_) if buffer_ and not res.m_internal: raise ValueError( MSG_BAD_BUFFER) if res: JM_update_stream(pdf, stream, res, 1) # adjust /DL and /Size parameters len, _ = mupdf.fz_buffer_storage(res) l = mupdf.pdf_new_int(len) mupdf.pdf_dict_put(stream, PDF_NAME('DL'), l) mupdf.pdf_dict_putl(stream, l, PDF_NAME('Params'), PDF_NAME('Size')) if filename: mupdf.pdf_dict_put_text_string(stream, PDF_NAME('F'), filename) mupdf.pdf_dict_put_text_string(fs, PDF_NAME('F'), filename) mupdf.pdf_dict_put_text_string(stream, PDF_NAME('UF'), filename) mupdf.pdf_dict_put_text_string(fs, PDF_NAME('UF'), filename) mupdf.pdf_dict_put_text_string(annot_obj, PDF_NAME('Contents'), filename) if ufilename: mupdf.pdf_dict_put_text_string(stream, PDF_NAME('UF'), ufilename) mupdf.pdf_dict_put_text_string(fs, PDF_NAME('UF'), ufilename) if desc: mupdf.pdf_dict_put_text_string(stream, PDF_NAME('Desc'), desc) mupdf.pdf_dict_put_text_string(fs, PDF_NAME('Desc'), desc) @staticmethod def update_timing_test(): total = 0 for i in range( 30*1000): total += i return total @property def vertices(self): """annotation vertex points""" CheckParent(self) annot = self.this assert isinstance(annot, mupdf.PdfAnnot) annot_obj = mupdf.pdf_annot_obj(annot) page = _pdf_annot_page(annot) page_ctm = mupdf.FzMatrix() # page transformation matrix dummy = mupdf.FzRect() # Out-param for mupdf.pdf_page_transform(). mupdf.pdf_page_transform(page, dummy, page_ctm) derot = JM_derotate_page_matrix(page) page_ctm = mupdf.fz_concat(page_ctm, derot) #---------------------------------------------------------------- # The following objects occur in different annotation types. # So we are sure that (!o) occurs at most once. # Every pair of floats is one point, that needs to be separately # transformed with the page transformation matrix. #---------------------------------------------------------------- o = mupdf.pdf_dict_get(annot_obj, PDF_NAME('Vertices')) if not o.m_internal: o = mupdf.pdf_dict_get(annot_obj, PDF_NAME('L')) if not o.m_internal: o = mupdf.pdf_dict_get(annot_obj, PDF_NAME('QuadPoints')) if not o.m_internal: o = mupdf.pdf_dict_gets(annot_obj, 'CL') if o.m_internal: # handle lists with 1-level depth # weiter res = [] for i in range(0, mupdf.pdf_array_len(o), 2): x = mupdf.pdf_to_real(mupdf.pdf_array_get(o, i)) y = mupdf.pdf_to_real(mupdf.pdf_array_get(o, i+1)) point = mupdf.FzPoint(x, y) point = mupdf.fz_transform_point(point, page_ctm) res.append( (point.x, point.y)) return res o = mupdf.pdf_dict_gets(annot_obj, 'InkList') if o.m_internal: # InkList has 2-level lists #inklist: res = [] for i in range(mupdf.pdf_array_len(o)): res1 = [] o1 = mupdf.pdf_array_get(o, i) for j in range(0, mupdf.pdf_array_len(o1), 2): x = mupdf.pdf_to_real(mupdf.pdf_array_get(o1, j)) y = mupdf.pdf_to_real(mupdf.pdf_array_get(o1, j+1)) point = mupdf.FzPoint(x, y) point = mupdf.fz_transform_point(point, page_ctm) res1.append( (point.x, point.y)) res.append(res1) return res @property def xref(self): """annotation xref number""" CheckParent(self) annot = self.this return mupdf.pdf_to_num(mupdf.pdf_annot_obj(annot)) class Archive: def __init__( self, *args): ''' Archive(dirname [, path]) - from folder Archive(file [, path]) - from file name or object Archive(data, name) - from memory item Archive() - empty archive Archive(archive [, path]) - from archive ''' self._subarchives = list() self.this = mupdf.fz_new_multi_archive() if args: self.add( *args) def __repr__( self): return f'Archive, sub-archives: {len(self._subarchives)}' def _add_arch( self, subarch, path=None): mupdf.fz_mount_multi_archive( self.this, subarch, path) def _add_dir( self, folder, path=None): sub = mupdf.fz_open_directory( folder) mupdf.fz_mount_multi_archive( self.this, sub, path) def _add_treeitem( self, memory, name, path=None): buff = JM_BufferFromBytes( memory) sub = mupdf.fz_new_tree_archive( mupdf.FzTree()) mupdf.fz_tree_archive_add_buffer( sub, name, buff) mupdf.fz_mount_multi_archive( self.this, sub, path) def _add_ziptarfile( self, filepath, type_, path=None): if type_ == 1: sub = mupdf.fz_open_zip_archive( filepath) else: sub = mupdf.fz_open_tar_archive( filepath) mupdf.fz_mount_multi_archive( self.this, sub, path) def _add_ziptarmemory( self, memory, type_, path=None): buff = JM_BufferFromBytes( memory) stream = mupdf.fz_open_buffer( buff) if type_==1: sub = mupdf.fz_open_zip_archive_with_stream( stream) else: sub = mupdf.fz_open_tar_archive_with_stream( stream) mupdf.fz_mount_multi_archive( self.this, sub, path) def add( self, content, path=None): ''' Add a sub-archive. Args: content: The content to be added. May be one of: `str` - must be path of directory or file. `bytes`, `bytearray`, `io.BytesIO` - raw data. `zipfile.Zipfile`. `tarfile.TarFile`. `pymupdf.Archive`. A two-item tuple `(data, name)`. List or tuple (but not tuple with length 2) of the above. path: (str) a "virtual" path name, under which the elements of content can be retrieved. Use it to e.g. cope with duplicate element names. ''' def is_binary_data(x): return isinstance(x, (bytes, bytearray, io.BytesIO)) def make_subarch(entries, mount, fmt): subarch = dict(fmt=fmt, entries=entries, path=mount) if fmt != "tree" or self._subarchives == []: self._subarchives.append(subarch) else: ltree = self._subarchives[-1] if ltree["fmt"] != "tree" or ltree["path"] != subarch["path"]: self._subarchives.append(subarch) else: ltree["entries"].extend(subarch["entries"]) self._subarchives[-1] = ltree if isinstance(content, pathlib.Path): content = str(content) if isinstance(content, str): if os.path.isdir(content): self._add_dir(content, path) return make_subarch(os.listdir(content), path, 'dir') elif os.path.isfile(content): assert isinstance(path, str) and path != '', \ f'Need name for binary content, but {path=}.' with open(content) as f: ff = f.read() self._add_treeitem(ff, path) return make_subarch([path], None, 'tree') else: raise ValueError(f'Not a file or directory: {content!r}') elif is_binary_data(content): assert isinstance(path, str) and path != '' \ f'Need name for binary content, but {path=}.' self._add_treeitem(content, path) return make_subarch([path], None, 'tree') elif isinstance(content, zipfile.ZipFile): filename = getattr(content, "filename", None) if filename is None: fp = content.fp.getvalue() self._add_ziptarmemory(fp, 1, path) else: self._add_ziptarfile(filename, 1, path) return make_subarch(content.namelist(), path, 'zip') elif isinstance(content, tarfile.TarFile): filename = getattr(content.fileobj, "name", None) if filename is None: fp = content.fileobj if not isinstance(fp, io.BytesIO): fp = fp.fileobj self._add_ziptarmemory(fp.getvalue(), 0, path) else: self._add_ziptarfile(filename, 0, path) return make_subarch(content.getnames(), path, 'tar') elif isinstance(content, Archive): self._add_arch(content, path) return make_subarch([], path, 'multi') if isinstance(content, tuple) and len(content) == 2: # covers the tree item plus path data, name = content assert isinstance(name, str), f'Unexpected {type(name)=}' if is_binary_data(data): self._add_treeitem(data, name, path=path) elif isinstance(data, str): if os.path.isfile(data): with open(data, 'rb') as f: ff = f.read() self._add_treeitem(ff, name, path=path) else: assert 0, f'Unexpected {type(data)=}.' return make_subarch([name], path, 'tree') elif hasattr(content, '__getitem__'): # Deal with sequence of disparate items. for item in content: self.add(item, path) return else: raise TypeError(f'Unrecognised type {type(content)}.') assert 0 @property def entry_list( self): ''' List of sub archives. ''' return self._subarchives def has_entry( self, name): return mupdf.fz_has_archive_entry( self.this, name) def read_entry( self, name): buff = mupdf.fz_read_archive_entry( self.this, name) return JM_BinFromBuffer( buff) class Xml: def __enter__(self): return self def __exit__(self, *args): pass def __init__(self, rhs): if isinstance(rhs, mupdf.FzXml): self.this = rhs elif isinstance(rhs, str): buff = mupdf.fz_new_buffer_from_copied_data(rhs) self.this = mupdf.fz_parse_xml_from_html5(buff) else: assert 0, f'Unsupported type for rhs: {type(rhs)}' def _get_node_tree( self): def show_node(node, items, shift): while node is not None: if node.is_text: items.append((shift, f'"{node.text}"')) node = node.next continue items.append((shift, f"({node.tagname}")) for k, v in node.get_attributes().items(): items.append((shift, f"={k} '{v}'")) child = node.first_child if child: items = show_node(child, items, shift + 1) items.append((shift, f"){node.tagname}")) node = node.next return items shift = 0 items = [] items = show_node(self, items, shift) return items def add_bullet_list(self): """Add bulleted list ("ul" tag)""" child = self.create_element("ul") self.append_child(child) return child def add_class(self, text): """Set some class via CSS. Replaces complete class spec.""" cls = self.get_attribute_value("class") if cls is not None and text in cls: return self self.remove_attribute("class") if cls is None: cls = text else: cls += " " + text self.set_attribute("class", cls) return self def add_code(self, text=None): """Add a "code" tag""" child = self.create_element("code") if type(text) is str: child.append_child(self.create_text_node(text)) prev = self.span_bottom() if prev is None: prev = self prev.append_child(child) return self def add_codeblock(self): """Add monospaced lines ("pre" node)""" child = self.create_element("pre") self.append_child(child) return child def add_description_list(self): """Add description list ("dl" tag)""" child = self.create_element("dl") self.append_child(child) return child def add_division(self): """Add "div" tag""" child = self.create_element("div") self.append_child(child) return child def add_header(self, level=1): """Add header tag""" if level not in range(1, 7): raise ValueError("Header level must be in [1, 6]") this_tag = self.tagname new_tag = f"h{level}" child = self.create_element(new_tag) if this_tag not in ("h1", "h2", "h3", "h4", "h5", "h6", "p"): self.append_child(child) return child self.parent.append_child(child) return child def add_horizontal_line(self): """Add horizontal line ("hr" tag)""" child = self.create_element("hr") self.append_child(child) return child def add_image(self, name, width=None, height=None, imgfloat=None, align=None): """Add image node (tag "img").""" child = self.create_element("img") if width is not None: child.set_attribute("width", f"{width}") if height is not None: child.set_attribute("height", f"{height}") if imgfloat is not None: child.set_attribute("style", f"float: {imgfloat}") if align is not None: child.set_attribute("align", f"{align}") child.set_attribute("src", f"{name}") self.append_child(child) return child def add_link(self, href, text=None): """Add a hyperlink ("a" tag)""" child = self.create_element("a") if not isinstance(text, str): text = href child.set_attribute("href", href) child.append_child(self.create_text_node(text)) prev = self.span_bottom() if prev is None: prev = self prev.append_child(child) return self def add_list_item(self): """Add item ("li" tag) under a (numbered or bulleted) list.""" if self.tagname not in ("ol", "ul"): raise ValueError("cannot add list item to", self.tagname) child = self.create_element("li") self.append_child(child) return child def add_number_list(self, start=1, numtype=None): """Add numbered list ("ol" tag)""" child = self.create_element("ol") if start > 1: child.set_attribute("start", str(start)) if numtype is not None: child.set_attribute("type", numtype) self.append_child(child) return child def add_paragraph(self): """Add "p" tag""" child = self.create_element("p") if self.tagname != "p": self.append_child(child) else: self.parent.append_child(child) return child def add_span(self): child = self.create_element("span") self.append_child(child) return child def add_style(self, text): """Set some style via CSS style. Replaces complete style spec.""" style = self.get_attribute_value("style") if style is not None and text in style: return self self.remove_attribute("style") if style is None: style = text else: style += ";" + text self.set_attribute("style", style) return self def add_subscript(self, text=None): """Add a subscript ("sub" tag)""" child = self.create_element("sub") if type(text) is str: child.append_child(self.create_text_node(text)) prev = self.span_bottom() if prev is None: prev = self prev.append_child(child) return self def add_superscript(self, text=None): """Add a superscript ("sup" tag)""" child = self.create_element("sup") if type(text) is str: child.append_child(self.create_text_node(text)) prev = self.span_bottom() if prev is None: prev = self prev.append_child(child) return self def add_text(self, text): """Add text. Line breaks are honored.""" lines = text.splitlines() line_count = len(lines) prev = self.span_bottom() if prev is None: prev = self for i, line in enumerate(lines): prev.append_child(self.create_text_node(line)) if i < line_count - 1: prev.append_child(self.create_element("br")) return self def append_child( self, child): mupdf.fz_dom_append_child( self.this, child.this) def append_styled_span(self, style): span = self.create_element("span") span.add_style(style) prev = self.span_bottom() if prev is None: prev = self prev.append_child(span) return prev def bodytag( self): return Xml( mupdf.fz_dom_body( self.this)) def clone( self): ret = mupdf.fz_dom_clone( self.this) return Xml( ret) @staticmethod def color_text(color): if type(color) is str: return color if type(color) is int: return f"rgb({sRGB_to_rgb(color)})" if type(color) in (tuple, list): return f"rgb{tuple(color)}" return color def create_element( self, tag): return Xml( mupdf.fz_dom_create_element( self.this, tag)) def create_text_node( self, text): return Xml( mupdf.fz_dom_create_text_node( self.this, text)) def debug(self): """Print a list of the node tree below self.""" items = self._get_node_tree() for item in items: message(" " * item[0] + item[1].replace("\n", "\\n")) def find( self, tag, att, match): ret = mupdf.fz_dom_find( self.this, tag, att, match) if ret.m_internal: return Xml( ret) def find_next( self, tag, att, match): ret = mupdf.fz_dom_find_next( self.this, tag, att, match) if ret.m_internal: return Xml( ret) @property def first_child( self): if mupdf.fz_xml_text( self.this): # text node, has no child. return ret = mupdf.fz_dom_first_child( self) if ret.m_internal: return Xml( ret) def get_attribute_value( self, key): assert key return mupdf.fz_dom_attribute( self.this, key) def get_attributes( self): if mupdf.fz_xml_text( self.this): # text node, has no attributes. return result = dict() i = 0 while 1: val, key = mupdf.fz_dom_get_attribute( self.this, i) if not val or not key: break result[ key] = val i += 1 return result def insert_after( self, node): mupdf.fz_dom_insert_after( self.this, node.this) def insert_before( self, node): mupdf.fz_dom_insert_before( self.this, node.this) def insert_text(self, text): lines = text.splitlines() line_count = len(lines) for i, line in enumerate(lines): self.append_child(self.create_text_node(line)) if i < line_count - 1: self.append_child(self.create_element("br")) return self @property def is_text(self): """Check if this is a text node.""" return self.text is not None @property def last_child(self): """Return last child node.""" child = self.first_child if child is None: return None while True: next = child.next if not next: return child child = next @property def next( self): ret = mupdf.fz_dom_next( self.this) if ret.m_internal: return Xml( ret) @property def parent( self): ret = mupdf.fz_dom_parent( self.this) if ret.m_internal: return Xml( ret) @property def previous( self): ret = mupdf.fz_dom_previous( self.this) if ret.m_internal: return Xml( ret) def remove( self): mupdf.fz_dom_remove( self.this) def remove_attribute( self, key): assert key mupdf.fz_dom_remove_attribute( self.this, key) @property def root( self): return Xml( mupdf.fz_xml_root( self.this)) def set_align(self, align): """Set text alignment via CSS style""" text = "text-align: %s" if isinstance( align, str): t = align elif align == TEXT_ALIGN_LEFT: t = "left" elif align == TEXT_ALIGN_CENTER: t = "center" elif align == TEXT_ALIGN_RIGHT: t = "right" elif align == TEXT_ALIGN_JUSTIFY: t = "justify" else: raise ValueError(f"Unrecognised {align=}") text = text % t self.add_style(text) return self def set_attribute( self, key, value): assert key mupdf.fz_dom_add_attribute( self.this, key, value) def set_bgcolor(self, color): """Set background color via CSS style""" text = f"background-color: %s" % self.color_text(color) self.add_style(text) # does not work on span level return self def set_bold(self, val=True): """Set bold on / off via CSS style""" if val: val="bold" else: val="normal" text = "font-weight: %s" % val self.append_styled_span(text) return self def set_color(self, color): """Set text color via CSS style""" text = f"color: %s" % self.color_text(color) self.append_styled_span(text) return self def set_columns(self, cols): """Set number of text columns via CSS style""" text = f"columns: {cols}" self.append_styled_span(text) return self def set_font(self, font): """Set font-family name via CSS style""" text = "font-family: %s" % font self.append_styled_span(text) return self def set_fontsize(self, fontsize): """Set font size name via CSS style""" if type(fontsize) is str: px="" else: px="px" text = f"font-size: {fontsize}{px}" self.append_styled_span(text) return self def set_id(self, unique): """Set a unique id.""" # check uniqueness root = self.root if root.find(None, "id", unique): raise ValueError(f"id '{unique}' already exists") self.set_attribute("id", unique) return self def set_italic(self, val=True): """Set italic on / off via CSS style""" if val: val="italic" else: val="normal" text = "font-style: %s" % val self.append_styled_span(text) return self def set_leading(self, leading): """Set inter-line spacing value via CSS style - block-level only.""" text = f"-mupdf-leading: {leading}" self.add_style(text) return self def set_letter_spacing(self, spacing): """Set inter-letter spacing value via CSS style""" text = f"letter-spacing: {spacing}" self.append_styled_span(text) return self def set_lineheight(self, lineheight): """Set line height name via CSS style - block-level only.""" text = f"line-height: {lineheight}" self.add_style(text) return self def set_margins(self, val): """Set margin values via CSS style""" text = "margins: %s" % val self.append_styled_span(text) return self def set_opacity(self, opacity): """Set opacity via CSS style""" text = f"opacity: {opacity}" self.append_styled_span(text) return self def set_pagebreak_after(self): """Insert a page break after this node.""" text = "page-break-after: always" self.add_style(text) return self def set_pagebreak_before(self): """Insert a page break before this node.""" text = "page-break-before: always" self.add_style(text) return self def set_properties( self, align=None, bgcolor=None, bold=None, color=None, columns=None, font=None, fontsize=None, indent=None, italic=None, leading=None, letter_spacing=None, lineheight=None, margins=None, pagebreak_after=None, pagebreak_before=None, word_spacing=None, unqid=None, cls=None, ): """Set any or all properties of a node. To be used for existing nodes preferably. """ root = self.root temp = root.add_division() if align is not None: temp.set_align(align) if bgcolor is not None: temp.set_bgcolor(bgcolor) if bold is not None: temp.set_bold(bold) if color is not None: temp.set_color(color) if columns is not None: temp.set_columns(columns) if font is not None: temp.set_font(font) if fontsize is not None: temp.set_fontsize(fontsize) if indent is not None: temp.set_text_indent(indent) if italic is not None: temp.set_italic(italic) if leading is not None: temp.set_leading(leading) if letter_spacing is not None: temp.set_letter_spacing(letter_spacing) if lineheight is not None: temp.set_lineheight(lineheight) if margins is not None: temp.set_margins(margins) if pagebreak_after is not None: temp.set_pagebreak_after() if pagebreak_before is not None: temp.set_pagebreak_before() if word_spacing is not None: temp.set_word_spacing(word_spacing) if unqid is not None: self.set_id(unqid) if cls is not None: self.add_class(cls) styles = [] top_style = temp.get_attribute_value("style") if top_style is not None: styles.append(top_style) child = temp.first_child while child: styles.append(child.get_attribute_value("style")) child = child.first_child self.set_attribute("style", ";".join(styles)) temp.remove() return self def set_text_indent(self, indent): """Set text indentation name via CSS style - block-level only.""" text = f"text-indent: {indent}" self.add_style(text) return self def set_underline(self, val="underline"): text = "text-decoration: %s" % val self.append_styled_span(text) return self def set_word_spacing(self, spacing): """Set inter-word spacing value via CSS style""" text = f"word-spacing: {spacing}" self.append_styled_span(text) return self def span_bottom(self): """Find deepest level in stacked spans.""" parent = self child = self.last_child if child is None: return None while child.is_text: child = child.previous if child is None: break if child is None or child.tagname != "span": return None while True: if child is None: return parent if child.tagname in ("a", "sub","sup","body") or child.is_text: child = child.next continue if child.tagname == "span": parent = child child = child.first_child else: return parent @property def tagname( self): return mupdf.fz_xml_tag( self.this) @property def text( self): return mupdf.fz_xml_text( self.this) add_var = add_code add_samp = add_code add_kbd = add_code class Colorspace: def __init__(self, type_): """Supported are GRAY, RGB and CMYK.""" if isinstance( type_, mupdf.FzColorspace): self.this = type_ elif type_ == CS_GRAY: self.this = mupdf.FzColorspace(mupdf.FzColorspace.Fixed_GRAY) elif type_ == CS_CMYK: self.this = mupdf.FzColorspace(mupdf.FzColorspace.Fixed_CMYK) elif type_ == CS_RGB: self.this = mupdf.FzColorspace(mupdf.FzColorspace.Fixed_RGB) else: self.this = mupdf.FzColorspace(mupdf.FzColorspace.Fixed_RGB) def __repr__(self): x = ("", "GRAY", "", "RGB", "CMYK")[self.n] return "Colorspace(CS_%s) - %s" % (x, self.name) def _name(self): return mupdf.fz_colorspace_name(self.this) @property def n(self): """Size of one pixel.""" return mupdf.fz_colorspace_n(self.this) @property def name(self): """Name of the Colorspace.""" return self._name() class DeviceWrapper: def __init__(self, *args): if args_match( args, mupdf.FzDevice): device, = args self.this = device elif args_match( args, Pixmap, None): pm, clip = args bbox = JM_irect_from_py( clip) if mupdf.fz_is_infinite_irect( bbox): self.this = mupdf.fz_new_draw_device( mupdf.FzMatrix(), pm) else: self.this = mupdf.fz_new_draw_device_with_bbox( mupdf.FzMatrix(), pm, bbox) elif args_match( args, mupdf.FzDisplayList): dl, = args self.this = mupdf.fz_new_list_device( dl) elif args_match( args, mupdf.FzStextPage, None): tp, flags = args opts = mupdf.FzStextOptions( flags) self.this = mupdf.fz_new_stext_device( tp, opts) else: raise Exception( f'Unrecognised args for DeviceWrapper: {args!r}') class DisplayList: def __del__(self): if not type(self) is DisplayList: return self.thisown = False def __init__(self, *args): if len(args) == 1 and isinstance(args[0], mupdf.FzRect): self.this = mupdf.FzDisplayList(args[0]) elif len(args) == 1 and isinstance(args[0], mupdf.FzDisplayList): self.this = args[0] else: assert 0, f'Unrecognised {args=}' def get_pixmap(self, matrix=None, colorspace=None, alpha=0, clip=None): if isinstance(colorspace, Colorspace): colorspace = colorspace.this else: colorspace = mupdf.FzColorspace(mupdf.FzColorspace.Fixed_RGB) val = JM_pixmap_from_display_list(self.this, matrix, colorspace, alpha, clip, None) val.thisown = True return val def get_textpage(self, flags=3): """Make a TextPage from a DisplayList.""" stext_options = mupdf.FzStextOptions() stext_options.flags = flags val = mupdf.FzStextPage(self.this, stext_options) val.thisown = True return val @property def rect(self): val = JM_py_from_rect(mupdf.fz_bound_display_list(self.this)) val = Rect(val) return val def run(self, dw, m, area): mupdf.fz_run_display_list( self.this, dw.device, JM_matrix_from_py(m), JM_rect_from_py(area), mupdf.FzCookie(), ) if g_use_extra: extra_FzDocument_insert_pdf = extra.FzDocument_insert_pdf class Document: def __contains__(self, loc) -> bool: if type(loc) is int: if loc < self.page_count: return True return False if type(loc) not in (tuple, list) or len(loc) != 2: return False chapter, pno = loc if (0 or not isinstance(chapter, int) or chapter < 0 or chapter >= self.chapter_count ): return False if (0 or not isinstance(pno, int) or pno < 0 or pno >= self.chapter_page_count(chapter) ): return False return True def __delitem__(self, i)->None: if not self.is_pdf: raise ValueError("is no PDF") if type(i) is int: return self.delete_page(i) if type(i) in (list, tuple, range): return self.delete_pages(i) if type(i) is not slice: raise ValueError("bad argument type") pc = self.page_count start = i.start if i.start else 0 stop = i.stop if i.stop else pc step = i.step if i.step else 1 while start < 0: start += pc if start >= pc: raise ValueError("bad page number(s)") while stop < 0: stop += pc if stop > pc: raise ValueError("bad page number(s)") return self.delete_pages(range(start, stop, step)) def __enter__(self): return self def __exit__(self, *args): self.close() @typing.overload def __getitem__(self, i: int = 0) -> Page: ... if sys.version_info >= (3, 9): @typing.overload def __getitem__(self, i: slice) -> list[Page]: ... @typing.overload def __getitem__(self, i: tuple[int, int]) -> Page: ... def __getitem__(self, i=0): if isinstance(i, slice): return [self[j] for j in range(*i.indices(len(self)))] assert isinstance(i, int) or (isinstance(i, tuple) and len(i) == 2 and all(isinstance(x, int) for x in i)), \ f'Invalid item number: {i=}.' if i not in self: raise IndexError(f"page {i} not in document") return self.load_page(i) def __init__(self, filename=None, stream=None, filetype=None, rect=None, width=0, height=0, fontsize=11): """Creates a document. Use 'open' as a synonym. Notes: Basic usages: open() - new PDF document open(filename) - string or pathlib.Path, must have supported file extension. open(type, buffer) - type: valid extension, buffer: bytes object. open(stream=buffer, filetype=type) - keyword version of previous. open(filename, fileype=type) - filename with unrecognized extension. rect, width, height, fontsize: layout reflowable document on open (e.g. EPUB). Ignored if n/a. """ # We temporarily set JM_mupdf_show_errors=0 while we are constructing, # then restore its original value in a `finally:` block. # global JM_mupdf_show_errors JM_mupdf_show_errors_old = JM_mupdf_show_errors JM_mupdf_show_errors = 0 try: self.is_closed = False self.is_encrypted = False self.is_encrypted = False self.metadata = None self.FontInfos = [] self.Graftmaps = {} self.ShownPages = {} self.InsertedImages = {} self._page_refs = weakref.WeakValueDictionary() if isinstance(filename, mupdf.PdfDocument): pdf_document = filename self.this = pdf_document self.this_is_pdf = True return w = width h = height r = JM_rect_from_py(rect) if not mupdf.fz_is_infinite_rect(r): w = r.x1 - r.x0 h = r.y1 - r.y0 self._name = filename self.stream = stream if stream is not None: if filename is not None and filetype is None: # 2025-05-06: Use <filename> as the filetype. This is # reversing precedence - we used to use <filename> if both # were set. filetype = filename if isinstance(stream, (bytes, memoryview)): pass elif isinstance(stream, bytearray): stream = bytes(stream) elif isinstance(stream, io.BytesIO): stream = stream.getvalue() else: raise TypeError(f"bad stream: {type(stream)=}.") self.stream = stream assert isinstance(stream, (bytes, memoryview)) if len(stream) == 0: # MuPDF raise an exception for this but also generates # warnings, which is not very helpful for us. So instead we # raise a specific exception. raise EmptyFileError('Cannot open empty stream.') stream2 = mupdf.fz_open_memory(mupdf.python_buffer_data(stream), len(stream)) try: doc = mupdf.fz_open_document_with_stream(filetype if filetype else '', stream2) except Exception as e: if g_exceptions_verbose > 1: exception_info() raise FileDataError('Failed to open stream') from e elif filename: assert not stream if isinstance(filename, str): pass elif hasattr(filename, "absolute"): filename = str(filename) elif hasattr(filename, "name"): filename = filename.name else: raise TypeError(f"bad filename: {type(filename)=} {filename=}.") self._name = filename # Generate our own specific exceptions. This avoids MuPDF # generating warnings etc. if not os.path.exists(filename): raise FileNotFoundError(f"no such file: '{filename}'") elif not os.path.isfile(filename): raise FileDataError(f"'{filename}' is no file") elif os.path.getsize(filename) == 0: raise EmptyFileError(f'Cannot open empty file: {filename=}.') if filetype: # Override the type implied by <filename>. MuPDF does not # have a way to do this directly so we open via a stream. try: fz_stream = mupdf.fz_open_file(filename) doc = mupdf.fz_open_document_with_stream(filetype, fz_stream) except Exception as e: if g_exceptions_verbose > 1: exception_info() raise FileDataError(f'Failed to open file {filename!r} as type {filetype!r}.') from e else: try: doc = mupdf.fz_open_document(filename) except Exception as e: if g_exceptions_verbose > 1: exception_info() raise FileDataError(f'Failed to open file {filename!r}.') from e else: pdf = mupdf.PdfDocument() doc = mupdf.FzDocument(pdf) if w > 0 and h > 0: mupdf.fz_layout_document(doc, w, h, fontsize) elif mupdf.fz_is_document_reflowable(doc): mupdf.fz_layout_document(doc, 400, 600, 11) self.this = doc # fixme: not sure where self.thisown gets initialised in PyMuPDF. # self.thisown = True if self.thisown: self._graft_id = TOOLS.gen_id() if self.needs_pass: self.is_encrypted = True else: # we won't init until doc is decrypted self.init_doc() # the following hack detects invalid/empty SVG files, which else may lead # to interpreter crashes if filename and filename.lower().endswith("svg") or filetype and "svg" in filetype.lower(): try: _ = self.convert_to_pdf() # this seems to always work except Exception as e: if g_exceptions_verbose > 1: exception_info() raise FileDataError("cannot open broken document") from e if g_use_extra: self.this_is_pdf = isinstance( self.this, mupdf.PdfDocument) if self.this_is_pdf: self.page_count2 = extra.page_count_pdf else: self.page_count2 = extra.page_count_fz finally: JM_mupdf_show_errors = JM_mupdf_show_errors_old def __len__(self) -> int: return self.page_count def __repr__(self) -> str: m = "closed " if self.is_closed else "" if self.stream is None: if self.name == "": return m + "Document(<new PDF, doc# %i>)" % self._graft_id return m + "Document('%s')" % (self.name,) return m + "Document('%s', <memory, doc# %i>)" % (self.name, self._graft_id) def _addFormFont(self, name, font): """Add new form font.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return fonts = mupdf.pdf_dict_getl( mupdf.pdf_trailer( pdf), PDF_NAME('Root'), PDF_NAME('AcroForm'), PDF_NAME('DR'), PDF_NAME('Font'), ) if not fonts.m_internal or not mupdf.pdf_is_dict( fonts): raise RuntimeError( "PDF has no form fonts yet") k = mupdf.pdf_new_name( name) v = JM_pdf_obj_from_str( pdf, font) mupdf.pdf_dict_put( fonts, k, v) def del_toc_item( self, idx: int, ) -> None: """Delete TOC / bookmark item by index.""" xref = self.get_outline_xrefs()[idx] self._remove_toc_item(xref) def _delToC(self): """Delete the TOC.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") xrefs = [] # create Python list pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return xrefs # not a pdf # get the main root root = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')) # get the outline root olroot = mupdf.pdf_dict_get(root, PDF_NAME('Outlines')) if not olroot.m_internal: return xrefs # no outlines or some problem first = mupdf.pdf_dict_get(olroot, PDF_NAME('First')) # first outline xrefs = JM_outline_xrefs(first, xrefs) xref_count = len(xrefs) olroot_xref = mupdf.pdf_to_num(olroot) # delete OL root mupdf.pdf_delete_object(pdf, olroot_xref) # delete OL root mupdf.pdf_dict_del(root, PDF_NAME('Outlines')) # delete OL root for i in range(xref_count): _, xref = JM_INT_ITEM(xrefs, i) mupdf.pdf_delete_object(pdf, xref) # delete outline item xrefs.append(olroot_xref) val = xrefs self.init_doc() return val def _delete_page(self, pno): pdf = _as_pdf_document(self) mupdf.pdf_delete_page( pdf, pno) if pdf.m_internal.rev_page_map: mupdf.ll_pdf_drop_page_tree( pdf.m_internal) def _deleteObject(self, xref): """Delete object.""" pdf = _as_pdf_document(self) if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1): raise ValueError( MSG_BAD_XREF) mupdf.pdf_delete_object(pdf, xref) def _do_links( doc1: 'Document', doc2: 'Document', from_page: int = -1, to_page: int = -1, start_at: int = -1, ) -> None: """Insert links contained in copied page range into destination PDF. Parameter values **must** equal those of method insert_pdf(), which must have been previously executed. """ #pymupdf.log( 'utils.do_links()') # -------------------------------------------------------------------------- # internal function to create the actual "/Annots" object string # -------------------------------------------------------------------------- def cre_annot(lnk, xref_dst, pno_src, ctm): """Create annotation object string for a passed-in link.""" r = lnk["from"] * ctm # rect in PDF coordinates rect = _format_g(tuple(r)) if lnk["kind"] == LINK_GOTO: txt = annot_skel["goto1"] # annot_goto idx = pno_src.index(lnk["page"]) p = lnk["to"] * ctm # target point in PDF coordinates annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect) elif lnk["kind"] == LINK_GOTOR: if lnk["page"] >= 0: txt = annot_skel["gotor1"] # annot_gotor pnt = lnk.get("to", Point(0, 0)) # destination point if type(pnt) is not Point: pnt = Point(0, 0) annot = txt( lnk["page"], pnt.x, pnt.y, lnk["zoom"], lnk["file"], lnk["file"], rect, ) else: txt = annot_skel["gotor2"] # annot_gotor_n to = get_pdf_str(lnk["to"]) to = to[1:-1] f = lnk["file"] annot = txt(to, f, rect) elif lnk["kind"] == LINK_LAUNCH: txt = annot_skel["launch"] # annot_launch annot = txt(lnk["file"], lnk["file"], rect) elif lnk["kind"] == LINK_URI: txt = annot_skel["uri"] # annot_uri annot = txt(lnk["uri"], rect) else: annot = "" return annot # -------------------------------------------------------------------------- # validate & normalize parameters if from_page < 0: fp = 0 elif from_page >= doc2.page_count: fp = doc2.page_count - 1 else: fp = from_page if to_page < 0 or to_page >= doc2.page_count: tp = doc2.page_count - 1 else: tp = to_page if start_at < 0: raise ValueError("'start_at' must be >= 0") sa = start_at incr = 1 if fp <= tp else -1 # page range could be reversed # lists of source / destination page numbers pno_src = list(range(fp, tp + incr, incr)) pno_dst = [sa + i for i in range(len(pno_src))] # lists of source / destination page xrefs xref_src = [] xref_dst = [] for i in range(len(pno_src)): p_src = pno_src[i] p_dst = pno_dst[i] old_xref = doc2.page_xref(p_src) new_xref = doc1.page_xref(p_dst) xref_src.append(old_xref) xref_dst.append(new_xref) # create the links for each copied page in destination PDF for i in range(len(xref_src)): page_src = doc2[pno_src[i]] # load source page links = page_src.get_links() # get all its links #log( '{pno_src=}') #log( '{type(page_src)=}') #log( '{page_src=}') #log( '{=i len(links)}') if len(links) == 0: # no links there page_src = None continue ctm = ~page_src.transformation_matrix # calc page transformation matrix page_dst = doc1[pno_dst[i]] # load destination page link_tab = [] # store all link definitions here for l in links: if l["kind"] == LINK_GOTO and (l["page"] not in pno_src): continue # GOTO link target not in copied pages annot_text = cre_annot(l, xref_dst, pno_src, ctm) if annot_text: link_tab.append(annot_text) if link_tab != []: page_dst._addAnnot_FromString( tuple(link_tab)) #log( 'utils.do_links() returning.') def _do_widgets( tar: 'Document', src: 'Document', graftmap, from_page: int = -1, to_page: int = -1, start_at: int = -1, join_duplicates=0, ) -> None: """Insert widgets of copied page range into target PDF. Parameter values **must** equal those of method insert_pdf() which must have been previously executed. """ if not src.is_form_pdf: # nothing to do: source PDF has no fields return def clean_kid_parents(acro_fields): """ Make sure all kids have correct "Parent" pointers.""" for i in range(acro_fields.pdf_array_len()): parent = acro_fields.pdf_array_get(i) kids = parent.pdf_dict_get(PDF_NAME("Kids")) for j in range(kids.pdf_array_len()): kid = kids.pdf_array_get(j) kid.pdf_dict_put(PDF_NAME("Parent"), parent) def join_widgets(pdf, acro_fields, xref1, xref2, name): """Called for each pair of widgets having the same name. Args: pdf: target MuPDF document acro_fields: object Root/AcroForm/Fields xref1, xref2: widget xrefs having same names name: (str) the name Result: Defined or updated widget parent that points to both widgets. """ def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2): """Merge widget in xref2 into "Kids" list of widget xref1. Args: xref1, kids1: target widget and its "Kids" array. xref2, kids2: source wwidget and its "Kids" array (may be empty). """ # make indirect objects from widgets w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0) w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0) # find source widget in "Fields" array idx = acro_fields.pdf_array_find(w2_ind) acro_fields.pdf_array_delete(idx) if not kids2.pdf_is_array(): # source widget has no kids widget = mupdf.pdf_load_object(pdf, xref2) # delete name from widget and insert target as parent widget.pdf_dict_del(PDF_NAME("T")) widget.pdf_dict_put(PDF_NAME("Parent"), w1_ind) # put in target Kids kids1.pdf_array_push(w2_ind) else: # copy source kids to target kids for i in range(kids2.pdf_array_len()): kid = kids2.pdf_array_get(i) kid.pdf_dict_put(PDF_NAME("Parent"), w1_ind) kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0) kids1.pdf_array_push(kid_ind) def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name): """Make new "Parent" for two widgets with same name. Args: xref1, w1: first widget xref2, w2: second widget name: field name Result: Both widgets have no "Kids". We create a new object with the name and a "Kids" array containing the widgets. Original widgets must be removed from AcroForm/Fields. """ # make new "Parent" object new = mupdf.pdf_new_dict(pdf, 5) new.pdf_dict_put_text_string(PDF_NAME("T"), name) kids = new.pdf_dict_put_array(PDF_NAME("Kids"), 2) new_obj = mupdf.pdf_add_object(pdf, new) new_obj_xref = new_obj.pdf_to_num() new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0) # copy over some required source widget properties ft = w1.pdf_dict_get(PDF_NAME("FT")) w1.pdf_dict_del(PDF_NAME("FT")) new_obj.pdf_dict_put(PDF_NAME("FT"), ft) aa = w1.pdf_dict_get(PDF_NAME("AA")) w1.pdf_dict_del(PDF_NAME("AA")) new_obj.pdf_dict_put(PDF_NAME("AA"), aa) # remove name field, insert "Parent" field in source widgets w1.pdf_dict_del(PDF_NAME("T")) w1.pdf_dict_put(PDF_NAME("Parent"), new_ind) w2.pdf_dict_del(PDF_NAME("T")) w2.pdf_dict_put(PDF_NAME("Parent"), new_ind) # put source widgets in "kids" array ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0) ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0) kids.pdf_array_push(ind1) kids.pdf_array_push(ind2) # remove source widgets from "AcroForm/Fields" idx = acro_fields.pdf_array_find(ind1) acro_fields.pdf_array_delete(idx) idx = acro_fields.pdf_array_find(ind2) acro_fields.pdf_array_delete(idx) acro_fields.pdf_array_push(new_ind) w1 = mupdf.pdf_load_object(pdf, xref1) w2 = mupdf.pdf_load_object(pdf, xref2) kids1 = w1.pdf_dict_get(PDF_NAME("Kids")) kids2 = w2.pdf_dict_get(PDF_NAME("Kids")) # check which widget has a suitable "Kids" array if kids1.pdf_is_array(): re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order elif kids2.pdf_is_array(): re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order else: new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order def get_kids(parent, kids_list): """Return xref list of leaf kids for a parent. Call with an empty list. """ kids = mupdf.pdf_dict_get(parent, PDF_NAME("Kids")) if not kids.pdf_is_array(): return kids_list for i in range(kids.pdf_array_len()): kid = kids.pdf_array_get(i) if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, PDF_NAME("Kids"))): kids_list = get_kids(kid, kids_list) else: kids_list.append(kid.pdf_to_num()) return kids_list def kids_xrefs(widget): """Get the xref of top "Parent" and the list of leaf widgets.""" kids_list = [] parent = mupdf.pdf_dict_get(widget, PDF_NAME("Parent")) parent_xref = parent.pdf_to_num() if parent_xref == 0: return parent_xref, kids_list kids_list = get_kids(parent, kids_list) return parent_xref, kids_list def deduplicate_names(pdf, acro_fields, join_duplicates=False): """Handle any widget name duplicates caused by the merge.""" names = {} # key is a widget name, value a list of widgets having it. # extract all names and widgets in "AcroForm/Fields" for i in range(mupdf.pdf_array_len(acro_fields)): wobject = mupdf.pdf_array_get(acro_fields, i) xref = wobject.pdf_to_num() # extract widget name and collect widget(s) using it T = mupdf.pdf_dict_get_text_string(wobject, PDF_NAME("T")) xrefs = names.get(T, []) xrefs.append(xref) names[T] = xrefs for name, xrefs in names.items(): if len(xrefs) < 2: continue xref0, xref1 = xrefs[:2] # only exactly 2 should occur! if join_duplicates: # combine fields with equal names join_widgets(pdf, acro_fields, xref0, xref1, name) else: # make field names unique newname = name + f" [{xref1}]" # append this to the name wobject = mupdf.pdf_load_object(pdf, xref1) wobject.pdf_dict_put_text_string(PDF_NAME("T"), newname) clean_kid_parents(acro_fields) def get_acroform(doc): """Retrieve the AcroForm dictionary form a PDF.""" pdf = mupdf.pdf_document_from_fz_document(doc) # AcroForm (= central form field info) return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm") tarpdf = mupdf.pdf_document_from_fz_document(tar) srcpdf = mupdf.pdf_document_from_fz_document(src) if tar.is_form_pdf: # target is a Form PDF, so use it to include source fields acro = get_acroform(tar) # Important arrays in AcroForm acro_fields = acro.pdf_dict_get(PDF_NAME("Fields")) tar_co = acro.pdf_dict_get(PDF_NAME("CO")) if not tar_co.pdf_is_array(): tar_co = acro.pdf_dict_put_array(PDF_NAME("CO"), 5) else: # target is no Form PDF, so copy over source AcroForm acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy # Clear "Fields" and "CO" arrays: will be populated by page fields. # This is required to avoid copying unneeded objects. acro.pdf_dict_del(PDF_NAME("Fields")) acro.pdf_dict_put_array(PDF_NAME("Fields"), 5) acro.pdf_dict_del(PDF_NAME("CO")) acro.pdf_dict_put_array(PDF_NAME("CO"), 5) # Enrich AcroForm for copying to target acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro) # Insert AcroForm into target PDF acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft) acro_fields = acro_tar.pdf_dict_get(PDF_NAME("Fields")) tar_co = acro_tar.pdf_dict_get(PDF_NAME("CO")) # get its xref and insert it into target catalog tar_xref = acro_tar.pdf_to_num() acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), PDF_NAME("Root")) root.pdf_dict_put(PDF_NAME("AcroForm"), acro_tar_ind) if from_page <= to_page: src_range = range(from_page, to_page + 1) else: src_range = range(from_page, to_page - 1, -1) parents = {} # information about widget parents # remove "P" owning page reference from all widgets of all source pages for i in src_range: src_page = src[i] for xref in [ xref for xref, wtype, _ in src_page.annot_xrefs() if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member ]: w_obj = mupdf.pdf_load_object(srcpdf, xref) w_obj.pdf_dict_del(PDF_NAME("P")) # get the widget's parent structure parent_xref, old_kids = kids_xrefs(w_obj) if parent_xref: parents[parent_xref] = { "new_xref": 0, "old_kids": old_kids, "new_kids": [], } # Copy over Parent widgets first - they are not page-dependent for xref in parents.keys(): # pylint: disable=consider-using-dict-items parent = mupdf.pdf_load_object(srcpdf, xref) parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent) parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft) kids_xrefs_new = get_kids(parent_tar, []) parent_xref_new = parent_tar.pdf_to_num() parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0) acro_fields.pdf_array_push(parent_ind) parents[xref]["new_xref"] = parent_xref_new parents[xref]["new_kids"] = kids_xrefs_new for i in range(len(src_range)): # read first copied over page in target tar_page = tar[start_at + i] # read the original page in the source PDF src_page = src[src_range[i]] # now walk through source page widgets and copy over w_xrefs = [ # widget xrefs of the source page xref for xref, wtype, _ in src_page.annot_xrefs() if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member ] if not w_xrefs: # no widgets on this source page continue # convert to formal PDF page tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page) # extract annotations array tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), PDF_NAME("Annots")) if not mupdf.pdf_is_array(tar_annots): tar_annots = mupdf.pdf_dict_put_array( tar_page_pdf.obj(), PDF_NAME("Annots"), 5 ) for xref in w_xrefs: w_obj = mupdf.pdf_load_object(srcpdf, xref) # check if field takes part in inter-field validations is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C")) # check if parent of widget already in target parent_xref = mupdf.pdf_to_num( w_obj.pdf_dict_get(PDF_NAME("Parent")) ) if parent_xref == 0: # parent not in target yet try: w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj) except Exception as e: message_warning(f"cannot copy widget at {xref=}: {e}") continue w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft) tar_xref = w_obj_tar.pdf_to_num() w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) mupdf.pdf_array_push(acro_fields, w_obj_tar_ind) else: parent = parents[parent_xref] idx = parent["old_kids"].index(xref) # search for xref in parent tar_xref = parent["new_kids"][idx] w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) # Into "AcroForm/CO" if a computation field. if is_aac: mupdf.pdf_array_push(tar_co, w_obj_tar_ind) deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates) def _embeddedFileGet(self, idx): pdf = _as_pdf_document(self) names = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) entry = mupdf.pdf_array_get(names, 2*idx+1) filespec = mupdf.pdf_dict_getl(entry, PDF_NAME('EF'), PDF_NAME('F')) buf = mupdf.pdf_load_stream(filespec) cont = JM_BinFromBuffer(buf) return cont def _embeddedFileIndex(self, item: typing.Union[int, str]) -> int: filenames = self.embfile_names() msg = "'%s' not in EmbeddedFiles array." % str(item) if item in filenames: idx = filenames.index(item) elif item in range(len(filenames)): idx = item else: raise ValueError(msg) return idx def _embfile_add(self, name, buffer_, filename=None, ufilename=None, desc=None): pdf = _as_pdf_document(self) data = JM_BufferFromBytes(buffer_) if not data.m_internal: raise TypeError( MSG_BAD_BUFFER) names = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) if not mupdf.pdf_is_array(names): root = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')) names = mupdf.pdf_new_array(pdf, 6) # an even number! mupdf.pdf_dict_putl( root, names, PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) fileentry = JM_embed_file(pdf, data, filename, ufilename, desc, 1) xref = mupdf.pdf_to_num( mupdf.pdf_dict_getl(fileentry, PDF_NAME('EF'), PDF_NAME('F')) ) mupdf.pdf_array_push(names, mupdf.pdf_new_text_string(name)) mupdf.pdf_array_push(names, fileentry) return xref def _embfile_del(self, idx): pdf = _as_pdf_document(self) names = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) mupdf.pdf_array_delete(names, idx + 1) mupdf.pdf_array_delete(names, idx) def _embfile_info(self, idx, infodict): pdf = _as_pdf_document(self) xref = 0 ci_xref=0 trailer = mupdf.pdf_trailer(pdf) names = mupdf.pdf_dict_getl( trailer, PDF_NAME('Root'), PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) o = mupdf.pdf_array_get(names, 2*idx+1) ci = mupdf.pdf_dict_get(o, PDF_NAME('CI')) if ci.m_internal: ci_xref = mupdf.pdf_to_num(ci) infodict["collection"] = ci_xref name = mupdf.pdf_to_text_string(mupdf.pdf_dict_get(o, PDF_NAME('F'))) infodict[dictkey_filename] = JM_EscapeStrFromStr(name) name = mupdf.pdf_to_text_string(mupdf.pdf_dict_get(o, PDF_NAME('UF'))) infodict[dictkey_ufilename] = JM_EscapeStrFromStr(name) name = mupdf.pdf_to_text_string(mupdf.pdf_dict_get(o, PDF_NAME('Desc'))) infodict[dictkey_descr] = JM_UnicodeFromStr(name) len_ = -1 DL = -1 fileentry = mupdf.pdf_dict_getl(o, PDF_NAME('EF'), PDF_NAME('F')) xref = mupdf.pdf_to_num(fileentry) o = mupdf.pdf_dict_get(fileentry, PDF_NAME('Length')) if o.m_internal: len_ = mupdf.pdf_to_int(o) o = mupdf.pdf_dict_get(fileentry, PDF_NAME('DL')) if o.m_internal: DL = mupdf.pdf_to_int(o) else: o = mupdf.pdf_dict_getl(fileentry, PDF_NAME('Params'), PDF_NAME('Size')) if o.m_internal: DL = mupdf.pdf_to_int(o) infodict[dictkey_size] = DL infodict[dictkey_length] = len_ return xref def _embfile_names(self, namelist): """Get list of embedded file names.""" pdf = _as_pdf_document(self) names = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) if mupdf.pdf_is_array(names): n = mupdf.pdf_array_len(names) for i in range(0, n, 2): val = JM_EscapeStrFromStr( mupdf.pdf_to_text_string( mupdf.pdf_array_get(names, i) ) ) namelist.append(val) def _embfile_upd(self, idx, buffer_=None, filename=None, ufilename=None, desc=None): pdf = _as_pdf_document(self) xref = 0 names = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) entry = mupdf.pdf_array_get(names, 2*idx+1) filespec = mupdf.pdf_dict_getl(entry, PDF_NAME('EF'), PDF_NAME('F')) if not filespec.m_internal: RAISEPY( "bad PDF: no /EF object", JM_Exc_FileDataError) res = JM_BufferFromBytes(buffer_) if buffer_ and buffer_.m_internal and not res.m_internal: raise TypeError( MSG_BAD_BUFFER) if res.m_internal and buffer_ and buffer_.m_internal: JM_update_stream(pdf, filespec, res, 1) # adjust /DL and /Size parameters len, _ = mupdf.fz_buffer_storage(res) l = mupdf.pdf_new_int(len) mupdf.pdf_dict_put(filespec, PDF_NAME('DL'), l) mupdf.pdf_dict_putl(filespec, l, PDF_NAME('Params'), PDF_NAME('Size')) xref = mupdf.pdf_to_num(filespec) if filename: mupdf.pdf_dict_put_text_string(entry, PDF_NAME('F'), filename) if ufilename: mupdf.pdf_dict_put_text_string(entry, PDF_NAME('UF'), ufilename) if desc: mupdf.pdf_dict_put_text_string(entry, PDF_NAME('Desc'), desc) return xref def _extend_toc_items(self, items): """Add color info to all items of an extended TOC list.""" if self.is_closed: raise ValueError("document closed") if g_use_extra: return extra.Document_extend_toc_items( self.this, items) pdf = _as_pdf_document(self) zoom = "zoom" bold = "bold" italic = "italic" collapse = "collapse" root = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')) if not root.m_internal: return olroot = mupdf.pdf_dict_get(root, PDF_NAME('Outlines')) if not olroot.m_internal: return first = mupdf.pdf_dict_get(olroot, PDF_NAME('First')) if not first.m_internal: return xrefs = [] xrefs = JM_outline_xrefs(first, xrefs) n = len(xrefs) m = len(items) if not n: return if n != m: raise IndexError( "internal error finding outline xrefs") # update all TOC item dictionaries for i in range(n): xref = int(xrefs[i]) item = items[i] itemdict = item[3] if not isinstance(itemdict, dict): raise ValueError( "need non-simple TOC format") itemdict[dictkey_xref] = xrefs[i] bm = mupdf.pdf_load_object(pdf, xref) flags = mupdf.pdf_to_int( mupdf.pdf_dict_get(bm, PDF_NAME('F'))) if flags == 1: itemdict[italic] = True elif flags == 2: itemdict[bold] = True elif flags == 3: itemdict[italic] = True itemdict[bold] = True count = mupdf.pdf_to_int( mupdf.pdf_dict_get(bm, PDF_NAME('Count'))) if count < 0: itemdict[collapse] = True elif count > 0: itemdict[collapse] = False col = mupdf.pdf_dict_get(bm, PDF_NAME('C')) if mupdf.pdf_is_array(col) and mupdf.pdf_array_len(col) == 3: color = ( mupdf.pdf_to_real(mupdf.pdf_array_get(col, 0)), mupdf.pdf_to_real(mupdf.pdf_array_get(col, 1)), mupdf.pdf_to_real(mupdf.pdf_array_get(col, 2)), ) itemdict[dictkey_color] = color z=0 obj = mupdf.pdf_dict_get(bm, PDF_NAME('Dest')) if not obj.m_internal or not mupdf.pdf_is_array(obj): obj = mupdf.pdf_dict_getl(bm, PDF_NAME('A'), PDF_NAME('D')) if mupdf.pdf_is_array(obj) and mupdf.pdf_array_len(obj) == 5: z = mupdf.pdf_to_real(mupdf.pdf_array_get(obj, 4)) itemdict[zoom] = float(z) item[3] = itemdict items[i] = item def _forget_page(self, page: Page): """Remove a page from document page dict.""" pid = id(page) if pid in self._page_refs: #self._page_refs[pid] = None del self._page_refs[pid] def _get_char_widths(self, xref: int, bfname: str, ext: str, ordering: int, limit: int, idx: int = 0): pdf = _as_pdf_document(self) mylimit = limit if mylimit < 256: mylimit = 256 if ordering >= 0: data, size, index = mupdf.fz_lookup_cjk_font(ordering) font = mupdf.fz_new_font_from_memory(None, data, size, index, 0) else: data, size = mupdf.fz_lookup_base14_font(bfname) if data: font = mupdf.fz_new_font_from_memory(bfname, data, size, 0, 0) else: buf = JM_get_fontbuffer(pdf, xref) if not buf.m_internal: raise Exception("font at xref %d is not supported" % xref) font = mupdf.fz_new_font_from_buffer(None, buf, idx, 0) wlist = [] for i in range(mylimit): glyph = mupdf.fz_encode_character(font, i) adv = mupdf.fz_advance_glyph(font, glyph, 0) if ordering >= 0: glyph = i if glyph > 0: wlist.append( (glyph, adv)) else: wlist.append( (glyph, 0.0)) return wlist def _get_page_labels(self): pdf = _as_pdf_document(self) rc = [] pagelabels = mupdf.pdf_new_name("PageLabels") obj = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), pagelabels) if not obj.m_internal: return rc # simple case: direct /Nums object nums = mupdf.pdf_resolve_indirect( mupdf.pdf_dict_get( obj, PDF_NAME('Nums'))) if nums.m_internal: JM_get_page_labels(rc, nums) return rc # case: /Kids/Nums nums = mupdf.pdf_resolve_indirect( mupdf.pdf_dict_getl(obj, PDF_NAME('Kids'), PDF_NAME('Nums'))) if nums.m_internal: JM_get_page_labels(rc, nums) return rc # case: /Kids is an array of multiple /Nums kids = mupdf.pdf_resolve_indirect( mupdf.pdf_dict_get( obj, PDF_NAME('Kids'))) if not kids.m_internal or not mupdf.pdf_is_array(kids): return rc n = mupdf.pdf_array_len(kids) for i in range(n): nums = mupdf.pdf_resolve_indirect( mupdf.pdf_dict_get( mupdf.pdf_array_get(kids, i), PDF_NAME('Nums'), ) ) JM_get_page_labels(rc, nums) return rc def _getMetadata(self, key): """Get metadata.""" try: return mupdf.fz_lookup_metadata2( self.this, key) except Exception: if g_exceptions_verbose > 2: exception_info() return '' def _getOLRootNumber(self): """Get xref of Outline Root, create it if missing.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) # get main root root = mupdf.pdf_dict_get( mupdf.pdf_trailer( pdf), PDF_NAME('Root')) # get outline root olroot = mupdf.pdf_dict_get( root, PDF_NAME('Outlines')) if not olroot.m_internal: olroot = mupdf.pdf_new_dict( pdf, 4) mupdf.pdf_dict_put( olroot, PDF_NAME('Type'), PDF_NAME('Outlines')) ind_obj = mupdf.pdf_add_object( pdf, olroot) mupdf.pdf_dict_put( root, PDF_NAME('Outlines'), ind_obj) olroot = mupdf.pdf_dict_get( root, PDF_NAME('Outlines')) return mupdf.pdf_to_num( olroot) def _getPDFfileid(self): """Get PDF file id.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return idlist = [] identity = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('ID')) if identity.m_internal: n = mupdf.pdf_array_len(identity) for i in range(n): o = mupdf.pdf_array_get(identity, i) text = mupdf.pdf_to_text_string(o) hex_ = binascii.hexlify(text) idlist.append(hex_) return idlist def _getPageInfo(self, pno, what): """List fonts, images, XObjects used on a page.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") doc = self.this pageCount = mupdf.pdf_count_pages(doc) if isinstance(doc, mupdf.PdfDocument) else mupdf.fz_count_pages(doc) n = pno # pno < 0 is allowed while n < 0: n += pageCount # make it non-negative if n >= pageCount: raise ValueError( MSG_BAD_PAGENO) pdf = _as_pdf_document(self) pageref = mupdf.pdf_lookup_page_obj(pdf, n) rsrc = mupdf.pdf_dict_get_inheritable(pageref, mupdf.PDF_ENUM_NAME_Resources) liste = [] tracer = [] if rsrc.m_internal: JM_scan_resources(pdf, rsrc, liste, what, 0, tracer) return liste def _insert_font(self, fontfile=None, fontbuffer=None): ''' Utility: insert font from file or binary. ''' pdf = _as_pdf_document(self) if not fontfile and not fontbuffer: raise ValueError( MSG_FILE_OR_BUFFER) value = JM_insert_font(pdf, None, fontfile, fontbuffer, 0, 0, 0, 0, 0, -1) return value def _loadOutline(self): """Load first outline.""" doc = self.this assert isinstance( doc, mupdf.FzDocument) try: ol = mupdf.fz_load_outline( doc) except Exception: if g_exceptions_verbose > 1: exception_info() return return Outline( ol) def _make_page_map(self): """Make an array page number -> page object.""" if self.is_closed: raise ValueError("document closed") assert 0, f'_make_page_map() is no-op' def _move_copy_page(self, pno, nb, before, copy): """Move or copy a PDF page reference.""" pdf = _as_pdf_document(self) same = 0 # get the two page objects ----------------------------------- # locate the /Kids arrays and indices in each page1, parent1, i1 = pdf_lookup_page_loc( pdf, pno) kids1 = mupdf.pdf_dict_get( parent1, PDF_NAME('Kids')) page2, parent2, i2 = pdf_lookup_page_loc( pdf, nb) kids2 = mupdf.pdf_dict_get( parent2, PDF_NAME('Kids')) if before: # calc index of source page in target /Kids pos = i2 else: pos = i2 + 1 # same /Kids array? ------------------------------------------ same = mupdf.pdf_objcmp( kids1, kids2) # put source page in target /Kids array ---------------------- if not copy and same != 0: # update parent in page object mupdf.pdf_dict_put( page1, PDF_NAME('Parent'), parent2) mupdf.pdf_array_insert( kids2, page1, pos) if same != 0: # different /Kids arrays ---------------------- parent = parent2 while parent.m_internal: # increase /Count objects in parents count = mupdf.pdf_dict_get_int( parent, PDF_NAME('Count')) mupdf.pdf_dict_put_int( parent, PDF_NAME('Count'), count + 1) parent = mupdf.pdf_dict_get( parent, PDF_NAME('Parent')) if not copy: # delete original item mupdf.pdf_array_delete( kids1, i1) parent = parent1 while parent.m_internal: # decrease /Count objects in parents count = mupdf.pdf_dict_get_int( parent, PDF_NAME('Count')) mupdf.pdf_dict_put_int( parent, PDF_NAME('Count'), count - 1) parent = mupdf.pdf_dict_get( parent, PDF_NAME('Parent')) else: # same /Kids array if copy: # source page is copied parent = parent2 while parent.m_internal: # increase /Count object in parents count = mupdf.pdf_dict_get_int( parent, PDF_NAME('Count')) mupdf.pdf_dict_put_int( parent, PDF_NAME('Count'), count + 1) parent = mupdf.pdf_dict_get( parent, PDF_NAME('Parent')) else: if i1 < pos: mupdf.pdf_array_delete( kids1, i1) else: mupdf.pdf_array_delete( kids1, i1 + 1) if pdf.m_internal.rev_page_map: # page map no longer valid: drop it mupdf.ll_pdf_drop_page_tree( pdf.m_internal) self._reset_page_refs() def _newPage(self, pno=-1, width=595, height=842): """Make a new PDF page.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if g_use_extra: extra._newPage( self.this, pno, width, height) else: pdf = _as_pdf_document(self) mediabox = mupdf.FzRect(mupdf.FzRect.Fixed_UNIT) mediabox.x1 = width mediabox.y1 = height contents = mupdf.FzBuffer() if pno < -1: raise ValueError( MSG_BAD_PAGENO) # create /Resources and /Contents objects #resources = pdf.add_object(pdf.new_dict(1)) resources = mupdf.pdf_add_new_dict(pdf, 1) page_obj = mupdf.pdf_add_page( pdf, mediabox, 0, resources, contents) mupdf.pdf_insert_page( pdf, pno, page_obj) # fixme: pdf->dirty = 1; self._reset_page_refs() return self[pno] def _remove_links_to(self, numbers): pdf = _as_pdf_document(self) _remove_dest_range(pdf, numbers) def _remove_toc_item(self, xref): # "remove" bookmark by letting it point to nowhere pdf = _as_pdf_document(self) item = mupdf.pdf_new_indirect(pdf, xref, 0) mupdf.pdf_dict_del( item, PDF_NAME('Dest')) mupdf.pdf_dict_del( item, PDF_NAME('A')) color = mupdf.pdf_new_array( pdf, 3) for i in range(3): mupdf.pdf_array_push_real( color, 0.8) mupdf.pdf_dict_put( item, PDF_NAME('C'), color) def _reset_page_refs(self): """Invalidate all pages in document dictionary.""" if getattr(self, "is_closed", True): return pages = [p for p in self._page_refs.values()] for page in pages: if page: page._erase() page = None self._page_refs.clear() def _set_page_labels(self, labels): pdf = _as_pdf_document(self) pagelabels = mupdf.pdf_new_name("PageLabels") root = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')) mupdf.pdf_dict_del(root, pagelabels) mupdf.pdf_dict_putl(root, mupdf.pdf_new_array(pdf, 0), pagelabels, PDF_NAME('Nums')) xref = self.pdf_catalog() text = self.xref_object(xref, compressed=True) text = text.replace("/Nums[]", "/Nums[%s]" % labels) self.update_object(xref, text) def _update_toc_item(self, xref, action=None, title=None, flags=0, collapse=None, color=None): ''' "update" bookmark by letting it point to nowhere ''' pdf = _as_pdf_document(self) item = mupdf.pdf_new_indirect( pdf, xref, 0) if title: mupdf.pdf_dict_put_text_string( item, PDF_NAME('Title'), title) if action: mupdf.pdf_dict_del( item, PDF_NAME('Dest')) obj = JM_pdf_obj_from_str( pdf, action) mupdf.pdf_dict_put( item, PDF_NAME('A'), obj) mupdf.pdf_dict_put_int( item, PDF_NAME('F'), flags) if color: c = mupdf.pdf_new_array( pdf, 3) for i in range(3): f = color[i] mupdf.pdf_array_push_real( c, f) mupdf.pdf_dict_put( item, PDF_NAME('C'), c) elif color is not None: mupdf.pdf_dict_del( item, PDF_NAME('C')) if collapse is not None: if mupdf.pdf_dict_get( item, PDF_NAME('Count')).m_internal: i = mupdf.pdf_dict_get_int( item, PDF_NAME('Count')) if (i < 0 and collapse is False) or (i > 0 and collapse is True): i = i * (-1) mupdf.pdf_dict_put_int( item, PDF_NAME('Count'), i) @property def FormFonts(self): """Get list of field font resource names.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return fonts = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('AcroForm'), PDF_NAME('DR'), PDF_NAME('Font'), ) liste = list() if fonts.m_internal and mupdf.pdf_is_dict(fonts): # fonts exist n = mupdf.pdf_dict_len(fonts) for i in range(n): f = mupdf.pdf_dict_get_key(fonts, i) liste.append(JM_UnicodeFromStr(mupdf.pdf_to_name(f))) return liste def add_layer(self, name, creator=None, on=None): """Add a new OC layer.""" pdf = _as_pdf_document(self) JM_add_layer_config( pdf, name, creator, on) mupdf.ll_pdf_read_ocg( pdf.m_internal) def add_ocg(self, name, config=-1, on=1, intent=None, usage=None): """Add new optional content group.""" xref = 0 pdf = _as_pdf_document(self) # make the OCG ocg = mupdf.pdf_add_new_dict(pdf, 3) mupdf.pdf_dict_put(ocg, PDF_NAME('Type'), PDF_NAME('OCG')) mupdf.pdf_dict_put_text_string(ocg, PDF_NAME('Name'), name) intents = mupdf.pdf_dict_put_array(ocg, PDF_NAME('Intent'), 2) if not intent: mupdf.pdf_array_push(intents, PDF_NAME('View')) elif not isinstance(intent, str): assert 0, f'fixme: intent is not a str. {type(intent)=} {type=}' #n = len(intent) #for i in range(n): # item = intent[i] # c = JM_StrAsChar(item); # if (c) { # pdf_array_push(gctx, intents, pdf_new_name(gctx, c)); # } # Py_DECREF(item); #} else: mupdf.pdf_array_push(intents, mupdf.pdf_new_name(intent)) use_for = mupdf.pdf_dict_put_dict(ocg, PDF_NAME('Usage'), 3) ci_name = mupdf.pdf_new_name("CreatorInfo") cre_info = mupdf.pdf_dict_put_dict(use_for, ci_name, 2) mupdf.pdf_dict_put_text_string(cre_info, PDF_NAME('Creator'), "PyMuPDF") if usage: mupdf.pdf_dict_put_name(cre_info, PDF_NAME('Subtype'), usage) else: mupdf.pdf_dict_put_name(cre_info, PDF_NAME('Subtype'), "Artwork") indocg = mupdf.pdf_add_object(pdf, ocg) # Insert OCG in the right config ocp = JM_ensure_ocproperties(pdf) obj = mupdf.pdf_dict_get(ocp, PDF_NAME('OCGs')) mupdf.pdf_array_push(obj, indocg) if config > -1: obj = mupdf.pdf_dict_get(ocp, PDF_NAME('Configs')) if not mupdf.pdf_is_array(obj): raise ValueError( MSG_BAD_OC_CONFIG) cfg = mupdf.pdf_array_get(obj, config) if not cfg.m_internal: raise ValueError( MSG_BAD_OC_CONFIG) else: cfg = mupdf.pdf_dict_get(ocp, PDF_NAME('D')) obj = mupdf.pdf_dict_get(cfg, PDF_NAME('Order')) if not obj.m_internal: obj = mupdf.pdf_dict_put_array(cfg, PDF_NAME('Order'), 1) mupdf.pdf_array_push(obj, indocg) if on: obj = mupdf.pdf_dict_get(cfg, PDF_NAME('ON')) if not obj.m_internal: obj = mupdf.pdf_dict_put_array(cfg, PDF_NAME('ON'), 1) else: obj =mupdf.pdf_dict_get(cfg, PDF_NAME('OFF')) if not obj.m_internal: obj =mupdf.pdf_dict_put_array(cfg, PDF_NAME('OFF'), 1) mupdf.pdf_array_push(obj, indocg) # let MuPDF take note: re-read OCProperties mupdf.ll_pdf_read_ocg(pdf.m_internal) xref = mupdf.pdf_to_num(indocg) return xref def authenticate(self, password): """Decrypt document.""" if self.is_closed: raise ValueError("document closed") val = mupdf.fz_authenticate_password(self.this, password) if val: # the doc is decrypted successfully and we init the outline self.is_encrypted = False self.is_encrypted = False self.init_doc() self.thisown = True return val def can_save_incrementally(self): """Check whether incremental saves are possible.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return False return mupdf.pdf_can_be_saved_incrementally(pdf) def bake(self, *, annots: bool = True, widgets: bool = True) -> None: """Convert annotations or fields to permanent content. Notes: Converts annotations or widgets to permanent page content, like text and vector graphics, as appropriate. After execution, pages will still look the same, but no longer have annotations, respectively no fields. If widgets are selected the PDF will no longer be a Form PDF. Args: annots: convert annotations widgets: convert form fields """ pdf = _as_pdf_document(self) mupdf.pdf_bake_document(pdf, int(annots), int(widgets)) @property def chapter_count(self): """Number of chapters.""" if self.is_closed: raise ValueError("document closed") return mupdf.fz_count_chapters( self.this) def chapter_page_count(self, chapter): """Page count of chapter.""" if self.is_closed: raise ValueError("document closed") chapters = mupdf.fz_count_chapters( self.this) if chapter < 0 or chapter >= chapters: raise ValueError( "bad chapter number") pages = mupdf.fz_count_chapter_pages( self.this, chapter) return pages def close(self): """Close document.""" if getattr(self, "is_closed", True): raise ValueError("document closed") # self._cleanup() if hasattr(self, "_outline") and self._outline: self._outline = None self._reset_page_refs() #self.metadata = None #self.stream = None self.is_closed = True #self.FontInfos = [] self.Graftmaps = {} # Fixes test_3140(). #self.ShownPages = {} #self.InsertedImages = {} #self.this = None self.this = None def convert_to_pdf(self, from_page=0, to_page=-1, rotate=0): """Convert document to a PDF, selecting page range and optional rotation. Output bytes object.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") fz_doc = self.this fp = from_page tp = to_page srcCount = mupdf.fz_count_pages(fz_doc) if fp < 0: fp = 0 if fp > srcCount - 1: fp = srcCount - 1 if tp < 0: tp = srcCount - 1 if tp > srcCount - 1: tp = srcCount - 1 len0 = len(JM_mupdf_warnings_store) doc = JM_convert_to_pdf(fz_doc, fp, tp, rotate) len1 = len(JM_mupdf_warnings_store) for i in range(len0, len1): message(f'{JM_mupdf_warnings_store[i]}') return doc def copy_page(self, pno: int, to: int =-1): """Copy a page within a PDF document. This will only create another reference of the same page object. Args: pno: source page number to: put before this page, '-1' means after last page. """ if self.is_closed: raise ValueError("document closed") page_count = len(self) if ( pno not in range(page_count) or to not in range(-1, page_count) ): raise ValueError("bad page number(s)") before = 1 copy = 1 if to == -1: to = page_count - 1 before = 0 return self._move_copy_page(pno, to, before, copy) def del_xml_metadata(self): """Delete XML metadata.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) root = mupdf.pdf_dict_get( mupdf.pdf_trailer( pdf), PDF_NAME('Root')) if root.m_internal: mupdf.pdf_dict_del( root, PDF_NAME('Metadata')) def delete_page(self, pno: int =-1): """ Delete one page from a PDF. """ return self.delete_pages(pno) def delete_pages(self, *args, **kw): """Delete pages from a PDF. Args: Either keywords 'from_page'/'to_page', or two integers to specify the first/last page to delete. Or a list/tuple/range object, which can contain arbitrary page numbers. Or a single integer page number. """ if not self.is_pdf: raise ValueError("is no PDF") if self.is_closed: raise ValueError("document closed") page_count = self.page_count # page count of document f = t = -1 if kw: # check if keywords were used if args: # then no positional args are allowed raise ValueError("cannot mix keyword and positional argument") f = kw.get("from_page", -1) # first page to delete t = kw.get("to_page", -1) # last page to delete while f < 0: f += page_count while t < 0: t += page_count if not f <= t < page_count: raise ValueError("bad page number(s)") numbers = tuple(range(f, t + 1)) else: if len(args) > 2 or args == []: raise ValueError("need 1 or 2 positional arguments") if len(args) == 2: f, t = args if not (type(f) is int and type(t) is int): raise ValueError("both arguments must be int") if f > t: f, t = t, f if not f <= t < page_count: raise ValueError("bad page number(s)") numbers = tuple(range(f, t + 1)) elif isinstance(args[0], int): pno = args[0] while pno < 0: pno += page_count numbers = (pno,) else: numbers = tuple(args[0]) numbers = list(map(int, set(numbers))) # ensure unique integers if numbers == []: message("nothing to delete") return numbers.sort() if numbers[0] < 0 or numbers[-1] >= page_count: raise ValueError("bad page number(s)") frozen_numbers = frozenset(numbers) toc = self.get_toc() for i, xref in enumerate(self.get_outline_xrefs()): if toc[i][2] - 1 in frozen_numbers: self._remove_toc_item(xref) # remove target in PDF object self._remove_links_to(frozen_numbers) for i in reversed(numbers): # delete pages, last to first self._delete_page(i) self._reset_page_refs() def embfile_add(self, name: str, buffer_: ByteString, filename: OptStr =None, ufilename: OptStr =None, desc: OptStr =None, ) -> None: """Add an item to the EmbeddedFiles array. Args: name: name of the new item, must not already exist. buffer_: (binary data) the file content. filename: (str) the file name, default: the name ufilename: (unicode) the file name, default: filename desc: (str) the description. """ filenames = self.embfile_names() msg = "Name '%s' already exists." % str(name) if name in filenames: raise ValueError(msg) if filename is None: filename = name if ufilename is None: ufilename = filename if desc is None: desc = name xref = self._embfile_add( name, buffer_=buffer_, filename=filename, ufilename=ufilename, desc=desc, ) date = get_pdf_now() self.xref_set_key(xref, "Type", "/EmbeddedFile") self.xref_set_key(xref, "Params/CreationDate", get_pdf_str(date)) self.xref_set_key(xref, "Params/ModDate", get_pdf_str(date)) return xref def embfile_count(self) -> int: """Get number of EmbeddedFiles.""" return len(self.embfile_names()) def embfile_del(self, item: typing.Union[int, str]): """Delete an entry from EmbeddedFiles. Notes: The argument must be name or index of an EmbeddedFiles item. Physical deletion of data will happen on save to a new file with appropriate garbage option. Args: item: name or number of item. Returns: None """ idx = self._embeddedFileIndex(item) return self._embfile_del(idx) def embfile_get(self, item: typing.Union[int, str]) -> bytes: """Get the content of an item in the EmbeddedFiles array. Args: item: number or name of item. Returns: (bytes) The file content. """ idx = self._embeddedFileIndex(item) return self._embeddedFileGet(idx) def embfile_info(self, item: typing.Union[int, str]) -> dict: """Get information of an item in the EmbeddedFiles array. Args: item: number or name of item. Returns: Information dictionary. """ idx = self._embeddedFileIndex(item) infodict = {"name": self.embfile_names()[idx]} xref = self._embfile_info(idx, infodict) t, date = self.xref_get_key(xref, "Params/CreationDate") if t != "null": infodict["creationDate"] = date t, date = self.xref_get_key(xref, "Params/ModDate") if t != "null": infodict["modDate"] = date t, md5 = self.xref_get_key(xref, "Params/CheckSum") if t != "null": infodict["checksum"] = binascii.hexlify(md5.encode()).decode() return infodict def embfile_names(self) -> list: """Get list of names of EmbeddedFiles.""" filenames = [] self._embfile_names(filenames) return filenames def embfile_upd(self, item: typing.Union[int, str], buffer_: OptBytes =None, filename: OptStr =None, ufilename: OptStr =None, desc: OptStr =None, ) -> None: """Change an item of the EmbeddedFiles array. Notes: Only provided parameters are changed. If all are omitted, the method is a no-op. Args: item: number or name of item. buffer_: (binary data) the new file content. filename: (str) the new file name. ufilename: (unicode) the new filen ame. desc: (str) the new description. """ idx = self._embeddedFileIndex(item) xref = self._embfile_upd( idx, buffer_=buffer_, filename=filename, ufilename=ufilename, desc=desc, ) date = get_pdf_now() self.xref_set_key(xref, "Params/ModDate", get_pdf_str(date)) return xref def extract_font(self, xref=0, info_only=0, named=None): ''' Get a font by xref. Returns a tuple or dictionary. ''' #log( '{=xref info_only}') pdf = _as_pdf_document(self) obj = mupdf.pdf_load_object(pdf, xref) type_ = mupdf.pdf_dict_get(obj, PDF_NAME('Type')) subtype = mupdf.pdf_dict_get(obj, PDF_NAME('Subtype')) if (mupdf.pdf_name_eq(type_, PDF_NAME('Font')) and not mupdf.pdf_to_name( subtype).startswith('CIDFontType') ): basefont = mupdf.pdf_dict_get(obj, PDF_NAME('BaseFont')) if not basefont.m_internal or mupdf.pdf_is_null(basefont): bname = mupdf.pdf_dict_get(obj, PDF_NAME('Name')) else: bname = basefont ext = JM_get_fontextension(pdf, xref) if ext != 'n/a' and not info_only: buffer_ = JM_get_fontbuffer(pdf, xref) bytes_ = JM_BinFromBuffer(buffer_) else: bytes_ = b'' if not named: rc = ( JM_EscapeStrFromStr(mupdf.pdf_to_name(bname)), JM_UnicodeFromStr(ext), JM_UnicodeFromStr(mupdf.pdf_to_name(subtype)), bytes_, ) else: rc = { dictkey_name: JM_EscapeStrFromStr(mupdf.pdf_to_name(bname)), dictkey_ext: JM_UnicodeFromStr(ext), dictkey_type: JM_UnicodeFromStr(mupdf.pdf_to_name(subtype)), dictkey_content: bytes_, } else: if not named: rc = '', '', '', b'' else: rc = { dictkey_name: '', dictkey_ext: '', dictkey_type: '', dictkey_content: b'', } return rc def extract_image(self, xref): """Get image by xref. Returns a dictionary.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1): raise ValueError( MSG_BAD_XREF) obj = mupdf.pdf_new_indirect(pdf, xref, 0) subtype = mupdf.pdf_dict_get(obj, PDF_NAME('Subtype')) if not mupdf.pdf_name_eq(subtype, PDF_NAME('Image')): raise ValueError( "not an image") o = mupdf.pdf_dict_geta(obj, PDF_NAME('SMask'), PDF_NAME('Mask')) if o.m_internal: smask = mupdf.pdf_to_num(o) else: smask = 0 # load the image img = mupdf.pdf_load_image(pdf, obj) rc = dict() _make_image_dict(img, rc) rc[dictkey_smask] = smask rc[dictkey_cs_name] = mupdf.fz_colorspace_name(img.colorspace()) return rc def ez_save( self, filename, garbage=3, clean=False, deflate=True, deflate_images=True, deflate_fonts=True, incremental=False, ascii=False, expand=False, linear=False, pretty=False, encryption=1, permissions=4095, owner_pw=None, user_pw=None, no_new_id=True, preserve_metadata=1, use_objstms=1, compression_effort=0, ): ''' Save PDF using some different defaults ''' return self.save( filename, garbage=garbage, clean=clean, deflate=deflate, deflate_images=deflate_images, deflate_fonts=deflate_fonts, incremental=incremental, ascii=ascii, expand=expand, linear=linear, pretty=pretty, encryption=encryption, permissions=permissions, owner_pw=owner_pw, user_pw=user_pw, no_new_id=no_new_id, preserve_metadata=preserve_metadata, use_objstms=use_objstms, compression_effort=compression_effort, ) def find_bookmark(self, bm): """Find new location after layouting a document.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") location = mupdf.fz_lookup_bookmark2( self.this, bm) return location.chapter, location.page def fullcopy_page(self, pno, to=-1): """Make a full page duplicate.""" pdf = _as_pdf_document(self) page_count = mupdf.pdf_count_pages( pdf) try: if (not _INRANGE(pno, 0, page_count - 1) or not _INRANGE(to, -1, page_count - 1) ): raise ValueError( MSG_BAD_PAGENO) page1 = mupdf.pdf_resolve_indirect( mupdf.pdf_lookup_page_obj( pdf, pno)) page2 = mupdf.pdf_deep_copy_obj( page1) old_annots = mupdf.pdf_dict_get( page2, PDF_NAME('Annots')) # copy annotations, but remove Popup and IRT types if old_annots.m_internal: n = mupdf.pdf_array_len( old_annots) new_annots = mupdf.pdf_new_array( pdf, n) for i in range(n): o = mupdf.pdf_array_get( old_annots, i) subtype = mupdf.pdf_dict_get( o, PDF_NAME('Subtype')) if mupdf.pdf_name_eq( subtype, PDF_NAME('Popup')): continue if mupdf.pdf_dict_gets( o, "IRT").m_internal: continue copy_o = mupdf.pdf_deep_copy_obj( mupdf.pdf_resolve_indirect( o)) xref = mupdf.pdf_create_object( pdf) mupdf.pdf_update_object( pdf, xref, copy_o) copy_o = mupdf.pdf_new_indirect( pdf, xref, 0) mupdf.pdf_dict_del( copy_o, PDF_NAME('Popup')) mupdf.pdf_dict_del( copy_o, PDF_NAME('P')) mupdf.pdf_array_push( new_annots, copy_o) mupdf.pdf_dict_put( page2, PDF_NAME('Annots'), new_annots) # copy the old contents stream(s) res = JM_read_contents( page1) # create new /Contents object for page2 if res and res.m_internal: #contents = mupdf.pdf_add_stream( pdf, mupdf.fz_new_buffer_from_copied_data( b" ", 1), NULL, 0) contents = mupdf.pdf_add_stream( pdf, mupdf.fz_new_buffer_from_copied_data( b" "), mupdf.PdfObj(), 0) JM_update_stream( pdf, contents, res, 1) mupdf.pdf_dict_put( page2, PDF_NAME('Contents'), contents) # now insert target page, making sure it is an indirect object xref = mupdf.pdf_create_object( pdf) # get new xref mupdf.pdf_update_object( pdf, xref, page2) # store new page page2 = mupdf.pdf_new_indirect( pdf, xref, 0) # reread object mupdf.pdf_insert_page( pdf, to, page2) # and store the page finally: mupdf.ll_pdf_drop_page_tree( pdf.m_internal) self._reset_page_refs() def get_char_widths( doc: 'Document', xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None, ) -> list: """Get list of glyph information of a font. Notes: Must be provided by its XREF number. If we already dealt with the font, it will be recorded in doc.FontInfos. Otherwise we insert an entry there. Finally we return the glyphs for the font. This is a list of (glyph, width) where glyph is an integer controlling the char appearance, and width is a float controlling the char's spacing: width * fontsize is the actual space. For 'simple' fonts, glyph == ord(char) will usually be true. Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here. """ fontinfo = CheckFontInfo(doc, xref) if fontinfo is None: # not recorded yet: create it if fontdict is None: name, ext, stype, asc, dsc = utils._get_font_properties(doc, xref) fontdict = { "name": name, "type": stype, "ext": ext, "ascender": asc, "descender": dsc, } else: name = fontdict["name"] ext = fontdict["ext"] stype = fontdict["type"] ordering = fontdict["ordering"] simple = fontdict["simple"] if ext == "": raise ValueError("xref is not a font") # check for 'simple' fonts if stype in ("Type1", "MMType1", "TrueType"): simple = True else: simple = False # check for CJK fonts if name in ("Fangti", "Ming"): ordering = 0 elif name in ("Heiti", "Song"): ordering = 1 elif name in ("Gothic", "Mincho"): ordering = 2 elif name in ("Dotum", "Batang"): ordering = 3 else: ordering = -1 fontdict["simple"] = simple if name == "ZapfDingbats": glyphs = zapf_glyphs elif name == "Symbol": glyphs = symbol_glyphs else: glyphs = None fontdict["glyphs"] = glyphs fontdict["ordering"] = ordering fontinfo = [xref, fontdict] doc.FontInfos.append(fontinfo) else: fontdict = fontinfo[1] glyphs = fontdict["glyphs"] simple = fontdict["simple"] ordering = fontdict["ordering"] if glyphs is None: oldlimit = 0 else: oldlimit = len(glyphs) mylimit = max(256, limit) if mylimit <= oldlimit: return glyphs if ordering < 0: # not a CJK font glyphs = doc._get_char_widths( xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx ) else: # CJK fonts use char codes and width = 1 glyphs = None fontdict["glyphs"] = glyphs fontinfo[1] = fontdict UpdateFontInfo(doc, fontinfo) return glyphs def get_layer(self, config=-1): """Content of ON, OFF, RBGroups of an OC layer.""" pdf = _as_pdf_document(self) ocp = mupdf.pdf_dict_getl( mupdf.pdf_trailer( pdf), PDF_NAME('Root'), PDF_NAME('OCProperties'), ) if not ocp.m_internal: return if config == -1: obj = mupdf.pdf_dict_get( ocp, PDF_NAME('D')) else: obj = mupdf.pdf_array_get( mupdf.pdf_dict_get( ocp, PDF_NAME('Configs')), config, ) if not obj.m_internal: raise ValueError( MSG_BAD_OC_CONFIG) rc = JM_get_ocg_arrays( obj) return rc def get_layers(self): """Show optional OC layers.""" pdf = _as_pdf_document(self) n = mupdf.pdf_count_layer_configs( pdf) if n == 1: obj = mupdf.pdf_dict_getl( mupdf.pdf_trailer( pdf), PDF_NAME('Root'), PDF_NAME('OCProperties'), PDF_NAME('Configs'), ) if not mupdf.pdf_is_array( obj): n = 0 rc = [] info = mupdf.PdfLayerConfig() for i in range(n): mupdf.pdf_layer_config_info( pdf, i, info) item = { "number": i, "name": info.name, "creator": info.creator, } rc.append( item) return rc def get_new_xref(self): """Make new xref.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) xref = 0 ENSURE_OPERATION(pdf) xref = mupdf.pdf_create_object(pdf) return xref def get_oc(doc: 'Document', xref: int) -> int: """Return optional content object xref for an image or form xobject. Args: xref: (int) xref number of an image or form xobject. """ if doc.is_closed or doc.is_encrypted: raise ValueError("document close or encrypted") t, name = doc.xref_get_key(xref, "Subtype") if t != "name" or name not in ("/Image", "/Form"): raise ValueError("bad object type at xref %i" % xref) t, oc = doc.xref_get_key(xref, "OC") if t != "xref": return 0 rc = int(oc.replace("0 R", "")) return rc def get_ocgs(self): """Show existing optional content groups.""" ci = mupdf.pdf_new_name( "CreatorInfo") pdf = _as_pdf_document(self) ocgs = mupdf.pdf_dict_getl( mupdf.pdf_dict_get( mupdf.pdf_trailer( pdf), PDF_NAME('Root')), PDF_NAME('OCProperties'), PDF_NAME('OCGs'), ) rc = dict() if not mupdf.pdf_is_array( ocgs): return rc n = mupdf.pdf_array_len( ocgs) for i in range(n): ocg = mupdf.pdf_array_get( ocgs, i) xref = mupdf.pdf_to_num( ocg) name = mupdf.pdf_to_text_string( mupdf.pdf_dict_get( ocg, PDF_NAME('Name'))) obj = mupdf.pdf_dict_getl( ocg, PDF_NAME('Usage'), ci, PDF_NAME('Subtype')) usage = None if obj.m_internal: usage = mupdf.pdf_to_name( obj) intents = list() intent = mupdf.pdf_dict_get( ocg, PDF_NAME('Intent')) if intent.m_internal: if mupdf.pdf_is_name( intent): intents.append( mupdf.pdf_to_name( intent)) elif mupdf.pdf_is_array( intent): m = mupdf.pdf_array_len( intent) for j in range(m): o = mupdf.pdf_array_get( intent, j) if mupdf.pdf_is_name( o): intents.append( mupdf.pdf_to_name( o)) if mupdf_version_tuple >= (1, 27): resource_stack = mupdf.PdfResourceStack() hidden = mupdf.pdf_is_ocg_hidden( pdf, resource_stack, usage, ocg) else: hidden = mupdf.pdf_is_ocg_hidden( pdf, mupdf.PdfObj(), usage, ocg) item = { "name": name, "intent": intents, "on": not hidden, "usage": usage, } temp = xref rc[ temp] = item return rc def get_ocmd(doc: 'Document', xref: int) -> dict: """Return the definition of an OCMD (optional content membership dictionary). Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and /VE (visibility expression, PDF array). Via string manipulation, this info is converted to a Python dictionary with keys "xref", "ocgs", "policy" and "ve" - ready to recycle as input for 'set_ocmd()'. """ if xref not in range(doc.xref_length()): raise ValueError("bad xref") text = doc.xref_object(xref, compressed=True) if "/Type/OCMD" not in text: raise ValueError("bad object type") textlen = len(text) p0 = text.find("/OCGs[") # look for /OCGs key p1 = text.find("]", p0) if p0 < 0 or p1 < 0: # no OCGs found ocgs = None else: ocgs = text[p0 + 6 : p1].replace("0 R", " ").split() ocgs = list(map(int, ocgs)) p0 = text.find("/P/") # look for /P policy key if p0 < 0: policy = None else: p1 = text.find("ff", p0) if p1 < 0: p1 = text.find("on", p0) if p1 < 0: # some irregular syntax raise ValueError("bad object at xref") else: policy = text[p0 + 3 : p1 + 2] p0 = text.find("/VE[") # look for /VE visibility expression key if p0 < 0: # no visibility expression found ve = None else: lp = rp = 0 # find end of /VE by finding last ']'. p1 = p0 while lp < 1 or lp != rp: p1 += 1 if not p1 < textlen: # some irregular syntax raise ValueError("bad object at xref") if text[p1] == "[": lp += 1 if text[p1] == "]": rp += 1 # p1 now positioned at the last "]" ve = text[p0 + 3 : p1 + 1] # the PDF /VE array ve = ( ve.replace("/And", '"and",') .replace("/Not", '"not",') .replace("/Or", '"or",') ) ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[") import json try: ve = json.loads(ve) except Exception: exception_info() message(f"bad /VE key: {ve!r}") raise return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve} def get_outline_xrefs(self): """Get list of outline xref numbers.""" xrefs = [] pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return xrefs root = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')) if not root.m_internal: return xrefs olroot = mupdf.pdf_dict_get(root, PDF_NAME('Outlines')) if not olroot.m_internal: return xrefs first = mupdf.pdf_dict_get(olroot, PDF_NAME('First')) if not first.m_internal: return xrefs xrefs = JM_outline_xrefs(first, xrefs) return xrefs def get_page_fonts(self, pno: int, full: bool =False) -> list: """Retrieve a list of fonts used on a page. """ if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if not self.is_pdf: return () if type(pno) is not int: try: pno = pno.number except Exception: exception_info() raise ValueError("need a Page or page number") val = self._getPageInfo(pno, 1) if not full: return [v[:-1] for v in val] return val def get_page_images(self, pno: int, full: bool =False) -> list: """Retrieve a list of images used on a page. """ if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if not self.is_pdf: return () val = self._getPageInfo(pno, 2) if not full: return [v[:-1] for v in val] return val def get_page_labels(self): """Return page label definitions in PDF document. Returns: A list of dictionaries with the following format: {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. """ # Jorj McKie, 2021-01-10 return [utils.rule_dict(item) for item in self._get_page_labels()] def get_page_numbers(doc, label, only_one=False): """Return a list of page numbers with the given label. Args: doc: PDF document object (resp. 'self'). label: (str) label. only_one: (bool) stop searching after first hit. Returns: List of page numbers having this label. """ # Jorj McKie, 2021-01-06 numbers = [] if not label: return numbers labels = doc._get_page_labels() if labels == []: return numbers for i in range(doc.page_count): plabel = utils.get_label_pno(i, labels) if plabel == label: numbers.append(i) if only_one: break return numbers def get_page_pixmap( doc: 'Document', pno: int, *, matrix: matrix_like = None, dpi=None, colorspace: Colorspace = None, clip: rect_like = None, alpha: bool = False, annots: bool = True, ) -> 'Pixmap': """Create pixmap of document page by page number. Notes: Convenience function calling page.get_pixmap. Args: pno: (int) page number matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity). colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB. clip: (irect-like) restrict rendering to this area. alpha: (bool) include alpha channel annots: (bool) also render annotations """ if matrix is None: matrix = Identity if colorspace is None: colorspace = csRGB return doc[pno].get_pixmap( matrix=matrix, dpi=dpi, colorspace=colorspace, clip=clip, alpha=alpha, annots=annots ) def get_page_text( doc: 'Document', pno: int, option: str = "text", clip: rect_like = None, flags: OptInt = None, textpage: 'TextPage' = None, sort: bool = False, ) -> typing.Any: """Extract a document page's text by page number. Notes: Convenience function calling page.get_text(). Args: pno: page number option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. Returns: output from page.TextPage(). """ return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort) def get_page_xobjects(self, pno: int) -> list: """Retrieve a list of XObjects used on a page. """ if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if not self.is_pdf: return () val = self._getPageInfo(pno, 3) return val def get_sigflags(self): """Get the /SigFlags value.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return -1 # not a PDF sigflags = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('AcroForm'), PDF_NAME('SigFlags'), ) sigflag = -1 if sigflags.m_internal: sigflag = mupdf.pdf_to_int(sigflags) return sigflag def get_toc( doc: 'Document', simple: bool = True, ) -> list: """Create a table of contents. Args: simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. """ def recurse(olItem, liste, lvl): """Recursively follow the outline item chain and record item information in a list.""" while olItem and olItem.this.m_internal: if olItem.title: title = olItem.title else: title = " " if not olItem.is_external: if olItem.uri: if olItem.page == -1: resolve = doc.resolve_link(olItem.uri) page = resolve[0] + 1 else: page = olItem.page + 1 else: page = -1 else: page = -1 if not simple: link = utils.getLinkDict(olItem, doc) liste.append([lvl, title, page, link]) else: liste.append([lvl, title, page]) if olItem.down: liste = recurse(olItem.down, liste, lvl + 1) olItem = olItem.next return liste # ensure document is open if doc.is_closed: raise ValueError("document closed") doc.init_doc() olItem = doc.outline if not olItem: return [] lvl = 1 liste = [] toc = recurse(olItem, liste, lvl) if doc.is_pdf and not simple: doc._extend_toc_items(toc) return toc def get_xml_metadata(self): """Get document XML metadata.""" xml = None pdf = _as_pdf_document(self, required=0) if pdf.m_internal: xml = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('Metadata'), ) if xml is not None and xml.m_internal: buff = mupdf.pdf_load_stream(xml) rc = JM_UnicodeFromBuffer(buff) else: rc = '' return rc def has_annots(doc: 'Document') -> bool: """Check whether there are annotations on any page.""" if doc.is_closed: raise ValueError("document closed") if not doc.is_pdf: raise ValueError("is no PDF") for i in range(doc.page_count): for item in doc.page_annot_xrefs(i): # pylint: disable=no-member if not (item[1] == mupdf.PDF_ANNOT_LINK or item[1] == mupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member return True return False def has_links(doc: 'Document') -> bool: """Check whether there are links on any page.""" if doc.is_closed: raise ValueError("document closed") if not doc.is_pdf: raise ValueError("is no PDF") for i in range(doc.page_count): for item in doc.page_annot_xrefs(i): if item[1] == mupdf.PDF_ANNOT_LINK: # pylint: disable=no-member return True return False def init_doc(self): if self.is_encrypted: raise ValueError("cannot initialize - document still encrypted") self._outline = self._loadOutline() self.metadata = dict( [ (k,self._getMetadata(v)) for k,v in { 'format':'format', 'title':'info:Title', 'author':'info:Author', 'subject':'info:Subject', 'keywords':'info:Keywords', 'creator':'info:Creator', 'producer':'info:Producer', 'creationDate':'info:CreationDate', 'modDate':'info:ModDate', 'trapped':'info:Trapped' }.items() ] ) self.metadata['encryption'] = None if self._getMetadata('encryption')=='None' else self._getMetadata('encryption') def insert_file(self, infile, from_page=-1, to_page=-1, start_at=-1, rotate=-1, links=True, annots=True, show_progress=0, final=1, ): ''' Insert an arbitrary supported document to an existing PDF. The infile may be given as a filename, a Document or a Pixmap. Other parameters - where applicable - equal those of insert_pdf(). ''' src = None if isinstance(infile, Pixmap): if infile.colorspace.n > 3: infile = Pixmap(csRGB, infile) src = Document("png", infile.tobytes()) elif isinstance(infile, Document): src = infile else: src = Document(infile) if not src: raise ValueError("bad infile parameter") if not src.is_pdf: pdfbytes = src.convert_to_pdf() src = Document("pdf", pdfbytes) return self.insert_pdf( src, from_page=from_page, to_page=to_page, start_at=start_at, rotate=rotate, links=links, annots=annots, show_progress=show_progress, final=final, ) def insert_page( doc: 'Document', pno: int, text: typing.Union[str, list, None] = None, fontsize: float = 11, width: float = 595, height: float = 842, fontname: str = "helv", fontfile: OptStr = None, color: OptSeq = (0,), ) -> int: """Create a new PDF page and insert some text. Notes: Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text(). For parameter details see these methods. """ page = doc.new_page(pno=pno, width=width, height=height) if not bool(text): return 0 rc = page.insert_text( (50, 72), text, fontsize=fontsize, fontname=fontname, fontfile=fontfile, color=color, ) return rc def insert_pdf( self, docsrc, *, from_page=-1, to_page=-1, start_at=-1, rotate=-1, links=1, annots=1, widgets=1, join_duplicates=0, show_progress=0, final=1, _gmap=None, ): """Insert a page range from another PDF. Args: docsrc: PDF to copy from. Must be different object, but may be same file. from_page: (int) first source page to copy, 0-based, default 0. to_page: (int) last source page to copy, 0-based, default last page. start_at: (int) from_page will become this page number in target. rotate: (int) rotate copied pages, default -1 is no change. links: (int/bool) whether to also copy links. annots: (int/bool) whether to also copy annotations. widgets: (int/bool) whether to also copy form fields. join_duplicates: (int/bool) join or rename duplicate widget names. show_progress: (int) progress message interval, 0 is no messages. final: (bool) indicates last insertion from this source PDF. _gmap: internal use only Copy sequence reversed if from_page > to_page.""" # Insert pages from a source PDF into this PDF. # For reconstructing the links (_do_links method), we must save the # insertion point (start_at) if it was specified as -1. #log( 'insert_pdf(): start') if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if self._graft_id == docsrc._graft_id: raise ValueError("source and target cannot be same object") sa = start_at if sa < 0: sa = self.page_count outCount = self.page_count srcCount = docsrc.page_count # local copies of page numbers fp = from_page tp = to_page sa = start_at # normalize page numbers fp = max(fp, 0) # -1 = first page fp = min(fp, srcCount - 1) # but do not exceed last page if tp < 0: tp = srcCount - 1 # -1 = last page tp = min(tp, srcCount - 1) # but do not exceed last page if sa < 0: sa = outCount # -1 = behind last page sa = min(sa, outCount) # but that is also the limit if len(docsrc) > show_progress > 0: inname = os.path.basename(docsrc.name) if not inname: inname = "memory PDF" outname = os.path.basename(self.name) if not outname: outname = "memory PDF" message("Inserting '%s' at '%s'" % (inname, outname)) # retrieve / make a Graftmap to avoid duplicate objects #log( 'insert_pdf(): Graftmaps') isrt = docsrc._graft_id _gmap = self.Graftmaps.get(isrt, None) if _gmap is None: #log( 'insert_pdf(): Graftmaps2') _gmap = Graftmap(self) self.Graftmaps[isrt] = _gmap if g_use_extra: #log( 'insert_pdf(): calling extra_FzDocument_insert_pdf()') extra_FzDocument_insert_pdf( self.this, docsrc.this, from_page, to_page, start_at, rotate, links, annots, show_progress, final, _gmap, ) #log( 'insert_pdf(): extra_FzDocument_insert_pdf() returned.') else: pdfout = _as_pdf_document(self) pdfsrc = _as_pdf_document(docsrc) if not pdfout.m_internal or not pdfsrc.m_internal: raise TypeError( "source or target not a PDF") ENSURE_OPERATION(pdfout) JM_merge_range(pdfout, pdfsrc, fp, tp, sa, rotate, links, annots, show_progress, _gmap) #log( 'insert_pdf(): calling self._reset_page_refs()') self._reset_page_refs() if links: #log( 'insert_pdf(): calling self._do_links()') self._do_links(docsrc, from_page=fp, to_page=tp, start_at=sa) if widgets: self._do_widgets(docsrc, _gmap, from_page=fp, to_page=tp, start_at=sa, join_duplicates=join_duplicates) if final == 1: self.Graftmaps[isrt] = None #log( 'insert_pdf(): returning') @property def is_dirty(self): pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return False r = mupdf.pdf_has_unsaved_changes(pdf) return True if r else False @property def is_fast_webaccess(self): ''' Check whether we have a linearized PDF. ''' pdf = _as_pdf_document(self, required=0) if pdf.m_internal: return mupdf.pdf_doc_was_linearized(pdf) return False # gracefully handle non-PDF @property def is_form_pdf(self): """Either False or PDF field count.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return False count = -1 try: fields = mupdf.pdf_dict_getl( mupdf.pdf_trailer(pdf), mupdf.PDF_ENUM_NAME_Root, mupdf.PDF_ENUM_NAME_AcroForm, mupdf.PDF_ENUM_NAME_Fields, ) if mupdf.pdf_is_array(fields): count = mupdf.pdf_array_len(fields) except Exception: if g_exceptions_verbose: exception_info() return False if count >= 0: return count return False @property def is_pdf(self): """Check for PDF.""" if isinstance(self.this, mupdf.PdfDocument): return True # Avoid calling smupdf.pdf_specifics because it will end up creating # a new PdfDocument which will call pdf_create_document(), which is ok # but a little unnecessary. # if mupdf.ll_pdf_specifics(self.this.m_internal): ret = True else: ret = False return ret @property def is_reflowable(self): """Check if document is layoutable.""" if self.is_closed: raise ValueError("document closed") return bool(mupdf.fz_is_document_reflowable(self)) @property def is_repaired(self): """Check whether PDF was repaired.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return False r = mupdf.pdf_was_repaired(pdf) if r: return True return False def journal_can_do(self): """Show if undo and / or redo are possible.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") undo=0 redo=0 pdf = _as_pdf_document(self) undo = mupdf.pdf_can_undo(pdf) redo = mupdf.pdf_can_redo(pdf) return {'undo': bool(undo), 'redo': bool(redo)} def journal_enable(self): """Activate document journalling.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) mupdf.pdf_enable_journal(pdf) def journal_is_enabled(self): """Check if journalling is enabled.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) enabled = pdf.m_internal and pdf.m_internal.journal return enabled def journal_load(self, filename): """Load a journal from a file.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) if isinstance(filename, str): mupdf.pdf_load_journal(pdf, filename) else: res = JM_BufferFromBytes(filename) stm = mupdf.fz_open_buffer(res) mupdf.pdf_deserialise_journal(pdf, stm) if not pdf.m_internal.journal: RAISEPY( "Journal and document do not match", JM_Exc_FileDataError) def journal_op_name(self, step): """Show operation name for given step.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) name = mupdf.pdf_undoredo_step(pdf, step) return name def journal_position(self): """Show journalling state.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") steps=0 pdf = _as_pdf_document(self) rc, steps = mupdf.pdf_undoredo_state(pdf) return rc, steps def journal_redo(self): """Move forward in the journal.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) mupdf.pdf_redo(pdf) return True def journal_save(self, filename): """Save journal to a file.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) if isinstance(filename, str): mupdf.pdf_save_journal(pdf, filename) else: out = JM_new_output_fileptr(filename) mupdf.pdf_write_journal(pdf, out) out.fz_close_output() def journal_start_op(self, name=None): """Begin a journalling operation.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) if not pdf.m_internal.journal: raise RuntimeError( "Journalling not enabled") if name: mupdf.pdf_begin_operation(pdf, name) else: mupdf.pdf_begin_implicit_operation(pdf) def journal_stop_op(self): """End a journalling operation.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) mupdf.pdf_end_operation(pdf) def journal_undo(self): """Move backwards in the journal.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) mupdf.pdf_undo(pdf) return True @property def language(self): """Document language.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return lang = mupdf.pdf_document_language(pdf) if lang == mupdf.FZ_LANG_UNSET: return return mupdf.fz_string_from_text_language2(lang) @property def last_location(self): """Id (chapter, page) of last page.""" if self.is_closed: raise ValueError("document closed") last_loc = mupdf.fz_last_page(self.this) return last_loc.chapter, last_loc.page def layer_ui_configs(self): """Show OC visibility status modifiable by user.""" pdf = _as_pdf_document(self) info = mupdf.PdfLayerConfigUi() n = mupdf.pdf_count_layer_config_ui( pdf) rc = [] for i in range(n): mupdf.pdf_layer_config_ui_info( pdf, i, info) if info.type == 1: type_ = "checkbox" elif info.type == 2: type_ = "radiobox" else: type_ = "label" item = { "number": i, "text": info.text, "depth": info.depth, "type": type_, "on": info.selected, "locked": info.locked, } rc.append(item) return rc def layout(self, rect=None, width=0, height=0, fontsize=11): """Re-layout a reflowable document.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") doc = self.this if not mupdf.fz_is_document_reflowable( doc): return w = width h = height r = JM_rect_from_py(rect) if not mupdf.fz_is_infinite_rect(r): w = r.x1 - r.x0 h = r.y1 - r.y0 if w <= 0.0 or h <= 0.0: raise ValueError( "bad page size") mupdf.fz_layout_document( doc, w, h, fontsize) self._reset_page_refs() self.init_doc() def load_page(self, page_id): """Load a page. 'page_id' is either a 0-based page number or a tuple (chapter, pno), with chapter number and page number within that chapter. """ if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if page_id is None: page_id = 0 if page_id not in self: raise ValueError("page not in document") if type(page_id) is int and page_id < 0: np = self.page_count while page_id < 0: page_id += np if isinstance(page_id, int): page = mupdf.fz_load_page(self.this, page_id) else: chapter, pagenum = page_id page = mupdf.fz_load_chapter_page(self.this, chapter, pagenum) val = Page(page, self) val.thisown = True val.parent = self self._page_refs[id(val)] = val val._annot_refs = weakref.WeakValueDictionary() val.number = page_id return val def location_from_page_number(self, pno): """Convert pno to (chapter, page).""" if self.is_closed: raise ValueError("document closed") this_doc = self.this loc = mupdf.fz_make_location(-1, -1) page_count = mupdf.fz_count_pages(this_doc) while pno < 0: pno += page_count if pno >= page_count: raise ValueError( MSG_BAD_PAGENO) loc = mupdf.fz_location_from_page_number(this_doc, pno) return loc.chapter, loc.page def make_bookmark(self, loc): """Make a page pointer before layouting document.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") loc = mupdf.FzLocation(*loc) mark = mupdf.ll_fz_make_bookmark2( self.this.m_internal, loc.internal()) return mark @property def markinfo(self) -> dict: """Return the PDF MarkInfo value.""" xref = self.pdf_catalog() if xref == 0: return None rc = self.xref_get_key(xref, "MarkInfo") if rc[0] == "null": return {} if rc[0] == "xref": xref = int(rc[1].split()[0]) val = self.xref_object(xref, compressed=True) elif rc[0] == "dict": val = rc[1] else: val = None if val is None or not (val[:2] == "<<" and val[-2:] == ">>"): return {} valid = {"Marked": False, "UserProperties": False, "Suspects": False} val = val[2:-2].split("/") for v in val[1:]: try: key, value = v.split() except Exception: if g_exceptions_verbose > 1: exception_info() return valid if value == "true": valid[key] = True return valid def move_page(self, pno: int, to: int =-1): """Move a page within a PDF document. Args: pno: source page number. to: put before this page, '-1' means after last page. """ if self.is_closed: raise ValueError("document closed") page_count = len(self) if (pno not in range(page_count) or to not in range(-1, page_count)): raise ValueError("bad page number(s)") before = 1 copy = 0 if to == -1: to = page_count - 1 before = 0 return self._move_copy_page(pno, to, before, copy) @property def name(self): return self._name def need_appearances(self, value=None): """Get/set the NeedAppearances value.""" if not self.is_form_pdf: return None pdf = _as_pdf_document(self) oldval = -1 appkey = "NeedAppearances" form = mupdf.pdf_dict_getp( mupdf.pdf_trailer(pdf), "Root/AcroForm", ) app = mupdf.pdf_dict_gets(form, appkey) if mupdf.pdf_is_bool(app): oldval = mupdf.pdf_to_bool(app) if value: mupdf.pdf_dict_puts(form, appkey, mupdf.PDF_TRUE) else: mupdf.pdf_dict_puts(form, appkey, mupdf.PDF_FALSE) if value is None: return oldval >= 0 return value @property def needs_pass(self): """Indicate password required.""" if self.is_closed: raise ValueError("document closed") document = self.this if isinstance(self.this, mupdf.FzDocument) else self.this.super() ret = mupdf.fz_needs_password( document) return ret def new_page( doc: 'Document', pno: int = -1, width: float = 595, height: float = 842, ) -> Page: """Create and return a new page object. Args: pno: (int) insert before this page. Default: after last page. width: (float) page width in points. Default: 595 (ISO A4 width). height: (float) page height in points. Default 842 (ISO A4 height). Returns: A pymupdf.Page object. """ doc._newPage(pno, width=width, height=height) return doc[pno] def next_location(self, page_id): """Get (chapter, page) of next page.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if type(page_id) is int: page_id = (0, page_id) if page_id not in self: raise ValueError("page id not in document") if tuple(page_id) == self.last_location: return () this_doc = _as_fz_document(self) val = page_id[ 0] if not isinstance(val, int): RAISEPY(MSG_BAD_PAGEID, PyExc_ValueError) chapter = val val = page_id[ 1] pno = val loc = mupdf.fz_make_location(chapter, pno) next_loc = mupdf.fz_next_page( this_doc, loc) return next_loc.chapter, next_loc.page def page_annot_xrefs(self, n): if g_use_extra: return extra.page_annot_xrefs( self.this, n) if isinstance(self.this, mupdf.PdfDocument): page_count = mupdf.pdf_count_pages(self.this) pdf_document = self.this else: page_count = mupdf.fz_count_pages(self.this) pdf_document = _as_pdf_document(self) while n < 0: n += page_count if n > page_count: raise ValueError( MSG_BAD_PAGENO) page_obj = mupdf.pdf_lookup_page_obj(pdf_document, n) annots = JM_get_annot_xref_list(page_obj) return annots @property def page_count(self): """Number of pages.""" if self.is_closed: raise ValueError('document closed') if g_use_extra: return self.page_count2(self) if isinstance( self.this, mupdf.FzDocument): return mupdf.fz_count_pages( self.this) else: return mupdf.pdf_count_pages( self.this) def page_cropbox(self, pno): """Get CropBox of page number (without loading page).""" if self.is_closed: raise ValueError("document closed") this_doc = self.this page_count = mupdf.fz_count_pages( this_doc) n = pno while n < 0: n += page_count pdf = _as_pdf_document(self) if n >= page_count: raise ValueError( MSG_BAD_PAGENO) pageref = mupdf.pdf_lookup_page_obj( pdf, n) cropbox = JM_cropbox(pageref) val = JM_py_from_rect(cropbox) val = Rect(val) return val def page_number_from_location(self, page_id): """Convert (chapter, pno) to page number.""" if type(page_id) is int: np = self.page_count while page_id < 0: page_id += np page_id = (0, page_id) if page_id not in self: raise ValueError("page id not in document") chapter, pno = page_id loc = mupdf.fz_make_location( chapter, pno) page_n = mupdf.fz_page_number_from_location( self.this, loc) return page_n def page_xref(self, pno): """Get xref of page number.""" if g_use_extra: return extra.page_xref( self.this, pno) if self.is_closed: raise ValueError("document closed") page_count = mupdf.fz_count_pages(self.this) n = pno while n < 0: n += page_count pdf = _as_pdf_document(self) xref = 0 if n >= page_count: raise ValueError( MSG_BAD_PAGENO) xref = mupdf.pdf_to_num(mupdf.pdf_lookup_page_obj(pdf, n)) return xref @property def pagelayout(self) -> str: """Return the PDF PageLayout value. """ xref = self.pdf_catalog() if xref == 0: return None rc = self.xref_get_key(xref, "PageLayout") if rc[0] == "null": return "SinglePage" if rc[0] == "name": return rc[1][1:] return "SinglePage" @property def pagemode(self) -> str: """Return the PDF PageMode value. """ xref = self.pdf_catalog() if xref == 0: return None rc = self.xref_get_key(xref, "PageMode") if rc[0] == "null": return "UseNone" if rc[0] == "name": return rc[1][1:] return "UseNone" if sys.implementation.version < (3, 9): # Appending `[Page]` causes `TypeError: 'ABCMeta' object is not subscriptable`. _pages_ret = collections.abc.Iterable else: _pages_ret = collections.abc.Iterable[Page] def pages(self, start: OptInt =None, stop: OptInt =None, step: OptInt =None) -> _pages_ret: """Return a generator iterator over a page range. Arguments have the same meaning as for the range() built-in. """ if not self.page_count: return # set the start value start = start or 0 while start < 0: start += self.page_count if start not in range(self.page_count): raise ValueError("bad start page number") # set the stop value stop = stop if stop is not None and stop <= self.page_count else self.page_count # set the step value if step == 0: raise ValueError("arg 3 must not be zero") if step is None: if start > stop: step = -1 else: step = 1 for pno in range(start, stop, step): yield (self.load_page(pno)) def pdf_catalog(self): """Get xref of PDF catalog.""" pdf = _as_pdf_document(self, required=0) xref = 0 if not pdf.m_internal: return xref root = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')) xref = mupdf.pdf_to_num(root) return xref def pdf_trailer(self, compressed=0, ascii=0): """Get PDF trailer as a string.""" return self.xref_object(-1, compressed=compressed, ascii=ascii) @property def permissions(self): """Document permissions.""" if self.is_encrypted: return 0 doc =self.this pdf = mupdf.pdf_document_from_fz_document(doc) # for PDF return result of standard function if pdf.m_internal: return mupdf.pdf_document_permissions(pdf) # otherwise simulate the PDF return value perm = 0xFFFFFFFC # all permissions granted # now switch off where needed if not mupdf.fz_has_permission(doc, mupdf.FZ_PERMISSION_PRINT): perm = perm ^ mupdf.PDF_PERM_PRINT if not mupdf.fz_has_permission(doc, mupdf.FZ_PERMISSION_EDIT): perm = perm ^ mupdf.PDF_PERM_MODIFY if not mupdf.fz_has_permission(doc, mupdf.FZ_PERMISSION_COPY): perm = perm ^ mupdf.PDF_PERM_COPY if not mupdf.fz_has_permission(doc, mupdf.FZ_PERMISSION_ANNOTATE): perm = perm ^ mupdf.PDF_PERM_ANNOTATE return perm def prev_location(self, page_id): """Get (chapter, page) of previous page.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if type(page_id) is int: page_id = (0, page_id) if page_id not in self: raise ValueError("page id not in document") if page_id == (0, 0): return () chapter, pno = page_id loc = mupdf.fz_make_location(chapter, pno) prev_loc = mupdf.fz_previous_page(self.this, loc) return prev_loc.chapter, prev_loc.page def reload_page(self, page: Page) -> Page: """Make a fresh copy of a page.""" old_annots = {} # copy annot references to here pno = page.number # save the page number for k, v in page._annot_refs.items(): # save the annot dictionary old_annots[k] = v # When we call `self.load_page()` below, it will end up in # fz_load_chapter_page(), which will return any matching page in the # document's list of non-ref-counted loaded pages, instead of actually # reloading the page. # # We want to assert that we have actually reloaded the fz_page, and not # simply returned the same `fz_page*` pointer from the document's list # of non-ref-counted loaded pages. # # So we first remove our reference to the `fz_page*`. This will # decrement .refs, and if .refs was 1, this is guaranteed to free the # `fz_page*` and remove it from the document's list if it was there. So # we are guaranteed that our returned `fz_page*` is from a genuine # reload, even if it happens to reuse the original block of memory. # # However if the original .refs is greater than one, there must be # other references to the `fz_page` somewhere, and we require that # these other references are not keeping the page in the document's # list. We check that we are returning a newly loaded page by # asserting that our returned `fz_page*` is different from the original # `fz_page*` - the original was not freed, so a new `fz_page` cannot # reuse the same block of memory. # refs_old = page.this.m_internal.refs m_internal_old = page.this.m_internal_value() page.this = None page._erase() # remove the page page = None TOOLS.store_shrink(100) page = self.load_page(pno) # reload the page # copy annot refs over to the new dictionary #page_proxy = weakref.proxy(page) for k, v in old_annots.items(): annot = old_annots[k] #annot.parent = page_proxy # refresh parent to new page page._annot_refs[k] = annot if refs_old == 1: # We know that `page.this = None` will have decremented the ref # count to zero so we are guaranteed that the new `fz_page` is a # new page even if it happens to have reused the same block of # memory. pass else: # Check that the new `fz_page*` is different from the original. m_internal_new = page.this.m_internal_value() assert m_internal_new != m_internal_old, \ f'{refs_old=} {m_internal_old=:#x} {m_internal_new=:#x}' return page def resolve_link(self, uri=None, chapters=0): """Calculate internal link destination. Args: uri: (str) some Link.uri chapters: (bool) whether to use (chapter, page) format Returns: (page_id, x, y) where x, y are point coordinates on the page. page_id is either page number (if chapters=0), or (chapter, pno). """ if not uri: if chapters: return (-1, -1), 0, 0 return -1, 0, 0 try: loc, xp, yp = mupdf.fz_resolve_link(self.this, uri) except Exception: if g_exceptions_verbose: exception_info() if chapters: return (-1, -1), 0, 0 return -1, 0, 0 if chapters: return (loc.chapter, loc.page), xp, yp pno = mupdf.fz_page_number_from_location(self.this, loc) return pno, xp, yp def rewrite_images( self, dpi_threshold=None, dpi_target=0, quality=0, lossy=True, lossless=True, bitonal=True, color=True, gray=True, set_to_gray=False, options=None, ): """Rewrite images in a PDF document. The typical use case is to reduce the size of the PDF by recompressing images. Default parameters will convert all images to JPEG where possible, using the specified resolutions and quality. Exclude undesired images by setting parameters to False. Args: dpi_threshold: look at images with a larger DPI only. dpi_target: change eligible images to this DPI. quality: Quality of the recompressed images (0-100). lossy: process lossy image types (e.g. JPEG). lossless: process lossless image types (e.g. PNG). bitonal: process black-and-white images (e.g. FAX) color: process colored images. gray: process gray images. set_to_gray: whether to change the PDF to gray at process start. options: (PdfImageRewriterOptions) Custom options for image rewriting (optional). Expert use only. If provided, other parameters are ignored, except set_to_gray. """ quality_str = str(quality) if not dpi_threshold: dpi_threshold = dpi_target = 0 if dpi_target > 0 and dpi_target >= dpi_threshold: raise ValueError("{dpi_target=} must be less than {dpi_threshold=}") template_opts = mupdf.PdfImageRewriterOptions() dir1 = set(dir(template_opts)) # for checking that only existing options are set if not options: opts = mupdf.PdfImageRewriterOptions() if bitonal: opts.bitonal_image_recompress_method = mupdf.FZ_RECOMPRESS_FAX opts.bitonal_image_subsample_method = mupdf.FZ_SUBSAMPLE_AVERAGE opts.bitonal_image_subsample_to = dpi_target opts.bitonal_image_recompress_quality = quality_str opts.bitonal_image_subsample_threshold = dpi_threshold if color: if lossless: opts.color_lossless_image_recompress_method = mupdf.FZ_RECOMPRESS_JPEG opts.color_lossless_image_subsample_method = mupdf.FZ_SUBSAMPLE_AVERAGE opts.color_lossless_image_subsample_to = dpi_target opts.color_lossless_image_subsample_threshold = dpi_threshold opts.color_lossless_image_recompress_quality = quality_str if lossy: opts.color_lossy_image_recompress_method = mupdf.FZ_RECOMPRESS_JPEG opts.color_lossy_image_subsample_method = mupdf.FZ_SUBSAMPLE_AVERAGE opts.color_lossy_image_subsample_threshold = dpi_threshold opts.color_lossy_image_subsample_to = dpi_target opts.color_lossy_image_recompress_quality = quality_str if gray: if lossless: opts.gray_lossless_image_recompress_method = mupdf.FZ_RECOMPRESS_JPEG opts.gray_lossless_image_subsample_method = mupdf.FZ_SUBSAMPLE_AVERAGE opts.gray_lossless_image_subsample_to = dpi_target opts.gray_lossless_image_subsample_threshold = dpi_threshold opts.gray_lossless_image_recompress_quality = quality_str if lossy: opts.gray_lossy_image_recompress_method = mupdf.FZ_RECOMPRESS_JPEG opts.gray_lossy_image_subsample_method = mupdf.FZ_SUBSAMPLE_AVERAGE opts.gray_lossy_image_subsample_threshold = dpi_threshold opts.gray_lossy_image_subsample_to = dpi_target opts.gray_lossy_image_recompress_quality = quality_str else: opts = options dir2 = set(dir(opts)) # checking that only possible options were used invalid_options = dir2 - dir1 if invalid_options: raise ValueError(f"Invalid options: {invalid_options}") if set_to_gray: self.recolor(1) pdf = _as_pdf_document(self) mupdf.pdf_rewrite_images(pdf, opts) def recolor(self, components=1): """Change the color component count on all pages. Args: components: (int) desired color component count, one of 1, 3, 4. Invokes the same-named method for all pages. """ if not self.is_pdf: raise ValueError("is no PDF") for i in range(self.page_count): self.load_page(i).recolor(components) def resolve_names(self): """Convert the PDF's destination names into a Python dict. The only parameter is the pymupdf.Document. All names found in the catalog under keys "/Dests" and "/Names/Dests" are being included. Returns: A dcitionary with the following layout: - key: (str) the name - value: (dict) with the following layout: * "page": target page number (0-based). If no page number found -1. * "to": (x, y) target point on page - currently in PDF coordinates, i.e. point (0,0) is the bottom-left of the page. * "zoom": (float) the zoom factor * "dest": (str) only occurs if the target location on the page has not been provided as "/XYZ" or if no page number was found. Examples: {'__bookmark_1': {'page': 0, 'to': (0.0, 541.0), 'zoom': 0.0}, '__bookmark_2': {'page': 0, 'to': (0.0, 481.45), 'zoom': 0.0}} or '21154a7c20684ceb91f9c9adc3b677c40': {'page': -1, 'dest': '/XYZ 15.75 1486 0'}, ... """ if hasattr(self, "_resolved_names"): # do not execute multiple times! return self._resolved_names # this is a backward listing of page xref to page number page_xrefs = {self.page_xref(i): i for i in range(self.page_count)} def obj_string(obj): """Return string version of a PDF object definition.""" buffer = mupdf.fz_new_buffer(512) output = mupdf.FzOutput(buffer) mupdf.pdf_print_obj(output, obj, 1, 0) output.fz_close_output() return JM_UnicodeFromBuffer(buffer) def get_array(val): """Generate value of one item of the names dictionary.""" templ_dict = {"page": -1, "dest": ""} # value template if val.pdf_is_indirect(): val = mupdf.pdf_resolve_indirect(val) if val.pdf_is_array(): array = obj_string(val) elif val.pdf_is_dict(): array = obj_string(mupdf.pdf_dict_gets(val, "D")) else: # if all fails return the empty template return templ_dict # replace PDF "null" by zero, omit the square brackets array = array.replace("null", "0")[1:-1] # find stuff before first "/" idx = array.find("/") if idx < 1: # this has no target page spec templ_dict["dest"] = array # return the orig. string return templ_dict subval = array[:idx].strip() # stuff before "/" array = array[idx:] # stuff from "/" onwards templ_dict["dest"] = array # if we start with /XYZ: extract x, y, zoom # 1, 2 or 3 of these values may actually be supplied if array.startswith("/XYZ"): del templ_dict["dest"] # don't return orig string in this case # make a list of the 3 tokens following "/XYZ" array_list = array.split()[1:4] # omit "/XYZ" # fill up missing tokens with "0" strings while len(array_list) < 3: # fill up if too short array_list.append("0") # add missing values # make list of 3 floats: x, y and zoom t = list(map(float, array_list)) # the resulting x, y, z values templ_dict["to"] = (t[0], t[1]) templ_dict["zoom"] = t[2] # extract page number if subval.endswith("0 R"): # page xref given? templ_dict["page"] = page_xrefs.get(int(subval.split()[0]),-1) else: # naked page number given templ_dict["page"] = int(subval) return templ_dict def fill_dict(dest_dict, pdf_dict): """Generate name resolution items for pdf_dict. This may be either "/Names/Dests" or just "/Dests" """ # length of the PDF dictionary name_count = mupdf.pdf_dict_len(pdf_dict) # extract key-val of each dict item for i in range(name_count): key = mupdf.pdf_dict_get_key(pdf_dict, i) val = mupdf.pdf_dict_get_val(pdf_dict, i) if key.pdf_is_name(): # this should always be true! dict_key = key.pdf_to_name() else: message(f"key {i} is no /Name") dict_key = None if dict_key: dest_dict[dict_key] = get_array(val) # store key/value in dict # access underlying PDF document of fz Document pdf = mupdf.pdf_document_from_fz_document(self) # access PDF catalog catalog = mupdf.pdf_dict_gets(mupdf.pdf_trailer(pdf), "Root") dest_dict = {} # make PDF_NAME(Dests) dests = mupdf.pdf_new_name("Dests") # extract destinations old style (PDF 1.1) old_dests = mupdf.pdf_dict_get(catalog, dests) if old_dests.pdf_is_dict(): fill_dict(dest_dict, old_dests) # extract destinations new style (PDF 1.2+) tree = mupdf.pdf_load_name_tree(pdf, dests) if tree.pdf_is_dict(): fill_dict(dest_dict, tree) self._resolved_names = dest_dict # store result or reuse return dest_dict def save( self, filename, garbage=0, clean=0, deflate=0, deflate_images=0, deflate_fonts=0, incremental=0, ascii=0, expand=0, linear=0, no_new_id=0, appearance=0, pretty=0, encryption=1, permissions=4095, owner_pw=None, user_pw=None, preserve_metadata=1, use_objstms=0, compression_effort=0, ): # From %pythonprepend save # """Save PDF to file, pathlib.Path or file pointer.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if type(filename) is str: pass elif hasattr(filename, "open"): # assume: pathlib.Path filename = str(filename) elif hasattr(filename, "name"): # assume: file object filename = filename.name elif not hasattr(filename, "seek"): # assume file object raise ValueError("filename must be str, Path or file object") if filename == self.name and not incremental: raise ValueError("save to original must be incremental") if linear and use_objstms: raise ValueError("'linear' and 'use_objstms' cannot both be requested") if self.page_count < 1: raise ValueError("cannot save with zero pages") if incremental: if self.name != filename or self.stream: raise ValueError("incremental needs original file") if user_pw and len(user_pw) > 40 or owner_pw and len(owner_pw) > 40: raise ValueError("password length must not exceed 40") pdf = _as_pdf_document(self) opts = mupdf.PdfWriteOptions() opts.do_incremental = incremental opts.do_ascii = ascii opts.do_compress = deflate opts.do_compress_images = deflate_images opts.do_compress_fonts = deflate_fonts opts.do_decompress = expand opts.do_garbage = garbage opts.do_pretty = pretty opts.do_linear = linear opts.do_clean = clean opts.do_sanitize = clean opts.dont_regenerate_id = no_new_id opts.do_appearance = appearance opts.do_encrypt = encryption opts.permissions = permissions if owner_pw is not None: opts.opwd_utf8_set_value(owner_pw) elif user_pw is not None: opts.opwd_utf8_set_value(user_pw) if user_pw is not None: opts.upwd_utf8_set_value(user_pw) opts.do_preserve_metadata = preserve_metadata opts.do_use_objstms = use_objstms opts.compression_effort = compression_effort out = None pdf.m_internal.resynth_required = 0 JM_embedded_clean(pdf) if no_new_id == 0: JM_ensure_identity(pdf) if isinstance(filename, str): #log( 'calling mupdf.pdf_save_document()') mupdf.pdf_save_document(pdf, filename, opts) else: out = JM_new_output_fileptr(filename) #log( f'{type(out)=} {type(out.this)=}') mupdf.pdf_write_document(pdf, out, opts) out.fz_close_output() def save_snapshot(self, filename): """Save a file snapshot suitable for journalling.""" if self.is_closed: raise ValueError("doc is closed") if type(filename) is str: pass elif hasattr(filename, "open"): # assume: pathlib.Path filename = str(filename) elif hasattr(filename, "name"): # assume: file object filename = filename.name else: raise ValueError("filename must be str, Path or file object") if filename == self.name: raise ValueError("cannot snapshot to original") pdf = _as_pdf_document(self) mupdf.pdf_save_snapshot(pdf, filename) def saveIncr(self): """ Save PDF incrementally""" return self.save(self.name, incremental=True, encryption=mupdf.PDF_ENCRYPT_KEEP) # ------------------------------------------------------------------------------ # Remove potentially sensitive data from a PDF. Similar to the Adobe # Acrobat 'sanitize' function # ------------------------------------------------------------------------------ def scrub( doc: 'Document', attached_files: bool = True, clean_pages: bool = True, embedded_files: bool = True, hidden_text: bool = True, javascript: bool = True, metadata: bool = True, redactions: bool = True, redact_images: int = 0, remove_links: bool = True, reset_fields: bool = True, reset_responses: bool = True, thumbnails: bool = True, xml_metadata: bool = True, ) -> None: def remove_hidden(cont_lines): """Remove hidden text from a PDF page. Args: cont_lines: list of lines with /Contents content. Should have status from after page.cleanContents(). Returns: List of /Contents lines from which hidden text has been removed. Notes: The input must have been created after the page's /Contents object(s) have been cleaned with page.cleanContents(). This ensures a standard formatting: one command per line, single spaces between operators. This allows for drastic simplification of this code. """ out_lines = [] # will return this in_text = False # indicate if within BT/ET object suppress = False # indicate text suppression active make_return = False for line in cont_lines: if line == b"BT": # start of text object in_text = True # switch on out_lines.append(line) # output it continue if line == b"ET": # end of text object in_text = False # switch off out_lines.append(line) # output it continue if line == b"3 Tr": # text suppression operator suppress = True # switch on make_return = True continue if line[-2:] == b"Tr" and line[0] != b"3": suppress = False # text rendering changed out_lines.append(line) continue if line == b"Q": # unstack command also switches off suppress = False out_lines.append(line) continue if suppress and in_text: # suppress hidden lines continue out_lines.append(line) if make_return: return out_lines else: return None if not doc.is_pdf: # only works for PDF raise ValueError("is no PDF") if doc.is_encrypted or doc.is_closed: raise ValueError("closed or encrypted doc") if not clean_pages: hidden_text = False redactions = False if metadata: doc.set_metadata({}) # remove standard metadata for page in doc: if reset_fields: # reset form fields (widgets) for widget in page.widgets(): widget.reset() if remove_links: links = page.get_links() # list of all links on page for link in links: # remove all links page.delete_link(link) found_redacts = False for annot in page.annots(): if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files: annot.update_file(buffer_=b" ") # set file content to empty if reset_responses: annot.delete_responses() if annot.type[0] == mupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member found_redacts = True if redactions and found_redacts: page.apply_redactions(images=redact_images) if not (clean_pages or hidden_text): continue # done with the page page.clean_contents() if not page.get_contents(): continue if hidden_text: xrefs = page.get_contents() assert len(xrefs) == 1 # only one because of cleaning. xref = xrefs[0] cont = doc.xref_stream(xref) cont_lines = remove_hidden(cont.splitlines()) # remove hidden text if cont_lines: # something was actually removed cont = b"\n".join(cont_lines) doc.update_stream(xref, cont) # rewrite the page /Contents if thumbnails: # remove page thumbnails? if doc.xref_get_key(page.xref, "Thumb")[0] != "null": doc.xref_set_key(page.xref, "Thumb", "null") # pages are scrubbed, now perform document-wide scrubbing # remove embedded files if embedded_files: for name in doc.embfile_names(): doc.embfile_del(name) if xml_metadata: doc.del_xml_metadata() if not (xml_metadata or javascript): xref_limit = 0 else: xref_limit = doc.xref_length() for xref in range(1, xref_limit): if not doc.xref_object(xref): msg = "bad xref %i - clean PDF before scrubbing" % xref raise ValueError(msg) if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript": # a /JavaScript action object obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript doc.update_object(xref, obj) # update this object continue # no further handling if not xml_metadata: continue if doc.xref_get_key(xref, "Type")[1] == "/Metadata": # delete any metadata object directly doc.update_object(xref, "<<>>") doc.update_stream(xref, b"deleted", new=True) continue if doc.xref_get_key(xref, "Metadata")[0] != "null": doc.xref_set_key(xref, "Metadata", "null") def search_page_for( doc: 'Document', pno: int, text: str, quads: bool = False, clip: rect_like = None, flags: int = None, textpage: 'TextPage' = None, ) -> list: """Search for a string on a page. Args: pno: page number text: string to be searched for clip: restrict search to this rectangle quads: (bool) return quads instead of rectangles flags: bit switches, default: join hyphened words textpage: reuse a prepared textpage Returns: a list of rectangles or quads, each containing an occurrence. """ if flags is None: flags = (0 | TEXT_DEHYPHENATE | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP ) return doc[pno].search_for( text, quads=quads, clip=clip, flags=flags, textpage=textpage, ) def select(self, pyliste): """Build sub-pdf with page numbers in the list.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if not self.is_pdf: raise ValueError("is no PDF") if not hasattr(pyliste, "__getitem__"): raise ValueError("sequence required") valid_range = range(len(self)) if (len(pyliste) == 0 or min(pyliste) not in valid_range or max(pyliste) not in valid_range ): raise ValueError("bad page number(s)") # get underlying pdf document, pdf = _as_pdf_document(self) # create page sub-pdf via pdf_rearrange_pages2(). # if mupdf_version_tuple >= (1, 25, 3): # We use PDF_CLEAN_STRUCTURE_KEEP otherwise we lose structure tree # which, for example, breaks test_3705. mupdf.pdf_rearrange_pages2(pdf, pyliste, mupdf.PDF_CLEAN_STRUCTURE_KEEP) else: mupdf.pdf_rearrange_pages2(pdf, pyliste) # remove any existing pages with their kids self._reset_page_refs() def set_language(self, language=None): pdf = _as_pdf_document(self) if not language: lang = mupdf.FZ_LANG_UNSET else: lang = mupdf.fz_text_language_from_string(language) mupdf.pdf_set_document_language(pdf, lang) return True def set_layer(self, config, basestate=None, on=None, off=None, rbgroups=None, locked=None): """Set the PDF keys /ON, /OFF, /RBGroups of an OC layer.""" if self.is_closed: raise ValueError("document closed") ocgs = set(self.get_ocgs().keys()) if ocgs == set(): raise ValueError("document has no optional content") if on: if type(on) not in (list, tuple): raise ValueError("bad type: 'on'") s = set(on).difference(ocgs) if s != set(): raise ValueError("bad OCGs in 'on': %s" % s) if off: if type(off) not in (list, tuple): raise ValueError("bad type: 'off'") s = set(off).difference(ocgs) if s != set(): raise ValueError("bad OCGs in 'off': %s" % s) if locked: if type(locked) not in (list, tuple): raise ValueError("bad type: 'locked'") s = set(locked).difference(ocgs) if s != set(): raise ValueError("bad OCGs in 'locked': %s" % s) if rbgroups: if type(rbgroups) not in (list, tuple): raise ValueError("bad type: 'rbgroups'") for x in rbgroups: if not type(x) in (list, tuple): raise ValueError("bad RBGroup '%s'" % x) s = set(x).difference(ocgs) if s != set(): raise ValueError("bad OCGs in RBGroup: %s" % s) if basestate: basestate = str(basestate).upper() if basestate == "UNCHANGED": basestate = "Unchanged" if basestate not in ("ON", "OFF", "Unchanged"): raise ValueError("bad 'basestate'") pdf = _as_pdf_document(self) ocp = mupdf.pdf_dict_getl( mupdf.pdf_trailer( pdf), PDF_NAME('Root'), PDF_NAME('OCProperties'), ) if not ocp.m_internal: return if config == -1: obj = mupdf.pdf_dict_get( ocp, PDF_NAME('D')) else: obj = mupdf.pdf_array_get( mupdf.pdf_dict_get( ocp, PDF_NAME('Configs')), config, ) if not obj.m_internal: raise ValueError( MSG_BAD_OC_CONFIG) JM_set_ocg_arrays( obj, basestate, on, off, rbgroups, locked) mupdf.ll_pdf_read_ocg( pdf.m_internal) def set_layer_ui_config(self, number, action=0): """Set / unset OC intent configuration.""" # The user might have given the name instead of sequence number, # so select by that name and continue with corresp. number if isinstance(number, str): select = [ui["number"] for ui in self.layer_ui_configs() if ui["text"] == number] if select == []: raise ValueError(f"bad OCG '{number}'.") number = select[0] # this is the number for the name pdf = _as_pdf_document(self) if action == 1: mupdf.pdf_toggle_layer_config_ui(pdf, number) elif action == 2: mupdf.pdf_deselect_layer_config_ui(pdf, number) else: mupdf.pdf_select_layer_config_ui(pdf, number) def set_markinfo(self, markinfo: dict) -> bool: """Set the PDF MarkInfo values.""" xref = self.pdf_catalog() if xref == 0: raise ValueError("not a PDF") if not markinfo or not isinstance(markinfo, dict): return False valid = {"Marked": False, "UserProperties": False, "Suspects": False} if not set(valid.keys()).issuperset(markinfo.keys()): badkeys = f"bad MarkInfo key(s): {set(markinfo.keys()).difference(valid.keys())}" raise ValueError(badkeys) pdfdict = "<<" valid.update(markinfo) for key, value in valid.items(): value=str(value).lower() if value not in ("true", "false"): raise ValueError(f"bad key value '{key}': '{value}'") pdfdict += f"/{key} {value}" pdfdict += ">>" self.xref_set_key(xref, "MarkInfo", pdfdict) return True def set_metadata(doc: 'Document', m: dict = None) -> None: """Update the PDF /Info object. Args: m: a dictionary like doc.metadata. """ if not doc.is_pdf: raise ValueError("is no PDF") if doc.is_closed or doc.is_encrypted: raise ValueError("document closed or encrypted") if m is None: m = {} elif type(m) is not dict: raise ValueError("bad metadata") keymap = { "author": "Author", "producer": "Producer", "creator": "Creator", "title": "Title", "format": None, "encryption": None, "creationDate": "CreationDate", "modDate": "ModDate", "subject": "Subject", "keywords": "Keywords", "trapped": "Trapped", } valid_keys = set(keymap.keys()) diff_set = set(m.keys()).difference(valid_keys) if diff_set != set(): msg = "bad dict key(s): %s" % diff_set raise ValueError(msg) t, temp = doc.xref_get_key(-1, "Info") if t != "xref": info_xref = 0 else: info_xref = int(temp.replace("0 R", "")) if m == {} and info_xref == 0: # nothing to do return if info_xref == 0: # no prev metadata: get new xref info_xref = doc.get_new_xref() doc.update_object(info_xref, "<<>>") # fill it with empty object doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref) elif m == {}: # remove existing metadata doc.xref_set_key(-1, "Info", "null") doc.init_doc() return for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]: pdf_key = keymap[key] if not bool(val) or val in ("none", "null"): val = "null" else: val = get_pdf_str(val) doc.xref_set_key(info_xref, pdf_key, val) doc.init_doc() return def set_oc(doc: 'Document', xref: int, oc: int) -> None: """Attach optional content object to image or form xobject. Args: xref: (int) xref number of an image or form xobject oc: (int) xref number of an OCG or OCMD """ if doc.is_closed or doc.is_encrypted: raise ValueError("document close or encrypted") t, name = doc.xref_get_key(xref, "Subtype") if t != "name" or name not in ("/Image", "/Form"): raise ValueError("bad object type at xref %i" % xref) if oc > 0: t, name = doc.xref_get_key(oc, "Type") if t != "name" or name not in ("/OCG", "/OCMD"): raise ValueError("bad object type at xref %i" % oc) if oc == 0 and "OC" in doc.xref_get_keys(xref): doc.xref_set_key(xref, "OC", "null") return None doc.xref_set_key(xref, "OC", "%i 0 R" % oc) return None def set_ocmd( doc: 'Document', xref: int = 0, ocgs: typing.Union[list, None] = None, policy: OptStr = None, ve: typing.Union[list, None] = None, ) -> int: """Create or update an OCMD object in a PDF document. Args: xref: (int) 0 for creating a new object, otherwise update existing one. ocgs: (list) OCG xref numbers, which shall be subject to 'policy'. policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing). ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'. Returns: Xref of the created or updated OCMD. """ all_ocgs = set(doc.get_ocgs().keys()) def ve_maker(ve): if type(ve) not in (list, tuple) or len(ve) < 2: raise ValueError("bad 've' format: %s" % ve) if ve[0].lower() not in ("and", "or", "not"): raise ValueError("bad operand: %s" % ve[0]) if ve[0].lower() == "not" and len(ve) != 2: raise ValueError("bad 've' format: %s" % ve) item = "[/%s" % ve[0].title() for x in ve[1:]: if type(x) is int: if x not in all_ocgs: raise ValueError("bad OCG %i" % x) item += " %i 0 R" % x else: item += " %s" % ve_maker(x) item += "]" return item text = "<</Type/OCMD" if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided s = set(ocgs).difference(all_ocgs) # contains illegal xrefs if s != set(): msg = "bad OCGs: %s" % s raise ValueError(msg) text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]" if policy: policy = str(policy).lower() pols = { "anyon": "AnyOn", "allon": "AllOn", "anyoff": "AnyOff", "alloff": "AllOff", } if policy not in ("anyon", "allon", "anyoff", "alloff"): raise ValueError("bad policy: %s" % policy) text += "/P/%s" % pols[policy] if ve: text += "/VE%s" % ve_maker(ve) text += ">>" # make new object or replace old OCMD (check type first) if xref == 0: xref = doc.get_new_xref() elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True): raise ValueError("bad xref or not an OCMD") doc.update_object(xref, text) return xref def set_pagelayout(self, pagelayout: str): """Set the PDF PageLayout value.""" valid = ("SinglePage", "OneColumn", "TwoColumnLeft", "TwoColumnRight", "TwoPageLeft", "TwoPageRight") xref = self.pdf_catalog() if xref == 0: raise ValueError("not a PDF") if not pagelayout: raise ValueError("bad PageLayout value") if pagelayout[0] == "/": pagelayout = pagelayout[1:] for v in valid: if pagelayout.lower() == v.lower(): self.xref_set_key(xref, "PageLayout", f"/{v}") return True raise ValueError("bad PageLayout value") def set_pagemode(self, pagemode: str): """Set the PDF PageMode value.""" valid = ("UseNone", "UseOutlines", "UseThumbs", "FullScreen", "UseOC", "UseAttachments") xref = self.pdf_catalog() if xref == 0: raise ValueError("not a PDF") if not pagemode: raise ValueError("bad PageMode value") if pagemode[0] == "/": pagemode = pagemode[1:] for v in valid: if pagemode.lower() == v.lower(): self.xref_set_key(xref, "PageMode", f"/{v}") return True raise ValueError("bad PageMode value") def set_page_labels(doc, labels): """Add / replace page label definitions in PDF document. Args: doc: PDF document (resp. 'self'). labels: list of label dictionaries like: {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}, as returned by get_page_labels(). """ # William Chapman, 2021-01-06 def create_label_str(label): """Convert Python label dict to corresponding PDF rule string. Args: label: (dict) build rule for the label. Returns: PDF label rule string wrapped in "<<", ">>". """ s = "%i<<" % label["startpage"] if label.get("prefix", "") != "": s += "/P(%s)" % label["prefix"] if label.get("style", "") != "": s += "/S/%s" % label["style"] if label.get("firstpagenum", 1) > 1: s += "/St %i" % label["firstpagenum"] s += ">>" return s def create_nums(labels): """Return concatenated string of all labels rules. Args: labels: (list) dictionaries as created by function 'rule_dict'. Returns: PDF compatible string for page label definitions, ready to be enclosed in PDF array 'Nums[...]'. """ labels.sort(key=lambda x: x["startpage"]) s = "".join([create_label_str(label) for label in labels]) return s doc._set_page_labels(create_nums(labels)) def set_toc( doc: 'Document', toc: list, collapse: int = 1, ) -> int: """Create new outline tree (table of contents, TOC). Args: toc: (list, tuple) each entry must contain level, title, page and optionally top margin on the page. None or '()' remove the TOC. collapse: (int) collapses entries beyond this level. Zero or None shows all entries unfolded. Returns: the number of inserted items, or the number of removed items respectively. """ if doc.is_closed or doc.is_encrypted: raise ValueError("document closed or encrypted") if not doc.is_pdf: raise ValueError("is no PDF") if not toc: # remove all entries return len(doc._delToC()) # validity checks -------------------------------------------------------- if type(toc) not in (list, tuple): raise ValueError("'toc' must be list or tuple") toclen = len(toc) page_count = doc.page_count t0 = toc[0] if type(t0) not in (list, tuple): raise ValueError("items must be sequences of 3 or 4 items") if t0[0] != 1: raise ValueError("hierarchy level of item 0 must be 1") for i in list(range(toclen - 1)): t1 = toc[i] t2 = toc[i + 1] if not -1 <= t1[2] <= page_count: raise ValueError("row %i: page number out of range" % i) if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4): raise ValueError("bad row %i" % (i + 1)) if (type(t2[0]) is not int) or t2[0] < 1: raise ValueError("bad hierarchy level in row %i" % (i + 1)) if t2[0] > t1[0] + 1: raise ValueError("bad hierarchy level in row %i" % (i + 1)) # no formal errors in toc -------------------------------------------------- # -------------------------------------------------------------------------- # make a list of xref numbers, which we can use for our TOC entries # -------------------------------------------------------------------------- old_xrefs = doc._delToC() # del old outlines, get their xref numbers # prepare table of xrefs for new bookmarks old_xrefs = [] xref = [0] + old_xrefs xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number if toclen > len(old_xrefs): # too few old xrefs? for i in range((toclen - len(old_xrefs))): xref.append(doc.get_new_xref()) # acquire new ones lvltab = {0: 0} # to store last entry per hierarchy level # ------------------------------------------------------------------------------ # contains new outline objects as strings - first one is the outline root # ------------------------------------------------------------------------------ olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}] # ------------------------------------------------------------------------------ # build olitems as a list of PDF-like connected dictionaries # ------------------------------------------------------------------------------ for i in range(toclen): o = toc[i] lvl = o[0] # level title = get_pdf_str(o[1]) # title pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number page_xref = doc.page_xref(pno) page_height = doc.page_cropbox(pno).height top = Point(72, page_height - 36) dest_dict = {"to": top, "kind": LINK_GOTO} # fall back target if o[2] < 0: dest_dict["kind"] = LINK_NONE if len(o) > 3: # some target is specified if type(o[3]) in (int, float): # convert a number to a point dest_dict["to"] = Point(72, page_height - o[3]) else: # if something else, make sure we have a dict # We make a copy of o[3] to avoid modifying our caller's data. dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict if "to" not in dest_dict: # target point not in dict? dest_dict["to"] = top # put default in else: # transform target to PDF coordinates page = doc[pno] point = Point(dest_dict["to"]) point.y = page.cropbox.height - point.y point = point * page.rotation_matrix dest_dict["to"] = (point.x, point.y) d = {} d["first"] = -1 d["count"] = 0 d["last"] = -1 d["prev"] = -1 d["next"] = -1 d["dest"] = utils.getDestStr(page_xref, dest_dict) d["top"] = dest_dict["to"] d["title"] = title d["parent"] = lvltab[lvl - 1] d["xref"] = xref[i + 1] d["color"] = dest_dict.get("color") d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0) lvltab[lvl] = i + 1 parent = olitems[lvltab[lvl - 1]] # the parent entry if ( dest_dict.get("collapse") or collapse and lvl > collapse ): # suppress expansion parent["count"] -= 1 # make /Count negative else: parent["count"] += 1 # positive /Count if parent["first"] == -1: parent["first"] = i + 1 parent["last"] = i + 1 else: d["prev"] = parent["last"] prev = olitems[parent["last"]] prev["next"] = i + 1 parent["last"] = i + 1 olitems.append(d) # ------------------------------------------------------------------------------ # now create each outline item as a string and insert it in the PDF # ------------------------------------------------------------------------------ for i, ol in enumerate(olitems): txt = "<<" if ol["count"] != 0: txt += "/Count %i" % ol["count"] try: txt += ol["dest"] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose >= 2: exception_info() pass try: if ol["first"] > -1: txt += "/First %i 0 R" % xref[ol["first"]] except Exception: if g_exceptions_verbose >= 2: exception_info() pass try: if ol["last"] > -1: txt += "/Last %i 0 R" % xref[ol["last"]] except Exception: if g_exceptions_verbose >= 2: exception_info() pass try: if ol["next"] > -1: txt += "/Next %i 0 R" % xref[ol["next"]] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose >= 2: exception_info() pass try: if ol["parent"] > -1: txt += "/Parent %i 0 R" % xref[ol["parent"]] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose >= 2: exception_info() pass try: if ol["prev"] > -1: txt += "/Prev %i 0 R" % xref[ol["prev"]] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose >= 2: exception_info() pass try: txt += "/Title" + ol["title"] except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose >= 2: exception_info() pass if ol.get("color") and len(ol["color"]) == 3: txt += f"/C[ {_format_g(tuple(ol['color']))}]" if ol.get("flags", 0) > 0: txt += "/F %i" % ol["flags"] if i == 0: # special: this is the outline root txt += "/Type/Outlines" # so add the /Type entry txt += ">>" doc.update_object(xref[i], txt) # insert the PDF object doc.init_doc() return toclen def set_toc_item( doc: 'Document', idx: int, dest_dict: OptDict = None, kind: OptInt = None, pno: OptInt = None, uri: OptStr = None, title: OptStr = None, to: point_like = None, filename: OptStr = None, zoom: float = 0, ) -> None: """Update TOC item by index. It allows changing the item's title and link destination. Args: idx: (int) desired index of the TOC list, as created by get_toc. dest_dict: (dict) destination dictionary as created by get_toc(False). Outrules all other parameters. If None, the remaining parameters are used to make a dest dictionary. kind: (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only the title will be updated. If pymupdf.LINK_NONE, the TOC item will be deleted. pno: (int) page number (1-based like in get_toc). Required if pymupdf.LINK_GOTO. uri: (str) the URL, required if pymupdf.LINK_URI. title: (str) the new title. No change if None. to: (point-like) destination on the target page. If omitted, (72, 36) will be used as target coordinates. filename: (str) destination filename, required for pymupdf.LINK_GOTOR and pymupdf.LINK_LAUNCH. name: (str) a destination name for pymupdf.LINK_NAMED. zoom: (float) a zoom factor for the target location (pymupdf.LINK_GOTO). """ xref = doc.get_outline_xrefs()[idx] page_xref = 0 if type(dest_dict) is dict: if dest_dict["kind"] == LINK_GOTO: pno = dest_dict["page"] page_xref = doc.page_xref(pno) page_height = doc.page_cropbox(pno).height to = dest_dict.get('to', Point(72, 36)) to.y = page_height - to.y dest_dict["to"] = to action = utils.getDestStr(page_xref, dest_dict) if not action.startswith("/A"): raise ValueError("bad bookmark dest") color = dest_dict.get("color") if color: color = list(map(float, color)) if len(color) != 3 or min(color) < 0 or max(color) > 1: raise ValueError("bad color value") bold = dest_dict.get("bold", False) italic = dest_dict.get("italic", False) flags = italic + 2 * bold collapse = dest_dict.get("collapse") return doc._update_toc_item( xref, action=action[2:], title=title, color=color, flags=flags, collapse=collapse, ) if kind == LINK_NONE: # delete bookmark item return doc.del_toc_item(idx) if kind is None and title is None: # treat as no-op return None if kind is None: # only update title text return doc._update_toc_item(xref, action=None, title=title) if kind == LINK_GOTO: if pno is None or pno not in range(1, doc.page_count + 1): raise ValueError("bad page number") page_xref = doc.page_xref(pno - 1) page_height = doc.page_cropbox(pno - 1).height if to is None: to = Point(72, page_height - 36) else: to = Point(to) to.y = page_height - to.y ddict = { "kind": kind, "to": to, "uri": uri, "page": pno, "file": filename, "zoom": zoom, } action = utils.getDestStr(page_xref, ddict) if action == "" or not action.startswith("/A"): raise ValueError("bad bookmark dest") return doc._update_toc_item(xref, action=action[2:], title=title) def set_xml_metadata(self, metadata): """Store XML document level metadata.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) root = mupdf.pdf_dict_get( mupdf.pdf_trailer( pdf), PDF_NAME('Root')) if not root.m_internal: RAISEPY( MSG_BAD_PDFROOT, JM_Exc_FileDataError) res = mupdf.fz_new_buffer_from_copied_data( metadata.encode('utf-8')) xml = mupdf.pdf_dict_get( root, PDF_NAME('Metadata')) if xml.m_internal: JM_update_stream( pdf, xml, res, 0) else: xml = mupdf.pdf_add_stream( pdf, res, mupdf.PdfObj(), 0) mupdf.pdf_dict_put( xml, PDF_NAME('Type'), PDF_NAME('Metadata')) mupdf.pdf_dict_put( xml, PDF_NAME('Subtype'), PDF_NAME('XML')) mupdf.pdf_dict_put( root, PDF_NAME('Metadata'), xml) def subset_fonts(doc: 'Document', verbose: bool = False, fallback: bool = False) -> OptInt: """Build font subsets in a PDF. Eligible fonts are potentially replaced by smaller versions. Page text is NOT rewritten and thus should retain properties like being hidden or controlled by optional content. This method by default uses MuPDF's own internal feature to create subset fonts. As this is a new function, errors may still occur. In this case, please fall back to using the previous version by using "fallback=True". Fallback mode requires the external package 'fontTools'. Args: fallback: use the older deprecated implementation. verbose: only used by fallback mode. Returns: The new MuPDF-based code returns None. The deprecated fallback mode returns 0 if there are no fonts to subset. Otherwise, it returns the decrease in fontsize (the difference in fontsize), measured in bytes. """ # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs)) # An embedded font is uniquely defined by its fontbuffer only. It may have # multiple names and xrefs. # Once the sets of used unicodes and glyphs are known, we compute a # smaller version of the buffer user package fontTools. if not fallback: # by default use MuPDF function pdf = mupdf.pdf_document_from_fz_document(doc) mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) return font_buffers = {} def get_old_widths(xref): """Retrieve old font '/W' and '/DW' values.""" df = doc.xref_get_key(xref, "DescendantFonts") if df[0] != "array": # only handle xref specifications return None, None df_xref = int(df[1][1:-1].replace("0 R", "")) widths = doc.xref_get_key(df_xref, "W") if widths[0] != "array": # no widths key found widths = None else: widths = widths[1] dwidths = doc.xref_get_key(df_xref, "DW") if dwidths[0] != "int": dwidths = None else: dwidths = dwidths[1] return widths, dwidths def set_old_widths(xref, widths, dwidths): """Restore the old '/W' and '/DW' in subsetted font. If either parameter is None or evaluates to False, the corresponding dictionary key will be set to null. """ df = doc.xref_get_key(xref, "DescendantFonts") if df[0] != "array": # only handle xref specs return None df_xref = int(df[1][1:-1].replace("0 R", "")) if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[ 0 ] != "null": doc.xref_set_key(df_xref, "W", "null") else: doc.xref_set_key(df_xref, "W", widths) if (type(dwidths) is not str or not dwidths) and doc.xref_get_key( df_xref, "DW" )[0] != "null": doc.xref_set_key(df_xref, "DW", "null") else: doc.xref_set_key(df_xref, "DW", dwidths) return None def set_subset_fontname(new_xref): """Generate a name prefix to tag a font as subset. We use a random generator to select 6 upper case ASCII characters. The prefixed name must be put in the font xref as the "/BaseFont" value and in the FontDescriptor object as the '/FontName' value. """ # The following generates a prefix like 'ABCDEF+' import random import string prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+" font_str = doc.xref_object(new_xref, compressed=True) font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix) df = doc.xref_get_key(new_xref, "DescendantFonts") if df[0] == "array": df_xref = int(df[1][1:-1].replace("0 R", "")) fd = doc.xref_get_key(df_xref, "FontDescriptor") if fd[0] == "xref": fd_xref = int(fd[1].replace("0 R", "")) fd_str = doc.xref_object(fd_xref, compressed=True) fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix) doc.update_object(fd_xref, fd_str) doc.update_object(new_xref, font_str) def build_subset(buffer, unc_set, gid_set): """Build font subset using fontTools. Args: buffer: (bytes) the font given as a binary buffer. unc_set: (set) required glyph ids. Returns: Either None if subsetting is unsuccessful or the subset font buffer. """ try: import fontTools.subset as fts except ImportError: if g_exceptions_verbose: exception_info() message("This method requires fontTools to be installed.") raise import tempfile with tempfile.TemporaryDirectory() as tmp_dir: oldfont_path = f"{tmp_dir}/oldfont.ttf" newfont_path = f"{tmp_dir}/newfont.ttf" uncfile_path = f"{tmp_dir}/uncfile.txt" args = [ oldfont_path, "--retain-gids", f"--output-file={newfont_path}", "--layout-features=*", "--passthrough-tables", "--ignore-missing-glyphs", "--ignore-missing-unicodes", "--symbol-cmap", ] # store glyph ids or unicodes as file with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file: if 0xFFFD in unc_set: # error unicode exists -> use glyphs args.append(f"--gids-file={uncfile_path}") gid_set.add(189) unc_list = list(gid_set) for unc in unc_list: unc_file.write("%i\n" % unc) else: args.append(f"--unicodes-file={uncfile_path}") unc_set.add(255) unc_list = list(unc_set) for unc in unc_list: unc_file.write("%04x\n" % unc) # store fontbuffer as a file with open(oldfont_path, "wb") as fontfile: fontfile.write(buffer) try: os.remove(newfont_path) # remove old file except Exception: pass try: # invoke fontTools subsetter fts.main(args) font = Font(fontfile=newfont_path) new_buffer = font.buffer # subset font binary if font.glyph_count == 0: # intercept empty font new_buffer = None except Exception: exception_info() new_buffer = None return new_buffer def repl_fontnames(doc): """Populate 'font_buffers'. For each font candidate, store its xref and the list of names by which PDF text may refer to it (there may be multiple). """ def norm_name(name): """Recreate font name that contains PDF hex codes. E.g. #20 -> space, chr(32) """ while "#" in name: p = name.find("#") c = int(name[p + 1 : p + 3], 16) name = name.replace(name[p : p + 3], chr(c)) return name def get_fontnames(doc, item): """Return a list of fontnames for an item of page.get_fonts(). There may be multiple names e.g. for Type0 fonts. """ fontname = item[3] names = [fontname] fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:] fontname = norm_name(fontname) if fontname not in names: names.append(fontname) descendents = doc.xref_get_key(item[0], "DescendantFonts") if descendents[0] != "array": return names descendents = descendents[1][1:-1] if descendents.endswith(" 0 R"): xref = int(descendents[:-4]) descendents = doc.xref_object(xref, compressed=True) p1 = descendents.find("/BaseFont") if p1 >= 0: p2 = descendents.find("/", p1 + 1) p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1)) fontname = descendents[p2 + 1 : p1] fontname = norm_name(fontname) if fontname not in names: names.append(fontname) return names for i in range(doc.page_count): for f in doc.get_page_fonts(i, full=True): font_xref = f[0] # font xref font_ext = f[1] # font file extension basename = f[3] # font basename if font_ext not in ( # skip if not supported by fontTools "otf", "ttf", "woff", "woff2", ): continue # skip fonts which already are subsets if len(basename) > 6 and basename[6] == "+": continue extr = doc.extract_font(font_xref) fontbuffer = extr[-1] names = get_fontnames(doc, f) name_set, xref_set, subsets = font_buffers.get( fontbuffer, (set(), set(), (set(), set())) ) xref_set.add(font_xref) for name in names: name_set.add(name) font = Font(fontbuffer=fontbuffer) name_set.add(font.name) del font font_buffers[fontbuffer] = (name_set, xref_set, subsets) def find_buffer_by_name(name): for buffer, (name_set, _, _) in font_buffers.items(): if name in name_set: return buffer return None # ----------------- # main function # ----------------- repl_fontnames(doc) # populate font information if not font_buffers: # nothing found to do if verbose: message(f'No fonts to subset.') return 0 old_fontsize = 0 new_fontsize = 0 for fontbuffer in font_buffers.keys(): old_fontsize += len(fontbuffer) # Scan page text for usage of subsettable fonts for page in doc: # go through the text and extend set of used glyphs by font # we use a modified MuPDF trace device, which delivers us glyph ids. for span in page.get_texttrace(): if type(span) is not dict: # skip useless information continue fontname = span["font"][:33] # fontname for the span buffer = find_buffer_by_name(fontname) if buffer is None: continue name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer] for c in span["chars"]: set_ucs.add(c[0]) # unicode set_gid.add(c[1]) # glyph id font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid)) # build the font subsets for old_buffer, (name_set, xref_set, subsets) in font_buffers.items(): new_buffer = build_subset(old_buffer, subsets[0], subsets[1]) fontname = list(name_set)[0] if new_buffer is None or len(new_buffer) >= len(old_buffer): # subset was not created or did not get smaller if verbose: message(f'Cannot subset {fontname!r}.') continue if verbose: message(f"Built subset of font {fontname!r}.") val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF new_xref = val[0] # get its xref set_subset_fontname(new_xref) # tag fontname as subset font font_str = doc.xref_object( # get its object definition new_xref, compressed=True, ) # walk through the original font xrefs and replace each by the subset def for font_xref in xref_set: # we need the original '/W' and '/DW' width values width_table, def_width = get_old_widths(font_xref) # ... and replace original font definition at xref with it doc.update_object(font_xref, font_str) # now copy over old '/W' and '/DW' values if width_table or def_width: set_old_widths(font_xref, width_table, def_width) # 'new_xref' remains unused in the PDF and must be removed # by garbage collection. new_fontsize += len(new_buffer) return old_fontsize - new_fontsize def switch_layer(self, config, as_default=0): """Activate an OC layer.""" pdf = _as_pdf_document(self) cfgs = mupdf.pdf_dict_getl( mupdf.pdf_trailer( pdf), PDF_NAME('Root'), PDF_NAME('OCProperties'), PDF_NAME('Configs') ) if not mupdf.pdf_is_array( cfgs) or not mupdf.pdf_array_len( cfgs): if config < 1: return raise ValueError( MSG_BAD_OC_LAYER) if config < 0: return mupdf.pdf_select_layer_config( pdf, config) if as_default: mupdf.pdf_set_layer_config_as_default( pdf) mupdf.ll_pdf_read_ocg( pdf.m_internal) def update_object(self, xref, text, page=None): """Replace object definition source.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len(pdf) if not _INRANGE(xref, 1, xreflen-1): RAISEPY("bad xref", MSG_BAD_XREF) ENSURE_OPERATION(pdf) # create new object with passed-in string new_obj = JM_pdf_obj_from_str(pdf, text) mupdf.pdf_update_object(pdf, xref, new_obj) if page: JM_refresh_links( _as_pdf_page(page)) def update_stream(self, xref=0, stream=None, new=1, compress=1): """Replace xref stream part.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len(pdf) if xref < 1 or xref > xreflen: raise ValueError( MSG_BAD_XREF) # get the object obj = mupdf.pdf_new_indirect(pdf, xref, 0) if not mupdf.pdf_is_dict(obj): raise ValueError( MSG_IS_NO_DICT) res = JM_BufferFromBytes(stream) if not res.m_internal: raise TypeError( MSG_BAD_BUFFER) JM_update_stream(pdf, obj, res, compress) pdf.dirty = 1 @property def version_count(self): ''' Count versions of PDF document. ''' pdf = _as_pdf_document(self, required=0) if pdf.m_internal: return mupdf.pdf_count_versions(pdf) return 0 def write( self, garbage=False, clean=False, deflate=False, deflate_images=False, deflate_fonts=False, incremental=False, ascii=False, expand=False, linear=False, no_new_id=False, appearance=False, pretty=False, encryption=1, permissions=4095, owner_pw=None, user_pw=None, preserve_metadata=1, use_objstms=0, compression_effort=0, ): from io import BytesIO bio = BytesIO() self.save( bio, garbage=garbage, clean=clean, no_new_id=no_new_id, appearance=appearance, deflate=deflate, deflate_images=deflate_images, deflate_fonts=deflate_fonts, incremental=incremental, ascii=ascii, expand=expand, linear=linear, pretty=pretty, encryption=encryption, permissions=permissions, owner_pw=owner_pw, user_pw=user_pw, preserve_metadata=preserve_metadata, use_objstms=use_objstms, compression_effort=compression_effort, ) return bio.getvalue() def tobytes(self, *args, **kwargs): return self.write(*args, **kwargs) @property def xref(self): """PDF xref number of page.""" CheckParent(self) return self.parent.page_xref(self.number) def xref_copy(doc: 'Document', source: int, target: int, *, keep: list = None) -> None: """Copy a PDF dictionary object to another one given their xref numbers. Args: doc: PDF document object source: source xref number target: target xref number, the xref must already exist keep: an optional list of 1st level keys in target that should not be removed before copying. Notes: This works similar to the copy() method of dictionaries in Python. The source may be a stream object. """ if doc.xref_is_stream(source): # read new xref stream, maintaining compression stream = doc.xref_stream_raw(source) doc.update_stream( target, stream, compress=False, # keeps source compression new=True, # in case target is no stream ) # empty the target completely, observe exceptions if keep is None: keep = [] for key in doc.xref_get_keys(target): if key in keep: continue doc.xref_set_key(target, key, "null") # copy over all source dict items for key in doc.xref_get_keys(source): item = doc.xref_get_key(source, key) doc.xref_set_key(target, key, item[1]) def xref_get_key(self, xref, key): """Get PDF dict key value of object at 'xref'.""" pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len(pdf) if not _INRANGE(xref, 1, xreflen-1) and xref != -1: raise ValueError( MSG_BAD_XREF) if xref > 0: obj = mupdf.pdf_load_object(pdf, xref) else: obj = mupdf.pdf_trailer(pdf) if not obj.m_internal: return ("null", "null") subobj = mupdf.pdf_dict_getp(obj, key) if not subobj.m_internal: return ("null", "null") text = None if mupdf.pdf_is_indirect(subobj): type = "xref" text = "%i 0 R" % mupdf.pdf_to_num(subobj) elif mupdf.pdf_is_array(subobj): type = "array" elif mupdf.pdf_is_dict(subobj): type = "dict" elif mupdf.pdf_is_int(subobj): type = "int" text = "%i" % mupdf.pdf_to_int(subobj) elif mupdf.pdf_is_real(subobj): type = "float" elif mupdf.pdf_is_null(subobj): type = "null" text = "null" elif mupdf.pdf_is_bool(subobj): type = "bool" if mupdf.pdf_to_bool(subobj): text = "true" else: text = "false" elif mupdf.pdf_is_name(subobj): type = "name" text = "/%s" % mupdf.pdf_to_name(subobj) elif mupdf.pdf_is_string(subobj): type = "string" text = JM_UnicodeFromStr(mupdf.pdf_to_text_string(subobj)) else: type = "unknown" if text is None: res = JM_object_to_buffer(subobj, 1, 0) text = JM_UnicodeFromBuffer(res) return (type, text) def xref_get_keys(self, xref): """Get the keys of PDF dict object at 'xref'. Use -1 for the PDF trailer.""" pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len( pdf) if not _INRANGE(xref, 1, xreflen-1) and xref != -1: raise ValueError( MSG_BAD_XREF) if xref > 0: obj = mupdf.pdf_load_object( pdf, xref) else: obj = mupdf.pdf_trailer( pdf) n = mupdf.pdf_dict_len( obj) rc = [] if n == 0: return rc for i in range(n): key = mupdf.pdf_to_name( mupdf.pdf_dict_get_key( obj, i)) rc.append(key) return rc def xref_is_font(self, xref): """Check if xref is a font object.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if self.xref_get_key(xref, "Type")[1] == "/Font": return True return False def xref_is_image(self, xref): """Check if xref is an image object.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if self.xref_get_key(xref, "Subtype")[1] == "/Image": return True return False def xref_is_stream(self, xref=0): """Check if xref is a stream object.""" pdf = _as_pdf_document(self, required=0) if not pdf.m_internal: return False # not a PDF return bool(mupdf.pdf_obj_num_is_stream(pdf, xref)) def xref_is_xobject(self, xref): """Check if xref is a form xobject.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") if self.xref_get_key(xref, "Subtype")[1] == "/Form": return True return False def xref_length(self): """Get length of xref table.""" xreflen = 0 pdf = _as_pdf_document(self, required=0) if pdf.m_internal: xreflen = mupdf.pdf_xref_len(pdf) return xreflen def xref_object(self, xref, compressed=0, ascii=0): """Get xref object source as a string.""" if self.is_closed: raise ValueError("document closed") if g_use_extra: ret = extra.xref_object( self.this, xref, compressed, ascii) return ret pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len(pdf) if not _INRANGE(xref, 1, xreflen-1) and xref != -1: raise ValueError( MSG_BAD_XREF) if xref > 0: obj = mupdf.pdf_load_object(pdf, xref) else: obj = mupdf.pdf_trailer(pdf) res = JM_object_to_buffer(mupdf.pdf_resolve_indirect(obj), compressed, ascii) text = JM_EscapeStrFromBuffer(res) return text def xref_set_key(self, xref, key, value): """Set the value of a PDF dictionary key.""" if self.is_closed: raise ValueError("document closed") if not key or not isinstance(key, str) or INVALID_NAME_CHARS.intersection(key) not in (set(), {"/"}): raise ValueError("bad 'key'") if not isinstance(value, str) or not value or value[0] == "/" and INVALID_NAME_CHARS.intersection(value[1:]) != set(): raise ValueError("bad 'value'") pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len(pdf) #if not _INRANGE(xref, 1, xreflen-1) and xref != -1: # THROWMSG("bad xref") #if len(value) == 0: # THROWMSG("bad 'value'") #if len(key) == 0: # THROWMSG("bad 'key'") if not _INRANGE(xref, 1, xreflen-1) and xref != -1: raise ValueError( MSG_BAD_XREF) if xref != -1: obj = mupdf.pdf_load_object(pdf, xref) else: obj = mupdf.pdf_trailer(pdf) new_obj = JM_set_object_value(obj, key, value) if not new_obj.m_internal: return # did not work: skip update if xref != -1: mupdf.pdf_update_object(pdf, xref, new_obj) else: n = mupdf.pdf_dict_len(new_obj) for i in range(n): mupdf.pdf_dict_put( obj, mupdf.pdf_dict_get_key(new_obj, i), mupdf.pdf_dict_get_val(new_obj, i), ) def xref_stream(self, xref): """Get decompressed xref stream.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len( pdf) if not _INRANGE(xref, 1, xreflen-1) and xref != -1: raise ValueError( MSG_BAD_XREF) if xref >= 0: obj = mupdf.pdf_new_indirect( pdf, xref, 0) else: obj = mupdf.pdf_trailer( pdf) r = None if mupdf.pdf_is_stream( obj): res = mupdf.pdf_load_stream_number( pdf, xref) r = JM_BinFromBuffer( res) return r def xref_stream_raw(self, xref): """Get xref stream without decompression.""" if self.is_closed or self.is_encrypted: raise ValueError("document closed or encrypted") pdf = _as_pdf_document(self) xreflen = mupdf.pdf_xref_len( pdf) if not _INRANGE(xref, 1, xreflen-1) and xref != -1: raise ValueError( MSG_BAD_XREF) if xref >= 0: obj = mupdf.pdf_new_indirect( pdf, xref, 0) else: obj = mupdf.pdf_trailer( pdf) r = None if mupdf.pdf_is_stream( obj): res = mupdf.pdf_load_raw_stream_number( pdf, xref) r = JM_BinFromBuffer( res) return r def xref_xml_metadata(self): """Get xref of document XML metadata.""" pdf = _as_pdf_document(self) root = mupdf.pdf_dict_get( mupdf.pdf_trailer( pdf), PDF_NAME('Root')) if not root.m_internal: RAISEPY( MSG_BAD_PDFROOT, JM_Exc_FileDataError) xml = mupdf.pdf_dict_get( root, PDF_NAME('Metadata')) xref = 0 if xml.m_internal: xref = mupdf.pdf_to_num( xml) return xref __slots__ = ('this', 'page_count2', 'this_is_pdf', '__dict__') outline = property(lambda self: self._outline) is_stream = xref_is_stream open = Document class DocumentWriter: def __enter__(self): return self def __exit__(self, *args): self.close() def __init__(self, path, options=''): if isinstance( path, str): pass elif hasattr( path, 'absolute'): path = str( path) elif hasattr( path, 'name'): path = path.name if isinstance( path, str): self.this = mupdf.FzDocumentWriter( path, options, mupdf.FzDocumentWriter.PathType_PDF) else: # Need to keep the Python JM_new_output_fileptr_Output instance # alive for the lifetime of this DocumentWriter, otherwise calls # to virtual methods implemented in Python fail. So we make it a # member of this DocumentWriter. # # Unrelated to this, mupdf.FzDocumentWriter will set # self._out.m_internal to null because ownership is passed in. # out = JM_new_output_fileptr( path) self.this = mupdf.FzDocumentWriter( out, options, mupdf.FzDocumentWriter.OutputType_PDF) assert out.m_internal_value() == 0 assert hasattr( self.this, '_out') def begin_page( self, mediabox): mediabox2 = JM_rect_from_py(mediabox) device = mupdf.fz_begin_page( self.this, mediabox2) device_wrapper = DeviceWrapper( device) return device_wrapper def close( self): mupdf.fz_close_document_writer( self.this) def end_page( self): mupdf.fz_end_page( self.this) class Font: def __del__(self): if type(self) is not Font: return None def __init__( self, fontname=None, fontfile=None, fontbuffer=None, script=0, language=None, ordering=-1, is_bold=0, is_italic=0, is_serif=0, embed=1, ): if fontbuffer: if hasattr(fontbuffer, "getvalue"): fontbuffer = fontbuffer.getvalue() elif isinstance(fontbuffer, bytearray): fontbuffer = bytes(fontbuffer) if not isinstance(fontbuffer, bytes): raise ValueError("bad type: 'fontbuffer'") if isinstance(fontname, str): fname_lower = fontname.lower() if "/" in fname_lower or "\\" in fname_lower or "." in fname_lower: message("Warning: did you mean a fontfile?") if fname_lower in ("cjk", "china-t", "china-ts"): ordering = 0 elif fname_lower.startswith("china-s"): ordering = 1 elif fname_lower.startswith("korea"): ordering = 3 elif fname_lower.startswith("japan"): ordering = 2 elif fname_lower in fitz_fontdescriptors.keys(): import pymupdf_fonts # optional fonts fontbuffer = pymupdf_fonts.myfont(fname_lower) # make a copy fontname = None # ensure using fontbuffer only del pymupdf_fonts # remove package again elif ordering < 0: fontname = Base14_fontdict.get(fontname, fontname) lang = mupdf.fz_text_language_from_string(language) font = JM_get_font(fontname, fontfile, fontbuffer, script, lang, ordering, is_bold, is_italic, is_serif, embed) self.this = font def __repr__(self): return "Font('%s')" % self.name @property def ascender(self): """Return the glyph ascender value.""" return mupdf.fz_font_ascender(self.this) @property def bbox(self): return self.this.fz_font_bbox() @property def buffer(self): buffer_ = mupdf.FzBuffer( mupdf.ll_fz_keep_buffer( self.this.m_internal.buffer)) return mupdf.fz_buffer_extract_copy( buffer_) def char_lengths(self, text, fontsize=11, language=None, script=0, wmode=0, small_caps=0): """Return tuple of char lengths of unicode 'text' under a fontsize.""" lang = mupdf.fz_text_language_from_string(language) rc = [] for ch in text: c = ord(ch) if small_caps: gid = mupdf.fz_encode_character_sc(self.this, c) if gid >= 0: font = self.this else: gid, font = mupdf.fz_encode_character_with_fallback(self.this, c, script, lang) rc.append(fontsize * mupdf.fz_advance_glyph(font, gid, wmode)) return rc @property def descender(self): """Return the glyph descender value.""" return mupdf.fz_font_descender(self.this) @property def flags(self): f = mupdf.ll_fz_font_flags(self.this.m_internal) if not f: return assert isinstance( f, mupdf.fz_font_flags_t) #log( '{=f}') if mupdf_cppyy: # cppyy includes remaining higher bits. v = [f.is_mono] def b(bits): ret = v[0] & ((1 << bits)-1) v[0] = v[0] >> bits return ret is_mono = b(1) is_serif = b(1) is_bold = b(1) is_italic = b(1) ft_substitute = b(1) ft_stretch = b(1) fake_bold = b(1) fake_italic = b(1) has_opentype = b(1) invalid_bbox = b(1) cjk_lang = b(1) embed = b(1) never_embed = b(1) return { "mono": is_mono if mupdf_cppyy else f.is_mono, "serif": is_serif if mupdf_cppyy else f.is_serif, "bold": is_bold if mupdf_cppyy else f.is_bold, "italic": is_italic if mupdf_cppyy else f.is_italic, "substitute": ft_substitute if mupdf_cppyy else f.ft_substitute, "stretch": ft_stretch if mupdf_cppyy else f.ft_stretch, "fake-bold": fake_bold if mupdf_cppyy else f.fake_bold, "fake-italic": fake_italic if mupdf_cppyy else f.fake_italic, "opentype": has_opentype if mupdf_cppyy else f.has_opentype, "invalid-bbox": invalid_bbox if mupdf_cppyy else f.invalid_bbox, 'cjk': cjk_lang if mupdf_cppyy else f.cjk, 'cjk-lang': cjk_lang if mupdf_cppyy else f.cjk_lang, 'embed': embed if mupdf_cppyy else f.embed, 'never-embed': never_embed if mupdf_cppyy else f.never_embed, } def glyph_advance(self, chr_, language=None, script=0, wmode=0, small_caps=0): """Return the glyph width of a unicode (font size 1).""" lang = mupdf.fz_text_language_from_string(language) if small_caps: gid = mupdf.fz_encode_character_sc(self.this, chr_) if gid >= 0: font = self.this else: gid, font = mupdf.fz_encode_character_with_fallback(self.this, chr_, script, lang) return mupdf.fz_advance_glyph(font, gid, wmode) def glyph_bbox(self, char, language=None, script=0, small_caps=0): """Return the glyph bbox of a unicode (font size 1).""" lang = mupdf.fz_text_language_from_string(language) if small_caps: gid = mupdf.fz_encode_character_sc( self.this, char) if gid >= 0: font = self.this else: gid, font = mupdf.fz_encode_character_with_fallback( self.this, char, script, lang) return Rect(mupdf.fz_bound_glyph( font, gid, mupdf.FzMatrix())) @property def glyph_count(self): return self.this.m_internal.glyph_count def glyph_name_to_unicode(self, name): """Return the unicode for a glyph name.""" return glyph_name_to_unicode(name) def has_glyph(self, chr, language=None, script=0, fallback=0, small_caps=0): """Check whether font has a glyph for this unicode.""" if fallback: lang = mupdf.fz_text_language_from_string(language) gid, font = mupdf.fz_encode_character_with_fallback(self.this, chr, script, lang) else: if small_caps: gid = mupdf.fz_encode_character_sc(self.this, chr) else: gid = mupdf.fz_encode_character(self.this, chr) return gid @property def is_bold(self): return mupdf.fz_font_is_bold( self.this) @property def is_italic(self): return mupdf.fz_font_is_italic( self.this) @property def is_monospaced(self): return mupdf.fz_font_is_monospaced( self.this) @property def is_serif(self): return mupdf.fz_font_is_serif( self.this) @property def is_writable(self): return True # see pymupdf commit ef4056ee4da2 font = self.this flags = mupdf.ll_fz_font_flags(font.m_internal) if mupdf_cppyy: # cppyy doesn't handle bitfields correctly. import cppyy ft_substitute = cppyy.gbl.mupdf_mfz_font_flags_ft_substitute( flags) else: ft_substitute = flags.ft_substitute if ( mupdf.ll_fz_font_t3_procs(font.m_internal) or ft_substitute or not mupdf.pdf_font_writing_supported(font) ): return False return True @property def name(self): ret = mupdf.fz_font_name(self.this) #log( '{ret=}') return ret def text_length(self, text, fontsize=11, language=None, script=0, wmode=0, small_caps=0): """Return length of unicode 'text' under a fontsize.""" thisfont = self.this lang = mupdf.fz_text_language_from_string(language) rc = 0 if not isinstance(text, str): raise TypeError( MSG_BAD_TEXT) for ch in text: c = ord(ch) if small_caps: gid = mupdf.fz_encode_character_sc(thisfont, c) if gid >= 0: font = thisfont else: gid, font = mupdf.fz_encode_character_with_fallback(thisfont, c, script, lang) rc += mupdf.fz_advance_glyph(font, gid, wmode) rc *= fontsize return rc def unicode_to_glyph_name(self, ch): """Return the glyph name for a unicode.""" return unicode_to_glyph_name(ch) def valid_codepoints(self): ''' Returns sorted list of valid unicodes of a fz_font. ''' ucs_gids = mupdf.fz_enumerate_font_cmap2(self.this) ucss = [i.ucs for i in ucs_gids] ucss_unique = set(ucss) ucss_unique_sorted = sorted(ucss_unique) return ucss_unique_sorted class Graftmap: def __del__(self): if not type(self) is Graftmap: return self.thisown = False def __init__(self, doc): dst = _as_pdf_document(doc) map_ = mupdf.pdf_new_graft_map(dst) self.this = map_ self.thisown = True class Link: def __del__(self): self._erase() def __init__( self, this): assert isinstance( this, mupdf.FzLink) self.this = this def __repr__(self): CheckParent(self) return "link on " + str(self.parent) def __str__(self): CheckParent(self) return "link on " + str(self.parent) def _border(self, doc, xref): pdf = _as_pdf_document(doc, required=0) if not pdf.m_internal: return link_obj = mupdf.pdf_new_indirect(pdf, xref, 0) if not link_obj.m_internal: return b = JM_annot_border(link_obj) return b def _colors(self, doc, xref): pdf = _as_pdf_document(doc, required=0) if not pdf.m_internal: return link_obj = mupdf.pdf_new_indirect( pdf, xref, 0) if not link_obj.m_internal: raise ValueError( MSG_BAD_XREF) b = JM_annot_colors( link_obj) return b def _erase(self): self.parent = None self.thisown = False def _setBorder(self, border, doc, xref): pdf = _as_pdf_document(doc, required=0) if not pdf.m_internal: return link_obj = mupdf.pdf_new_indirect(pdf, xref, 0) if not link_obj.m_internal: return b = JM_annot_set_border(border, pdf, link_obj) return b @property def border(self): return self._border(self.parent.parent.this, self.xref) @property def colors(self): return self._colors(self.parent.parent.this, self.xref) @property def dest(self): """Create link destination details.""" if hasattr(self, "parent") and self.parent is None: raise ValueError("orphaned object: parent is None") if self.parent.parent.is_closed or self.parent.parent.is_encrypted: raise ValueError("document closed or encrypted") doc = self.parent.parent if self.is_external or self.uri.startswith("#"): uri = None else: uri = doc.resolve_link(self.uri) return linkDest(self, uri, doc) @property def flags(self)->int: CheckParent(self) doc = self.parent.parent if not doc.is_pdf: return 0 f = doc.xref_get_key(self.xref, "F") if f[1] != "null": return int(f[1]) return 0 @property def is_external(self): """Flag the link as external.""" CheckParent(self) if g_use_extra: return extra.Link_is_external( self.this) this_link = self.this if not this_link.m_internal or not this_link.m_internal.uri: return False return bool( mupdf.fz_is_external_link( this_link.m_internal.uri)) @property def next(self): """Next link.""" if not self.this.m_internal: return None CheckParent(self) if 0 and g_use_extra: val = extra.Link_next( self.this) else: val = self.this.next() if not val.m_internal: return None val = Link( val) if val: val.thisown = True val.parent = self.parent # copy owning page from prev link val.parent._annot_refs[id(val)] = val if self.xref > 0: # prev link has an xref link_xrefs = [x[0] for x in self.parent.annot_xrefs() if x[1] == mupdf.PDF_ANNOT_LINK] link_ids = [x[2] for x in self.parent.annot_xrefs() if x[1] == mupdf.PDF_ANNOT_LINK] idx = link_xrefs.index(self.xref) val.xref = link_xrefs[idx + 1] val.id = link_ids[idx + 1] else: val.xref = 0 val.id = "" return val @property def rect(self): """Rectangle ('hot area').""" CheckParent(self) # utils.py:getLinkDict() appears to expect exceptions from us, so we # ensure that we raise on error. if self.this is None or not self.this.m_internal: raise Exception( 'self.this.m_internal not available') val = JM_py_from_rect( self.this.rect()) val = Rect(val) return val def set_border(self, border=None, width=0, dashes=None, style=None): if type(border) is not dict: border = {"width": width, "style": style, "dashes": dashes} return self._setBorder(border, self.parent.parent.this, self.xref) def set_colors(self, colors=None, stroke=None, fill=None): """Set border colors.""" CheckParent(self) doc = self.parent.parent if type(colors) is not dict: colors = {"fill": fill, "stroke": stroke} fill = colors.get("fill") stroke = colors.get("stroke") if fill is not None: message("warning: links have no fill color") if stroke in ([], ()): doc.xref_set_key(self.xref, "C", "[]") return if hasattr(stroke, "__float__"): stroke = [float(stroke)] CheckColor(stroke) assert len(stroke) in (1, 3, 4) s = f"[{_format_g(stroke)}]" doc.xref_set_key(self.xref, "C", s) def set_flags(self, flags): CheckParent(self) doc = self.parent.parent if not doc.is_pdf: raise ValueError("is no PDF") if not type(flags) is int: raise ValueError("bad 'flags' value") doc.xref_set_key(self.xref, "F", str(flags)) return None @property def uri(self): """Uri string.""" #CheckParent(self) if g_use_extra: return extra.link_uri(self.this) this_link = self.this return this_link.m_internal.uri if this_link.m_internal else '' page = -1 class Matrix: def __abs__(self): return math.sqrt(sum([c*c for c in self])) def __add__(self, m): if hasattr(m, "__float__"): return Matrix(self.a + m, self.b + m, self.c + m, self.d + m, self.e + m, self.f + m) if len(m) != 6: raise ValueError("Matrix: bad seq len") return Matrix(self.a + m[0], self.b + m[1], self.c + m[2], self.d + m[3], self.e + m[4], self.f + m[5]) def __bool__(self): return not (max(self) == min(self) == 0) def __eq__(self, mat): if not hasattr(mat, "__len__"): return False return len(mat) == 6 and not (self - mat) def __getitem__(self, i): return (self.a, self.b, self.c, self.d, self.e, self.f)[i] def __init__(self, *args, a=None, b=None, c=None, d=None, e=None, f=None): """ Matrix() - all zeros Matrix(a, b, c, d, e, f) Matrix(zoom-x, zoom-y) - zoom Matrix(shear-x, shear-y, 1) - shear Matrix(degree) - rotate Matrix(Matrix) - new copy Matrix(sequence) - from 'sequence' Matrix(mupdf.FzMatrix) - from MuPDF class wrapper for fz_matrix. Explicit keyword args a, b, c, d, e, f override any earlier settings if not None. """ if not args: self.a = self.b = self.c = self.d = self.e = self.f = 0.0 elif len(args) > 6: raise ValueError("Matrix: bad seq len") elif len(args) == 6: # 6 numbers self.a, self.b, self.c, self.d, self.e, self.f = map(float, args) elif len(args) == 1: # either an angle or a sequ if isinstance(args[0], mupdf.FzMatrix): self.a = args[0].a self.b = args[0].b self.c = args[0].c self.d = args[0].d self.e = args[0].e self.f = args[0].f elif hasattr(args[0], "__float__"): theta = math.radians(args[0]) c_ = round(math.cos(theta), 8) s_ = round(math.sin(theta), 8) self.a = self.d = c_ self.b = s_ self.c = -s_ self.e = self.f = 0.0 else: self.a, self.b, self.c, self.d, self.e, self.f = map(float, args[0]) elif len(args) == 2 or len(args) == 3 and args[2] == 0: self.a, self.b, self.c, self.d, self.e, self.f = float(args[0]), \ 0.0, 0.0, float(args[1]), 0.0, 0.0 elif len(args) == 3 and args[2] == 1: self.a, self.b, self.c, self.d, self.e, self.f = 1.0, \ float(args[1]), float(args[0]), 1.0, 0.0, 0.0 else: raise ValueError("Matrix: bad args") # Override with explicit args if specified. if a is not None: self.a = a if b is not None: self.b = b if c is not None: self.c = c if d is not None: self.d = d if e is not None: self.e = e if f is not None: self.f = f def __invert__(self): """Calculate inverted matrix.""" m1 = Matrix() m1.invert(self) return m1 def __len__(self): return 6 def __mul__(self, m): if hasattr(m, "__float__"): return Matrix(self.a * m, self.b * m, self.c * m, self.d * m, self.e * m, self.f * m) m1 = Matrix(1,1) return m1.concat(self, m) def __neg__(self): return Matrix(-self.a, -self.b, -self.c, -self.d, -self.e, -self.f) def __nonzero__(self): return not (max(self) == min(self) == 0) def __pos__(self): return Matrix(self) def __repr__(self): return "Matrix" + str(tuple(self)) def __setitem__(self, i, v): v = float(v) if i == 0: self.a = v elif i == 1: self.b = v elif i == 2: self.c = v elif i == 3: self.d = v elif i == 4: self.e = v elif i == 5: self.f = v else: raise IndexError("index out of range") return def __sub__(self, m): if hasattr(m, "__float__"): return Matrix(self.a - m, self.b - m, self.c - m, self.d - m, self.e - m, self.f - m) if len(m) != 6: raise ValueError("Matrix: bad seq len") return Matrix(self.a - m[0], self.b - m[1], self.c - m[2], self.d - m[3], self.e - m[4], self.f - m[5]) def __truediv__(self, m): if hasattr(m, "__float__"): return Matrix(self.a * 1./m, self.b * 1./m, self.c * 1./m, self.d * 1./m, self.e * 1./m, self.f * 1./m) m1 = util_invert_matrix(m)[1] if not m1: raise ZeroDivisionError("matrix not invertible") m2 = Matrix(1,1) return m2.concat(self, m1) def concat(self, one, two): """Multiply two matrices and replace current one.""" if not len(one) == len(two) == 6: raise ValueError("Matrix: bad seq len") self.a, self.b, self.c, self.d, self.e, self.f = util_concat_matrix(one, two) return self def invert(self, src=None): """Calculate the inverted matrix. Return 0 if successful and replace current one. Else return 1 and do nothing. """ if src is None: dst = util_invert_matrix(self) else: dst = util_invert_matrix(src) if dst[0] == 1: return 1 self.a, self.b, self.c, self.d, self.e, self.f = dst[1] return 0 @property def is_rectilinear(self): """True if rectangles are mapped to rectangles.""" return (abs(self.b) < EPSILON and abs(self.c) < EPSILON) or \ (abs(self.a) < EPSILON and abs(self.d) < EPSILON) def prerotate(self, theta): """Calculate pre rotation and replace current matrix.""" theta = float(theta) while theta < 0: theta += 360 while theta >= 360: theta -= 360 if abs(0 - theta) < EPSILON: pass elif abs(90.0 - theta) < EPSILON: a = self.a b = self.b self.a = self.c self.b = self.d self.c = -a self.d = -b elif abs(180.0 - theta) < EPSILON: self.a = -self.a self.b = -self.b self.c = -self.c self.d = -self.d elif abs(270.0 - theta) < EPSILON: a = self.a b = self.b self.a = -self.c self.b = -self.d self.c = a self.d = b else: rad = math.radians(theta) s = math.sin(rad) c = math.cos(rad) a = self.a b = self.b self.a = c * a + s * self.c self.b = c * b + s * self.d self.c =-s * a + c * self.c self.d =-s * b + c * self.d return self def prescale(self, sx, sy): """Calculate pre scaling and replace current matrix.""" sx = float(sx) sy = float(sy) self.a *= sx self.b *= sx self.c *= sy self.d *= sy return self def preshear(self, h, v): """Calculate pre shearing and replace current matrix.""" h = float(h) v = float(v) a, b = self.a, self.b self.a += v * self.c self.b += v * self.d self.c += h * a self.d += h * b return self def pretranslate(self, tx, ty): """Calculate pre translation and replace current matrix.""" tx = float(tx) ty = float(ty) self.e += tx * self.a + ty * self.c self.f += tx * self.b + ty * self.d return self __inv__ = __invert__ __div__ = __truediv__ norm = __abs__ class IdentityMatrix(Matrix): """Identity matrix [1, 0, 0, 1, 0, 0]""" def __hash__(self): return hash((1,0,0,1,0,0)) def __init__(self): Matrix.__init__(self, 1.0, 1.0) def __repr__(self): return "IdentityMatrix(1.0, 0.0, 0.0, 1.0, 0.0, 0.0)" def __setattr__(self, name, value): if name in "ad": self.__dict__[name] = 1.0 elif name in "bcef": self.__dict__[name] = 0.0 else: self.__dict__[name] = value def checkargs(*args): raise NotImplementedError("Identity is readonly") Identity = IdentityMatrix() class linkDest: """link or outline destination details""" def __init__(self, obj, rlink, document=None): isExt = obj.is_external isInt = not isExt self.dest = "" self.file_spec = "" self.flags = 0 self.is_map = False self.is_uri = False self.kind = LINK_NONE self.lt = Point(0, 0) self.named = dict() self.new_window = "" self.page = obj.page self.rb = Point(0, 0) self.uri = obj.uri def uri_to_dict(uri): items = self.uri[1:].split('&') ret = dict() for item in items: eq = item.find('=') if eq >= 0: ret[item[:eq]] = item[eq+1:] else: ret[item] = None return ret def unescape(name): """Unescape '%AB' substrings to chr(0xAB).""" split = name.replace("%%", "%25") # take care of escaped '%' split = split.split("%") newname = split[0] for item in split[1:]: piece = item[:2] newname += chr(int(piece, base=16)) newname += item[2:] return newname if rlink and not self.uri.startswith("#"): self.uri = f"#page={rlink[0] + 1}&zoom=0,{_format_g(rlink[1])},{_format_g(rlink[2])}" if obj.is_external: self.page = -1 self.kind = LINK_URI if not self.uri: self.page = -1 self.kind = LINK_NONE if isInt and self.uri: self.uri = self.uri.replace("&zoom=nan", "&zoom=0") if self.uri.startswith("#"): self.kind = LINK_GOTO m = re.match('^#page=([0-9]+)&zoom=([0-9.]+),(-?[0-9.]+),(-?[0-9.]+)$', self.uri) if m: self.page = int(m.group(1)) - 1 self.lt = Point(float((m.group(3))), float(m.group(4))) self.flags = self.flags | LINK_FLAG_L_VALID | LINK_FLAG_T_VALID else: m = re.match('^#page=([0-9]+)$', self.uri) if m: self.page = int(m.group(1)) - 1 else: self.kind = LINK_NAMED m = re.match('^#nameddest=(.*)', self.uri) assert document if document and m: named = unescape(m.group(1)) self.named = document.resolve_names().get(named) if self.named is None: # document.resolve_names() does not contain an # entry for `named` so use an empty dict. self.named = dict() self.named['nameddest'] = named else: self.named = uri_to_dict(self.uri[1:]) else: self.kind = LINK_NAMED self.named = uri_to_dict(self.uri) if obj.is_external: if not self.uri: pass elif self.uri.startswith("file:"): self.file_spec = self.uri[5:] if self.file_spec.startswith("//"): self.file_spec = self.file_spec[2:] self.is_uri = False self.uri = "" self.kind = LINK_LAUNCH ftab = self.file_spec.split("#") if len(ftab) == 2: if ftab[1].startswith("page="): self.kind = LINK_GOTOR self.file_spec = ftab[0] self.page = int(ftab[1].split("&")[0][5:]) - 1 elif ":" in self.uri: self.is_uri = True self.kind = LINK_URI else: self.is_uri = True self.kind = LINK_LAUNCH assert isinstance(self.named, dict) class Widget: ''' Class describing a PDF form field ("widget") ''' def __init__(self): self.border_color = None self.border_style = "S" self.border_width = 0 self.border_dashes = None self.choice_values = None # choice fields only self.rb_parent = None # radio buttons only: xref of owning parent self.field_name = None # field name self.field_label = None # field label self.field_value = None self.field_flags = 0 self.field_display = 0 self.field_type = 0 # valid range 1 through 7 self.field_type_string = None # field type as string self.fill_color = None self.button_caption = None # button caption self.is_signed = None # True / False if signature self.text_color = (0, 0, 0) self.text_font = "Helv" self.text_fontsize = 0 self.text_maxlen = 0 # text fields only self.text_format = 0 # text fields only self._text_da = "" # /DA = default appearance self.script = None # JavaScript (/A) self.script_stroke = None # JavaScript (/AA/K) self.script_format = None # JavaScript (/AA/F) self.script_change = None # JavaScript (/AA/V) self.script_calc = None # JavaScript (/AA/C) self.script_blur = None # JavaScript (/AA/Bl) self.script_focus = None # JavaScript (/AA/Fo) codespell:ignore self.rect = None # annot value self.xref = 0 # annot value def __repr__(self): #return "'%s' widget on %s" % (self.field_type_string, str(self.parent)) # No self.parent. return f'Widget:(field_type={self.field_type_string} script={self.script})' return "'%s' widget" % (self.field_type_string) def _adjust_font(self): """Ensure text_font is from our list and correctly spelled. """ if not self.text_font: self.text_font = "Helv" return valid_fonts = ("Cour", "TiRo", "Helv", "ZaDb") for f in valid_fonts: if self.text_font.lower() == f.lower(): self.text_font = f return self.text_font = "Helv" return def _checker(self): """Any widget type checks. """ if self.field_type not in range(1, 8): raise ValueError("bad field type") # if setting a radio button to ON, first set Off all buttons # in the group - this is not done by MuPDF: if self.field_type == mupdf.PDF_WIDGET_TYPE_RADIOBUTTON and self.field_value not in (False, "Off") and hasattr(self, "parent"): # so we are about setting this button to ON/True # check other buttons in same group and set them to 'Off' doc = self.parent.parent kids_type, kids_value = doc.xref_get_key(self.xref, "Parent/Kids") if kids_type == "array": xrefs = tuple(map(int, kids_value[1:-1].replace("0 R","").split())) for xref in xrefs: if xref != self.xref: doc.xref_set_key(xref, "AS", "/Off") # the calling method will now set the intended button to on and # will find everything prepared for correct functioning. def _parse_da(self): """Extract font name, size and color from default appearance string (/DA object). Equivalent to 'pdf_parse_default_appearance' function in MuPDF's 'pdf-annot.c'. """ if not self._text_da: return font = "Helv" fsize = 0 col = (0, 0, 0) dat = self._text_da.split() # split on any whitespace for i, item in enumerate(dat): if item == "Tf": font = dat[i - 2][1:] fsize = float(dat[i - 1]) dat[i] = dat[i-1] = dat[i-2] = "" continue if item == "g": # unicolor text col = [(float(dat[i - 1]))] dat[i] = dat[i-1] = "" continue if item == "rg": # RGB colored text col = [float(f) for f in dat[i - 3:i]] dat[i] = dat[i-1] = dat[i-2] = dat[i-3] = "" continue self.text_font = font self.text_fontsize = fsize self.text_color = col self._text_da = "" return def _validate(self): """Validate the class entries. """ if (self.rect.is_infinite or self.rect.is_empty ): raise ValueError("bad rect") if not self.field_name: raise ValueError("field name missing") if self.field_label == "Unnamed": self.field_label = None CheckColor(self.border_color) CheckColor(self.fill_color) if not self.text_color: self.text_color = (0, 0, 0) CheckColor(self.text_color) if not self.border_width: self.border_width = 0 if not self.text_fontsize: self.text_fontsize = 0 self.border_style = self.border_style.upper()[0:1] # standardize content of JavaScript entries btn_type = self.field_type in ( mupdf.PDF_WIDGET_TYPE_BUTTON, mupdf.PDF_WIDGET_TYPE_CHECKBOX, mupdf.PDF_WIDGET_TYPE_RADIOBUTTON, ) if not self.script: self.script = None elif type(self.script) is not str: raise ValueError("script content must be a string") # buttons cannot have the following script actions if btn_type or not self.script_calc: self.script_calc = None elif type(self.script_calc) is not str: raise ValueError("script_calc content must be a string") if btn_type or not self.script_change: self.script_change = None elif type(self.script_change) is not str: raise ValueError("script_change content must be a string") if btn_type or not self.script_format: self.script_format = None elif type(self.script_format) is not str: raise ValueError("script_format content must be a string") if btn_type or not self.script_stroke: self.script_stroke = None elif type(self.script_stroke) is not str: raise ValueError("script_stroke content must be a string") if btn_type or not self.script_blur: self.script_blur = None elif type(self.script_blur) is not str: raise ValueError("script_blur content must be a string") if btn_type or not self.script_focus: self.script_focus = None elif type(self.script_focus) is not str: raise ValueError("script_focus content must be a string") self._checker() # any field_type specific checks def _sync_flags(self): """Propagate the field flags. If this widget has a "/Parent", set its field flags and that of all its /Kids widgets to the value of the current widget. Only possible for widgets existing in the PDF. Returns True or False. """ if not self.xref: return False # no xref: widget not in the PDF doc = self.parent.parent # the owning document assert doc pdf = _as_pdf_document(doc) # load underlying PDF object pdf_widget = mupdf.pdf_load_object(pdf, self.xref) Parent = mupdf.pdf_dict_get(pdf_widget, PDF_NAME("Parent")) if not Parent.pdf_is_dict(): return False # no /Parent: nothing to do # put the field flags value into the parent field flags: Parent.pdf_dict_put_int(PDF_NAME("Ff"), self.field_flags) # also put that value into all kids of the Parent kids = Parent.pdf_dict_get(PDF_NAME("Kids")) if not kids.pdf_is_array(): message("warning: malformed PDF, Parent has no Kids array") return False # no /Kids: should never happen! for i in range(kids.pdf_array_len()): # walk through all kids # access kid widget, and do some precautionary checks kid = kids.pdf_array_get(i) if not kid.pdf_is_dict(): continue xref = kid.pdf_to_num() # get xref of the kid if xref == self.xref: # skip self widget continue subtype = kid.pdf_dict_get(PDF_NAME("Subtype")) if not subtype.pdf_to_name() == "Widget": continue # put the field flags value into the kid field flags: kid.pdf_dict_put_int(PDF_NAME("Ff"), self.field_flags) return True # all done def button_states(self): """Return the on/off state names for button widgets. A button may have 'normal' or 'pressed down' appearances. While the 'Off' state is usually called like this, the 'On' state is often given a name relating to the functional context. """ if self.field_type not in (2, 5): return None # no button type if hasattr(self, "parent"): # field already exists on page doc = self.parent.parent else: return xref = self.xref states = {"normal": None, "down": None} APN = doc.xref_get_key(xref, "AP/N") if APN[0] == "dict": nstates = [] APN = APN[1][2:-2] apnt = APN.split("/")[1:] for x in apnt: nstates.append(x.split()[0]) states["normal"] = nstates if APN[0] == "xref": nstates = [] nxref = int(APN[1].split(" ")[0]) APN = doc.xref_object(nxref) apnt = APN.split("/")[1:] for x in apnt: nstates.append(x.split()[0]) states["normal"] = nstates APD = doc.xref_get_key(xref, "AP/D") if APD[0] == "dict": dstates = [] APD = APD[1][2:-2] apdt = APD.split("/")[1:] for x in apdt: dstates.append(x.split()[0]) states["down"] = dstates if APD[0] == "xref": dstates = [] dxref = int(APD[1].split(" ")[0]) APD = doc.xref_object(dxref) apdt = APD.split("/")[1:] for x in apdt: dstates.append(x.split()[0]) states["down"] = dstates return states @property def next(self): return self._annot.next def on_state(self): """Return the "On" value for button widgets. This is useful for radio buttons mainly. Checkboxes will always return "Yes". Radio buttons will return the string that is unequal to "Off" as returned by method button_states(). If the radio button is new / being created, it does not yet have an "On" value. In this case, a warning is shown and True is returned. """ if self.field_type not in (2, 5): return None # no checkbox or radio button bstate = self.button_states() if bstate is None: bstate = dict() for k in bstate.keys(): for v in bstate[k]: if v != "Off": return v message("warning: radio button has no 'On' value.") return True def reset(self): """Reset the field value to its default. """ TOOLS._reset_widget(self._annot) def update(self, sync_flags=False): """Reflect Python object in the PDF.""" self._validate() self._adjust_font() # ensure valid text_font name # now create the /DA string self._text_da = "" if len(self.text_color) == 3: fmt = "{:g} {:g} {:g} rg /{f:s} {s:g} Tf" + self._text_da elif len(self.text_color) == 1: fmt = "{:g} g /{f:s} {s:g} Tf" + self._text_da elif len(self.text_color) == 4: fmt = "{:g} {:g} {:g} {:g} k /{f:s} {s:g} Tf" + self._text_da self._text_da = fmt.format(*self.text_color, f=self.text_font, s=self.text_fontsize) # finally update the widget # if widget has a '/AA/C' script, make sure it is in the '/CO' # array of the '/AcroForm' dictionary. if self.script_calc: # there is a "calculation" script: # make sure we are in the /CO array util_ensure_widget_calc(self._annot) # finally update the widget TOOLS._save_widget(self._annot, self) self._text_da = "" if sync_flags: self._sync_flags() # propagate field flags to parent and kids from . import _extra class Outline: def __init__(self, ol): self.this = ol @property def dest(self): '''outline destination details''' return linkDest(self, None, None) def destination(self, document): ''' Like `dest` property but uses `document` to resolve destinations for kind=LINK_NAMED. ''' return linkDest(self, None, document) @property def down(self): ol = self.this down_ol = ol.down() if not down_ol.m_internal: return return Outline(down_ol) @property def is_external(self): if g_use_extra: # calling _extra.* here appears to save significant time in # test_toc.py:test_full_toc, 1.2s=>0.94s. # return _extra.Outline_is_external( self.this) ol = self.this if not ol.m_internal: return False uri = ol.m_internal.uri if 1 else ol.uri() if uri is None: return False return mupdf.fz_is_external_link(uri) @property def is_open(self): if 1: return self.this.m_internal.is_open return self.this.is_open() @property def next(self): ol = self.this next_ol = ol.next() if not next_ol.m_internal: return return Outline(next_ol) @property def page(self): if 1: return self.this.m_internal.page.page return self.this.page().page @property def title(self): return self.this.m_internal.title @property def uri(self): ol = self.this if not ol.m_internal: return None return ol.m_internal.uri @property def x(self): return self.this.m_internal.x @property def y(self): return self.this.m_internal.y __slots__ = [ 'this'] def _make_PdfFilterOptions( recurse=0, instance_forms=0, ascii=0, no_update=0, sanitize=0, sopts=None, ): ''' Returns a mupdf.PdfFilterOptions instance. ''' filter_ = mupdf.PdfFilterOptions() filter_.recurse = recurse filter_.instance_forms = instance_forms filter_.ascii = ascii filter_.no_update = no_update if sanitize: # We want to use a PdfFilterFactory whose `.filter` fn pointer is # set to MuPDF's `pdf_new_sanitize_filter()`. But not sure how to # get access to this raw fn in Python; and on Windows raw MuPDF # functions are not even available to C++. # # So we use SWIG Director to implement our own # PdfFilterFactory whose `filter()` method calls # `mupdf.ll_pdf_new_sanitize_filter()`. if sopts: assert isinstance(sopts, mupdf.PdfSanitizeFilterOptions) else: sopts = mupdf.PdfSanitizeFilterOptions() class Factory(mupdf.PdfFilterFactory2): def __init__(self): super().__init__() self.use_virtual_filter() self.sopts = sopts def filter(self, ctx, doc, chain, struct_parents, transform, options): if 0: log(f'sanitize filter.filter():') log(f' {self=}') log(f' {ctx=}') log(f' {doc=}') log(f' {chain=}') log(f' {struct_parents=}') log(f' {transform=}') log(f' {options=}') log(f' {self.sopts.internal()=}') return mupdf.ll_pdf_new_sanitize_filter( doc, chain, struct_parents, transform, options, self.sopts.internal(), ) factory = Factory() filter_.add_factory(factory.internal()) filter_._factory = factory return filter_ class Page: def __init__(self, page, document): assert isinstance(page, (mupdf.FzPage, mupdf.PdfPage)), f'page is: {page}' self.this = page self.thisown = True self.last_point = None self.draw_cont = '' self._annot_refs = dict() self.parent = document if page.m_internal: if isinstance( page, mupdf.PdfPage): self.number = page.m_internal.super.number else: self.number = page.m_internal.number else: self.number = None def __repr__(self): return self.__str__() CheckParent(self) x = self.parent.name if self.parent.stream is not None: x = "<memory, doc# %i>" % (self.parent._graft_id,) if x == "": x = "<new PDF, doc# %i>" % self.parent._graft_id return "page %s of %s" % (self.number, x) def __str__(self): #CheckParent(self) parent = getattr(self, 'parent', None) if isinstance(self.this.m_internal, mupdf.pdf_page): number = self.this.m_internal.super.number else: number = self.this.m_internal.number ret = f'page {number}' if parent: x = self.parent.name if self.parent.stream is not None: x = "<memory, doc# %i>" % (self.parent._graft_id,) if x == "": x = "<new PDF, doc# %i>" % self.parent._graft_id ret += f' of {x}' return ret def _add_caret_annot(self, point): if g_use_extra: annot = extra._add_caret_annot( self.this, JM_point_from_py(point)) else: page = self._pdf_page() annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_CARET) if point: p = JM_point_from_py(point) r = mupdf.pdf_annot_rect(annot) r = mupdf.FzRect(p.x, p.y, p.x + r.x1 - r.x0, p.y + r.y1 - r.y0) mupdf.pdf_set_annot_rect(annot, r) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") return annot def _add_file_annot(self, point, buffer_, filename, ufilename=None, desc=None, icon=None): page = self._pdf_page() uf = ufilename if ufilename else filename d = desc if desc else filename p = JM_point_from_py(point) filebuf = JM_BufferFromBytes(buffer_) if not filebuf.m_internal: raise TypeError( MSG_BAD_BUFFER) annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_FILE_ATTACHMENT) r = mupdf.pdf_annot_rect(annot) r = mupdf.fz_make_rect(p.x, p.y, p.x + r.x1 - r.x0, p.y + r.y1 - r.y0) mupdf.pdf_set_annot_rect(annot, r) flags = mupdf.PDF_ANNOT_IS_PRINT mupdf.pdf_set_annot_flags(annot, flags) if icon: mupdf.pdf_set_annot_icon_name(annot, icon) val = JM_embed_file(page.doc(), filebuf, filename, uf, d, 1) mupdf.pdf_dict_put(mupdf.pdf_annot_obj(annot), PDF_NAME('FS'), val) mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(annot), PDF_NAME('Contents'), filename) mupdf.pdf_update_annot(annot) mupdf.pdf_set_annot_rect(annot, r) mupdf.pdf_set_annot_flags(annot, flags) JM_add_annot_id(annot, "A") return Annot(annot) def _add_freetext_annot( self, rect, text, fontsize=11, fontname=None, text_color=None, fill_color=None, border_color=None, border_width=0, dashes=None, callout=None, line_end=mupdf.PDF_ANNOT_LE_OPEN_ARROW, opacity=1, align=0, rotate=0, richtext=False, style=None, ): rc = f"""<?xml version="1.0"?> <body xmlns="http://www.w3.org/1999/xtml" xmlns:xfa="http://www.xfa.org/schema/xfa-data/1.0/" xfa:contentType="text/html" xfa:APIVersion="Acrobat:8.0.0" xfa:spec="2.4"> {text}""" page = self._pdf_page() if border_color and not richtext: raise ValueError("cannot set border_color if rich_text is False") if border_color and not text_color: text_color = border_color nfcol, fcol = JM_color_FromSequence(fill_color) ntcol, tcol = JM_color_FromSequence(text_color) r = JM_rect_from_py(rect) if mupdf.fz_is_infinite_rect(r) or mupdf.fz_is_empty_rect(r): raise ValueError( MSG_BAD_RECT) annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_FREE_TEXT) annot_obj = mupdf.pdf_annot_obj(annot) #insert text as 'contents' or 'RC' depending on 'richtext' if not richtext: mupdf.pdf_set_annot_contents(annot, text) else: mupdf.pdf_dict_put_text_string(annot_obj,PDF_NAME("RC"), rc) if style: mupdf.pdf_dict_put_text_string(annot_obj,PDF_NAME("DS"), style) mupdf.pdf_set_annot_rect(annot, r) while rotate < 0: rotate += 360 while rotate >= 360: rotate -= 360 if rotate != 0: mupdf.pdf_dict_put_int(annot_obj, PDF_NAME('Rotate'), rotate) mupdf.pdf_set_annot_quadding(annot, align) if nfcol > 0: mupdf.pdf_set_annot_color(annot, fcol[:nfcol]) mupdf.pdf_set_annot_border_width(annot, border_width) mupdf.pdf_set_annot_opacity(annot, opacity) if dashes: for d in dashes: mupdf.pdf_add_annot_border_dash_item(annot, float(d)) # Insert callout information if callout: mupdf.pdf_dict_put(annot_obj, PDF_NAME("IT"), PDF_NAME("FreeTextCallout")) mupdf.pdf_set_annot_callout_style(annot, line_end) point_count = len(callout) extra.JM_set_annot_callout_line(annot, tuple(callout), point_count) # insert the default appearance string if not richtext: JM_make_annot_DA(annot, ntcol, tcol, fontname, fontsize) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") val = Annot(annot) return val def _add_ink_annot(self, list): page = _as_pdf_page(self.this) if not PySequence_Check(list): raise ValueError( MSG_BAD_ARG_INK_ANNOT) ctm = mupdf.FzMatrix() mupdf.pdf_page_transform(page, mupdf.FzRect(0), ctm) inv_ctm = mupdf.fz_invert_matrix(ctm) annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_INK) annot_obj = mupdf.pdf_annot_obj(annot) n0 = len(list) inklist = mupdf.pdf_new_array(page.doc(), n0) for j in range(n0): sublist = list[j] n1 = len(sublist) stroke = mupdf.pdf_new_array(page.doc(), 2 * n1) for i in range(n1): p = sublist[i] if not PySequence_Check(p) or PySequence_Size(p) != 2: raise ValueError( MSG_BAD_ARG_INK_ANNOT) point = mupdf.fz_transform_point(JM_point_from_py(p), inv_ctm) mupdf.pdf_array_push_real(stroke, point.x) mupdf.pdf_array_push_real(stroke, point.y) mupdf.pdf_array_push(inklist, stroke) mupdf.pdf_dict_put(annot_obj, PDF_NAME('InkList'), inklist) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") return Annot(annot) def _add_line_annot(self, p1, p2): page = self._pdf_page() annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_LINE) a = JM_point_from_py(p1) b = JM_point_from_py(p2) mupdf.pdf_set_annot_line(annot, a, b) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") assert annot.m_internal return Annot(annot) def _add_multiline(self, points, annot_type): page = self._pdf_page() if len(points) < 2: raise ValueError( MSG_BAD_ARG_POINTS) annot = mupdf.pdf_create_annot(page, annot_type) for p in points: if (PySequence_Size(p) != 2): raise ValueError( MSG_BAD_ARG_POINTS) point = JM_point_from_py(p) mupdf.pdf_add_annot_vertex(annot, point) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") return Annot(annot) def _add_redact_annot(self, quad, text=None, da_str=None, align=0, fill=None, text_color=None): page = self._pdf_page() fcol = [ 1, 1, 1, 0] nfcol = 0 annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_REDACT) q = JM_quad_from_py(quad) r = mupdf.fz_rect_from_quad(q) # TODO calculate de-rotated rect mupdf.pdf_set_annot_rect(annot, r) if fill: nfcol, fcol = JM_color_FromSequence(fill) arr = mupdf.pdf_new_array(page.doc(), nfcol) for i in range(nfcol): mupdf.pdf_array_push_real(arr, fcol[i]) mupdf.pdf_dict_put(mupdf.pdf_annot_obj(annot), PDF_NAME('IC'), arr) if text: assert da_str mupdf.pdf_dict_puts( mupdf.pdf_annot_obj(annot), "OverlayText", mupdf.pdf_new_text_string(text), ) mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(annot), PDF_NAME('DA'), da_str) mupdf.pdf_dict_put_int(mupdf.pdf_annot_obj(annot), PDF_NAME('Q'), align) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") annot = mupdf.ll_pdf_keep_annot(annot.m_internal) annot = mupdf.PdfAnnot( annot) return Annot(annot) def _add_square_or_circle(self, rect, annot_type): page = self._pdf_page() r = JM_rect_from_py(rect) if mupdf.fz_is_infinite_rect(r) or mupdf.fz_is_empty_rect(r): raise ValueError( MSG_BAD_RECT) annot = mupdf.pdf_create_annot(page, annot_type) mupdf.pdf_set_annot_rect(annot, r) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") assert annot.m_internal return Annot(annot) def _add_stamp_annot(self, rect, stamp=0): rect = Rect(rect) r = JM_rect_from_py(rect) if mupdf.fz_is_infinite_rect(r) or mupdf.fz_is_empty_rect(r): raise ValueError(MSG_BAD_RECT) page = self._pdf_page() stamp_id = [ "Approved", "AsIs", "Confidential", "Departmental", "Experimental", "Expired", "Final", "ForComment", "ForPublicRelease", "NotApproved", "NotForPublicRelease", "Sold", "TopSecret", "Draft", ] n = len(stamp_id) buf = None name = None if stamp in range(n): name = stamp_id[stamp] elif isinstance(stamp, Pixmap): buf = stamp.tobytes() elif isinstance(stamp, str): buf = pathlib.Path(stamp).read_bytes() elif isinstance(stamp, (bytes, bytearray)): buf = stamp elif isinstance(stamp, io.BytesIO): buf = stamp.getvalue() else: name = stamp_id[0] annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_STAMP) if buf: # image stamp fzbuff = mupdf.fz_new_buffer_from_copied_data(buf) img = mupdf.fz_new_image_from_buffer(fzbuff) # compute image boundary box on page w, h = img.w(), img.h() scale = min(rect.width / w, rect.height / h) width = w * scale # bbox width height = h * scale # bbox height # center of "rect" center = (rect.tl + rect.br) / 2 x0 = center.x - width / 2 y0 = center.y - height / 2 x1 = x0 + width y1 = y0 + height r = mupdf.fz_make_rect(x0, y0, x1, y1) mupdf.pdf_set_annot_rect(annot, r) mupdf.pdf_set_annot_stamp_image(annot, img) mupdf.pdf_dict_put(mupdf.pdf_annot_obj(annot), PDF_NAME("Name"), mupdf.pdf_new_name("ImageStamp")) mupdf.pdf_set_annot_contents(annot, "Image Stamp") else: # text stamp mupdf.pdf_set_annot_rect(annot, r) mupdf.pdf_dict_put(mupdf.pdf_annot_obj(annot), PDF_NAME("Name"), PDF_NAME(name)) mupdf.pdf_set_annot_contents(annot, name) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") return Annot(annot) def _add_text_annot(self, point, text, icon=None): page = self._pdf_page() p = JM_point_from_py( point) annot = mupdf.pdf_create_annot(page, mupdf.PDF_ANNOT_TEXT) r = mupdf.pdf_annot_rect(annot) r = mupdf.fz_make_rect(p.x, p.y, p.x + r.x1 - r.x0, p.y + r.y1 - r.y0) mupdf.pdf_set_annot_rect(annot, r) mupdf.pdf_set_annot_contents(annot, text) if icon: mupdf.pdf_set_annot_icon_name(annot, icon) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") return Annot(annot) def _add_text_marker(self, quads, annot_type): CheckParent(self) if not self.parent.is_pdf: raise ValueError("is no PDF") val = Page__add_text_marker(self, quads, annot_type) if not val: return None val.parent = weakref.proxy(self) self._annot_refs[id(val)] = val return val def _addAnnot_FromString(self, linklist): """Add links from list of object sources.""" CheckParent(self) if g_use_extra: self.__class__._addAnnot_FromString = extra.Page_addAnnot_FromString #log('Page._addAnnot_FromString() deferring to extra.Page_addAnnot_FromString().') return extra.Page_addAnnot_FromString( self.this, linklist) page = _as_pdf_page(self.this) lcount = len(linklist) # link count if lcount < 1: return i = -1 # insert links from the provided sources if not isinstance(linklist, tuple): raise ValueError( "bad 'linklist' argument") if not mupdf.pdf_dict_get( page.obj(), PDF_NAME('Annots')).m_internal: mupdf.pdf_dict_put_array( page.obj(), PDF_NAME('Annots'), lcount) annots = mupdf.pdf_dict_get( page.obj(), PDF_NAME('Annots')) assert annots.m_internal, f'{lcount=} {annots.m_internal=}' for i in range(lcount): txtpy = linklist[i] text = JM_StrAsChar(txtpy) if not text: message("skipping bad link / annot item %i.", i) continue try: annot = mupdf.pdf_add_object( page.doc(), JM_pdf_obj_from_str( page.doc(), text)) ind_obj = mupdf.pdf_new_indirect( page.doc(), mupdf.pdf_to_num( annot), 0) mupdf.pdf_array_push( annots, ind_obj) except Exception: if g_exceptions_verbose: exception_info() message("skipping bad link / annot item %i.\n" % i) def _addWidget(self, field_type, field_name): page = self._pdf_page() pdf = page.doc() annot = JM_create_widget(pdf, page, field_type, field_name) if not annot.m_internal: raise RuntimeError( "cannot create widget") JM_add_annot_id(annot, "W") return Annot(annot) def _apply_redactions(self, text, images, graphics): page = self._pdf_page() opts = mupdf.PdfRedactOptions() opts.black_boxes = 0 # no black boxes opts.text = text # how to treat text opts.image_method = images # how to treat images opts.line_art = graphics # how to treat vector graphics success = mupdf.pdf_redact_page(page.doc(), page, opts) return success def _erase(self): self._reset_annot_refs() try: self.parent._forget_page(self) except Exception: exception_info() pass self.parent = None self.thisown = False self.number = None self.this = None def _count_q_balance(self): """Count missing graphic state pushs and pops. Returns: A pair of integers (push, pop). Push is the number of missing PDF "q" commands, pop is the number of "Q" commands. A balanced graphics state for the page will be reached if its /Contents is prepended with 'push' copies of string "q\n" and appended with 'pop' copies of "\nQ". """ page = _as_pdf_page(self) # need the underlying PDF page res = mupdf.pdf_dict_get( # access /Resources page.obj(), mupdf.PDF_ENUM_NAME_Resources, ) cont = mupdf.pdf_dict_get( # access /Contents page.obj(), mupdf.PDF_ENUM_NAME_Contents, ) pdf = _as_pdf_document(self.parent) # need underlying PDF document # return value of MuPDF function return mupdf.pdf_count_q_balance_outparams_fn(pdf, res, cont) def _get_optional_content(self, oc: OptInt) -> OptStr: if oc is None or oc == 0: return None doc = self.parent check = doc.xref_object(oc, compressed=True) if not ("/Type/OCG" in check or "/Type/OCMD" in check): #log( 'raising "bad optional content"') raise ValueError("bad optional content: 'oc'") #log( 'Looking at self._get_resource_properties()') props = {} for p, x in self._get_resource_properties(): props[x] = p if oc in props.keys(): return props[oc] i = 0 mc = "MC%i" % i while mc in props.values(): i += 1 mc = "MC%i" % i self._set_resource_property(mc, oc) #log( 'returning {mc=}') return mc def _get_resource_properties(self): ''' page list Resource/Properties ''' page = self._pdf_page() rc = JM_get_resource_properties(page.obj()) return rc def _get_textpage(self, clip=None, flags=0, matrix=None): if g_use_extra: ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix) tpage = mupdf.FzStextPage(ll_tpage) return tpage page = self.this options = mupdf.FzStextOptions(flags) rect = JM_rect_from_py(clip) # Default to page's rect if `clip` not specified, for #2048. rect = mupdf.fz_bound_page(page) if clip is None else JM_rect_from_py(clip) ctm = JM_matrix_from_py(matrix) tpage = mupdf.FzStextPage(rect) dev = mupdf.fz_new_stext_device(tpage, options) if _globals.no_device_caching: mupdf.fz_enable_device_hints( dev, mupdf.FZ_NO_CACHE) if isinstance(page, mupdf.FzPage): pass elif isinstance(page, mupdf.PdfPage): page = page.super() else: assert 0, f'Unrecognised {type(page)=}' mupdf.fz_run_page(page, dev, ctm, mupdf.FzCookie()) mupdf.fz_close_device(dev) return tpage def _insert_image(self, filename=None, pixmap=None, stream=None, imask=None, clip=None, overlay=1, rotate=0, keep_proportion=1, oc=0, width=0, height=0, xref=0, alpha=-1, _imgname=None, digests=None ): maskbuf = mupdf.FzBuffer() page = self._pdf_page() # This will create an empty PdfDocument with a call to # pdf_new_document() then assign page.doc()'s return value to it (which # drop the original empty pdf_document). pdf = page.doc() w = width h = height img_xref = xref rc_digest = 0 do_process_pixmap = 1 do_process_stream = 1 do_have_imask = 1 do_have_image = 1 do_have_xref = 1 if xref > 0: ref = mupdf.pdf_new_indirect(pdf, xref, 0) w = mupdf.pdf_to_int( mupdf.pdf_dict_geta( ref, PDF_NAME('Width'), PDF_NAME('W'))) h = mupdf.pdf_to_int( mupdf.pdf_dict_geta( ref, PDF_NAME('Height'), PDF_NAME('H'))) if w + h == 0: raise ValueError( MSG_IS_NO_IMAGE) #goto have_xref() do_process_pixmap = 0 do_process_stream = 0 do_have_imask = 0 do_have_image = 0 else: if stream: imgbuf = JM_BufferFromBytes(stream) do_process_pixmap = 0 else: if filename: imgbuf = mupdf.fz_read_file(filename) #goto have_stream() do_process_pixmap = 0 if do_process_pixmap: #log( 'do_process_pixmap') # process pixmap --------------------------------- arg_pix = pixmap.this w = arg_pix.w() h = arg_pix.h() digest = mupdf.fz_md5_pixmap2(arg_pix) md5_py = digest temp = digests.get(md5_py, None) if temp is not None: img_xref = temp ref = mupdf.pdf_new_indirect(page.doc(), img_xref, 0) #goto have_xref() do_process_stream = 0 do_have_imask = 0 do_have_image = 0 else: if arg_pix.alpha() == 0: image = mupdf.fz_new_image_from_pixmap(arg_pix, mupdf.FzImage()) else: pm = mupdf.fz_convert_pixmap( arg_pix, mupdf.FzColorspace(), mupdf.FzColorspace(), mupdf.FzDefaultColorspaces(None), mupdf.FzColorParams(), 1, ) pm.alpha = 0 pm.colorspace = None mask = mupdf.fz_new_image_from_pixmap(pm, mupdf.FzImage()) image = mupdf.fz_new_image_from_pixmap(arg_pix, mask) #goto have_image() do_process_stream = 0 do_have_imask = 0 if do_process_stream: #log( 'do_process_stream') # process stream --------------------------------- state = mupdf.FzMd5() if mupdf_cppyy: mupdf.fz_md5_update_buffer( state, imgbuf) else: mupdf.fz_md5_update(state, imgbuf.m_internal.data, imgbuf.m_internal.len) if imask: maskbuf = JM_BufferFromBytes(imask) if mupdf_cppyy: mupdf.fz_md5_update_buffer( state, maskbuf) else: mupdf.fz_md5_update(state, maskbuf.m_internal.data, maskbuf.m_internal.len) digest = mupdf.fz_md5_final2(state) md5_py = bytes(digest) temp = digests.get(md5_py, None) if temp is not None: img_xref = temp ref = mupdf.pdf_new_indirect(page.doc(), img_xref, 0) w = mupdf.pdf_to_int( mupdf.pdf_dict_geta( ref, PDF_NAME('Width'), PDF_NAME('W'))) h = mupdf.pdf_to_int( mupdf.pdf_dict_geta( ref, PDF_NAME('Height'), PDF_NAME('H'))) #goto have_xref() do_have_imask = 0 do_have_image = 0 else: image = mupdf.fz_new_image_from_buffer(imgbuf) w = image.w() h = image.h() if not imask: #goto have_image() do_have_imask = 0 if do_have_imask: # `fz_compressed_buffer` is reference counted and # `mupdf.fz_new_image_from_compressed_buffer2()` # is povided as a Swig-friendly wrapper for # `fz_new_image_from_compressed_buffer()`, so we can do things # straightfowardly. # cbuf1 = mupdf.fz_compressed_image_buffer( image) if not cbuf1.m_internal: raise ValueError( "uncompressed image cannot have mask") bpc = image.bpc() colorspace = image.colorspace() xres, yres = mupdf.fz_image_resolution(image) mask = mupdf.fz_new_image_from_buffer(maskbuf) image = mupdf.fz_new_image_from_compressed_buffer2( w, h, bpc, colorspace, xres, yres, 1, # interpolate 0, # imagemask, list(), # decode list(), # colorkey cbuf1, mask, ) if do_have_image: #log( 'do_have_image') ref = mupdf.pdf_add_image(pdf, image) if oc: JM_add_oc_object(pdf, ref, oc) img_xref = mupdf.pdf_to_num(ref) digests[md5_py] = img_xref rc_digest = 1 if do_have_xref: #log( 'do_have_xref') resources = mupdf.pdf_dict_get_inheritable(page.obj(), PDF_NAME('Resources')) if not resources.m_internal: resources = mupdf.pdf_dict_put_dict(page.obj(), PDF_NAME('Resources'), 2) xobject = mupdf.pdf_dict_get(resources, PDF_NAME('XObject')) if not xobject.m_internal: xobject = mupdf.pdf_dict_put_dict(resources, PDF_NAME('XObject'), 2) mat = calc_image_matrix(w, h, clip, rotate, keep_proportion) mupdf.pdf_dict_puts(xobject, _imgname, ref) nres = mupdf.fz_new_buffer(50) s = f"\nq\n{_format_g((mat.a, mat.b, mat.c, mat.d, mat.e, mat.f))} cm\n/{_imgname} Do\nQ\n" #s = s.replace('\n', '\r\n') mupdf.fz_append_string(nres, s) JM_insert_contents(pdf, page.obj(), nres, overlay) if rc_digest: return img_xref, digests else: return img_xref, None def _insertFont(self, fontname, bfname, fontfile, fontbuffer, set_simple, idx, wmode, serif, encoding, ordering): page = self._pdf_page() pdf = page.doc() value = JM_insert_font(pdf, bfname, fontfile,fontbuffer, set_simple, idx, wmode, serif, encoding, ordering) # get the objects /Resources, /Resources/Font resources = mupdf.pdf_dict_get_inheritable(page.obj(), PDF_NAME('Resources')) if not resources.pdf_is_dict(): resources = mupdf.pdf_dict_put_dict(page.obj(), PDF_NAME("Resources"), 5) fonts = mupdf.pdf_dict_get(resources, PDF_NAME('Font')) if not fonts.m_internal: # page has no fonts yet fonts = mupdf.pdf_new_dict(pdf, 5) mupdf.pdf_dict_putl(page.obj(), fonts, PDF_NAME('Resources'), PDF_NAME('Font')) # store font in resources and fonts objects will contain named reference to font _, xref = JM_INT_ITEM(value, 0) if not xref: raise RuntimeError( "cannot insert font") font_obj = mupdf.pdf_new_indirect(pdf, xref, 0) mupdf.pdf_dict_puts(fonts, fontname, font_obj) return value def _load_annot(self, name, xref): page = self._pdf_page() if xref == 0: annot = JM_get_annot_by_name(page, name) else: annot = JM_get_annot_by_xref(page, xref) if annot.m_internal: return Annot(annot) def _makePixmap(self, doc, ctm, cs, alpha=0, annots=1, clip=None): pix = JM_pixmap_from_page(doc, self.this, ctm, cs, alpha, annots, clip) return Pixmap(pix) def _other_box(self, boxtype): rect = mupdf.FzRect( mupdf.FzRect.Fixed_INFINITE) page = _as_pdf_page(self.this, required=False) if page.m_internal: obj = mupdf.pdf_dict_gets( page.obj(), boxtype) if mupdf.pdf_is_array(obj): rect = mupdf.pdf_to_rect(obj) if mupdf.fz_is_infinite_rect( rect): return return JM_py_from_rect(rect) def _pdf_page(self, required=True): return _as_pdf_page(self.this, required=required) def _reset_annot_refs(self): """Invalidate / delete all annots of this page.""" self._annot_refs.clear() def _set_opacity(self, gstate=None, CA=1, ca=1, blendmode=None): if CA >= 1 and ca >= 1 and blendmode is None: return tCA = int(round(max(CA , 0) * 100)) if tCA >= 100: tCA = 99 tca = int(round(max(ca, 0) * 100)) if tca >= 100: tca = 99 gstate = "fitzca%02i%02i" % (tCA, tca) if not gstate: return page = _as_pdf_page(self.this) resources = mupdf.pdf_dict_get(page.obj(), PDF_NAME('Resources')) if not resources.m_internal: resources = mupdf.pdf_dict_put_dict(page.obj(), PDF_NAME('Resources'), 2) extg = mupdf.pdf_dict_get(resources, PDF_NAME('ExtGState')) if not extg.m_internal: extg = mupdf.pdf_dict_put_dict(resources, PDF_NAME('ExtGState'), 2) n = mupdf.pdf_dict_len(extg) for i in range(n): o1 = mupdf.pdf_dict_get_key(extg, i) name = mupdf.pdf_to_name(o1) if name == gstate: return gstate opa = mupdf.pdf_new_dict(page.doc(), 3) mupdf.pdf_dict_put_real(opa, PDF_NAME('CA'), CA) mupdf.pdf_dict_put_real(opa, PDF_NAME('ca'), ca) mupdf.pdf_dict_puts(extg, gstate, opa) return gstate def _set_pagebox(self, boxtype, rect): doc = self.parent if doc is None: raise ValueError("orphaned object: parent is None") if not doc.is_pdf: raise ValueError("is no PDF") valid_boxes = ("CropBox", "BleedBox", "TrimBox", "ArtBox") if boxtype not in valid_boxes: raise ValueError("bad boxtype") rect = Rect(rect) mb = self.mediabox rect = Rect(rect[0], mb.y1 - rect[3], rect[2], mb.y1 - rect[1]) if not (mb.x0 <= rect.x0 < rect.x1 <= mb.x1 and mb.y0 <= rect.y0 < rect.y1 <= mb.y1): raise ValueError(f"{boxtype} not in MediaBox") doc.xref_set_key(self.xref, boxtype, f"[{_format_g(tuple(rect))}]") def _set_resource_property(self, name, xref): page = self._pdf_page() JM_set_resource_property(page.obj(), name, xref) def _show_pdf_page(self, fz_srcpage, overlay=1, matrix=None, xref=0, oc=0, clip=None, graftmap=None, _imgname=None): cropbox = JM_rect_from_py(clip) mat = JM_matrix_from_py(matrix) rc_xref = xref tpage = _as_pdf_page(self.this) tpageref = tpage.obj() pdfout = tpage.doc() # target PDF ENSURE_OPERATION(pdfout) #------------------------------------------------------------- # convert the source page to a Form XObject #------------------------------------------------------------- xobj1 = JM_xobject_from_page(pdfout, fz_srcpage, xref, graftmap.this) if not rc_xref: rc_xref = mupdf.pdf_to_num(xobj1) #------------------------------------------------------------- # create referencing XObject (controls display on target page) #------------------------------------------------------------- # fill reference to xobj1 into the /Resources #------------------------------------------------------------- subres1 = mupdf.pdf_new_dict(pdfout, 5) mupdf.pdf_dict_puts(subres1, "fullpage", xobj1) subres = mupdf.pdf_new_dict(pdfout, 5) mupdf.pdf_dict_put(subres, PDF_NAME('XObject'), subres1) res = mupdf.fz_new_buffer(20) mupdf.fz_append_string(res, "/fullpage Do") xobj2 = mupdf.pdf_new_xobject(pdfout, cropbox, mat, subres, res) if oc > 0: JM_add_oc_object(pdfout, mupdf.pdf_resolve_indirect(xobj2), oc) #------------------------------------------------------------- # update target page with xobj2: #------------------------------------------------------------- # 1. insert Xobject in Resources #------------------------------------------------------------- resources = mupdf.pdf_dict_get_inheritable(tpageref, PDF_NAME('Resources')) if not resources.m_internal: resources = mupdf.pdf_dict_put_dict(tpageref,PDF_NAME('Resources'), 5) subres = mupdf.pdf_dict_get(resources, PDF_NAME('XObject')) if not subres.m_internal: subres = mupdf.pdf_dict_put_dict(resources, PDF_NAME('XObject'), 5) mupdf.pdf_dict_puts(subres, _imgname, xobj2) #------------------------------------------------------------- # 2. make and insert new Contents object #------------------------------------------------------------- nres = mupdf.fz_new_buffer(50) # buffer for Do-command mupdf.fz_append_string(nres, " q /") # Do-command mupdf.fz_append_string(nres, _imgname) mupdf.fz_append_string(nres, " Do Q ") JM_insert_contents(pdfout, tpageref, nres, overlay) return rc_xref def add_caret_annot(self, point: point_like) -> Annot: """Add a 'Caret' annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_caret_annot(point) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot = Annot( annot) annot_postprocess(self, annot) assert hasattr( annot, 'parent') return annot def add_circle_annot(self, rect: rect_like) -> Annot: """Add a 'Circle' (ellipse, oval) annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_square_or_circle(rect, mupdf.PDF_ANNOT_CIRCLE) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_file_annot( self, point: point_like, buffer_: ByteString, filename: str, ufilename: OptStr =None, desc: OptStr =None, icon: OptStr =None ) -> Annot: """Add a 'FileAttachment' annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_file_annot(point, buffer_, filename, ufilename=ufilename, desc=desc, icon=icon, ) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_freetext_annot( self, rect: rect_like, text: str, *, fontsize: float =11, fontname: OptStr =None, text_color: OptSeq =None, fill_color: OptSeq =None, border_color: OptSeq =None, border_width: float =0, dashes: OptSeq =None, callout: OptSeq =None, line_end: int=mupdf.PDF_ANNOT_LE_OPEN_ARROW, opacity: float =1, align: int =0, rotate: int =0, richtext=False, style=None, ) -> Annot: """Add a 'FreeText' annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_freetext_annot( rect, text, fontsize=fontsize, fontname=fontname, text_color=text_color, fill_color=fill_color, border_color=border_color, border_width=border_width, dashes=dashes, callout=callout, line_end=line_end, opacity=opacity, align=align, rotate=rotate, richtext=richtext, style=style, ) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_highlight_annot(self, quads=None, start=None, stop=None, clip=None) -> Annot: """Add a 'Highlight' annotation.""" if quads is None: q = get_highlight_selection(self, start=start, stop=stop, clip=clip) else: q = CheckMarkerArg(quads) ret = self._add_text_marker(q, mupdf.PDF_ANNOT_HIGHLIGHT) return ret def add_ink_annot(self, handwriting: list) -> Annot: """Add a 'Ink' ('handwriting') annotation. The argument must be a list of lists of point_likes. """ old_rotation = annot_preprocess(self) try: annot = self._add_ink_annot(handwriting) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_line_annot(self, p1: point_like, p2: point_like) -> Annot: """Add a 'Line' annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_line_annot(p1, p2) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_polygon_annot(self, points: list) -> Annot: """Add a 'Polygon' annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_multiline(points, mupdf.PDF_ANNOT_POLYGON) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_polyline_annot(self, points: list) -> Annot: """Add a 'PolyLine' annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_multiline(points, mupdf.PDF_ANNOT_POLY_LINE) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_rect_annot(self, rect: rect_like) -> Annot: """Add a 'Square' (rectangle) annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_square_or_circle(rect, mupdf.PDF_ANNOT_SQUARE) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_redact_annot( self, quad, text: OptStr =None, fontname: OptStr =None, fontsize: float =11, align: int =0, fill: OptSeq =None, text_color: OptSeq =None, cross_out: bool =True, ) -> Annot: """Add a 'Redact' annotation.""" da_str = None if text and not set(string.whitespace).issuperset(text): CheckColor(fill) CheckColor(text_color) if not fontname: fontname = "Helv" if not fontsize: fontsize = 11 if not text_color: text_color = (0, 0, 0) if hasattr(text_color, "__float__"): text_color = (text_color, text_color, text_color) if len(text_color) > 3: text_color = text_color[:3] fmt = "{:g} {:g} {:g} rg /{f:s} {s:g} Tf" da_str = fmt.format(*text_color, f=fontname, s=fontsize) if fill is None: fill = (1, 1, 1) if fill: if hasattr(fill, "__float__"): fill = (fill, fill, fill) if len(fill) > 3: fill = fill[:3] else: text = None old_rotation = annot_preprocess(self) try: annot = self._add_redact_annot(quad, text=text, da_str=da_str, align=align, fill=fill) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) #------------------------------------------------------------- # change appearance to show a crossed-out rectangle #------------------------------------------------------------- if cross_out: ap_tab = annot._getAP().splitlines()[:-1] # get the 4 commands only _, LL, LR, UR, UL = ap_tab ap_tab.append(LR) ap_tab.append(LL) ap_tab.append(UR) ap_tab.append(LL) ap_tab.append(UL) ap_tab.append(b"S") ap = b"\n".join(ap_tab) annot._setAP(ap, 0) return annot def add_squiggly_annot( self, quads=None, start=None, stop=None, clip=None, ) -> Annot: """Add a 'Squiggly' annotation.""" if quads is None: q = get_highlight_selection(self, start=start, stop=stop, clip=clip) else: q = CheckMarkerArg(quads) return self._add_text_marker(q, mupdf.PDF_ANNOT_SQUIGGLY) def add_stamp_annot(self, rect: rect_like, stamp=0) -> Annot: """Add a ('rubber') 'Stamp' annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_stamp_annot(rect, stamp) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_strikeout_annot(self, quads=None, start=None, stop=None, clip=None) -> Annot: """Add a 'StrikeOut' annotation.""" if quads is None: q = get_highlight_selection(self, start=start, stop=stop, clip=clip) else: q = CheckMarkerArg(quads) return self._add_text_marker(q, mupdf.PDF_ANNOT_STRIKE_OUT) def add_text_annot(self, point: point_like, text: str, icon: str ="Note") -> Annot: """Add a 'Text' (sticky note) annotation.""" old_rotation = annot_preprocess(self) try: annot = self._add_text_annot(point, text, icon=icon) finally: if old_rotation != 0: self.set_rotation(old_rotation) annot_postprocess(self, annot) return annot def add_underline_annot(self, quads=None, start=None, stop=None, clip=None) -> Annot: """Add a 'Underline' annotation.""" if quads is None: q = get_highlight_selection(self, start=start, stop=stop, clip=clip) else: q = CheckMarkerArg(quads) return self._add_text_marker(q, mupdf.PDF_ANNOT_UNDERLINE) def add_widget(self, widget: Widget) -> Annot: """Add a 'Widget' (form field).""" CheckParent(self) doc = self.parent if not doc.is_pdf: raise ValueError("is no PDF") widget._validate() annot = self._addWidget(widget.field_type, widget.field_name) if not annot: return None annot.thisown = True annot.parent = weakref.proxy(self) # owning page object self._annot_refs[id(annot)] = annot widget.parent = annot.parent widget._annot = annot widget.update() return annot def annot_names(self): ''' page get list of annot names ''' """List of names of annotations, fields and links.""" CheckParent(self) page = self._pdf_page(required=False) if not page.m_internal: return [] return JM_get_annot_id_list(page) def annot_xrefs(self): ''' List of xref numbers of annotations, fields and links. ''' return JM_get_annot_xref_list2(self) def annots(self, types=None): """ Generator over the annotations of a page. Args: types: (list) annotation types to subselect from. If none, all annotations are returned. E.g. types=[PDF_ANNOT_LINE] will only yield line annotations. """ skip_types = (mupdf.PDF_ANNOT_LINK, mupdf.PDF_ANNOT_POPUP, mupdf.PDF_ANNOT_WIDGET) if not hasattr(types, "__getitem__"): annot_xrefs = [a[0] for a in self.annot_xrefs() if a[1] not in skip_types] else: annot_xrefs = [a[0] for a in self.annot_xrefs() if a[1] in types and a[1] not in skip_types] for xref in annot_xrefs: annot = self.load_annot(xref) annot._yielded=True yield annot def apply_redactions( page: 'Page', images: int = 2, graphics: int = 1, text: int = 0, ) -> bool: """Apply the redaction annotations of the page. Args: page: the PDF page. images: 0 - ignore images 1 - remove all overlapping images 2 - blank out overlapping image parts 3 - remove image unless invisible graphics: 0 - ignore graphics 1 - remove graphics if contained in rectangle 2 - remove all overlapping graphics text: 0 - remove text 1 - ignore text """ def center_rect(annot_rect, new_text, font, fsize): """Calculate minimal sub-rectangle for the overlay text. Notes: Because 'insert_textbox' supports no vertical text centering, we calculate an approximate number of lines here and return a sub-rect with smaller height, which should still be sufficient. Args: annot_rect: the annotation rectangle new_text: the text to insert. font: the fontname. Must be one of the CJK or Base-14 set, else the rectangle is returned unchanged. fsize: the fontsize Returns: A rectangle to use instead of the annot rectangle. """ if not new_text or annot_rect.width <= EPSILON: return annot_rect try: text_width = get_text_length(new_text, font, fsize) except (ValueError, mupdf.FzErrorBase): # unsupported font if g_exceptions_verbose: exception_info() return annot_rect line_height = fsize * 1.2 limit = annot_rect.width h = math.ceil(text_width / limit) * line_height # estimate rect height if h >= annot_rect.height: return annot_rect r = annot_rect y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5 r.y0 = y return r CheckParent(page) doc = page.parent if doc.is_encrypted or doc.is_closed: raise ValueError("document closed or encrypted") if not doc.is_pdf: raise ValueError("is no PDF") redact_annots = [] # storage of annot values for annot in page.annots( types=(mupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member ): # loop redactions redact_annots.append(annot._get_redact_values()) # save annot values if redact_annots == []: # any redactions on this page? return False # no redactions rc = page._apply_redactions(text, images, graphics) # call MuPDF if not rc: # should not happen really raise ValueError("Error applying redactions.") # now write replacement text in old redact rectangles shape = page.new_shape() for redact in redact_annots: annot_rect = redact["rect"] fill = redact["fill"] if fill: shape.draw_rect(annot_rect) # colorize the rect background shape.finish(fill=fill, color=fill) if "text" in redact.keys(): # if we also have text new_text = redact["text"] align = redact.get("align", 0) fname = redact["fontname"] fsize = redact["fontsize"] color = redact["text_color"] # try finding vertical centered sub-rect trect = center_rect(annot_rect, new_text, fname, fsize) rc = -1 while rc < 0 and fsize >= 4: # while not enough room # (re-) try insertion rc = shape.insert_textbox( trect, new_text, fontname=fname, fontsize=fsize, color=color, align=align, ) fsize -= 0.5 # reduce font if unsuccessful shape.commit() # append new contents object return True def recolor(self, components=1): """Convert colorspaces of objects on the page. Valid values are 1, 3 and 4. """ if components not in (1, 3, 4): raise ValueError("components must be one of 1, 3, 4") pdfdoc = _as_pdf_document(self.parent) ropt = mupdf.pdf_recolor_options() ropt.num_comp = components ropts = mupdf.PdfRecolorOptions(ropt) mupdf.pdf_recolor_page(pdfdoc, self.number, ropts) def clip_to_rect(self, rect): """Clip away page content outside the rectangle.""" clip = Rect(rect) if clip.is_infinite or (clip & self.rect).is_empty: raise ValueError("rect must not be infinite or empty") clip *= self.transformation_matrix pdfpage = _as_pdf_page(self) pclip = JM_rect_from_py(clip) mupdf.pdf_clip_page(pdfpage, pclip) @property def artbox(self): """The ArtBox""" rect = self._other_box("ArtBox") if rect is None: return self.cropbox mb = self.mediabox return Rect(rect[0], mb.y1 - rect[3], rect[2], mb.y1 - rect[1]) @property def bleedbox(self): """The BleedBox""" rect = self._other_box("BleedBox") if rect is None: return self.cropbox mb = self.mediabox return Rect(rect[0], mb.y1 - rect[3], rect[2], mb.y1 - rect[1]) def bound(self): """Get page rectangle.""" CheckParent(self) page = _as_fz_page(self.this) val = mupdf.fz_bound_page(page) val = Rect(val) if val.is_infinite and self.parent.is_pdf: cb = self.cropbox w, h = cb.width, cb.height if self.rotation not in (0, 180): w, h = h, w val = Rect(0, 0, w, h) msg = TOOLS.mupdf_warnings(reset=False).splitlines()[-1] message(msg) return val def clean_contents(self, sanitize=1): if not sanitize and not self.is_wrapped: self.wrap_contents() page = _as_pdf_page( self.this, required=False) if not page.m_internal: return filter_ = _make_PdfFilterOptions(recurse=1, sanitize=sanitize) mupdf.pdf_filter_page_contents( page.doc(), page, filter_) @property def cropbox(self): """The CropBox.""" CheckParent(self) page = self._pdf_page(required=False) if not page.m_internal: val = mupdf.fz_bound_page(self.this) else: val = JM_cropbox(page.obj()) val = Rect(val) return val @property def cropbox_position(self): return self.cropbox.tl def delete_annot(self, annot): """Delete annot and return next one.""" CheckParent(self) CheckParent(annot) page = self._pdf_page() while 1: # first loop through all /IRT annots and remove them irt_annot = JM_find_annot_irt(annot.this) if not irt_annot: # no more there break mupdf.pdf_delete_annot(page, irt_annot.this) nextannot = mupdf.pdf_next_annot(annot.this) # store next mupdf.pdf_delete_annot(page, annot.this) val = Annot(nextannot) if val: val.thisown = True val.parent = weakref.proxy(self) # owning page object val.parent._annot_refs[id(val)] = val annot._erase() return val def delete_image(page: 'Page', xref: int): """Delete the image referred to by xef. Actually replaces by a small transparent Pixmap using method Page.replace_image. Args: xref: xref of the image to delete. """ # make a small 100% transparent pixmap (of just any dimension) pix = Pixmap(csGRAY, (0, 0, 1, 1), 1) pix.clear_with() # clear all samples bytes to 0x00 page.replace_image(xref, pixmap=pix) def delete_link(self, linkdict): """Delete a Link.""" CheckParent(self) if not isinstance( linkdict, dict): return # have no dictionary def finished(): if linkdict["xref"] == 0: return try: linkid = linkdict["id"] linkobj = self._annot_refs[linkid] linkobj._erase() except Exception: # Don't print this exception, to match classic. Issue #2841. if g_exceptions_verbose > 1: exception_info() pass page = _as_pdf_page(self.this, required=False) if not page.m_internal: return finished() # have no PDF xref = linkdict[dictkey_xref] if xref < 1: return finished() # invalid xref annots = mupdf.pdf_dict_get( page.obj(), PDF_NAME('Annots')) if not annots.m_internal: return finished() # have no annotations len_ = mupdf.pdf_array_len( annots) if len_ == 0: return finished() oxref = 0 for i in range( len_): oxref = mupdf.pdf_to_num( mupdf.pdf_array_get( annots, i)) if xref == oxref: break # found xref in annotations if xref != oxref: return finished() # xref not in annotations mupdf.pdf_array_delete( annots, i) # delete entry in annotations mupdf.pdf_delete_object( page.doc(), xref) # delete link object mupdf.pdf_dict_put( page.obj(), PDF_NAME('Annots'), annots) JM_refresh_links( page) return finished() def delete_widget(page: 'Page', widget: Widget) -> Widget: """Delete widget from page and return the next one.""" CheckParent(page) annot = getattr(widget, "_annot", None) if annot is None: raise ValueError("bad type: widget") nextwidget = widget.next page.delete_annot(annot) widget._annot.parent = None keylist = list(widget.__dict__.keys()) for key in keylist: del widget.__dict__[key] return nextwidget @property def derotation_matrix(self) -> Matrix: """Reflects page de-rotation.""" if g_use_extra: return Matrix(extra.Page_derotate_matrix( self.this)) pdfpage = self._pdf_page(required=False) if not pdfpage.m_internal: return Matrix(mupdf.FzRect(mupdf.FzRect.UNIT)) return Matrix(JM_derotate_page_matrix(pdfpage)) def draw_bezier( page: 'Page', p1: point_like, p2: point_like, p3: point_like, p4: point_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, morph: OptStr = None, closePath: bool = False, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3.""" img = page.new_shape() Q = img.draw_bezier(Point(p1), Point(p2), Point(p3), Point(p4)) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_circle( page: 'Page', center: point_like, radius: float, color: OptSeq = (0,), fill: OptSeq = None, morph: OptSeq = None, dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw a circle given its center and radius.""" img = page.new_shape() Q = img.draw_circle(Point(center), radius) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_curve( page: 'Page', p1: point_like, p2: point_like, p3: point_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, morph: OptSeq = None, closePath: bool = False, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3.""" img = page.new_shape() Q = img.draw_curve(Point(p1), Point(p2), Point(p3)) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_line( page: 'Page', p1: point_like, p2: point_like, color: OptSeq = (0,), dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc=0, ) -> Point: """Draw a line from point p1 to point p2.""" img = page.new_shape() p = img.draw_line(Point(p1), Point(p2)) img.finish( color=color, dashes=dashes, width=width, closePath=False, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return p def draw_oval( page: 'Page', rect: typing.Union[rect_like, quad_like], color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, morph: OptSeq = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw an oval given its containing rectangle or quad.""" img = page.new_shape() Q = img.draw_oval(rect) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_polyline( page: 'Page', points: list, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, morph: OptSeq = None, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, closePath: bool = False, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw multiple connected line segments.""" img = page.new_shape() Q = img.draw_polyline(points) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_quad( page: 'Page', quad: quad_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw a quadrilateral.""" img = page.new_shape() Q = img.draw_quad(Quad(quad)) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_rect( page: 'Page', rect: rect_like, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, radius=None, ) -> Point: ''' Draw a rectangle. See Shape class method for details. ''' img = page.new_shape() Q = img.draw_rect(Rect(rect), radius=radius) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_sector( page: 'Page', center: point_like, point: point_like, beta: float, color: OptSeq = (0,), fill: OptSeq = None, dashes: OptStr = None, fullSector: bool = True, morph: OptSeq = None, width: float = 1, closePath: bool = False, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw a circle sector given circle center, one arc end point and the angle of the arc. Parameters: center -- center of circle point -- arc end point beta -- angle of arc (degrees) fullSector -- connect arc ends with center """ img = page.new_shape() Q = img.draw_sector(Point(center), Point(point), beta, fullSector=fullSector) img.finish( color=color, fill=fill, dashes=dashes, width=width, lineCap=lineCap, lineJoin=lineJoin, morph=morph, closePath=closePath, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return Q def draw_squiggle( page: 'Page', p1: point_like, p2: point_like, breadth: float = 2, color: OptSeq = (0,), dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw a squiggly line from point p1 to point p2.""" img = page.new_shape() p = img.draw_squiggle(Point(p1), Point(p2), breadth=breadth) img.finish( color=color, dashes=dashes, width=width, closePath=False, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return p def draw_zigzag( page: 'Page', p1: point_like, p2: point_like, breadth: float = 2, color: OptSeq = (0,), dashes: OptStr = None, width: float = 1, lineCap: int = 0, lineJoin: int = 0, overlay: bool = True, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> Point: """Draw a zigzag line from point p1 to point p2.""" img = page.new_shape() p = img.draw_zigzag(Point(p1), Point(p2), breadth=breadth) img.finish( color=color, dashes=dashes, width=width, closePath=False, lineCap=lineCap, lineJoin=lineJoin, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) img.commit(overlay) return p def extend_textpage(self, tpage, flags=0, matrix=None): page = self.this tp = tpage.this assert isinstance( tp, mupdf.FzStextPage) options = mupdf.FzStextOptions() options.flags = flags ctm = JM_matrix_from_py(matrix) dev = mupdf.FzDevice(tp, options) mupdf.fz_run_page( page, dev, ctm, mupdf.FzCookie()) mupdf.fz_close_device( dev) @property def first_annot(self): """First annotation.""" CheckParent(self) page = self._pdf_page(required=False) if not page.m_internal: return annot = mupdf.pdf_first_annot(page) if not annot.m_internal: return val = Annot(annot) val.thisown = True val.parent = weakref.proxy(self) # owning page object self._annot_refs[id(val)] = val return val @property def first_link(self): ''' First link on page ''' return self.load_links() @property def first_widget(self): """First widget/field.""" CheckParent(self) annot = 0 page = self._pdf_page(required=False) if not page.m_internal: return annot = mupdf.pdf_first_widget(page) if not annot.m_internal: return val = Annot(annot) val.thisown = True val.parent = weakref.proxy(self) # owning page object self._annot_refs[id(val)] = val widget = Widget() TOOLS._fill_widget(val, widget) val = widget return val def get_bboxlog(self, layers=None): CheckParent(self) old_rotation = self.rotation if old_rotation != 0: self.set_rotation(0) page = self.this rc = [] inc_layers = True if layers else False dev = JM_new_bbox_device( rc, inc_layers) mupdf.fz_run_page( page, dev, mupdf.FzMatrix(), mupdf.FzCookie()) mupdf.fz_close_device( dev) if old_rotation != 0: self.set_rotation(old_rotation) return rc def get_cdrawings(self, extended=None, callback=None, method=None): """Extract vector graphics ("line art") from the page.""" CheckParent(self) old_rotation = self.rotation if old_rotation != 0: self.set_rotation(0) page = self.this if isinstance(page, mupdf.PdfPage): # Downcast pdf_page to fz_page. page = mupdf.FzPage(page) assert isinstance(page, mupdf.FzPage), f'{self.this=}' clips = True if extended else False prect = mupdf.fz_bound_page(page) if g_use_extra: rc = extra.get_cdrawings(page, extended, callback, method) else: rc = list() if callable(callback) or method is not None: dev = JM_new_lineart_device_Device(callback, clips, method) else: dev = JM_new_lineart_device_Device(rc, clips, method) dev.ptm = mupdf.FzMatrix(1, 0, 0, -1, 0, prect.y1) mupdf.fz_run_page(page, dev, mupdf.FzMatrix(), mupdf.FzCookie()) mupdf.fz_close_device(dev) if old_rotation != 0: self.set_rotation(old_rotation) if callable(callback) or method is not None: return return rc def get_contents(self): """Get xrefs of /Contents objects.""" CheckParent(self) ret = [] page = _as_pdf_page(self.this) obj = page.obj() contents = mupdf.pdf_dict_get(obj, mupdf.PDF_ENUM_NAME_Contents) if mupdf.pdf_is_array(contents): n = mupdf.pdf_array_len(contents) for i in range(n): icont = mupdf.pdf_array_get(contents, i) xref = mupdf.pdf_to_num(icont) ret.append(xref) elif contents.m_internal: xref = mupdf.pdf_to_num(contents) ret.append( xref) return ret def get_displaylist(self, annots=1): ''' Make a DisplayList from the page for Pixmap generation. Include (default) or exclude annotations. ''' CheckParent(self) if annots: dl = mupdf.fz_new_display_list_from_page(self.this) else: dl = mupdf.fz_new_display_list_from_page_contents(self.this) return DisplayList(dl) def get_drawings(self, extended: bool=False) -> list: """Retrieve vector graphics. The extended version includes clips. Note: For greater comfort, this method converts point-likes, rect-likes, quad-likes of the C version to respective Point / Rect / Quad objects. It also adds default items that are missing in original path types. """ allkeys = ( 'closePath', 'fill', 'color', 'width', 'lineCap', 'lineJoin', 'dashes', 'stroke_opacity', 'fill_opacity', 'even_odd', ) val = self.get_cdrawings(extended=extended) for i in range(len(val)): npath = val[i] if not npath["type"].startswith("clip"): npath["rect"] = Rect(npath["rect"]) else: npath["scissor"] = Rect(npath["scissor"]) if npath["type"]!="group": items = npath["items"] newitems = [] for item in items: cmd = item[0] rest = item[1:] if cmd == "re": item = ("re", Rect(rest[0]).normalize(), rest[1]) elif cmd == "qu": item = ("qu", Quad(rest[0])) else: item = tuple([cmd] + [Point(i) for i in rest]) newitems.append(item) npath["items"] = newitems if npath['type'] in ('f', 's'): for k in allkeys: npath[k] = npath.get(k) val[i] = npath return val class Drawpath(object): """Reflects a path dictionary from get_cdrawings().""" def __init__(self, **args): self.__dict__.update(args) class Drawpathlist(object): """List of Path objects representing get_cdrawings() output.""" def __getitem__(self, item): return self.paths.__getitem__(item) def __init__(self): self.paths = [] self.path_count = 0 self.group_count = 0 self.clip_count = 0 self.fill_count = 0 self.stroke_count = 0 self.fillstroke_count = 0 def __len__(self): return self.paths.__len__() def append(self, path): self.paths.append(path) self.path_count += 1 if path.type == "clip": self.clip_count += 1 elif path.type == "group": self.group_count += 1 elif path.type == "f": self.fill_count += 1 elif path.type == "s": self.stroke_count += 1 elif path.type == "fs": self.fillstroke_count += 1 def clip_parents(self, i): """Return list of parent clip paths. Args: i: (int) return parents of this path. Returns: List of the clip parents.""" if i >= self.path_count: raise IndexError("bad path index") while i < 0: i += self.path_count lvl = self.paths[i].level clips = list( # clip paths before identified one reversed( [ p for p in self.paths[:i] if p.type == "clip" and p.level < lvl ] ) ) if clips == []: # none found: empty list return [] nclips = [clips[0]] # init return list for p in clips[1:]: if p.level >= nclips[-1].level: continue # only accept smaller clip levels nclips.append(p) return nclips def group_parents(self, i): """Return list of parent group paths. Args: i: (int) return parents of this path. Returns: List of the group parents.""" if i >= self.path_count: raise IndexError("bad path index") while i < 0: i += self.path_count lvl = self.paths[i].level groups = list( # group paths before identified one reversed( [ p for p in self.paths[:i] if p.type == "group" and p.level < lvl ] ) ) if groups == []: # none found: empty list return [] ngroups = [groups[0]] # init return list for p in groups[1:]: if p.level >= ngroups[-1].level: continue # only accept smaller group levels ngroups.append(p) return ngroups def get_lineart(self) -> object: """Get page drawings paths. Note: For greater comfort, this method converts point-like, rect-like, quad-like tuples of the C version to respective Point / Rect / Quad objects. Also adds default items that are missing in original path types. In contrast to get_drawings(), this output is an object. """ val = self.get_cdrawings(extended=True) paths = self.Drawpathlist() for path in val: npath = self.Drawpath(**path) if npath.type != "clip": npath.rect = Rect(path["rect"]) else: npath.scissor = Rect(path["scissor"]) if npath.type != "group": items = path["items"] newitems = [] for item in items: cmd = item[0] rest = item[1:] if cmd == "re": item = ("re", Rect(rest[0]).normalize(), rest[1]) elif cmd == "qu": item = ("qu", Quad(rest[0])) else: item = tuple([cmd] + [Point(i) for i in rest]) newitems.append(item) npath.items = newitems if npath.type == "f": npath.stroke_opacity = None npath.dashes = None npath.line_join = None npath.line_cap = None npath.color = None npath.width = None paths.append(npath) val = None return paths def get_image_info( page: 'Page', hashes: bool = False, xrefs: bool = False ) -> list: """Extract image information only from a pymupdf.TextPage. Args: hashes: (bool) include MD5 hash for each image. xrefs: (bool) try to find the xref for each image. Sets hashes to true. """ doc = page.parent if xrefs and doc.is_pdf: hashes = True if not doc.is_pdf: xrefs = False imginfo = getattr(page, "_image_info", None) if imginfo and not xrefs: return imginfo if not imginfo: tp = page.get_textpage(flags=TEXT_PRESERVE_IMAGES) imginfo = tp.extractIMGINFO(hashes=hashes) del tp if hashes: page._image_info = imginfo if not xrefs or not doc.is_pdf: return imginfo imglist = page.get_images() digests = {} for item in imglist: xref = item[0] pix = Pixmap(doc, xref) digests[pix.digest] = xref del pix for i in range(len(imginfo)): item = imginfo[i] xref = digests.get(item["digest"], 0) item["xref"] = xref imginfo[i] = item return imginfo def get_image_rects(page: 'Page', name, transform=False) -> list: """Return list of image positions on a page. Args: name: (str, list, int) image identification. May be reference name, an item of the page's image list or an xref. transform: (bool) whether to also return the transformation matrix. Returns: A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix) for all image locations on the page. """ if type(name) in (list, tuple): xref = name[0] elif type(name) is int: xref = name else: imglist = [i for i in page.get_images() if i[7] == name] if imglist == []: raise ValueError("bad image name") elif len(imglist) != 1: raise ValueError("multiple image names found") xref = imglist[0][0] pix = Pixmap(page.parent, xref) # make pixmap of the image to compute MD5 digest = pix.digest del pix infos = page.get_image_info(hashes=True) if not transform: bboxes = [Rect(im["bbox"]) for im in infos if im["digest"] == digest] else: bboxes = [ (Rect(im["bbox"]), Matrix(im["transform"])) for im in infos if im["digest"] == digest ] return bboxes def get_label(page): """Return the label for this PDF page. Args: page: page object. Returns: The label (str) of the page. Errors return an empty string. """ # Jorj McKie, 2021-01-06 labels = page.parent._get_page_labels() if not labels: return "" labels.sort() return utils.get_label_pno(page.number, labels) def get_links(page: 'Page') -> list: """Create a list of all links contained in a PDF page. Notes: see PyMuPDF ducmentation for details. """ CheckParent(page) ln = page.first_link links = [] while ln: nl = utils.getLinkDict(ln, page.parent) links.append(nl) ln = ln.next if links != [] and page.parent.is_pdf: linkxrefs = [x for x in #page.annot_xrefs() JM_get_annot_xref_list2(page) if x[1] == mupdf.PDF_ANNOT_LINK # pylint: disable=no-member ] if len(linkxrefs) == len(links): for i in range(len(linkxrefs)): links[i]["xref"] = linkxrefs[i][0] links[i]["id"] = linkxrefs[i][2] return links def get_pixmap( page: 'Page', *, matrix: matrix_like=Identity, dpi=None, colorspace: Colorspace=None, clip: rect_like=None, alpha: bool=False, annots: bool=True, ) -> 'Pixmap': """Create pixmap of page. Keyword args: matrix: Matrix for transformation (default: Identity). dpi: desired dots per inch. If given, matrix is ignored. colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB. clip: (irect-like) restrict rendering to this area. alpha: (bool) whether to include alpha channel annots: (bool) whether to also render annotations """ if colorspace is None: colorspace = csRGB if dpi: zoom = dpi / 72 matrix = Matrix(zoom, zoom) if type(colorspace) is str: if colorspace.upper() == "GRAY": colorspace = csGRAY elif colorspace.upper() == "CMYK": colorspace = csCMYK else: colorspace = csRGB if colorspace.n not in (1, 3, 4): raise ValueError("unsupported colorspace") dl = page.get_displaylist(annots=annots) pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip) dl = None if dpi: pix.set_dpi(dpi, dpi) return pix def remove_rotation(self): """Set page rotation to 0 while maintaining visual appearance.""" rot = self.rotation # normalized rotation value if rot == 0: return Identity # nothing to do # need to derotate the page's content mb = self.mediabox # current mediabox if rot == 90: # before derotation, shift content horizontally mat0 = Matrix(1, 0, 0, 1, mb.y1 - mb.x1 - mb.x0 - mb.y0, 0) elif rot == 270: # before derotation, shift content vertically mat0 = Matrix(1, 0, 0, 1, 0, mb.x1 - mb.y1 - mb.y0 - mb.x0) else: # rot = 180 mat0 = Matrix(1, 0, 0, 1, -2 * mb.x0, -2 * mb.y0) # prefix with derotation matrix mat = mat0 * self.derotation_matrix cmd = _format_g(tuple(mat)) + ' cm ' cmd = cmd.encode('utf8') _ = TOOLS._insert_contents(self, cmd, False) # prepend to page contents # swap x- and y-coordinates if rot in (90, 270): x0, y0, x1, y1 = mb mb.x0 = y0 mb.y0 = x0 mb.x1 = y1 mb.y1 = x1 self.set_mediabox(mb) self.set_rotation(0) rot = ~mat # inverse of the derotation matrix for annot in self.annots(): # modify rectangles of annotations r = annot.rect * rot # TODO: only try to set rectangle for applicable annot types annot.set_rect(r) for link in self.get_links(): # modify 'from' rectangles of links r = link["from"] * rot self.delete_link(link) link["from"] = r try: # invalid links remain deleted self.insert_link(link) except Exception: pass for widget in self.widgets(): # modify field rectangles r = widget.rect * rot widget.rect = r widget.update() return rot # the inverse of the generated derotation matrix def cluster_drawings( self, clip=None, drawings=None, x_tolerance: float = 3, y_tolerance: float = 3, final_filter: bool = True, ) -> list: """Join rectangles of neighboring vector graphic items. Args: clip: optional rect-like to restrict the page area to consider. drawings: (optional) output of a previous "get_drawings()". x_tolerance: horizontal neighborhood threshold. y_tolerance: vertical neighborhood threshold. Notes: Vector graphics (also called line-art or drawings) usually consist of independent items like rectangles, lines or curves to jointly form table grid lines or bar, line, pie charts and similar. This method identifies rectangles wrapping these disparate items. Returns: A list of Rect items, each wrapping line-art items that are close enough to be considered forming a common vector graphic. Only "significant" rectangles will be returned, i.e. having both, width and height larger than the tolerance values. """ CheckParent(self) parea = self.rect # the default clipping area if clip is not None: parea = Rect(clip) delta_x = x_tolerance # shorter local name delta_y = y_tolerance # shorter local name if drawings is None: # if we cannot re-use a previous output drawings = self.get_drawings() def are_neighbors(r1, r2): """Detect whether r1, r2 are "neighbors". Items r1, r2 are called neighbors if the minimum distance between their points is less-equal delta. Both parameters must be (potentially invalid) rectangles. """ # normalize rectangles as needed rr1_x0, rr1_x1 = (r1.x0, r1.x1) if r1.x1 > r1.x0 else (r1.x1, r1.x0) rr1_y0, rr1_y1 = (r1.y0, r1.y1) if r1.y1 > r1.y0 else (r1.y1, r1.y0) rr2_x0, rr2_x1 = (r2.x0, r2.x1) if r2.x1 > r2.x0 else (r2.x1, r2.x0) rr2_y0, rr2_y1 = (r2.y0, r2.y1) if r2.y1 > r2.y0 else (r2.y1, r2.y0) if ( 0 or rr1_x1 < rr2_x0 - delta_x or rr1_x0 > rr2_x1 + delta_x or rr1_y1 < rr2_y0 - delta_y or rr1_y0 > rr2_y1 + delta_y ): # Rects do not overlap. return False else: # Rects overlap. return True # exclude graphics not contained in the clip paths = [ p for p in drawings if 1 and p["rect"].x0 >= parea.x0 and p["rect"].x1 <= parea.x1 and p["rect"].y0 >= parea.y0 and p["rect"].y1 <= parea.y1 ] # list of all vector graphic rectangles prects = sorted([p["rect"] for p in paths], key=lambda r: (r.y1, r.x0)) new_rects = [] # the final list of the joined rectangles # ------------------------------------------------------------------------- # The strategy is to identify and join all rects that are neighbors # ------------------------------------------------------------------------- while prects: # the algorithm will empty this list r = +prects[0] # copy of first rectangle repeat = True while repeat: repeat = False for i in range(len(prects) - 1, 0, -1): # from back to front if are_neighbors(prects[i], r): r |= prects[i].tl # include in first rect r |= prects[i].br # include in first rect del prects[i] # delete this rect repeat = True new_rects.append(r) del prects[0] prects = sorted(set(prects), key=lambda r: (r.y1, r.x0)) new_rects = sorted(set(new_rects), key=lambda r: (r.y1, r.x0)) if not final_filter: return new_rects return [r for r in new_rects if r.width > delta_x and r.height > delta_y] def get_fonts(self, full=False): """List of fonts defined in the page object.""" CheckParent(self) return self.parent.get_page_fonts(self.number, full=full) def get_image_bbox(self, name, transform=0): """Get rectangle occupied by image 'name'. 'name' is either an item of the image list, or the referencing name string - elem[7] of the resp. item. Option 'transform' also returns the image transformation matrix. """ CheckParent(self) doc = self.parent if doc.is_closed or doc.is_encrypted: raise ValueError('document closed or encrypted') inf_rect = Rect(1, 1, -1, -1) null_mat = Matrix() if transform: rc = (inf_rect, null_mat) else: rc = inf_rect if type(name) in (list, tuple): if not type(name[-1]) is int: raise ValueError('need item of full page image list') item = name else: imglist = [i for i in doc.get_page_images(self.number, True) if name == i[7]] if len(imglist) == 1: item = imglist[0] elif imglist == []: raise ValueError('bad image name') else: raise ValueError("found multiple images named '%s'." % name) xref = item[-1] if xref != 0 or transform: try: return self.get_image_rects(item, transform=transform)[0] except Exception: exception_info() return inf_rect pdf_page = self._pdf_page() val = JM_image_reporter(pdf_page) if not bool(val): return rc for v in val: if v[0] != item[-3]: continue q = Quad(v[1]) bbox = q.rect if transform == 0: rc = bbox break hm = Matrix(util_hor_matrix(q.ll, q.lr)) h = abs(q.ll - q.ul) w = abs(q.ur - q.ul) m0 = Matrix(1 / w, 0, 0, 1 / h, 0, 0) m = ~(hm * m0) rc = (bbox, m) break val = rc return val def get_images(self, full=False): """List of images defined in the page object.""" CheckParent(self) return self.parent.get_page_images(self.number, full=full) def get_oc_items(self) -> list: """Get OCGs and OCMDs used in the page's contents. Returns: List of items (name, xref, type), where type is one of "ocg" / "ocmd", and name is the property name. """ rc = [] for pname, xref in self._get_resource_properties(): text = self.parent.xref_object(xref, compressed=True) if "/Type/OCG" in text: octype = "ocg" elif "/Type/OCMD" in text: octype = "ocmd" else: continue rc.append((pname, xref, octype)) return rc def get_svg_image(self, matrix=None, text_as_path=1): """Make SVG image from page.""" CheckParent(self) mediabox = mupdf.fz_bound_page(self.this) ctm = JM_matrix_from_py(matrix) tbounds = mediabox text_option = mupdf.FZ_SVG_TEXT_AS_PATH if text_as_path == 1 else mupdf.FZ_SVG_TEXT_AS_TEXT tbounds = mupdf.fz_transform_rect(tbounds, ctm) res = mupdf.fz_new_buffer(1024) out = mupdf.FzOutput(res) dev = mupdf.fz_new_svg_device( out, tbounds.x1-tbounds.x0, # width tbounds.y1-tbounds.y0, # height text_option, 1, ) mupdf.fz_run_page(self.this, dev, ctm, mupdf.FzCookie()) mupdf.fz_close_device(dev) out.fz_close_output() text = JM_EscapeStrFromBuffer(res) return text def get_textbox( page: Page, rect: rect_like, textpage=None, #: TextPage = None, ) -> str: tp = textpage if tp is None: tp = page.get_textpage() elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") rc = tp.extractTextbox(rect) if textpage is None: del tp return rc def get_text(self, *args, **kwargs): return utils.get_text(self, *args, **kwargs) def get_text_blocks(self, *args, **kwargs): return utils.get_text_blocks(self, *args, **kwargs) def get_text_selection(self, *args, **kwargs): return utils.get_text_selection(self, *args, **kwargs) def get_text_words(self, *args, **kwargs): return utils.get_text_words(self, *args, **kwargs) def get_textpage_ocr(self, *args, **kwargs): return utils.get_textpage_ocr(self, *args, **kwargs) def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "TextPage": CheckParent(self) if matrix is None: matrix = Matrix(1, 1) old_rotation = self.rotation if old_rotation != 0: self.set_rotation(0) try: textpage = self._get_textpage(clip, flags=flags, matrix=matrix) finally: if old_rotation != 0: self.set_rotation(old_rotation) textpage = TextPage(textpage) textpage.parent = weakref.proxy(self) return textpage def get_texttrace(self): CheckParent(self) old_rotation = self.rotation if old_rotation != 0: self.set_rotation(0) page = self.this rc = [] if g_use_extra: dev = extra.JM_new_texttrace_device(rc) else: dev = JM_new_texttrace_device(rc) prect = mupdf.fz_bound_page(page) dev.ptm = mupdf.FzMatrix(1, 0, 0, -1, 0, prect.y1) mupdf.fz_run_page(page, dev, mupdf.FzMatrix(), mupdf.FzCookie()) mupdf.fz_close_device(dev) if old_rotation != 0: self.set_rotation(old_rotation) return rc def get_xobjects(self): """List of xobjects defined in the page object.""" CheckParent(self) return self.parent.get_page_xobjects(self.number) def insert_font(self, fontname="helv", fontfile=None, fontbuffer=None, set_simple=False, wmode=0, encoding=0): doc = self.parent if doc is None: raise ValueError("orphaned object: parent is None") idx = 0 if fontname.startswith("/"): fontname = fontname[1:] inv_chars = INVALID_NAME_CHARS.intersection(fontname) if inv_chars != set(): raise ValueError(f"bad fontname chars {inv_chars}") font = CheckFont(self, fontname) if font is not None: # font already in font list of page xref = font[0] # this is the xref if CheckFontInfo(doc, xref): # also in our document font list? return xref # yes: we are done # need to build the doc FontInfo entry - done via get_char_widths doc.get_char_widths(xref) return xref #-------------------------------------------------------------------------- # the font is not present for this page #-------------------------------------------------------------------------- bfname = Base14_fontdict.get(fontname.lower(), None) # BaseFont if Base-14 font serif = 0 CJK_number = -1 CJK_list_n = ["china-t", "china-s", "japan", "korea"] CJK_list_s = ["china-ts", "china-ss", "japan-s", "korea-s"] try: CJK_number = CJK_list_n.index(fontname) serif = 0 except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose > 1: exception_info() pass if CJK_number < 0: try: CJK_number = CJK_list_s.index(fontname) serif = 1 except Exception: # Verbose in PyMuPDF/tests. if g_exceptions_verbose > 1: exception_info() pass if fontname.lower() in fitz_fontdescriptors.keys(): import pymupdf_fonts fontbuffer = pymupdf_fonts.myfont(fontname) # make a copy del pymupdf_fonts # install the font for the page if fontfile is not None: if type(fontfile) is str: fontfile_str = fontfile elif hasattr(fontfile, "absolute"): fontfile_str = str(fontfile) elif hasattr(fontfile, "name"): fontfile_str = fontfile.name else: raise ValueError("bad fontfile") else: fontfile_str = None val = self._insertFont(fontname, bfname, fontfile_str, fontbuffer, set_simple, idx, wmode, serif, encoding, CJK_number) if not val: # did not work, error return return val xref = val[0] # xref of installed font fontdict = val[1] if CheckFontInfo(doc, xref): # check again: document already has this font return xref # we are done # need to create document font info doc.get_char_widths(xref, fontdict=fontdict) return xref def insert_htmlbox( page, rect, text, *, css=None, scale_low=0, archive=None, rotate=0, oc=0, opacity=1, overlay=True, _scale_word_width=True, _verbose=False, ) -> tuple: """Insert text with optional HTML tags and stylings into a rectangle. Args: rect: (rect-like) rectangle into which the text should be placed. text: (str) text with optional HTML tags and stylings. css: (str) CSS styling commands. scale_low: (float) force-fit content by scaling it down. Must be in range [0, 1]. If 1, no scaling will take place. If 0, arbitrary down-scaling is acceptable. A value of 0.1 would mean that content may be scaled down by at most 90%. archive: Archive object pointing to locations of used fonts or images rotate: (int) rotate the text in the box by a multiple of 90 degrees. oc: (int) the xref of an OCG / OCMD (Optional Content). opacity: (float) set opacity of inserted content. overlay: (bool) put text on top of page content. _scale_word_width: internal, for testing only. _verbose: internal, for testing only. Returns: A tuple of floats (spare_height, scale). spare_height: The height of the remaining space in <rect> below the text, or -1 if we failed to fit. scale: The scaling required; `0 < scale <= 1`. Will be less than `scale_low` if we failed to fit. """ # normalize rotation angle if not rotate % 90 == 0: raise ValueError("bad rotation angle") while rotate < 0: rotate += 360 while rotate >= 360: rotate -= 360 if not 0 <= scale_low <= 1: raise ValueError("'scale_low' must be in [0, 1]") if css is None: css = "" rect = Rect(rect) if rotate in (90, 270): temp_rect = Rect(0, 0, rect.height, rect.width) else: temp_rect = Rect(0, 0, rect.width, rect.height) # use a small border by default mycss = "body {margin:1px;}" + css # append user CSS # either make a story, or accept a given one if isinstance(text, str): # if a string, convert to a Story story = Story(html=text, user_css=mycss, archive=archive) elif isinstance(text, Story): story = text else: raise ValueError("'text' must be a string or a Story") # ---------------------------------------------------------------- # Find a scaling factor that lets our story fit in. Instead of scaling # the text smaller, we instead look at how much bigger the rect needs # to be to fit the text, then reverse the scaling to get how much we # need to scale down the text. # ---------------------------------------------------------------- rect_scale_max = None if scale_low == 0 else 1 / scale_low fit = story.fit_scale( temp_rect, scale_min=1, scale_max=rect_scale_max, flags=mupdf.FZ_PLACE_STORY_FLAG_NO_OVERFLOW if _scale_word_width else 0, verbose=_verbose, ) if not fit.big_enough: # there was no fit scale = 1 / fit.parameter return (-1, scale) # fit.filled is a tuple; we convert it in place to a Rect for # convenience. (fit.rect is already a Rect.) fit.filled = Rect(fit.filled) assert (fit.rect.x0, fit.rect.y0) == (0, 0) assert (fit.filled.x0, fit.filled.y0) == (0, 0) scale = 1 / fit.parameter assert scale >= scale_low, f'{scale_low=} {scale=}' spare_height = max((fit.rect.y1 - fit.filled.y1) * scale, 0) def rect_function(*args): return fit.rect, fit.rect, None # draw story on temp PDF page doc = story.write_with_links(rect_function) # Insert opacity if requested. # For this, we prepend a command to the /Contents. if 0 <= opacity < 1: tpage = doc[0] # load page # generate /ExtGstate for the page alp0 = tpage._set_opacity(CA=opacity, ca=opacity) s = f"/{alp0} gs\n" # generate graphic state command TOOLS._insert_contents(tpage, s.encode(), 0) # put result in target page page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay) # ------------------------------------------------------------------------- # re-insert links in target rect (show_pdf_page cannot copy annotations) # ------------------------------------------------------------------------- # scaled center point of fit.rect mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale # center point of target rect mp2 = (rect.tl + rect.br) / 2 # compute link positioning matrix: # - move center of scaled-down fit.rect to (0,0) # - rotate # - move (0,0) to center of target rect mat = ( Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y) * Matrix(-rotate) * Matrix(1, 0, 0, 1, mp2.x, mp2.y) ) # copy over links for link in doc[0].get_links(): link["from"] *= mat page.insert_link(link) return spare_height, scale def insert_image( page, rect, *, alpha=-1, filename=None, height=0, keep_proportion=True, mask=None, oc=0, overlay=True, pixmap=None, rotate=0, stream=None, width=0, xref=0, ): """Insert an image for display in a rectangle. Args: rect: (rect_like) position of image on the page. alpha: (int, optional) set to 0 if image has no transparency. filename: (str, Path, file object) image filename. height: (int) keep_proportion: (bool) keep width / height ratio (default). mask: (bytes, optional) image consisting of alpha values to use. oc: (int) xref of OCG or OCMD to declare as Optional Content. overlay: (bool) put in foreground (default) or background. pixmap: (pymupdf.Pixmap) use this as image. rotate: (int) rotate by 0, 90, 180 or 270 degrees. stream: (bytes) use this as image. width: (int) xref: (int) use this as image. 'page' and 'rect' are positional, all other parameters are keywords. If 'xref' is given, that image is used. Other input options are ignored. Else, exactly one of pixmap, stream or filename must be given. 'alpha=0' for non-transparent images improves performance significantly. Affects stream and filename only. Optimum transparent insertions are possible by using filename / stream in conjunction with a 'mask' image of alpha values. Returns: xref (int) of inserted image. Re-use as argument for multiple insertions. """ CheckParent(page) doc = page.parent if not doc.is_pdf: raise ValueError("is no PDF") if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1): raise ValueError("xref=0 needs exactly one of filename, pixmap, stream") if filename: if type(filename) is str: pass elif hasattr(filename, "absolute"): filename = str(filename) elif hasattr(filename, "name"): filename = filename.name else: raise ValueError("bad filename") if filename and not os.path.exists(filename): raise FileNotFoundError("No such file: '%s'" % filename) elif stream and type(stream) not in (bytes, bytearray, io.BytesIO): raise ValueError("stream must be bytes-like / BytesIO") elif pixmap and type(pixmap) is not Pixmap: raise ValueError("pixmap must be a Pixmap") if mask and not (stream or filename): raise ValueError("mask requires stream or filename") if mask and type(mask) not in (bytes, bytearray, io.BytesIO): raise ValueError("mask must be bytes-like / BytesIO") while rotate < 0: rotate += 360 while rotate >= 360: rotate -= 360 if rotate not in (0, 90, 180, 270): raise ValueError("bad rotate value") r = Rect(rect) if r.is_empty or r.is_infinite: raise ValueError("rect must be finite and not empty") clip = r * ~page.transformation_matrix # Create a unique image reference name. ilst = [i[7] for i in doc.get_page_images(page.number)] ilst += [i[1] for i in doc.get_page_xobjects(page.number)] ilst += [i[4] for i in doc.get_page_fonts(page.number)] n = "fzImg" # 'pymupdf image' i = 0 _imgname = n + "0" # first name candidate while _imgname in ilst: i += 1 _imgname = n + str(i) # try new name if overlay: page.wrap_contents() # ensure a balanced graphics state digests = doc.InsertedImages xref, digests = page._insert_image( filename=filename, pixmap=pixmap, stream=stream, imask=mask, clip=clip, overlay=overlay, oc=oc, xref=xref, rotate=rotate, keep_proportion=keep_proportion, width=width, height=height, alpha=alpha, _imgname=_imgname, digests=digests, ) if digests is not None: doc.InsertedImages = digests return xref def insert_link(page: 'Page', lnk: dict, mark: bool = True) -> None: """Insert a new link for the current page.""" CheckParent(page) annot = utils.getLinkText(page, lnk) if annot == "": raise ValueError("link kind not supported") page._addAnnot_FromString((annot,)) def insert_text( page: 'Page', point: point_like, text: typing.Union[str, list], *, fontsize: float = 11, lineheight: OptFloat = None, fontname: str = "helv", fontfile: OptStr = None, set_simple: int = 0, encoding: int = 0, color: OptSeq = None, fill: OptSeq = None, border_width: float = 0.05, miter_limit: float = 1, render_mode: int = 0, rotate: int = 0, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ): img = page.new_shape() rc = img.insert_text( point, text, fontsize=fontsize, lineheight=lineheight, fontname=fontname, fontfile=fontfile, set_simple=set_simple, encoding=encoding, color=color, fill=fill, border_width=border_width, render_mode=render_mode, miter_limit=miter_limit, rotate=rotate, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) if rc >= 0: img.commit(overlay) return rc def insert_textbox( page: 'Page', rect: rect_like, buffer: typing.Union[str, list], *, fontname: str = "helv", fontfile: OptStr = None, set_simple: int = 0, encoding: int = 0, fontsize: float = 11, lineheight: OptFloat = None, color: OptSeq = None, fill: OptSeq = None, expandtabs: int = 1, align: int = 0, rotate: int = 0, render_mode: int = 0, miter_limit: float = 1, border_width: float = 0.05, morph: OptSeq = None, overlay: bool = True, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> float: """Insert text into a given rectangle. Notes: Creates a Shape object, uses its same-named method and commits it. Parameters: rect: (rect-like) area to use for text. buffer: text to be inserted fontname: a Base-14 font, font name or '/name' fontfile: name of a font file fontsize: font size lineheight: overwrite the font property color: RGB color triple expandtabs: handles tabulators with string function align: left, center, right, justified rotate: 0, 90, 180, or 270 degrees morph: morph box with a matrix and a fixpoint overlay: put text in foreground or background Returns: unused or deficit rectangle area (float) """ img = page.new_shape() rc = img.insert_textbox( rect, buffer, fontsize=fontsize, lineheight=lineheight, fontname=fontname, fontfile=fontfile, set_simple=set_simple, encoding=encoding, color=color, fill=fill, expandtabs=expandtabs, render_mode=render_mode, miter_limit=miter_limit, border_width=border_width, align=align, rotate=rotate, morph=morph, stroke_opacity=stroke_opacity, fill_opacity=fill_opacity, oc=oc, ) if rc >= 0: img.commit(overlay) return rc @property def is_wrapped(self): """Check if /Contents is in a balanced graphics state.""" return self._count_q_balance() == (0, 0) @property def language(self): """Page language.""" pdfpage = _as_pdf_page(self.this, required=False) if not pdfpage.m_internal: return lang = mupdf.pdf_dict_get_inheritable(pdfpage.obj(), PDF_NAME('Lang')) if not lang.m_internal: return return mupdf.pdf_to_str_buf(lang) def links(self, kinds=None): """ Generator over the links of a page. Args: kinds: (list) link kinds to subselect from. If none, all links are returned. E.g. kinds=[LINK_URI] will only yield URI links. """ all_links = self.get_links() for link in all_links: if kinds is None or link["kind"] in kinds: yield (link) def load_annot(self, ident: typing.Union[str, int]) -> Annot: """Load an annot by name (/NM key) or xref. Args: ident: identifier, either name (str) or xref (int). """ CheckParent(self) if type(ident) is str: xref = 0 name = ident elif type(ident) is int: xref = ident name = None else: raise ValueError("identifier must be a string or integer") val = self._load_annot(name, xref) if not val: return val val.thisown = True val.parent = weakref.proxy(self) self._annot_refs[id(val)] = val return val def load_links(self): """Get first Link.""" CheckParent(self) val = mupdf.fz_load_links( self.this) if not val.m_internal: return val = Link( val) val.thisown = True val.parent = weakref.proxy(self) # owning page object self._annot_refs[id(val)] = val val.xref = 0 val.id = "" if self.parent.is_pdf: xrefs = self.annot_xrefs() xrefs = [x for x in xrefs if x[1] == mupdf.PDF_ANNOT_LINK] if xrefs: link_id = xrefs[0] val.xref = link_id[0] val.id = link_id[2] else: val.xref = 0 val.id = "" return val #---------------------------------------------------------------- # page load widget by xref #---------------------------------------------------------------- def load_widget( self, xref): """Load a widget by its xref.""" CheckParent(self) page = _as_pdf_page(self.this) annot = JM_get_widget_by_xref( page, xref) #log( '{=type(annot)}') val = annot if not val: return val val.thisown = True val.parent = weakref.proxy(self) self._annot_refs[id(val)] = val widget = Widget() TOOLS._fill_widget(val, widget) val = widget return val @property def mediabox(self): """The MediaBox.""" CheckParent(self) page = self._pdf_page(required=False) if not page.m_internal: rect = mupdf.fz_bound_page( self.this) else: rect = JM_mediabox( page.obj()) return Rect(rect) @property def mediabox_size(self): return Point(self.mediabox.x1, self.mediabox.y1) def new_shape(self): return Shape(self) #@property #def parent( self): # assert self._parent # if self._parent: # return self._parent # return Document( self.this.document()) def read_contents(self): """All /Contents streams concatenated to one bytes object.""" return TOOLS._get_all_contents(self) def refresh(self): """Refresh page after link/annot/widget updates.""" CheckParent(self) doc = self.parent page = doc.reload_page(self) # fixme this looks wrong. self.this = page def replace_image( page: 'Page', xref: int, *, filename=None, pixmap=None, stream=None, ): """Replace the image referred to by xref. Replace the image by changing the object definition stored under xref. This will leave the pages appearance instructions intact, so the new image is being displayed with the same bbox, rotation etc. By providing a small fully transparent image, an effect as if the image had been deleted can be achieved. A typical use may include replacing large images by a smaller version, e.g. with a lower resolution or graylevel instead of colored. Args: xref: the xref of the image to replace. filename, pixmap, stream: exactly one of these must be provided. The meaning being the same as in Page.insert_image. """ doc = page.parent # the owning document if not doc.xref_is_image(xref): raise ValueError("xref not an image") # insert new image anywhere in page if bool(filename) + bool(stream) + bool(pixmap) != 1: raise ValueError("Exactly one of filename/stream/pixmap must be given") new_xref = page.insert_image( page.rect, filename=filename, stream=stream, pixmap=pixmap ) doc.xref_copy(new_xref, xref) # copy over new to old last_contents_xref = page.get_contents()[-1] # new image insertion has created a new /Contents source, # which we will set to spaces now doc.update_stream(last_contents_xref, b" ") page._image_info = None # clear cache of extracted image information @property def rotation(self): """Page rotation.""" CheckParent(self) page = _as_pdf_page(self.this, required=0) if not page.m_internal: return 0 return JM_page_rotation(page) @property def rotation_matrix(self) -> Matrix: """Reflects page rotation.""" return Matrix(TOOLS._rotate_matrix(self)) def run(self, dw, m): """Run page through a device. dw: DeviceWrapper """ CheckParent(self) mupdf.fz_run_page(self.this, dw.device, JM_matrix_from_py(m), mupdf.FzCookie()) def search_for( page, text, *, clip=None, quads=False, flags=None, textpage=None, ) -> list: """Search for a string on a page. Args: text: string to be searched for clip: restrict search to this rectangle quads: (bool) return quads instead of rectangles flags: bit switches, default: join hyphened words textpage: a pre-created pymupdf.TextPage Returns: a list of rectangles or quads, each containing one occurrence. """ if flags is None: flags=(0 | TEXT_DEHYPHENATE | TEXT_PRESERVE_WHITESPACE | TEXT_PRESERVE_LIGATURES | TEXT_MEDIABOX_CLIP ) if clip is not None: clip = Rect(clip) CheckParent(page) tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") rlist = tp.search(text, quads=quads) if textpage is None: del tp return rlist def set_artbox(self, rect): """Set the ArtBox.""" return self._set_pagebox("ArtBox", rect) def set_bleedbox(self, rect): """Set the BleedBox.""" return self._set_pagebox("BleedBox", rect) def set_contents(self, xref): """Set object at 'xref' as the page's /Contents.""" CheckParent(self) doc = self.parent if doc.is_closed: raise ValueError("document closed") if not doc.is_pdf: raise ValueError("is no PDF") if xref not in range(1, doc.xref_length()): raise ValueError("bad xref") if not doc.xref_is_stream(xref): raise ValueError("xref is no stream") doc.xref_set_key(self.xref, "Contents", "%i 0 R" % xref) def set_cropbox(self, rect): """Set the CropBox. Will also change Page.rect.""" return self._set_pagebox("CropBox", rect) def set_language(self, language=None): """Set PDF page default language.""" CheckParent(self) pdfpage = _as_pdf_page(self.this) if not language: mupdf.pdf_dict_del(pdfpage.obj(), PDF_NAME('Lang')) else: lang = mupdf.fz_text_language_from_string(language) assert hasattr(mupdf, 'fz_string_from_text_language2') mupdf.pdf_dict_put_text_string( pdfpage.obj, PDF_NAME('Lang'), mupdf.fz_string_from_text_language2(lang) ) def set_mediabox(self, rect): """Set the MediaBox.""" CheckParent(self) page = self._pdf_page() mediabox = JM_rect_from_py(rect) if (mupdf.fz_is_empty_rect(mediabox) or mupdf.fz_is_infinite_rect(mediabox) ): raise ValueError( MSG_BAD_RECT) mupdf.pdf_dict_put_rect( page.obj(), PDF_NAME('MediaBox'), mediabox) mupdf.pdf_dict_del( page.obj(), PDF_NAME('CropBox')) mupdf.pdf_dict_del( page.obj(), PDF_NAME('ArtBox')) mupdf.pdf_dict_del( page.obj(), PDF_NAME('BleedBox')) mupdf.pdf_dict_del( page.obj(), PDF_NAME('TrimBox')) def set_rotation(self, rotation): """Set page rotation.""" CheckParent(self) page = _as_pdf_page(self.this) rot = JM_norm_rotation(rotation) mupdf.pdf_dict_put_int( page.obj(), PDF_NAME('Rotate'), rot) def set_trimbox(self, rect): """Set the TrimBox.""" return self._set_pagebox("TrimBox", rect) def show_pdf_page( page, rect, docsrc, pno=0, keep_proportion=True, overlay=True, oc=0, rotate=0, clip=None, ) -> int: """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'. Args: rect: (rect-like) where to place the source image docsrc: (document) source PDF pno: (int) source page number keep_proportion: (bool) do not change width-height-ratio overlay: (bool) put in foreground oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF) rotate: (int) degrees (multiple of 90) clip: (rect-like) part of source page rectangle Returns: xref of inserted object (for reuse) """ def calc_matrix(sr, tr, keep=True, rotate=0): """Calculate transformation matrix from source to target rect. Notes: The product of four matrices in this sequence: (1) translate correct source corner to origin, (2) rotate, (3) scale, (4) translate to target's top-left corner. Args: sr: source rect in PDF (!) coordinate system tr: target rect in PDF coordinate system keep: whether to keep source ratio of width to height rotate: rotation angle in degrees Returns: Transformation matrix. """ # calc center point of source rect smp = (sr.tl + sr.br) / 2.0 # calc center point of target rect tmp = (tr.tl + tr.br) / 2.0 # m moves to (0, 0), then rotates m = Matrix(1, 0, 0, 1, -smp.x, -smp.y) * Matrix(rotate) sr1 = sr * m # resulting source rect to calculate scale factors fw = tr.width / sr1.width # scale the width fh = tr.height / sr1.height # scale the height if keep: fw = fh = min(fw, fh) # take min if keeping aspect ratio m *= Matrix(fw, fh) # concat scale matrix m *= Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center return JM_TUPLE(m) CheckParent(page) doc = page.parent if not doc.is_pdf or not docsrc.is_pdf: raise ValueError("is no PDF") if rect.is_empty or rect.is_infinite: raise ValueError("rect must be finite and not empty") while pno < 0: # support negative page numbers pno += docsrc.page_count src_page = docsrc[pno] # load source page tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates src_rect = src_page.rect if not clip else src_page.rect & clip # source rect if src_rect.is_empty or src_rect.is_infinite: raise ValueError("clip must be finite and not empty") src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate) # list of existing /Form /XObjects ilst = [i[1] for i in doc.get_page_xobjects(page.number)] ilst += [i[7] for i in doc.get_page_images(page.number)] ilst += [i[4] for i in doc.get_page_fonts(page.number)] # create a name not in that list n = "fzFrm" i = 0 _imgname = n + "0" while _imgname in ilst: i += 1 _imgname = n + str(i) isrc = docsrc._graft_id # used as key for graftmaps if doc._graft_id == isrc: raise ValueError("source document must not equal target") # retrieve / make Graftmap for source PDF gmap = doc.Graftmaps.get(isrc, None) if gmap is None: gmap = Graftmap(doc) doc.Graftmaps[isrc] = gmap # take note of generated xref for automatic reuse pno_id = (isrc, pno) # id of docsrc[pno] xref = doc.ShownPages.get(pno_id, 0) if overlay: page.wrap_contents() # ensure a balanced graphics state xref = page._show_pdf_page( src_page, overlay=overlay, matrix=matrix, xref=xref, oc=oc, clip=src_rect, graftmap=gmap, _imgname=_imgname, ) doc.ShownPages[pno_id] = xref return xref @property def transformation_matrix(self): """Page transformation matrix.""" CheckParent(self) ctm = mupdf.FzMatrix() page = self._pdf_page(required=False) if not page.m_internal: return JM_py_from_matrix(ctm) mediabox = mupdf.FzRect(mupdf.FzRect.Fixed_UNIT) # fixme: original code passed mediabox=NULL. mupdf.pdf_page_transform(page, mediabox, ctm) val = JM_py_from_matrix(ctm) if self.rotation % 360 == 0: val = Matrix(val) else: val = Matrix(1, 0, 0, -1, 0, self.cropbox.height) return val @property def trimbox(self): """The TrimBox""" rect = self._other_box("TrimBox") if rect is None: return self.cropbox mb = self.mediabox return Rect(rect[0], mb.y1 - rect[3], rect[2], mb.y1 - rect[1]) def update_link(page: 'Page', lnk: dict) -> None: """Update a link on the current page.""" CheckParent(page) annot = utils.getLinkText(page, lnk) if annot == "": raise ValueError("link kind not supported") page.parent.update_object(lnk["xref"], annot, page=page) def widgets(self, types=None): """ Generator over the widgets of a page. Args: types: (list) field types to subselect from. If none, all fields are returned. E.g. types=[PDF_WIDGET_TYPE_TEXT] will only yield text fields. """ #for a in self.annot_xrefs(): # log( '{a=}') widget_xrefs = [a[0] for a in self.annot_xrefs() if a[1] == mupdf.PDF_ANNOT_WIDGET] #log(f'widgets(): {widget_xrefs=}') for xref in widget_xrefs: widget = self.load_widget(xref) if types is None or widget.field_type in types: yield (widget) def wrap_contents(self): """Ensure page is in a balanced graphics state.""" push, pop = self._count_q_balance() # count missing "q"/"Q" commands if push > 0: # prepend required push commands prepend = b"q\n" * push TOOLS._insert_contents(self, prepend, False) if pop > 0: # append required pop commands append = b"\nQ" * pop + b"\n" TOOLS._insert_contents(self, append, True) def write_text( page: 'Page', rect=None, writers=None, overlay=True, color=None, opacity=None, keep_proportion=True, rotate=0, oc=0, ) -> None: """Write the text of one or more pymupdf.TextWriter objects. Args: rect: target rectangle. If None, the union of the text writers is used. writers: one or more pymupdf.TextWriter objects. overlay: put in foreground or background. keep_proportion: maintain aspect ratio of rectangle sides. rotate: arbitrary rotation angle. oc: the xref of an optional content object """ assert isinstance(page, Page) if not writers: raise ValueError("need at least one pymupdf.TextWriter") if type(writers) is TextWriter: if rotate == 0 and rect is None: writers.write_text(page, opacity=opacity, color=color, overlay=overlay) return None else: writers = (writers,) clip = writers[0].text_rect textdoc = Document() tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height) for writer in writers: clip |= writer.text_rect writer.write_text(tpage, opacity=opacity, color=color) if rect is None: rect = clip page.show_pdf_page( rect, textdoc, 0, overlay=overlay, keep_proportion=keep_proportion, rotate=rotate, clip=clip, oc=oc, ) textdoc = None tpage = None @property def xref(self): """PDF xref number of page.""" CheckParent(self) return self.parent.page_xref(self.number) rect = property(bound, doc="page rectangle") class Pixmap: def __init__(self, *args): """ Pixmap(colorspace, irect, alpha) - empty pixmap. Pixmap(colorspace, src) - copy changing colorspace. Pixmap(src, width, height,[clip]) - scaled copy, float dimensions. Pixmap(src, alpha=1) - copy and add or drop alpha channel. Pixmap(filename) - from an image in a file. Pixmap(image) - from an image in memory (bytes). Pixmap(colorspace, width, height, samples, alpha) - from samples data. Pixmap(PDFdoc, xref) - from an image at xref in a PDF document. """ # Cache for property `self.samples_mv`. Set here so __del_() sees it if # we raise. # self._samples_mv = None # 2024-01-16: Experimental support for a memory-view of the underlying # data. Doesn't seem to make much difference to Pixmap.set_pixel() so # not currently used. self._memory_view = None if 0: pass elif args_match(args, (Colorspace, mupdf.FzColorspace), (mupdf.FzRect, mupdf.FzIrect, IRect, Rect, tuple) ): # create empty pixmap with colorspace and IRect cs, rect = args alpha = 0 pm = mupdf.fz_new_pixmap_with_bbox(cs, JM_irect_from_py(rect), mupdf.FzSeparations(0), alpha) self.this = pm elif args_match(args, (Colorspace, mupdf.FzColorspace), (mupdf.FzRect, mupdf.FzIrect, IRect, Rect, tuple), (int, bool) ): # create empty pixmap with colorspace and IRect cs, rect, alpha = args pm = mupdf.fz_new_pixmap_with_bbox(cs, JM_irect_from_py(rect), mupdf.FzSeparations(0), alpha) self.this = pm elif args_match(args, (Colorspace, mupdf.FzColorspace, type(None)), (Pixmap, mupdf.FzPixmap)): # copy pixmap, converting colorspace cs, spix = args if isinstance(cs, Colorspace): cs = cs.this elif cs is None: cs = mupdf.FzColorspace(None) if isinstance(spix, Pixmap): spix = spix.this if not mupdf.fz_pixmap_colorspace(spix).m_internal: raise ValueError( "source colorspace must not be None") if cs.m_internal: self.this = mupdf.fz_convert_pixmap( spix, cs, mupdf.FzColorspace(), mupdf.FzDefaultColorspaces(None), mupdf.FzColorParams(), 1 ) else: self.this = mupdf.fz_new_pixmap_from_alpha_channel( spix) if not self.this.m_internal: raise RuntimeError( MSG_PIX_NOALPHA) elif args_match(args, (Pixmap, mupdf.FzPixmap), (Pixmap, mupdf.FzPixmap)): # add mask to a pixmap w/o alpha channel spix, mpix = args if isinstance(spix, Pixmap): spix = spix.this if isinstance(mpix, Pixmap): mpix = mpix.this spm = spix mpm = mpix if not spix.m_internal: # intercept NULL for spix: make alpha only pix dst = mupdf.fz_new_pixmap_from_alpha_channel(mpm) if not dst.m_internal: raise RuntimeError( MSG_PIX_NOALPHA) else: dst = mupdf.fz_new_pixmap_from_color_and_mask(spm, mpm) self.this = dst elif (args_match(args, (Pixmap, mupdf.FzPixmap), (float, int), (float, int), None) or args_match(args, (Pixmap, mupdf.FzPixmap), (float, int), (float, int))): # create pixmap as scaled copy of another one if len(args) == 3: spix, w, h = args bbox = mupdf.FzIrect(mupdf.fz_infinite_irect) else: spix, w, h, clip = args bbox = JM_irect_from_py(clip) src_pix = spix.this if isinstance(spix, Pixmap) else spix if not mupdf.fz_is_infinite_irect(bbox): pm = mupdf.fz_scale_pixmap(src_pix, src_pix.x(), src_pix.y(), w, h, bbox) else: pm = mupdf.fz_scale_pixmap(src_pix, src_pix.x(), src_pix.y(), w, h, mupdf.FzIrect(mupdf.fz_infinite_irect)) self.this = pm elif args_match(args, str, (Pixmap, mupdf.FzPixmap)) and args[0] == 'raw': # Special raw construction where we set .this directly. _, pm = args if isinstance(pm, Pixmap): pm = pm.this self.this = pm elif args_match(args, (Pixmap, mupdf.FzPixmap), (int, None)): # Pixmap(struct Pixmap *spix, int alpha=1) # copy pixmap & add / drop the alpha channel spix = args[0] alpha = args[1] if len(args) == 2 else 1 src_pix = spix.this if isinstance(spix, Pixmap) else spix if not _INRANGE(alpha, 0, 1): raise ValueError( "bad alpha value") cs = mupdf.fz_pixmap_colorspace(src_pix) if not cs.m_internal and not alpha: raise ValueError( "cannot drop alpha for 'NULL' colorspace") seps = mupdf.FzSeparations() n = mupdf.fz_pixmap_colorants(src_pix) w = mupdf.fz_pixmap_width(src_pix) h = mupdf.fz_pixmap_height(src_pix) pm = mupdf.fz_new_pixmap(cs, w, h, seps, alpha) pm.m_internal.x = src_pix.m_internal.x pm.m_internal.y = src_pix.m_internal.y pm.m_internal.xres = src_pix.m_internal.xres pm.m_internal.yres = src_pix.m_internal.yres # copy samples data ------------------------------------------ if 1: # We use our pixmap_copy() to get best performance. # test_pixmap.py:test_setalpha(): 3.9s t=0.0062 extra.pixmap_copy( pm.m_internal, src_pix.m_internal, n) elif 1: # Use memoryview. # test_pixmap.py:test_setalpha(): 4.6 t=0.51 src_view = mupdf.fz_pixmap_samples_memoryview( src_pix) pm_view = mupdf.fz_pixmap_samples_memoryview( pm) if src_pix.alpha() == pm.alpha(): # identical samples #memcpy(tptr, sptr, w * h * (n + alpha)); size = w * h * (n + alpha) pm_view[ 0 : size] = src_view[ 0 : size] else: tptr = 0 sptr = 0 # This is a little faster than calling # pm.fz_samples_set(), but still quite slow. E.g. reduces # test_pixmap.py:test_setalpha() from 6.7s to 4.5s. # # t=0.53 pm_stride = pm.stride() pm_n = pm.n() pm_alpha = pm.alpha() src_stride = src_pix.stride() src_n = src_pix.n() #log( '{=pm_stride pm_n src_stride src_n}') for y in range( h): for x in range( w): pm_i = pm_stride * y + pm_n * x src_i = src_stride * y + src_n * x pm_view[ pm_i : pm_i + n] = src_view[ src_i : src_i + n] if pm_alpha: pm_view[ pm_i + n] = 255 else: # Copy individual bytes from Python. Very slow. # test_pixmap.py:test_setalpha(): 6.89 t=2.601 if src_pix.alpha() == pm.alpha(): # identical samples #memcpy(tptr, sptr, w * h * (n + alpha)); for i in range(w * h * (n + alpha)): mupdf.fz_samples_set(pm, i, mupdf.fz_samples_get(src_pix, i)) else: # t=2.56 tptr = 0 sptr = 0 src_pix_alpha = src_pix.alpha() for i in range(w * h): #memcpy(tptr, sptr, n); for j in range(n): mupdf.fz_samples_set(pm, tptr + j, mupdf.fz_samples_get(src_pix, sptr + j)) tptr += n if pm.alpha(): mupdf.fz_samples_set(pm, tptr, 255) tptr += 1 sptr += n + src_pix_alpha self.this = pm elif args_match(args, (mupdf.FzColorspace, Colorspace), int, int, None, (int, bool)): # create pixmap from samples data cs, w, h, samples, alpha = args if isinstance(cs, Colorspace): cs = cs.this assert isinstance(cs, mupdf.FzColorspace) n = mupdf.fz_colorspace_n(cs) stride = (n + alpha) * w seps = mupdf.FzSeparations() pm = mupdf.fz_new_pixmap(cs, w, h, seps, alpha) if isinstance( samples, (bytes, bytearray)): #log('using mupdf.python_buffer_data()') samples2 = mupdf.python_buffer_data(samples) size = len(samples) else: res = JM_BufferFromBytes(samples) if not res.m_internal: raise ValueError( "bad samples data") size, c = mupdf.fz_buffer_storage(res) samples2 = mupdf.python_buffer_data(samples) # raw swig proxy for `const unsigned char*`. if stride * h != size: raise ValueError( f"bad samples length {w=} {h=} {alpha=} {n=} {stride=} {size=}") mupdf.ll_fz_pixmap_copy_raw( pm.m_internal, samples2) self.this = pm elif args_match(args, None): # create pixmap from filename, file object, pathlib.Path or memory imagedata, = args name = 'name' if hasattr(imagedata, "resolve"): fname = imagedata.__str__() if fname: img = mupdf.fz_new_image_from_file(fname) elif hasattr(imagedata, name): fname = imagedata.name if fname: img = mupdf.fz_new_image_from_file(fname) elif isinstance(imagedata, str): img = mupdf.fz_new_image_from_file(imagedata) else: res = JM_BufferFromBytes(imagedata) if not res.m_internal or not res.m_internal.len: raise ValueError( "bad image data") img = mupdf.fz_new_image_from_buffer(res) # Original code passed null for subarea and ctm, but that's not # possible with MuPDF's python bindings. The equivalent is an # infinite rect and identify matrix scaled by img.w() and img.h(). pm, w, h = mupdf.fz_get_pixmap_from_image( img, mupdf.FzIrect(FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT), mupdf.FzMatrix( img.w(), 0, 0, img.h(), 0, 0), ) xres, yres = mupdf.fz_image_resolution(img) pm.m_internal.xres = xres pm.m_internal.yres = yres self.this = pm elif args_match(args, (Document, mupdf.FzDocument), int): # Create pixmap from PDF image identified by XREF number doc, xref = args pdf = _as_pdf_document(doc) xreflen = mupdf.pdf_xref_len(pdf) if not _INRANGE(xref, 1, xreflen-1): raise ValueError( MSG_BAD_XREF) ref = mupdf.pdf_new_indirect(pdf, xref, 0) type_ = mupdf.pdf_dict_get(ref, PDF_NAME('Subtype')) if (not mupdf.pdf_name_eq(type_, PDF_NAME('Image')) and not mupdf.pdf_name_eq(type_, PDF_NAME('Alpha')) and not mupdf.pdf_name_eq(type_, PDF_NAME('Luminosity')) ): raise ValueError( MSG_IS_NO_IMAGE) img = mupdf.pdf_load_image(pdf, ref) # Original code passed null for subarea and ctm, but that's not # possible with MuPDF's python bindings. The equivalent is an # infinite rect and identify matrix scaled by img.w() and img.h(). pix, w, h = mupdf.fz_get_pixmap_from_image( img, mupdf.FzIrect(FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT), mupdf.FzMatrix(img.w(), 0, 0, img.h(), 0, 0), ) self.this = pix else: text = 'Unrecognised args for constructing Pixmap:\n' for arg in args: text += f' {type(arg)}: {arg}\n' raise Exception( text) def __len__(self): return self.size def __repr__(self): if not type(self) is Pixmap: return if self.colorspace: return "Pixmap(%s, %s, %s)" % (self.colorspace.this.m_internal.name, self.irect, self.alpha) else: return "Pixmap(%s, %s, %s)" % ('None', self.irect, self.alpha) def _tobytes(self, format_, jpg_quality): ''' Pixmap._tobytes ''' pm = self.this size = mupdf.fz_pixmap_stride(pm) * pm.h() res = mupdf.fz_new_buffer(size) out = mupdf.FzOutput(res) if format_ == 1: mupdf.fz_write_pixmap_as_png(out, pm) elif format_ == 2: mupdf.fz_write_pixmap_as_pnm(out, pm) elif format_ == 3: mupdf.fz_write_pixmap_as_pam(out, pm) elif format_ == 5: mupdf.fz_write_pixmap_as_psd(out, pm) elif format_ == 6: mupdf.fz_write_pixmap_as_ps(out, pm) elif format_ == 7: mupdf.fz_write_pixmap_as_jpeg(out, pm, jpg_quality, 0) else: mupdf.fz_write_pixmap_as_png(out, pm) out.fz_close_output() barray = JM_BinFromBuffer(res) return barray def _writeIMG(self, filename, format_, jpg_quality): pm = self.this if format_ == 1: mupdf.fz_save_pixmap_as_png(pm, filename) elif format_ == 2: mupdf.fz_save_pixmap_as_pnm(pm, filename) elif format_ == 3: mupdf.fz_save_pixmap_as_pam(pm, filename) elif format_ == 5: mupdf.fz_save_pixmap_as_psd(pm, filename) elif format_ == 6: mupdf.fz_save_pixmap_as_ps(pm, filename) elif format_ == 7: mupdf.fz_save_pixmap_as_jpeg(pm, filename, jpg_quality) else: mupdf.fz_save_pixmap_as_png(pm, filename) @property def alpha(self): """Indicates presence of alpha channel.""" return mupdf.fz_pixmap_alpha(self.this) def clear_with(self, value=None, bbox=None): """Fill all color components with same value.""" if value is None: mupdf.fz_clear_pixmap(self.this) elif bbox is None: mupdf.fz_clear_pixmap_with_value(self.this, value) else: JM_clear_pixmap_rect_with_value(self.this, value, JM_irect_from_py(bbox)) def color_count(self, colors=0, clip=None): ''' Return count of each color. ''' pm = self.this rc = JM_color_count( pm, clip) if not colors: return len( rc) return rc def color_topusage(self, clip=None): """Return most frequent color and its usage ratio.""" allpixels = 0 cnt = 0 if clip is not None and self.irect in Rect(clip): clip = self.irect for pixel, count in self.color_count(colors=True,clip=clip).items(): allpixels += count if count > cnt: cnt = count maxpixel = pixel if not allpixels: return (1, bytes([255] * self.n)) return (cnt / allpixels, maxpixel) @property def colorspace(self): """Pixmap Colorspace.""" cs = Colorspace(mupdf.fz_pixmap_colorspace(self.this)) if cs.name == "None": return None return cs def copy(self, src, bbox): """Copy bbox from another Pixmap.""" pm = self.this src_pix = src.this if not mupdf.fz_pixmap_colorspace(src_pix): raise ValueError( "cannot copy pixmap with NULL colorspace") if pm.alpha() != src_pix.alpha(): raise ValueError( "source and target alpha must be equal") mupdf.fz_copy_pixmap_rect(pm, src_pix, JM_irect_from_py(bbox), mupdf.FzDefaultColorspaces(None)) @property def digest(self): """MD5 digest of pixmap (bytes).""" ret = mupdf.fz_md5_pixmap2(self.this) return bytes(ret) def gamma_with(self, gamma): """Apply correction with some float. gamma=1 is a no-op.""" if not mupdf.fz_pixmap_colorspace( self.this): message_warning("colorspace invalid for function") return mupdf.fz_gamma_pixmap( self.this, gamma) @property def h(self): """The height.""" return mupdf.fz_pixmap_height(self.this) def invert_irect(self, bbox=None): """Invert the colors inside a bbox.""" pm = self.this if not mupdf.fz_pixmap_colorspace(pm).m_internal: message_warning("ignored for stencil pixmap") return False r = JM_irect_from_py(bbox) if mupdf.fz_is_infinite_irect(r): mupdf.fz_invert_pixmap(pm) return True mupdf.fz_invert_pixmap_rect(pm, r) return True @property def irect(self): """Pixmap bbox - an IRect object.""" val = mupdf.fz_pixmap_bbox(self.this) return JM_py_from_irect( val) @property def is_monochrome(self): """Check if pixmap is monochrome.""" return mupdf.fz_is_pixmap_monochrome( self.this) @property def is_unicolor(self): ''' Check if pixmap has only one color. ''' pm = self.this n = pm.n() count = pm.w() * pm.h() * n def _pixmap_read_samples(pm, offset, n): ret = list() for i in range(n): ret.append(mupdf.fz_samples_get(pm, offset+i)) return ret for offset in range( 0, count, n): if offset == 0: sample0 = _pixmap_read_samples( pm, 0, n) else: sample = _pixmap_read_samples( pm, offset, n) if sample != sample0: return False return True @property def n(self): """The size of one pixel.""" if g_use_extra: # Setting self.__class__.n gives a small reduction in overhead of # test_general.py:test_2093, e.g. 1.4x -> 1.3x. #return extra.pixmap_n(self.this) def n2(self): return extra.pixmap_n(self.this) self.__class__.n = property(n2) return self.n return mupdf.fz_pixmap_components(self.this) def pdfocr_save(self, filename, compress=1, language=None, tessdata=None): ''' Save pixmap as an OCR-ed PDF page. ''' tessdata = get_tessdata(tessdata) opts = mupdf.FzPdfocrOptions() opts.compress = compress if language: opts.language_set2( language) if tessdata: opts.datadir_set2( tessdata) pix = self.this if isinstance(filename, str): mupdf.fz_save_pixmap_as_pdfocr( pix, filename, 0, opts) else: out = JM_new_output_fileptr( filename) try: mupdf.fz_write_pixmap_as_pdfocr( out, pix, opts) finally: out.fz_close_output() # Avoid MuPDF warning. def pdfocr_tobytes(self, compress=True, language="eng", tessdata=None): """Save pixmap as an OCR-ed PDF page. Args: compress: (bool) compress, default 1 (True). language: (str) language(s) occurring on page, default "eng" (English), multiples like "eng+ger" for English and German. tessdata: (str) folder name of Tesseract's language support. If None we use environment variable TESSDATA_PREFIX or search for Tesseract installation. Notes: On failure, make sure Tesseract is installed and you have set <tessdata> or environment variable "TESSDATA_PREFIX" to the folder containing your Tesseract's language support data. """ tessdata = get_tessdata(tessdata) from io import BytesIO bio = BytesIO() self.pdfocr_save(bio, compress=compress, language=language, tessdata=tessdata) return bio.getvalue() def pil_image(self): """Create a Pillow Image from the Pixmap.""" try: from PIL import Image except ImportError: message("PIL/Pillow not installed") raise cspace = self.colorspace if not cspace: mode = "L" elif cspace.n == 1: mode = "L" if not self.alpha else "LA" elif cspace.n == 3: mode = "RGB" if not self.alpha else "RGBA" else: mode = "CMYK" img = Image.frombytes(mode, (self.width, self.height), self.samples) return img def pil_save(self, *args, **kwargs): """Write to image file using Pillow. An intermediate PIL Image is created, and its "save" method is used to store the image. See Pillow documentation to learn about the meaning of possible positional and keyword parameters. Use this when other output formats are desired. """ img = self.pil_image() if "dpi" not in kwargs.keys(): kwargs["dpi"] = (self.xres, self.yres) img.save(*args, **kwargs) def pil_tobytes(self, *args, **kwargs): """Convert to an image in memory using Pillow. An intermediate PIL Image is created, and its "save" method is used to store the image. See Pillow documentation to learn about the meaning of possible positional or keyword parameters. Use this when other output formats are desired. """ bytes_out = io.BytesIO() img = self.pil_image() if "dpi" not in kwargs.keys(): kwargs["dpi"] = (self.xres, self.yres) img.save(bytes_out, *args, **kwargs) return bytes_out.getvalue() def pixel(self, x, y): """Get color tuple of pixel (x, y). Last item is the alpha if Pixmap.alpha is true.""" if g_use_extra: return extra.pixmap_pixel(self.this.m_internal, x, y) if (0 or x < 0 or x >= self.this.m_internal.w or y < 0 or y >= self.this.m_internal.h ): RAISEPY(MSG_PIXEL_OUTSIDE, PyExc_ValueError) n = self.this.m_internal.n stride = self.this.m_internal.stride i = stride * y + n * x ret = tuple( self.samples_mv[ i: i+n]) return ret @property def samples(self)->bytes: mv = self.samples_mv return bytes( mv) @property def samples_mv(self): ''' Pixmap samples memoryview. ''' # We remember the returned memoryview so that our `__del__()` can # release it; otherwise accessing it after we have been destructed will # fail, possibly crashing Python; this is #4155. # if self._samples_mv is None: self._samples_mv = mupdf.fz_pixmap_samples_memoryview(self.this) return self._samples_mv def _samples_mv_release(self): if self._samples_mv: self._samples_mv.release() @property def samples_ptr(self): return mupdf.fz_pixmap_samples_int(self.this) def save(self, filename, output=None, jpg_quality=95): """Output as image in format determined by filename extension. Args: output: (str) only use to overrule filename extension. Default is PNG. Others are JPEG, JPG, PNM, PGM, PPM, PBM, PAM, PSD, PS. """ valid_formats = { "png": 1, "pnm": 2, "pgm": 2, "ppm": 2, "pbm": 2, "pam": 3, "psd": 5, "ps": 6, "jpg": 7, "jpeg": 7, } if type(filename) is str: pass elif hasattr(filename, "absolute"): filename = str(filename) elif hasattr(filename, "name"): filename = filename.name if output is None: _, ext = os.path.splitext(filename) output = ext[1:] idx = valid_formats.get(output.lower(), None) if idx is None: raise ValueError(f"Image format {output} not in {tuple(valid_formats.keys())}") if self.alpha and idx in (2, 6, 7): raise ValueError("'%s' cannot have alpha" % output) if self.colorspace and self.colorspace.n > 3 and idx in (1, 2, 4): raise ValueError(f"unsupported colorspace for '{output}'") if idx == 7: self.set_dpi(self.xres, self.yres) return self._writeIMG(filename, idx, jpg_quality) def set_alpha(self, alphavalues=None, premultiply=1, opaque=None, matte=None): """Set alpha channel to values contained in a byte array. If omitted, set alphas to 255. Args: alphavalues: (bytes) with length (width * height) or 'None'. premultiply: (bool, True) premultiply colors with alpha values. opaque: (tuple, length colorspace.n) this color receives opacity 0. matte: (tuple, length colorspace.n)) preblending background color. """ pix = self.this alpha = 0 m = 0 if pix.alpha() == 0: raise ValueError( MSG_PIX_NOALPHA) n = mupdf.fz_pixmap_colorants(pix) w = mupdf.fz_pixmap_width(pix) h = mupdf.fz_pixmap_height(pix) balen = w * h * (n+1) colors = [0, 0, 0, 0] # make this color opaque bgcolor = [0, 0, 0, 0] # preblending background color zero_out = 0 bground = 0 if opaque and isinstance(opaque, (list, tuple)) and len(opaque) == n: for i in range(n): colors[i] = opaque[i] zero_out = 1 if matte and isinstance( matte, (tuple, list)) and len(matte) == n: for i in range(n): bgcolor[i] = matte[i] bground = 1 data = bytes() data_len = 0 if alphavalues: #res = JM_BufferFromBytes(alphavalues) #data_len, data = mupdf.fz_buffer_storage(res) #if data_len < w * h: # THROWMSG("bad alpha values") # fixme: don't seem to need to create an fz_buffer - can # use <alphavalues> directly? if isinstance(alphavalues, (bytes, bytearray)): data = alphavalues data_len = len(alphavalues) else: assert 0, f'unexpected type for alphavalues: {type(alphavalues)}' if data_len < w * h: raise ValueError( "bad alpha values") if 1: # Use C implementation for speed. mupdf.Pixmap_set_alpha_helper( balen, n, data_len, zero_out, mupdf.python_buffer_data( data), pix.m_internal, premultiply, bground, colors, bgcolor, ) else: i = k = j = 0 data_fix = 255 while i < balen: alpha = data[k] if zero_out: for j in range(i, i+n): if mupdf.fz_samples_get(pix, j) != colors[j - i]: data_fix = 255 break else: data_fix = 0 if data_len: def fz_mul255( a, b): x = a * b + 128 x += x // 256 return x // 256 if data_fix == 0: mupdf.fz_samples_set(pix, i+n, 0) else: mupdf.fz_samples_set(pix, i+n, alpha) if premultiply and not bground: for j in range(i, i+n): mupdf.fz_samples_set(pix, j, fz_mul255( mupdf.fz_samples_get(pix, j), alpha)) elif bground: for j in range( i, i+n): m = bgcolor[j - i] mupdf.fz_samples_set(pix, j, fz_mul255( mupdf.fz_samples_get(pix, j) - m, alpha)) else: mupdf.fz_samples_set(pix, i+n, data_fix) i += n+1 k += 1 def tobytes(self, output="png", jpg_quality=95): ''' Convert to binary image stream of desired type. ''' valid_formats = { "png": 1, "pnm": 2, "pgm": 2, "ppm": 2, "pbm": 2, "pam": 3, "tga": 4, "tpic": 4, "psd": 5, "ps": 6, 'jpg': 7, 'jpeg': 7, } idx = valid_formats.get(output.lower(), None) if idx is None: raise ValueError(f"Image format {output} not in {tuple(valid_formats.keys())}") if self.alpha and idx in (2, 6, 7): raise ValueError("'{output}' cannot have alpha") if self.colorspace and self.colorspace.n > 3 and idx in (1, 2, 4): raise ValueError(f"unsupported colorspace for '{output}'") if idx == 7: self.set_dpi(self.xres, self.yres) barray = self._tobytes(idx, jpg_quality) return barray def set_dpi(self, xres, yres): """Set resolution in both dimensions.""" pm = self.this pm.m_internal.xres = xres pm.m_internal.yres = yres def set_origin(self, x, y): """Set top-left coordinates.""" pm = self.this pm.m_internal.x = x pm.m_internal.y = y def set_pixel(self, x, y, color): """Set color of pixel (x, y).""" if g_use_extra: return extra.set_pixel(self.this.m_internal, x, y, color) pm = self.this if not _INRANGE(x, 0, pm.w() - 1) or not _INRANGE(y, 0, pm.h() - 1): raise ValueError( MSG_PIXEL_OUTSIDE) n = pm.n() for j in range(n): i = color[j] if not _INRANGE(i, 0, 255): raise ValueError( MSG_BAD_COLOR_SEQ) stride = mupdf.fz_pixmap_stride( pm) i = stride * y + n * x if 0: # Using a cached self._memory_view doesn't actually make much # difference to speed. if not self._memory_view: self._memory_view = self.samples_mv for j in range(n): self._memory_view[i + j] = color[j] else: for j in range(n): pm.fz_samples_set(i + j, color[j]) def set_rect(self, bbox, color): """Set color of all pixels in bbox.""" pm = self.this n = pm.n() c = [] for j in range(n): i = color[j] if not _INRANGE(i, 0, 255): raise ValueError( MSG_BAD_COLOR_SEQ) c.append(i) bbox = JM_irect_from_py(bbox) i = JM_fill_pixmap_rect_with_color(pm, c, bbox) rc = bool(i) return rc def shrink(self, factor): """Divide width and height by 2**factor. E.g. factor=1 shrinks to 25% of original size (in place).""" if factor < 1: message_warning("ignoring shrink factor < 1") return mupdf.fz_subsample_pixmap( self.this, factor) # Pixmap has changed so clear our memory view. self._memory_view = None self._samples_mv_release() @property def size(self): """Pixmap size.""" return mupdf.fz_pixmap_size( self.this) @property def stride(self): """Length of one image line (width * n).""" return self.this.stride() def tint_with(self, black, white): """Tint colors with modifiers for black and white.""" if not self.colorspace or self.colorspace.n > 3: message("warning: colorspace invalid for function") return return mupdf.fz_tint_pixmap( self.this, black, white) @property def w(self): """The width.""" return mupdf.fz_pixmap_width(self.this) def warp(self, quad, width, height): """Return pixmap from a warped quad.""" if not quad.is_convex: raise ValueError("quad must be convex") q = JM_quad_from_py(quad) points = [ q.ul, q.ur, q.lr, q.ll] dst = mupdf.fz_warp_pixmap( self.this, points, width, height) return Pixmap( dst) @property def x(self): """x component of Pixmap origin.""" return mupdf.fz_pixmap_x(self.this) @property def xres(self): """Resolution in x direction.""" return self.this.xres() @property def y(self): """y component of Pixmap origin.""" return mupdf.fz_pixmap_y(self.this) @property def yres(self): """Resolution in y direction.""" return self.this.yres() width = w height = h def __del__(self): if self._samples_mv: self._samples_mv.release() del Point class Point: def __abs__(self): return math.sqrt(self.x * self.x + self.y * self.y) def __add__(self, p): if hasattr(p, "__float__"): return Point(self.x + p, self.y + p) if len(p) != 2: raise ValueError("Point: bad seq len") return Point(self.x + p[0], self.y + p[1]) def __bool__(self): return not (max(self) == min(self) == 0) def __eq__(self, p): if not hasattr(p, "__len__"): return False return len(p) == 2 and not (self - p) def __getitem__(self, i): return (self.x, self.y)[i] def __hash__(self): return hash(tuple(self)) def __init__(self, *args, x=None, y=None): ''' Point() - all zeros Point(x, y) Point(Point) - new copy Point(sequence) - from 'sequence' Explicit keyword args x, y override earlier settings if not None. ''' if not args: self.x = 0.0 self.y = 0.0 elif len(args) > 2: raise ValueError("Point: bad seq len") elif len(args) == 2: self.x = float(args[0]) self.y = float(args[1]) elif len(args) == 1: l = args[0] if isinstance(l, (mupdf.FzPoint, mupdf.fz_point)): self.x = l.x self.y = l.y else: if not hasattr(l, "__getitem__"): raise ValueError("Point: bad args") if len(l) != 2: raise ValueError("Point: bad seq len") self.x = float(l[0]) self.y = float(l[1]) else: raise ValueError("Point: bad seq len") if x is not None: self.x = x if y is not None: self.y = y def __len__(self): return 2 def __mul__(self, m): if hasattr(m, "__float__"): return Point(self.x * m, self.y * m) if hasattr(m, "__getitem__") and len(m) == 2: # dot product return self.x * m[0] + self.y * m[1] p = Point(self) return p.transform(m) def __neg__(self): return Point(-self.x, -self.y) def __nonzero__(self): return not (max(self) == min(self) == 0) def __pos__(self): return Point(self) def __repr__(self): return "Point" + str(tuple(self)) def __setitem__(self, i, v): v = float(v) if i == 0: self.x = v elif i == 1: self.y = v else: raise IndexError("index out of range") return None def __sub__(self, p): if hasattr(p, "__float__"): return Point(self.x - p, self.y - p) if len(p) != 2: raise ValueError("Point: bad seq len") return Point(self.x - p[0], self.y - p[1]) def __truediv__(self, m): if hasattr(m, "__float__"): return Point(self.x * 1./m, self.y * 1./m) m1 = util_invert_matrix(m)[1] if not m1: raise ZeroDivisionError("matrix not invertible") p = Point(self) return p.transform(m1) @property def abs_unit(self): """Unit vector with positive coordinates.""" s = self.x * self.x + self.y * self.y if s < EPSILON: return Point(0,0) s = math.sqrt(s) return Point(abs(self.x) / s, abs(self.y) / s) def distance_to(self, *args): """Return distance to rectangle or another point.""" if not len(args) > 0: raise ValueError("at least one parameter must be given") x = args[0] if len(x) == 2: x = Point(x) elif len(x) == 4: x = Rect(x) else: raise ValueError("arg1 must be point-like or rect-like") if len(args) > 1: unit = args[1] else: unit = "px" u = {"px": (1.,1.), "in": (1.,72.), "cm": (2.54, 72.), "mm": (25.4, 72.)} f = u[unit][0] / u[unit][1] if type(x) is Point: return abs(self - x) * f # from here on, x is a rectangle # as a safeguard, make a finite copy of it r = Rect(x.top_left, x.top_left) r = r | x.bottom_right if self in r: return 0.0 if self.x > r.x1: if self.y >= r.y1: return self.distance_to(r.bottom_right, unit) elif self.y <= r.y0: return self.distance_to(r.top_right, unit) else: return (self.x - r.x1) * f elif r.x0 <= self.x <= r.x1: if self.y >= r.y1: return (self.y - r.y1) * f else: return (r.y0 - self.y) * f else: if self.y >= r.y1: return self.distance_to(r.bottom_left, unit) elif self.y <= r.y0: return self.distance_to(r.top_left, unit) else: return (r.x0 - self.x) * f def transform(self, m): """Replace point by its transformation with matrix-like m.""" if len(m) != 6: raise ValueError("Matrix: bad seq len") self.x, self.y = util_transform_point(self, m) return self @property def unit(self): """Unit vector of the point.""" s = self.x * self.x + self.y * self.y if s < EPSILON: return Point(0,0) s = math.sqrt(s) return Point(self.x / s, self.y / s) __div__ = __truediv__ norm = __abs__ class Quad: def __abs__(self): if self.is_empty: return 0.0 return abs(self.ul - self.ur) * abs(self.ul - self.ll) def __add__(self, q): if hasattr(q, "__float__"): return Quad(self.ul + q, self.ur + q, self.ll + q, self.lr + q) if len(q) != 4: raise ValueError("Quad: bad seq len") return Quad(self.ul + q[0], self.ur + q[1], self.ll + q[2], self.lr + q[3]) def __bool__(self): return not self.is_empty def __contains__(self, x): try: l = x.__len__() except Exception: if g_exceptions_verbose > 1: exception_info() return False if l == 2: return util_point_in_quad(x, self) if l != 4: return False if CheckRect(x): if Rect(x).is_empty: return True return util_point_in_quad(x[:2], self) and util_point_in_quad(x[2:], self) if CheckQuad(x): for i in range(4): if not util_point_in_quad(x[i], self): return False return True return False def __eq__(self, quad): if not hasattr(quad, "__len__"): return False return len(quad) == 4 and ( self.ul == quad[0] and self.ur == quad[1] and self.ll == quad[2] and self.lr == quad[3] ) def __getitem__(self, i): return (self.ul, self.ur, self.ll, self.lr)[i] def __hash__(self): return hash(tuple(self)) def __init__(self, *args, ul=None, ur=None, ll=None, lr=None): ''' Quad() - all zero points Quad(ul, ur, ll, lr) Quad(quad) - new copy Quad(sequence) - from 'sequence' Explicit keyword args ul, ur, ll, lr override earlier settings if not None. ''' if not args: self.ul = self.ur = self.ll = self.lr = Point() elif len(args) > 4: raise ValueError("Quad: bad seq len") elif len(args) == 4: self.ul, self.ur, self.ll, self.lr = map(Point, args) elif len(args) == 1: l = args[0] if isinstance(l, mupdf.FzQuad): self.this = l self.ul, self.ur, self.ll, self.lr = Point(l.ul), Point(l.ur), Point(l.ll), Point(l.lr) elif not hasattr(l, "__getitem__"): raise ValueError("Quad: bad args") elif len(l) != 4: raise ValueError("Quad: bad seq len") else: self.ul, self.ur, self.ll, self.lr = map(Point, l) else: raise ValueError("Quad: bad args") if ul is not None: self.ul = Point(ul) if ur is not None: self.ur = Point(ur) if ll is not None: self.ll = Point(ll) if lr is not None: self.lr = Point(lr) def __len__(self): return 4 def __mul__(self, m): q = Quad(self) q = q.transform(m) return q def __neg__(self): return Quad(-self.ul, -self.ur, -self.ll, -self.lr) def __nonzero__(self): return not self.is_empty def __pos__(self): return Quad(self) def __repr__(self): return "Quad" + str(tuple(self)) def __setitem__(self, i, v): if i == 0: self.ul = Point(v) elif i == 1: self.ur = Point(v) elif i == 2: self.ll = Point(v) elif i == 3: self.lr = Point(v) else: raise IndexError("index out of range") return None def __sub__(self, q): if hasattr(q, "__float__"): return Quad(self.ul - q, self.ur - q, self.ll - q, self.lr - q) if len(q) != 4: raise ValueError("Quad: bad seq len") return Quad(self.ul - q[0], self.ur - q[1], self.ll - q[2], self.lr - q[3]) def __truediv__(self, m): if hasattr(m, "__float__"): im = 1. / m else: im = util_invert_matrix(m)[1] if not im: raise ZeroDivisionError("Matrix not invertible") q = Quad(self) q = q.transform(im) return q @property def is_convex(self): """Check if quad is convex and not degenerate. Notes: Check that for the two diagonals, the other two corners are not on the same side of the diagonal. Returns: True or False. """ m = planish_line(self.ul, self.lr) # puts this diagonal on x-axis p1 = self.ll * m # transform the p2 = self.ur * m # other two points if p1.y * p2.y > 0: return False m = planish_line(self.ll, self.ur) # puts other diagonal on x-axis p1 = self.lr * m # transform the p2 = self.ul * m # remaining points if p1.y * p2.y > 0: return False return True @property def is_empty(self): """Check whether all quad corners are on the same line. This is the case if width or height is zero. """ return self.width < EPSILON or self.height < EPSILON @property def is_infinite(self): """Check whether this is the infinite quad.""" return self.rect.is_infinite @property def is_rectangular(self): """Check if quad is rectangular. Notes: Some rotation matrix can thus transform it into a rectangle. This is equivalent to three corners enclose 90 degrees. Returns: True or False. """ sine = util_sine_between(self.ul, self.ur, self.lr) if abs(sine - 1) > EPSILON: # the sine of the angle return False sine = util_sine_between(self.ur, self.lr, self.ll) if abs(sine - 1) > EPSILON: return False sine = util_sine_between(self.lr, self.ll, self.ul) if abs(sine - 1) > EPSILON: return False return True def morph(self, p, m): """Morph the quad with matrix-like 'm' and point-like 'p'. Return a new quad.""" if self.is_infinite: return INFINITE_QUAD() delta = Matrix(1, 1).pretranslate(p.x, p.y) q = self * ~delta * m * delta return q @property def rect(self): r = Rect() r.x0 = min(self.ul.x, self.ur.x, self.lr.x, self.ll.x) r.y0 = min(self.ul.y, self.ur.y, self.lr.y, self.ll.y) r.x1 = max(self.ul.x, self.ur.x, self.lr.x, self.ll.x) r.y1 = max(self.ul.y, self.ur.y, self.lr.y, self.ll.y) return r def transform(self, m): """Replace quad by its transformation with matrix m.""" if hasattr(m, "__float__"): pass elif len(m) != 6: raise ValueError("Matrix: bad seq len") self.ul *= m self.ur *= m self.ll *= m self.lr *= m return self __div__ = __truediv__ width = property(lambda self: max(abs(self.ul - self.ur), abs(self.ll - self.lr))) height = property(lambda self: max(abs(self.ul - self.ll), abs(self.ur - self.lr))) class Rect: def __abs__(self): if self.is_empty or self.is_infinite: return 0.0 return (self.x1 - self.x0) * (self.y1 - self.y0) def __add__(self, p): if hasattr(p, "__float__"): return Rect(self.x0 + p, self.y0 + p, self.x1 + p, self.y1 + p) if len(p) != 4: raise ValueError("Rect: bad seq len") return Rect(self.x0 + p[0], self.y0 + p[1], self.x1 + p[2], self.y1 + p[3]) def __and__(self, x): if not hasattr(x, "__len__"): raise ValueError("bad operand 2") r1 = Rect(x) r = Rect(self) return r.intersect(r1) def __bool__(self): return not (max(self) == min(self) == 0) def __contains__(self, x): if hasattr(x, "__float__"): return x in tuple(self) l = len(x) if l == 2: return util_is_point_in_rect(x, self) if l == 4: r = INFINITE_RECT() try: r = Rect(x) except Exception: if g_exceptions_verbose > 1: exception_info() r = Quad(x).rect return (self.x0 <= r.x0 <= r.x1 <= self.x1 and self.y0 <= r.y0 <= r.y1 <= self.y1) return False def __eq__(self, rect): if not hasattr(rect, "__len__"): return False return len(rect) == 4 and not (self - rect) def __getitem__(self, i): return (self.x0, self.y0, self.x1, self.y1)[i] def __hash__(self): return hash(tuple(self)) def __init__(self, *args, p0=None, p1=None, x0=None, y0=None, x1=None, y1=None): """ Rect() - all zeros Rect(x0, y0, x1, y1) Rect(top-left, x1, y1) Rect(x0, y0, bottom-right) Rect(top-left, bottom-right) Rect(Rect or IRect) - new copy Rect(sequence) - from 'sequence' Explicit keyword args p0, p1, x0, y0, x1, y1 override earlier settings if not None. """ x0, y0, x1, y1 = util_make_rect( *args, p0=p0, p1=p1, x0=x0, y0=y0, x1=x1, y1=y1) self.x0 = float( x0) self.y0 = float( y0) self.x1 = float( x1) self.y1 = float( y1) def __len__(self): return 4 def __mul__(self, m): if hasattr(m, "__float__"): return Rect(self.x0 * m, self.y0 * m, self.x1 * m, self.y1 * m) r = Rect(self) r = r.transform(m) return r def __neg__(self): return Rect(-self.x0, -self.y0, -self.x1, -self.y1) def __nonzero__(self): return not (max(self) == min(self) == 0) def __or__(self, x): if not hasattr(x, "__len__"): raise ValueError("bad operand 2") r = Rect(self) if len(x) == 2: return r.include_point(x) if len(x) == 4: return r.include_rect(x) raise ValueError("bad operand 2") def __pos__(self): return Rect(self) def __repr__(self): return "Rect" + str(tuple(self)) def __setitem__(self, i, v): v = float(v) if i == 0: self.x0 = v elif i == 1: self.y0 = v elif i == 2: self.x1 = v elif i == 3: self.y1 = v else: raise IndexError("index out of range") return None def __sub__(self, p): if hasattr(p, "__float__"): return Rect(self.x0 - p, self.y0 - p, self.x1 - p, self.y1 - p) if len(p) != 4: raise ValueError("Rect: bad seq len") return Rect(self.x0 - p[0], self.y0 - p[1], self.x1 - p[2], self.y1 - p[3]) def __truediv__(self, m): if hasattr(m, "__float__"): return Rect(self.x0 * 1./m, self.y0 * 1./m, self.x1 * 1./m, self.y1 * 1./m) im = util_invert_matrix(m)[1] if not im: raise ZeroDivisionError(f"Matrix not invertible: {m}") r = Rect(self) r = r.transform(im) return r @property def bottom_left(self): """Bottom-left corner.""" return Point(self.x0, self.y1) @property def bottom_right(self): """Bottom-right corner.""" return Point(self.x1, self.y1) def contains(self, x): """Check if containing point-like or rect-like x.""" return self.__contains__(x) @property def height(self): return max(0, self.y1 - self.y0) def include_point(self, p): """Extend to include point-like p.""" if len(p) != 2: raise ValueError("Point: bad seq len") self.x0, self.y0, self.x1, self.y1 = util_include_point_in_rect(self, p) return self def include_rect(self, r): """Extend to include rect-like r.""" if len(r) != 4: raise ValueError("Rect: bad seq len") r = Rect(r) if r.is_infinite or self.is_infinite: self.x0, self.y0, self.x1, self.y1 = FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT elif r.is_empty: return self elif self.is_empty: self.x0, self.y0, self.x1, self.y1 = r.x0, r.y0, r.x1, r.y1 else: self.x0, self.y0, self.x1, self.y1 = util_union_rect(self, r) return self def intersect(self, r): """Restrict to common rect with rect-like r.""" if not len(r) == 4: raise ValueError("Rect: bad seq len") r = Rect(r) if r.is_infinite: return self elif self.is_infinite: self.x0, self.y0, self.x1, self.y1 = r.x0, r.y0, r.x1, r.y1 elif r.is_empty: self.x0, self.y0, self.x1, self.y1 = r.x0, r.y0, r.x1, r.y1 elif self.is_empty: return self else: self.x0, self.y0, self.x1, self.y1 = util_intersect_rect(self, r) return self def intersects(self, x): """Check if intersection with rectangle x is not empty.""" rect2 = Rect(x) return (1 and not self.is_empty and not self.is_infinite and not rect2.is_empty and not rect2.is_infinite and self.x0 < rect2.x1 and rect2.x0 < self.x1 and self.y0 < rect2.y1 and rect2.y0 < self.y1 ) @property def is_empty(self): """True if rectangle area is empty.""" return self.x0 >= self.x1 or self.y0 >= self.y1 @property def is_infinite(self): """True if this is the infinite rectangle.""" return self.x0 == self.y0 == FZ_MIN_INF_RECT and self.x1 == self.y1 == FZ_MAX_INF_RECT @property def is_valid(self): """True if rectangle is valid.""" return self.x0 <= self.x1 and self.y0 <= self.y1 def morph(self, p, m): """Morph with matrix-like m and point-like p. Returns a new quad.""" if self.is_infinite: return INFINITE_QUAD() return self.quad.morph(p, m) def norm(self): return math.sqrt(sum([c*c for c in self])) def normalize(self): """Replace rectangle with its finite version.""" if self.x1 < self.x0: self.x0, self.x1 = self.x1, self.x0 if self.y1 < self.y0: self.y0, self.y1 = self.y1, self.y0 return self @property def quad(self): """Return Quad version of rectangle.""" return Quad(self.tl, self.tr, self.bl, self.br) def round(self): """Return the IRect.""" return IRect(util_round_rect(self)) @property def top_left(self): """Top-left corner.""" return Point(self.x0, self.y0) @property def top_right(self): """Top-right corner.""" return Point(self.x1, self.y0) def torect(self, r): """Return matrix that converts to target rect.""" r = Rect(r) if self.is_infinite or self.is_empty or r.is_infinite or r.is_empty: raise ValueError("rectangles must be finite and not empty") return ( Matrix(1, 0, 0, 1, -self.x0, -self.y0) * Matrix(r.width / self.width, r.height / self.height) * Matrix(1, 0, 0, 1, r.x0, r.y0) ) def transform(self, m): """Replace with the transformation by matrix-like m.""" if not len(m) == 6: raise ValueError("Matrix: bad seq len") self.x0, self.y0, self.x1, self.y1 = util_transform_rect(self, m) return self @property def width(self): return max(0, self.x1 - self.x0) __div__ = __truediv__ bl = bottom_left br = bottom_right irect = property(round) tl = top_left tr = top_right class Shape: """Create a new shape.""" @staticmethod def horizontal_angle(C, P): """Return the angle to the horizontal for the connection from C to P. This uses the arcus sine function and resolves its inherent ambiguity by looking up in which quadrant vector S = P - C is located. """ S = Point(P - C).unit # unit vector 'C' -> 'P' alfa = math.asin(abs(S.y)) # absolute angle from horizontal if S.x < 0: # make arcsin result unique if S.y <= 0: # bottom-left alfa = -(math.pi - alfa) else: # top-left alfa = math.pi - alfa else: if S.y >= 0: # top-right pass else: # bottom-right alfa = -alfa return alfa def __init__(self, page: Page): CheckParent(page) self.page = page self.doc = page.parent if not self.doc.is_pdf: raise ValueError("is no PDF") self.height = page.mediabox_size.y self.width = page.mediabox_size.x self.x = page.cropbox_position.x self.y = page.cropbox_position.y self.pctm = page.transformation_matrix # page transf. matrix self.ipctm = ~self.pctm # inverted transf. matrix self.draw_cont = "" self.text_cont = "" self.totalcont = "" self.last_point = None self.rect = None def updateRect(self, x): if self.rect is None: if len(x) == 2: self.rect = Rect(x, x) else: self.rect = Rect(x) else: if len(x) == 2: x = Point(x) self.rect.x0 = min(self.rect.x0, x.x) self.rect.y0 = min(self.rect.y0, x.y) self.rect.x1 = max(self.rect.x1, x.x) self.rect.y1 = max(self.rect.y1, x.y) else: x = Rect(x) self.rect.x0 = min(self.rect.x0, x.x0) self.rect.y0 = min(self.rect.y0, x.y0) self.rect.x1 = max(self.rect.x1, x.x1) self.rect.y1 = max(self.rect.y1, x.y1) def draw_line(self, p1: point_like, p2: point_like) -> Point: """Draw a line between two points.""" p1 = Point(p1) p2 = Point(p2) if not (self.last_point == p1): self.draw_cont += _format_g(JM_TUPLE(p1 * self.ipctm)) + " m\n" self.last_point = p1 self.updateRect(p1) self.draw_cont += _format_g(JM_TUPLE(p2 * self.ipctm)) + " l\n" self.updateRect(p2) self.last_point = p2 return self.last_point def draw_polyline(self, points: list) -> Point: """Draw several connected line segments.""" for i, p in enumerate(points): if i == 0: if not (self.last_point == Point(p)): self.draw_cont += _format_g(JM_TUPLE(Point(p) * self.ipctm)) + " m\n" self.last_point = Point(p) else: self.draw_cont += _format_g(JM_TUPLE(Point(p) * self.ipctm)) + " l\n" self.updateRect(p) self.last_point = Point(points[-1]) return self.last_point def draw_bezier( self, p1: point_like, p2: point_like, p3: point_like, p4: point_like, ) -> Point: """Draw a standard cubic Bezier curve.""" p1 = Point(p1) p2 = Point(p2) p3 = Point(p3) p4 = Point(p4) if not (self.last_point == p1): self.draw_cont += _format_g(JM_TUPLE(p1 * self.ipctm)) + " m\n" args = JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm)) self.draw_cont += _format_g(args) + " c\n" self.updateRect(p1) self.updateRect(p2) self.updateRect(p3) self.updateRect(p4) self.last_point = p4 return self.last_point def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> Point: """Draw an ellipse inside a tetrapod.""" if len(tetra) != 4: raise ValueError("invalid arg length") if hasattr(tetra[0], "__float__"): q = Rect(tetra).quad else: q = Quad(tetra) mt = q.ul + (q.ur - q.ul) * 0.5 mr = q.ur + (q.lr - q.ur) * 0.5 mb = q.ll + (q.lr - q.ll) * 0.5 ml = q.ul + (q.ll - q.ul) * 0.5 if not (self.last_point == ml): self.draw_cont += _format_g(JM_TUPLE(ml * self.ipctm)) + " m\n" self.last_point = ml self.draw_curve(ml, q.ll, mb) self.draw_curve(mb, q.lr, mr) self.draw_curve(mr, q.ur, mt) self.draw_curve(mt, q.ul, ml) self.updateRect(q.rect) self.last_point = ml return self.last_point def draw_circle(self, center: point_like, radius: float) -> Point: """Draw a circle given its center and radius.""" if not radius > EPSILON: raise ValueError("radius must be positive") center = Point(center) p1 = center - (radius, 0) return self.draw_sector(center, p1, 360, fullSector=False) def draw_curve( self, p1: point_like, p2: point_like, p3: point_like, ) -> Point: """Draw a curve between points using one control point.""" kappa = 0.55228474983 p1 = Point(p1) p2 = Point(p2) p3 = Point(p3) k1 = p1 + (p2 - p1) * kappa k2 = p3 + (p2 - p3) * kappa return self.draw_bezier(p1, k1, k2, p3) def draw_sector( self, center: point_like, point: point_like, beta: float, fullSector: bool = True, ) -> Point: """Draw a circle sector.""" center = Point(center) point = Point(point) l3 = lambda a, b: _format_g((a, b)) + " m\n" l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n" l5 = lambda a, b: _format_g((a, b)) + " l\n" betar = math.radians(-beta) w360 = math.radians(math.copysign(360, betar)) * (-1) w90 = math.radians(math.copysign(90, betar)) w45 = w90 / 2 while abs(betar) > 2 * math.pi: betar += w360 # bring angle below 360 degrees if not (self.last_point == point): self.draw_cont += l3(*JM_TUPLE(point * self.ipctm)) self.last_point = point Q = Point(0, 0) # just make sure it exists C = center P = point S = P - C # vector 'center' -> 'point' rad = abs(S) # circle radius if not rad > EPSILON: raise ValueError("radius must be positive") alfa = self.horizontal_angle(center, point) while abs(betar) > abs(w90): # draw 90 degree arcs q1 = C.x + math.cos(alfa + w90) * rad q2 = C.y + math.sin(alfa + w90) * rad Q = Point(q1, q2) # the arc's end point r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45) r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45) R = Point(r1, r2) # crossing point of tangents kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q) kappa = kappah * abs(P - Q) cp1 = P + (R - P) * kappa # control point 1 cp2 = Q + (R - Q) * kappa # control point 2 self.draw_cont += l4(*JM_TUPLE( list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) )) betar -= w90 # reduce param angle by 90 deg alfa += w90 # advance start angle by 90 deg P = Q # advance to arc end point # draw (remaining) arc if abs(betar) > 1e-3: # significant degrees left? beta2 = betar / 2 q1 = C.x + math.cos(alfa + betar) * rad q2 = C.y + math.sin(alfa + betar) * rad Q = Point(q1, q2) # the arc's end point r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2) r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2) R = Point(r1, r2) # crossing point of tangents # kappa height is 4/3 of segment height kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height kappa = kappah * abs(P - Q) / (1 - math.cos(betar)) cp1 = P + (R - P) * kappa # control point 1 cp2 = Q + (R - Q) * kappa # control point 2 self.draw_cont += l4(*JM_TUPLE( list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) )) if fullSector: self.draw_cont += l3(*JM_TUPLE(point * self.ipctm)) self.draw_cont += l5(*JM_TUPLE(center * self.ipctm)) self.draw_cont += l5(*JM_TUPLE(Q * self.ipctm)) self.last_point = Q return self.last_point def draw_rect(self, rect: rect_like, *, radius=None) -> Point: """Draw a rectangle. Args: radius: if not None, the rectangle will have rounded corners. This is the radius of the curvature, given as percentage of the rectangle width or height. Valid are values 0 < v <= 0.5. For a sequence of two values, the corners will have different radii. Otherwise, the percentage will be computed from the shorter side. A value of (0.5, 0.5) will draw an ellipse. """ r = Rect(rect) if radius is None: # standard rectangle self.draw_cont += _format_g(JM_TUPLE( list(r.bl * self.ipctm) + [r.width, r.height] )) + " re\n" self.updateRect(r) self.last_point = r.tl return self.last_point # rounded corners requested. This requires 1 or 2 values, each # with 0 < value <= 0.5 if hasattr(radius, "__float__"): if radius <= 0 or radius > 0.5: raise ValueError(f"bad radius value {radius}.") d = min(r.width, r.height) * radius px = (d, 0) py = (0, d) elif hasattr(radius, "__len__") and len(radius) == 2: rx, ry = radius px = (rx * r.width, 0) py = (0, ry * r.height) if min(rx, ry) <= 0 or max(rx, ry) > 0.5: raise ValueError(f"bad radius value {radius}.") else: raise ValueError(f"bad radius value {radius}.") lp = self.draw_line(r.tl + py, r.bl - py) lp = self.draw_curve(lp, r.bl, r.bl + px) lp = self.draw_line(lp, r.br - px) lp = self.draw_curve(lp, r.br, r.br - py) lp = self.draw_line(lp, r.tr + py) lp = self.draw_curve(lp, r.tr, r.tr - px) lp = self.draw_line(lp, r.tl + px) self.last_point = self.draw_curve(lp, r.tl, r.tl + py) self.updateRect(r) return self.last_point def draw_quad(self, quad: quad_like) -> Point: """Draw a Quad.""" q = Quad(quad) return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul]) def draw_zigzag( self, p1: point_like, p2: point_like, breadth: float = 2, ) -> Point: """Draw a zig-zagged line from p1 to p2.""" p1 = Point(p1) p2 = Point(p2) S = p2 - p1 # vector start - end rad = abs(S) # distance of points cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases if cnt < 4: raise ValueError("points too close") mb = rad / cnt # revised breadth matrix = Matrix(util_hor_matrix(p1, p2)) # normalize line to x-axis i_mat = ~matrix # get original position points = [] # stores edges for i in range(1, cnt): if i % 4 == 1: # point "above" connection p = Point(i, -1) * mb elif i % 4 == 3: # point "below" connection p = Point(i, 1) * mb else: # ignore others continue points.append(p * i_mat) self.draw_polyline([p1] + points + [p2]) # add start and end points return p2 def draw_squiggle( self, p1: point_like, p2: point_like, breadth=2, ) -> Point: """Draw a squiggly line from p1 to p2.""" p1 = Point(p1) p2 = Point(p2) S = p2 - p1 # vector start - end rad = abs(S) # distance of points cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases if cnt < 4: raise ValueError("points too close") mb = rad / cnt # revised breadth matrix = Matrix(util_hor_matrix(p1, p2)) # normalize line to x-axis i_mat = ~matrix # get original position k = 2.4142135623765633 # y of draw_curve helper point points = [] # stores edges for i in range(1, cnt): if i % 4 == 1: # point "above" connection p = Point(i, -k) * mb elif i % 4 == 3: # point "below" connection p = Point(i, k) * mb else: # else on connection line p = Point(i, 0) * mb points.append(p * i_mat) points = [p1] + points + [p2] cnt = len(points) i = 0 while i + 2 < cnt: self.draw_curve(points[i], points[i + 1], points[i + 2]) i += 2 return p2 # ============================================================================== # Shape.insert_text # ============================================================================== def insert_text( self, point: point_like, buffer: typing.Union[str, list], *, fontsize: float = 11, lineheight: OptFloat = None, fontname: str = "helv", fontfile: OptStr = None, set_simple: bool = 0, encoding: int = 0, color: OptSeq = None, fill: OptSeq = None, render_mode: int = 0, border_width: float = 0.05, miter_limit: float = 1, rotate: int = 0, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> int: # ensure 'text' is a list of strings, worth dealing with if not bool(buffer): return 0 if type(buffer) not in (list, tuple): text = buffer.splitlines() else: text = buffer if not len(text) > 0: return 0 point = Point(point) try: maxcode = max([ord(c) for c in " ".join(text)]) except Exception: exception_info() return 0 # ensure valid 'fontname' fname = fontname if fname.startswith("/"): fname = fname[1:] xref = self.page.insert_font( fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple ) fontinfo = CheckFontInfo(self.doc, xref) fontdict = fontinfo[1] ordering = fontdict["ordering"] simple = fontdict["simple"] bfname = fontdict["name"] ascender = fontdict["ascender"] descender = fontdict["descender"] if lineheight: lheight = fontsize * lineheight elif ascender - descender <= 1: lheight = fontsize * 1.2 else: lheight = fontsize * (ascender - descender) if maxcode > 255: glyphs = self.doc.get_char_widths(xref, maxcode + 1) else: glyphs = fontdict["glyphs"] tab = [] for t in text: if simple and bfname not in ("Symbol", "ZapfDingbats"): g = None else: g = glyphs tab.append(getTJstr(t, g, simple, ordering)) text = tab color_str = ColorCode(color, "c") fill_str = ColorCode(fill, "f") if not fill and render_mode == 0: # ensure fill color when 0 Tr fill = color fill_str = ColorCode(color, "f") morphing = CheckMorph(morph) rot = rotate if rot % 90 != 0: raise ValueError("bad rotate value") while rot < 0: rot += 360 rot = rot % 360 # text rotate = 0, 90, 270, 180 templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf " templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n" cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. height = self.height width = self.width # setting up for standard rotation directions # case rotate = 0 if morphing: m1 = Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y) mat = ~m1 * morph[1] * m1 cm = _format_g(JM_TUPLE(mat)) + " cm\n" else: cm = "" top = height - point.y - self.y # start of 1st char left = point.x + self.x # start of 1. char space = top # space available #headroom = point.y + self.y # distance to page border if rot == 90: left = height - point.y - self.y top = -point.x - self.x cm += cmp90 space = width - abs(top) #headroom = point.x + self.x elif rot == 270: left = -height + point.y + self.y top = point.x + self.x cm += cmm90 space = abs(top) #headroom = width - point.x - self.x elif rot == 180: left = -point.x - self.x top = -height + point.y + self.y cm += cm180 space = abs(point.y + self.y) #headroom = height - point.y - self.y optcont = self.page._get_optional_content(oc) if optcont is not None: bdc = "/OC /%s BDC\n" % optcont emc = "EMC\n" else: bdc = emc = "" alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) if alpha is None: alpha = "" else: alpha = "/%s gs\n" % alpha nres = templ1(bdc, alpha, cm, left, top, fname, fontsize) if render_mode > 0: nres += "%i Tr " % render_mode nres += _format_g(border_width * fontsize) + " w " if miter_limit is not None: nres += _format_g(miter_limit) + " M " if color is not None: nres += color_str if fill is not None: nres += fill_str # ========================================================================= # start text insertion # ========================================================================= nres += text[0] nlines = 1 # set output line counter if len(text) > 1: nres += templ2(lheight) # line 1 else: nres += 'TJ' for i in range(1, len(text)): if space < lheight: break # no space left on page if i > 1: nres += "\nT* " nres += text[i] + 'TJ' space -= lheight nlines += 1 nres += "\nET\n%sQ\n" % emc # ========================================================================= # end of text insertion # ========================================================================= # update the /Contents object self.text_cont += nres return nlines # ============================================================================== # Shape.insert_textbox # ============================================================================== def insert_textbox( self, rect: rect_like, buffer: typing.Union[str, list], *, fontname: OptStr = "helv", fontfile: OptStr = None, fontsize: float = 11, lineheight: OptFloat = None, set_simple: bool = 0, encoding: int = 0, color: OptSeq = None, fill: OptSeq = None, expandtabs: int = 1, border_width: float = 0.05, miter_limit: float = 1, align: int = 0, render_mode: int = 0, rotate: int = 0, morph: OptSeq = None, stroke_opacity: float = 1, fill_opacity: float = 1, oc: int = 0, ) -> float: """Insert text into a given rectangle. Args: rect -- the textbox to fill buffer -- text to be inserted fontname -- a Base-14 font, font name or '/name' fontfile -- name of a font file fontsize -- font size lineheight -- overwrite the font property color -- RGB stroke color triple fill -- RGB fill color triple render_mode -- text rendering control border_width -- thickness of glyph borders as percentage of fontsize expandtabs -- handles tabulators with string function align -- left, center, right, justified rotate -- 0, 90, 180, or 270 degrees morph -- morph box with a matrix and a fixpoint Returns: unused or deficit rectangle area (float) """ rect = Rect(rect) if rect.is_empty or rect.is_infinite: raise ValueError("text box must be finite and not empty") color_str = ColorCode(color, "c") fill_str = ColorCode(fill, "f") if fill is None and render_mode == 0: # ensure fill color for 0 Tr fill = color fill_str = ColorCode(color, "f") optcont = self.page._get_optional_content(oc) if optcont is not None: bdc = "/OC /%s BDC\n" % optcont emc = "EMC\n" else: bdc = emc = "" # determine opacity / transparency alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) if alpha is None: alpha = "" else: alpha = "/%s gs\n" % alpha if rotate % 90 != 0: raise ValueError("rotate must be multiple of 90") rot = rotate while rot < 0: rot += 360 rot = rot % 360 # is buffer worth of dealing with? if not bool(buffer): return rect.height if rot in (0, 180) else rect.width cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. height = self.height fname = fontname if fname.startswith("/"): fname = fname[1:] xref = self.page.insert_font( fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple ) fontinfo = CheckFontInfo(self.doc, xref) fontdict = fontinfo[1] ordering = fontdict["ordering"] simple = fontdict["simple"] glyphs = fontdict["glyphs"] bfname = fontdict["name"] ascender = fontdict["ascender"] descender = fontdict["descender"] if lineheight: lheight_factor = lineheight elif ascender - descender <= 1: lheight_factor = 1.2 else: lheight_factor = ascender - descender lheight = fontsize * lheight_factor # create a list from buffer, split into its lines if type(buffer) in (list, tuple): t0 = "\n".join(buffer) else: t0 = buffer maxcode = max([ord(c) for c in t0]) # replace invalid char codes for simple fonts if simple and maxcode > 255: t0 = "".join([c if ord(c) < 256 else "?" for c in t0]) t0 = t0.splitlines() glyphs = self.doc.get_char_widths(xref, maxcode + 1) if simple and bfname not in ("Symbol", "ZapfDingbats"): tj_glyphs = None else: tj_glyphs = glyphs # ---------------------------------------------------------------------- # calculate pixel length of a string # ---------------------------------------------------------------------- def pixlen(x): """Calculate pixel length of x.""" if ordering < 0: return sum([glyphs[ord(c)][1] for c in x]) * fontsize else: return len(x) * fontsize # --------------------------------------------------------------------- if ordering < 0: blen = glyphs[32][1] * fontsize # pixel size of space character else: blen = fontsize text = "" # output buffer if CheckMorph(morph): m1 = Matrix( 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y ) mat = ~m1 * morph[1] * m1 cm = _format_g(JM_TUPLE(mat)) + " cm\n" else: cm = "" # --------------------------------------------------------------------- # adjust for text orientation / rotation # --------------------------------------------------------------------- progr = 1 # direction of line progress c_pnt = Point(0, fontsize * ascender) # used for line progress if rot == 0: # normal orientation point = rect.tl + c_pnt # line 1 is 'lheight' below top maxwidth = rect.width # pixels available in one line maxheight = rect.height # available text height elif rot == 90: # rotate counter clockwise c_pnt = Point(fontsize * ascender, 0) # progress in x-direction point = rect.bl + c_pnt # line 1 'lheight' away from left maxwidth = rect.height # pixels available in one line maxheight = rect.width # available text height cm += cmp90 elif rot == 180: # text upside down # progress upwards in y direction c_pnt = -Point(0, fontsize * ascender) point = rect.br + c_pnt # line 1 'lheight' above bottom maxwidth = rect.width # pixels available in one line progr = -1 # subtract lheight for next line maxheight =rect.height # available text height cm += cm180 else: # rotate clockwise (270 or -90) # progress from right to left c_pnt = -Point(fontsize * ascender, 0) point = rect.tr + c_pnt # line 1 'lheight' left of right maxwidth = rect.height # pixels available in one line progr = -1 # subtract lheight for next line maxheight = rect.width # available text height cm += cmm90 # ===================================================================== # line loop # ===================================================================== just_tab = [] # 'justify' indicators per line for i, line in enumerate(t0): line_t = line.expandtabs(expandtabs).split(" ") # split into words num_words = len(line_t) lbuff = "" # init line buffer rest = maxwidth # available line pixels # ================================================================= # word loop # ================================================================= for j in range(num_words): word = line_t[j] pl_w = pixlen(word) # pixel len of word if rest >= pl_w: # does it fit on the line? lbuff += word + " " # yes, append word rest -= pl_w + blen # update available line space continue # next word # word doesn't fit - output line (if not empty) if lbuff: lbuff = lbuff.rstrip() + "\n" # line full, append line break text += lbuff # append to total text just_tab.append(True) # can align-justify lbuff = "" # re-init line buffer rest = maxwidth # re-init avail. space if pl_w <= maxwidth: # word shorter than 1 line? lbuff = word + " " # start the line with it rest = maxwidth - pl_w - blen # update free space continue # long word: split across multiple lines - char by char ... if len(just_tab) > 0: just_tab[-1] = False # cannot align-justify for c in word: if pixlen(lbuff) <= maxwidth - pixlen(c): lbuff += c else: # line full lbuff += "\n" # close line text += lbuff # append to text just_tab.append(False) # cannot align-justify lbuff = c # start new line with this char lbuff += " " # finish long word rest = maxwidth - pixlen(lbuff) # long word stored if lbuff: # unprocessed line content? text += lbuff.rstrip() # append to text just_tab.append(False) # cannot align-justify if i < len(t0) - 1: # not the last line? text += "\n" # insert line break # compute used part of the textbox if text.endswith("\n"): text = text[:-1] lb_count = text.count("\n") + 1 # number of lines written # text height = line count * line height plus one descender value text_height = lheight * lb_count - descender * fontsize more = text_height - maxheight # difference to height limit if more > EPSILON: # landed too much outside rect return (-1) * more # return deficit, don't output more = abs(more) if more < EPSILON: more = 0 # don't bother with epsilons nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf " # center, right, justify: output each line with its own specifics text_t = text.splitlines() # split text in lines again just_tab[-1] = False # never justify last line for i, t in enumerate(text_t): spacing = 0 pl = maxwidth - pixlen(t) # length of empty line part pnt = point + c_pnt * (i * lheight_factor) # text start of line if align == 1: # center: right shift by half width if rot in (0, 180): pnt = pnt + Point(pl / 2, 0) * progr else: pnt = pnt - Point(0, pl / 2) * progr elif align == 2: # right: right shift by full width if rot in (0, 180): pnt = pnt + Point(pl, 0) * progr else: pnt = pnt - Point(0, pl) * progr elif align == 3: # justify spaces = t.count(" ") # number of spaces in line if spaces > 0 and just_tab[i]: # if any, and we may justify spacing = pl / spaces # make every space this much larger else: spacing = 0 # keep normal space length top = height - pnt.y - self.y left = pnt.x + self.x if rot == 90: left = height - pnt.y - self.y top = -pnt.x - self.x elif rot == 270: left = -height + pnt.y + self.y top = pnt.x + self.x elif rot == 180: left = -pnt.x - self.x top = -height + pnt.y + self.y nres += templ(left, top, fname, fontsize) if render_mode > 0: nres += "%i Tr " % render_mode nres += _format_g(border_width * fontsize) + " w " if miter_limit is not None: nres += _format_g(miter_limit) + " M " if align == 3: nres += _format_g(spacing) + " Tw " if color is not None: nres += color_str if fill is not None: nres += fill_str nres += "%sTJ\n" % getTJstr(t, tj_glyphs, simple, ordering) nres += "ET\n%sQ\n" % emc self.text_cont += nres self.updateRect(rect) return more def finish( self, width: float = 1, color: OptSeq = (0,), fill: OptSeq = None, lineCap: int = 0, lineJoin: int = 0, dashes: OptStr = None, even_odd: bool = False, morph: OptSeq = None, closePath: bool = True, fill_opacity: float = 1, stroke_opacity: float = 1, oc: int = 0, ) -> None: """Finish the current drawing segment. Notes: Apply colors, opacity, dashes, line style and width, or morphing. Also whether to close the path by connecting last to first point. """ if self.draw_cont == "": # treat empty contents as no-op return if width == 0: # border color makes no sense then color = None elif color is None: # vice versa width = 0 # if color == None and fill == None: # raise ValueError("at least one of 'color' or 'fill' must be given") color_str = ColorCode(color, "c") # ensure proper color string fill_str = ColorCode(fill, "f") # ensure proper fill string optcont = self.page._get_optional_content(oc) if optcont is not None: self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont emc = "EMC\n" else: emc = "" alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) if alpha is not None: self.draw_cont = "/%s gs\n" % alpha + self.draw_cont if width != 1 and width != 0: self.draw_cont += _format_g(width) + " w\n" if lineCap != 0: self.draw_cont = "%i J\n" % lineCap + self.draw_cont if lineJoin != 0: self.draw_cont = "%i j\n" % lineJoin + self.draw_cont if dashes not in (None, "", "[] 0"): self.draw_cont = "%s d\n" % dashes + self.draw_cont if closePath: self.draw_cont += "h\n" self.last_point = None if color is not None: self.draw_cont += color_str if fill is not None: self.draw_cont += fill_str if color is not None: if not even_odd: self.draw_cont += "B\n" else: self.draw_cont += "B*\n" else: if not even_odd: self.draw_cont += "f\n" else: self.draw_cont += "f*\n" else: self.draw_cont += "S\n" self.draw_cont += emc if CheckMorph(morph): m1 = Matrix( 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y ) mat = ~m1 * morph[1] * m1 self.draw_cont = _format_g(JM_TUPLE(mat)) + " cm\n" + self.draw_cont self.totalcont += "\nq\n" + self.draw_cont + "Q\n" self.draw_cont = "" self.last_point = None return def commit(self, overlay: bool = True) -> None: """Update the page's /Contents object with Shape data. The argument controls whether data appear in foreground (default) or background. """ CheckParent(self.page) # doc may have died meanwhile self.totalcont += self.text_cont self.totalcont = self.totalcont.encode() if self.totalcont: if overlay: self.page.wrap_contents() # ensure a balanced graphics state # make /Contents object with dummy stream xref = TOOLS._insert_contents(self.page, b" ", overlay) # update it with potential compression self.doc.update_stream(xref, self.totalcont) self.last_point = None # clean up ... self.rect = None # self.draw_cont = "" # for potential ... self.text_cont = "" # ... self.totalcont = "" # re-use class Story: def __init__( self, html='', user_css=None, em=12, archive=None): buffer_ = mupdf.fz_new_buffer_from_copied_data( html.encode('utf-8')) if archive and not isinstance(archive, Archive): archive = Archive(archive) arch = archive.this if archive else mupdf.FzArchive( None) if hasattr(mupdf, 'FzStoryS'): self.this = mupdf.FzStoryS( buffer_, user_css, em, arch) else: self.this = mupdf.FzStory( buffer_, user_css, em, arch) def add_header_ids(self): ''' Look for `<h1..6>` items in `self` and adds unique `id` attributes if not already present. ''' dom = self.body i = 0 x = dom.find(None, None, None) while x: name = x.tagname if len(name) == 2 and name[0]=="h" and name[1] in "123456": attr = x.get_attribute_value("id") if not attr: id_ = f"h_id_{i}" #log(f"{name=}: setting {id_=}") x.set_attribute("id", id_) i += 1 x = x.find_next(None, None, None) @staticmethod def add_pdf_links(document_or_stream, positions): """ Adds links to PDF document. Args: document_or_stream: A PDF `Document` or raw PDF content, for example an `io.BytesIO` instance. positions: List of `ElementPosition`'s for `document_or_stream`, typically from Story.element_positions(). We raise an exception if two or more positions have same id. Returns: `document_or_stream` if a `Document` instance, otherwise a new `Document` instance. We raise an exception if an `href` in `positions` refers to an internal position `#<name>` but no item in `positions` has `id = name`. """ if isinstance(document_or_stream, Document): document = document_or_stream else: document = Document("pdf", document_or_stream) # Create dict from id to position, which we will use to find # link destinations. # id_to_position = dict() #log(f"positions: {positions}") for position in positions: #log(f"add_pdf_links(): position: {position}") if (position.open_close & 1) and position.id: #log(f"add_pdf_links(): position with id: {position}") if position.id in id_to_position: #log(f"Ignoring duplicate positions with id={position.id!r}") pass else: id_to_position[ position.id] = position # Insert links for all positions that have an `href`. # for position_from in positions: if (position_from.open_close & 1) and position_from.href: #log(f"add_pdf_links(): position with href: {position}") link = dict() link['from'] = Rect(position_from.rect) if position_from.href.startswith("#"): #`<a href="#...">...</a>` internal link. target_id = position_from.href[1:] try: position_to = id_to_position[ target_id] except Exception as e: if g_exceptions_verbose > 1: exception_info() raise RuntimeError(f"No destination with id={target_id}, required by position_from: {position_from}") from e # Make link from `position_from`'s rect to top-left of # `position_to`'s rect. if 0: log(f"add_pdf_links(): making link from:") log(f"add_pdf_links(): {position_from}") log(f"add_pdf_links(): to:") log(f"add_pdf_links(): {position_to}") link["kind"] = LINK_GOTO x0, y0, x1, y1 = position_to.rect # This appears to work well with viewers which scroll # to make destination point top-left of window. link["to"] = Point(x0, y0) link["page"] = position_to.page_num - 1 else: # `<a href="...">...</a>` external link. if position_from.href.startswith('name:'): link['kind'] = LINK_NAMED link['name'] = position_from.href[5:] else: link['kind'] = LINK_URI link['uri'] = position_from.href #log(f'Adding link: {position_from.page_num=} {link=}.') document[position_from.page_num - 1].insert_link(link) return document @property def body(self): dom = self.document() return dom.bodytag() def document( self): dom = mupdf.fz_story_document( self.this) return Xml( dom) def draw( self, device, matrix=None): ctm2 = JM_matrix_from_py( matrix) dev = device.this if device else mupdf.FzDevice( None) mupdf.fz_draw_story( self.this, dev, ctm2) def element_positions( self, function, args=None): ''' Trigger a callback function to record where items have been placed. ''' if type(args) is dict: for k in args.keys(): if not (type(k) is str and k.isidentifier()): raise ValueError(f"invalid key '{k}'") else: args = {} if not callable(function) or function.__code__.co_argcount != 1: raise ValueError("callback 'function' must be a callable with exactly one argument") def function2( position): class Position2: pass position2 = Position2() position2.depth = position.depth position2.heading = position.heading position2.id = position.id position2.rect = JM_py_from_rect(position.rect) position2.text = position.text position2.open_close = position.open_close position2.rect_num = position.rectangle_num position2.href = position.href if args: for k, v in args.items(): setattr( position2, k, v) function( position2) mupdf.fz_story_positions( self.this, function2) def place( self, where, flags=0): ''' Wrapper for fz_place_story_flags(). ''' where = JM_rect_from_py( where) filled = mupdf.FzRect() more = mupdf.fz_place_story_flags( self.this, where, filled, flags) return more, JM_py_from_rect( filled) def reset( self): mupdf.fz_reset_story( self.this) def write(self, writer, rectfn, positionfn=None, pagefn=None): dev = None page_num = 0 rect_num = 0 filled = Rect(0, 0, 0, 0) while 1: mediabox, rect, ctm = rectfn(rect_num, filled) rect_num += 1 if mediabox: # new page. page_num += 1 more, filled = self.place( rect) if positionfn: def positionfn2(position): # We add a `.page_num` member to the # `ElementPosition` instance. position.page_num = page_num positionfn(position) self.element_positions(positionfn2) if writer: if mediabox: # new page. if dev: if pagefn: pagefn(page_num, mediabox, dev, 1) writer.end_page() dev = writer.begin_page( mediabox) if pagefn: pagefn(page_num, mediabox, dev, 0) self.draw( dev, ctm) if not more: if pagefn: pagefn( page_num, mediabox, dev, 1) writer.end_page() else: self.draw(None, ctm) if not more: break @staticmethod def write_stabilized(writer, contentfn, rectfn, user_css=None, em=12, positionfn=None, pagefn=None, archive=None, add_header_ids=True): positions = list() content = None # Iterate until stable. while 1: content_prev = content content = contentfn( positions) stable = False if content == content_prev: stable = True content2 = content story = Story(content2, user_css, em, archive) if add_header_ids: story.add_header_ids() positions = list() def positionfn2(position): #log(f"write_stabilized(): {stable=} {positionfn=} {position=}") positions.append(position) if stable and positionfn: positionfn(position) story.write( writer if stable else None, rectfn, positionfn2, pagefn, ) if stable: break @staticmethod def write_stabilized_with_links(contentfn, rectfn, user_css=None, em=12, positionfn=None, pagefn=None, archive=None, add_header_ids=True): #log("write_stabilized_with_links()") stream = io.BytesIO() writer = DocumentWriter(stream) positions = [] def positionfn2(position): #log(f"write_stabilized_with_links(): {position=}") positions.append(position) if positionfn: positionfn(position) Story.write_stabilized(writer, contentfn, rectfn, user_css, em, positionfn2, pagefn, archive, add_header_ids) writer.close() stream.seek(0) return Story.add_pdf_links(stream, positions) def write_with_links(self, rectfn, positionfn=None, pagefn=None): #log("write_with_links()") stream = io.BytesIO() writer = DocumentWriter(stream) positions = [] def positionfn2(position): #log(f"write_with_links(): {position=}") positions.append(position) if positionfn: positionfn(position) self.write(writer, rectfn, positionfn=positionfn2, pagefn=pagefn) writer.close() stream.seek(0) return Story.add_pdf_links(stream, positions) class FitResult: ''' The result from a `Story.fit*()` method. Members: `big_enough`: `True` if the fit succeeded. `filled`: Tuple (x0, y0, x1, y1) from the last call to `Story.place()`. This will be wider than .rect if any single word (which we never split) was too wide for .rect. `more`: `False` if the fit succeeded. `numcalls`: Number of calls made to `self.place()`. `parameter`: The successful parameter value, or the largest failing value. `rect`: The pumupdf.Rect created from `parameter`. ''' def __init__(self, big_enough=None, filled=None, more=None, numcalls=None, parameter=None, rect=None): self.big_enough = big_enough self.filled = filled self.more = more self.numcalls = numcalls self.parameter = parameter self.rect = rect def __repr__(self): return ( f' big_enough={self.big_enough}' f' filled={self.filled}' f' more={self.more}' f' numcalls={self.numcalls}' f' parameter={self.parameter}' f' rect={self.rect}' ) def fit(self, fn, pmin=None, pmax=None, delta=0.001, verbose=False, flags=0): ''' Finds optimal rect that contains the story `self`. Returns a `Story.FitResult` instance. On success, the last call to `self.place()` will have been with the returned rectangle, so `self.draw()` can be used directly. Args: :arg fn: A callable taking a floating point `parameter` and returning a `pymupdf.Rect()`. If the rect is empty, we assume the story will not fit and do not call `self.place()`. Must guarantee that `self.place()` behaves monotonically when given rect `fn(parameter`) as `parameter` increases. This usually means that both width and height increase or stay unchanged as `parameter` increases. :arg pmin: Minimum parameter to consider; `None` for -infinity. :arg pmax: Maximum parameter to consider; `None` for +infinity. :arg delta: Maximum error in returned `parameter`. :arg verbose: If true we output diagnostics. :arg flags: Passed to mupdf.fz_place_story_flags(). e.g. zero or `mupdf.FZ_PLACE_STORY_FLAG_NO_OVERFLOW`. ''' def log(text): assert verbose message(f'fit(): {text}') assert isinstance(pmin, (int, float)) or pmin is None assert isinstance(pmax, (int, float)) or pmax is None class State: def __init__(self): self.pmin = pmin self.pmax = pmax self.pmin_result = None self.pmax_result = None self.result = None self.numcalls = 0 if verbose: self.pmin0 = pmin self.pmax0 = pmax state = State() if verbose: log(f'starting. {state.pmin=} {state.pmax=}.') self.reset() def ret(): if state.pmax is not None: if state.last_p != state.pmax: if verbose: log(f'Calling update() with pmax, because was overwritten by later calls.') big_enough = update(state.pmax) assert big_enough result = state.pmax_result else: result = state.pmin_result if state.pmin_result else Story.FitResult(numcalls=state.numcalls) if verbose: log(f'finished. {state.pmin0=} {state.pmax0=} {state.pmax=}: returning {result=}') return result def update(parameter): ''' Evaluates `more, _ = self.place(fn(parameter))`. If `more` is false, then `rect` is big enough to contain `self` and we set `state.pmax=parameter` and return True. Otherwise we set `state.pmin=parameter` and return False. ''' rect = fn(parameter) assert isinstance(rect, Rect), f'{type(rect)=} {rect=}' if rect.is_empty: big_enough = False result = Story.FitResult(parameter=parameter, numcalls=state.numcalls) if verbose: log(f'update(): not calling self.place() because rect is empty.') else: more, filled = self.place(rect, flags) state.numcalls += 1 big_enough = not more result = Story.FitResult( filled=filled, more=more, numcalls=state.numcalls, parameter=parameter, rect=rect, big_enough=big_enough, ) if verbose: log(f'update(): called self.place(): {state.numcalls:>2d}: {more=} {parameter=} {rect=}.') if big_enough: state.pmax = parameter state.pmax_result = result else: state.pmin = parameter state.pmin_result = result state.last_p = parameter return big_enough def opposite(p, direction): ''' Returns same sign as `direction`, larger or smaller than `p` if direction is positive or negative respectively. ''' if p is None or p==0: return direction if direction * p > 0: return 2 * p return -p if state.pmin is None: # Find an initial finite pmin value. if verbose: log(f'finding pmin.') parameter = opposite(state.pmax, -1) while 1: if not update(parameter): break parameter *= 2 else: if update(state.pmin): if verbose: log(f'{state.pmin=} is big enough.') return ret() if state.pmax is None: # Find an initial finite pmax value. if verbose: log(f'finding pmax.') parameter = opposite(state.pmin, +1) while 1: if update(parameter): break parameter *= 2 else: if not update(state.pmax): # No solution possible. state.pmax = None if verbose: log(f'No solution possible {state.pmax=}.') return ret() # Do binary search in pmin..pmax. if verbose: log(f'doing binary search with {state.pmin=} {state.pmax=}.') while 1: if state.pmax - state.pmin < delta: return ret() parameter = (state.pmin + state.pmax) / 2 update(parameter) def fit_scale(self, rect, scale_min=0, scale_max=None, delta=0.001, verbose=False, flags=0): ''' Finds smallest value `scale` in range `scale_min..scale_max` where `scale * rect` is large enough to contain the story `self`. Returns a `Story.FitResult` instance with `.parameter` set to `scale`. :arg width: width of rect. :arg height: height of rect. :arg scale_min: Minimum scale to consider; must be >= 0. :arg scale_max: Maximum scale to consider, must be >= scale_min or `None` for infinite. :arg delta: Maximum error in returned scale. :arg verbose: If true we output diagnostics. :arg flags: Passed to Story.place(). ''' x0, y0, x1, y1 = rect width = x1 - x0 height = y1 - y0 def fn(scale): return Rect(x0, y0, x0 + scale*width, y0 + scale*height) return self.fit(fn, scale_min, scale_max, delta, verbose, flags) def fit_height(self, width, height_min=0, height_max=None, origin=(0, 0), delta=0.001, verbose=False): ''' Finds smallest height in range `height_min..height_max` where a rect with size `(width, height)` is large enough to contain the story `self`. Returns a `Story.FitResult` instance. :arg width: width of rect. :arg height_min: Minimum height to consider; must be >= 0. :arg height_max: Maximum height to consider, must be >= height_min or `None` for infinite. :arg origin: `(x0, y0)` of rect. :arg delta: Maximum error in returned height. :arg verbose: If true we output diagnostics. ''' x0, y0 = origin x1 = x0 + width def fn(height): return Rect(x0, y0, x1, y0+height) return self.fit(fn, height_min, height_max, delta, verbose) def fit_width(self, height, width_min=0, width_max=None, origin=(0, 0), delta=0.001, verbose=False): ''' Finds smallest width in range `width_min..width_max` where a rect with size `(width, height)` is large enough to contain the story `self`. Returns a `Story.FitResult` instance. Returns a `FitResult` instance. :arg height: height of rect. :arg width_min: Minimum width to consider; must be >= 0. :arg width_max: Maximum width to consider, must be >= width_min or `None` for infinite. :arg origin: `(x0, y0)` of rect. :arg delta: Maximum error in returned width. :arg verbose: If true we output diagnostics. ''' x0, y0 = origin y1 = y0 + height def fn(width): return Rect(x0, y0, x0+width, y1) return self.fit(fn, width_min, width_max, delta, verbose) class TextPage: def __init__(self, *args): if args_match(args, mupdf.FzRect): mediabox = args[0] self.this = mupdf.FzStextPage( mediabox) elif args_match(args, mupdf.FzStextPage): self.this = args[0] else: raise Exception(f'Unrecognised args: {args}') self.thisown = True self.parent = None def _extractText(self, format_): this_tpage = self.this res = mupdf.fz_new_buffer(1024) out = mupdf.FzOutput( res) # fixme: mupdfwrap.py thinks fz_output is not copyable, possibly # because there is no .refs member visible and no fz_keep_output() fn, # although there is an fz_drop_output(). So mupdf.fz_new_output_with_buffer() # doesn't convert the returned fz_output* into a mupdf.FzOutput. #out = mupdf.FzOutput(out) if format_ == 1: mupdf.fz_print_stext_page_as_html(out, this_tpage, 0) elif format_ == 3: mupdf.fz_print_stext_page_as_xml(out, this_tpage, 0) elif format_ == 4: mupdf.fz_print_stext_page_as_xhtml(out, this_tpage, 0) else: JM_print_stext_page_as_text(res, this_tpage) out.fz_close_output() text = JM_EscapeStrFromBuffer(res) return text def _getNewBlockList(self, page_dict, raw): JM_make_textpage_dict(self.this, page_dict, raw) def _textpage_dict(self, raw=False): page_dict = {"width": self.rect.width, "height": self.rect.height} self._getNewBlockList(page_dict, raw) return page_dict def extractBLOCKS(self): """Return a list with text block information.""" if g_use_extra: return extra.extractBLOCKS(self.this) block_n = -1 this_tpage = self.this tp_rect = mupdf.FzRect(this_tpage.m_internal.mediabox) res = mupdf.fz_new_buffer(1024) lines = [] for block in this_tpage: block_n += 1 blockrect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) if block.m_internal.type == mupdf.FZ_STEXT_BLOCK_TEXT: mupdf.fz_clear_buffer(res) # set text buffer to empty line_n = -1 last_char = 0 for line in block: line_n += 1 linerect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) for ch in line: cbbox = JM_char_bbox(line, ch) if (not JM_rects_overlap(tp_rect, cbbox) and not mupdf.fz_is_infinite_rect(tp_rect) ): continue JM_append_rune(res, ch.m_internal.c) last_char = ch.m_internal.c linerect = mupdf.fz_union_rect(linerect, cbbox) if last_char != 10 and not mupdf.fz_is_empty_rect(linerect): mupdf.fz_append_byte(res, 10) blockrect = mupdf.fz_union_rect(blockrect, linerect) text = JM_EscapeStrFromBuffer(res) elif (JM_rects_overlap(tp_rect, block.m_internal.bbox) or mupdf.fz_is_infinite_rect(tp_rect) ): img = block.i_image() cs = img.colorspace() text = "<image: %s, width: %d, height: %d, bpc: %d>" % ( mupdf.fz_colorspace_name(cs), img.w(), img.h(), img.bpc() ) blockrect = mupdf.fz_union_rect(blockrect, mupdf.FzRect(block.m_internal.bbox)) if not mupdf.fz_is_empty_rect(blockrect): litem = ( blockrect.x0, blockrect.y0, blockrect.x1, blockrect.y1, text, block_n, block.m_internal.type, ) lines.append(litem) return lines def extractDICT(self, cb=None, sort=False) -> dict: """Return page content as a Python dict of images and text spans.""" val = self._textpage_dict(raw=False) if cb is not None: val["width"] = cb.width val["height"] = cb.height if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks return val def extractHTML(self) -> str: """Return page content as a HTML string.""" return self._extractText(1) def extractIMGINFO(self, hashes=0): """Return a list with image meta information.""" block_n = -1 this_tpage = self.this rc = [] for block in this_tpage: block_n += 1 if block.m_internal.type == mupdf.FZ_STEXT_BLOCK_TEXT: continue img = block.i_image() img_size = 0 mask = img.mask() if mask.m_internal: has_mask = True else: has_mask = False compr_buff = mupdf.fz_compressed_image_buffer(img) if compr_buff.m_internal: img_size = compr_buff.fz_compressed_buffer_size() compr_buff = None if hashes: r = mupdf.FzIrect(FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT) assert mupdf.fz_is_infinite_irect(r) m = mupdf.FzMatrix(img.w(), 0, 0, img.h(), 0, 0) pix, w, h = mupdf.fz_get_pixmap_from_image(img, r, m) digest = mupdf.fz_md5_pixmap2(pix) digest = bytes(digest) if img_size == 0: img_size = img.w() * img.h() * img.n() cs = mupdf.FzColorspace(mupdf.ll_fz_keep_colorspace(img.m_internal.colorspace)) block_dict = dict() block_dict[dictkey_number] = block_n block_dict[dictkey_bbox] = JM_py_from_rect(block.m_internal.bbox) block_dict[dictkey_matrix] = JM_py_from_matrix(block.i_transform()) block_dict[dictkey_width] = img.w() block_dict[dictkey_height] = img.h() block_dict[dictkey_colorspace] = mupdf.fz_colorspace_n(cs) block_dict[dictkey_cs_name] = mupdf.fz_colorspace_name(cs) block_dict[dictkey_xres] = img.xres() block_dict[dictkey_yres] = img.yres() block_dict[dictkey_bpc] = img.bpc() block_dict[dictkey_size] = img_size if hashes: block_dict["digest"] = digest block_dict["has-mask"] = has_mask rc.append(block_dict) return rc def extractJSON(self, cb=None, sort=False) -> str: """Return 'extractDICT' converted to JSON format.""" import base64 import json val = self._textpage_dict(raw=False) class b64encode(json.JSONEncoder): def default(self, s): if type(s) in (bytes, bytearray): return base64.b64encode(s).decode() if cb is not None: val["width"] = cb.width val["height"] = cb.height if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks val = json.dumps(val, separators=(",", ":"), cls=b64encode, indent=1) return val def extractRAWDICT(self, cb=None, sort=False) -> dict: """Return page content as a Python dict of images and text characters.""" val = self._textpage_dict(raw=True) if cb is not None: val["width"] = cb.width val["height"] = cb.height if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks return val def extractRAWJSON(self, cb=None, sort=False) -> str: """Return 'extractRAWDICT' converted to JSON format.""" import base64 import json val = self._textpage_dict(raw=True) class b64encode(json.JSONEncoder): def default(self,s): if type(s) in (bytes, bytearray): return base64.b64encode(s).decode() if cb is not None: val["width"] = cb.width val["height"] = cb.height if sort: blocks = val["blocks"] blocks.sort(key=lambda b: (b["bbox"][3], b["bbox"][0])) val["blocks"] = blocks val = json.dumps(val, separators=(",", ":"), cls=b64encode, indent=1) return val def extractSelection(self, pointa, pointb): a = JM_point_from_py(pointa) b = JM_point_from_py(pointb) found = mupdf.fz_copy_selection(self.this, a, b, 0) return found def extractText(self, sort=False) -> str: """Return simple, bare text on the page.""" if not sort: return self._extractText(0) blocks = self.extractBLOCKS()[:] blocks.sort(key=lambda b: (b[3], b[0])) return "".join([b[4] for b in blocks]) def extractTextbox(self, rect): this_tpage = self.this assert isinstance(this_tpage, mupdf.FzStextPage) area = JM_rect_from_py(rect) found = JM_copy_rectangle(this_tpage, area) rc = PyUnicode_DecodeRawUnicodeEscape(found) return rc def extractWORDS(self, delimiters=None): """Return a list with text word information.""" if g_use_extra: return extra.extractWORDS(self.this, delimiters) buflen = 0 last_char_rtl = 0 block_n = -1 wbbox = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) # word bbox this_tpage = self.this tp_rect = mupdf.FzRect(this_tpage.m_internal.mediabox) lines = None buff = mupdf.fz_new_buffer(64) lines = [] for block in this_tpage: block_n += 1 if block.m_internal.type != mupdf.FZ_STEXT_BLOCK_TEXT: continue line_n = -1 for line in block: line_n += 1 word_n = 0 # word counter per line mupdf.fz_clear_buffer(buff) # reset word buffer buflen = 0 # reset char counter for ch in line: cbbox = JM_char_bbox(line, ch) if (not JM_rects_overlap(tp_rect, cbbox) and not mupdf.fz_is_infinite_rect(tp_rect) ): continue if buflen == 0 and ch.m_internal.c == 0x200d: # ZERO WIDTH JOINER cannot start a word continue word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters) this_char_rtl = JM_is_rtl_char(ch.m_internal.c) if word_delimiter or this_char_rtl != last_char_rtl: if buflen == 0 and word_delimiter: continue # skip delimiters at line start if not mupdf.fz_is_empty_rect(wbbox): word_n, wbbox = JM_append_word(lines, buff, wbbox, block_n, line_n, word_n) mupdf.fz_clear_buffer(buff) buflen = 0 # reset char counter if word_delimiter: continue # append one unicode character to the word JM_append_rune(buff, ch.m_internal.c) last_char_rtl = this_char_rtl buflen += 1 # enlarge word bbox wbbox = mupdf.fz_union_rect(wbbox, JM_char_bbox(line, ch)) if buflen and not mupdf.fz_is_empty_rect(wbbox): word_n, wbbox = JM_append_word(lines, buff, wbbox, block_n, line_n, word_n) buflen = 0 return lines def extractXHTML(self) -> str: """Return page content as a XHTML string.""" return self._extractText(4) def extractXML(self) -> str: """Return page content as a XML string.""" return self._extractText(3) def poolsize(self): """TextPage current poolsize.""" tpage = self.this pool = mupdf.Pool(tpage.m_internal.pool) size = mupdf.fz_pool_size( pool) pool.m_internal = None # Ensure that pool's destructor does not free the pool. return size @property def rect(self): """Page rectangle.""" this_tpage = self.this mediabox = this_tpage.m_internal.mediabox val = JM_py_from_rect(mediabox) val = Rect(val) return val def search(self, needle, hit_max=0, quads=1): """Locate 'needle' returning rects or quads.""" val = JM_search_stext_page(self.this, needle) if not val: return val items = len(val) for i in range(items): # change entries to quads or rects q = Quad(val[i]) if quads: val[i] = q else: val[i] = q.rect if quads: return val i = 0 # join overlapping rects on the same line while i < items - 1: v1 = val[i] v2 = val[i + 1] if v1.y1 != v2.y1 or (v1 & v2).is_empty: i += 1 continue # no overlap on same line val[i] = v1 | v2 # join rectangles del val[i + 1] # remove v2 items -= 1 # reduce item count return val extractTEXT = extractText class TextWriter: def __init__(self, page_rect, opacity=1, color=None): """Stores text spans for later output on compatible PDF pages.""" self.this = mupdf.fz_new_text() self.opacity = opacity self.color = color self.rect = Rect(page_rect) self.ctm = Matrix(1, 0, 0, -1, 0, self.rect.height) self.ictm = ~self.ctm self.last_point = Point() self.last_point.__doc__ = "Position following last text insertion." self.text_rect = Rect() self.text_rect.__doc__ = "Accumulated area of text spans." self.used_fonts = set() self.thisown = True @property def _bbox(self): val = JM_py_from_rect( mupdf.fz_bound_text( self.this, mupdf.FzStrokeState(None), mupdf.FzMatrix())) val = Rect(val) return val def append(self, pos, text, font=None, fontsize=11, language=None, right_to_left=0, small_caps=0): """Store 'text' at point 'pos' using 'font' and 'fontsize'.""" pos = Point(pos) * self.ictm #log( '{font=}') if font is None: font = Font("helv") if not font.is_writable: if 0: log( '{font.this.m_internal.name=}') log( '{font.this.m_internal.t3matrix=}') log( '{font.this.m_internal.bbox=}') log( '{font.this.m_internal.glyph_count=}') log( '{font.this.m_internal.use_glyph_bbox=}') log( '{font.this.m_internal.width_count=}') log( '{font.this.m_internal.width_default=}') log( '{font.this.m_internal.has_digest=}') log( 'Unsupported font {font.name=}') if mupdf_cppyy: import cppyy log( f'Unsupported font {cppyy.gbl.mupdf_font_name(font.this.m_internal)=}') raise ValueError("Unsupported font '%s'." % font.name) if right_to_left: text = self.clean_rtl(text) text = "".join(reversed(text)) right_to_left = 0 lang = mupdf.fz_text_language_from_string(language) p = JM_point_from_py(pos) trm = mupdf.fz_make_matrix(fontsize, 0, 0, fontsize, p.x, p.y) markup_dir = 0 wmode = 0 if small_caps == 0: trm = mupdf.fz_show_string( self.this, font.this, trm, text, wmode, right_to_left, markup_dir, lang) else: trm = JM_show_string_cs( self.this, font.this, trm, text, wmode, right_to_left, markup_dir, lang) val = JM_py_from_matrix(trm) self.last_point = Point(val[-2:]) * self.ctm self.text_rect = self._bbox * self.ctm val = self.text_rect, self.last_point if font.flags["mono"] == 1: self.used_fonts.add(font) return val def appendv(self, pos, text, font=None, fontsize=11, language=None, small_caps=False): lheight = fontsize * 1.2 for c in text: self.append(pos, c, font=font, fontsize=fontsize, language=language, small_caps=small_caps) pos.y += lheight return self.text_rect, self.last_point def clean_rtl(self, text): """Revert the sequence of Latin text parts. Text with right-to-left writing direction (Arabic, Hebrew) often contains Latin parts, which are written in left-to-right: numbers, names, etc. For output as PDF text we need *everything* in right-to-left. E.g. an input like "<arabic> ABCDE FG HIJ <arabic> KL <arabic>" will be converted to "<arabic> JIH GF EDCBA <arabic> LK <arabic>". The Arabic parts remain untouched. Args: text: str Returns: Massaged string. """ if not text: return text # split into words at space boundaries words = text.split(" ") idx = [] for i in range(len(words)): w = words[i] # revert character sequence for Latin only words if not (len(w) < 2 or max([ord(c) for c in w]) > 255): words[i] = "".join(reversed(w)) idx.append(i) # stored index of Latin word # adjacent Latin words must revert their sequence, too idx2 = [] # store indices of adjacent Latin words for i in range(len(idx)): if idx2 == []: # empty yet? idx2.append(idx[i]) # store Latin word number elif idx[i] > idx2[-1] + 1: # large gap to last? if len(idx2) > 1: # at least two consecutives? words[idx2[0] : idx2[-1] + 1] = reversed( words[idx2[0] : idx2[-1] + 1] ) # revert their sequence idx2 = [idx[i]] # re-initialize elif idx[i] == idx2[-1] + 1: # new adjacent Latin word idx2.append(idx[i]) text = " ".join(words) return text def fill_textbox( writer: 'TextWriter', rect: rect_like, text: typing.Union[str, list], pos: point_like = None, font: typing.Optional[Font] = None, fontsize: float = 11, lineheight: OptFloat = None, align: int = 0, warn: bool = None, right_to_left: bool = False, small_caps: bool = False, ) -> tuple: """Fill a rectangle with text. Args: writer: pymupdf.TextWriter object (= "self") rect: rect-like to receive the text. text: string or list/tuple of strings. pos: point-like start position of first word. font: pymupdf.Font object (default pymupdf.Font('helv')). fontsize: the fontsize. lineheight: overwrite the font property align: (int) 0 = left, 1 = center, 2 = right, 3 = justify warn: (bool) text overflow action: none, warn, or exception right_to_left: (bool) indicate right-to-left language. """ rect = Rect(rect) if rect.is_empty: raise ValueError("fill rect must not empty.") if type(font) is not Font: font = Font("helv") def textlen(x): """Return length of a string.""" return font.text_length( x, fontsize=fontsize, small_caps=small_caps ) # abbreviation def char_lengths(x): """Return list of single character lengths for a string.""" return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps) def append_this(pos, text): ret = writer.append( pos, text, font=font, fontsize=fontsize, small_caps=small_caps ) return ret tolerance = fontsize * 0.2 # extra distance to left border space_len = textlen(" ") std_width = rect.width - tolerance std_start = rect.x0 + tolerance def norm_words(width, words): """Cut any word in pieces no longer than 'width'.""" nwords = [] word_lengths = [] for w in words: wl_lst = char_lengths(w) wl = sum(wl_lst) if wl <= width: # nothing to do - copy over nwords.append(w) word_lengths.append(wl) continue # word longer than rect width - split it in parts n = len(wl_lst) while n > 0: wl = sum(wl_lst[:n]) if wl <= width: nwords.append(w[:n]) word_lengths.append(wl) w = w[n:] wl_lst = wl_lst[n:] n = len(wl_lst) else: n -= 1 return nwords, word_lengths def output_justify(start, line): """Justified output of a line.""" # ignore leading / trailing / multiple spaces words = [w for w in line.split(" ") if w != ""] nwords = len(words) if nwords == 0: return if nwords == 1: # single word cannot be justified append_this(start, words[0]) return tl = sum([textlen(w) for w in words]) # total word lengths gaps = nwords - 1 # number of word gaps gapl = (std_width - tl) / gaps # width of each gap for w in words: _, lp = append_this(start, w) # output one word start.x = lp.x + gapl # next start at word end plus gap return asc = font.ascender dsc = font.descender if not lineheight: if asc - dsc <= 1: lheight = 1.2 else: lheight = asc - dsc else: lheight = lineheight LINEHEIGHT = fontsize * lheight # effective line height width = std_width # available horizontal space # starting point of text if pos is not None: pos = Point(pos) else: # default is just below rect top-left pos = rect.tl + (tolerance, fontsize * asc) if pos not in rect: raise ValueError("Text must start in rectangle.") # calculate displacement factor for alignment if align == TEXT_ALIGN_CENTER: factor = 0.5 elif align == TEXT_ALIGN_RIGHT: factor = 1.0 else: factor = 0 # split in lines if just a string was given if type(text) is str: textlines = text.splitlines() else: textlines = [] for line in text: textlines.extend(line.splitlines()) max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1 new_lines = [] # the final list of textbox lines no_justify = [] # no justify for these line numbers for i, line in enumerate(textlines): if line in ("", " "): new_lines.append((line, space_len)) width = rect.width - tolerance no_justify.append((len(new_lines) - 1)) continue if i == 0: width = rect.x1 - pos.x else: width = rect.width - tolerance if right_to_left: # reverses Arabic / Hebrew text front to back line = writer.clean_rtl(line) tl = textlen(line) if tl <= width: # line short enough new_lines.append((line, tl)) no_justify.append((len(new_lines) - 1)) continue # we need to split the line in fitting parts words = line.split(" ") # the words in the line # cut in parts any words that are longer than rect width words, word_lengths = norm_words(width, words) n = len(words) while True: line0 = " ".join(words[:n]) wl = sum(word_lengths[:n]) + space_len * (n - 1) if wl <= width: new_lines.append((line0, wl)) words = words[n:] word_lengths = word_lengths[n:] n = len(words) line0 = None else: n -= 1 if len(words) == 0: break assert n # ------------------------------------------------------------------------- # List of lines created. Each item is (text, tl), where 'tl' is the PDF # output length (float) and 'text' is the text. Except for justified text, # this is output-ready. # ------------------------------------------------------------------------- nlines = len(new_lines) if nlines > max_lines: msg = "Only fitting %i of %i lines." % (max_lines, nlines) if warn is None: pass elif warn: message("Warning: " + msg) else: raise ValueError(msg) start = Point() no_justify += [len(new_lines) - 1] # no justifying of last line for i in range(max_lines): try: line, tl = new_lines.pop(0) except IndexError: if g_exceptions_verbose >= 2: exception_info() break if right_to_left: # Arabic, Hebrew line = "".join(reversed(line)) if i == 0: # may have different start for first line start = pos if align == TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width: output_justify(start, line) start.x = std_start start.y += LINEHEIGHT continue if i > 0 or pos.x == std_start: # left, center, right alignments start.x += (width - tl) * factor append_this(start, line) start.x = std_start start.y += LINEHEIGHT return new_lines # return non-written lines def write_text(self, page, color=None, opacity=-1, overlay=1, morph=None, matrix=None, render_mode=0, oc=0): """Write the text to a PDF page having the TextWriter's page size. Args: page: a PDF page having same size. color: override text color. opacity: override transparency. overlay: put in foreground or background. morph: tuple(Point, Matrix), apply a matrix with a fixpoint. matrix: Matrix to be used instead of 'morph' argument. render_mode: (int) PDF render mode operator 'Tr'. """ CheckParent(page) if abs(self.rect - page.rect) > 1e-3: raise ValueError("incompatible page rect") if morph is not None: if (type(morph) not in (tuple, list) or type(morph[0]) is not Point or type(morph[1]) is not Matrix ): raise ValueError("morph must be (Point, Matrix) or None") if matrix is not None and morph is not None: raise ValueError("only one of matrix, morph is allowed") if getattr(opacity, "__float__", None) is None or opacity == -1: opacity = self.opacity if color is None: color = self.color if 1: pdfpage = page._pdf_page() alpha = 1 if opacity >= 0 and opacity < 1: alpha = opacity ncol = 1 dev_color = [0, 0, 0, 0] if color: ncol, dev_color = JM_color_FromSequence(color) if ncol == 3: colorspace = mupdf.fz_device_rgb() elif ncol == 4: colorspace = mupdf.fz_device_cmyk() else: colorspace = mupdf.fz_device_gray() resources = mupdf.pdf_new_dict(pdfpage.doc(), 5) contents = mupdf.fz_new_buffer(1024) dev = mupdf.pdf_new_pdf_device( pdfpage.doc(), mupdf.FzMatrix(), resources, contents) #log( '=== {dev_color!r=}') mupdf.fz_fill_text( dev, self.this, mupdf.FzMatrix(), colorspace, dev_color, alpha, mupdf.FzColorParams(mupdf.fz_default_color_params), ) mupdf.fz_close_device( dev) # copy generated resources into the one of the page max_nums = JM_merge_resources( pdfpage, resources) cont_string = JM_EscapeStrFromBuffer( contents) result = (max_nums, cont_string) val = result max_nums = val[0] content = val[1] max_alp, max_font = max_nums old_cont_lines = content.splitlines() optcont = page._get_optional_content(oc) if optcont is not None: bdc = "/OC /%s BDC" % optcont emc = "EMC" else: bdc = emc = "" new_cont_lines = ["q"] if bdc: new_cont_lines.append(bdc) cb = page.cropbox_position if page.rotation in (90, 270): delta = page.rect.height - page.rect.width else: delta = 0 mb = page.mediabox if bool(cb) or mb.y0 != 0 or delta != 0: new_cont_lines.append(f"1 0 0 1 {_format_g((cb.x, cb.y + mb.y0 - delta))} cm") if morph: p = morph[0] * self.ictm delta = Matrix(1, 1).pretranslate(p.x, p.y) matrix = ~delta * morph[1] * delta if morph or matrix: new_cont_lines.append(_format_g(JM_TUPLE(matrix)) + " cm") for line in old_cont_lines: if line.endswith(" cm"): continue if line == "BT": new_cont_lines.append(line) new_cont_lines.append("%i Tr" % render_mode) continue if line.endswith(" gs"): alp = int(line.split()[0][4:]) + max_alp line = "/Alp%i gs" % alp elif line.endswith(" Tf"): temp = line.split() fsize = float(temp[1]) if render_mode != 0: w = fsize * 0.05 else: w = 1 new_cont_lines.append(_format_g(w) + " w") font = int(temp[0][2:]) + max_font line = " ".join(["/F%i" % font] + temp[1:]) elif line.endswith(" rg"): new_cont_lines.append(line.replace("rg", "RG")) elif line.endswith(" g"): new_cont_lines.append(line.replace(" g", " G")) elif line.endswith(" k"): new_cont_lines.append(line.replace(" k", " K")) new_cont_lines.append(line) if emc: new_cont_lines.append(emc) new_cont_lines.append("Q\n") content = "\n".join(new_cont_lines).encode("utf-8") TOOLS._insert_contents(page, content, overlay=overlay) val = None for font in self.used_fonts: repair_mono_font(page, font) return val class IRect: """ IRect() - all zeros IRect(x0, y0, x1, y1) - 4 coordinates IRect(top-left, x1, y1) - point and 2 coordinates IRect(x0, y0, bottom-right) - 2 coordinates and point IRect(top-left, bottom-right) - 2 points IRect(sequ) - new from sequence or rect-like """ def __add__(self, p): return Rect.__add__(self, p).round() def __and__(self, x): return Rect.__and__(self, x).round() def __contains__(self, x): return Rect.__contains__(self, x) def __eq__(self, r): if not hasattr(r, "__len__"): return False return len(r) == 4 and self.x0 == r[0] and self.y0 == r[1] and self.x1 == r[2] and self.y1 == r[3] def __getitem__(self, i): return (self.x0, self.y0, self.x1, self.y1)[i] def __hash__(self): return hash(tuple(self)) def __init__(self, *args, p0=None, p1=None, x0=None, y0=None, x1=None, y1=None): self.x0, self.y0, self.x1, self.y1 = util_make_irect( *args, p0=p0, p1=p1, x0=x0, y0=y0, x1=x1, y1=y1) def __len__(self): return 4 def __mul__(self, m): return Rect.__mul__(self, m).round() def __neg__(self): return IRect(-self.x0, -self.y0, -self.x1, -self.y1) def __or__(self, x): return Rect.__or__(self, x).round() def __pos__(self): return IRect(self) def __repr__(self): return "IRect" + str(tuple(self)) def __setitem__(self, i, v): v = int(v) if i == 0: self.x0 = v elif i == 1: self.y0 = v elif i == 2: self.x1 = v elif i == 3: self.y1 = v else: raise IndexError("index out of range") return None def __sub__(self, p): return Rect.__sub__(self, p).round() def __truediv__(self, m): return Rect.__truediv__(self, m).round() @property def bottom_left(self): """Bottom-left corner.""" return Point(self.x0, self.y1) @property def bottom_right(self): """Bottom-right corner.""" return Point(self.x1, self.y1) @property def height(self): return max(0, self.y1 - self.y0) def contains(self, x): """Check if x is in the rectangle.""" return self.__contains__(x) def get_area(self, *args) -> float: """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'.""" if args: unit = args[0] else: unit = "px" u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)} f = (u[unit][0] / u[unit][1]) ** 2 return f * self.width * self.height def include_point(self, p): """Extend rectangle to include point p.""" rect = self.rect.include_point(p) return rect.irect def include_rect(self, r): """Extend rectangle to include rectangle r.""" rect = self.rect.include_rect(r) return rect.irect def intersect(self, r): """Restrict rectangle to intersection with rectangle r.""" return Rect.intersect(self, r).round() def intersects(self, x): return Rect.intersects(self, x) @property def is_empty(self): """True if rectangle area is empty.""" return self.x0 >= self.x1 or self.y0 >= self.y1 @property def is_infinite(self): """True if rectangle is infinite.""" return self.x0 == self.y0 == FZ_MIN_INF_RECT and self.x1 == self.y1 == FZ_MAX_INF_RECT @property def is_valid(self): """True if rectangle is valid.""" return self.x0 <= self.x1 and self.y0 <= self.y1 def morph(self, p, m): """Morph with matrix-like m and point-like p. Returns a new quad.""" if self.is_infinite: return INFINITE_QUAD() return self.quad.morph(p, m) def norm(self): return math.sqrt(sum([c*c for c in self])) def normalize(self): """Replace rectangle with its valid version.""" if self.x1 < self.x0: self.x0, self.x1 = self.x1, self.x0 if self.y1 < self.y0: self.y0, self.y1 = self.y1, self.y0 return self @property def quad(self): """Return Quad version of rectangle.""" return Quad(self.tl, self.tr, self.bl, self.br) @property def rect(self): return Rect(self) @property def top_left(self): """Top-left corner.""" return Point(self.x0, self.y0) @property def top_right(self): """Top-right corner.""" return Point(self.x1, self.y0) def torect(self, r): """Return matrix that converts to target rect.""" r = Rect(r) if self.is_infinite or self.is_empty or r.is_infinite or r.is_empty: raise ValueError("rectangles must be finite and not empty") return ( Matrix(1, 0, 0, 1, -self.x0, -self.y0) * Matrix(r.width / self.width, r.height / self.height) * Matrix(1, 0, 0, 1, r.x0, r.y0) ) def transform(self, m): return Rect.transform(self, m).round() @property def width(self): return max(0, self.x1 - self.x0) br = bottom_right bl = bottom_left tl = top_left tr = top_right # Data # if 1: _self = sys.modules[__name__] if 1: for _name, _value in mupdf.__dict__.items(): if _name.startswith(('PDF_', 'UCDN_SCRIPT_')): if _name.startswith('PDF_ENUM_NAME_'): # Not a simple enum. pass else: #assert not inspect.isroutine(value) #log(f'importing {_name=} {_value=}.') setattr(_self, _name, _value) #log(f'{getattr( self, name, None)=}') else: # This is slow due to importing inspect, e.g. 0.019 instead of 0.004. for _name, _value in inspect.getmembers(mupdf): if _name.startswith(('PDF_', 'UCDN_SCRIPT_')): if _name.startswith('PDF_ENUM_NAME_'): # Not a simple enum. pass else: #assert not inspect.isroutine(value) #log(f'importing {name}') setattr(_self, _name, _value) #log(f'{getattr( self, name, None)=}') # This is a macro so not preserved in mupdf C++/Python bindings. # PDF_SIGNATURE_DEFAULT_APPEARANCE = (0 | mupdf.PDF_SIGNATURE_SHOW_LABELS | mupdf.PDF_SIGNATURE_SHOW_DN | mupdf.PDF_SIGNATURE_SHOW_DATE | mupdf.PDF_SIGNATURE_SHOW_TEXT_NAME | mupdf.PDF_SIGNATURE_SHOW_GRAPHIC_NAME | mupdf.PDF_SIGNATURE_SHOW_LOGO ) #UCDN_SCRIPT_ADLAM = mupdf.UCDN_SCRIPT_ADLAM #setattr(self, 'UCDN_SCRIPT_ADLAM', mupdf.UCDN_SCRIPT_ADLAM) assert mupdf.UCDN_EAST_ASIAN_H == 1 # Flake8 incorrectly fails next two lines because we've dynamically added # items to self. assert PDF_TX_FIELD_IS_MULTILINE == mupdf.PDF_TX_FIELD_IS_MULTILINE # noqa: F821 assert UCDN_SCRIPT_ADLAM == mupdf.UCDN_SCRIPT_ADLAM # noqa: F821 del _self, _name, _value AnyType = typing.Any Base14_fontnames = ( "Courier", "Courier-Oblique", "Courier-Bold", "Courier-BoldOblique", "Helvetica", "Helvetica-Oblique", "Helvetica-Bold", "Helvetica-BoldOblique", "Times-Roman", "Times-Italic", "Times-Bold", "Times-BoldItalic", "Symbol", "ZapfDingbats", ) Base14_fontdict = {} for f in Base14_fontnames: Base14_fontdict[f.lower()] = f Base14_fontdict["helv"] = "Helvetica" Base14_fontdict["heit"] = "Helvetica-Oblique" Base14_fontdict["hebo"] = "Helvetica-Bold" Base14_fontdict["hebi"] = "Helvetica-BoldOblique" Base14_fontdict["cour"] = "Courier" Base14_fontdict["coit"] = "Courier-Oblique" Base14_fontdict["cobo"] = "Courier-Bold" Base14_fontdict["cobi"] = "Courier-BoldOblique" Base14_fontdict["tiro"] = "Times-Roman" Base14_fontdict["tibo"] = "Times-Bold" Base14_fontdict["tiit"] = "Times-Italic" Base14_fontdict["tibi"] = "Times-BoldItalic" Base14_fontdict["symb"] = "Symbol" Base14_fontdict["zadb"] = "ZapfDingbats" EPSILON = 1e-5 FLT_EPSILON = 1e-5 # largest 32bit integers surviving C float conversion roundtrips # used by MuPDF to define infinite rectangles FZ_MIN_INF_RECT = -0x80000000 FZ_MAX_INF_RECT = 0x7fffff80 JM_annot_id_stem = "fitz" JM_mupdf_warnings_store = [] JM_mupdf_show_errors = 1 JM_mupdf_show_warnings = 0 # ------------------------------------------------------------------------------ # Image recompression constants # ------------------------------------------------------------------------------ FZ_RECOMPRESS_NEVER = mupdf.FZ_RECOMPRESS_NEVER FZ_RECOMPRESS_SAME = mupdf.FZ_RECOMPRESS_SAME FZ_RECOMPRESS_LOSSLESS = mupdf.FZ_RECOMPRESS_LOSSLESS FZ_RECOMPRESS_JPEG = mupdf.FZ_RECOMPRESS_JPEG FZ_RECOMPRESS_J2K = mupdf.FZ_RECOMPRESS_J2K FZ_RECOMPRESS_FAX = mupdf.FZ_RECOMPRESS_FAX FZ_SUBSAMPLE_AVERAGE = mupdf.FZ_SUBSAMPLE_AVERAGE FZ_SUBSAMPLE_BICUBIC = mupdf.FZ_SUBSAMPLE_BICUBIC # ------------------------------------------------------------------------------ # Various PDF Optional Content Flags # ------------------------------------------------------------------------------ PDF_OC_ON = 0 PDF_OC_TOGGLE = 1 PDF_OC_OFF = 2 # ------------------------------------------------------------------------------ # link kinds and link flags # ------------------------------------------------------------------------------ LINK_NONE = 0 LINK_GOTO = 1 LINK_URI = 2 LINK_LAUNCH = 3 LINK_NAMED = 4 LINK_GOTOR = 5 LINK_FLAG_L_VALID = 1 LINK_FLAG_T_VALID = 2 LINK_FLAG_R_VALID = 4 LINK_FLAG_B_VALID = 8 LINK_FLAG_FIT_H = 16 LINK_FLAG_FIT_V = 32 LINK_FLAG_R_IS_ZOOM = 64 SigFlag_SignaturesExist = 1 SigFlag_AppendOnly = 2 STAMP_Approved = 0 STAMP_AsIs = 1 STAMP_Confidential = 2 STAMP_Departmental = 3 STAMP_Experimental = 4 STAMP_Expired = 5 STAMP_Final = 6 STAMP_ForComment = 7 STAMP_ForPublicRelease = 8 STAMP_NotApproved = 9 STAMP_NotForPublicRelease = 10 STAMP_Sold = 11 STAMP_TopSecret = 12 STAMP_Draft = 13 TEXT_ALIGN_LEFT = 0 TEXT_ALIGN_CENTER = 1 TEXT_ALIGN_RIGHT = 2 TEXT_ALIGN_JUSTIFY = 3 TEXT_FONT_SUPERSCRIPT = 1 TEXT_FONT_ITALIC = 2 TEXT_FONT_SERIFED = 4 TEXT_FONT_MONOSPACED = 8 TEXT_FONT_BOLD = 16 TEXT_OUTPUT_TEXT = 0 TEXT_OUTPUT_HTML = 1 TEXT_OUTPUT_JSON = 2 TEXT_OUTPUT_XML = 3 TEXT_OUTPUT_XHTML = 4 TEXT_PRESERVE_LIGATURES = mupdf.FZ_STEXT_PRESERVE_LIGATURES TEXT_PRESERVE_WHITESPACE = mupdf.FZ_STEXT_PRESERVE_WHITESPACE TEXT_PRESERVE_IMAGES = mupdf.FZ_STEXT_PRESERVE_IMAGES TEXT_INHIBIT_SPACES = mupdf.FZ_STEXT_INHIBIT_SPACES TEXT_DEHYPHENATE = mupdf.FZ_STEXT_DEHYPHENATE TEXT_PRESERVE_SPANS = mupdf.FZ_STEXT_PRESERVE_SPANS TEXT_MEDIABOX_CLIP = mupdf.FZ_STEXT_MEDIABOX_CLIP TEXT_USE_CID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_CID_FOR_UNKNOWN_UNICODE TEXT_COLLECT_STRUCTURE = mupdf.FZ_STEXT_COLLECT_STRUCTURE TEXT_ACCURATE_BBOXES = mupdf.FZ_STEXT_ACCURATE_BBOXES TEXT_COLLECT_VECTORS = mupdf.FZ_STEXT_COLLECT_VECTORS TEXT_IGNORE_ACTUALTEXT = mupdf.FZ_STEXT_IGNORE_ACTUALTEXT TEXT_SEGMENT = mupdf.FZ_STEXT_SEGMENT if mupdf_version_tuple >= (1, 26): TEXT_PARAGRAPH_BREAK = mupdf.FZ_STEXT_PARAGRAPH_BREAK TEXT_TABLE_HUNT = mupdf.FZ_STEXT_TABLE_HUNT TEXT_COLLECT_STYLES = mupdf.FZ_STEXT_COLLECT_STYLES TEXT_USE_GID_FOR_UNKNOWN_UNICODE = mupdf.FZ_STEXT_USE_GID_FOR_UNKNOWN_UNICODE TEXT_CLIP_RECT = mupdf.FZ_STEXT_CLIP_RECT TEXT_ACCURATE_ASCENDERS = mupdf.FZ_STEXT_ACCURATE_ASCENDERS TEXT_ACCURATE_SIDE_BEARINGS = mupdf.FZ_STEXT_ACCURATE_SIDE_BEARINGS # 2025-05-07: Non-standard names preserved for backwards compatibility. TEXT_STEXT_SEGMENT = TEXT_SEGMENT TEXT_CID_FOR_UNKNOWN_UNICODE = TEXT_USE_CID_FOR_UNKNOWN_UNICODE TEXTFLAGS_WORDS = (0 | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) TEXTFLAGS_BLOCKS = (0 | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) TEXTFLAGS_DICT = (0 | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) TEXTFLAGS_RAWDICT = TEXTFLAGS_DICT TEXTFLAGS_SEARCH = (0 | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_DEHYPHENATE | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) TEXTFLAGS_HTML = (0 | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) TEXTFLAGS_XHTML = (0 | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_PRESERVE_IMAGES | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) TEXTFLAGS_XML = (0 | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) TEXTFLAGS_TEXT = (0 | TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE | TEXT_MEDIABOX_CLIP | TEXT_USE_CID_FOR_UNKNOWN_UNICODE ) # Simple text encoding options TEXT_ENCODING_LATIN = 0 TEXT_ENCODING_GREEK = 1 TEXT_ENCODING_CYRILLIC = 2 TOOLS_JM_UNIQUE_ID = 0 # colorspace identifiers CS_RGB = 1 CS_GRAY = 2 CS_CMYK = 3 # PDF Blend Modes PDF_BM_Color = "Color" PDF_BM_ColorBurn = "ColorBurn" PDF_BM_ColorDodge = "ColorDodge" PDF_BM_Darken = "Darken" PDF_BM_Difference = "Difference" PDF_BM_Exclusion = "Exclusion" PDF_BM_HardLight = "HardLight" PDF_BM_Hue = "Hue" PDF_BM_Lighten = "Lighten" PDF_BM_Luminosity = "Luminosity" PDF_BM_Multiply = "Multiply" PDF_BM_Normal = "Normal" PDF_BM_Overlay = "Overlay" PDF_BM_Saturation = "Saturation" PDF_BM_Screen = "Screen" PDF_BM_SoftLight = "Softlight" annot_skel = { "goto1": lambda a, b, c, d, e: f"<</A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>/Rect[{e}]/BS<</W 0>>/Subtype/Link>>", "goto2": lambda a, b: f"<</A<</S/GoTo/D{a}>>/Rect[{b}]/BS<</W 0>>/Subtype/Link>>", "gotor1": lambda a, b, c, d, e, f, g: f"<</A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F({e})/UF({f})/Type/Filespec>>>>/Rect[{g}]/BS<</W 0>>/Subtype/Link>>", "gotor2": lambda a, b, c: f"<</A<</S/GoToR/D{a}/F({b})>>/Rect[{c}]/BS<</W 0>>/Subtype/Link>>", "launch": lambda a, b, c: f"<</A<</S/Launch/F<</F({a})/UF({b})/Type/Filespec>>>>/Rect[{c}]/BS<</W 0>>/Subtype/Link>>", "uri": lambda a, b: f"<</A<</S/URI/URI({a})>>/Rect[{b}]/BS<</W 0>>/Subtype/Link>>", "named": lambda a, b: f"<</A<</S/GoTo/D({a})/Type/Action>>/Rect[{b}]/BS<</W 0>>/Subtype/Link>>", } class FileDataError(RuntimeError): """Raised for documents with file structure issues.""" pass class FileNotFoundError(RuntimeError): """Raised if file does not exist.""" pass class EmptyFileError(FileDataError): """Raised when creating documents from zero-length data.""" pass # propagate exception class to C-level code #_set_FileDataError(FileDataError) csRGB = Colorspace(CS_RGB) csGRAY = Colorspace(CS_GRAY) csCMYK = Colorspace(CS_CMYK) # These don't appear to be visible in classic, but are used # internally. # dictkey_align = "align" dictkey_asc = "ascender" dictkey_bidi = "bidi" dictkey_bbox = "bbox" dictkey_blocks = "blocks" dictkey_bpc = "bpc" dictkey_c = "c" dictkey_chars = "chars" dictkey_color = "color" dictkey_colorspace = "colorspace" dictkey_content = "content" dictkey_creationDate = "creationDate" dictkey_cs_name = "cs-name" dictkey_da = "da" dictkey_dashes = "dashes" dictkey_descr = "description" dictkey_desc = "descender" dictkey_dir = "dir" dictkey_effect = "effect" dictkey_ext = "ext" dictkey_filename = "filename" dictkey_fill = "fill" dictkey_flags = "flags" dictkey_char_flags = "char_flags" dictkey_font = "font" dictkey_glyph = "glyph" dictkey_height = "height" dictkey_id = "id" dictkey_image = "image" dictkey_items = "items" dictkey_length = "length" dictkey_lines = "lines" dictkey_matrix = "transform" dictkey_modDate = "modDate" dictkey_name = "name" dictkey_number = "number" dictkey_origin = "origin" dictkey_rect = "rect" dictkey_size = "size" dictkey_smask = "smask" dictkey_spans = "spans" dictkey_stroke = "stroke" dictkey_style = "style" dictkey_subject = "subject" dictkey_text = "text" dictkey_title = "title" dictkey_type = "type" dictkey_ufilename = "ufilename" dictkey_width = "width" dictkey_wmode = "wmode" dictkey_xref = "xref" dictkey_xres = "xres" dictkey_yres = "yres" try: from pymupdf_fonts import fontdescriptors, fontbuffers fitz_fontdescriptors = fontdescriptors.copy() for k in fitz_fontdescriptors.keys(): fitz_fontdescriptors[k]["loader"] = fontbuffers[k] del fontdescriptors, fontbuffers except ImportError: fitz_fontdescriptors = {} symbol_glyphs = ( # Glyph list for the built-in font 'Symbol' (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (32, 0.25), (33, 0.333), (34, 0.713), (35, 0.5), (36, 0.549), (37, 0.833), (38, 0.778), (39, 0.439), (40, 0.333), (41, 0.333), (42, 0.5), (43, 0.549), (44, 0.25), (45, 0.549), (46, 0.25), (47, 0.278), (48, 0.5), (49, 0.5), (50, 0.5), (51, 0.5), (52, 0.5), (53, 0.5), (54, 0.5), (55, 0.5), (56, 0.5), (57, 0.5), (58, 0.278), (59, 0.278), (60, 0.549), (61, 0.549), (62, 0.549), (63, 0.444), (64, 0.549), (65, 0.722), (66, 0.667), (67, 0.722), (68, 0.612), (69, 0.611), (70, 0.763), (71, 0.603), (72, 0.722), (73, 0.333), (74, 0.631), (75, 0.722), (76, 0.686), (77, 0.889), (78, 0.722), (79, 0.722), (80, 0.768), (81, 0.741), (82, 0.556), (83, 0.592), (84, 0.611), (85, 0.69), (86, 0.439), (87, 0.768), (88, 0.645), (89, 0.795), (90, 0.611), (91, 0.333), (92, 0.863), (93, 0.333), (94, 0.658), (95, 0.5), (96, 0.5), (97, 0.631), (98, 0.549), (99, 0.549), (100, 0.494), (101, 0.439), (102, 0.521), (103, 0.411), (104, 0.603), (105, 0.329), (106, 0.603), (107, 0.549), (108, 0.549), (109, 0.576), (110, 0.521), (111, 0.549), (112, 0.549), (113, 0.521), (114, 0.549), (115, 0.603), (116, 0.439), (117, 0.576), (118, 0.713), (119, 0.686), (120, 0.493), (121, 0.686), (122, 0.494), (123, 0.48), (124, 0.2), (125, 0.48), (126, 0.549), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (183, 0.46), (160, 0.25), (161, 0.62), (162, 0.247), (163, 0.549), (164, 0.167), (165, 0.713), (166, 0.5), (167, 0.753), (168, 0.753), (169, 0.753), (170, 0.753), (171, 1.042), (172, 0.713), (173, 0.603), (174, 0.987), (175, 0.603), (176, 0.4), (177, 0.549), (178, 0.411), (179, 0.549), (180, 0.549), (181, 0.576), (182, 0.494), (183, 0.46), (184, 0.549), (185, 0.549), (186, 0.549), (187, 0.549), (188, 1), (189, 0.603), (190, 1), (191, 0.658), (192, 0.823), (193, 0.686), (194, 0.795), (195, 0.987), (196, 0.768), (197, 0.768), (198, 0.823), (199, 0.768), (200, 0.768), (201, 0.713), (202, 0.713), (203, 0.713), (204, 0.713), (205, 0.713), (206, 0.713), (207, 0.713), (208, 0.768), (209, 0.713), (210, 0.79), (211, 0.79), (212, 0.89), (213, 0.823), (214, 0.549), (215, 0.549), (216, 0.713), (217, 0.603), (218, 0.603), (219, 1.042), (220, 0.987), (221, 0.603), (222, 0.987), (223, 0.603), (224, 0.494), (225, 0.329), (226, 0.79), (227, 0.79), (228, 0.786), (229, 0.713), (230, 0.384), (231, 0.384), (232, 0.384), (233, 0.384), (234, 0.384), (235, 0.384), (236, 0.494), (237, 0.494), (238, 0.494), (239, 0.494), (183, 0.46), (241, 0.329), (242, 0.274), (243, 0.686), (244, 0.686), (245, 0.686), (246, 0.384), (247, 0.549), (248, 0.384), (249, 0.384), (250, 0.384), (251, 0.384), (252, 0.494), (253, 0.494), (254, 0.494), (183, 0.46), ) zapf_glyphs = ( # Glyph list for the built-in font 'ZapfDingbats' (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (32, 0.278), (33, 0.974), (34, 0.961), (35, 0.974), (36, 0.98), (37, 0.719), (38, 0.789), (39, 0.79), (40, 0.791), (41, 0.69), (42, 0.96), (43, 0.939), (44, 0.549), (45, 0.855), (46, 0.911), (47, 0.933), (48, 0.911), (49, 0.945), (50, 0.974), (51, 0.755), (52, 0.846), (53, 0.762), (54, 0.761), (55, 0.571), (56, 0.677), (57, 0.763), (58, 0.76), (59, 0.759), (60, 0.754), (61, 0.494), (62, 0.552), (63, 0.537), (64, 0.577), (65, 0.692), (66, 0.786), (67, 0.788), (68, 0.788), (69, 0.79), (70, 0.793), (71, 0.794), (72, 0.816), (73, 0.823), (74, 0.789), (75, 0.841), (76, 0.823), (77, 0.833), (78, 0.816), (79, 0.831), (80, 0.923), (81, 0.744), (82, 0.723), (83, 0.749), (84, 0.79), (85, 0.792), (86, 0.695), (87, 0.776), (88, 0.768), (89, 0.792), (90, 0.759), (91, 0.707), (92, 0.708), (93, 0.682), (94, 0.701), (95, 0.826), (96, 0.815), (97, 0.789), (98, 0.789), (99, 0.707), (100, 0.687), (101, 0.696), (102, 0.689), (103, 0.786), (104, 0.787), (105, 0.713), (106, 0.791), (107, 0.785), (108, 0.791), (109, 0.873), (110, 0.761), (111, 0.762), (112, 0.762), (113, 0.759), (114, 0.759), (115, 0.892), (116, 0.892), (117, 0.788), (118, 0.784), (119, 0.438), (120, 0.138), (121, 0.277), (122, 0.415), (123, 0.392), (124, 0.392), (125, 0.668), (126, 0.668), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (183, 0.788), (161, 0.732), (162, 0.544), (163, 0.544), (164, 0.91), (165, 0.667), (166, 0.76), (167, 0.76), (168, 0.776), (169, 0.595), (170, 0.694), (171, 0.626), (172, 0.788), (173, 0.788), (174, 0.788), (175, 0.788), (176, 0.788), (177, 0.788), (178, 0.788), (179, 0.788), (180, 0.788), (181, 0.788), (182, 0.788), (183, 0.788), (184, 0.788), (185, 0.788), (186, 0.788), (187, 0.788), (188, 0.788), (189, 0.788), (190, 0.788), (191, 0.788), (192, 0.788), (193, 0.788), (194, 0.788), (195, 0.788), (196, 0.788), (197, 0.788), (198, 0.788), (199, 0.788), (200, 0.788), (201, 0.788), (202, 0.788), (203, 0.788), (204, 0.788), (205, 0.788), (206, 0.788), (207, 0.788), (208, 0.788), (209, 0.788), (210, 0.788), (211, 0.788), (212, 0.894), (213, 0.838), (214, 1.016), (215, 0.458), (216, 0.748), (217, 0.924), (218, 0.748), (219, 0.918), (220, 0.927), (221, 0.928), (222, 0.928), (223, 0.834), (224, 0.873), (225, 0.828), (226, 0.924), (227, 0.924), (228, 0.917), (229, 0.93), (230, 0.931), (231, 0.463), (232, 0.883), (233, 0.836), (234, 0.836), (235, 0.867), (236, 0.867), (237, 0.696), (238, 0.696), (239, 0.874), (183, 0.788), (241, 0.874), (242, 0.76), (243, 0.946), (244, 0.771), (245, 0.865), (246, 0.771), (247, 0.888), (248, 0.967), (249, 0.888), (250, 0.831), (251, 0.873), (252, 0.927), (253, 0.97), (183, 0.788), (183, 0.788), ) # Functions # def _read_samples( pixmap, offset, n): # fixme: need to be able to get a sample in one call, as a Python # bytes or similar. ret = [] if not pixmap.samples(): # mupdf.fz_samples_get() gives a segv if pixmap->samples is null. return ret for i in range( n): ret.append( mupdf.fz_samples_get( pixmap, offset + i)) return bytes( ret) def _INRANGE(v, low, high): return low <= v and v <= high def _remove_dest_range(pdf, numbers): pagecount = mupdf.pdf_count_pages(pdf) for i in range(pagecount): n1 = i if n1 in numbers: continue pageref = mupdf.pdf_lookup_page_obj( pdf, i) annots = mupdf.pdf_dict_get( pageref, PDF_NAME('Annots')) if not annots.m_internal: continue len_ = mupdf.pdf_array_len(annots) for j in range(len_ - 1, -1, -1): o = mupdf.pdf_array_get( annots, j) if not mupdf.pdf_name_eq( mupdf.pdf_dict_get( o, PDF_NAME('Subtype')), PDF_NAME('Link')): continue action = mupdf.pdf_dict_get( o, PDF_NAME('A')) dest = mupdf.pdf_dict_get( o, PDF_NAME('Dest')) if action.m_internal: if not mupdf.pdf_name_eq( mupdf.pdf_dict_get( action, PDF_NAME('S')), PDF_NAME('GoTo')): continue dest = mupdf.pdf_dict_get( action, PDF_NAME('D')) pno = -1 if mupdf.pdf_is_array( dest): target = mupdf.pdf_array_get( dest, 0) pno = mupdf.pdf_lookup_page_number( pdf, target) elif mupdf.pdf_is_string( dest): location, _, _ = mupdf.fz_resolve_link( pdf.super(), mupdf.pdf_to_text_string( dest)) pno = location.page if pno < 0: # page number lookup did not work continue n1 = pno if n1 in numbers: mupdf.pdf_array_delete( annots, j) def ASSERT_PDF(cond): assert isinstance(cond, (mupdf.PdfPage, mupdf.PdfDocument)), f'{type(cond)=} {cond=}' if not cond.m_internal: raise Exception(MSG_IS_NO_PDF) def EMPTY_IRECT(): return IRect(FZ_MAX_INF_RECT, FZ_MAX_INF_RECT, FZ_MIN_INF_RECT, FZ_MIN_INF_RECT) def EMPTY_QUAD(): return EMPTY_RECT().quad def EMPTY_RECT(): return Rect(FZ_MAX_INF_RECT, FZ_MAX_INF_RECT, FZ_MIN_INF_RECT, FZ_MIN_INF_RECT) def ENSURE_OPERATION(pdf): if not JM_have_operation(pdf): raise Exception("No journalling operation started") def INFINITE_IRECT(): return IRect(FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT) def INFINITE_QUAD(): return INFINITE_RECT().quad def INFINITE_RECT(): return Rect(FZ_MIN_INF_RECT, FZ_MIN_INF_RECT, FZ_MAX_INF_RECT, FZ_MAX_INF_RECT) def JM_BinFromBuffer(buffer_): ''' Turn fz_buffer into a Python bytes object ''' assert isinstance(buffer_, mupdf.FzBuffer) ret = mupdf.fz_buffer_extract_copy(buffer_) return ret def JM_EscapeStrFromStr(c): # `c` is typically from SWIG which will have converted a `const char*` from # C into a Python `str` using `PyUnicode_DecodeUTF8(carray, static_cast< # Py_ssize_t >(size), "surrogateescape")`. This gives us a Python `str` # with some characters encoded as a \0xdcXY sequence, where `XY` are hex # digits for an invalid byte in the original `const char*`. # # This is actually a reasonable way of representing arbitrary # strings from C, but we want to mimic what PyMuPDF does. It uses # `PyUnicode_DecodeRawUnicodeEscape(c, (Py_ssize_t) strlen(c), "replace")` # which gives a string containing actual unicode characters for any invalid # bytes. # # We mimic this by converting the `str` to a `bytes` with 'surrogateescape' # to recognise \0xdcXY sequences, then convert the individual bytes into a # `str` using `chr()`. # # Would be good to have a more efficient way to do this. # if c is None: return '' assert isinstance(c, str), f'{type(c)=}' b = c.encode('utf8', 'surrogateescape') ret = '' for bb in b: ret += chr(bb) return ret def JM_BufferFromBytes(stream): ''' Make fz_buffer from a PyBytes, PyByteArray or io.BytesIO object. If a text io.BytesIO, we convert to binary by encoding as utf8. ''' if isinstance(stream, (bytes, bytearray)): data = stream elif hasattr(stream, 'getvalue'): data = stream.getvalue() if isinstance(data, str): data = data.encode('utf-8') if not isinstance(data, (bytes, bytearray)): raise Exception(f'.getvalue() returned unexpected type: {type(data)}') else: return mupdf.FzBuffer() return mupdf.fz_new_buffer_from_copied_data(data) def JM_FLOAT_ITEM(obj, idx): if not PySequence_Check(obj): return None return float(obj[idx]) def JM_INT_ITEM(obj, idx): if idx < len(obj): temp = obj[idx] if isinstance(temp, (int, float)): return 0, temp return 1, None def JM_pixmap_from_page(doc, page, ctm, cs, alpha, annots, clip): ''' Pixmap creation directly using a short-lived displaylist, so we can support separations. ''' SPOTS_NONE = 0 SPOTS_OVERPRINT_SIM = 1 SPOTS_FULL = 2 FZ_ENABLE_SPOT_RENDERING = True # fixme: this is a build-time setting in MuPDF's config.h. if FZ_ENABLE_SPOT_RENDERING: spots = SPOTS_OVERPRINT_SIM else: spots = SPOTS_NONE seps = None colorspace = cs matrix = JM_matrix_from_py(ctm) rect = mupdf.fz_bound_page(page) rclip = JM_rect_from_py(clip) rect = mupdf.fz_intersect_rect(rect, rclip) # no-op if clip is not given rect = mupdf.fz_transform_rect(rect, matrix) bbox = mupdf.fz_round_rect(rect) # Pixmap of the document's /OutputIntents ("output intents") oi = mupdf.fz_document_output_intent(doc) # if present and compatible, use it instead of the parameter if oi.m_internal: if mupdf.fz_colorspace_n(oi) == mupdf.fz_colorspace_n(cs): colorspace = mupdf.fz_keep_colorspace(oi) # check if spots rendering is available and if so use separations if spots != SPOTS_NONE: seps = mupdf.fz_page_separations(page) if seps.m_internal: n = mupdf.fz_count_separations(seps) if spots == SPOTS_FULL: for i in range(n): mupdf.fz_set_separation_behavior(seps, i, mupdf.FZ_SEPARATION_SPOT) else: for i in range(n): mupdf.fz_set_separation_behavior(seps, i, mupdf.FZ_SEPARATION_COMPOSITE) elif mupdf.fz_page_uses_overprint(page): # This page uses overprint, so we need an empty # sep object to force the overprint simulation on. seps = mupdf.fz_new_separations(0) elif oi.m_internal and mupdf.fz_colorspace_n(oi) != mupdf.fz_colorspace_n(colorspace): # We have an output intent, and it's incompatible # with the colorspace our device needs. Force the # overprint simulation on, because this ensures that # we 'simulate' the output intent too. seps = mupdf.fz_new_separations(0) pix = mupdf.fz_new_pixmap_with_bbox(colorspace, bbox, seps, alpha) if alpha: mupdf.fz_clear_pixmap(pix) else: mupdf.fz_clear_pixmap_with_value(pix, 0xFF) dev = mupdf.fz_new_draw_device(matrix, pix) if annots: mupdf.fz_run_page(page, dev, mupdf.FzMatrix(), mupdf.FzCookie()) else: mupdf.fz_run_page_contents(page, dev, mupdf.FzMatrix(), mupdf.FzCookie()) mupdf.fz_close_device(dev) return pix def JM_StrAsChar(x): # fixme: should encode, but swig doesn't pass bytes to C as const char*. return x #return x.encode('utf8') def JM_TUPLE(o: typing.Sequence) -> tuple: return tuple(map(lambda x: round(x, 5) if abs(x) >= 1e-4 else 0, o)) def JM_TUPLE3(o: typing.Sequence) -> tuple: return tuple(map(lambda x: round(x, 3) if abs(x) >= 1e-3 else 0, o)) def JM_UnicodeFromStr(s): if s is None: return '' if isinstance(s, bytes): s = s.decode('utf8') assert isinstance(s, str), f'{type(s)=} {s=}' return s def JM_add_annot_id(annot, stem): ''' Add a unique /NM key to an annotation or widget. Append a number to 'stem' such that the result is a unique name. ''' assert isinstance(annot, mupdf.PdfAnnot) page = _pdf_annot_page(annot) annot_obj = mupdf.pdf_annot_obj( annot) names = JM_get_annot_id_list(page) i = 0 while 1: stem_id = f'{JM_annot_id_stem}-{stem}{i}' if stem_id not in names: break i += 1 response = JM_StrAsChar(stem_id) name = mupdf.pdf_new_string( response, len(response)) mupdf.pdf_dict_puts(annot_obj, "NM", name) page.doc().m_internal.resynth_required = 0 def JM_add_oc_object(pdf, ref, xref): ''' Add OC object reference to a dictionary ''' indobj = mupdf.pdf_new_indirect(pdf, xref, 0) if not mupdf.pdf_is_dict(indobj): RAISEPY(MSG_BAD_OC_REF, PyExc_ValueError) type_ = mupdf.pdf_dict_get(indobj, PDF_NAME('Type')) if (mupdf.pdf_objcmp(type_, PDF_NAME('OCG')) == 0 or mupdf.pdf_objcmp(type_, PDF_NAME('OCMD')) == 0 ): mupdf.pdf_dict_put(ref, PDF_NAME('OC'), indobj) else: RAISEPY(MSG_BAD_OC_REF, PyExc_ValueError) def JM_annot_border(annot_obj): dash_py = list() style = None width = -1 clouds = -1 obj = None obj = mupdf.pdf_dict_get( annot_obj, PDF_NAME('Border')) if mupdf.pdf_is_array( obj): width = mupdf.pdf_to_real( mupdf.pdf_array_get( obj, 2)) if mupdf.pdf_array_len( obj) == 4: dash = mupdf.pdf_array_get( obj, 3) for i in range( mupdf.pdf_array_len( dash)): val = mupdf.pdf_to_int( mupdf.pdf_array_get( dash, i)) dash_py.append( val) bs_o = mupdf.pdf_dict_get( annot_obj, PDF_NAME('BS')) if bs_o.m_internal: width = mupdf.pdf_to_real( mupdf.pdf_dict_get( bs_o, PDF_NAME('W'))) style = mupdf.pdf_to_name( mupdf.pdf_dict_get( bs_o, PDF_NAME('S'))) if style == '': style = None obj = mupdf.pdf_dict_get( bs_o, PDF_NAME('D')) if obj.m_internal: for i in range( mupdf.pdf_array_len( obj)): val = mupdf.pdf_to_int( mupdf.pdf_array_get( obj, i)) dash_py.append( val) obj = mupdf.pdf_dict_get( annot_obj, PDF_NAME('BE')) if obj.m_internal: clouds = mupdf.pdf_to_int( mupdf.pdf_dict_get( obj, PDF_NAME('I'))) res = dict() res[ dictkey_width] = width res[ dictkey_dashes] = tuple( dash_py) res[ dictkey_style] = style res[ 'clouds'] = clouds return res def JM_annot_colors(annot_obj): res = dict() bc = list() # stroke colors fc =list() # fill colors o = mupdf.pdf_dict_get(annot_obj, mupdf.PDF_ENUM_NAME_C) if mupdf.pdf_is_array(o): n = mupdf.pdf_array_len(o) for i in range(n): col = mupdf.pdf_to_real( mupdf.pdf_array_get(o, i)) bc.append(col) res[dictkey_stroke] = bc o = mupdf.pdf_dict_gets(annot_obj, "IC") if mupdf.pdf_is_array(o): n = mupdf.pdf_array_len(o) for i in range(n): col = mupdf.pdf_to_real( mupdf.pdf_array_get(o, i)) fc.append(col) res[dictkey_fill] = fc return res def JM_annot_set_border( border, doc, annot_obj): assert isinstance(border, dict) obj = None dashlen = 0 nwidth = border.get( dictkey_width) # new width ndashes = border.get( dictkey_dashes) # new dashes nstyle = border.get( dictkey_style) # new style nclouds = border.get( 'clouds', -1) # new clouds value # get old border properties oborder = JM_annot_border( annot_obj) # delete border-related entries mupdf.pdf_dict_del( annot_obj, PDF_NAME('BS')) mupdf.pdf_dict_del( annot_obj, PDF_NAME('BE')) mupdf.pdf_dict_del( annot_obj, PDF_NAME('Border')) # populate border items: keep old values for any omitted new ones if nwidth < 0: nwidth = oborder.get( dictkey_width) # no new width: keep current if ndashes is None: ndashes = oborder.get( dictkey_dashes) # no new dashes: keep old if nstyle is None: nstyle = oborder.get( dictkey_style) # no new style: keep old if nclouds < 0: nclouds = oborder.get( "clouds", -1) # no new clouds: keep old if isinstance( ndashes, tuple) and len( ndashes) > 0: dashlen = len( ndashes) darr = mupdf.pdf_new_array( doc, dashlen) for d in ndashes: mupdf.pdf_array_push_int( darr, d) mupdf.pdf_dict_putl( annot_obj, darr, PDF_NAME('BS'), PDF_NAME('D')) mupdf.pdf_dict_putl( annot_obj, mupdf.pdf_new_real( nwidth), PDF_NAME('BS'), PDF_NAME('W'), ) if dashlen == 0: obj = JM_get_border_style( nstyle) else: obj = PDF_NAME('D') mupdf.pdf_dict_putl( annot_obj, obj, PDF_NAME('BS'), PDF_NAME('S')) if nclouds > 0: mupdf.pdf_dict_put_dict( annot_obj, PDF_NAME('BE'), 2) obj = mupdf.pdf_dict_get( annot_obj, PDF_NAME('BE')) mupdf.pdf_dict_put( obj, PDF_NAME('S'), PDF_NAME('C')) mupdf.pdf_dict_put_int( obj, PDF_NAME('I'), nclouds) def make_escape(ch): if ch == 92: return "\\u005c" elif 32 <= ch <= 127 or ch == 10: return chr(ch) elif 0xd800 <= ch <= 0xdfff: # orphaned surrogate return "\\ufffd" elif ch <= 0xffff: return "\\u%04x" % ch else: return "\\U%08x" % ch def JM_append_rune(buff, ch): """ APPEND non-ascii runes in unicode escape format to fz_buffer. """ mupdf.fz_append_string(buff, make_escape(ch)) def JM_append_word(lines, buff, wbbox, block_n, line_n, word_n): ''' Functions for wordlist output ''' s = JM_EscapeStrFromBuffer(buff) litem = ( wbbox.x0, wbbox.y0, wbbox.x1, wbbox.y1, s, block_n, line_n, word_n, ) lines.append(litem) return word_n + 1, mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) # word counter def JM_add_layer_config( pdf, name, creator, ON): ''' Add OC configuration to the PDF catalog ''' ocp = JM_ensure_ocproperties( pdf) configs = mupdf.pdf_dict_get( ocp, PDF_NAME('Configs')) if not mupdf.pdf_is_array( configs): configs = mupdf.pdf_dict_put_array( ocp, PDF_NAME('Configs'), 1) D = mupdf.pdf_new_dict( pdf, 5) mupdf.pdf_dict_put_text_string( D, PDF_NAME('Name'), name) if creator is not None: mupdf.pdf_dict_put_text_string( D, PDF_NAME('Creator'), creator) mupdf.pdf_dict_put( D, PDF_NAME('BaseState'), PDF_NAME('OFF')) onarray = mupdf.pdf_dict_put_array( D, PDF_NAME('ON'), 5) if not ON: pass else: ocgs = mupdf.pdf_dict_get( ocp, PDF_NAME('OCGs')) n = len(ON) for i in range(n): xref = 0 e, xref = JM_INT_ITEM(ON, i) if e == 1: continue ind = mupdf.pdf_new_indirect( pdf, xref, 0) if mupdf.pdf_array_contains( ocgs, ind): mupdf.pdf_array_push( onarray, ind) mupdf.pdf_array_push( configs, D) def JM_char_bbox(line, ch): ''' return rect of char quad ''' q = JM_char_quad(line, ch) r = mupdf.fz_rect_from_quad(q) if not line.m_internal.wmode: return r if r.y1 < r.y0 + ch.m_internal.size: r.y0 = r.y1 - ch.m_internal.size return r def JM_char_font_flags(font, line, ch): flags = 0 if line and ch: flags += detect_super_script(line, ch) flags += mupdf.fz_font_is_italic(font) * TEXT_FONT_ITALIC flags += mupdf.fz_font_is_serif(font) * TEXT_FONT_SERIFED flags += mupdf.fz_font_is_monospaced(font) * TEXT_FONT_MONOSPACED flags += mupdf.fz_font_is_bold(font) * TEXT_FONT_BOLD return flags def JM_char_quad(line, ch): ''' re-compute char quad if ascender/descender values make no sense ''' if 1 and g_use_extra: # This reduces time taken to extract text from PyMuPDF.pdf from 20s to # 15s. return mupdf.FzQuad(extra.JM_char_quad( line.m_internal, ch.m_internal)) assert isinstance(line, mupdf.FzStextLine) assert isinstance(ch, mupdf.FzStextChar) if _globals.skip_quad_corrections: # no special handling return ch.quad if line.m_internal.wmode: # never touch vertical write mode return ch.quad font = mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)) asc = JM_font_ascender(font) dsc = JM_font_descender(font) fsize = ch.m_internal.size asc_dsc = asc - dsc + FLT_EPSILON if asc_dsc >= 1 and _globals.small_glyph_heights == 0: # no problem return mupdf.FzQuad(ch.m_internal.quad) # Re-compute quad with adjusted ascender / descender values: # Move ch->origin to (0,0) and de-rotate quad, then adjust the corners, # re-rotate and move back to ch->origin location. fsize = ch.m_internal.size bbox = mupdf.fz_font_bbox(font) fwidth = bbox.x1 - bbox.x0 if asc < 1e-3: # probably Tesseract glyphless font dsc = -0.1 asc = 0.9 asc_dsc = 1.0 if _globals.small_glyph_heights or asc_dsc < 1: dsc = dsc / asc_dsc asc = asc / asc_dsc asc_dsc = asc - dsc asc = asc * fsize / asc_dsc dsc = dsc * fsize / asc_dsc # Re-compute quad with the adjusted ascender / descender values: # Move ch->origin to (0,0) and de-rotate quad, then adjust the corners, # re-rotate and move back to ch->origin location. c = line.m_internal.dir.x # cosine s = line.m_internal.dir.y # sine trm1 = mupdf.fz_make_matrix(c, -s, s, c, 0, 0) # derotate trm2 = mupdf.fz_make_matrix(c, s, -s, c, 0, 0) # rotate if (c == -1): # left-right flip trm1.d = 1 trm2.d = 1 xlate1 = mupdf.fz_make_matrix(1, 0, 0, 1, -ch.m_internal.origin.x, -ch.m_internal.origin.y) xlate2 = mupdf.fz_make_matrix(1, 0, 0, 1, ch.m_internal.origin.x, ch.m_internal.origin.y) quad = mupdf.fz_transform_quad(mupdf.FzQuad(ch.m_internal.quad), xlate1) # move origin to (0,0) quad = mupdf.fz_transform_quad(quad, trm1) # de-rotate corners # adjust vertical coordinates if c == 1 and quad.ul.y > 0: # up-down flip quad.ul.y = asc quad.ur.y = asc quad.ll.y = dsc quad.lr.y = dsc else: quad.ul.y = -asc quad.ur.y = -asc quad.ll.y = -dsc quad.lr.y = -dsc # adjust horizontal coordinates that are too crazy: # (1) left x must be >= 0 # (2) if bbox width is 0, lookup char advance in font. if quad.ll.x < 0: quad.ll.x = 0 quad.ul.x = 0 cwidth = quad.lr.x - quad.ll.x if cwidth < FLT_EPSILON: glyph = mupdf.fz_encode_character( font, ch.m_internal.c) if glyph: fwidth = mupdf.fz_advance_glyph( font, glyph, line.m_internal.wmode) quad.lr.x = quad.ll.x + fwidth * fsize quad.ur.x = quad.lr.x quad = mupdf.fz_transform_quad(quad, trm2) # rotate back quad = mupdf.fz_transform_quad(quad, xlate2) # translate back return quad def JM_choice_options(annot): ''' return list of choices for list or combo boxes ''' annot_obj = mupdf.pdf_annot_obj( annot.this) opts = mupdf.pdf_choice_widget_options2( annot, 0) n = len( opts) if n == 0: return # wrong widget type optarr = mupdf.pdf_dict_get( annot_obj, PDF_NAME('Opt')) liste = [] for i in range( n): m = mupdf.pdf_array_len( mupdf.pdf_array_get( optarr, i)) if m == 2: val = ( mupdf.pdf_to_text_string( mupdf.pdf_array_get( mupdf.pdf_array_get( optarr, i), 0)), mupdf.pdf_to_text_string( mupdf.pdf_array_get( mupdf.pdf_array_get( optarr, i), 1)), ) liste.append( val) else: val = mupdf.pdf_to_text_string( mupdf.pdf_array_get( optarr, i)) liste.append( val) return liste def JM_clear_pixmap_rect_with_value(dest, value, b): ''' Clear a pixmap rectangle - my version also supports non-alpha pixmaps ''' b = mupdf.fz_intersect_irect(b, mupdf.fz_pixmap_bbox(dest)) w = b.x1 - b.x0 y = b.y1 - b.y0 if w <= 0 or y <= 0: return 0 destspan = dest.stride() destp = destspan * (b.y0 - dest.y()) + dest.n() * (b.x0 - dest.x()) # CMYK needs special handling (and potentially any other subtractive colorspaces) if mupdf.fz_colorspace_n(dest.colorspace()) == 4: value = 255 - value while 1: s = destp for x in range(0, w): mupdf.fz_samples_set(dest, s, 0) s += 1 mupdf.fz_samples_set(dest, s, 0) s += 1 mupdf.fz_samples_set(dest, s, 0) s += 1 mupdf.fz_samples_set(dest, s, value) s += 1 if dest.alpha(): mupdf.fz_samples_set(dest, s, 255) s += 1 destp += destspan if y == 0: break y -= 1 return 1 while 1: s = destp for x in range(w): for k in range(dest.n()-1): mupdf.fz_samples_set(dest, s, value) s += 1 if dest.alpha(): mupdf.fz_samples_set(dest, s, 255) s += 1 else: mupdf.fz_samples_set(dest, s, value) s += 1 destp += destspan if y == 0: break y -= 1 return 1 def JM_color_FromSequence(color): if isinstance(color, (int, float)): # maybe just a single float color = [color] if not isinstance( color, (list, tuple)): return -1, [] if len(color) not in (0, 1, 3, 4): return -1, [] ret = color[:] for i in range(len(ret)): if ret[i] < 0 or ret[i] > 1: ret[i] = 1 return len(ret), ret def JM_color_count( pm, clip): if g_use_extra: return extra.ll_JM_color_count(pm.m_internal, clip) rc = dict() cnt = 0 irect = mupdf.fz_pixmap_bbox( pm) irect = mupdf.fz_intersect_irect(irect, mupdf.fz_round_rect(JM_rect_from_py(clip))) stride = pm.stride() width = irect.x1 - irect.x0 height = irect.y1 - irect.y0 n = pm.n() substride = width * n s = stride * (irect.y0 - pm.y()) + (irect.x0 - pm.x()) * n oldpix = _read_samples( pm, s, n) cnt = 0 if mupdf.fz_is_empty_irect(irect): return rc for i in range( height): for j in range( 0, substride, n): newpix = _read_samples( pm, s + j, n) if newpix != oldpix: pixel = oldpix c = rc.get( pixel, None) if c is not None: cnt += c rc[ pixel] = cnt cnt = 1 oldpix = newpix else: cnt += 1 s += stride pixel = oldpix c = rc.get( pixel) if c is not None: cnt += c rc[ pixel] = cnt return rc def JM_compress_buffer(inbuffer): ''' compress char* into a new buffer ''' data, compressed_length = mupdf.fz_new_deflated_data_from_buffer( inbuffer, mupdf.FZ_DEFLATE_BEST, ) #log( '{=data compressed_length}') if not data or compressed_length == 0: return None buf = mupdf.FzBuffer(mupdf.fz_new_buffer_from_data(data, compressed_length)) mupdf.fz_resize_buffer(buf, compressed_length) return buf def JM_copy_rectangle(page, area): need_new_line = 0 buffer = io.StringIO() for block in page: if block.m_internal.type != mupdf.FZ_STEXT_BLOCK_TEXT: continue for line in block: line_had_text = 0 for ch in line: r = JM_char_bbox(line, ch) if JM_rects_overlap(area, r): line_had_text = 1 if need_new_line: buffer.write("\n") need_new_line = 0 buffer.write(make_escape(ch.m_internal.c)) if line_had_text: need_new_line = 1 s = buffer.getvalue() # take over the data return s def JM_convert_to_pdf(doc, fp, tp, rotate): ''' Convert any MuPDF document to a PDF Returns bytes object containing the PDF, created via 'write' function. ''' pdfout = mupdf.PdfDocument() incr = 1 s = fp e = tp if fp > tp: incr = -1 # count backwards s = tp # adjust ... e = fp # ... range rot = JM_norm_rotation(rotate) i = fp while 1: # interpret & write document pages as PDF pages if not _INRANGE(i, s, e): break page = mupdf.fz_load_page(doc, i) mediabox = mupdf.fz_bound_page(page) dev, resources, contents = mupdf.pdf_page_write(pdfout, mediabox) mupdf.fz_run_page(page, dev, mupdf.FzMatrix(), mupdf.FzCookie()) mupdf.fz_close_device(dev) dev = None page_obj = mupdf.pdf_add_page(pdfout, mediabox, rot, resources, contents) mupdf.pdf_insert_page(pdfout, -1, page_obj) i += incr # PDF created - now write it to Python bytearray # prepare write options structure opts = mupdf.PdfWriteOptions() opts.do_garbage = 4 opts.do_compress = 1 opts.do_compress_images = 1 opts.do_compress_fonts = 1 opts.do_sanitize = 1 opts.do_incremental = 0 opts.do_ascii = 0 opts.do_decompress = 0 opts.do_linear = 0 opts.do_clean = 1 opts.do_pretty = 0 res = mupdf.fz_new_buffer(8192) out = mupdf.FzOutput(res) mupdf.pdf_write_document(pdfout, out, opts) out.fz_close_output() c = mupdf.fz_buffer_extract_copy(res) assert isinstance(c, bytes) return c # Copied from MuPDF v1.14 # Create widget def JM_create_widget(doc, page, type, fieldname): old_sigflags = mupdf.pdf_to_int(mupdf.pdf_dict_getp(mupdf.pdf_trailer(doc), "Root/AcroForm/SigFlags")) #log( '*** JM_create_widget()') #log( f'{mupdf.pdf_create_annot_raw=}') #log( f'{page=}') #log( f'{mupdf.PDF_ANNOT_WIDGET=}') annot = mupdf.pdf_create_annot_raw(page, mupdf.PDF_ANNOT_WIDGET) annot_obj = mupdf.pdf_annot_obj(annot) try: JM_set_field_type(doc, annot_obj, type) mupdf.pdf_dict_put_text_string(annot_obj, PDF_NAME('T'), fieldname) if type == mupdf.PDF_WIDGET_TYPE_SIGNATURE: sigflags = old_sigflags | (SigFlag_SignaturesExist | SigFlag_AppendOnly) mupdf.pdf_dict_putl( mupdf.pdf_trailer(doc), mupdf.pdf_new_int(sigflags), PDF_NAME('Root'), PDF_NAME('AcroForm'), PDF_NAME('SigFlags'), ) # pdf_create_annot will have linked the new widget into the page's # annot array. We also need it linked into the document's form form = mupdf.pdf_dict_getp(mupdf.pdf_trailer(doc), "Root/AcroForm/Fields") if not form.m_internal: form = mupdf.pdf_new_array(doc, 1) mupdf.pdf_dict_putl( mupdf.pdf_trailer(doc), form, PDF_NAME('Root'), PDF_NAME('AcroForm'), PDF_NAME('Fields'), ) mupdf.pdf_array_push(form, annot_obj) # Cleanup relies on this statement being last except Exception: if g_exceptions_verbose: exception_info() mupdf.pdf_delete_annot(page, annot) if type == mupdf.PDF_WIDGET_TYPE_SIGNATURE: mupdf.pdf_dict_putl( mupdf.pdf_trailer(doc), mupdf.pdf_new_int(old_sigflags), PDF_NAME('Root'), PDF_NAME('AcroForm'), PDF_NAME('SigFlags'), ) raise return annot def JM_cropbox(page_obj): ''' return a PDF page's CropBox ''' if g_use_extra: return extra.JM_cropbox(page_obj) mediabox = JM_mediabox(page_obj) cropbox = mupdf.pdf_to_rect( mupdf.pdf_dict_get_inheritable(page_obj, PDF_NAME('CropBox')) ) if mupdf.fz_is_infinite_rect(cropbox) or mupdf.fz_is_empty_rect(cropbox): cropbox = mediabox y0 = mediabox.y1 - cropbox.y1 y1 = mediabox.y1 - cropbox.y0 cropbox.y0 = y0 cropbox.y1 = y1 return cropbox def JM_cropbox_size(page_obj): rect = JM_cropbox(page_obj) w = abs(rect.x1 - rect.x0) h = abs(rect.y1 - rect.y0) size = mupdf.fz_make_point(w, h) return size def JM_derotate_page_matrix(page): ''' just the inverse of rotation ''' mp = JM_rotate_page_matrix(page) return mupdf.fz_invert_matrix(mp) def JM_embed_file( pdf, buf, filename, ufilename, desc, compress, ): ''' embed a new file in a PDF (not only /EmbeddedFiles entries) ''' len_ = 0 val = mupdf.pdf_new_dict(pdf, 6) mupdf.pdf_dict_put_dict(val, PDF_NAME('CI'), 4) ef = mupdf.pdf_dict_put_dict(val, PDF_NAME('EF'), 4) mupdf.pdf_dict_put_text_string(val, PDF_NAME('F'), filename) mupdf.pdf_dict_put_text_string(val, PDF_NAME('UF'), ufilename) mupdf.pdf_dict_put_text_string(val, PDF_NAME('Desc'), desc) mupdf.pdf_dict_put(val, PDF_NAME('Type'), PDF_NAME('Filespec')) bs = b' ' f = mupdf.pdf_add_stream( pdf, #mupdf.fz_fz_new_buffer_from_copied_data(bs), mupdf.fz_new_buffer_from_copied_data(bs), mupdf.PdfObj(), 0, ) mupdf.pdf_dict_put(ef, PDF_NAME('F'), f) JM_update_stream(pdf, f, buf, compress) len_, _ = mupdf.fz_buffer_storage(buf) mupdf.pdf_dict_put_int(f, PDF_NAME('DL'), len_) mupdf.pdf_dict_put_int(f, PDF_NAME('Length'), len_) params = mupdf.pdf_dict_put_dict(f, PDF_NAME('Params'), 4) mupdf.pdf_dict_put_int(params, PDF_NAME('Size'), len_) return val def JM_embedded_clean(pdf): ''' perform some cleaning if we have /EmbeddedFiles: (1) remove any /Limits if /Names exists (2) remove any empty /Collection (3) set /PageMode/UseAttachments ''' root = mupdf.pdf_dict_get( mupdf.pdf_trailer( pdf), PDF_NAME('Root')) # remove any empty /Collection entry coll = mupdf.pdf_dict_get(root, PDF_NAME('Collection')) if coll.m_internal and mupdf.pdf_dict_len(coll) == 0: mupdf.pdf_dict_del(root, PDF_NAME('Collection')) efiles = mupdf.pdf_dict_getl( root, PDF_NAME('Names'), PDF_NAME('EmbeddedFiles'), PDF_NAME('Names'), ) if efiles.m_internal: mupdf.pdf_dict_put_name(root, PDF_NAME('PageMode'), "UseAttachments") def JM_EscapeStrFromBuffer(buff): if not buff.m_internal: return '' s = mupdf.fz_buffer_extract_copy(buff) val = PyUnicode_DecodeRawUnicodeEscape(s, errors='replace') return val def JM_ensure_identity(pdf): ''' Store ID in PDF trailer ''' id_ = mupdf.pdf_dict_get( mupdf.pdf_trailer(pdf), PDF_NAME('ID')) if not id_.m_internal: rnd0 = mupdf.fz_memrnd2(16) # Need to convert raw bytes into a str to send to # mupdf.pdf_new_string(). chr() seems to work for this. rnd = '' for i in rnd0: rnd += chr(i) id_ = mupdf.pdf_dict_put_array( mupdf.pdf_trailer( pdf), PDF_NAME('ID'), 2) mupdf.pdf_array_push( id_, mupdf.pdf_new_string( rnd, len(rnd))) mupdf.pdf_array_push( id_, mupdf.pdf_new_string( rnd, len(rnd))) def JM_ensure_ocproperties(pdf): ''' Ensure OCProperties, return /OCProperties key ''' ocp = mupdf.pdf_dict_get(mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')), PDF_NAME('OCProperties')) if ocp.m_internal: return ocp root = mupdf.pdf_dict_get(mupdf.pdf_trailer(pdf), PDF_NAME('Root')) ocp = mupdf.pdf_dict_put_dict(root, PDF_NAME('OCProperties'), 2) mupdf.pdf_dict_put_array(ocp, PDF_NAME('OCGs'), 0) D = mupdf.pdf_dict_put_dict(ocp, PDF_NAME('D'), 5) mupdf.pdf_dict_put_array(D, PDF_NAME('ON'), 0) mupdf.pdf_dict_put_array(D, PDF_NAME('OFF'), 0) mupdf.pdf_dict_put_array(D, PDF_NAME('Order'), 0) mupdf.pdf_dict_put_array(D, PDF_NAME('RBGroups'), 0) return ocp def JM_expand_fname(name): ''' Make /DA string of annotation ''' if not name: return "Helv" if name.startswith("Co"): return "Cour" if name.startswith("co"): return "Cour" if name.startswith("Ti"): return "TiRo" if name.startswith("ti"): return "TiRo" if name.startswith("Sy"): return "Symb" if name.startswith("sy"): return "Symb" if name.startswith("Za"): return "ZaDb" if name.startswith("za"): return "ZaDb" return "Helv" def JM_field_type_text(wtype): ''' String from widget type ''' if wtype == mupdf.PDF_WIDGET_TYPE_BUTTON: return "Button" if wtype == mupdf.PDF_WIDGET_TYPE_CHECKBOX: return "CheckBox" if wtype == mupdf.PDF_WIDGET_TYPE_RADIOBUTTON: return "RadioButton" if wtype == mupdf.PDF_WIDGET_TYPE_TEXT: return "Text" if wtype == mupdf.PDF_WIDGET_TYPE_LISTBOX: return "ListBox" if wtype == mupdf.PDF_WIDGET_TYPE_COMBOBOX: return "ComboBox" if wtype == mupdf.PDF_WIDGET_TYPE_SIGNATURE: return "Signature" return "unknown" def JM_fill_pixmap_rect_with_color(dest, col, b): assert isinstance(dest, mupdf.FzPixmap) # fill a rect with a color tuple b = mupdf.fz_intersect_irect(b, mupdf.fz_pixmap_bbox( dest)) w = b.x1 - b.x0 y = b.y1 - b.y0 if w <= 0 or y <= 0: return 0 destspan = dest.stride() destp = destspan * (b.y0 - dest.y()) + dest.n() * (b.x0 - dest.x()) while 1: s = destp for x in range(w): for i in range( dest.n()): mupdf.fz_samples_set(dest, s, col[i]) s += 1 destp += destspan y -= 1 if y == 0: break return 1 def JM_find_annot_irt(annot): ''' Return the first annotation whose /IRT key ("In Response To") points to annot. Used to remove the response chain of a given annotation. ''' assert isinstance(annot, mupdf.PdfAnnot) irt_annot = None # returning this annot_obj = mupdf.pdf_annot_obj(annot) found = 0 # loop thru MuPDF's internal annots array page = _pdf_annot_page(annot) irt_annot = mupdf.pdf_first_annot(page) while 1: assert isinstance(irt_annot, mupdf.PdfAnnot) if not irt_annot.m_internal: break irt_annot_obj = mupdf.pdf_annot_obj(irt_annot) o = mupdf.pdf_dict_gets(irt_annot_obj, 'IRT') if o.m_internal: if not mupdf.pdf_objcmp(o, annot_obj): found = 1 break irt_annot = mupdf.pdf_next_annot(irt_annot) if found: return irt_annot def JM_font_ascender(font): ''' need own versions of ascender / descender ''' assert isinstance(font, mupdf.FzFont) if _globals.skip_quad_corrections: return 0.8 return mupdf.fz_font_ascender(font) def JM_font_descender(font): ''' need own versions of ascender / descender ''' assert isinstance(font, mupdf.FzFont) if _globals.skip_quad_corrections: return -0.2 ret = mupdf.fz_font_descender(font) return ret def JM_is_word_delimiter(ch, delimiters): """Check if ch is an extra word delimiting character. """ if (0 or ch <= 32 or ch == 160 or 0x202a <= ch <= 0x202e ): # covers any whitespace plus unicodes that switch between # right-to-left and left-to-right languages return True if not delimiters: # no extra delimiters provided return False char = chr(ch) for d in delimiters: if d == char: return True return False def JM_is_rtl_char(ch): if ch < 0x590 or ch > 0x900: return False return True def JM_font_name(font): assert isinstance(font, mupdf.FzFont) name = mupdf.fz_font_name(font) s = name.find('+') if _globals.subset_fontnames or s == -1 or s != 6: return name return name[s + 1:] def JM_gather_fonts(pdf, dict_, fontlist, stream_xref): rc = 1 n = mupdf.pdf_dict_len(dict_) for i in range(n): refname = mupdf.pdf_dict_get_key(dict_, i) fontdict = mupdf.pdf_dict_get_val(dict_, i) if not mupdf.pdf_is_dict(fontdict): mupdf.fz_warn( f"'{mupdf.pdf_to_name(refname)}' is no font dict ({mupdf.pdf_to_num(fontdict)} 0 R)") continue subtype = mupdf.pdf_dict_get(fontdict, mupdf.PDF_ENUM_NAME_Subtype) basefont = mupdf.pdf_dict_get(fontdict, mupdf.PDF_ENUM_NAME_BaseFont) if not basefont.m_internal or mupdf.pdf_is_null(basefont): name = mupdf.pdf_dict_get(fontdict, mupdf.PDF_ENUM_NAME_Name) else: name = basefont encoding = mupdf.pdf_dict_get(fontdict, mupdf.PDF_ENUM_NAME_Encoding) if mupdf.pdf_is_dict(encoding): encoding = mupdf.pdf_dict_get(encoding, mupdf.PDF_ENUM_NAME_BaseEncoding) xref = mupdf.pdf_to_num(fontdict) ext = "n/a" if xref: ext = JM_get_fontextension(pdf, xref) entry = ( xref, ext, mupdf.pdf_to_name(subtype), JM_EscapeStrFromStr(mupdf.pdf_to_name(name)), mupdf.pdf_to_name(refname), mupdf.pdf_to_name(encoding), stream_xref, ) fontlist.append(entry) return rc def JM_gather_forms(doc, dict_: mupdf.PdfObj, imagelist, stream_xref: int): ''' Store info of a /Form xobject in Python list ''' assert isinstance(doc, mupdf.PdfDocument) rc = 1 n = mupdf.pdf_dict_len(dict_) for i in range(n): refname = mupdf.pdf_dict_get_key( dict_, i) imagedict = mupdf.pdf_dict_get_val(dict_, i) if not mupdf.pdf_is_dict(imagedict): mupdf.fz_warn( f"'{mupdf.pdf_to_name(refname)}' is no form dict ({mupdf.pdf_to_num(imagedict)} 0 R)") continue type_ = mupdf.pdf_dict_get(imagedict, PDF_NAME('Subtype')) if not mupdf.pdf_name_eq(type_, PDF_NAME('Form')): continue o = mupdf.pdf_dict_get(imagedict, PDF_NAME('BBox')) m = mupdf.pdf_dict_get(imagedict, PDF_NAME('Matrix')) if m.m_internal: mat = mupdf.pdf_to_matrix(m) else: mat = mupdf.FzMatrix() if o.m_internal: bbox = mupdf.fz_transform_rect( mupdf.pdf_to_rect(o), mat) else: bbox = mupdf.FzRect(mupdf.FzRect.Fixed_INFINITE) xref = mupdf.pdf_to_num(imagedict) entry = ( xref, mupdf.pdf_to_name( refname), stream_xref, JM_py_from_rect(bbox), ) imagelist.append(entry) return rc def JM_gather_images(doc: mupdf.PdfDocument, dict_: mupdf.PdfObj, imagelist, stream_xref: int): ''' Store info of an image in Python list ''' rc = 1 n = mupdf.pdf_dict_len( dict_) for i in range(n): refname = mupdf.pdf_dict_get_key(dict_, i) imagedict = mupdf.pdf_dict_get_val(dict_, i) if not mupdf.pdf_is_dict(imagedict): mupdf.fz_warn(f"'{mupdf.pdf_to_name(refname)}' is no image dict ({mupdf.pdf_to_num(imagedict)} 0 R)") continue type_ = mupdf.pdf_dict_get(imagedict, PDF_NAME('Subtype')) if not mupdf.pdf_name_eq(type_, PDF_NAME('Image')): continue xref = mupdf.pdf_to_num(imagedict) gen = 0 smask = mupdf.pdf_dict_geta(imagedict, PDF_NAME('SMask'), PDF_NAME('Mask')) if smask.m_internal: gen = mupdf.pdf_to_num(smask) filter_ = mupdf.pdf_dict_geta(imagedict, PDF_NAME('Filter'), PDF_NAME('F')) if mupdf.pdf_is_array(filter_): filter_ = mupdf.pdf_array_get(filter_, 0) altcs = mupdf.PdfObj(0) cs = mupdf.pdf_dict_geta(imagedict, PDF_NAME('ColorSpace'), PDF_NAME('CS')) if mupdf.pdf_is_array(cs): cses = cs cs = mupdf.pdf_array_get(cses, 0) if (mupdf.pdf_name_eq(cs, PDF_NAME('DeviceN')) or mupdf.pdf_name_eq(cs, PDF_NAME('Separation')) ): altcs = mupdf.pdf_array_get(cses, 2) if mupdf.pdf_is_array(altcs): altcs = mupdf.pdf_array_get(altcs, 0) width = mupdf.pdf_dict_geta(imagedict, PDF_NAME('Width'), PDF_NAME('W')) height = mupdf.pdf_dict_geta(imagedict, PDF_NAME('Height'), PDF_NAME('H')) bpc = mupdf.pdf_dict_geta(imagedict, PDF_NAME('BitsPerComponent'), PDF_NAME('BPC')) entry = ( xref, gen, mupdf.pdf_to_int(width), mupdf.pdf_to_int(height), mupdf.pdf_to_int(bpc), JM_EscapeStrFromStr(mupdf.pdf_to_name(cs)), JM_EscapeStrFromStr(mupdf.pdf_to_name(altcs)), JM_EscapeStrFromStr(mupdf.pdf_to_name(refname)), JM_EscapeStrFromStr(mupdf.pdf_to_name(filter_)), stream_xref, ) imagelist.append(entry) return rc def JM_get_annot_by_xref(page, xref): ''' retrieve annot by its xref ''' assert isinstance(page, mupdf.PdfPage) found = 0 # loop thru MuPDF's internal annots array annot = mupdf.pdf_first_annot(page) while 1: if not annot.m_internal: break if xref == mupdf.pdf_to_num(mupdf.pdf_annot_obj(annot)): found = 1 break annot = mupdf.pdf_next_annot( annot) if not found: raise Exception("xref %d is not an annot of this page" % xref) return annot def JM_get_annot_by_name(page, name): ''' retrieve annot by name (/NM key) ''' assert isinstance(page, mupdf.PdfPage) if not name: return found = 0 # loop thru MuPDF's internal annots and widget arrays annot = mupdf.pdf_first_annot(page) while 1: if not annot.m_internal: break response, len_ = mupdf.pdf_to_string(mupdf.pdf_dict_gets(mupdf.pdf_annot_obj(annot), "NM")) if name == response: found = 1 break annot = mupdf.pdf_next_annot(annot) if not found: raise Exception("'%s' is not an annot of this page" % name) return annot def JM_get_annot_id_list(page): names = [] annots = mupdf.pdf_dict_get( page.obj(), mupdf.PDF_ENUM_NAME_Annots) if not annots.m_internal: return names for i in range( mupdf.pdf_array_len(annots)): annot_obj = mupdf.pdf_array_get(annots, i) name = mupdf.pdf_dict_gets(annot_obj, "NM") if name.m_internal: names.append( mupdf.pdf_to_text_string(name) ) return names def JM_get_annot_xref_list( page_obj): ''' return the xrefs and /NM ids of a page's annots, links and fields ''' if g_use_extra: names = extra.JM_get_annot_xref_list( page_obj) return names names = [] annots = mupdf.pdf_dict_get( page_obj, PDF_NAME('Annots')) n = mupdf.pdf_array_len( annots) for i in range( n): annot_obj = mupdf.pdf_array_get( annots, i) xref = mupdf.pdf_to_num( annot_obj) subtype = mupdf.pdf_dict_get( annot_obj, PDF_NAME('Subtype')) if not subtype.m_internal: continue # subtype is required type_ = mupdf.pdf_annot_type_from_string( mupdf.pdf_to_name( subtype)) if type_ == mupdf.PDF_ANNOT_UNKNOWN: continue # only accept valid annot types id_ = mupdf.pdf_dict_gets( annot_obj, "NM") names.append( (xref, type_, mupdf.pdf_to_text_string( id_))) return names def JM_get_annot_xref_list2(page): page = page._pdf_page(required=False) if not page.m_internal: return list() return JM_get_annot_xref_list( page.obj()) def JM_get_border_style(style): ''' return pdf_obj "border style" from Python str ''' val = mupdf.PDF_ENUM_NAME_S if style is None: return val s = style if s.startswith("b") or s.startswith("B"): val = mupdf.PDF_ENUM_NAME_B elif s.startswith("d") or s.startswith("D"): val = mupdf.PDF_ENUM_NAME_D elif s.startswith("i") or s.startswith("I"): val = mupdf.PDF_ENUM_NAME_I elif s.startswith("u") or s.startswith("U"): val = mupdf.PDF_ENUM_NAME_U elif s.startswith("s") or s.startswith("S"): val = mupdf.PDF_ENUM_NAME_S return val def JM_get_font( fontname, fontfile, fontbuffer, script, lang, ordering, is_bold, is_italic, is_serif, embed, ): ''' return a fz_font from a number of parameters ''' def fertig(font): if not font.m_internal: raise RuntimeError(MSG_FONT_FAILED) # if font allows this, set embedding if not font.m_internal.flags.never_embed: mupdf.fz_set_font_embedding(font, embed) return font index = 0 font = None if fontfile: #goto have_file; font = mupdf.fz_new_font_from_file( None, fontfile, index, 0) return fertig(font) if fontbuffer: #goto have_buffer; res = JM_BufferFromBytes(fontbuffer) font = mupdf.fz_new_font_from_buffer( None, res, index, 0) return fertig(font) if ordering > -1: # goto have_cjk; font = mupdf.fz_new_cjk_font(ordering) return fertig(font) if fontname: # goto have_base14; # Base-14 or a MuPDF builtin font font = mupdf.fz_new_base14_font(fontname) if font.m_internal: return fertig(font) font = mupdf.fz_new_builtin_font(fontname, is_bold, is_italic) return fertig(font) # Check for NOTO font #have_noto:; data, size, index = mupdf.fz_lookup_noto_font( script, lang) font = None if data: font = mupdf.fz_new_font_from_memory( None, data, size, index, 0) if font.m_internal: return fertig(font) font = mupdf.fz_load_fallback_font( script, lang, is_serif, is_bold, is_italic) return fertig(font) def JM_get_fontbuffer(doc, xref): ''' Return the contents of a font file, identified by xref ''' if xref < 1: return o = mupdf.pdf_load_object(doc, xref) desft = mupdf.pdf_dict_get(o, PDF_NAME('DescendantFonts')) if desft.m_internal: obj = mupdf.pdf_resolve_indirect(mupdf.pdf_array_get(desft, 0)) obj = mupdf.pdf_dict_get(obj, PDF_NAME('FontDescriptor')) else: obj = mupdf.pdf_dict_get(o, PDF_NAME('FontDescriptor')) if not obj.m_internal: message(f"invalid font - FontDescriptor missing") return o = obj stream = None obj = mupdf.pdf_dict_get(o, PDF_NAME('FontFile')) if obj.m_internal: stream = obj # ext = "pfa" obj = mupdf.pdf_dict_get(o, PDF_NAME('FontFile2')) if obj.m_internal: stream = obj # ext = "ttf" obj = mupdf.pdf_dict_get(o, PDF_NAME('FontFile3')) if obj.m_internal: stream = obj obj = mupdf.pdf_dict_get(obj, PDF_NAME('Subtype')) if obj.m_internal and not mupdf.pdf_is_name(obj): message("invalid font descriptor subtype") return if mupdf.pdf_name_eq(obj, PDF_NAME('Type1C')): pass # Prev code did: ext = "cff", but this has no effect. elif mupdf.pdf_name_eq(obj, PDF_NAME('CIDFontType0C')): pass # Prev code did: ext = "cid", but this has no effect. elif mupdf.pdf_name_eq(obj, PDF_NAME('OpenType')): pass # Prev code did: ext = "otf", but this has no effect. */ else: message('warning: unhandled font type {pdf_to_name(ctx, obj)!r}') if not stream: message('warning: unhandled font type') return return mupdf.pdf_load_stream(stream) def JM_get_resource_properties(ref): ''' Return the items of Resources/Properties (used for Marked Content) Argument may be e.g. a page object or a Form XObject ''' properties = mupdf.pdf_dict_getl(ref, PDF_NAME('Resources'), PDF_NAME('Properties')) if not properties.m_internal: return () else: n = mupdf.pdf_dict_len(properties) if n < 1: return () rc = [] for i in range(n): key = mupdf.pdf_dict_get_key(properties, i) val = mupdf.pdf_dict_get_val(properties, i) c = mupdf.pdf_to_name(key) xref = mupdf.pdf_to_num(val) rc.append((c, xref)) return rc def JM_get_widget_by_xref( page, xref): ''' retrieve widget by its xref ''' found = False annot = mupdf.pdf_first_widget( page) while annot.m_internal: annot_obj = mupdf.pdf_annot_obj( annot) if xref == mupdf.pdf_to_num( annot_obj): found = True break annot = mupdf.pdf_next_widget( annot) if not found: raise Exception( f"xref {xref} is not a widget of this page") return Annot( annot) def JM_get_widget_properties(annot, Widget): ''' Populate a Python Widget object with the values from a PDF form field. Called by "Page.first_widget" and "Widget.next". ''' #log( '{type(annot)=}') annot_obj = mupdf.pdf_annot_obj(annot.this) #log( 'Have called mupdf.pdf_annot_obj()') page = _pdf_annot_page(annot.this) pdf = page.doc() tw = annot def SETATTR(key, value): setattr(Widget, key, value) def SETATTR_DROP(mod, key, value): # Original C code for this function deletes if PyObject* is NULL. We # don't have a representation for that in Python - e.g. None is not # represented by NULL. setattr(mod, key, value) #log( '=== + mupdf.pdf_widget_type(tw)') field_type = mupdf.pdf_widget_type(tw.this) #log( '=== - mupdf.pdf_widget_type(tw)') Widget.field_type = field_type if field_type == mupdf.PDF_WIDGET_TYPE_SIGNATURE: if mupdf.pdf_signature_is_signed(pdf, annot_obj): SETATTR("is_signed", True) else: SETATTR("is_signed",False) else: SETATTR("is_signed", None) SETATTR_DROP(Widget, "border_style", JM_UnicodeFromStr(mupdf.pdf_field_border_style(annot_obj))) SETATTR_DROP(Widget, "field_type_string", JM_UnicodeFromStr(JM_field_type_text(field_type))) field_name = mupdf.pdf_load_field_name(annot_obj) SETATTR_DROP(Widget, "field_name", field_name) def pdf_dict_get_inheritable_nonempty_label(node, key): ''' This is a modified version of MuPDF's pdf_dict_get_inheritable(), with some changes: * Returns string from pdf_to_text_string() or None if not found. * Recurses to parent if current node exists but with empty string value. ''' slow = node halfbeat = 11 # Don't start moving slow pointer for a while. while 1: if not node.m_internal: return val = mupdf.pdf_dict_get(node, key) if val.m_internal: label = mupdf.pdf_to_text_string(val) if label: return label node = mupdf.pdf_dict_get(node, PDF_NAME('Parent')) if node.m_internal == slow.m_internal: raise Exception("cycle in resources") halfbeat -= 1 if halfbeat == 0: slow = mupdf.pdf_dict_get(slow, PDF_NAME('Parent')) halfbeat = 2 # In order to address #3950, we use our modified pdf_dict_get_inheritable() # to ignore empty-string child values. label = pdf_dict_get_inheritable_nonempty_label(annot_obj, PDF_NAME('TU')) if label is not None: SETATTR_DROP(Widget, "field_label", label) fvalue = None if field_type == mupdf.PDF_WIDGET_TYPE_RADIOBUTTON: obj = mupdf.pdf_dict_get( annot_obj, PDF_NAME('Parent')) # owning RB group if obj.m_internal: SETATTR_DROP(Widget, "rb_parent", mupdf.pdf_to_num( obj)) obj = mupdf.pdf_dict_get(annot_obj, PDF_NAME('AS')) if obj.m_internal: fvalue = mupdf.pdf_to_name(obj) if not fvalue: fvalue = mupdf.pdf_field_value(annot_obj) SETATTR_DROP(Widget, "field_value", JM_UnicodeFromStr(fvalue)) SETATTR_DROP(Widget, "field_display", mupdf.pdf_field_display(annot_obj)) border_width = mupdf.pdf_to_real(mupdf.pdf_dict_getl(annot_obj, PDF_NAME('BS'), PDF_NAME('W'))) if border_width == 0: border_width = 1 SETATTR_DROP(Widget, "border_width", border_width) obj = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('BS'), PDF_NAME('D')) if mupdf.pdf_is_array(obj): n = mupdf.pdf_array_len(obj) d = [0] * n for i in range(n): d[i] = mupdf.pdf_to_int(mupdf.pdf_array_get(obj, i)) SETATTR_DROP(Widget, "border_dashes", d) SETATTR_DROP(Widget, "text_maxlen", mupdf.pdf_text_widget_max_len(tw.this)) SETATTR_DROP(Widget, "text_format", mupdf.pdf_text_widget_format(tw.this)) obj = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('MK'), PDF_NAME('BG')) if mupdf.pdf_is_array(obj): n = mupdf.pdf_array_len(obj) col = [0] * n for i in range(n): col[i] = mupdf.pdf_to_real(mupdf.pdf_array_get(obj, i)) SETATTR_DROP(Widget, "fill_color", col) obj = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('MK'), PDF_NAME('BC')) if mupdf.pdf_is_array(obj): n = mupdf.pdf_array_len(obj) col = [0] * n for i in range(n): col[i] = mupdf.pdf_to_real(mupdf.pdf_array_get(obj, i)) SETATTR_DROP(Widget, "border_color", col) SETATTR_DROP(Widget, "choice_values", JM_choice_options(annot)) da = mupdf.pdf_to_text_string(mupdf.pdf_dict_get_inheritable(annot_obj, PDF_NAME('DA'))) SETATTR_DROP(Widget, "_text_da", JM_UnicodeFromStr(da)) obj = mupdf.pdf_dict_getl(annot_obj, PDF_NAME('MK'), PDF_NAME('CA')) if obj.m_internal: SETATTR_DROP(Widget, "button_caption", JM_UnicodeFromStr(mupdf.pdf_to_text_string(obj))) SETATTR_DROP(Widget, "field_flags", mupdf.pdf_field_flags(annot_obj)) # call Py method to reconstruct text color, font name, size Widget._parse_da() # extract JavaScript action texts s = mupdf.pdf_dict_get(annot_obj, PDF_NAME('A')) ss = JM_get_script(s) SETATTR_DROP(Widget, "script", ss) SETATTR_DROP(Widget, "script_stroke", JM_get_script(mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AA'), PDF_NAME('K'))) ) SETATTR_DROP(Widget, "script_format", JM_get_script(mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AA'), PDF_NAME('F'))) ) SETATTR_DROP(Widget, "script_change", JM_get_script(mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AA'), PDF_NAME('V'))) ) SETATTR_DROP(Widget, "script_calc", JM_get_script(mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AA'), PDF_NAME('C'))) ) SETATTR_DROP(Widget, "script_blur", JM_get_script(mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AA'), mupdf.pdf_new_name('Bl'))) ) SETATTR_DROP(Widget, "script_focus", JM_get_script(mupdf.pdf_dict_getl(annot_obj, PDF_NAME('AA'), mupdf.pdf_new_name('Fo'))) ) def JM_get_fontextension(doc, xref): ''' Return the file extension of a font file, identified by xref ''' if xref < 1: return "n/a" o = mupdf.pdf_load_object(doc, xref) desft = mupdf.pdf_dict_get(o, PDF_NAME('DescendantFonts')) if desft.m_internal: obj = mupdf.pdf_resolve_indirect(mupdf.pdf_array_get(desft, 0)) obj = mupdf.pdf_dict_get(obj, PDF_NAME('FontDescriptor')) else: obj = mupdf.pdf_dict_get(o, PDF_NAME('FontDescriptor')) if not obj.m_internal: return "n/a" # this is a base-14 font o = obj # we have the FontDescriptor obj = mupdf.pdf_dict_get(o, PDF_NAME('FontFile')) if obj.m_internal: return "pfa" obj = mupdf.pdf_dict_get(o, PDF_NAME('FontFile2')) if obj.m_internal: return "ttf" obj = mupdf.pdf_dict_get(o, PDF_NAME('FontFile3')) if obj.m_internal: obj = mupdf.pdf_dict_get(obj, PDF_NAME('Subtype')) if obj.m_internal and not mupdf.pdf_is_name(obj): message("invalid font descriptor subtype") return "n/a" if mupdf.pdf_name_eq(obj, PDF_NAME('Type1C')): return "cff" elif mupdf.pdf_name_eq(obj, PDF_NAME('CIDFontType0C')): return "cid" elif mupdf.pdf_name_eq(obj, PDF_NAME('OpenType')): return "otf" else: message("unhandled font type '%s'", mupdf.pdf_to_name(obj)) return "n/a" def JM_get_ocg_arrays_imp(arr): ''' Get OCG arrays from OC configuration Returns dict {"basestate":name, "on":list, "off":list, "rbg":list, "locked":list} ''' list_ = list() if mupdf.pdf_is_array( arr): n = mupdf.pdf_array_len( arr) for i in range(n): obj = mupdf.pdf_array_get( arr, i) item = mupdf.pdf_to_num( obj) if item not in list_: list_.append(item) return list_ def JM_get_ocg_arrays(conf): rc = dict() arr = mupdf.pdf_dict_get( conf, PDF_NAME('ON')) list_ = JM_get_ocg_arrays_imp( arr) if list_: rc["on"] = list_ arr = mupdf.pdf_dict_get( conf, PDF_NAME('OFF')) list_ = JM_get_ocg_arrays_imp( arr) if list_: rc["off"] = list_ arr = mupdf.pdf_dict_get( conf, PDF_NAME('Locked')) list_ = JM_get_ocg_arrays_imp( arr) if list_: rc['locked'] = list_ list_ = list() arr = mupdf.pdf_dict_get( conf, PDF_NAME('RBGroups')) if mupdf.pdf_is_array( arr): n = mupdf.pdf_array_len( arr) for i in range(n): obj = mupdf.pdf_array_get( arr, i) list1 = JM_get_ocg_arrays_imp( obj) list_.append(list1) if list_: rc["rbgroups"] = list_ obj = mupdf.pdf_dict_get( conf, PDF_NAME('BaseState')) if obj.m_internal: state = mupdf.pdf_to_name( obj) rc["basestate"] = state return rc def JM_get_page_labels(liste, nums): n = mupdf.pdf_array_len(nums) for i in range(0, n, 2): key = mupdf.pdf_resolve_indirect( mupdf.pdf_array_get(nums, i)) pno = mupdf.pdf_to_int(key) val = mupdf.pdf_resolve_indirect( mupdf.pdf_array_get(nums, i + 1)) res = JM_object_to_buffer(val, 1, 0) c = mupdf.fz_buffer_extract(res) assert isinstance(c, bytes) c = c.decode('utf-8') liste.append( (pno, c)) def JM_get_script(key): ''' JavaScript extractor Returns either the script source or None. Parameter is a PDF action dictionary, which must have keys /S and /JS. The value of /S must be '/JavaScript'. The value of /JS is returned. ''' if not key.m_internal: return j = mupdf.pdf_dict_get(key, PDF_NAME('S')) jj = mupdf.pdf_to_name(j) if jj == "JavaScript": js = mupdf.pdf_dict_get(key, PDF_NAME('JS')) if not js.m_internal: return else: return if mupdf.pdf_is_string(js): script = JM_UnicodeFromStr(mupdf.pdf_to_text_string(js)) elif mupdf.pdf_is_stream(js): res = mupdf.pdf_load_stream(js) script = JM_EscapeStrFromBuffer(res) else: return if script: # do not return an empty script return script return def JM_have_operation(pdf): ''' Ensure valid journalling state ''' if pdf.m_internal.journal and not mupdf.pdf_undoredo_step(pdf, 0): return 0 return 1 def JM_image_extension(type_): ''' return extension for MuPDF image type ''' if type_ == mupdf.FZ_IMAGE_FAX: return "fax" if type_ == mupdf.FZ_IMAGE_RAW: return "raw" if type_ == mupdf.FZ_IMAGE_FLATE: return "flate" if type_ == mupdf.FZ_IMAGE_LZW: return "lzw" if type_ == mupdf.FZ_IMAGE_RLD: return "rld" if type_ == mupdf.FZ_IMAGE_BMP: return "bmp" if type_ == mupdf.FZ_IMAGE_GIF: return "gif" if type_ == mupdf.FZ_IMAGE_JBIG2: return "jb2" if type_ == mupdf.FZ_IMAGE_JPEG: return "jpeg" if type_ == mupdf.FZ_IMAGE_JPX: return "jpx" if type_ == mupdf.FZ_IMAGE_JXR: return "jxr" if type_ == mupdf.FZ_IMAGE_PNG: return "png" if type_ == mupdf.FZ_IMAGE_PNM: return "pnm" if type_ == mupdf.FZ_IMAGE_TIFF: return "tiff" #if type_ == mupdf.FZ_IMAGE_PSD: return "psd" return "n/a" # fixme: need to avoid using a global for this. g_img_info = None def JM_image_filter(opaque, ctm, name, image): assert isinstance(ctm, mupdf.FzMatrix) r = mupdf.FzRect(mupdf.FzRect.Fixed_UNIT) q = mupdf.fz_transform_quad( mupdf.fz_quad_from_rect(r), ctm) q = mupdf.fz_transform_quad( q, g_img_info_matrix) temp = name, JM_py_from_quad(q) g_img_info.append(temp) def JM_image_profile( imagedata, keep_image): ''' Return basic properties of an image provided as bytes or bytearray The function creates an fz_image and optionally returns it. ''' if not imagedata: return None # nothing given len_ = len( imagedata) if len_ < 8: message( "bad image data") return None c = imagedata #log( 'calling mfz_recognize_image_format with {c!r=}') type_ = mupdf.fz_recognize_image_format( c) if type_ == mupdf.FZ_IMAGE_UNKNOWN: return None if keep_image: res = mupdf.fz_new_buffer_from_copied_data( c, len_) else: res = mupdf.fz_new_buffer_from_shared_data( c, len_) image = mupdf.fz_new_image_from_buffer( res) ctm = mupdf.fz_image_orientation_matrix( image) xres, yres = mupdf.fz_image_resolution(image) orientation = mupdf.fz_image_orientation( image) cs_name = mupdf.fz_colorspace_name( image.colorspace()) result = dict() result[ dictkey_width] = image.w() result[ dictkey_height] = image.h() result[ "orientation"] = orientation result[ dictkey_matrix] = JM_py_from_matrix(ctm) result[ dictkey_xres] = xres result[ dictkey_yres] = yres result[ dictkey_colorspace] = image.n() result[ dictkey_bpc] = image.bpc() result[ dictkey_ext] = JM_image_extension(type_) result[ dictkey_cs_name] = cs_name if keep_image: result[ dictkey_image] = image return result def JM_image_reporter(page): doc = page.doc() global g_img_info_matrix g_img_info_matrix = mupdf.FzMatrix() mediabox = mupdf.FzRect() mupdf.pdf_page_transform(page, mediabox, g_img_info_matrix) class SanitizeFilterOptions(mupdf.PdfSanitizeFilterOptions2): def __init__(self): super().__init__() self.use_virtual_image_filter() def image_filter(self, ctx, ctm, name, image, scissor): JM_image_filter(None, mupdf.FzMatrix(ctm), name, image) sanitize_filter_options = SanitizeFilterOptions() filter_options = _make_PdfFilterOptions( instance_forms=1, ascii=1, no_update=1, sanitize=1, sopts=sanitize_filter_options, ) global g_img_info g_img_info = [] mupdf.pdf_filter_page_contents( doc, page, filter_options) rc = tuple(g_img_info) g_img_info = [] return rc def JM_fitz_config(): have_TOFU = not hasattr(mupdf, 'TOFU') have_TOFU_BASE14 = not hasattr(mupdf, 'TOFU_BASE14') have_TOFU_CJK = not hasattr(mupdf, 'TOFU_CJK') have_TOFU_CJK_EXT = not hasattr(mupdf, 'TOFU_CJK_EXT') have_TOFU_CJK_LANG = not hasattr(mupdf, 'TOFU_CJK_LANG') have_TOFU_EMOJI = not hasattr(mupdf, 'TOFU_EMOJI') have_TOFU_HISTORIC = not hasattr(mupdf, 'TOFU_HISTORIC') have_TOFU_SIL = not hasattr(mupdf, 'TOFU_SIL') have_TOFU_SYMBOL = not hasattr(mupdf, 'TOFU_SYMBOL') ret = dict() ret["base14"] = have_TOFU_BASE14 ret["cbz"] = bool(mupdf.FZ_ENABLE_CBZ) ret["epub"] = bool(mupdf.FZ_ENABLE_EPUB) ret["html"] = bool(mupdf.FZ_ENABLE_HTML) ret["icc"] = bool(mupdf.FZ_ENABLE_ICC) ret["img"] = bool(mupdf.FZ_ENABLE_IMG) ret["jpx"] = bool(mupdf.FZ_ENABLE_JPX) ret["js"] = bool(mupdf.FZ_ENABLE_JS) ret["pdf"] = bool(mupdf.FZ_ENABLE_PDF) ret["plotter-cmyk"] = bool(mupdf.FZ_PLOTTERS_CMYK) ret["plotter-g"] = bool(mupdf.FZ_PLOTTERS_G) ret["plotter-n"] = bool(mupdf.FZ_PLOTTERS_N) ret["plotter-rgb"] = bool(mupdf.FZ_PLOTTERS_RGB) ret["py-memory"] = bool(JM_MEMORY) ret["svg"] = bool(mupdf.FZ_ENABLE_SVG) ret["tofu"] = have_TOFU ret["tofu-cjk"] = have_TOFU_CJK ret["tofu-cjk-ext"] = have_TOFU_CJK_EXT ret["tofu-cjk-lang"] = have_TOFU_CJK_LANG ret["tofu-emoji"] = have_TOFU_EMOJI ret["tofu-historic"] = have_TOFU_HISTORIC ret["tofu-sil"] = have_TOFU_SIL ret["tofu-symbol"] = have_TOFU_SYMBOL ret["xps"] = bool(mupdf.FZ_ENABLE_XPS) return ret def JM_insert_contents(pdf, pageref, newcont, overlay): ''' Insert a buffer as a new separate /Contents object of a page. 1. Create a new stream object from buffer 'newcont' 2. If /Contents already is an array, then just prepend or append this object 3. Else, create new array and put old content obj and this object into it. If the page had no /Contents before, just create a 1-item array. ''' contents = mupdf.pdf_dict_get(pageref, PDF_NAME('Contents')) newconts = mupdf.pdf_add_stream(pdf, newcont, mupdf.PdfObj(), 0) xref = mupdf.pdf_to_num(newconts) if mupdf.pdf_is_array(contents): if overlay: # append new object mupdf.pdf_array_push(contents, newconts) else: # prepend new object mupdf.pdf_array_insert(contents, newconts, 0) else: carr = mupdf.pdf_new_array(pdf, 5) if overlay: if contents.m_internal: mupdf.pdf_array_push(carr, contents) mupdf.pdf_array_push(carr, newconts) else: mupdf.pdf_array_push(carr, newconts) if contents.m_internal: mupdf.pdf_array_push(carr, contents) mupdf.pdf_dict_put(pageref, PDF_NAME('Contents'), carr) return xref def JM_insert_font(pdf, bfname, fontfile, fontbuffer, set_simple, idx, wmode, serif, encoding, ordering): ''' Insert a font in a PDF ''' font = None res = None data = None ixref = 0 index = 0 simple = 0 value=None name=None subt=None exto = None ENSURE_OPERATION(pdf) # check for CJK font if ordering > -1: data, size, index = mupdf.fz_lookup_cjk_font(ordering) if data: font = mupdf.fz_new_font_from_memory(None, data, size, index, 0) font_obj = mupdf.pdf_add_cjk_font(pdf, font, ordering, wmode, serif) exto = "n/a" simple = 0 #goto weiter; else: # check for PDF Base-14 font if bfname: data, size = mupdf.fz_lookup_base14_font(bfname) if data: font = mupdf.fz_new_font_from_memory(bfname, data, size, 0, 0) font_obj = mupdf.pdf_add_simple_font(pdf, font, encoding) exto = "n/a" simple = 1 #goto weiter; else: if fontfile: font = mupdf.fz_new_font_from_file(None, fontfile, idx, 0) else: res = JM_BufferFromBytes(fontbuffer) if not res.m_internal: RAISEPY(MSG_FILE_OR_BUFFER, PyExc_ValueError) font = mupdf.fz_new_font_from_buffer(None, res, idx, 0) if not set_simple: font_obj = mupdf.pdf_add_cid_font(pdf, font) simple = 0 else: font_obj = mupdf.pdf_add_simple_font(pdf, font, encoding) simple = 2 #weiter: ; ixref = mupdf.pdf_to_num(font_obj) name = JM_EscapeStrFromStr( mupdf.pdf_to_name( mupdf.pdf_dict_get(font_obj, PDF_NAME('BaseFont')))) subt = JM_UnicodeFromStr( mupdf.pdf_to_name( mupdf.pdf_dict_get( font_obj, PDF_NAME('Subtype')))) if not exto: exto = JM_UnicodeFromStr(JM_get_fontextension(pdf, ixref)) asc = mupdf.fz_font_ascender(font) dsc = mupdf.fz_font_descender(font) value = [ ixref, { "name": name, # base font name "type": subt, # subtype "ext": exto, # file extension "simple": bool(simple), # simple font? "ordering": ordering, # CJK font? "ascender": asc, "descender": dsc, }, ] return value def JM_irect_from_py(r): ''' PySequence to mupdf.FzIrect. Default: infinite irect ''' if isinstance(r, mupdf.FzIrect): return r if isinstance(r, IRect): r = mupdf.FzIrect( r.x0, r.y0, r.x1, r.y1) return r if isinstance(r, Rect): ret = mupdf.FzRect(r.x0, r.y0, r.x1, r.y1) ret = mupdf.FzIrect(ret) # Uses fz_irect_from_rect(). return ret if isinstance(r, mupdf.FzRect): ret = mupdf.FzIrect(r) # Uses fz_irect_from_rect(). return ret if not r or not PySequence_Check(r) or PySequence_Size(r) != 4: return mupdf.FzIrect(mupdf.fz_infinite_irect) f = [0, 0, 0, 0] for i in range(4): f[i] = r[i] if f[i] is None: return mupdf.FzIrect(mupdf.fz_infinite_irect) if f[i] < FZ_MIN_INF_RECT: f[i] = FZ_MIN_INF_RECT if f[i] > FZ_MAX_INF_RECT: f[i] = FZ_MAX_INF_RECT return mupdf.fz_make_irect(f[0], f[1], f[2], f[3]) def JM_listbox_value( annot): ''' ListBox retrieve value ''' # may be single value or array annot_obj = mupdf.pdf_annot_obj( annot) optarr = mupdf.pdf_dict_get( annot_obj, PDF_NAME('V')) if mupdf.pdf_is_string( optarr): # a single string return mupdf.pdf_to_text_string( optarr) # value is an array (may have len 0) n = mupdf.pdf_array_len( optarr) liste = [] # extract a list of strings # each entry may again be an array: take second entry then for i in range( n): elem = mupdf.pdf_array_get( optarr, i) if mupdf.pdf_is_array( elem): elem = mupdf.pdf_array_get( elem, 1) liste.append( JM_UnicodeFromStr( mupdf.pdf_to_text_string( elem))) return liste def JM_make_annot_DA(annot, ncol, col, fontname, fontsize): # PyMuPDF uses a fz_buffer to build up the string, but it's non-trivial to # convert the fz_buffer's `unsigned char*` into a `const char*` suitable # for passing to pdf_dict_put_text_string(). So instead we build up the # string directly in Python. buf = '' if ncol < 1: buf += f'0 g ' elif ncol == 1: buf += f'{col[0]:g} g ' elif ncol == 2: assert 0 elif ncol == 3: buf += f'{col[0]:g} {col[1]:g} {col[2]:g} rg ' else: buf += f'{col[0]:g} {col[1]:g} {col[2]:g} {col[3]:g} k ' buf += f'/{JM_expand_fname(fontname)} {fontsize} Tf' mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(annot), mupdf.PDF_ENUM_NAME_DA, buf) def JM_make_spanlist(line_dict, line, raw, buff, tp_rect): if g_use_extra: return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect) char_list = None span_list = [] mupdf.fz_clear_buffer(buff) span_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) line_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) class char_style: def __init__(self, rhs=None): if rhs: self.size = rhs.size self.flags = rhs.flags if mupdf_version_tuple >= (1, 25, 2): self.char_flags = rhs.char_flags self.font = rhs.font self.argb = rhs.argb self.asc = rhs.asc self.desc = rhs.desc self.bidi = rhs.bidi else: self.size = -1 self.flags = -1 if mupdf_version_tuple >= (1, 25, 2): self.char_flags = -1 self.font = '' self.argb = -1 self.asc = 0 self.desc = 0 self.bidi = 0 def __str__(self): ret = f'{self.size} {self.flags}' if mupdf_version_tuple >= (1, 25, 2): ret += f' {self.char_flags}' ret += f' {self.font} {self.color} {self.asc} {self.desc}' return ret old_style = char_style() style = char_style() span = None span_origin = None for ch in line: # start-trace r = JM_char_bbox(line, ch) if (not JM_rects_overlap(tp_rect, r) and not mupdf.fz_is_infinite_rect(tp_rect) ): continue # Info from: # detect_super_script() # fz_font_is_italic() # fz_font_is_serif() # fz_font_is_monospaced() # fz_font_is_bold() flags = JM_char_font_flags(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font)), line, ch) origin = mupdf.FzPoint(ch.m_internal.origin) style.size = ch.m_internal.size style.flags = flags if mupdf_version_tuple >= (1, 25, 2): # FZ_STEXT_SYNTHETIC is per-char, not per-span. style.char_flags = ch.m_internal.flags & ~mupdf.FZ_STEXT_SYNTHETIC style.font = JM_font_name(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font))) style.argb = ch.m_internal.argb style.asc = JM_font_ascender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font))) style.desc = JM_font_descender(mupdf.FzFont(mupdf.ll_fz_keep_font(ch.m_internal.font))) style.bidi = ch.m_internal.bidi if (style.size != old_style.size or style.flags != old_style.flags or (mupdf_version_tuple >= (1, 25, 2) and (style.char_flags != old_style.char_flags) ) or style.argb != old_style.argb or style.font != old_style.font or style.bidi != old_style.bidi ): if old_style.size >= 0: # not first one, output previous if raw: # put character list in the span span[dictkey_chars] = char_list char_list = None else: # put text string in the span span[dictkey_text] = JM_EscapeStrFromBuffer( buff) mupdf.fz_clear_buffer(buff) span[dictkey_origin] = JM_py_from_point(span_origin) span[dictkey_bbox] = JM_py_from_rect(span_rect) line_rect = mupdf.fz_union_rect(line_rect, span_rect) span_list.append( span) span = None span = dict() asc = style.asc desc = style.desc if style.asc < 1e-3: asc = 0.9 desc = -0.1 span[dictkey_size] = style.size span[dictkey_flags] = style.flags span[dictkey_bidi] = style.bidi if mupdf_version_tuple >= (1, 25, 2): span[dictkey_char_flags] = style.char_flags span[dictkey_font] = JM_EscapeStrFromStr(style.font) span[dictkey_color] = style.argb & 0xffffff if mupdf_version_tuple >= (1, 25, 0): span['alpha'] = style.argb >> 24 span["ascender"] = asc span["descender"] = desc # Need to be careful here - doing 'old_style=style' does a shallow # copy, but we need to keep old_style as a distinct instance. old_style = char_style(style) span_rect = r span_origin = origin span_rect = mupdf.fz_union_rect(span_rect, r) if raw: # make and append a char dict char_dict = dict() char_dict[dictkey_origin] = JM_py_from_point( ch.m_internal.origin) char_dict[dictkey_bbox] = JM_py_from_rect(r) char_dict[dictkey_c] = chr(ch.m_internal.c) char_dict['synthetic'] = bool(ch.m_internal.flags & mupdf.FZ_STEXT_SYNTHETIC) if char_list is None: char_list = [] char_list.append(char_dict) else: # add character byte to buffer JM_append_rune(buff, ch.m_internal.c) # all characters processed, now flush remaining span if span: if raw: span[dictkey_chars] = char_list char_list = None else: span[dictkey_text] = JM_EscapeStrFromBuffer(buff) mupdf.fz_clear_buffer(buff) span[dictkey_origin] = JM_py_from_point(span_origin) span[dictkey_bbox] = JM_py_from_rect(span_rect) if not mupdf.fz_is_empty_rect(span_rect): span_list.append(span) line_rect = mupdf.fz_union_rect(line_rect, span_rect) span = None if not mupdf.fz_is_empty_rect(line_rect): line_dict[dictkey_spans] = span_list else: line_dict[dictkey_spans] = span_list return line_rect def _make_image_dict(img, img_dict): """Populate a dictionary with information extracted from a given image. Used by 'Document.extract_image' and by 'JM_make_image_block'. Both of these functions will add some more specific information. """ img_type = img.fz_compressed_image_type() ext = JM_image_extension(img_type) # compressed image buffer if present, else None ll_cbuf = mupdf.ll_fz_compressed_image_buffer(img.m_internal) if (0 or not ll_cbuf or img_type in (mupdf.FZ_IMAGE_JBIG2, mupdf.FZ_IMAGE_UNKNOWN) or img_type < mupdf.FZ_IMAGE_BMP ): # not an image with a compressed buffer: convert to PNG res = mupdf.fz_new_buffer_from_image_as_png( img, mupdf.FzColorParams(mupdf.fz_default_color_params), ) ext = "png" elif ext == "jpeg" and img.n() == 4: # JPEG with CMYK: invert colors res = mupdf.fz_new_buffer_from_image_as_jpeg( img, mupdf.FzColorParams(mupdf.fz_default_color_params), 95, 1) else: # copy the compressed buffer res = mupdf.FzBuffer(mupdf.ll_fz_keep_buffer(ll_cbuf.buffer)) bytes_ = JM_BinFromBuffer(res) img_dict[dictkey_width] = img.w() img_dict[dictkey_height] = img.h() img_dict[dictkey_ext] = ext img_dict[dictkey_colorspace] = img.n() img_dict[dictkey_xres] = img.xres() img_dict[dictkey_yres] = img.yres() img_dict[dictkey_bpc] = img.bpc() img_dict[dictkey_size] = len(bytes_) img_dict[dictkey_image] = bytes_ def JM_make_image_block(block, block_dict): img = block.i_image() _make_image_dict(img, block_dict) # if the image has a mask, store it as a PNG buffer mask = img.mask() if mask.m_internal: buff = mask.fz_new_buffer_from_image_as_png(mupdf.FzColorParams(mupdf.fz_default_color_params)) block_dict["mask"] = buff.fz_buffer_extract() else: block_dict["mask"] = None block_dict[dictkey_matrix] = JM_py_from_matrix(block.i_transform()) def JM_make_text_block(block, block_dict, raw, buff, tp_rect): if g_use_extra: return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal) line_list = [] block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY) #log(f'{block=}') for line in block: #log(f'{line=}') if (mupdf.fz_is_empty_rect(mupdf.fz_intersect_rect(tp_rect, mupdf.FzRect(line.m_internal.bbox))) and not mupdf.fz_is_infinite_rect(tp_rect) ): continue line_dict = dict() line_rect = JM_make_spanlist(line_dict, line, raw, buff, tp_rect) block_rect = mupdf.fz_union_rect(block_rect, line_rect) line_dict[dictkey_wmode] = line.m_internal.wmode line_dict[dictkey_dir] = JM_py_from_point(line.m_internal.dir) line_dict[dictkey_bbox] = JM_py_from_rect(line_rect) line_list.append(line_dict) block_dict[dictkey_bbox] = JM_py_from_rect(block_rect) block_dict[dictkey_lines] = line_list def JM_make_textpage_dict(tp, page_dict, raw): if g_use_extra: return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw) text_buffer = mupdf.fz_new_buffer(128) block_list = [] tp_rect = mupdf.FzRect(tp.m_internal.mediabox) block_n = -1 #log( 'JM_make_textpage_dict {=tp}') for block in tp: block_n += 1 if (not mupdf.fz_contains_rect(tp_rect, mupdf.FzRect(block.m_internal.bbox)) and not mupdf.fz_is_infinite_rect(tp_rect) and block.m_internal.type == mupdf.FZ_STEXT_BLOCK_IMAGE ): continue if (not mupdf.fz_is_infinite_rect(tp_rect) and mupdf.fz_is_empty_rect(mupdf.fz_intersect_rect(tp_rect, mupdf.FzRect(block.m_internal.bbox))) ): continue block_dict = dict() block_dict[dictkey_number] = block_n block_dict[dictkey_type] = block.m_internal.type if block.m_internal.type == mupdf.FZ_STEXT_BLOCK_IMAGE: block_dict[dictkey_bbox] = JM_py_from_rect(block.m_internal.bbox) JM_make_image_block(block, block_dict) else: JM_make_text_block(block, block_dict, raw, text_buffer, tp_rect) block_list.append(block_dict) page_dict[dictkey_blocks] = block_list def JM_matrix_from_py(m): a = [0, 0, 0, 0, 0, 0] if isinstance(m, mupdf.FzMatrix): return m if isinstance(m, Matrix): return mupdf.FzMatrix(m.a, m.b, m.c, m.d, m.e, m.f) if not m or not PySequence_Check(m) or PySequence_Size(m) != 6: return mupdf.FzMatrix() for i in range(6): a[i] = JM_FLOAT_ITEM(m, i) if a[i] is None: return mupdf.FzRect() return mupdf.FzMatrix(a[0], a[1], a[2], a[3], a[4], a[5]) def JM_mediabox(page_obj): ''' return a PDF page's MediaBox ''' page_mediabox = mupdf.FzRect(mupdf.FzRect.Fixed_UNIT) mediabox = mupdf.pdf_to_rect( mupdf.pdf_dict_get_inheritable(page_obj, PDF_NAME('MediaBox')) ) if mupdf.fz_is_empty_rect(mediabox) or mupdf.fz_is_infinite_rect(mediabox): mediabox.x0 = 0 mediabox.y0 = 0 mediabox.x1 = 612 mediabox.y1 = 792 page_mediabox = mupdf.FzRect( mupdf.fz_min(mediabox.x0, mediabox.x1), mupdf.fz_min(mediabox.y0, mediabox.y1), mupdf.fz_max(mediabox.x0, mediabox.x1), mupdf.fz_max(mediabox.y0, mediabox.y1), ) if (page_mediabox.x1 - page_mediabox.x0 < 1 or page_mediabox.y1 - page_mediabox.y0 < 1 ): page_mediabox = mupdf.FzRect(mupdf.FzRect.Fixed_UNIT) return page_mediabox def JM_merge_range( doc_des, doc_src, spage, epage, apage, rotate, links, annots, show_progress, graft_map, ): ''' Copy a range of pages (spage, epage) from a source PDF to a specified location (apage) of the target PDF. If spage > epage, the sequence of source pages is reversed. ''' if g_use_extra: return extra.JM_merge_range( doc_des, doc_src, spage, epage, apage, rotate, links, annots, show_progress, graft_map, ) afterpage = apage counter = 0 # copied pages counter total = mupdf.fz_absi(epage - spage) + 1 # total pages to copy if spage < epage: page = spage while page <= epage: page_merge(doc_des, doc_src, page, afterpage, rotate, links, annots, graft_map) counter += 1 if show_progress > 0 and counter % show_progress == 0: message(f"Inserted {counter} of {total} pages.") page += 1 afterpage += 1 else: page = spage while page >= epage: page_merge(doc_des, doc_src, page, afterpage, rotate, links, annots, graft_map) counter += 1 if show_progress > 0 and counter % show_progress == 0: message(f"Inserted {counter} of {total} pages.") page -= 1 afterpage += 1 def JM_merge_resources( page, temp_res): ''' Merge the /Resources object created by a text pdf device into the page. The device may have created multiple /ExtGState/Alp? and /Font/F? objects. These need to be renamed (renumbered) to not overwrite existing page objects from previous executions. Returns the next available numbers n, m for objects /Alp<n>, /F<m>. ''' # page objects /Resources, /Resources/ExtGState, /Resources/Font resources = mupdf.pdf_dict_get(page.obj(), PDF_NAME('Resources')) if not resources.m_internal: resources = mupdf.pdf_dict_put_dict(page.obj(), PDF_NAME('Resources'), 5) main_extg = mupdf.pdf_dict_get(resources, PDF_NAME('ExtGState')) main_fonts = mupdf.pdf_dict_get(resources, PDF_NAME('Font')) # text pdf device objects /ExtGState, /Font temp_extg = mupdf.pdf_dict_get(temp_res, PDF_NAME('ExtGState')) temp_fonts = mupdf.pdf_dict_get(temp_res, PDF_NAME('Font')) max_alp = -1 max_fonts = -1 # Handle /Alp objects if mupdf.pdf_is_dict(temp_extg): # any created at all? n = mupdf.pdf_dict_len(temp_extg) if mupdf.pdf_is_dict(main_extg): # does page have /ExtGState yet? for i in range(mupdf.pdf_dict_len(main_extg)): # get highest number of objects named /Alpxxx alp = mupdf.pdf_to_name( mupdf.pdf_dict_get_key(main_extg, i)) if not alp.startswith('Alp'): continue j = mupdf.fz_atoi(alp[3:]) if j > max_alp: max_alp = j else: # create a /ExtGState for the page main_extg = mupdf.pdf_dict_put_dict(resources, PDF_NAME('ExtGState'), n) max_alp += 1 for i in range(n): # copy over renumbered /Alp objects alp = mupdf.pdf_to_name( mupdf.pdf_dict_get_key( temp_extg, i)) j = mupdf.fz_atoi(alp[3:]) + max_alp text = f'Alp{j}' val = mupdf.pdf_dict_get_val( temp_extg, i) mupdf.pdf_dict_puts(main_extg, text, val) if mupdf.pdf_is_dict(main_fonts): # has page any fonts yet? for i in range(mupdf.pdf_dict_len(main_fonts)): # get max font number font = mupdf.pdf_to_name( mupdf.pdf_dict_get_key( main_fonts, i)) if not font.startswith("F"): continue j = mupdf.fz_atoi(font[1:]) if j > max_fonts: max_fonts = j else: # create a Resources/Font for the page main_fonts = mupdf.pdf_dict_put_dict(resources, PDF_NAME('Font'), 2) max_fonts += 1 for i in range(mupdf.pdf_dict_len(temp_fonts)): # copy renumbered fonts font = mupdf.pdf_to_name( mupdf.pdf_dict_get_key( temp_fonts, i)) j = mupdf.fz_atoi(font[1:]) + max_fonts text = f'F{j}' val = mupdf.pdf_dict_get_val(temp_fonts, i) mupdf.pdf_dict_puts(main_fonts, text, val) return (max_alp, max_fonts) # next available numbers def JM_mupdf_warning( text): ''' redirect MuPDF warnings ''' JM_mupdf_warnings_store.append(text) if JM_mupdf_show_warnings: message(f'MuPDF warning: {text}') def JM_mupdf_error( text): JM_mupdf_warnings_store.append(text) if JM_mupdf_show_errors: message(f'MuPDF error: {text}\n') def JM_new_bbox_device(rc, inc_layers): assert isinstance(rc, list) return JM_new_bbox_device_Device( rc, inc_layers) def JM_new_buffer_from_stext_page(page): ''' make a buffer from an stext_page's text ''' assert isinstance(page, mupdf.FzStextPage) rect = mupdf.FzRect(page.m_internal.mediabox) buf = mupdf.fz_new_buffer(256) for block in page: if block.m_internal.type == mupdf.FZ_STEXT_BLOCK_TEXT: for line in block: for ch in line: if (not JM_rects_overlap(rect, JM_char_bbox(line, ch)) and not mupdf.fz_is_infinite_rect(rect) ): continue mupdf.fz_append_rune(buf, ch.m_internal.c) mupdf.fz_append_byte(buf, ord('\n')) mupdf.fz_append_byte(buf, ord('\n')) return buf def JM_new_javascript(pdf, value): ''' make new PDF action object from JavaScript source Parameters are a PDF document and a Python string. Returns a PDF action object. ''' if value is None: # no argument given return data = JM_StrAsChar(value) if data is None: # not convertible to char* return res = mupdf.fz_new_buffer_from_copied_data(data.encode('utf8')) source = mupdf.pdf_add_stream(pdf, res, mupdf.PdfObj(), 0) newaction = mupdf.pdf_add_new_dict(pdf, 4) mupdf.pdf_dict_put(newaction, PDF_NAME('S'), mupdf.pdf_new_name('JavaScript')) mupdf.pdf_dict_put(newaction, PDF_NAME('JS'), source) return newaction def JM_new_output_fileptr(bio): return JM_new_output_fileptr_Output( bio) def JM_norm_rotation(rotate): ''' # return normalized /Rotate value:one of 0, 90, 180, 270 ''' while rotate < 0: rotate += 360 while rotate >= 360: rotate -= 360 if rotate % 90 != 0: return 0 return rotate def JM_object_to_buffer(what, compress, ascii): res = mupdf.fz_new_buffer(512) out = mupdf.FzOutput(res) mupdf.pdf_print_obj(out, what, compress, ascii) out.fz_close_output() mupdf.fz_terminate_buffer(res) return res def JM_outline_xrefs(obj, xrefs): ''' Return list of outline xref numbers. Recursive function. Arguments: 'obj' first OL item 'xrefs' empty Python list ''' if not obj.m_internal: return xrefs thisobj = obj while thisobj.m_internal: newxref = mupdf.pdf_to_num( thisobj) if newxref in xrefs or mupdf.pdf_dict_get( thisobj, PDF_NAME('Type')).m_internal: # circular ref or top of chain: terminate break xrefs.append( newxref) first = mupdf.pdf_dict_get( thisobj, PDF_NAME('First')) # try go down if mupdf.pdf_is_dict( first): xrefs = JM_outline_xrefs( first, xrefs) thisobj = mupdf.pdf_dict_get( thisobj, PDF_NAME('Next')) # try go next parent = mupdf.pdf_dict_get( thisobj, PDF_NAME('Parent')) # get parent if not mupdf.pdf_is_dict( thisobj): thisobj = parent return xrefs def JM_page_rotation(page): ''' return a PDF page's /Rotate value: one of (0, 90, 180, 270) ''' rotate = 0 obj = mupdf.pdf_dict_get_inheritable( page.obj(), mupdf.PDF_ENUM_NAME_Rotate) rotate = mupdf.pdf_to_int(obj) rotate = JM_norm_rotation(rotate) return rotate def JM_pdf_obj_from_str(doc, src): ''' create PDF object from given string (new in v1.14.0: MuPDF dropped it) ''' # fixme: seems inefficient to convert to bytes instance then make another # copy inside fz_new_buffer_from_copied_data(), but no other way? # buffer_ = mupdf.fz_new_buffer_from_copied_data(bytes(src, 'utf8')) stream = mupdf.fz_open_buffer(buffer_) lexbuf = mupdf.PdfLexbuf(mupdf.PDF_LEXBUF_SMALL) result = mupdf.pdf_parse_stm_obj(doc, stream, lexbuf) return result def JM_pixmap_from_display_list( list_, ctm, cs, alpha, clip, seps, ): ''' Version of fz_new_pixmap_from_display_list (util.c) to also support rendering of only the 'clip' part of the displaylist rectangle ''' assert isinstance(list_, mupdf.FzDisplayList) if seps is None: seps = mupdf.FzSeparations() assert seps is None or isinstance(seps, mupdf.FzSeparations), f'{type(seps)=}: {seps}' rect = mupdf.fz_bound_display_list(list_) matrix = JM_matrix_from_py(ctm) rclip = JM_rect_from_py(clip) rect = mupdf.fz_intersect_rect(rect, rclip) # no-op if clip is not given rect = mupdf.fz_transform_rect(rect, matrix) irect = mupdf.fz_round_rect(rect) assert isinstance( cs, mupdf.FzColorspace) pix = mupdf.fz_new_pixmap_with_bbox(cs, irect, seps, alpha) if alpha: mupdf.fz_clear_pixmap(pix) else: mupdf.fz_clear_pixmap_with_value(pix, 0xFF) if not mupdf.fz_is_infinite_rect(rclip): dev = mupdf.fz_new_draw_device_with_bbox(matrix, pix, irect) mupdf.fz_run_display_list(list_, dev, mupdf.FzMatrix(), rclip, mupdf.FzCookie()) else: dev = mupdf.fz_new_draw_device(matrix, pix) mupdf.fz_run_display_list(list_, dev, mupdf.FzMatrix(), mupdf.FzRect(mupdf.FzRect.Fixed_INFINITE), mupdf.FzCookie()) mupdf.fz_close_device(dev) # Use special raw Pixmap constructor so we don't set alpha to true. return Pixmap( 'raw', pix) def JM_point_from_py(p): ''' PySequence to fz_point. Default: (FZ_MIN_INF_RECT, FZ_MIN_INF_RECT) ''' if isinstance(p, mupdf.FzPoint): return p if isinstance(p, Point): return mupdf.FzPoint(p.x, p.y) if g_use_extra: return extra.JM_point_from_py( p) p0 = mupdf.FzPoint(0, 0) x = JM_FLOAT_ITEM(p, 0) y = JM_FLOAT_ITEM(p, 1) if x is None or y is None: return p0 x = max( x, FZ_MIN_INF_RECT) y = max( y, FZ_MIN_INF_RECT) x = min( x, FZ_MAX_INF_RECT) y = min( y, FZ_MAX_INF_RECT) return mupdf.FzPoint(x, y) def JM_print_stext_page_as_text(res, page): ''' Plain text output. An identical copy of fz_print_stext_page_as_text, but lines within a block are concatenated by space instead a new-line character (which else leads to 2 new-lines). ''' if 1 and g_use_extra: return extra.JM_print_stext_page_as_text(res, page) assert isinstance(res, mupdf.FzBuffer) assert isinstance(page, mupdf.FzStextPage) rect = mupdf.FzRect(page.m_internal.mediabox) last_char = 0 n_blocks = 0 n_lines = 0 n_chars = 0 for n_blocks2, block in enumerate( page): if block.m_internal.type == mupdf.FZ_STEXT_BLOCK_TEXT: for n_lines2, line in enumerate( block): for n_chars2, ch in enumerate( line): pass n_chars += n_chars2 n_lines += n_lines2 n_blocks += n_blocks2 for block in page: if block.m_internal.type == mupdf.FZ_STEXT_BLOCK_TEXT: for line in block: last_char = 0 for ch in line: chbbox = JM_char_bbox(line, ch) if (mupdf.fz_is_infinite_rect(rect) or JM_rects_overlap(rect, chbbox) ): #raw += chr(ch.m_internal.c) last_char = ch.m_internal.c #log( '{=last_char!r utf!r}') JM_append_rune(res, last_char) if last_char != 10 and last_char > 0: mupdf.fz_append_string(res, "\n") def JM_put_script(annot_obj, key1, key2, value): ''' Create a JavaScript PDF action. Usable for all object types which support PDF actions, even if the argument name suggests annotations. Up to 2 key values can be specified, so JavaScript actions can be stored for '/A' and '/AA/?' keys. ''' key1_obj = mupdf.pdf_dict_get(annot_obj, key1) pdf = mupdf.pdf_get_bound_document(annot_obj) # owning PDF # if no new script given, just delete corresponding key if not value: if key2 is None or not key2.m_internal: mupdf.pdf_dict_del(annot_obj, key1) elif key1_obj.m_internal: mupdf.pdf_dict_del(key1_obj, key2) return # read any existing script as a PyUnicode string if not key2.m_internal or not key1_obj.m_internal: script = JM_get_script(key1_obj) else: script = JM_get_script(mupdf.pdf_dict_get(key1_obj, key2)) # replace old script, if different from new one if value != script: newaction = JM_new_javascript(pdf, value) if not key2.m_internal: mupdf.pdf_dict_put(annot_obj, key1, newaction) else: mupdf.pdf_dict_putl(annot_obj, newaction, key1, key2) def JM_py_from_irect(r): return r.x0, r.y0, r.x1, r.y1 def JM_py_from_matrix(m): return m.a, m.b, m.c, m.d, m.e, m.f def JM_py_from_point(p): return p.x, p.y def JM_py_from_quad(q): ''' PySequence from fz_quad. ''' return ( (q.ul.x, q.ul.y), (q.ur.x, q.ur.y), (q.ll.x, q.ll.y), (q.lr.x, q.lr.y), ) def JM_py_from_rect(r): return r.x0, r.y0, r.x1, r.y1 def JM_quad_from_py(r): if isinstance(r, mupdf.FzQuad): return r # cover all cases of 4-float-sequences if hasattr(r, "__getitem__") and len(r) == 4 and hasattr(r[0], "__float__"): r = mupdf.FzRect(*tuple(r)) if isinstance( r, mupdf.FzRect): return mupdf.fz_quad_from_rect( r) if isinstance( r, Quad): return mupdf.fz_make_quad( r.ul.x, r.ul.y, r.ur.x, r.ur.y, r.ll.x, r.ll.y, r.lr.x, r.lr.y, ) q = mupdf.fz_make_quad(0, 0, 0, 0, 0, 0, 0, 0) p = [0,0,0,0] if not r or not isinstance(r, (tuple, list)) or len(r) != 4: return q if JM_FLOAT_ITEM(r, 0) is None: return mupdf.fz_quad_from_rect(JM_rect_from_py(r)) for i in range(4): if i >= len(r): return q # invalid: cancel the rest obj = r[i] # next point item if not PySequence_Check(obj) or PySequence_Size(obj) != 2: return q # invalid: cancel the rest p[i].x = JM_FLOAT_ITEM(obj, 0) p[i].y = JM_FLOAT_ITEM(obj, 1) if p[i].x is None or p[i].y is None: return q p[i].x = max( p[i].x, FZ_MIN_INF_RECT) p[i].y = max( p[i].y, FZ_MIN_INF_RECT) p[i].x = min( p[i].x, FZ_MAX_INF_RECT) p[i].y = min( p[i].y, FZ_MAX_INF_RECT) q.ul = p[0] q.ur = p[1] q.ll = p[2] q.lr = p[3] return q def JM_read_contents(pageref): ''' Read and concatenate a PDF page's /Contents object(s) in a buffer ''' assert isinstance(pageref, mupdf.PdfObj), f'{type(pageref)}' contents = mupdf.pdf_dict_get(pageref, mupdf.PDF_ENUM_NAME_Contents) if mupdf.pdf_is_array(contents): res = mupdf.FzBuffer(1024) for i in range(mupdf.pdf_array_len(contents)): if i > 0: mupdf.fz_append_byte(res, 32) obj = mupdf.pdf_array_get(contents, i) if mupdf.pdf_is_stream(obj): nres = mupdf.pdf_load_stream(obj) mupdf.fz_append_buffer(res, nres) elif contents.m_internal: res = mupdf.pdf_load_stream(contents) else: res = mupdf.FzBuffer(0) return res def JM_rect_from_py(r): if isinstance(r, mupdf.FzRect): return r if isinstance(r, mupdf.FzIrect): return mupdf.FzRect(r) if isinstance(r, Rect): return mupdf.fz_make_rect(r.x0, r.y0, r.x1, r.y1) if isinstance(r, IRect): return mupdf.fz_make_rect(r.x0, r.y0, r.x1, r.y1) if not r or not PySequence_Check(r) or PySequence_Size(r) != 4: return mupdf.FzRect(mupdf.FzRect.Fixed_INFINITE) f = [0, 0, 0, 0] for i in range(4): f[i] = JM_FLOAT_ITEM(r, i) if f[i] is None: return mupdf.FzRect(mupdf.FzRect.Fixed_INFINITE) if f[i] < FZ_MIN_INF_RECT: f[i] = FZ_MIN_INF_RECT if f[i] > FZ_MAX_INF_RECT: f[i] = FZ_MAX_INF_RECT return mupdf.fz_make_rect(f[0], f[1], f[2], f[3]) def JM_rects_overlap(a, b): if (0 or a.x0 >= b.x1 or a.y0 >= b.y1 or a.x1 <= b.x0 or a.y1 <= b.y0 ): return 0 return 1 def JM_refresh_links( page): ''' refreshes the link and annotation tables of a page ''' if page is None or not page.m_internal: return obj = mupdf.pdf_dict_get( page.obj(), PDF_NAME('Annots')) if obj.m_internal: pdf = page.doc() number = mupdf.pdf_lookup_page_number( pdf, page.obj()) page_mediabox = mupdf.FzRect() page_ctm = mupdf.FzMatrix() mupdf.pdf_page_transform( page, page_mediabox, page_ctm) link = mupdf.pdf_load_link_annots( pdf, page, obj, number, page_ctm) page.m_internal.links = mupdf.ll_fz_keep_link( link.m_internal) def JM_rotate_page_matrix(page): ''' calculate page rotation matrices ''' if not page.m_internal: return mupdf.FzMatrix() # no valid pdf page given rotation = JM_page_rotation(page) #log( '{rotation=}') if rotation == 0: return mupdf.FzMatrix() # no rotation cb_size = JM_cropbox_size(page.obj()) w = cb_size.x h = cb_size.y #log( '{=h w}') if rotation == 90: m = mupdf.fz_make_matrix(0, 1, -1, 0, h, 0) elif rotation == 180: m = mupdf.fz_make_matrix(-1, 0, 0, -1, w, h) else: m = mupdf.fz_make_matrix(0, -1, 1, 0, 0, w) #log( 'returning {m=}') return m def JM_search_stext_page(page, needle): if g_use_extra: return extra.JM_search_stext_page(page.m_internal, needle) rect = mupdf.FzRect(page.m_internal.mediabox) if not needle: return quads = [] class Hits: def __str__(self): return f'Hits(len={self.len} quads={self.quads} hfuzz={self.hfuzz} vfuzz={self.vfuzz}' hits = Hits() hits.len = 0 hits.quads = quads hits.hfuzz = 0.2 # merge kerns but not large gaps hits.vfuzz = 0.1 buffer_ = JM_new_buffer_from_stext_page(page) haystack_string = mupdf.fz_string_from_buffer(buffer_) haystack = 0 begin, end = find_string(haystack_string[haystack:], needle) if begin is None: #goto no_more_matches; return quads begin += haystack end += haystack inside = 0 i = 0 for block in page: if block.m_internal.type != mupdf.FZ_STEXT_BLOCK_TEXT: continue for line in block: for ch in line: i += 1 if not mupdf.fz_is_infinite_rect(rect): r = JM_char_bbox(line, ch) if not JM_rects_overlap(rect, r): #goto next_char; continue while 1: #try_new_match: if not inside: if haystack >= begin: inside = 1 if inside: if haystack < end: on_highlight_char(hits, line, ch) break else: inside = 0 begin, end = find_string(haystack_string[haystack:], needle) if begin is None: #goto no_more_matches; return quads else: #goto try_new_match; begin += haystack end += haystack continue break haystack += 1 #next_char:; assert haystack_string[haystack] == '\n', \ f'{haystack=} {haystack_string[haystack]=}' haystack += 1 assert haystack_string[haystack] == '\n', \ f'{haystack=} {haystack_string[haystack]=}' haystack += 1 #no_more_matches:; return quads def JM_scan_resources(pdf, rsrc, liste, what, stream_xref, tracer): ''' Step through /Resources, looking up image, xobject or font information ''' if mupdf.pdf_mark_obj(rsrc): mupdf.fz_warn('Circular dependencies! Consider page cleaning.') return # Circular dependencies! try: xobj = mupdf.pdf_dict_get(rsrc, mupdf.PDF_ENUM_NAME_XObject) if what == 1: # lookup fonts font = mupdf.pdf_dict_get(rsrc, mupdf.PDF_ENUM_NAME_Font) JM_gather_fonts(pdf, font, liste, stream_xref) elif what == 2: # look up images JM_gather_images(pdf, xobj, liste, stream_xref) elif what == 3: # look up form xobjects JM_gather_forms(pdf, xobj, liste, stream_xref) else: # should never happen return # check if we need to recurse into Form XObjects n = mupdf.pdf_dict_len(xobj) for i in range(n): obj = mupdf.pdf_dict_get_val(xobj, i) if mupdf.pdf_is_stream(obj): sxref = mupdf.pdf_to_num(obj) else: sxref = 0 subrsrc = mupdf.pdf_dict_get(obj, mupdf.PDF_ENUM_NAME_Resources) if subrsrc.m_internal: sxref_t = sxref if sxref_t not in tracer: tracer.append(sxref_t) JM_scan_resources( pdf, subrsrc, liste, what, sxref, tracer) else: mupdf.fz_warn('Circular dependencies! Consider page cleaning.') return finally: mupdf.pdf_unmark_obj(rsrc) def JM_set_choice_options(annot, liste): ''' set ListBox / ComboBox values ''' if not liste: return assert isinstance( liste, (tuple, list)) n = len( liste) if n == 0: return annot_obj = mupdf.pdf_annot_obj( annot) pdf = mupdf.pdf_get_bound_document( annot_obj) optarr = mupdf.pdf_new_array( pdf, n) for i in range(n): val = liste[i] opt = val if isinstance(opt, str): mupdf.pdf_array_push_text_string( optarr, opt) else: assert isinstance( val, (tuple, list)) and len( val) == 2, 'bad choice field list' opt1, opt2 = val assert opt1 and opt2, 'bad choice field list' optarrsub = mupdf.pdf_array_push_array( optarr, 2) mupdf.pdf_array_push_text_string( optarrsub, opt1) mupdf.pdf_array_push_text_string( optarrsub, opt2) mupdf.pdf_dict_put( annot_obj, PDF_NAME('Opt'), optarr) def JM_set_field_type(doc, obj, type): ''' Set the field type ''' setbits = 0 clearbits = 0 typename = None if type == mupdf.PDF_WIDGET_TYPE_BUTTON: typename = PDF_NAME('Btn') setbits = mupdf.PDF_BTN_FIELD_IS_PUSHBUTTON elif type == mupdf.PDF_WIDGET_TYPE_RADIOBUTTON: typename = PDF_NAME('Btn') clearbits = mupdf.PDF_BTN_FIELD_IS_PUSHBUTTON setbits = mupdf.PDF_BTN_FIELD_IS_RADIO elif type == mupdf.PDF_WIDGET_TYPE_CHECKBOX: typename = PDF_NAME('Btn') clearbits = (mupdf.PDF_BTN_FIELD_IS_PUSHBUTTON | mupdf.PDF_BTN_FIELD_IS_RADIO) elif type == mupdf.PDF_WIDGET_TYPE_TEXT: typename = PDF_NAME('Tx') elif type == mupdf.PDF_WIDGET_TYPE_LISTBOX: typename = PDF_NAME('Ch') clearbits = mupdf.PDF_CH_FIELD_IS_COMBO elif type == mupdf.PDF_WIDGET_TYPE_COMBOBOX: typename = PDF_NAME('Ch') setbits = mupdf.PDF_CH_FIELD_IS_COMBO elif type == mupdf.PDF_WIDGET_TYPE_SIGNATURE: typename = PDF_NAME('Sig') if typename is not None and typename.m_internal: mupdf.pdf_dict_put(obj, PDF_NAME('FT'), typename) if setbits != 0 or clearbits != 0: bits = mupdf.pdf_dict_get_int(obj, PDF_NAME('Ff')) bits &= ~clearbits bits |= setbits mupdf.pdf_dict_put_int(obj, PDF_NAME('Ff'), bits) def JM_set_object_value(obj, key, value): ''' Set a PDF dict key to some value ''' eyecatcher = "fitz: replace me!" pdf = mupdf.pdf_get_bound_document(obj) # split PDF key at path seps and take last key part list_ = key.split('/') len_ = len(list_) i = len_ - 1 skey = list_[i] del list_[i] # del the last sub-key len_ = len(list_) # remaining length testkey = mupdf.pdf_dict_getp(obj, key) # check if key already exists if not testkey.m_internal: #No, it will be created here. But we cannot allow this happening if #indirect objects are referenced. So we check all higher level #sub-paths for indirect references. while len_ > 0: t = '/'.join(list_) # next high level if mupdf.pdf_is_indirect(mupdf.pdf_dict_getp(obj, JM_StrAsChar(t))): raise Exception("path to '%s' has indirects", JM_StrAsChar(skey)) del list_[len_ - 1] # del last sub-key len_ = len(list_) # remaining length # Insert our eyecatcher. Will create all sub-paths in the chain, or # respectively remove old value of key-path. mupdf.pdf_dict_putp(obj, key, mupdf.pdf_new_text_string(eyecatcher)) testkey = mupdf.pdf_dict_getp(obj, key) if not mupdf.pdf_is_string(testkey): raise Exception("cannot insert value for '%s'", key) temp = mupdf.pdf_to_text_string(testkey) if temp != eyecatcher: raise Exception("cannot insert value for '%s'", key) # read the result as a string res = JM_object_to_buffer(obj, 1, 0) objstr = JM_EscapeStrFromBuffer(res) # replace 'eyecatcher' by desired 'value' nullval = "/%s(%s)" % ( skey, eyecatcher) newval = "/%s %s" % (skey, value) newstr = objstr.replace(nullval, newval, 1) # make PDF object from resulting string new_obj = JM_pdf_obj_from_str(pdf, newstr) return new_obj def JM_set_ocg_arrays(conf, basestate, on, off, rbgroups, locked): if basestate: mupdf.pdf_dict_put_name( conf, PDF_NAME('BaseState'), basestate) if on is not None: mupdf.pdf_dict_del( conf, PDF_NAME('ON')) if on: arr = mupdf.pdf_dict_put_array( conf, PDF_NAME('ON'), 1) JM_set_ocg_arrays_imp( arr, on) if off is not None: mupdf.pdf_dict_del( conf, PDF_NAME('OFF')) if off: arr = mupdf.pdf_dict_put_array( conf, PDF_NAME('OFF'), 1) JM_set_ocg_arrays_imp( arr, off) if locked is not None: mupdf.pdf_dict_del( conf, PDF_NAME('Locked')) if locked: arr = mupdf.pdf_dict_put_array( conf, PDF_NAME('Locked'), 1) JM_set_ocg_arrays_imp( arr, locked) if rbgroups is not None: mupdf.pdf_dict_del( conf, PDF_NAME('RBGroups')) if rbgroups: arr = mupdf.pdf_dict_put_array( conf, PDF_NAME('RBGroups'), 1) n =len(rbgroups) for i in range(n): item0 = rbgroups[i] obj = mupdf.pdf_array_push_array( arr, 1) JM_set_ocg_arrays_imp( obj, item0) def JM_set_ocg_arrays_imp(arr, list_): ''' Set OCG arrays from dict of Python lists Works with dict like {"basestate":name, "on":list, "off":list, "rbg":list} ''' pdf = mupdf.pdf_get_bound_document(arr) for xref in list_: obj = mupdf.pdf_new_indirect(pdf, xref, 0) mupdf.pdf_array_push(arr, obj) def JM_set_resource_property(ref, name, xref): ''' Insert an item into Resources/Properties (used for Marked Content) Arguments: (1) e.g. page object, Form XObject (2) marked content name (3) xref of the referenced object (insert as indirect reference) ''' pdf = mupdf.pdf_get_bound_document(ref) ind = mupdf.pdf_new_indirect(pdf, xref, 0) if not ind.m_internal: RAISEPY(MSG_BAD_XREF, PyExc_ValueError) resources = mupdf.pdf_dict_get(ref, PDF_NAME('Resources')) if not resources.m_internal: resources = mupdf.pdf_dict_put_dict(ref, PDF_NAME('Resources'), 1) properties = mupdf.pdf_dict_get(resources, PDF_NAME('Properties')) if not properties.m_internal: properties = mupdf.pdf_dict_put_dict(resources, PDF_NAME('Properties'), 1) mupdf.pdf_dict_put(properties, mupdf.pdf_new_name(name), ind) def JM_set_widget_properties(annot, Widget): ''' Update the PDF form field with the properties from a Python Widget object. Called by "Page.add_widget" and "Annot.update_widget". ''' if isinstance( annot, Annot): annot = annot.this assert isinstance( annot, mupdf.PdfAnnot), f'{type(annot)=} {type=}' page = _pdf_annot_page(annot) assert page.m_internal, 'Annot is not bound to a page' annot_obj = mupdf.pdf_annot_obj(annot) pdf = page.doc() def GETATTR(name): return getattr(Widget, name, None) value = GETATTR("field_type") field_type = value # rectangle -------------------------------------------------------------- value = GETATTR("rect") rect = JM_rect_from_py(value) rot_mat = JM_rotate_page_matrix(page) rect = mupdf.fz_transform_rect(rect, rot_mat) mupdf.pdf_set_annot_rect(annot, rect) # fill color ------------------------------------------------------------- value = GETATTR("fill_color") if value and PySequence_Check(value): n = len(value) fill_col = mupdf.pdf_new_array(pdf, n) col = 0 for i in range(n): col = value[i] mupdf.pdf_array_push_real(fill_col, col) mupdf.pdf_field_set_fill_color(annot_obj, fill_col) # dashes ----------------------------------------------------------------- value = GETATTR("border_dashes") if value and PySequence_Check(value): n = len(value) dashes = mupdf.pdf_new_array(pdf, n) for i in range(n): mupdf.pdf_array_push_int(dashes, value[i]) mupdf.pdf_dict_putl(annot_obj, dashes, PDF_NAME('BS'), PDF_NAME('D')) # border color ----------------------------------------------------------- value = GETATTR("border_color") if value and PySequence_Check(value): n = len(value) border_col = mupdf.pdf_new_array(pdf, n) col = 0 for i in range(n): col = value[i] mupdf.pdf_array_push_real(border_col, col) mupdf.pdf_dict_putl(annot_obj, border_col, PDF_NAME('MK'), PDF_NAME('BC')) # entry ignored - may be used later # #int text_format = (int) PyInt_AsLong(GETATTR("text_format")); # # field label ----------------------------------------------------------- value = GETATTR("field_label") if value is not None: label = JM_StrAsChar(value) mupdf.pdf_dict_put_text_string(annot_obj, PDF_NAME('TU'), label) # field name ------------------------------------------------------------- value = GETATTR("field_name") if value is not None: name = JM_StrAsChar(value) old_name = mupdf.pdf_load_field_name(annot_obj) if name != old_name: mupdf.pdf_dict_put_text_string(annot_obj, PDF_NAME('T'), name) # max text len ----------------------------------------------------------- if field_type == mupdf.PDF_WIDGET_TYPE_TEXT: value = GETATTR("text_maxlen") text_maxlen = value if text_maxlen: mupdf.pdf_dict_put_int(annot_obj, PDF_NAME('MaxLen'), text_maxlen) value = GETATTR("field_display") d = value mupdf.pdf_field_set_display(annot_obj, d) # choice values ---------------------------------------------------------- if field_type in (mupdf.PDF_WIDGET_TYPE_LISTBOX, mupdf.PDF_WIDGET_TYPE_COMBOBOX): value = GETATTR("choice_values") JM_set_choice_options(annot, value) # border style ----------------------------------------------------------- value = GETATTR("border_style") val = JM_get_border_style(value) mupdf.pdf_dict_putl(annot_obj, val, PDF_NAME('BS'), PDF_NAME('S')) # border width ----------------------------------------------------------- value = GETATTR("border_width") border_width = value mupdf.pdf_dict_putl( annot_obj, mupdf.pdf_new_real(border_width), PDF_NAME('BS'), PDF_NAME('W'), ) # /DA string ------------------------------------------------------------- value = GETATTR("_text_da") da = JM_StrAsChar(value) mupdf.pdf_dict_put_text_string(annot_obj, PDF_NAME('DA'), da) mupdf.pdf_dict_del(annot_obj, PDF_NAME('DS')) # not supported by MuPDF mupdf.pdf_dict_del(annot_obj, PDF_NAME('RC')) # not supported by MuPDF # field flags ------------------------------------------------------------ field_flags = GETATTR("field_flags") if field_flags is not None: if field_type == mupdf.PDF_WIDGET_TYPE_COMBOBOX: field_flags |= mupdf.PDF_CH_FIELD_IS_COMBO elif field_type == mupdf.PDF_WIDGET_TYPE_RADIOBUTTON: field_flags |= mupdf.PDF_BTN_FIELD_IS_RADIO elif field_type == mupdf.PDF_WIDGET_TYPE_BUTTON: field_flags |= mupdf.PDF_BTN_FIELD_IS_PUSHBUTTON mupdf.pdf_dict_put_int( annot_obj, PDF_NAME('Ff'), field_flags) # button caption --------------------------------------------------------- value = GETATTR("button_caption") ca = JM_StrAsChar(value) if ca: mupdf.pdf_field_set_button_caption(annot_obj, ca) # script (/A) ------------------------------------------------------- value = GETATTR("script") JM_put_script(annot_obj, PDF_NAME('A'), mupdf.PdfObj(), value) # script (/AA/K) ------------------------------------------------------- value = GETATTR("script_stroke") JM_put_script(annot_obj, PDF_NAME('AA'), PDF_NAME('K'), value) # script (/AA/F) ------------------------------------------------------- value = GETATTR("script_format") JM_put_script(annot_obj, PDF_NAME('AA'), PDF_NAME('F'), value) # script (/AA/V) ------------------------------------------------------- value = GETATTR("script_change") JM_put_script(annot_obj, PDF_NAME('AA'), PDF_NAME('V'), value) # script (/AA/C) ------------------------------------------------------- value = GETATTR("script_calc") JM_put_script(annot_obj, PDF_NAME('AA'), PDF_NAME('C'), value) # script (/AA/Bl) ------------------------------------------------------- value = GETATTR("script_blur") JM_put_script(annot_obj, PDF_NAME('AA'), mupdf.pdf_new_name('Bl'), value) # script (/AA/Fo) codespell:ignore -------------------------------------- value = GETATTR("script_focus") JM_put_script(annot_obj, PDF_NAME('AA'), mupdf.pdf_new_name('Fo'), value) # field value ------------------------------------------------------------ value = GETATTR("field_value") # field value text = JM_StrAsChar(value) # convert to text (may fail!) if field_type == mupdf.PDF_WIDGET_TYPE_RADIOBUTTON: if not value: mupdf.pdf_set_field_value(pdf, annot_obj, "Off", 1) mupdf.pdf_dict_put_name(annot_obj, PDF_NAME('AS'), "Off") else: # TODO check if another button in the group is ON and if so set it Off onstate = mupdf.pdf_button_field_on_state(annot_obj) if onstate.m_internal: on = mupdf.pdf_to_name(onstate) mupdf.pdf_set_field_value(pdf, annot_obj, on, 1) mupdf.pdf_dict_put_name(annot_obj, PDF_NAME('AS'), on) elif text: mupdf.pdf_dict_put_name(annot_obj, PDF_NAME('AS'), text) elif field_type == mupdf.PDF_WIDGET_TYPE_CHECKBOX: onstate = mupdf.pdf_button_field_on_state(annot_obj) on = onstate.pdf_to_name() if value in (True, on) or text == 'Yes': mupdf.pdf_set_field_value(pdf, annot_obj, on, 1) mupdf.pdf_dict_put_name(annot_obj, PDF_NAME('AS'), on) mupdf.pdf_dict_put_name(annot_obj, PDF_NAME('V'), on) else: mupdf.pdf_dict_put_name( annot_obj, PDF_NAME('AS'), 'Off') mupdf.pdf_dict_put_name( annot_obj, PDF_NAME('V'), 'Off') else: if text: mupdf.pdf_set_field_value(pdf, annot_obj, text, 1) if field_type in (mupdf.PDF_WIDGET_TYPE_COMBOBOX, mupdf.PDF_WIDGET_TYPE_LISTBOX): mupdf.pdf_dict_del(annot_obj, PDF_NAME('I')) mupdf.pdf_dirty_annot(annot) mupdf.pdf_set_annot_hot(annot, 1) mupdf.pdf_set_annot_active(annot, 1) mupdf.pdf_update_annot(annot) def JM_show_string_cs( text, user_font, trm, s, wmode, bidi_level, markup_dir, language, ): i = 0 while i < len(s): l, ucs = mupdf.fz_chartorune(s[i:]) i += l gid = mupdf.fz_encode_character_sc(user_font, ucs) if gid == 0: gid, font = mupdf.fz_encode_character_with_fallback(user_font, ucs, 0, language) else: font = user_font mupdf.fz_show_glyph(text, font, trm, gid, ucs, wmode, bidi_level, markup_dir, language) adv = mupdf.fz_advance_glyph(font, gid, wmode) if wmode == 0: trm = mupdf.fz_pre_translate(trm, adv, 0) else: trm = mupdf.fz_pre_translate(trm, 0, -adv) return trm def JM_UnicodeFromBuffer(buff): buff_bytes = mupdf.fz_buffer_extract_copy(buff) val = buff_bytes.decode(errors='replace') z = val.find(chr(0)) if z >= 0: val = val[:z] return val def message_warning(text): ''' Generate a warning. ''' message(f'warning: {text}') def JM_update_stream(doc, obj, buffer_, compress): ''' update a stream object compress stream when beneficial ''' if compress: length, _ = mupdf.fz_buffer_storage(buffer_) if length > 30: # ignore small stuff buffer_compressed = JM_compress_buffer(buffer_) assert isinstance(buffer_compressed, mupdf.FzBuffer) if buffer_compressed.m_internal: length_compressed, _ = mupdf.fz_buffer_storage(buffer_compressed) if length_compressed < length: # was it worth the effort? mupdf.pdf_dict_put( obj, mupdf.PDF_ENUM_NAME_Filter, mupdf.PDF_ENUM_NAME_FlateDecode, ) mupdf.pdf_update_stream(doc, obj, buffer_compressed, 1) return mupdf.pdf_update_stream(doc, obj, buffer_, 0) def JM_xobject_from_page(pdfout, fsrcpage, xref, gmap): ''' Make an XObject from a PDF page For a positive xref assume that its object can be used instead ''' assert isinstance(gmap, mupdf.PdfGraftMap), f'{type(gmap)=}' if xref > 0: xobj1 = mupdf.pdf_new_indirect(pdfout, xref, 0) else: srcpage = _as_pdf_page(fsrcpage.this) spageref = srcpage.obj() mediabox = mupdf.pdf_to_rect(mupdf.pdf_dict_get_inheritable(spageref, PDF_NAME('MediaBox'))) # Deep-copy resources object of source page o = mupdf.pdf_dict_get_inheritable(spageref, PDF_NAME('Resources')) if gmap.m_internal: # use graftmap when possible resources = mupdf.pdf_graft_mapped_object(gmap, o) else: resources = mupdf.pdf_graft_object(pdfout, o) # get spgage contents source res = JM_read_contents(spageref) #------------------------------------------------------------- # create XObject representing the source page #------------------------------------------------------------- xobj1 = mupdf.pdf_new_xobject(pdfout, mediabox, mupdf.FzMatrix(), mupdf.PdfObj(0), res) # store spage contents JM_update_stream(pdfout, xobj1, res, 1) # store spage resources mupdf.pdf_dict_put(xobj1, PDF_NAME('Resources'), resources) return xobj1 def PySequence_Check(s): return isinstance(s, (tuple, list)) def PySequence_Size(s): return len(s) # constants: error messages. These are also in extra.i. # MSG_BAD_ANNOT_TYPE = "bad annot type" MSG_BAD_APN = "bad or missing annot AP/N" MSG_BAD_ARG_INK_ANNOT = "arg must be seq of seq of float pairs" MSG_BAD_ARG_POINTS = "bad seq of points" MSG_BAD_BUFFER = "bad type: 'buffer'" MSG_BAD_COLOR_SEQ = "bad color sequence" MSG_BAD_DOCUMENT = "cannot open broken document" MSG_BAD_FILETYPE = "bad filetype" MSG_BAD_LOCATION = "bad location" MSG_BAD_OC_CONFIG = "bad config number" MSG_BAD_OC_LAYER = "bad layer number" MSG_BAD_OC_REF = "bad 'oc' reference" MSG_BAD_PAGEID = "bad page id" MSG_BAD_PAGENO = "bad page number(s)" MSG_BAD_PDFROOT = "PDF has no root" MSG_BAD_RECT = "rect is infinite or empty" MSG_BAD_TEXT = "bad type: 'text'" MSG_BAD_XREF = "bad xref" MSG_COLOR_COUNT_FAILED = "color count failed" MSG_FILE_OR_BUFFER = "need font file or buffer" MSG_FONT_FAILED = "cannot create font" MSG_IS_NO_ANNOT = "is no annotation" MSG_IS_NO_IMAGE = "is no image" MSG_IS_NO_PDF = "is no PDF" MSG_IS_NO_DICT = "object is no PDF dict" MSG_PIX_NOALPHA = "source pixmap has no alpha" MSG_PIXEL_OUTSIDE = "pixel(s) outside image" JM_Exc_FileDataError = 'FileDataError' PyExc_ValueError = 'ValueError' def RAISEPY( msg, exc): #JM_Exc_CurrentException=exc #fz_throw(context, FZ_ERROR_GENERIC, msg) raise Exception( msg) def PyUnicode_DecodeRawUnicodeEscape(s, errors='strict'): # FIXED: handle raw unicode escape sequences if not s: return "" if isinstance(s, str): rc = s.encode("utf8", errors=errors) elif isinstance(s, bytes): rc = s[:] ret = rc.decode('raw_unicode_escape', errors=errors) return ret def CheckColor(c: OptSeq): if c: if ( type(c) not in (list, tuple) or len(c) not in (1, 3, 4) or min(c) < 0 or max(c) > 1 ): raise ValueError("need 1, 3 or 4 color components in range 0 to 1") def CheckFont(page: Page, fontname: str) -> tuple: """Return an entry in the page's font list if reference name matches. """ for f in page.get_fonts(): if f[4] == fontname: return f def CheckFontInfo(doc: Document, xref: int) -> list: """Return a font info if present in the document. """ for f in doc.FontInfos: if xref == f[0]: return f def CheckMarkerArg(quads: typing.Any) -> tuple: if CheckRect(quads): r = Rect(quads) return (r.quad,) if CheckQuad(quads): return (quads,) for q in quads: if not (CheckRect(q) or CheckQuad(q)): raise ValueError("bad quads entry") return quads def CheckMorph(o: typing.Any) -> bool: if not bool(o): return False if not (type(o) in (list, tuple) and len(o) == 2): raise ValueError("morph must be a sequence of length 2") if not (len(o[0]) == 2 and len(o[1]) == 6): raise ValueError("invalid morph param 0") if not o[1][4] == o[1][5] == 0: raise ValueError("invalid morph param 1") return True def CheckParent(o: typing.Any): return if not hasattr(o, "parent") or o.parent is None: raise ValueError(f"orphaned object {type(o)=}: parent is None") def CheckQuad(q: typing.Any) -> bool: """Check whether an object is convex, not empty quad-like. It must be a sequence of 4 number pairs. """ try: q0 = Quad(q) except Exception: if g_exceptions_verbose > 1: exception_info() return False return q0.is_convex def CheckRect(r: typing.Any) -> bool: """Check whether an object is non-degenerate rect-like. It must be a sequence of 4 numbers. """ try: r = Rect(r) except Exception: if g_exceptions_verbose > 1: exception_info() return False return not (r.is_empty or r.is_infinite) def ColorCode(c: typing.Union[list, tuple, float, None], f: str) -> str: if not c: return "" if hasattr(c, "__float__"): c = (c,) CheckColor(c) if len(c) == 1: s = _format_g(c[0]) + " " return s + "G " if f == "c" else s + "g " if len(c) == 3: s = _format_g(tuple(c)) + " " return s + "RG " if f == "c" else s + "rg " s = _format_g(tuple(c)) + " " return s + "K " if f == "c" else s + "k " def Page__add_text_marker(self, quads, annot_type): pdfpage = self._pdf_page() rotation = JM_page_rotation(pdfpage) def final(): if rotation != 0: mupdf.pdf_dict_put_int(pdfpage.obj(), PDF_NAME('Rotate'), rotation) try: if rotation != 0: mupdf.pdf_dict_put_int(pdfpage.obj(), PDF_NAME('Rotate'), 0) annot = mupdf.pdf_create_annot(pdfpage, annot_type) for item in quads: q = JM_quad_from_py(item) mupdf.pdf_add_annot_quad_point(annot, q) mupdf.pdf_update_annot(annot) JM_add_annot_id(annot, "A") final() except Exception: if g_exceptions_verbose: exception_info() final() return return Annot(annot) def PDF_NAME(x): assert isinstance(x, str) ret = getattr(mupdf, f'PDF_ENUM_NAME_{x}') # Note that we return a (swig proxy for) pdf_obj*, not a mupdf.PdfObj. In # the C++ API, the constructor PdfObj::PdfObj(pdf_obj*) is marked as # explicit, but this seems to be ignored by SWIG. If SWIG started to # generate code that respected `explicit`, we would need to do `return # mupdf.PdfObj(ret)`. # # [Compare with extra.i, where we define our own PDF_NAME2() macro that # returns a mupdf::PdfObj.] return ret def UpdateFontInfo(doc: Document, info: typing.Sequence): xref = info[0] found = False for i, fi in enumerate(doc.FontInfos): if fi[0] == xref: found = True break if found: doc.FontInfos[i] = info else: doc.FontInfos.append(info) def args_match(args, *types): ''' Returns true if <args> matches <types>. Each item in <types> is a type or tuple of types. Any of these types will match an item in <args>. `None` will match anything in <args>. `type(None)` will match an arg whose value is `None`. ''' j = 0 for i in range(len(types)): type_ = types[i] if j >= len(args): if isinstance(type_, tuple) and None in type_: # arg is missing but has default value. continue else: return False if type_ is not None and not isinstance(args[j], type_): return False j += 1 if j != len(args): return False return True def calc_image_matrix(width, height, tr, rotate, keep): ''' # compute image insertion matrix ''' trect = JM_rect_from_py(tr) rot = mupdf.fz_rotate(rotate) trw = trect.x1 - trect.x0 trh = trect.y1 - trect.y0 w = trw h = trh if keep: large = max(width, height) fw = width / large fh = height / large else: fw = fh = 1 small = min(fw, fh) if rotate != 0 and rotate != 180: f = fw fw = fh fh = f if fw < 1: if trw / fw > trh / fh: w = trh * small h = trh else: w = trw h = trw / small elif fw != fh: if trw / fw > trh / fh: w = trh / small h = trh else: w = trw h = trw * small else: w = trw h = trh tmp = mupdf.fz_make_point( (trect.x0 + trect.x1) / 2, (trect.y0 + trect.y1) / 2, ) mat = mupdf.fz_make_matrix(1, 0, 0, 1, -0.5, -0.5) mat = mupdf.fz_concat(mat, rot) mat = mupdf.fz_concat(mat, mupdf.fz_scale(w, h)) mat = mupdf.fz_concat(mat, mupdf.fz_translate(tmp.x, tmp.y)) return mat def detect_super_script(line, ch): if line.m_internal.wmode == 0 and line.m_internal.dir.x == 1 and line.m_internal.dir.y == 0: return ch.m_internal.origin.y < line.m_internal.first_char.origin.y - ch.m_internal.size * 0.1 return 0 def dir_str(x): ret = f'{x} {type(x)} ({len(dir(x))}):\n' for i in dir(x): ret += f' {i}\n' return ret def getTJstr(text: str, glyphs: typing.Union[list, tuple, None], simple: bool, ordering: int) -> str: """ Return a PDF string enclosed in [] brackets, suitable for the PDF TJ operator. Notes: The input string is converted to either 2 or 4 hex digits per character. Args: simple: no glyphs: 2-chars, use char codes as the glyph glyphs: 2-chars, use glyphs instead of char codes (Symbol, ZapfDingbats) not simple: ordering < 0: 4-chars, use glyphs not char codes ordering >=0: a CJK font! 4 chars, use char codes as glyphs """ if text.startswith("[<") and text.endswith(">]"): # already done return text if not bool(text): return "[<>]" if simple: # each char or its glyph is coded as a 2-byte hex if glyphs is None: # not Symbol, not ZapfDingbats: use char code otxt = "".join(["%02x" % ord(c) if ord(c) < 256 else "b7" for c in text]) else: # Symbol or ZapfDingbats: use glyphs otxt = "".join( ["%02x" % glyphs[ord(c)][0] if ord(c) < 256 else "b7" for c in text] ) return "[<" + otxt + ">]" # non-simple fonts: each char or its glyph is coded as 4-byte hex if ordering < 0: # not a CJK font: use the glyphs otxt = "".join(["%04x" % glyphs[ord(c)][0] for c in text]) else: # CJK: use the char codes otxt = "".join(["%04x" % ord(c) for c in text]) return "[<" + otxt + ">]" def get_pdf_str(s: str) -> str: """ Return a PDF string depending on its coding. Notes: Returns a string bracketed with either "()" or "<>" for hex values. If only ascii then "(original)" is returned, else if only 8 bit chars then "(original)" with interspersed octal strings \nnn is returned, else a string "<FEFF[hexstring]>" is returned, where [hexstring] is the UTF-16BE encoding of the original. """ if not bool(s): return "()" def make_utf16be(s): r = bytearray([254, 255]) + bytearray(s, "UTF-16BE") return "<" + r.hex() + ">" # brackets indicate hex # The following either returns the original string with mixed-in # octal numbers \nnn for chars outside the ASCII range, or returns # the UTF-16BE BOM version of the string. r = "" for c in s: oc = ord(c) if oc > 255: # shortcut if beyond 8-bit code range return make_utf16be(s) if oc > 31 and oc < 127: # in ASCII range if c in ("(", ")", "\\"): # these need to be escaped r += "\\" r += c continue if oc > 127: # beyond ASCII r += "\\%03o" % oc continue # now the white spaces if oc == 8: # backspace r += "\\b" elif oc == 9: # tab r += "\\t" elif oc == 10: # line feed r += "\\n" elif oc == 12: # form feed r += "\\f" elif oc == 13: # carriage return r += "\\r" else: r += "\\267" # unsupported: replace by 0xB7 return "(" + r + ")" def get_tessdata(tessdata=None): """Detect Tesseract language support folder. This function is used to enable OCR via Tesseract even if the language support folder is not specified directly or in environment variable TESSDATA_PREFIX. * If <tessdata> is set we return it directly. * Otherwise we return `os.environ['TESSDATA_PREFIX']` if set. * Otherwise we search for a Tesseract installation and return its language support folder. * Otherwise we raise an exception. """ if tessdata: return tessdata tessdata = os.getenv("TESSDATA_PREFIX") if tessdata: # use environment variable if set return tessdata # Try to locate the tesseract-ocr installation. import subprocess cp = subprocess.run('tesseract --list-langs', shell=1, capture_output=1, check=0, text=True) if cp.returncode == 0: m = re.search('List of available languages in "(.+)"', cp.stdout) if m: tessdata = m.group(1) return tessdata # Windows systems: if sys.platform == "win32": cp = subprocess.run("where tesseract", shell=1, capture_output=1, check=0, text=True) response = cp.stdout.strip() if cp.returncode or not response: raise RuntimeError("No tessdata specified and Tesseract is not installed") dirname = os.path.dirname(response) # path of tesseract.exe tessdata = os.path.join(dirname, "tessdata") # language support if os.path.exists(tessdata): # all ok? return tessdata else: # should not happen! raise RuntimeError("No tessdata specified and Tesseract installation has no {tessdata} folder") # Unix-like systems: attempts = list() for path in 'tesseract-ocr', 'tesseract': cp = subprocess.run(f'whereis {path}', shell=1, capture_output=1, check=0, text=True) if cp.returncode == 0: response = cp.stdout.strip().split() if len(response) == 2: # search tessdata in folder structure dirname = response[1] # contains tesseract-ocr installation folder pattern = f"{dirname}/*/tessdata" attempts.append(pattern) tessdatas = glob.glob(pattern) tessdatas.sort() if tessdatas: return tessdatas[-1] if attempts: text = 'No tessdata specified and no match for:\n' for attempt in attempts: text += f' {attempt}' raise RuntimeError(text) else: raise RuntimeError('No tessdata specified and Tesseract is not installed') def css_for_pymupdf_font( fontcode: str, *, CSS: OptStr = None, archive: AnyType = None, name: OptStr = None ) -> str: """Create @font-face items for the given fontcode of pymupdf-fonts. Adds @font-face support for fonts contained in package pymupdf-fonts. Creates a CSS font-family for all fonts starting with string 'fontcode'. Note: The font naming convention in package pymupdf-fonts is "fontcode<sf>", where the suffix "sf" is either empty or one of "it", "bo" or "bi". These suffixes thus represent the regular, italic, bold or bold-italic variants of a font. For example, font code "notos" refers to fonts "notos" - "Noto Sans Regular" "notosit" - "Noto Sans Italic" "notosbo" - "Noto Sans Bold" "notosbi" - "Noto Sans Bold Italic" This function creates four CSS @font-face definitions and collectively assigns the font-family name "notos" to them (or the "name" value). All fitting font buffers of the pymupdf-fonts package are placed / added to the archive provided as parameter. To use the font in pymupdf.Story, execute 'set_font(fontcode)'. The correct font weight (bold) or style (italic) will automatically be selected. Expects and returns the CSS source, with the new CSS definitions appended. Args: fontcode: (str) font code for naming the font variants to include. E.g. "fig" adds notos, notosi, notosb, notosbi fonts. A maximum of 4 font variants is accepted. CSS: (str) CSS string to add @font-face definitions to. archive: (Archive, mandatory) where to place the font buffers. name: (str) use this as family-name instead of 'fontcode'. Returns: Modified CSS, with appended @font-face statements for each font variant of fontcode. Fontbuffers associated with "fontcode" will be added to 'archive'. """ # @font-face template string CSSFONT = "\n@font-face {font-family: %s; src: url(%s);%s%s}\n" if not type(archive) is Archive: raise ValueError("'archive' must be an Archive") if CSS is None: CSS = "" # select font codes starting with the pass-in string font_keys = [k for k in fitz_fontdescriptors.keys() if k.startswith(fontcode)] if font_keys == []: raise ValueError(f"No font code '{fontcode}' found in pymupdf-fonts.") if len(font_keys) > 4: raise ValueError("fontcode too short") if name is None: # use this name for font-family name = fontcode for fkey in font_keys: font = fitz_fontdescriptors[fkey] bold = font["bold"] # determine font property italic = font["italic"] # determine font property fbuff = font["loader"]() # load the fontbuffer archive.add(fbuff, fkey) # update the archive bold_text = "font-weight: bold;" if bold else "" italic_text = "font-style: italic;" if italic else "" CSS += CSSFONT % (name, fkey, bold_text, italic_text) return CSS def get_text_length(text: str, fontname: str ="helv", fontsize: float =11, encoding: int =0) -> float: """Calculate length of a string for a built-in font. Args: fontname: name of the font. fontsize: font size points. encoding: encoding to use, 0=Latin (default), 1=Greek, 2=Cyrillic. Returns: (float) length of text. """ fontname = fontname.lower() basename = Base14_fontdict.get(fontname, None) glyphs = None if basename == "Symbol": glyphs = symbol_glyphs if basename == "ZapfDingbats": glyphs = zapf_glyphs if glyphs is not None: w = sum([glyphs[ord(c)][1] if ord(c) < 256 else glyphs[183][1] for c in text]) return w * fontsize if fontname in Base14_fontdict.keys(): return util_measure_string( text, Base14_fontdict[fontname], fontsize, encoding ) if fontname in ( "china-t", "china-s", "china-ts", "china-ss", "japan", "japan-s", "korea", "korea-s", ): return len(text) * fontsize raise ValueError("Font '%s' is unsupported" % fontname) def image_profile(img: ByteString) -> dict: """ Return basic properties of an image. Args: img: bytes, bytearray, io.BytesIO object or an opened image file. Returns: A dictionary with keys width, height, colorspace.n, bpc, type, ext and size, where 'type' is the MuPDF image type (0 to 14) and 'ext' the suitable file extension. """ if type(img) is io.BytesIO: stream = img.getvalue() elif hasattr(img, "read"): stream = img.read() elif type(img) in (bytes, bytearray): stream = img else: raise ValueError("bad argument 'img'") return TOOLS.image_profile(stream) def jm_append_merge(dev): ''' Append current path to list or merge into last path of the list. (1) Append if first path, different item lists or not a 'stroke' version of previous path (2) If new path has the same items, merge its content into previous path and change path["type"] to "fs". (3) If "out" is callable, skip the previous and pass dictionary to it. ''' #log(f'{getattr(dev, "pathdict", None)=}') assert isinstance(dev.out, list) #log( f'{dev.out=}') if callable(dev.method) or dev.method: # function or method # callback. if dev.method is None: # fixme, this surely cannot happen? assert 0 #resp = PyObject_CallFunctionObjArgs(out, dev.pathdict, NULL) else: #log(f'calling {dev.out=} {dev.method=} {dev.pathdict=}') resp = getattr(dev.out, dev.method)(dev.pathdict) if not resp: message("calling cdrawings callback function/method failed!") dev.pathdict = None return def append(): #log(f'jm_append_merge(): clearing dev.pathdict') dev.out.append(dev.pathdict.copy()) dev.pathdict.clear() assert isinstance(dev.out, list) len_ = len(dev.out) # len of output list so far #log('{len_=}') if len_ == 0: # always append first path return append() #log(f'{getattr(dev, "pathdict", None)=}') thistype = dev.pathdict[ dictkey_type] #log(f'{thistype=}') if thistype != 's': # if not stroke, then append return append() prev = dev.out[ len_-1] # get prev path #log( f'{prev=}') prevtype = prev[ dictkey_type] #log( f'{prevtype=}') if prevtype != 'f': # if previous not fill, append return append() # last check: there must be the same list of items for "f" and "s". previtems = prev[ dictkey_items] thisitems = dev.pathdict[ dictkey_items] if previtems != thisitems: return append() #rc = PyDict_Merge(prev, dev.pathdict, 0); // merge with no override try: for k, v in dev.pathdict.items(): if k not in prev: prev[k] = v rc = 0 except Exception: if g_exceptions_verbose: exception_info() #raise rc = -1 if rc == 0: prev[ dictkey_type] = 'fs' dev.pathdict.clear() else: message("could not merge stroke and fill path") append() def jm_bbox_add_rect( dev, ctx, rect, code): if not dev.layers: dev.result.append( (code, JM_py_from_rect(rect))) else: dev.result.append( (code, JM_py_from_rect(rect), dev.layer_name)) def jm_bbox_fill_image( dev, ctx, image, ctm, alpha, color_params): r = mupdf.FzRect(mupdf.FzRect.Fixed_UNIT) r = mupdf.ll_fz_transform_rect( r.internal(), ctm) jm_bbox_add_rect( dev, ctx, r, "fill-image") def jm_bbox_fill_image_mask( dev, ctx, image, ctm, colorspace, color, alpha, color_params): try: jm_bbox_add_rect( dev, ctx, mupdf.ll_fz_transform_rect(mupdf.fz_unit_rect, ctm), "fill-imgmask") except Exception: if g_exceptions_verbose: exception_info() raise def jm_bbox_fill_path( dev, ctx, path, even_odd, ctm, colorspace, color, alpha, color_params): even_odd = True if even_odd else False try: jm_bbox_add_rect( dev, ctx, mupdf.ll_fz_bound_path(path, None, ctm), "fill-path") except Exception: if g_exceptions_verbose: exception_info() raise def jm_bbox_fill_shade( dev, ctx, shade, ctm, alpha, color_params): try: jm_bbox_add_rect( dev, ctx, mupdf.ll_fz_bound_shade( shade, ctm), "fill-shade") except Exception: if g_exceptions_verbose: exception_info() raise def jm_bbox_stroke_text( dev, ctx, text, stroke, ctm, *args): try: jm_bbox_add_rect( dev, ctx, mupdf.ll_fz_bound_text( text, stroke, ctm), "stroke-text") except Exception: if g_exceptions_verbose: exception_info() raise def jm_bbox_fill_text( dev, ctx, text, ctm, *args): try: jm_bbox_add_rect( dev, ctx, mupdf.ll_fz_bound_text( text, None, ctm), "fill-text") except Exception: if g_exceptions_verbose: exception_info() raise def jm_bbox_ignore_text( dev, ctx, text, ctm): jm_bbox_add_rect( dev, ctx, mupdf.ll_fz_bound_text(text, None, ctm), "ignore-text") def jm_bbox_stroke_path( dev, ctx, path, stroke, ctm, colorspace, color, alpha, color_params): try: jm_bbox_add_rect( dev, ctx, mupdf.ll_fz_bound_path( path, stroke, ctm), "stroke-path") except Exception: if g_exceptions_verbose: exception_info() raise def jm_checkquad(dev): ''' Check whether the last 4 lines represent a quad. Because of how we count, the lines are a polyline already, i.e. last point of a line equals 1st point of next line. So we check for a polygon (last line's end point equals start point). If not true we return 0. ''' #log(f'{getattr(dev, "pathdict", None)=}') items = dev.pathdict[ dictkey_items] len_ = len(items) f = [0] * 8 # coordinates of the 4 corners # fill the 8 floats in f, start from items[-4:] for i in range( 4): # store line start points line = items[ len_ - 4 + i] temp = JM_point_from_py( line[1]) f[i * 2] = temp.x f[i * 2 + 1] = temp.y lp = JM_point_from_py( line[ 2]) if lp.x != f[0] or lp.y != f[1]: # not a polygon! #dev.linecount -= 1 return 0 # we have detected a quad dev.linecount = 0 # reset this # a quad item is ("qu", (ul, ur, ll, lr)), where the tuple items # are pairs of floats representing a quad corner each. # relationship of float array to quad points: # (0, 1) = ul, (2, 3) = ll, (6, 7) = ur, (4, 5) = lr q = mupdf.fz_make_quad(f[0], f[1], f[6], f[7], f[2], f[3], f[4], f[5]) rect = ('qu', JM_py_from_quad(q)) items[ len_ - 4] = rect # replace item -4 by rect del items[ len_ - 3 : len_] # delete remaining 3 items return 1 def jm_checkrect(dev): ''' Check whether the last 3 path items represent a rectangle. Returns 1 if we have modified the path, otherwise 0. ''' #log(f'{getattr(dev, "pathdict", None)=}') dev.linecount = 0 # reset line count orientation = 0 # area orientation of rectangle items = dev.pathdict[ dictkey_items] len_ = len(items) line0 = items[ len_ - 3] ll = JM_point_from_py( line0[ 1]) lr = JM_point_from_py( line0[ 2]) # no need to extract "line1"! line2 = items[ len_ - 1] ur = JM_point_from_py( line2[ 1]) ul = JM_point_from_py( line2[ 2]) # Assumption: # When decomposing rects, MuPDF always starts with a horizontal line, # followed by a vertical line, followed by a horizontal line. # First line: (ll, lr), third line: (ul, ur). # If 1st line is below 3rd line, we record anti-clockwise (+1), else # clockwise (-1) orientation. if (0 or ll.y != lr.y or ll.x != ul.x or ur.y != ul.y or ur.x != lr.x ): return 0 # not a rectangle # we have a rect, replace last 3 "l" items by one "re" item. if ul.y < lr.y: r = mupdf.fz_make_rect(ul.x, ul.y, lr.x, lr.y) orientation = 1 else: r = mupdf.fz_make_rect(ll.x, ll.y, ur.x, ur.y) orientation = -1 rect = ( 're', JM_py_from_rect(r), orientation) items[ len_ - 3] = rect # replace item -3 by rect del items[ len_ - 2 : len_] # delete remaining 2 items return 1 def jm_trace_text( dev, text, type_, ctm, colorspace, color, alpha, seqno): span = text.head while 1: if not span: break jm_trace_text_span( dev, span, type_, ctm, colorspace, color, alpha, seqno) span = span.next def jm_trace_text_span(dev, span, type_, ctm, colorspace, color, alpha, seqno): ''' jm_trace_text_span(fz_context *ctx, PyObject *out, fz_text_span *span, int type, fz_matrix ctm, fz_colorspace *colorspace, const float *color, float alpha, size_t seqno) ''' out_font = None assert isinstance( span, mupdf.fz_text_span) span = mupdf.FzTextSpan( span) assert isinstance( ctm, mupdf.fz_matrix) ctm = mupdf.FzMatrix( ctm) fontname = JM_font_name( span.font()) #float rgb[3]; #PyObject *chars = PyTuple_New(span->len); mat = mupdf.fz_concat(span.trm(), ctm) # text transformation matrix dir = mupdf.fz_transform_vector(mupdf.fz_make_point(1, 0), mat) # writing direction fsize = math.sqrt(dir.x * dir.x + dir.y * dir.y) # font size dir = mupdf.fz_normalize_vector(dir) space_adv = 0 asc = JM_font_ascender( span.font()) dsc = JM_font_descender( span.font()) if asc < 1e-3: # probably Tesseract font dsc = -0.1 asc = 0.9 # compute effective ascender / descender ascsize = asc * fsize / (asc - dsc) dscsize = dsc * fsize / (asc - dsc) fflags = 0 # font flags mono = mupdf.fz_font_is_monospaced( span.font()) fflags += mono * TEXT_FONT_MONOSPACED fflags += mupdf.fz_font_is_italic( span.font()) * TEXT_FONT_ITALIC fflags += mupdf.fz_font_is_serif( span.font()) * TEXT_FONT_SERIFED fflags += mupdf.fz_font_is_bold( span.font()) * TEXT_FONT_BOLD last_adv = 0 # walk through characters of span span_bbox = mupdf.FzRect() rot = mupdf.fz_make_matrix(dir.x, dir.y, -dir.y, dir.x, 0, 0) if dir.x == -1: # left-right flip rot.d = 1 chars = [] for i in range( span.m_internal.len): adv = 0 if span.items(i).gid >= 0: adv = mupdf.fz_advance_glyph( span.font(), span.items(i).gid, span.m_internal.wmode) adv *= fsize last_adv = adv if span.items(i).ucs == 32: space_adv = adv char_orig = mupdf.fz_make_point(span.items(i).x, span.items(i).y) char_orig = mupdf.fz_transform_point(char_orig, ctm) m1 = mupdf.fz_make_matrix(1, 0, 0, 1, -char_orig.x, -char_orig.y) m1 = mupdf.fz_concat(m1, rot) m1 = mupdf.fz_concat(m1, mupdf.FzMatrix(1, 0, 0, 1, char_orig.x, char_orig.y)) x0 = char_orig.x x1 = x0 + adv if ( (mat.d > 0 and (dir.x == 1 or dir.x == -1)) or (mat.b != 0 and mat.b == -mat.c) ): # up-down flip y0 = char_orig.y + dscsize y1 = char_orig.y + ascsize else: y0 = char_orig.y - ascsize y1 = char_orig.y - dscsize char_bbox = mupdf.fz_make_rect(x0, y0, x1, y1) char_bbox = mupdf.fz_transform_rect(char_bbox, m1) chars.append( ( span.items(i).ucs, span.items(i).gid, ( char_orig.x, char_orig.y, ), ( char_bbox.x0, char_bbox.y0, char_bbox.x1, char_bbox.y1, ), ) ) if i > 0: span_bbox = mupdf.fz_union_rect(span_bbox, char_bbox) else: span_bbox = char_bbox chars = tuple(chars) if not space_adv: if not (fflags & TEXT_FONT_MONOSPACED): c, out_font = mupdf.fz_encode_character_with_fallback( span.font(), 32, 0, 0) space_adv = mupdf.fz_advance_glyph( span.font(), c, span.m_internal.wmode, ) space_adv *= fsize if not space_adv: space_adv = last_adv else: space_adv = last_adv # for mono, any char width suffices # make the span dictionary span_dict = dict() span_dict[ 'dir'] = JM_py_from_point(dir) span_dict[ 'font'] = JM_EscapeStrFromStr(fontname) span_dict[ 'wmode'] = span.m_internal.wmode span_dict[ 'flags'] =fflags span_dict[ "bidi_lvl"] =span.m_internal.bidi_level span_dict[ "bidi_dir"] = span.m_internal.markup_dir span_dict[ 'ascender'] = asc span_dict[ 'descender'] = dsc span_dict[ 'colorspace'] = 3 if colorspace: rgb = mupdf.fz_convert_color( mupdf.FzColorspace( mupdf.ll_fz_keep_colorspace( colorspace)), color, mupdf.fz_device_rgb(), mupdf.FzColorspace(), mupdf.FzColorParams(), ) rgb = rgb[:3] # mupdf.fz_convert_color() always returns 4 items. else: rgb = (0, 0, 0) if dev.linewidth > 0: # width of character border linewidth = dev.linewidth else: linewidth = fsize * 0.05 # default: 5% of font size #log(f'{dev.linewidth=:.4f} {fsize=:.4f} {linewidth=:.4f}') span_dict[ 'color'] = rgb span_dict[ 'size'] = fsize span_dict[ "opacity"] = alpha span_dict[ "linewidth"] = linewidth span_dict[ "spacewidth"] = space_adv span_dict[ 'type'] = type_ span_dict[ 'bbox'] = JM_py_from_rect(span_bbox) span_dict[ 'layer'] = dev.layer_name span_dict[ "seqno"] = seqno span_dict[ 'chars'] = chars #log(f'{span_dict=}') dev.out.append( span_dict) def jm_lineart_color(colorspace, color): #log(f' ') if colorspace: try: # Need to be careful to use a named Python object to ensure # that the `params` we pass to mupdf.ll_fz_convert_color() is # valid. E.g. doing: # # rgb = mupdf.ll_fz_convert_color(..., mupdf.FzColorParams().internal()) # # - seems to end up with a corrupted `params`. # cs = mupdf.FzColorspace( mupdf.FzColorspace.Fixed_RGB) cp = mupdf.FzColorParams() rgb = mupdf.ll_fz_convert_color( colorspace, color, cs.m_internal, None, cp.internal(), ) except Exception: if g_exceptions_verbose: exception_info() raise return rgb[:3] return () def jm_lineart_drop_device(dev, ctx): if isinstance(dev.out, list): dev.out = [] dev.scissors = [] def jm_lineart_fill_path( dev, ctx, path, even_odd, ctm, colorspace, color, alpha, color_params): #log(f'{getattr(dev, "pathdict", None)=}') #log(f'jm_lineart_fill_path(): {dev.seqno=}') even_odd = True if even_odd else False try: assert isinstance( ctm, mupdf.fz_matrix) dev.ctm = mupdf.FzMatrix( ctm) # fz_concat(ctm, dev_ptm); dev.path_type = trace_device_FILL_PATH jm_lineart_path( dev, ctx, path) if dev.pathdict is None: return #item_count = len(dev.pathdict[ dictkey_items]) #if item_count == 0: # return dev.pathdict[ dictkey_type] ="f" dev.pathdict[ "even_odd"] = even_odd dev.pathdict[ "fill_opacity"] = alpha #log(f'setting dev.pathdict[ "closePath"] to false') #dev.pathdict[ "closePath"] = False dev.pathdict[ "fill"] = jm_lineart_color( colorspace, color) dev.pathdict[ dictkey_rect] = JM_py_from_rect(dev.pathrect) dev.pathdict[ "seqno"] = dev.seqno #jm_append_merge(dev) dev.pathdict[ 'layer'] = dev.layer_name if dev.clips: dev.pathdict[ 'level'] = dev.depth jm_append_merge(dev) dev.seqno += 1 #log(f'jm_lineart_fill_path() end: {getattr(dev, "pathdict", None)=}') except Exception: if g_exceptions_verbose: exception_info() raise # There are 3 text trace types: # 0 - fill text (PDF Tr 0) # 1 - stroke text (PDF Tr 1) # 3 - ignore text (PDF Tr 3) def jm_lineart_fill_text( dev, ctx, text, ctm, colorspace, color, alpha, color_params): if 0: log(f'{type(ctx)=} {ctx=}') log(f'{type(dev)=} {dev=}') log(f'{type(text)=} {text=}') log(f'{type(ctm)=} {ctm=}') log(f'{type(colorspace)=} {colorspace=}') log(f'{type(color)=} {color=}') log(f'{type(alpha)=} {alpha=}') log(f'{type(color_params)=} {color_params=}') jm_trace_text(dev, text, 0, ctm, colorspace, color, alpha, dev.seqno) dev.seqno += 1 def jm_lineart_ignore_text(dev, text, ctm): #log(f'{getattr(dev, "pathdict", None)=}') jm_trace_text(dev, text, 3, ctm, None, None, 1, dev.seqno) dev.seqno += 1 class Walker(mupdf.FzPathWalker2): def __init__(self, dev): super().__init__() self.use_virtual_moveto() self.use_virtual_lineto() self.use_virtual_curveto() self.use_virtual_closepath() self.dev = dev def closepath(self, ctx): # trace_close(). #log(f'Walker(): {self.dev.pathdict=}') try: if self.dev.linecount == 3: if jm_checkrect(self.dev): #log(f'end1: {self.dev.pathdict=}') return self.dev.linecount = 0 # reset # of consec. lines if self.dev.havemove: if self.dev.lastpoint != self.dev.firstpoint: item = ("l", JM_py_from_point(self.dev.lastpoint), JM_py_from_point(self.dev.firstpoint)) self.dev.pathdict[dictkey_items].append(item) self.dev.lastpoint = self.dev.firstpoint self.dev.pathdict["closePath"] = False else: #log('setting self.dev.pathdict[ "closePath"] to true') self.dev.pathdict[ "closePath"] = True #log(f'end2: {self.dev.pathdict=}') self.dev.havemove = 0 except Exception: if g_exceptions_verbose: exception_info() raise def curveto(self, ctx, x1, y1, x2, y2, x3, y3): # trace_curveto(). #log(f'Walker(): {self.dev.pathdict=}') try: self.dev.linecount = 0 # reset # of consec. lines p1 = mupdf.fz_make_point(x1, y1) p2 = mupdf.fz_make_point(x2, y2) p3 = mupdf.fz_make_point(x3, y3) p1 = mupdf.fz_transform_point(p1, self.dev.ctm) p2 = mupdf.fz_transform_point(p2, self.dev.ctm) p3 = mupdf.fz_transform_point(p3, self.dev.ctm) self.dev.pathrect = mupdf.fz_include_point_in_rect(self.dev.pathrect, p1) self.dev.pathrect = mupdf.fz_include_point_in_rect(self.dev.pathrect, p2) self.dev.pathrect = mupdf.fz_include_point_in_rect(self.dev.pathrect, p3) list_ = ( "c", JM_py_from_point(self.dev.lastpoint), JM_py_from_point(p1), JM_py_from_point(p2), JM_py_from_point(p3), ) self.dev.lastpoint = p3 self.dev.pathdict[ dictkey_items].append( list_) except Exception: if g_exceptions_verbose: exception_info() raise def lineto(self, ctx, x, y): # trace_lineto(). #log(f'Walker(): {self.dev.pathdict=}') try: p1 = mupdf.fz_transform_point( mupdf.fz_make_point(x, y), self.dev.ctm) self.dev.pathrect = mupdf.fz_include_point_in_rect( self.dev.pathrect, p1) list_ = ( 'l', JM_py_from_point( self.dev.lastpoint), JM_py_from_point(p1), ) self.dev.lastpoint = p1 items = self.dev.pathdict[ dictkey_items] items.append( list_) self.dev.linecount += 1 # counts consecutive lines if self.dev.linecount == 4 and self.dev.path_type != trace_device_FILL_PATH: # shrink to "re" or "qu" item jm_checkquad(self.dev) except Exception: if g_exceptions_verbose: exception_info() raise def moveto(self, ctx, x, y): # trace_moveto(). if 0 and isinstance(self.dev.pathdict, dict): log(f'self.dev.pathdict:') for n, v in self.dev.pathdict.items(): log( ' {type(n)=} {len(n)=} {n!r} {n}: {v!r}: {v}') #log(f'Walker(): {type(self.dev.pathdict)=} {self.dev.pathdict=}') try: #log( '{=dev.ctm type(dev.ctm)}') self.dev.lastpoint = mupdf.fz_transform_point( mupdf.fz_make_point(x, y), self.dev.ctm, ) if mupdf.fz_is_infinite_rect( self.dev.pathrect): self.dev.pathrect = mupdf.fz_make_rect( self.dev.lastpoint.x, self.dev.lastpoint.y, self.dev.lastpoint.x, self.dev.lastpoint.y, ) self.dev.firstpoint = self.dev.lastpoint self.dev.havemove = 1 self.dev.linecount = 0 # reset # of consec. lines except Exception: if g_exceptions_verbose: exception_info() raise def jm_lineart_path(dev, ctx, path): ''' Create the "items" list of the path dictionary * either create or empty the path dictionary * reset the end point of the path * reset count of consecutive lines * invoke fz_walk_path(), which create the single items * if no items detected, empty path dict again ''' #log(f'{getattr(dev, "pathdict", None)=}') try: dev.pathrect = mupdf.FzRect( mupdf.FzRect.Fixed_INFINITE) dev.linecount = 0 dev.lastpoint = mupdf.FzPoint( 0, 0) dev.pathdict = dict() dev.pathdict[ dictkey_items] = [] # First time we create a Walker instance is slow, e.g. 0.3s, then later # times run in around 0.01ms. If Walker is defined locally instead of # globally, each time takes 0.3s. # walker = Walker(dev) # Unlike fz_run_page(), fz_path_walker callbacks are not passed # a pointer to the struct, instead they get an arbitrary # void*. The underlying C++ Director callbacks use this void* to # identify the fz_path_walker instance so in turn we need to pass # arg=walker.m_internal. mupdf.fz_walk_path( mupdf.FzPath(mupdf.ll_fz_keep_path(path)), walker, walker.m_internal) # Check if any items were added ... if not dev.pathdict[ dictkey_items]: dev.pathdict = None except Exception: if g_exceptions_verbose: exception_info() raise def jm_lineart_stroke_path( dev, ctx, path, stroke, ctm, colorspace, color, alpha, color_params): #log(f'{dev.pathdict=} {dev.clips=}') try: assert isinstance( ctm, mupdf.fz_matrix) dev.pathfactor = 1 if ctm.a != 0 and abs(ctm.a) == abs(ctm.d): dev.pathfactor = abs(ctm.a) elif ctm.b != 0 and abs(ctm.b) == abs(ctm.c): dev.pathfactor = abs(ctm.b) dev.ctm = mupdf.FzMatrix( ctm) # fz_concat(ctm, dev_ptm); dev.path_type = trace_device_STROKE_PATH jm_lineart_path( dev, ctx, path) if dev.pathdict is None: return dev.pathdict[ dictkey_type] = 's' dev.pathdict[ 'stroke_opacity'] = alpha dev.pathdict[ 'color'] = jm_lineart_color( colorspace, color) dev.pathdict[ dictkey_width] = dev.pathfactor * stroke.linewidth dev.pathdict[ 'lineCap'] = ( stroke.start_cap, stroke.dash_cap, stroke.end_cap, ) dev.pathdict[ 'lineJoin'] = dev.pathfactor * stroke.linejoin if 'closePath' not in dev.pathdict: #log('setting dev.pathdict["closePath"] to false') dev.pathdict['closePath'] = False # output the "dashes" string if stroke.dash_len: buff = mupdf.fz_new_buffer( 256) mupdf.fz_append_string( buff, "[ ") # left bracket for i in range( stroke.dash_len): # We use mupdf python's SWIG-generated floats_getitem() fn to # access float *stroke.dash_list[]. value = mupdf.floats_getitem( stroke.dash_list, i) # stroke.dash_list[i]. mupdf.fz_append_string( buff, f'{_format_g(dev.pathfactor * value)} ') mupdf.fz_append_string( buff, f'] {_format_g(dev.pathfactor * stroke.dash_phase)}') dev.pathdict[ 'dashes'] = buff else: dev.pathdict[ 'dashes'] = '[] 0' dev.pathdict[ dictkey_rect] = JM_py_from_rect(dev.pathrect) dev.pathdict['layer'] = dev.layer_name dev.pathdict[ 'seqno'] = dev.seqno if dev.clips: dev.pathdict[ 'level'] = dev.depth jm_append_merge(dev) dev.seqno += 1 except Exception: if g_exceptions_verbose: exception_info() raise def jm_lineart_clip_path(dev, ctx, path, even_odd, ctm, scissor): if not dev.clips: return dev.ctm = mupdf.FzMatrix(ctm) # fz_concat(ctm, trace_device_ptm); dev.path_type = trace_device_CLIP_PATH jm_lineart_path(dev, ctx, path) if dev.pathdict is None: return dev.pathdict[ dictkey_type] = 'clip' dev.pathdict[ 'even_odd'] = bool(even_odd) if 'closePath' not in dev.pathdict: #log(f'setting dev.pathdict["closePath"] to False') dev.pathdict['closePath'] = False dev.pathdict['scissor'] = JM_py_from_rect(compute_scissor(dev)) dev.pathdict['level'] = dev.depth dev.pathdict['layer'] = dev.layer_name jm_append_merge(dev) dev.depth += 1 def jm_lineart_clip_stroke_path(dev, ctx, path, stroke, ctm, scissor): if not dev.clips: return dev.ctm = mupdf.FzMatrix(ctm) # fz_concat(ctm, trace_device_ptm); dev.path_type = trace_device_CLIP_STROKE_PATH jm_lineart_path(dev, ctx, path) if dev.pathdict is None: return dev.pathdict['dictkey_type'] = 'clip' dev.pathdict['even_odd'] = None if 'closePath' not in dev.pathdict: #log(f'setting dev.pathdict["closePath"] to False') dev.pathdict['closePath'] = False dev.pathdict['scissor'] = JM_py_from_rect(compute_scissor(dev)) dev.pathdict['level'] = dev.depth dev.pathdict['layer'] = dev.layer_name jm_append_merge(dev) dev.depth += 1 def jm_lineart_clip_stroke_text(dev, ctx, text, stroke, ctm, scissor): if not dev.clips: return compute_scissor(dev) dev.depth += 1 def jm_lineart_clip_text(dev, ctx, text, ctm, scissor): if not dev.clips: return compute_scissor(dev) dev.depth += 1 def jm_lineart_clip_image_mask( dev, ctx, image, ctm, scissor): if not dev.clips: return compute_scissor(dev) dev.depth += 1 def jm_lineart_pop_clip(dev, ctx): if not dev.clips or not dev.scissors: return len_ = len(dev.scissors) if len_ < 1: return del dev.scissors[-1] dev.depth -= 1 def jm_lineart_begin_layer(dev, ctx, name): if name: dev.layer_name = name else: dev.layer_name = "" def jm_lineart_end_layer(dev, ctx): dev.layer_name = "" def jm_lineart_begin_group(dev, ctx, bbox, cs, isolated, knockout, blendmode, alpha): #log(f'{dev.pathdict=} {dev.clips=}') if not dev.clips: return dev.pathdict = { # Py_BuildValue("{s:s,s:N,s:N,s:N,s:s,s:f,s:i,s:N}", "type": "group", "rect": JM_py_from_rect(bbox), "isolated": bool(isolated), "knockout": bool(knockout), "blendmode": mupdf.fz_blendmode_name(blendmode), "opacity": alpha, "level": dev.depth, "layer": dev.layer_name } jm_append_merge(dev) dev.depth += 1 def jm_lineart_end_group(dev, ctx): #log(f'{dev.pathdict=} {dev.clips=}') if not dev.clips: return dev.depth -= 1 def jm_lineart_stroke_text(dev, ctx, text, stroke, ctm, colorspace, color, alpha, color_params): jm_trace_text(dev, text, 1, ctm, colorspace, color, alpha, dev.seqno) dev.seqno += 1 def jm_dev_linewidth( dev, ctx, path, stroke, matrix, colorspace, color, alpha, color_params): dev.linewidth = stroke.linewidth jm_increase_seqno( dev, ctx) def jm_increase_seqno( dev, ctx, *vargs): try: dev.seqno += 1 except Exception: if g_exceptions_verbose: exception_info() raise def planish_line(p1: point_like, p2: point_like) -> Matrix: """Compute matrix which maps line from p1 to p2 to the x-axis, such that it maintains its length and p1 * matrix = Point(0, 0). Args: p1, p2: point_like Returns: Matrix which maps p1 to Point(0, 0) and p2 to a point on the x axis at the same distance to Point(0,0). Will always combine a rotation and a transformation. """ p1 = Point(p1) p2 = Point(p2) return Matrix(util_hor_matrix(p1, p2)) class JM_image_reporter_Filter(mupdf.PdfFilterOptions2): def __init__(self): super().__init__() self.use_virtual_image_filter() def image_filter( self, ctx, ctm, name, image): assert isinstance(ctm, mupdf.fz_matrix) JM_image_filter(self, mupdf.FzMatrix(ctm), name, image) if mupdf_cppyy: # cppyy doesn't appear to treat returned None as nullptr, # resulting in obscure 'python exception' exception. return 0 class JM_new_bbox_device_Device(mupdf.FzDevice2): def __init__(self, result, layers): super().__init__() self.result = result self.layers = layers self.layer_name = "" self.use_virtual_fill_path() self.use_virtual_stroke_path() self.use_virtual_fill_text() self.use_virtual_stroke_text() self.use_virtual_ignore_text() self.use_virtual_fill_shade() self.use_virtual_fill_image() self.use_virtual_fill_image_mask() self.use_virtual_begin_layer() self.use_virtual_end_layer() begin_layer = jm_lineart_begin_layer end_layer = jm_lineart_end_layer fill_path = jm_bbox_fill_path stroke_path = jm_bbox_stroke_path fill_text = jm_bbox_fill_text stroke_text = jm_bbox_stroke_text ignore_text = jm_bbox_ignore_text fill_shade = jm_bbox_fill_shade fill_image = jm_bbox_fill_image fill_image_mask = jm_bbox_fill_image_mask class JM_new_output_fileptr_Output(mupdf.FzOutput2): def __init__(self, bio): super().__init__() self.bio = bio self.use_virtual_write() self.use_virtual_seek() self.use_virtual_tell() self.use_virtual_truncate() def seek( self, ctx, offset, whence): return self.bio.seek( offset, whence) def tell( self, ctx): ret = self.bio.tell() return ret def truncate( self, ctx): return self.bio.truncate() def write(self, ctx, data_raw, data_length): data = mupdf.raw_to_python_bytes(data_raw, data_length) return self.bio.write(data) def compute_scissor(dev): ''' Every scissor of a clip is a sub rectangle of the preceding clip scissor if the clip level is larger. ''' if dev.scissors is None: dev.scissors = list() num_scissors = len(dev.scissors) if num_scissors > 0: last_scissor = dev.scissors[num_scissors-1] scissor = JM_rect_from_py(last_scissor) scissor = mupdf.fz_intersect_rect(scissor, dev.pathrect) else: scissor = dev.pathrect dev.scissors.append(JM_py_from_rect(scissor)) return scissor class JM_new_lineart_device_Device(mupdf.FzDevice2): ''' LINEART device for Python method Page.get_cdrawings() ''' #log(f'JM_new_lineart_device_Device()') def __init__(self, out, clips, method): #log(f'JM_new_lineart_device_Device.__init__()') super().__init__() # fixme: this results in "Unexpected call of unimplemented virtual_fnptrs fn FzDevice2::drop_device().". #self.use_virtual_drop_device() self.use_virtual_fill_path() self.use_virtual_stroke_path() self.use_virtual_clip_path() self.use_virtual_clip_image_mask() self.use_virtual_clip_stroke_path() self.use_virtual_clip_stroke_text() self.use_virtual_clip_text() self.use_virtual_fill_text self.use_virtual_stroke_text self.use_virtual_ignore_text self.use_virtual_fill_shade() self.use_virtual_fill_image() self.use_virtual_fill_image_mask() self.use_virtual_pop_clip() self.use_virtual_begin_group() self.use_virtual_end_group() self.use_virtual_begin_layer() self.use_virtual_end_layer() self.out = out self.seqno = 0 self.depth = 0 self.clips = clips self.method = method self.scissors = None self.layer_name = "" # optional content name self.pathrect = None self.linewidth = 0 self.ptm = mupdf.FzMatrix() self.ctm = mupdf.FzMatrix() self.rot = mupdf.FzMatrix() self.lastpoint = mupdf.FzPoint() self.firstpoint = mupdf.FzPoint() self.havemove = 0 self.pathrect = mupdf.FzRect() self.pathfactor = 0 self.linecount = 0 self.path_type = 0 #drop_device = jm_lineart_drop_device fill_path = jm_lineart_fill_path stroke_path = jm_lineart_stroke_path clip_image_mask = jm_lineart_clip_image_mask clip_path = jm_lineart_clip_path clip_stroke_path = jm_lineart_clip_stroke_path clip_text = jm_lineart_clip_text clip_stroke_text = jm_lineart_clip_stroke_text fill_text = jm_increase_seqno stroke_text = jm_increase_seqno ignore_text = jm_increase_seqno fill_shade = jm_increase_seqno fill_image = jm_increase_seqno fill_image_mask = jm_increase_seqno pop_clip = jm_lineart_pop_clip begin_group = jm_lineart_begin_group end_group = jm_lineart_end_group begin_layer = jm_lineart_begin_layer end_layer = jm_lineart_end_layer class JM_new_texttrace_device(mupdf.FzDevice2): ''' Trace TEXT device for Python method Page.get_texttrace() ''' def __init__(self, out): super().__init__() self.use_virtual_fill_path() self.use_virtual_stroke_path() self.use_virtual_fill_text() self.use_virtual_stroke_text() self.use_virtual_ignore_text() self.use_virtual_fill_shade() self.use_virtual_fill_image() self.use_virtual_fill_image_mask() self.use_virtual_begin_layer() self.use_virtual_end_layer() self.out = out self.seqno = 0 self.depth = 0 self.clips = 0 self.method = None self.seqno = 0 self.pathdict = dict() self.scissors = list() self.linewidth = 0 self.ptm = mupdf.FzMatrix() self.ctm = mupdf.FzMatrix() self.rot = mupdf.FzMatrix() self.lastpoint = mupdf.FzPoint() self.pathrect = mupdf.FzRect() self.pathfactor = 0 self.linecount = 0 self.path_type = 0 self.layer_name = "" fill_path = jm_increase_seqno stroke_path = jm_dev_linewidth fill_text = jm_lineart_fill_text stroke_text = jm_lineart_stroke_text ignore_text = jm_lineart_ignore_text fill_shade = jm_increase_seqno fill_image = jm_increase_seqno fill_image_mask = jm_increase_seqno begin_layer = jm_lineart_begin_layer end_layer = jm_lineart_end_layer def ConversionHeader(i: str, filename: OptStr ="unknown"): t = i.lower() import textwrap html = textwrap.dedent(""" <!DOCTYPE html> <html> <head> <style> body{background-color:gray} div{position:relative;background-color:white;margin:1em auto} p{position:absolute;margin:0} img{position:absolute} </style> </head> <body> """) xml = textwrap.dedent(""" <?xml version="1.0"?> <document name="%s"> """ % filename ) xhtml = textwrap.dedent(""" <?xml version="1.0"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <style> body{background-color:gray} div{background-color:white;margin:1em;padding:1em} p{white-space:pre-wrap} </style> </head> <body> """) text = "" json = '{"document": "%s", "pages": [\n' % filename if t == "html": r = html elif t == "json": r = json elif t == "xml": r = xml elif t == "xhtml": r = xhtml else: r = text return r def ConversionTrailer(i: str): t = i.lower() text = "" json = "]\n}" html = "</body>\n</html>\n" xml = "</document>\n" xhtml = html if t == "html": r = html elif t == "json": r = json elif t == "xml": r = xml elif t == "xhtml": r = xhtml else: r = text return r def annot_preprocess(page: "Page") -> int: """Prepare for annotation insertion on the page. Returns: Old page rotation value. Temporarily sets rotation to 0 when required. """ CheckParent(page) if not page.parent.is_pdf: raise ValueError("is no PDF") old_rotation = page.rotation if old_rotation != 0: page.set_rotation(0) return old_rotation def annot_postprocess(page: "Page", annot: "Annot") -> None: """Clean up after annotation insertion. Set ownership flag and store annotation in page annotation dictionary. """ #annot.parent = weakref.proxy(page) assert isinstance( page, Page) assert isinstance( annot, Annot) annot.parent = page page._annot_refs[id(annot)] = annot annot.thisown = True def canon(c): assert isinstance(c, int) # TODO: proper unicode case folding # TODO: character equivalence (a matches ä, etc) if c == 0xA0 or c == 0x2028 or c == 0x2029: return ord(' ') if c == ord('\r') or c == ord('\n') or c == ord('\t'): return ord(' ') if c >= ord('A') and c <= ord('Z'): return c - ord('A') + ord('a') return c def chartocanon(s): assert isinstance(s, str) n, c = mupdf.fz_chartorune(s) c = canon(c) return n, c def dest_is_valid(o, page_count, page_object_nums, names_list): p = mupdf.pdf_dict_get( o, PDF_NAME('A')) if ( mupdf.pdf_name_eq( mupdf.pdf_dict_get( p, PDF_NAME('S')), PDF_NAME('GoTo') ) and not string_in_names_list( mupdf.pdf_dict_get( p, PDF_NAME('D')), names_list ) ): return 0 p = mupdf.pdf_dict_get( o, PDF_NAME('Dest')) if not p.m_internal: pass elif mupdf.pdf_is_string( p): return string_in_names_list( p, names_list) elif not dest_is_valid_page( mupdf.pdf_array_get( p, 0), page_object_nums, page_count, ): return 0 return 1 def dest_is_valid_page(obj, page_object_nums, pagecount): num = mupdf.pdf_to_num(obj) if num == 0: return 0 for i in range(pagecount): if page_object_nums[i] == num: return 1 return 0 def find_string(s, needle): assert isinstance(s, str) for i in range(len(s)): end = match_string(s[i:], needle) if end is not None: end += i return i, end return None, None def get_pdf_now() -> str: ''' "Now" timestamp in PDF Format ''' import time tz = "%s'%s'" % ( str(abs(time.altzone // 3600)).rjust(2, "0"), str((abs(time.altzone // 60) % 60)).rjust(2, "0"), ) tstamp = time.strftime("D:%Y%m%d%H%M%S", time.localtime()) if time.altzone > 0: tstamp += "-" + tz elif time.altzone < 0: tstamp += "+" + tz else: pass return tstamp class ElementPosition(object): """Convert a dictionary with element position information to an object.""" def __init__(self): pass def make_story_elpos(): return ElementPosition() def get_highlight_selection(page, start: point_like =None, stop: point_like =None, clip: rect_like =None) -> list: """Return rectangles of text lines between two points. Notes: The default of 'start' is top-left of 'clip'. The default of 'stop' is bottom-reight of 'clip'. Args: start: start point_like stop: end point_like, must be 'below' start clip: consider this rect_like only, default is page rectangle Returns: List of line bbox intersections with the area established by the parameters. """ # validate and normalize arguments if clip is None: clip = page.rect clip = Rect(clip) if start is None: start = clip.tl if stop is None: stop = clip.br clip.y0 = start.y clip.y1 = stop.y if clip.is_empty or clip.is_infinite: return [] # extract text of page, clip only, no images, expand ligatures blocks = page.get_text( "dict", flags=0, clip=clip, )["blocks"] lines = [] # will return this list of rectangles for b in blocks: bbox = Rect(b["bbox"]) if bbox.is_infinite or bbox.is_empty: continue for line in b["lines"]: bbox = Rect(line["bbox"]) if bbox.is_infinite or bbox.is_empty: continue lines.append(bbox) if lines == []: # did not select anything return lines lines.sort(key=lambda bbox: bbox.y1) # sort by vertical positions # cut off prefix from first line if start point is close to its top bboxf = lines.pop(0) if bboxf.y0 - start.y <= 0.1 * bboxf.height: # close enough? r = Rect(start.x, bboxf.y0, bboxf.br) # intersection rectangle if not (r.is_empty or r.is_infinite): lines.insert(0, r) # insert again if not empty else: lines.insert(0, bboxf) # insert again if lines == []: # the list might have been emptied return lines # cut off suffix from last line if stop point is close to its bottom bboxl = lines.pop() if stop.y - bboxl.y1 <= 0.1 * bboxl.height: # close enough? r = Rect(bboxl.tl, stop.x, bboxl.y1) # intersection rectangle if not (r.is_empty or r.is_infinite): lines.append(r) # append if not empty else: lines.append(bboxl) # append again return lines def glyph_name_to_unicode(name: str) -> int: """Convenience function accessing unicodedata.""" import unicodedata try: unc = ord(unicodedata.lookup(name)) except Exception: unc = 65533 return unc def hdist(dir, a, b): dx = b.x - a.x dy = b.y - a.y return mupdf.fz_abs(dx * dir.x + dy * dir.y) def make_table(rect: rect_like =(0, 0, 1, 1), cols: int =1, rows: int =1) -> list: """Return a list of (rows x cols) equal sized rectangles. Notes: A utility to fill a given area with table cells of equal size. Args: rect: rect_like to use as the table area rows: number of rows cols: number of columns Returns: A list with <rows> items, where each item is a list of <cols> PyMuPDF Rect objects of equal sizes. """ rect = Rect(rect) # ensure this is a Rect if rect.is_empty or rect.is_infinite: raise ValueError("rect must be finite and not empty") tl = rect.tl height = rect.height / rows # height of one table cell width = rect.width / cols # width of one table cell delta_h = (width, 0, width, 0) # diff to next right rect delta_v = (0, height, 0, height) # diff to next lower rect r = Rect(tl, tl.x + width, tl.y + height) # first rectangle # make the first row row = [r] for i in range(1, cols): r += delta_h # build next rect to the right row.append(r) # make result, starts with first row rects = [row] for i in range(1, rows): row = rects[i - 1] # take previously appended row nrow = [] # the new row to append for r in row: # for each previous cell add its downward copy nrow.append(r + delta_v) rects.append(nrow) # append new row to result return rects def util_ensure_widget_calc(annot): ''' Ensure that widgets with /AA/C JavaScript are in array AcroForm/CO ''' annot_obj = mupdf.pdf_annot_obj(annot.this) pdf = mupdf.pdf_get_bound_document(annot_obj) PDFNAME_CO = mupdf.pdf_new_name("CO") # = PDF_NAME(CO) acro = mupdf.pdf_dict_getl( # get AcroForm dict mupdf.pdf_trailer(pdf), PDF_NAME('Root'), PDF_NAME('AcroForm'), ) CO = mupdf.pdf_dict_get(acro, PDFNAME_CO) # = AcroForm/CO if not mupdf.pdf_is_array(CO): CO = mupdf.pdf_dict_put_array(acro, PDFNAME_CO, 2) n = mupdf.pdf_array_len(CO) found = 0 xref = mupdf.pdf_to_num(annot_obj) for i in range(n): nxref = mupdf.pdf_to_num(mupdf.pdf_array_get(CO, i)) if xref == nxref: found = 1 break if not found: mupdf.pdf_array_push(CO, mupdf.pdf_new_indirect(pdf, xref, 0)) def util_make_rect( *args, p0=None, p1=None, x0=None, y0=None, x1=None, y1=None): ''' Helper for initialising rectangle classes. 2022-09-02: This is quite different from PyMuPDF's util_make_rect(), which uses `goto` in ways that don't easily translate to Python. Returns (x0, y0, x1, y1) derived from <args>, then override with p0, p1, x0, y0, x1, y1 if they are not None. Accepts following forms for <args>: () returns all zeros. (top-left, bottom-right) (top-left, x1, y1) (x0, y0, bottom-right) (x0, y0, x1, y1) (rect) Where top-left and bottom-right are (x, y) or something with .x, .y members; rect is something with .x0, .y0, .x1, and .y1 members. 2023-11-18: we now override with p0, p1, x0, y0, x1, y1 if not None. ''' def get_xy( arg): if isinstance( arg, (list, tuple)) and len( arg) == 2: return arg[0], arg[1] if isinstance( arg, (Point, mupdf.FzPoint, mupdf.fz_point)): return arg.x, arg.y return None, None def make_tuple( a): if isinstance( a, tuple): return a if isinstance( a, Point): return a.x, a.y elif isinstance( a, (Rect, IRect, mupdf.FzRect, mupdf.fz_rect)): return a.x0, a.y0, a.x1, a.y1 if not isinstance( a, (list, tuple)): a = a, return a def handle_args(): if len(args) == 0: return 0, 0, 0, 0 elif len(args) == 1: arg = args[0] if isinstance( arg, (list, tuple)) and len( arg) == 2: p1, p2 = arg ret = *p1, *p2 assert len(ret) == 4 return ret if isinstance( arg, (list, tuple)) and len( arg) == 3: a, b, c = arg a = make_tuple(a) b = make_tuple(b) c = make_tuple(c) ret = *a, *b, *c assert len(ret) == 4 return ret ret = make_tuple( arg) assert len(ret) == 4, f'{arg=} {ret=}' return ret elif len(args) == 2: ret = get_xy( args[0]) + get_xy( args[1]) assert len(ret) == 4 return ret elif len(args) == 3: x0, y0 = get_xy( args[0]) if (x0, y0) != (None, None): return x0, y0, args[1], args[2] x1, y1 = get_xy( args[2]) if (x1, y1) != (None, None): return args[0], args[1], x1, y1 elif len(args) == 4: return args[0], args[1], args[2], args[3] raise Exception( f'Unrecognised args: {args}') ret_x0, ret_y0, ret_x1, ret_y1 = handle_args() if p0 is not None: ret_x0, ret_y0 = get_xy(p0) if p1 is not None: ret_x1, ret_y1 = get_xy(p1) if x0 is not None: ret_x0 = x0 if y0 is not None: ret_y0 = y0 if x1 is not None: ret_x1 = x1 if y1 is not None: ret_y1 = y1 return ret_x0, ret_y0, ret_x1, ret_y1 def util_make_irect( *args, p0=None, p1=None, x0=None, y0=None, x1=None, y1=None): a, b, c, d = util_make_rect( *args, p0=p0, p1=p1, x0=x0, y0=y0, x1=x1, y1=y1) def convert(x, ceil): if ceil: return int(math.ceil(x)) else: return int(math.floor(x)) a = convert(a, False) b = convert(b, False) c = convert(c, True) d = convert(d, True) return a, b, c, d def util_round_rect( rect): return JM_py_from_irect(mupdf.fz_round_rect(JM_rect_from_py(rect))) def util_transform_rect( rect, matrix): if g_use_extra: return extra.util_transform_rect( rect, matrix) return JM_py_from_rect(mupdf.fz_transform_rect(JM_rect_from_py(rect), JM_matrix_from_py(matrix))) def util_intersect_rect( r1, r2): return JM_py_from_rect( mupdf.fz_intersect_rect( JM_rect_from_py(r1), JM_rect_from_py(r2), ) ) def util_is_point_in_rect( p, r): return mupdf.fz_is_point_inside_rect( JM_point_from_py(p), JM_rect_from_py(r), ) def util_include_point_in_rect( r, p): return JM_py_from_rect( mupdf.fz_include_point_in_rect( JM_rect_from_py(r), JM_point_from_py(p), ) ) def util_point_in_quad( P, Q): p = JM_point_from_py(P) q = JM_quad_from_py(Q) return mupdf.fz_is_point_inside_quad(p, q) def util_transform_point( point, matrix): return JM_py_from_point( mupdf.fz_transform_point( JM_point_from_py(point), JM_matrix_from_py(matrix), ) ) def util_union_rect( r1, r2): return JM_py_from_rect( mupdf.fz_union_rect( JM_rect_from_py(r1), JM_rect_from_py(r2), ) ) def util_concat_matrix( m1, m2): return JM_py_from_matrix( mupdf.fz_concat( JM_matrix_from_py(m1), JM_matrix_from_py(m2), ) ) def util_invert_matrix(matrix): if 0: # Use MuPDF's fz_invert_matrix(). if isinstance( matrix, (tuple, list)): matrix = mupdf.FzMatrix( *matrix) elif isinstance( matrix, mupdf.fz_matrix): matrix = mupdf.FzMatrix( matrix) elif isinstance( matrix, Matrix): matrix = mupdf.FzMatrix( matrix.a, matrix.b, matrix.c, matrix.d, matrix.e, matrix.f) assert isinstance( matrix, mupdf.FzMatrix), f'{type(matrix)=}: {matrix}' ret = mupdf.fz_invert_matrix( matrix) if ret == matrix and (0 or abs( matrix.a - 1) >= sys.float_info.epsilon or abs( matrix.b - 0) >= sys.float_info.epsilon or abs( matrix.c - 0) >= sys.float_info.epsilon or abs( matrix.d - 1) >= sys.float_info.epsilon ): # Inversion not possible. return 1, () return 0, (ret.a, ret.b, ret.c, ret.d, ret.e, ret.f) # Do inversion in python. src = JM_matrix_from_py(matrix) a = src.a det = a * src.d - src.b * src.c if det < -sys.float_info.epsilon or det > sys.float_info.epsilon: dst = mupdf.FzMatrix() rdet = 1 / det dst.a = src.d * rdet dst.b = -src.b * rdet dst.c = -src.c * rdet dst.d = a * rdet a = -src.e * dst.a - src.f * dst.c dst.f = -src.e * dst.b - src.f * dst.d dst.e = a return 0, (dst.a, dst.b, dst.c, dst.d, dst.e, dst.f) return 1, () def util_measure_string( text, fontname, fontsize, encoding): font = mupdf.fz_new_base14_font(fontname) w = 0 pos = 0 while pos < len(text): t, c = mupdf.fz_chartorune(text[pos:]) pos += t if encoding == mupdf.PDF_SIMPLE_ENCODING_GREEK: c = mupdf.fz_iso8859_7_from_unicode(c) elif encoding == mupdf.PDF_SIMPLE_ENCODING_CYRILLIC: c = mupdf.fz_windows_1251_from_unicode(c) else: c = mupdf.fz_windows_1252_from_unicode(c) if c < 0: c = 0xB7 g = mupdf.fz_encode_character(font, c) dw = mupdf.fz_advance_glyph(font, g, 0) w += dw ret = w * fontsize return ret def util_sine_between(C, P, Q): # for points C, P, Q compute the sine between lines CP and QP c = JM_point_from_py(C) p = JM_point_from_py(P) q = JM_point_from_py(Q) s = mupdf.fz_normalize_vector(mupdf.fz_make_point(q.x - p.x, q.y - p.y)) m1 = mupdf.fz_make_matrix(1, 0, 0, 1, -p.x, -p.y) m2 = mupdf.fz_make_matrix(s.x, -s.y, s.y, s.x, 0, 0) m1 = mupdf.fz_concat(m1, m2) c = mupdf.fz_transform_point(c, m1) c = mupdf.fz_normalize_vector(c) return c.y def util_hor_matrix(C, P): ''' Return the matrix that maps two points C, P to the x-axis such that C -> (0,0) and the image of P have the same distance. ''' c = JM_point_from_py(C) p = JM_point_from_py(P) # compute (cosine, sine) of vector P-C with double precision: s = mupdf.fz_normalize_vector(mupdf.fz_make_point(p.x - c.x, p.y - c.y)) m1 = mupdf.fz_make_matrix(1, 0, 0, 1, -c.x, -c.y) m2 = mupdf.fz_make_matrix(s.x, -s.y, s.y, s.x, 0, 0) return JM_py_from_matrix(mupdf.fz_concat(m1, m2)) def match_string(h0, n0): h = 0 n = 0 e = h delta_h, hc = chartocanon(h0[h:]) h += delta_h delta_n, nc = chartocanon(n0[n:]) n += delta_n while hc == nc: e = h if hc == ord(' '): while 1: delta_h, hc = chartocanon(h0[h:]) h += delta_h if hc != ord(' '): break else: delta_h, hc = chartocanon(h0[h:]) h += delta_h if nc == ord(' '): while 1: delta_n, nc = chartocanon(n0[n:]) n += delta_n if nc != ord(' '): break else: delta_n, nc = chartocanon(n0[n:]) n += delta_n return None if nc != 0 else e def on_highlight_char(hits, line, ch): assert hits assert isinstance(line, mupdf.FzStextLine) assert isinstance(ch, mupdf.FzStextChar) vfuzz = ch.m_internal.size * hits.vfuzz hfuzz = ch.m_internal.size * hits.hfuzz ch_quad = JM_char_quad(line, ch) if hits.len > 0: # fixme: end = hits.quads[-1] quad = hits.quads[hits.len - 1] end = JM_quad_from_py(quad) if ( 1 and hdist(line.m_internal.dir, end.lr, ch_quad.ll) < hfuzz and vdist(line.m_internal.dir, end.lr, ch_quad.ll) < vfuzz and hdist(line.m_internal.dir, end.ur, ch_quad.ul) < hfuzz and vdist(line.m_internal.dir, end.ur, ch_quad.ul) < vfuzz ): end.ur = ch_quad.ur end.lr = ch_quad.lr assert hits.quads[-1] == end return hits.quads.append(ch_quad) hits.len += 1 def page_merge(doc_des, doc_src, page_from, page_to, rotate, links, copy_annots, graft_map): ''' Deep-copies a source page to the target. Modified version of function of pdfmerge.c: we also copy annotations, but we skip some subtypes. In addition we rotate output. ''' if g_use_extra: #log( 'Calling C++ extra.page_merge()') return extra.page_merge( doc_des, doc_src, page_from, page_to, rotate, links, copy_annots, graft_map) # list of object types (per page) we want to copy known_page_objs = [ PDF_NAME('Contents'), PDF_NAME('Resources'), PDF_NAME('MediaBox'), PDF_NAME('CropBox'), PDF_NAME('BleedBox'), PDF_NAME('TrimBox'), PDF_NAME('ArtBox'), PDF_NAME('Rotate'), PDF_NAME('UserUnit'), ] page_ref = mupdf.pdf_lookup_page_obj(doc_src, page_from) # make new page dict in dest doc page_dict = mupdf.pdf_new_dict(doc_des, 4) mupdf.pdf_dict_put(page_dict, PDF_NAME('Type'), PDF_NAME('Page')) # copy objects of source page into it for i in range( len(known_page_objs)): obj = mupdf.pdf_dict_get_inheritable( page_ref, known_page_objs[i]) if obj.m_internal: #log( '{=type(graft_map) type(graft_map.this)}') mupdf.pdf_dict_put( page_dict, known_page_objs[i], mupdf.pdf_graft_mapped_object(graft_map.this, obj)) # Copy annotations, but skip Link, Popup, IRT, Widget types # If selected, remove dict keys P (parent) and Popup if copy_annots: old_annots = mupdf.pdf_dict_get( page_ref, PDF_NAME('Annots')) n = mupdf.pdf_array_len( old_annots) if n > 0: new_annots = mupdf.pdf_dict_put_array( page_dict, PDF_NAME('Annots'), n) for i in range(n): o = mupdf.pdf_array_get( old_annots, i) if not o.m_internal or not mupdf.pdf_is_dict(o): continue # skip non-dict items if mupdf.pdf_dict_gets( o, "IRT").m_internal: continue subtype = mupdf.pdf_dict_get( o, PDF_NAME('Subtype')) if mupdf.pdf_name_eq( subtype, PDF_NAME('Link')): continue if mupdf.pdf_name_eq( subtype, PDF_NAME('Popup')): continue if mupdf.pdf_name_eq(subtype, PDF_NAME('Widget')): continue mupdf.pdf_dict_del( o, PDF_NAME('Popup')) mupdf.pdf_dict_del( o, PDF_NAME('P')) copy_o = mupdf.pdf_graft_mapped_object( graft_map.this, o) annot = mupdf.pdf_new_indirect( doc_des, mupdf.pdf_to_num( copy_o), 0) mupdf.pdf_array_push( new_annots, annot) # rotate the page if rotate != -1: mupdf.pdf_dict_put_int( page_dict, PDF_NAME('Rotate'), rotate) # Now add the page dictionary to dest PDF ref = mupdf.pdf_add_object( doc_des, page_dict) # Insert new page at specified location mupdf.pdf_insert_page( doc_des, page_to, ref) def paper_rect(s: str) -> Rect: """Return a Rect for the paper size indicated in string 's'. Must conform to the argument of method 'PaperSize', which will be invoked. """ width, height = paper_size(s) return Rect(0.0, 0.0, width, height) def paper_size(s: str) -> tuple: """Return a tuple (width, height) for a given paper format string. Notes: 'A4-L' will return (842, 595), the values for A4 landscape. Suffix '-P' and no suffix return the portrait tuple. """ size = s.lower() f = "p" if size.endswith("-l"): f = "l" size = size[:-2] if size.endswith("-p"): size = size[:-2] rc = paper_sizes().get(size, (-1, -1)) if f == "p": return rc return (rc[1], rc[0]) def paper_sizes(): """Known paper formats @ 72 dpi as a dictionary. Key is the format string like "a4" for ISO-A4. Value is the tuple (width, height). Information taken from the following web sites: www.din-formate.de www.din-formate.info/amerikanische-formate.html www.directtools.de/wissen/normen/iso.htm """ return { "a0": (2384, 3370), "a1": (1684, 2384), "a10": (74, 105), "a2": (1191, 1684), "a3": (842, 1191), "a4": (595, 842), "a5": (420, 595), "a6": (298, 420), "a7": (210, 298), "a8": (147, 210), "a9": (105, 147), "b0": (2835, 4008), "b1": (2004, 2835), "b10": (88, 125), "b2": (1417, 2004), "b3": (1001, 1417), "b4": (709, 1001), "b5": (499, 709), "b6": (354, 499), "b7": (249, 354), "b8": (176, 249), "b9": (125, 176), "c0": (2599, 3677), "c1": (1837, 2599), "c10": (79, 113), "c2": (1298, 1837), "c3": (918, 1298), "c4": (649, 918), "c5": (459, 649), "c6": (323, 459), "c7": (230, 323), "c8": (162, 230), "c9": (113, 162), "card-4x6": (288, 432), "card-5x7": (360, 504), "commercial": (297, 684), "executive": (522, 756), "invoice": (396, 612), "ledger": (792, 1224), "legal": (612, 1008), "legal-13": (612, 936), "letter": (612, 792), "monarch": (279, 540), "tabloid-extra": (864, 1296), } def pdf_lookup_page_loc(doc, needle): return mupdf.pdf_lookup_page_loc(doc, needle) def pdfobj_string(o, prefix=''): ''' Returns description of mupdf.PdfObj (wrapper for pdf_obj) <o>. ''' assert 0, 'use mupdf.pdf_debug_obj() ?' ret = '' if mupdf.pdf_is_array(o): l = mupdf.pdf_array_len(o) ret += f'array {l}\n' for i in range(l): oo = mupdf.pdf_array_get(o, i) ret += pdfobj_string(oo, prefix + ' ') ret += '\n' elif mupdf.pdf_is_bool(o): ret += f'bool: {o.array_get_bool()}\n' elif mupdf.pdf_is_dict(o): l = mupdf.pdf_dict_len(o) ret += f'dict {l}\n' for i in range(l): key = mupdf.pdf_dict_get_key(o, i) value = mupdf.pdf_dict_get( o, key) ret += f'{prefix} {key}: ' ret += pdfobj_string( value, prefix + ' ') ret += '\n' elif mupdf.pdf_is_embedded_file(o): ret += f'embedded_file: {o.embedded_file_name()}\n' elif mupdf.pdf_is_indirect(o): ret += f'indirect: ...\n' elif mupdf.pdf_is_int(o): ret += f'int: {mupdf.pdf_to_int(o)}\n' elif mupdf.pdf_is_jpx_image(o): ret += f'jpx_image:\n' elif mupdf.pdf_is_name(o): ret += f'name: {mupdf.pdf_to_name(o)}\n' elif o.pdf_is_null: ret += f'null\n' #elif o.pdf_is_number: # ret += f'number\n' elif o.pdf_is_real: ret += f'real: {o.pdf_to_real()}\n' elif mupdf.pdf_is_stream(o): ret += f'stream\n' elif mupdf.pdf_is_string(o): ret += f'string: {mupdf.pdf_to_string(o)}\n' else: ret += '<>\n' return ret def repair_mono_font(page: "Page", font: "Font") -> None: """Repair character spacing for mono fonts. Notes: Some mono-spaced fonts are displayed with a too large character distance, e.g. "a b c" instead of "abc". This utility adds an entry "/W[0 65535 w]" to the descendent font(s) of font. The float w is taken to be the width of 0x20 (space). This should enforce viewers to use 'w' as the character width. Args: page: pymupdf.Page object. font: pymupdf.Font object. """ if not font.flags["mono"]: # font not flagged as monospaced return None doc = page.parent # the document fontlist = page.get_fonts() # list of fonts on page xrefs = [ # list of objects referring to font f[0] for f in fontlist if (f[3] == font.name and f[4].startswith("F") and f[5].startswith("Identity")) ] if xrefs == []: # our font does not occur return xrefs = set(xrefs) # drop any double counts width = int(round((font.glyph_advance(32) * 1000))) for xref in xrefs: if not TOOLS.set_font_width(doc, xref, width): log("Cannot set width for '%s' in xref %i" % (font.name, xref)) def sRGB_to_pdf(srgb: int) -> tuple: """Convert sRGB color code to a PDF color triple. There is **no error checking** for performance reasons! Args: srgb: (int) RRGGBB (red, green, blue), each color in range(255). Returns: Tuple (red, green, blue) each item in interval 0 <= item <= 1. """ t = sRGB_to_rgb(srgb) return t[0] / 255.0, t[1] / 255.0, t[2] / 255.0 def sRGB_to_rgb(srgb: int) -> tuple: """Convert sRGB color code to an RGB color triple. There is **no error checking** for performance reasons! Args: srgb: (int) SSRRGGBB (red, green, blue), each color in range(255). With MuPDF < 1.26, `s` is always 0. Returns: Tuple (red, green, blue) each item in interval 0 <= item <= 255. """ srgb &= 0xffffff r = srgb >> 16 g = (srgb - (r << 16)) >> 8 b = srgb - (r << 16) - (g << 8) return (r, g, b) def string_in_names_list(p, names_list): n = mupdf.pdf_array_len( names_list) if names_list else 0 str_ = mupdf.pdf_to_text_string( p) for i in range(0, n, 2): if mupdf.pdf_to_text_string( mupdf.pdf_array_get( names_list, i)) == str_: return 1 return 0 def strip_outline(doc, outlines, page_count, page_object_nums, names_list): ''' Returns (count, first, prev). ''' first = None count = 0 current = outlines prev = None while current.m_internal: # Strip any children to start with. This takes care of # First / Last / Count for us. nc = strip_outlines(doc, current, page_count, page_object_nums, names_list) if not dest_is_valid(current, page_count, page_object_nums, names_list): if nc == 0: # Outline with invalid dest and no children. Drop it by # pulling the next one in here. next = mupdf.pdf_dict_get(current, PDF_NAME('Next')) if not next.m_internal: # There is no next one to pull in if prev.m_internal: mupdf.pdf_dict_del(prev, PDF_NAME('Next')) elif prev.m_internal: mupdf.pdf_dict_put(prev, PDF_NAME('Next'), next) mupdf.pdf_dict_put(next, PDF_NAME('Prev'), prev) else: mupdf.pdf_dict_del(next, PDF_NAME('Prev')) current = next else: # Outline with invalid dest, but children. Just drop the dest. mupdf.pdf_dict_del(current, PDF_NAME('Dest')) mupdf.pdf_dict_del(current, PDF_NAME('A')) current = mupdf.pdf_dict_get(current, PDF_NAME('Next')) else: # Keep this one if not first or not first.m_internal: first = current prev = current current = mupdf.pdf_dict_get(current, PDF_NAME('Next')) count += 1 return count, first, prev def strip_outlines(doc, outlines, page_count, page_object_nums, names_list): if not outlines.m_internal: return 0 first = mupdf.pdf_dict_get(outlines, PDF_NAME('First')) if not first.m_internal: nc = 0 else: nc, first, last = strip_outline(doc, first, page_count, page_object_nums, names_list) if nc == 0: mupdf.pdf_dict_del(outlines, PDF_NAME('First')) mupdf.pdf_dict_del(outlines, PDF_NAME('Last')) mupdf.pdf_dict_del(outlines, PDF_NAME('Count')) else: old_count = mupdf.pdf_to_int(mupdf.pdf_dict_get(outlines, PDF_NAME('Count'))) mupdf.pdf_dict_put(outlines, PDF_NAME('First'), first) mupdf.pdf_dict_put(outlines, PDF_NAME('Last'), last) mupdf.pdf_dict_put(outlines, PDF_NAME('Count'), mupdf.pdf_new_int(nc if old_count > 0 else -nc)) return nc trace_device_FILL_PATH = 1 trace_device_STROKE_PATH = 2 trace_device_CLIP_PATH = 3 trace_device_CLIP_STROKE_PATH = 4 def unicode_to_glyph_name(ch: int) -> str: """ Convenience function accessing unicodedata. """ import unicodedata try: name = unicodedata.name(chr(ch)) except ValueError: name = ".notdef" return name def vdist(dir, a, b): dx = b.x - a.x dy = b.y - a.y return mupdf.fz_abs(dx * dir.y + dy * dir.x) def apply_pages( path, pagefn, *, pagefn_args=(), pagefn_kwargs=dict(), initfn=None, initfn_args=(), initfn_kwargs=dict(), pages=None, method='single', concurrency=None, _stats=False, ): ''' Returns list of results from `pagefn()`, optionally using concurrency for speed. Args: path: Path of document. pagefn: Function to call for each page; is passed (page, *pagefn_args, **pagefn_kwargs). Return value is added to list that we return. If `method` is not 'single', must be a top-level function - nested functions don't work with concurrency. pagefn_args pagefn_kwargs: Additional args to pass to `pagefn`. Must be picklable. initfn: If true, called once in each worker process; is passed (*initfn_args, **initfn_kwargs). initfn_args initfn_kwargs: Args to pass to initfn. Must be picklable. pages: List of page numbers to process, or None to include all pages. method: 'single' Do not use concurrency. 'mp' Operate concurrently using Python's `multiprocessing` module. 'fork' Operate concurrently using custom implementation with `os.fork()`. Does not work on Windows. concurrency: Number of worker processes to use when operating concurrently. If None, we use the number of available CPUs. _stats: Internal, may change or be removed. If true, we output simple timing diagnostics. Note: We require a file path rather than a Document, because Document instances do not work properly after a fork - internal file descriptor offsets are shared between the parent and child processes. ''' if _stats: t0 = time.time() if method == 'single': if initfn: initfn(*initfn_args, **initfn_kwargs) ret = list() document = Document(path) if pages is None: pages = range(len(document)) for pno in pages: page = document[pno] r = pagefn(page, *pagefn_args, **initfn_kwargs) ret.append(r) else: # Use concurrency. # from . import _apply_pages if pages is None: if _stats: t = time.time() with Document(path) as document: num_pages = len(document) pages = list(range(num_pages)) if _stats: t = time.time() - t log(f'{t:.2f}s: count pages.') if _stats: t = time.time() if method == 'mp': ret = _apply_pages._multiprocessing( path, pages, pagefn, pagefn_args, pagefn_kwargs, initfn, initfn_args, initfn_kwargs, concurrency, _stats, ) elif method == 'fork': ret = _apply_pages._fork( path, pages, pagefn, pagefn_args, pagefn_kwargs, initfn, initfn_args, initfn_kwargs, concurrency, _stats, ) else: assert 0, f'Unrecognised {method=}.' if _stats: t = time.time() - t log(f'{t:.2f}s: work.') if _stats: t = time.time() - t0 log(f'{t:.2f}s: total.') return ret def get_text( path, *, pages=None, method='single', concurrency=None, option='text', clip=None, flags=None, textpage=None, sort=False, delimiters=None, _stats=False, ): ''' Returns list of results from `Page.get_text()`, optionally using concurrency for speed. Args: path: Path of document. pages: List of page numbers to process, or None to include all pages. method: 'single' Do not use concurrency. 'mp' Operate concurrently using Python's `multiprocessing` module. 'fork' Operate concurrently using custom implementation with `os.fork`. Does not work on Windows. concurrency: Number of worker processes to use when operating concurrently. If None, we use the number of available CPUs. option clip flags textpage sort delimiters: Passed to internal calls to `Page.get_text()`. ''' args_dict = dict( option=option, clip=clip, flags=flags, textpage=textpage, sort=sort, delimiters=delimiters, ) return apply_pages( path, Page.get_text, pagefn_kwargs=args_dict, pages=pages, method=method, concurrency=concurrency, _stats=_stats, ) class TOOLS: ''' We use @staticmethod to avoid the need to create an instance of this class. ''' def _derotate_matrix(page): if isinstance(page, mupdf.PdfPage): return JM_py_from_matrix(JM_derotate_page_matrix(page)) else: return JM_py_from_matrix(mupdf.FzMatrix()) @staticmethod def _fill_widget(annot, widget): val = JM_get_widget_properties(annot, widget) widget.rect = Rect(annot.rect) widget.xref = annot.xref widget.parent = annot.parent widget._annot = annot # backpointer to annot object if not widget.script: widget.script = None if not widget.script_stroke: widget.script_stroke = None if not widget.script_format: widget.script_format = None if not widget.script_change: widget.script_change = None if not widget.script_calc: widget.script_calc = None if not widget.script_blur: widget.script_blur = None if not widget.script_focus: widget.script_focus = None return val @staticmethod def _get_all_contents(page): page = _as_pdf_page(page.this) res = JM_read_contents(page.obj()) result = JM_BinFromBuffer( res) return result @staticmethod def _insert_contents(page, newcont, overlay=1): """Add bytes as a new /Contents object for a page, and return its xref.""" pdfpage = _as_pdf_page(page, required=1) contbuf = JM_BufferFromBytes(newcont) xref = JM_insert_contents(pdfpage.doc(), pdfpage.obj(), contbuf, overlay) #fixme: pdfpage->doc->dirty = 1; return xref @staticmethod def _le_annot_parms(annot, p1, p2, fill_color): """Get common parameters for making annot line end symbols. Returns: m: matrix that maps p1, p2 to points L, P on the x-axis im: its inverse L, P: transformed p1, p2 w: line width scol: stroke color string fcol: fill color store_shrink opacity: opacity string (gs command) """ w = annot.border["width"] # line width sc = annot.colors["stroke"] # stroke color if not sc: # black if missing sc = (0,0,0) scol = " ".join(map(str, sc)) + " RG\n" if fill_color: fc = fill_color else: fc = annot.colors["fill"] # fill color if not fc: fc = (1,1,1) # white if missing fcol = " ".join(map(str, fc)) + " rg\n" # nr = annot.rect np1 = p1 # point coord relative to annot rect np2 = p2 # point coord relative to annot rect m = Matrix(util_hor_matrix(np1, np2)) # matrix makes the line horizontal im = ~m # inverted matrix L = np1 * m # converted start (left) point R = np2 * m # converted end (right) point if 0 <= annot.opacity < 1: opacity = "/H gs\n" else: opacity = "" return m, im, L, R, w, scol, fcol, opacity @staticmethod def _le_butt(annot, p1, p2, lr, fill_color): """Make stream commands for butt line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 3 d = shift * max(1, w) M = R if lr else L top = (M + (0, -d/2.)) * im bot = (M + (0, d/2.)) * im ap = "\nq\n%s%f %f m\n" % (opacity, top.x, top.y) ap += "%f %f l\n" % (bot.x, bot.y) ap += _format_g(w) + " w\n" ap += scol + "s\nQ\n" return ap @staticmethod def _le_circle(annot, p1, p2, lr, fill_color): """Make stream commands for circle line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 2.5 # 2*shift*width = length of square edge d = shift * max(1, w) M = R - (d/2., 0) if lr else L + (d/2., 0) r = Rect(M, M) + (-d, -d, d, d) # the square ap = "q\n" + opacity + TOOLS._oval_string(r.tl * im, r.tr * im, r.br * im, r.bl * im) ap += _format_g(w) + " w\n" ap += scol + fcol + "b\nQ\n" return ap @staticmethod def _le_closedarrow(annot, p1, p2, lr, fill_color): """Make stream commands for closed arrow line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 2.5 d = shift * max(1, w) p2 = R + (d/2., 0) if lr else L - (d/2., 0) p1 = p2 + (-2*d, -d) if lr else p2 + (2*d, -d) p3 = p2 + (-2*d, d) if lr else p2 + (2*d, d) p1 *= im p2 *= im p3 *= im ap = "\nq\n%s%f %f m\n" % (opacity, p1.x, p1.y) ap += "%f %f l\n" % (p2.x, p2.y) ap += "%f %f l\n" % (p3.x, p3.y) ap += _format_g(w) + " w\n" ap += scol + fcol + "b\nQ\n" return ap @staticmethod def _le_diamond(annot, p1, p2, lr, fill_color): """Make stream commands for diamond line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 2.5 # 2*shift*width = length of square edge d = shift * max(1, w) M = R - (d/2., 0) if lr else L + (d/2., 0) r = Rect(M, M) + (-d, -d, d, d) # the square # the square makes line longer by (2*shift - 1)*width p = (r.tl + (r.bl - r.tl) * 0.5) * im ap = "q\n%s%f %f m\n" % (opacity, p.x, p.y) p = (r.tl + (r.tr - r.tl) * 0.5) * im ap += "%f %f l\n" % (p.x, p.y) p = (r.tr + (r.br - r.tr) * 0.5) * im ap += "%f %f l\n" % (p.x, p.y) p = (r.br + (r.bl - r.br) * 0.5) * im ap += "%f %f l\n" % (p.x, p.y) ap += _format_g(w) + " w\n" ap += scol + fcol + "b\nQ\n" return ap @staticmethod def _le_openarrow(annot, p1, p2, lr, fill_color): """Make stream commands for open arrow line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 2.5 d = shift * max(1, w) p2 = R + (d/2., 0) if lr else L - (d/2., 0) p1 = p2 + (-2*d, -d) if lr else p2 + (2*d, -d) p3 = p2 + (-2*d, d) if lr else p2 + (2*d, d) p1 *= im p2 *= im p3 *= im ap = "\nq\n%s%f %f m\n" % (opacity, p1.x, p1.y) ap += "%f %f l\n" % (p2.x, p2.y) ap += "%f %f l\n" % (p3.x, p3.y) ap += _format_g(w) + " w\n" ap += scol + "S\nQ\n" return ap @staticmethod def _le_rclosedarrow(annot, p1, p2, lr, fill_color): """Make stream commands for right closed arrow line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 2.5 d = shift * max(1, w) p2 = R - (2*d, 0) if lr else L + (2*d, 0) p1 = p2 + (2*d, -d) if lr else p2 + (-2*d, -d) p3 = p2 + (2*d, d) if lr else p2 + (-2*d, d) p1 *= im p2 *= im p3 *= im ap = "\nq\n%s%f %f m\n" % (opacity, p1.x, p1.y) ap += "%f %f l\n" % (p2.x, p2.y) ap += "%f %f l\n" % (p3.x, p3.y) ap += _format_g(w) + " w\n" ap += scol + fcol + "b\nQ\n" return ap @staticmethod def _le_ropenarrow(annot, p1, p2, lr, fill_color): """Make stream commands for right open arrow line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 2.5 d = shift * max(1, w) p2 = R - (d/3., 0) if lr else L + (d/3., 0) p1 = p2 + (2*d, -d) if lr else p2 + (-2*d, -d) p3 = p2 + (2*d, d) if lr else p2 + (-2*d, d) p1 *= im p2 *= im p3 *= im ap = "\nq\n%s%f %f m\n" % (opacity, p1.x, p1.y) ap += "%f %f l\n" % (p2.x, p2.y) ap += "%f %f l\n" % (p3.x, p3.y) ap += _format_g(w) + " w\n" ap += scol + fcol + "S\nQ\n" return ap @staticmethod def _le_slash(annot, p1, p2, lr, fill_color): """Make stream commands for slash line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) rw = 1.1547 * max(1, w) * 1.0 # makes rect diagonal a 30 deg inclination M = R if lr else L r = Rect(M.x - rw, M.y - 2 * w, M.x + rw, M.y + 2 * w) top = r.tl * im bot = r.br * im ap = "\nq\n%s%f %f m\n" % (opacity, top.x, top.y) ap += "%f %f l\n" % (bot.x, bot.y) ap += _format_g(w) + " w\n" ap += scol + "s\nQ\n" return ap @staticmethod def _le_square(annot, p1, p2, lr, fill_color): """Make stream commands for square line end symbol. "lr" denotes left (False) or right point. """ m, im, L, R, w, scol, fcol, opacity = TOOLS._le_annot_parms(annot, p1, p2, fill_color) shift = 2.5 # 2*shift*width = length of square edge d = shift * max(1, w) M = R - (d/2., 0) if lr else L + (d/2., 0) r = Rect(M, M) + (-d, -d, d, d) # the square # the square makes line longer by (2*shift - 1)*width p = r.tl * im ap = "q\n%s%f %f m\n" % (opacity, p.x, p.y) p = r.tr * im ap += "%f %f l\n" % (p.x, p.y) p = r.br * im ap += "%f %f l\n" % (p.x, p.y) p = r.bl * im ap += "%f %f l\n" % (p.x, p.y) ap += _format_g(w) + " w\n" ap += scol + fcol + "b\nQ\n" return ap @staticmethod def _oval_string(p1, p2, p3, p4): """Return /AP string defining an oval within a 4-polygon provided as points """ def bezier(p, q, r): f = "%f %f %f %f %f %f c\n" return f % (p.x, p.y, q.x, q.y, r.x, r.y) kappa = 0.55228474983 # magic number ml = p1 + (p4 - p1) * 0.5 # middle points ... mo = p1 + (p2 - p1) * 0.5 # for each ... mr = p2 + (p3 - p2) * 0.5 # polygon ... mu = p4 + (p3 - p4) * 0.5 # side ol1 = ml + (p1 - ml) * kappa # the 8 bezier ol2 = mo + (p1 - mo) * kappa # helper points or1 = mo + (p2 - mo) * kappa or2 = mr + (p2 - mr) * kappa ur1 = mr + (p3 - mr) * kappa ur2 = mu + (p3 - mu) * kappa ul1 = mu + (p4 - mu) * kappa ul2 = ml + (p4 - ml) * kappa # now draw, starting from middle point of left side ap = "%f %f m\n" % (ml.x, ml.y) ap += bezier(ol1, ol2, mo) ap += bezier(or1, or2, mr) ap += bezier(ur1, ur2, mu) ap += bezier(ul1, ul2, ml) return ap @staticmethod def _parse_da(annot): if g_use_extra: val = extra.Tools_parse_da( annot.this) else: def Tools__parse_da(annot): this_annot = annot.this assert isinstance(this_annot, mupdf.PdfAnnot) this_annot_obj = mupdf.pdf_annot_obj( this_annot) pdf = mupdf.pdf_get_bound_document( this_annot_obj) try: da = mupdf.pdf_dict_get_inheritable( this_annot_obj, PDF_NAME('DA')) if not da.m_internal: trailer = mupdf.pdf_trailer(pdf) da = mupdf.pdf_dict_getl(trailer, PDF_NAME('Root'), PDF_NAME('AcroForm'), PDF_NAME('DA'), ) da_str = mupdf.pdf_to_text_string(da) except Exception: if g_exceptions_verbose: exception_info() return return da_str val = Tools__parse_da(annot) if not val: return ((0,), "", 0) font = "Helv" fsize = 12 col = (0, 0, 0) dat = val.split() # split on any whitespace for i, item in enumerate(dat): if item == "Tf": font = dat[i - 2][1:] fsize = float(dat[i - 1]) dat[i] = dat[i-1] = dat[i-2] = "" continue if item == "g": # unicolor text col = [(float(dat[i - 1]))] dat[i] = dat[i-1] = "" continue if item == "rg": # RGB colored text col = [float(f) for f in dat[i - 3:i]] dat[i] = dat[i-1] = dat[i-2] = dat[i-3] = "" continue if item == "k": # CMYK colored text col = [float(f) for f in dat[i - 4:i]] dat[i] = dat[i-1] = dat[i-2] = dat[i-3] = dat[i-4] = "" continue val = (col, font, fsize) return val @staticmethod def _reset_widget(annot): this_annot = annot this_annot_obj = mupdf.pdf_annot_obj(this_annot) pdf = mupdf.pdf_get_bound_document(this_annot_obj) mupdf.pdf_field_reset(pdf, this_annot_obj) @staticmethod def _rotate_matrix(page): pdfpage = page._pdf_page(required=False) if not pdfpage.m_internal: return JM_py_from_matrix(mupdf.FzMatrix()) return JM_py_from_matrix(JM_rotate_page_matrix(pdfpage)) @staticmethod def _save_widget(annot, widget): JM_set_widget_properties(annot, widget) def _update_da(annot, da_str): if g_use_extra: extra.Tools_update_da( annot.this, da_str) else: try: this_annot = annot.this assert isinstance(this_annot, mupdf.PdfAnnot) mupdf.pdf_dict_put_text_string(mupdf.pdf_annot_obj(this_annot), PDF_NAME('DA'), da_str) mupdf.pdf_dict_del(mupdf.pdf_annot_obj(this_annot), PDF_NAME('DS')) # /* not supported */ mupdf.pdf_dict_del(mupdf.pdf_annot_obj(this_annot), PDF_NAME('RC')) # /* not supported */ except Exception: if g_exceptions_verbose: exception_info() return return @staticmethod def gen_id(): global TOOLS_JM_UNIQUE_ID TOOLS_JM_UNIQUE_ID += 1 return TOOLS_JM_UNIQUE_ID @staticmethod def glyph_cache_empty(): ''' Empty the glyph cache. ''' mupdf.fz_purge_glyph_cache() @staticmethod def image_profile(stream, keep_image=0): ''' Metadata of an image binary stream. ''' return JM_image_profile(stream, keep_image) @staticmethod def mupdf_display_errors(on=None): ''' Set MuPDF error display to True or False. ''' global JM_mupdf_show_errors if on is not None: JM_mupdf_show_errors = bool(on) return JM_mupdf_show_errors @staticmethod def mupdf_display_warnings(on=None): ''' Set MuPDF warnings display to True or False. ''' global JM_mupdf_show_warnings if on is not None: JM_mupdf_show_warnings = bool(on) return JM_mupdf_show_warnings @staticmethod def mupdf_version(): '''Get version of MuPDF binary build.''' return mupdf.FZ_VERSION @staticmethod def mupdf_warnings(reset=1): ''' Get the MuPDF warnings/errors with optional reset (default). ''' # Get any trailing `... repeated <N> times...` message. mupdf.fz_flush_warnings() ret = '\n'.join( JM_mupdf_warnings_store) if reset: TOOLS.reset_mupdf_warnings() return ret @staticmethod def reset_mupdf_warnings(): global JM_mupdf_warnings_store JM_mupdf_warnings_store = list() @staticmethod def set_aa_level(level): ''' Set anti-aliasing level. ''' mupdf.fz_set_aa_level(level) @staticmethod def set_annot_stem( stem=None): global JM_annot_id_stem if stem is None: return JM_annot_id_stem len_ = len(stem) + 1 if len_ > 50: len_ = 50 JM_annot_id_stem = stem[:50] return JM_annot_id_stem @staticmethod def set_font_width(doc, xref, width): pdf = _as_pdf_document(doc, required=0) if not pdf.m_internal: return False font = mupdf.pdf_load_object(pdf, xref) dfonts = mupdf.pdf_dict_get(font, PDF_NAME('DescendantFonts')) if mupdf.pdf_is_array(dfonts): n = mupdf.pdf_array_len(dfonts) for i in range(n): dfont = mupdf.pdf_array_get(dfonts, i) warray = mupdf.pdf_new_array(pdf, 3) mupdf.pdf_array_push(warray, mupdf.pdf_new_int(0)) mupdf.pdf_array_push(warray, mupdf.pdf_new_int(65535)) mupdf.pdf_array_push(warray, mupdf.pdf_new_int(width)) mupdf.pdf_dict_put(dfont, PDF_NAME('W'), warray) return True @staticmethod def set_graphics_min_line_width(min_line_width): ''' Set the graphics minimum line width. ''' mupdf.fz_set_graphics_min_line_width(min_line_width) @staticmethod def set_icc( on=0): """Set ICC color handling on or off.""" if on: if mupdf.FZ_ENABLE_ICC: mupdf.fz_enable_icc() else: RAISEPY( "MuPDF built w/o ICC support",PyExc_ValueError) elif mupdf.FZ_ENABLE_ICC: mupdf.fz_disable_icc() @staticmethod def set_low_memory( on=None): """Set / unset MuPDF device caching.""" if on is not None: _globals.no_device_caching = bool(on) return _globals.no_device_caching @staticmethod def set_small_glyph_heights(on=None): """Set / unset small glyph heights.""" if on is not None: _globals.small_glyph_heights = bool(on) if g_use_extra: extra.set_small_glyph_heights(_globals.small_glyph_heights) return _globals.small_glyph_heights @staticmethod def set_subset_fontnames(on=None): ''' Set / unset returning fontnames with their subset prefix. ''' if on is not None: _globals.subset_fontnames = bool(on) if g_use_extra: extra.set_subset_fontnames(_globals.subset_fontnames) return _globals.subset_fontnames @staticmethod def show_aa_level(): ''' Show anti-aliasing values. ''' return dict( graphics = mupdf.fz_graphics_aa_level(), text = mupdf.fz_text_aa_level(), graphics_min_line_width = mupdf.fz_graphics_min_line_width(), ) @staticmethod def store_maxsize(): ''' MuPDF store size limit. ''' # fixme: return gctx->store->max. return None @staticmethod def store_shrink(percent): ''' Free 'percent' of current store size. ''' if percent >= 100: mupdf.fz_empty_store() return 0 if percent > 0: mupdf.fz_shrink_store( 100 - percent) # fixme: return gctx->store->size. @staticmethod def store_size(): ''' MuPDF current store size. ''' # fixme: return gctx->store->size. return None @staticmethod def unset_quad_corrections(on=None): ''' Set ascender / descender corrections on or off. ''' if on is not None: _globals.skip_quad_corrections = bool(on) if g_use_extra: extra.set_skip_quad_corrections(_globals.skip_quad_corrections) return _globals.skip_quad_corrections # fixme: also defined at top-level. JM_annot_id_stem = 'fitz' fitz_config = JM_fitz_config() # Callbacks not yet supported with cppyy. if not mupdf_cppyy: mupdf.fz_set_warning_callback(JM_mupdf_warning) mupdf.fz_set_error_callback(JM_mupdf_error) # If there are pending warnings when we exit, we end up in this sequence: # # atexit() # -> mupdf::internal_thread_state::~internal_thread_state() # -> fz_drop_context() # -> fz_flush_warnings() # -> SWIG Director code # -> Python calling JM_mupdf_warning(). # # Unfortunately this causes a SEGV, seemingly because the SWIG Director code has # already been torn down. # # So we use a Python atexit handler to explicitly call fz_flush_warnings(); # this appears to happen early enough for the Director machinery to still # work. So in the sequence above, fz_flush_warnings() will find that there are # no pending warnings and will not attempt to call JM_mupdf_warning(). # def _atexit(): #log( 'PyMuPDF/src/__init__.py:_atexit() called') mupdf.fz_flush_warnings() mupdf.fz_set_warning_callback(None) mupdf.fz_set_error_callback(None) #log( '_atexit() returning') atexit.register( _atexit) # List of (name, red, green, blue) where: # name: upper-case name. # red, green, blue: integer in range 0..255. # from . import _wxcolors _wxcolors = _wxcolors._wxcolors # Dict mapping from name to (red, green, blue). # name: lower-case name. # red, green, blue: float in range 0..1. # pdfcolor = dict() for name, r, g, b in _wxcolors: pdfcolor[name.lower()] = (r/255, g/255, b/255) def colors_pdf_dict(): ''' Returns dict mapping from name to (red, green, blue). name: lower-case name. red, green, blue: float in range 0..1. ''' return pdfcolor def colors_wx_list(): ''' Returns list of (name, red, green, blue) tuples: name: upper-case name. red, green, blue: integers in range 0..255. ''' return _wxcolors def _mupdf_devel(make_links=True): ''' Allows PyMuPDF installation to be used to compile and link programmes that use the MuPDF C/C++ API. Args: make_links: If true, then on non-windows we also create softlinks to any shared libraries that are supplied with a version suffix; this allows them to be used in a link command. For example we create links such as: site-packages/pymupdf/ libmupdf.so -> libmupdf.so.26.7 libmupdfcpp.so -> libmupdfcpp.so.26.7 Returns: (mupdf_include, mupdf_lib). mupdf_include: Path of MuPDF include directory within PyMuPDF install. mupdf_lib Path of MuPDF library directory within PyMuPDF install. ''' import platform log(f'{mupdf_version=}') p = os.path.normpath(f'{__file__}/..') mupdf_include = f'{p}/mupdf-devel/include' if platform.system() == 'Windows': # Separate .lib files are used at build time. mupdf_lib = f'{p}/mupdf-devel/lib' else: # .so files are used for both buildtime and runtime linking. mupdf_lib = p log(f'Within installed PyMuPDF:') log(f' {mupdf_include=}') log(f' {mupdf_lib=}') assert os.path.isdir(mupdf_include), f'Not a directory: {mupdf_include=}.' assert os.path.isdir(mupdf_lib), f'Not a directory: {mupdf_lib=}.' if platform.system() != 'Windows' and make_links: # Make symbolic links within the installed pymupdf module so # that ld can find libmupdf.so etc. This is a bit of a hack, but # necessary because wheels cannot contain symbolic links. # # For example we create `libmupdf.so -> libmupdf.so.24.8`. # # We are careful to only create symlinks for the expected MuPDF # version, in case old .so files from a previous install are still # in place. # log(f'Creating symlinks in {mupdf_lib=} for MuPDF-{mupdf_version} .so files.') regex_suffix = mupdf_version.split('.')[1:3] regex_suffix = '[.]'.join(regex_suffix) mupdf_lib_regex = f'^(lib[^.]+[.]so)[.]{regex_suffix}$' log(f'{mupdf_lib_regex=}.') for leaf in os.listdir(mupdf_lib): m = re.match(mupdf_lib_regex, leaf) if m: pfrom = f'{mupdf_lib}/{m.group(1)}' # os.path.exists() can return false if softlink exists # but points to non-existent file, so we also use # `os.path.islink()`. if os.path.islink(pfrom) or os.path.exists(pfrom): log(f'Removing existing link {pfrom=}.') os.remove(pfrom) log(f'Creating symlink: {pfrom} -> {leaf}') os.symlink(leaf, pfrom) return mupdf_include, mupdf_lib # We cannot import utils earlier because it imports this .py file itself and # uses some pymupdf.* types in function typing. # from . import utils # Use utils.*() fns for some class methods. # recover_bbox_quad = utils.recover_bbox_quad recover_char_quad = utils.recover_char_quad recover_line_quad = utils.recover_line_quad recover_quad = utils.recover_quad recover_span_quad = utils.recover_span_quad from .table import find_tables Page.find_tables = find_tables class FitzDeprecation(DeprecationWarning): pass def restore_aliases(): warnings.filterwarnings( "once", category=FitzDeprecation) def showthis(msg, cat, filename, lineno, file=None, line=None): text = warnings.formatwarning(msg, cat, filename, lineno, line=line) s = text.find("FitzDeprecation") if s < 0: log(text) return text = text[s:].splitlines()[0][4:] log(text) warnings.showwarning = showthis def _alias(class_, new_name, legacy_name=None): ''' Adds an alias for a class_ or module item clled <class_>.<new>. class_: Class/module to modify; use None for the current module. new_name: String name of existing item, e.g. name of method. legacy_name: Name of legacy object to create in <class_>. If None, we generate from <item> by removing underscores and capitalising the next letter. ''' if class_ is None: class_ = sys.modules[__name__] if not legacy_name: legacy_name = '' capitalise_next = False for c in new_name: if c == '_': capitalise_next = True elif capitalise_next: legacy_name += c.upper() capitalise_next = False else: legacy_name += c new_object = getattr( class_, new_name) assert not getattr( class_, legacy_name, None), f'class {class_} already has {legacy_name}' if callable( new_object): def deprecated_function( *args, **kwargs): warnings.warn( f'"{legacy_name=}" removed from {class_} after v1.19.0 - use "{new_name}".', category=FitzDeprecation, ) return new_object( *args, **kwargs) setattr( class_, legacy_name, deprecated_function) deprecated_function.__doc__ = ( f'*** Deprecated and removed in version after v1.19.0 - use "{new_name}". ***\n' f'{new_object.__doc__}' ) else: setattr( class_, legacy_name, new_object) _alias( Annot, 'get_file', 'fileGet') _alias( Annot, 'get_pixmap') _alias( Annot, 'get_sound', 'soundGet') _alias( Annot, 'get_text') _alias( Annot, 'get_textbox') _alias( Annot, 'get_textpage', 'getTextPage') _alias( Annot, 'line_ends') _alias( Annot, 'set_blendmode', 'setBlendMode') _alias( Annot, 'set_border') _alias( Annot, 'set_colors') _alias( Annot, 'set_flags') _alias( Annot, 'set_info') _alias( Annot, 'set_line_ends') _alias( Annot, 'set_name') _alias( Annot, 'set_oc', 'setOC') _alias( Annot, 'set_opacity') _alias( Annot, 'set_rect') _alias( Annot, 'update_file', 'fileUpd') _alias( DisplayList, 'get_pixmap') _alias( DisplayList, 'get_textpage', 'getTextPage') _alias( Document, 'chapter_count') _alias( Document, 'chapter_page_count') _alias( Document, 'convert_to_pdf', 'convertToPDF') _alias( Document, 'copy_page') _alias( Document, 'delete_page') _alias( Document, 'delete_pages', 'deletePageRange') _alias( Document, 'embfile_add', 'embeddedFileAdd') _alias( Document, 'embfile_count', 'embeddedFileCount') _alias( Document, 'embfile_del', 'embeddedFileDel') _alias( Document, 'embfile_get', 'embeddedFileGet') _alias( Document, 'embfile_info', 'embeddedFileInfo') _alias( Document, 'embfile_names', 'embeddedFileNames') _alias( Document, 'embfile_upd', 'embeddedFileUpd') _alias( Document, 'extract_font') _alias( Document, 'extract_image') _alias( Document, 'find_bookmark') _alias( Document, 'fullcopy_page') _alias( Document, 'get_char_widths') _alias( Document, 'get_ocgs', 'getOCGs') _alias( Document, 'get_page_fonts', 'getPageFontList') _alias( Document, 'get_page_images', 'getPageImageList') _alias( Document, 'get_page_pixmap') _alias( Document, 'get_page_text') _alias( Document, 'get_page_xobjects', 'getPageXObjectList') _alias( Document, 'get_sigflags', 'getSigFlags') _alias( Document, 'get_toc', 'getToC') _alias( Document, 'get_xml_metadata') _alias( Document, 'insert_page') _alias( Document, 'insert_pdf', 'insertPDF') _alias( Document, 'is_dirty') _alias( Document, 'is_form_pdf', 'isFormPDF') _alias( Document, 'is_pdf', 'isPDF') _alias( Document, 'is_reflowable') _alias( Document, 'is_repaired') _alias( Document, 'last_location') _alias( Document, 'load_page') _alias( Document, 'make_bookmark') _alias( Document, 'move_page') _alias( Document, 'needs_pass') _alias( Document, 'new_page') _alias( Document, 'next_location') _alias( Document, 'page_count') _alias( Document, 'page_cropbox', 'pageCropBox') _alias( Document, 'page_xref') _alias( Document, 'pdf_catalog', 'PDFCatalog') _alias( Document, 'pdf_trailer', 'PDFTrailer') _alias( Document, 'prev_location', 'previousLocation') _alias( Document, 'resolve_link') _alias( Document, 'search_page_for') _alias( Document, 'set_language') _alias( Document, 'set_metadata') _alias( Document, 'set_toc', 'setToC') _alias( Document, 'set_xml_metadata') _alias( Document, 'update_object') _alias( Document, 'update_stream') _alias( Document, 'xref_is_stream', 'isStream') _alias( Document, 'xref_length') _alias( Document, 'xref_object') _alias( Document, 'xref_stream') _alias( Document, 'xref_stream_raw') _alias( Document, 'xref_xml_metadata', 'metadataXML') _alias( IRect, 'get_area') _alias( IRect, 'get_area', 'getRectArea') _alias( IRect, 'include_point') _alias( IRect, 'include_rect') _alias( IRect, 'is_empty') _alias( IRect, 'is_infinite') _alias( Link, 'is_external') _alias( Link, 'set_border') _alias( Link, 'set_colors') _alias( Matrix, 'is_rectilinear') _alias( Matrix, 'prerotate', 'preRotate') _alias( Matrix, 'prescale', 'preScale') _alias( Matrix, 'preshear', 'preShear') _alias( Matrix, 'pretranslate', 'preTranslate') _alias( None, 'get_pdf_now', 'getPDFnow') _alias( None, 'get_pdf_str', 'getPDFstr') _alias( None, 'get_text_length') _alias( None, 'get_text_length', 'getTextlength') _alias( None, 'image_profile', 'ImageProperties') _alias( None, 'paper_rect', 'PaperRect') _alias( None, 'paper_size', 'PaperSize') _alias( None, 'paper_sizes') _alias( None, 'planish_line') _alias( Outline, 'is_external') _alias( Outline, 'is_open') _alias( Page, 'add_caret_annot') _alias( Page, 'add_circle_annot') _alias( Page, 'add_file_annot') _alias( Page, 'add_freetext_annot') _alias( Page, 'add_highlight_annot') _alias( Page, 'add_ink_annot') _alias( Page, 'add_line_annot') _alias( Page, 'add_polygon_annot') _alias( Page, 'add_polyline_annot') _alias( Page, 'add_rect_annot') _alias( Page, 'add_redact_annot') _alias( Page, 'add_squiggly_annot') _alias( Page, 'add_stamp_annot') _alias( Page, 'add_strikeout_annot') _alias( Page, 'add_text_annot') _alias( Page, 'add_underline_annot') _alias( Page, 'add_widget') _alias( Page, 'clean_contents') _alias( Page, 'cropbox', 'CropBox') _alias( Page, 'cropbox_position', 'CropBoxPosition') _alias( Page, 'delete_annot') _alias( Page, 'delete_link') _alias( Page, 'delete_widget') _alias( Page, 'derotation_matrix') _alias( Page, 'draw_bezier') _alias( Page, 'draw_circle') _alias( Page, 'draw_curve') _alias( Page, 'draw_line') _alias( Page, 'draw_oval') _alias( Page, 'draw_polyline') _alias( Page, 'draw_quad') _alias( Page, 'draw_rect') _alias( Page, 'draw_sector') _alias( Page, 'draw_squiggle') _alias( Page, 'draw_zigzag') _alias( Page, 'first_annot') _alias( Page, 'first_link') _alias( Page, 'first_widget') _alias( Page, 'get_contents') _alias( Page, 'get_displaylist', 'getDisplayList') _alias( Page, 'get_drawings') _alias( Page, 'get_fonts', 'getFontList') _alias( Page, 'get_image_bbox') _alias( Page, 'get_images', 'getImageList') _alias( Page, 'get_links') _alias( Page, 'get_pixmap') _alias( Page, 'get_svg_image', 'getSVGimage') _alias( Page, 'get_text') _alias( Page, 'get_text_blocks') _alias( Page, 'get_text_words') _alias( Page, 'get_textbox') _alias( Page, 'get_textpage', 'getTextPage') _alias( Page, 'insert_font') _alias( Page, 'insert_image') _alias( Page, 'insert_link') _alias( Page, 'insert_text') _alias( Page, 'insert_textbox') _alias( Page, 'is_wrapped', '_isWrapped') _alias( Page, 'load_annot') _alias( Page, 'load_links') _alias( Page, 'mediabox', 'MediaBox') _alias( Page, 'mediabox_size', 'MediaBoxSize') _alias( Page, 'new_shape') _alias( Page, 'read_contents') _alias( Page, 'rotation_matrix') _alias( Page, 'search_for') _alias( Page, 'set_cropbox', 'setCropBox') _alias( Page, 'set_mediabox', 'setMediaBox') _alias( Page, 'set_rotation') _alias( Page, 'show_pdf_page', 'showPDFpage') _alias( Page, 'transformation_matrix') _alias( Page, 'update_link') _alias( Page, 'wrap_contents') _alias( Page, 'write_text') _alias( Pixmap, 'clear_with') _alias( Pixmap, 'copy', 'copyPixmap') _alias( Pixmap, 'gamma_with') _alias( Pixmap, 'invert_irect', 'invertIRect') _alias( Pixmap, 'pil_save', 'pillowWrite') _alias( Pixmap, 'pil_tobytes', 'pillowData') _alias( Pixmap, 'save', 'writeImage') _alias( Pixmap, 'save', 'writePNG') _alias( Pixmap, 'set_alpha') _alias( Pixmap, 'set_dpi', 'setResolution') _alias( Pixmap, 'set_origin') _alias( Pixmap, 'set_pixel') _alias( Pixmap, 'set_rect') _alias( Pixmap, 'tint_with') _alias( Pixmap, 'tobytes', 'getImageData') _alias( Pixmap, 'tobytes', 'getPNGData') _alias( Pixmap, 'tobytes', 'getPNGdata') _alias( Quad, 'is_convex') _alias( Quad, 'is_empty') _alias( Quad, 'is_rectangular') _alias( Rect, 'get_area') _alias( Rect, 'get_area', 'getRectArea') _alias( Rect, 'include_point') _alias( Rect, 'include_rect') _alias( Rect, 'is_empty') _alias( Rect, 'is_infinite') _alias( TextWriter, 'fill_textbox') _alias( TextWriter, 'write_text') _alias( Shape, 'draw_bezier') _alias( Shape, 'draw_circle') _alias( Shape, 'draw_curve') _alias( Shape, 'draw_line') _alias( Shape, 'draw_oval') _alias( Shape, 'draw_polyline') _alias( Shape, 'draw_quad') _alias( Shape, 'draw_rect') _alias( Shape, 'draw_sector') _alias( Shape, 'draw_squiggle') _alias( Shape, 'draw_zigzag') _alias( Shape, 'insert_text') _alias( Shape, 'insert_textbox') if 0: restore_aliases() __version__ = VersionBind __doc__ = ( f'PyMuPDF {VersionBind}: Python bindings for the MuPDF {VersionFitz} library (rebased implementation).\n' f'Python {sys.version_info[0]}.{sys.version_info[1]} running on {sys.platform} ({64 if sys.maxsize > 2**32 else 32}-bit).\n' )
