Mercurial > hgrepos > Python2 > PyMuPDF
changeset 39:a6bc019ac0b2 upstream
ADD: PyMuPDF v1.26.5: the original sdist.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 11:19:58 +0200 |
| parents | b50eed0cc0ef |
| children | aa33339d6b8a |
| files | PKG-INFO changes.txt pipcl.py scripts/test.py setup.py src/__init__.py src/extra.i src/utils.py tests/conftest.py tests/gentle_compare.py tests/resources/test_1645_expected-after-1.27.0.pdf tests/resources/test_4613.png tests/resources/test_4699.pdf tests/resources/test_4699.png tests/resources/test_4712_a.pdf tests/resources/test_4712_b.pdf tests/resources/test_4716.pdf tests/test_4716.py tests/test_annots.py tests/test_codespell.py tests/test_flake8.py tests/test_font.py tests/test_general.py tests/test_import.py tests/test_memory.py tests/test_pixmap.py tests/test_pylint.py tests/test_release.py tests/test_tables.py tests/test_tesseract.py tests/test_textbox.py tests/test_textextract.py |
| diffstat | 32 files changed, 6066 insertions(+), 5056 deletions(-) [+] |
line wrap: on
line diff
--- a/PKG-INFO Mon Sep 15 11:43:07 2025 +0200 +++ b/PKG-INFO Sat Oct 11 11:19:58 2025 +0200 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: PyMuPDF -Version: 1.26.4 +Version: 1.26.5 Summary: A high performance Python library for data extraction, analysis, conversion & manipulation of PDF (and other) documents. Description-Content-Type: text/markdown Author: Artifex
--- a/changes.txt Mon Sep 15 11:43:07 2025 +0200 +++ b/changes.txt Sat Oct 11 11:19:58 2025 +0200 @@ -2,7 +2,31 @@ ========== -**Changes in version 1.26.4** +**Changes in version 1.26.5** + +* Use MuPDF-1.26.10. + +* Fixed issues: + + * **Fixed** `2883 <https://github.com/pymupdf/PyMuPDF/issues/2883>`_: Improve the Python type annotations for fitz_new + * **Fixed** `4507 <https://github.com/pymupdf/PyMuPDF/issues/4507>`_: Bugs in pyodide + * **Fixed** `4613 <https://github.com/pymupdf/PyMuPDF/issues/4613>`_: Thai and number blocks are not auto-scaled and get wrong hyphen when using in insert_htmlbox + * **Fixed** `4700 <https://github.com/pymupdf/PyMuPDF/issues/4700>`_: pymupdf.open() processes .zip file without raising + * **Fixed** `4716 <https://github.com/pymupdf/PyMuPDF/issues/4716>`_: Problems with unreadable characters + +* Other: + + * Supported Python versions are now 3.9-3.14. + * We now define all class methods explicitly instead of with dynamic assignment; this improves type hints. + * Removed `pymupdf.utils.Shape` class, was duplicate of `pymupdf.Shape`. + * Allow use of cibuildwheel to build and test on Pyodide. + * Fixed various Pyodide bugs. + * In documentation, added section about Linux wheels and glibc compatibility. + * Improved documentation of pymupdf.open()'s <filetype> arg. + * Retrospectively mark `4544 <https://github.com/pymupdf/PyMuPDF/issues/4544>`_ as fixed in 1.26.4. + + +**Changes in version 1.26.4 (2025-08-25)** * Use MuPDF-1.26.7. @@ -13,6 +37,7 @@ * **Fixed** `4457 <https://github.com/pymupdf/PyMuPDF/issues/4457>`_: Wrong characters displayed after font subsetting (w/ native method) * **Fixed** `4462 <https://github.com/pymupdf/PyMuPDF/issues/4462>`_: delete_pages() does not accept a single int * **Fixed** `4533 <https://github.com/pymupdf/PyMuPDF/issues/4533>`_: Open PDF error segmentation fault + * **Fixed** `4544 <https://github.com/pymupdf/PyMuPDF/issues/4544>`_: About pdf_clip_page * **Fixed** `4565 <https://github.com/pymupdf/PyMuPDF/issues/4565>`_: MacOS uses Tesseract and not Tesseract-OCR * **Fixed** `4571 <https://github.com/pymupdf/PyMuPDF/issues/4571>`_: Broken merged pdfs. * **Fixed** `4590 <https://github.com/pymupdf/PyMuPDF/issues/4590>`_: TypeError in utils.py scrub(): annot.update_file(buffer=...) is invalid
--- a/pipcl.py Mon Sep 15 11:43:07 2025 +0200 +++ b/pipcl.py Sat Oct 11 11:19:58 2025 +0200 @@ -2,23 +2,37 @@ Python packaging operations, including PEP-517 support, for use by a `setup.py` script. -The intention is to take care of as many packaging details as possible so that -setup.py contains only project-specific information, while also giving as much -flexibility as possible. - -For example we provide a function `build_extension()` that can be used to build -a SWIG extension, but we also give access to the located compiler/linker so -that a `setup.py` script can take over the details itself. - -Run doctests with: `python -m doctest pipcl.py` - -For Graal we require that PIPCL_GRAAL_PYTHON is set to non-graal Python (we -build for non-graal except with Graal Python's include paths and library -directory). +Overview: + + The intention is to take care of as many packaging details as possible so + that setup.py contains only project-specific information, while also giving + as much flexibility as possible. + + For example we provide a function `build_extension()` that can be used + to build a SWIG extension, but we also give access to the located + compiler/linker so that a `setup.py` script can take over the details + itself. + +Doctests: + Doctest strings are provided in some comments. + + Test in the usual way with: + python -m doctest pipcl.py + + Test specific functions/classes with: + python pipcl.py --doctest run_if ... + + If no functions or classes are specified, this tests everything. + +Graal: + For Graal we require that PIPCL_GRAAL_PYTHON is set to non-graal Python (we + build for non-graal except with Graal Python's include paths and library + directory). ''' import base64 import codecs +import difflib import glob import hashlib import inspect @@ -55,6 +69,9 @@ by legacy distutils/setuptools and described in: https://pip.pypa.io/en/stable/reference/build-system/setup-py/ + The file pyproject.toml must exist; this is checked if/when fn_build() is + called. + Here is a `doctest` example of using pipcl to create a SWIG extension module. Requires `swig`. @@ -321,63 +338,86 @@ wheel_compresslevel = None, ): ''' - The initial args before `root` define the package - metadata and closely follow the definitions in: + The initial args before `entry_points` define the + package metadata and closely follow the definitions in: https://packaging.python.org/specifications/core-metadata/ Args: name: + Used for metadata `Name`. A string, the name of the Python package. version: + Used for metadata `Version`. A string, the version of the Python package. Also see PEP-440 `Version Identification and Dependency Specification`. platform: + Used for metadata `Platform`. A string or list of strings. supported_platform: + Used for metadata `Supported-Platform`. A string or list of strings. summary: + Used for metadata `Summary`. A string, short description of the package. description: + Used for metadata `Description`. A string. If contains newlines, a detailed description of the package. Otherwise the path of a file containing the detailed description of the package. description_content_type: + Used for metadata `Description-Content-Type`. A string describing markup of `description` arg. For example `text/markdown; variant=GFM`. keywords: + Used for metadata `Keywords`. A string containing comma-separated keywords. home_page: + Used for metadata `Home-page`. URL of home page. download_url: + Used for metadata `Download-URL`. Where this version can be downloaded from. author: + Used for metadata `Author`. Author. author_email: + Used for metadata `Author-email`. Author email. maintainer: + Used for metadata `Maintainer`. Maintainer. maintainer_email: + Used for metadata `Maintainer-email`. Maintainer email. license: + Used for metadata `License`. A string containing the license text. Written into metadata file `COPYING`. Is also written into metadata itself if not multi-line. classifier: + Used for metadata `Classifier`. A string or list of strings. Also see: * https://pypi.org/pypi?%3Aaction=list_classifiers * https://pypi.org/classifiers/ requires_dist: - A string or list of strings. None items are ignored. Also see PEP-508. + Used for metadata `Requires-Dist`. + A string or list of strings, Python packages required + at runtime. None items are ignored. requires_python: + Used for metadata `Requires-Python`. A string or list of strings. requires_external: + Used for metadata `Requires-External`. A string or list of strings. project_url: - A string or list of strings, each of the form: `{name}, {url}`. + Used for metadata `Project-URL`. + A string or list of strings, each of the form: `{name}, + {url}`. provides_extra: + Used for metadata `Provides-Extra`. A string or list of strings. entry_points: @@ -415,8 +455,11 @@ added. `to_` identifies what the file should be called within a wheel - or when installing. If `to_` ends with `/`, the leaf of `from_` - is appended to it (and `from_` must not be a `bytes`). + or when installing. If `to_` is empty or `/` we set it to the + leaf of `from_` (`from_` must not be a `bytes`) - i.e. we place + the file in the root directory of the wheel; otherwise if + `to_` ends with `/` the leaf of `from_` is appended to it (and + `from_` must not be a `bytes`). Initial `$dist-info/` in `_to` is replaced by `{name}-{version}.dist-info/`; this is useful for license files @@ -439,6 +482,11 @@ default being `sysconfig.get_path('platlib')` e.g. `myvenv/lib/python3.9/site-packages/`. + When calling this function, we assert that the file + pyproject.toml exists in the current directory. (We do this + here rather than in pipcl.Package's constructor, as otherwise + importing setup.py from non-package-related code could fail.) + fn_clean: A function taking a single arg `all_` that cleans generated files. `all_` is true iff `--all` is in argv. @@ -457,8 +505,7 @@ It can be convenient to use `pipcl.git_items()`. The specification for sdists requires that the list contains - `pyproject.toml`; we enforce this with a diagnostic rather than - raising an exception, to allow legacy command-line usage. + `pyproject.toml`; we enforce this with a Python assert. tag_python: First element of wheel tag defined in PEP-425. If None we use @@ -528,6 +575,12 @@ assert_str_or_multi( requires_external) assert_str_or_multi( project_url) assert_str_or_multi( provides_extra) + + assert re.match('^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])\\Z', name, re.IGNORECASE), ( + f'Invalid package name' + f' (https://packaging.python.org/en/latest/specifications/name-normalization/)' + f': {name!r}' + ) # https://packaging.python.org/en/latest/specifications/core-metadata/. assert re.match('([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$', name, re.IGNORECASE), \ @@ -602,7 +655,10 @@ f' metadata_directory={metadata_directory!r}' ) - if sys.implementation.name == 'graalpy': + if os.environ.get('CIBUILDWHEEL') == '1': + # Don't special-case graal builds when running under cibuildwheel. + pass + elif sys.implementation.name == 'graalpy': # We build for Graal by building a native Python wheel with Graal # Python's include paths and library directory. We then rename the # wheel to contain graal's tag etc. @@ -754,7 +810,7 @@ else: items = self.fn_sdist() - prefix = f'{_normalise(self.name)}-{self.version}' + prefix = f'{_normalise2(self.name)}-{self.version}' os.makedirs(sdist_directory, exist_ok=True) tarpath = f'{sdist_directory}/{prefix}.tar.gz' log2(f'Creating sdist: {tarpath}') @@ -796,12 +852,11 @@ assert 0, f'Path is inside sdist_directory={sdist_directory}: {from_!r}' assert os.path.exists(from_), f'Path does not exist: {from_!r}' assert os.path.isfile(from_), f'Path is not a file: {from_!r}' - if to_rel == 'pyproject.toml': - found_pyproject_toml = True add(from_, to_rel) - - if not found_pyproject_toml: - log0(f'Warning: no pyproject.toml specified.') + if to_rel == 'pyproject.toml': + found_pyproject_toml = True + + assert found_pyproject_toml, f'Cannot create sdist because file not specified: pyproject.toml' # Always add a PKG-INFO file. add_string(self._metainfo(), 'PKG-INFO') @@ -826,9 +881,11 @@ Get two-digit python version, e.g. 'cp3.8' for python-3.8.6. ''' if self.tag_python_: - return self.tag_python_ + ret = self.tag_python_ else: - return 'cp' + ''.join(platform.python_version().split('.')[:2]) + ret = 'cp' + ''.join(platform.python_version().split('.')[:2]) + assert '-' not in ret + return ret def tag_abi(self): ''' @@ -884,10 +941,13 @@ ret = ret2 log0( f'tag_platform(): returning {ret=}.') + assert '-' not in ret return ret def wheel_name(self): - return f'{_normalise(self.name)}-{self.version}-{self.tag_python()}-{self.tag_abi()}-{self.tag_platform()}.whl' + ret = f'{_normalise2(self.name)}-{self.version}-{self.tag_python()}-{self.tag_abi()}-{self.tag_platform()}.whl' + assert ret.count('-') == 4, f'Expected 4 dash characters in {ret=}.' + return ret def wheel_name_match(self, wheel): ''' @@ -916,7 +976,7 @@ log2(f'py_limited_api; {tag_python=} compatible with {self.tag_python()=}.') py_limited_api_compatible = True - log2(f'{_normalise(self.name) == name=}') + log2(f'{_normalise2(self.name) == name=}') log2(f'{self.version == version=}') log2(f'{self.tag_python() == tag_python=} {self.tag_python()=} {tag_python=}') log2(f'{py_limited_api_compatible=}') @@ -925,7 +985,7 @@ log2(f'{self.tag_platform()=}') log2(f'{tag_platform.split(".")=}') ret = (1 - and _normalise(self.name) == name + and _normalise2(self.name) == name and self.version == version and (self.tag_python() == tag_python or py_limited_api_compatible) and self.tag_abi() == tag_abi @@ -947,6 +1007,9 @@ def _call_fn_build( self, config_settings=None): assert self.fn_build + assert os.path.isfile('pyproject.toml'), ( + 'Cannot create package because file does not exist: pyproject.toml' + ) log2(f'calling self.fn_build={self.fn_build}') if inspect.signature(self.fn_build).parameters: ret = self.fn_build(config_settings) @@ -954,6 +1017,28 @@ ret = self.fn_build() assert isinstance( ret, (list, tuple)), \ f'Expected list/tuple from {self.fn_build} but got: {ret!r}' + + # Check that any extensions that we have built, have same + # py_limited_api value. If package is marked with py_limited_api=True + # then non-py_limited_api extensions seem to fail at runtime on + # Windows. + # + # (We could possibly allow package py_limited_api=False and extensions + # py_limited_api=True, but haven't tested this, and it seems simpler to + # be strict.) + for item in ret: + from_, (to_abs, to_rel) = self._fromto(item) + from_abs = os.path.abspath(from_) + is_py_limited_api = _extensions_to_py_limited_api.get(from_abs) + if is_py_limited_api is not None: + assert bool(self.py_limited_api) == bool(is_py_limited_api), ( + f'Extension was built with' + f' py_limited_api={is_py_limited_api} but pipcl.Package' + f' name={self.name!r} has' + f' py_limited_api={self.py_limited_api}:' + f' {from_abs!r}' + ) + return ret @@ -1052,7 +1137,7 @@ it writes to a slightly different directory. ''' if root is None: - root = f'{self.name}-{self.version}.dist-info' + root = f'{normalise2(self.name)}-{self.version}.dist-info' self._write_info(f'{root}/METADATA') if self.license: with open( f'{root}/COPYING', 'w') as f: @@ -1340,7 +1425,7 @@ ) def _dist_info_dir( self): - return f'{_normalise(self.name)}-{self.version}.dist-info' + return f'{_normalise2(self.name)}-{self.version}.dist-info' def _metainfo(self): ''' @@ -1446,8 +1531,12 @@ `p` is a tuple `(from_, to_)` where `from_` is str/bytes and `to_` is str. If `from_` is a bytes it is contents of file to add, otherwise the path of an existing file; non-absolute paths are assumed to be relative - to `self.root`. If `to_` is empty or ends with `/`, we append the leaf - of `from_` (which must be a str). + to `self.root`. + + If `to_` is empty or `/` we set it to the leaf of `from_` (which must + be a str) - i.e. we place the file in the root directory of the wheel; + otherwise if `to_` ends with `/` we append the leaf of `from_` (which + must be a str). If `to_` starts with `$dist-info/`, we replace this with `self._dist_info_dir()`. @@ -1467,14 +1556,16 @@ from_, to_ = p assert isinstance(from_, (str, bytes)) assert isinstance(to_, str) - if to_.endswith('/') or to_=='': + if to_ == '/' or to_ == '': + to_ = os.path.basename(from_) + elif to_.endswith('/'): to_ += os.path.basename(from_) prefix = '$dist-info/' if to_.startswith( prefix): to_ = f'{self._dist_info_dir()}/{to_[ len(prefix):]}' prefix = '$data/' if to_.startswith( prefix): - to_ = f'{self.name}-{self.version}.data/{to_[ len(prefix):]}' + to_ = f'{_normalise2(self.name)}-{self.version}.data/{to_[ len(prefix):]}' if isinstance(from_, str): from_, _ = self._path_relative_to_root( from_, assert_within_root=False) to_ = self._path_relative_to_root(to_) @@ -1482,11 +1573,13 @@ log2(f'returning {from_=} {to_=}') return from_, to_ +_extensions_to_py_limited_api = dict() def build_extension( name, path_i, outdir, + *, builddir=None, includes=None, defines=None, @@ -1498,6 +1591,7 @@ linker_extra='', swig=None, cpp=True, + source_extra=None, prerequisites_swig=None, prerequisites_compile=None, prerequisites_link=None, @@ -1539,7 +1633,7 @@ A string, or a sequence of library names. Each item is prefixed with `-l` on non-Windows. optimise: - Whether to use compiler optimisations. + Whether to use compiler optimisations and define NDEBUG. debug: Whether to build with debug symbols. compiler_extra: @@ -1550,6 +1644,8 @@ Swig command; if false we use 'swig'. cpp: If true we tell SWIG to generate C++ code instead of C. + source_extra: + Extra source files to build into the shared library, prerequisites_swig: prerequisites_compile: prerequisites_link: @@ -1584,10 +1680,15 @@ `compile_extra` (also `/I` on windows) and use them with swig so that it can see the same header files as C/C++. This is useful when using enviromment variables such as `CC` and `CXX` to set - `compile_extra. + `compile_extra`. py_limited_api: If true we build for current Python's limited API / stable ABI. + Note that we will assert false if this extension is added to a + pipcl.Package that has a different <py_limited_api>, because + on Windows importing a non-py_limited_api extension inside a + py_limited=True package fails. + Returns the leafname of the generated library file within `outdir`, e.g. `_{name}.so` on Unix or `_{name}.cp311-win_amd64.pyd` on Windows. ''' @@ -1599,6 +1700,12 @@ builddir = outdir if not swig: swig = 'swig' + + if source_extra is None: + source_extra = list() + if isinstance(source_extra, str): + source_extra = [source_extra] + includes_text = _flags( includes, '-I') defines_text = _flags( defines, '-D') libpaths_text = _flags( libpaths, '/LIBPATH:', '"') if windows() else _flags( libpaths, '-L') @@ -1608,11 +1715,11 @@ os.makedirs( outdir, exist_ok=True) # Run SWIG. - + # if infer_swig_includes: # Extract include flags from `compiler_extra`. swig_includes_extra = '' - compiler_extra_items = compiler_extra.split() + compiler_extra_items = shlex.split(compiler_extra) i = 0 while i < len(compiler_extra_items): item = compiler_extra_items[i] @@ -1647,75 +1754,130 @@ prerequisites_swig2, ) - so_suffix = _so_suffix(use_so_versioning = not py_limited_api) + if pyodide(): + so_suffix = '.so' + log0(f'pyodide: PEP-3149 suffix untested, so omitting. {_so_suffix()=}.') + else: + so_suffix = _so_suffix(use_so_versioning = not py_limited_api) path_so_leaf = f'_{name}{so_suffix}' path_so = f'{outdir}/{path_so_leaf}' py_limited_api2 = current_py_limited_api() if py_limited_api else None + compiler_command, pythonflags = base_compiler(cpp=cpp) + linker_command, _ = base_linker(cpp=cpp) + # setuptools on Linux seems to use slightly different compile flags: + # + # -fwrapv -O3 -Wall -O2 -g0 -DPY_CALL_TRAMPOLINE + # + + general_flags = '' if windows(): - path_obj = f'{path_so}.obj' - permissive = '/permissive-' EHsc = '/EHsc' T = '/Tp' if cpp else '/Tc' optimise2 = '/DNDEBUG /O2' if optimise else '/D_DEBUG' - debug2 = '' - if debug: - debug2 = '/Zi' # Generate .pdb. - # debug2 = '/Z7' # Embed debug info in .obj files. - + debug2 = '/Zi' if debug else '' py_limited_api3 = f'/DPy_LIMITED_API={py_limited_api2}' if py_limited_api2 else '' - # As of 2023-08-23, it looks like VS tools create slightly - # .dll's each time, even with identical inputs. - # - # Some info about this is at: - # https://nikhilism.com/post/2020/windows-deterministic-builds/. - # E.g. an undocumented linker flag `/Brepro`. + else: + if debug: + general_flags += '/Zi' if windows() else ' -g' + if optimise: + general_flags += ' /DNDEBUG /O2' if windows() else ' -O2 -DNDEBUG' + + py_limited_api3 = f'-DPy_LIMITED_API={py_limited_api2}' if py_limited_api2 else '' + + if windows(): + pass + elif darwin(): + # MacOS's linker does not like `-z origin`. + rpath_flag = "-Wl,-rpath,@loader_path/" + # Avoid `Undefined symbols for ... "_PyArg_UnpackTuple" ...'. + general_flags += ' -undefined dynamic_lookup' + elif pyodide(): + # Setting `-Wl,-rpath,'$ORIGIN',-z,origin` gives: + # emcc: warning: ignoring unsupported linker flag: `-rpath` [-Wlinkflags] + # wasm-ld: error: unknown -z value: origin # - - command, pythonflags = base_compiler(cpp=cpp) - command = f''' - {command} - # General: - /c # Compiles without linking. - {EHsc} # Enable "Standard C++ exception handling". - - #/MD # Creates a multithreaded DLL using MSVCRT.lib. - {'/MDd' if debug else '/MD'} - - # Input/output files: - {T}{path_cpp} # /Tp specifies C++ source file. - /Fo{path_obj} # Output file. codespell:ignore - - # Include paths: - {includes_text} - {pythonflags.includes} # Include path for Python headers. - - # Code generation: - {optimise2} - {debug2} - {permissive} # Set standard-conformance mode. - - # Diagnostics: - #/FC # Display full path of source code files passed to cl.exe in diagnostic text. - /W3 # Sets which warning level to output. /W3 is IDE default. - /diagnostics:caret # Controls the format of diagnostic messages. - /nologo # - - {defines_text} - {compiler_extra} - - {py_limited_api3} - ''' - run_if( command, path_obj, path_cpp, prerequisites_compile) - - command, pythonflags = base_linker(cpp=cpp) + rpath_flag = "-Wl,-rpath,'$ORIGIN'" + else: + rpath_flag = "-Wl,-rpath,'$ORIGIN',-z,origin" + + # Fun fact - on Linux, if the -L and -l options are before '{path_cpp}' + # they seem to be ignored... + # + path_os = list() + + for path_source in [path_cpp] + source_extra: + path_o = f'{path_source}.obj' if windows() else f'{path_source}.o' + path_os.append(f' {path_o}') + + prerequisites_path = f'{path_o}.d' + + if windows(): + compiler_command2 = f''' + {compiler_command} + # General: + /c # Compiles without linking. + {EHsc} # Enable "Standard C++ exception handling". + + #/MD # Creates a multithreaded DLL using MSVCRT.lib. + {'/MDd' if debug else '/MD'} + + # Input/output files: + {T}{path_source} # /Tp specifies C++ source file. + /Fo{path_o} # Output file. codespell:ignore + + # Include paths: + {includes_text} + {pythonflags.includes} # Include path for Python headers. + + # Code generation: + {optimise2} + {debug2} + {permissive} # Set standard-conformance mode. + + # Diagnostics: + #/FC # Display full path of source code files passed to cl.exe in diagnostic text. + /W3 # Sets which warning level to output. /W3 is IDE default. + /diagnostics:caret # Controls the format of diagnostic messages. + /nologo # + + {defines_text} + {compiler_extra} + + {py_limited_api3} + ''' + + else: + compiler_command2 = f''' + {compiler_command} + -fPIC + {general_flags.strip()} + {pythonflags.includes} + {includes_text} + {defines_text} + -MD -MF {prerequisites_path} + -c {path_source} + -o {path_o} + {compiler_extra} + {py_limited_api3} + ''' + run_if( + compiler_command2, + path_o, + path_source, + [path_source] + _get_prerequisites(prerequisites_path), + ) + + # Link + prerequisites_path = f'{path_so}.d' + if windows(): debug2 = '/DEBUG' if debug else '' base, _ = os.path.splitext(path_so_leaf) - command = f''' - {command} + command2 = f''' + {linker_command} /DLL # Builds a DLL. /EXPORT:PyInit__{name} # Exports a function. /IMPLIB:{base}.lib # Overrides the default import library name. @@ -1725,139 +1887,67 @@ {debug2} /nologo {libs_text} - {path_obj} + {' '.join(path_os)} {linker_extra} ''' - run_if( command, path_so, path_obj, prerequisites_link) - + elif pyodide(): + command2 = f''' + {linker_command} + -MD -MF {prerequisites_path} + -o {path_so} + {' '.join(path_os)} + {libpaths_text} + {libs_text} + {linker_extra} + {pythonflags.ldflags} + {rpath_flag} + ''' else: - - # Not Windows. - # - command, pythonflags = base_compiler(cpp=cpp) - - # setuptools on Linux seems to use slightly different compile flags: - # - # -fwrapv -O3 -Wall -O2 -g0 -DPY_CALL_TRAMPOLINE - # - - general_flags = '' - if debug: - general_flags += ' -g' - if optimise: - general_flags += ' -O2 -DNDEBUG' - - py_limited_api3 = f'-DPy_LIMITED_API={py_limited_api2}' if py_limited_api2 else '' - - if darwin(): - # MacOS's linker does not like `-z origin`. - rpath_flag = "-Wl,-rpath,@loader_path/" - - # Avoid `Undefined symbols for ... "_PyArg_UnpackTuple" ...'. - general_flags += ' -undefined dynamic_lookup' - elif pyodide(): - # Setting `-Wl,-rpath,'$ORIGIN',-z,origin` gives: - # emcc: warning: ignoring unsupported linker flag: `-rpath` [-Wlinkflags] - # wasm-ld: error: unknown -z value: origin - # - log0(f'pyodide: PEP-3149 suffix untested, so omitting. {_so_suffix()=}.') - path_so_leaf = f'_{name}.so' - path_so = f'{outdir}/{path_so_leaf}' - - rpath_flag = '' - else: - rpath_flag = "-Wl,-rpath,'$ORIGIN',-z,origin" - path_so = f'{outdir}/{path_so_leaf}' - # Fun fact - on Linux, if the -L and -l options are before '{path_cpp}' - # they seem to be ignored... - # - prerequisites = list() - - if pyodide(): - # Looks like pyodide's `cc` can't compile and link in one invocation. - prerequisites_compile_path = f'{path_cpp}.o.d' - prerequisites += _get_prerequisites( prerequisites_compile_path) - command = f''' - {command} - -fPIC - {general_flags.strip()} - {pythonflags.includes} - {includes_text} - {defines_text} - -MD -MF {prerequisites_compile_path} - -c {path_cpp} - -o {path_cpp}.o - {compiler_extra} - {py_limited_api3} - ''' - prerequisites_link_path = f'{path_cpp}.o.d' - prerequisites += _get_prerequisites( prerequisites_link_path) - ld, _ = base_linker(cpp=cpp) - command += f''' - && {ld} - {path_cpp}.o - -o {path_so} - -MD -MF {prerequisites_link_path} - {rpath_flag} - {libpaths_text} - {libs_text} - {linker_extra} - {pythonflags.ldflags} - ''' - else: - # We use compiler to compile and link in one command. - prerequisites_path = f'{path_so}.d' - prerequisites = _get_prerequisites(prerequisites_path) - - command = f''' - {command} - -fPIC - -shared - {general_flags.strip()} - {pythonflags.includes} - {includes_text} - {defines_text} - {path_cpp} - -MD -MF {prerequisites_path} - -o {path_so} - {compiler_extra} - {libpaths_text} - {linker_extra} - {pythonflags.ldflags} - {libs_text} - {rpath_flag} - {py_limited_api3} - ''' - command_was_run = run_if( - command, - path_so, - path_cpp, - prerequisites_compile, - prerequisites_link, - prerequisites, - ) - - if command_was_run and darwin(): - # We need to patch up references to shared libraries in `libs`. - sublibraries = list() - for lib in () if libs is None else libs: - for libpath in libpaths: - found = list() - for suffix in '.so', '.dylib': - path = f'{libpath}/lib{os.path.basename(lib)}{suffix}' - if os.path.exists( path): - found.append( path) - if found: - assert len(found) == 1, f'More than one file matches lib={lib!r}: {found}' - sublibraries.append( found[0]) - break - else: - log2(f'Warning: can not find path of lib={lib!r} in libpaths={libpaths}') - macos_patch( path_so, *sublibraries) + command2 = f''' + {linker_command} + -shared + {general_flags.strip()} + -MD -MF {prerequisites_path} + -o {path_so} + {' '.join(path_os)} + {libpaths_text} + {libs_text} + {linker_extra} + {pythonflags.ldflags} + {rpath_flag} + {py_limited_api3} + ''' + link_was_run = run_if( + command2, + path_so, + path_cpp, + *path_os, + *_get_prerequisites(f'{path_so}.d'), + ) + + if link_was_run and darwin(): + # We need to patch up references to shared libraries in `libs`. + sublibraries = list() + for lib in () if libs is None else libs: + for libpath in libpaths: + found = list() + for suffix in '.so', '.dylib': + path = f'{libpath}/lib{os.path.basename(lib)}{suffix}' + if os.path.exists( path): + found.append( path) + if found: + assert len(found) == 1, f'More than one file matches lib={lib!r}: {found}' + sublibraries.append( found[0]) + break + else: + log2(f'Warning: can not find path of lib={lib!r} in libpaths={libpaths}') + macos_patch( path_so, *sublibraries) #run(f'ls -l {path_so}', check=0) #run(f'file {path_so}', check=0) + _extensions_to_py_limited_api[os.path.abspath(path_so)] = py_limited_api + return path_so_leaf @@ -1983,7 +2073,7 @@ ) if not e: branch = out.strip() - log(f'git_info(): directory={directory!r} returning branch={branch!r} sha={sha!r} comment={comment!r}') + log1(f'git_info(): directory={directory!r} returning branch={branch!r} sha={sha!r} comment={comment!r}') return sha, comment, diff, branch @@ -2027,88 +2117,96 @@ def git_get( - remote, local, *, + remote=None, branch=None, + tag=None, + text=None, depth=1, env_extra=None, - tag=None, update=True, submodules=True, - default_remote=None, ): ''' - Ensures that <local> is a git checkout (at either <tag>, or <branch> HEAD) - of a remote repository. - - Exactly one of <branch> and <tag> must be specified, or <remote> must start - with 'git:' and match the syntax described below. + Creates/updates local checkout <local> of remote repository and returns + absolute path of <local>. + + If <text> is set but does not start with 'git:', it is assumed to be an up + to date local checkout, and we return absolute path of <text> without doing + any git operations. Args: + local: + Local directory. Created and/or updated using `git clone` and `git + fetch` etc. remote: Remote git repostitory, for example - 'https://github.com/ArtifexSoftware/mupdf.git'. + 'https://github.com/ArtifexSoftware/mupdf.git'. Can be overridden + by <text>. + branch: + Branch to use; can be overridden by <text>. + tag: + Tag to use; can be overridden by <text>. + text: + If None or empty: + Ignored. - If starts with 'git:', the remaining text should be a command-line - style string containing some or all of these args: - --branch <branch> - --tag <tag> - <remote> - These overrides <branch>, <tag> and <default_remote>. + If starts with 'git:': + The remaining text should be a command-line + style string containing some or all of these args: + --branch <branch> + --tag <tag> + <remote> + These overrides <branch>, <tag> and <remote>. + Otherwise: + <text> is assumed to be a local directory, and we simply return + it as an absolute path without doing any git operations. For example these all clone/update/branch master of https://foo.bar/qwerty.git to local checkout 'foo-local': - git_get('https://foo.bar/qwerty.git', 'foo-local', branch='master') - git_get('git:--branch master https://foo.bar/qwerty.git', 'foo-local') - git_get('git:--branch master', 'foo-local', default_remote='https://foo.bar/qwerty.git') - git_get('git:', 'foo-local', branch='master', default_remote='https://foo.bar/qwerty.git') - - local: - Local directory. If <local>/.git exists, we attempt to run `git - update` in it. - branch: - Branch to use. Is used as default if remote starts with 'git:'. + git_get('foo-local', remote='https://foo.bar/qwerty.git', branch='master') + git_get('foo-local', text='git:--branch master https://foo.bar/qwerty.git') + git_get('foo-local', text='git:--branch master', remote='https://foo.bar/qwerty.git') + git_get('foo-local', text='git:', branch='master', remote='https://foo.bar/qwerty.git') depth: Depth of local checkout when cloning and fetching, or None. env_extra: Dict of extra name=value environment variables to use whenever we run git. - tag: - Tag to use. Is used as default if remote starts with 'git:'. update: If false we do not update existing repository. Might be useful if testing without network access. submodules: If true, we clone with `--recursive --shallow-submodules` and run `git submodule update --init --recursive` before returning. - default_remote: - The remote URL if <remote> starts with 'git:' but does not specify - the remote URL. ''' log0(f'{remote=} {local=} {branch=} {tag=}') - if remote.startswith('git:'): - remote0 = remote - args = iter(shlex.split(remote0[len('git:'):])) - remote = default_remote - while 1: - try: - arg = next(args) - except StopIteration: - break - if arg == '--branch': - branch = next(args) - tag = None - elif arg == '--tag': - tag == next(args) - branch = None - else: - remote = arg - assert remote, f'{default_remote=} and no remote specified in remote={remote0!r}.' - assert branch or tag, f'{branch=} {tag=} and no branch/tag specified in remote={remote0!r}.' + + if text: + if text.startswith('git:'): + args = iter(shlex.split(text[len('git:'):])) + while 1: + try: + arg = next(args) + except StopIteration: + break + if arg == '--branch': + branch = next(args) + tag = None + elif arg == '--tag': + tag = next(args) + branch = None + else: + remote = arg + assert remote, f'<remote> unset and no remote specified in {text=}.' + assert branch or tag, f'<branch> and <tag> unset and no branch/tag specified in {text=}.' + else: + log0(f'Using local directory {text!r}.') + return os.path.abspath(text) - assert (branch and not tag) or (not branch and tag), f'Must specify exactly one of <branch> and <tag>.' + assert (branch and not tag) or (not branch and tag), f'Must specify exactly one of <branch> and <tag>; {branch=} {tag=}.' depth_arg = f' --depth {depth}' if depth else '' @@ -2116,7 +2214,7 @@ # This seems to pull in the entire repository. log0(f'do_update(): attempting to update {local=}.') # Remove any local changes. - run(f'cd {local} && git checkout .', env_extra=env_extra) + run(f'cd {local} && git reset --hard', env_extra=env_extra) if tag: # `-u` avoids `fatal: Refusing to fetch into current branch`. # Using '+' and `revs/tags/` prefix seems to avoid errors like: @@ -2164,6 +2262,7 @@ # Show sha of checkout. run( f'cd {local} && git show --pretty=oneline|head -n 1', check=False) + return os.path.abspath(local) def run( @@ -2452,10 +2551,11 @@ log2(f'### Have removed `-lcrypt` from ldflags: {self.ldflags!r} -> {ldflags2!r}') self.ldflags = ldflags2 - log1(f'{self.includes=}') - log1(f' {includes_=}') - log1(f'{self.ldflags=}') - log1(f' {ldflags_=}') + if 0: + log1(f'{self.includes=}') + log1(f' {includes_=}') + log1(f'{self.ldflags=}') + log1(f' {ldflags_=}') def macos_add_cross_flags(command): @@ -2555,7 +2655,7 @@ return f'x{32 if sys.maxsize == 2**31 - 1 else 64}' -def run_if( command, out, *prerequisites): +def run_if( command, out, *prerequisites, caller=1): ''' Runs a command only if the output file is not up to date. @@ -2585,21 +2685,26 @@ ... os.remove( out) >>> if os.path.exists( f'{out}.cmd'): ... os.remove( f'{out}.cmd') - >>> run_if( f'touch {out}', out) + >>> run_if( f'touch {out}', out, caller=0) pipcl.py:run_if(): Running command because: File does not exist: 'run_if_test_out' pipcl.py:run_if(): Running: touch run_if_test_out True If we repeat, the output file will be up to date so the command is not run: - >>> run_if( f'touch {out}', out) + >>> run_if( f'touch {out}', out, caller=0) pipcl.py:run_if(): Not running command because up to date: 'run_if_test_out' If we change the command, the command is run: - >>> run_if( f'touch {out}', out) - pipcl.py:run_if(): Running command because: Command has changed - pipcl.py:run_if(): Running: touch run_if_test_out + >>> run_if( f'touch {out};', out, caller=0) + pipcl.py:run_if(): Running command because: Command has changed: + pipcl.py:run_if(): @@ -1,2 +1,2 @@ + pipcl.py:run_if(): touch + pipcl.py:run_if(): -run_if_test_out + pipcl.py:run_if(): +run_if_test_out; + pipcl.py:run_if(): + pipcl.py:run_if(): Running: touch run_if_test_out; True If we add a prerequisite that is newer than the output, the command is run: @@ -2608,15 +2713,20 @@ >>> prerequisite = 'run_if_test_prerequisite' >>> run( f'touch {prerequisite}', caller=0) pipcl.py:run(): Running: touch run_if_test_prerequisite - >>> run_if( f'touch {out}', out, prerequisite) - pipcl.py:run_if(): Running command because: Prerequisite is new: 'run_if_test_prerequisite' + >>> run_if( f'touch {out}', out, prerequisite, caller=0) + pipcl.py:run_if(): Running command because: Command has changed: + pipcl.py:run_if(): @@ -1,2 +1,2 @@ + pipcl.py:run_if(): touch + pipcl.py:run_if(): -run_if_test_out; + pipcl.py:run_if(): +run_if_test_out + pipcl.py:run_if(): pipcl.py:run_if(): Running: touch run_if_test_out True If we repeat, the output will be newer than the prerequisite, so the command is not run: - >>> run_if( f'touch {out}', out, prerequisite) + >>> run_if( f'touch {out}', out, prerequisite, caller=0) pipcl.py:run_if(): Not running command because up to date: 'run_if_test_out' ''' doit = False @@ -2633,13 +2743,34 @@ cmd = f.read() else: cmd = None - if command != cmd: + cmd_args = shlex.split(cmd or '') + command_args = shlex.split(command or '') + if command_args != cmd_args: if cmd is None: doit = 'No previous command stored' else: doit = f'Command has changed' if 0: - doit += f': {cmd!r} => {command!r}' + doit += f':\n {cmd!r}\n {command!r}' + if 0: + doit += f'\nbefore:\n' + doit += textwrap.indent(cmd, ' ') + doit += f'\nafter:\n' + doit += textwrap.indent(command, ' ') + if 1: + # Show diff based on commands split into pseudo lines by + # shlex.split(). + doit += ':\n' + lines = difflib.unified_diff( + cmd.split(), + command.split(), + lineterm='', + ) + # Skip initial lines. + assert next(lines) == '--- ' + assert next(lines) == '+++ ' + for line in lines: + doit += f' {line}\n' if not doit: # See whether any prerequisites are newer than target. @@ -2652,9 +2783,9 @@ for p in prerequisites: prerequisites_all += _make_prerequisites( p) if 0: - log2( 'prerequisites_all:') + log2( 'prerequisites_all:', caller=caller+1) for i in prerequisites_all: - log2( f' {i!r}') + log2( f' {i!r}', caller=caller+1) pre_mtime = 0 pre_path = None for prerequisite in prerequisites_all: @@ -2670,7 +2801,7 @@ break if not doit: if pre_mtime > out_mtime: - doit = f'Prerequisite is new: {pre_path!r}' + doit = f'Prerequisite is new: {os.path.abspath(pre_path)!r}' if doit: # Remove `cmd_path` before we run the command, so any failure @@ -2680,16 +2811,16 @@ os.remove( cmd_path) except Exception: pass - log1( f'Running command because: {doit}') - - run( command) + log1( f'Running command because: {doit}', caller=caller+1) + + run( command, caller=caller+1) # Write the command we ran, into `cmd_path`. with open( cmd_path, 'w') as f: f.write( command) return True else: - log1( f'Not running command because up to date: {out!r}') + log1( f'Not running command because up to date: {out!r}', caller=caller+1) if 0: log2( f'out_mtime={time.ctime(out_mtime)} pre_mtime={time.ctime(pre_mtime)}.' @@ -2761,6 +2892,11 @@ return re.sub(r"[-_.]+", "-", name).lower() +def _normalise2(name): + # https://packaging.python.org/en/latest/specifications/binary-distribution-format/ + return _normalise(name).replace('-', '_') + + def _assert_version_pep_440(version): assert re.match( r'^([1-9][0-9]*!)?(0|[1-9][0-9]*)(\.(0|[1-9][0-9]*))*((a|b|rc)(0|[1-9][0-9]*))?(\.post(0|[1-9][0-9]*))?(\.dev(0|[1-9][0-9]*))?$', @@ -2790,6 +2926,9 @@ global g_log_line_numbers g_log_line_numbers = bool(yes) +def log(text='', caller=1): + _log(text, 0, caller+1) + def log0(text='', caller=1): _log(text, 0, caller+1) @@ -2813,19 +2952,30 @@ print(f'{filename}:{fr.function}(): {line}', file=sys.stdout, flush=1) -def relpath(path, start=None): +def relpath(path, start=None, allow_up=True): ''' A safe alternative to os.path.relpath(), avoiding an exception on Windows if the drive needs to change - in this case we use os.path.abspath(). + + Args: + path: + Path to be processed. + start: + Start directory or current directory if None. + allow_up: + If false we return absolute path is <path> is not within <start>. ''' if windows(): try: - return os.path.relpath(path, start) + ret = os.path.relpath(path, start) except ValueError: # os.path.relpath() fails if trying to change drives. - return os.path.abspath(path) + ret = os.path.abspath(path) else: - return os.path.relpath(path, start) + ret = os.path.relpath(path, start) + if not allow_up and ret.startswith('../') or ret.startswith('..\\'): + ret = os.path.abspath(path) + return ret def _so_suffix(use_so_versioning=True): @@ -2981,21 +3131,22 @@ for path, id_ in items.items(): id0 = self.items0.get(path) if id0 != id_: - #mtime0, hash0 = id0 - #mtime1, hash1 = id_ - #log0(f'New/modified file {path=}.') - #log0(f' {mtime0=} {"==" if mtime0==mtime1 else "!="} {mtime1=}.') - #log0(f' {hash0=} {"==" if hash0==hash1 else "!="} {hash1=}.') ret.append(path) return ret + def get_n(self, n): + ''' + Returns new files matching <glob_pattern>, asserting that there are + exactly <n>. + ''' + ret = self.get() + assert len(ret) == n, f'{len(ret)=}: {ret}' + return ret def get_one(self): ''' Returns new match of <glob_pattern>, asserting that there is exactly one. ''' - ret = self.get() - assert len(ret) == 1, f'{len(ret)=}' - return ret[0] + return self.get_n(1)[0] def _file_id(self, path): mtime = os.stat(path).st_mtime with open(path, 'rb') as f: @@ -3025,7 +3176,7 @@ Args: swig: - If starts with 'git:', passed as <remote> arg to git_remote(). + If starts with 'git:', passed as <text> arg to git_get(). quick: If true, we do not update/build local checkout if the binary is already present. @@ -3033,9 +3184,8 @@ path to use for checkout. ''' if swig and swig.startswith('git:'): - assert platform.system() != 'Windows' - swig_local = os.path.abspath(swig_local) - # Note that {swig_local}/install/bin/swig doesn't work on MacoS because + assert platform.system() != 'Windows', f'Cannot build swig on Windows.' + # Note that {swig_local}/install/bin/swig doesn't work on MacOS because # {swig_local}/INSTALL is a file and the fs is case-insensitive. swig_binary = f'{swig_local}/install-dir/bin/swig' if quick and os.path.isfile(swig_binary): @@ -3043,10 +3193,10 @@ else: # Clone swig. swig_env_extra = None - git_get( - swig, + swig_local = git_get( swig_local, - default_remote='https://github.com/swig/swig.git', + text=swig, + remote='https://github.com/swig/swig.git', branch='master', ) if darwin(): @@ -3061,10 +3211,10 @@ # > If you need to have bison first in your PATH, run: # > echo 'export PATH="/opt/homebrew/opt/bison/bin:$PATH"' >> ~/.zshrc # - run(f'brew install bison') - PATH = os.environ['PATH'] - PATH = f'/opt/homebrew/opt/bison/bin:{PATH}' - swig_env_extra = dict(PATH=PATH) + swig_env_extra = dict() + macos_add_brew_path('bison', swig_env_extra) + run(f'which bison') + run(f'which bison', env_extra=swig_env_extra) # Build swig. run(f'cd {swig_local} && ./autogen.sh', env_extra=swig_env_extra) run(f'cd {swig_local} && ./configure --prefix={swig_local}/install-dir', env_extra=swig_env_extra) @@ -3076,6 +3226,38 @@ return swig +def macos_add_brew_path(package, env=None, gnubin=True): + ''' + Adds path(s) for Brew <package>'s binaries to env['PATH']. + + Args: + package: + Name of package. We get <package_root> of installed package by + running `brew --prefix <package>`. + env: + The environment dict to modify. If None we use os.environ. If PATH + is not in <env>, we first copy os.environ['PATH'] into <env>. + gnubin: + If true, we also add path to gnu binaries if it exists, + <package_root>/libexe/gnubin. + ''' + if not darwin(): + return + if env is None: + env = os.environ + if 'PATH' not in env: + env['PATH'] = os.environ['PATH'] + package_root = run(f'brew --prefix {package}', capture=1).strip() + def add(path): + if os.path.isdir(path): + log1(f'Adding to $PATH: {path}') + PATH = env['PATH'] + env['PATH'] = f'{path}:{PATH}' + add(f'{package_root}/bin') + if gnubin: + add(f'{package_root}/libexec/gnubin') + + def _show_dict(d): ret = '' for n in sorted(d.keys()): @@ -3119,12 +3301,76 @@ return includes_, ldflags_ +def venv_in(path=None): + ''' + If path is None, returns true if we are in a venv. Otherwise returns true + only if we are in venv <path>. + ''' + if path: + return os.path.abspath(sys.prefix) == os.path.abspath(path) + else: + return sys.prefix != sys.base_prefix + + +def venv_run(args, path, recreate=True, clean=False): + ''' + Runs Python command inside venv and returns termination code. + + Args: + args: + List of args or string command. + path: + Path of venv directory. + recreate: + If false we do not run `<sys.executable> -m venv <path>` if <path> + already exists. This avoids a delay in the common case where <path> + is already set up, but fails if <path> exists but does not contain + a valid venv. + clean: + If true we first delete <path>. + ''' + if clean: + log(f'Removing any existing venv {path}.') + assert path.startswith('venv-') + shutil.rmtree(path, ignore_errors=1) + if recreate or not os.path.isdir(path): + run(f'{sys.executable} -m venv {path}') + + if isinstance(args, str): + args_string = args + elif platform.system() == 'Windows': + # shlex not reliable on Windows so we use Use crude quoting with "...". + args_string = '' + for i, arg in enumerate(args): + assert '"' not in arg + if i: + args_string += ' ' + args_string += f'"{arg}"' + else: + args_string = shlex.join(args) + + if platform.system() == 'Windows': + command = f'{path}\\Scripts\\activate && python {args_string}' + else: + command = f'. {path}/bin/activate && python {args_string}' + e = run(command, check=0) + return e + + if __name__ == '__main__': # Internal-only limited command line support, used if # graal_legacy_python_config is true. # includes, ldflags = sysconfig_python_flags() - if sys.argv[1:] == ['--graal-legacy-python-config', '--includes']: + if sys.argv[1] == '--doctest': + import doctest + if sys.argv[2:]: + for f in sys.argv[2:]: + ff = globals()[f] + doctest.run_docstring_examples(ff, globals()) + else: + doctest.testmod(None) + elif sys.argv[1:] == ['--graal-legacy-python-config', '--includes']: print(includes) elif sys.argv[1:] == ['--graal-legacy-python-config', '--ldflags']: print(ldflags)
--- a/scripts/test.py Mon Sep 15 11:43:07 2025 +0200 +++ b/scripts/test.py Sat Oct 11 11:19:58 2025 +0200 @@ -4,7 +4,7 @@ Examples: - ./PyMuPDF/scripts/test.py --m mupdf build test + ./PyMuPDF/scripts/test.py -m mupdf build test Build and test with pre-existing local mupdf/ checkout. ./PyMuPDF/scripts/test.py build test @@ -13,10 +13,13 @@ ./PyMuPDF/scripts/test.py -m 'git:https://git.ghostscript.com/mupdf.git' build test Build and test with internal checkout of MuPDF master. - ./PyMuPDF/scripts/test.py -m 'git:--branch 1.26.x https://github.com/ArtifexSoftware/mupdf.git' build test + ./PyMuPDF/scripts/test.py -m ':1.26.x' build test Build and test using internal checkout of mupdf 1.26.x branch from Github. + ./PyMuPDF/scripts/test.py install test -i 1.26.3 -k test_2596 + Install pymupdf-1.26.3 from pupi.org and test only test_2596. + Usage: * Command line arguments are called parameters if they start with `-`, @@ -31,7 +34,7 @@ Other: * If we are not already running inside a Python venv, we automatically create a - venv and re-run ourselves inside it. + venv and re-run ourselves inside it (also see the -v option). * Build/wheel/install commands always install into the venv. * Tests use whatever PyMuPDF/MuPDF is currently installed in the venv. * We run tests with pytest. @@ -55,6 +58,7 @@ `setup.py`.] --build-flavour <build_flavour> + [Obsolete.] Combination of 'p', 'b', 'd'. See ../setup.py's description of PYMUPDF_SETUP_FLAVOUR. Default is 'pbd', i.e. self-contained PyMuPDF wheels including MuPDF build-time files. @@ -71,10 +75,11 @@ --cibw-name <cibw_name> Name to use when installing cibuildwheel, e.g.: --cibw-name cibuildwheel==3.0.0b1 + --cibw-name git+https://github.com/pypa/cibuildwheel Default is `cibuildwheel`, i.e. the current release. --cibw-pyodide 0|1 - Experimental, make `cibuild` command build a pyodide wheel. + Experimental, make `cibw` command build a pyodide wheel. 2025-05-27: this fails when building mupdf C API - `ld -r -b binary ...` fails with: emcc: error: binary: No such file or directory ("binary" was expected to be an input file, based on the commandline arguments provided) @@ -90,6 +95,56 @@ --cibw-release-2 Set up so that `cibw` builds only linux-aarch64 wheel. + --cibw-skip-add-defaults 0|1 + If 1 (the default) we add defaults to CIBW_SKIP such as `pp*` (to + exclude pypy) and `cp3??t-*` (to exclude free-threading). + + --cibw-test-project 0|1 + If 1, command `cibw` will use a minimal test project instead of the + PyMuPDF directory itself. + + The test project uses setjmp/longjmp and C++ throw/catch. + + The test checks for current behaviour, so with `--cibw-pyodide 1` it + succeeds if the cibw command fails with the expected error message. + + 2025-08-22: + Builds ok on Linux. + + Fails at runtime with --cibw-pyodide 1: + + With compile/link flags ``: + (+45.0s): remote.py:233:main: jules-devuan: Pyodide has suffered a fatal error. Please report this to the Pyodide maintainers. + (+45.1s): remote.py:233:main: jules-devuan: Stack (most recent call first): + (+45.1s): remote.py:233:main: jules-devuan: File "/tmp/cibw-run-h_pfo0wf/cp312-pyodide_wasm32/venv-test/lib/python3.12/site-packages/foo/__init__.py", line 63 in bar + (+45.1s): remote.py:233:main: jules-devuan: File "<string>The cause of the fatal error was: + (+45.1s): remote.py:233:main: jules-devuan: CppException std::runtime_error: deliberate exception + (+45.1s): remote.py:233:main: jules-devuan: at convertCppException (/home/jules/.cache/cibuildwheel/pyodide-build-0.30.7/0.27.7/xbuildenv/pyodide-root/dist/pyodide.asm.js:10:48959) + (+45.1s): remote.py:233:main: jules-devuan: at API.fatal_error (/home/jules/.cache/cibuildwheel/pyodide-build-0.30.7/0.27.7/xbuildenv/pyodide-root/dist/pyodide.asm.js:10:49253) + (+45.1s): remote.py:233:main: jules-devuan: at main (file:///home/jules/.cache/cibuildwheel/pyodide-build-0.30.7/0.27.7/xbuildenv/pyodide-root/dist/python_cli_entry.mjs:149:13) { + (+45.1s): remote.py:233:main: jules-devuan: ty: 'std::runtime_error', + (+45.1s): remote.py:233:main: jules-devuan: pyodide_fatal_error: true + (+45.1s): remote.py:233:main: jules-devuan: } + (+45.1s): remote.py:233:main: jules-devuan: ", line 1 in <module> + (+45.1s): remote.py:233:main: jules-devuan: CppException std::runtime_error: deliberate exception + (+45.1s): remote.py:233:main: jules-devuan: at convertCppException (/home/jules/.cache/cibuildwheel/pyodide-build-0.30.7/0.27.7/xbuildenv/pyodide-root/dist/pyodide.asm.js:10:48959) + (+45.1s): remote.py:233:main: jules-devuan: at API.fatal_error (/home/jules/.cache/cibuildwheel/pyodide-build-0.30.7/0.27.7/xbuildenv/pyodide-root/dist/pyodide.asm.js:10:49253) + (+45.1s): remote.py:233:main: jules-devuan: at main (file:///home/jules/.cache/cibuildwheel/pyodide-build-0.30.7/0.27.7/xbuildenv/pyodide-root/dist/python_cli_entry.mjs:149:13) { + (+45.1s): remote.py:233:main: jules-devuan: ty: 'std::runtime_error', + (+45.1s): remote.py:233:main: jules-devuan: pyodide_fatal_error: true + (+45.1s): remote.py:233:main: jules-devuan: } + + With compile/link flags `-fwasm-exceptions`: + [LinkError: WebAssembly.instantiate(): Import #60 module="env" function="__c_longjmp": tag import requires a WebAssembly.Tag] + + With compile/link flags `-fwasm-exceptions -sSUPPORT_LONGJMP=wasm`: + [LinkError: WebAssembly.instantiate(): Import #60 module="env" function="__c_longjmp": tag import requires a WebAssembly.Tag] + + --cibw-test-project-setjmp 0|1 + If 1, --cibw-test-project builds a project that uses + setjmp/longjmp. Default is 0 (Windows builds fail when attempting to + compile the output from swig). + -d Equivalent to `-b debug`. @@ -104,9 +159,6 @@ -f 0|1 If 1 we also test alias `fitz` as well as `pymupdf`. Default is '0'. - --gdb 0|1 - Run tests under gdb. Requires user interaction. - --graal Use graal - run inside a Graal VM instead of a Python venv. @@ -130,15 +182,21 @@ Default is 'r'. Also see `PyMuPDF:tests/run_compound.py`. -i <install_version> - Set version installed by the 'install' command. + Controls behaviour of `install` command: + + * If <install_version> ends with `.whl` we use `pip install + <install_version>`. + * If <install_version> starts with == or >= or >, we use `pip install + pymupdf<install_version>`. + * Otherwise we use `pip install pymupdf==<install_version>`. -k <expression> Specify which test(s) to run; passed straight through to pytest's `-k`. For example `-k test_3354`. -m <location> | --mupdf <location> - Location of local mupdf/ directory or 'git:...' to be used - when building PyMuPDF. + Location of mupdf as local directory or remote git, to be used when + building PyMuPDF. This sets environment variable PYMUPDF_SETUP_MUPDF_BUILD, which is used by PyMuPDF/setup.py. If not specified PyMuPDF will download its default @@ -176,7 +234,7 @@ -P 0|1 If 1, automatically install required system packages such as - Valgrind. Default is 0. + Valgrind. Default is 1 if running as Github action, otherwise 0. --pybind 0|1 Experimental, for investigating @@ -197,9 +255,9 @@ --show-args: Show sys.argv and exit. For debugging. - --sync-paths + --sync-paths <path> Do not run anything, instead write required files/directories/checkouts - to stdout, one per line. This is to help with automated running on + to <path>, one per line. This is to help with automated running on remote machines. --system-site-packages 0|1 @@ -241,7 +299,7 @@ Use specified prefix when running pytest, must be one of: gdb helgrind - vagrind + valgrind -v <venv> venv is: @@ -332,6 +390,19 @@ run = pipcl.run +# We build and test Python 3.x for x in this range. +python_versions_minor = range(9, 14+1) + +def cibw_cp(*version_minors): + ''' + Returns <version_tuples> in 'cp39*' format, e.g. suitable for CIBW_BUILD. + ''' + ret = list() + for version_minor in version_minors: + ret.append(f'cp3{version_minor}*') + return ' '.join(ret) + + def main(argv): if github_workflow_unimportant(): @@ -341,6 +412,9 @@ cibw_name = None cibw_pyodide = None cibw_pyodide_version = None + cibw_skip_add_defaults = True + cibw_test_project = None + cibw_test_project_setjmp = False commands = list() env_extra = dict() graal = False @@ -348,7 +422,7 @@ install_version = None mupdf_sync = None os_names = list() - system_packages = False + system_packages = True if os.environ.get('GITHUB_ACTIONS') == 'true' else False pybind = False pyodide_build_version = None pytest_options = '' @@ -408,13 +482,16 @@ env_extra['CIBW_ARCHS_LINUX'] = 'auto64' env_extra['CIBW_ARCHS_MACOS'] = 'auto64' env_extra['CIBW_ARCHS_WINDOWS'] = 'auto' # win32 and win64. - env_extra['CIBW_SKIP'] = 'pp* *i686 cp36* cp37* *musllinux*aarch64*' + env_extra['CIBW_SKIP'] = '*i686 *musllinux*aarch64* cp3??t-*' + cibw_skip_add_defaults = 0 elif arg == '--cibw-release-2': - env_extra['CIBW_ARCHS_LINUX'] = 'aarch64' # Testing only first and last python versions because otherwise # Github times out after 6h. - env_extra['CIBW_BUILD'] = 'cp39* cp313*' + env_extra['CIBW_BUILD'] = cibw_cp(python_versions_minor[0], python_versions_minor[-1]) + env_extra['CIBW_ARCHS_LINUX'] = 'aarch64' + env_extra['CIBW_SKIP'] = '*i686 *musllinux*aarch64* cp3??t-*' + cibw_skip_add_defaults = 0 os_names = ['linux'] elif arg == '--cibw-archs-linux': @@ -424,7 +501,16 @@ cibw_name = next(args) elif arg == '--cibw-pyodide': - cibw_pyodide = next(args) + cibw_pyodide = int(next(args)) + + elif arg == '--cibw-skip-add-defaults': + cibw_skip_add_defaults = int(next(args)) + + elif arg == '--cibw-test-project': + cibw_test_project = int(next(args)) + + elif arg == '--cibw-test-project-setjmp': + cibw_test_project_setjmp = int(next(args)) elif arg == '-d': env_extra['PYMUPDF_SETUP_MUPDF_BUILD_TYPE'] = 'debug' @@ -463,13 +549,13 @@ _mupdf = None elif _mupdf.startswith(':'): _branch = _mupdf[1:] - _mupdf = 'git:--branch {_branch} https://github.com/ArtifexSoftware/mupdf.git' - os.environ['PYMUPDF_SETUP_MUPDF_BUILD'] = _mupdf + _mupdf = f'git:--branch {_branch} https://github.com/ArtifexSoftware/mupdf.git' + env_extra['PYMUPDF_SETUP_MUPDF_BUILD'] = _mupdf elif _mupdf.startswith('git:') or '://' in _mupdf: - os.environ['PYMUPDF_SETUP_MUPDF_BUILD'] = _mupdf + env_extra['PYMUPDF_SETUP_MUPDF_BUILD'] = _mupdf else: assert os.path.isdir(_mupdf), f'Not a directory: {_mupdf=}' - os.environ['PYMUPDF_SETUP_MUPDF_BUILD'] = os.path.abspath(_mupdf) + env_extra['PYMUPDF_SETUP_MUPDF_BUILD'] = os.path.abspath(_mupdf) mupdf_sync = _mupdf elif arg == '--mupdf-clean': @@ -501,7 +587,7 @@ elif arg == '--show-args': show_args = 1 elif arg == '--sync-paths': - sync_paths = True + sync_paths = next(args) elif arg == '--system-site-packages': system_site_packages = int(next(args)) @@ -539,10 +625,11 @@ # Handle special args --sync-paths, -h, -v, -o first. # if sync_paths: - # Just print required files, directories and checkouts. - print(pymupdf_dir) - if mupdf_sync: - print(mupdf_sync) + # Print required files, directories and checkouts. + with open(sync_paths, 'w') as f: + print(pymupdf_dir, file=f) + if mupdf_sync: + print(mupdf_sync, file=f) return if show_help: @@ -578,7 +665,7 @@ if venv == 1 and os.path.exists(pyenv_dir) and os.path.exists(venv_name): log(f'{venv=} and {venv_name=} already exists so not building pyenv or creating venv.') else: - pipcl.git_get('https://github.com/pyenv/pyenv.git', pyenv_dir, branch='master') + pipcl.git_get(pyenv_dir, remote='https://github.com/pyenv/pyenv.git', branch='master') run(f'cd {pyenv_dir} && src/configure && make -C src') run(f'which pyenv') run(f'pyenv install -v -s {graalpy}') @@ -622,27 +709,33 @@ elif command == 'cibw': # Build wheel(s) with cibuildwheel. - if cibw_pyodide and env_extra.get('CIBW_BUILD') is None: - assert 0, f'Need a Python version for Pyodide.' - CIBW_BUILD = 'cp312*' - env_extra['CIBW_BUILD'] = CIBW_BUILD - log(f'Defaulting to {CIBW_BUILD=} for Pyodide.') - #if cibw_pyodide_version == None: - # cibw_pyodide_version = '0.28.0' + + if platform.system() == 'Linux': + PYMUPDF_SETUP_MUPDF_BUILD = env_extra.get('PYMUPDF_SETUP_MUPDF_BUILD') + if PYMUPDF_SETUP_MUPDF_BUILD and not PYMUPDF_SETUP_MUPDF_BUILD.startswith('git:'): + assert PYMUPDF_SETUP_MUPDF_BUILD.startswith('/') + env_extra['PYMUPDF_SETUP_MUPDF_BUILD'] = f'/host/{PYMUPDF_SETUP_MUPDF_BUILD}' + cibuildwheel( env_extra, cibw_name or 'cibuildwheel', cibw_pyodide, cibw_pyodide_version, cibw_sdist, + cibw_test_project, + cibw_test_project_setjmp, + cibw_skip_add_defaults, ) elif command == 'install': p = 'pymupdf' if install_version: - if not install_version.startswith(('==', '>=', '>')): - p = f'{p}==' - p = f'{p}{install_version}' + if install_version.endswith('.whl'): + p = install_version + elif install_version.startswith(('==', '>=', '>')): + p = f'{p}{install_version}' + else: + p = f'{p}=={install_version}' run(f'pip install --force-reinstall {p}') have_installed = True @@ -739,7 +832,7 @@ venv, wheel, ): - print(f'{build_isolation=}') + log(f'{build_isolation=}') if build_isolation is None: # On OpenBSD libclang is not available on pypi.org, so we need to force @@ -775,7 +868,16 @@ run(f'pip install{build_isolation_text} -v --force-reinstall {pymupdf_dir_abs}', env_extra=env_extra) -def cibuildwheel(env_extra, cibw_name, cibw_pyodide, cibw_pyodide_version, cibw_sdist): +def cibuildwheel( + env_extra, + cibw_name, + cibw_pyodide, + cibw_pyodide_version, + cibw_sdist, + cibw_test_project, + cibw_test_project_setjmp, + cibw_skip_add_defaults, + ): if cibw_sdist and platform.system() == 'Linux': log(f'Building sdist.') @@ -789,9 +891,19 @@ # Some general flags. if 'CIBW_BUILD_VERBOSITY' not in env_extra: env_extra['CIBW_BUILD_VERBOSITY'] = '1' - if 'CIBW_SKIP' not in env_extra: - env_extra['CIBW_SKIP'] = 'pp* *i686 cp36* cp37* *musllinux* *-win32 *-aarch64' - + + # Add default flags to CIBW_SKIP. + # 2025-10-07: `cp3??t-*` excludes free-threading, which currently breaks + # some tests. + + if cibw_skip_add_defaults: + CIBW_SKIP = env_extra.get('CIBW_SKIP', '') + CIBW_SKIP += ' *i686 *musllinux* *-win32 *-aarch64 cp3??t-*' + CIBW_SKIP = CIBW_SKIP.split() + CIBW_SKIP = sorted(list(set(CIBW_SKIP))) + CIBW_SKIP = ' '.join(CIBW_SKIP) + env_extra['CIBW_SKIP'] = CIBW_SKIP + # Set what wheels to build, if not already specified. if 'CIBW_ARCHS' not in env_extra: if 'CIBW_ARCHS_WINDOWS' not in env_extra: @@ -823,14 +935,18 @@ CIBW_BUILD = env_extra.get('CIBW_BUILD') log(f'{CIBW_BUILD=}') if CIBW_BUILD is None: - if os.environ.get('GITHUB_ACTIONS') == 'true': + if cibw_pyodide: + # Using python-3.13 fixes problems with MuPDF's setjmp/longjmp. + CIBW_BUILD = 'cp313*' + elif os.environ.get('GITHUB_ACTIONS') == 'true': # Build/test all supported Python versions. - CIBW_BUILD = 'cp39* cp310* cp311* cp312* cp313*' + CIBW_BUILD = cibw_cp(*python_versions_minor) else: # Build/test current Python only. v = platform.python_version_tuple()[:2] log(f'{v=}') CIBW_BUILD = f'cp{"".join(v)}*' + log(f'Defaulting to {CIBW_BUILD=}.') cibw_pyodide_args = '' if cibw_pyodide: @@ -843,11 +959,25 @@ env_extra['CIBW_PYODIDE_VERSION'] = cibw_pyodide_version env_extra['CIBW_ENABLE'] = 'pyodide-prerelease' - # Pass all the environment variables we have set, to Linux - # docker. Note that this will miss any settings in the original - # environment. - env_extra['CIBW_ENVIRONMENT_PASS_LINUX'] = ' '.join(sorted(env_extra.keys())) - + # Pass all the environment variables we have set, to Linux docker. Note + # that this will miss any settings in the original environment. We have to + # add CIBW_BUILD explicitly because we haven't set it yet. + CIBW_ENVIRONMENT_PASS_LINUX = set(env_extra.keys()) + CIBW_ENVIRONMENT_PASS_LINUX.add('CIBW_BUILD') + CIBW_ENVIRONMENT_PASS_LINUX = sorted(list(CIBW_ENVIRONMENT_PASS_LINUX)) + CIBW_ENVIRONMENT_PASS_LINUX = ' '.join(CIBW_ENVIRONMENT_PASS_LINUX) + env_extra['CIBW_ENVIRONMENT_PASS_LINUX'] = CIBW_ENVIRONMENT_PASS_LINUX + + if cibw_test_project: + cibw_do_test_project( + env_extra, + CIBW_BUILD, + cibw_pyodide, + cibw_pyodide_args, + cibw_test_project_setjmp, + ) + return + # Build for lowest (assumed first) Python version. # CIBW_BUILD_0 = CIBW_BUILD.split()[0] @@ -859,9 +989,175 @@ # will notice that the wheel we built above supports all versions of # Python, so will not actually do any builds here. # + # We only do this if there are more than one Python versions. This still + # duplicates the testing of the first python version. + if len(CIBW_BUILD.split()) > 1: + env_extra['CIBW_BUILD'] = CIBW_BUILD + run(f'cd {pymupdf_dir} && cibuildwheel{cibw_pyodide_args}', env_extra=env_extra) + run(f'ls -ld {pymupdf_dir}/wheelhouse/*') + + +def cibw_do_test_project( + env_extra, + CIBW_BUILD, + cibw_pyodide, + cibw_pyodide_args, + cibw_test_project_setjmp, + ): + testdir = f'{pymupdf_dir_abs}/cibw_test' + shutil.rmtree(testdir, ignore_errors=1) + os.mkdir(testdir) + with open(f'{testdir}/setup.py', 'w') as f: + f.write(textwrap.dedent(f''' + import shutil + import sys + import os + import pipcl + + def build(): + so_leaf = pipcl.build_extension( + name = 'foo', + path_i = 'foo.i', + outdir = 'build', + source_extra = 'qwerty.cpp', + py_limited_api = True, + ) + + return [ + ('build/foo.py', 'foo/__init__.py'), + (f'build/{{so_leaf}}', f'foo/'), + ] + + p = pipcl.Package( + name = 'pymupdf-test', + version = '1.2.3', + fn_build = build, + py_limited_api=True, + ) + + def get_requires_for_build_wheel(config_settings=None): + return ['swig'] + + build_wheel = p.build_wheel + build_sdist = p.build_sdist + + # Handle old-style setup.py command-line usage: + if __name__ == '__main__': + p.handle_argv(sys.argv) + ''')) + with open(f'{testdir}/foo.i', 'w') as f: + if cibw_test_project_setjmp: + f.write(textwrap.dedent(''' + %{ + #include <stdexcept> + + #include <assert.h> + #include <setjmp.h> + #include <stdio.h> + #include <string.h> + + int qwerty(void); + + static sigjmp_buf jmpbuf; + static int bar0(const char* text) + { + printf("bar0(): text: %s\\n", text); + + int q = qwerty(); + printf("bar0(): q=%i\\n", q); + + int len = (int) strlen(text); + printf("bar0(): len=%i\\n", len); + printf("bar0(): calling longjmp().\\n"); + fflush(stdout); + longjmp(jmpbuf, 1); + assert(0); + } + int bar1(const char* text) + { + int ret = 0; + if (setjmp(jmpbuf) == 0) + { + ret = bar0(text); + } + else + { + printf("bar1(): setjmp() returned non-zero.\\n"); + throw std::runtime_error("deliberate exception"); + } + assert(0); + } + int bar(const char* text) + { + int ret = 0; + try + { + ret = bar1(text); + } + catch(std::exception& e) + { + printf("bar1(): received exception: %s\\n", e.what()); + } + return ret; + } + %} + int bar(const char* text); + ''')) + else: + f.write(textwrap.dedent(''' + %{ + #include <stdexcept> + + #include <assert.h> + #include <stdio.h> + #include <string.h> + + int qwerty(void); + + int bar(const char* text) + { + qwerty(); + return strlen(text); + } + %} + int bar(const char* text); + ''')) + + with open(f'{testdir}/qwerty.cpp', 'w') as f: + f.write(textwrap.dedent(''' + #include <stdio.h> + int qwerty(void) + { + printf("qwerty()\\n"); + return 3; + } + ''')) + + with open(f'{testdir}/pyproject.toml', 'w') as f: + f.write(textwrap.dedent(''' + [build-system] + # We define required packages in setup.py:get_requires_for_build_wheel(). + requires = [] + + # See pep-517. + # + build-backend = "setup" + backend-path = ["."] + ''')) + + shutil.copy2(f'{pymupdf_dir_abs}/pipcl.py', f'{testdir}/pipcl.py') + shutil.copy2(f'{pymupdf_dir_abs}/wdev.py', f'{testdir}/wdev.py') + env_extra['CIBW_BUILD'] = CIBW_BUILD - run(f'cd {pymupdf_dir} && cibuildwheel{cibw_pyodide_args}', env_extra=env_extra) - run(f'ls -ld {pymupdf_dir}/wheelhouse/*') + CIBW_TEST_COMMAND = '' + if cibw_pyodide: + CIBW_TEST_COMMAND += 'pyodide xbuildenv search --all; ' + CIBW_TEST_COMMAND += 'python -c "import foo; foo.bar(\\"some text\\")"' + env_extra['CIBW_TEST_COMMAND'] = CIBW_TEST_COMMAND + #env_extra['CIBW_TEST_COMMAND'] = '' + + run(f'cd {testdir} && cibuildwheel --output-dir ../wheelhouse{cibw_pyodide_args}', env_extra=env_extra) + run(f'ls -ldt {pymupdf_dir_abs}/wheelhouse/*') def build_pyodide_wheel(pyodide_build_version=None): @@ -1088,16 +1384,21 @@ PYODIDE_ROOT = os.environ.get('PYODIDE_ROOT') if PYODIDE_ROOT is not None: + # We can't install packages with `pip install`; setup.py will have + # specified pytest in the wheels's <requires_dist>, so it will be + # already installed. + # log(f'Not installing test packages because {PYODIDE_ROOT=}.') - command = f'{pytest_options} {pytest_arg} -s' + command = f'{pytest_options} {pytest_arg}' args = shlex.split(command) - print(f'{PYODIDE_ROOT=} so calling pytest.main(args).') - print(f'{command=}') - print(f'args are ({len(args)}):') + log(f'{PYODIDE_ROOT=} so calling pytest.main(args).') + log(f'{command=}') + log(f'args are ({len(args)}):') for arg in args: - print(f' {arg!r}') + log(f' {arg!r}') import pytest - pytest.main(args) + e = pytest.main(args) + assert e == 0, f'pytest.main() failed: {e=}' return if venv >= 2: @@ -1163,7 +1464,7 @@ # Always start by removing any test_*_fitz.py files. for p in glob.glob(f'{pymupdf_dir_rel}/tests/test_*_fitz.py'): - print(f'Removing {p=}') + log(f'Removing {p=}') os.remove(p) if test_fitz: # Create copies of each test file, modified to use `pymupdf` @@ -1175,7 +1476,7 @@ continue branch, leaf = os.path.split(p) p2 = f'{branch}/{leaf[:5]}fitz_{leaf[5:]}' - print(f'Converting {p=} to {p2=}.') + log(f'Converting {p=} to {p2=}.') with open(p, encoding='utf8') as f: text = f.read() text2 = re.sub("([^\'])\\bpymupdf\\b", '\\1fitz', text)
--- a/setup.py Mon Sep 15 11:43:07 2025 +0200 +++ b/setup.py Sat Oct 11 11:19:58 2025 +0200 @@ -88,11 +88,12 @@ Empty string: Build PyMuPDF with the system MuPDF. A string starting with 'git:': - Use `git clone` to get a MuPDF checkout. We use the - string in the git clone command; it must contain the git - URL from which to clone, and can also contain other `git - clone` args, for example: - PYMUPDF_SETUP_MUPDF_BUILD="git:--branch master https://github.com/ArtifexSoftware/mupdf.git" + We use `git` commands to clone/update a local MuPDF checkout. + Should match `git:[--branch <branch>][--tag <tag>][<remote>]`. + If <remote> is omitted we use a default. + For example: + PYMUPDF_SETUP_MUPDF_BUILD="git:--branch master" + Passed as <text> arg to pipcl.git_get(). Otherwise: Location of mupdf directory. @@ -425,7 +426,7 @@ mupdf_tgz = os.path.abspath( f'{__file__}/../mupdf.tgz') -def get_mupdf_internal(out, location=None, sha=None, local_tgz=None): +def get_mupdf_internal(out, location=None, local_tgz=None): ''' Gets MuPDF as either a .tgz or a local directory. @@ -438,8 +439,6 @@ If starts with 'git:', should be remote git location. Otherwise if containing '://' should be URL for .tgz. Otherwise should path of local mupdf checkout. - sha: - If not None and we use git clone, we checkout this sha. local_tgz: If not None, must be local .tgz file. Returns: @@ -451,7 +450,7 @@ default location. ''' - log(f'get_mupdf_internal(): {out=} {location=} {sha=}') + log(f'get_mupdf_internal(): {out=} {location=}') assert out in ('dir', 'tgz') if location is None: location = f'https://mupdf.com/downloads/archive/mupdf-{version_mupdf}-source.tar.gz' @@ -465,21 +464,15 @@ if local_tgz: assert os.path.isfile(local_tgz) elif location.startswith( 'git:'): - location_git = location[4:] local_dir = 'mupdf-git' + pipcl.git_get(local_dir, text=location, remote='https://github.com/ArtifexSoftware/mupdf.git') - # Try to update existing checkout. - e = run(f'cd {local_dir} && git pull && git submodule update --init', check=False) - if e: - # No existing git checkout, so do a fresh clone. - _fs_remove(local_dir) - gitargs = location[4:] - run(f'git clone --recursive --depth 1 --shallow-submodules {gitargs} {local_dir}') - # Show sha of checkout. - run( f'cd {local_dir} && git show --pretty=oneline|head -n 1', check=False) - if sha: - run( f'cd {local_dir} && git checkout {sha}') + run( + f'cd {local_dir} && git show --pretty=oneline|head -n 1', + check = False, + prefix = 'mupdf git id: ', + ) elif '://' in location: # Download .tgz. local_tgz = os.path.basename( location) @@ -574,14 +567,10 @@ windows = platform.system() == 'Windows' or platform.system().startswith('CYGWIN') msys2 = platform.system().startswith('MSYS_NT-') -pyodide_flags = '-fwasm-exceptions' - if os.environ.get('PYODIDE') == '1': if os.environ.get('OS') != 'pyodide': log('PYODIDE=1, setting OS=pyodide.') os.environ['OS'] = 'pyodide' - os.environ['XCFLAGS'] = pyodide_flags - os.environ['XCXXFLAGS'] = pyodide_flags pyodide = os.environ.get('OS') == 'pyodide' @@ -704,8 +693,8 @@ add('d', f'{mupdf_build_dir}/libmupdf-threads.a', f'{to_dir_d}/lib/') elif pyodide: add('p', f'{mupdf_build_dir}/_mupdf.so', to_dir) - add('b', f'{mupdf_build_dir}/libmupdfcpp.so', 'PyMuPDF.libs/') - add('b', f'{mupdf_build_dir}/libmupdf.so', 'PyMuPDF.libs/') + add('b', f'{mupdf_build_dir}/libmupdfcpp.so', to_dir) + add('b', f'{mupdf_build_dir}/libmupdf.so', to_dir) else: add('p', f'{mupdf_build_dir}/_mupdf.so', to_dir) add('b', pipcl.get_soname(f'{mupdf_build_dir}/libmupdfcpp.so'), to_dir) @@ -748,10 +737,12 @@ except Exception: return 0 swig_version_tuple = tuple(int_or_0(i) for i in swig_version.split('.')) + version_p_tuple = tuple(int_or_0(i) for i in version_p.split('.')) log(f'{swig_version=}') text = '' text += f'mupdf_location = {mupdf_location!r}\n' text += f'pymupdf_version = {version_p!r}\n' + text += f'pymupdf_version_tuple = {version_p_tuple!r}\n' text += f'pymupdf_git_sha = {sha!r}\n' text += f'pymupdf_git_diff = {diff!r}\n' text += f'pymupdf_git_branch = {branch!r}\n' @@ -1211,10 +1202,6 @@ if cxxflags: compiler_extra += f' {cxxflags}' - if pyodide: - compiler_extra += f' {pyodide_flags}' - linker_extra += f' {pyodide_flags}' - return compiler_extra, linker_extra, includes, defines, optimise, debug, libpaths, libs, libraries, @@ -1280,9 +1267,9 @@ # # PyMuPDF version. -version_p = '1.26.4' +version_p = '1.26.5' -version_mupdf = '1.26.7' +version_mupdf = '1.26.10' # PyMuPDFb version. This is the PyMuPDF version whose PyMuPDFb wheels we will # (re)use if generating separate PyMuPDFb wheels. Though as of PyMuPDF-1.24.11 @@ -1413,9 +1400,6 @@ ret.append(libclang) elif openbsd: print(f'OpenBSD: libclang not available via pip; assuming `pkg_add py3-llvm`.') - elif darwin and platform.machine() == 'arm64': - print(f'MacOS/arm64: forcing use of libclang 16.0.6 because 18.1.1 known to fail with `clang.cindex.TranslationUnitLoadError: Error parsing translation unit.`') - ret.append('libclang==16.0.6') elif darwin and platform_release_tuple() < (18,): # There are still of problems when building on old macos. ret.append('libclang==14.0.6')
--- a/src/__init__.py Mon Sep 15 11:43:07 2025 +0200 +++ b/src/__init__.py Sat Oct 11 11:19:58 2025 +0200 @@ -383,6 +383,7 @@ from ._build import pymupdf_git_diff # noqa F401 from ._build import pymupdf_git_sha # noqa F401 from ._build import pymupdf_version # noqa F401 +from ._build import pymupdf_version_tuple # noqa F401 from ._build import swig_version # noqa F401 from ._build import swig_version_tuple # noqa F401 @@ -393,7 +394,6 @@ # Versions as tuples; useful when comparing versions. # -pymupdf_version_tuple = tuple( [_int_rc(i) for i in pymupdf_version.split('.')]) mupdf_version_tuple = tuple( [_int_rc(i) for i in mupdf_version.split('.')]) assert mupdf_version_tuple == (mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH), \ @@ -1035,6 +1035,12 @@ stream = JM_BinFromBuffer(buf) res['stream'] = stream return res + + def get_text(self, *args, **kwargs): + return utils.get_text(self, *args, **kwargs) + + def get_textbox(self, *args, **kwargs): + return utils.get_textbox(self, *args, **kwargs) def get_textpage(self, clip=None, flags=0): """Make annotation TextPage.""" @@ -3058,6 +3064,14 @@ v = JM_pdf_obj_from_str( pdf, font) mupdf.pdf_dict_put( fonts, k, v) + def del_toc_item( + self, + idx: int, + ) -> None: + """Delete TOC / bookmark item by index.""" + xref = self.get_outline_xrefs()[idx] + self._remove_toc_item(xref) + def _delToC(self): """Delete the TOC.""" if self.is_closed or self.is_encrypted: @@ -3103,6 +3117,454 @@ raise ValueError( MSG_BAD_XREF) mupdf.pdf_delete_object(pdf, xref) + def _do_links( + doc1: 'Document', + doc2: 'Document', + from_page: int = -1, + to_page: int = -1, + start_at: int = -1, + ) -> None: + """Insert links contained in copied page range into destination PDF. + + Parameter values **must** equal those of method insert_pdf(), which must + have been previously executed. + """ + #pymupdf.log( 'utils.do_links()') + # -------------------------------------------------------------------------- + # internal function to create the actual "/Annots" object string + # -------------------------------------------------------------------------- + def cre_annot(lnk, xref_dst, pno_src, ctm): + """Create annotation object string for a passed-in link.""" + + r = lnk["from"] * ctm # rect in PDF coordinates + rect = _format_g(tuple(r)) + if lnk["kind"] == LINK_GOTO: + txt = annot_skel["goto1"] # annot_goto + idx = pno_src.index(lnk["page"]) + p = lnk["to"] * ctm # target point in PDF coordinates + annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect) + + elif lnk["kind"] == LINK_GOTOR: + if lnk["page"] >= 0: + txt = annot_skel["gotor1"] # annot_gotor + pnt = lnk.get("to", Point(0, 0)) # destination point + if type(pnt) is not Point: + pnt = Point(0, 0) + annot = txt( + lnk["page"], + pnt.x, + pnt.y, + lnk["zoom"], + lnk["file"], + lnk["file"], + rect, + ) + else: + txt = annot_skel["gotor2"] # annot_gotor_n + to = get_pdf_str(lnk["to"]) + to = to[1:-1] + f = lnk["file"] + annot = txt(to, f, rect) + + elif lnk["kind"] == LINK_LAUNCH: + txt = annot_skel["launch"] # annot_launch + annot = txt(lnk["file"], lnk["file"], rect) + + elif lnk["kind"] == LINK_URI: + txt = annot_skel["uri"] # annot_uri + annot = txt(lnk["uri"], rect) + + else: + annot = "" + + return annot + + # -------------------------------------------------------------------------- + + # validate & normalize parameters + if from_page < 0: + fp = 0 + elif from_page >= doc2.page_count: + fp = doc2.page_count - 1 + else: + fp = from_page + + if to_page < 0 or to_page >= doc2.page_count: + tp = doc2.page_count - 1 + else: + tp = to_page + + if start_at < 0: + raise ValueError("'start_at' must be >= 0") + sa = start_at + + incr = 1 if fp <= tp else -1 # page range could be reversed + + # lists of source / destination page numbers + pno_src = list(range(fp, tp + incr, incr)) + pno_dst = [sa + i for i in range(len(pno_src))] + + # lists of source / destination page xrefs + xref_src = [] + xref_dst = [] + for i in range(len(pno_src)): + p_src = pno_src[i] + p_dst = pno_dst[i] + old_xref = doc2.page_xref(p_src) + new_xref = doc1.page_xref(p_dst) + xref_src.append(old_xref) + xref_dst.append(new_xref) + + # create the links for each copied page in destination PDF + for i in range(len(xref_src)): + page_src = doc2[pno_src[i]] # load source page + links = page_src.get_links() # get all its links + #log( '{pno_src=}') + #log( '{type(page_src)=}') + #log( '{page_src=}') + #log( '{=i len(links)}') + if len(links) == 0: # no links there + page_src = None + continue + ctm = ~page_src.transformation_matrix # calc page transformation matrix + page_dst = doc1[pno_dst[i]] # load destination page + link_tab = [] # store all link definitions here + for l in links: + if l["kind"] == LINK_GOTO and (l["page"] not in pno_src): + continue # GOTO link target not in copied pages + annot_text = cre_annot(l, xref_dst, pno_src, ctm) + if annot_text: + link_tab.append(annot_text) + if link_tab != []: + page_dst._addAnnot_FromString( tuple(link_tab)) + #log( 'utils.do_links() returning.') + + def _do_widgets( + tar: 'Document', + src: 'Document', + graftmap, + from_page: int = -1, + to_page: int = -1, + start_at: int = -1, + join_duplicates=0, + ) -> None: + """Insert widgets of copied page range into target PDF. + + Parameter values **must** equal those of method insert_pdf() which + must have been previously executed. + """ + if not src.is_form_pdf: # nothing to do: source PDF has no fields + return + + def clean_kid_parents(acro_fields): + """ Make sure all kids have correct "Parent" pointers.""" + for i in range(acro_fields.pdf_array_len()): + parent = acro_fields.pdf_array_get(i) + kids = parent.pdf_dict_get(PDF_NAME("Kids")) + for j in range(kids.pdf_array_len()): + kid = kids.pdf_array_get(j) + kid.pdf_dict_put(PDF_NAME("Parent"), parent) + + def join_widgets(pdf, acro_fields, xref1, xref2, name): + """Called for each pair of widgets having the same name. + + Args: + pdf: target MuPDF document + acro_fields: object Root/AcroForm/Fields + xref1, xref2: widget xrefs having same names + name: (str) the name + + Result: + Defined or updated widget parent that points to both widgets. + """ + + def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2): + """Merge widget in xref2 into "Kids" list of widget xref1. + + Args: + xref1, kids1: target widget and its "Kids" array. + xref2, kids2: source wwidget and its "Kids" array (may be empty). + """ + # make indirect objects from widgets + w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0) + w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0) + # find source widget in "Fields" array + idx = acro_fields.pdf_array_find(w2_ind) + acro_fields.pdf_array_delete(idx) + + if not kids2.pdf_is_array(): # source widget has no kids + widget = mupdf.pdf_load_object(pdf, xref2) + + # delete name from widget and insert target as parent + widget.pdf_dict_del(PDF_NAME("T")) + widget.pdf_dict_put(PDF_NAME("Parent"), w1_ind) + + # put in target Kids + kids1.pdf_array_push(w2_ind) + else: # copy source kids to target kids + for i in range(kids2.pdf_array_len()): + kid = kids2.pdf_array_get(i) + kid.pdf_dict_put(PDF_NAME("Parent"), w1_ind) + kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0) + kids1.pdf_array_push(kid_ind) + + def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name): + """Make new "Parent" for two widgets with same name. + + Args: + xref1, w1: first widget + xref2, w2: second widget + name: field name + + Result: + Both widgets have no "Kids". We create a new object with the + name and a "Kids" array containing the widgets. + Original widgets must be removed from AcroForm/Fields. + """ + # make new "Parent" object + new = mupdf.pdf_new_dict(pdf, 5) + new.pdf_dict_put_text_string(PDF_NAME("T"), name) + kids = new.pdf_dict_put_array(PDF_NAME("Kids"), 2) + new_obj = mupdf.pdf_add_object(pdf, new) + new_obj_xref = new_obj.pdf_to_num() + new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0) + + # copy over some required source widget properties + ft = w1.pdf_dict_get(PDF_NAME("FT")) + w1.pdf_dict_del(PDF_NAME("FT")) + new_obj.pdf_dict_put(PDF_NAME("FT"), ft) + + aa = w1.pdf_dict_get(PDF_NAME("AA")) + w1.pdf_dict_del(PDF_NAME("AA")) + new_obj.pdf_dict_put(PDF_NAME("AA"), aa) + + # remove name field, insert "Parent" field in source widgets + w1.pdf_dict_del(PDF_NAME("T")) + w1.pdf_dict_put(PDF_NAME("Parent"), new_ind) + w2.pdf_dict_del(PDF_NAME("T")) + w2.pdf_dict_put(PDF_NAME("Parent"), new_ind) + + # put source widgets in "kids" array + ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0) + ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0) + kids.pdf_array_push(ind1) + kids.pdf_array_push(ind2) + + # remove source widgets from "AcroForm/Fields" + idx = acro_fields.pdf_array_find(ind1) + acro_fields.pdf_array_delete(idx) + idx = acro_fields.pdf_array_find(ind2) + acro_fields.pdf_array_delete(idx) + + acro_fields.pdf_array_push(new_ind) + + w1 = mupdf.pdf_load_object(pdf, xref1) + w2 = mupdf.pdf_load_object(pdf, xref2) + kids1 = w1.pdf_dict_get(PDF_NAME("Kids")) + kids2 = w2.pdf_dict_get(PDF_NAME("Kids")) + + # check which widget has a suitable "Kids" array + if kids1.pdf_is_array(): + re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order + elif kids2.pdf_is_array(): + re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order + else: + new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order + + def get_kids(parent, kids_list): + """Return xref list of leaf kids for a parent. + + Call with an empty list. + """ + kids = mupdf.pdf_dict_get(parent, PDF_NAME("Kids")) + if not kids.pdf_is_array(): + return kids_list + for i in range(kids.pdf_array_len()): + kid = kids.pdf_array_get(i) + if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, PDF_NAME("Kids"))): + kids_list = get_kids(kid, kids_list) + else: + kids_list.append(kid.pdf_to_num()) + return kids_list + + def kids_xrefs(widget): + """Get the xref of top "Parent" and the list of leaf widgets.""" + kids_list = [] + parent = mupdf.pdf_dict_get(widget, PDF_NAME("Parent")) + parent_xref = parent.pdf_to_num() + if parent_xref == 0: + return parent_xref, kids_list + kids_list = get_kids(parent, kids_list) + return parent_xref, kids_list + + def deduplicate_names(pdf, acro_fields, join_duplicates=False): + """Handle any widget name duplicates caused by the merge.""" + names = {} # key is a widget name, value a list of widgets having it. + + # extract all names and widgets in "AcroForm/Fields" + for i in range(mupdf.pdf_array_len(acro_fields)): + wobject = mupdf.pdf_array_get(acro_fields, i) + xref = wobject.pdf_to_num() + + # extract widget name and collect widget(s) using it + T = mupdf.pdf_dict_get_text_string(wobject, PDF_NAME("T")) + xrefs = names.get(T, []) + xrefs.append(xref) + names[T] = xrefs + + for name, xrefs in names.items(): + if len(xrefs) < 2: + continue + xref0, xref1 = xrefs[:2] # only exactly 2 should occur! + if join_duplicates: # combine fields with equal names + join_widgets(pdf, acro_fields, xref0, xref1, name) + else: # make field names unique + newname = name + f" [{xref1}]" # append this to the name + wobject = mupdf.pdf_load_object(pdf, xref1) + wobject.pdf_dict_put_text_string(PDF_NAME("T"), newname) + + clean_kid_parents(acro_fields) + + def get_acroform(doc): + """Retrieve the AcroForm dictionary form a PDF.""" + pdf = mupdf.pdf_document_from_fz_document(doc) + # AcroForm (= central form field info) + return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm") + + tarpdf = mupdf.pdf_document_from_fz_document(tar) + srcpdf = mupdf.pdf_document_from_fz_document(src) + + if tar.is_form_pdf: + # target is a Form PDF, so use it to include source fields + acro = get_acroform(tar) + # Important arrays in AcroForm + acro_fields = acro.pdf_dict_get(PDF_NAME("Fields")) + tar_co = acro.pdf_dict_get(PDF_NAME("CO")) + if not tar_co.pdf_is_array(): + tar_co = acro.pdf_dict_put_array(PDF_NAME("CO"), 5) + else: + # target is no Form PDF, so copy over source AcroForm + acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy + + # Clear "Fields" and "CO" arrays: will be populated by page fields. + # This is required to avoid copying unneeded objects. + acro.pdf_dict_del(PDF_NAME("Fields")) + acro.pdf_dict_put_array(PDF_NAME("Fields"), 5) + acro.pdf_dict_del(PDF_NAME("CO")) + acro.pdf_dict_put_array(PDF_NAME("CO"), 5) + + # Enrich AcroForm for copying to target + acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro) + + # Insert AcroForm into target PDF + acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft) + acro_fields = acro_tar.pdf_dict_get(PDF_NAME("Fields")) + tar_co = acro_tar.pdf_dict_get(PDF_NAME("CO")) + + # get its xref and insert it into target catalog + tar_xref = acro_tar.pdf_to_num() + acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) + root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), PDF_NAME("Root")) + root.pdf_dict_put(PDF_NAME("AcroForm"), acro_tar_ind) + + if from_page <= to_page: + src_range = range(from_page, to_page + 1) + else: + src_range = range(from_page, to_page - 1, -1) + + parents = {} # information about widget parents + + # remove "P" owning page reference from all widgets of all source pages + for i in src_range: + src_page = src[i] + for xref in [ + xref + for xref, wtype, _ in src_page.annot_xrefs() + if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member + ]: + w_obj = mupdf.pdf_load_object(srcpdf, xref) + w_obj.pdf_dict_del(PDF_NAME("P")) + + # get the widget's parent structure + parent_xref, old_kids = kids_xrefs(w_obj) + if parent_xref: + parents[parent_xref] = { + "new_xref": 0, + "old_kids": old_kids, + "new_kids": [], + } + # Copy over Parent widgets first - they are not page-dependent + for xref in parents.keys(): # pylint: disable=consider-using-dict-items + parent = mupdf.pdf_load_object(srcpdf, xref) + parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent) + parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft) + kids_xrefs_new = get_kids(parent_tar, []) + parent_xref_new = parent_tar.pdf_to_num() + parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0) + acro_fields.pdf_array_push(parent_ind) + parents[xref]["new_xref"] = parent_xref_new + parents[xref]["new_kids"] = kids_xrefs_new + + for i in range(len(src_range)): + # read first copied over page in target + tar_page = tar[start_at + i] + + # read the original page in the source PDF + src_page = src[src_range[i]] + + # now walk through source page widgets and copy over + w_xrefs = [ # widget xrefs of the source page + xref + for xref, wtype, _ in src_page.annot_xrefs() + if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member + ] + if not w_xrefs: # no widgets on this source page + continue + + # convert to formal PDF page + tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page) + + # extract annotations array + tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), PDF_NAME("Annots")) + if not mupdf.pdf_is_array(tar_annots): + tar_annots = mupdf.pdf_dict_put_array( + tar_page_pdf.obj(), PDF_NAME("Annots"), 5 + ) + + for xref in w_xrefs: + w_obj = mupdf.pdf_load_object(srcpdf, xref) + + # check if field takes part in inter-field validations + is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C")) + + # check if parent of widget already in target + parent_xref = mupdf.pdf_to_num( + w_obj.pdf_dict_get(PDF_NAME("Parent")) + ) + if parent_xref == 0: # parent not in target yet + try: + w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj) + except Exception as e: + message_warning(f"cannot copy widget at {xref=}: {e}") + continue + w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft) + tar_xref = w_obj_tar.pdf_to_num() + w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) + mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) + mupdf.pdf_array_push(acro_fields, w_obj_tar_ind) + else: + parent = parents[parent_xref] + idx = parent["old_kids"].index(xref) # search for xref in parent + tar_xref = parent["new_kids"][idx] + w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) + mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) + + # Into "AcroForm/CO" if a computation field. + if is_aac: + mupdf.pdf_array_push(tar_co, w_obj_tar_ind) + + deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates) + def _embeddedFileGet(self, idx): pdf = _as_pdf_document(self) names = mupdf.pdf_dict_getl( @@ -4266,6 +4728,107 @@ self._reset_page_refs() + def get_char_widths( + doc: 'Document', + xref: int, + limit: int = 256, + idx: int = 0, + fontdict: OptDict = None, + ) -> list: + """Get list of glyph information of a font. + + Notes: + Must be provided by its XREF number. If we already dealt with the + font, it will be recorded in doc.FontInfos. Otherwise we insert an + entry there. + Finally we return the glyphs for the font. This is a list of + (glyph, width) where glyph is an integer controlling the char + appearance, and width is a float controlling the char's spacing: + width * fontsize is the actual space. + For 'simple' fonts, glyph == ord(char) will usually be true. + Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here. + """ + fontinfo = CheckFontInfo(doc, xref) + if fontinfo is None: # not recorded yet: create it + if fontdict is None: + name, ext, stype, asc, dsc = utils._get_font_properties(doc, xref) + fontdict = { + "name": name, + "type": stype, + "ext": ext, + "ascender": asc, + "descender": dsc, + } + else: + name = fontdict["name"] + ext = fontdict["ext"] + stype = fontdict["type"] + ordering = fontdict["ordering"] + simple = fontdict["simple"] + + if ext == "": + raise ValueError("xref is not a font") + + # check for 'simple' fonts + if stype in ("Type1", "MMType1", "TrueType"): + simple = True + else: + simple = False + + # check for CJK fonts + if name in ("Fangti", "Ming"): + ordering = 0 + elif name in ("Heiti", "Song"): + ordering = 1 + elif name in ("Gothic", "Mincho"): + ordering = 2 + elif name in ("Dotum", "Batang"): + ordering = 3 + else: + ordering = -1 + + fontdict["simple"] = simple + + if name == "ZapfDingbats": + glyphs = zapf_glyphs + elif name == "Symbol": + glyphs = symbol_glyphs + else: + glyphs = None + + fontdict["glyphs"] = glyphs + fontdict["ordering"] = ordering + fontinfo = [xref, fontdict] + doc.FontInfos.append(fontinfo) + else: + fontdict = fontinfo[1] + glyphs = fontdict["glyphs"] + simple = fontdict["simple"] + ordering = fontdict["ordering"] + + if glyphs is None: + oldlimit = 0 + else: + oldlimit = len(glyphs) + + mylimit = max(256, limit) + + if mylimit <= oldlimit: + return glyphs + + if ordering < 0: # not a CJK font + glyphs = doc._get_char_widths( + xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx + ) + else: # CJK fonts use char codes and width = 1 + glyphs = None + + fontdict["glyphs"] = glyphs + fontinfo[1] = fontdict + UpdateFontInfo(doc, fontinfo) + + return glyphs + def get_layer(self, config=-1): """Content of ON, OFF, RBGroups of an OC layer.""" pdf = _as_pdf_document(self) @@ -4323,6 +4886,23 @@ xref = mupdf.pdf_create_object(pdf) return xref + def get_oc(doc: 'Document', xref: int) -> int: + """Return optional content object xref for an image or form xobject. + + Args: + xref: (int) xref number of an image or form xobject. + """ + if doc.is_closed or doc.is_encrypted: + raise ValueError("document close or encrypted") + t, name = doc.xref_get_key(xref, "Subtype") + if t != "name" or name not in ("/Image", "/Form"): + raise ValueError("bad object type at xref %i" % xref) + t, oc = doc.xref_get_key(xref, "OC") + if t != "xref": + return 0 + rc = int(oc.replace("0 R", "")) + return rc + def get_ocgs(self): """Show existing optional content groups.""" ci = mupdf.pdf_new_name( "CreatorInfo") @@ -4355,7 +4935,11 @@ o = mupdf.pdf_array_get( intent, j) if mupdf.pdf_is_name( o): intents.append( mupdf.pdf_to_name( o)) - hidden = mupdf.pdf_is_ocg_hidden( pdf, mupdf.PdfObj(), usage, ocg) + if mupdf_version_tuple >= (1, 27): + resource_stack = mupdf.PdfResourceStack() + hidden = mupdf.pdf_is_ocg_hidden( pdf, resource_stack, usage, ocg) + else: + hidden = mupdf.pdf_is_ocg_hidden( pdf, mupdf.PdfObj(), usage, ocg) item = { "name": name, "intent": intents, @@ -4366,6 +4950,73 @@ rc[ temp] = item return rc + def get_ocmd(doc: 'Document', xref: int) -> dict: + """Return the definition of an OCMD (optional content membership dictionary). + + Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and + /VE (visibility expression, PDF array). Via string manipulation, this + info is converted to a Python dictionary with keys "xref", "ocgs", "policy" + and "ve" - ready to recycle as input for 'set_ocmd()'. + """ + + if xref not in range(doc.xref_length()): + raise ValueError("bad xref") + text = doc.xref_object(xref, compressed=True) + if "/Type/OCMD" not in text: + raise ValueError("bad object type") + textlen = len(text) + + p0 = text.find("/OCGs[") # look for /OCGs key + p1 = text.find("]", p0) + if p0 < 0 or p1 < 0: # no OCGs found + ocgs = None + else: + ocgs = text[p0 + 6 : p1].replace("0 R", " ").split() + ocgs = list(map(int, ocgs)) + + p0 = text.find("/P/") # look for /P policy key + if p0 < 0: + policy = None + else: + p1 = text.find("ff", p0) + if p1 < 0: + p1 = text.find("on", p0) + if p1 < 0: # some irregular syntax + raise ValueError("bad object at xref") + else: + policy = text[p0 + 3 : p1 + 2] + + p0 = text.find("/VE[") # look for /VE visibility expression key + if p0 < 0: # no visibility expression found + ve = None + else: + lp = rp = 0 # find end of /VE by finding last ']'. + p1 = p0 + while lp < 1 or lp != rp: + p1 += 1 + if not p1 < textlen: # some irregular syntax + raise ValueError("bad object at xref") + if text[p1] == "[": + lp += 1 + if text[p1] == "]": + rp += 1 + # p1 now positioned at the last "]" + ve = text[p0 + 3 : p1 + 1] # the PDF /VE array + ve = ( + ve.replace("/And", '"and",') + .replace("/Not", '"not",') + .replace("/Or", '"or",') + ) + ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[") + import json + try: + ve = json.loads(ve) + except Exception: + exception_info() + message(f"bad /VE key: {ve!r}") + raise + return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve} + def get_outline_xrefs(self): """Get list of outline xref numbers.""" xrefs = [] @@ -4414,6 +5065,98 @@ return [v[:-1] for v in val] return val + def get_page_labels(self): + """Return page label definitions in PDF document. + + Returns: + A list of dictionaries with the following format: + {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. + """ + # Jorj McKie, 2021-01-10 + return [utils.rule_dict(item) for item in self._get_page_labels()] + + def get_page_numbers(doc, label, only_one=False): + """Return a list of page numbers with the given label. + + Args: + doc: PDF document object (resp. 'self'). + label: (str) label. + only_one: (bool) stop searching after first hit. + Returns: + List of page numbers having this label. + """ + # Jorj McKie, 2021-01-06 + + numbers = [] + if not label: + return numbers + labels = doc._get_page_labels() + if labels == []: + return numbers + for i in range(doc.page_count): + plabel = utils.get_label_pno(i, labels) + if plabel == label: + numbers.append(i) + if only_one: + break + return numbers + + def get_page_pixmap( + doc: 'Document', + pno: int, + *, + matrix: matrix_like = None, + dpi=None, + colorspace: Colorspace = None, + clip: rect_like = None, + alpha: bool = False, + annots: bool = True, + ) -> 'Pixmap': + """Create pixmap of document page by page number. + + Notes: + Convenience function calling page.get_pixmap. + Args: + pno: (int) page number + matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity). + colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB. + clip: (irect-like) restrict rendering to this area. + alpha: (bool) include alpha channel + annots: (bool) also render annotations + """ + if matrix is None: + matrix = Identity + if colorspace is None: + colorspace = csRGB + return doc[pno].get_pixmap( + matrix=matrix, + dpi=dpi, colorspace=colorspace, + clip=clip, + alpha=alpha, + annots=annots + ) + + def get_page_text( + doc: 'Document', + pno: int, + option: str = "text", + clip: rect_like = None, + flags: OptInt = None, + textpage: 'TextPage' = None, + sort: bool = False, + ) -> typing.Any: + """Extract a document page's text by page number. + + Notes: + Convenience function calling page.get_text(). + Args: + pno: page number + option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. + Returns: + output from page.TextPage(). + """ + return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort) + def get_page_xobjects(self, pno: int) -> list: """Retrieve a list of XObjects used on a page. """ @@ -4440,6 +5183,60 @@ sigflag = mupdf.pdf_to_int(sigflags) return sigflag + def get_toc( + doc: 'Document', + simple: bool = True, + ) -> list: + """Create a table of contents. + + Args: + simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. + """ + def recurse(olItem, liste, lvl): + """Recursively follow the outline item chain and record item information in a list.""" + while olItem and olItem.this.m_internal: + if olItem.title: + title = olItem.title + else: + title = " " + + if not olItem.is_external: + if olItem.uri: + if olItem.page == -1: + resolve = doc.resolve_link(olItem.uri) + page = resolve[0] + 1 + else: + page = olItem.page + 1 + else: + page = -1 + else: + page = -1 + + if not simple: + link = utils.getLinkDict(olItem, doc) + liste.append([lvl, title, page, link]) + else: + liste.append([lvl, title, page]) + + if olItem.down: + liste = recurse(olItem.down, liste, lvl + 1) + olItem = olItem.next + return liste + + # ensure document is open + if doc.is_closed: + raise ValueError("document closed") + doc.init_doc() + olItem = doc.outline + if not olItem: + return [] + lvl = 1 + liste = [] + toc = recurse(olItem, liste, lvl) + if doc.is_pdf and not simple: + doc._extend_toc_items(toc) + return toc + def get_xml_metadata(self): """Get document XML metadata.""" xml = None @@ -4457,6 +5254,31 @@ rc = '' return rc + def has_annots(doc: 'Document') -> bool: + """Check whether there are annotations on any page.""" + if doc.is_closed: + raise ValueError("document closed") + if not doc.is_pdf: + raise ValueError("is no PDF") + for i in range(doc.page_count): + for item in doc.page_annot_xrefs(i): + # pylint: disable=no-member + if not (item[1] == mupdf.PDF_ANNOT_LINK or item[1] == mupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member + return True + return False + + def has_links(doc: 'Document') -> bool: + """Check whether there are links on any page.""" + if doc.is_closed: + raise ValueError("document closed") + if not doc.is_pdf: + raise ValueError("is no PDF") + for i in range(doc.page_count): + for item in doc.page_annot_xrefs(i): + if item[1] == mupdf.PDF_ANNOT_LINK: # pylint: disable=no-member + return True + return False + def init_doc(self): if self.is_encrypted: raise ValueError("cannot initialize - document still encrypted") @@ -4522,6 +5344,36 @@ final=final, ) + def insert_page( + doc: 'Document', + pno: int, + text: typing.Union[str, list, None] = None, + fontsize: float = 11, + width: float = 595, + height: float = 842, + fontname: str = "helv", + fontfile: OptStr = None, + color: OptSeq = (0,), + ) -> int: + """Create a new PDF page and insert some text. + + Notes: + Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text(). + For parameter details see these methods. + """ + page = doc.new_page(pno=pno, width=width, height=height) + if not bool(text): + return 0 + rc = page.insert_text( + (50, 72), + text, + fontsize=fontsize, + fontname=fontname, + fontfile=fontfile, + color=color, + ) + return rc + def insert_pdf( self, docsrc, @@ -5022,6 +5874,24 @@ ret = mupdf.fz_needs_password( document) return ret + def new_page( + doc: 'Document', + pno: int = -1, + width: float = 595, + height: float = 842, + ) -> Page: + """Create and return a new page object. + + Args: + pno: (int) insert before this page. Default: after last page. + width: (float) page width in points. Default: 595 (ISO A4 width). + height: (float) page height in points. Default 842 (ISO A4 height). + Returns: + A pymupdf.Page object. + """ + doc._newPage(pno, width=width, height=height) + return doc[pno] + def next_location(self, page_id): """Get (chapter, page) of next page.""" if self.is_closed or self.is_encrypted: @@ -5668,6 +6538,201 @@ """ Save PDF incrementally""" return self.save(self.name, incremental=True, encryption=mupdf.PDF_ENCRYPT_KEEP) + # ------------------------------------------------------------------------------ + # Remove potentially sensitive data from a PDF. Similar to the Adobe + # Acrobat 'sanitize' function + # ------------------------------------------------------------------------------ + def scrub( + doc: 'Document', + attached_files: bool = True, + clean_pages: bool = True, + embedded_files: bool = True, + hidden_text: bool = True, + javascript: bool = True, + metadata: bool = True, + redactions: bool = True, + redact_images: int = 0, + remove_links: bool = True, + reset_fields: bool = True, + reset_responses: bool = True, + thumbnails: bool = True, + xml_metadata: bool = True, + ) -> None: + + def remove_hidden(cont_lines): + """Remove hidden text from a PDF page. + + Args: + cont_lines: list of lines with /Contents content. Should have status + from after page.cleanContents(). + + Returns: + List of /Contents lines from which hidden text has been removed. + + Notes: + The input must have been created after the page's /Contents object(s) + have been cleaned with page.cleanContents(). This ensures a standard + formatting: one command per line, single spaces between operators. + This allows for drastic simplification of this code. + """ + out_lines = [] # will return this + in_text = False # indicate if within BT/ET object + suppress = False # indicate text suppression active + make_return = False + for line in cont_lines: + if line == b"BT": # start of text object + in_text = True # switch on + out_lines.append(line) # output it + continue + if line == b"ET": # end of text object + in_text = False # switch off + out_lines.append(line) # output it + continue + if line == b"3 Tr": # text suppression operator + suppress = True # switch on + make_return = True + continue + if line[-2:] == b"Tr" and line[0] != b"3": + suppress = False # text rendering changed + out_lines.append(line) + continue + if line == b"Q": # unstack command also switches off + suppress = False + out_lines.append(line) + continue + if suppress and in_text: # suppress hidden lines + continue + out_lines.append(line) + if make_return: + return out_lines + else: + return None + + if not doc.is_pdf: # only works for PDF + raise ValueError("is no PDF") + if doc.is_encrypted or doc.is_closed: + raise ValueError("closed or encrypted doc") + + if not clean_pages: + hidden_text = False + redactions = False + + if metadata: + doc.set_metadata({}) # remove standard metadata + + for page in doc: + if reset_fields: + # reset form fields (widgets) + for widget in page.widgets(): + widget.reset() + + if remove_links: + links = page.get_links() # list of all links on page + for link in links: # remove all links + page.delete_link(link) + + found_redacts = False + for annot in page.annots(): + if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files: + annot.update_file(buffer_=b" ") # set file content to empty + if reset_responses: + annot.delete_responses() + if annot.type[0] == mupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member + found_redacts = True + + if redactions and found_redacts: + page.apply_redactions(images=redact_images) + + if not (clean_pages or hidden_text): + continue # done with the page + + page.clean_contents() + if not page.get_contents(): + continue + if hidden_text: + xrefs = page.get_contents() + assert len(xrefs) == 1 # only one because of cleaning. + xref = xrefs[0] + cont = doc.xref_stream(xref) + cont_lines = remove_hidden(cont.splitlines()) # remove hidden text + if cont_lines: # something was actually removed + cont = b"\n".join(cont_lines) + doc.update_stream(xref, cont) # rewrite the page /Contents + + if thumbnails: # remove page thumbnails? + if doc.xref_get_key(page.xref, "Thumb")[0] != "null": + doc.xref_set_key(page.xref, "Thumb", "null") + + # pages are scrubbed, now perform document-wide scrubbing + # remove embedded files + if embedded_files: + for name in doc.embfile_names(): + doc.embfile_del(name) + + if xml_metadata: + doc.del_xml_metadata() + if not (xml_metadata or javascript): + xref_limit = 0 + else: + xref_limit = doc.xref_length() + for xref in range(1, xref_limit): + if not doc.xref_object(xref): + msg = "bad xref %i - clean PDF before scrubbing" % xref + raise ValueError(msg) + if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript": + # a /JavaScript action object + obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript + doc.update_object(xref, obj) # update this object + continue # no further handling + + if not xml_metadata: + continue + + if doc.xref_get_key(xref, "Type")[1] == "/Metadata": + # delete any metadata object directly + doc.update_object(xref, "<<>>") + doc.update_stream(xref, b"deleted", new=True) + continue + + if doc.xref_get_key(xref, "Metadata")[0] != "null": + doc.xref_set_key(xref, "Metadata", "null") + + def search_page_for( + doc: 'Document', + pno: int, + text: str, + quads: bool = False, + clip: rect_like = None, + flags: int = None, + textpage: 'TextPage' = None, + ) -> list: + """Search for a string on a page. + + Args: + pno: page number + text: string to be searched for + clip: restrict search to this rectangle + quads: (bool) return quads instead of rectangles + flags: bit switches, default: join hyphened words + textpage: reuse a prepared textpage + Returns: + a list of rectangles or quads, each containing an occurrence. + """ + if flags is None: + flags = (0 + | TEXT_DEHYPHENATE + | TEXT_PRESERVE_LIGATURES + | TEXT_PRESERVE_WHITESPACE + | TEXT_MEDIABOX_CLIP + ) + return doc[pno].search_for( + text, + quads=quads, + clip=clip, + flags=flags, + textpage=textpage, + ) + def select(self, pyliste): """Build sub-pdf with page numbers in the list.""" if self.is_closed or self.is_encrypted: @@ -5812,6 +6877,162 @@ self.xref_set_key(xref, "MarkInfo", pdfdict) return True + def set_metadata(doc: 'Document', m: dict = None) -> None: + """Update the PDF /Info object. + + Args: + m: a dictionary like doc.metadata. + """ + if not doc.is_pdf: + raise ValueError("is no PDF") + if doc.is_closed or doc.is_encrypted: + raise ValueError("document closed or encrypted") + if m is None: + m = {} + elif type(m) is not dict: + raise ValueError("bad metadata") + keymap = { + "author": "Author", + "producer": "Producer", + "creator": "Creator", + "title": "Title", + "format": None, + "encryption": None, + "creationDate": "CreationDate", + "modDate": "ModDate", + "subject": "Subject", + "keywords": "Keywords", + "trapped": "Trapped", + } + valid_keys = set(keymap.keys()) + diff_set = set(m.keys()).difference(valid_keys) + if diff_set != set(): + msg = "bad dict key(s): %s" % diff_set + raise ValueError(msg) + + t, temp = doc.xref_get_key(-1, "Info") + if t != "xref": + info_xref = 0 + else: + info_xref = int(temp.replace("0 R", "")) + + if m == {} and info_xref == 0: # nothing to do + return + + if info_xref == 0: # no prev metadata: get new xref + info_xref = doc.get_new_xref() + doc.update_object(info_xref, "<<>>") # fill it with empty object + doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref) + elif m == {}: # remove existing metadata + doc.xref_set_key(-1, "Info", "null") + doc.init_doc() + return + + for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]: + pdf_key = keymap[key] + if not bool(val) or val in ("none", "null"): + val = "null" + else: + val = get_pdf_str(val) + doc.xref_set_key(info_xref, pdf_key, val) + doc.init_doc() + return + + def set_oc(doc: 'Document', xref: int, oc: int) -> None: + """Attach optional content object to image or form xobject. + + Args: + xref: (int) xref number of an image or form xobject + oc: (int) xref number of an OCG or OCMD + """ + if doc.is_closed or doc.is_encrypted: + raise ValueError("document close or encrypted") + t, name = doc.xref_get_key(xref, "Subtype") + if t != "name" or name not in ("/Image", "/Form"): + raise ValueError("bad object type at xref %i" % xref) + if oc > 0: + t, name = doc.xref_get_key(oc, "Type") + if t != "name" or name not in ("/OCG", "/OCMD"): + raise ValueError("bad object type at xref %i" % oc) + if oc == 0 and "OC" in doc.xref_get_keys(xref): + doc.xref_set_key(xref, "OC", "null") + return None + doc.xref_set_key(xref, "OC", "%i 0 R" % oc) + return None + + def set_ocmd( + doc: 'Document', + xref: int = 0, + ocgs: typing.Union[list, None] = None, + policy: OptStr = None, + ve: typing.Union[list, None] = None, + ) -> int: + """Create or update an OCMD object in a PDF document. + + Args: + xref: (int) 0 for creating a new object, otherwise update existing one. + ocgs: (list) OCG xref numbers, which shall be subject to 'policy'. + policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing). + ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'. + + Returns: + Xref of the created or updated OCMD. + """ + + all_ocgs = set(doc.get_ocgs().keys()) + + def ve_maker(ve): + if type(ve) not in (list, tuple) or len(ve) < 2: + raise ValueError("bad 've' format: %s" % ve) + if ve[0].lower() not in ("and", "or", "not"): + raise ValueError("bad operand: %s" % ve[0]) + if ve[0].lower() == "not" and len(ve) != 2: + raise ValueError("bad 've' format: %s" % ve) + item = "[/%s" % ve[0].title() + for x in ve[1:]: + if type(x) is int: + if x not in all_ocgs: + raise ValueError("bad OCG %i" % x) + item += " %i 0 R" % x + else: + item += " %s" % ve_maker(x) + item += "]" + return item + + text = "<</Type/OCMD" + + if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided + s = set(ocgs).difference(all_ocgs) # contains illegal xrefs + if s != set(): + msg = "bad OCGs: %s" % s + raise ValueError(msg) + text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]" + + if policy: + policy = str(policy).lower() + pols = { + "anyon": "AnyOn", + "allon": "AllOn", + "anyoff": "AnyOff", + "alloff": "AllOff", + } + if policy not in ("anyon", "allon", "anyoff", "alloff"): + raise ValueError("bad policy: %s" % policy) + text += "/P/%s" % pols[policy] + + if ve: + text += "/VE%s" % ve_maker(ve) + + text += ">>" + + # make new object or replace old OCMD (check type first) + if xref == 0: + xref = doc.get_new_xref() + elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True): + raise ValueError("bad xref or not an OCMD") + doc.update_object(xref, text) + return xref + def set_pagelayout(self, pagelayout: str): """Set the PDF PageLayout value.""" valid = ("SinglePage", "OneColumn", "TwoColumnLeft", "TwoColumnRight", "TwoPageLeft", "TwoPageRight") @@ -5844,6 +7065,349 @@ return True raise ValueError("bad PageMode value") + def set_page_labels(doc, labels): + """Add / replace page label definitions in PDF document. + + Args: + doc: PDF document (resp. 'self'). + labels: list of label dictionaries like: + {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}, + as returned by get_page_labels(). + """ + # William Chapman, 2021-01-06 + + def create_label_str(label): + """Convert Python label dict to corresponding PDF rule string. + + Args: + label: (dict) build rule for the label. + Returns: + PDF label rule string wrapped in "<<", ">>". + """ + s = "%i<<" % label["startpage"] + if label.get("prefix", "") != "": + s += "/P(%s)" % label["prefix"] + if label.get("style", "") != "": + s += "/S/%s" % label["style"] + if label.get("firstpagenum", 1) > 1: + s += "/St %i" % label["firstpagenum"] + s += ">>" + return s + + def create_nums(labels): + """Return concatenated string of all labels rules. + + Args: + labels: (list) dictionaries as created by function 'rule_dict'. + Returns: + PDF compatible string for page label definitions, ready to be + enclosed in PDF array 'Nums[...]'. + """ + labels.sort(key=lambda x: x["startpage"]) + s = "".join([create_label_str(label) for label in labels]) + return s + + doc._set_page_labels(create_nums(labels)) + + def set_toc( + doc: 'Document', + toc: list, + collapse: int = 1, + ) -> int: + """Create new outline tree (table of contents, TOC). + + Args: + toc: (list, tuple) each entry must contain level, title, page and + optionally top margin on the page. None or '()' remove the TOC. + collapse: (int) collapses entries beyond this level. Zero or None + shows all entries unfolded. + Returns: + the number of inserted items, or the number of removed items respectively. + """ + if doc.is_closed or doc.is_encrypted: + raise ValueError("document closed or encrypted") + if not doc.is_pdf: + raise ValueError("is no PDF") + if not toc: # remove all entries + return len(doc._delToC()) + + # validity checks -------------------------------------------------------- + if type(toc) not in (list, tuple): + raise ValueError("'toc' must be list or tuple") + toclen = len(toc) + page_count = doc.page_count + t0 = toc[0] + if type(t0) not in (list, tuple): + raise ValueError("items must be sequences of 3 or 4 items") + if t0[0] != 1: + raise ValueError("hierarchy level of item 0 must be 1") + for i in list(range(toclen - 1)): + t1 = toc[i] + t2 = toc[i + 1] + if not -1 <= t1[2] <= page_count: + raise ValueError("row %i: page number out of range" % i) + if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4): + raise ValueError("bad row %i" % (i + 1)) + if (type(t2[0]) is not int) or t2[0] < 1: + raise ValueError("bad hierarchy level in row %i" % (i + 1)) + if t2[0] > t1[0] + 1: + raise ValueError("bad hierarchy level in row %i" % (i + 1)) + # no formal errors in toc -------------------------------------------------- + + # -------------------------------------------------------------------------- + # make a list of xref numbers, which we can use for our TOC entries + # -------------------------------------------------------------------------- + old_xrefs = doc._delToC() # del old outlines, get their xref numbers + + # prepare table of xrefs for new bookmarks + old_xrefs = [] + xref = [0] + old_xrefs + xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number + if toclen > len(old_xrefs): # too few old xrefs? + for i in range((toclen - len(old_xrefs))): + xref.append(doc.get_new_xref()) # acquire new ones + + lvltab = {0: 0} # to store last entry per hierarchy level + + # ------------------------------------------------------------------------------ + # contains new outline objects as strings - first one is the outline root + # ------------------------------------------------------------------------------ + olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}] + # ------------------------------------------------------------------------------ + # build olitems as a list of PDF-like connected dictionaries + # ------------------------------------------------------------------------------ + for i in range(toclen): + o = toc[i] + lvl = o[0] # level + title = get_pdf_str(o[1]) # title + pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number + page_xref = doc.page_xref(pno) + page_height = doc.page_cropbox(pno).height + top = Point(72, page_height - 36) + dest_dict = {"to": top, "kind": LINK_GOTO} # fall back target + if o[2] < 0: + dest_dict["kind"] = LINK_NONE + if len(o) > 3: # some target is specified + if type(o[3]) in (int, float): # convert a number to a point + dest_dict["to"] = Point(72, page_height - o[3]) + else: # if something else, make sure we have a dict + # We make a copy of o[3] to avoid modifying our caller's data. + dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict + if "to" not in dest_dict: # target point not in dict? + dest_dict["to"] = top # put default in + else: # transform target to PDF coordinates + page = doc[pno] + point = Point(dest_dict["to"]) + point.y = page.cropbox.height - point.y + point = point * page.rotation_matrix + dest_dict["to"] = (point.x, point.y) + d = {} + d["first"] = -1 + d["count"] = 0 + d["last"] = -1 + d["prev"] = -1 + d["next"] = -1 + d["dest"] = utils.getDestStr(page_xref, dest_dict) + d["top"] = dest_dict["to"] + d["title"] = title + d["parent"] = lvltab[lvl - 1] + d["xref"] = xref[i + 1] + d["color"] = dest_dict.get("color") + d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0) + lvltab[lvl] = i + 1 + parent = olitems[lvltab[lvl - 1]] # the parent entry + + if ( + dest_dict.get("collapse") or collapse and lvl > collapse + ): # suppress expansion + parent["count"] -= 1 # make /Count negative + else: + parent["count"] += 1 # positive /Count + + if parent["first"] == -1: + parent["first"] = i + 1 + parent["last"] = i + 1 + else: + d["prev"] = parent["last"] + prev = olitems[parent["last"]] + prev["next"] = i + 1 + parent["last"] = i + 1 + olitems.append(d) + + # ------------------------------------------------------------------------------ + # now create each outline item as a string and insert it in the PDF + # ------------------------------------------------------------------------------ + for i, ol in enumerate(olitems): + txt = "<<" + if ol["count"] != 0: + txt += "/Count %i" % ol["count"] + try: + txt += ol["dest"] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["first"] > -1: + txt += "/First %i 0 R" % xref[ol["first"]] + except Exception: + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["last"] > -1: + txt += "/Last %i 0 R" % xref[ol["last"]] + except Exception: + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["next"] > -1: + txt += "/Next %i 0 R" % xref[ol["next"]] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["parent"] > -1: + txt += "/Parent %i 0 R" % xref[ol["parent"]] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + if ol["prev"] > -1: + txt += "/Prev %i 0 R" % xref[ol["prev"]] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + try: + txt += "/Title" + ol["title"] + except Exception: + # Verbose in PyMuPDF/tests. + if g_exceptions_verbose >= 2: exception_info() + pass + + if ol.get("color") and len(ol["color"]) == 3: + txt += f"/C[ {_format_g(tuple(ol['color']))}]" + if ol.get("flags", 0) > 0: + txt += "/F %i" % ol["flags"] + + if i == 0: # special: this is the outline root + txt += "/Type/Outlines" # so add the /Type entry + txt += ">>" + doc.update_object(xref[i], txt) # insert the PDF object + + doc.init_doc() + return toclen + + def set_toc_item( + doc: 'Document', + idx: int, + dest_dict: OptDict = None, + kind: OptInt = None, + pno: OptInt = None, + uri: OptStr = None, + title: OptStr = None, + to: point_like = None, + filename: OptStr = None, + zoom: float = 0, + ) -> None: + """Update TOC item by index. + + It allows changing the item's title and link destination. + + Args: + idx: + (int) desired index of the TOC list, as created by get_toc. + dest_dict: + (dict) destination dictionary as created by get_toc(False). + Outrules all other parameters. If None, the remaining parameters + are used to make a dest dictionary. + kind: + (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only + the title will be updated. If pymupdf.LINK_NONE, the TOC item will + be deleted. + pno: + (int) page number (1-based like in get_toc). Required if + pymupdf.LINK_GOTO. + uri: + (str) the URL, required if pymupdf.LINK_URI. + title: + (str) the new title. No change if None. + to: + (point-like) destination on the target page. If omitted, (72, 36) + will be used as target coordinates. + filename: + (str) destination filename, required for pymupdf.LINK_GOTOR and + pymupdf.LINK_LAUNCH. + name: + (str) a destination name for pymupdf.LINK_NAMED. + zoom: + (float) a zoom factor for the target location (pymupdf.LINK_GOTO). + """ + xref = doc.get_outline_xrefs()[idx] + page_xref = 0 + if type(dest_dict) is dict: + if dest_dict["kind"] == LINK_GOTO: + pno = dest_dict["page"] + page_xref = doc.page_xref(pno) + page_height = doc.page_cropbox(pno).height + to = dest_dict.get('to', Point(72, 36)) + to.y = page_height - to.y + dest_dict["to"] = to + action = utils.getDestStr(page_xref, dest_dict) + if not action.startswith("/A"): + raise ValueError("bad bookmark dest") + color = dest_dict.get("color") + if color: + color = list(map(float, color)) + if len(color) != 3 or min(color) < 0 or max(color) > 1: + raise ValueError("bad color value") + bold = dest_dict.get("bold", False) + italic = dest_dict.get("italic", False) + flags = italic + 2 * bold + collapse = dest_dict.get("collapse") + return doc._update_toc_item( + xref, + action=action[2:], + title=title, + color=color, + flags=flags, + collapse=collapse, + ) + + if kind == LINK_NONE: # delete bookmark item + return doc.del_toc_item(idx) + if kind is None and title is None: # treat as no-op + return None + if kind is None: # only update title text + return doc._update_toc_item(xref, action=None, title=title) + + if kind == LINK_GOTO: + if pno is None or pno not in range(1, doc.page_count + 1): + raise ValueError("bad page number") + page_xref = doc.page_xref(pno - 1) + page_height = doc.page_cropbox(pno - 1).height + if to is None: + to = Point(72, page_height - 36) + else: + to = Point(to) + to.y = page_height - to.y + + ddict = { + "kind": kind, + "to": to, + "uri": uri, + "page": pno, + "file": filename, + "zoom": zoom, + } + action = utils.getDestStr(page_xref, ddict) + if action == "" or not action.startswith("/A"): + raise ValueError("bad bookmark dest") + + return doc._update_toc_item(xref, action=action[2:], title=title) + def set_xml_metadata(self, metadata): """Store XML document level metadata.""" if self.is_closed or self.is_encrypted: @@ -5862,6 +7426,318 @@ mupdf.pdf_dict_put( xml, PDF_NAME('Subtype'), PDF_NAME('XML')) mupdf.pdf_dict_put( root, PDF_NAME('Metadata'), xml) + def subset_fonts(doc: 'Document', verbose: bool = False, fallback: bool = False) -> OptInt: + """Build font subsets in a PDF. + + Eligible fonts are potentially replaced by smaller versions. Page text is + NOT rewritten and thus should retain properties like being hidden or + controlled by optional content. + + This method by default uses MuPDF's own internal feature to create subset + fonts. As this is a new function, errors may still occur. In this case, + please fall back to using the previous version by using "fallback=True". + Fallback mode requires the external package 'fontTools'. + + Args: + fallback: use the older deprecated implementation. + verbose: only used by fallback mode. + + Returns: + The new MuPDF-based code returns None. The deprecated fallback + mode returns 0 if there are no fonts to subset. Otherwise, it + returns the decrease in fontsize (the difference in fontsize), + measured in bytes. + """ + # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs)) + # An embedded font is uniquely defined by its fontbuffer only. It may have + # multiple names and xrefs. + # Once the sets of used unicodes and glyphs are known, we compute a + # smaller version of the buffer user package fontTools. + + if not fallback: # by default use MuPDF function + pdf = mupdf.pdf_document_from_fz_document(doc) + mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) + return + + font_buffers = {} + + def get_old_widths(xref): + """Retrieve old font '/W' and '/DW' values.""" + df = doc.xref_get_key(xref, "DescendantFonts") + if df[0] != "array": # only handle xref specifications + return None, None + df_xref = int(df[1][1:-1].replace("0 R", "")) + widths = doc.xref_get_key(df_xref, "W") + if widths[0] != "array": # no widths key found + widths = None + else: + widths = widths[1] + dwidths = doc.xref_get_key(df_xref, "DW") + if dwidths[0] != "int": + dwidths = None + else: + dwidths = dwidths[1] + return widths, dwidths + + def set_old_widths(xref, widths, dwidths): + """Restore the old '/W' and '/DW' in subsetted font. + + If either parameter is None or evaluates to False, the corresponding + dictionary key will be set to null. + """ + df = doc.xref_get_key(xref, "DescendantFonts") + if df[0] != "array": # only handle xref specs + return None + df_xref = int(df[1][1:-1].replace("0 R", "")) + if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[ + 0 + ] != "null": + doc.xref_set_key(df_xref, "W", "null") + else: + doc.xref_set_key(df_xref, "W", widths) + if (type(dwidths) is not str or not dwidths) and doc.xref_get_key( + df_xref, "DW" + )[0] != "null": + doc.xref_set_key(df_xref, "DW", "null") + else: + doc.xref_set_key(df_xref, "DW", dwidths) + return None + + def set_subset_fontname(new_xref): + """Generate a name prefix to tag a font as subset. + + We use a random generator to select 6 upper case ASCII characters. + The prefixed name must be put in the font xref as the "/BaseFont" value + and in the FontDescriptor object as the '/FontName' value. + """ + # The following generates a prefix like 'ABCDEF+' + import random + import string + prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+" + font_str = doc.xref_object(new_xref, compressed=True) + font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix) + df = doc.xref_get_key(new_xref, "DescendantFonts") + if df[0] == "array": + df_xref = int(df[1][1:-1].replace("0 R", "")) + fd = doc.xref_get_key(df_xref, "FontDescriptor") + if fd[0] == "xref": + fd_xref = int(fd[1].replace("0 R", "")) + fd_str = doc.xref_object(fd_xref, compressed=True) + fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix) + doc.update_object(fd_xref, fd_str) + doc.update_object(new_xref, font_str) + + def build_subset(buffer, unc_set, gid_set): + """Build font subset using fontTools. + + Args: + buffer: (bytes) the font given as a binary buffer. + unc_set: (set) required glyph ids. + Returns: + Either None if subsetting is unsuccessful or the subset font buffer. + """ + try: + import fontTools.subset as fts + except ImportError: + if g_exceptions_verbose: exception_info() + message("This method requires fontTools to be installed.") + raise + import tempfile + with tempfile.TemporaryDirectory() as tmp_dir: + oldfont_path = f"{tmp_dir}/oldfont.ttf" + newfont_path = f"{tmp_dir}/newfont.ttf" + uncfile_path = f"{tmp_dir}/uncfile.txt" + args = [ + oldfont_path, + "--retain-gids", + f"--output-file={newfont_path}", + "--layout-features=*", + "--passthrough-tables", + "--ignore-missing-glyphs", + "--ignore-missing-unicodes", + "--symbol-cmap", + ] + + # store glyph ids or unicodes as file + with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file: + if 0xFFFD in unc_set: # error unicode exists -> use glyphs + args.append(f"--gids-file={uncfile_path}") + gid_set.add(189) + unc_list = list(gid_set) + for unc in unc_list: + unc_file.write("%i\n" % unc) + else: + args.append(f"--unicodes-file={uncfile_path}") + unc_set.add(255) + unc_list = list(unc_set) + for unc in unc_list: + unc_file.write("%04x\n" % unc) + + # store fontbuffer as a file + with open(oldfont_path, "wb") as fontfile: + fontfile.write(buffer) + try: + os.remove(newfont_path) # remove old file + except Exception: + pass + try: # invoke fontTools subsetter + fts.main(args) + font = Font(fontfile=newfont_path) + new_buffer = font.buffer # subset font binary + if font.glyph_count == 0: # intercept empty font + new_buffer = None + except Exception: + exception_info() + new_buffer = None + return new_buffer + + def repl_fontnames(doc): + """Populate 'font_buffers'. + + For each font candidate, store its xref and the list of names + by which PDF text may refer to it (there may be multiple). + """ + + def norm_name(name): + """Recreate font name that contains PDF hex codes. + + E.g. #20 -> space, chr(32) + """ + while "#" in name: + p = name.find("#") + c = int(name[p + 1 : p + 3], 16) + name = name.replace(name[p : p + 3], chr(c)) + return name + + def get_fontnames(doc, item): + """Return a list of fontnames for an item of page.get_fonts(). + + There may be multiple names e.g. for Type0 fonts. + """ + fontname = item[3] + names = [fontname] + fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:] + fontname = norm_name(fontname) + if fontname not in names: + names.append(fontname) + descendents = doc.xref_get_key(item[0], "DescendantFonts") + if descendents[0] != "array": + return names + descendents = descendents[1][1:-1] + if descendents.endswith(" 0 R"): + xref = int(descendents[:-4]) + descendents = doc.xref_object(xref, compressed=True) + p1 = descendents.find("/BaseFont") + if p1 >= 0: + p2 = descendents.find("/", p1 + 1) + p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1)) + fontname = descendents[p2 + 1 : p1] + fontname = norm_name(fontname) + if fontname not in names: + names.append(fontname) + return names + + for i in range(doc.page_count): + for f in doc.get_page_fonts(i, full=True): + font_xref = f[0] # font xref + font_ext = f[1] # font file extension + basename = f[3] # font basename + + if font_ext not in ( # skip if not supported by fontTools + "otf", + "ttf", + "woff", + "woff2", + ): + continue + # skip fonts which already are subsets + if len(basename) > 6 and basename[6] == "+": + continue + + extr = doc.extract_font(font_xref) + fontbuffer = extr[-1] + names = get_fontnames(doc, f) + name_set, xref_set, subsets = font_buffers.get( + fontbuffer, (set(), set(), (set(), set())) + ) + xref_set.add(font_xref) + for name in names: + name_set.add(name) + font = Font(fontbuffer=fontbuffer) + name_set.add(font.name) + del font + font_buffers[fontbuffer] = (name_set, xref_set, subsets) + + def find_buffer_by_name(name): + for buffer, (name_set, _, _) in font_buffers.items(): + if name in name_set: + return buffer + return None + + # ----------------- + # main function + # ----------------- + repl_fontnames(doc) # populate font information + if not font_buffers: # nothing found to do + if verbose: + message(f'No fonts to subset.') + return 0 + + old_fontsize = 0 + new_fontsize = 0 + for fontbuffer in font_buffers.keys(): + old_fontsize += len(fontbuffer) + + # Scan page text for usage of subsettable fonts + for page in doc: + # go through the text and extend set of used glyphs by font + # we use a modified MuPDF trace device, which delivers us glyph ids. + for span in page.get_texttrace(): + if type(span) is not dict: # skip useless information + continue + fontname = span["font"][:33] # fontname for the span + buffer = find_buffer_by_name(fontname) + if buffer is None: + continue + name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer] + for c in span["chars"]: + set_ucs.add(c[0]) # unicode + set_gid.add(c[1]) # glyph id + font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid)) + + # build the font subsets + for old_buffer, (name_set, xref_set, subsets) in font_buffers.items(): + new_buffer = build_subset(old_buffer, subsets[0], subsets[1]) + fontname = list(name_set)[0] + if new_buffer is None or len(new_buffer) >= len(old_buffer): + # subset was not created or did not get smaller + if verbose: + message(f'Cannot subset {fontname!r}.') + continue + if verbose: + message(f"Built subset of font {fontname!r}.") + val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF + new_xref = val[0] # get its xref + set_subset_fontname(new_xref) # tag fontname as subset font + font_str = doc.xref_object( # get its object definition + new_xref, + compressed=True, + ) + # walk through the original font xrefs and replace each by the subset def + for font_xref in xref_set: + # we need the original '/W' and '/DW' width values + width_table, def_width = get_old_widths(font_xref) + # ... and replace original font definition at xref with it + doc.update_object(font_xref, font_str) + # now copy over old '/W' and '/DW' values + if width_table or def_width: + set_old_widths(font_xref, width_table, def_width) + # 'new_xref' remains unused in the PDF and must be removed + # by garbage collection. + new_fontsize += len(new_buffer) + + return old_fontsize - new_fontsize + def switch_layer(self, config, as_default=0): """Activate an OC layer.""" pdf = _as_pdf_document(self) @@ -5972,6 +7848,9 @@ compression_effort=compression_effort, ) return bio.getvalue() + + def tobytes(self, *args, **kwargs): + return self.write(*args, **kwargs) @property def xref(self): @@ -5979,6 +7858,41 @@ CheckParent(self) return self.parent.page_xref(self.number) + def xref_copy(doc: 'Document', source: int, target: int, *, keep: list = None) -> None: + """Copy a PDF dictionary object to another one given their xref numbers. + + Args: + doc: PDF document object + source: source xref number + target: target xref number, the xref must already exist + keep: an optional list of 1st level keys in target that should not be + removed before copying. + Notes: + This works similar to the copy() method of dictionaries in Python. The + source may be a stream object. + """ + if doc.xref_is_stream(source): + # read new xref stream, maintaining compression + stream = doc.xref_stream_raw(source) + doc.update_stream( + target, + stream, + compress=False, # keeps source compression + new=True, # in case target is no stream + ) + + # empty the target completely, observe exceptions + if keep is None: + keep = [] + for key in doc.xref_get_keys(target): + if key in keep: + continue + doc.xref_set_key(target, key, "null") + # copy over all source dict items + for key in doc.xref_get_keys(source): + item = doc.xref_get_key(source, key) + doc.xref_set_key(target, key, item[1]) + def xref_get_key(self, xref, key): """Get PDF dict key value of object at 'xref'.""" pdf = _as_pdf_document(self) @@ -6195,7 +8109,6 @@ __slots__ = ('this', 'page_count2', 'this_is_pdf', '__dict__') outline = property(lambda self: self._outline) - tobytes = write is_stream = xref_is_stream open = Document @@ -8734,6 +10647,117 @@ annot._yielded=True yield annot + def apply_redactions( + page: 'Page', + images: int = 2, + graphics: int = 1, + text: int = 0, + ) -> bool: + """Apply the redaction annotations of the page. + + Args: + page: the PDF page. + images: + 0 - ignore images + 1 - remove all overlapping images + 2 - blank out overlapping image parts + 3 - remove image unless invisible + graphics: + 0 - ignore graphics + 1 - remove graphics if contained in rectangle + 2 - remove all overlapping graphics + text: + 0 - remove text + 1 - ignore text + """ + + def center_rect(annot_rect, new_text, font, fsize): + """Calculate minimal sub-rectangle for the overlay text. + + Notes: + Because 'insert_textbox' supports no vertical text centering, + we calculate an approximate number of lines here and return a + sub-rect with smaller height, which should still be sufficient. + Args: + annot_rect: the annotation rectangle + new_text: the text to insert. + font: the fontname. Must be one of the CJK or Base-14 set, else + the rectangle is returned unchanged. + fsize: the fontsize + Returns: + A rectangle to use instead of the annot rectangle. + """ + if not new_text or annot_rect.width <= EPSILON: + return annot_rect + try: + text_width = get_text_length(new_text, font, fsize) + except (ValueError, mupdf.FzErrorBase): # unsupported font + if g_exceptions_verbose: + exception_info() + return annot_rect + line_height = fsize * 1.2 + limit = annot_rect.width + h = math.ceil(text_width / limit) * line_height # estimate rect height + if h >= annot_rect.height: + return annot_rect + r = annot_rect + y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5 + r.y0 = y + return r + + CheckParent(page) + doc = page.parent + if doc.is_encrypted or doc.is_closed: + raise ValueError("document closed or encrypted") + if not doc.is_pdf: + raise ValueError("is no PDF") + + redact_annots = [] # storage of annot values + for annot in page.annots( + types=(mupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member + ): + # loop redactions + redact_annots.append(annot._get_redact_values()) # save annot values + + if redact_annots == []: # any redactions on this page? + return False # no redactions + + rc = page._apply_redactions(text, images, graphics) # call MuPDF + if not rc: # should not happen really + raise ValueError("Error applying redactions.") + + # now write replacement text in old redact rectangles + shape = page.new_shape() + for redact in redact_annots: + annot_rect = redact["rect"] + fill = redact["fill"] + if fill: + shape.draw_rect(annot_rect) # colorize the rect background + shape.finish(fill=fill, color=fill) + if "text" in redact.keys(): # if we also have text + new_text = redact["text"] + align = redact.get("align", 0) + fname = redact["fontname"] + fsize = redact["fontsize"] + color = redact["text_color"] + # try finding vertical centered sub-rect + trect = center_rect(annot_rect, new_text, fname, fsize) + + rc = -1 + while rc < 0 and fsize >= 4: # while not enough room + # (re-) try insertion + rc = shape.insert_textbox( + trect, + new_text, + fontname=fname, + fontsize=fsize, + color=color, + align=align, + ) + fsize -= 0.5 # reduce font if unsuccessful + shape.commit() # append new contents object + return True + def recolor(self, components=1): """Convert colorspaces of objects on the page. @@ -8842,6 +10866,19 @@ annot._erase() return val + def delete_image(page: 'Page', xref: int): + """Delete the image referred to by xef. + + Actually replaces by a small transparent Pixmap using method Page.replace_image. + + Args: + xref: xref of the image to delete. + """ + # make a small 100% transparent pixmap (of just any dimension) + pix = Pixmap(csGRAY, (0, 0, 1, 1), 1) + pix.clear_with() # clear all samples bytes to 0x00 + page.replace_image(xref, pixmap=pix) + def delete_link(self, linkdict): """Delete a Link.""" CheckParent(self) @@ -8886,6 +10923,20 @@ return finished() + def delete_widget(page: 'Page', widget: Widget) -> Widget: + """Delete widget from page and return the next one.""" + CheckParent(page) + annot = getattr(widget, "_annot", None) + if annot is None: + raise ValueError("bad type: widget") + nextwidget = widget.next + page.delete_annot(annot) + widget._annot.parent = None + keylist = list(widget.__dict__.keys()) + for key in keylist: + del widget.__dict__[key] + return nextwidget + @property def derotation_matrix(self) -> Matrix: """Reflects page de-rotation.""" @@ -8896,6 +10947,408 @@ return Matrix(mupdf.FzRect(mupdf.FzRect.UNIT)) return Matrix(JM_derotate_page_matrix(pdfpage)) + def draw_bezier( + page: 'Page', + p1: point_like, + p2: point_like, + p3: point_like, + p4: point_like, + color: OptSeq = (0,), + fill: OptSeq = None, + dashes: OptStr = None, + width: float = 1, + morph: OptStr = None, + closePath: bool = False, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3.""" + img = page.new_shape() + Q = img.draw_bezier(Point(p1), Point(p2), Point(p3), Point(p4)) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + closePath=closePath, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return Q + + def draw_circle( + page: 'Page', + center: point_like, + radius: float, + color: OptSeq = (0,), + fill: OptSeq = None, + morph: OptSeq = None, + dashes: OptStr = None, + width: float = 1, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw a circle given its center and radius.""" + img = page.new_shape() + Q = img.draw_circle(Point(center), radius) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + return Q + + def draw_curve( + page: 'Page', + p1: point_like, + p2: point_like, + p3: point_like, + color: OptSeq = (0,), + fill: OptSeq = None, + dashes: OptStr = None, + width: float = 1, + morph: OptSeq = None, + closePath: bool = False, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3.""" + img = page.new_shape() + Q = img.draw_curve(Point(p1), Point(p2), Point(p3)) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + closePath=closePath, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return Q + + def draw_line( + page: 'Page', + p1: point_like, + p2: point_like, + color: OptSeq = (0,), + dashes: OptStr = None, + width: float = 1, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + morph: OptSeq = None, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc=0, + ) -> Point: + """Draw a line from point p1 to point p2.""" + img = page.new_shape() + p = img.draw_line(Point(p1), Point(p2)) + img.finish( + color=color, + dashes=dashes, + width=width, + closePath=False, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return p + + def draw_oval( + page: 'Page', + rect: typing.Union[rect_like, quad_like], + color: OptSeq = (0,), + fill: OptSeq = None, + dashes: OptStr = None, + morph: OptSeq = None, + width: float = 1, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw an oval given its containing rectangle or quad.""" + img = page.new_shape() + Q = img.draw_oval(rect) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return Q + + def draw_polyline( + page: 'Page', + points: list, + color: OptSeq = (0,), + fill: OptSeq = None, + dashes: OptStr = None, + width: float = 1, + morph: OptSeq = None, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + closePath: bool = False, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw multiple connected line segments.""" + img = page.new_shape() + Q = img.draw_polyline(points) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + closePath=closePath, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return Q + + def draw_quad( + page: 'Page', + quad: quad_like, + color: OptSeq = (0,), + fill: OptSeq = None, + dashes: OptStr = None, + width: float = 1, + lineCap: int = 0, + lineJoin: int = 0, + morph: OptSeq = None, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw a quadrilateral.""" + img = page.new_shape() + Q = img.draw_quad(Quad(quad)) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return Q + + def draw_rect( + page: 'Page', + rect: rect_like, + color: OptSeq = (0,), + fill: OptSeq = None, + dashes: OptStr = None, + width: float = 1, + lineCap: int = 0, + lineJoin: int = 0, + morph: OptSeq = None, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + radius=None, + ) -> Point: + ''' + Draw a rectangle. See Shape class method for details. + ''' + img = page.new_shape() + Q = img.draw_rect(Rect(rect), radius=radius) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return Q + + def draw_sector( + page: 'Page', + center: point_like, + point: point_like, + beta: float, + color: OptSeq = (0,), + fill: OptSeq = None, + dashes: OptStr = None, + fullSector: bool = True, + morph: OptSeq = None, + width: float = 1, + closePath: bool = False, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw a circle sector given circle center, one arc end point and the angle of the arc. + + Parameters: + center -- center of circle + point -- arc end point + beta -- angle of arc (degrees) + fullSector -- connect arc ends with center + """ + img = page.new_shape() + Q = img.draw_sector(Point(center), Point(point), beta, fullSector=fullSector) + img.finish( + color=color, + fill=fill, + dashes=dashes, + width=width, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + closePath=closePath, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return Q + + def draw_squiggle( + page: 'Page', + p1: point_like, + p2: point_like, + breadth: float = 2, + color: OptSeq = (0,), + dashes: OptStr = None, + width: float = 1, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + morph: OptSeq = None, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw a squiggly line from point p1 to point p2.""" + img = page.new_shape() + p = img.draw_squiggle(Point(p1), Point(p2), breadth=breadth) + img.finish( + color=color, + dashes=dashes, + width=width, + closePath=False, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return p + + def draw_zigzag( + page: 'Page', + p1: point_like, + p2: point_like, + breadth: float = 2, + color: OptSeq = (0,), + dashes: OptStr = None, + width: float = 1, + lineCap: int = 0, + lineJoin: int = 0, + overlay: bool = True, + morph: OptSeq = None, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> Point: + """Draw a zigzag line from point p1 to point p2.""" + img = page.new_shape() + p = img.draw_zigzag(Point(p1), Point(p2), breadth=breadth) + img.finish( + color=color, + dashes=dashes, + width=width, + closePath=False, + lineCap=lineCap, + lineJoin=lineJoin, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + img.commit(overlay) + + return p + def extend_textpage(self, tpage, flags=0, matrix=None): page = self.this tp = tpage.this @@ -9218,6 +11671,168 @@ val = None return paths + def get_image_info( + page: 'Page', + hashes: bool = False, + xrefs: bool = False + ) -> list: + """Extract image information only from a pymupdf.TextPage. + + Args: + hashes: (bool) include MD5 hash for each image. + xrefs: (bool) try to find the xref for each image. Sets hashes to true. + """ + doc = page.parent + if xrefs and doc.is_pdf: + hashes = True + if not doc.is_pdf: + xrefs = False + imginfo = getattr(page, "_image_info", None) + if imginfo and not xrefs: + return imginfo + if not imginfo: + tp = page.get_textpage(flags=TEXT_PRESERVE_IMAGES) + imginfo = tp.extractIMGINFO(hashes=hashes) + del tp + if hashes: + page._image_info = imginfo + if not xrefs or not doc.is_pdf: + return imginfo + imglist = page.get_images() + digests = {} + for item in imglist: + xref = item[0] + pix = Pixmap(doc, xref) + digests[pix.digest] = xref + del pix + for i in range(len(imginfo)): + item = imginfo[i] + xref = digests.get(item["digest"], 0) + item["xref"] = xref + imginfo[i] = item + return imginfo + + def get_image_rects(page: 'Page', name, transform=False) -> list: + """Return list of image positions on a page. + + Args: + name: (str, list, int) image identification. May be reference name, an + item of the page's image list or an xref. + transform: (bool) whether to also return the transformation matrix. + Returns: + A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix) + for all image locations on the page. + """ + if type(name) in (list, tuple): + xref = name[0] + elif type(name) is int: + xref = name + else: + imglist = [i for i in page.get_images() if i[7] == name] + if imglist == []: + raise ValueError("bad image name") + elif len(imglist) != 1: + raise ValueError("multiple image names found") + xref = imglist[0][0] + pix = Pixmap(page.parent, xref) # make pixmap of the image to compute MD5 + digest = pix.digest + del pix + infos = page.get_image_info(hashes=True) + if not transform: + bboxes = [Rect(im["bbox"]) for im in infos if im["digest"] == digest] + else: + bboxes = [ + (Rect(im["bbox"]), Matrix(im["transform"])) + for im in infos + if im["digest"] == digest + ] + return bboxes + + def get_label(page): + """Return the label for this PDF page. + + Args: + page: page object. + Returns: + The label (str) of the page. Errors return an empty string. + """ + # Jorj McKie, 2021-01-06 + + labels = page.parent._get_page_labels() + if not labels: + return "" + labels.sort() + return utils.get_label_pno(page.number, labels) + + def get_links(page: 'Page') -> list: + """Create a list of all links contained in a PDF page. + + Notes: + see PyMuPDF ducmentation for details. + """ + + CheckParent(page) + ln = page.first_link + links = [] + while ln: + nl = utils.getLinkDict(ln, page.parent) + links.append(nl) + ln = ln.next + if links != [] and page.parent.is_pdf: + linkxrefs = [x for x in + #page.annot_xrefs() + JM_get_annot_xref_list2(page) + if x[1] == mupdf.PDF_ANNOT_LINK # pylint: disable=no-member + ] + if len(linkxrefs) == len(links): + for i in range(len(linkxrefs)): + links[i]["xref"] = linkxrefs[i][0] + links[i]["id"] = linkxrefs[i][2] + return links + + def get_pixmap( + page: 'Page', + *, + matrix: matrix_like=Identity, + dpi=None, + colorspace: Colorspace=None, + clip: rect_like=None, + alpha: bool=False, + annots: bool=True, + ) -> 'Pixmap': + """Create pixmap of page. + + Keyword args: + matrix: Matrix for transformation (default: Identity). + dpi: desired dots per inch. If given, matrix is ignored. + colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB. + clip: (irect-like) restrict rendering to this area. + alpha: (bool) whether to include alpha channel + annots: (bool) whether to also render annotations + """ + if colorspace is None: + colorspace = csRGB + if dpi: + zoom = dpi / 72 + matrix = Matrix(zoom, zoom) + + if type(colorspace) is str: + if colorspace.upper() == "GRAY": + colorspace = csGRAY + elif colorspace.upper() == "CMYK": + colorspace = csCMYK + else: + colorspace = csRGB + if colorspace.n not in (1, 3, 4): + raise ValueError("unsupported colorspace") + + dl = page.get_displaylist(annots=annots) + pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip) + dl = None + if dpi: + pix.set_dpi(dpi, dpi) + return pix + def remove_rotation(self): """Set page rotation to 0 while maintaining visual appearance.""" rot = self.rotation # normalized rotation value @@ -9503,6 +12118,21 @@ del tp return rc + def get_text(self, *args, **kwargs): + return utils.get_text(self, *args, **kwargs) + + def get_text_blocks(self, *args, **kwargs): + return utils.get_text_blocks(self, *args, **kwargs) + + def get_text_selection(self, *args, **kwargs): + return utils.get_text_selection(self, *args, **kwargs) + + def get_text_words(self, *args, **kwargs): + return utils.get_text_words(self, *args, **kwargs) + + def get_textpage_ocr(self, *args, **kwargs): + return utils.get_textpage_ocr(self, *args, **kwargs) + def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "TextPage": CheckParent(self) if matrix is None: @@ -9628,6 +12258,406 @@ doc.get_char_widths(xref, fontdict=fontdict) return xref + def insert_htmlbox( + page, + rect, + text, + *, + css=None, + scale_low=0, + archive=None, + rotate=0, + oc=0, + opacity=1, + overlay=True, + _scale_word_width=True, + _verbose=False, + ) -> tuple: + """Insert text with optional HTML tags and stylings into a rectangle. + + Args: + rect: (rect-like) rectangle into which the text should be placed. + text: (str) text with optional HTML tags and stylings. + css: (str) CSS styling commands. + scale_low: (float) force-fit content by scaling it down. Must be in + range [0, 1]. If 1, no scaling will take place. If 0, arbitrary + down-scaling is acceptable. A value of 0.1 would mean that content + may be scaled down by at most 90%. + archive: Archive object pointing to locations of used fonts or images + rotate: (int) rotate the text in the box by a multiple of 90 degrees. + oc: (int) the xref of an OCG / OCMD (Optional Content). + opacity: (float) set opacity of inserted content. + overlay: (bool) put text on top of page content. + _scale_word_width: internal, for testing only. + _verbose: internal, for testing only. + Returns: + A tuple of floats (spare_height, scale). + spare_height: + The height of the remaining space in <rect> below the + text, or -1 if we failed to fit. + scale: + The scaling required; `0 < scale <= 1`. + Will be less than `scale_low` if we failed to fit. + """ + # normalize rotation angle + if not rotate % 90 == 0: + raise ValueError("bad rotation angle") + while rotate < 0: + rotate += 360 + while rotate >= 360: + rotate -= 360 + + if not 0 <= scale_low <= 1: + raise ValueError("'scale_low' must be in [0, 1]") + + if css is None: + css = "" + + rect = Rect(rect) + if rotate in (90, 270): + temp_rect = Rect(0, 0, rect.height, rect.width) + else: + temp_rect = Rect(0, 0, rect.width, rect.height) + + # use a small border by default + mycss = "body {margin:1px;}" + css # append user CSS + + # either make a story, or accept a given one + if isinstance(text, str): # if a string, convert to a Story + story = Story(html=text, user_css=mycss, archive=archive) + elif isinstance(text, Story): + story = text + else: + raise ValueError("'text' must be a string or a Story") + + # ---------------------------------------------------------------- + # Find a scaling factor that lets our story fit in. Instead of scaling + # the text smaller, we instead look at how much bigger the rect needs + # to be to fit the text, then reverse the scaling to get how much we + # need to scale down the text. + # ---------------------------------------------------------------- + rect_scale_max = None if scale_low == 0 else 1 / scale_low + + fit = story.fit_scale( + temp_rect, + scale_min=1, + scale_max=rect_scale_max, + flags=mupdf.FZ_PLACE_STORY_FLAG_NO_OVERFLOW if _scale_word_width else 0, + verbose=_verbose, + ) + + if not fit.big_enough: # there was no fit + scale = 1 / fit.parameter + return (-1, scale) + + # fit.filled is a tuple; we convert it in place to a Rect for + # convenience. (fit.rect is already a Rect.) + fit.filled = Rect(fit.filled) + assert (fit.rect.x0, fit.rect.y0) == (0, 0) + assert (fit.filled.x0, fit.filled.y0) == (0, 0) + + scale = 1 / fit.parameter + assert scale >= scale_low, f'{scale_low=} {scale=}' + + spare_height = max((fit.rect.y1 - fit.filled.y1) * scale, 0) + + def rect_function(*args): + return fit.rect, fit.rect, None + + # draw story on temp PDF page + doc = story.write_with_links(rect_function) + + # Insert opacity if requested. + # For this, we prepend a command to the /Contents. + if 0 <= opacity < 1: + tpage = doc[0] # load page + # generate /ExtGstate for the page + alp0 = tpage._set_opacity(CA=opacity, ca=opacity) + s = f"/{alp0} gs\n" # generate graphic state command + TOOLS._insert_contents(tpage, s.encode(), 0) + + # put result in target page + page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay) + + # ------------------------------------------------------------------------- + # re-insert links in target rect (show_pdf_page cannot copy annotations) + # ------------------------------------------------------------------------- + # scaled center point of fit.rect + mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale + + # center point of target rect + mp2 = (rect.tl + rect.br) / 2 + + # compute link positioning matrix: + # - move center of scaled-down fit.rect to (0,0) + # - rotate + # - move (0,0) to center of target rect + mat = ( + Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y) + * Matrix(-rotate) + * Matrix(1, 0, 0, 1, mp2.x, mp2.y) + ) + + # copy over links + for link in doc[0].get_links(): + link["from"] *= mat + page.insert_link(link) + + return spare_height, scale + + def insert_image( + page, + rect, + *, + alpha=-1, + filename=None, + height=0, + keep_proportion=True, + mask=None, + oc=0, + overlay=True, + pixmap=None, + rotate=0, + stream=None, + width=0, + xref=0, + ): + """Insert an image for display in a rectangle. + + Args: + rect: (rect_like) position of image on the page. + alpha: (int, optional) set to 0 if image has no transparency. + filename: (str, Path, file object) image filename. + height: (int) + keep_proportion: (bool) keep width / height ratio (default). + mask: (bytes, optional) image consisting of alpha values to use. + oc: (int) xref of OCG or OCMD to declare as Optional Content. + overlay: (bool) put in foreground (default) or background. + pixmap: (pymupdf.Pixmap) use this as image. + rotate: (int) rotate by 0, 90, 180 or 270 degrees. + stream: (bytes) use this as image. + width: (int) + xref: (int) use this as image. + + 'page' and 'rect' are positional, all other parameters are keywords. + + If 'xref' is given, that image is used. Other input options are ignored. + Else, exactly one of pixmap, stream or filename must be given. + + 'alpha=0' for non-transparent images improves performance significantly. + Affects stream and filename only. + + Optimum transparent insertions are possible by using filename / stream in + conjunction with a 'mask' image of alpha values. + + Returns: + xref (int) of inserted image. Re-use as argument for multiple insertions. + """ + CheckParent(page) + doc = page.parent + if not doc.is_pdf: + raise ValueError("is no PDF") + + if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1): + raise ValueError("xref=0 needs exactly one of filename, pixmap, stream") + + if filename: + if type(filename) is str: + pass + elif hasattr(filename, "absolute"): + filename = str(filename) + elif hasattr(filename, "name"): + filename = filename.name + else: + raise ValueError("bad filename") + + if filename and not os.path.exists(filename): + raise FileNotFoundError("No such file: '%s'" % filename) + elif stream and type(stream) not in (bytes, bytearray, io.BytesIO): + raise ValueError("stream must be bytes-like / BytesIO") + elif pixmap and type(pixmap) is not Pixmap: + raise ValueError("pixmap must be a Pixmap") + if mask and not (stream or filename): + raise ValueError("mask requires stream or filename") + if mask and type(mask) not in (bytes, bytearray, io.BytesIO): + raise ValueError("mask must be bytes-like / BytesIO") + while rotate < 0: + rotate += 360 + while rotate >= 360: + rotate -= 360 + if rotate not in (0, 90, 180, 270): + raise ValueError("bad rotate value") + + r = Rect(rect) + if r.is_empty or r.is_infinite: + raise ValueError("rect must be finite and not empty") + clip = r * ~page.transformation_matrix + + # Create a unique image reference name. + ilst = [i[7] for i in doc.get_page_images(page.number)] + ilst += [i[1] for i in doc.get_page_xobjects(page.number)] + ilst += [i[4] for i in doc.get_page_fonts(page.number)] + n = "fzImg" # 'pymupdf image' + i = 0 + _imgname = n + "0" # first name candidate + while _imgname in ilst: + i += 1 + _imgname = n + str(i) # try new name + + if overlay: + page.wrap_contents() # ensure a balanced graphics state + digests = doc.InsertedImages + xref, digests = page._insert_image( + filename=filename, + pixmap=pixmap, + stream=stream, + imask=mask, + clip=clip, + overlay=overlay, + oc=oc, + xref=xref, + rotate=rotate, + keep_proportion=keep_proportion, + width=width, + height=height, + alpha=alpha, + _imgname=_imgname, + digests=digests, + ) + if digests is not None: + doc.InsertedImages = digests + + return xref + + def insert_link(page: 'Page', lnk: dict, mark: bool = True) -> None: + """Insert a new link for the current page.""" + CheckParent(page) + annot = utils.getLinkText(page, lnk) + if annot == "": + raise ValueError("link kind not supported") + page._addAnnot_FromString((annot,)) + + def insert_text( + page: 'Page', + point: point_like, + text: typing.Union[str, list], + *, + fontsize: float = 11, + lineheight: OptFloat = None, + fontname: str = "helv", + fontfile: OptStr = None, + set_simple: int = 0, + encoding: int = 0, + color: OptSeq = None, + fill: OptSeq = None, + border_width: float = 0.05, + miter_limit: float = 1, + render_mode: int = 0, + rotate: int = 0, + morph: OptSeq = None, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ): + + img = page.new_shape() + rc = img.insert_text( + point, + text, + fontsize=fontsize, + lineheight=lineheight, + fontname=fontname, + fontfile=fontfile, + set_simple=set_simple, + encoding=encoding, + color=color, + fill=fill, + border_width=border_width, + render_mode=render_mode, + miter_limit=miter_limit, + rotate=rotate, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + if rc >= 0: + img.commit(overlay) + return rc + + def insert_textbox( + page: 'Page', + rect: rect_like, + buffer: typing.Union[str, list], + *, + fontname: str = "helv", + fontfile: OptStr = None, + set_simple: int = 0, + encoding: int = 0, + fontsize: float = 11, + lineheight: OptFloat = None, + color: OptSeq = None, + fill: OptSeq = None, + expandtabs: int = 1, + align: int = 0, + rotate: int = 0, + render_mode: int = 0, + miter_limit: float = 1, + border_width: float = 0.05, + morph: OptSeq = None, + overlay: bool = True, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> float: + """Insert text into a given rectangle. + + Notes: + Creates a Shape object, uses its same-named method and commits it. + Parameters: + rect: (rect-like) area to use for text. + buffer: text to be inserted + fontname: a Base-14 font, font name or '/name' + fontfile: name of a font file + fontsize: font size + lineheight: overwrite the font property + color: RGB color triple + expandtabs: handles tabulators with string function + align: left, center, right, justified + rotate: 0, 90, 180, or 270 degrees + morph: morph box with a matrix and a fixpoint + overlay: put text in foreground or background + Returns: + unused or deficit rectangle area (float) + """ + img = page.new_shape() + rc = img.insert_textbox( + rect, + buffer, + fontsize=fontsize, + lineheight=lineheight, + fontname=fontname, + fontfile=fontfile, + set_simple=set_simple, + encoding=encoding, + color=color, + fill=fill, + expandtabs=expandtabs, + render_mode=render_mode, + miter_limit=miter_limit, + border_width=border_width, + align=align, + rotate=rotate, + morph=morph, + stroke_opacity=stroke_opacity, + fill_opacity=fill_opacity, + oc=oc, + ) + if rc >= 0: + img.commit(overlay) + return rc + @property def is_wrapped(self): """Check if /Contents is in a balanced graphics state.""" @@ -9740,6 +12770,9 @@ def mediabox_size(self): return Point(self.mediabox.x1, self.mediabox.y1) + def new_shape(self): + return Shape(self) + #@property #def parent( self): # assert self._parent @@ -9759,6 +12792,44 @@ # fixme this looks wrong. self.this = page + def replace_image( + page: 'Page', + xref: int, + *, + filename=None, + pixmap=None, + stream=None, + ): + """Replace the image referred to by xref. + + Replace the image by changing the object definition stored under xref. This + will leave the pages appearance instructions intact, so the new image is + being displayed with the same bbox, rotation etc. + By providing a small fully transparent image, an effect as if the image had + been deleted can be achieved. + A typical use may include replacing large images by a smaller version, + e.g. with a lower resolution or graylevel instead of colored. + + Args: + xref: the xref of the image to replace. + filename, pixmap, stream: exactly one of these must be provided. The + meaning being the same as in Page.insert_image. + """ + doc = page.parent # the owning document + if not doc.xref_is_image(xref): + raise ValueError("xref not an image") # insert new image anywhere in page + if bool(filename) + bool(stream) + bool(pixmap) != 1: + raise ValueError("Exactly one of filename/stream/pixmap must be given") + new_xref = page.insert_image( + page.rect, filename=filename, stream=stream, pixmap=pixmap + ) + doc.xref_copy(new_xref, xref) # copy over new to old + last_contents_xref = page.get_contents()[-1] + # new image insertion has created a new /Contents source, + # which we will set to spaces now + doc.update_stream(last_contents_xref, b" ") + page._image_info = None # clear cache of extracted image information + @property def rotation(self): """Page rotation.""" @@ -9780,6 +12851,47 @@ CheckParent(self) mupdf.fz_run_page(self.this, dw.device, JM_matrix_from_py(m), mupdf.FzCookie()) + def search_for( + page, + text, + *, + clip=None, + quads=False, + flags=None, + textpage=None, + ) -> list: + """Search for a string on a page. + + Args: + text: string to be searched for + clip: restrict search to this rectangle + quads: (bool) return quads instead of rectangles + flags: bit switches, default: join hyphened words + textpage: a pre-created pymupdf.TextPage + Returns: + a list of rectangles or quads, each containing one occurrence. + """ + if flags is None: + flags=(0 + | TEXT_DEHYPHENATE + | TEXT_PRESERVE_WHITESPACE + | TEXT_PRESERVE_LIGATURES + | TEXT_MEDIABOX_CLIP + ) + if clip is not None: + clip = Rect(clip) + + CheckParent(page) + tp = textpage + if tp is None: + tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage + elif getattr(tp, "parent") != page: + raise ValueError("not a textpage of this page") + rlist = tp.search(text, quads=quads) + if textpage is None: + del tp + return rlist + def set_artbox(self, rect): """Set the ArtBox.""" return self._set_pagebox("ArtBox", rect) @@ -9847,6 +12959,130 @@ """Set the TrimBox.""" return self._set_pagebox("TrimBox", rect) + def show_pdf_page( + page, + rect, + docsrc, + pno=0, + keep_proportion=True, + overlay=True, + oc=0, + rotate=0, + clip=None, + ) -> int: + """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'. + + Args: + rect: (rect-like) where to place the source image + docsrc: (document) source PDF + pno: (int) source page number + keep_proportion: (bool) do not change width-height-ratio + overlay: (bool) put in foreground + oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF) + rotate: (int) degrees (multiple of 90) + clip: (rect-like) part of source page rectangle + Returns: + xref of inserted object (for reuse) + """ + def calc_matrix(sr, tr, keep=True, rotate=0): + """Calculate transformation matrix from source to target rect. + + Notes: + The product of four matrices in this sequence: (1) translate correct + source corner to origin, (2) rotate, (3) scale, (4) translate to + target's top-left corner. + Args: + sr: source rect in PDF (!) coordinate system + tr: target rect in PDF coordinate system + keep: whether to keep source ratio of width to height + rotate: rotation angle in degrees + Returns: + Transformation matrix. + """ + # calc center point of source rect + smp = (sr.tl + sr.br) / 2.0 + # calc center point of target rect + tmp = (tr.tl + tr.br) / 2.0 + + # m moves to (0, 0), then rotates + m = Matrix(1, 0, 0, 1, -smp.x, -smp.y) * Matrix(rotate) + + sr1 = sr * m # resulting source rect to calculate scale factors + + fw = tr.width / sr1.width # scale the width + fh = tr.height / sr1.height # scale the height + if keep: + fw = fh = min(fw, fh) # take min if keeping aspect ratio + + m *= Matrix(fw, fh) # concat scale matrix + m *= Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center + return JM_TUPLE(m) + + CheckParent(page) + doc = page.parent + + if not doc.is_pdf or not docsrc.is_pdf: + raise ValueError("is no PDF") + + if rect.is_empty or rect.is_infinite: + raise ValueError("rect must be finite and not empty") + + while pno < 0: # support negative page numbers + pno += docsrc.page_count + src_page = docsrc[pno] # load source page + + tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates + + src_rect = src_page.rect if not clip else src_page.rect & clip # source rect + if src_rect.is_empty or src_rect.is_infinite: + raise ValueError("clip must be finite and not empty") + src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord + + matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate) + + # list of existing /Form /XObjects + ilst = [i[1] for i in doc.get_page_xobjects(page.number)] + ilst += [i[7] for i in doc.get_page_images(page.number)] + ilst += [i[4] for i in doc.get_page_fonts(page.number)] + + # create a name not in that list + n = "fzFrm" + i = 0 + _imgname = n + "0" + while _imgname in ilst: + i += 1 + _imgname = n + str(i) + + isrc = docsrc._graft_id # used as key for graftmaps + if doc._graft_id == isrc: + raise ValueError("source document must not equal target") + + # retrieve / make Graftmap for source PDF + gmap = doc.Graftmaps.get(isrc, None) + if gmap is None: + gmap = Graftmap(doc) + doc.Graftmaps[isrc] = gmap + + # take note of generated xref for automatic reuse + pno_id = (isrc, pno) # id of docsrc[pno] + xref = doc.ShownPages.get(pno_id, 0) + + if overlay: + page.wrap_contents() # ensure a balanced graphics state + xref = page._show_pdf_page( + src_page, + overlay=overlay, + matrix=matrix, + xref=xref, + oc=oc, + clip=src_rect, + graftmap=gmap, + _imgname=_imgname, + ) + doc.ShownPages[pno_id] = xref + + return xref + @property def transformation_matrix(self): """Page transformation matrix.""" @@ -9875,6 +13111,15 @@ mb = self.mediabox return Rect(rect[0], mb.y1 - rect[3], rect[2], mb.y1 - rect[1]) + def update_link(page: 'Page', lnk: dict) -> None: + """Update a link on the current page.""" + CheckParent(page) + annot = utils.getLinkText(page, lnk) + if annot == "": + raise ValueError("link kind not supported") + + page.parent.update_object(lnk["xref"], annot, page=page) + def widgets(self, types=None): """ Generator over the widgets of a page. @@ -9902,6 +13147,57 @@ append = b"\nQ" * pop + b"\n" TOOLS._insert_contents(self, append, True) + def write_text( + page: 'Page', + rect=None, + writers=None, + overlay=True, + color=None, + opacity=None, + keep_proportion=True, + rotate=0, + oc=0, + ) -> None: + """Write the text of one or more pymupdf.TextWriter objects. + + Args: + rect: target rectangle. If None, the union of the text writers is used. + writers: one or more pymupdf.TextWriter objects. + overlay: put in foreground or background. + keep_proportion: maintain aspect ratio of rectangle sides. + rotate: arbitrary rotation angle. + oc: the xref of an optional content object + """ + assert isinstance(page, Page) + if not writers: + raise ValueError("need at least one pymupdf.TextWriter") + if type(writers) is TextWriter: + if rotate == 0 and rect is None: + writers.write_text(page, opacity=opacity, color=color, overlay=overlay) + return None + else: + writers = (writers,) + clip = writers[0].text_rect + textdoc = Document() + tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height) + for writer in writers: + clip |= writer.text_rect + writer.write_text(tpage, opacity=opacity, color=color) + if rect is None: + rect = clip + page.show_pdf_page( + rect, + textdoc, + 0, + overlay=overlay, + keep_proportion=keep_proportion, + rotate=rotate, + clip=clip, + oc=oc, + ) + textdoc = None + tpage = None + @property def xref(self): """PDF xref number of page.""" @@ -11502,6 +14798,996 @@ tr = top_right +class Shape: + """Create a new shape.""" + + @staticmethod + def horizontal_angle(C, P): + """Return the angle to the horizontal for the connection from C to P. + This uses the arcus sine function and resolves its inherent ambiguity by + looking up in which quadrant vector S = P - C is located. + """ + S = Point(P - C).unit # unit vector 'C' -> 'P' + alfa = math.asin(abs(S.y)) # absolute angle from horizontal + if S.x < 0: # make arcsin result unique + if S.y <= 0: # bottom-left + alfa = -(math.pi - alfa) + else: # top-left + alfa = math.pi - alfa + else: + if S.y >= 0: # top-right + pass + else: # bottom-right + alfa = -alfa + return alfa + + def __init__(self, page: Page): + CheckParent(page) + self.page = page + self.doc = page.parent + if not self.doc.is_pdf: + raise ValueError("is no PDF") + self.height = page.mediabox_size.y + self.width = page.mediabox_size.x + self.x = page.cropbox_position.x + self.y = page.cropbox_position.y + + self.pctm = page.transformation_matrix # page transf. matrix + self.ipctm = ~self.pctm # inverted transf. matrix + + self.draw_cont = "" + self.text_cont = "" + self.totalcont = "" + self.last_point = None + self.rect = None + + def updateRect(self, x): + if self.rect is None: + if len(x) == 2: + self.rect = Rect(x, x) + else: + self.rect = Rect(x) + + else: + if len(x) == 2: + x = Point(x) + self.rect.x0 = min(self.rect.x0, x.x) + self.rect.y0 = min(self.rect.y0, x.y) + self.rect.x1 = max(self.rect.x1, x.x) + self.rect.y1 = max(self.rect.y1, x.y) + else: + x = Rect(x) + self.rect.x0 = min(self.rect.x0, x.x0) + self.rect.y0 = min(self.rect.y0, x.y0) + self.rect.x1 = max(self.rect.x1, x.x1) + self.rect.y1 = max(self.rect.y1, x.y1) + + def draw_line(self, p1: point_like, p2: point_like) -> Point: + """Draw a line between two points.""" + p1 = Point(p1) + p2 = Point(p2) + if not (self.last_point == p1): + self.draw_cont += _format_g(JM_TUPLE(p1 * self.ipctm)) + " m\n" + self.last_point = p1 + self.updateRect(p1) + + self.draw_cont += _format_g(JM_TUPLE(p2 * self.ipctm)) + " l\n" + self.updateRect(p2) + self.last_point = p2 + return self.last_point + + def draw_polyline(self, points: list) -> Point: + """Draw several connected line segments.""" + for i, p in enumerate(points): + if i == 0: + if not (self.last_point == Point(p)): + self.draw_cont += _format_g(JM_TUPLE(Point(p) * self.ipctm)) + " m\n" + self.last_point = Point(p) + else: + self.draw_cont += _format_g(JM_TUPLE(Point(p) * self.ipctm)) + " l\n" + self.updateRect(p) + + self.last_point = Point(points[-1]) + return self.last_point + + def draw_bezier( + self, + p1: point_like, + p2: point_like, + p3: point_like, + p4: point_like, + ) -> Point: + """Draw a standard cubic Bezier curve.""" + p1 = Point(p1) + p2 = Point(p2) + p3 = Point(p3) + p4 = Point(p4) + if not (self.last_point == p1): + self.draw_cont += _format_g(JM_TUPLE(p1 * self.ipctm)) + " m\n" + args = JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm)) + self.draw_cont += _format_g(args) + " c\n" + self.updateRect(p1) + self.updateRect(p2) + self.updateRect(p3) + self.updateRect(p4) + self.last_point = p4 + return self.last_point + + def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> Point: + """Draw an ellipse inside a tetrapod.""" + if len(tetra) != 4: + raise ValueError("invalid arg length") + if hasattr(tetra[0], "__float__"): + q = Rect(tetra).quad + else: + q = Quad(tetra) + + mt = q.ul + (q.ur - q.ul) * 0.5 + mr = q.ur + (q.lr - q.ur) * 0.5 + mb = q.ll + (q.lr - q.ll) * 0.5 + ml = q.ul + (q.ll - q.ul) * 0.5 + if not (self.last_point == ml): + self.draw_cont += _format_g(JM_TUPLE(ml * self.ipctm)) + " m\n" + self.last_point = ml + self.draw_curve(ml, q.ll, mb) + self.draw_curve(mb, q.lr, mr) + self.draw_curve(mr, q.ur, mt) + self.draw_curve(mt, q.ul, ml) + self.updateRect(q.rect) + self.last_point = ml + return self.last_point + + def draw_circle(self, center: point_like, radius: float) -> Point: + """Draw a circle given its center and radius.""" + if not radius > EPSILON: + raise ValueError("radius must be positive") + center = Point(center) + p1 = center - (radius, 0) + return self.draw_sector(center, p1, 360, fullSector=False) + + def draw_curve( + self, + p1: point_like, + p2: point_like, + p3: point_like, + ) -> Point: + """Draw a curve between points using one control point.""" + kappa = 0.55228474983 + p1 = Point(p1) + p2 = Point(p2) + p3 = Point(p3) + k1 = p1 + (p2 - p1) * kappa + k2 = p3 + (p2 - p3) * kappa + return self.draw_bezier(p1, k1, k2, p3) + + def draw_sector( + self, + center: point_like, + point: point_like, + beta: float, + fullSector: bool = True, + ) -> Point: + """Draw a circle sector.""" + center = Point(center) + point = Point(point) + l3 = lambda a, b: _format_g((a, b)) + " m\n" + l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n" + l5 = lambda a, b: _format_g((a, b)) + " l\n" + betar = math.radians(-beta) + w360 = math.radians(math.copysign(360, betar)) * (-1) + w90 = math.radians(math.copysign(90, betar)) + w45 = w90 / 2 + while abs(betar) > 2 * math.pi: + betar += w360 # bring angle below 360 degrees + if not (self.last_point == point): + self.draw_cont += l3(*JM_TUPLE(point * self.ipctm)) + self.last_point = point + Q = Point(0, 0) # just make sure it exists + C = center + P = point + S = P - C # vector 'center' -> 'point' + rad = abs(S) # circle radius + + if not rad > EPSILON: + raise ValueError("radius must be positive") + + alfa = self.horizontal_angle(center, point) + while abs(betar) > abs(w90): # draw 90 degree arcs + q1 = C.x + math.cos(alfa + w90) * rad + q2 = C.y + math.sin(alfa + w90) * rad + Q = Point(q1, q2) # the arc's end point + r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45) + r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45) + R = Point(r1, r2) # crossing point of tangents + kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q) + kappa = kappah * abs(P - Q) + cp1 = P + (R - P) * kappa # control point 1 + cp2 = Q + (R - Q) * kappa # control point 2 + self.draw_cont += l4(*JM_TUPLE( + list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) + )) + + betar -= w90 # reduce param angle by 90 deg + alfa += w90 # advance start angle by 90 deg + P = Q # advance to arc end point + # draw (remaining) arc + if abs(betar) > 1e-3: # significant degrees left? + beta2 = betar / 2 + q1 = C.x + math.cos(alfa + betar) * rad + q2 = C.y + math.sin(alfa + betar) * rad + Q = Point(q1, q2) # the arc's end point + r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2) + r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2) + R = Point(r1, r2) # crossing point of tangents + # kappa height is 4/3 of segment height + kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height + kappa = kappah * abs(P - Q) / (1 - math.cos(betar)) + cp1 = P + (R - P) * kappa # control point 1 + cp2 = Q + (R - Q) * kappa # control point 2 + self.draw_cont += l4(*JM_TUPLE( + list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) + )) + if fullSector: + self.draw_cont += l3(*JM_TUPLE(point * self.ipctm)) + self.draw_cont += l5(*JM_TUPLE(center * self.ipctm)) + self.draw_cont += l5(*JM_TUPLE(Q * self.ipctm)) + self.last_point = Q + return self.last_point + + def draw_rect(self, rect: rect_like, *, radius=None) -> Point: + """Draw a rectangle. + + Args: + radius: if not None, the rectangle will have rounded corners. + This is the radius of the curvature, given as percentage of + the rectangle width or height. Valid are values 0 < v <= 0.5. + For a sequence of two values, the corners will have different + radii. Otherwise, the percentage will be computed from the + shorter side. A value of (0.5, 0.5) will draw an ellipse. + """ + r = Rect(rect) + if radius is None: # standard rectangle + self.draw_cont += _format_g(JM_TUPLE( + list(r.bl * self.ipctm) + [r.width, r.height] + )) + " re\n" + self.updateRect(r) + self.last_point = r.tl + return self.last_point + # rounded corners requested. This requires 1 or 2 values, each + # with 0 < value <= 0.5 + if hasattr(radius, "__float__"): + if radius <= 0 or radius > 0.5: + raise ValueError(f"bad radius value {radius}.") + d = min(r.width, r.height) * radius + px = (d, 0) + py = (0, d) + elif hasattr(radius, "__len__") and len(radius) == 2: + rx, ry = radius + px = (rx * r.width, 0) + py = (0, ry * r.height) + if min(rx, ry) <= 0 or max(rx, ry) > 0.5: + raise ValueError(f"bad radius value {radius}.") + else: + raise ValueError(f"bad radius value {radius}.") + + lp = self.draw_line(r.tl + py, r.bl - py) + lp = self.draw_curve(lp, r.bl, r.bl + px) + + lp = self.draw_line(lp, r.br - px) + lp = self.draw_curve(lp, r.br, r.br - py) + + lp = self.draw_line(lp, r.tr + py) + lp = self.draw_curve(lp, r.tr, r.tr - px) + + lp = self.draw_line(lp, r.tl + px) + self.last_point = self.draw_curve(lp, r.tl, r.tl + py) + + self.updateRect(r) + return self.last_point + + def draw_quad(self, quad: quad_like) -> Point: + """Draw a Quad.""" + q = Quad(quad) + return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul]) + + def draw_zigzag( + self, + p1: point_like, + p2: point_like, + breadth: float = 2, + ) -> Point: + """Draw a zig-zagged line from p1 to p2.""" + p1 = Point(p1) + p2 = Point(p2) + S = p2 - p1 # vector start - end + rad = abs(S) # distance of points + cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases + if cnt < 4: + raise ValueError("points too close") + mb = rad / cnt # revised breadth + matrix = Matrix(util_hor_matrix(p1, p2)) # normalize line to x-axis + i_mat = ~matrix # get original position + points = [] # stores edges + for i in range(1, cnt): + if i % 4 == 1: # point "above" connection + p = Point(i, -1) * mb + elif i % 4 == 3: # point "below" connection + p = Point(i, 1) * mb + else: # ignore others + continue + points.append(p * i_mat) + self.draw_polyline([p1] + points + [p2]) # add start and end points + return p2 + + def draw_squiggle( + self, + p1: point_like, + p2: point_like, + breadth=2, + ) -> Point: + """Draw a squiggly line from p1 to p2.""" + p1 = Point(p1) + p2 = Point(p2) + S = p2 - p1 # vector start - end + rad = abs(S) # distance of points + cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases + if cnt < 4: + raise ValueError("points too close") + mb = rad / cnt # revised breadth + matrix = Matrix(util_hor_matrix(p1, p2)) # normalize line to x-axis + i_mat = ~matrix # get original position + k = 2.4142135623765633 # y of draw_curve helper point + + points = [] # stores edges + for i in range(1, cnt): + if i % 4 == 1: # point "above" connection + p = Point(i, -k) * mb + elif i % 4 == 3: # point "below" connection + p = Point(i, k) * mb + else: # else on connection line + p = Point(i, 0) * mb + points.append(p * i_mat) + + points = [p1] + points + [p2] + cnt = len(points) + i = 0 + while i + 2 < cnt: + self.draw_curve(points[i], points[i + 1], points[i + 2]) + i += 2 + return p2 + + # ============================================================================== + # Shape.insert_text + # ============================================================================== + def insert_text( + self, + point: point_like, + buffer: typing.Union[str, list], + *, + fontsize: float = 11, + lineheight: OptFloat = None, + fontname: str = "helv", + fontfile: OptStr = None, + set_simple: bool = 0, + encoding: int = 0, + color: OptSeq = None, + fill: OptSeq = None, + render_mode: int = 0, + border_width: float = 0.05, + miter_limit: float = 1, + rotate: int = 0, + morph: OptSeq = None, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> int: + + # ensure 'text' is a list of strings, worth dealing with + if not bool(buffer): + return 0 + + if type(buffer) not in (list, tuple): + text = buffer.splitlines() + else: + text = buffer + + if not len(text) > 0: + return 0 + + point = Point(point) + try: + maxcode = max([ord(c) for c in " ".join(text)]) + except Exception: + exception_info() + return 0 + + # ensure valid 'fontname' + fname = fontname + if fname.startswith("/"): + fname = fname[1:] + + xref = self.page.insert_font( + fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple + ) + fontinfo = CheckFontInfo(self.doc, xref) + + fontdict = fontinfo[1] + ordering = fontdict["ordering"] + simple = fontdict["simple"] + bfname = fontdict["name"] + ascender = fontdict["ascender"] + descender = fontdict["descender"] + if lineheight: + lheight = fontsize * lineheight + elif ascender - descender <= 1: + lheight = fontsize * 1.2 + else: + lheight = fontsize * (ascender - descender) + + if maxcode > 255: + glyphs = self.doc.get_char_widths(xref, maxcode + 1) + else: + glyphs = fontdict["glyphs"] + + tab = [] + for t in text: + if simple and bfname not in ("Symbol", "ZapfDingbats"): + g = None + else: + g = glyphs + tab.append(getTJstr(t, g, simple, ordering)) + text = tab + + color_str = ColorCode(color, "c") + fill_str = ColorCode(fill, "f") + if not fill and render_mode == 0: # ensure fill color when 0 Tr + fill = color + fill_str = ColorCode(color, "f") + + morphing = CheckMorph(morph) + rot = rotate + if rot % 90 != 0: + raise ValueError("bad rotate value") + + while rot < 0: + rot += 360 + rot = rot % 360 # text rotate = 0, 90, 270, 180 + + templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf " + templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n" + cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise + cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise + cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. + height = self.height + width = self.width + + # setting up for standard rotation directions + # case rotate = 0 + if morphing: + m1 = Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y) + mat = ~m1 * morph[1] * m1 + cm = _format_g(JM_TUPLE(mat)) + " cm\n" + else: + cm = "" + top = height - point.y - self.y # start of 1st char + left = point.x + self.x # start of 1. char + space = top # space available + #headroom = point.y + self.y # distance to page border + if rot == 90: + left = height - point.y - self.y + top = -point.x - self.x + cm += cmp90 + space = width - abs(top) + #headroom = point.x + self.x + + elif rot == 270: + left = -height + point.y + self.y + top = point.x + self.x + cm += cmm90 + space = abs(top) + #headroom = width - point.x - self.x + + elif rot == 180: + left = -point.x - self.x + top = -height + point.y + self.y + cm += cm180 + space = abs(point.y + self.y) + #headroom = height - point.y - self.y + + optcont = self.page._get_optional_content(oc) + if optcont is not None: + bdc = "/OC /%s BDC\n" % optcont + emc = "EMC\n" + else: + bdc = emc = "" + + alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) + if alpha is None: + alpha = "" + else: + alpha = "/%s gs\n" % alpha + nres = templ1(bdc, alpha, cm, left, top, fname, fontsize) + + if render_mode > 0: + nres += "%i Tr " % render_mode + nres += _format_g(border_width * fontsize) + " w " + if miter_limit is not None: + nres += _format_g(miter_limit) + " M " + if color is not None: + nres += color_str + if fill is not None: + nres += fill_str + + # ========================================================================= + # start text insertion + # ========================================================================= + nres += text[0] + nlines = 1 # set output line counter + if len(text) > 1: + nres += templ2(lheight) # line 1 + else: + nres += 'TJ' + for i in range(1, len(text)): + if space < lheight: + break # no space left on page + if i > 1: + nres += "\nT* " + nres += text[i] + 'TJ' + space -= lheight + nlines += 1 + + nres += "\nET\n%sQ\n" % emc + + # ========================================================================= + # end of text insertion + # ========================================================================= + # update the /Contents object + self.text_cont += nres + return nlines + + # ============================================================================== + # Shape.insert_textbox + # ============================================================================== + def insert_textbox( + self, + rect: rect_like, + buffer: typing.Union[str, list], + *, + fontname: OptStr = "helv", + fontfile: OptStr = None, + fontsize: float = 11, + lineheight: OptFloat = None, + set_simple: bool = 0, + encoding: int = 0, + color: OptSeq = None, + fill: OptSeq = None, + expandtabs: int = 1, + border_width: float = 0.05, + miter_limit: float = 1, + align: int = 0, + render_mode: int = 0, + rotate: int = 0, + morph: OptSeq = None, + stroke_opacity: float = 1, + fill_opacity: float = 1, + oc: int = 0, + ) -> float: + """Insert text into a given rectangle. + + Args: + rect -- the textbox to fill + buffer -- text to be inserted + fontname -- a Base-14 font, font name or '/name' + fontfile -- name of a font file + fontsize -- font size + lineheight -- overwrite the font property + color -- RGB stroke color triple + fill -- RGB fill color triple + render_mode -- text rendering control + border_width -- thickness of glyph borders as percentage of fontsize + expandtabs -- handles tabulators with string function + align -- left, center, right, justified + rotate -- 0, 90, 180, or 270 degrees + morph -- morph box with a matrix and a fixpoint + Returns: + unused or deficit rectangle area (float) + """ + rect = Rect(rect) + if rect.is_empty or rect.is_infinite: + raise ValueError("text box must be finite and not empty") + + color_str = ColorCode(color, "c") + fill_str = ColorCode(fill, "f") + if fill is None and render_mode == 0: # ensure fill color for 0 Tr + fill = color + fill_str = ColorCode(color, "f") + + optcont = self.page._get_optional_content(oc) + if optcont is not None: + bdc = "/OC /%s BDC\n" % optcont + emc = "EMC\n" + else: + bdc = emc = "" + + # determine opacity / transparency + alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) + if alpha is None: + alpha = "" + else: + alpha = "/%s gs\n" % alpha + + if rotate % 90 != 0: + raise ValueError("rotate must be multiple of 90") + + rot = rotate + while rot < 0: + rot += 360 + rot = rot % 360 + + # is buffer worth of dealing with? + if not bool(buffer): + return rect.height if rot in (0, 180) else rect.width + + cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise + cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise + cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. + height = self.height + + fname = fontname + if fname.startswith("/"): + fname = fname[1:] + + xref = self.page.insert_font( + fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple + ) + fontinfo = CheckFontInfo(self.doc, xref) + + fontdict = fontinfo[1] + ordering = fontdict["ordering"] + simple = fontdict["simple"] + glyphs = fontdict["glyphs"] + bfname = fontdict["name"] + ascender = fontdict["ascender"] + descender = fontdict["descender"] + + if lineheight: + lheight_factor = lineheight + elif ascender - descender <= 1: + lheight_factor = 1.2 + else: + lheight_factor = ascender - descender + lheight = fontsize * lheight_factor + + # create a list from buffer, split into its lines + if type(buffer) in (list, tuple): + t0 = "\n".join(buffer) + else: + t0 = buffer + + maxcode = max([ord(c) for c in t0]) + # replace invalid char codes for simple fonts + if simple and maxcode > 255: + t0 = "".join([c if ord(c) < 256 else "?" for c in t0]) + + t0 = t0.splitlines() + + glyphs = self.doc.get_char_widths(xref, maxcode + 1) + if simple and bfname not in ("Symbol", "ZapfDingbats"): + tj_glyphs = None + else: + tj_glyphs = glyphs + + # ---------------------------------------------------------------------- + # calculate pixel length of a string + # ---------------------------------------------------------------------- + def pixlen(x): + """Calculate pixel length of x.""" + if ordering < 0: + return sum([glyphs[ord(c)][1] for c in x]) * fontsize + else: + return len(x) * fontsize + + # --------------------------------------------------------------------- + + if ordering < 0: + blen = glyphs[32][1] * fontsize # pixel size of space character + else: + blen = fontsize + + text = "" # output buffer + + if CheckMorph(morph): + m1 = Matrix( + 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y + ) + mat = ~m1 * morph[1] * m1 + cm = _format_g(JM_TUPLE(mat)) + " cm\n" + else: + cm = "" + + # --------------------------------------------------------------------- + # adjust for text orientation / rotation + # --------------------------------------------------------------------- + progr = 1 # direction of line progress + c_pnt = Point(0, fontsize * ascender) # used for line progress + if rot == 0: # normal orientation + point = rect.tl + c_pnt # line 1 is 'lheight' below top + maxwidth = rect.width # pixels available in one line + maxheight = rect.height # available text height + + elif rot == 90: # rotate counter clockwise + c_pnt = Point(fontsize * ascender, 0) # progress in x-direction + point = rect.bl + c_pnt # line 1 'lheight' away from left + maxwidth = rect.height # pixels available in one line + maxheight = rect.width # available text height + cm += cmp90 + + elif rot == 180: # text upside down + # progress upwards in y direction + c_pnt = -Point(0, fontsize * ascender) + point = rect.br + c_pnt # line 1 'lheight' above bottom + maxwidth = rect.width # pixels available in one line + progr = -1 # subtract lheight for next line + maxheight =rect.height # available text height + cm += cm180 + + else: # rotate clockwise (270 or -90) + # progress from right to left + c_pnt = -Point(fontsize * ascender, 0) + point = rect.tr + c_pnt # line 1 'lheight' left of right + maxwidth = rect.height # pixels available in one line + progr = -1 # subtract lheight for next line + maxheight = rect.width # available text height + cm += cmm90 + + # ===================================================================== + # line loop + # ===================================================================== + just_tab = [] # 'justify' indicators per line + + for i, line in enumerate(t0): + line_t = line.expandtabs(expandtabs).split(" ") # split into words + num_words = len(line_t) + lbuff = "" # init line buffer + rest = maxwidth # available line pixels + # ================================================================= + # word loop + # ================================================================= + for j in range(num_words): + word = line_t[j] + pl_w = pixlen(word) # pixel len of word + if rest >= pl_w: # does it fit on the line? + lbuff += word + " " # yes, append word + rest -= pl_w + blen # update available line space + continue # next word + + # word doesn't fit - output line (if not empty) + if lbuff: + lbuff = lbuff.rstrip() + "\n" # line full, append line break + text += lbuff # append to total text + just_tab.append(True) # can align-justify + + lbuff = "" # re-init line buffer + rest = maxwidth # re-init avail. space + + if pl_w <= maxwidth: # word shorter than 1 line? + lbuff = word + " " # start the line with it + rest = maxwidth - pl_w - blen # update free space + continue + + # long word: split across multiple lines - char by char ... + if len(just_tab) > 0: + just_tab[-1] = False # cannot align-justify + for c in word: + if pixlen(lbuff) <= maxwidth - pixlen(c): + lbuff += c + else: # line full + lbuff += "\n" # close line + text += lbuff # append to text + just_tab.append(False) # cannot align-justify + lbuff = c # start new line with this char + + lbuff += " " # finish long word + rest = maxwidth - pixlen(lbuff) # long word stored + + if lbuff: # unprocessed line content? + text += lbuff.rstrip() # append to text + just_tab.append(False) # cannot align-justify + + if i < len(t0) - 1: # not the last line? + text += "\n" # insert line break + + # compute used part of the textbox + if text.endswith("\n"): + text = text[:-1] + lb_count = text.count("\n") + 1 # number of lines written + + # text height = line count * line height plus one descender value + text_height = lheight * lb_count - descender * fontsize + + more = text_height - maxheight # difference to height limit + if more > EPSILON: # landed too much outside rect + return (-1) * more # return deficit, don't output + + more = abs(more) + if more < EPSILON: + more = 0 # don't bother with epsilons + nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer + templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf " + # center, right, justify: output each line with its own specifics + text_t = text.splitlines() # split text in lines again + just_tab[-1] = False # never justify last line + for i, t in enumerate(text_t): + spacing = 0 + pl = maxwidth - pixlen(t) # length of empty line part + pnt = point + c_pnt * (i * lheight_factor) # text start of line + if align == 1: # center: right shift by half width + if rot in (0, 180): + pnt = pnt + Point(pl / 2, 0) * progr + else: + pnt = pnt - Point(0, pl / 2) * progr + elif align == 2: # right: right shift by full width + if rot in (0, 180): + pnt = pnt + Point(pl, 0) * progr + else: + pnt = pnt - Point(0, pl) * progr + elif align == 3: # justify + spaces = t.count(" ") # number of spaces in line + if spaces > 0 and just_tab[i]: # if any, and we may justify + spacing = pl / spaces # make every space this much larger + else: + spacing = 0 # keep normal space length + top = height - pnt.y - self.y + left = pnt.x + self.x + if rot == 90: + left = height - pnt.y - self.y + top = -pnt.x - self.x + elif rot == 270: + left = -height + pnt.y + self.y + top = pnt.x + self.x + elif rot == 180: + left = -pnt.x - self.x + top = -height + pnt.y + self.y + + nres += templ(left, top, fname, fontsize) + + if render_mode > 0: + nres += "%i Tr " % render_mode + nres += _format_g(border_width * fontsize) + " w " + if miter_limit is not None: + nres += _format_g(miter_limit) + " M " + + if align == 3: + nres += _format_g(spacing) + " Tw " + + if color is not None: + nres += color_str + if fill is not None: + nres += fill_str + nres += "%sTJ\n" % getTJstr(t, tj_glyphs, simple, ordering) + + nres += "ET\n%sQ\n" % emc + + self.text_cont += nres + self.updateRect(rect) + return more + + def finish( + self, + width: float = 1, + color: OptSeq = (0,), + fill: OptSeq = None, + lineCap: int = 0, + lineJoin: int = 0, + dashes: OptStr = None, + even_odd: bool = False, + morph: OptSeq = None, + closePath: bool = True, + fill_opacity: float = 1, + stroke_opacity: float = 1, + oc: int = 0, + ) -> None: + """Finish the current drawing segment. + + Notes: + Apply colors, opacity, dashes, line style and width, or + morphing. Also whether to close the path + by connecting last to first point. + """ + if self.draw_cont == "": # treat empty contents as no-op + return + + if width == 0: # border color makes no sense then + color = None + elif color is None: # vice versa + width = 0 + # if color == None and fill == None: + # raise ValueError("at least one of 'color' or 'fill' must be given") + color_str = ColorCode(color, "c") # ensure proper color string + fill_str = ColorCode(fill, "f") # ensure proper fill string + + optcont = self.page._get_optional_content(oc) + if optcont is not None: + self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont + emc = "EMC\n" + else: + emc = "" + + alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) + if alpha is not None: + self.draw_cont = "/%s gs\n" % alpha + self.draw_cont + + if width != 1 and width != 0: + self.draw_cont += _format_g(width) + " w\n" + + if lineCap != 0: + self.draw_cont = "%i J\n" % lineCap + self.draw_cont + if lineJoin != 0: + self.draw_cont = "%i j\n" % lineJoin + self.draw_cont + + if dashes not in (None, "", "[] 0"): + self.draw_cont = "%s d\n" % dashes + self.draw_cont + + if closePath: + self.draw_cont += "h\n" + self.last_point = None + + if color is not None: + self.draw_cont += color_str + + if fill is not None: + self.draw_cont += fill_str + if color is not None: + if not even_odd: + self.draw_cont += "B\n" + else: + self.draw_cont += "B*\n" + else: + if not even_odd: + self.draw_cont += "f\n" + else: + self.draw_cont += "f*\n" + else: + self.draw_cont += "S\n" + + self.draw_cont += emc + if CheckMorph(morph): + m1 = Matrix( + 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y + ) + mat = ~m1 * morph[1] * m1 + self.draw_cont = _format_g(JM_TUPLE(mat)) + " cm\n" + self.draw_cont + + self.totalcont += "\nq\n" + self.draw_cont + "Q\n" + self.draw_cont = "" + self.last_point = None + return + + def commit(self, overlay: bool = True) -> None: + """Update the page's /Contents object with Shape data. + + The argument controls whether data appear in foreground (default) + or background. + """ + CheckParent(self.page) # doc may have died meanwhile + self.totalcont += self.text_cont + self.totalcont = self.totalcont.encode() + + if self.totalcont: + if overlay: + self.page.wrap_contents() # ensure a balanced graphics state + # make /Contents object with dummy stream + xref = TOOLS._insert_contents(self.page, b" ", overlay) + # update it with potential compression + self.doc.update_stream(xref, self.totalcont) + + self.last_point = None # clean up ... + self.rect = None # + self.draw_cont = "" # for potential ... + self.text_cont = "" # ... + self.totalcont = "" # re-use + + class Story: def __init__( self, html='', user_css=None, em=12, archive=None): @@ -11663,10 +15949,13 @@ function( position2) mupdf.fz_story_positions( self.this, function2) - def place( self, where): + def place( self, where, flags=0): + ''' + Wrapper for fz_place_story_flags(). + ''' where = JM_rect_from_py( where) filled = mupdf.FzRect() - more = mupdf.fz_place_story( self.this, where, filled) + more = mupdf.fz_place_story_flags( self.this, where, filled, flags) return more, JM_py_from_rect( filled) def reset( self): @@ -11783,7 +16072,9 @@ `big_enough`: `True` if the fit succeeded. `filled`: - From the last call to `Story.place()`. + Tuple (x0, y0, x1, y1) from the last call to `Story.place()`. This + will be wider than .rect if any single word (which we never split) + was too wide for .rect. `more`: `False` if the fit succeeded. `numcalls`: @@ -11791,7 +16082,7 @@ `parameter`: The successful parameter value, or the largest failing value. `rect`: - The rect created from `parameter`. + The pumupdf.Rect created from `parameter`. ''' def __init__(self, big_enough=None, filled=None, more=None, numcalls=None, parameter=None, rect=None): self.big_enough = big_enough @@ -11811,7 +16102,7 @@ f' rect={self.rect}' ) - def fit(self, fn, pmin=None, pmax=None, delta=0.001, verbose=False): + def fit(self, fn, pmin=None, pmax=None, delta=0.001, verbose=False, flags=0): ''' Finds optimal rect that contains the story `self`. @@ -11838,6 +16129,9 @@ Maximum error in returned `parameter`. :arg verbose: If true we output diagnostics. + :arg flags: + Passed to mupdf.fz_place_story_flags(). e.g. + zero or `mupdf.FZ_PLACE_STORY_FLAG_NO_OVERFLOW`. ''' def log(text): assert verbose @@ -11893,7 +16187,7 @@ if verbose: log(f'update(): not calling self.place() because rect is empty.') else: - more, filled = self.place(rect) + more, filled = self.place(rect, flags) state.numcalls += 1 big_enough = not more result = Story.FitResult( @@ -11962,12 +16256,12 @@ parameter = (state.pmin + state.pmax) / 2 update(parameter) - def fit_scale(self, rect, scale_min=0, scale_max=None, delta=0.001, verbose=False): + def fit_scale(self, rect, scale_min=0, scale_max=None, delta=0.001, verbose=False, flags=0): ''' Finds smallest value `scale` in range `scale_min..scale_max` where `scale * rect` is large enough to contain the story `self`. - Returns a `Story.FitResult` instance. + Returns a `Story.FitResult` instance with `.parameter` set to `scale`. :arg width: width of rect. @@ -11982,13 +16276,15 @@ Maximum error in returned scale. :arg verbose: If true we output diagnostics. + :arg flags: + Passed to Story.place(). ''' x0, y0, x1, y1 = rect width = x1 - x0 height = y1 - y0 def fn(scale): return Rect(x0, y0, x0 + scale*width, y0 + scale*height) - return self.fit(fn, scale_min, scale_max, delta, verbose) + return self.fit(fn, scale_min, scale_max, delta, verbose, flags) def fit_height(self, width, height_min=0, height_max=None, origin=(0, 0), delta=0.001, verbose=False): ''' @@ -12315,6 +16611,10 @@ and not mupdf.fz_is_infinite_rect(tp_rect) ): continue + + if buflen == 0 and ch.m_internal.c == 0x200d: + # ZERO WIDTH JOINER cannot start a word + continue word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters) this_char_rtl = JM_is_rtl_char(ch.m_internal.c) if word_delimiter or this_char_rtl != last_char_rtl: @@ -12515,6 +16815,232 @@ text = " ".join(words) return text + def fill_textbox( + writer: 'TextWriter', + rect: rect_like, + text: typing.Union[str, list], + pos: point_like = None, + font: typing.Optional[Font] = None, + fontsize: float = 11, + lineheight: OptFloat = None, + align: int = 0, + warn: bool = None, + right_to_left: bool = False, + small_caps: bool = False, + ) -> tuple: + """Fill a rectangle with text. + + Args: + writer: pymupdf.TextWriter object (= "self") + rect: rect-like to receive the text. + text: string or list/tuple of strings. + pos: point-like start position of first word. + font: pymupdf.Font object (default pymupdf.Font('helv')). + fontsize: the fontsize. + lineheight: overwrite the font property + align: (int) 0 = left, 1 = center, 2 = right, 3 = justify + warn: (bool) text overflow action: none, warn, or exception + right_to_left: (bool) indicate right-to-left language. + """ + rect = Rect(rect) + if rect.is_empty: + raise ValueError("fill rect must not empty.") + if type(font) is not Font: + font = Font("helv") + + def textlen(x): + """Return length of a string.""" + return font.text_length( + x, fontsize=fontsize, small_caps=small_caps + ) # abbreviation + + def char_lengths(x): + """Return list of single character lengths for a string.""" + return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps) + + def append_this(pos, text): + ret = writer.append( + pos, text, font=font, fontsize=fontsize, small_caps=small_caps + ) + return ret + + tolerance = fontsize * 0.2 # extra distance to left border + space_len = textlen(" ") + std_width = rect.width - tolerance + std_start = rect.x0 + tolerance + + def norm_words(width, words): + """Cut any word in pieces no longer than 'width'.""" + nwords = [] + word_lengths = [] + for w in words: + wl_lst = char_lengths(w) + wl = sum(wl_lst) + if wl <= width: # nothing to do - copy over + nwords.append(w) + word_lengths.append(wl) + continue + + # word longer than rect width - split it in parts + n = len(wl_lst) + while n > 0: + wl = sum(wl_lst[:n]) + if wl <= width: + nwords.append(w[:n]) + word_lengths.append(wl) + w = w[n:] + wl_lst = wl_lst[n:] + n = len(wl_lst) + else: + n -= 1 + return nwords, word_lengths + + def output_justify(start, line): + """Justified output of a line.""" + # ignore leading / trailing / multiple spaces + words = [w for w in line.split(" ") if w != ""] + nwords = len(words) + if nwords == 0: + return + if nwords == 1: # single word cannot be justified + append_this(start, words[0]) + return + tl = sum([textlen(w) for w in words]) # total word lengths + gaps = nwords - 1 # number of word gaps + gapl = (std_width - tl) / gaps # width of each gap + for w in words: + _, lp = append_this(start, w) # output one word + start.x = lp.x + gapl # next start at word end plus gap + return + + asc = font.ascender + dsc = font.descender + if not lineheight: + if asc - dsc <= 1: + lheight = 1.2 + else: + lheight = asc - dsc + else: + lheight = lineheight + + LINEHEIGHT = fontsize * lheight # effective line height + width = std_width # available horizontal space + + # starting point of text + if pos is not None: + pos = Point(pos) + else: # default is just below rect top-left + pos = rect.tl + (tolerance, fontsize * asc) + if pos not in rect: + raise ValueError("Text must start in rectangle.") + + # calculate displacement factor for alignment + if align == TEXT_ALIGN_CENTER: + factor = 0.5 + elif align == TEXT_ALIGN_RIGHT: + factor = 1.0 + else: + factor = 0 + + # split in lines if just a string was given + if type(text) is str: + textlines = text.splitlines() + else: + textlines = [] + for line in text: + textlines.extend(line.splitlines()) + + max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1 + + new_lines = [] # the final list of textbox lines + no_justify = [] # no justify for these line numbers + for i, line in enumerate(textlines): + if line in ("", " "): + new_lines.append((line, space_len)) + width = rect.width - tolerance + no_justify.append((len(new_lines) - 1)) + continue + if i == 0: + width = rect.x1 - pos.x + else: + width = rect.width - tolerance + + if right_to_left: # reverses Arabic / Hebrew text front to back + line = writer.clean_rtl(line) + tl = textlen(line) + if tl <= width: # line short enough + new_lines.append((line, tl)) + no_justify.append((len(new_lines) - 1)) + continue + + # we need to split the line in fitting parts + words = line.split(" ") # the words in the line + + # cut in parts any words that are longer than rect width + words, word_lengths = norm_words(width, words) + + n = len(words) + while True: + line0 = " ".join(words[:n]) + wl = sum(word_lengths[:n]) + space_len * (n - 1) + if wl <= width: + new_lines.append((line0, wl)) + words = words[n:] + word_lengths = word_lengths[n:] + n = len(words) + line0 = None + else: + n -= 1 + + if len(words) == 0: + break + assert n + + # ------------------------------------------------------------------------- + # List of lines created. Each item is (text, tl), where 'tl' is the PDF + # output length (float) and 'text' is the text. Except for justified text, + # this is output-ready. + # ------------------------------------------------------------------------- + nlines = len(new_lines) + if nlines > max_lines: + msg = "Only fitting %i of %i lines." % (max_lines, nlines) + if warn is None: + pass + elif warn: + message("Warning: " + msg) + else: + raise ValueError(msg) + + start = Point() + no_justify += [len(new_lines) - 1] # no justifying of last line + for i in range(max_lines): + try: + line, tl = new_lines.pop(0) + except IndexError: + if g_exceptions_verbose >= 2: exception_info() + break + + if right_to_left: # Arabic, Hebrew + line = "".join(reversed(line)) + + if i == 0: # may have different start for first line + start = pos + + if align == TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width: + output_justify(start, line) + start.x = std_start + start.y += LINEHEIGHT + continue + + if i > 0 or pos.x == std_start: # left, center, right alignments + start.x += (width - tl) * factor + + append_this(start, line) + start.x = std_start + start.y += LINEHEIGHT + + return new_lines # return non-written lines + def write_text(self, page, color=None, opacity=-1, overlay=1, morph=None, matrix=None, render_mode=0, oc=0): """Write the text to a PDF page having the TextWriter's page size. @@ -12735,6 +17261,16 @@ """Check if x is in the rectangle.""" return self.__contains__(x) + def get_area(self, *args) -> float: + """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'.""" + if args: + unit = args[0] + else: + unit = "px" + u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)} + f = (u[unit][0] / u[unit][1]) ** 2 + return f * self.width * self.height + def include_point(self, p): """Extend rectangle to include point p.""" rect = self.rect.include_point(p) @@ -20924,6 +25460,82 @@ return _wxcolors +def _mupdf_devel(make_links=True): + ''' + Allows PyMuPDF installation to be used to compile and link programmes that + use the MuPDF C/C++ API. + + Args: + make_links: + If true, then on non-windows we also create softlinks to any shared + libraries that are supplied with a version suffix; this allows them + to be used in a link command. + + For example we create links such as: + + site-packages/pymupdf/ + libmupdf.so -> libmupdf.so.26.7 + libmupdfcpp.so -> libmupdfcpp.so.26.7 + + Returns: (mupdf_include, mupdf_lib). + mupdf_include: + Path of MuPDF include directory within PyMuPDF install. + mupdf_lib + Path of MuPDF library directory within PyMuPDF install. + ''' + import platform + + log(f'{mupdf_version=}') + + p = os.path.normpath(f'{__file__}/..') + + mupdf_include = f'{p}/mupdf-devel/include' + + if platform.system() == 'Windows': + # Separate .lib files are used at build time. + mupdf_lib = f'{p}/mupdf-devel/lib' + else: + # .so files are used for both buildtime and runtime linking. + mupdf_lib = p + log(f'Within installed PyMuPDF:') + log(f' {mupdf_include=}') + log(f' {mupdf_lib=}') + + assert os.path.isdir(mupdf_include), f'Not a directory: {mupdf_include=}.' + assert os.path.isdir(mupdf_lib), f'Not a directory: {mupdf_lib=}.' + + if platform.system() != 'Windows' and make_links: + # Make symbolic links within the installed pymupdf module so + # that ld can find libmupdf.so etc. This is a bit of a hack, but + # necessary because wheels cannot contain symbolic links. + # + # For example we create `libmupdf.so -> libmupdf.so.24.8`. + # + # We are careful to only create symlinks for the expected MuPDF + # version, in case old .so files from a previous install are still + # in place. + # + log(f'Creating symlinks in {mupdf_lib=} for MuPDF-{mupdf_version} .so files.') + regex_suffix = mupdf_version.split('.')[1:3] + regex_suffix = '[.]'.join(regex_suffix) + mupdf_lib_regex = f'^(lib[^.]+[.]so)[.]{regex_suffix}$' + log(f'{mupdf_lib_regex=}.') + for leaf in os.listdir(mupdf_lib): + m = re.match(mupdf_lib_regex, leaf) + if m: + pfrom = f'{mupdf_lib}/{m.group(1)}' + # os.path.exists() can return false if softlink exists + # but points to non-existent file, so we also use + # `os.path.islink()`. + if os.path.islink(pfrom) or os.path.exists(pfrom): + log(f'Removing existing link {pfrom=}.') + os.remove(pfrom) + log(f'Creating symlink: {pfrom} -> {leaf}') + os.symlink(leaf, pfrom) + + return mupdf_include, mupdf_lib + + # We cannot import utils earlier because it imports this .py file itself and # uses some pymupdf.* types in function typing. # @@ -20938,83 +25550,9 @@ recover_quad = utils.recover_quad recover_span_quad = utils.recover_span_quad -Annot.get_text = utils.get_text -Annot.get_textbox = utils.get_textbox - -Document._do_links = utils.do_links -Document._do_widgets = utils.do_widgets -Document.del_toc_item = utils.del_toc_item -Document.get_char_widths = utils.get_char_widths -Document.get_oc = utils.get_oc -Document.get_ocmd = utils.get_ocmd -Document.get_page_labels = utils.get_page_labels -Document.get_page_numbers = utils.get_page_numbers -Document.get_page_pixmap = utils.get_page_pixmap -Document.get_page_text = utils.get_page_text -Document.get_toc = utils.get_toc -Document.has_annots = utils.has_annots -Document.has_links = utils.has_links -Document.insert_page = utils.insert_page -Document.new_page = utils.new_page -Document.scrub = utils.scrub -Document.search_page_for = utils.search_page_for -Document.set_metadata = utils.set_metadata -Document.set_oc = utils.set_oc -Document.set_ocmd = utils.set_ocmd -Document.set_page_labels = utils.set_page_labels -Document.set_toc = utils.set_toc -Document.set_toc_item = utils.set_toc_item -Document.subset_fonts = utils.subset_fonts -Document.tobytes = Document.write -Document.xref_copy = utils.xref_copy - -IRect.get_area = utils.get_area - -Page.apply_redactions = utils.apply_redactions -Page.delete_image = utils.delete_image -Page.delete_widget = utils.delete_widget -Page.draw_bezier = utils.draw_bezier -Page.draw_circle = utils.draw_circle -Page.draw_curve = utils.draw_curve -Page.draw_line = utils.draw_line -Page.draw_oval = utils.draw_oval -Page.draw_polyline = utils.draw_polyline -Page.draw_quad = utils.draw_quad -Page.draw_rect = utils.draw_rect -Page.draw_sector = utils.draw_sector -Page.draw_squiggle = utils.draw_squiggle -Page.draw_zigzag = utils.draw_zigzag -Page.get_image_info = utils.get_image_info -Page.get_image_rects = utils.get_image_rects -Page.get_label = utils.get_label -Page.get_links = utils.get_links -Page.get_pixmap = utils.get_pixmap -Page.get_text = utils.get_text -Page.get_text_blocks = utils.get_text_blocks -Page.get_text_selection = utils.get_text_selection -Page.get_text_words = utils.get_text_words -Page.get_textbox = utils.get_textbox -Page.get_textpage_ocr = utils.get_textpage_ocr -Page.insert_image = utils.insert_image -Page.insert_link = utils.insert_link -Page.insert_text = utils.insert_text -Page.insert_textbox = utils.insert_textbox -Page.insert_htmlbox = utils.insert_htmlbox -Page.new_shape = lambda x: utils.Shape(x) -Page.replace_image = utils.replace_image -Page.search_for = utils.search_for -Page.show_pdf_page = utils.show_pdf_page -Page.update_link = utils.update_link -Page.write_text = utils.write_text -Shape = utils.Shape from .table import find_tables - Page.find_tables = find_tables -Rect.get_area = utils.get_area - -TextWriter.fill_textbox = utils.fill_textbox - class FitzDeprecation(DeprecationWarning): pass @@ -21284,19 +25822,19 @@ _alias( Rect, 'is_infinite') _alias( TextWriter, 'fill_textbox') _alias( TextWriter, 'write_text') - _alias( utils.Shape, 'draw_bezier') - _alias( utils.Shape, 'draw_circle') - _alias( utils.Shape, 'draw_curve') - _alias( utils.Shape, 'draw_line') - _alias( utils.Shape, 'draw_oval') - _alias( utils.Shape, 'draw_polyline') - _alias( utils.Shape, 'draw_quad') - _alias( utils.Shape, 'draw_rect') - _alias( utils.Shape, 'draw_sector') - _alias( utils.Shape, 'draw_squiggle') - _alias( utils.Shape, 'draw_zigzag') - _alias( utils.Shape, 'insert_text') - _alias( utils.Shape, 'insert_textbox') + _alias( Shape, 'draw_bezier') + _alias( Shape, 'draw_circle') + _alias( Shape, 'draw_curve') + _alias( Shape, 'draw_line') + _alias( Shape, 'draw_oval') + _alias( Shape, 'draw_polyline') + _alias( Shape, 'draw_quad') + _alias( Shape, 'draw_rect') + _alias( Shape, 'draw_sector') + _alias( Shape, 'draw_squiggle') + _alias( Shape, 'draw_zigzag') + _alias( Shape, 'insert_text') + _alias( Shape, 'insert_textbox') if 0: restore_aliases()
--- a/src/extra.i Mon Sep 15 11:43:07 2025 +0200 +++ b/src/extra.i Sat Oct 11 11:19:58 2025 +0200 @@ -1,5 +1,3 @@ -%module fitz_extra - %pythoncode %{ # pylint: disable=all %} @@ -3297,7 +3295,11 @@ { continue; } - + // prevent Unicode ZWJ 0x200d to start a word + if (buflen == 0 && ch.m_internal->c == 0x200d) + { + continue; + } int word_delimiter = JM_is_word_delimiter(ch.m_internal->c, delimiters); int this_char_rtl = JM_is_rtl_char(ch.m_internal->c); if (word_delimiter || this_char_rtl != last_char_rtl)
--- a/src/utils.py Mon Sep 15 11:43:07 2025 +0200 +++ b/src/utils.py Sat Oct 11 11:19:58 2025 +0200 @@ -6,9 +6,7 @@ # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is # maintained and developed by Artifex Software, Inc. https://artifex.com. # ------------------------------------------------------------------------ -import io import math -import os import typing import weakref @@ -51,428 +49,6 @@ """ -def write_text( - page: pymupdf.Page, - rect=None, - writers=None, - overlay=True, - color=None, - opacity=None, - keep_proportion=True, - rotate=0, - oc=0, - ) -> None: - """Write the text of one or more pymupdf.TextWriter objects. - - Args: - rect: target rectangle. If None, the union of the text writers is used. - writers: one or more pymupdf.TextWriter objects. - overlay: put in foreground or background. - keep_proportion: maintain aspect ratio of rectangle sides. - rotate: arbitrary rotation angle. - oc: the xref of an optional content object - """ - assert isinstance(page, pymupdf.Page) - if not writers: - raise ValueError("need at least one pymupdf.TextWriter") - if type(writers) is pymupdf.TextWriter: - if rotate == 0 and rect is None: - writers.write_text(page, opacity=opacity, color=color, overlay=overlay) - return None - else: - writers = (writers,) - clip = writers[0].text_rect - textdoc = pymupdf.Document() - tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height) - for writer in writers: - clip |= writer.text_rect - writer.write_text(tpage, opacity=opacity, color=color) - if rect is None: - rect = clip - page.show_pdf_page( - rect, - textdoc, - 0, - overlay=overlay, - keep_proportion=keep_proportion, - rotate=rotate, - clip=clip, - oc=oc, - ) - textdoc = None - tpage = None - - -def show_pdf_page( - page, - rect, - docsrc, - pno=0, - keep_proportion=True, - overlay=True, - oc=0, - rotate=0, - clip=None, - ) -> int: - """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'. - - Args: - rect: (rect-like) where to place the source image - docsrc: (document) source PDF - pno: (int) source page number - keep_proportion: (bool) do not change width-height-ratio - overlay: (bool) put in foreground - oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF) - rotate: (int) degrees (multiple of 90) - clip: (rect-like) part of source page rectangle - Returns: - xref of inserted object (for reuse) - """ - def calc_matrix(sr, tr, keep=True, rotate=0): - """Calculate transformation matrix from source to target rect. - - Notes: - The product of four matrices in this sequence: (1) translate correct - source corner to origin, (2) rotate, (3) scale, (4) translate to - target's top-left corner. - Args: - sr: source rect in PDF (!) coordinate system - tr: target rect in PDF coordinate system - keep: whether to keep source ratio of width to height - rotate: rotation angle in degrees - Returns: - Transformation matrix. - """ - # calc center point of source rect - smp = (sr.tl + sr.br) / 2.0 - # calc center point of target rect - tmp = (tr.tl + tr.br) / 2.0 - - # m moves to (0, 0), then rotates - m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate) - - sr1 = sr * m # resulting source rect to calculate scale factors - - fw = tr.width / sr1.width # scale the width - fh = tr.height / sr1.height # scale the height - if keep: - fw = fh = min(fw, fh) # take min if keeping aspect ratio - - m *= pymupdf.Matrix(fw, fh) # concat scale matrix - m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center - return pymupdf.JM_TUPLE(m) - - pymupdf.CheckParent(page) - doc = page.parent - - if not doc.is_pdf or not docsrc.is_pdf: - raise ValueError("is no PDF") - - if rect.is_empty or rect.is_infinite: - raise ValueError("rect must be finite and not empty") - - while pno < 0: # support negative page numbers - pno += docsrc.page_count - src_page = docsrc[pno] # load source page - - tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates - - src_rect = src_page.rect if not clip else src_page.rect & clip # source rect - if src_rect.is_empty or src_rect.is_infinite: - raise ValueError("clip must be finite and not empty") - src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord - - matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate) - - # list of existing /Form /XObjects - ilst = [i[1] for i in doc.get_page_xobjects(page.number)] - ilst += [i[7] for i in doc.get_page_images(page.number)] - ilst += [i[4] for i in doc.get_page_fonts(page.number)] - - # create a name not in that list - n = "fzFrm" - i = 0 - _imgname = n + "0" - while _imgname in ilst: - i += 1 - _imgname = n + str(i) - - isrc = docsrc._graft_id # used as key for graftmaps - if doc._graft_id == isrc: - raise ValueError("source document must not equal target") - - # retrieve / make pymupdf.Graftmap for source PDF - gmap = doc.Graftmaps.get(isrc, None) - if gmap is None: - gmap = pymupdf.Graftmap(doc) - doc.Graftmaps[isrc] = gmap - - # take note of generated xref for automatic reuse - pno_id = (isrc, pno) # id of docsrc[pno] - xref = doc.ShownPages.get(pno_id, 0) - - if overlay: - page.wrap_contents() # ensure a balanced graphics state - xref = page._show_pdf_page( - src_page, - overlay=overlay, - matrix=matrix, - xref=xref, - oc=oc, - clip=src_rect, - graftmap=gmap, - _imgname=_imgname, - ) - doc.ShownPages[pno_id] = xref - - return xref - - -def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None): - """Replace the image referred to by xref. - - Replace the image by changing the object definition stored under xref. This - will leave the pages appearance instructions intact, so the new image is - being displayed with the same bbox, rotation etc. - By providing a small fully transparent image, an effect as if the image had - been deleted can be achieved. - A typical use may include replacing large images by a smaller version, - e.g. with a lower resolution or graylevel instead of colored. - - Args: - xref: the xref of the image to replace. - filename, pixmap, stream: exactly one of these must be provided. The - meaning being the same as in Page.insert_image. - """ - doc = page.parent # the owning document - if not doc.xref_is_image(xref): - raise ValueError("xref not an image") # insert new image anywhere in page - if bool(filename) + bool(stream) + bool(pixmap) != 1: - raise ValueError("Exactly one of filename/stream/pixmap must be given") - new_xref = page.insert_image( - page.rect, filename=filename, stream=stream, pixmap=pixmap - ) - doc.xref_copy(new_xref, xref) # copy over new to old - last_contents_xref = page.get_contents()[-1] - # new image insertion has created a new /Contents source, - # which we will set to spaces now - doc.update_stream(last_contents_xref, b" ") - page._image_info = None # clear cache of extracted image information - - -def delete_image(page: pymupdf.Page, xref: int): - """Delete the image referred to by xef. - - Actually replaces by a small transparent Pixmap using method Page.replace_image. - - Args: - xref: xref of the image to delete. - """ - # make a small 100% transparent pixmap (of just any dimension) - pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1) - pix.clear_with() # clear all samples bytes to 0x00 - page.replace_image(xref, pixmap=pix) - - -def insert_image( - page, - rect, - *, - alpha=-1, - filename=None, - height=0, - keep_proportion=True, - mask=None, - oc=0, - overlay=True, - pixmap=None, - rotate=0, - stream=None, - width=0, - xref=0, - ): - """Insert an image for display in a rectangle. - - Args: - rect: (rect_like) position of image on the page. - alpha: (int, optional) set to 0 if image has no transparency. - filename: (str, Path, file object) image filename. - height: (int) - keep_proportion: (bool) keep width / height ratio (default). - mask: (bytes, optional) image consisting of alpha values to use. - oc: (int) xref of OCG or OCMD to declare as Optional Content. - overlay: (bool) put in foreground (default) or background. - pixmap: (pymupdf.Pixmap) use this as image. - rotate: (int) rotate by 0, 90, 180 or 270 degrees. - stream: (bytes) use this as image. - width: (int) - xref: (int) use this as image. - - 'page' and 'rect' are positional, all other parameters are keywords. - - If 'xref' is given, that image is used. Other input options are ignored. - Else, exactly one of pixmap, stream or filename must be given. - - 'alpha=0' for non-transparent images improves performance significantly. - Affects stream and filename only. - - Optimum transparent insertions are possible by using filename / stream in - conjunction with a 'mask' image of alpha values. - - Returns: - xref (int) of inserted image. Re-use as argument for multiple insertions. - """ - pymupdf.CheckParent(page) - doc = page.parent - if not doc.is_pdf: - raise ValueError("is no PDF") - - if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1): - raise ValueError("xref=0 needs exactly one of filename, pixmap, stream") - - if filename: - if type(filename) is str: - pass - elif hasattr(filename, "absolute"): - filename = str(filename) - elif hasattr(filename, "name"): - filename = filename.name - else: - raise ValueError("bad filename") - - if filename and not os.path.exists(filename): - raise FileNotFoundError("No such file: '%s'" % filename) - elif stream and type(stream) not in (bytes, bytearray, io.BytesIO): - raise ValueError("stream must be bytes-like / BytesIO") - elif pixmap and type(pixmap) is not pymupdf.Pixmap: - raise ValueError("pixmap must be a pymupdf.Pixmap") - if mask and not (stream or filename): - raise ValueError("mask requires stream or filename") - if mask and type(mask) not in (bytes, bytearray, io.BytesIO): - raise ValueError("mask must be bytes-like / BytesIO") - while rotate < 0: - rotate += 360 - while rotate >= 360: - rotate -= 360 - if rotate not in (0, 90, 180, 270): - raise ValueError("bad rotate value") - - r = pymupdf.Rect(rect) - if r.is_empty or r.is_infinite: - raise ValueError("rect must be finite and not empty") - clip = r * ~page.transformation_matrix - - # Create a unique image reference name. - ilst = [i[7] for i in doc.get_page_images(page.number)] - ilst += [i[1] for i in doc.get_page_xobjects(page.number)] - ilst += [i[4] for i in doc.get_page_fonts(page.number)] - n = "fzImg" # 'pymupdf image' - i = 0 - _imgname = n + "0" # first name candidate - while _imgname in ilst: - i += 1 - _imgname = n + str(i) # try new name - - if overlay: - page.wrap_contents() # ensure a balanced graphics state - digests = doc.InsertedImages - xref, digests = page._insert_image( - filename=filename, - pixmap=pixmap, - stream=stream, - imask=mask, - clip=clip, - overlay=overlay, - oc=oc, - xref=xref, - rotate=rotate, - keep_proportion=keep_proportion, - width=width, - height=height, - alpha=alpha, - _imgname=_imgname, - digests=digests, - ) - if digests is not None: - doc.InsertedImages = digests - - return xref - - -def search_for( - page, - text, - *, - clip=None, - quads=False, - flags=pymupdf.TEXT_DEHYPHENATE - | pymupdf.TEXT_PRESERVE_WHITESPACE - | pymupdf.TEXT_PRESERVE_LIGATURES - | pymupdf.TEXT_MEDIABOX_CLIP - , - textpage=None, - ) -> list: - """Search for a string on a page. - - Args: - text: string to be searched for - clip: restrict search to this rectangle - quads: (bool) return quads instead of rectangles - flags: bit switches, default: join hyphened words - textpage: a pre-created pymupdf.TextPage - Returns: - a list of rectangles or quads, each containing one occurrence. - """ - if clip is not None: - clip = pymupdf.Rect(clip) - - pymupdf.CheckParent(page) - tp = textpage - if tp is None: - tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage - elif getattr(tp, "parent") != page: - raise ValueError("not a textpage of this page") - rlist = tp.search(text, quads=quads) - if textpage is None: - del tp - return rlist - - -def search_page_for( - doc: pymupdf.Document, - pno: int, - text: str, - quads: bool = False, - clip: rect_like = None, - flags: int = pymupdf.TEXT_DEHYPHENATE - | pymupdf.TEXT_PRESERVE_LIGATURES - | pymupdf.TEXT_PRESERVE_WHITESPACE - | pymupdf.TEXT_MEDIABOX_CLIP - , - textpage: pymupdf.TextPage = None, -) -> list: - """Search for a string on a page. - - Args: - pno: page number - text: string to be searched for - clip: restrict search to this rectangle - quads: (bool) return quads instead of rectangles - flags: bit switches, default: join hyphened words - textpage: reuse a prepared textpage - Returns: - a list of rectangles or quads, each containing an occurrence. - """ - - return doc[pno].search_for( - text, - quads=quads, - clip=clip, - flags=flags, - textpage=textpage, - ) - - def get_text_blocks( page: pymupdf.Page, clip: rect_like = None, @@ -822,81 +398,6 @@ return tpage -def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list: - """Extract image information only from a pymupdf.TextPage. - - Args: - hashes: (bool) include MD5 hash for each image. - xrefs: (bool) try to find the xref for each image. Sets hashes to true. - """ - doc = page.parent - if xrefs and doc.is_pdf: - hashes = True - if not doc.is_pdf: - xrefs = False - imginfo = getattr(page, "_image_info", None) - if imginfo and not xrefs: - return imginfo - if not imginfo: - tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES) - imginfo = tp.extractIMGINFO(hashes=hashes) - del tp - if hashes: - page._image_info = imginfo - if not xrefs or not doc.is_pdf: - return imginfo - imglist = page.get_images() - digests = {} - for item in imglist: - xref = item[0] - pix = pymupdf.Pixmap(doc, xref) - digests[pix.digest] = xref - del pix - for i in range(len(imginfo)): - item = imginfo[i] - xref = digests.get(item["digest"], 0) - item["xref"] = xref - imginfo[i] = item - return imginfo - - -def get_image_rects(page: pymupdf.Page, name, transform=False) -> list: - """Return list of image positions on a page. - - Args: - name: (str, list, int) image identification. May be reference name, an - item of the page's image list or an xref. - transform: (bool) whether to also return the transformation matrix. - Returns: - A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix) - for all image locations on the page. - """ - if type(name) in (list, tuple): - xref = name[0] - elif type(name) is int: - xref = name - else: - imglist = [i for i in page.get_images() if i[7] == name] - if imglist == []: - raise ValueError("bad image name") - elif len(imglist) != 1: - raise ValueError("multiple image names found") - xref = imglist[0][0] - pix = pymupdf.Pixmap(page.parent, xref) # make pixmap of the image to compute MD5 - digest = pix.digest - del pix - infos = page.get_image_info(hashes=True) - if not transform: - bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest] - else: - bboxes = [ - (pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"])) - for im in infos - if im["digest"] == digest - ] - return bboxes - - def get_text( page: pymupdf.Page, option: str = "text", @@ -1006,101 +507,6 @@ return t -def get_page_text( - doc: pymupdf.Document, - pno: int, - option: str = "text", - clip: rect_like = None, - flags: OptInt = None, - textpage: pymupdf.TextPage = None, - sort: bool = False, -) -> typing.Any: - """Extract a document page's text by page number. - - Notes: - Convenience function calling page.get_text(). - Args: - pno: page number - option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. - Returns: - output from page.TextPage(). - """ - return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort) - -def get_pixmap( - page: pymupdf.Page, - *, - matrix: matrix_like=pymupdf.Identity, - dpi=None, - colorspace: pymupdf.Colorspace=pymupdf.csRGB, - clip: rect_like=None, - alpha: bool=False, - annots: bool=True, - ) -> pymupdf.Pixmap: - """Create pixmap of page. - - Keyword args: - matrix: Matrix for transformation (default: Identity). - dpi: desired dots per inch. If given, matrix is ignored. - colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB. - clip: (irect-like) restrict rendering to this area. - alpha: (bool) whether to include alpha channel - annots: (bool) whether to also render annotations - """ - if dpi: - zoom = dpi / 72 - matrix = pymupdf.Matrix(zoom, zoom) - - if type(colorspace) is str: - if colorspace.upper() == "GRAY": - colorspace = pymupdf.csGRAY - elif colorspace.upper() == "CMYK": - colorspace = pymupdf.csCMYK - else: - colorspace = pymupdf.csRGB - if colorspace.n not in (1, 3, 4): - raise ValueError("unsupported colorspace") - - dl = page.get_displaylist(annots=annots) - pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip) - dl = None - if dpi: - pix.set_dpi(dpi, dpi) - return pix - - -def get_page_pixmap( - doc: pymupdf.Document, - pno: int, - *, - matrix: matrix_like = pymupdf.Identity, - dpi=None, - colorspace: pymupdf.Colorspace = pymupdf.csRGB, - clip: rect_like = None, - alpha: bool = False, - annots: bool = True, -) -> pymupdf.Pixmap: - """Create pixmap of document page by page number. - - Notes: - Convenience function calling page.get_pixmap. - Args: - pno: (int) page number - matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity). - colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB. - clip: (irect-like) restrict rendering to this area. - alpha: (bool) include alpha channel - annots: (bool) also render annotations - """ - return doc[pno].get_pixmap( - matrix=matrix, - dpi=dpi, colorspace=colorspace, - clip=clip, - alpha=alpha, - annots=annots - ) - - def getLinkDict(ln, document=None) -> dict: if isinstance(ln, pymupdf.Outline): dest = ln.destination(document) @@ -1160,280 +566,6 @@ return nl -def get_links(page: pymupdf.Page) -> list: - """Create a list of all links contained in a PDF page. - - Notes: - see PyMuPDF ducmentation for details. - """ - - pymupdf.CheckParent(page) - ln = page.first_link - links = [] - while ln: - nl = getLinkDict(ln, page.parent) - links.append(nl) - ln = ln.next - if links != [] and page.parent.is_pdf: - linkxrefs = [x for x in - #page.annot_xrefs() - pymupdf.JM_get_annot_xref_list2(page) - if x[1] == pymupdf.PDF_ANNOT_LINK # pylint: disable=no-member - ] - if len(linkxrefs) == len(links): - for i in range(len(linkxrefs)): - links[i]["xref"] = linkxrefs[i][0] - links[i]["id"] = linkxrefs[i][2] - return links - - -def get_toc( - doc: pymupdf.Document, - simple: bool = True, -) -> list: - """Create a table of contents. - - Args: - simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. - """ - def recurse(olItem, liste, lvl): - """Recursively follow the outline item chain and record item information in a list.""" - while olItem and olItem.this.m_internal: - if olItem.title: - title = olItem.title - else: - title = " " - - if not olItem.is_external: - if olItem.uri: - if olItem.page == -1: - resolve = doc.resolve_link(olItem.uri) - page = resolve[0] + 1 - else: - page = olItem.page + 1 - else: - page = -1 - else: - page = -1 - - if not simple: - link = getLinkDict(olItem, doc) - liste.append([lvl, title, page, link]) - else: - liste.append([lvl, title, page]) - - if olItem.down: - liste = recurse(olItem.down, liste, lvl + 1) - olItem = olItem.next - return liste - - # ensure document is open - if doc.is_closed: - raise ValueError("document closed") - doc.init_doc() - olItem = doc.outline - if not olItem: - return [] - lvl = 1 - liste = [] - toc = recurse(olItem, liste, lvl) - if doc.is_pdf and not simple: - doc._extend_toc_items(toc) - return toc - - -def del_toc_item( - doc: pymupdf.Document, - idx: int, -) -> None: - """Delete TOC / bookmark item by index.""" - xref = doc.get_outline_xrefs()[idx] - doc._remove_toc_item(xref) - - -def set_toc_item( - doc: pymupdf.Document, - idx: int, - dest_dict: OptDict = None, - kind: OptInt = None, - pno: OptInt = None, - uri: OptStr = None, - title: OptStr = None, - to: point_like = None, - filename: OptStr = None, - zoom: float = 0, -) -> None: - """Update TOC item by index. - - It allows changing the item's title and link destination. - - Args: - idx: - (int) desired index of the TOC list, as created by get_toc. - dest_dict: - (dict) destination dictionary as created by get_toc(False). - Outrules all other parameters. If None, the remaining parameters - are used to make a dest dictionary. - kind: - (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only - the title will be updated. If pymupdf.LINK_NONE, the TOC item will - be deleted. - pno: - (int) page number (1-based like in get_toc). Required if - pymupdf.LINK_GOTO. - uri: - (str) the URL, required if pymupdf.LINK_URI. - title: - (str) the new title. No change if None. - to: - (point-like) destination on the target page. If omitted, (72, 36) - will be used as target coordinates. - filename: - (str) destination filename, required for pymupdf.LINK_GOTOR and - pymupdf.LINK_LAUNCH. - name: - (str) a destination name for pymupdf.LINK_NAMED. - zoom: - (float) a zoom factor for the target location (pymupdf.LINK_GOTO). - """ - xref = doc.get_outline_xrefs()[idx] - page_xref = 0 - if type(dest_dict) is dict: - if dest_dict["kind"] == pymupdf.LINK_GOTO: - pno = dest_dict["page"] - page_xref = doc.page_xref(pno) - page_height = doc.page_cropbox(pno).height - to = dest_dict.get('to', pymupdf.Point(72, 36)) - to.y = page_height - to.y - dest_dict["to"] = to - action = getDestStr(page_xref, dest_dict) - if not action.startswith("/A"): - raise ValueError("bad bookmark dest") - color = dest_dict.get("color") - if color: - color = list(map(float, color)) - if len(color) != 3 or min(color) < 0 or max(color) > 1: - raise ValueError("bad color value") - bold = dest_dict.get("bold", False) - italic = dest_dict.get("italic", False) - flags = italic + 2 * bold - collapse = dest_dict.get("collapse") - return doc._update_toc_item( - xref, - action=action[2:], - title=title, - color=color, - flags=flags, - collapse=collapse, - ) - - if kind == pymupdf.LINK_NONE: # delete bookmark item - return doc.del_toc_item(idx) - if kind is None and title is None: # treat as no-op - return None - if kind is None: # only update title text - return doc._update_toc_item(xref, action=None, title=title) - - if kind == pymupdf.LINK_GOTO: - if pno is None or pno not in range(1, doc.page_count + 1): - raise ValueError("bad page number") - page_xref = doc.page_xref(pno - 1) - page_height = doc.page_cropbox(pno - 1).height - if to is None: - to = pymupdf.Point(72, page_height - 36) - else: - to = pymupdf.Point(to) - to.y = page_height - to.y - - ddict = { - "kind": kind, - "to": to, - "uri": uri, - "page": pno, - "file": filename, - "zoom": zoom, - } - action = getDestStr(page_xref, ddict) - if action == "" or not action.startswith("/A"): - raise ValueError("bad bookmark dest") - - return doc._update_toc_item(xref, action=action[2:], title=title) - - -def get_area(*args) -> float: - """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'.""" - rect = args[0] - if len(args) > 1: - unit = args[1] - else: - unit = "px" - u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)} - f = (u[unit][0] / u[unit][1]) ** 2 - return f * rect.width * rect.height - - -def set_metadata(doc: pymupdf.Document, m: dict = None) -> None: - """Update the PDF /Info object. - - Args: - m: a dictionary like doc.metadata. - """ - if not doc.is_pdf: - raise ValueError("is no PDF") - if doc.is_closed or doc.is_encrypted: - raise ValueError("document closed or encrypted") - if m is None: - m = {} - elif type(m) is not dict: - raise ValueError("bad metadata") - keymap = { - "author": "Author", - "producer": "Producer", - "creator": "Creator", - "title": "Title", - "format": None, - "encryption": None, - "creationDate": "CreationDate", - "modDate": "ModDate", - "subject": "Subject", - "keywords": "Keywords", - "trapped": "Trapped", - } - valid_keys = set(keymap.keys()) - diff_set = set(m.keys()).difference(valid_keys) - if diff_set != set(): - msg = "bad dict key(s): %s" % diff_set - raise ValueError(msg) - - t, temp = doc.xref_get_key(-1, "Info") - if t != "xref": - info_xref = 0 - else: - info_xref = int(temp.replace("0 R", "")) - - if m == {} and info_xref == 0: # nothing to do - return - - if info_xref == 0: # no prev metadata: get new xref - info_xref = doc.get_new_xref() - doc.update_object(info_xref, "<<>>") # fill it with empty object - doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref) - elif m == {}: # remove existing metadata - doc.xref_set_key(-1, "Info", "null") - doc.init_doc() - return - - for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]: - pdf_key = keymap[key] - if not bool(val) or val in ("none", "null"): - val = "null" - else: - val = pymupdf.get_pdf_str(val) - doc.xref_set_key(info_xref, pdf_key, val) - doc.init_doc() - return - - def getDestStr(xref: int, ddict: dict) -> str: """Calculate the PDF action string. @@ -1492,647 +624,6 @@ return "" -def set_toc( - doc: pymupdf.Document, - toc: list, - collapse: int = 1, -) -> int: - """Create new outline tree (table of contents, TOC). - - Args: - toc: (list, tuple) each entry must contain level, title, page and - optionally top margin on the page. None or '()' remove the TOC. - collapse: (int) collapses entries beyond this level. Zero or None - shows all entries unfolded. - Returns: - the number of inserted items, or the number of removed items respectively. - """ - if doc.is_closed or doc.is_encrypted: - raise ValueError("document closed or encrypted") - if not doc.is_pdf: - raise ValueError("is no PDF") - if not toc: # remove all entries - return len(doc._delToC()) - - # validity checks -------------------------------------------------------- - if type(toc) not in (list, tuple): - raise ValueError("'toc' must be list or tuple") - toclen = len(toc) - page_count = doc.page_count - t0 = toc[0] - if type(t0) not in (list, tuple): - raise ValueError("items must be sequences of 3 or 4 items") - if t0[0] != 1: - raise ValueError("hierarchy level of item 0 must be 1") - for i in list(range(toclen - 1)): - t1 = toc[i] - t2 = toc[i + 1] - if not -1 <= t1[2] <= page_count: - raise ValueError("row %i: page number out of range" % i) - if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4): - raise ValueError("bad row %i" % (i + 1)) - if (type(t2[0]) is not int) or t2[0] < 1: - raise ValueError("bad hierarchy level in row %i" % (i + 1)) - if t2[0] > t1[0] + 1: - raise ValueError("bad hierarchy level in row %i" % (i + 1)) - # no formal errors in toc -------------------------------------------------- - - # -------------------------------------------------------------------------- - # make a list of xref numbers, which we can use for our TOC entries - # -------------------------------------------------------------------------- - old_xrefs = doc._delToC() # del old outlines, get their xref numbers - - # prepare table of xrefs for new bookmarks - old_xrefs = [] - xref = [0] + old_xrefs - xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number - if toclen > len(old_xrefs): # too few old xrefs? - for i in range((toclen - len(old_xrefs))): - xref.append(doc.get_new_xref()) # acquire new ones - - lvltab = {0: 0} # to store last entry per hierarchy level - - # ------------------------------------------------------------------------------ - # contains new outline objects as strings - first one is the outline root - # ------------------------------------------------------------------------------ - olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}] - # ------------------------------------------------------------------------------ - # build olitems as a list of PDF-like connected dictionaries - # ------------------------------------------------------------------------------ - for i in range(toclen): - o = toc[i] - lvl = o[0] # level - title = pymupdf.get_pdf_str(o[1]) # title - pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number - page_xref = doc.page_xref(pno) - page_height = doc.page_cropbox(pno).height - top = pymupdf.Point(72, page_height - 36) - dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target - if o[2] < 0: - dest_dict["kind"] = pymupdf.LINK_NONE - if len(o) > 3: # some target is specified - if type(o[3]) in (int, float): # convert a number to a point - dest_dict["to"] = pymupdf.Point(72, page_height - o[3]) - else: # if something else, make sure we have a dict - # We make a copy of o[3] to avoid modifying our caller's data. - dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict - if "to" not in dest_dict: # target point not in dict? - dest_dict["to"] = top # put default in - else: # transform target to PDF coordinates - page = doc[pno] - point = pymupdf.Point(dest_dict["to"]) - point.y = page.cropbox.height - point.y - point = point * page.rotation_matrix - dest_dict["to"] = (point.x, point.y) - d = {} - d["first"] = -1 - d["count"] = 0 - d["last"] = -1 - d["prev"] = -1 - d["next"] = -1 - d["dest"] = getDestStr(page_xref, dest_dict) - d["top"] = dest_dict["to"] - d["title"] = title - d["parent"] = lvltab[lvl - 1] - d["xref"] = xref[i + 1] - d["color"] = dest_dict.get("color") - d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0) - lvltab[lvl] = i + 1 - parent = olitems[lvltab[lvl - 1]] # the parent entry - - if ( - dest_dict.get("collapse") or collapse and lvl > collapse - ): # suppress expansion - parent["count"] -= 1 # make /Count negative - else: - parent["count"] += 1 # positive /Count - - if parent["first"] == -1: - parent["first"] = i + 1 - parent["last"] = i + 1 - else: - d["prev"] = parent["last"] - prev = olitems[parent["last"]] - prev["next"] = i + 1 - parent["last"] = i + 1 - olitems.append(d) - - # ------------------------------------------------------------------------------ - # now create each outline item as a string and insert it in the PDF - # ------------------------------------------------------------------------------ - for i, ol in enumerate(olitems): - txt = "<<" - if ol["count"] != 0: - txt += "/Count %i" % ol["count"] - try: - txt += ol["dest"] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["first"] > -1: - txt += "/First %i 0 R" % xref[ol["first"]] - except Exception: - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["last"] > -1: - txt += "/Last %i 0 R" % xref[ol["last"]] - except Exception: - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["next"] > -1: - txt += "/Next %i 0 R" % xref[ol["next"]] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["parent"] > -1: - txt += "/Parent %i 0 R" % xref[ol["parent"]] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - if ol["prev"] > -1: - txt += "/Prev %i 0 R" % xref[ol["prev"]] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - try: - txt += "/Title" + ol["title"] - except Exception: - # Verbose in PyMuPDF/tests. - if g_exceptions_verbose >= 2: pymupdf.exception_info() - pass - - if ol.get("color") and len(ol["color"]) == 3: - txt += f"/C[ {_format_g(tuple(ol['color']))}]" - if ol.get("flags", 0) > 0: - txt += "/F %i" % ol["flags"] - - if i == 0: # special: this is the outline root - txt += "/Type/Outlines" # so add the /Type entry - txt += ">>" - doc.update_object(xref[i], txt) # insert the PDF object - - doc.init_doc() - return toclen - - -def do_widgets( - tar: pymupdf.Document, - src: pymupdf.Document, - graftmap, - from_page: int = -1, - to_page: int = -1, - start_at: int = -1, - join_duplicates=0, -) -> None: - """Insert widgets of copied page range into target PDF. - - Parameter values **must** equal those of method insert_pdf() which - must have been previously executed. - """ - if not src.is_form_pdf: # nothing to do: source PDF has no fields - return - - def clean_kid_parents(acro_fields): - """ Make sure all kids have correct "Parent" pointers.""" - for i in range(acro_fields.pdf_array_len()): - parent = acro_fields.pdf_array_get(i) - kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids")) - for j in range(kids.pdf_array_len()): - kid = kids.pdf_array_get(j) - kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent) - - def join_widgets(pdf, acro_fields, xref1, xref2, name): - """Called for each pair of widgets having the same name. - - Args: - pdf: target MuPDF document - acro_fields: object Root/AcroForm/Fields - xref1, xref2: widget xrefs having same names - name: (str) the name - - Result: - Defined or updated widget parent that points to both widgets. - """ - - def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2): - """Merge widget in xref2 into "Kids" list of widget xref1. - - Args: - xref1, kids1: target widget and its "Kids" array. - xref2, kids2: source wwidget and its "Kids" array (may be empty). - """ - # make indirect objects from widgets - w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0) - w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0) - # find source widget in "Fields" array - idx = acro_fields.pdf_array_find(w2_ind) - acro_fields.pdf_array_delete(idx) - - if not kids2.pdf_is_array(): # source widget has no kids - widget = mupdf.pdf_load_object(pdf, xref2) - - # delete name from widget and insert target as parent - widget.pdf_dict_del(pymupdf.PDF_NAME("T")) - widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind) - - # put in target Kids - kids1.pdf_array_push(w2_ind) - else: # copy source kids to target kids - for i in range(kids2.pdf_array_len()): - kid = kids2.pdf_array_get(i) - kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind) - kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0) - kids1.pdf_array_push(kid_ind) - - def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name): - """Make new "Parent" for two widgets with same name. - - Args: - xref1, w1: first widget - xref2, w2: second widget - name: field name - - Result: - Both widgets have no "Kids". We create a new object with the - name and a "Kids" array containing the widgets. - Original widgets must be removed from AcroForm/Fields. - """ - # make new "Parent" object - new = mupdf.pdf_new_dict(pdf, 5) - new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name) - kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2) - new_obj = mupdf.pdf_add_object(pdf, new) - new_obj_xref = new_obj.pdf_to_num() - new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0) - - # copy over some required source widget properties - ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT")) - w1.pdf_dict_del(pymupdf.PDF_NAME("FT")) - new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft) - - aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA")) - w1.pdf_dict_del(pymupdf.PDF_NAME("AA")) - new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa) - - # remove name field, insert "Parent" field in source widgets - w1.pdf_dict_del(pymupdf.PDF_NAME("T")) - w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind) - w2.pdf_dict_del(pymupdf.PDF_NAME("T")) - w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind) - - # put source widgets in "kids" array - ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0) - ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0) - kids.pdf_array_push(ind1) - kids.pdf_array_push(ind2) - - # remove source widgets from "AcroForm/Fields" - idx = acro_fields.pdf_array_find(ind1) - acro_fields.pdf_array_delete(idx) - idx = acro_fields.pdf_array_find(ind2) - acro_fields.pdf_array_delete(idx) - - acro_fields.pdf_array_push(new_ind) - - w1 = mupdf.pdf_load_object(pdf, xref1) - w2 = mupdf.pdf_load_object(pdf, xref2) - kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids")) - kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids")) - - # check which widget has a suitable "Kids" array - if kids1.pdf_is_array(): - re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order - elif kids2.pdf_is_array(): - re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order - else: - new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order - - def get_kids(parent, kids_list): - """Return xref list of leaf kids for a parent. - - Call with an empty list. - """ - kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids")) - if not kids.pdf_is_array(): - return kids_list - for i in range(kids.pdf_array_len()): - kid = kids.pdf_array_get(i) - if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))): - kids_list = get_kids(kid, kids_list) - else: - kids_list.append(kid.pdf_to_num()) - return kids_list - - def kids_xrefs(widget): - """Get the xref of top "Parent" and the list of leaf widgets.""" - kids_list = [] - parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent")) - parent_xref = parent.pdf_to_num() - if parent_xref == 0: - return parent_xref, kids_list - kids_list = get_kids(parent, kids_list) - return parent_xref, kids_list - - def deduplicate_names(pdf, acro_fields, join_duplicates=False): - """Handle any widget name duplicates caused by the merge.""" - names = {} # key is a widget name, value a list of widgets having it. - - # extract all names and widgets in "AcroForm/Fields" - for i in range(mupdf.pdf_array_len(acro_fields)): - wobject = mupdf.pdf_array_get(acro_fields, i) - xref = wobject.pdf_to_num() - - # extract widget name and collect widget(s) using it - T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T")) - xrefs = names.get(T, []) - xrefs.append(xref) - names[T] = xrefs - - for name, xrefs in names.items(): - if len(xrefs) < 2: - continue - xref0, xref1 = xrefs[:2] # only exactly 2 should occur! - if join_duplicates: # combine fields with equal names - join_widgets(pdf, acro_fields, xref0, xref1, name) - else: # make field names unique - newname = name + f" [{xref1}]" # append this to the name - wobject = mupdf.pdf_load_object(pdf, xref1) - wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname) - - clean_kid_parents(acro_fields) - - def get_acroform(doc): - """Retrieve the AcroForm dictionary form a PDF.""" - pdf = mupdf.pdf_document_from_fz_document(doc) - # AcroForm (= central form field info) - return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm") - - tarpdf = mupdf.pdf_document_from_fz_document(tar) - srcpdf = mupdf.pdf_document_from_fz_document(src) - - if tar.is_form_pdf: - # target is a Form PDF, so use it to include source fields - acro = get_acroform(tar) - # Important arrays in AcroForm - acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields")) - tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO")) - if not tar_co.pdf_is_array(): - tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5) - else: - # target is no Form PDF, so copy over source AcroForm - acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy - - # Clear "Fields" and "CO" arrays: will be populated by page fields. - # This is required to avoid copying unneeded objects. - acro.pdf_dict_del(pymupdf.PDF_NAME("Fields")) - acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5) - acro.pdf_dict_del(pymupdf.PDF_NAME("CO")) - acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5) - - # Enrich AcroForm for copying to target - acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro) - - # Insert AcroForm into target PDF - acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft) - acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields")) - tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO")) - - # get its xref and insert it into target catalog - tar_xref = acro_tar.pdf_to_num() - acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) - root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root")) - root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind) - - if from_page <= to_page: - src_range = range(from_page, to_page + 1) - else: - src_range = range(from_page, to_page - 1, -1) - - parents = {} # information about widget parents - - # remove "P" owning page reference from all widgets of all source pages - for i in src_range: - src_page = src[i] - for xref in [ - xref - for xref, wtype, _ in src_page.annot_xrefs() - if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member - ]: - w_obj = mupdf.pdf_load_object(srcpdf, xref) - w_obj.pdf_dict_del(pymupdf.PDF_NAME("P")) - - # get the widget's parent structure - parent_xref, old_kids = kids_xrefs(w_obj) - if parent_xref: - parents[parent_xref] = { - "new_xref": 0, - "old_kids": old_kids, - "new_kids": [], - } - # Copy over Parent widgets first - they are not page-dependent - for xref in parents.keys(): # pylint: disable=consider-using-dict-items - parent = mupdf.pdf_load_object(srcpdf, xref) - parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent) - parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft) - kids_xrefs_new = get_kids(parent_tar, []) - parent_xref_new = parent_tar.pdf_to_num() - parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0) - acro_fields.pdf_array_push(parent_ind) - parents[xref]["new_xref"] = parent_xref_new - parents[xref]["new_kids"] = kids_xrefs_new - - for i in range(len(src_range)): - # read first copied over page in target - tar_page = tar[start_at + i] - - # read the original page in the source PDF - src_page = src[src_range[i]] - - # now walk through source page widgets and copy over - w_xrefs = [ # widget xrefs of the source page - xref - for xref, wtype, _ in src_page.annot_xrefs() - if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member - ] - if not w_xrefs: # no widgets on this source page - continue - - # convert to formal PDF page - tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page) - - # extract annotations array - tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots")) - if not mupdf.pdf_is_array(tar_annots): - tar_annots = mupdf.pdf_dict_put_array( - tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5 - ) - - for xref in w_xrefs: - w_obj = mupdf.pdf_load_object(srcpdf, xref) - - # check if field takes part in inter-field validations - is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C")) - - # check if parent of widget already in target - parent_xref = mupdf.pdf_to_num( - w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent")) - ) - if parent_xref == 0: # parent not in target yet - try: - w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj) - except Exception as e: - pymupdf.message_warning(f"cannot copy widget at {xref=}: {e}") - continue - w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft) - tar_xref = w_obj_tar.pdf_to_num() - w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) - mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) - mupdf.pdf_array_push(acro_fields, w_obj_tar_ind) - else: - parent = parents[parent_xref] - idx = parent["old_kids"].index(xref) # search for xref in parent - tar_xref = parent["new_kids"][idx] - w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) - mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) - - # Into "AcroForm/CO" if a computation field. - if is_aac: - mupdf.pdf_array_push(tar_co, w_obj_tar_ind) - - deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates) - -def do_links( - doc1: pymupdf.Document, - doc2: pymupdf.Document, - from_page: int = -1, - to_page: int = -1, - start_at: int = -1, -) -> None: - """Insert links contained in copied page range into destination PDF. - - Parameter values **must** equal those of method insert_pdf(), which must - have been previously executed. - """ - #pymupdf.log( 'utils.do_links()') - # -------------------------------------------------------------------------- - # internal function to create the actual "/Annots" object string - # -------------------------------------------------------------------------- - def cre_annot(lnk, xref_dst, pno_src, ctm): - """Create annotation object string for a passed-in link.""" - - r = lnk["from"] * ctm # rect in PDF coordinates - rect = _format_g(tuple(r)) - if lnk["kind"] == pymupdf.LINK_GOTO: - txt = pymupdf.annot_skel["goto1"] # annot_goto - idx = pno_src.index(lnk["page"]) - p = lnk["to"] * ctm # target point in PDF coordinates - annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect) - - elif lnk["kind"] == pymupdf.LINK_GOTOR: - if lnk["page"] >= 0: - txt = pymupdf.annot_skel["gotor1"] # annot_gotor - pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point - if type(pnt) is not pymupdf.Point: - pnt = pymupdf.Point(0, 0) - annot = txt( - lnk["page"], - pnt.x, - pnt.y, - lnk["zoom"], - lnk["file"], - lnk["file"], - rect, - ) - else: - txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n - to = pymupdf.get_pdf_str(lnk["to"]) - to = to[1:-1] - f = lnk["file"] - annot = txt(to, f, rect) - - elif lnk["kind"] == pymupdf.LINK_LAUNCH: - txt = pymupdf.annot_skel["launch"] # annot_launch - annot = txt(lnk["file"], lnk["file"], rect) - - elif lnk["kind"] == pymupdf.LINK_URI: - txt = pymupdf.annot_skel["uri"] # annot_uri - annot = txt(lnk["uri"], rect) - - else: - annot = "" - - return annot - - # -------------------------------------------------------------------------- - - # validate & normalize parameters - if from_page < 0: - fp = 0 - elif from_page >= doc2.page_count: - fp = doc2.page_count - 1 - else: - fp = from_page - - if to_page < 0 or to_page >= doc2.page_count: - tp = doc2.page_count - 1 - else: - tp = to_page - - if start_at < 0: - raise ValueError("'start_at' must be >= 0") - sa = start_at - - incr = 1 if fp <= tp else -1 # page range could be reversed - - # lists of source / destination page numbers - pno_src = list(range(fp, tp + incr, incr)) - pno_dst = [sa + i for i in range(len(pno_src))] - - # lists of source / destination page xrefs - xref_src = [] - xref_dst = [] - for i in range(len(pno_src)): - p_src = pno_src[i] - p_dst = pno_dst[i] - old_xref = doc2.page_xref(p_src) - new_xref = doc1.page_xref(p_dst) - xref_src.append(old_xref) - xref_dst.append(new_xref) - - # create the links for each copied page in destination PDF - for i in range(len(xref_src)): - page_src = doc2[pno_src[i]] # load source page - links = page_src.get_links() # get all its links - #pymupdf.log( '{pno_src=}') - #pymupdf.log( '{type(page_src)=}') - #pymupdf.log( '{page_src=}') - #pymupdf.log( '{=i len(links)}') - if len(links) == 0: # no links there - page_src = None - continue - ctm = ~page_src.transformation_matrix # calc page transformation matrix - page_dst = doc1[pno_dst[i]] # load destination page - link_tab = [] # store all link definitions here - for l in links: - if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src): - continue # GOTO link target not in copied pages - annot_text = cre_annot(l, xref_dst, pno_src, ctm) - if annot_text: - link_tab.append(annot_text) - if link_tab != []: - page_dst._addAnnot_FromString( tuple(link_tab)) - #pymupdf.log( 'utils.do_links() returning.') - - def getLinkText(page: pymupdf.Page, lnk: dict) -> str: # -------------------------------------------------------------------------- # define skeletons for /Annots object texts @@ -2216,754 +707,6 @@ return annot -def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget: - """Delete widget from page and return the next one.""" - pymupdf.CheckParent(page) - annot = getattr(widget, "_annot", None) - if annot is None: - raise ValueError("bad type: widget") - nextwidget = widget.next - page.delete_annot(annot) - widget._annot.parent = None - keylist = list(widget.__dict__.keys()) - for key in keylist: - del widget.__dict__[key] - return nextwidget - - -def update_link(page: pymupdf.Page, lnk: dict) -> None: - """Update a link on the current page.""" - pymupdf.CheckParent(page) - annot = getLinkText(page, lnk) - if annot == "": - raise ValueError("link kind not supported") - - page.parent.update_object(lnk["xref"], annot, page=page) - - -def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None: - """Insert a new link for the current page.""" - pymupdf.CheckParent(page) - annot = getLinkText(page, lnk) - if annot == "": - raise ValueError("link kind not supported") - page._addAnnot_FromString((annot,)) - - -def insert_textbox( - page: pymupdf.Page, - rect: rect_like, - buffer: typing.Union[str, list], - *, - fontname: str = "helv", - fontfile: OptStr = None, - set_simple: int = 0, - encoding: int = 0, - fontsize: float = 11, - lineheight: OptFloat = None, - color: OptSeq = None, - fill: OptSeq = None, - expandtabs: int = 1, - align: int = 0, - rotate: int = 0, - render_mode: int = 0, - miter_limit: float = 1, - border_width: float = 0.05, - morph: OptSeq = None, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> float: - """Insert text into a given rectangle. - - Notes: - Creates a Shape object, uses its same-named method and commits it. - Parameters: - rect: (rect-like) area to use for text. - buffer: text to be inserted - fontname: a Base-14 font, font name or '/name' - fontfile: name of a font file - fontsize: font size - lineheight: overwrite the font property - color: RGB color triple - expandtabs: handles tabulators with string function - align: left, center, right, justified - rotate: 0, 90, 180, or 270 degrees - morph: morph box with a matrix and a fixpoint - overlay: put text in foreground or background - Returns: - unused or deficit rectangle area (float) - """ - img = page.new_shape() - rc = img.insert_textbox( - rect, - buffer, - fontsize=fontsize, - lineheight=lineheight, - fontname=fontname, - fontfile=fontfile, - set_simple=set_simple, - encoding=encoding, - color=color, - fill=fill, - expandtabs=expandtabs, - render_mode=render_mode, - miter_limit=miter_limit, - border_width=border_width, - align=align, - rotate=rotate, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - if rc >= 0: - img.commit(overlay) - return rc - - -def insert_text( - page: pymupdf.Page, - point: point_like, - text: typing.Union[str, list], - *, - fontsize: float = 11, - lineheight: OptFloat = None, - fontname: str = "helv", - fontfile: OptStr = None, - set_simple: int = 0, - encoding: int = 0, - color: OptSeq = None, - fill: OptSeq = None, - border_width: float = 0.05, - miter_limit: float = 1, - render_mode: int = 0, - rotate: int = 0, - morph: OptSeq = None, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -): - - img = page.new_shape() - rc = img.insert_text( - point, - text, - fontsize=fontsize, - lineheight=lineheight, - fontname=fontname, - fontfile=fontfile, - set_simple=set_simple, - encoding=encoding, - color=color, - fill=fill, - border_width=border_width, - render_mode=render_mode, - miter_limit=miter_limit, - rotate=rotate, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - if rc >= 0: - img.commit(overlay) - return rc - - -def insert_htmlbox( - page, - rect, - text, - *, - css=None, - scale_low=0, - archive=None, - rotate=0, - oc=0, - opacity=1, - overlay=True, -) -> float: - """Insert text with optional HTML tags and stylings into a rectangle. - - Args: - rect: (rect-like) rectangle into which the text should be placed. - text: (str) text with optional HTML tags and stylings. - css: (str) CSS styling commands. - scale_low: (float) force-fit content by scaling it down. Must be in - range [0, 1]. If 1, no scaling will take place. If 0, arbitrary - down-scaling is acceptable. A value of 0.1 would mean that content - may be scaled down by at most 90%. - archive: Archive object pointing to locations of used fonts or images - rotate: (int) rotate the text in the box by a multiple of 90 degrees. - oc: (int) the xref of an OCG / OCMD (Optional Content). - opacity: (float) set opacity of inserted content. - overlay: (bool) put text on top of page content. - Returns: - A tuple of floats (spare_height, scale). - spare_height: -1 if content did not fit, else >= 0. It is the height of the - unused (still available) rectangle stripe. Positive only if - scale_min = 1 (no down scaling). - scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit). - """ - - # normalize rotation angle - if not rotate % 90 == 0: - raise ValueError("bad rotation angle") - while rotate < 0: - rotate += 360 - while rotate >= 360: - rotate -= 360 - - if not 0 <= scale_low <= 1: - raise ValueError("'scale_low' must be in [0, 1]") - - if css is None: - css = "" - - rect = pymupdf.Rect(rect) - if rotate in (90, 270): - temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width) - else: - temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height) - - # use a small border by default - mycss = "body {margin:1px;}" + css # append user CSS - - # either make a story, or accept a given one - if isinstance(text, str): # if a string, convert to a Story - story = pymupdf.Story(html=text, user_css=mycss, archive=archive) - elif isinstance(text, pymupdf.Story): - story = text - else: - raise ValueError("'text' must be a string or a Story") - # ---------------------------------------------------------------- - # Find a scaling factor that lets our story fit in - # ---------------------------------------------------------------- - scale_max = None if scale_low == 0 else 1 / scale_low - - fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max) - if not fit.big_enough: # there was no fit - return (-1, scale_low) - - filled = fit.filled - scale = 1 / fit.parameter # shrink factor - - spare_height = fit.rect.y1 - filled[3] # unused room at rectangle bottom - # Note: due to MuPDF's logic this may be negative even for successful fits. - if scale != 1 or spare_height < 0: # if scaling occurred, set spare_height to 0 - spare_height = 0 - - def rect_function(*args): - return fit.rect, fit.rect, pymupdf.Identity - - # draw story on temp PDF page - doc = story.write_with_links(rect_function) - - # Insert opacity if requested. - # For this, we prepend a command to the /Contents. - if 0 <= opacity < 1: - tpage = doc[0] # load page - # generate /ExtGstate for the page - alp0 = tpage._set_opacity(CA=opacity, ca=opacity) - s = f"/{alp0} gs\n" # generate graphic state command - pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0) - - # put result in target page - page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay) - - # ------------------------------------------------------------------------- - # re-insert links in target rect (show_pdf_page cannot copy annotations) - # ------------------------------------------------------------------------- - # scaled center point of fit.rect - mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale - - # center point of target rect - mp2 = (rect.tl + rect.br) / 2 - - # compute link positioning matrix: - # - move center of scaled-down fit.rect to (0,0) - # - rotate - # - move (0,0) to center of target rect - mat = ( - pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y) - * pymupdf.Matrix(-rotate) - * pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y) - ) - - # copy over links - for link in doc[0].get_links(): - link["from"] *= mat - page.insert_link(link) - - return spare_height, scale - - -def new_page( - doc: pymupdf.Document, - pno: int = -1, - width: float = 595, - height: float = 842, -) -> pymupdf.Page: - """Create and return a new page object. - - Args: - pno: (int) insert before this page. Default: after last page. - width: (float) page width in points. Default: 595 (ISO A4 width). - height: (float) page height in points. Default 842 (ISO A4 height). - Returns: - A pymupdf.Page object. - """ - doc._newPage(pno, width=width, height=height) - return doc[pno] - - -def insert_page( - doc: pymupdf.Document, - pno: int, - text: typing.Union[str, list, None] = None, - fontsize: float = 11, - width: float = 595, - height: float = 842, - fontname: str = "helv", - fontfile: OptStr = None, - color: OptSeq = (0,), -) -> int: - """Create a new PDF page and insert some text. - - Notes: - Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text(). - For parameter details see these methods. - """ - page = doc.new_page(pno=pno, width=width, height=height) - if not bool(text): - return 0 - rc = page.insert_text( - (50, 72), - text, - fontsize=fontsize, - fontname=fontname, - fontfile=fontfile, - color=color, - ) - return rc - - -def draw_line( - page: pymupdf.Page, - p1: point_like, - p2: point_like, - color: OptSeq = (0,), - dashes: OptStr = None, - width: float = 1, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - morph: OptSeq = None, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc=0, -) -> pymupdf.Point: - """Draw a line from point p1 to point p2.""" - img = page.new_shape() - p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2)) - img.finish( - color=color, - dashes=dashes, - width=width, - closePath=False, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return p - - -def draw_squiggle( - page: pymupdf.Page, - p1: point_like, - p2: point_like, - breadth: float = 2, - color: OptSeq = (0,), - dashes: OptStr = None, - width: float = 1, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - morph: OptSeq = None, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw a squiggly line from point p1 to point p2.""" - img = page.new_shape() - p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth) - img.finish( - color=color, - dashes=dashes, - width=width, - closePath=False, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return p - - -def draw_zigzag( - page: pymupdf.Page, - p1: point_like, - p2: point_like, - breadth: float = 2, - color: OptSeq = (0,), - dashes: OptStr = None, - width: float = 1, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - morph: OptSeq = None, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw a zigzag line from point p1 to point p2.""" - img = page.new_shape() - p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth) - img.finish( - color=color, - dashes=dashes, - width=width, - closePath=False, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return p - - -def draw_rect( - page: pymupdf.Page, - rect: rect_like, - color: OptSeq = (0,), - fill: OptSeq = None, - dashes: OptStr = None, - width: float = 1, - lineCap: int = 0, - lineJoin: int = 0, - morph: OptSeq = None, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, - radius=None, - ) -> pymupdf.Point: - ''' - Draw a rectangle. See Shape class method for details. - ''' - img = page.new_shape() - Q = img.draw_rect(pymupdf.Rect(rect), radius=radius) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return Q - - -def draw_quad( - page: pymupdf.Page, - quad: quad_like, - color: OptSeq = (0,), - fill: OptSeq = None, - dashes: OptStr = None, - width: float = 1, - lineCap: int = 0, - lineJoin: int = 0, - morph: OptSeq = None, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw a quadrilateral.""" - img = page.new_shape() - Q = img.draw_quad(pymupdf.Quad(quad)) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return Q - - -def draw_polyline( - page: pymupdf.Page, - points: list, - color: OptSeq = (0,), - fill: OptSeq = None, - dashes: OptStr = None, - width: float = 1, - morph: OptSeq = None, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - closePath: bool = False, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw multiple connected line segments.""" - img = page.new_shape() - Q = img.draw_polyline(points) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - closePath=closePath, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return Q - - -def draw_circle( - page: pymupdf.Page, - center: point_like, - radius: float, - color: OptSeq = (0,), - fill: OptSeq = None, - morph: OptSeq = None, - dashes: OptStr = None, - width: float = 1, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw a circle given its center and radius.""" - img = page.new_shape() - Q = img.draw_circle(pymupdf.Point(center), radius) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - return Q - - -def draw_oval( - page: pymupdf.Page, - rect: typing.Union[rect_like, quad_like], - color: OptSeq = (0,), - fill: OptSeq = None, - dashes: OptStr = None, - morph: OptSeq = None, - width: float = 1, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw an oval given its containing rectangle or quad.""" - img = page.new_shape() - Q = img.draw_oval(rect) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return Q - - -def draw_curve( - page: pymupdf.Page, - p1: point_like, - p2: point_like, - p3: point_like, - color: OptSeq = (0,), - fill: OptSeq = None, - dashes: OptStr = None, - width: float = 1, - morph: OptSeq = None, - closePath: bool = False, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3.""" - img = page.new_shape() - Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3)) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - closePath=closePath, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return Q - - -def draw_bezier( - page: pymupdf.Page, - p1: point_like, - p2: point_like, - p3: point_like, - p4: point_like, - color: OptSeq = (0,), - fill: OptSeq = None, - dashes: OptStr = None, - width: float = 1, - morph: OptStr = None, - closePath: bool = False, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3.""" - img = page.new_shape() - Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4)) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - closePath=closePath, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return Q - - -def draw_sector( - page: pymupdf.Page, - center: point_like, - point: point_like, - beta: float, - color: OptSeq = (0,), - fill: OptSeq = None, - dashes: OptStr = None, - fullSector: bool = True, - morph: OptSeq = None, - width: float = 1, - closePath: bool = False, - lineCap: int = 0, - lineJoin: int = 0, - overlay: bool = True, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, -) -> pymupdf.Point: - """Draw a circle sector given circle center, one arc end point and the angle of the arc. - - Parameters: - center -- center of circle - point -- arc end point - beta -- angle of arc (degrees) - fullSector -- connect arc ends with center - """ - img = page.new_shape() - Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector) - img.finish( - color=color, - fill=fill, - dashes=dashes, - width=width, - lineCap=lineCap, - lineJoin=lineJoin, - morph=morph, - closePath=closePath, - stroke_opacity=stroke_opacity, - fill_opacity=fill_opacity, - oc=oc, - ) - img.commit(overlay) - - return Q - - # ---------------------------------------------------------------------- # Name: wx.lib.colourdb.py # Purpose: Adds a bunch of colour names and RGB values to the @@ -3081,1360 +824,6 @@ return fontname, ext, stype, asc, dsc -def get_char_widths( - doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None -) -> list: - """Get list of glyph information of a font. - - Notes: - Must be provided by its XREF number. If we already dealt with the - font, it will be recorded in doc.FontInfos. Otherwise we insert an - entry there. - Finally we return the glyphs for the font. This is a list of - (glyph, width) where glyph is an integer controlling the char - appearance, and width is a float controlling the char's spacing: - width * fontsize is the actual space. - For 'simple' fonts, glyph == ord(char) will usually be true. - Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here. - """ - fontinfo = pymupdf.CheckFontInfo(doc, xref) - if fontinfo is None: # not recorded yet: create it - if fontdict is None: - name, ext, stype, asc, dsc = _get_font_properties(doc, xref) - fontdict = { - "name": name, - "type": stype, - "ext": ext, - "ascender": asc, - "descender": dsc, - } - else: - name = fontdict["name"] - ext = fontdict["ext"] - stype = fontdict["type"] - ordering = fontdict["ordering"] - simple = fontdict["simple"] - - if ext == "": - raise ValueError("xref is not a font") - - # check for 'simple' fonts - if stype in ("Type1", "MMType1", "TrueType"): - simple = True - else: - simple = False - - # check for CJK fonts - if name in ("Fangti", "Ming"): - ordering = 0 - elif name in ("Heiti", "Song"): - ordering = 1 - elif name in ("Gothic", "Mincho"): - ordering = 2 - elif name in ("Dotum", "Batang"): - ordering = 3 - else: - ordering = -1 - - fontdict["simple"] = simple - - if name == "ZapfDingbats": - glyphs = pymupdf.zapf_glyphs - elif name == "Symbol": - glyphs = pymupdf.symbol_glyphs - else: - glyphs = None - - fontdict["glyphs"] = glyphs - fontdict["ordering"] = ordering - fontinfo = [xref, fontdict] - doc.FontInfos.append(fontinfo) - else: - fontdict = fontinfo[1] - glyphs = fontdict["glyphs"] - simple = fontdict["simple"] - ordering = fontdict["ordering"] - - if glyphs is None: - oldlimit = 0 - else: - oldlimit = len(glyphs) - - mylimit = max(256, limit) - - if mylimit <= oldlimit: - return glyphs - - if ordering < 0: # not a CJK font - glyphs = doc._get_char_widths( - xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx - ) - else: # CJK fonts use char codes and width = 1 - glyphs = None - - fontdict["glyphs"] = glyphs - fontinfo[1] = fontdict - pymupdf.UpdateFontInfo(doc, fontinfo) - - return glyphs - - -class Shape: - """Create a new shape.""" - - @staticmethod - def horizontal_angle(C, P): - """Return the angle to the horizontal for the connection from C to P. - This uses the arcus sine function and resolves its inherent ambiguity by - looking up in which quadrant vector S = P - C is located. - """ - S = pymupdf.Point(P - C).unit # unit vector 'C' -> 'P' - alfa = math.asin(abs(S.y)) # absolute angle from horizontal - if S.x < 0: # make arcsin result unique - if S.y <= 0: # bottom-left - alfa = -(math.pi - alfa) - else: # top-left - alfa = math.pi - alfa - else: - if S.y >= 0: # top-right - pass - else: # bottom-right - alfa = -alfa - return alfa - - def __init__(self, page: pymupdf.Page): - pymupdf.CheckParent(page) - self.page = page - self.doc = page.parent - if not self.doc.is_pdf: - raise ValueError("is no PDF") - self.height = page.mediabox_size.y - self.width = page.mediabox_size.x - self.x = page.cropbox_position.x - self.y = page.cropbox_position.y - - self.pctm = page.transformation_matrix # page transf. matrix - self.ipctm = ~self.pctm # inverted transf. matrix - - self.draw_cont = "" - self.text_cont = "" - self.totalcont = "" - self.last_point = None - self.rect = None - - def updateRect(self, x): - if self.rect is None: - if len(x) == 2: - self.rect = pymupdf.Rect(x, x) - else: - self.rect = pymupdf.Rect(x) - - else: - if len(x) == 2: - x = pymupdf.Point(x) - self.rect.x0 = min(self.rect.x0, x.x) - self.rect.y0 = min(self.rect.y0, x.y) - self.rect.x1 = max(self.rect.x1, x.x) - self.rect.y1 = max(self.rect.y1, x.y) - else: - x = pymupdf.Rect(x) - self.rect.x0 = min(self.rect.x0, x.x0) - self.rect.y0 = min(self.rect.y0, x.y0) - self.rect.x1 = max(self.rect.x1, x.x1) - self.rect.y1 = max(self.rect.y1, x.y1) - - def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point: - """Draw a line between two points.""" - p1 = pymupdf.Point(p1) - p2 = pymupdf.Point(p2) - if not (self.last_point == p1): - self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n" - self.last_point = p1 - self.updateRect(p1) - - self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n" - self.updateRect(p2) - self.last_point = p2 - return self.last_point - - def draw_polyline(self, points: list) -> pymupdf.Point: - """Draw several connected line segments.""" - for i, p in enumerate(points): - if i == 0: - if not (self.last_point == pymupdf.Point(p)): - self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n" - self.last_point = pymupdf.Point(p) - else: - self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n" - self.updateRect(p) - - self.last_point = pymupdf.Point(points[-1]) - return self.last_point - - def draw_bezier( - self, - p1: point_like, - p2: point_like, - p3: point_like, - p4: point_like, - ) -> pymupdf.Point: - """Draw a standard cubic Bezier curve.""" - p1 = pymupdf.Point(p1) - p2 = pymupdf.Point(p2) - p3 = pymupdf.Point(p3) - p4 = pymupdf.Point(p4) - if not (self.last_point == p1): - self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n" - args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm)) - self.draw_cont += _format_g(args) + " c\n" - self.updateRect(p1) - self.updateRect(p2) - self.updateRect(p3) - self.updateRect(p4) - self.last_point = p4 - return self.last_point - - def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point: - """Draw an ellipse inside a tetrapod.""" - if len(tetra) != 4: - raise ValueError("invalid arg length") - if hasattr(tetra[0], "__float__"): - q = pymupdf.Rect(tetra).quad - else: - q = pymupdf.Quad(tetra) - - mt = q.ul + (q.ur - q.ul) * 0.5 - mr = q.ur + (q.lr - q.ur) * 0.5 - mb = q.ll + (q.lr - q.ll) * 0.5 - ml = q.ul + (q.ll - q.ul) * 0.5 - if not (self.last_point == ml): - self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n" - self.last_point = ml - self.draw_curve(ml, q.ll, mb) - self.draw_curve(mb, q.lr, mr) - self.draw_curve(mr, q.ur, mt) - self.draw_curve(mt, q.ul, ml) - self.updateRect(q.rect) - self.last_point = ml - return self.last_point - - def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point: - """Draw a circle given its center and radius.""" - if not radius > pymupdf.EPSILON: - raise ValueError("radius must be positive") - center = pymupdf.Point(center) - p1 = center - (radius, 0) - return self.draw_sector(center, p1, 360, fullSector=False) - - def draw_curve( - self, - p1: point_like, - p2: point_like, - p3: point_like, - ) -> pymupdf.Point: - """Draw a curve between points using one control point.""" - kappa = 0.55228474983 - p1 = pymupdf.Point(p1) - p2 = pymupdf.Point(p2) - p3 = pymupdf.Point(p3) - k1 = p1 + (p2 - p1) * kappa - k2 = p3 + (p2 - p3) * kappa - return self.draw_bezier(p1, k1, k2, p3) - - def draw_sector( - self, - center: point_like, - point: point_like, - beta: float, - fullSector: bool = True, - ) -> pymupdf.Point: - """Draw a circle sector.""" - center = pymupdf.Point(center) - point = pymupdf.Point(point) - l3 = lambda a, b: _format_g((a, b)) + " m\n" - l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n" - l5 = lambda a, b: _format_g((a, b)) + " l\n" - betar = math.radians(-beta) - w360 = math.radians(math.copysign(360, betar)) * (-1) - w90 = math.radians(math.copysign(90, betar)) - w45 = w90 / 2 - while abs(betar) > 2 * math.pi: - betar += w360 # bring angle below 360 degrees - if not (self.last_point == point): - self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm)) - self.last_point = point - Q = pymupdf.Point(0, 0) # just make sure it exists - C = center - P = point - S = P - C # vector 'center' -> 'point' - rad = abs(S) # circle radius - - if not rad > pymupdf.EPSILON: - raise ValueError("radius must be positive") - - alfa = self.horizontal_angle(center, point) - while abs(betar) > abs(w90): # draw 90 degree arcs - q1 = C.x + math.cos(alfa + w90) * rad - q2 = C.y + math.sin(alfa + w90) * rad - Q = pymupdf.Point(q1, q2) # the arc's end point - r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45) - r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45) - R = pymupdf.Point(r1, r2) # crossing point of tangents - kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q) - kappa = kappah * abs(P - Q) - cp1 = P + (R - P) * kappa # control point 1 - cp2 = Q + (R - Q) * kappa # control point 2 - self.draw_cont += l4(*pymupdf.JM_TUPLE( - list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) - )) - - betar -= w90 # reduce param angle by 90 deg - alfa += w90 # advance start angle by 90 deg - P = Q # advance to arc end point - # draw (remaining) arc - if abs(betar) > 1e-3: # significant degrees left? - beta2 = betar / 2 - q1 = C.x + math.cos(alfa + betar) * rad - q2 = C.y + math.sin(alfa + betar) * rad - Q = pymupdf.Point(q1, q2) # the arc's end point - r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2) - r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2) - R = pymupdf.Point(r1, r2) # crossing point of tangents - # kappa height is 4/3 of segment height - kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height - kappa = kappah * abs(P - Q) / (1 - math.cos(betar)) - cp1 = P + (R - P) * kappa # control point 1 - cp2 = Q + (R - Q) * kappa # control point 2 - self.draw_cont += l4(*pymupdf.JM_TUPLE( - list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) - )) - if fullSector: - self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm)) - self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm)) - self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm)) - self.last_point = Q - return self.last_point - - def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point: - """Draw a rectangle. - - Args: - radius: if not None, the rectangle will have rounded corners. - This is the radius of the curvature, given as percentage of - the rectangle width or height. Valid are values 0 < v <= 0.5. - For a sequence of two values, the corners will have different - radii. Otherwise, the percentage will be computed from the - shorter side. A value of (0.5, 0.5) will draw an ellipse. - """ - r = pymupdf.Rect(rect) - if radius is None: # standard rectangle - self.draw_cont += _format_g(pymupdf.JM_TUPLE( - list(r.bl * self.ipctm) + [r.width, r.height] - )) + " re\n" - self.updateRect(r) - self.last_point = r.tl - return self.last_point - # rounded corners requested. This requires 1 or 2 values, each - # with 0 < value <= 0.5 - if hasattr(radius, "__float__"): - if radius <= 0 or radius > 0.5: - raise ValueError(f"bad radius value {radius}.") - d = min(r.width, r.height) * radius - px = (d, 0) - py = (0, d) - elif hasattr(radius, "__len__") and len(radius) == 2: - rx, ry = radius - px = (rx * r.width, 0) - py = (0, ry * r.height) - if min(rx, ry) <= 0 or max(rx, ry) > 0.5: - raise ValueError(f"bad radius value {radius}.") - else: - raise ValueError(f"bad radius value {radius}.") - - lp = self.draw_line(r.tl + py, r.bl - py) - lp = self.draw_curve(lp, r.bl, r.bl + px) - - lp = self.draw_line(lp, r.br - px) - lp = self.draw_curve(lp, r.br, r.br - py) - - lp = self.draw_line(lp, r.tr + py) - lp = self.draw_curve(lp, r.tr, r.tr - px) - - lp = self.draw_line(lp, r.tl + px) - self.last_point = self.draw_curve(lp, r.tl, r.tl + py) - - self.updateRect(r) - return self.last_point - - def draw_quad(self, quad: quad_like) -> pymupdf.Point: - """Draw a Quad.""" - q = pymupdf.Quad(quad) - return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul]) - - def draw_zigzag( - self, - p1: point_like, - p2: point_like, - breadth: float = 2, - ) -> pymupdf.Point: - """Draw a zig-zagged line from p1 to p2.""" - p1 = pymupdf.Point(p1) - p2 = pymupdf.Point(p2) - S = p2 - p1 # vector start - end - rad = abs(S) # distance of points - cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases - if cnt < 4: - raise ValueError("points too close") - mb = rad / cnt # revised breadth - matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis - i_mat = ~matrix # get original position - points = [] # stores edges - for i in range(1, cnt): - if i % 4 == 1: # point "above" connection - p = pymupdf.Point(i, -1) * mb - elif i % 4 == 3: # point "below" connection - p = pymupdf.Point(i, 1) * mb - else: # ignore others - continue - points.append(p * i_mat) - self.draw_polyline([p1] + points + [p2]) # add start and end points - return p2 - - def draw_squiggle( - self, - p1: point_like, - p2: point_like, - breadth=2, - ) -> pymupdf.Point: - """Draw a squiggly line from p1 to p2.""" - p1 = pymupdf.Point(p1) - p2 = pymupdf.Point(p2) - S = p2 - p1 # vector start - end - rad = abs(S) # distance of points - cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases - if cnt < 4: - raise ValueError("points too close") - mb = rad / cnt # revised breadth - matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis - i_mat = ~matrix # get original position - k = 2.4142135623765633 # y of draw_curve helper point - - points = [] # stores edges - for i in range(1, cnt): - if i % 4 == 1: # point "above" connection - p = pymupdf.Point(i, -k) * mb - elif i % 4 == 3: # point "below" connection - p = pymupdf.Point(i, k) * mb - else: # else on connection line - p = pymupdf.Point(i, 0) * mb - points.append(p * i_mat) - - points = [p1] + points + [p2] - cnt = len(points) - i = 0 - while i + 2 < cnt: - self.draw_curve(points[i], points[i + 1], points[i + 2]) - i += 2 - return p2 - - # ============================================================================== - # Shape.insert_text - # ============================================================================== - def insert_text( - self, - point: point_like, - buffer: typing.Union[str, list], - *, - fontsize: float = 11, - lineheight: OptFloat = None, - fontname: str = "helv", - fontfile: OptStr = None, - set_simple: bool = 0, - encoding: int = 0, - color: OptSeq = None, - fill: OptSeq = None, - render_mode: int = 0, - border_width: float = 0.05, - miter_limit: float = 1, - rotate: int = 0, - morph: OptSeq = None, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, - ) -> int: - - # ensure 'text' is a list of strings, worth dealing with - if not bool(buffer): - return 0 - - if type(buffer) not in (list, tuple): - text = buffer.splitlines() - else: - text = buffer - - if not len(text) > 0: - return 0 - - point = pymupdf.Point(point) - try: - maxcode = max([ord(c) for c in " ".join(text)]) - except Exception: - pymupdf.exception_info() - return 0 - - # ensure valid 'fontname' - fname = fontname - if fname.startswith("/"): - fname = fname[1:] - - xref = self.page.insert_font( - fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple - ) - fontinfo = pymupdf.CheckFontInfo(self.doc, xref) - - fontdict = fontinfo[1] - ordering = fontdict["ordering"] - simple = fontdict["simple"] - bfname = fontdict["name"] - ascender = fontdict["ascender"] - descender = fontdict["descender"] - if lineheight: - lheight = fontsize * lineheight - elif ascender - descender <= 1: - lheight = fontsize * 1.2 - else: - lheight = fontsize * (ascender - descender) - - if maxcode > 255: - glyphs = self.doc.get_char_widths(xref, maxcode + 1) - else: - glyphs = fontdict["glyphs"] - - tab = [] - for t in text: - if simple and bfname not in ("Symbol", "ZapfDingbats"): - g = None - else: - g = glyphs - tab.append(pymupdf.getTJstr(t, g, simple, ordering)) - text = tab - - color_str = pymupdf.ColorCode(color, "c") - fill_str = pymupdf.ColorCode(fill, "f") - if not fill and render_mode == 0: # ensure fill color when 0 Tr - fill = color - fill_str = pymupdf.ColorCode(color, "f") - - morphing = pymupdf.CheckMorph(morph) - rot = rotate - if rot % 90 != 0: - raise ValueError("bad rotate value") - - while rot < 0: - rot += 360 - rot = rot % 360 # text rotate = 0, 90, 270, 180 - - templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf " - templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n" - cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise - cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise - cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. - height = self.height - width = self.width - - # setting up for standard rotation directions - # case rotate = 0 - if morphing: - m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y) - mat = ~m1 * morph[1] * m1 - cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" - else: - cm = "" - top = height - point.y - self.y # start of 1st char - left = point.x + self.x # start of 1. char - space = top # space available - #headroom = point.y + self.y # distance to page border - if rot == 90: - left = height - point.y - self.y - top = -point.x - self.x - cm += cmp90 - space = width - abs(top) - #headroom = point.x + self.x - - elif rot == 270: - left = -height + point.y + self.y - top = point.x + self.x - cm += cmm90 - space = abs(top) - #headroom = width - point.x - self.x - - elif rot == 180: - left = -point.x - self.x - top = -height + point.y + self.y - cm += cm180 - space = abs(point.y + self.y) - #headroom = height - point.y - self.y - - optcont = self.page._get_optional_content(oc) - if optcont is not None: - bdc = "/OC /%s BDC\n" % optcont - emc = "EMC\n" - else: - bdc = emc = "" - - alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) - if alpha is None: - alpha = "" - else: - alpha = "/%s gs\n" % alpha - nres = templ1(bdc, alpha, cm, left, top, fname, fontsize) - - if render_mode > 0: - nres += "%i Tr " % render_mode - nres += _format_g(border_width * fontsize) + " w " - if miter_limit is not None: - nres += _format_g(miter_limit) + " M " - if color is not None: - nres += color_str - if fill is not None: - nres += fill_str - - # ========================================================================= - # start text insertion - # ========================================================================= - nres += text[0] - nlines = 1 # set output line counter - if len(text) > 1: - nres += templ2(lheight) # line 1 - else: - nres += 'TJ' - for i in range(1, len(text)): - if space < lheight: - break # no space left on page - if i > 1: - nres += "\nT* " - nres += text[i] + 'TJ' - space -= lheight - nlines += 1 - - nres += "\nET\n%sQ\n" % emc - - # ========================================================================= - # end of text insertion - # ========================================================================= - # update the /Contents object - self.text_cont += nres - return nlines - - # ============================================================================== - # Shape.insert_textbox - # ============================================================================== - def insert_textbox( - self, - rect: rect_like, - buffer: typing.Union[str, list], - *, - fontname: OptStr = "helv", - fontfile: OptStr = None, - fontsize: float = 11, - lineheight: OptFloat = None, - set_simple: bool = 0, - encoding: int = 0, - color: OptSeq = None, - fill: OptSeq = None, - expandtabs: int = 1, - border_width: float = 0.05, - miter_limit: float = 1, - align: int = 0, - render_mode: int = 0, - rotate: int = 0, - morph: OptSeq = None, - stroke_opacity: float = 1, - fill_opacity: float = 1, - oc: int = 0, - ) -> float: - """Insert text into a given rectangle. - - Args: - rect -- the textbox to fill - buffer -- text to be inserted - fontname -- a Base-14 font, font name or '/name' - fontfile -- name of a font file - fontsize -- font size - lineheight -- overwrite the font property - color -- RGB stroke color triple - fill -- RGB fill color triple - render_mode -- text rendering control - border_width -- thickness of glyph borders as percentage of fontsize - expandtabs -- handles tabulators with string function - align -- left, center, right, justified - rotate -- 0, 90, 180, or 270 degrees - morph -- morph box with a matrix and a fixpoint - Returns: - unused or deficit rectangle area (float) - """ - rect = pymupdf.Rect(rect) - if rect.is_empty or rect.is_infinite: - raise ValueError("text box must be finite and not empty") - - color_str = pymupdf.ColorCode(color, "c") - fill_str = pymupdf.ColorCode(fill, "f") - if fill is None and render_mode == 0: # ensure fill color for 0 Tr - fill = color - fill_str = pymupdf.ColorCode(color, "f") - - optcont = self.page._get_optional_content(oc) - if optcont is not None: - bdc = "/OC /%s BDC\n" % optcont - emc = "EMC\n" - else: - bdc = emc = "" - - # determine opacity / transparency - alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) - if alpha is None: - alpha = "" - else: - alpha = "/%s gs\n" % alpha - - if rotate % 90 != 0: - raise ValueError("rotate must be multiple of 90") - - rot = rotate - while rot < 0: - rot += 360 - rot = rot % 360 - - # is buffer worth of dealing with? - if not bool(buffer): - return rect.height if rot in (0, 180) else rect.width - - cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise - cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise - cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. - height = self.height - - fname = fontname - if fname.startswith("/"): - fname = fname[1:] - - xref = self.page.insert_font( - fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple - ) - fontinfo = pymupdf.CheckFontInfo(self.doc, xref) - - fontdict = fontinfo[1] - ordering = fontdict["ordering"] - simple = fontdict["simple"] - glyphs = fontdict["glyphs"] - bfname = fontdict["name"] - ascender = fontdict["ascender"] - descender = fontdict["descender"] - - if lineheight: - lheight_factor = lineheight - elif ascender - descender <= 1: - lheight_factor = 1.2 - else: - lheight_factor = ascender - descender - lheight = fontsize * lheight_factor - - # create a list from buffer, split into its lines - if type(buffer) in (list, tuple): - t0 = "\n".join(buffer) - else: - t0 = buffer - - maxcode = max([ord(c) for c in t0]) - # replace invalid char codes for simple fonts - if simple and maxcode > 255: - t0 = "".join([c if ord(c) < 256 else "?" for c in t0]) - - t0 = t0.splitlines() - - glyphs = self.doc.get_char_widths(xref, maxcode + 1) - if simple and bfname not in ("Symbol", "ZapfDingbats"): - tj_glyphs = None - else: - tj_glyphs = glyphs - - # ---------------------------------------------------------------------- - # calculate pixel length of a string - # ---------------------------------------------------------------------- - def pixlen(x): - """Calculate pixel length of x.""" - if ordering < 0: - return sum([glyphs[ord(c)][1] for c in x]) * fontsize - else: - return len(x) * fontsize - - # --------------------------------------------------------------------- - - if ordering < 0: - blen = glyphs[32][1] * fontsize # pixel size of space character - else: - blen = fontsize - - text = "" # output buffer - - if pymupdf.CheckMorph(morph): - m1 = pymupdf.Matrix( - 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y - ) - mat = ~m1 * morph[1] * m1 - cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" - else: - cm = "" - - # --------------------------------------------------------------------- - # adjust for text orientation / rotation - # --------------------------------------------------------------------- - progr = 1 # direction of line progress - c_pnt = pymupdf.Point(0, fontsize * ascender) # used for line progress - if rot == 0: # normal orientation - point = rect.tl + c_pnt # line 1 is 'lheight' below top - maxwidth = rect.width # pixels available in one line - maxheight = rect.height # available text height - - elif rot == 90: # rotate counter clockwise - c_pnt = pymupdf.Point(fontsize * ascender, 0) # progress in x-direction - point = rect.bl + c_pnt # line 1 'lheight' away from left - maxwidth = rect.height # pixels available in one line - maxheight = rect.width # available text height - cm += cmp90 - - elif rot == 180: # text upside down - # progress upwards in y direction - c_pnt = -pymupdf.Point(0, fontsize * ascender) - point = rect.br + c_pnt # line 1 'lheight' above bottom - maxwidth = rect.width # pixels available in one line - progr = -1 # subtract lheight for next line - maxheight =rect.height # available text height - cm += cm180 - - else: # rotate clockwise (270 or -90) - # progress from right to left - c_pnt = -pymupdf.Point(fontsize * ascender, 0) - point = rect.tr + c_pnt # line 1 'lheight' left of right - maxwidth = rect.height # pixels available in one line - progr = -1 # subtract lheight for next line - maxheight = rect.width # available text height - cm += cmm90 - - # ===================================================================== - # line loop - # ===================================================================== - just_tab = [] # 'justify' indicators per line - - for i, line in enumerate(t0): - line_t = line.expandtabs(expandtabs).split(" ") # split into words - num_words = len(line_t) - lbuff = "" # init line buffer - rest = maxwidth # available line pixels - # ================================================================= - # word loop - # ================================================================= - for j in range(num_words): - word = line_t[j] - pl_w = pixlen(word) # pixel len of word - if rest >= pl_w: # does it fit on the line? - lbuff += word + " " # yes, append word - rest -= pl_w + blen # update available line space - continue # next word - - # word doesn't fit - output line (if not empty) - if lbuff: - lbuff = lbuff.rstrip() + "\n" # line full, append line break - text += lbuff # append to total text - just_tab.append(True) # can align-justify - - lbuff = "" # re-init line buffer - rest = maxwidth # re-init avail. space - - if pl_w <= maxwidth: # word shorter than 1 line? - lbuff = word + " " # start the line with it - rest = maxwidth - pl_w - blen # update free space - continue - - # long word: split across multiple lines - char by char ... - if len(just_tab) > 0: - just_tab[-1] = False # cannot align-justify - for c in word: - if pixlen(lbuff) <= maxwidth - pixlen(c): - lbuff += c - else: # line full - lbuff += "\n" # close line - text += lbuff # append to text - just_tab.append(False) # cannot align-justify - lbuff = c # start new line with this char - - lbuff += " " # finish long word - rest = maxwidth - pixlen(lbuff) # long word stored - - if lbuff: # unprocessed line content? - text += lbuff.rstrip() # append to text - just_tab.append(False) # cannot align-justify - - if i < len(t0) - 1: # not the last line? - text += "\n" # insert line break - - # compute used part of the textbox - if text.endswith("\n"): - text = text[:-1] - lb_count = text.count("\n") + 1 # number of lines written - - # text height = line count * line height plus one descender value - text_height = lheight * lb_count - descender * fontsize - - more = text_height - maxheight # difference to height limit - if more > pymupdf.EPSILON: # landed too much outside rect - return (-1) * more # return deficit, don't output - - more = abs(more) - if more < pymupdf.EPSILON: - more = 0 # don't bother with epsilons - nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer - templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf " - # center, right, justify: output each line with its own specifics - text_t = text.splitlines() # split text in lines again - just_tab[-1] = False # never justify last line - for i, t in enumerate(text_t): - spacing = 0 - pl = maxwidth - pixlen(t) # length of empty line part - pnt = point + c_pnt * (i * lheight_factor) # text start of line - if align == 1: # center: right shift by half width - if rot in (0, 180): - pnt = pnt + pymupdf.Point(pl / 2, 0) * progr - else: - pnt = pnt - pymupdf.Point(0, pl / 2) * progr - elif align == 2: # right: right shift by full width - if rot in (0, 180): - pnt = pnt + pymupdf.Point(pl, 0) * progr - else: - pnt = pnt - pymupdf.Point(0, pl) * progr - elif align == 3: # justify - spaces = t.count(" ") # number of spaces in line - if spaces > 0 and just_tab[i]: # if any, and we may justify - spacing = pl / spaces # make every space this much larger - else: - spacing = 0 # keep normal space length - top = height - pnt.y - self.y - left = pnt.x + self.x - if rot == 90: - left = height - pnt.y - self.y - top = -pnt.x - self.x - elif rot == 270: - left = -height + pnt.y + self.y - top = pnt.x + self.x - elif rot == 180: - left = -pnt.x - self.x - top = -height + pnt.y + self.y - - nres += templ(left, top, fname, fontsize) - - if render_mode > 0: - nres += "%i Tr " % render_mode - nres += _format_g(border_width * fontsize) + " w " - if miter_limit is not None: - nres += _format_g(miter_limit) + " M " - - if align == 3: - nres += _format_g(spacing) + " Tw " - - if color is not None: - nres += color_str - if fill is not None: - nres += fill_str - nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering) - - nres += "ET\n%sQ\n" % emc - - self.text_cont += nres - self.updateRect(rect) - return more - - def finish( - self, - width: float = 1, - color: OptSeq = (0,), - fill: OptSeq = None, - lineCap: int = 0, - lineJoin: int = 0, - dashes: OptStr = None, - even_odd: bool = False, - morph: OptSeq = None, - closePath: bool = True, - fill_opacity: float = 1, - stroke_opacity: float = 1, - oc: int = 0, - ) -> None: - """Finish the current drawing segment. - - Notes: - Apply colors, opacity, dashes, line style and width, or - morphing. Also whether to close the path - by connecting last to first point. - """ - if self.draw_cont == "": # treat empty contents as no-op - return - - if width == 0: # border color makes no sense then - color = None - elif color is None: # vice versa - width = 0 - # if color == None and fill == None: - # raise ValueError("at least one of 'color' or 'fill' must be given") - color_str = pymupdf.ColorCode(color, "c") # ensure proper color string - fill_str = pymupdf.ColorCode(fill, "f") # ensure proper fill string - - optcont = self.page._get_optional_content(oc) - if optcont is not None: - self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont - emc = "EMC\n" - else: - emc = "" - - alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) - if alpha is not None: - self.draw_cont = "/%s gs\n" % alpha + self.draw_cont - - if width != 1 and width != 0: - self.draw_cont += _format_g(width) + " w\n" - - if lineCap != 0: - self.draw_cont = "%i J\n" % lineCap + self.draw_cont - if lineJoin != 0: - self.draw_cont = "%i j\n" % lineJoin + self.draw_cont - - if dashes not in (None, "", "[] 0"): - self.draw_cont = "%s d\n" % dashes + self.draw_cont - - if closePath: - self.draw_cont += "h\n" - self.last_point = None - - if color is not None: - self.draw_cont += color_str - - if fill is not None: - self.draw_cont += fill_str - if color is not None: - if not even_odd: - self.draw_cont += "B\n" - else: - self.draw_cont += "B*\n" - else: - if not even_odd: - self.draw_cont += "f\n" - else: - self.draw_cont += "f*\n" - else: - self.draw_cont += "S\n" - - self.draw_cont += emc - if pymupdf.CheckMorph(morph): - m1 = pymupdf.Matrix( - 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y - ) - mat = ~m1 * morph[1] * m1 - self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont - - self.totalcont += "\nq\n" + self.draw_cont + "Q\n" - self.draw_cont = "" - self.last_point = None - return - - def commit(self, overlay: bool = True) -> None: - """Update the page's /Contents object with Shape data. - - The argument controls whether data appear in foreground (default) - or background. - """ - pymupdf.CheckParent(self.page) # doc may have died meanwhile - self.totalcont += self.text_cont - self.totalcont = self.totalcont.encode() - - if self.totalcont: - if overlay: - self.page.wrap_contents() # ensure a balanced graphics state - # make /Contents object with dummy stream - xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay) - # update it with potential compression - self.doc.update_stream(xref, self.totalcont) - - self.last_point = None # clean up ... - self.rect = None # - self.draw_cont = "" # for potential ... - self.text_cont = "" # ... - self.totalcont = "" # re-use - - -def apply_redactions( - page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0 -) -> bool: - """Apply the redaction annotations of the page. - - Args: - page: the PDF page. - images: - 0 - ignore images - 1 - remove all overlapping images - 2 - blank out overlapping image parts - 3 - remove image unless invisible - graphics: - 0 - ignore graphics - 1 - remove graphics if contained in rectangle - 2 - remove all overlapping graphics - text: - 0 - remove text - 1 - ignore text - """ - - def center_rect(annot_rect, new_text, font, fsize): - """Calculate minimal sub-rectangle for the overlay text. - - Notes: - Because 'insert_textbox' supports no vertical text centering, - we calculate an approximate number of lines here and return a - sub-rect with smaller height, which should still be sufficient. - Args: - annot_rect: the annotation rectangle - new_text: the text to insert. - font: the fontname. Must be one of the CJK or Base-14 set, else - the rectangle is returned unchanged. - fsize: the fontsize - Returns: - A rectangle to use instead of the annot rectangle. - """ - if not new_text or annot_rect.width <= pymupdf.EPSILON: - return annot_rect - try: - text_width = pymupdf.get_text_length(new_text, font, fsize) - except (ValueError, mupdf.FzErrorBase): # unsupported font - if g_exceptions_verbose: - pymupdf.exception_info() - return annot_rect - line_height = fsize * 1.2 - limit = annot_rect.width - h = math.ceil(text_width / limit) * line_height # estimate rect height - if h >= annot_rect.height: - return annot_rect - r = annot_rect - y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5 - r.y0 = y - return r - - pymupdf.CheckParent(page) - doc = page.parent - if doc.is_encrypted or doc.is_closed: - raise ValueError("document closed or encrypted") - if not doc.is_pdf: - raise ValueError("is no PDF") - - redact_annots = [] # storage of annot values - for annot in page.annots( - types=(pymupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member - ): - # loop redactions - redact_annots.append(annot._get_redact_values()) # save annot values - - if redact_annots == []: # any redactions on this page? - return False # no redactions - - rc = page._apply_redactions(text, images, graphics) # call MuPDF - if not rc: # should not happen really - raise ValueError("Error applying redactions.") - - # now write replacement text in old redact rectangles - shape = page.new_shape() - for redact in redact_annots: - annot_rect = redact["rect"] - fill = redact["fill"] - if fill: - shape.draw_rect(annot_rect) # colorize the rect background - shape.finish(fill=fill, color=fill) - if "text" in redact.keys(): # if we also have text - new_text = redact["text"] - align = redact.get("align", 0) - fname = redact["fontname"] - fsize = redact["fontsize"] - color = redact["text_color"] - # try finding vertical centered sub-rect - trect = center_rect(annot_rect, new_text, fname, fsize) - - rc = -1 - while rc < 0 and fsize >= 4: # while not enough room - # (re-) try insertion - rc = shape.insert_textbox( - trect, - new_text, - fontname=fname, - fontsize=fsize, - color=color, - align=align, - ) - fsize -= 0.5 # reduce font if unsuccessful - shape.commit() # append new contents object - return True - - -# ------------------------------------------------------------------------------ -# Remove potentially sensitive data from a PDF. Similar to the Adobe -# Acrobat 'sanitize' function -# ------------------------------------------------------------------------------ -def scrub( - doc: pymupdf.Document, - attached_files: bool = True, - clean_pages: bool = True, - embedded_files: bool = True, - hidden_text: bool = True, - javascript: bool = True, - metadata: bool = True, - redactions: bool = True, - redact_images: int = 0, - remove_links: bool = True, - reset_fields: bool = True, - reset_responses: bool = True, - thumbnails: bool = True, - xml_metadata: bool = True, -) -> None: - def remove_hidden(cont_lines): - """Remove hidden text from a PDF page. - - Args: - cont_lines: list of lines with /Contents content. Should have status - from after page.cleanContents(). - - Returns: - List of /Contents lines from which hidden text has been removed. - - Notes: - The input must have been created after the page's /Contents object(s) - have been cleaned with page.cleanContents(). This ensures a standard - formatting: one command per line, single spaces between operators. - This allows for drastic simplification of this code. - """ - out_lines = [] # will return this - in_text = False # indicate if within BT/ET object - suppress = False # indicate text suppression active - make_return = False - for line in cont_lines: - if line == b"BT": # start of text object - in_text = True # switch on - out_lines.append(line) # output it - continue - if line == b"ET": # end of text object - in_text = False # switch off - out_lines.append(line) # output it - continue - if line == b"3 Tr": # text suppression operator - suppress = True # switch on - make_return = True - continue - if line[-2:] == b"Tr" and line[0] != b"3": - suppress = False # text rendering changed - out_lines.append(line) - continue - if line == b"Q": # unstack command also switches off - suppress = False - out_lines.append(line) - continue - if suppress and in_text: # suppress hidden lines - continue - out_lines.append(line) - if make_return: - return out_lines - else: - return None - - if not doc.is_pdf: # only works for PDF - raise ValueError("is no PDF") - if doc.is_encrypted or doc.is_closed: - raise ValueError("closed or encrypted doc") - - if not clean_pages: - hidden_text = False - redactions = False - - if metadata: - doc.set_metadata({}) # remove standard metadata - - for page in doc: - if reset_fields: - # reset form fields (widgets) - for widget in page.widgets(): - widget.reset() - - if remove_links: - links = page.get_links() # list of all links on page - for link in links: # remove all links - page.delete_link(link) - - found_redacts = False - for annot in page.annots(): - if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files: - annot.update_file(buffer_=b" ") # set file content to empty - if reset_responses: - annot.delete_responses() - if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member - found_redacts = True - - if redactions and found_redacts: - page.apply_redactions(images=redact_images) - - if not (clean_pages or hidden_text): - continue # done with the page - - page.clean_contents() - if not page.get_contents(): - continue - if hidden_text: - xref = page.get_contents()[0] # only one b/o cleaning! - cont = doc.xref_stream(xref) - cont_lines = remove_hidden(cont.splitlines()) # remove hidden text - if cont_lines: # something was actually removed - cont = b"\n".join(cont_lines) - doc.update_stream(xref, cont) # rewrite the page /Contents - - if thumbnails: # remove page thumbnails? - if doc.xref_get_key(page.xref, "Thumb")[0] != "null": - doc.xref_set_key(page.xref, "Thumb", "null") - - # pages are scrubbed, now perform document-wide scrubbing - # remove embedded files - if embedded_files: - for name in doc.embfile_names(): - doc.embfile_del(name) - - if xml_metadata: - doc.del_xml_metadata() - if not (xml_metadata or javascript): - xref_limit = 0 - else: - xref_limit = doc.xref_length() - for xref in range(1, xref_limit): - if not doc.xref_object(xref): - msg = "bad xref %i - clean PDF before scrubbing" % xref - raise ValueError(msg) - if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript": - # a /JavaScript action object - obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript - doc.update_object(xref, obj) # update this object - continue # no further handling - - if not xml_metadata: - continue - - if doc.xref_get_key(xref, "Type")[1] == "/Metadata": - # delete any metadata object directly - doc.update_object(xref, "<<>>") - doc.update_stream(xref, b"deleted", new=True) - continue - - if doc.xref_get_key(xref, "Metadata")[0] != "null": - doc.xref_set_key(xref, "Metadata", "null") - - def _show_fz_text( text): #if mupdf_cppyy: # assert isinstance( text, cppyy.gbl.mupdf.Text) @@ -4451,418 +840,6 @@ span = span.next return f'num_spans={num_spans} num_chars={num_chars}' -def fill_textbox( - writer: pymupdf.TextWriter, - rect: rect_like, - text: typing.Union[str, list], - pos: point_like = None, - font: typing.Optional[pymupdf.Font] = None, - fontsize: float = 11, - lineheight: OptFloat = None, - align: int = 0, - warn: bool = None, - right_to_left: bool = False, - small_caps: bool = False, -) -> tuple: - """Fill a rectangle with text. - - Args: - writer: pymupdf.TextWriter object (= "self") - rect: rect-like to receive the text. - text: string or list/tuple of strings. - pos: point-like start position of first word. - font: pymupdf.Font object (default pymupdf.Font('helv')). - fontsize: the fontsize. - lineheight: overwrite the font property - align: (int) 0 = left, 1 = center, 2 = right, 3 = justify - warn: (bool) text overflow action: none, warn, or exception - right_to_left: (bool) indicate right-to-left language. - """ - rect = pymupdf.Rect(rect) - if rect.is_empty: - raise ValueError("fill rect must not empty.") - if type(font) is not pymupdf.Font: - font = pymupdf.Font("helv") - - def textlen(x): - """Return length of a string.""" - return font.text_length( - x, fontsize=fontsize, small_caps=small_caps - ) # abbreviation - - def char_lengths(x): - """Return list of single character lengths for a string.""" - return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps) - - def append_this(pos, text): - ret = writer.append( - pos, text, font=font, fontsize=fontsize, small_caps=small_caps - ) - return ret - - tolerance = fontsize * 0.2 # extra distance to left border - space_len = textlen(" ") - std_width = rect.width - tolerance - std_start = rect.x0 + tolerance - - def norm_words(width, words): - """Cut any word in pieces no longer than 'width'.""" - nwords = [] - word_lengths = [] - for w in words: - wl_lst = char_lengths(w) - wl = sum(wl_lst) - if wl <= width: # nothing to do - copy over - nwords.append(w) - word_lengths.append(wl) - continue - - # word longer than rect width - split it in parts - n = len(wl_lst) - while n > 0: - wl = sum(wl_lst[:n]) - if wl <= width: - nwords.append(w[:n]) - word_lengths.append(wl) - w = w[n:] - wl_lst = wl_lst[n:] - n = len(wl_lst) - else: - n -= 1 - return nwords, word_lengths - - def output_justify(start, line): - """Justified output of a line.""" - # ignore leading / trailing / multiple spaces - words = [w for w in line.split(" ") if w != ""] - nwords = len(words) - if nwords == 0: - return - if nwords == 1: # single word cannot be justified - append_this(start, words[0]) - return - tl = sum([textlen(w) for w in words]) # total word lengths - gaps = nwords - 1 # number of word gaps - gapl = (std_width - tl) / gaps # width of each gap - for w in words: - _, lp = append_this(start, w) # output one word - start.x = lp.x + gapl # next start at word end plus gap - return - - asc = font.ascender - dsc = font.descender - if not lineheight: - if asc - dsc <= 1: - lheight = 1.2 - else: - lheight = asc - dsc - else: - lheight = lineheight - - LINEHEIGHT = fontsize * lheight # effective line height - width = std_width # available horizontal space - - # starting point of text - if pos is not None: - pos = pymupdf.Point(pos) - else: # default is just below rect top-left - pos = rect.tl + (tolerance, fontsize * asc) - if pos not in rect: - raise ValueError("Text must start in rectangle.") - - # calculate displacement factor for alignment - if align == pymupdf.TEXT_ALIGN_CENTER: - factor = 0.5 - elif align == pymupdf.TEXT_ALIGN_RIGHT: - factor = 1.0 - else: - factor = 0 - - # split in lines if just a string was given - if type(text) is str: - textlines = text.splitlines() - else: - textlines = [] - for line in text: - textlines.extend(line.splitlines()) - - max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1 - - new_lines = [] # the final list of textbox lines - no_justify = [] # no justify for these line numbers - for i, line in enumerate(textlines): - if line in ("", " "): - new_lines.append((line, space_len)) - width = rect.width - tolerance - no_justify.append((len(new_lines) - 1)) - continue - if i == 0: - width = rect.x1 - pos.x - else: - width = rect.width - tolerance - - if right_to_left: # reverses Arabic / Hebrew text front to back - line = writer.clean_rtl(line) - tl = textlen(line) - if tl <= width: # line short enough - new_lines.append((line, tl)) - no_justify.append((len(new_lines) - 1)) - continue - - # we need to split the line in fitting parts - words = line.split(" ") # the words in the line - - # cut in parts any words that are longer than rect width - words, word_lengths = norm_words(width, words) - - n = len(words) - while True: - line0 = " ".join(words[:n]) - wl = sum(word_lengths[:n]) + space_len * (n - 1) - if wl <= width: - new_lines.append((line0, wl)) - words = words[n:] - word_lengths = word_lengths[n:] - n = len(words) - line0 = None - else: - n -= 1 - - if len(words) == 0: - break - assert n - - # ------------------------------------------------------------------------- - # List of lines created. Each item is (text, tl), where 'tl' is the PDF - # output length (float) and 'text' is the text. Except for justified text, - # this is output-ready. - # ------------------------------------------------------------------------- - nlines = len(new_lines) - if nlines > max_lines: - msg = "Only fitting %i of %i lines." % (max_lines, nlines) - if warn is None: - pass - elif warn: - pymupdf.message("Warning: " + msg) - else: - raise ValueError(msg) - - start = pymupdf.Point() - no_justify += [len(new_lines) - 1] # no justifying of last line - for i in range(max_lines): - try: - line, tl = new_lines.pop(0) - except IndexError: - if g_exceptions_verbose >= 2: pymupdf.exception_info() - break - - if right_to_left: # Arabic, Hebrew - line = "".join(reversed(line)) - - if i == 0: # may have different start for first line - start = pos - - if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width: - output_justify(start, line) - start.x = std_start - start.y += LINEHEIGHT - continue - - if i > 0 or pos.x == std_start: # left, center, right alignments - start.x += (width - tl) * factor - - append_this(start, line) - start.x = std_start - start.y += LINEHEIGHT - - return new_lines # return non-written lines - - -# ------------------------------------------------------------------------ -# Optional Content functions -# ------------------------------------------------------------------------ -def get_oc(doc: pymupdf.Document, xref: int) -> int: - """Return optional content object xref for an image or form xobject. - - Args: - xref: (int) xref number of an image or form xobject. - """ - if doc.is_closed or doc.is_encrypted: - raise ValueError("document close or encrypted") - t, name = doc.xref_get_key(xref, "Subtype") - if t != "name" or name not in ("/Image", "/Form"): - raise ValueError("bad object type at xref %i" % xref) - t, oc = doc.xref_get_key(xref, "OC") - if t != "xref": - return 0 - rc = int(oc.replace("0 R", "")) - return rc - - -def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None: - """Attach optional content object to image or form xobject. - - Args: - xref: (int) xref number of an image or form xobject - oc: (int) xref number of an OCG or OCMD - """ - if doc.is_closed or doc.is_encrypted: - raise ValueError("document close or encrypted") - t, name = doc.xref_get_key(xref, "Subtype") - if t != "name" or name not in ("/Image", "/Form"): - raise ValueError("bad object type at xref %i" % xref) - if oc > 0: - t, name = doc.xref_get_key(oc, "Type") - if t != "name" or name not in ("/OCG", "/OCMD"): - raise ValueError("bad object type at xref %i" % oc) - if oc == 0 and "OC" in doc.xref_get_keys(xref): - doc.xref_set_key(xref, "OC", "null") - return None - doc.xref_set_key(xref, "OC", "%i 0 R" % oc) - return None - - -def set_ocmd( - doc: pymupdf.Document, - xref: int = 0, - ocgs: typing.Union[list, None] = None, - policy: OptStr = None, - ve: typing.Union[list, None] = None, -) -> int: - """Create or update an OCMD object in a PDF document. - - Args: - xref: (int) 0 for creating a new object, otherwise update existing one. - ocgs: (list) OCG xref numbers, which shall be subject to 'policy'. - policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing). - ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'. - - Returns: - Xref of the created or updated OCMD. - """ - - all_ocgs = set(doc.get_ocgs().keys()) - - def ve_maker(ve): - if type(ve) not in (list, tuple) or len(ve) < 2: - raise ValueError("bad 've' format: %s" % ve) - if ve[0].lower() not in ("and", "or", "not"): - raise ValueError("bad operand: %s" % ve[0]) - if ve[0].lower() == "not" and len(ve) != 2: - raise ValueError("bad 've' format: %s" % ve) - item = "[/%s" % ve[0].title() - for x in ve[1:]: - if type(x) is int: - if x not in all_ocgs: - raise ValueError("bad OCG %i" % x) - item += " %i 0 R" % x - else: - item += " %s" % ve_maker(x) - item += "]" - return item - - text = "<</Type/OCMD" - - if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided - s = set(ocgs).difference(all_ocgs) # contains illegal xrefs - if s != set(): - msg = "bad OCGs: %s" % s - raise ValueError(msg) - text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]" - - if policy: - policy = str(policy).lower() - pols = { - "anyon": "AnyOn", - "allon": "AllOn", - "anyoff": "AnyOff", - "alloff": "AllOff", - } - if policy not in ("anyon", "allon", "anyoff", "alloff"): - raise ValueError("bad policy: %s" % policy) - text += "/P/%s" % pols[policy] - - if ve: - text += "/VE%s" % ve_maker(ve) - - text += ">>" - - # make new object or replace old OCMD (check type first) - if xref == 0: - xref = doc.get_new_xref() - elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True): - raise ValueError("bad xref or not an OCMD") - doc.update_object(xref, text) - return xref - - -def get_ocmd(doc: pymupdf.Document, xref: int) -> dict: - """Return the definition of an OCMD (optional content membership dictionary). - - Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and - /VE (visibility expression, PDF array). Via string manipulation, this - info is converted to a Python dictionary with keys "xref", "ocgs", "policy" - and "ve" - ready to recycle as input for 'set_ocmd()'. - """ - - if xref not in range(doc.xref_length()): - raise ValueError("bad xref") - text = doc.xref_object(xref, compressed=True) - if "/Type/OCMD" not in text: - raise ValueError("bad object type") - textlen = len(text) - - p0 = text.find("/OCGs[") # look for /OCGs key - p1 = text.find("]", p0) - if p0 < 0 or p1 < 0: # no OCGs found - ocgs = None - else: - ocgs = text[p0 + 6 : p1].replace("0 R", " ").split() - ocgs = list(map(int, ocgs)) - - p0 = text.find("/P/") # look for /P policy key - if p0 < 0: - policy = None - else: - p1 = text.find("ff", p0) - if p1 < 0: - p1 = text.find("on", p0) - if p1 < 0: # some irregular syntax - raise ValueError("bad object at xref") - else: - policy = text[p0 + 3 : p1 + 2] - - p0 = text.find("/VE[") # look for /VE visibility expression key - if p0 < 0: # no visibility expression found - ve = None - else: - lp = rp = 0 # find end of /VE by finding last ']'. - p1 = p0 - while lp < 1 or lp != rp: - p1 += 1 - if not p1 < textlen: # some irregular syntax - raise ValueError("bad object at xref") - if text[p1] == "[": - lp += 1 - if text[p1] == "]": - rp += 1 - # p1 now positioned at the last "]" - ve = text[p0 + 3 : p1 + 1] # the PDF /VE array - ve = ( - ve.replace("/And", '"and",') - .replace("/Not", '"not",') - .replace("/Or", '"or",') - ) - ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[") - import json - try: - ve = json.loads(ve) - except Exception: - pymupdf.exception_info() - pymupdf.message(f"bad /VE key: {ve!r}") - raise - return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve} - """ Handle page labels for PDF documents. @@ -4937,50 +914,6 @@ return construct_label(style, prefix, pagenumber) -def get_label(page): - """Return the label for this PDF page. - - Args: - page: page object. - Returns: - The label (str) of the page. Errors return an empty string. - """ - # Jorj McKie, 2021-01-06 - - labels = page.parent._get_page_labels() - if not labels: - return "" - labels.sort() - return get_label_pno(page.number, labels) - - -def get_page_numbers(doc, label, only_one=False): - """Return a list of page numbers with the given label. - - Args: - doc: PDF document object (resp. 'self'). - label: (str) label. - only_one: (bool) stop searching after first hit. - Returns: - List of page numbers having this label. - """ - # Jorj McKie, 2021-01-06 - - numbers = [] - if not label: - return numbers - labels = doc._get_page_labels() - if labels == []: - return numbers - for i in range(doc.page_count): - plabel = get_label_pno(i, labels) - if plabel == label: - numbers.append(i) - if only_one: - break - return numbers - - def construct_label(style, prefix, pno) -> str: """Construct a label based on style, prefix and page number.""" # William Chapman, 2021-01-06 @@ -5049,94 +982,6 @@ return "".join([a for a in roman_num(num)]) -def get_page_labels(doc): - """Return page label definitions in PDF document. - - Args: - doc: PDF document (resp. 'self'). - Returns: - A list of dictionaries with the following format: - {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. - """ - # Jorj McKie, 2021-01-10 - return [rule_dict(item) for item in doc._get_page_labels()] - - -def set_page_labels(doc, labels): - """Add / replace page label definitions in PDF document. - - Args: - doc: PDF document (resp. 'self'). - labels: list of label dictionaries like: - {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}, - as returned by get_page_labels(). - """ - # William Chapman, 2021-01-06 - - def create_label_str(label): - """Convert Python label dict to corresponding PDF rule string. - - Args: - label: (dict) build rule for the label. - Returns: - PDF label rule string wrapped in "<<", ">>". - """ - s = "%i<<" % label["startpage"] - if label.get("prefix", "") != "": - s += "/P(%s)" % label["prefix"] - if label.get("style", "") != "": - s += "/S/%s" % label["style"] - if label.get("firstpagenum", 1) > 1: - s += "/St %i" % label["firstpagenum"] - s += ">>" - return s - - def create_nums(labels): - """Return concatenated string of all labels rules. - - Args: - labels: (list) dictionaries as created by function 'rule_dict'. - Returns: - PDF compatible string for page label definitions, ready to be - enclosed in PDF array 'Nums[...]'. - """ - labels.sort(key=lambda x: x["startpage"]) - s = "".join([create_label_str(label) for label in labels]) - return s - - doc._set_page_labels(create_nums(labels)) - - -# End of Page Label Code ------------------------------------------------- - - -def has_links(doc: pymupdf.Document) -> bool: - """Check whether there are links on any page.""" - if doc.is_closed: - raise ValueError("document closed") - if not doc.is_pdf: - raise ValueError("is no PDF") - for i in range(doc.page_count): - for item in doc.page_annot_xrefs(i): - if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member - return True - return False - - -def has_annots(doc: pymupdf.Document) -> bool: - """Check whether there are annotations on any page.""" - if doc.is_closed: - raise ValueError("document closed") - if not doc.is_pdf: - raise ValueError("is no PDF") - for i in range(doc.page_count): - for item in doc.page_annot_xrefs(i): - # pylint: disable=no-member - if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member - return True - return False - - # ------------------------------------------------------------------- # Functions to recover the quad contained in a text extraction bbox # ------------------------------------------------------------------- @@ -5322,358 +1167,3 @@ raise ValueError("bad span argument") return recover_bbox_quad(line_dir, span, bbox) - - -# ------------------------------------------------------------------- -# Building font subsets using fontTools -# ------------------------------------------------------------------- -def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> OptInt: - """Build font subsets in a PDF. - - Eligible fonts are potentially replaced by smaller versions. Page text is - NOT rewritten and thus should retain properties like being hidden or - controlled by optional content. - - This method by default uses MuPDF's own internal feature to create subset - fonts. As this is a new function, errors may still occur. In this case, - please fall back to using the previous version by using "fallback=True". - Fallback mode requires the external package 'fontTools'. - - Args: - fallback: use the older deprecated implementation. - verbose: only used by fallback mode. - - Returns: - The new MuPDF-based code returns None. The deprecated fallback - mode returns 0 if there are no fonts to subset. Otherwise, it - returns the decrease in fontsize (the difference in fontsize), - measured in bytes. - """ - # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs)) - # An embedded font is uniquely defined by its fontbuffer only. It may have - # multiple names and xrefs. - # Once the sets of used unicodes and glyphs are known, we compute a - # smaller version of the buffer user package fontTools. - - if not fallback: # by default use MuPDF function - pdf = mupdf.pdf_document_from_fz_document(doc) - mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) - return - - font_buffers = {} - - def get_old_widths(xref): - """Retrieve old font '/W' and '/DW' values.""" - df = doc.xref_get_key(xref, "DescendantFonts") - if df[0] != "array": # only handle xref specifications - return None, None - df_xref = int(df[1][1:-1].replace("0 R", "")) - widths = doc.xref_get_key(df_xref, "W") - if widths[0] != "array": # no widths key found - widths = None - else: - widths = widths[1] - dwidths = doc.xref_get_key(df_xref, "DW") - if dwidths[0] != "int": - dwidths = None - else: - dwidths = dwidths[1] - return widths, dwidths - - def set_old_widths(xref, widths, dwidths): - """Restore the old '/W' and '/DW' in subsetted font. - - If either parameter is None or evaluates to False, the corresponding - dictionary key will be set to null. - """ - df = doc.xref_get_key(xref, "DescendantFonts") - if df[0] != "array": # only handle xref specs - return None - df_xref = int(df[1][1:-1].replace("0 R", "")) - if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[ - 0 - ] != "null": - doc.xref_set_key(df_xref, "W", "null") - else: - doc.xref_set_key(df_xref, "W", widths) - if (type(dwidths) is not str or not dwidths) and doc.xref_get_key( - df_xref, "DW" - )[0] != "null": - doc.xref_set_key(df_xref, "DW", "null") - else: - doc.xref_set_key(df_xref, "DW", dwidths) - return None - - def set_subset_fontname(new_xref): - """Generate a name prefix to tag a font as subset. - - We use a random generator to select 6 upper case ASCII characters. - The prefixed name must be put in the font xref as the "/BaseFont" value - and in the FontDescriptor object as the '/FontName' value. - """ - # The following generates a prefix like 'ABCDEF+' - import random - import string - prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+" - font_str = doc.xref_object(new_xref, compressed=True) - font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix) - df = doc.xref_get_key(new_xref, "DescendantFonts") - if df[0] == "array": - df_xref = int(df[1][1:-1].replace("0 R", "")) - fd = doc.xref_get_key(df_xref, "FontDescriptor") - if fd[0] == "xref": - fd_xref = int(fd[1].replace("0 R", "")) - fd_str = doc.xref_object(fd_xref, compressed=True) - fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix) - doc.update_object(fd_xref, fd_str) - doc.update_object(new_xref, font_str) - - def build_subset(buffer, unc_set, gid_set): - """Build font subset using fontTools. - - Args: - buffer: (bytes) the font given as a binary buffer. - unc_set: (set) required glyph ids. - Returns: - Either None if subsetting is unsuccessful or the subset font buffer. - """ - try: - import fontTools.subset as fts - except ImportError: - if g_exceptions_verbose: pymupdf.exception_info() - pymupdf.message("This method requires fontTools to be installed.") - raise - import tempfile - with tempfile.TemporaryDirectory() as tmp_dir: - oldfont_path = f"{tmp_dir}/oldfont.ttf" - newfont_path = f"{tmp_dir}/newfont.ttf" - uncfile_path = f"{tmp_dir}/uncfile.txt" - args = [ - oldfont_path, - "--retain-gids", - f"--output-file={newfont_path}", - "--layout-features=*", - "--passthrough-tables", - "--ignore-missing-glyphs", - "--ignore-missing-unicodes", - "--symbol-cmap", - ] - - # store glyph ids or unicodes as file - with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file: - if 0xFFFD in unc_set: # error unicode exists -> use glyphs - args.append(f"--gids-file={uncfile_path}") - gid_set.add(189) - unc_list = list(gid_set) - for unc in unc_list: - unc_file.write("%i\n" % unc) - else: - args.append(f"--unicodes-file={uncfile_path}") - unc_set.add(255) - unc_list = list(unc_set) - for unc in unc_list: - unc_file.write("%04x\n" % unc) - - # store fontbuffer as a file - with open(oldfont_path, "wb") as fontfile: - fontfile.write(buffer) - try: - os.remove(newfont_path) # remove old file - except Exception: - pass - try: # invoke fontTools subsetter - fts.main(args) - font = pymupdf.Font(fontfile=newfont_path) - new_buffer = font.buffer # subset font binary - if font.glyph_count == 0: # intercept empty font - new_buffer = None - except Exception: - pymupdf.exception_info() - new_buffer = None - return new_buffer - - def repl_fontnames(doc): - """Populate 'font_buffers'. - - For each font candidate, store its xref and the list of names - by which PDF text may refer to it (there may be multiple). - """ - - def norm_name(name): - """Recreate font name that contains PDF hex codes. - - E.g. #20 -> space, chr(32) - """ - while "#" in name: - p = name.find("#") - c = int(name[p + 1 : p + 3], 16) - name = name.replace(name[p : p + 3], chr(c)) - return name - - def get_fontnames(doc, item): - """Return a list of fontnames for an item of page.get_fonts(). - - There may be multiple names e.g. for Type0 fonts. - """ - fontname = item[3] - names = [fontname] - fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:] - fontname = norm_name(fontname) - if fontname not in names: - names.append(fontname) - descendents = doc.xref_get_key(item[0], "DescendantFonts") - if descendents[0] != "array": - return names - descendents = descendents[1][1:-1] - if descendents.endswith(" 0 R"): - xref = int(descendents[:-4]) - descendents = doc.xref_object(xref, compressed=True) - p1 = descendents.find("/BaseFont") - if p1 >= 0: - p2 = descendents.find("/", p1 + 1) - p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1)) - fontname = descendents[p2 + 1 : p1] - fontname = norm_name(fontname) - if fontname not in names: - names.append(fontname) - return names - - for i in range(doc.page_count): - for f in doc.get_page_fonts(i, full=True): - font_xref = f[0] # font xref - font_ext = f[1] # font file extension - basename = f[3] # font basename - - if font_ext not in ( # skip if not supported by fontTools - "otf", - "ttf", - "woff", - "woff2", - ): - continue - # skip fonts which already are subsets - if len(basename) > 6 and basename[6] == "+": - continue - - extr = doc.extract_font(font_xref) - fontbuffer = extr[-1] - names = get_fontnames(doc, f) - name_set, xref_set, subsets = font_buffers.get( - fontbuffer, (set(), set(), (set(), set())) - ) - xref_set.add(font_xref) - for name in names: - name_set.add(name) - font = pymupdf.Font(fontbuffer=fontbuffer) - name_set.add(font.name) - del font - font_buffers[fontbuffer] = (name_set, xref_set, subsets) - - def find_buffer_by_name(name): - for buffer, (name_set, _, _) in font_buffers.items(): - if name in name_set: - return buffer - return None - - # ----------------- - # main function - # ----------------- - repl_fontnames(doc) # populate font information - if not font_buffers: # nothing found to do - if verbose: - pymupdf.message(f'No fonts to subset.') - return 0 - - old_fontsize = 0 - new_fontsize = 0 - for fontbuffer in font_buffers.keys(): - old_fontsize += len(fontbuffer) - - # Scan page text for usage of subsettable fonts - for page in doc: - # go through the text and extend set of used glyphs by font - # we use a modified MuPDF trace device, which delivers us glyph ids. - for span in page.get_texttrace(): - if type(span) is not dict: # skip useless information - continue - fontname = span["font"][:33] # fontname for the span - buffer = find_buffer_by_name(fontname) - if buffer is None: - continue - name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer] - for c in span["chars"]: - set_ucs.add(c[0]) # unicode - set_gid.add(c[1]) # glyph id - font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid)) - - # build the font subsets - for old_buffer, (name_set, xref_set, subsets) in font_buffers.items(): - new_buffer = build_subset(old_buffer, subsets[0], subsets[1]) - fontname = list(name_set)[0] - if new_buffer is None or len(new_buffer) >= len(old_buffer): - # subset was not created or did not get smaller - if verbose: - pymupdf.message(f'Cannot subset {fontname!r}.') - continue - if verbose: - pymupdf.message(f"Built subset of font {fontname!r}.") - val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF - new_xref = val[0] # get its xref - set_subset_fontname(new_xref) # tag fontname as subset font - font_str = doc.xref_object( # get its object definition - new_xref, - compressed=True, - ) - # walk through the original font xrefs and replace each by the subset def - for font_xref in xref_set: - # we need the original '/W' and '/DW' width values - width_table, def_width = get_old_widths(font_xref) - # ... and replace original font definition at xref with it - doc.update_object(font_xref, font_str) - # now copy over old '/W' and '/DW' values - if width_table or def_width: - set_old_widths(font_xref, width_table, def_width) - # 'new_xref' remains unused in the PDF and must be removed - # by garbage collection. - new_fontsize += len(new_buffer) - - return old_fontsize - new_fontsize - - -# ------------------------------------------------------------------- -# Copy XREF object to another XREF -# ------------------------------------------------------------------- -def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None: - """Copy a PDF dictionary object to another one given their xref numbers. - - Args: - doc: PDF document object - source: source xref number - target: target xref number, the xref must already exist - keep: an optional list of 1st level keys in target that should not be - removed before copying. - Notes: - This works similar to the copy() method of dictionaries in Python. The - source may be a stream object. - """ - if doc.xref_is_stream(source): - # read new xref stream, maintaining compression - stream = doc.xref_stream_raw(source) - doc.update_stream( - target, - stream, - compress=False, # keeps source compression - new=True, # in case target is no stream - ) - - # empty the target completely, observe exceptions - if keep is None: - keep = [] - for key in doc.xref_get_keys(target): - if key in keep: - continue - doc.xref_set_key(target, key, "null") - # copy over all source dict items - for key in doc.xref_get_keys(source): - item = doc.xref_get_key(source, key) - doc.xref_set_key(target, key, item[1])
--- a/tests/conftest.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/conftest.py Sat Oct 11 11:19:58 2025 +0200 @@ -77,11 +77,25 @@ # Allow post-test checking that pymupdf._globals has not changed. _globals_pre = get_members(pymupdf._globals) + testsfailed_before = request.session.testsfailed + # Run the test. rep = yield sys.stdout.flush() + # This seems the only way for us to tell that a test has failed. In + # particular, <rep> is always None. We're implicitly relying on tests not + # being run in parallel. + # + failed = request.session.testsfailed - testsfailed_before + assert failed in (0, 1) + + if failed: + # Do not check post-test conditions if the test as failed. This avoids + # additional confusing `ERROR` status for failed tests. + return + # Test has run; check it did not create any MuPDF warnings etc. wt = pymupdf.TOOLS.mupdf_warnings() if not hasattr(pymupdf, 'mupdf'):
--- a/tests/gentle_compare.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/gentle_compare.py Sat Oct 11 11:19:58 2025 +0200 @@ -32,8 +32,6 @@ ''' Returns RMS diff of raw bytes of two sequences. ''' - if verbose is True: - verbose = 100000 assert len(a) == len(b) e = 0 for i, (aa, bb) in enumerate(zip(a, b)): @@ -62,7 +60,7 @@ a_mv = a.samples_mv b_mv = b.samples_mv assert len(a_mv) == len(b_mv) - ret = rms(a_mv, b_mv, verbose=True, out_prefix=out_prefix) + ret = rms(a_mv, b_mv, out_prefix=out_prefix) print(f'{out_prefix}pixmaps_rms(): {ret=}.') return ret
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_4716.py Sat Oct 11 11:19:58 2025 +0200 @@ -0,0 +1,15 @@ +import pymupdf +import os + +def test_4716(): + """Confirm that ZERO WIDTH JOINER will never start a word.""" + script_dir = os.path.dirname(__file__) + filename = os.path.join(script_dir, "resources", "test_4716.pdf") + doc = pymupdf.open(filename) + expected = set(["+25.00", "Любимый", "-10.00"]) + word_text = set() + for page in doc: + words = page.get_text("words") + for w in words: + word_text.add(w[4]) + assert word_text == expected
--- a/tests/test_annots.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_annots.py Sat Oct 11 11:19:58 2025 +0200 @@ -236,7 +236,10 @@ pymupdf.TOOLS.set_annot_stem('jorj') try: path_in = os.path.abspath( f'{__file__}/../resources/symbol-list.pdf') - path_expected = os.path.abspath( f'{__file__}/../../tests/resources/test_1645_expected.pdf') + if pymupdf.mupdf_version_tuple >= (1, 27): + path_expected = os.path.abspath( f'{__file__}/../../tests/resources/test_1645_expected-after-1.27.0.pdf') + else: + path_expected = os.path.abspath( f'{__file__}/../../tests/resources/test_1645_expected.pdf') path_out = os.path.abspath( f'{__file__}/../test_1645_out.pdf') doc = pymupdf.open(path_in) page = doc[0] @@ -254,11 +257,13 @@ ) doc.save(path_out, garbage=1, deflate=True, no_new_id=True) print(f'Have created {path_out}. comparing with {path_expected}.') - with open( path_out, 'rb') as f: - out = f.read() - with open( path_expected, 'rb') as f: - expected = f.read() - assert out == expected, f'Files differ: {path_out} {path_expected}' + with pymupdf.open(path_expected) as doc_expected, pymupdf.open(path_out) as doc_out: + rms = gentle_compare.pixmaps_rms( + doc_expected[0].get_pixmap(), + doc_out[0].get_pixmap(), + ) + print(f'test_1645: {rms=}') + assert rms < 0.1, f'Pixmaps differ: {path_expected=} {path_out=}' finally: # Restore annot_stem. pymupdf.TOOLS.set_annot_stem(annot_stem)
--- a/tests/test_codespell.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_codespell.py Sat Oct 11 11:19:58 2025 +0200 @@ -12,6 +12,10 @@ ''' Check rebased Python code with codespell. ''' + if os.environ.get('PYODIDE_ROOT'): + print('test_codespell(): not running on Pyodide - cannot run child processes.') + return + if not hasattr(pymupdf, 'mupdf'): print('Not running codespell with classic implementation.') return
--- a/tests/test_flake8.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_flake8.py Sat Oct 11 11:19:58 2025 +0200 @@ -9,6 +9,10 @@ ''' Check rebased Python code with flake8. ''' + if os.environ.get('PYODIDE_ROOT'): + print('test_flake8(): not running on Pyodide - cannot run child processes.') + return + if not hasattr(pymupdf, 'mupdf'): print(f'Not running flake8 with classic implementation.') return
--- a/tests/test_font.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_font.py Sat Oct 11 11:19:58 2025 +0200 @@ -83,6 +83,10 @@ assert text == expected def test_fontarchive(): + if os.environ.get('PYODIDE_ROOT'): + print('test_fontarchive(): not running on Pyodide - we get ValueError: No font code \'notos\' found in pymupdf-fonts..') + return + import subprocess arch = pymupdf.Archive() css = pymupdf.css_for_pymupdf_font("notos", archive=arch, name="sans-serif") @@ -234,6 +238,10 @@ def test_4457(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4457(): not running on Pyodide - cannot run child processes.') + return + print() files = ( ('https://github.com/user-attachments/files/20862923/test_4457_a.pdf', 'test_4457_a.pdf', None, 4),
--- a/tests/test_general.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_general.py Sat Oct 11 11:19:58 2025 +0200 @@ -785,6 +785,9 @@ def test_subset_fonts(): + if os.environ.get('PYODIDE_ROOT'): + print('test_subset_fonts(): not running on Pyodide - ValueError: No font code \'ubuntu\' found in pymupdf-fonts.') + return """Confirm subset_fonts is working.""" if not hasattr(pymupdf, "mupdf"): print("Not testing 'test_subset_fonts' in classic.") @@ -1026,6 +1029,10 @@ os.remove(oldfile) def test_cli(): + if os.environ.get('PYODIDE_ROOT'): + print('test_cli(): not running on Pyodide - cannot run child processes.') + return + if not hasattr(pymupdf, 'mupdf'): print('test_cli(): Not running on classic because of fitz_old.') return @@ -1063,6 +1070,10 @@ Check redirection of messages and log diagnostics with environment variables PYMUPDF_LOG and PYMUPDF_MESSAGE. ''' + if os.environ.get('PYODIDE_ROOT'): + print('test_cli_out(): not running on Pyodide - cannot run child processes.') + return + if not hasattr(pymupdf, 'mupdf'): print('test_cli(): Not running on classic because of fitz_old.') return @@ -1150,6 +1161,10 @@ ''' Checks pymupdf.use_python_logging(). ''' + if os.environ.get('PYODIDE_ROOT'): + print('test_cli(): not running on Pyodide - cannot run child processes.') + return + log_prefix = None if os.environ.get('PYMUPDF_USE_EXTRA') == '0': log_prefix = f'.+Using non-default setting from PYMUPDF_USE_EXTRA: \'0\'' @@ -1433,6 +1448,10 @@ Checks behaviour of fz_open_document() and fz_open_document_with_stream() with different filenames/magic values. ''' + if os.environ.get('PYODIDE_ROOT'): + print('test_open2(): not running on Pyodide - cannot run child processes.') + return + if platform.system() == 'Windows': print(f'test_open2(): not running on Windows because `git ls-files` known fail on Github Windows runners.') return @@ -1789,6 +1808,10 @@ document.delete_page() def test_4263(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4263(): not running on Pyodide - cannot run child processes.') + return + path = os.path.normpath(f'{__file__}/../../tests/resources/test_4263.pdf') path_out = f'{path}.linerarized.pdf' command = f'pymupdf clean -linear {path} {path_out}' @@ -1915,6 +1938,10 @@ def test_4533(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4533(): not running on Pyodide - cannot run child processes.') + return + print() path = util.download( 'https://github.com/user-attachments/files/20497146/NineData_user_manual_V3.0.5.pdf', @@ -1962,10 +1989,16 @@ print(f'{pymupdf.pymupdf_git_branch=}') print(f'{pymupdf.pymupdf_git_sha=}') print(f'{pymupdf.pymupdf_version=}') - print(f'pymupdf.pymupdf_git_diff:\n{textwrap.indent(pymupdf.pymupdf_git_diff, " ")}') + print(f'{pymupdf.pymupdf_git_diff=}') + if pymupdf.pymupdf_git_diff: + print(f'pymupdf.pymupdf_git_diff:\n{textwrap.indent(pymupdf.pymupdf_git_diff, " ")}') def test_4392(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4392(): not running on Pyodide - cannot run child processes.') + return + print() path = os.path.normpath(f'{__file__}/../../tests/test_4392.py') with open(path, 'w') as f: @@ -2051,3 +2084,99 @@ # Check pymupdf.Document.scrub() works. with pymupdf.open(path) as document: document.scrub() + + +def test_4702(): + if os.environ.get('PYODIDE_ROOT'): + # util.download() uses subprocess. + print('test_4702(): not running on Pyodide - cannot run child processes.') + return + + path = util.download( + 'https://github.com/user-attachments/files/22403483/01995b6ca7837b52abaa24e38e8c076d.pdf', + 'test_4702.pdf', + ) + with pymupdf.open(path) as document: + for xref in range(1, document.xref_length()): + print(f'{xref=}') + try: + _ = document.xref_object(xref) + except Exception as e1: + print(f'{e1=}') + try: + document.update_object(xref, "<<>>") + except Exception as e2: + print(f'{e2=}') + raise + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document' + + with pymupdf.open(path) as document: + for xref in range(1, document.xref_length()): + print(f'{xref=}') + _ = document.xref_object(xref) + wt = pymupdf.TOOLS.mupdf_warnings() + assert wt == 'repairing PDF document' + + +def test_4712(): + ''' + Crash with "corrupted double-linked list + ''' + if 1: + print(f'test_4712(): Not running because known to fail.') + return + path_a = os.path.normpath(f'{__file__}/../../tests/resources/test_4712_a.pdf') + path_b = os.path.normpath(f'{__file__}/../../tests/resources/test_4712_b.pdf') + doc1 = pymupdf.open(path_a) + for i in range(6): + doc1.load_page(i).get_pixmap() + doc2 = pymupdf.open(path_b) + for i in range(6): + doc2.load_page(i).get_pixmap() + + +def test_4712m(): + if 1: + print(f'test_4712b(): Not running because known to fail.') + return + + path_a = os.path.normpath(f'{__file__}/../../tests/resources/test_4712_a.pdf') + path_b = os.path.normpath(f'{__file__}/../../tests/resources/test_4712_b.pdf') + + mupdf = pymupdf.mupdf + def get_pixmap(page): + displaylist = mupdf.fz_new_display_list_from_page(page) + rect = mupdf.fz_bound_display_list(displaylist) + irect = mupdf.fz_round_rect(rect) + pixmap = mupdf.fz_new_pixmap_with_bbox( + mupdf.FzColorspace(mupdf.FzColorspace.Fixed_RGB), + irect, + mupdf.FzSeparations(), + 0, # alpha + ) + mupdf.fz_clear_pixmap_with_value(pixmap, 0xFF) + matrix = mupdf.FzMatrix() + device = mupdf.fz_new_draw_device(matrix, pixmap) + mupdf.fz_run_display_list( + displaylist, + device, + mupdf.FzMatrix(), + mupdf.FzRect(mupdf.FzRect.Fixed_INFINITE), + mupdf.FzCookie(), + ) + mupdf.fz_close_device(device) + + def process_document(document): + for i in range(6): + print(f' {i=}', flush=1) + page = mupdf.fz_load_page(document, i) + get_pixmap(page) + + print(f'Processing {path_a=}', flush=1) + document_a = mupdf.fz_open_document(path_a) + process_document(document_a) + + print(f'Processing {path_b=}', flush=1) + document_b = mupdf.fz_open_document(path_b) + process_document(document_b)
--- a/tests/test_import.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_import.py Sat Oct 11 11:19:58 2025 +0200 @@ -5,6 +5,10 @@ def test_import(): + if os.environ.get('PYODIDE_ROOT'): + print('test_import(): not running on Pyodide - cannot run child processes.') + return + root = os.path.abspath(f'{__file__}/../../') p = f'{root}/tests/resources_test_import.py' with open(p, 'w') as f:
--- a/tests/test_memory.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_memory.py Sat Oct 11 11:19:58 2025 +0200 @@ -17,6 +17,10 @@ ''' Check for memory leaks. ''' + if os.environ.get('PYODIDE_ROOT'): + print('test_2791(): not running on Pyodide - No module named \'psutil\'.') + return + if os.environ.get('PYMUPDF_RUNNING_ON_VALGRIND') == '1': print(f'test_2791(): not running because PYMUPDF_RUNNING_ON_VALGRIND=1.') return @@ -94,6 +98,10 @@ def test_4090(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4090(): not running on Pyodide - No module named \'psutil\'.') + return + print(f'test_4090(): {os.environ.get("PYTHONMALLOC")=}.') import psutil process = psutil.Process() @@ -148,6 +156,10 @@ def test_4125(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4125(): not running on Pyodide - No module named \'psutil\'.') + return + if os.environ.get('PYMUPDF_RUNNING_ON_VALGRIND') == '1': print(f'test_4125(): not running because PYMUPDF_RUNNING_ON_VALGRIND=1.') return
--- a/tests/test_pixmap.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_pixmap.py Sat Oct 11 11:19:58 2025 +0200 @@ -70,7 +70,7 @@ pix2 = pymupdf.Pixmap(stream) assert repr(pix1) == repr(pix2) except ModuleNotFoundError: - assert platform.system() == 'Windows' and sys.maxsize == 2**31 - 1 + assert platform.system() in ('Windows', 'Emscripten') and sys.maxsize == 2**31 - 1 def test_save(tmpdir): @@ -556,6 +556,9 @@ def test_4445(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4445(): not running on Pyodide - cannot run child processes.') + return print() # Test case is large so we download it instead of having it in PyMuPDF # git. We put it in `cache/` directory do it is not removed by `git clean` @@ -628,3 +631,22 @@ assert rms == 0 else: assert rms >= 10 + + +def test_4699(): + path = os.path.normpath(f'{__file__}/../../tests/resources/test_4699.pdf') + path_png_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4699.png') + path_png_actual = os.path.normpath(f'{__file__}/../../tests/test_4699.png') + with pymupdf.open(path) as document: + page = document[0] + pixmap = page.get_pixmap() + pixmap.save(path_png_actual) + print(f'Have saved to {path_png_actual=}.') + rms = gentle_compare.pixmaps_rms(path_png_expected, pixmap) + print(f'test_4699(): {rms=}') + if pymupdf.mupdf_version_tuple >= (1, 27): + assert rms == 0 + else: + wt = pymupdf.TOOLS.mupdf_warnings() + assert 'syntax error: cannot find ExtGState resource' in wt + assert rms > 20
--- a/tests/test_pylint.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_pylint.py Sat Oct 11 11:19:58 2025 +0200 @@ -7,6 +7,10 @@ def test_pylint(): + if os.environ.get('PYODIDE_ROOT'): + print('test_pylint(): not running on Pyodide - cannot run child processes.') + return + if not hasattr(pymupdf, 'mupdf'): print(f'test_pylint(): Not running with classic implementation.') return
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test_release.py Sat Oct 11 11:19:58 2025 +0200 @@ -0,0 +1,84 @@ +import pymupdf + +import os +import re +import sys + + +g_root_abs = os.path.normpath(f'{__file__}/../../') + +sys.path.insert(0, g_root_abs) +try: + import pipcl + import setup +finally: + del sys.path[0] + +g_root = pipcl.relpath(g_root_abs) + + +def _file_line(path, text, re_match, offset=+2): + ''' + Returns <file>:<line> for location of regex match. + + path: + filename. + text: + Contents of <filename>. + re_match: + A re.Match. + offset: + Added to line number of start of <re_match>. Default offset=2 is + because callers usually grep for leading newline, and line numbers are + generally 1-based. + ''' + text_before = text[:re_match.start()] + line = text_before.count('\n') + offset + return f'{path}:{line}' + + +def test_release_versions(): + ''' + PyMuPDF and default MuPDF must have same major.minor version. + ''' + version_p_tuple = [int(i) for i in setup.version_p.split('.')] + version_mupdf_tuple = [int(i) for i in setup.version_mupdf.split('.')] + assert version_p_tuple[:2] == version_mupdf_tuple[:2], \ + f'PyMuPDF and MuPDF major.minor versions do not match. {setup.version_p=} {setup.version_mupdf=}.' + + +def test_release_bug_template(): + ''' + Bug report template must list current PyMuPDF version. + ''' + p = f'{g_root}/.github/ISSUE_TEMPLATE/bug_report.yml' + expected = f'\n - {setup.version_p}\n' + with open(p) as f: + text = f.read() + assert expected in text, f'{p}:1: Failed to find line for {setup.version_p=}, {expected!r}.' + + +def test_release_changelog_version(): + ''' + In changes.txt, first item must match setup.version_p. + ''' + p = f'{g_root}/changes.txt' + with open(p) as f: + text = f.read() + m = re.search(f'\n[*][*]Changes in version ([0-9.]+)[*][*]\n', text) + assert m, f'Cannot parse {p}.' + assert m[1] == setup.version_p, \ + f'{_file_line(p, text, m)}: Cannot find {setup.version_p=} in first changelog item: {m[0].strip()!r}.' + + +def test_release_changelog_mupdf_version(): + ''' + In changes.txt, first mentioned of MuPDF must match setup.version_mupdf. + ''' + p = f'{g_root}/changes.txt' + with open(p) as f: + text = f.read() + m = re.search(f'\n[*] Use MuPDF-([0-9.]+)[.]\n', text) + assert m, f'Cannot parse {p}.' + assert m[1] == setup.version_mupdf, \ + f'{_file_line(p, text, m)}: First mentioned MuPDF version does not match {setup.version_mupdf=}: {m[0].strip()!r}.'
--- a/tests/test_tables.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_tables.py Sat Oct 11 11:19:58 2025 +0200 @@ -184,7 +184,12 @@ ), f"{pymupdf.TOOLS.set_small_glyph_heights()=}" wt = pymupdf.TOOLS.mupdf_warnings() - if pymupdf.mupdf_version_tuple >= (1, 26, 0): + if pymupdf.mupdf_version_tuple >= (1, 26, 8): + assert ( + wt + == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times...\nActualtext with no position. Text may be lost or mispositioned.\n... repeated 96 times..." + ) + elif pymupdf.mupdf_version_tuple >= (1, 26, 0): assert ( wt == "bogus font ascent/descent values (3117 / -2463)\n... repeated 2 times..."
--- a/tests/test_tesseract.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_tesseract.py Sat Oct 11 11:19:58 2025 +0200 @@ -24,14 +24,18 @@ tail = 'OCR initialisation failed' else: tail = 'Tesseract language initialisation failed' - e_expected = f'code=3: {tail}' - if platform.system() == 'OpenBSD': - # 2023-12-12: For some reason the SWIG catch code only catches - # the exception as FzErrorBase. - e_expected_type = pymupdf.mupdf.FzErrorBase - print(f'OpenBSD workaround - expecting FzErrorBase, not FzErrorLibrary.') + if os.environ.get('PYODIDE_ROOT'): + e_expected = 'code=6: No OCR support in this build' + e_expected_type = pymupdf.mupdf.FzErrorUnsupported else: - e_expected_type = pymupdf.mupdf.FzErrorLibrary + e_expected = f'code=3: {tail}' + if platform.system() == 'OpenBSD': + # 2023-12-12: For some reason the SWIG catch code only catches + # the exception as FzErrorBase. + e_expected_type = pymupdf.mupdf.FzErrorBase + print(f'OpenBSD workaround - expecting FzErrorBase, not FzErrorLibrary.') + else: + e_expected_type = pymupdf.mupdf.FzErrorLibrary else: # classic. e_expected = 'OCR initialisation failed' @@ -71,6 +75,10 @@ # # Note that Tesseract seems to output its own diagnostics. # + if os.environ.get('PYODIDE_ROOT'): + print('test_3842b(): not running on Pyodide - cannot run child processes.') + return + path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf') with pymupdf.open(path) as document: page = document[6] @@ -91,6 +99,10 @@ def test_3842(): + if os.environ.get('PYODIDE_ROOT'): + print('test_3842(): not running on Pyodide - cannot run child processes.') + return + path = os.path.normpath(f'{__file__}/../../tests/resources/test_3842.pdf') with pymupdf.open(path) as document: page = document[6]
--- a/tests/test_textbox.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_textbox.py Sat Oct 11 11:19:58 2025 +0200 @@ -7,6 +7,11 @@ """ import pymupdf +import gentle_compare + +import os +import textwrap + # codespell:ignore-begin text = """Der Kleine Schwertwal (Pseudorca crassidens), auch bekannt als Unechter oder Schwarzer Schwertwal, ist eine Art der Delfine (Delphinidae) und der einzige rezente Vertreter der Gattung Pseudorca. @@ -182,7 +187,9 @@ assert spare_height < 0 assert scale == 1 spare_height, scale = page.insert_htmlbox(rect, text, rotate=rot, scale_low=0) - assert spare_height == 0 + page.draw_rect(rect, (1, 0, 0)) + doc.save(os.path.normpath(f'{__file__}/../../tests/test_htmlbox1.pdf')) + assert abs(spare_height - 3.8507) < 0.001 assert 0 < scale < 1 page = doc.reload_page(page) link = page.get_links()[0] # extracts the links on the page @@ -286,3 +293,77 @@ text = '111111111' print(f'Calling writer.fill_textbox().', flush=1) writer.fill_textbox(rect=pymupdf.Rect(0, 0, 100, 20), pos=(80, 0), text=text, fontsize=8) + + +def test_4613(): + print() + text = 3 * 'abcdefghijklmnopqrstuvwxyz\nABCDEFGHIJKLMNOPQRSTUVWXYZ\n' + story = pymupdf.Story(text) + rect = pymupdf.Rect(10, 10, 100, 100) + + # Test default operation where we get additional scaling down because of + # the long words in our text. + print(f'test_4613(): ### Testing default operation.') + with pymupdf.open() as doc: + page = doc.new_page() + spare_height, scale = page.insert_htmlbox(rect, story) + print(f'test_4613(): {spare_height=} {scale=}') + # The additional down-scaling from the long word widths results in + # spare vertical space. + page.draw_rect(rect, (1, 0, 0)) + path = os.path.normpath(f'{__file__}/../../tests/test_4613.pdf') + doc.save(path) + + path_pixmap = os.path.normpath(f'{__file__}/../../tests/test_4613.png') + path_pixmap_expected = os.path.normpath(f'{__file__}/../../tests/resources/test_4613.png') + pixmap = page.get_pixmap(dpi=300) + pixmap.save(path_pixmap) + + pixmap_diff = gentle_compare.pixmaps_diff(path_pixmap_expected, pixmap) + pixmap_diff.save(os.path.normpath(f'{__file__}/../../tests/test_4613-diff.png')) + + rms = gentle_compare.pixmaps_rms(pixmap, path_pixmap_expected) + print(f'{rms=}') + assert rms == 0, f'{rms=}' + + assert abs(spare_height - 45.7536) < 0.1 + assert abs(scale - 0.4009) < 0.01 + + new_text = page.get_text('text', clip=rect) + print(f'test_4613(): new_text:') + print(textwrap.indent(new_text, ' ')) + assert new_text == text + + # Check with _scale_word_width=False - ignore too-wide words. + print(f'test_4613(): ### Testing with _scale_word_width=False.') + with pymupdf.open() as doc: + page = doc.new_page() + spare_height, scale = page.insert_htmlbox(rect, story, _scale_word_width=False) + print(f'test_4613(): _scale_word_width=False: {spare_height=} {scale=}') + # With _scale_word_width=False we allow long words to extend beyond the + # rect, so we should have spare_height == 0 and only a small amount of + # down-scaling. + assert spare_height == 0 + assert abs(scale - 0.914) < 0.01 + new_text = page.get_text('text', clip=rect) + print(f'test_4613(): new_text:') + print(textwrap.indent(new_text, ' ')) + assert new_text == textwrap.dedent(''' + abcdefghijklmno + ABCDEFGHIJKLM + abcdefghijklmno + ABCDEFGHIJKLM + abcdefghijklmno + ABCDEFGHIJKLM + ''')[1:] + + + # Check that we get no fit if scale_low is not low enough. + print(f'test_4613(): ### Testing with scale_low too high to allow a fit.') + with pymupdf.open() as doc: + page = doc.new_page() + scale_low=0.6 + spare_height, scale = page.insert_htmlbox(rect, story, scale_low=scale_low) + print(f'test_4613(): {scale_low=}: {spare_height=} {scale=}') + assert spare_height == -1 + assert scale == scale_low
--- a/tests/test_textextract.py Mon Sep 15 11:43:07 2025 +0200 +++ b/tests/test_textextract.py Sat Oct 11 11:19:58 2025 +0200 @@ -263,6 +263,10 @@ def test_document_text(): + if os.environ.get('PYODIDE_ROOT'): + print('test_document_text(): not running on Pyodide - multiprocessing not available.') + return + import platform import time @@ -310,6 +314,9 @@ def test_4524(): + if os.environ.get('PYODIDE_ROOT'): + print('test_4524(): not running on Pyodide - multiprocessing not available.') + return path = os.path.abspath(f'{__file__}/../../tests/resources/mupdf_explored.pdf') print('') document = pymupdf.Document(path) @@ -331,6 +338,11 @@ for line in text.split('\n'): print(f' {line!r}') print('='*40) + wt = pymupdf.TOOLS.mupdf_warnings() + if pymupdf.mupdf_version_tuple < (1, 26, 8): + assert not wt + else: + assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 2 times...' def test_3687(): @@ -375,12 +387,14 @@ assert texts1 == texts0 wt = pymupdf.TOOLS.mupdf_warnings() - if pymupdf.mupdf_version_tuple < (1, 27): - assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...' - else: + if pymupdf.mupdf_version_tuple >= (1, 27): expected = 'format error: No common ancestor in structure tree\nstructure tree broken, assume tree is missing' expected = '\n'.join([expected] * 56) assert wt == expected + elif pymupdf.mupdf_version_tuple >= (1, 26, 8): + assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 7684 times...' + else: + assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 434 times...' def test_3650(): path = os.path.normpath(f'{__file__}/../../tests/resources/test_3650.pdf') @@ -878,6 +892,9 @@ # This output is different from expected_1_23_5. expected_mupdf_1_26_1 = b'JOB No.: Shipper (complete name and address) \xe5\x8f\x91\xe8\xb4\xa7\xe4\xba\xba(\xe5\x90\x8d\xe7\xa7\xb0\xe5\x8f\x8a\xe5\x9c\xb0\xe5\x9d\x80) Tel: Fax: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88'.decode() + + # This output is different from either of the two expected strings. + expected_mupdf_1_27_0 = b'JOB No.: \n \nS/O No. \xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95\xe5\x8f\xb7\xe7\xa0\x81 \nSINORICH TRANSPORT LIMITED \nSHIPPING ORDER \n\xe6\x89\x98\xe8\xbf\x90\xe5\x8d\x95 \n \xe5\xb8\x82\xe5\x9c\xba\xe9\x83\xa8: \n88570009 \n88577019 \n88572702 \n \xe6\x93\x8d\xe4\xbd\x9c\xe9\x83\xa8: \n88570008 \n88570004 \n \xe6\x96\x87\xe4\xbb\xb6\xe9\x83\xa8: \n88570003\n \nNotify Party(complete name and address, '.decode() print(f'expected_1_23_5\n{textwrap.indent(expected_1_23_5, " ")}') print(f'expected_mupdf_1_26_1\n{textwrap.indent(expected_mupdf_1_26_1, " ")}') @@ -887,10 +904,16 @@ print(f'{text=}') print(f'{text.encode()=}') - if pymupdf.mupdf_version_tuple >= (1, 26, 1): + wt = pymupdf.TOOLS.mupdf_warnings() + if pymupdf.mupdf_version_tuple >= (1, 26, 8): + assert text == expected_mupdf_1_27_0 + assert wt == 'Actualtext with no position. Text may be lost or mispositioned.\n... repeated 120 times...' + elif pymupdf.mupdf_version_tuple >= (1, 26, 1): assert text == expected_mupdf_1_26_1 + assert not wt else: print(f'No expected output for {pymupdf.mupdf_version_tuple=}') + assert not wt def test_4503():
