Python2/PyMuPDF: src/utils.py comparison

comparison src/utils.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:37:51 +0200
parents
children	a6bc019ac0b2

comparison

equal deleted inserted replaced

--1:000000000000
+:1d09e1dec1d9
+# ------------------------------------------------------------------------
+# Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
+# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
+#
+# Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
+# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
+# maintained and developed by Artifex Software, Inc. https://artifex.com.
+# ------------------------------------------------------------------------
+import io
+import math
+import os
+import typing
+import weakref
+try:
+from . import pymupdf
+except Exception:
+import pymupdf
+try:
+from . import mupdf
+except Exception:
+import mupdf
+_format_g = pymupdf.format_g
+g_exceptions_verbose = pymupdf.g_exceptions_verbose
+point_like = "point_like"
+rect_like = "rect_like"
+matrix_like = "matrix_like"
+quad_like = "quad_like"
+# ByteString is gone from typing in 3.14.
+# collections.abc.Buffer available from 3.12 only
+try:
+ByteString = typing.ByteString
+except AttributeError:
+# pylint: disable=unsupported-binary-operation
+ByteString = bytes | bytearray | memoryview
+AnyType = typing.Any
+OptInt = typing.Union[int, None]
+OptFloat = typing.Optional[float]
+OptStr = typing.Optional[str]
+OptDict = typing.Optional[dict]
+OptBytes = typing.Optional[ByteString]
+OptSeq = typing.Optional[typing.Sequence]
+"""
+This is a collection of functions to extend PyMupdf.
+"""
+def write_text(
+page: pymupdf.Page,
+rect=None,
+writers=None,
+overlay=True,
+color=None,
+opacity=None,
+keep_proportion=True,
+rotate=0,
+oc=0,
+) -> None:
+"""Write the text of one or more pymupdf.TextWriter objects.
+Args:
+rect: target rectangle. If None, the union of the text writers is used.
+writers: one or more pymupdf.TextWriter objects.
+overlay: put in foreground or background.
+keep_proportion: maintain aspect ratio of rectangle sides.
+rotate: arbitrary rotation angle.
+oc: the xref of an optional content object
+"""
+assert isinstance(page, pymupdf.Page)
+if not writers:
+raise ValueError("need at least one pymupdf.TextWriter")
+if type(writers) is pymupdf.TextWriter:
+if rotate == 0 and rect is None:
+writers.write_text(page, opacity=opacity, color=color, overlay=overlay)
+return None
+else:
+writers = (writers,)
+clip = writers[0].text_rect
+textdoc = pymupdf.Document()
+tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height)
+for writer in writers:
+clip |= writer.text_rect
+writer.write_text(tpage, opacity=opacity, color=color)
+if rect is None:
+rect = clip
+page.show_pdf_page(
+rect,
+textdoc,
+0,
+overlay=overlay,
+keep_proportion=keep_proportion,
+rotate=rotate,
+clip=clip,
+oc=oc,
+)
+textdoc = None
+tpage = None
+def show_pdf_page(
+page,
+rect,
+docsrc,
+pno=0,
+keep_proportion=True,
+overlay=True,
+oc=0,
+rotate=0,
+clip=None,
+) -> int:
+"""Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'.
+Args:
+rect: (rect-like) where to place the source image
+docsrc: (document) source PDF
+pno: (int) source page number
+keep_proportion: (bool) do not change width-height-ratio
+overlay: (bool) put in foreground
+oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF)
+rotate: (int) degrees (multiple of 90)
+clip: (rect-like) part of source page rectangle
+Returns:
+xref of inserted object (for reuse)
+"""
+def calc_matrix(sr, tr, keep=True, rotate=0):
+"""Calculate transformation matrix from source to target rect.
+Notes:
+The product of four matrices in this sequence: (1) translate correct
+source corner to origin, (2) rotate, (3) scale, (4) translate to
+target's top-left corner.
+Args:
+sr: source rect in PDF (!) coordinate system
+tr: target rect in PDF coordinate system
+keep: whether to keep source ratio of width to height
+rotate: rotation angle in degrees
+Returns:
+Transformation matrix.
+"""
+# calc center point of source rect
+smp = (sr.tl + sr.br) / 2.0
+# calc center point of target rect
+tmp = (tr.tl + tr.br) / 2.0
+# m moves to (0, 0), then rotates
+m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate)
+sr1 = sr * m  # resulting source rect to calculate scale factors
+fw = tr.width / sr1.width  # scale the width
+fh = tr.height / sr1.height  # scale the height
+if keep:
+fw = fh = min(fw, fh)  # take min if keeping aspect ratio
+m *= pymupdf.Matrix(fw, fh)  # concat scale matrix
+m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y)  # concat move to target center
+return pymupdf.JM_TUPLE(m)
+pymupdf.CheckParent(page)
+doc = page.parent
+if not doc.is_pdf or not docsrc.is_pdf:
+raise ValueError("is no PDF")
+if rect.is_empty or rect.is_infinite:
+raise ValueError("rect must be finite and not empty")
+while pno < 0:  # support negative page numbers
+pno += docsrc.page_count
+src_page = docsrc[pno]  # load source page
+tar_rect = rect * ~page.transformation_matrix  # target rect in PDF coordinates
+src_rect = src_page.rect if not clip else src_page.rect & clip  # source rect
+if src_rect.is_empty or src_rect.is_infinite:
+raise ValueError("clip must be finite and not empty")
+src_rect = src_rect * ~src_page.transformation_matrix  # ... in PDF coord
+matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate)
+# list of existing /Form /XObjects
+ilst = [i[1] for i in doc.get_page_xobjects(page.number)]
+ilst += [i[7] for i in doc.get_page_images(page.number)]
+ilst += [i[4] for i in doc.get_page_fonts(page.number)]
+# create a name not in that list
+n = "fzFrm"
+i = 0
+_imgname = n + "0"
+while _imgname in ilst:
+i += 1
+_imgname = n + str(i)
+isrc = docsrc._graft_id  # used as key for graftmaps
+if doc._graft_id == isrc:
+raise ValueError("source document must not equal target")
+# retrieve / make pymupdf.Graftmap for source PDF
+gmap = doc.Graftmaps.get(isrc, None)
+if gmap is None:
+gmap = pymupdf.Graftmap(doc)
+doc.Graftmaps[isrc] = gmap
+# take note of generated xref for automatic reuse
+pno_id = (isrc, pno)  # id of docsrc[pno]
+xref = doc.ShownPages.get(pno_id, 0)
+if overlay:
+page.wrap_contents()  # ensure a balanced graphics state
+xref = page._show_pdf_page(
+src_page,
+overlay=overlay,
+matrix=matrix,
+xref=xref,
+oc=oc,
+clip=src_rect,
+graftmap=gmap,
+_imgname=_imgname,
+)
+doc.ShownPages[pno_id] = xref
+return xref
+def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None):
+"""Replace the image referred to by xref.
+Replace the image by changing the object definition stored under xref. This
+will leave the pages appearance instructions intact, so the new image is
+being displayed with the same bbox, rotation etc.
+By providing a small fully transparent image, an effect as if the image had
+been deleted can be achieved.
+A typical use may include replacing large images by a smaller version,
+e.g. with a lower resolution or graylevel instead of colored.
+Args:
+xref: the xref of the image to replace.
+filename, pixmap, stream: exactly one of these must be provided. The
+meaning being the same as in Page.insert_image.
+"""
+doc = page.parent  # the owning document
+if not doc.xref_is_image(xref):
+raise ValueError("xref not an image")  # insert new image anywhere in page
+if bool(filename) + bool(stream) + bool(pixmap) != 1:
+raise ValueError("Exactly one of filename/stream/pixmap must be given")
+new_xref = page.insert_image(
+page.rect, filename=filename, stream=stream, pixmap=pixmap
+)
+doc.xref_copy(new_xref, xref)  # copy over new to old
+last_contents_xref = page.get_contents()[-1]
+# new image insertion has created a new /Contents source,
+# which we will set to spaces now
+doc.update_stream(last_contents_xref, b" ")
+page._image_info = None  # clear cache of extracted image information
+def delete_image(page: pymupdf.Page, xref: int):
+"""Delete the image referred to by xef.
+Actually replaces by a small transparent Pixmap using method Page.replace_image.
+Args:
+xref: xref of the image to delete.
+"""
+# make a small 100% transparent pixmap (of just any dimension)
+pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1)
+pix.clear_with()  # clear all samples bytes to 0x00
+page.replace_image(xref, pixmap=pix)
+def insert_image(
+page,
+rect,
+*,
+alpha=-1,
+filename=None,
+height=0,
+keep_proportion=True,
+mask=None,
+oc=0,
+overlay=True,
+pixmap=None,
+rotate=0,
+stream=None,
+width=0,
+xref=0,
+):
+"""Insert an image for display in a rectangle.
+Args:
+rect: (rect_like) position of image on the page.
+alpha: (int, optional) set to 0 if image has no transparency.
+filename: (str, Path, file object) image filename.
+height: (int)
+keep_proportion: (bool) keep width / height ratio (default).
+mask: (bytes, optional) image consisting of alpha values to use.
+oc: (int) xref of OCG or OCMD to declare as Optional Content.
+overlay: (bool) put in foreground (default) or background.
+pixmap: (pymupdf.Pixmap) use this as image.
+rotate: (int) rotate by 0, 90, 180 or 270 degrees.
+stream: (bytes) use this as image.
+width: (int)
+xref: (int) use this as image.
+'page' and 'rect' are positional, all other parameters are keywords.
+If 'xref' is given, that image is used. Other input options are ignored.
+Else, exactly one of pixmap, stream or filename must be given.
+'alpha=0' for non-transparent images improves performance significantly.
+Affects stream and filename only.
+Optimum transparent insertions are possible by using filename / stream in
+conjunction with a 'mask' image of alpha values.
+Returns:
+xref (int) of inserted image. Re-use as argument for multiple insertions.
+"""
+pymupdf.CheckParent(page)
+doc = page.parent
+if not doc.is_pdf:
+raise ValueError("is no PDF")
+if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1):
+raise ValueError("xref=0 needs exactly one of filename, pixmap, stream")
+if filename:
+if type(filename) is str:
+pass
+elif hasattr(filename, "absolute"):
+filename = str(filename)
+elif hasattr(filename, "name"):
+filename = filename.name
+else:
+raise ValueError("bad filename")
+if filename and not os.path.exists(filename):
+raise FileNotFoundError("No such file: '%s'" % filename)
+elif stream and type(stream) not in (bytes, bytearray, io.BytesIO):
+raise ValueError("stream must be bytes-like / BytesIO")
+elif pixmap and type(pixmap) is not pymupdf.Pixmap:
+raise ValueError("pixmap must be a pymupdf.Pixmap")
+if mask and not (stream or filename):
+raise ValueError("mask requires stream or filename")
+if mask and type(mask) not in (bytes, bytearray, io.BytesIO):
+raise ValueError("mask must be bytes-like / BytesIO")
+while rotate < 0:
+rotate += 360
+while rotate >= 360:
+rotate -= 360
+if rotate not in (0, 90, 180, 270):
+raise ValueError("bad rotate value")
+r = pymupdf.Rect(rect)
+if r.is_empty or r.is_infinite:
+raise ValueError("rect must be finite and not empty")
+clip = r * ~page.transformation_matrix
+# Create a unique image reference name.
+ilst = [i[7] for i in doc.get_page_images(page.number)]
+ilst += [i[1] for i in doc.get_page_xobjects(page.number)]
+ilst += [i[4] for i in doc.get_page_fonts(page.number)]
+n = "fzImg"  # 'pymupdf image'
+i = 0
+_imgname = n + "0"  # first name candidate
+while _imgname in ilst:
+i += 1
+_imgname = n + str(i)  # try new name
+if overlay:
+page.wrap_contents()  # ensure a balanced graphics state
+digests = doc.InsertedImages
+xref, digests = page._insert_image(
+filename=filename,
+pixmap=pixmap,
+stream=stream,
+imask=mask,
+clip=clip,
+overlay=overlay,
+oc=oc,
+xref=xref,
+rotate=rotate,
+keep_proportion=keep_proportion,
+width=width,
+height=height,
+alpha=alpha,
+_imgname=_imgname,
+digests=digests,
+)
+if digests is not None:
+doc.InsertedImages = digests
+return xref
+def search_for(
+page,
+text,
+*,
+clip=None,
+quads=False,
+flags=pymupdf.TEXT_DEHYPHENATE
+| pymupdf.TEXT_PRESERVE_WHITESPACE
+| pymupdf.TEXT_PRESERVE_LIGATURES
+| pymupdf.TEXT_MEDIABOX_CLIP
+,
+textpage=None,
+) -> list:
+"""Search for a string on a page.
+Args:
+text: string to be searched for
+clip: restrict search to this rectangle
+quads: (bool) return quads instead of rectangles
+flags: bit switches, default: join hyphened words
+textpage: a pre-created pymupdf.TextPage
+Returns:
+a list of rectangles or quads, each containing one occurrence.
+"""
+if clip is not None:
+clip = pymupdf.Rect(clip)
+pymupdf.CheckParent(page)
+tp = textpage
+if tp is None:
+tp = page.get_textpage(clip=clip, flags=flags)  # create pymupdf.TextPage
+elif getattr(tp, "parent") != page:
+raise ValueError("not a textpage of this page")
+rlist = tp.search(text, quads=quads)
+if textpage is None:
+del tp
+return rlist
+def search_page_for(
+doc: pymupdf.Document,
+pno: int,
+text: str,
+quads: bool = False,
+clip: rect_like = None,
+flags: int = pymupdf.TEXT_DEHYPHENATE
+| pymupdf.TEXT_PRESERVE_LIGATURES
+| pymupdf.TEXT_PRESERVE_WHITESPACE
+| pymupdf.TEXT_MEDIABOX_CLIP
+,
+textpage: pymupdf.TextPage = None,
+) -> list:
+"""Search for a string on a page.
+Args:
+pno: page number
+text: string to be searched for
+clip: restrict search to this rectangle
+quads: (bool) return quads instead of rectangles
+flags: bit switches, default: join hyphened words
+textpage: reuse a prepared textpage
+Returns:
+a list of rectangles or quads, each containing an occurrence.
+"""
+return doc[pno].search_for(
+text,
+quads=quads,
+clip=clip,
+flags=flags,
+textpage=textpage,
+)
+def get_text_blocks(
+page: pymupdf.Page,
+clip: rect_like = None,
+flags: OptInt = None,
+textpage: pymupdf.TextPage = None,
+sort: bool = False,
+) -> list:
+"""Return the text blocks on a page.
+Notes:
+Lines in a block are concatenated with line breaks.
+Args:
+flags: (int) control the amount of data parsed into the textpage.
+Returns:
+A list of the blocks. Each item contains the containing rectangle
+coordinates, text lines, running block number and block type.
+"""
+pymupdf.CheckParent(page)
+if flags is None:
+flags = pymupdf.TEXTFLAGS_BLOCKS
+tp = textpage
+if tp is None:
+tp = page.get_textpage(clip=clip, flags=flags)
+elif getattr(tp, "parent") != page:
+raise ValueError("not a textpage of this page")
+blocks = tp.extractBLOCKS()
+if textpage is None:
+del tp
+if sort:
+blocks.sort(key=lambda b: (b[3], b[0]))
+return blocks
+def get_text_words(
+page: pymupdf.Page,
+clip: rect_like = None,
+flags: OptInt = None,
+textpage: pymupdf.TextPage = None,
+sort: bool = False,
+delimiters=None,
+tolerance=3,
+) -> list:
+"""Return the text words as a list with the bbox for each word.
+Args:
+page: pymupdf.Page
+clip: (rect-like) area on page to consider
+flags: (int) control the amount of data parsed into the textpage.
+textpage: (pymupdf.TextPage) either passed-in or None.
+sort: (bool) sort the words in reading sequence.
+delimiters: (str,list) characters to use as word delimiters.
+tolerance: (float) consider words to be part of the same line if
+top or bottom coordinate are not larger than this. Relevant
+only if sort=True.
+Returns:
+Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
+"""
+def sort_words(words):
+"""Sort words line-wise, forgiving small deviations."""
+words.sort(key=lambda w: (w[3], w[0]))
+nwords = []  # final word list
+line = [words[0]]  # collects words roughly in same line
+lrect = pymupdf.Rect(words[0][:4])  # start the line rectangle
+for w in words[1:]:
+wrect = pymupdf.Rect(w[:4])
+if (
+abs(wrect.y0 - lrect.y0) <= tolerance
+or abs(wrect.y1 - lrect.y1) <= tolerance
+):
+line.append(w)
+lrect |= wrect
+else:
+line.sort(key=lambda w: w[0])  # sort words in line l-t-r
+nwords.extend(line)  # append to final words list
+line = [w]  # start next line
+lrect = wrect  # start next line rect
+line.sort(key=lambda w: w[0])  # sort words in line l-t-r
+nwords.extend(line)  # append to final words list
+return nwords
+pymupdf.CheckParent(page)
+if flags is None:
+flags = pymupdf.TEXTFLAGS_WORDS
+tp = textpage
+if tp is None:
+tp = page.get_textpage(clip=clip, flags=flags)
+elif getattr(tp, "parent") != page:
+raise ValueError("not a textpage of this page")
+words = tp.extractWORDS(delimiters)
+# if textpage was given, we subselect the words in clip
+if textpage is not None and clip is not None:
+# sub-select words contained in clip
+clip = pymupdf.Rect(clip)
+words = [
+w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
+]
+if textpage is None:
+del tp
+if words and sort:
+# advanced sort if any words found
+words = sort_words(words)
+return words
+def get_sorted_text(
+page: pymupdf.Page,
+clip: rect_like = None,
+flags: OptInt = None,
+textpage: pymupdf.TextPage = None,
+tolerance=3,
+) -> str:
+"""Extract plain text avoiding unacceptable line breaks.
+Text contained in clip will be sorted in reading sequence. Some effort
+is also spent to simulate layout vertically and horizontally.
+Args:
+page: pymupdf.Page
+clip: (rect-like) only consider text inside
+flags: (int) text extraction flags
+textpage: pymupdf.TextPage
+tolerance: (float) consider words to be on the same line if their top
+or bottom coordinates do not differ more than this.
+Notes:
+If a TextPage is provided, all text is checked for being inside clip
+with at least 50% of its bbox.
+This allows to use some "global" TextPage in conjunction with sub-
+selecting words in parts of the defined TextPage rectangle.
+Returns:
+A text string in reading sequence. Left indentation of each line,
+inter-line and inter-word distances strive to reflect the layout.
+"""
+def line_text(clip, line):
+"""Create the string of one text line.
+We are trying to simulate some horizontal layout here, too.
+Args:
+clip: (pymupdf.Rect) the area from which all text is being read.
+line: (list) word tuples (rect, text) contained in the line
+Returns:
+Text in this line. Generated from words in 'line'. Distance from
+predecessor is translated to multiple spaces, thus simulating
+text indentations and large horizontal distances.
+"""
+line.sort(key=lambda w: w[0].x0)
+ltext = ""  # text in the line
+x1 = clip.x0  # end coordinate of ltext
+lrect = pymupdf.EMPTY_RECT()  # bbox of this line
+for r, t in line:
+lrect |= r  # update line bbox
+# convert distance to previous word to multiple spaces
+dist = max(
+int(round((r.x0 - x1) / r.width * len(t))),
+0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
+)  # number of space characters
+ltext += " " * dist + t  # append word string
+x1 = r.x1  # update new end position
+return ltext
+# Extract words in correct sequence first.
+words = [
+(pymupdf.Rect(w[:4]), w[4])
+for w in get_text_words(
+page,
+clip=clip,
+flags=flags,
+textpage=textpage,
+sort=True,
+tolerance=tolerance,
+)
+]
+if not words:  # no text present
+return ""
+totalbox = pymupdf.EMPTY_RECT()  # area covering all text
+for wr, text in words:
+totalbox |= wr
+lines = []  # list of reconstituted lines
+line = [words[0]]  # current line
+lrect = words[0][0]  # the line's rectangle
+# walk through the words
+for wr, text in words[1:]:  # start with second word
+w0r, _ = line[-1]  # read previous word in current line
+# if this word matches top or bottom of the line, append it
+if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
+line.append((wr, text))
+lrect |= wr
+else:
+# output current line and re-initialize
+ltext = line_text(totalbox, line)
+lines.append((lrect, ltext))
+line = [(wr, text)]
+lrect = wr
+# also append unfinished last line
+ltext = line_text(totalbox, line)
+lines.append((lrect, ltext))
+# sort all lines vertically
+lines.sort(key=lambda l: (l[0].y1))
+text = lines[0][1]  # text of first line
+y1 = lines[0][0].y1  # its bottom coordinate
+for lrect, ltext in lines[1:]:
+distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
+breaks = "\n" * (distance + 1)
+text += breaks + ltext
+y1 = lrect.y1
+# return text in clip
+return text
+def get_textbox(
+page: pymupdf.Page,
+rect: rect_like,
+textpage: pymupdf.TextPage = None,
+) -> str:
+tp = textpage
+if tp is None:
+tp = page.get_textpage()
+elif getattr(tp, "parent") != page:
+raise ValueError("not a textpage of this page")
+rc = tp.extractTextbox(rect)
+if textpage is None:
+del tp
+return rc
+def get_text_selection(
+page: pymupdf.Page,
+p1: point_like,
+p2: point_like,
+clip: rect_like = None,
+textpage: pymupdf.TextPage = None,
+):
+pymupdf.CheckParent(page)
+tp = textpage
+if tp is None:
+tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
+elif getattr(tp, "parent") != page:
+raise ValueError("not a textpage of this page")
+rc = tp.extractSelection(p1, p2)
+if textpage is None:
+del tp
+return rc
+def get_textpage_ocr(
+page: pymupdf.Page,
+flags: int = 0,
+language: str = "eng",
+dpi: int = 72,
+full: bool = False,
+tessdata: str = None,
+) -> pymupdf.TextPage:
+"""Create a Textpage from combined results of normal and OCR text parsing.
+Args:
+flags: (int) control content becoming part of the result.
+language: (str) specify expected language(s). Default is "eng" (English).
+dpi: (int) resolution in dpi, default 72.
+full: (bool) whether to OCR the full page image, or only its images (default)
+"""
+pymupdf.CheckParent(page)
+tessdata = pymupdf.get_tessdata(tessdata)
+def full_ocr(page, dpi, language, flags):
+zoom = dpi / 72
+mat = pymupdf.Matrix(zoom, zoom)
+pix = page.get_pixmap(matrix=mat)
+ocr_pdf = pymupdf.Document(
+"pdf",
+pix.pdfocr_tobytes(
+compress=False,
+language=language,
+tessdata=tessdata,
+),
+)
+ocr_page = ocr_pdf.load_page(0)
+unzoom = page.rect.width / ocr_page.rect.width
+ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
+tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
+ocr_pdf.close()
+pix = None
+tpage.parent = weakref.proxy(page)
+return tpage
+# if OCR for the full page, OCR its pixmap @ desired dpi
+if full:
+return full_ocr(page, dpi, language, flags)
+# For partial OCR, make a normal textpage, then extend it with text that
+# is OCRed from each image.
+# Because of this, we need the images flag bit set ON.
+tpage = page.get_textpage(flags=flags)
+for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
+if block["type"] != 1:  # only look at images
+continue
+bbox = pymupdf.Rect(block["bbox"])
+if bbox.width <= 3 or bbox.height <= 3:  # ignore tiny stuff
+continue
+try:
+pix = pymupdf.Pixmap(block["image"])  # get image pixmap
+if pix.n - pix.alpha != 3:  # we need to convert this to RGB!
+pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
+if pix.alpha:  # must remove alpha channel
+pix = pymupdf.Pixmap(pix, 0)
+imgdoc = pymupdf.Document(
+"pdf",
+pix.pdfocr_tobytes(language=language, tessdata=tessdata),
+)  # pdf with OCRed page
+imgpage = imgdoc.load_page(0)  # read image as a page
+pix = None
+# compute matrix to transform coordinates back to that of 'page'
+imgrect = imgpage.rect  # page size of image PDF
+shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
+mat = shrink * block["transform"]
+imgpage.extend_textpage(tpage, flags=0, matrix=mat)
+imgdoc.close()
+except (RuntimeError, mupdf.FzErrorBase):
+if 0 and g_exceptions_verbose:
+# Don't show exception info here because it can happen in
+# normal operation (see test_3842b).
+pymupdf.exception_info()
+tpage = None
+pymupdf.message("Falling back to full page OCR")
+return full_ocr(page, dpi, language, flags)
+return tpage
+def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list:
+"""Extract image information only from a pymupdf.TextPage.
+Args:
+hashes: (bool) include MD5 hash for each image.
+xrefs: (bool) try to find the xref for each image. Sets hashes to true.
+"""
+doc = page.parent
+if xrefs and doc.is_pdf:
+hashes = True
+if not doc.is_pdf:
+xrefs = False
+imginfo = getattr(page, "_image_info", None)
+if imginfo and not xrefs:
+return imginfo
+if not imginfo:
+tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES)
+imginfo = tp.extractIMGINFO(hashes=hashes)
+del tp
+if hashes:
+page._image_info = imginfo
+if not xrefs or not doc.is_pdf:
+return imginfo
+imglist = page.get_images()
+digests = {}
+for item in imglist:
+xref = item[0]
+pix = pymupdf.Pixmap(doc, xref)
+digests[pix.digest] = xref
+del pix
+for i in range(len(imginfo)):
+item = imginfo[i]
+xref = digests.get(item["digest"], 0)
+item["xref"] = xref
+imginfo[i] = item
+return imginfo
+def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:
+"""Return list of image positions on a page.
+Args:
+name: (str, list, int) image identification. May be reference name, an
+item of the page's image list or an xref.
+transform: (bool) whether to also return the transformation matrix.
+Returns:
+A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix)
+for all image locations on the page.
+"""
+if type(name) in (list, tuple):
+xref = name[0]
+elif type(name) is int:
+xref = name
+else:
+imglist = [i for i in page.get_images() if i[7] == name]
+if imglist == []:
+raise ValueError("bad image name")
+elif len(imglist) != 1:
+raise ValueError("multiple image names found")
+xref = imglist[0][0]
+pix = pymupdf.Pixmap(page.parent, xref)  # make pixmap of the image to compute MD5
+digest = pix.digest
+del pix
+infos = page.get_image_info(hashes=True)
+if not transform:
+bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest]
+else:
+bboxes = [
+(pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"]))
+for im in infos
+if im["digest"] == digest
+]
+return bboxes
+def get_text(
+page: pymupdf.Page,
+option: str = "text",
+*,
+clip: rect_like = None,
+flags: OptInt = None,
+textpage: pymupdf.TextPage = None,
+sort: bool = False,
+delimiters=None,
+tolerance=3,
+):
+"""Extract text from a page or an annotation.
+This is a unifying wrapper for various methods of the pymupdf.TextPage class.
+Args:
+option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
+clip: (rect-like) restrict output to this area.
+flags: bit switches to e.g. exclude images or decompose ligatures.
+textpage: reuse this pymupdf.TextPage and make no new one. If specified,
+'flags' and 'clip' are ignored.
+Returns:
+the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
+methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
+extractXHTML or etractXML respectively.
+Default and misspelling choice is "text".
+"""
+formats = {
+"text": pymupdf.TEXTFLAGS_TEXT,
+"html": pymupdf.TEXTFLAGS_HTML,
+"json": pymupdf.TEXTFLAGS_DICT,
+"rawjson": pymupdf.TEXTFLAGS_RAWDICT,
+"xml": pymupdf.TEXTFLAGS_XML,
+"xhtml": pymupdf.TEXTFLAGS_XHTML,
+"dict": pymupdf.TEXTFLAGS_DICT,
+"rawdict": pymupdf.TEXTFLAGS_RAWDICT,
+"words": pymupdf.TEXTFLAGS_WORDS,
+"blocks": pymupdf.TEXTFLAGS_BLOCKS,
+}
+option = option.lower()
+assert option in formats
+if option not in formats:
+option = "text"
+if flags is None:
+flags = formats[option]
+if option == "words":
+return get_text_words(
+page,
+clip=clip,
+flags=flags,
+textpage=textpage,
+sort=sort,
+delimiters=delimiters,
+)
+if option == "blocks":
+return get_text_blocks(
+page, clip=clip, flags=flags, textpage=textpage, sort=sort
+)
+if option == "text" and sort:
+return get_sorted_text(
+page,
+clip=clip,
+flags=flags,
+textpage=textpage,
+tolerance=tolerance,
+)
+pymupdf.CheckParent(page)
+cb = None
+if option in ("html", "xml", "xhtml"):  # no clipping for MuPDF functions
+clip = page.cropbox
+if clip is not None:
+clip = pymupdf.Rect(clip)
+cb = None
+elif type(page) is pymupdf.Page:
+cb = page.cropbox
+# pymupdf.TextPage with or without images
+tp = textpage
+#pymupdf.exception_info()
+if tp is None:
+tp = page.get_textpage(clip=clip, flags=flags)
+elif getattr(tp, "parent") != page:
+raise ValueError("not a textpage of this page")
+#pymupdf.log( '{option=}')
+if option == "json":
+t = tp.extractJSON(cb=cb, sort=sort)
+elif option == "rawjson":
+t = tp.extractRAWJSON(cb=cb, sort=sort)
+elif option == "dict":
+t = tp.extractDICT(cb=cb, sort=sort)
+elif option == "rawdict":
+t = tp.extractRAWDICT(cb=cb, sort=sort)
+elif option == "html":
+t = tp.extractHTML()
+elif option == "xml":
+t = tp.extractXML()
+elif option == "xhtml":
+t = tp.extractXHTML()
+else:
+t = tp.extractText(sort=sort)
+if textpage is None:
+del tp
+return t
+def get_page_text(
+doc: pymupdf.Document,
+pno: int,
+option: str = "text",
+clip: rect_like = None,
+flags: OptInt = None,
+textpage: pymupdf.TextPage = None,
+sort: bool = False,
+) -> typing.Any:
+"""Extract a document page's text by page number.
+Notes:
+Convenience function calling page.get_text().
+Args:
+pno: page number
+option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
+Returns:
+output from page.TextPage().
+"""
+return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort)
+def get_pixmap(
+page: pymupdf.Page,
+*,
+matrix: matrix_like=pymupdf.Identity,
+dpi=None,
+colorspace: pymupdf.Colorspace=pymupdf.csRGB,
+clip: rect_like=None,
+alpha: bool=False,
+annots: bool=True,
+) -> pymupdf.Pixmap:
+"""Create pixmap of page.
+Keyword args:
+matrix: Matrix for transformation (default: Identity).
+dpi: desired dots per inch. If given, matrix is ignored.
+colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB.
+clip: (irect-like) restrict rendering to this area.
+alpha: (bool) whether to include alpha channel
+annots: (bool) whether to also render annotations
+"""
+if dpi:
+zoom = dpi / 72
+matrix = pymupdf.Matrix(zoom, zoom)
+if type(colorspace) is str:
+if colorspace.upper() == "GRAY":
+colorspace = pymupdf.csGRAY
+elif colorspace.upper() == "CMYK":
+colorspace = pymupdf.csCMYK
+else:
+colorspace = pymupdf.csRGB
+if colorspace.n not in (1, 3, 4):
+raise ValueError("unsupported colorspace")
+dl = page.get_displaylist(annots=annots)
+pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip)
+dl = None
+if dpi:
+pix.set_dpi(dpi, dpi)
+return pix
+def get_page_pixmap(
+doc: pymupdf.Document,
+pno: int,
+*,
+matrix: matrix_like = pymupdf.Identity,
+dpi=None,
+colorspace: pymupdf.Colorspace = pymupdf.csRGB,
+clip: rect_like = None,
+alpha: bool = False,
+annots: bool = True,
+) -> pymupdf.Pixmap:
+"""Create pixmap of document page by page number.
+Notes:
+Convenience function calling page.get_pixmap.
+Args:
+pno: (int) page number
+matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity).
+colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB.
+clip: (irect-like) restrict rendering to this area.
+alpha: (bool) include alpha channel
+annots: (bool) also render annotations
+"""
+return doc[pno].get_pixmap(
+matrix=matrix,
+dpi=dpi, colorspace=colorspace,
+clip=clip,
+alpha=alpha,
+annots=annots
+)
+def getLinkDict(ln, document=None) -> dict:
+if isinstance(ln, pymupdf.Outline):
+dest = ln.destination(document)
+elif isinstance(ln, pymupdf.Link):
+dest = ln.dest
+else:
+assert 0, f'Unexpected {type(ln)=}.'
+nl = {"kind": dest.kind, "xref": 0}
+try:
+if hasattr(ln, 'rect'):
+nl["from"] = ln.rect
+except Exception:
+# This seems to happen quite often in PyMuPDF/tests.
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+pnt = pymupdf.Point(0, 0)
+if dest.flags & pymupdf.LINK_FLAG_L_VALID:
+pnt.x = dest.lt.x
+if dest.flags & pymupdf.LINK_FLAG_T_VALID:
+pnt.y = dest.lt.y
+if dest.kind == pymupdf.LINK_URI:
+nl["uri"] = dest.uri
+elif dest.kind == pymupdf.LINK_GOTO:
+nl["page"] = dest.page
+nl["to"] = pnt
+if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
+nl["zoom"] = dest.rb.x
+else:
+nl["zoom"] = 0.0
+elif dest.kind == pymupdf.LINK_GOTOR:
+nl["file"] = dest.file_spec.replace("\\", "/")
+nl["page"] = dest.page
+if dest.page < 0:
+nl["to"] = dest.dest
+else:
+nl["to"] = pnt
+if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
+nl["zoom"] = dest.rb.x
+else:
+nl["zoom"] = 0.0
+elif dest.kind == pymupdf.LINK_LAUNCH:
+nl["file"] = dest.file_spec.replace("\\", "/")
+elif dest.kind == pymupdf.LINK_NAMED:
+# The dicts should not have same key(s).
+assert not (dest.named.keys() & nl.keys())
+nl.update(dest.named)
+if 'to' in nl:
+nl['to'] = pymupdf.Point(nl['to'])
+else:
+nl["page"] = dest.page
+return nl
+def get_links(page: pymupdf.Page) -> list:
+"""Create a list of all links contained in a PDF page.
+Notes:
+see PyMuPDF ducmentation for details.
+"""
+pymupdf.CheckParent(page)
+ln = page.first_link
+links = []
+while ln:
+nl = getLinkDict(ln, page.parent)
+links.append(nl)
+ln = ln.next
+if links != [] and page.parent.is_pdf:
+linkxrefs = [x for x in
+#page.annot_xrefs()
+pymupdf.JM_get_annot_xref_list2(page)
+if x[1] == pymupdf.PDF_ANNOT_LINK  # pylint: disable=no-member
+]
+if len(linkxrefs) == len(links):
+for i in range(len(linkxrefs)):
+links[i]["xref"] = linkxrefs[i][0]
+links[i]["id"] = linkxrefs[i][2]
+return links
+def get_toc(
+doc: pymupdf.Document,
+simple: bool = True,
+) -> list:
+"""Create a table of contents.
+Args:
+simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.
+"""
+def recurse(olItem, liste, lvl):
+"""Recursively follow the outline item chain and record item information in a list."""
+while olItem and olItem.this.m_internal:
+if olItem.title:
+title = olItem.title
+else:
+title = " "
+if not olItem.is_external:
+if olItem.uri:
+if olItem.page == -1:
+resolve = doc.resolve_link(olItem.uri)
+page = resolve[0] + 1
+else:
+page = olItem.page + 1
+else:
+page = -1
+else:
+page = -1
+if not simple:
+link = getLinkDict(olItem, doc)
+liste.append([lvl, title, page, link])
+else:
+liste.append([lvl, title, page])
+if olItem.down:
+liste = recurse(olItem.down, liste, lvl + 1)
+olItem = olItem.next
+return liste
+# ensure document is open
+if doc.is_closed:
+raise ValueError("document closed")
+doc.init_doc()
+olItem = doc.outline
+if not olItem:
+return []
+lvl = 1
+liste = []
+toc = recurse(olItem, liste, lvl)
+if doc.is_pdf and not simple:
+doc._extend_toc_items(toc)
+return toc
+def del_toc_item(
+doc: pymupdf.Document,
+idx: int,
+) -> None:
+"""Delete TOC / bookmark item by index."""
+xref = doc.get_outline_xrefs()[idx]
+doc._remove_toc_item(xref)
+def set_toc_item(
+doc: pymupdf.Document,
+idx: int,
+dest_dict: OptDict = None,
+kind: OptInt = None,
+pno: OptInt = None,
+uri: OptStr = None,
+title: OptStr = None,
+to: point_like = None,
+filename: OptStr = None,
+zoom: float = 0,
+) -> None:
+"""Update TOC item by index.
+It allows changing the item's title and link destination.
+Args:
+idx:
+(int) desired index of the TOC list, as created by get_toc.
+dest_dict:
+(dict) destination dictionary as created by get_toc(False).
+Outrules all other parameters. If None, the remaining parameters
+are used to make a dest dictionary.
+kind:
+(int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only
+the title will be updated. If pymupdf.LINK_NONE, the TOC item will
+be deleted.
+pno:
+(int) page number (1-based like in get_toc). Required if
+pymupdf.LINK_GOTO.
+uri:
+(str) the URL, required if pymupdf.LINK_URI.
+title:
+(str) the new title. No change if None.
+to:
+(point-like) destination on the target page. If omitted, (72, 36)
+will be used as target coordinates.
+filename:
+(str) destination filename, required for pymupdf.LINK_GOTOR and
+pymupdf.LINK_LAUNCH.
+name:
+(str) a destination name for pymupdf.LINK_NAMED.
+zoom:
+(float) a zoom factor for the target location (pymupdf.LINK_GOTO).
+"""
+xref = doc.get_outline_xrefs()[idx]
+page_xref = 0
+if type(dest_dict) is dict:
+if dest_dict["kind"] == pymupdf.LINK_GOTO:
+pno = dest_dict["page"]
+page_xref = doc.page_xref(pno)
+page_height = doc.page_cropbox(pno).height
+to = dest_dict.get('to', pymupdf.Point(72, 36))
+to.y = page_height - to.y
+dest_dict["to"] = to
+action = getDestStr(page_xref, dest_dict)
+if not action.startswith("/A"):
+raise ValueError("bad bookmark dest")
+color = dest_dict.get("color")
+if color:
+color = list(map(float, color))
+if len(color) != 3 or min(color) < 0 or max(color) > 1:
+raise ValueError("bad color value")
+bold = dest_dict.get("bold", False)
+italic = dest_dict.get("italic", False)
+flags = italic + 2 * bold
+collapse = dest_dict.get("collapse")
+return doc._update_toc_item(
+xref,
+action=action[2:],
+title=title,
+color=color,
+flags=flags,
+collapse=collapse,
+)
+if kind == pymupdf.LINK_NONE:  # delete bookmark item
+return doc.del_toc_item(idx)
+if kind is None and title is None:  # treat as no-op
+return None
+if kind is None:  # only update title text
+return doc._update_toc_item(xref, action=None, title=title)
+if kind == pymupdf.LINK_GOTO:
+if pno is None or pno not in range(1, doc.page_count + 1):
+raise ValueError("bad page number")
+page_xref = doc.page_xref(pno - 1)
+page_height = doc.page_cropbox(pno - 1).height
+if to is None:
+to = pymupdf.Point(72, page_height - 36)
+else:
+to = pymupdf.Point(to)
+to.y = page_height - to.y
+ddict = {
+"kind": kind,
+"to": to,
+"uri": uri,
+"page": pno,
+"file": filename,
+"zoom": zoom,
+}
+action = getDestStr(page_xref, ddict)
+if action == "" or not action.startswith("/A"):
+raise ValueError("bad bookmark dest")
+return doc._update_toc_item(xref, action=action[2:], title=title)
+def get_area(*args) -> float:
+"""Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'."""
+rect = args[0]
+if len(args) > 1:
+unit = args[1]
+else:
+unit = "px"
+u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)}
+f = (u[unit][0] / u[unit][1]) ** 2
+return f * rect.width * rect.height
+def set_metadata(doc: pymupdf.Document, m: dict = None) -> None:
+"""Update the PDF /Info object.
+Args:
+m: a dictionary like doc.metadata.
+"""
+if not doc.is_pdf:
+raise ValueError("is no PDF")
+if doc.is_closed or doc.is_encrypted:
+raise ValueError("document closed or encrypted")
+if m is None:
+m = {}
+elif type(m) is not dict:
+raise ValueError("bad metadata")
+keymap = {
+"author": "Author",
+"producer": "Producer",
+"creator": "Creator",
+"title": "Title",
+"format": None,
+"encryption": None,
+"creationDate": "CreationDate",
+"modDate": "ModDate",
+"subject": "Subject",
+"keywords": "Keywords",
+"trapped": "Trapped",
+}
+valid_keys = set(keymap.keys())
+diff_set = set(m.keys()).difference(valid_keys)
+if diff_set != set():
+msg = "bad dict key(s): %s" % diff_set
+raise ValueError(msg)
+t, temp = doc.xref_get_key(-1, "Info")
+if t != "xref":
+info_xref = 0
+else:
+info_xref = int(temp.replace("0 R", ""))
+if m == {} and info_xref == 0:  # nothing to do
+return
+if info_xref == 0:  # no prev metadata: get new xref
+info_xref = doc.get_new_xref()
+doc.update_object(info_xref, "<<>>")  # fill it with empty object
+doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref)
+elif m == {}:  # remove existing metadata
+doc.xref_set_key(-1, "Info", "null")
+doc.init_doc()
+return
+for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]:
+pdf_key = keymap[key]
+if not bool(val) or val in ("none", "null"):
+val = "null"
+else:
+val = pymupdf.get_pdf_str(val)
+doc.xref_set_key(info_xref, pdf_key, val)
+doc.init_doc()
+return
+def getDestStr(xref: int, ddict: dict) -> str:
+"""Calculate the PDF action string.
+Notes:
+Supports Link annotations and outline items (bookmarks).
+"""
+if not ddict:
+return ""
+str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
+str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
+str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
+str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
+str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
+if type(ddict) in (int, float):
+dest = str_goto(xref, 0, ddict, 0)
+return dest
+d_kind = ddict.get("kind", pymupdf.LINK_NONE)
+if d_kind == pymupdf.LINK_NONE:
+return ""
+if ddict["kind"] == pymupdf.LINK_GOTO:
+d_zoom = ddict.get("zoom", 0)
+to = ddict.get("to", pymupdf.Point(0, 0))
+d_left, d_top = to
+dest = str_goto(xref, d_left, d_top, d_zoom)
+return dest
+if ddict["kind"] == pymupdf.LINK_URI:
+dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
+return dest
+if ddict["kind"] == pymupdf.LINK_LAUNCH:
+fspec = pymupdf.get_pdf_str(ddict["file"])
+dest = str_launch(fspec, fspec)
+return dest
+if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
+fspec = pymupdf.get_pdf_str(ddict["file"])
+dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
+return dest
+if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
+fspec = pymupdf.get_pdf_str(ddict["file"])
+dest = str_gotor1(
+ddict["page"],
+ddict["to"].x,
+ddict["to"].y,
+ddict["zoom"],
+fspec,
+fspec,
+)
+return dest
+return ""
+def set_toc(
+doc: pymupdf.Document,
+toc: list,
+collapse: int = 1,
+) -> int:
+"""Create new outline tree (table of contents, TOC).
+Args:
+toc: (list, tuple) each entry must contain level, title, page and
+optionally top margin on the page. None or '()' remove the TOC.
+collapse: (int) collapses entries beyond this level. Zero or None
+shows all entries unfolded.
+Returns:
+the number of inserted items, or the number of removed items respectively.
+"""
+if doc.is_closed or doc.is_encrypted:
+raise ValueError("document closed or encrypted")
+if not doc.is_pdf:
+raise ValueError("is no PDF")
+if not toc:  # remove all entries
+return len(doc._delToC())
+# validity checks --------------------------------------------------------
+if type(toc) not in (list, tuple):
+raise ValueError("'toc' must be list or tuple")
+toclen = len(toc)
+page_count = doc.page_count
+t0 = toc[0]
+if type(t0) not in (list, tuple):
+raise ValueError("items must be sequences of 3 or 4 items")
+if t0[0] != 1:
+raise ValueError("hierarchy level of item 0 must be 1")
+for i in list(range(toclen - 1)):
+t1 = toc[i]
+t2 = toc[i + 1]
+if not -1 <= t1[2] <= page_count:
+raise ValueError("row %i: page number out of range" % i)
+if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4):
+raise ValueError("bad row %i" % (i + 1))
+if (type(t2[0]) is not int) or t2[0] < 1:
+raise ValueError("bad hierarchy level in row %i" % (i + 1))
+if t2[0] > t1[0] + 1:
+raise ValueError("bad hierarchy level in row %i" % (i + 1))
+# no formal errors in toc --------------------------------------------------
+# --------------------------------------------------------------------------
+# make a list of xref numbers, which we can use for our TOC entries
+# --------------------------------------------------------------------------
+old_xrefs = doc._delToC()  # del old outlines, get their xref numbers
+# prepare table of xrefs for new bookmarks
+old_xrefs = []
+xref = [0] + old_xrefs
+xref[0] = doc._getOLRootNumber()  # entry zero is outline root xref number
+if toclen > len(old_xrefs):  # too few old xrefs?
+for i in range((toclen - len(old_xrefs))):
+xref.append(doc.get_new_xref())  # acquire new ones
+lvltab = {0: 0}  # to store last entry per hierarchy level
+# ------------------------------------------------------------------------------
+# contains new outline objects as strings - first one is the outline root
+# ------------------------------------------------------------------------------
+olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}]
+# ------------------------------------------------------------------------------
+# build olitems as a list of PDF-like connected dictionaries
+# ------------------------------------------------------------------------------
+for i in range(toclen):
+o = toc[i]
+lvl = o[0]  # level
+title = pymupdf.get_pdf_str(o[1])  # title
+pno = min(doc.page_count - 1, max(0, o[2] - 1))  # page number
+page_xref = doc.page_xref(pno)
+page_height = doc.page_cropbox(pno).height
+top = pymupdf.Point(72, page_height - 36)
+dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO}  # fall back target
+if o[2] < 0:
+dest_dict["kind"] = pymupdf.LINK_NONE
+if len(o) > 3:  # some target is specified
+if type(o[3]) in (int, float):  # convert a number to a point
+dest_dict["to"] = pymupdf.Point(72, page_height - o[3])
+else:  # if something else, make sure we have a dict
+# We make a copy of o[3] to avoid modifying our caller's data.
+dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict
+if "to" not in dest_dict:  # target point not in dict?
+dest_dict["to"] = top  # put default in
+else:  # transform target to PDF coordinates
+page = doc[pno]
+point = pymupdf.Point(dest_dict["to"])
+point.y = page.cropbox.height - point.y
+point = point * page.rotation_matrix
+dest_dict["to"] = (point.x, point.y)
+d = {}
+d["first"] = -1
+d["count"] = 0
+d["last"] = -1
+d["prev"] = -1
+d["next"] = -1
+d["dest"] = getDestStr(page_xref, dest_dict)
+d["top"] = dest_dict["to"]
+d["title"] = title
+d["parent"] = lvltab[lvl - 1]
+d["xref"] = xref[i + 1]
+d["color"] = dest_dict.get("color")
+d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0)
+lvltab[lvl] = i + 1
+parent = olitems[lvltab[lvl - 1]]  # the parent entry
+if (
+dest_dict.get("collapse") or collapse and lvl > collapse
+):  # suppress expansion
+parent["count"] -= 1  # make /Count negative
+else:
+parent["count"] += 1  # positive /Count
+if parent["first"] == -1:
+parent["first"] = i + 1
+parent["last"] = i + 1
+else:
+d["prev"] = parent["last"]
+prev = olitems[parent["last"]]
+prev["next"] = i + 1
+parent["last"] = i + 1
+olitems.append(d)
+# ------------------------------------------------------------------------------
+# now create each outline item as a string and insert it in the PDF
+# ------------------------------------------------------------------------------
+for i, ol in enumerate(olitems):
+txt = "<<"
+if ol["count"] != 0:
+txt += "/Count %i" % ol["count"]
+try:
+txt += ol["dest"]
+except Exception:
+# Verbose in PyMuPDF/tests.
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+try:
+if ol["first"] > -1:
+txt += "/First %i 0 R" % xref[ol["first"]]
+except Exception:
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+try:
+if ol["last"] > -1:
+txt += "/Last %i 0 R" % xref[ol["last"]]
+except Exception:
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+try:
+if ol["next"] > -1:
+txt += "/Next %i 0 R" % xref[ol["next"]]
+except Exception:
+# Verbose in PyMuPDF/tests.
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+try:
+if ol["parent"] > -1:
+txt += "/Parent %i 0 R" % xref[ol["parent"]]
+except Exception:
+# Verbose in PyMuPDF/tests.
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+try:
+if ol["prev"] > -1:
+txt += "/Prev %i 0 R" % xref[ol["prev"]]
+except Exception:
+# Verbose in PyMuPDF/tests.
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+try:
+txt += "/Title" + ol["title"]
+except Exception:
+# Verbose in PyMuPDF/tests.
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+pass
+if ol.get("color") and len(ol["color"]) == 3:
+txt += f"/C[ {_format_g(tuple(ol['color']))}]"
+if ol.get("flags", 0) > 0:
+txt += "/F %i" % ol["flags"]
+if i == 0:  # special: this is the outline root
+txt += "/Type/Outlines"  # so add the /Type entry
+txt += ">>"
+doc.update_object(xref[i], txt)  # insert the PDF object
+doc.init_doc()
+return toclen
+def do_widgets(
+tar: pymupdf.Document,
+src: pymupdf.Document,
+graftmap,
+from_page: int = -1,
+to_page: int = -1,
+start_at: int = -1,
+join_duplicates=0,
+) -> None:
+"""Insert widgets of copied page range into target PDF.
+Parameter values **must** equal those of method insert_pdf() which
+must have been previously executed.
+"""
+if not src.is_form_pdf:  # nothing to do: source PDF has no fields
+return
+def clean_kid_parents(acro_fields):
+""" Make sure all kids have correct "Parent" pointers."""
+for i in range(acro_fields.pdf_array_len()):
+parent = acro_fields.pdf_array_get(i)
+kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
+for j in range(kids.pdf_array_len()):
+kid = kids.pdf_array_get(j)
+kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent)
+def join_widgets(pdf, acro_fields, xref1, xref2, name):
+"""Called for each pair of widgets having the same name.
+Args:
+pdf: target MuPDF document
+acro_fields: object Root/AcroForm/Fields
+xref1, xref2: widget xrefs having same names
+name: (str) the name
+Result:
+Defined or updated widget parent that points to both widgets.
+"""
+def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2):
+"""Merge widget in xref2 into "Kids" list of widget xref1.
+Args:
+xref1, kids1: target widget and its "Kids" array.
+xref2, kids2: source wwidget and its "Kids" array (may be empty).
+"""
+# make indirect objects from widgets
+w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0)
+w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0)
+# find source widget in "Fields" array
+idx = acro_fields.pdf_array_find(w2_ind)
+acro_fields.pdf_array_delete(idx)
+if not kids2.pdf_is_array():  # source widget has no kids
+widget = mupdf.pdf_load_object(pdf, xref2)
+# delete name from widget and insert target as parent
+widget.pdf_dict_del(pymupdf.PDF_NAME("T"))
+widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
+# put in target Kids
+kids1.pdf_array_push(w2_ind)
+else:  # copy source kids to target kids
+for i in range(kids2.pdf_array_len()):
+kid = kids2.pdf_array_get(i)
+kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
+kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0)
+kids1.pdf_array_push(kid_ind)
+def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name):
+"""Make new "Parent" for two widgets with same name.
+Args:
+xref1, w1: first widget
+xref2, w2: second widget
+name: field name
+Result:
+Both widgets have no "Kids". We create a new object with the
+name and a "Kids" array containing the widgets.
+Original widgets must be removed from AcroForm/Fields.
+"""
+# make new "Parent" object
+new = mupdf.pdf_new_dict(pdf, 5)
+new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name)
+kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2)
+new_obj = mupdf.pdf_add_object(pdf, new)
+new_obj_xref = new_obj.pdf_to_num()
+new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0)
+# copy over some required source widget properties
+ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT"))
+w1.pdf_dict_del(pymupdf.PDF_NAME("FT"))
+new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft)
+aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA"))
+w1.pdf_dict_del(pymupdf.PDF_NAME("AA"))
+new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa)
+# remove name field, insert "Parent" field in source widgets
+w1.pdf_dict_del(pymupdf.PDF_NAME("T"))
+w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
+w2.pdf_dict_del(pymupdf.PDF_NAME("T"))
+w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
+# put source widgets in "kids" array
+ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0)
+ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0)
+kids.pdf_array_push(ind1)
+kids.pdf_array_push(ind2)
+# remove source widgets from "AcroForm/Fields"
+idx = acro_fields.pdf_array_find(ind1)
+acro_fields.pdf_array_delete(idx)
+idx = acro_fields.pdf_array_find(ind2)
+acro_fields.pdf_array_delete(idx)
+acro_fields.pdf_array_push(new_ind)
+w1 = mupdf.pdf_load_object(pdf, xref1)
+w2 = mupdf.pdf_load_object(pdf, xref2)
+kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
+kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
+# check which widget has a suitable "Kids" array
+if kids1.pdf_is_array():
+re_target(pdf, acro_fields, xref1, kids1, xref2, kids2)  # pylint: disable=arguments-out-of-order
+elif kids2.pdf_is_array():
+re_target(pdf, acro_fields, xref2, kids2, xref1, kids1)  # pylint: disable=arguments-out-of-order
+else:
+new_target(pdf, acro_fields, xref1, w1, xref2, w2, name)  # pylint: disable=arguments-out-of-order
+def get_kids(parent, kids_list):
+"""Return xref list of leaf kids for a parent.
+Call with an empty list.
+"""
+kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids"))
+if not kids.pdf_is_array():
+return kids_list
+for i in range(kids.pdf_array_len()):
+kid = kids.pdf_array_get(i)
+if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))):
+kids_list = get_kids(kid, kids_list)
+else:
+kids_list.append(kid.pdf_to_num())
+return kids_list
+def kids_xrefs(widget):
+"""Get the xref of top "Parent" and the list of leaf widgets."""
+kids_list = []
+parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent"))
+parent_xref = parent.pdf_to_num()
+if parent_xref == 0:
+return parent_xref, kids_list
+kids_list = get_kids(parent, kids_list)
+return parent_xref, kids_list
+def deduplicate_names(pdf, acro_fields, join_duplicates=False):
+"""Handle any widget name duplicates caused by the merge."""
+names = {}  # key is a widget name, value a list of widgets having it.
+# extract all names and widgets in "AcroForm/Fields"
+for i in range(mupdf.pdf_array_len(acro_fields)):
+wobject = mupdf.pdf_array_get(acro_fields, i)
+xref = wobject.pdf_to_num()
+# extract widget name and collect widget(s) using it
+T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T"))
+xrefs = names.get(T, [])
+xrefs.append(xref)
+names[T] = xrefs
+for name, xrefs in names.items():
+if len(xrefs) < 2:
+continue
+xref0, xref1 = xrefs[:2]  # only exactly 2 should occur!
+if join_duplicates:  # combine fields with equal names
+join_widgets(pdf, acro_fields, xref0, xref1, name)
+else:  # make field names unique
+newname = name + f" [{xref1}]"  # append this to the name
+wobject = mupdf.pdf_load_object(pdf, xref1)
+wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname)
+clean_kid_parents(acro_fields)
+def get_acroform(doc):
+"""Retrieve the AcroForm dictionary form a PDF."""
+pdf = mupdf.pdf_document_from_fz_document(doc)
+# AcroForm (= central form field info)
+return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm")
+tarpdf = mupdf.pdf_document_from_fz_document(tar)
+srcpdf = mupdf.pdf_document_from_fz_document(src)
+if tar.is_form_pdf:
+# target is a Form PDF, so use it to include source fields
+acro = get_acroform(tar)
+# Important arrays in AcroForm
+acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
+tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO"))
+if not tar_co.pdf_is_array():
+tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
+else:
+# target is no Form PDF, so copy over source AcroForm
+acro = mupdf.pdf_deep_copy_obj(get_acroform(src))  # make a copy
+# Clear "Fields" and "CO" arrays: will be populated by page fields.
+# This is required to avoid copying unneeded objects.
+acro.pdf_dict_del(pymupdf.PDF_NAME("Fields"))
+acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5)
+acro.pdf_dict_del(pymupdf.PDF_NAME("CO"))
+acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
+# Enrich AcroForm for copying to target
+acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro)
+# Insert AcroForm into target PDF
+acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft)
+acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
+tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO"))
+# get its xref and insert it into target catalog
+tar_xref = acro_tar.pdf_to_num()
+acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
+root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root"))
+root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)
+if from_page <= to_page:
+src_range = range(from_page, to_page + 1)
+else:
+src_range = range(from_page, to_page - 1, -1)
+parents = {}  # information about widget parents
+# remove "P" owning page reference from all widgets of all source pages
+for i in src_range:
+src_page = src[i]
+for xref in [
+xref
+for xref, wtype, _ in src_page.annot_xrefs()
+if wtype == pymupdf.PDF_ANNOT_WIDGET  # pylint: disable=no-member
+]:
+w_obj = mupdf.pdf_load_object(srcpdf, xref)
+w_obj.pdf_dict_del(pymupdf.PDF_NAME("P"))
+# get the widget's parent structure
+parent_xref, old_kids = kids_xrefs(w_obj)
+if parent_xref:
+parents[parent_xref] = {
+"new_xref": 0,
+"old_kids": old_kids,
+"new_kids": [],
+}
+# Copy over Parent widgets first - they are not page-dependent
+for xref in parents.keys():  # pylint: disable=consider-using-dict-items
+parent = mupdf.pdf_load_object(srcpdf, xref)
+parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent)
+parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft)
+kids_xrefs_new = get_kids(parent_tar, [])
+parent_xref_new = parent_tar.pdf_to_num()
+parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0)
+acro_fields.pdf_array_push(parent_ind)
+parents[xref]["new_xref"] = parent_xref_new
+parents[xref]["new_kids"] = kids_xrefs_new
+for i in range(len(src_range)):
+# read first copied over page in target
+tar_page = tar[start_at + i]
+# read the original page in the source PDF
+src_page = src[src_range[i]]
+# now walk through source page widgets and copy over
+w_xrefs = [  # widget xrefs of the source page
+xref
+for xref, wtype, _ in src_page.annot_xrefs()
+if wtype == pymupdf.PDF_ANNOT_WIDGET  # pylint: disable=no-member
+]
+if not w_xrefs:  # no widgets on this source page
+continue
+# convert to formal PDF page
+tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
+# extract annotations array
+tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
+if not mupdf.pdf_is_array(tar_annots):
+tar_annots = mupdf.pdf_dict_put_array(
+tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
+)
+for xref in w_xrefs:
+w_obj = mupdf.pdf_load_object(srcpdf, xref)
+# check if field takes part in inter-field validations
+is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C"))
+# check if parent of widget already in target
+parent_xref = mupdf.pdf_to_num(
+w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent"))
+)
+if parent_xref == 0:  # parent not in target yet
+try:
+w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
+except Exception as e:
+pymupdf.message_warning(f"cannot copy widget at {xref=}: {e}")
+continue
+w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
+tar_xref = w_obj_tar.pdf_to_num()
+w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
+mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
+mupdf.pdf_array_push(acro_fields, w_obj_tar_ind)
+else:
+parent = parents[parent_xref]
+idx = parent["old_kids"].index(xref)  # search for xref in parent
+tar_xref = parent["new_kids"][idx]
+w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
+mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
+# Into "AcroForm/CO" if a computation field.
+if is_aac:
+mupdf.pdf_array_push(tar_co, w_obj_tar_ind)
+deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates)
+def do_links(
+doc1: pymupdf.Document,
+doc2: pymupdf.Document,
+from_page: int = -1,
+to_page: int = -1,
+start_at: int = -1,
+) -> None:
+"""Insert links contained in copied page range into destination PDF.
+Parameter values **must** equal those of method insert_pdf(), which must
+have been previously executed.
+"""
+#pymupdf.log( 'utils.do_links()')
+# --------------------------------------------------------------------------
+# internal function to create the actual "/Annots" object string
+# --------------------------------------------------------------------------
+def cre_annot(lnk, xref_dst, pno_src, ctm):
+"""Create annotation object string for a passed-in link."""
+r = lnk["from"] * ctm  # rect in PDF coordinates
+rect = _format_g(tuple(r))
+if lnk["kind"] == pymupdf.LINK_GOTO:
+txt = pymupdf.annot_skel["goto1"]  # annot_goto
+idx = pno_src.index(lnk["page"])
+p = lnk["to"] * ctm  # target point in PDF coordinates
+annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect)
+elif lnk["kind"] == pymupdf.LINK_GOTOR:
+if lnk["page"] >= 0:
+txt = pymupdf.annot_skel["gotor1"]  # annot_gotor
+pnt = lnk.get("to", pymupdf.Point(0, 0))  # destination point
+if type(pnt) is not pymupdf.Point:
+pnt = pymupdf.Point(0, 0)
+annot = txt(
+lnk["page"],
+pnt.x,
+pnt.y,
+lnk["zoom"],
+lnk["file"],
+lnk["file"],
+rect,
+)
+else:
+txt = pymupdf.annot_skel["gotor2"]  # annot_gotor_n
+to = pymupdf.get_pdf_str(lnk["to"])
+to = to[1:-1]
+f = lnk["file"]
+annot = txt(to, f, rect)
+elif lnk["kind"] == pymupdf.LINK_LAUNCH:
+txt = pymupdf.annot_skel["launch"]  # annot_launch
+annot = txt(lnk["file"], lnk["file"], rect)
+elif lnk["kind"] == pymupdf.LINK_URI:
+txt = pymupdf.annot_skel["uri"]  # annot_uri
+annot = txt(lnk["uri"], rect)
+else:
+annot = ""
+return annot
+# --------------------------------------------------------------------------
+# validate & normalize parameters
+if from_page < 0:
+fp = 0
+elif from_page >= doc2.page_count:
+fp = doc2.page_count - 1
+else:
+fp = from_page
+if to_page < 0 or to_page >= doc2.page_count:
+tp = doc2.page_count - 1
+else:
+tp = to_page
+if start_at < 0:
+raise ValueError("'start_at' must be >= 0")
+sa = start_at
+incr = 1 if fp <= tp else -1  # page range could be reversed
+# lists of source / destination page numbers
+pno_src = list(range(fp, tp + incr, incr))
+pno_dst = [sa + i for i in range(len(pno_src))]
+# lists of source / destination page xrefs
+xref_src = []
+xref_dst = []
+for i in range(len(pno_src)):
+p_src = pno_src[i]
+p_dst = pno_dst[i]
+old_xref = doc2.page_xref(p_src)
+new_xref = doc1.page_xref(p_dst)
+xref_src.append(old_xref)
+xref_dst.append(new_xref)
+# create the links for each copied page in destination PDF
+for i in range(len(xref_src)):
+page_src = doc2[pno_src[i]]  # load source page
+links = page_src.get_links()  # get all its links
+#pymupdf.log( '{pno_src=}')
+#pymupdf.log( '{type(page_src)=}')
+#pymupdf.log( '{page_src=}')
+#pymupdf.log( '{=i len(links)}')
+if len(links) == 0:  # no links there
+page_src = None
+continue
+ctm = ~page_src.transformation_matrix  # calc page transformation matrix
+page_dst = doc1[pno_dst[i]]  # load destination page
+link_tab = []  # store all link definitions here
+for l in links:
+if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src):
+continue  # GOTO link target not in copied pages
+annot_text = cre_annot(l, xref_dst, pno_src, ctm)
+if annot_text:
+link_tab.append(annot_text)
+if link_tab != []:
+page_dst._addAnnot_FromString( tuple(link_tab))
+#pymupdf.log( 'utils.do_links() returning.')
+def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
+# --------------------------------------------------------------------------
+# define skeletons for /Annots object texts
+# --------------------------------------------------------------------------
+ctm = page.transformation_matrix
+ictm = ~ctm
+r = lnk["from"]
+rect = _format_g(tuple(r * ictm))
+annot = ""
+if lnk["kind"] == pymupdf.LINK_GOTO:
+if lnk["page"] >= 0:
+txt = pymupdf.annot_skel["goto1"]  # annot_goto
+pno = lnk["page"]
+xref = page.parent.page_xref(pno)
+pnt = lnk.get("to", pymupdf.Point(0, 0))  # destination point
+dest_page = page.parent[pno]
+dest_ctm = dest_page.transformation_matrix
+dest_ictm = ~dest_ctm
+ipnt = pnt * dest_ictm
+annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
+else:
+txt = pymupdf.annot_skel["goto2"]  # annot_goto_n
+annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
+elif lnk["kind"] == pymupdf.LINK_GOTOR:
+if lnk["page"] >= 0:
+txt = pymupdf.annot_skel["gotor1"]  # annot_gotor
+pnt = lnk.get("to", pymupdf.Point(0, 0))  # destination point
+if type(pnt) is not pymupdf.Point:
+pnt = pymupdf.Point(0, 0)
+annot = txt(
+lnk["page"],
+pnt.x,
+pnt.y,
+lnk.get("zoom", 0),
+lnk["file"],
+lnk["file"],
+rect,
+)
+else:
+txt = pymupdf.annot_skel["gotor2"]  # annot_gotor_n
+annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
+elif lnk["kind"] == pymupdf.LINK_LAUNCH:
+txt = pymupdf.annot_skel["launch"]  # annot_launch
+annot = txt(lnk["file"], lnk["file"], rect)
+elif lnk["kind"] == pymupdf.LINK_URI:
+txt = pymupdf.annot_skel["uri"]  # txt = annot_uri
+annot = txt(lnk["uri"], rect)
+elif lnk["kind"] == pymupdf.LINK_NAMED:
+txt = pymupdf.annot_skel["named"]  # annot_named
+lname = lnk.get("name")  # check presence of key
+if lname is None:  # if missing, fall back to alternative
+lname = lnk["nameddest"]
+annot = txt(lname, rect)
+if not annot:
+return annot
+# add a /NM PDF key to the object definition
+link_names = dict(  # existing ids and their xref
+[(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK]   # pylint: disable=no-member
+)
+old_name = lnk.get("id", "")  # id value in the argument
+if old_name and (lnk["xref"], old_name) in link_names.items():
+name = old_name  # no new name if this is an update only
+else:
+i = 0
+stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
+while True:
+name = stem % i
+if name not in link_names.values():
+break
+i += 1
+# add /NM key to object definition
+annot = annot.replace("/Link", "/Link/NM(%s)" % name)
+return annot
+def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget:
+"""Delete widget from page and return the next one."""
+pymupdf.CheckParent(page)
+annot = getattr(widget, "_annot", None)
+if annot is None:
+raise ValueError("bad type: widget")
+nextwidget = widget.next
+page.delete_annot(annot)
+widget._annot.parent = None
+keylist = list(widget.__dict__.keys())
+for key in keylist:
+del widget.__dict__[key]
+return nextwidget
+def update_link(page: pymupdf.Page, lnk: dict) -> None:
+"""Update a link on the current page."""
+pymupdf.CheckParent(page)
+annot = getLinkText(page, lnk)
+if annot == "":
+raise ValueError("link kind not supported")
+page.parent.update_object(lnk["xref"], annot, page=page)
+def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None:
+"""Insert a new link for the current page."""
+pymupdf.CheckParent(page)
+annot = getLinkText(page, lnk)
+if annot == "":
+raise ValueError("link kind not supported")
+page._addAnnot_FromString((annot,))
+def insert_textbox(
+page: pymupdf.Page,
+rect: rect_like,
+buffer: typing.Union[str, list],
+*,
+fontname: str = "helv",
+fontfile: OptStr = None,
+set_simple: int = 0,
+encoding: int = 0,
+fontsize: float = 11,
+lineheight: OptFloat = None,
+color: OptSeq = None,
+fill: OptSeq = None,
+expandtabs: int = 1,
+align: int = 0,
+rotate: int = 0,
+render_mode: int = 0,
+miter_limit: float = 1,
+border_width: float = 0.05,
+morph: OptSeq = None,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> float:
+"""Insert text into a given rectangle.
+Notes:
+Creates a Shape object, uses its same-named method and commits it.
+Parameters:
+rect: (rect-like) area to use for text.
+buffer: text to be inserted
+fontname: a Base-14 font, font name or '/name'
+fontfile: name of a font file
+fontsize: font size
+lineheight: overwrite the font property
+color: RGB color triple
+expandtabs: handles tabulators with string function
+align: left, center, right, justified
+rotate: 0, 90, 180, or 270 degrees
+morph: morph box with a matrix and a fixpoint
+overlay: put text in foreground or background
+Returns:
+unused or deficit rectangle area (float)
+"""
+img = page.new_shape()
+rc = img.insert_textbox(
+rect,
+buffer,
+fontsize=fontsize,
+lineheight=lineheight,
+fontname=fontname,
+fontfile=fontfile,
+set_simple=set_simple,
+encoding=encoding,
+color=color,
+fill=fill,
+expandtabs=expandtabs,
+render_mode=render_mode,
+miter_limit=miter_limit,
+border_width=border_width,
+align=align,
+rotate=rotate,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+if rc >= 0:
+img.commit(overlay)
+return rc
+def insert_text(
+page: pymupdf.Page,
+point: point_like,
+text: typing.Union[str, list],
+*,
+fontsize: float = 11,
+lineheight: OptFloat = None,
+fontname: str = "helv",
+fontfile: OptStr = None,
+set_simple: int = 0,
+encoding: int = 0,
+color: OptSeq = None,
+fill: OptSeq = None,
+border_width: float = 0.05,
+miter_limit: float = 1,
+render_mode: int = 0,
+rotate: int = 0,
+morph: OptSeq = None,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+):
+img = page.new_shape()
+rc = img.insert_text(
+point,
+text,
+fontsize=fontsize,
+lineheight=lineheight,
+fontname=fontname,
+fontfile=fontfile,
+set_simple=set_simple,
+encoding=encoding,
+color=color,
+fill=fill,
+border_width=border_width,
+render_mode=render_mode,
+miter_limit=miter_limit,
+rotate=rotate,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+if rc >= 0:
+img.commit(overlay)
+return rc
+def insert_htmlbox(
+page,
+rect,
+text,
+*,
+css=None,
+scale_low=0,
+archive=None,
+rotate=0,
+oc=0,
+opacity=1,
+overlay=True,
+) -> float:
+"""Insert text with optional HTML tags and stylings into a rectangle.
+Args:
+rect: (rect-like) rectangle into which the text should be placed.
+text: (str) text with optional HTML tags and stylings.
+css: (str) CSS styling commands.
+scale_low: (float) force-fit content by scaling it down. Must be in
+range [0, 1]. If 1, no scaling will take place. If 0, arbitrary
+down-scaling is acceptable. A value of 0.1 would mean that content
+may be scaled down by at most 90%.
+archive: Archive object pointing to locations of used fonts or images
+rotate: (int) rotate the text in the box by a multiple of 90 degrees.
+oc: (int) the xref of an OCG / OCMD (Optional Content).
+opacity: (float) set opacity of inserted content.
+overlay: (bool) put text on top of page content.
+Returns:
+A tuple of floats (spare_height, scale).
+spare_height: -1 if content did not fit, else >= 0. It is the height of the
+unused (still available) rectangle stripe. Positive only if
+scale_min = 1 (no down scaling).
+scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit).
+"""
+# normalize rotation angle
+if not rotate % 90 == 0:
+raise ValueError("bad rotation angle")
+while rotate < 0:
+rotate += 360
+while rotate >= 360:
+rotate -= 360
+if not 0 <= scale_low <= 1:
+raise ValueError("'scale_low' must be in [0, 1]")
+if css is None:
+css = ""
+rect = pymupdf.Rect(rect)
+if rotate in (90, 270):
+temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width)
+else:
+temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height)
+# use a small border by default
+mycss = "body {margin:1px;}" + css  # append user CSS
+# either make a story, or accept a given one
+if isinstance(text, str):  # if a string, convert to a Story
+story = pymupdf.Story(html=text, user_css=mycss, archive=archive)
+elif isinstance(text, pymupdf.Story):
+story = text
+else:
+raise ValueError("'text' must be a string or a Story")
+# ----------------------------------------------------------------
+# Find a scaling factor that lets our story fit in
+# ----------------------------------------------------------------
+scale_max = None if scale_low == 0 else 1 / scale_low
+fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max)
+if not fit.big_enough:  # there was no fit
+return (-1, scale_low)
+filled = fit.filled
+scale = 1 / fit.parameter  # shrink factor
+spare_height = fit.rect.y1 - filled[3]  # unused room at rectangle bottom
+# Note: due to MuPDF's logic this may be negative even for successful fits.
+if scale != 1 or spare_height < 0:  # if scaling occurred, set spare_height to 0
+spare_height = 0
+def rect_function(*args):
+return fit.rect, fit.rect, pymupdf.Identity
+# draw story on temp PDF page
+doc = story.write_with_links(rect_function)
+# Insert opacity if requested.
+# For this, we prepend a command to the /Contents.
+if 0 <= opacity < 1:
+tpage = doc[0]  # load page
+# generate /ExtGstate for the page
+alp0 = tpage._set_opacity(CA=opacity, ca=opacity)
+s = f"/{alp0} gs\n"  # generate graphic state command
+pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0)
+# put result in target page
+page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay)
+# -------------------------------------------------------------------------
+# re-insert links in target rect (show_pdf_page cannot copy annotations)
+# -------------------------------------------------------------------------
+# scaled center point of fit.rect
+mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale
+# center point of target rect
+mp2 = (rect.tl + rect.br) / 2
+# compute link positioning matrix:
+# - move center of scaled-down fit.rect to (0,0)
+# - rotate
+# - move (0,0) to center of target rect
+mat = (
+pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y)
+* pymupdf.Matrix(-rotate)
+* pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y)
+)
+# copy over links
+for link in doc[0].get_links():
+link["from"] *= mat
+page.insert_link(link)
+return spare_height, scale
+def new_page(
+doc: pymupdf.Document,
+pno: int = -1,
+width: float = 595,
+height: float = 842,
+) -> pymupdf.Page:
+"""Create and return a new page object.
+Args:
+pno: (int) insert before this page. Default: after last page.
+width: (float) page width in points. Default: 595 (ISO A4 width).
+height: (float) page height in points. Default 842 (ISO A4 height).
+Returns:
+A pymupdf.Page object.
+"""
+doc._newPage(pno, width=width, height=height)
+return doc[pno]
+def insert_page(
+doc: pymupdf.Document,
+pno: int,
+text: typing.Union[str, list, None] = None,
+fontsize: float = 11,
+width: float = 595,
+height: float = 842,
+fontname: str = "helv",
+fontfile: OptStr = None,
+color: OptSeq = (0,),
+) -> int:
+"""Create a new PDF page and insert some text.
+Notes:
+Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text().
+For parameter details see these methods.
+"""
+page = doc.new_page(pno=pno, width=width, height=height)
+if not bool(text):
+return 0
+rc = page.insert_text(
+(50, 72),
+text,
+fontsize=fontsize,
+fontname=fontname,
+fontfile=fontfile,
+color=color,
+)
+return rc
+def draw_line(
+page: pymupdf.Page,
+p1: point_like,
+p2: point_like,
+color: OptSeq = (0,),
+dashes: OptStr = None,
+width: float = 1,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+morph: OptSeq = None,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc=0,
+) -> pymupdf.Point:
+"""Draw a line from point p1 to point p2."""
+img = page.new_shape()
+p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2))
+img.finish(
+color=color,
+dashes=dashes,
+width=width,
+closePath=False,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return p
+def draw_squiggle(
+page: pymupdf.Page,
+p1: point_like,
+p2: point_like,
+breadth: float = 2,
+color: OptSeq = (0,),
+dashes: OptStr = None,
+width: float = 1,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+morph: OptSeq = None,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw a squiggly line from point p1 to point p2."""
+img = page.new_shape()
+p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
+img.finish(
+color=color,
+dashes=dashes,
+width=width,
+closePath=False,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return p
+def draw_zigzag(
+page: pymupdf.Page,
+p1: point_like,
+p2: point_like,
+breadth: float = 2,
+color: OptSeq = (0,),
+dashes: OptStr = None,
+width: float = 1,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+morph: OptSeq = None,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw a zigzag line from point p1 to point p2."""
+img = page.new_shape()
+p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
+img.finish(
+color=color,
+dashes=dashes,
+width=width,
+closePath=False,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return p
+def draw_rect(
+page: pymupdf.Page,
+rect: rect_like,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+dashes: OptStr = None,
+width: float = 1,
+lineCap: int = 0,
+lineJoin: int = 0,
+morph: OptSeq = None,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+radius=None,
+) -> pymupdf.Point:
+'''
+Draw a rectangle. See Shape class method for details.
+'''
+img = page.new_shape()
+Q = img.draw_rect(pymupdf.Rect(rect), radius=radius)
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+def draw_quad(
+page: pymupdf.Page,
+quad: quad_like,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+dashes: OptStr = None,
+width: float = 1,
+lineCap: int = 0,
+lineJoin: int = 0,
+morph: OptSeq = None,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw a quadrilateral."""
+img = page.new_shape()
+Q = img.draw_quad(pymupdf.Quad(quad))
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+def draw_polyline(
+page: pymupdf.Page,
+points: list,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+dashes: OptStr = None,
+width: float = 1,
+morph: OptSeq = None,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+closePath: bool = False,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw multiple connected line segments."""
+img = page.new_shape()
+Q = img.draw_polyline(points)
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+closePath=closePath,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+def draw_circle(
+page: pymupdf.Page,
+center: point_like,
+radius: float,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+morph: OptSeq = None,
+dashes: OptStr = None,
+width: float = 1,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw a circle given its center and radius."""
+img = page.new_shape()
+Q = img.draw_circle(pymupdf.Point(center), radius)
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+def draw_oval(
+page: pymupdf.Page,
+rect: typing.Union[rect_like, quad_like],
+color: OptSeq = (0,),
+fill: OptSeq = None,
+dashes: OptStr = None,
+morph: OptSeq = None,
+width: float = 1,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw an oval given its containing rectangle or quad."""
+img = page.new_shape()
+Q = img.draw_oval(rect)
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+def draw_curve(
+page: pymupdf.Page,
+p1: point_like,
+p2: point_like,
+p3: point_like,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+dashes: OptStr = None,
+width: float = 1,
+morph: OptSeq = None,
+closePath: bool = False,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3."""
+img = page.new_shape()
+Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3))
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+closePath=closePath,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+def draw_bezier(
+page: pymupdf.Page,
+p1: point_like,
+p2: point_like,
+p3: point_like,
+p4: point_like,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+dashes: OptStr = None,
+width: float = 1,
+morph: OptStr = None,
+closePath: bool = False,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3."""
+img = page.new_shape()
+Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4))
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+closePath=closePath,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+def draw_sector(
+page: pymupdf.Page,
+center: point_like,
+point: point_like,
+beta: float,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+dashes: OptStr = None,
+fullSector: bool = True,
+morph: OptSeq = None,
+width: float = 1,
+closePath: bool = False,
+lineCap: int = 0,
+lineJoin: int = 0,
+overlay: bool = True,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> pymupdf.Point:
+"""Draw a circle sector given circle center, one arc end point and the angle of the arc.
+Parameters:
+center -- center of circle
+point -- arc end point
+beta -- angle of arc (degrees)
+fullSector -- connect arc ends with center
+"""
+img = page.new_shape()
+Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector)
+img.finish(
+color=color,
+fill=fill,
+dashes=dashes,
+width=width,
+lineCap=lineCap,
+lineJoin=lineJoin,
+morph=morph,
+closePath=closePath,
+stroke_opacity=stroke_opacity,
+fill_opacity=fill_opacity,
+oc=oc,
+)
+img.commit(overlay)
+return Q
+# ----------------------------------------------------------------------
+# Name:        wx.lib.colourdb.py
+# Purpose:     Adds a bunch of colour names and RGB values to the
+#              colour database so they can be found by name
+#
+# Author:      Robin Dunn
+#
+# Created:     13-March-2001
+# Copyright:   (c) 2001-2017 by Total Control Software
+# Licence:     wxWindows license
+# Tags:        phoenix-port, unittest, documented
+# ----------------------------------------------------------------------
+def getColorList() -> list:
+"""
+Returns a list of upper-case colour names.
+:rtype: list of strings
+"""
+return [name for name, r, g, b in pymupdf.colors_wx_list()]
+def getColorInfoList() -> list:
+"""
+Returns list of (name, red, gree, blue) tuples, where:
+name: upper-case color name.
+read, green, blue: integers in range 0..255.
+:rtype: list of tuples
+"""
+return pymupdf.colors_wx_list()
+def getColor(name: str) -> tuple:
+"""Retrieve RGB color in PDF format by name.
+Returns:
+a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
+"""
+return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1))
+def getColorHSV(name: str) -> tuple:
+"""Retrieve the hue, saturation, value triple of a color name.
+Returns:
+a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
+"""
+try:
+x = getColorInfoList()[getColorList().index(name.upper())]
+except Exception:
+if g_exceptions_verbose:    pymupdf.exception_info()
+return (-1, -1, -1)
+r = x[1] / 255.0
+g = x[2] / 255.0
+b = x[3] / 255.0
+cmax = max(r, g, b)
+V = round(cmax * 100, 1)
+cmin = min(r, g, b)
+delta = cmax - cmin
+if delta == 0:
+hue = 0
+elif cmax == r:
+hue = 60.0 * (((g - b) / delta) % 6)
+elif cmax == g:
+hue = 60.0 * (((b - r) / delta) + 2)
+else:
+hue = 60.0 * (((r - g) / delta) + 4)
+H = int(round(hue))
+if cmax == 0:
+sat = 0
+else:
+sat = delta / cmax
+S = int(round(sat * 100))
+return (H, S, V)
+def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
+fontname, ext, stype, buffer = doc.extract_font(xref)
+asc = 0.8
+dsc = -0.2
+if ext == "":
+return fontname, ext, stype, asc, dsc
+if buffer:
+try:
+font = pymupdf.Font(fontbuffer=buffer)
+asc = font.ascender
+dsc = font.descender
+bbox = font.bbox
+if asc - dsc < 1:
+if bbox.y0 < dsc:
+dsc = bbox.y0
+asc = 1 - dsc
+except Exception:
+pymupdf.exception_info()
+asc *= 1.2
+dsc *= 1.2
+return fontname, ext, stype, asc, dsc
+if ext != "n/a":
+try:
+font = pymupdf.Font(fontname)
+asc = font.ascender
+dsc = font.descender
+except Exception:
+pymupdf.exception_info()
+asc *= 1.2
+dsc *= 1.2
+else:
+asc *= 1.2
+dsc *= 1.2
+return fontname, ext, stype, asc, dsc
+def get_char_widths(
+doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None
+) -> list:
+"""Get list of glyph information of a font.
+Notes:
+Must be provided by its XREF number. If we already dealt with the
+font, it will be recorded in doc.FontInfos. Otherwise we insert an
+entry there.
+Finally we return the glyphs for the font. This is a list of
+(glyph, width) where glyph is an integer controlling the char
+appearance, and width is a float controlling the char's spacing:
+width * fontsize is the actual space.
+For 'simple' fonts, glyph == ord(char) will usually be true.
+Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here.
+"""
+fontinfo = pymupdf.CheckFontInfo(doc, xref)
+if fontinfo is None:  # not recorded yet: create it
+if fontdict is None:
+name, ext, stype, asc, dsc = _get_font_properties(doc, xref)
+fontdict = {
+"name": name,
+"type": stype,
+"ext": ext,
+"ascender": asc,
+"descender": dsc,
+}
+else:
+name = fontdict["name"]
+ext = fontdict["ext"]
+stype = fontdict["type"]
+ordering = fontdict["ordering"]
+simple = fontdict["simple"]
+if ext == "":
+raise ValueError("xref is not a font")
+# check for 'simple' fonts
+if stype in ("Type1", "MMType1", "TrueType"):
+simple = True
+else:
+simple = False
+# check for CJK fonts
+if name in ("Fangti", "Ming"):
+ordering = 0
+elif name in ("Heiti", "Song"):
+ordering = 1
+elif name in ("Gothic", "Mincho"):
+ordering = 2
+elif name in ("Dotum", "Batang"):
+ordering = 3
+else:
+ordering = -1
+fontdict["simple"] = simple
+if name == "ZapfDingbats":
+glyphs = pymupdf.zapf_glyphs
+elif name == "Symbol":
+glyphs = pymupdf.symbol_glyphs
+else:
+glyphs = None
+fontdict["glyphs"] = glyphs
+fontdict["ordering"] = ordering
+fontinfo = [xref, fontdict]
+doc.FontInfos.append(fontinfo)
+else:
+fontdict = fontinfo[1]
+glyphs = fontdict["glyphs"]
+simple = fontdict["simple"]
+ordering = fontdict["ordering"]
+if glyphs is None:
+oldlimit = 0
+else:
+oldlimit = len(glyphs)
+mylimit = max(256, limit)
+if mylimit <= oldlimit:
+return glyphs
+if ordering < 0:  # not a CJK font
+glyphs = doc._get_char_widths(
+xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx
+)
+else:  # CJK fonts use char codes and width = 1
+glyphs = None
+fontdict["glyphs"] = glyphs
+fontinfo[1] = fontdict
+pymupdf.UpdateFontInfo(doc, fontinfo)
+return glyphs
+class Shape:
+"""Create a new shape."""
+@staticmethod
+def horizontal_angle(C, P):
+"""Return the angle to the horizontal for the connection from C to P.
+This uses the arcus sine function and resolves its inherent ambiguity by
+looking up in which quadrant vector S = P - C is located.
+"""
+S = pymupdf.Point(P - C).unit  # unit vector 'C' -> 'P'
+alfa = math.asin(abs(S.y))  # absolute angle from horizontal
+if S.x < 0:  # make arcsin result unique
+if S.y <= 0:  # bottom-left
+alfa = -(math.pi - alfa)
+else:  # top-left
+alfa = math.pi - alfa
+else:
+if S.y >= 0:  # top-right
+pass
+else:  # bottom-right
+alfa = -alfa
+return alfa
+def __init__(self, page: pymupdf.Page):
+pymupdf.CheckParent(page)
+self.page = page
+self.doc = page.parent
+if not self.doc.is_pdf:
+raise ValueError("is no PDF")
+self.height = page.mediabox_size.y
+self.width = page.mediabox_size.x
+self.x = page.cropbox_position.x
+self.y = page.cropbox_position.y
+self.pctm = page.transformation_matrix  # page transf. matrix
+self.ipctm = ~self.pctm  # inverted transf. matrix
+self.draw_cont = ""
+self.text_cont = ""
+self.totalcont = ""
+self.last_point = None
+self.rect = None
+def updateRect(self, x):
+if self.rect is None:
+if len(x) == 2:
+self.rect = pymupdf.Rect(x, x)
+else:
+self.rect = pymupdf.Rect(x)
+else:
+if len(x) == 2:
+x = pymupdf.Point(x)
+self.rect.x0 = min(self.rect.x0, x.x)
+self.rect.y0 = min(self.rect.y0, x.y)
+self.rect.x1 = max(self.rect.x1, x.x)
+self.rect.y1 = max(self.rect.y1, x.y)
+else:
+x = pymupdf.Rect(x)
+self.rect.x0 = min(self.rect.x0, x.x0)
+self.rect.y0 = min(self.rect.y0, x.y0)
+self.rect.x1 = max(self.rect.x1, x.x1)
+self.rect.y1 = max(self.rect.y1, x.y1)
+def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point:
+"""Draw a line between two points."""
+p1 = pymupdf.Point(p1)
+p2 = pymupdf.Point(p2)
+if not (self.last_point == p1):
+self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
+self.last_point = p1
+self.updateRect(p1)
+self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n"
+self.updateRect(p2)
+self.last_point = p2
+return self.last_point
+def draw_polyline(self, points: list) -> pymupdf.Point:
+"""Draw several connected line segments."""
+for i, p in enumerate(points):
+if i == 0:
+if not (self.last_point == pymupdf.Point(p)):
+self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n"
+self.last_point = pymupdf.Point(p)
+else:
+self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n"
+self.updateRect(p)
+self.last_point = pymupdf.Point(points[-1])
+return self.last_point
+def draw_bezier(
+self,
+p1: point_like,
+p2: point_like,
+p3: point_like,
+p4: point_like,
+) -> pymupdf.Point:
+"""Draw a standard cubic Bezier curve."""
+p1 = pymupdf.Point(p1)
+p2 = pymupdf.Point(p2)
+p3 = pymupdf.Point(p3)
+p4 = pymupdf.Point(p4)
+if not (self.last_point == p1):
+self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
+args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm))
+self.draw_cont += _format_g(args) + " c\n"
+self.updateRect(p1)
+self.updateRect(p2)
+self.updateRect(p3)
+self.updateRect(p4)
+self.last_point = p4
+return self.last_point
+def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point:
+"""Draw an ellipse inside a tetrapod."""
+if len(tetra) != 4:
+raise ValueError("invalid arg length")
+if hasattr(tetra[0], "__float__"):
+q = pymupdf.Rect(tetra).quad
+else:
+q = pymupdf.Quad(tetra)
+mt = q.ul + (q.ur - q.ul) * 0.5
+mr = q.ur + (q.lr - q.ur) * 0.5
+mb = q.ll + (q.lr - q.ll) * 0.5
+ml = q.ul + (q.ll - q.ul) * 0.5
+if not (self.last_point == ml):
+self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n"
+self.last_point = ml
+self.draw_curve(ml, q.ll, mb)
+self.draw_curve(mb, q.lr, mr)
+self.draw_curve(mr, q.ur, mt)
+self.draw_curve(mt, q.ul, ml)
+self.updateRect(q.rect)
+self.last_point = ml
+return self.last_point
+def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point:
+"""Draw a circle given its center and radius."""
+if not radius > pymupdf.EPSILON:
+raise ValueError("radius must be positive")
+center = pymupdf.Point(center)
+p1 = center - (radius, 0)
+return self.draw_sector(center, p1, 360, fullSector=False)
+def draw_curve(
+self,
+p1: point_like,
+p2: point_like,
+p3: point_like,
+) -> pymupdf.Point:
+"""Draw a curve between points using one control point."""
+kappa = 0.55228474983
+p1 = pymupdf.Point(p1)
+p2 = pymupdf.Point(p2)
+p3 = pymupdf.Point(p3)
+k1 = p1 + (p2 - p1) * kappa
+k2 = p3 + (p2 - p3) * kappa
+return self.draw_bezier(p1, k1, k2, p3)
+def draw_sector(
+self,
+center: point_like,
+point: point_like,
+beta: float,
+fullSector: bool = True,
+) -> pymupdf.Point:
+"""Draw a circle sector."""
+center = pymupdf.Point(center)
+point = pymupdf.Point(point)
+l3 = lambda a, b: _format_g((a, b)) + " m\n"
+l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n"
+l5 = lambda a, b: _format_g((a, b)) + " l\n"
+betar = math.radians(-beta)
+w360 = math.radians(math.copysign(360, betar)) * (-1)
+w90 = math.radians(math.copysign(90, betar))
+w45 = w90 / 2
+while abs(betar) > 2 * math.pi:
+betar += w360  # bring angle below 360 degrees
+if not (self.last_point == point):
+self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
+self.last_point = point
+Q = pymupdf.Point(0, 0)  # just make sure it exists
+C = center
+P = point
+S = P - C  # vector 'center' -> 'point'
+rad = abs(S)  # circle radius
+if not rad > pymupdf.EPSILON:
+raise ValueError("radius must be positive")
+alfa = self.horizontal_angle(center, point)
+while abs(betar) > abs(w90):  # draw 90 degree arcs
+q1 = C.x + math.cos(alfa + w90) * rad
+q2 = C.y + math.sin(alfa + w90) * rad
+Q = pymupdf.Point(q1, q2)  # the arc's end point
+r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45)
+r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45)
+R = pymupdf.Point(r1, r2)  # crossing point of tangents
+kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q)
+kappa = kappah * abs(P - Q)
+cp1 = P + (R - P) * kappa  # control point 1
+cp2 = Q + (R - Q) * kappa  # control point 2
+self.draw_cont += l4(*pymupdf.JM_TUPLE(
+list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
+))
+betar -= w90  # reduce param angle by 90 deg
+alfa += w90  # advance start angle by 90 deg
+P = Q  # advance to arc end point
+# draw (remaining) arc
+if abs(betar) > 1e-3:  # significant degrees left?
+beta2 = betar / 2
+q1 = C.x + math.cos(alfa + betar) * rad
+q2 = C.y + math.sin(alfa + betar) * rad
+Q = pymupdf.Point(q1, q2)  # the arc's end point
+r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2)
+r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2)
+R = pymupdf.Point(r1, r2)  # crossing point of tangents
+# kappa height is 4/3 of segment height
+kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q)  # kappa height
+kappa = kappah * abs(P - Q) / (1 - math.cos(betar))
+cp1 = P + (R - P) * kappa  # control point 1
+cp2 = Q + (R - Q) * kappa  # control point 2
+self.draw_cont += l4(*pymupdf.JM_TUPLE(
+list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
+))
+if fullSector:
+self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
+self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm))
+self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm))
+self.last_point = Q
+return self.last_point
+def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point:
+"""Draw a rectangle.
+Args:
+radius: if not None, the rectangle will have rounded corners.
+This is the radius of the curvature, given as percentage of
+the rectangle width or height. Valid are values 0 < v <= 0.5.
+For a sequence of two values, the corners will have different
+radii. Otherwise, the percentage will be computed from the
+shorter side. A value of (0.5, 0.5) will draw an ellipse.
+"""
+r = pymupdf.Rect(rect)
+if radius is None:  # standard rectangle
+self.draw_cont += _format_g(pymupdf.JM_TUPLE(
+list(r.bl * self.ipctm) + [r.width, r.height]
+)) + " re\n"
+self.updateRect(r)
+self.last_point = r.tl
+return self.last_point
+# rounded corners requested. This requires 1 or 2 values, each
+# with 0 < value <= 0.5
+if hasattr(radius, "__float__"):
+if radius <= 0 or radius > 0.5:
+raise ValueError(f"bad radius value {radius}.")
+d = min(r.width, r.height) * radius
+px = (d, 0)
+py = (0, d)
+elif hasattr(radius, "__len__") and len(radius) == 2:
+rx, ry = radius
+px = (rx * r.width, 0)
+py = (0, ry * r.height)
+if min(rx, ry) <= 0 or max(rx, ry) > 0.5:
+raise ValueError(f"bad radius value {radius}.")
+else:
+raise ValueError(f"bad radius value {radius}.")
+lp = self.draw_line(r.tl + py, r.bl - py)
+lp = self.draw_curve(lp, r.bl, r.bl + px)
+lp = self.draw_line(lp, r.br - px)
+lp = self.draw_curve(lp, r.br, r.br - py)
+lp = self.draw_line(lp, r.tr + py)
+lp = self.draw_curve(lp, r.tr, r.tr - px)
+lp = self.draw_line(lp, r.tl + px)
+self.last_point = self.draw_curve(lp, r.tl, r.tl + py)
+self.updateRect(r)
+return self.last_point
+def draw_quad(self, quad: quad_like) -> pymupdf.Point:
+"""Draw a Quad."""
+q = pymupdf.Quad(quad)
+return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul])
+def draw_zigzag(
+self,
+p1: point_like,
+p2: point_like,
+breadth: float = 2,
+) -> pymupdf.Point:
+"""Draw a zig-zagged line from p1 to p2."""
+p1 = pymupdf.Point(p1)
+p2 = pymupdf.Point(p2)
+S = p2 - p1  # vector start - end
+rad = abs(S)  # distance of points
+cnt = 4 * int(round(rad / (4 * breadth), 0))  # always take full phases
+if cnt < 4:
+raise ValueError("points too close")
+mb = rad / cnt  # revised breadth
+matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2))  # normalize line to x-axis
+i_mat = ~matrix  # get original position
+points = []  # stores edges
+for i in range(1, cnt):
+if i % 4 == 1:  # point "above" connection
+p = pymupdf.Point(i, -1) * mb
+elif i % 4 == 3:  # point "below" connection
+p = pymupdf.Point(i, 1) * mb
+else:  # ignore others
+continue
+points.append(p * i_mat)
+self.draw_polyline([p1] + points + [p2])  # add start and end points
+return p2
+def draw_squiggle(
+self,
+p1: point_like,
+p2: point_like,
+breadth=2,
+) -> pymupdf.Point:
+"""Draw a squiggly line from p1 to p2."""
+p1 = pymupdf.Point(p1)
+p2 = pymupdf.Point(p2)
+S = p2 - p1  # vector start - end
+rad = abs(S)  # distance of points
+cnt = 4 * int(round(rad / (4 * breadth), 0))  # always take full phases
+if cnt < 4:
+raise ValueError("points too close")
+mb = rad / cnt  # revised breadth
+matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2))  # normalize line to x-axis
+i_mat = ~matrix  # get original position
+k = 2.4142135623765633  # y of draw_curve helper point
+points = []  # stores edges
+for i in range(1, cnt):
+if i % 4 == 1:  # point "above" connection
+p = pymupdf.Point(i, -k) * mb
+elif i % 4 == 3:  # point "below" connection
+p = pymupdf.Point(i, k) * mb
+else:  # else on connection line
+p = pymupdf.Point(i, 0) * mb
+points.append(p * i_mat)
+points = [p1] + points + [p2]
+cnt = len(points)
+i = 0
+while i + 2 < cnt:
+self.draw_curve(points[i], points[i + 1], points[i + 2])
+i += 2
+return p2
+# ==============================================================================
+# Shape.insert_text
+# ==============================================================================
+def insert_text(
+self,
+point: point_like,
+buffer: typing.Union[str, list],
+*,
+fontsize: float = 11,
+lineheight: OptFloat = None,
+fontname: str = "helv",
+fontfile: OptStr = None,
+set_simple: bool = 0,
+encoding: int = 0,
+color: OptSeq = None,
+fill: OptSeq = None,
+render_mode: int = 0,
+border_width: float = 0.05,
+miter_limit: float = 1,
+rotate: int = 0,
+morph: OptSeq = None,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> int:
+# ensure 'text' is a list of strings, worth dealing with
+if not bool(buffer):
+return 0
+if type(buffer) not in (list, tuple):
+text = buffer.splitlines()
+else:
+text = buffer
+if not len(text) > 0:
+return 0
+point = pymupdf.Point(point)
+try:
+maxcode = max([ord(c) for c in " ".join(text)])
+except Exception:
+pymupdf.exception_info()
+return 0
+# ensure valid 'fontname'
+fname = fontname
+if fname.startswith("/"):
+fname = fname[1:]
+xref = self.page.insert_font(
+fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
+)
+fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
+fontdict = fontinfo[1]
+ordering = fontdict["ordering"]
+simple = fontdict["simple"]
+bfname = fontdict["name"]
+ascender = fontdict["ascender"]
+descender = fontdict["descender"]
+if lineheight:
+lheight = fontsize * lineheight
+elif ascender - descender <= 1:
+lheight = fontsize * 1.2
+else:
+lheight = fontsize * (ascender - descender)
+if maxcode > 255:
+glyphs = self.doc.get_char_widths(xref, maxcode + 1)
+else:
+glyphs = fontdict["glyphs"]
+tab = []
+for t in text:
+if simple and bfname not in ("Symbol", "ZapfDingbats"):
+g = None
+else:
+g = glyphs
+tab.append(pymupdf.getTJstr(t, g, simple, ordering))
+text = tab
+color_str = pymupdf.ColorCode(color, "c")
+fill_str = pymupdf.ColorCode(fill, "f")
+if not fill and render_mode == 0:  # ensure fill color when 0 Tr
+fill = color
+fill_str = pymupdf.ColorCode(color, "f")
+morphing = pymupdf.CheckMorph(morph)
+rot = rotate
+if rot % 90 != 0:
+raise ValueError("bad rotate value")
+while rot < 0:
+rot += 360
+rot = rot % 360  # text rotate = 0, 90, 270, 180
+templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf "
+templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n"
+cmp90 = "0 1 -1 0 0 0 cm\n"  # rotates 90 deg counter-clockwise
+cmm90 = "0 -1 1 0 0 0 cm\n"  # rotates 90 deg clockwise
+cm180 = "-1 0 0 -1 0 0 cm\n"  # rotates by 180 deg.
+height = self.height
+width = self.width
+# setting up for standard rotation directions
+# case rotate = 0
+if morphing:
+m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y)
+mat = ~m1 * morph[1] * m1
+cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
+else:
+cm = ""
+top = height - point.y - self.y  # start of 1st char
+left = point.x + self.x  # start of 1. char
+space = top  # space available
+#headroom = point.y + self.y  # distance to page border
+if rot == 90:
+left = height - point.y - self.y
+top = -point.x - self.x
+cm += cmp90
+space = width - abs(top)
+#headroom = point.x + self.x
+elif rot == 270:
+left = -height + point.y + self.y
+top = point.x + self.x
+cm += cmm90
+space = abs(top)
+#headroom = width - point.x - self.x
+elif rot == 180:
+left = -point.x - self.x
+top = -height + point.y + self.y
+cm += cm180
+space = abs(point.y + self.y)
+#headroom = height - point.y - self.y
+optcont = self.page._get_optional_content(oc)
+if optcont is not None:
+bdc = "/OC /%s BDC\n" % optcont
+emc = "EMC\n"
+else:
+bdc = emc = ""
+alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
+if alpha is None:
+alpha = ""
+else:
+alpha = "/%s gs\n" % alpha
+nres = templ1(bdc, alpha, cm, left, top, fname, fontsize)
+if render_mode > 0:
+nres += "%i Tr " % render_mode
+nres += _format_g(border_width * fontsize) + " w "
+if miter_limit is not None:
+nres += _format_g(miter_limit) + " M "
+if color is not None:
+nres += color_str
+if fill is not None:
+nres += fill_str
+# =========================================================================
+#   start text insertion
+# =========================================================================
+nres += text[0]
+nlines = 1  # set output line counter
+if len(text) > 1:
+nres += templ2(lheight)  # line 1
+else:
+nres += 'TJ'
+for i in range(1, len(text)):
+if space < lheight:
+break  # no space left on page
+if i > 1:
+nres += "\nT* "
+nres += text[i] + 'TJ'
+space -= lheight
+nlines += 1
+nres += "\nET\n%sQ\n" % emc
+# =========================================================================
+#   end of text insertion
+# =========================================================================
+# update the /Contents object
+self.text_cont += nres
+return nlines
+# ==============================================================================
+# Shape.insert_textbox
+# ==============================================================================
+def insert_textbox(
+self,
+rect: rect_like,
+buffer: typing.Union[str, list],
+*,
+fontname: OptStr = "helv",
+fontfile: OptStr = None,
+fontsize: float = 11,
+lineheight: OptFloat = None,
+set_simple: bool = 0,
+encoding: int = 0,
+color: OptSeq = None,
+fill: OptSeq = None,
+expandtabs: int = 1,
+border_width: float = 0.05,
+miter_limit: float = 1,
+align: int = 0,
+render_mode: int = 0,
+rotate: int = 0,
+morph: OptSeq = None,
+stroke_opacity: float = 1,
+fill_opacity: float = 1,
+oc: int = 0,
+) -> float:
+"""Insert text into a given rectangle.
+Args:
+rect -- the textbox to fill
+buffer -- text to be inserted
+fontname -- a Base-14 font, font name or '/name'
+fontfile -- name of a font file
+fontsize -- font size
+lineheight -- overwrite the font property
+color -- RGB stroke color triple
+fill -- RGB fill color triple
+render_mode -- text rendering control
+border_width -- thickness of glyph borders as percentage of fontsize
+expandtabs -- handles tabulators with string function
+align -- left, center, right, justified
+rotate -- 0, 90, 180, or 270 degrees
+morph -- morph box with a matrix and a fixpoint
+Returns:
+unused or deficit rectangle area (float)
+"""
+rect = pymupdf.Rect(rect)
+if rect.is_empty or rect.is_infinite:
+raise ValueError("text box must be finite and not empty")
+color_str = pymupdf.ColorCode(color, "c")
+fill_str = pymupdf.ColorCode(fill, "f")
+if fill is None and render_mode == 0:  # ensure fill color for 0 Tr
+fill = color
+fill_str = pymupdf.ColorCode(color, "f")
+optcont = self.page._get_optional_content(oc)
+if optcont is not None:
+bdc = "/OC /%s BDC\n" % optcont
+emc = "EMC\n"
+else:
+bdc = emc = ""
+# determine opacity / transparency
+alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
+if alpha is None:
+alpha = ""
+else:
+alpha = "/%s gs\n" % alpha
+if rotate % 90 != 0:
+raise ValueError("rotate must be multiple of 90")
+rot = rotate
+while rot < 0:
+rot += 360
+rot = rot % 360
+# is buffer worth of dealing with?
+if not bool(buffer):
+return rect.height if rot in (0, 180) else rect.width
+cmp90 = "0 1 -1 0 0 0 cm\n"  # rotates counter-clockwise
+cmm90 = "0 -1 1 0 0 0 cm\n"  # rotates clockwise
+cm180 = "-1 0 0 -1 0 0 cm\n"  # rotates by 180 deg.
+height = self.height
+fname = fontname
+if fname.startswith("/"):
+fname = fname[1:]
+xref = self.page.insert_font(
+fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
+)
+fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
+fontdict = fontinfo[1]
+ordering = fontdict["ordering"]
+simple = fontdict["simple"]
+glyphs = fontdict["glyphs"]
+bfname = fontdict["name"]
+ascender = fontdict["ascender"]
+descender = fontdict["descender"]
+if lineheight:
+lheight_factor = lineheight
+elif ascender - descender <= 1:
+lheight_factor = 1.2
+else:
+lheight_factor = ascender - descender
+lheight = fontsize * lheight_factor
+# create a list from buffer, split into its lines
+if type(buffer) in (list, tuple):
+t0 = "\n".join(buffer)
+else:
+t0 = buffer
+maxcode = max([ord(c) for c in t0])
+# replace invalid char codes for simple fonts
+if simple and maxcode > 255:
+t0 = "".join([c if ord(c) < 256 else "?" for c in t0])
+t0 = t0.splitlines()
+glyphs = self.doc.get_char_widths(xref, maxcode + 1)
+if simple and bfname not in ("Symbol", "ZapfDingbats"):
+tj_glyphs = None
+else:
+tj_glyphs = glyphs
+# ----------------------------------------------------------------------
+# calculate pixel length of a string
+# ----------------------------------------------------------------------
+def pixlen(x):
+"""Calculate pixel length of x."""
+if ordering < 0:
+return sum([glyphs[ord(c)][1] for c in x]) * fontsize
+else:
+return len(x) * fontsize
+# ---------------------------------------------------------------------
+if ordering < 0:
+blen = glyphs[32][1] * fontsize  # pixel size of space character
+else:
+blen = fontsize
+text = ""  # output buffer
+if pymupdf.CheckMorph(morph):
+m1 = pymupdf.Matrix(
+1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
+)
+mat = ~m1 * morph[1] * m1
+cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
+else:
+cm = ""
+# ---------------------------------------------------------------------
+# adjust for text orientation / rotation
+# ---------------------------------------------------------------------
+progr = 1  # direction of line progress
+c_pnt = pymupdf.Point(0, fontsize * ascender)  # used for line progress
+if rot == 0:  # normal orientation
+point = rect.tl + c_pnt  # line 1 is 'lheight' below top
+maxwidth = rect.width  # pixels available in one line
+maxheight = rect.height  # available text height
+elif rot == 90:  # rotate counter clockwise
+c_pnt = pymupdf.Point(fontsize * ascender, 0)  # progress in x-direction
+point = rect.bl + c_pnt  # line 1 'lheight' away from left
+maxwidth = rect.height  # pixels available in one line
+maxheight = rect.width  # available text height
+cm += cmp90
+elif rot == 180:  # text upside down
+# progress upwards in y direction
+c_pnt = -pymupdf.Point(0, fontsize * ascender)
+point = rect.br + c_pnt  # line 1 'lheight' above bottom
+maxwidth = rect.width  # pixels available in one line
+progr = -1  # subtract lheight for next line
+maxheight =rect.height  # available text height
+cm += cm180
+else:  # rotate clockwise (270 or -90)
+# progress from right to left
+c_pnt = -pymupdf.Point(fontsize * ascender, 0)
+point = rect.tr + c_pnt  # line 1 'lheight' left of right
+maxwidth = rect.height  # pixels available in one line
+progr = -1  # subtract lheight for next line
+maxheight = rect.width  # available text height
+cm += cmm90
+# =====================================================================
+# line loop
+# =====================================================================
+just_tab = []  # 'justify' indicators per line
+for i, line in enumerate(t0):
+line_t = line.expandtabs(expandtabs).split(" ")  # split into words
+num_words = len(line_t)
+lbuff = ""  # init line buffer
+rest = maxwidth  # available line pixels
+# =================================================================
+# word loop
+# =================================================================
+for j in range(num_words):
+word = line_t[j]
+pl_w = pixlen(word)  # pixel len of word
+if rest >= pl_w:  # does it fit on the line?
+lbuff += word + " "  # yes, append word
+rest -= pl_w + blen  # update available line space
+continue  # next word
+# word doesn't fit - output line (if not empty)
+if lbuff:
+lbuff = lbuff.rstrip() + "\n"  # line full, append line break
+text += lbuff  # append to total text
+just_tab.append(True)  # can align-justify
+lbuff = ""  # re-init line buffer
+rest = maxwidth  # re-init avail. space
+if pl_w <= maxwidth:  # word shorter than 1 line?
+lbuff = word + " "  # start the line with it
+rest = maxwidth - pl_w - blen  # update free space
+continue
+# long word: split across multiple lines - char by char ...
+if len(just_tab) > 0:
+just_tab[-1] = False  # cannot align-justify
+for c in word:
+if pixlen(lbuff) <= maxwidth - pixlen(c):
+lbuff += c
+else:  # line full
+lbuff += "\n"  # close line
+text += lbuff  # append to text
+just_tab.append(False)  # cannot align-justify
+lbuff = c  # start new line with this char
+lbuff += " "  # finish long word
+rest = maxwidth - pixlen(lbuff)  # long word stored
+if lbuff:  # unprocessed line content?
+text += lbuff.rstrip()  # append to text
+just_tab.append(False)  # cannot align-justify
+if i < len(t0) - 1:  # not the last line?
+text += "\n"  # insert line break
+# compute used part of the textbox
+if text.endswith("\n"):
+text = text[:-1]
+lb_count = text.count("\n") + 1  # number of lines written
+# text height = line count * line height plus one descender value
+text_height = lheight * lb_count - descender * fontsize
+more = text_height - maxheight  # difference to height limit
+if more > pymupdf.EPSILON:  # landed too much outside rect
+return (-1) * more  # return deficit, don't output
+more = abs(more)
+if more < pymupdf.EPSILON:
+more = 0  # don't bother with epsilons
+nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm  # initialize output buffer
+templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf "
+# center, right, justify: output each line with its own specifics
+text_t = text.splitlines()  # split text in lines again
+just_tab[-1] = False  # never justify last line
+for i, t in enumerate(text_t):
+spacing = 0
+pl = maxwidth - pixlen(t)  # length of empty line part
+pnt = point + c_pnt * (i * lheight_factor)  # text start of line
+if align == 1:  # center: right shift by half width
+if rot in (0, 180):
+pnt = pnt + pymupdf.Point(pl / 2, 0) * progr
+else:
+pnt = pnt - pymupdf.Point(0, pl / 2) * progr
+elif align == 2:  # right: right shift by full width
+if rot in (0, 180):
+pnt = pnt + pymupdf.Point(pl, 0) * progr
+else:
+pnt = pnt - pymupdf.Point(0, pl) * progr
+elif align == 3:  # justify
+spaces = t.count(" ")  # number of spaces in line
+if spaces > 0 and just_tab[i]:  # if any, and we may justify
+spacing = pl / spaces  # make every space this much larger
+else:
+spacing = 0  # keep normal space length
+top = height - pnt.y - self.y
+left = pnt.x + self.x
+if rot == 90:
+left = height - pnt.y - self.y
+top = -pnt.x - self.x
+elif rot == 270:
+left = -height + pnt.y + self.y
+top = pnt.x + self.x
+elif rot == 180:
+left = -pnt.x - self.x
+top = -height + pnt.y + self.y
+nres += templ(left, top, fname, fontsize)
+if render_mode > 0:
+nres += "%i Tr " % render_mode
+nres += _format_g(border_width * fontsize) + " w "
+if miter_limit is not None:
+nres += _format_g(miter_limit) + " M "
+if align == 3:
+nres += _format_g(spacing) + " Tw "
+if color is not None:
+nres += color_str
+if fill is not None:
+nres += fill_str
+nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering)
+nres += "ET\n%sQ\n" % emc
+self.text_cont += nres
+self.updateRect(rect)
+return more
+def finish(
+self,
+width: float = 1,
+color: OptSeq = (0,),
+fill: OptSeq = None,
+lineCap: int = 0,
+lineJoin: int = 0,
+dashes: OptStr = None,
+even_odd: bool = False,
+morph: OptSeq = None,
+closePath: bool = True,
+fill_opacity: float = 1,
+stroke_opacity: float = 1,
+oc: int = 0,
+) -> None:
+"""Finish the current drawing segment.
+Notes:
+Apply colors, opacity, dashes, line style and width, or
+morphing. Also whether to close the path
+by connecting last to first point.
+"""
+if self.draw_cont == "":  # treat empty contents as no-op
+return
+if width == 0:  # border color makes no sense then
+color = None
+elif color is None:  # vice versa
+width = 0
+# if color == None and fill == None:
+#     raise ValueError("at least one of 'color' or 'fill' must be given")
+color_str = pymupdf.ColorCode(color, "c")  # ensure proper color string
+fill_str = pymupdf.ColorCode(fill, "f")  # ensure proper fill string
+optcont = self.page._get_optional_content(oc)
+if optcont is not None:
+self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont
+emc = "EMC\n"
+else:
+emc = ""
+alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
+if alpha is not None:
+self.draw_cont = "/%s gs\n" % alpha + self.draw_cont
+if width != 1 and width != 0:
+self.draw_cont += _format_g(width) + " w\n"
+if lineCap != 0:
+self.draw_cont = "%i J\n" % lineCap + self.draw_cont
+if lineJoin != 0:
+self.draw_cont = "%i j\n" % lineJoin + self.draw_cont
+if dashes not in (None, "", "[] 0"):
+self.draw_cont = "%s d\n" % dashes + self.draw_cont
+if closePath:
+self.draw_cont += "h\n"
+self.last_point = None
+if color is not None:
+self.draw_cont += color_str
+if fill is not None:
+self.draw_cont += fill_str
+if color is not None:
+if not even_odd:
+self.draw_cont += "B\n"
+else:
+self.draw_cont += "B*\n"
+else:
+if not even_odd:
+self.draw_cont += "f\n"
+else:
+self.draw_cont += "f*\n"
+else:
+self.draw_cont += "S\n"
+self.draw_cont += emc
+if pymupdf.CheckMorph(morph):
+m1 = pymupdf.Matrix(
+1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
+)
+mat = ~m1 * morph[1] * m1
+self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont
+self.totalcont += "\nq\n" + self.draw_cont + "Q\n"
+self.draw_cont = ""
+self.last_point = None
+return
+def commit(self, overlay: bool = True) -> None:
+"""Update the page's /Contents object with Shape data.
+The argument controls whether data appear in foreground (default)
+or background.
+"""
+pymupdf.CheckParent(self.page)  # doc may have died meanwhile
+self.totalcont += self.text_cont
+self.totalcont = self.totalcont.encode()
+if self.totalcont:
+if overlay:
+self.page.wrap_contents()  # ensure a balanced graphics state
+# make /Contents object with dummy stream
+xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay)
+# update it with potential compression
+self.doc.update_stream(xref, self.totalcont)
+self.last_point = None  # clean up ...
+self.rect = None  #
+self.draw_cont = ""  # for potential ...
+self.text_cont = ""  # ...
+self.totalcont = ""  # re-use
+def apply_redactions(
+page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0
+) -> bool:
+"""Apply the redaction annotations of the page.
+Args:
+page: the PDF page.
+images:
+0 - ignore images
+1 - remove all overlapping images
+2 - blank out overlapping image parts
+3 - remove image unless invisible
+graphics:
+0 - ignore graphics
+1 - remove graphics if contained in rectangle
+2 - remove all overlapping graphics
+text:
+0 - remove text
+1 - ignore text
+"""
+def center_rect(annot_rect, new_text, font, fsize):
+"""Calculate minimal sub-rectangle for the overlay text.
+Notes:
+Because 'insert_textbox' supports no vertical text centering,
+we calculate an approximate number of lines here and return a
+sub-rect with smaller height, which should still be sufficient.
+Args:
+annot_rect: the annotation rectangle
+new_text: the text to insert.
+font: the fontname. Must be one of the CJK or Base-14 set, else
+the rectangle is returned unchanged.
+fsize: the fontsize
+Returns:
+A rectangle to use instead of the annot rectangle.
+"""
+if not new_text or annot_rect.width <= pymupdf.EPSILON:
+return annot_rect
+try:
+text_width = pymupdf.get_text_length(new_text, font, fsize)
+except (ValueError, mupdf.FzErrorBase):  # unsupported font
+if g_exceptions_verbose:
+pymupdf.exception_info()
+return annot_rect
+line_height = fsize * 1.2
+limit = annot_rect.width
+h = math.ceil(text_width / limit) * line_height  # estimate rect height
+if h >= annot_rect.height:
+return annot_rect
+r = annot_rect
+y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5
+r.y0 = y
+return r
+pymupdf.CheckParent(page)
+doc = page.parent
+if doc.is_encrypted or doc.is_closed:
+raise ValueError("document closed or encrypted")
+if not doc.is_pdf:
+raise ValueError("is no PDF")
+redact_annots = []  # storage of annot values
+for annot in page.annots(
+types=(pymupdf.PDF_ANNOT_REDACT,)  # pylint: disable=no-member
+):
+# loop redactions
+redact_annots.append(annot._get_redact_values())  # save annot values
+if redact_annots == []:  # any redactions on this page?
+return False  # no redactions
+rc = page._apply_redactions(text, images, graphics)  # call MuPDF
+if not rc:  # should not happen really
+raise ValueError("Error applying redactions.")
+# now write replacement text in old redact rectangles
+shape = page.new_shape()
+for redact in redact_annots:
+annot_rect = redact["rect"]
+fill = redact["fill"]
+if fill:
+shape.draw_rect(annot_rect)  # colorize the rect background
+shape.finish(fill=fill, color=fill)
+if "text" in redact.keys():  # if we also have text
+new_text = redact["text"]
+align = redact.get("align", 0)
+fname = redact["fontname"]
+fsize = redact["fontsize"]
+color = redact["text_color"]
+# try finding vertical centered sub-rect
+trect = center_rect(annot_rect, new_text, fname, fsize)
+rc = -1
+while rc < 0 and fsize >= 4:  # while not enough room
+# (re-) try insertion
+rc = shape.insert_textbox(
+trect,
+new_text,
+fontname=fname,
+fontsize=fsize,
+color=color,
+align=align,
+)
+fsize -= 0.5  # reduce font if unsuccessful
+shape.commit()  # append new contents object
+return True
+# ------------------------------------------------------------------------------
+# Remove potentially sensitive data from a PDF. Similar to the Adobe
+# Acrobat 'sanitize' function
+# ------------------------------------------------------------------------------
+def scrub(
+doc: pymupdf.Document,
+attached_files: bool = True,
+clean_pages: bool = True,
+embedded_files: bool = True,
+hidden_text: bool = True,
+javascript: bool = True,
+metadata: bool = True,
+redactions: bool = True,
+redact_images: int = 0,
+remove_links: bool = True,
+reset_fields: bool = True,
+reset_responses: bool = True,
+thumbnails: bool = True,
+xml_metadata: bool = True,
+) -> None:
+def remove_hidden(cont_lines):
+"""Remove hidden text from a PDF page.
+Args:
+cont_lines: list of lines with /Contents content. Should have status
+from after page.cleanContents().
+Returns:
+List of /Contents lines from which hidden text has been removed.
+Notes:
+The input must have been created after the page's /Contents object(s)
+have been cleaned with page.cleanContents(). This ensures a standard
+formatting: one command per line, single spaces between operators.
+This allows for drastic simplification of this code.
+"""
+out_lines = []  # will return this
+in_text = False  # indicate if within BT/ET object
+suppress = False  # indicate text suppression active
+make_return = False
+for line in cont_lines:
+if line == b"BT":  # start of text object
+in_text = True  # switch on
+out_lines.append(line)  # output it
+continue
+if line == b"ET":  # end of text object
+in_text = False  # switch off
+out_lines.append(line)  # output it
+continue
+if line == b"3 Tr":  # text suppression operator
+suppress = True  # switch on
+make_return = True
+continue
+if line[-2:] == b"Tr" and line[0] != b"3":
+suppress = False  # text rendering changed
+out_lines.append(line)
+continue
+if line == b"Q":  # unstack command also switches off
+suppress = False
+out_lines.append(line)
+continue
+if suppress and in_text:  # suppress hidden lines
+continue
+out_lines.append(line)
+if make_return:
+return out_lines
+else:
+return None
+if not doc.is_pdf:  # only works for PDF
+raise ValueError("is no PDF")
+if doc.is_encrypted or doc.is_closed:
+raise ValueError("closed or encrypted doc")
+if not clean_pages:
+hidden_text = False
+redactions = False
+if metadata:
+doc.set_metadata({})  # remove standard metadata
+for page in doc:
+if reset_fields:
+# reset form fields (widgets)
+for widget in page.widgets():
+widget.reset()
+if remove_links:
+links = page.get_links()  # list of all links on page
+for link in links:  # remove all links
+page.delete_link(link)
+found_redacts = False
+for annot in page.annots():
+if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
+annot.update_file(buffer_=b" ")  # set file content to empty
+if reset_responses:
+annot.delete_responses()
+if annot.type[0] == pymupdf.PDF_ANNOT_REDACT:  # pylint: disable=no-member
+found_redacts = True
+if redactions and found_redacts:
+page.apply_redactions(images=redact_images)
+if not (clean_pages or hidden_text):
+continue  # done with the page
+page.clean_contents()
+if not page.get_contents():
+continue
+if hidden_text:
+xref = page.get_contents()[0]  # only one b/o cleaning!
+cont = doc.xref_stream(xref)
+cont_lines = remove_hidden(cont.splitlines())  # remove hidden text
+if cont_lines:  # something was actually removed
+cont = b"\n".join(cont_lines)
+doc.update_stream(xref, cont)  # rewrite the page /Contents
+if thumbnails:  # remove page thumbnails?
+if doc.xref_get_key(page.xref, "Thumb")[0] != "null":
+doc.xref_set_key(page.xref, "Thumb", "null")
+# pages are scrubbed, now perform document-wide scrubbing
+# remove embedded files
+if embedded_files:
+for name in doc.embfile_names():
+doc.embfile_del(name)
+if xml_metadata:
+doc.del_xml_metadata()
+if not (xml_metadata or javascript):
+xref_limit = 0
+else:
+xref_limit = doc.xref_length()
+for xref in range(1, xref_limit):
+if not doc.xref_object(xref):
+msg = "bad xref %i - clean PDF before scrubbing" % xref
+raise ValueError(msg)
+if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript":
+# a /JavaScript action object
+obj = "<</S/JavaScript/JS()>>"  # replace with a null JavaScript
+doc.update_object(xref, obj)  # update this object
+continue  # no further handling
+if not xml_metadata:
+continue
+if doc.xref_get_key(xref, "Type")[1] == "/Metadata":
+# delete any metadata object directly
+doc.update_object(xref, "<<>>")
+doc.update_stream(xref, b"deleted", new=True)
+continue
+if doc.xref_get_key(xref, "Metadata")[0] != "null":
+doc.xref_set_key(xref, "Metadata", "null")
+def _show_fz_text( text):
+#if mupdf_cppyy:
+#    assert isinstance( text, cppyy.gbl.mupdf.Text)
+#else:
+#    assert isinstance( text, mupdf.Text)
+num_spans = 0
+num_chars = 0
+span = text.m_internal.head
+while 1:
+if not span:
+break
+num_spans += 1
+num_chars += span.len
+span = span.next
+return f'num_spans={num_spans} num_chars={num_chars}'
+def fill_textbox(
+writer: pymupdf.TextWriter,
+rect: rect_like,
+text: typing.Union[str, list],
+pos: point_like = None,
+font: typing.Optional[pymupdf.Font] = None,
+fontsize: float = 11,
+lineheight: OptFloat = None,
+align: int = 0,
+warn: bool = None,
+right_to_left: bool = False,
+small_caps: bool = False,
+) -> tuple:
+"""Fill a rectangle with text.
+Args:
+writer: pymupdf.TextWriter object (= "self")
+rect: rect-like to receive the text.
+text: string or list/tuple of strings.
+pos: point-like start position of first word.
+font: pymupdf.Font object (default pymupdf.Font('helv')).
+fontsize: the fontsize.
+lineheight: overwrite the font property
+align: (int) 0 = left, 1 = center, 2 = right, 3 = justify
+warn: (bool) text overflow action: none, warn, or exception
+right_to_left: (bool) indicate right-to-left language.
+"""
+rect = pymupdf.Rect(rect)
+if rect.is_empty:
+raise ValueError("fill rect must not empty.")
+if type(font) is not pymupdf.Font:
+font = pymupdf.Font("helv")
+def textlen(x):
+"""Return length of a string."""
+return font.text_length(
+x, fontsize=fontsize, small_caps=small_caps
+)  # abbreviation
+def char_lengths(x):
+"""Return list of single character lengths for a string."""
+return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps)
+def append_this(pos, text):
+ret = writer.append(
+pos, text, font=font, fontsize=fontsize, small_caps=small_caps
+)
+return ret
+tolerance = fontsize * 0.2  # extra distance to left border
+space_len = textlen(" ")
+std_width = rect.width - tolerance
+std_start = rect.x0 + tolerance
+def norm_words(width, words):
+"""Cut any word in pieces no longer than 'width'."""
+nwords = []
+word_lengths = []
+for w in words:
+wl_lst = char_lengths(w)
+wl = sum(wl_lst)
+if wl <= width:  # nothing to do - copy over
+nwords.append(w)
+word_lengths.append(wl)
+continue
+# word longer than rect width - split it in parts
+n = len(wl_lst)
+while n > 0:
+wl = sum(wl_lst[:n])
+if wl <= width:
+nwords.append(w[:n])
+word_lengths.append(wl)
+w = w[n:]
+wl_lst = wl_lst[n:]
+n = len(wl_lst)
+else:
+n -= 1
+return nwords, word_lengths
+def output_justify(start, line):
+"""Justified output of a line."""
+# ignore leading / trailing / multiple spaces
+words = [w for w in line.split(" ") if w != ""]
+nwords = len(words)
+if nwords == 0:
+return
+if nwords == 1:  # single word cannot be justified
+append_this(start, words[0])
+return
+tl = sum([textlen(w) for w in words])  # total word lengths
+gaps = nwords - 1  # number of word gaps
+gapl = (std_width - tl) / gaps  # width of each gap
+for w in words:
+_, lp = append_this(start, w)  # output one word
+start.x = lp.x + gapl  # next start at word end plus gap
+return
+asc = font.ascender
+dsc = font.descender
+if not lineheight:
+if asc - dsc <= 1:
+lheight = 1.2
+else:
+lheight = asc - dsc
+else:
+lheight = lineheight
+LINEHEIGHT = fontsize * lheight  # effective line height
+width = std_width  # available horizontal space
+# starting point of text
+if pos is not None:
+pos = pymupdf.Point(pos)
+else:  # default is just below rect top-left
+pos = rect.tl + (tolerance, fontsize * asc)
+if pos not in rect:
+raise ValueError("Text must start in rectangle.")
+# calculate displacement factor for alignment
+if align == pymupdf.TEXT_ALIGN_CENTER:
+factor = 0.5
+elif align == pymupdf.TEXT_ALIGN_RIGHT:
+factor = 1.0
+else:
+factor = 0
+# split in lines if just a string was given
+if type(text) is str:
+textlines = text.splitlines()
+else:
+textlines = []
+for line in text:
+textlines.extend(line.splitlines())
+max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1
+new_lines = []  # the final list of textbox lines
+no_justify = []  # no justify for these line numbers
+for i, line in enumerate(textlines):
+if line in ("", " "):
+new_lines.append((line, space_len))
+width = rect.width - tolerance
+no_justify.append((len(new_lines) - 1))
+continue
+if i == 0:
+width = rect.x1 - pos.x
+else:
+width = rect.width - tolerance
+if right_to_left:  # reverses Arabic / Hebrew text front to back
+line = writer.clean_rtl(line)
+tl = textlen(line)
+if tl <= width:  # line short enough
+new_lines.append((line, tl))
+no_justify.append((len(new_lines) - 1))
+continue
+# we need to split the line in fitting parts
+words = line.split(" ")  # the words in the line
+# cut in parts any words that are longer than rect width
+words, word_lengths = norm_words(width, words)
+n = len(words)
+while True:
+line0 = " ".join(words[:n])
+wl = sum(word_lengths[:n]) + space_len * (n - 1)
+if wl <= width:
+new_lines.append((line0, wl))
+words = words[n:]
+word_lengths = word_lengths[n:]
+n = len(words)
+line0 = None
+else:
+n -= 1
+if len(words) == 0:
+break
+assert n
+# -------------------------------------------------------------------------
+# List of lines created. Each item is (text, tl), where 'tl' is the PDF
+# output length (float) and 'text' is the text. Except for justified text,
+# this is output-ready.
+# -------------------------------------------------------------------------
+nlines = len(new_lines)
+if nlines > max_lines:
+msg = "Only fitting %i of %i lines." % (max_lines, nlines)
+if warn is None:
+pass
+elif warn:
+pymupdf.message("Warning: " + msg)
+else:
+raise ValueError(msg)
+start = pymupdf.Point()
+no_justify += [len(new_lines) - 1]  # no justifying of last line
+for i in range(max_lines):
+try:
+line, tl = new_lines.pop(0)
+except IndexError:
+if g_exceptions_verbose >= 2:   pymupdf.exception_info()
+break
+if right_to_left:  # Arabic, Hebrew
+line = "".join(reversed(line))
+if i == 0:  # may have different start for first line
+start = pos
+if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width:
+output_justify(start, line)
+start.x = std_start
+start.y += LINEHEIGHT
+continue
+if i > 0 or pos.x == std_start:  # left, center, right alignments
+start.x += (width - tl) * factor
+append_this(start, line)
+start.x = std_start
+start.y += LINEHEIGHT
+return new_lines  # return non-written lines
+# ------------------------------------------------------------------------
+# Optional Content functions
+# ------------------------------------------------------------------------
+def get_oc(doc: pymupdf.Document, xref: int) -> int:
+"""Return optional content object xref for an image or form xobject.
+Args:
+xref: (int) xref number of an image or form xobject.
+"""
+if doc.is_closed or doc.is_encrypted:
+raise ValueError("document close or encrypted")
+t, name = doc.xref_get_key(xref, "Subtype")
+if t != "name" or name not in ("/Image", "/Form"):
+raise ValueError("bad object type at xref %i" % xref)
+t, oc = doc.xref_get_key(xref, "OC")
+if t != "xref":
+return 0
+rc = int(oc.replace("0 R", ""))
+return rc
+def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None:
+"""Attach optional content object to image or form xobject.
+Args:
+xref: (int) xref number of an image or form xobject
+oc: (int) xref number of an OCG or OCMD
+"""
+if doc.is_closed or doc.is_encrypted:
+raise ValueError("document close or encrypted")
+t, name = doc.xref_get_key(xref, "Subtype")
+if t != "name" or name not in ("/Image", "/Form"):
+raise ValueError("bad object type at xref %i" % xref)
+if oc > 0:
+t, name = doc.xref_get_key(oc, "Type")
+if t != "name" or name not in ("/OCG", "/OCMD"):
+raise ValueError("bad object type at xref %i" % oc)
+if oc == 0 and "OC" in doc.xref_get_keys(xref):
+doc.xref_set_key(xref, "OC", "null")
+return None
+doc.xref_set_key(xref, "OC", "%i 0 R" % oc)
+return None
+def set_ocmd(
+doc: pymupdf.Document,
+xref: int = 0,
+ocgs: typing.Union[list, None] = None,
+policy: OptStr = None,
+ve: typing.Union[list, None] = None,
+) -> int:
+"""Create or update an OCMD object in a PDF document.
+Args:
+xref: (int) 0 for creating a new object, otherwise update existing one.
+ocgs: (list) OCG xref numbers, which shall be subject to 'policy'.
+policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing).
+ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'.
+Returns:
+Xref of the created or updated OCMD.
+"""
+all_ocgs = set(doc.get_ocgs().keys())
+def ve_maker(ve):
+if type(ve) not in (list, tuple) or len(ve) < 2:
+raise ValueError("bad 've' format: %s" % ve)
+if ve[0].lower() not in ("and", "or", "not"):
+raise ValueError("bad operand: %s" % ve[0])
+if ve[0].lower() == "not" and len(ve) != 2:
+raise ValueError("bad 've' format: %s" % ve)
+item = "[/%s" % ve[0].title()
+for x in ve[1:]:
+if type(x) is int:
+if x not in all_ocgs:
+raise ValueError("bad OCG %i" % x)
+item += " %i 0 R" % x
+else:
+item += " %s" % ve_maker(x)
+item += "]"
+return item
+text = "<</Type/OCMD"
+if ocgs and type(ocgs) in (list, tuple):  # some OCGs are provided
+s = set(ocgs).difference(all_ocgs)  # contains illegal xrefs
+if s != set():
+msg = "bad OCGs: %s" % s
+raise ValueError(msg)
+text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]"
+if policy:
+policy = str(policy).lower()
+pols = {
+"anyon": "AnyOn",
+"allon": "AllOn",
+"anyoff": "AnyOff",
+"alloff": "AllOff",
+}
+if policy not in ("anyon", "allon", "anyoff", "alloff"):
+raise ValueError("bad policy: %s" % policy)
+text += "/P/%s" % pols[policy]
+if ve:
+text += "/VE%s" % ve_maker(ve)
+text += ">>"
+# make new object or replace old OCMD (check type first)
+if xref == 0:
+xref = doc.get_new_xref()
+elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True):
+raise ValueError("bad xref or not an OCMD")
+doc.update_object(xref, text)
+return xref
+def get_ocmd(doc: pymupdf.Document, xref: int) -> dict:
+"""Return the definition of an OCMD (optional content membership dictionary).
+Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and
+/VE (visibility expression, PDF array). Via string manipulation, this
+info is converted to a Python dictionary with keys "xref", "ocgs", "policy"
+and "ve" - ready to recycle as input for 'set_ocmd()'.
+"""
+if xref not in range(doc.xref_length()):
+raise ValueError("bad xref")
+text = doc.xref_object(xref, compressed=True)
+if "/Type/OCMD" not in text:
+raise ValueError("bad object type")
+textlen = len(text)
+p0 = text.find("/OCGs[")  # look for /OCGs key
+p1 = text.find("]", p0)
+if p0 < 0 or p1 < 0:  # no OCGs found
+ocgs = None
+else:
+ocgs = text[p0 + 6 : p1].replace("0 R", " ").split()
+ocgs = list(map(int, ocgs))
+p0 = text.find("/P/")  # look for /P policy key
+if p0 < 0:
+policy = None
+else:
+p1 = text.find("ff", p0)
+if p1 < 0:
+p1 = text.find("on", p0)
+if p1 < 0:  # some irregular syntax
+raise ValueError("bad object at xref")
+else:
+policy = text[p0 + 3 : p1 + 2]
+p0 = text.find("/VE[")  # look for /VE visibility expression key
+if p0 < 0:  # no visibility expression found
+ve = None
+else:
+lp = rp = 0  # find end of /VE by finding last ']'.
+p1 = p0
+while lp < 1 or lp != rp:
+p1 += 1
+if not p1 < textlen:  # some irregular syntax
+raise ValueError("bad object at xref")
+if text[p1] == "[":
+lp += 1
+if text[p1] == "]":
+rp += 1
+# p1 now positioned at the last "]"
+ve = text[p0 + 3 : p1 + 1]  # the PDF /VE array
+ve = (
+ve.replace("/And", '"and",')
+.replace("/Not", '"not",')
+.replace("/Or", '"or",')
+)
+ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[")
+import json
+try:
+ve = json.loads(ve)
+except Exception:
+pymupdf.exception_info()
+pymupdf.message(f"bad /VE key: {ve!r}")
+raise
+return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve}
+"""
+Handle page labels for PDF documents.
+Reading
+-------
+* compute the label of a page
+* find page number(s) having the given label.
+Writing
+-------
+Supports setting (defining) page labels for PDF documents.
+A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
+significant parts of the following code during late December 2020
+through early January 2021.
+"""
+def rule_dict(item):
+"""Make a Python dict from a PDF page label rule.
+Args:
+item -- a tuple (pno, rule) with the start page number and the rule
+string like <</S/D...>>.
+Returns:
+A dict like
+{'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
+"""
+# Jorj McKie, 2021-01-06
+pno, rule = item
+rule = rule[2:-2].split("/")[1:]  # strip "<<" and ">>"
+d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
+skip = False
+for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
+if skip:  # this item has already been processed
+skip = False  # deactivate skipping again
+continue
+if item == "S":  # style specification
+d["style"] = rule[i + 1]  # next item has the style
+skip = True  # do not process next item again
+continue
+if item.startswith("P"):  # prefix specification: extract the string
+x = item[1:].replace("(", "").replace(")", "")
+d["prefix"] = x
+continue
+if item.startswith("St"):  # start page number specification
+x = int(item[2:])
+d["firstpagenum"] = x
+return d
+def get_label_pno(pgNo, labels):
+"""Return the label for this page number.
+Args:
+pgNo: page number, 0-based.
+labels: result of doc._get_page_labels().
+Returns:
+The label (str) of the page number. Errors return an empty string.
+"""
+# Jorj McKie, 2021-01-06
+item = [x for x in labels if x[0] <= pgNo][-1]
+rule = rule_dict(item)
+prefix = rule.get("prefix", "")
+style = rule.get("style", "")
+# make sure we start at 0 when enumerating the alphabet
+delta = -1 if style in ("a", "A") else 0
+pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta
+return construct_label(style, prefix, pagenumber)
+def get_label(page):
+"""Return the label for this PDF page.
+Args:
+page: page object.
+Returns:
+The label (str) of the page. Errors return an empty string.
+"""
+# Jorj McKie, 2021-01-06
+labels = page.parent._get_page_labels()
+if not labels:
+return ""
+labels.sort()
+return get_label_pno(page.number, labels)
+def get_page_numbers(doc, label, only_one=False):
+"""Return a list of page numbers with the given label.
+Args:
+doc: PDF document object (resp. 'self').
+label: (str) label.
+only_one: (bool) stop searching after first hit.
+Returns:
+List of page numbers having this label.
+"""
+# Jorj McKie, 2021-01-06
+numbers = []
+if not label:
+return numbers
+labels = doc._get_page_labels()
+if labels == []:
+return numbers
+for i in range(doc.page_count):
+plabel = get_label_pno(i, labels)
+if plabel == label:
+numbers.append(i)
+if only_one:
+break
+return numbers
+def construct_label(style, prefix, pno) -> str:
+"""Construct a label based on style, prefix and page number."""
+# William Chapman, 2021-01-06
+n_str = ""
+if style == "D":
+n_str = str(pno)
+elif style == "r":
+n_str = integerToRoman(pno).lower()
+elif style == "R":
+n_str = integerToRoman(pno).upper()
+elif style == "a":
+n_str = integerToLetter(pno).lower()
+elif style == "A":
+n_str = integerToLetter(pno).upper()
+result = prefix + n_str
+return result
+def integerToLetter(i) -> str:
+"""Returns letter sequence string for integer i."""
+# William Chapman, Jorj McKie, 2021-01-06
+import string
+ls = string.ascii_uppercase
+n, a = 1, i
+while pow(26, n) <= a:
+a -= int(math.pow(26, n))
+n += 1
+str_t = ""
+for j in reversed(range(n)):
+f, g = divmod(a, int(math.pow(26, j)))
+str_t += ls[f]
+a = g
+return str_t
+def integerToRoman(num: int) -> str:
+"""Return roman numeral for an integer."""
+# William Chapman, Jorj McKie, 2021-01-06
+roman = (
+(1000, "M"),
+(900, "CM"),
+(500, "D"),
+(400, "CD"),
+(100, "C"),
+(90, "XC"),
+(50, "L"),
+(40, "XL"),
+(10, "X"),
+(9, "IX"),
+(5, "V"),
+(4, "IV"),
+(1, "I"),
+)
+def roman_num(num):
+for r, ltr in roman:
+x, _ = divmod(num, r)
+yield ltr * x
+num -= r * x
+if num <= 0:
+break
+return "".join([a for a in roman_num(num)])
+def get_page_labels(doc):
+"""Return page label definitions in PDF document.
+Args:
+doc: PDF document (resp. 'self').
+Returns:
+A list of dictionaries with the following format:
+{'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
+"""
+# Jorj McKie, 2021-01-10
+return [rule_dict(item) for item in doc._get_page_labels()]
+def set_page_labels(doc, labels):
+"""Add / replace page label definitions in PDF document.
+Args:
+doc: PDF document (resp. 'self').
+labels: list of label dictionaries like:
+{'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int},
+as returned by get_page_labels().
+"""
+# William Chapman, 2021-01-06
+def create_label_str(label):
+"""Convert Python label dict to corresponding PDF rule string.
+Args:
+label: (dict) build rule for the label.
+Returns:
+PDF label rule string wrapped in "<<", ">>".
+"""
+s = "%i<<" % label["startpage"]
+if label.get("prefix", "") != "":
+s += "/P(%s)" % label["prefix"]
+if label.get("style", "") != "":
+s += "/S/%s" % label["style"]
+if label.get("firstpagenum", 1) > 1:
+s += "/St %i" % label["firstpagenum"]
+s += ">>"
+return s
+def create_nums(labels):
+"""Return concatenated string of all labels rules.
+Args:
+labels: (list) dictionaries as created by function 'rule_dict'.
+Returns:
+PDF compatible string for page label definitions, ready to be
+enclosed in PDF array 'Nums[...]'.
+"""
+labels.sort(key=lambda x: x["startpage"])
+s = "".join([create_label_str(label) for label in labels])
+return s
+doc._set_page_labels(create_nums(labels))
+# End of Page Label Code -------------------------------------------------
+def has_links(doc: pymupdf.Document) -> bool:
+"""Check whether there are links on any page."""
+if doc.is_closed:
+raise ValueError("document closed")
+if not doc.is_pdf:
+raise ValueError("is no PDF")
+for i in range(doc.page_count):
+for item in doc.page_annot_xrefs(i):
+if item[1] == pymupdf.PDF_ANNOT_LINK:  # pylint: disable=no-member
+return True
+return False
+def has_annots(doc: pymupdf.Document) -> bool:
+"""Check whether there are annotations on any page."""
+if doc.is_closed:
+raise ValueError("document closed")
+if not doc.is_pdf:
+raise ValueError("is no PDF")
+for i in range(doc.page_count):
+for item in doc.page_annot_xrefs(i):
+# pylint: disable=no-member
+if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET):  # pylint: disable=no-member
+return True
+return False
+# -------------------------------------------------------------------
+# Functions to recover the quad contained in a text extraction bbox
+# -------------------------------------------------------------------
+def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
+"""Compute the quad located inside the bbox.
+The bbox may be any of the resp. tuples occurring inside the given span.
+Args:
+line_dir: (tuple) 'line["dir"]' of the owning line or None.
+span: (dict) the span. May be from get_texttrace() method.
+bbox: (tuple) the bbox of the span or any of its characters.
+Returns:
+The quad which is wrapped by the bbox.
+"""
+if line_dir is None:
+line_dir = span["dir"]
+cos, sin = line_dir
+bbox = pymupdf.Rect(bbox)  # make it a rect
+if pymupdf.TOOLS.set_small_glyph_heights():  # ==> just fontsize as height
+d = 1
+else:
+d = span["ascender"] - span["descender"]
+height = d * span["size"]  # the quad's rectangle height
+# The following are distances from the bbox corners, at which we find the
+# respective quad points. The computation depends on in which quadrant the
+# text writing angle is located.
+hs = height * sin
+hc = height * cos
+if hc >= 0 and hs <= 0:  # quadrant 1
+ul = bbox.bl - (0, hc)
+ur = bbox.tr + (hs, 0)
+ll = bbox.bl - (hs, 0)
+lr = bbox.tr + (0, hc)
+elif hc <= 0 and hs <= 0:  # quadrant 2
+ul = bbox.br + (hs, 0)
+ur = bbox.tl - (0, hc)
+ll = bbox.br + (0, hc)
+lr = bbox.tl - (hs, 0)
+elif hc <= 0 and hs >= 0:  # quadrant 3
+ul = bbox.tr - (0, hc)
+ur = bbox.bl + (hs, 0)
+ll = bbox.tr - (hs, 0)
+lr = bbox.bl + (0, hc)
+else:  # quadrant 4
+ul = bbox.tl + (hs, 0)
+ur = bbox.br - (0, hc)
+ll = bbox.tl + (0, hc)
+lr = bbox.br - (hs, 0)
+return pymupdf.Quad(ul, ur, ll, lr)
+def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
+"""Recover the quadrilateral of a text span.
+Args:
+line_dir: (tuple) 'line["dir"]' of the owning line.
+span: the span.
+Returns:
+The quadrilateral enveloping the span's text.
+"""
+if type(line_dir) is not tuple or len(line_dir) != 2:
+raise ValueError("bad line dir argument")
+if type(span) is not dict:
+raise ValueError("bad span argument")
+return recover_bbox_quad(line_dir, span, span["bbox"])
+def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
+"""Calculate the line quad for 'dict' / 'rawdict' text extractions.
+The lower quad points are those of the first, resp. last span quad.
+The upper points are determined by the maximum span quad height.
+From this, compute a rect with bottom-left in (0, 0), convert this to a
+quad and rotate and shift back to cover the text of the spans.
+Args:
+spans: (list, optional) sub-list of spans to consider.
+Returns:
+pymupdf.Quad covering selected spans.
+"""
+if spans is None:  # no sub-selection
+spans = line["spans"]  # all spans
+if len(spans) == 0:
+raise ValueError("bad span list")
+line_dir = line["dir"]  # text direction
+cos, sin = line_dir
+q0 = recover_quad(line_dir, spans[0])  # quad of first span
+if len(spans) > 1:  # get quad of last span
+q1 = recover_quad(line_dir, spans[-1])
+else:
+q1 = q0  # last = first
+line_ll = q0.ll  # lower-left of line quad
+line_lr = q1.lr  # lower-right of line quad
+mat0 = pymupdf.planish_line(line_ll, line_lr)
+# map base line to x-axis such that line_ll goes to (0, 0)
+x_lr = line_lr * mat0
+small = pymupdf.TOOLS.set_small_glyph_heights()  # small glyph heights?
+h = max(
+[s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
+)
+line_rect = pymupdf.Rect(0, -h, x_lr.x, 0)  # line rectangle
+line_quad = line_rect.quad  # make it a quad and:
+line_quad *= ~mat0
+return line_quad
+def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
+"""Calculate the span quad for 'dict' / 'rawdict' text extractions.
+Notes:
+There are two execution paths:
+1. For the full span quad, the result of 'recover_quad' is returned.
+2. For the quad of a sub-list of characters, the char quads are
+computed and joined. This is only supported for the "rawdict"
+extraction option.
+Args:
+line_dir: (tuple) 'line["dir"]' of the owning line.
+span: (dict) the span.
+chars: (list, optional) sub-list of characters to consider.
+Returns:
+pymupdf.Quad covering selected characters.
+"""
+if line_dir is None:  # must be a span from get_texttrace()
+line_dir = span["dir"]
+if chars is None:  # no sub-selection
+return recover_quad(line_dir, span)
+if "chars" not in span.keys():
+raise ValueError("need 'rawdict' option to sub-select chars")
+q0 = recover_char_quad(line_dir, span, chars[0])  # quad of first char
+if len(chars) > 1:  # get quad of last char
+q1 = recover_char_quad(line_dir, span, chars[-1])
+else:
+q1 = q0  # last = first
+span_ll = q0.ll  # lower-left of span quad
+span_lr = q1.lr  # lower-right of span quad
+mat0 = pymupdf.planish_line(span_ll, span_lr)
+# map base line to x-axis such that span_ll goes to (0, 0)
+x_lr = span_lr * mat0
+small = pymupdf.TOOLS.set_small_glyph_heights()  # small glyph heights?
+h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
+span_rect = pymupdf.Rect(0, -h, x_lr.x, 0)  # line rectangle
+span_quad = span_rect.quad  # make it a quad and:
+span_quad *= ~mat0  # rotate back and shift back
+return span_quad
+def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
+"""Recover the quadrilateral of a text character.
+This requires the "rawdict" option of text extraction.
+Args:
+line_dir: (tuple) 'line["dir"]' of the span's line.
+span: (dict) the span dict.
+char: (dict) the character dict.
+Returns:
+The quadrilateral enveloping the character.
+"""
+if line_dir is None:
+line_dir = span["dir"]
+if type(line_dir) is not tuple or len(line_dir) != 2:
+raise ValueError("bad line dir argument")
+if type(span) is not dict:
+raise ValueError("bad span argument")
+if type(char) is dict:
+bbox = pymupdf.Rect(char["bbox"])
+elif type(char) is tuple:
+bbox = pymupdf.Rect(char[3])
+else:
+raise ValueError("bad span argument")
+return recover_bbox_quad(line_dir, span, bbox)
+# -------------------------------------------------------------------
+# Building font subsets using fontTools
+# -------------------------------------------------------------------
+def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> OptInt:
+"""Build font subsets in a PDF.
+Eligible fonts are potentially replaced by smaller versions. Page text is
+NOT rewritten and thus should retain properties like being hidden or
+controlled by optional content.
+This method by default uses MuPDF's own internal feature to create subset
+fonts. As this is a new function, errors may still occur. In this case,
+please fall back to using the previous version by using "fallback=True".
+Fallback mode requires the external package 'fontTools'.
+Args:
+fallback: use the older deprecated implementation.
+verbose: only used by fallback mode.
+Returns:
+The new MuPDF-based code returns None.  The deprecated fallback
+mode returns 0 if there are no fonts to subset.  Otherwise, it
+returns the decrease in fontsize (the difference in fontsize),
+measured in bytes.
+"""
+# Font binaries: -  "buffer" -> (names, xrefs, (unicodes, glyphs))
+# An embedded font is uniquely defined by its fontbuffer only. It may have
+# multiple names and xrefs.
+# Once the sets of used unicodes and glyphs are known, we compute a
+# smaller version of the buffer user package fontTools.
+if not fallback:  # by default use MuPDF function
+pdf = mupdf.pdf_document_from_fz_document(doc)
+mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count)))
+return
+font_buffers = {}
+def get_old_widths(xref):
+"""Retrieve old font '/W' and '/DW' values."""
+df = doc.xref_get_key(xref, "DescendantFonts")
+if df[0] != "array":  # only handle xref specifications
+return None, None
+df_xref = int(df[1][1:-1].replace("0 R", ""))
+widths = doc.xref_get_key(df_xref, "W")
+if widths[0] != "array":  # no widths key found
+widths = None
+else:
+widths = widths[1]
+dwidths = doc.xref_get_key(df_xref, "DW")
+if dwidths[0] != "int":
+dwidths = None
+else:
+dwidths = dwidths[1]
+return widths, dwidths
+def set_old_widths(xref, widths, dwidths):
+"""Restore the old '/W' and '/DW' in subsetted font.
+If either parameter is None or evaluates to False, the corresponding
+dictionary key will be set to null.
+"""
+df = doc.xref_get_key(xref, "DescendantFonts")
+if df[0] != "array":  # only handle xref specs
+return None
+df_xref = int(df[1][1:-1].replace("0 R", ""))
+if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[
+0
+] != "null":
+doc.xref_set_key(df_xref, "W", "null")
+else:
+doc.xref_set_key(df_xref, "W", widths)
+if (type(dwidths) is not str or not dwidths) and doc.xref_get_key(
+df_xref, "DW"
+)[0] != "null":
+doc.xref_set_key(df_xref, "DW", "null")
+else:
+doc.xref_set_key(df_xref, "DW", dwidths)
+return None
+def set_subset_fontname(new_xref):
+"""Generate a name prefix to tag a font as subset.
+We use a random generator to select 6 upper case ASCII characters.
+The prefixed name must be put in the font xref as the "/BaseFont" value
+and in the FontDescriptor object as the '/FontName' value.
+"""
+# The following generates a prefix like 'ABCDEF+'
+import random
+import string
+prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+"
+font_str = doc.xref_object(new_xref, compressed=True)
+font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix)
+df = doc.xref_get_key(new_xref, "DescendantFonts")
+if df[0] == "array":
+df_xref = int(df[1][1:-1].replace("0 R", ""))
+fd = doc.xref_get_key(df_xref, "FontDescriptor")
+if fd[0] == "xref":
+fd_xref = int(fd[1].replace("0 R", ""))
+fd_str = doc.xref_object(fd_xref, compressed=True)
+fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix)
+doc.update_object(fd_xref, fd_str)
+doc.update_object(new_xref, font_str)
+def build_subset(buffer, unc_set, gid_set):
+"""Build font subset using fontTools.
+Args:
+buffer: (bytes) the font given as a binary buffer.
+unc_set: (set) required glyph ids.
+Returns:
+Either None if subsetting is unsuccessful or the subset font buffer.
+"""
+try:
+import fontTools.subset as fts
+except ImportError:
+if g_exceptions_verbose:    pymupdf.exception_info()
+pymupdf.message("This method requires fontTools to be installed.")
+raise
+import tempfile
+with tempfile.TemporaryDirectory() as tmp_dir:
+oldfont_path = f"{tmp_dir}/oldfont.ttf"
+newfont_path = f"{tmp_dir}/newfont.ttf"
+uncfile_path = f"{tmp_dir}/uncfile.txt"
+args = [
+oldfont_path,
+"--retain-gids",
+f"--output-file={newfont_path}",
+"--layout-features=*",
+"--passthrough-tables",
+"--ignore-missing-glyphs",
+"--ignore-missing-unicodes",
+"--symbol-cmap",
+]
+# store glyph ids or unicodes as file
+with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file:
+if 0xFFFD in unc_set:  # error unicode exists -> use glyphs
+args.append(f"--gids-file={uncfile_path}")
+gid_set.add(189)
+unc_list = list(gid_set)
+for unc in unc_list:
+unc_file.write("%i\n" % unc)
+else:
+args.append(f"--unicodes-file={uncfile_path}")
+unc_set.add(255)
+unc_list = list(unc_set)
+for unc in unc_list:
+unc_file.write("%04x\n" % unc)
+# store fontbuffer as a file
+with open(oldfont_path, "wb") as fontfile:
+fontfile.write(buffer)
+try:
+os.remove(newfont_path)  # remove old file
+except Exception:
+pass
+try:  # invoke fontTools subsetter
+fts.main(args)
+font = pymupdf.Font(fontfile=newfont_path)
+new_buffer = font.buffer  # subset font binary
+if font.glyph_count == 0:  # intercept empty font
+new_buffer = None
+except Exception:
+pymupdf.exception_info()
+new_buffer = None
+return new_buffer
+def repl_fontnames(doc):
+"""Populate 'font_buffers'.
+For each font candidate, store its xref and the list of names
+by which PDF text may refer to it (there may be multiple).
+"""
+def norm_name(name):
+"""Recreate font name that contains PDF hex codes.
+E.g. #20 -> space, chr(32)
+"""
+while "#" in name:
+p = name.find("#")
+c = int(name[p + 1 : p + 3], 16)
+name = name.replace(name[p : p + 3], chr(c))
+return name
+def get_fontnames(doc, item):
+"""Return a list of fontnames for an item of page.get_fonts().
+There may be multiple names e.g. for Type0 fonts.
+"""
+fontname = item[3]
+names = [fontname]
+fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:]
+fontname = norm_name(fontname)
+if fontname not in names:
+names.append(fontname)
+descendents = doc.xref_get_key(item[0], "DescendantFonts")
+if descendents[0] != "array":
+return names
+descendents = descendents[1][1:-1]
+if descendents.endswith(" 0 R"):
+xref = int(descendents[:-4])
+descendents = doc.xref_object(xref, compressed=True)
+p1 = descendents.find("/BaseFont")
+if p1 >= 0:
+p2 = descendents.find("/", p1 + 1)
+p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1))
+fontname = descendents[p2 + 1 : p1]
+fontname = norm_name(fontname)
+if fontname not in names:
+names.append(fontname)
+return names
+for i in range(doc.page_count):
+for f in doc.get_page_fonts(i, full=True):
+font_xref = f[0]  # font xref
+font_ext = f[1]  # font file extension
+basename = f[3]  # font basename
+if font_ext not in (  # skip if not supported by fontTools
+"otf",
+"ttf",
+"woff",
+"woff2",
+):
+continue
+# skip fonts which already are subsets
+if len(basename) > 6 and basename[6] == "+":
+continue
+extr = doc.extract_font(font_xref)
+fontbuffer = extr[-1]
+names = get_fontnames(doc, f)
+name_set, xref_set, subsets = font_buffers.get(
+fontbuffer, (set(), set(), (set(), set()))
+)
+xref_set.add(font_xref)
+for name in names:
+name_set.add(name)
+font = pymupdf.Font(fontbuffer=fontbuffer)
+name_set.add(font.name)
+del font
+font_buffers[fontbuffer] = (name_set, xref_set, subsets)
+def find_buffer_by_name(name):
+for buffer, (name_set, _, _) in font_buffers.items():
+if name in name_set:
+return buffer
+return None
+# -----------------
+# main function
+# -----------------
+repl_fontnames(doc)  # populate font information
+if not font_buffers:  # nothing found to do
+if verbose:
+pymupdf.message(f'No fonts to subset.')
+return 0
+old_fontsize = 0
+new_fontsize = 0
+for fontbuffer in font_buffers.keys():
+old_fontsize += len(fontbuffer)
+# Scan page text for usage of subsettable fonts
+for page in doc:
+# go through the text and extend set of used glyphs by font
+# we use a modified MuPDF trace device, which delivers us glyph ids.
+for span in page.get_texttrace():
+if type(span) is not dict:  # skip useless information
+continue
+fontname = span["font"][:33]  # fontname for the span
+buffer = find_buffer_by_name(fontname)
+if buffer is None:
+continue
+name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer]
+for c in span["chars"]:
+set_ucs.add(c[0])  # unicode
+set_gid.add(c[1])  # glyph id
+font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid))
+# build the font subsets
+for old_buffer, (name_set, xref_set, subsets) in font_buffers.items():
+new_buffer = build_subset(old_buffer, subsets[0], subsets[1])
+fontname = list(name_set)[0]
+if new_buffer is None or len(new_buffer) >= len(old_buffer):
+# subset was not created or did not get smaller
+if verbose:
+pymupdf.message(f'Cannot subset {fontname!r}.')
+continue
+if verbose:
+pymupdf.message(f"Built subset of font {fontname!r}.")
+val = doc._insert_font(fontbuffer=new_buffer)  # store subset font in PDF
+new_xref = val[0]  # get its xref
+set_subset_fontname(new_xref)  # tag fontname as subset font
+font_str = doc.xref_object(  # get its object definition
+new_xref,
+compressed=True,
+)
+# walk through the original font xrefs and replace each by the subset def
+for font_xref in xref_set:
+# we need the original '/W' and '/DW' width values
+width_table, def_width = get_old_widths(font_xref)
+# ... and replace original font definition at xref with it
+doc.update_object(font_xref, font_str)
+# now copy over old '/W' and '/DW' values
+if width_table or def_width:
+set_old_widths(font_xref, width_table, def_width)
+# 'new_xref' remains unused in the PDF and must be removed
+# by garbage collection.
+new_fontsize += len(new_buffer)
+return old_fontsize - new_fontsize
+# -------------------------------------------------------------------
+# Copy XREF object to another XREF
+# -------------------------------------------------------------------
+def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None:
+"""Copy a PDF dictionary object to another one given their xref numbers.
+Args:
+doc: PDF document object
+source: source xref number
+target: target xref number, the xref must already exist
+keep: an optional list of 1st level keys in target that should not be
+removed before copying.
+Notes:
+This works similar to the copy() method of dictionaries in Python. The
+source may be a stream object.
+"""
+if doc.xref_is_stream(source):
+# read new xref stream, maintaining compression
+stream = doc.xref_stream_raw(source)
+doc.update_stream(
+target,
+stream,
+compress=False,  # keeps source compression
+new=True,  # in case target is no stream
+)
+# empty the target completely, observe exceptions
+if keep is None:
+keep = []
+for key in doc.xref_get_keys(target):
+if key in keep:
+continue
+doc.xref_set_key(target, key, "null")
+# copy over all source dict items
+for key in doc.xref_get_keys(source):
+item = doc.xref_get_key(source, key)
+doc.xref_set_key(target, key, item[1])

Mercurial > hgrepos > Python2 > PyMuPDF

comparison src/utils.py @ 1:1d09e1dec1d9 upstream