comparison src/utils.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children a6bc019ac0b2
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 # ------------------------------------------------------------------------
2 # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
3 # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
4 #
5 # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a
6 # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
7 # maintained and developed by Artifex Software, Inc. https://artifex.com.
8 # ------------------------------------------------------------------------
9 import io
10 import math
11 import os
12 import typing
13 import weakref
14
15 try:
16 from . import pymupdf
17 except Exception:
18 import pymupdf
19 try:
20 from . import mupdf
21 except Exception:
22 import mupdf
23
24 _format_g = pymupdf.format_g
25
26 g_exceptions_verbose = pymupdf.g_exceptions_verbose
27
28 point_like = "point_like"
29 rect_like = "rect_like"
30 matrix_like = "matrix_like"
31 quad_like = "quad_like"
32
33 # ByteString is gone from typing in 3.14.
34 # collections.abc.Buffer available from 3.12 only
35 try:
36 ByteString = typing.ByteString
37 except AttributeError:
38 # pylint: disable=unsupported-binary-operation
39 ByteString = bytes | bytearray | memoryview
40
41 AnyType = typing.Any
42 OptInt = typing.Union[int, None]
43 OptFloat = typing.Optional[float]
44 OptStr = typing.Optional[str]
45 OptDict = typing.Optional[dict]
46 OptBytes = typing.Optional[ByteString]
47 OptSeq = typing.Optional[typing.Sequence]
48
49 """
50 This is a collection of functions to extend PyMupdf.
51 """
52
53
54 def write_text(
55 page: pymupdf.Page,
56 rect=None,
57 writers=None,
58 overlay=True,
59 color=None,
60 opacity=None,
61 keep_proportion=True,
62 rotate=0,
63 oc=0,
64 ) -> None:
65 """Write the text of one or more pymupdf.TextWriter objects.
66
67 Args:
68 rect: target rectangle. If None, the union of the text writers is used.
69 writers: one or more pymupdf.TextWriter objects.
70 overlay: put in foreground or background.
71 keep_proportion: maintain aspect ratio of rectangle sides.
72 rotate: arbitrary rotation angle.
73 oc: the xref of an optional content object
74 """
75 assert isinstance(page, pymupdf.Page)
76 if not writers:
77 raise ValueError("need at least one pymupdf.TextWriter")
78 if type(writers) is pymupdf.TextWriter:
79 if rotate == 0 and rect is None:
80 writers.write_text(page, opacity=opacity, color=color, overlay=overlay)
81 return None
82 else:
83 writers = (writers,)
84 clip = writers[0].text_rect
85 textdoc = pymupdf.Document()
86 tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height)
87 for writer in writers:
88 clip |= writer.text_rect
89 writer.write_text(tpage, opacity=opacity, color=color)
90 if rect is None:
91 rect = clip
92 page.show_pdf_page(
93 rect,
94 textdoc,
95 0,
96 overlay=overlay,
97 keep_proportion=keep_proportion,
98 rotate=rotate,
99 clip=clip,
100 oc=oc,
101 )
102 textdoc = None
103 tpage = None
104
105
106 def show_pdf_page(
107 page,
108 rect,
109 docsrc,
110 pno=0,
111 keep_proportion=True,
112 overlay=True,
113 oc=0,
114 rotate=0,
115 clip=None,
116 ) -> int:
117 """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'.
118
119 Args:
120 rect: (rect-like) where to place the source image
121 docsrc: (document) source PDF
122 pno: (int) source page number
123 keep_proportion: (bool) do not change width-height-ratio
124 overlay: (bool) put in foreground
125 oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF)
126 rotate: (int) degrees (multiple of 90)
127 clip: (rect-like) part of source page rectangle
128 Returns:
129 xref of inserted object (for reuse)
130 """
131 def calc_matrix(sr, tr, keep=True, rotate=0):
132 """Calculate transformation matrix from source to target rect.
133
134 Notes:
135 The product of four matrices in this sequence: (1) translate correct
136 source corner to origin, (2) rotate, (3) scale, (4) translate to
137 target's top-left corner.
138 Args:
139 sr: source rect in PDF (!) coordinate system
140 tr: target rect in PDF coordinate system
141 keep: whether to keep source ratio of width to height
142 rotate: rotation angle in degrees
143 Returns:
144 Transformation matrix.
145 """
146 # calc center point of source rect
147 smp = (sr.tl + sr.br) / 2.0
148 # calc center point of target rect
149 tmp = (tr.tl + tr.br) / 2.0
150
151 # m moves to (0, 0), then rotates
152 m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate)
153
154 sr1 = sr * m # resulting source rect to calculate scale factors
155
156 fw = tr.width / sr1.width # scale the width
157 fh = tr.height / sr1.height # scale the height
158 if keep:
159 fw = fh = min(fw, fh) # take min if keeping aspect ratio
160
161 m *= pymupdf.Matrix(fw, fh) # concat scale matrix
162 m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center
163 return pymupdf.JM_TUPLE(m)
164
165 pymupdf.CheckParent(page)
166 doc = page.parent
167
168 if not doc.is_pdf or not docsrc.is_pdf:
169 raise ValueError("is no PDF")
170
171 if rect.is_empty or rect.is_infinite:
172 raise ValueError("rect must be finite and not empty")
173
174 while pno < 0: # support negative page numbers
175 pno += docsrc.page_count
176 src_page = docsrc[pno] # load source page
177
178 tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates
179
180 src_rect = src_page.rect if not clip else src_page.rect & clip # source rect
181 if src_rect.is_empty or src_rect.is_infinite:
182 raise ValueError("clip must be finite and not empty")
183 src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord
184
185 matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate)
186
187 # list of existing /Form /XObjects
188 ilst = [i[1] for i in doc.get_page_xobjects(page.number)]
189 ilst += [i[7] for i in doc.get_page_images(page.number)]
190 ilst += [i[4] for i in doc.get_page_fonts(page.number)]
191
192 # create a name not in that list
193 n = "fzFrm"
194 i = 0
195 _imgname = n + "0"
196 while _imgname in ilst:
197 i += 1
198 _imgname = n + str(i)
199
200 isrc = docsrc._graft_id # used as key for graftmaps
201 if doc._graft_id == isrc:
202 raise ValueError("source document must not equal target")
203
204 # retrieve / make pymupdf.Graftmap for source PDF
205 gmap = doc.Graftmaps.get(isrc, None)
206 if gmap is None:
207 gmap = pymupdf.Graftmap(doc)
208 doc.Graftmaps[isrc] = gmap
209
210 # take note of generated xref for automatic reuse
211 pno_id = (isrc, pno) # id of docsrc[pno]
212 xref = doc.ShownPages.get(pno_id, 0)
213
214 if overlay:
215 page.wrap_contents() # ensure a balanced graphics state
216 xref = page._show_pdf_page(
217 src_page,
218 overlay=overlay,
219 matrix=matrix,
220 xref=xref,
221 oc=oc,
222 clip=src_rect,
223 graftmap=gmap,
224 _imgname=_imgname,
225 )
226 doc.ShownPages[pno_id] = xref
227
228 return xref
229
230
231 def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None):
232 """Replace the image referred to by xref.
233
234 Replace the image by changing the object definition stored under xref. This
235 will leave the pages appearance instructions intact, so the new image is
236 being displayed with the same bbox, rotation etc.
237 By providing a small fully transparent image, an effect as if the image had
238 been deleted can be achieved.
239 A typical use may include replacing large images by a smaller version,
240 e.g. with a lower resolution or graylevel instead of colored.
241
242 Args:
243 xref: the xref of the image to replace.
244 filename, pixmap, stream: exactly one of these must be provided. The
245 meaning being the same as in Page.insert_image.
246 """
247 doc = page.parent # the owning document
248 if not doc.xref_is_image(xref):
249 raise ValueError("xref not an image") # insert new image anywhere in page
250 if bool(filename) + bool(stream) + bool(pixmap) != 1:
251 raise ValueError("Exactly one of filename/stream/pixmap must be given")
252 new_xref = page.insert_image(
253 page.rect, filename=filename, stream=stream, pixmap=pixmap
254 )
255 doc.xref_copy(new_xref, xref) # copy over new to old
256 last_contents_xref = page.get_contents()[-1]
257 # new image insertion has created a new /Contents source,
258 # which we will set to spaces now
259 doc.update_stream(last_contents_xref, b" ")
260 page._image_info = None # clear cache of extracted image information
261
262
263 def delete_image(page: pymupdf.Page, xref: int):
264 """Delete the image referred to by xef.
265
266 Actually replaces by a small transparent Pixmap using method Page.replace_image.
267
268 Args:
269 xref: xref of the image to delete.
270 """
271 # make a small 100% transparent pixmap (of just any dimension)
272 pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1)
273 pix.clear_with() # clear all samples bytes to 0x00
274 page.replace_image(xref, pixmap=pix)
275
276
277 def insert_image(
278 page,
279 rect,
280 *,
281 alpha=-1,
282 filename=None,
283 height=0,
284 keep_proportion=True,
285 mask=None,
286 oc=0,
287 overlay=True,
288 pixmap=None,
289 rotate=0,
290 stream=None,
291 width=0,
292 xref=0,
293 ):
294 """Insert an image for display in a rectangle.
295
296 Args:
297 rect: (rect_like) position of image on the page.
298 alpha: (int, optional) set to 0 if image has no transparency.
299 filename: (str, Path, file object) image filename.
300 height: (int)
301 keep_proportion: (bool) keep width / height ratio (default).
302 mask: (bytes, optional) image consisting of alpha values to use.
303 oc: (int) xref of OCG or OCMD to declare as Optional Content.
304 overlay: (bool) put in foreground (default) or background.
305 pixmap: (pymupdf.Pixmap) use this as image.
306 rotate: (int) rotate by 0, 90, 180 or 270 degrees.
307 stream: (bytes) use this as image.
308 width: (int)
309 xref: (int) use this as image.
310
311 'page' and 'rect' are positional, all other parameters are keywords.
312
313 If 'xref' is given, that image is used. Other input options are ignored.
314 Else, exactly one of pixmap, stream or filename must be given.
315
316 'alpha=0' for non-transparent images improves performance significantly.
317 Affects stream and filename only.
318
319 Optimum transparent insertions are possible by using filename / stream in
320 conjunction with a 'mask' image of alpha values.
321
322 Returns:
323 xref (int) of inserted image. Re-use as argument for multiple insertions.
324 """
325 pymupdf.CheckParent(page)
326 doc = page.parent
327 if not doc.is_pdf:
328 raise ValueError("is no PDF")
329
330 if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1):
331 raise ValueError("xref=0 needs exactly one of filename, pixmap, stream")
332
333 if filename:
334 if type(filename) is str:
335 pass
336 elif hasattr(filename, "absolute"):
337 filename = str(filename)
338 elif hasattr(filename, "name"):
339 filename = filename.name
340 else:
341 raise ValueError("bad filename")
342
343 if filename and not os.path.exists(filename):
344 raise FileNotFoundError("No such file: '%s'" % filename)
345 elif stream and type(stream) not in (bytes, bytearray, io.BytesIO):
346 raise ValueError("stream must be bytes-like / BytesIO")
347 elif pixmap and type(pixmap) is not pymupdf.Pixmap:
348 raise ValueError("pixmap must be a pymupdf.Pixmap")
349 if mask and not (stream or filename):
350 raise ValueError("mask requires stream or filename")
351 if mask and type(mask) not in (bytes, bytearray, io.BytesIO):
352 raise ValueError("mask must be bytes-like / BytesIO")
353 while rotate < 0:
354 rotate += 360
355 while rotate >= 360:
356 rotate -= 360
357 if rotate not in (0, 90, 180, 270):
358 raise ValueError("bad rotate value")
359
360 r = pymupdf.Rect(rect)
361 if r.is_empty or r.is_infinite:
362 raise ValueError("rect must be finite and not empty")
363 clip = r * ~page.transformation_matrix
364
365 # Create a unique image reference name.
366 ilst = [i[7] for i in doc.get_page_images(page.number)]
367 ilst += [i[1] for i in doc.get_page_xobjects(page.number)]
368 ilst += [i[4] for i in doc.get_page_fonts(page.number)]
369 n = "fzImg" # 'pymupdf image'
370 i = 0
371 _imgname = n + "0" # first name candidate
372 while _imgname in ilst:
373 i += 1
374 _imgname = n + str(i) # try new name
375
376 if overlay:
377 page.wrap_contents() # ensure a balanced graphics state
378 digests = doc.InsertedImages
379 xref, digests = page._insert_image(
380 filename=filename,
381 pixmap=pixmap,
382 stream=stream,
383 imask=mask,
384 clip=clip,
385 overlay=overlay,
386 oc=oc,
387 xref=xref,
388 rotate=rotate,
389 keep_proportion=keep_proportion,
390 width=width,
391 height=height,
392 alpha=alpha,
393 _imgname=_imgname,
394 digests=digests,
395 )
396 if digests is not None:
397 doc.InsertedImages = digests
398
399 return xref
400
401
402 def search_for(
403 page,
404 text,
405 *,
406 clip=None,
407 quads=False,
408 flags=pymupdf.TEXT_DEHYPHENATE
409 | pymupdf.TEXT_PRESERVE_WHITESPACE
410 | pymupdf.TEXT_PRESERVE_LIGATURES
411 | pymupdf.TEXT_MEDIABOX_CLIP
412 ,
413 textpage=None,
414 ) -> list:
415 """Search for a string on a page.
416
417 Args:
418 text: string to be searched for
419 clip: restrict search to this rectangle
420 quads: (bool) return quads instead of rectangles
421 flags: bit switches, default: join hyphened words
422 textpage: a pre-created pymupdf.TextPage
423 Returns:
424 a list of rectangles or quads, each containing one occurrence.
425 """
426 if clip is not None:
427 clip = pymupdf.Rect(clip)
428
429 pymupdf.CheckParent(page)
430 tp = textpage
431 if tp is None:
432 tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage
433 elif getattr(tp, "parent") != page:
434 raise ValueError("not a textpage of this page")
435 rlist = tp.search(text, quads=quads)
436 if textpage is None:
437 del tp
438 return rlist
439
440
441 def search_page_for(
442 doc: pymupdf.Document,
443 pno: int,
444 text: str,
445 quads: bool = False,
446 clip: rect_like = None,
447 flags: int = pymupdf.TEXT_DEHYPHENATE
448 | pymupdf.TEXT_PRESERVE_LIGATURES
449 | pymupdf.TEXT_PRESERVE_WHITESPACE
450 | pymupdf.TEXT_MEDIABOX_CLIP
451 ,
452 textpage: pymupdf.TextPage = None,
453 ) -> list:
454 """Search for a string on a page.
455
456 Args:
457 pno: page number
458 text: string to be searched for
459 clip: restrict search to this rectangle
460 quads: (bool) return quads instead of rectangles
461 flags: bit switches, default: join hyphened words
462 textpage: reuse a prepared textpage
463 Returns:
464 a list of rectangles or quads, each containing an occurrence.
465 """
466
467 return doc[pno].search_for(
468 text,
469 quads=quads,
470 clip=clip,
471 flags=flags,
472 textpage=textpage,
473 )
474
475
476 def get_text_blocks(
477 page: pymupdf.Page,
478 clip: rect_like = None,
479 flags: OptInt = None,
480 textpage: pymupdf.TextPage = None,
481 sort: bool = False,
482 ) -> list:
483 """Return the text blocks on a page.
484
485 Notes:
486 Lines in a block are concatenated with line breaks.
487 Args:
488 flags: (int) control the amount of data parsed into the textpage.
489 Returns:
490 A list of the blocks. Each item contains the containing rectangle
491 coordinates, text lines, running block number and block type.
492 """
493 pymupdf.CheckParent(page)
494 if flags is None:
495 flags = pymupdf.TEXTFLAGS_BLOCKS
496 tp = textpage
497 if tp is None:
498 tp = page.get_textpage(clip=clip, flags=flags)
499 elif getattr(tp, "parent") != page:
500 raise ValueError("not a textpage of this page")
501
502 blocks = tp.extractBLOCKS()
503 if textpage is None:
504 del tp
505 if sort:
506 blocks.sort(key=lambda b: (b[3], b[0]))
507 return blocks
508
509
510 def get_text_words(
511 page: pymupdf.Page,
512 clip: rect_like = None,
513 flags: OptInt = None,
514 textpage: pymupdf.TextPage = None,
515 sort: bool = False,
516 delimiters=None,
517 tolerance=3,
518 ) -> list:
519 """Return the text words as a list with the bbox for each word.
520
521 Args:
522 page: pymupdf.Page
523 clip: (rect-like) area on page to consider
524 flags: (int) control the amount of data parsed into the textpage.
525 textpage: (pymupdf.TextPage) either passed-in or None.
526 sort: (bool) sort the words in reading sequence.
527 delimiters: (str,list) characters to use as word delimiters.
528 tolerance: (float) consider words to be part of the same line if
529 top or bottom coordinate are not larger than this. Relevant
530 only if sort=True.
531
532 Returns:
533 Word tuples (x0, y0, x1, y1, "word", bno, lno, wno).
534 """
535
536 def sort_words(words):
537 """Sort words line-wise, forgiving small deviations."""
538 words.sort(key=lambda w: (w[3], w[0]))
539 nwords = [] # final word list
540 line = [words[0]] # collects words roughly in same line
541 lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle
542 for w in words[1:]:
543 wrect = pymupdf.Rect(w[:4])
544 if (
545 abs(wrect.y0 - lrect.y0) <= tolerance
546 or abs(wrect.y1 - lrect.y1) <= tolerance
547 ):
548 line.append(w)
549 lrect |= wrect
550 else:
551 line.sort(key=lambda w: w[0]) # sort words in line l-t-r
552 nwords.extend(line) # append to final words list
553 line = [w] # start next line
554 lrect = wrect # start next line rect
555
556 line.sort(key=lambda w: w[0]) # sort words in line l-t-r
557 nwords.extend(line) # append to final words list
558
559 return nwords
560
561 pymupdf.CheckParent(page)
562 if flags is None:
563 flags = pymupdf.TEXTFLAGS_WORDS
564 tp = textpage
565 if tp is None:
566 tp = page.get_textpage(clip=clip, flags=flags)
567 elif getattr(tp, "parent") != page:
568 raise ValueError("not a textpage of this page")
569
570 words = tp.extractWORDS(delimiters)
571
572 # if textpage was given, we subselect the words in clip
573 if textpage is not None and clip is not None:
574 # sub-select words contained in clip
575 clip = pymupdf.Rect(clip)
576 words = [
577 w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4]))
578 ]
579
580 if textpage is None:
581 del tp
582 if words and sort:
583 # advanced sort if any words found
584 words = sort_words(words)
585
586 return words
587
588
589 def get_sorted_text(
590 page: pymupdf.Page,
591 clip: rect_like = None,
592 flags: OptInt = None,
593 textpage: pymupdf.TextPage = None,
594 tolerance=3,
595 ) -> str:
596 """Extract plain text avoiding unacceptable line breaks.
597
598 Text contained in clip will be sorted in reading sequence. Some effort
599 is also spent to simulate layout vertically and horizontally.
600
601 Args:
602 page: pymupdf.Page
603 clip: (rect-like) only consider text inside
604 flags: (int) text extraction flags
605 textpage: pymupdf.TextPage
606 tolerance: (float) consider words to be on the same line if their top
607 or bottom coordinates do not differ more than this.
608
609 Notes:
610 If a TextPage is provided, all text is checked for being inside clip
611 with at least 50% of its bbox.
612 This allows to use some "global" TextPage in conjunction with sub-
613 selecting words in parts of the defined TextPage rectangle.
614
615 Returns:
616 A text string in reading sequence. Left indentation of each line,
617 inter-line and inter-word distances strive to reflect the layout.
618 """
619
620 def line_text(clip, line):
621 """Create the string of one text line.
622
623 We are trying to simulate some horizontal layout here, too.
624
625 Args:
626 clip: (pymupdf.Rect) the area from which all text is being read.
627 line: (list) word tuples (rect, text) contained in the line
628 Returns:
629 Text in this line. Generated from words in 'line'. Distance from
630 predecessor is translated to multiple spaces, thus simulating
631 text indentations and large horizontal distances.
632 """
633 line.sort(key=lambda w: w[0].x0)
634 ltext = "" # text in the line
635 x1 = clip.x0 # end coordinate of ltext
636 lrect = pymupdf.EMPTY_RECT() # bbox of this line
637 for r, t in line:
638 lrect |= r # update line bbox
639 # convert distance to previous word to multiple spaces
640 dist = max(
641 int(round((r.x0 - x1) / r.width * len(t))),
642 0 if (x1 == clip.x0 or r.x0 <= x1) else 1,
643 ) # number of space characters
644
645 ltext += " " * dist + t # append word string
646 x1 = r.x1 # update new end position
647 return ltext
648
649 # Extract words in correct sequence first.
650 words = [
651 (pymupdf.Rect(w[:4]), w[4])
652 for w in get_text_words(
653 page,
654 clip=clip,
655 flags=flags,
656 textpage=textpage,
657 sort=True,
658 tolerance=tolerance,
659 )
660 ]
661
662 if not words: # no text present
663 return ""
664 totalbox = pymupdf.EMPTY_RECT() # area covering all text
665 for wr, text in words:
666 totalbox |= wr
667
668 lines = [] # list of reconstituted lines
669 line = [words[0]] # current line
670 lrect = words[0][0] # the line's rectangle
671
672 # walk through the words
673 for wr, text in words[1:]: # start with second word
674 w0r, _ = line[-1] # read previous word in current line
675
676 # if this word matches top or bottom of the line, append it
677 if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance:
678 line.append((wr, text))
679 lrect |= wr
680 else:
681 # output current line and re-initialize
682 ltext = line_text(totalbox, line)
683 lines.append((lrect, ltext))
684 line = [(wr, text)]
685 lrect = wr
686
687 # also append unfinished last line
688 ltext = line_text(totalbox, line)
689 lines.append((lrect, ltext))
690
691 # sort all lines vertically
692 lines.sort(key=lambda l: (l[0].y1))
693
694 text = lines[0][1] # text of first line
695 y1 = lines[0][0].y1 # its bottom coordinate
696 for lrect, ltext in lines[1:]:
697 distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5)
698 breaks = "\n" * (distance + 1)
699 text += breaks + ltext
700 y1 = lrect.y1
701
702 # return text in clip
703 return text
704
705
706 def get_textbox(
707 page: pymupdf.Page,
708 rect: rect_like,
709 textpage: pymupdf.TextPage = None,
710 ) -> str:
711 tp = textpage
712 if tp is None:
713 tp = page.get_textpage()
714 elif getattr(tp, "parent") != page:
715 raise ValueError("not a textpage of this page")
716 rc = tp.extractTextbox(rect)
717 if textpage is None:
718 del tp
719 return rc
720
721
722 def get_text_selection(
723 page: pymupdf.Page,
724 p1: point_like,
725 p2: point_like,
726 clip: rect_like = None,
727 textpage: pymupdf.TextPage = None,
728 ):
729 pymupdf.CheckParent(page)
730 tp = textpage
731 if tp is None:
732 tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE)
733 elif getattr(tp, "parent") != page:
734 raise ValueError("not a textpage of this page")
735 rc = tp.extractSelection(p1, p2)
736 if textpage is None:
737 del tp
738 return rc
739
740
741 def get_textpage_ocr(
742 page: pymupdf.Page,
743 flags: int = 0,
744 language: str = "eng",
745 dpi: int = 72,
746 full: bool = False,
747 tessdata: str = None,
748 ) -> pymupdf.TextPage:
749 """Create a Textpage from combined results of normal and OCR text parsing.
750
751 Args:
752 flags: (int) control content becoming part of the result.
753 language: (str) specify expected language(s). Default is "eng" (English).
754 dpi: (int) resolution in dpi, default 72.
755 full: (bool) whether to OCR the full page image, or only its images (default)
756 """
757 pymupdf.CheckParent(page)
758 tessdata = pymupdf.get_tessdata(tessdata)
759
760 def full_ocr(page, dpi, language, flags):
761 zoom = dpi / 72
762 mat = pymupdf.Matrix(zoom, zoom)
763 pix = page.get_pixmap(matrix=mat)
764 ocr_pdf = pymupdf.Document(
765 "pdf",
766 pix.pdfocr_tobytes(
767 compress=False,
768 language=language,
769 tessdata=tessdata,
770 ),
771 )
772 ocr_page = ocr_pdf.load_page(0)
773 unzoom = page.rect.width / ocr_page.rect.width
774 ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix
775 tpage = ocr_page.get_textpage(flags=flags, matrix=ctm)
776 ocr_pdf.close()
777 pix = None
778 tpage.parent = weakref.proxy(page)
779 return tpage
780
781 # if OCR for the full page, OCR its pixmap @ desired dpi
782 if full:
783 return full_ocr(page, dpi, language, flags)
784
785 # For partial OCR, make a normal textpage, then extend it with text that
786 # is OCRed from each image.
787 # Because of this, we need the images flag bit set ON.
788 tpage = page.get_textpage(flags=flags)
789 for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]:
790 if block["type"] != 1: # only look at images
791 continue
792 bbox = pymupdf.Rect(block["bbox"])
793 if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff
794 continue
795 try:
796 pix = pymupdf.Pixmap(block["image"]) # get image pixmap
797 if pix.n - pix.alpha != 3: # we need to convert this to RGB!
798 pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
799 if pix.alpha: # must remove alpha channel
800 pix = pymupdf.Pixmap(pix, 0)
801 imgdoc = pymupdf.Document(
802 "pdf",
803 pix.pdfocr_tobytes(language=language, tessdata=tessdata),
804 ) # pdf with OCRed page
805 imgpage = imgdoc.load_page(0) # read image as a page
806 pix = None
807 # compute matrix to transform coordinates back to that of 'page'
808 imgrect = imgpage.rect # page size of image PDF
809 shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height)
810 mat = shrink * block["transform"]
811 imgpage.extend_textpage(tpage, flags=0, matrix=mat)
812 imgdoc.close()
813 except (RuntimeError, mupdf.FzErrorBase):
814 if 0 and g_exceptions_verbose:
815 # Don't show exception info here because it can happen in
816 # normal operation (see test_3842b).
817 pymupdf.exception_info()
818 tpage = None
819 pymupdf.message("Falling back to full page OCR")
820 return full_ocr(page, dpi, language, flags)
821
822 return tpage
823
824
825 def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list:
826 """Extract image information only from a pymupdf.TextPage.
827
828 Args:
829 hashes: (bool) include MD5 hash for each image.
830 xrefs: (bool) try to find the xref for each image. Sets hashes to true.
831 """
832 doc = page.parent
833 if xrefs and doc.is_pdf:
834 hashes = True
835 if not doc.is_pdf:
836 xrefs = False
837 imginfo = getattr(page, "_image_info", None)
838 if imginfo and not xrefs:
839 return imginfo
840 if not imginfo:
841 tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES)
842 imginfo = tp.extractIMGINFO(hashes=hashes)
843 del tp
844 if hashes:
845 page._image_info = imginfo
846 if not xrefs or not doc.is_pdf:
847 return imginfo
848 imglist = page.get_images()
849 digests = {}
850 for item in imglist:
851 xref = item[0]
852 pix = pymupdf.Pixmap(doc, xref)
853 digests[pix.digest] = xref
854 del pix
855 for i in range(len(imginfo)):
856 item = imginfo[i]
857 xref = digests.get(item["digest"], 0)
858 item["xref"] = xref
859 imginfo[i] = item
860 return imginfo
861
862
863 def get_image_rects(page: pymupdf.Page, name, transform=False) -> list:
864 """Return list of image positions on a page.
865
866 Args:
867 name: (str, list, int) image identification. May be reference name, an
868 item of the page's image list or an xref.
869 transform: (bool) whether to also return the transformation matrix.
870 Returns:
871 A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix)
872 for all image locations on the page.
873 """
874 if type(name) in (list, tuple):
875 xref = name[0]
876 elif type(name) is int:
877 xref = name
878 else:
879 imglist = [i for i in page.get_images() if i[7] == name]
880 if imglist == []:
881 raise ValueError("bad image name")
882 elif len(imglist) != 1:
883 raise ValueError("multiple image names found")
884 xref = imglist[0][0]
885 pix = pymupdf.Pixmap(page.parent, xref) # make pixmap of the image to compute MD5
886 digest = pix.digest
887 del pix
888 infos = page.get_image_info(hashes=True)
889 if not transform:
890 bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest]
891 else:
892 bboxes = [
893 (pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"]))
894 for im in infos
895 if im["digest"] == digest
896 ]
897 return bboxes
898
899
900 def get_text(
901 page: pymupdf.Page,
902 option: str = "text",
903 *,
904 clip: rect_like = None,
905 flags: OptInt = None,
906 textpage: pymupdf.TextPage = None,
907 sort: bool = False,
908 delimiters=None,
909 tolerance=3,
910 ):
911 """Extract text from a page or an annotation.
912
913 This is a unifying wrapper for various methods of the pymupdf.TextPage class.
914
915 Args:
916 option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
917 clip: (rect-like) restrict output to this area.
918 flags: bit switches to e.g. exclude images or decompose ligatures.
919 textpage: reuse this pymupdf.TextPage and make no new one. If specified,
920 'flags' and 'clip' are ignored.
921
922 Returns:
923 the output of methods get_text_words / get_text_blocks or pymupdf.TextPage
924 methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT,
925 extractXHTML or etractXML respectively.
926 Default and misspelling choice is "text".
927 """
928 formats = {
929 "text": pymupdf.TEXTFLAGS_TEXT,
930 "html": pymupdf.TEXTFLAGS_HTML,
931 "json": pymupdf.TEXTFLAGS_DICT,
932 "rawjson": pymupdf.TEXTFLAGS_RAWDICT,
933 "xml": pymupdf.TEXTFLAGS_XML,
934 "xhtml": pymupdf.TEXTFLAGS_XHTML,
935 "dict": pymupdf.TEXTFLAGS_DICT,
936 "rawdict": pymupdf.TEXTFLAGS_RAWDICT,
937 "words": pymupdf.TEXTFLAGS_WORDS,
938 "blocks": pymupdf.TEXTFLAGS_BLOCKS,
939 }
940 option = option.lower()
941 assert option in formats
942 if option not in formats:
943 option = "text"
944 if flags is None:
945 flags = formats[option]
946
947 if option == "words":
948 return get_text_words(
949 page,
950 clip=clip,
951 flags=flags,
952 textpage=textpage,
953 sort=sort,
954 delimiters=delimiters,
955 )
956 if option == "blocks":
957 return get_text_blocks(
958 page, clip=clip, flags=flags, textpage=textpage, sort=sort
959 )
960
961 if option == "text" and sort:
962 return get_sorted_text(
963 page,
964 clip=clip,
965 flags=flags,
966 textpage=textpage,
967 tolerance=tolerance,
968 )
969
970 pymupdf.CheckParent(page)
971 cb = None
972 if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions
973 clip = page.cropbox
974 if clip is not None:
975 clip = pymupdf.Rect(clip)
976 cb = None
977 elif type(page) is pymupdf.Page:
978 cb = page.cropbox
979 # pymupdf.TextPage with or without images
980 tp = textpage
981 #pymupdf.exception_info()
982 if tp is None:
983 tp = page.get_textpage(clip=clip, flags=flags)
984 elif getattr(tp, "parent") != page:
985 raise ValueError("not a textpage of this page")
986 #pymupdf.log( '{option=}')
987 if option == "json":
988 t = tp.extractJSON(cb=cb, sort=sort)
989 elif option == "rawjson":
990 t = tp.extractRAWJSON(cb=cb, sort=sort)
991 elif option == "dict":
992 t = tp.extractDICT(cb=cb, sort=sort)
993 elif option == "rawdict":
994 t = tp.extractRAWDICT(cb=cb, sort=sort)
995 elif option == "html":
996 t = tp.extractHTML()
997 elif option == "xml":
998 t = tp.extractXML()
999 elif option == "xhtml":
1000 t = tp.extractXHTML()
1001 else:
1002 t = tp.extractText(sort=sort)
1003
1004 if textpage is None:
1005 del tp
1006 return t
1007
1008
1009 def get_page_text(
1010 doc: pymupdf.Document,
1011 pno: int,
1012 option: str = "text",
1013 clip: rect_like = None,
1014 flags: OptInt = None,
1015 textpage: pymupdf.TextPage = None,
1016 sort: bool = False,
1017 ) -> typing.Any:
1018 """Extract a document page's text by page number.
1019
1020 Notes:
1021 Convenience function calling page.get_text().
1022 Args:
1023 pno: page number
1024 option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
1025 Returns:
1026 output from page.TextPage().
1027 """
1028 return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort)
1029
1030 def get_pixmap(
1031 page: pymupdf.Page,
1032 *,
1033 matrix: matrix_like=pymupdf.Identity,
1034 dpi=None,
1035 colorspace: pymupdf.Colorspace=pymupdf.csRGB,
1036 clip: rect_like=None,
1037 alpha: bool=False,
1038 annots: bool=True,
1039 ) -> pymupdf.Pixmap:
1040 """Create pixmap of page.
1041
1042 Keyword args:
1043 matrix: Matrix for transformation (default: Identity).
1044 dpi: desired dots per inch. If given, matrix is ignored.
1045 colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB.
1046 clip: (irect-like) restrict rendering to this area.
1047 alpha: (bool) whether to include alpha channel
1048 annots: (bool) whether to also render annotations
1049 """
1050 if dpi:
1051 zoom = dpi / 72
1052 matrix = pymupdf.Matrix(zoom, zoom)
1053
1054 if type(colorspace) is str:
1055 if colorspace.upper() == "GRAY":
1056 colorspace = pymupdf.csGRAY
1057 elif colorspace.upper() == "CMYK":
1058 colorspace = pymupdf.csCMYK
1059 else:
1060 colorspace = pymupdf.csRGB
1061 if colorspace.n not in (1, 3, 4):
1062 raise ValueError("unsupported colorspace")
1063
1064 dl = page.get_displaylist(annots=annots)
1065 pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip)
1066 dl = None
1067 if dpi:
1068 pix.set_dpi(dpi, dpi)
1069 return pix
1070
1071
1072 def get_page_pixmap(
1073 doc: pymupdf.Document,
1074 pno: int,
1075 *,
1076 matrix: matrix_like = pymupdf.Identity,
1077 dpi=None,
1078 colorspace: pymupdf.Colorspace = pymupdf.csRGB,
1079 clip: rect_like = None,
1080 alpha: bool = False,
1081 annots: bool = True,
1082 ) -> pymupdf.Pixmap:
1083 """Create pixmap of document page by page number.
1084
1085 Notes:
1086 Convenience function calling page.get_pixmap.
1087 Args:
1088 pno: (int) page number
1089 matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity).
1090 colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB.
1091 clip: (irect-like) restrict rendering to this area.
1092 alpha: (bool) include alpha channel
1093 annots: (bool) also render annotations
1094 """
1095 return doc[pno].get_pixmap(
1096 matrix=matrix,
1097 dpi=dpi, colorspace=colorspace,
1098 clip=clip,
1099 alpha=alpha,
1100 annots=annots
1101 )
1102
1103
1104 def getLinkDict(ln, document=None) -> dict:
1105 if isinstance(ln, pymupdf.Outline):
1106 dest = ln.destination(document)
1107 elif isinstance(ln, pymupdf.Link):
1108 dest = ln.dest
1109 else:
1110 assert 0, f'Unexpected {type(ln)=}.'
1111 nl = {"kind": dest.kind, "xref": 0}
1112 try:
1113 if hasattr(ln, 'rect'):
1114 nl["from"] = ln.rect
1115 except Exception:
1116 # This seems to happen quite often in PyMuPDF/tests.
1117 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1118 pass
1119 pnt = pymupdf.Point(0, 0)
1120 if dest.flags & pymupdf.LINK_FLAG_L_VALID:
1121 pnt.x = dest.lt.x
1122 if dest.flags & pymupdf.LINK_FLAG_T_VALID:
1123 pnt.y = dest.lt.y
1124
1125 if dest.kind == pymupdf.LINK_URI:
1126 nl["uri"] = dest.uri
1127
1128 elif dest.kind == pymupdf.LINK_GOTO:
1129 nl["page"] = dest.page
1130 nl["to"] = pnt
1131 if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
1132 nl["zoom"] = dest.rb.x
1133 else:
1134 nl["zoom"] = 0.0
1135
1136 elif dest.kind == pymupdf.LINK_GOTOR:
1137 nl["file"] = dest.file_spec.replace("\\", "/")
1138 nl["page"] = dest.page
1139 if dest.page < 0:
1140 nl["to"] = dest.dest
1141 else:
1142 nl["to"] = pnt
1143 if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM:
1144 nl["zoom"] = dest.rb.x
1145 else:
1146 nl["zoom"] = 0.0
1147
1148 elif dest.kind == pymupdf.LINK_LAUNCH:
1149 nl["file"] = dest.file_spec.replace("\\", "/")
1150
1151 elif dest.kind == pymupdf.LINK_NAMED:
1152 # The dicts should not have same key(s).
1153 assert not (dest.named.keys() & nl.keys())
1154 nl.update(dest.named)
1155 if 'to' in nl:
1156 nl['to'] = pymupdf.Point(nl['to'])
1157
1158 else:
1159 nl["page"] = dest.page
1160 return nl
1161
1162
1163 def get_links(page: pymupdf.Page) -> list:
1164 """Create a list of all links contained in a PDF page.
1165
1166 Notes:
1167 see PyMuPDF ducmentation for details.
1168 """
1169
1170 pymupdf.CheckParent(page)
1171 ln = page.first_link
1172 links = []
1173 while ln:
1174 nl = getLinkDict(ln, page.parent)
1175 links.append(nl)
1176 ln = ln.next
1177 if links != [] and page.parent.is_pdf:
1178 linkxrefs = [x for x in
1179 #page.annot_xrefs()
1180 pymupdf.JM_get_annot_xref_list2(page)
1181 if x[1] == pymupdf.PDF_ANNOT_LINK # pylint: disable=no-member
1182 ]
1183 if len(linkxrefs) == len(links):
1184 for i in range(len(linkxrefs)):
1185 links[i]["xref"] = linkxrefs[i][0]
1186 links[i]["id"] = linkxrefs[i][2]
1187 return links
1188
1189
1190 def get_toc(
1191 doc: pymupdf.Document,
1192 simple: bool = True,
1193 ) -> list:
1194 """Create a table of contents.
1195
1196 Args:
1197 simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.
1198 """
1199 def recurse(olItem, liste, lvl):
1200 """Recursively follow the outline item chain and record item information in a list."""
1201 while olItem and olItem.this.m_internal:
1202 if olItem.title:
1203 title = olItem.title
1204 else:
1205 title = " "
1206
1207 if not olItem.is_external:
1208 if olItem.uri:
1209 if olItem.page == -1:
1210 resolve = doc.resolve_link(olItem.uri)
1211 page = resolve[0] + 1
1212 else:
1213 page = olItem.page + 1
1214 else:
1215 page = -1
1216 else:
1217 page = -1
1218
1219 if not simple:
1220 link = getLinkDict(olItem, doc)
1221 liste.append([lvl, title, page, link])
1222 else:
1223 liste.append([lvl, title, page])
1224
1225 if olItem.down:
1226 liste = recurse(olItem.down, liste, lvl + 1)
1227 olItem = olItem.next
1228 return liste
1229
1230 # ensure document is open
1231 if doc.is_closed:
1232 raise ValueError("document closed")
1233 doc.init_doc()
1234 olItem = doc.outline
1235 if not olItem:
1236 return []
1237 lvl = 1
1238 liste = []
1239 toc = recurse(olItem, liste, lvl)
1240 if doc.is_pdf and not simple:
1241 doc._extend_toc_items(toc)
1242 return toc
1243
1244
1245 def del_toc_item(
1246 doc: pymupdf.Document,
1247 idx: int,
1248 ) -> None:
1249 """Delete TOC / bookmark item by index."""
1250 xref = doc.get_outline_xrefs()[idx]
1251 doc._remove_toc_item(xref)
1252
1253
1254 def set_toc_item(
1255 doc: pymupdf.Document,
1256 idx: int,
1257 dest_dict: OptDict = None,
1258 kind: OptInt = None,
1259 pno: OptInt = None,
1260 uri: OptStr = None,
1261 title: OptStr = None,
1262 to: point_like = None,
1263 filename: OptStr = None,
1264 zoom: float = 0,
1265 ) -> None:
1266 """Update TOC item by index.
1267
1268 It allows changing the item's title and link destination.
1269
1270 Args:
1271 idx:
1272 (int) desired index of the TOC list, as created by get_toc.
1273 dest_dict:
1274 (dict) destination dictionary as created by get_toc(False).
1275 Outrules all other parameters. If None, the remaining parameters
1276 are used to make a dest dictionary.
1277 kind:
1278 (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only
1279 the title will be updated. If pymupdf.LINK_NONE, the TOC item will
1280 be deleted.
1281 pno:
1282 (int) page number (1-based like in get_toc). Required if
1283 pymupdf.LINK_GOTO.
1284 uri:
1285 (str) the URL, required if pymupdf.LINK_URI.
1286 title:
1287 (str) the new title. No change if None.
1288 to:
1289 (point-like) destination on the target page. If omitted, (72, 36)
1290 will be used as target coordinates.
1291 filename:
1292 (str) destination filename, required for pymupdf.LINK_GOTOR and
1293 pymupdf.LINK_LAUNCH.
1294 name:
1295 (str) a destination name for pymupdf.LINK_NAMED.
1296 zoom:
1297 (float) a zoom factor for the target location (pymupdf.LINK_GOTO).
1298 """
1299 xref = doc.get_outline_xrefs()[idx]
1300 page_xref = 0
1301 if type(dest_dict) is dict:
1302 if dest_dict["kind"] == pymupdf.LINK_GOTO:
1303 pno = dest_dict["page"]
1304 page_xref = doc.page_xref(pno)
1305 page_height = doc.page_cropbox(pno).height
1306 to = dest_dict.get('to', pymupdf.Point(72, 36))
1307 to.y = page_height - to.y
1308 dest_dict["to"] = to
1309 action = getDestStr(page_xref, dest_dict)
1310 if not action.startswith("/A"):
1311 raise ValueError("bad bookmark dest")
1312 color = dest_dict.get("color")
1313 if color:
1314 color = list(map(float, color))
1315 if len(color) != 3 or min(color) < 0 or max(color) > 1:
1316 raise ValueError("bad color value")
1317 bold = dest_dict.get("bold", False)
1318 italic = dest_dict.get("italic", False)
1319 flags = italic + 2 * bold
1320 collapse = dest_dict.get("collapse")
1321 return doc._update_toc_item(
1322 xref,
1323 action=action[2:],
1324 title=title,
1325 color=color,
1326 flags=flags,
1327 collapse=collapse,
1328 )
1329
1330 if kind == pymupdf.LINK_NONE: # delete bookmark item
1331 return doc.del_toc_item(idx)
1332 if kind is None and title is None: # treat as no-op
1333 return None
1334 if kind is None: # only update title text
1335 return doc._update_toc_item(xref, action=None, title=title)
1336
1337 if kind == pymupdf.LINK_GOTO:
1338 if pno is None or pno not in range(1, doc.page_count + 1):
1339 raise ValueError("bad page number")
1340 page_xref = doc.page_xref(pno - 1)
1341 page_height = doc.page_cropbox(pno - 1).height
1342 if to is None:
1343 to = pymupdf.Point(72, page_height - 36)
1344 else:
1345 to = pymupdf.Point(to)
1346 to.y = page_height - to.y
1347
1348 ddict = {
1349 "kind": kind,
1350 "to": to,
1351 "uri": uri,
1352 "page": pno,
1353 "file": filename,
1354 "zoom": zoom,
1355 }
1356 action = getDestStr(page_xref, ddict)
1357 if action == "" or not action.startswith("/A"):
1358 raise ValueError("bad bookmark dest")
1359
1360 return doc._update_toc_item(xref, action=action[2:], title=title)
1361
1362
1363 def get_area(*args) -> float:
1364 """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'."""
1365 rect = args[0]
1366 if len(args) > 1:
1367 unit = args[1]
1368 else:
1369 unit = "px"
1370 u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)}
1371 f = (u[unit][0] / u[unit][1]) ** 2
1372 return f * rect.width * rect.height
1373
1374
1375 def set_metadata(doc: pymupdf.Document, m: dict = None) -> None:
1376 """Update the PDF /Info object.
1377
1378 Args:
1379 m: a dictionary like doc.metadata.
1380 """
1381 if not doc.is_pdf:
1382 raise ValueError("is no PDF")
1383 if doc.is_closed or doc.is_encrypted:
1384 raise ValueError("document closed or encrypted")
1385 if m is None:
1386 m = {}
1387 elif type(m) is not dict:
1388 raise ValueError("bad metadata")
1389 keymap = {
1390 "author": "Author",
1391 "producer": "Producer",
1392 "creator": "Creator",
1393 "title": "Title",
1394 "format": None,
1395 "encryption": None,
1396 "creationDate": "CreationDate",
1397 "modDate": "ModDate",
1398 "subject": "Subject",
1399 "keywords": "Keywords",
1400 "trapped": "Trapped",
1401 }
1402 valid_keys = set(keymap.keys())
1403 diff_set = set(m.keys()).difference(valid_keys)
1404 if diff_set != set():
1405 msg = "bad dict key(s): %s" % diff_set
1406 raise ValueError(msg)
1407
1408 t, temp = doc.xref_get_key(-1, "Info")
1409 if t != "xref":
1410 info_xref = 0
1411 else:
1412 info_xref = int(temp.replace("0 R", ""))
1413
1414 if m == {} and info_xref == 0: # nothing to do
1415 return
1416
1417 if info_xref == 0: # no prev metadata: get new xref
1418 info_xref = doc.get_new_xref()
1419 doc.update_object(info_xref, "<<>>") # fill it with empty object
1420 doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref)
1421 elif m == {}: # remove existing metadata
1422 doc.xref_set_key(-1, "Info", "null")
1423 doc.init_doc()
1424 return
1425
1426 for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]:
1427 pdf_key = keymap[key]
1428 if not bool(val) or val in ("none", "null"):
1429 val = "null"
1430 else:
1431 val = pymupdf.get_pdf_str(val)
1432 doc.xref_set_key(info_xref, pdf_key, val)
1433 doc.init_doc()
1434 return
1435
1436
1437 def getDestStr(xref: int, ddict: dict) -> str:
1438 """Calculate the PDF action string.
1439
1440 Notes:
1441 Supports Link annotations and outline items (bookmarks).
1442 """
1443 if not ddict:
1444 return ""
1445 str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>"
1446 str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>"
1447 str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>"
1448 str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>"
1449 str_uri = lambda a: f"/A<</S/URI/URI{a}>>"
1450
1451 if type(ddict) in (int, float):
1452 dest = str_goto(xref, 0, ddict, 0)
1453 return dest
1454 d_kind = ddict.get("kind", pymupdf.LINK_NONE)
1455
1456 if d_kind == pymupdf.LINK_NONE:
1457 return ""
1458
1459 if ddict["kind"] == pymupdf.LINK_GOTO:
1460 d_zoom = ddict.get("zoom", 0)
1461 to = ddict.get("to", pymupdf.Point(0, 0))
1462 d_left, d_top = to
1463 dest = str_goto(xref, d_left, d_top, d_zoom)
1464 return dest
1465
1466 if ddict["kind"] == pymupdf.LINK_URI:
1467 dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),)
1468 return dest
1469
1470 if ddict["kind"] == pymupdf.LINK_LAUNCH:
1471 fspec = pymupdf.get_pdf_str(ddict["file"])
1472 dest = str_launch(fspec, fspec)
1473 return dest
1474
1475 if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0:
1476 fspec = pymupdf.get_pdf_str(ddict["file"])
1477 dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec)
1478 return dest
1479
1480 if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0:
1481 fspec = pymupdf.get_pdf_str(ddict["file"])
1482 dest = str_gotor1(
1483 ddict["page"],
1484 ddict["to"].x,
1485 ddict["to"].y,
1486 ddict["zoom"],
1487 fspec,
1488 fspec,
1489 )
1490 return dest
1491
1492 return ""
1493
1494
1495 def set_toc(
1496 doc: pymupdf.Document,
1497 toc: list,
1498 collapse: int = 1,
1499 ) -> int:
1500 """Create new outline tree (table of contents, TOC).
1501
1502 Args:
1503 toc: (list, tuple) each entry must contain level, title, page and
1504 optionally top margin on the page. None or '()' remove the TOC.
1505 collapse: (int) collapses entries beyond this level. Zero or None
1506 shows all entries unfolded.
1507 Returns:
1508 the number of inserted items, or the number of removed items respectively.
1509 """
1510 if doc.is_closed or doc.is_encrypted:
1511 raise ValueError("document closed or encrypted")
1512 if not doc.is_pdf:
1513 raise ValueError("is no PDF")
1514 if not toc: # remove all entries
1515 return len(doc._delToC())
1516
1517 # validity checks --------------------------------------------------------
1518 if type(toc) not in (list, tuple):
1519 raise ValueError("'toc' must be list or tuple")
1520 toclen = len(toc)
1521 page_count = doc.page_count
1522 t0 = toc[0]
1523 if type(t0) not in (list, tuple):
1524 raise ValueError("items must be sequences of 3 or 4 items")
1525 if t0[0] != 1:
1526 raise ValueError("hierarchy level of item 0 must be 1")
1527 for i in list(range(toclen - 1)):
1528 t1 = toc[i]
1529 t2 = toc[i + 1]
1530 if not -1 <= t1[2] <= page_count:
1531 raise ValueError("row %i: page number out of range" % i)
1532 if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4):
1533 raise ValueError("bad row %i" % (i + 1))
1534 if (type(t2[0]) is not int) or t2[0] < 1:
1535 raise ValueError("bad hierarchy level in row %i" % (i + 1))
1536 if t2[0] > t1[0] + 1:
1537 raise ValueError("bad hierarchy level in row %i" % (i + 1))
1538 # no formal errors in toc --------------------------------------------------
1539
1540 # --------------------------------------------------------------------------
1541 # make a list of xref numbers, which we can use for our TOC entries
1542 # --------------------------------------------------------------------------
1543 old_xrefs = doc._delToC() # del old outlines, get their xref numbers
1544
1545 # prepare table of xrefs for new bookmarks
1546 old_xrefs = []
1547 xref = [0] + old_xrefs
1548 xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number
1549 if toclen > len(old_xrefs): # too few old xrefs?
1550 for i in range((toclen - len(old_xrefs))):
1551 xref.append(doc.get_new_xref()) # acquire new ones
1552
1553 lvltab = {0: 0} # to store last entry per hierarchy level
1554
1555 # ------------------------------------------------------------------------------
1556 # contains new outline objects as strings - first one is the outline root
1557 # ------------------------------------------------------------------------------
1558 olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}]
1559 # ------------------------------------------------------------------------------
1560 # build olitems as a list of PDF-like connected dictionaries
1561 # ------------------------------------------------------------------------------
1562 for i in range(toclen):
1563 o = toc[i]
1564 lvl = o[0] # level
1565 title = pymupdf.get_pdf_str(o[1]) # title
1566 pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number
1567 page_xref = doc.page_xref(pno)
1568 page_height = doc.page_cropbox(pno).height
1569 top = pymupdf.Point(72, page_height - 36)
1570 dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target
1571 if o[2] < 0:
1572 dest_dict["kind"] = pymupdf.LINK_NONE
1573 if len(o) > 3: # some target is specified
1574 if type(o[3]) in (int, float): # convert a number to a point
1575 dest_dict["to"] = pymupdf.Point(72, page_height - o[3])
1576 else: # if something else, make sure we have a dict
1577 # We make a copy of o[3] to avoid modifying our caller's data.
1578 dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict
1579 if "to" not in dest_dict: # target point not in dict?
1580 dest_dict["to"] = top # put default in
1581 else: # transform target to PDF coordinates
1582 page = doc[pno]
1583 point = pymupdf.Point(dest_dict["to"])
1584 point.y = page.cropbox.height - point.y
1585 point = point * page.rotation_matrix
1586 dest_dict["to"] = (point.x, point.y)
1587 d = {}
1588 d["first"] = -1
1589 d["count"] = 0
1590 d["last"] = -1
1591 d["prev"] = -1
1592 d["next"] = -1
1593 d["dest"] = getDestStr(page_xref, dest_dict)
1594 d["top"] = dest_dict["to"]
1595 d["title"] = title
1596 d["parent"] = lvltab[lvl - 1]
1597 d["xref"] = xref[i + 1]
1598 d["color"] = dest_dict.get("color")
1599 d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0)
1600 lvltab[lvl] = i + 1
1601 parent = olitems[lvltab[lvl - 1]] # the parent entry
1602
1603 if (
1604 dest_dict.get("collapse") or collapse and lvl > collapse
1605 ): # suppress expansion
1606 parent["count"] -= 1 # make /Count negative
1607 else:
1608 parent["count"] += 1 # positive /Count
1609
1610 if parent["first"] == -1:
1611 parent["first"] = i + 1
1612 parent["last"] = i + 1
1613 else:
1614 d["prev"] = parent["last"]
1615 prev = olitems[parent["last"]]
1616 prev["next"] = i + 1
1617 parent["last"] = i + 1
1618 olitems.append(d)
1619
1620 # ------------------------------------------------------------------------------
1621 # now create each outline item as a string and insert it in the PDF
1622 # ------------------------------------------------------------------------------
1623 for i, ol in enumerate(olitems):
1624 txt = "<<"
1625 if ol["count"] != 0:
1626 txt += "/Count %i" % ol["count"]
1627 try:
1628 txt += ol["dest"]
1629 except Exception:
1630 # Verbose in PyMuPDF/tests.
1631 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1632 pass
1633 try:
1634 if ol["first"] > -1:
1635 txt += "/First %i 0 R" % xref[ol["first"]]
1636 except Exception:
1637 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1638 pass
1639 try:
1640 if ol["last"] > -1:
1641 txt += "/Last %i 0 R" % xref[ol["last"]]
1642 except Exception:
1643 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1644 pass
1645 try:
1646 if ol["next"] > -1:
1647 txt += "/Next %i 0 R" % xref[ol["next"]]
1648 except Exception:
1649 # Verbose in PyMuPDF/tests.
1650 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1651 pass
1652 try:
1653 if ol["parent"] > -1:
1654 txt += "/Parent %i 0 R" % xref[ol["parent"]]
1655 except Exception:
1656 # Verbose in PyMuPDF/tests.
1657 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1658 pass
1659 try:
1660 if ol["prev"] > -1:
1661 txt += "/Prev %i 0 R" % xref[ol["prev"]]
1662 except Exception:
1663 # Verbose in PyMuPDF/tests.
1664 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1665 pass
1666 try:
1667 txt += "/Title" + ol["title"]
1668 except Exception:
1669 # Verbose in PyMuPDF/tests.
1670 if g_exceptions_verbose >= 2: pymupdf.exception_info()
1671 pass
1672
1673 if ol.get("color") and len(ol["color"]) == 3:
1674 txt += f"/C[ {_format_g(tuple(ol['color']))}]"
1675 if ol.get("flags", 0) > 0:
1676 txt += "/F %i" % ol["flags"]
1677
1678 if i == 0: # special: this is the outline root
1679 txt += "/Type/Outlines" # so add the /Type entry
1680 txt += ">>"
1681 doc.update_object(xref[i], txt) # insert the PDF object
1682
1683 doc.init_doc()
1684 return toclen
1685
1686
1687 def do_widgets(
1688 tar: pymupdf.Document,
1689 src: pymupdf.Document,
1690 graftmap,
1691 from_page: int = -1,
1692 to_page: int = -1,
1693 start_at: int = -1,
1694 join_duplicates=0,
1695 ) -> None:
1696 """Insert widgets of copied page range into target PDF.
1697
1698 Parameter values **must** equal those of method insert_pdf() which
1699 must have been previously executed.
1700 """
1701 if not src.is_form_pdf: # nothing to do: source PDF has no fields
1702 return
1703
1704 def clean_kid_parents(acro_fields):
1705 """ Make sure all kids have correct "Parent" pointers."""
1706 for i in range(acro_fields.pdf_array_len()):
1707 parent = acro_fields.pdf_array_get(i)
1708 kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
1709 for j in range(kids.pdf_array_len()):
1710 kid = kids.pdf_array_get(j)
1711 kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent)
1712
1713 def join_widgets(pdf, acro_fields, xref1, xref2, name):
1714 """Called for each pair of widgets having the same name.
1715
1716 Args:
1717 pdf: target MuPDF document
1718 acro_fields: object Root/AcroForm/Fields
1719 xref1, xref2: widget xrefs having same names
1720 name: (str) the name
1721
1722 Result:
1723 Defined or updated widget parent that points to both widgets.
1724 """
1725
1726 def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2):
1727 """Merge widget in xref2 into "Kids" list of widget xref1.
1728
1729 Args:
1730 xref1, kids1: target widget and its "Kids" array.
1731 xref2, kids2: source wwidget and its "Kids" array (may be empty).
1732 """
1733 # make indirect objects from widgets
1734 w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0)
1735 w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0)
1736 # find source widget in "Fields" array
1737 idx = acro_fields.pdf_array_find(w2_ind)
1738 acro_fields.pdf_array_delete(idx)
1739
1740 if not kids2.pdf_is_array(): # source widget has no kids
1741 widget = mupdf.pdf_load_object(pdf, xref2)
1742
1743 # delete name from widget and insert target as parent
1744 widget.pdf_dict_del(pymupdf.PDF_NAME("T"))
1745 widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
1746
1747 # put in target Kids
1748 kids1.pdf_array_push(w2_ind)
1749 else: # copy source kids to target kids
1750 for i in range(kids2.pdf_array_len()):
1751 kid = kids2.pdf_array_get(i)
1752 kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind)
1753 kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0)
1754 kids1.pdf_array_push(kid_ind)
1755
1756 def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name):
1757 """Make new "Parent" for two widgets with same name.
1758
1759 Args:
1760 xref1, w1: first widget
1761 xref2, w2: second widget
1762 name: field name
1763
1764 Result:
1765 Both widgets have no "Kids". We create a new object with the
1766 name and a "Kids" array containing the widgets.
1767 Original widgets must be removed from AcroForm/Fields.
1768 """
1769 # make new "Parent" object
1770 new = mupdf.pdf_new_dict(pdf, 5)
1771 new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name)
1772 kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2)
1773 new_obj = mupdf.pdf_add_object(pdf, new)
1774 new_obj_xref = new_obj.pdf_to_num()
1775 new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0)
1776
1777 # copy over some required source widget properties
1778 ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT"))
1779 w1.pdf_dict_del(pymupdf.PDF_NAME("FT"))
1780 new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft)
1781
1782 aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA"))
1783 w1.pdf_dict_del(pymupdf.PDF_NAME("AA"))
1784 new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa)
1785
1786 # remove name field, insert "Parent" field in source widgets
1787 w1.pdf_dict_del(pymupdf.PDF_NAME("T"))
1788 w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
1789 w2.pdf_dict_del(pymupdf.PDF_NAME("T"))
1790 w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind)
1791
1792 # put source widgets in "kids" array
1793 ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0)
1794 ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0)
1795 kids.pdf_array_push(ind1)
1796 kids.pdf_array_push(ind2)
1797
1798 # remove source widgets from "AcroForm/Fields"
1799 idx = acro_fields.pdf_array_find(ind1)
1800 acro_fields.pdf_array_delete(idx)
1801 idx = acro_fields.pdf_array_find(ind2)
1802 acro_fields.pdf_array_delete(idx)
1803
1804 acro_fields.pdf_array_push(new_ind)
1805
1806 w1 = mupdf.pdf_load_object(pdf, xref1)
1807 w2 = mupdf.pdf_load_object(pdf, xref2)
1808 kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
1809 kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids"))
1810
1811 # check which widget has a suitable "Kids" array
1812 if kids1.pdf_is_array():
1813 re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order
1814 elif kids2.pdf_is_array():
1815 re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order
1816 else:
1817 new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order
1818
1819 def get_kids(parent, kids_list):
1820 """Return xref list of leaf kids for a parent.
1821
1822 Call with an empty list.
1823 """
1824 kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids"))
1825 if not kids.pdf_is_array():
1826 return kids_list
1827 for i in range(kids.pdf_array_len()):
1828 kid = kids.pdf_array_get(i)
1829 if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))):
1830 kids_list = get_kids(kid, kids_list)
1831 else:
1832 kids_list.append(kid.pdf_to_num())
1833 return kids_list
1834
1835 def kids_xrefs(widget):
1836 """Get the xref of top "Parent" and the list of leaf widgets."""
1837 kids_list = []
1838 parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent"))
1839 parent_xref = parent.pdf_to_num()
1840 if parent_xref == 0:
1841 return parent_xref, kids_list
1842 kids_list = get_kids(parent, kids_list)
1843 return parent_xref, kids_list
1844
1845 def deduplicate_names(pdf, acro_fields, join_duplicates=False):
1846 """Handle any widget name duplicates caused by the merge."""
1847 names = {} # key is a widget name, value a list of widgets having it.
1848
1849 # extract all names and widgets in "AcroForm/Fields"
1850 for i in range(mupdf.pdf_array_len(acro_fields)):
1851 wobject = mupdf.pdf_array_get(acro_fields, i)
1852 xref = wobject.pdf_to_num()
1853
1854 # extract widget name and collect widget(s) using it
1855 T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T"))
1856 xrefs = names.get(T, [])
1857 xrefs.append(xref)
1858 names[T] = xrefs
1859
1860 for name, xrefs in names.items():
1861 if len(xrefs) < 2:
1862 continue
1863 xref0, xref1 = xrefs[:2] # only exactly 2 should occur!
1864 if join_duplicates: # combine fields with equal names
1865 join_widgets(pdf, acro_fields, xref0, xref1, name)
1866 else: # make field names unique
1867 newname = name + f" [{xref1}]" # append this to the name
1868 wobject = mupdf.pdf_load_object(pdf, xref1)
1869 wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname)
1870
1871 clean_kid_parents(acro_fields)
1872
1873 def get_acroform(doc):
1874 """Retrieve the AcroForm dictionary form a PDF."""
1875 pdf = mupdf.pdf_document_from_fz_document(doc)
1876 # AcroForm (= central form field info)
1877 return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm")
1878
1879 tarpdf = mupdf.pdf_document_from_fz_document(tar)
1880 srcpdf = mupdf.pdf_document_from_fz_document(src)
1881
1882 if tar.is_form_pdf:
1883 # target is a Form PDF, so use it to include source fields
1884 acro = get_acroform(tar)
1885 # Important arrays in AcroForm
1886 acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
1887 tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO"))
1888 if not tar_co.pdf_is_array():
1889 tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
1890 else:
1891 # target is no Form PDF, so copy over source AcroForm
1892 acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy
1893
1894 # Clear "Fields" and "CO" arrays: will be populated by page fields.
1895 # This is required to avoid copying unneeded objects.
1896 acro.pdf_dict_del(pymupdf.PDF_NAME("Fields"))
1897 acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5)
1898 acro.pdf_dict_del(pymupdf.PDF_NAME("CO"))
1899 acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5)
1900
1901 # Enrich AcroForm for copying to target
1902 acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro)
1903
1904 # Insert AcroForm into target PDF
1905 acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft)
1906 acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields"))
1907 tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO"))
1908
1909 # get its xref and insert it into target catalog
1910 tar_xref = acro_tar.pdf_to_num()
1911 acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
1912 root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root"))
1913 root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind)
1914
1915 if from_page <= to_page:
1916 src_range = range(from_page, to_page + 1)
1917 else:
1918 src_range = range(from_page, to_page - 1, -1)
1919
1920 parents = {} # information about widget parents
1921
1922 # remove "P" owning page reference from all widgets of all source pages
1923 for i in src_range:
1924 src_page = src[i]
1925 for xref in [
1926 xref
1927 for xref, wtype, _ in src_page.annot_xrefs()
1928 if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
1929 ]:
1930 w_obj = mupdf.pdf_load_object(srcpdf, xref)
1931 w_obj.pdf_dict_del(pymupdf.PDF_NAME("P"))
1932
1933 # get the widget's parent structure
1934 parent_xref, old_kids = kids_xrefs(w_obj)
1935 if parent_xref:
1936 parents[parent_xref] = {
1937 "new_xref": 0,
1938 "old_kids": old_kids,
1939 "new_kids": [],
1940 }
1941 # Copy over Parent widgets first - they are not page-dependent
1942 for xref in parents.keys(): # pylint: disable=consider-using-dict-items
1943 parent = mupdf.pdf_load_object(srcpdf, xref)
1944 parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent)
1945 parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft)
1946 kids_xrefs_new = get_kids(parent_tar, [])
1947 parent_xref_new = parent_tar.pdf_to_num()
1948 parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0)
1949 acro_fields.pdf_array_push(parent_ind)
1950 parents[xref]["new_xref"] = parent_xref_new
1951 parents[xref]["new_kids"] = kids_xrefs_new
1952
1953 for i in range(len(src_range)):
1954 # read first copied over page in target
1955 tar_page = tar[start_at + i]
1956
1957 # read the original page in the source PDF
1958 src_page = src[src_range[i]]
1959
1960 # now walk through source page widgets and copy over
1961 w_xrefs = [ # widget xrefs of the source page
1962 xref
1963 for xref, wtype, _ in src_page.annot_xrefs()
1964 if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
1965 ]
1966 if not w_xrefs: # no widgets on this source page
1967 continue
1968
1969 # convert to formal PDF page
1970 tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
1971
1972 # extract annotations array
1973 tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"))
1974 if not mupdf.pdf_is_array(tar_annots):
1975 tar_annots = mupdf.pdf_dict_put_array(
1976 tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5
1977 )
1978
1979 for xref in w_xrefs:
1980 w_obj = mupdf.pdf_load_object(srcpdf, xref)
1981
1982 # check if field takes part in inter-field validations
1983 is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C"))
1984
1985 # check if parent of widget already in target
1986 parent_xref = mupdf.pdf_to_num(
1987 w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent"))
1988 )
1989 if parent_xref == 0: # parent not in target yet
1990 try:
1991 w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
1992 except Exception as e:
1993 pymupdf.message_warning(f"cannot copy widget at {xref=}: {e}")
1994 continue
1995 w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
1996 tar_xref = w_obj_tar.pdf_to_num()
1997 w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
1998 mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
1999 mupdf.pdf_array_push(acro_fields, w_obj_tar_ind)
2000 else:
2001 parent = parents[parent_xref]
2002 idx = parent["old_kids"].index(xref) # search for xref in parent
2003 tar_xref = parent["new_kids"][idx]
2004 w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
2005 mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
2006
2007 # Into "AcroForm/CO" if a computation field.
2008 if is_aac:
2009 mupdf.pdf_array_push(tar_co, w_obj_tar_ind)
2010
2011 deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates)
2012
2013 def do_links(
2014 doc1: pymupdf.Document,
2015 doc2: pymupdf.Document,
2016 from_page: int = -1,
2017 to_page: int = -1,
2018 start_at: int = -1,
2019 ) -> None:
2020 """Insert links contained in copied page range into destination PDF.
2021
2022 Parameter values **must** equal those of method insert_pdf(), which must
2023 have been previously executed.
2024 """
2025 #pymupdf.log( 'utils.do_links()')
2026 # --------------------------------------------------------------------------
2027 # internal function to create the actual "/Annots" object string
2028 # --------------------------------------------------------------------------
2029 def cre_annot(lnk, xref_dst, pno_src, ctm):
2030 """Create annotation object string for a passed-in link."""
2031
2032 r = lnk["from"] * ctm # rect in PDF coordinates
2033 rect = _format_g(tuple(r))
2034 if lnk["kind"] == pymupdf.LINK_GOTO:
2035 txt = pymupdf.annot_skel["goto1"] # annot_goto
2036 idx = pno_src.index(lnk["page"])
2037 p = lnk["to"] * ctm # target point in PDF coordinates
2038 annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect)
2039
2040 elif lnk["kind"] == pymupdf.LINK_GOTOR:
2041 if lnk["page"] >= 0:
2042 txt = pymupdf.annot_skel["gotor1"] # annot_gotor
2043 pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
2044 if type(pnt) is not pymupdf.Point:
2045 pnt = pymupdf.Point(0, 0)
2046 annot = txt(
2047 lnk["page"],
2048 pnt.x,
2049 pnt.y,
2050 lnk["zoom"],
2051 lnk["file"],
2052 lnk["file"],
2053 rect,
2054 )
2055 else:
2056 txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
2057 to = pymupdf.get_pdf_str(lnk["to"])
2058 to = to[1:-1]
2059 f = lnk["file"]
2060 annot = txt(to, f, rect)
2061
2062 elif lnk["kind"] == pymupdf.LINK_LAUNCH:
2063 txt = pymupdf.annot_skel["launch"] # annot_launch
2064 annot = txt(lnk["file"], lnk["file"], rect)
2065
2066 elif lnk["kind"] == pymupdf.LINK_URI:
2067 txt = pymupdf.annot_skel["uri"] # annot_uri
2068 annot = txt(lnk["uri"], rect)
2069
2070 else:
2071 annot = ""
2072
2073 return annot
2074
2075 # --------------------------------------------------------------------------
2076
2077 # validate & normalize parameters
2078 if from_page < 0:
2079 fp = 0
2080 elif from_page >= doc2.page_count:
2081 fp = doc2.page_count - 1
2082 else:
2083 fp = from_page
2084
2085 if to_page < 0 or to_page >= doc2.page_count:
2086 tp = doc2.page_count - 1
2087 else:
2088 tp = to_page
2089
2090 if start_at < 0:
2091 raise ValueError("'start_at' must be >= 0")
2092 sa = start_at
2093
2094 incr = 1 if fp <= tp else -1 # page range could be reversed
2095
2096 # lists of source / destination page numbers
2097 pno_src = list(range(fp, tp + incr, incr))
2098 pno_dst = [sa + i for i in range(len(pno_src))]
2099
2100 # lists of source / destination page xrefs
2101 xref_src = []
2102 xref_dst = []
2103 for i in range(len(pno_src)):
2104 p_src = pno_src[i]
2105 p_dst = pno_dst[i]
2106 old_xref = doc2.page_xref(p_src)
2107 new_xref = doc1.page_xref(p_dst)
2108 xref_src.append(old_xref)
2109 xref_dst.append(new_xref)
2110
2111 # create the links for each copied page in destination PDF
2112 for i in range(len(xref_src)):
2113 page_src = doc2[pno_src[i]] # load source page
2114 links = page_src.get_links() # get all its links
2115 #pymupdf.log( '{pno_src=}')
2116 #pymupdf.log( '{type(page_src)=}')
2117 #pymupdf.log( '{page_src=}')
2118 #pymupdf.log( '{=i len(links)}')
2119 if len(links) == 0: # no links there
2120 page_src = None
2121 continue
2122 ctm = ~page_src.transformation_matrix # calc page transformation matrix
2123 page_dst = doc1[pno_dst[i]] # load destination page
2124 link_tab = [] # store all link definitions here
2125 for l in links:
2126 if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src):
2127 continue # GOTO link target not in copied pages
2128 annot_text = cre_annot(l, xref_dst, pno_src, ctm)
2129 if annot_text:
2130 link_tab.append(annot_text)
2131 if link_tab != []:
2132 page_dst._addAnnot_FromString( tuple(link_tab))
2133 #pymupdf.log( 'utils.do_links() returning.')
2134
2135
2136 def getLinkText(page: pymupdf.Page, lnk: dict) -> str:
2137 # --------------------------------------------------------------------------
2138 # define skeletons for /Annots object texts
2139 # --------------------------------------------------------------------------
2140 ctm = page.transformation_matrix
2141 ictm = ~ctm
2142 r = lnk["from"]
2143 rect = _format_g(tuple(r * ictm))
2144
2145 annot = ""
2146 if lnk["kind"] == pymupdf.LINK_GOTO:
2147 if lnk["page"] >= 0:
2148 txt = pymupdf.annot_skel["goto1"] # annot_goto
2149 pno = lnk["page"]
2150 xref = page.parent.page_xref(pno)
2151 pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
2152 dest_page = page.parent[pno]
2153 dest_ctm = dest_page.transformation_matrix
2154 dest_ictm = ~dest_ctm
2155 ipnt = pnt * dest_ictm
2156 annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect)
2157 else:
2158 txt = pymupdf.annot_skel["goto2"] # annot_goto_n
2159 annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect)
2160
2161 elif lnk["kind"] == pymupdf.LINK_GOTOR:
2162 if lnk["page"] >= 0:
2163 txt = pymupdf.annot_skel["gotor1"] # annot_gotor
2164 pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point
2165 if type(pnt) is not pymupdf.Point:
2166 pnt = pymupdf.Point(0, 0)
2167 annot = txt(
2168 lnk["page"],
2169 pnt.x,
2170 pnt.y,
2171 lnk.get("zoom", 0),
2172 lnk["file"],
2173 lnk["file"],
2174 rect,
2175 )
2176 else:
2177 txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n
2178 annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect)
2179
2180 elif lnk["kind"] == pymupdf.LINK_LAUNCH:
2181 txt = pymupdf.annot_skel["launch"] # annot_launch
2182 annot = txt(lnk["file"], lnk["file"], rect)
2183
2184 elif lnk["kind"] == pymupdf.LINK_URI:
2185 txt = pymupdf.annot_skel["uri"] # txt = annot_uri
2186 annot = txt(lnk["uri"], rect)
2187
2188 elif lnk["kind"] == pymupdf.LINK_NAMED:
2189 txt = pymupdf.annot_skel["named"] # annot_named
2190 lname = lnk.get("name") # check presence of key
2191 if lname is None: # if missing, fall back to alternative
2192 lname = lnk["nameddest"]
2193 annot = txt(lname, rect)
2194 if not annot:
2195 return annot
2196
2197 # add a /NM PDF key to the object definition
2198 link_names = dict( # existing ids and their xref
2199 [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member
2200 )
2201
2202 old_name = lnk.get("id", "") # id value in the argument
2203
2204 if old_name and (lnk["xref"], old_name) in link_names.items():
2205 name = old_name # no new name if this is an update only
2206 else:
2207 i = 0
2208 stem = pymupdf.TOOLS.set_annot_stem() + "-L%i"
2209 while True:
2210 name = stem % i
2211 if name not in link_names.values():
2212 break
2213 i += 1
2214 # add /NM key to object definition
2215 annot = annot.replace("/Link", "/Link/NM(%s)" % name)
2216 return annot
2217
2218
2219 def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget:
2220 """Delete widget from page and return the next one."""
2221 pymupdf.CheckParent(page)
2222 annot = getattr(widget, "_annot", None)
2223 if annot is None:
2224 raise ValueError("bad type: widget")
2225 nextwidget = widget.next
2226 page.delete_annot(annot)
2227 widget._annot.parent = None
2228 keylist = list(widget.__dict__.keys())
2229 for key in keylist:
2230 del widget.__dict__[key]
2231 return nextwidget
2232
2233
2234 def update_link(page: pymupdf.Page, lnk: dict) -> None:
2235 """Update a link on the current page."""
2236 pymupdf.CheckParent(page)
2237 annot = getLinkText(page, lnk)
2238 if annot == "":
2239 raise ValueError("link kind not supported")
2240
2241 page.parent.update_object(lnk["xref"], annot, page=page)
2242
2243
2244 def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None:
2245 """Insert a new link for the current page."""
2246 pymupdf.CheckParent(page)
2247 annot = getLinkText(page, lnk)
2248 if annot == "":
2249 raise ValueError("link kind not supported")
2250 page._addAnnot_FromString((annot,))
2251
2252
2253 def insert_textbox(
2254 page: pymupdf.Page,
2255 rect: rect_like,
2256 buffer: typing.Union[str, list],
2257 *,
2258 fontname: str = "helv",
2259 fontfile: OptStr = None,
2260 set_simple: int = 0,
2261 encoding: int = 0,
2262 fontsize: float = 11,
2263 lineheight: OptFloat = None,
2264 color: OptSeq = None,
2265 fill: OptSeq = None,
2266 expandtabs: int = 1,
2267 align: int = 0,
2268 rotate: int = 0,
2269 render_mode: int = 0,
2270 miter_limit: float = 1,
2271 border_width: float = 0.05,
2272 morph: OptSeq = None,
2273 overlay: bool = True,
2274 stroke_opacity: float = 1,
2275 fill_opacity: float = 1,
2276 oc: int = 0,
2277 ) -> float:
2278 """Insert text into a given rectangle.
2279
2280 Notes:
2281 Creates a Shape object, uses its same-named method and commits it.
2282 Parameters:
2283 rect: (rect-like) area to use for text.
2284 buffer: text to be inserted
2285 fontname: a Base-14 font, font name or '/name'
2286 fontfile: name of a font file
2287 fontsize: font size
2288 lineheight: overwrite the font property
2289 color: RGB color triple
2290 expandtabs: handles tabulators with string function
2291 align: left, center, right, justified
2292 rotate: 0, 90, 180, or 270 degrees
2293 morph: morph box with a matrix and a fixpoint
2294 overlay: put text in foreground or background
2295 Returns:
2296 unused or deficit rectangle area (float)
2297 """
2298 img = page.new_shape()
2299 rc = img.insert_textbox(
2300 rect,
2301 buffer,
2302 fontsize=fontsize,
2303 lineheight=lineheight,
2304 fontname=fontname,
2305 fontfile=fontfile,
2306 set_simple=set_simple,
2307 encoding=encoding,
2308 color=color,
2309 fill=fill,
2310 expandtabs=expandtabs,
2311 render_mode=render_mode,
2312 miter_limit=miter_limit,
2313 border_width=border_width,
2314 align=align,
2315 rotate=rotate,
2316 morph=morph,
2317 stroke_opacity=stroke_opacity,
2318 fill_opacity=fill_opacity,
2319 oc=oc,
2320 )
2321 if rc >= 0:
2322 img.commit(overlay)
2323 return rc
2324
2325
2326 def insert_text(
2327 page: pymupdf.Page,
2328 point: point_like,
2329 text: typing.Union[str, list],
2330 *,
2331 fontsize: float = 11,
2332 lineheight: OptFloat = None,
2333 fontname: str = "helv",
2334 fontfile: OptStr = None,
2335 set_simple: int = 0,
2336 encoding: int = 0,
2337 color: OptSeq = None,
2338 fill: OptSeq = None,
2339 border_width: float = 0.05,
2340 miter_limit: float = 1,
2341 render_mode: int = 0,
2342 rotate: int = 0,
2343 morph: OptSeq = None,
2344 overlay: bool = True,
2345 stroke_opacity: float = 1,
2346 fill_opacity: float = 1,
2347 oc: int = 0,
2348 ):
2349
2350 img = page.new_shape()
2351 rc = img.insert_text(
2352 point,
2353 text,
2354 fontsize=fontsize,
2355 lineheight=lineheight,
2356 fontname=fontname,
2357 fontfile=fontfile,
2358 set_simple=set_simple,
2359 encoding=encoding,
2360 color=color,
2361 fill=fill,
2362 border_width=border_width,
2363 render_mode=render_mode,
2364 miter_limit=miter_limit,
2365 rotate=rotate,
2366 morph=morph,
2367 stroke_opacity=stroke_opacity,
2368 fill_opacity=fill_opacity,
2369 oc=oc,
2370 )
2371 if rc >= 0:
2372 img.commit(overlay)
2373 return rc
2374
2375
2376 def insert_htmlbox(
2377 page,
2378 rect,
2379 text,
2380 *,
2381 css=None,
2382 scale_low=0,
2383 archive=None,
2384 rotate=0,
2385 oc=0,
2386 opacity=1,
2387 overlay=True,
2388 ) -> float:
2389 """Insert text with optional HTML tags and stylings into a rectangle.
2390
2391 Args:
2392 rect: (rect-like) rectangle into which the text should be placed.
2393 text: (str) text with optional HTML tags and stylings.
2394 css: (str) CSS styling commands.
2395 scale_low: (float) force-fit content by scaling it down. Must be in
2396 range [0, 1]. If 1, no scaling will take place. If 0, arbitrary
2397 down-scaling is acceptable. A value of 0.1 would mean that content
2398 may be scaled down by at most 90%.
2399 archive: Archive object pointing to locations of used fonts or images
2400 rotate: (int) rotate the text in the box by a multiple of 90 degrees.
2401 oc: (int) the xref of an OCG / OCMD (Optional Content).
2402 opacity: (float) set opacity of inserted content.
2403 overlay: (bool) put text on top of page content.
2404 Returns:
2405 A tuple of floats (spare_height, scale).
2406 spare_height: -1 if content did not fit, else >= 0. It is the height of the
2407 unused (still available) rectangle stripe. Positive only if
2408 scale_min = 1 (no down scaling).
2409 scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit).
2410 """
2411
2412 # normalize rotation angle
2413 if not rotate % 90 == 0:
2414 raise ValueError("bad rotation angle")
2415 while rotate < 0:
2416 rotate += 360
2417 while rotate >= 360:
2418 rotate -= 360
2419
2420 if not 0 <= scale_low <= 1:
2421 raise ValueError("'scale_low' must be in [0, 1]")
2422
2423 if css is None:
2424 css = ""
2425
2426 rect = pymupdf.Rect(rect)
2427 if rotate in (90, 270):
2428 temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width)
2429 else:
2430 temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height)
2431
2432 # use a small border by default
2433 mycss = "body {margin:1px;}" + css # append user CSS
2434
2435 # either make a story, or accept a given one
2436 if isinstance(text, str): # if a string, convert to a Story
2437 story = pymupdf.Story(html=text, user_css=mycss, archive=archive)
2438 elif isinstance(text, pymupdf.Story):
2439 story = text
2440 else:
2441 raise ValueError("'text' must be a string or a Story")
2442 # ----------------------------------------------------------------
2443 # Find a scaling factor that lets our story fit in
2444 # ----------------------------------------------------------------
2445 scale_max = None if scale_low == 0 else 1 / scale_low
2446
2447 fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max)
2448 if not fit.big_enough: # there was no fit
2449 return (-1, scale_low)
2450
2451 filled = fit.filled
2452 scale = 1 / fit.parameter # shrink factor
2453
2454 spare_height = fit.rect.y1 - filled[3] # unused room at rectangle bottom
2455 # Note: due to MuPDF's logic this may be negative even for successful fits.
2456 if scale != 1 or spare_height < 0: # if scaling occurred, set spare_height to 0
2457 spare_height = 0
2458
2459 def rect_function(*args):
2460 return fit.rect, fit.rect, pymupdf.Identity
2461
2462 # draw story on temp PDF page
2463 doc = story.write_with_links(rect_function)
2464
2465 # Insert opacity if requested.
2466 # For this, we prepend a command to the /Contents.
2467 if 0 <= opacity < 1:
2468 tpage = doc[0] # load page
2469 # generate /ExtGstate for the page
2470 alp0 = tpage._set_opacity(CA=opacity, ca=opacity)
2471 s = f"/{alp0} gs\n" # generate graphic state command
2472 pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0)
2473
2474 # put result in target page
2475 page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay)
2476
2477 # -------------------------------------------------------------------------
2478 # re-insert links in target rect (show_pdf_page cannot copy annotations)
2479 # -------------------------------------------------------------------------
2480 # scaled center point of fit.rect
2481 mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale
2482
2483 # center point of target rect
2484 mp2 = (rect.tl + rect.br) / 2
2485
2486 # compute link positioning matrix:
2487 # - move center of scaled-down fit.rect to (0,0)
2488 # - rotate
2489 # - move (0,0) to center of target rect
2490 mat = (
2491 pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y)
2492 * pymupdf.Matrix(-rotate)
2493 * pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y)
2494 )
2495
2496 # copy over links
2497 for link in doc[0].get_links():
2498 link["from"] *= mat
2499 page.insert_link(link)
2500
2501 return spare_height, scale
2502
2503
2504 def new_page(
2505 doc: pymupdf.Document,
2506 pno: int = -1,
2507 width: float = 595,
2508 height: float = 842,
2509 ) -> pymupdf.Page:
2510 """Create and return a new page object.
2511
2512 Args:
2513 pno: (int) insert before this page. Default: after last page.
2514 width: (float) page width in points. Default: 595 (ISO A4 width).
2515 height: (float) page height in points. Default 842 (ISO A4 height).
2516 Returns:
2517 A pymupdf.Page object.
2518 """
2519 doc._newPage(pno, width=width, height=height)
2520 return doc[pno]
2521
2522
2523 def insert_page(
2524 doc: pymupdf.Document,
2525 pno: int,
2526 text: typing.Union[str, list, None] = None,
2527 fontsize: float = 11,
2528 width: float = 595,
2529 height: float = 842,
2530 fontname: str = "helv",
2531 fontfile: OptStr = None,
2532 color: OptSeq = (0,),
2533 ) -> int:
2534 """Create a new PDF page and insert some text.
2535
2536 Notes:
2537 Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text().
2538 For parameter details see these methods.
2539 """
2540 page = doc.new_page(pno=pno, width=width, height=height)
2541 if not bool(text):
2542 return 0
2543 rc = page.insert_text(
2544 (50, 72),
2545 text,
2546 fontsize=fontsize,
2547 fontname=fontname,
2548 fontfile=fontfile,
2549 color=color,
2550 )
2551 return rc
2552
2553
2554 def draw_line(
2555 page: pymupdf.Page,
2556 p1: point_like,
2557 p2: point_like,
2558 color: OptSeq = (0,),
2559 dashes: OptStr = None,
2560 width: float = 1,
2561 lineCap: int = 0,
2562 lineJoin: int = 0,
2563 overlay: bool = True,
2564 morph: OptSeq = None,
2565 stroke_opacity: float = 1,
2566 fill_opacity: float = 1,
2567 oc=0,
2568 ) -> pymupdf.Point:
2569 """Draw a line from point p1 to point p2."""
2570 img = page.new_shape()
2571 p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2))
2572 img.finish(
2573 color=color,
2574 dashes=dashes,
2575 width=width,
2576 closePath=False,
2577 lineCap=lineCap,
2578 lineJoin=lineJoin,
2579 morph=morph,
2580 stroke_opacity=stroke_opacity,
2581 fill_opacity=fill_opacity,
2582 oc=oc,
2583 )
2584 img.commit(overlay)
2585
2586 return p
2587
2588
2589 def draw_squiggle(
2590 page: pymupdf.Page,
2591 p1: point_like,
2592 p2: point_like,
2593 breadth: float = 2,
2594 color: OptSeq = (0,),
2595 dashes: OptStr = None,
2596 width: float = 1,
2597 lineCap: int = 0,
2598 lineJoin: int = 0,
2599 overlay: bool = True,
2600 morph: OptSeq = None,
2601 stroke_opacity: float = 1,
2602 fill_opacity: float = 1,
2603 oc: int = 0,
2604 ) -> pymupdf.Point:
2605 """Draw a squiggly line from point p1 to point p2."""
2606 img = page.new_shape()
2607 p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
2608 img.finish(
2609 color=color,
2610 dashes=dashes,
2611 width=width,
2612 closePath=False,
2613 lineCap=lineCap,
2614 lineJoin=lineJoin,
2615 morph=morph,
2616 stroke_opacity=stroke_opacity,
2617 fill_opacity=fill_opacity,
2618 oc=oc,
2619 )
2620 img.commit(overlay)
2621
2622 return p
2623
2624
2625 def draw_zigzag(
2626 page: pymupdf.Page,
2627 p1: point_like,
2628 p2: point_like,
2629 breadth: float = 2,
2630 color: OptSeq = (0,),
2631 dashes: OptStr = None,
2632 width: float = 1,
2633 lineCap: int = 0,
2634 lineJoin: int = 0,
2635 overlay: bool = True,
2636 morph: OptSeq = None,
2637 stroke_opacity: float = 1,
2638 fill_opacity: float = 1,
2639 oc: int = 0,
2640 ) -> pymupdf.Point:
2641 """Draw a zigzag line from point p1 to point p2."""
2642 img = page.new_shape()
2643 p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth)
2644 img.finish(
2645 color=color,
2646 dashes=dashes,
2647 width=width,
2648 closePath=False,
2649 lineCap=lineCap,
2650 lineJoin=lineJoin,
2651 morph=morph,
2652 stroke_opacity=stroke_opacity,
2653 fill_opacity=fill_opacity,
2654 oc=oc,
2655 )
2656 img.commit(overlay)
2657
2658 return p
2659
2660
2661 def draw_rect(
2662 page: pymupdf.Page,
2663 rect: rect_like,
2664 color: OptSeq = (0,),
2665 fill: OptSeq = None,
2666 dashes: OptStr = None,
2667 width: float = 1,
2668 lineCap: int = 0,
2669 lineJoin: int = 0,
2670 morph: OptSeq = None,
2671 overlay: bool = True,
2672 stroke_opacity: float = 1,
2673 fill_opacity: float = 1,
2674 oc: int = 0,
2675 radius=None,
2676 ) -> pymupdf.Point:
2677 '''
2678 Draw a rectangle. See Shape class method for details.
2679 '''
2680 img = page.new_shape()
2681 Q = img.draw_rect(pymupdf.Rect(rect), radius=radius)
2682 img.finish(
2683 color=color,
2684 fill=fill,
2685 dashes=dashes,
2686 width=width,
2687 lineCap=lineCap,
2688 lineJoin=lineJoin,
2689 morph=morph,
2690 stroke_opacity=stroke_opacity,
2691 fill_opacity=fill_opacity,
2692 oc=oc,
2693 )
2694 img.commit(overlay)
2695
2696 return Q
2697
2698
2699 def draw_quad(
2700 page: pymupdf.Page,
2701 quad: quad_like,
2702 color: OptSeq = (0,),
2703 fill: OptSeq = None,
2704 dashes: OptStr = None,
2705 width: float = 1,
2706 lineCap: int = 0,
2707 lineJoin: int = 0,
2708 morph: OptSeq = None,
2709 overlay: bool = True,
2710 stroke_opacity: float = 1,
2711 fill_opacity: float = 1,
2712 oc: int = 0,
2713 ) -> pymupdf.Point:
2714 """Draw a quadrilateral."""
2715 img = page.new_shape()
2716 Q = img.draw_quad(pymupdf.Quad(quad))
2717 img.finish(
2718 color=color,
2719 fill=fill,
2720 dashes=dashes,
2721 width=width,
2722 lineCap=lineCap,
2723 lineJoin=lineJoin,
2724 morph=morph,
2725 stroke_opacity=stroke_opacity,
2726 fill_opacity=fill_opacity,
2727 oc=oc,
2728 )
2729 img.commit(overlay)
2730
2731 return Q
2732
2733
2734 def draw_polyline(
2735 page: pymupdf.Page,
2736 points: list,
2737 color: OptSeq = (0,),
2738 fill: OptSeq = None,
2739 dashes: OptStr = None,
2740 width: float = 1,
2741 morph: OptSeq = None,
2742 lineCap: int = 0,
2743 lineJoin: int = 0,
2744 overlay: bool = True,
2745 closePath: bool = False,
2746 stroke_opacity: float = 1,
2747 fill_opacity: float = 1,
2748 oc: int = 0,
2749 ) -> pymupdf.Point:
2750 """Draw multiple connected line segments."""
2751 img = page.new_shape()
2752 Q = img.draw_polyline(points)
2753 img.finish(
2754 color=color,
2755 fill=fill,
2756 dashes=dashes,
2757 width=width,
2758 lineCap=lineCap,
2759 lineJoin=lineJoin,
2760 morph=morph,
2761 closePath=closePath,
2762 stroke_opacity=stroke_opacity,
2763 fill_opacity=fill_opacity,
2764 oc=oc,
2765 )
2766 img.commit(overlay)
2767
2768 return Q
2769
2770
2771 def draw_circle(
2772 page: pymupdf.Page,
2773 center: point_like,
2774 radius: float,
2775 color: OptSeq = (0,),
2776 fill: OptSeq = None,
2777 morph: OptSeq = None,
2778 dashes: OptStr = None,
2779 width: float = 1,
2780 lineCap: int = 0,
2781 lineJoin: int = 0,
2782 overlay: bool = True,
2783 stroke_opacity: float = 1,
2784 fill_opacity: float = 1,
2785 oc: int = 0,
2786 ) -> pymupdf.Point:
2787 """Draw a circle given its center and radius."""
2788 img = page.new_shape()
2789 Q = img.draw_circle(pymupdf.Point(center), radius)
2790 img.finish(
2791 color=color,
2792 fill=fill,
2793 dashes=dashes,
2794 width=width,
2795 lineCap=lineCap,
2796 lineJoin=lineJoin,
2797 morph=morph,
2798 stroke_opacity=stroke_opacity,
2799 fill_opacity=fill_opacity,
2800 oc=oc,
2801 )
2802 img.commit(overlay)
2803 return Q
2804
2805
2806 def draw_oval(
2807 page: pymupdf.Page,
2808 rect: typing.Union[rect_like, quad_like],
2809 color: OptSeq = (0,),
2810 fill: OptSeq = None,
2811 dashes: OptStr = None,
2812 morph: OptSeq = None,
2813 width: float = 1,
2814 lineCap: int = 0,
2815 lineJoin: int = 0,
2816 overlay: bool = True,
2817 stroke_opacity: float = 1,
2818 fill_opacity: float = 1,
2819 oc: int = 0,
2820 ) -> pymupdf.Point:
2821 """Draw an oval given its containing rectangle or quad."""
2822 img = page.new_shape()
2823 Q = img.draw_oval(rect)
2824 img.finish(
2825 color=color,
2826 fill=fill,
2827 dashes=dashes,
2828 width=width,
2829 lineCap=lineCap,
2830 lineJoin=lineJoin,
2831 morph=morph,
2832 stroke_opacity=stroke_opacity,
2833 fill_opacity=fill_opacity,
2834 oc=oc,
2835 )
2836 img.commit(overlay)
2837
2838 return Q
2839
2840
2841 def draw_curve(
2842 page: pymupdf.Page,
2843 p1: point_like,
2844 p2: point_like,
2845 p3: point_like,
2846 color: OptSeq = (0,),
2847 fill: OptSeq = None,
2848 dashes: OptStr = None,
2849 width: float = 1,
2850 morph: OptSeq = None,
2851 closePath: bool = False,
2852 lineCap: int = 0,
2853 lineJoin: int = 0,
2854 overlay: bool = True,
2855 stroke_opacity: float = 1,
2856 fill_opacity: float = 1,
2857 oc: int = 0,
2858 ) -> pymupdf.Point:
2859 """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3."""
2860 img = page.new_shape()
2861 Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3))
2862 img.finish(
2863 color=color,
2864 fill=fill,
2865 dashes=dashes,
2866 width=width,
2867 lineCap=lineCap,
2868 lineJoin=lineJoin,
2869 morph=morph,
2870 closePath=closePath,
2871 stroke_opacity=stroke_opacity,
2872 fill_opacity=fill_opacity,
2873 oc=oc,
2874 )
2875 img.commit(overlay)
2876
2877 return Q
2878
2879
2880 def draw_bezier(
2881 page: pymupdf.Page,
2882 p1: point_like,
2883 p2: point_like,
2884 p3: point_like,
2885 p4: point_like,
2886 color: OptSeq = (0,),
2887 fill: OptSeq = None,
2888 dashes: OptStr = None,
2889 width: float = 1,
2890 morph: OptStr = None,
2891 closePath: bool = False,
2892 lineCap: int = 0,
2893 lineJoin: int = 0,
2894 overlay: bool = True,
2895 stroke_opacity: float = 1,
2896 fill_opacity: float = 1,
2897 oc: int = 0,
2898 ) -> pymupdf.Point:
2899 """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3."""
2900 img = page.new_shape()
2901 Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4))
2902 img.finish(
2903 color=color,
2904 fill=fill,
2905 dashes=dashes,
2906 width=width,
2907 lineCap=lineCap,
2908 lineJoin=lineJoin,
2909 morph=morph,
2910 closePath=closePath,
2911 stroke_opacity=stroke_opacity,
2912 fill_opacity=fill_opacity,
2913 oc=oc,
2914 )
2915 img.commit(overlay)
2916
2917 return Q
2918
2919
2920 def draw_sector(
2921 page: pymupdf.Page,
2922 center: point_like,
2923 point: point_like,
2924 beta: float,
2925 color: OptSeq = (0,),
2926 fill: OptSeq = None,
2927 dashes: OptStr = None,
2928 fullSector: bool = True,
2929 morph: OptSeq = None,
2930 width: float = 1,
2931 closePath: bool = False,
2932 lineCap: int = 0,
2933 lineJoin: int = 0,
2934 overlay: bool = True,
2935 stroke_opacity: float = 1,
2936 fill_opacity: float = 1,
2937 oc: int = 0,
2938 ) -> pymupdf.Point:
2939 """Draw a circle sector given circle center, one arc end point and the angle of the arc.
2940
2941 Parameters:
2942 center -- center of circle
2943 point -- arc end point
2944 beta -- angle of arc (degrees)
2945 fullSector -- connect arc ends with center
2946 """
2947 img = page.new_shape()
2948 Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector)
2949 img.finish(
2950 color=color,
2951 fill=fill,
2952 dashes=dashes,
2953 width=width,
2954 lineCap=lineCap,
2955 lineJoin=lineJoin,
2956 morph=morph,
2957 closePath=closePath,
2958 stroke_opacity=stroke_opacity,
2959 fill_opacity=fill_opacity,
2960 oc=oc,
2961 )
2962 img.commit(overlay)
2963
2964 return Q
2965
2966
2967 # ----------------------------------------------------------------------
2968 # Name: wx.lib.colourdb.py
2969 # Purpose: Adds a bunch of colour names and RGB values to the
2970 # colour database so they can be found by name
2971 #
2972 # Author: Robin Dunn
2973 #
2974 # Created: 13-March-2001
2975 # Copyright: (c) 2001-2017 by Total Control Software
2976 # Licence: wxWindows license
2977 # Tags: phoenix-port, unittest, documented
2978 # ----------------------------------------------------------------------
2979
2980
2981 def getColorList() -> list:
2982 """
2983 Returns a list of upper-case colour names.
2984 :rtype: list of strings
2985 """
2986 return [name for name, r, g, b in pymupdf.colors_wx_list()]
2987
2988
2989 def getColorInfoList() -> list:
2990 """
2991 Returns list of (name, red, gree, blue) tuples, where:
2992 name: upper-case color name.
2993 read, green, blue: integers in range 0..255.
2994 :rtype: list of tuples
2995 """
2996 return pymupdf.colors_wx_list()
2997
2998
2999 def getColor(name: str) -> tuple:
3000 """Retrieve RGB color in PDF format by name.
3001
3002 Returns:
3003 a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned.
3004 """
3005 return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1))
3006
3007
3008 def getColorHSV(name: str) -> tuple:
3009 """Retrieve the hue, saturation, value triple of a color name.
3010
3011 Returns:
3012 a triple (degree, percent, percent). If not found (-1, -1, -1) is returned.
3013 """
3014 try:
3015 x = getColorInfoList()[getColorList().index(name.upper())]
3016 except Exception:
3017 if g_exceptions_verbose: pymupdf.exception_info()
3018 return (-1, -1, -1)
3019
3020 r = x[1] / 255.0
3021 g = x[2] / 255.0
3022 b = x[3] / 255.0
3023 cmax = max(r, g, b)
3024 V = round(cmax * 100, 1)
3025 cmin = min(r, g, b)
3026 delta = cmax - cmin
3027 if delta == 0:
3028 hue = 0
3029 elif cmax == r:
3030 hue = 60.0 * (((g - b) / delta) % 6)
3031 elif cmax == g:
3032 hue = 60.0 * (((b - r) / delta) + 2)
3033 else:
3034 hue = 60.0 * (((r - g) / delta) + 4)
3035
3036 H = int(round(hue))
3037
3038 if cmax == 0:
3039 sat = 0
3040 else:
3041 sat = delta / cmax
3042 S = int(round(sat * 100))
3043
3044 return (H, S, V)
3045
3046
3047 def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple:
3048 fontname, ext, stype, buffer = doc.extract_font(xref)
3049 asc = 0.8
3050 dsc = -0.2
3051 if ext == "":
3052 return fontname, ext, stype, asc, dsc
3053
3054 if buffer:
3055 try:
3056 font = pymupdf.Font(fontbuffer=buffer)
3057 asc = font.ascender
3058 dsc = font.descender
3059 bbox = font.bbox
3060 if asc - dsc < 1:
3061 if bbox.y0 < dsc:
3062 dsc = bbox.y0
3063 asc = 1 - dsc
3064 except Exception:
3065 pymupdf.exception_info()
3066 asc *= 1.2
3067 dsc *= 1.2
3068 return fontname, ext, stype, asc, dsc
3069 if ext != "n/a":
3070 try:
3071 font = pymupdf.Font(fontname)
3072 asc = font.ascender
3073 dsc = font.descender
3074 except Exception:
3075 pymupdf.exception_info()
3076 asc *= 1.2
3077 dsc *= 1.2
3078 else:
3079 asc *= 1.2
3080 dsc *= 1.2
3081 return fontname, ext, stype, asc, dsc
3082
3083
3084 def get_char_widths(
3085 doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None
3086 ) -> list:
3087 """Get list of glyph information of a font.
3088
3089 Notes:
3090 Must be provided by its XREF number. If we already dealt with the
3091 font, it will be recorded in doc.FontInfos. Otherwise we insert an
3092 entry there.
3093 Finally we return the glyphs for the font. This is a list of
3094 (glyph, width) where glyph is an integer controlling the char
3095 appearance, and width is a float controlling the char's spacing:
3096 width * fontsize is the actual space.
3097 For 'simple' fonts, glyph == ord(char) will usually be true.
3098 Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here.
3099 """
3100 fontinfo = pymupdf.CheckFontInfo(doc, xref)
3101 if fontinfo is None: # not recorded yet: create it
3102 if fontdict is None:
3103 name, ext, stype, asc, dsc = _get_font_properties(doc, xref)
3104 fontdict = {
3105 "name": name,
3106 "type": stype,
3107 "ext": ext,
3108 "ascender": asc,
3109 "descender": dsc,
3110 }
3111 else:
3112 name = fontdict["name"]
3113 ext = fontdict["ext"]
3114 stype = fontdict["type"]
3115 ordering = fontdict["ordering"]
3116 simple = fontdict["simple"]
3117
3118 if ext == "":
3119 raise ValueError("xref is not a font")
3120
3121 # check for 'simple' fonts
3122 if stype in ("Type1", "MMType1", "TrueType"):
3123 simple = True
3124 else:
3125 simple = False
3126
3127 # check for CJK fonts
3128 if name in ("Fangti", "Ming"):
3129 ordering = 0
3130 elif name in ("Heiti", "Song"):
3131 ordering = 1
3132 elif name in ("Gothic", "Mincho"):
3133 ordering = 2
3134 elif name in ("Dotum", "Batang"):
3135 ordering = 3
3136 else:
3137 ordering = -1
3138
3139 fontdict["simple"] = simple
3140
3141 if name == "ZapfDingbats":
3142 glyphs = pymupdf.zapf_glyphs
3143 elif name == "Symbol":
3144 glyphs = pymupdf.symbol_glyphs
3145 else:
3146 glyphs = None
3147
3148 fontdict["glyphs"] = glyphs
3149 fontdict["ordering"] = ordering
3150 fontinfo = [xref, fontdict]
3151 doc.FontInfos.append(fontinfo)
3152 else:
3153 fontdict = fontinfo[1]
3154 glyphs = fontdict["glyphs"]
3155 simple = fontdict["simple"]
3156 ordering = fontdict["ordering"]
3157
3158 if glyphs is None:
3159 oldlimit = 0
3160 else:
3161 oldlimit = len(glyphs)
3162
3163 mylimit = max(256, limit)
3164
3165 if mylimit <= oldlimit:
3166 return glyphs
3167
3168 if ordering < 0: # not a CJK font
3169 glyphs = doc._get_char_widths(
3170 xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx
3171 )
3172 else: # CJK fonts use char codes and width = 1
3173 glyphs = None
3174
3175 fontdict["glyphs"] = glyphs
3176 fontinfo[1] = fontdict
3177 pymupdf.UpdateFontInfo(doc, fontinfo)
3178
3179 return glyphs
3180
3181
3182 class Shape:
3183 """Create a new shape."""
3184
3185 @staticmethod
3186 def horizontal_angle(C, P):
3187 """Return the angle to the horizontal for the connection from C to P.
3188 This uses the arcus sine function and resolves its inherent ambiguity by
3189 looking up in which quadrant vector S = P - C is located.
3190 """
3191 S = pymupdf.Point(P - C).unit # unit vector 'C' -> 'P'
3192 alfa = math.asin(abs(S.y)) # absolute angle from horizontal
3193 if S.x < 0: # make arcsin result unique
3194 if S.y <= 0: # bottom-left
3195 alfa = -(math.pi - alfa)
3196 else: # top-left
3197 alfa = math.pi - alfa
3198 else:
3199 if S.y >= 0: # top-right
3200 pass
3201 else: # bottom-right
3202 alfa = -alfa
3203 return alfa
3204
3205 def __init__(self, page: pymupdf.Page):
3206 pymupdf.CheckParent(page)
3207 self.page = page
3208 self.doc = page.parent
3209 if not self.doc.is_pdf:
3210 raise ValueError("is no PDF")
3211 self.height = page.mediabox_size.y
3212 self.width = page.mediabox_size.x
3213 self.x = page.cropbox_position.x
3214 self.y = page.cropbox_position.y
3215
3216 self.pctm = page.transformation_matrix # page transf. matrix
3217 self.ipctm = ~self.pctm # inverted transf. matrix
3218
3219 self.draw_cont = ""
3220 self.text_cont = ""
3221 self.totalcont = ""
3222 self.last_point = None
3223 self.rect = None
3224
3225 def updateRect(self, x):
3226 if self.rect is None:
3227 if len(x) == 2:
3228 self.rect = pymupdf.Rect(x, x)
3229 else:
3230 self.rect = pymupdf.Rect(x)
3231
3232 else:
3233 if len(x) == 2:
3234 x = pymupdf.Point(x)
3235 self.rect.x0 = min(self.rect.x0, x.x)
3236 self.rect.y0 = min(self.rect.y0, x.y)
3237 self.rect.x1 = max(self.rect.x1, x.x)
3238 self.rect.y1 = max(self.rect.y1, x.y)
3239 else:
3240 x = pymupdf.Rect(x)
3241 self.rect.x0 = min(self.rect.x0, x.x0)
3242 self.rect.y0 = min(self.rect.y0, x.y0)
3243 self.rect.x1 = max(self.rect.x1, x.x1)
3244 self.rect.y1 = max(self.rect.y1, x.y1)
3245
3246 def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point:
3247 """Draw a line between two points."""
3248 p1 = pymupdf.Point(p1)
3249 p2 = pymupdf.Point(p2)
3250 if not (self.last_point == p1):
3251 self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
3252 self.last_point = p1
3253 self.updateRect(p1)
3254
3255 self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n"
3256 self.updateRect(p2)
3257 self.last_point = p2
3258 return self.last_point
3259
3260 def draw_polyline(self, points: list) -> pymupdf.Point:
3261 """Draw several connected line segments."""
3262 for i, p in enumerate(points):
3263 if i == 0:
3264 if not (self.last_point == pymupdf.Point(p)):
3265 self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n"
3266 self.last_point = pymupdf.Point(p)
3267 else:
3268 self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n"
3269 self.updateRect(p)
3270
3271 self.last_point = pymupdf.Point(points[-1])
3272 return self.last_point
3273
3274 def draw_bezier(
3275 self,
3276 p1: point_like,
3277 p2: point_like,
3278 p3: point_like,
3279 p4: point_like,
3280 ) -> pymupdf.Point:
3281 """Draw a standard cubic Bezier curve."""
3282 p1 = pymupdf.Point(p1)
3283 p2 = pymupdf.Point(p2)
3284 p3 = pymupdf.Point(p3)
3285 p4 = pymupdf.Point(p4)
3286 if not (self.last_point == p1):
3287 self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n"
3288 args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm))
3289 self.draw_cont += _format_g(args) + " c\n"
3290 self.updateRect(p1)
3291 self.updateRect(p2)
3292 self.updateRect(p3)
3293 self.updateRect(p4)
3294 self.last_point = p4
3295 return self.last_point
3296
3297 def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point:
3298 """Draw an ellipse inside a tetrapod."""
3299 if len(tetra) != 4:
3300 raise ValueError("invalid arg length")
3301 if hasattr(tetra[0], "__float__"):
3302 q = pymupdf.Rect(tetra).quad
3303 else:
3304 q = pymupdf.Quad(tetra)
3305
3306 mt = q.ul + (q.ur - q.ul) * 0.5
3307 mr = q.ur + (q.lr - q.ur) * 0.5
3308 mb = q.ll + (q.lr - q.ll) * 0.5
3309 ml = q.ul + (q.ll - q.ul) * 0.5
3310 if not (self.last_point == ml):
3311 self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n"
3312 self.last_point = ml
3313 self.draw_curve(ml, q.ll, mb)
3314 self.draw_curve(mb, q.lr, mr)
3315 self.draw_curve(mr, q.ur, mt)
3316 self.draw_curve(mt, q.ul, ml)
3317 self.updateRect(q.rect)
3318 self.last_point = ml
3319 return self.last_point
3320
3321 def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point:
3322 """Draw a circle given its center and radius."""
3323 if not radius > pymupdf.EPSILON:
3324 raise ValueError("radius must be positive")
3325 center = pymupdf.Point(center)
3326 p1 = center - (radius, 0)
3327 return self.draw_sector(center, p1, 360, fullSector=False)
3328
3329 def draw_curve(
3330 self,
3331 p1: point_like,
3332 p2: point_like,
3333 p3: point_like,
3334 ) -> pymupdf.Point:
3335 """Draw a curve between points using one control point."""
3336 kappa = 0.55228474983
3337 p1 = pymupdf.Point(p1)
3338 p2 = pymupdf.Point(p2)
3339 p3 = pymupdf.Point(p3)
3340 k1 = p1 + (p2 - p1) * kappa
3341 k2 = p3 + (p2 - p3) * kappa
3342 return self.draw_bezier(p1, k1, k2, p3)
3343
3344 def draw_sector(
3345 self,
3346 center: point_like,
3347 point: point_like,
3348 beta: float,
3349 fullSector: bool = True,
3350 ) -> pymupdf.Point:
3351 """Draw a circle sector."""
3352 center = pymupdf.Point(center)
3353 point = pymupdf.Point(point)
3354 l3 = lambda a, b: _format_g((a, b)) + " m\n"
3355 l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n"
3356 l5 = lambda a, b: _format_g((a, b)) + " l\n"
3357 betar = math.radians(-beta)
3358 w360 = math.radians(math.copysign(360, betar)) * (-1)
3359 w90 = math.radians(math.copysign(90, betar))
3360 w45 = w90 / 2
3361 while abs(betar) > 2 * math.pi:
3362 betar += w360 # bring angle below 360 degrees
3363 if not (self.last_point == point):
3364 self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
3365 self.last_point = point
3366 Q = pymupdf.Point(0, 0) # just make sure it exists
3367 C = center
3368 P = point
3369 S = P - C # vector 'center' -> 'point'
3370 rad = abs(S) # circle radius
3371
3372 if not rad > pymupdf.EPSILON:
3373 raise ValueError("radius must be positive")
3374
3375 alfa = self.horizontal_angle(center, point)
3376 while abs(betar) > abs(w90): # draw 90 degree arcs
3377 q1 = C.x + math.cos(alfa + w90) * rad
3378 q2 = C.y + math.sin(alfa + w90) * rad
3379 Q = pymupdf.Point(q1, q2) # the arc's end point
3380 r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45)
3381 r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45)
3382 R = pymupdf.Point(r1, r2) # crossing point of tangents
3383 kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q)
3384 kappa = kappah * abs(P - Q)
3385 cp1 = P + (R - P) * kappa # control point 1
3386 cp2 = Q + (R - Q) * kappa # control point 2
3387 self.draw_cont += l4(*pymupdf.JM_TUPLE(
3388 list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
3389 ))
3390
3391 betar -= w90 # reduce param angle by 90 deg
3392 alfa += w90 # advance start angle by 90 deg
3393 P = Q # advance to arc end point
3394 # draw (remaining) arc
3395 if abs(betar) > 1e-3: # significant degrees left?
3396 beta2 = betar / 2
3397 q1 = C.x + math.cos(alfa + betar) * rad
3398 q2 = C.y + math.sin(alfa + betar) * rad
3399 Q = pymupdf.Point(q1, q2) # the arc's end point
3400 r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2)
3401 r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2)
3402 R = pymupdf.Point(r1, r2) # crossing point of tangents
3403 # kappa height is 4/3 of segment height
3404 kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height
3405 kappa = kappah * abs(P - Q) / (1 - math.cos(betar))
3406 cp1 = P + (R - P) * kappa # control point 1
3407 cp2 = Q + (R - Q) * kappa # control point 2
3408 self.draw_cont += l4(*pymupdf.JM_TUPLE(
3409 list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
3410 ))
3411 if fullSector:
3412 self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm))
3413 self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm))
3414 self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm))
3415 self.last_point = Q
3416 return self.last_point
3417
3418 def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point:
3419 """Draw a rectangle.
3420
3421 Args:
3422 radius: if not None, the rectangle will have rounded corners.
3423 This is the radius of the curvature, given as percentage of
3424 the rectangle width or height. Valid are values 0 < v <= 0.5.
3425 For a sequence of two values, the corners will have different
3426 radii. Otherwise, the percentage will be computed from the
3427 shorter side. A value of (0.5, 0.5) will draw an ellipse.
3428 """
3429 r = pymupdf.Rect(rect)
3430 if radius is None: # standard rectangle
3431 self.draw_cont += _format_g(pymupdf.JM_TUPLE(
3432 list(r.bl * self.ipctm) + [r.width, r.height]
3433 )) + " re\n"
3434 self.updateRect(r)
3435 self.last_point = r.tl
3436 return self.last_point
3437 # rounded corners requested. This requires 1 or 2 values, each
3438 # with 0 < value <= 0.5
3439 if hasattr(radius, "__float__"):
3440 if radius <= 0 or radius > 0.5:
3441 raise ValueError(f"bad radius value {radius}.")
3442 d = min(r.width, r.height) * radius
3443 px = (d, 0)
3444 py = (0, d)
3445 elif hasattr(radius, "__len__") and len(radius) == 2:
3446 rx, ry = radius
3447 px = (rx * r.width, 0)
3448 py = (0, ry * r.height)
3449 if min(rx, ry) <= 0 or max(rx, ry) > 0.5:
3450 raise ValueError(f"bad radius value {radius}.")
3451 else:
3452 raise ValueError(f"bad radius value {radius}.")
3453
3454 lp = self.draw_line(r.tl + py, r.bl - py)
3455 lp = self.draw_curve(lp, r.bl, r.bl + px)
3456
3457 lp = self.draw_line(lp, r.br - px)
3458 lp = self.draw_curve(lp, r.br, r.br - py)
3459
3460 lp = self.draw_line(lp, r.tr + py)
3461 lp = self.draw_curve(lp, r.tr, r.tr - px)
3462
3463 lp = self.draw_line(lp, r.tl + px)
3464 self.last_point = self.draw_curve(lp, r.tl, r.tl + py)
3465
3466 self.updateRect(r)
3467 return self.last_point
3468
3469 def draw_quad(self, quad: quad_like) -> pymupdf.Point:
3470 """Draw a Quad."""
3471 q = pymupdf.Quad(quad)
3472 return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul])
3473
3474 def draw_zigzag(
3475 self,
3476 p1: point_like,
3477 p2: point_like,
3478 breadth: float = 2,
3479 ) -> pymupdf.Point:
3480 """Draw a zig-zagged line from p1 to p2."""
3481 p1 = pymupdf.Point(p1)
3482 p2 = pymupdf.Point(p2)
3483 S = p2 - p1 # vector start - end
3484 rad = abs(S) # distance of points
3485 cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
3486 if cnt < 4:
3487 raise ValueError("points too close")
3488 mb = rad / cnt # revised breadth
3489 matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
3490 i_mat = ~matrix # get original position
3491 points = [] # stores edges
3492 for i in range(1, cnt):
3493 if i % 4 == 1: # point "above" connection
3494 p = pymupdf.Point(i, -1) * mb
3495 elif i % 4 == 3: # point "below" connection
3496 p = pymupdf.Point(i, 1) * mb
3497 else: # ignore others
3498 continue
3499 points.append(p * i_mat)
3500 self.draw_polyline([p1] + points + [p2]) # add start and end points
3501 return p2
3502
3503 def draw_squiggle(
3504 self,
3505 p1: point_like,
3506 p2: point_like,
3507 breadth=2,
3508 ) -> pymupdf.Point:
3509 """Draw a squiggly line from p1 to p2."""
3510 p1 = pymupdf.Point(p1)
3511 p2 = pymupdf.Point(p2)
3512 S = p2 - p1 # vector start - end
3513 rad = abs(S) # distance of points
3514 cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
3515 if cnt < 4:
3516 raise ValueError("points too close")
3517 mb = rad / cnt # revised breadth
3518 matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis
3519 i_mat = ~matrix # get original position
3520 k = 2.4142135623765633 # y of draw_curve helper point
3521
3522 points = [] # stores edges
3523 for i in range(1, cnt):
3524 if i % 4 == 1: # point "above" connection
3525 p = pymupdf.Point(i, -k) * mb
3526 elif i % 4 == 3: # point "below" connection
3527 p = pymupdf.Point(i, k) * mb
3528 else: # else on connection line
3529 p = pymupdf.Point(i, 0) * mb
3530 points.append(p * i_mat)
3531
3532 points = [p1] + points + [p2]
3533 cnt = len(points)
3534 i = 0
3535 while i + 2 < cnt:
3536 self.draw_curve(points[i], points[i + 1], points[i + 2])
3537 i += 2
3538 return p2
3539
3540 # ==============================================================================
3541 # Shape.insert_text
3542 # ==============================================================================
3543 def insert_text(
3544 self,
3545 point: point_like,
3546 buffer: typing.Union[str, list],
3547 *,
3548 fontsize: float = 11,
3549 lineheight: OptFloat = None,
3550 fontname: str = "helv",
3551 fontfile: OptStr = None,
3552 set_simple: bool = 0,
3553 encoding: int = 0,
3554 color: OptSeq = None,
3555 fill: OptSeq = None,
3556 render_mode: int = 0,
3557 border_width: float = 0.05,
3558 miter_limit: float = 1,
3559 rotate: int = 0,
3560 morph: OptSeq = None,
3561 stroke_opacity: float = 1,
3562 fill_opacity: float = 1,
3563 oc: int = 0,
3564 ) -> int:
3565
3566 # ensure 'text' is a list of strings, worth dealing with
3567 if not bool(buffer):
3568 return 0
3569
3570 if type(buffer) not in (list, tuple):
3571 text = buffer.splitlines()
3572 else:
3573 text = buffer
3574
3575 if not len(text) > 0:
3576 return 0
3577
3578 point = pymupdf.Point(point)
3579 try:
3580 maxcode = max([ord(c) for c in " ".join(text)])
3581 except Exception:
3582 pymupdf.exception_info()
3583 return 0
3584
3585 # ensure valid 'fontname'
3586 fname = fontname
3587 if fname.startswith("/"):
3588 fname = fname[1:]
3589
3590 xref = self.page.insert_font(
3591 fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
3592 )
3593 fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
3594
3595 fontdict = fontinfo[1]
3596 ordering = fontdict["ordering"]
3597 simple = fontdict["simple"]
3598 bfname = fontdict["name"]
3599 ascender = fontdict["ascender"]
3600 descender = fontdict["descender"]
3601 if lineheight:
3602 lheight = fontsize * lineheight
3603 elif ascender - descender <= 1:
3604 lheight = fontsize * 1.2
3605 else:
3606 lheight = fontsize * (ascender - descender)
3607
3608 if maxcode > 255:
3609 glyphs = self.doc.get_char_widths(xref, maxcode + 1)
3610 else:
3611 glyphs = fontdict["glyphs"]
3612
3613 tab = []
3614 for t in text:
3615 if simple and bfname not in ("Symbol", "ZapfDingbats"):
3616 g = None
3617 else:
3618 g = glyphs
3619 tab.append(pymupdf.getTJstr(t, g, simple, ordering))
3620 text = tab
3621
3622 color_str = pymupdf.ColorCode(color, "c")
3623 fill_str = pymupdf.ColorCode(fill, "f")
3624 if not fill and render_mode == 0: # ensure fill color when 0 Tr
3625 fill = color
3626 fill_str = pymupdf.ColorCode(color, "f")
3627
3628 morphing = pymupdf.CheckMorph(morph)
3629 rot = rotate
3630 if rot % 90 != 0:
3631 raise ValueError("bad rotate value")
3632
3633 while rot < 0:
3634 rot += 360
3635 rot = rot % 360 # text rotate = 0, 90, 270, 180
3636
3637 templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf "
3638 templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n"
3639 cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise
3640 cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise
3641 cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
3642 height = self.height
3643 width = self.width
3644
3645 # setting up for standard rotation directions
3646 # case rotate = 0
3647 if morphing:
3648 m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y)
3649 mat = ~m1 * morph[1] * m1
3650 cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
3651 else:
3652 cm = ""
3653 top = height - point.y - self.y # start of 1st char
3654 left = point.x + self.x # start of 1. char
3655 space = top # space available
3656 #headroom = point.y + self.y # distance to page border
3657 if rot == 90:
3658 left = height - point.y - self.y
3659 top = -point.x - self.x
3660 cm += cmp90
3661 space = width - abs(top)
3662 #headroom = point.x + self.x
3663
3664 elif rot == 270:
3665 left = -height + point.y + self.y
3666 top = point.x + self.x
3667 cm += cmm90
3668 space = abs(top)
3669 #headroom = width - point.x - self.x
3670
3671 elif rot == 180:
3672 left = -point.x - self.x
3673 top = -height + point.y + self.y
3674 cm += cm180
3675 space = abs(point.y + self.y)
3676 #headroom = height - point.y - self.y
3677
3678 optcont = self.page._get_optional_content(oc)
3679 if optcont is not None:
3680 bdc = "/OC /%s BDC\n" % optcont
3681 emc = "EMC\n"
3682 else:
3683 bdc = emc = ""
3684
3685 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
3686 if alpha is None:
3687 alpha = ""
3688 else:
3689 alpha = "/%s gs\n" % alpha
3690 nres = templ1(bdc, alpha, cm, left, top, fname, fontsize)
3691
3692 if render_mode > 0:
3693 nres += "%i Tr " % render_mode
3694 nres += _format_g(border_width * fontsize) + " w "
3695 if miter_limit is not None:
3696 nres += _format_g(miter_limit) + " M "
3697 if color is not None:
3698 nres += color_str
3699 if fill is not None:
3700 nres += fill_str
3701
3702 # =========================================================================
3703 # start text insertion
3704 # =========================================================================
3705 nres += text[0]
3706 nlines = 1 # set output line counter
3707 if len(text) > 1:
3708 nres += templ2(lheight) # line 1
3709 else:
3710 nres += 'TJ'
3711 for i in range(1, len(text)):
3712 if space < lheight:
3713 break # no space left on page
3714 if i > 1:
3715 nres += "\nT* "
3716 nres += text[i] + 'TJ'
3717 space -= lheight
3718 nlines += 1
3719
3720 nres += "\nET\n%sQ\n" % emc
3721
3722 # =========================================================================
3723 # end of text insertion
3724 # =========================================================================
3725 # update the /Contents object
3726 self.text_cont += nres
3727 return nlines
3728
3729 # ==============================================================================
3730 # Shape.insert_textbox
3731 # ==============================================================================
3732 def insert_textbox(
3733 self,
3734 rect: rect_like,
3735 buffer: typing.Union[str, list],
3736 *,
3737 fontname: OptStr = "helv",
3738 fontfile: OptStr = None,
3739 fontsize: float = 11,
3740 lineheight: OptFloat = None,
3741 set_simple: bool = 0,
3742 encoding: int = 0,
3743 color: OptSeq = None,
3744 fill: OptSeq = None,
3745 expandtabs: int = 1,
3746 border_width: float = 0.05,
3747 miter_limit: float = 1,
3748 align: int = 0,
3749 render_mode: int = 0,
3750 rotate: int = 0,
3751 morph: OptSeq = None,
3752 stroke_opacity: float = 1,
3753 fill_opacity: float = 1,
3754 oc: int = 0,
3755 ) -> float:
3756 """Insert text into a given rectangle.
3757
3758 Args:
3759 rect -- the textbox to fill
3760 buffer -- text to be inserted
3761 fontname -- a Base-14 font, font name or '/name'
3762 fontfile -- name of a font file
3763 fontsize -- font size
3764 lineheight -- overwrite the font property
3765 color -- RGB stroke color triple
3766 fill -- RGB fill color triple
3767 render_mode -- text rendering control
3768 border_width -- thickness of glyph borders as percentage of fontsize
3769 expandtabs -- handles tabulators with string function
3770 align -- left, center, right, justified
3771 rotate -- 0, 90, 180, or 270 degrees
3772 morph -- morph box with a matrix and a fixpoint
3773 Returns:
3774 unused or deficit rectangle area (float)
3775 """
3776 rect = pymupdf.Rect(rect)
3777 if rect.is_empty or rect.is_infinite:
3778 raise ValueError("text box must be finite and not empty")
3779
3780 color_str = pymupdf.ColorCode(color, "c")
3781 fill_str = pymupdf.ColorCode(fill, "f")
3782 if fill is None and render_mode == 0: # ensure fill color for 0 Tr
3783 fill = color
3784 fill_str = pymupdf.ColorCode(color, "f")
3785
3786 optcont = self.page._get_optional_content(oc)
3787 if optcont is not None:
3788 bdc = "/OC /%s BDC\n" % optcont
3789 emc = "EMC\n"
3790 else:
3791 bdc = emc = ""
3792
3793 # determine opacity / transparency
3794 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
3795 if alpha is None:
3796 alpha = ""
3797 else:
3798 alpha = "/%s gs\n" % alpha
3799
3800 if rotate % 90 != 0:
3801 raise ValueError("rotate must be multiple of 90")
3802
3803 rot = rotate
3804 while rot < 0:
3805 rot += 360
3806 rot = rot % 360
3807
3808 # is buffer worth of dealing with?
3809 if not bool(buffer):
3810 return rect.height if rot in (0, 180) else rect.width
3811
3812 cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise
3813 cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise
3814 cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
3815 height = self.height
3816
3817 fname = fontname
3818 if fname.startswith("/"):
3819 fname = fname[1:]
3820
3821 xref = self.page.insert_font(
3822 fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
3823 )
3824 fontinfo = pymupdf.CheckFontInfo(self.doc, xref)
3825
3826 fontdict = fontinfo[1]
3827 ordering = fontdict["ordering"]
3828 simple = fontdict["simple"]
3829 glyphs = fontdict["glyphs"]
3830 bfname = fontdict["name"]
3831 ascender = fontdict["ascender"]
3832 descender = fontdict["descender"]
3833
3834 if lineheight:
3835 lheight_factor = lineheight
3836 elif ascender - descender <= 1:
3837 lheight_factor = 1.2
3838 else:
3839 lheight_factor = ascender - descender
3840 lheight = fontsize * lheight_factor
3841
3842 # create a list from buffer, split into its lines
3843 if type(buffer) in (list, tuple):
3844 t0 = "\n".join(buffer)
3845 else:
3846 t0 = buffer
3847
3848 maxcode = max([ord(c) for c in t0])
3849 # replace invalid char codes for simple fonts
3850 if simple and maxcode > 255:
3851 t0 = "".join([c if ord(c) < 256 else "?" for c in t0])
3852
3853 t0 = t0.splitlines()
3854
3855 glyphs = self.doc.get_char_widths(xref, maxcode + 1)
3856 if simple and bfname not in ("Symbol", "ZapfDingbats"):
3857 tj_glyphs = None
3858 else:
3859 tj_glyphs = glyphs
3860
3861 # ----------------------------------------------------------------------
3862 # calculate pixel length of a string
3863 # ----------------------------------------------------------------------
3864 def pixlen(x):
3865 """Calculate pixel length of x."""
3866 if ordering < 0:
3867 return sum([glyphs[ord(c)][1] for c in x]) * fontsize
3868 else:
3869 return len(x) * fontsize
3870
3871 # ---------------------------------------------------------------------
3872
3873 if ordering < 0:
3874 blen = glyphs[32][1] * fontsize # pixel size of space character
3875 else:
3876 blen = fontsize
3877
3878 text = "" # output buffer
3879
3880 if pymupdf.CheckMorph(morph):
3881 m1 = pymupdf.Matrix(
3882 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
3883 )
3884 mat = ~m1 * morph[1] * m1
3885 cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n"
3886 else:
3887 cm = ""
3888
3889 # ---------------------------------------------------------------------
3890 # adjust for text orientation / rotation
3891 # ---------------------------------------------------------------------
3892 progr = 1 # direction of line progress
3893 c_pnt = pymupdf.Point(0, fontsize * ascender) # used for line progress
3894 if rot == 0: # normal orientation
3895 point = rect.tl + c_pnt # line 1 is 'lheight' below top
3896 maxwidth = rect.width # pixels available in one line
3897 maxheight = rect.height # available text height
3898
3899 elif rot == 90: # rotate counter clockwise
3900 c_pnt = pymupdf.Point(fontsize * ascender, 0) # progress in x-direction
3901 point = rect.bl + c_pnt # line 1 'lheight' away from left
3902 maxwidth = rect.height # pixels available in one line
3903 maxheight = rect.width # available text height
3904 cm += cmp90
3905
3906 elif rot == 180: # text upside down
3907 # progress upwards in y direction
3908 c_pnt = -pymupdf.Point(0, fontsize * ascender)
3909 point = rect.br + c_pnt # line 1 'lheight' above bottom
3910 maxwidth = rect.width # pixels available in one line
3911 progr = -1 # subtract lheight for next line
3912 maxheight =rect.height # available text height
3913 cm += cm180
3914
3915 else: # rotate clockwise (270 or -90)
3916 # progress from right to left
3917 c_pnt = -pymupdf.Point(fontsize * ascender, 0)
3918 point = rect.tr + c_pnt # line 1 'lheight' left of right
3919 maxwidth = rect.height # pixels available in one line
3920 progr = -1 # subtract lheight for next line
3921 maxheight = rect.width # available text height
3922 cm += cmm90
3923
3924 # =====================================================================
3925 # line loop
3926 # =====================================================================
3927 just_tab = [] # 'justify' indicators per line
3928
3929 for i, line in enumerate(t0):
3930 line_t = line.expandtabs(expandtabs).split(" ") # split into words
3931 num_words = len(line_t)
3932 lbuff = "" # init line buffer
3933 rest = maxwidth # available line pixels
3934 # =================================================================
3935 # word loop
3936 # =================================================================
3937 for j in range(num_words):
3938 word = line_t[j]
3939 pl_w = pixlen(word) # pixel len of word
3940 if rest >= pl_w: # does it fit on the line?
3941 lbuff += word + " " # yes, append word
3942 rest -= pl_w + blen # update available line space
3943 continue # next word
3944
3945 # word doesn't fit - output line (if not empty)
3946 if lbuff:
3947 lbuff = lbuff.rstrip() + "\n" # line full, append line break
3948 text += lbuff # append to total text
3949 just_tab.append(True) # can align-justify
3950
3951 lbuff = "" # re-init line buffer
3952 rest = maxwidth # re-init avail. space
3953
3954 if pl_w <= maxwidth: # word shorter than 1 line?
3955 lbuff = word + " " # start the line with it
3956 rest = maxwidth - pl_w - blen # update free space
3957 continue
3958
3959 # long word: split across multiple lines - char by char ...
3960 if len(just_tab) > 0:
3961 just_tab[-1] = False # cannot align-justify
3962 for c in word:
3963 if pixlen(lbuff) <= maxwidth - pixlen(c):
3964 lbuff += c
3965 else: # line full
3966 lbuff += "\n" # close line
3967 text += lbuff # append to text
3968 just_tab.append(False) # cannot align-justify
3969 lbuff = c # start new line with this char
3970
3971 lbuff += " " # finish long word
3972 rest = maxwidth - pixlen(lbuff) # long word stored
3973
3974 if lbuff: # unprocessed line content?
3975 text += lbuff.rstrip() # append to text
3976 just_tab.append(False) # cannot align-justify
3977
3978 if i < len(t0) - 1: # not the last line?
3979 text += "\n" # insert line break
3980
3981 # compute used part of the textbox
3982 if text.endswith("\n"):
3983 text = text[:-1]
3984 lb_count = text.count("\n") + 1 # number of lines written
3985
3986 # text height = line count * line height plus one descender value
3987 text_height = lheight * lb_count - descender * fontsize
3988
3989 more = text_height - maxheight # difference to height limit
3990 if more > pymupdf.EPSILON: # landed too much outside rect
3991 return (-1) * more # return deficit, don't output
3992
3993 more = abs(more)
3994 if more < pymupdf.EPSILON:
3995 more = 0 # don't bother with epsilons
3996 nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer
3997 templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf "
3998 # center, right, justify: output each line with its own specifics
3999 text_t = text.splitlines() # split text in lines again
4000 just_tab[-1] = False # never justify last line
4001 for i, t in enumerate(text_t):
4002 spacing = 0
4003 pl = maxwidth - pixlen(t) # length of empty line part
4004 pnt = point + c_pnt * (i * lheight_factor) # text start of line
4005 if align == 1: # center: right shift by half width
4006 if rot in (0, 180):
4007 pnt = pnt + pymupdf.Point(pl / 2, 0) * progr
4008 else:
4009 pnt = pnt - pymupdf.Point(0, pl / 2) * progr
4010 elif align == 2: # right: right shift by full width
4011 if rot in (0, 180):
4012 pnt = pnt + pymupdf.Point(pl, 0) * progr
4013 else:
4014 pnt = pnt - pymupdf.Point(0, pl) * progr
4015 elif align == 3: # justify
4016 spaces = t.count(" ") # number of spaces in line
4017 if spaces > 0 and just_tab[i]: # if any, and we may justify
4018 spacing = pl / spaces # make every space this much larger
4019 else:
4020 spacing = 0 # keep normal space length
4021 top = height - pnt.y - self.y
4022 left = pnt.x + self.x
4023 if rot == 90:
4024 left = height - pnt.y - self.y
4025 top = -pnt.x - self.x
4026 elif rot == 270:
4027 left = -height + pnt.y + self.y
4028 top = pnt.x + self.x
4029 elif rot == 180:
4030 left = -pnt.x - self.x
4031 top = -height + pnt.y + self.y
4032
4033 nres += templ(left, top, fname, fontsize)
4034
4035 if render_mode > 0:
4036 nres += "%i Tr " % render_mode
4037 nres += _format_g(border_width * fontsize) + " w "
4038 if miter_limit is not None:
4039 nres += _format_g(miter_limit) + " M "
4040
4041 if align == 3:
4042 nres += _format_g(spacing) + " Tw "
4043
4044 if color is not None:
4045 nres += color_str
4046 if fill is not None:
4047 nres += fill_str
4048 nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering)
4049
4050 nres += "ET\n%sQ\n" % emc
4051
4052 self.text_cont += nres
4053 self.updateRect(rect)
4054 return more
4055
4056 def finish(
4057 self,
4058 width: float = 1,
4059 color: OptSeq = (0,),
4060 fill: OptSeq = None,
4061 lineCap: int = 0,
4062 lineJoin: int = 0,
4063 dashes: OptStr = None,
4064 even_odd: bool = False,
4065 morph: OptSeq = None,
4066 closePath: bool = True,
4067 fill_opacity: float = 1,
4068 stroke_opacity: float = 1,
4069 oc: int = 0,
4070 ) -> None:
4071 """Finish the current drawing segment.
4072
4073 Notes:
4074 Apply colors, opacity, dashes, line style and width, or
4075 morphing. Also whether to close the path
4076 by connecting last to first point.
4077 """
4078 if self.draw_cont == "": # treat empty contents as no-op
4079 return
4080
4081 if width == 0: # border color makes no sense then
4082 color = None
4083 elif color is None: # vice versa
4084 width = 0
4085 # if color == None and fill == None:
4086 # raise ValueError("at least one of 'color' or 'fill' must be given")
4087 color_str = pymupdf.ColorCode(color, "c") # ensure proper color string
4088 fill_str = pymupdf.ColorCode(fill, "f") # ensure proper fill string
4089
4090 optcont = self.page._get_optional_content(oc)
4091 if optcont is not None:
4092 self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont
4093 emc = "EMC\n"
4094 else:
4095 emc = ""
4096
4097 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
4098 if alpha is not None:
4099 self.draw_cont = "/%s gs\n" % alpha + self.draw_cont
4100
4101 if width != 1 and width != 0:
4102 self.draw_cont += _format_g(width) + " w\n"
4103
4104 if lineCap != 0:
4105 self.draw_cont = "%i J\n" % lineCap + self.draw_cont
4106 if lineJoin != 0:
4107 self.draw_cont = "%i j\n" % lineJoin + self.draw_cont
4108
4109 if dashes not in (None, "", "[] 0"):
4110 self.draw_cont = "%s d\n" % dashes + self.draw_cont
4111
4112 if closePath:
4113 self.draw_cont += "h\n"
4114 self.last_point = None
4115
4116 if color is not None:
4117 self.draw_cont += color_str
4118
4119 if fill is not None:
4120 self.draw_cont += fill_str
4121 if color is not None:
4122 if not even_odd:
4123 self.draw_cont += "B\n"
4124 else:
4125 self.draw_cont += "B*\n"
4126 else:
4127 if not even_odd:
4128 self.draw_cont += "f\n"
4129 else:
4130 self.draw_cont += "f*\n"
4131 else:
4132 self.draw_cont += "S\n"
4133
4134 self.draw_cont += emc
4135 if pymupdf.CheckMorph(morph):
4136 m1 = pymupdf.Matrix(
4137 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
4138 )
4139 mat = ~m1 * morph[1] * m1
4140 self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont
4141
4142 self.totalcont += "\nq\n" + self.draw_cont + "Q\n"
4143 self.draw_cont = ""
4144 self.last_point = None
4145 return
4146
4147 def commit(self, overlay: bool = True) -> None:
4148 """Update the page's /Contents object with Shape data.
4149
4150 The argument controls whether data appear in foreground (default)
4151 or background.
4152 """
4153 pymupdf.CheckParent(self.page) # doc may have died meanwhile
4154 self.totalcont += self.text_cont
4155 self.totalcont = self.totalcont.encode()
4156
4157 if self.totalcont:
4158 if overlay:
4159 self.page.wrap_contents() # ensure a balanced graphics state
4160 # make /Contents object with dummy stream
4161 xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay)
4162 # update it with potential compression
4163 self.doc.update_stream(xref, self.totalcont)
4164
4165 self.last_point = None # clean up ...
4166 self.rect = None #
4167 self.draw_cont = "" # for potential ...
4168 self.text_cont = "" # ...
4169 self.totalcont = "" # re-use
4170
4171
4172 def apply_redactions(
4173 page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0
4174 ) -> bool:
4175 """Apply the redaction annotations of the page.
4176
4177 Args:
4178 page: the PDF page.
4179 images:
4180 0 - ignore images
4181 1 - remove all overlapping images
4182 2 - blank out overlapping image parts
4183 3 - remove image unless invisible
4184 graphics:
4185 0 - ignore graphics
4186 1 - remove graphics if contained in rectangle
4187 2 - remove all overlapping graphics
4188 text:
4189 0 - remove text
4190 1 - ignore text
4191 """
4192
4193 def center_rect(annot_rect, new_text, font, fsize):
4194 """Calculate minimal sub-rectangle for the overlay text.
4195
4196 Notes:
4197 Because 'insert_textbox' supports no vertical text centering,
4198 we calculate an approximate number of lines here and return a
4199 sub-rect with smaller height, which should still be sufficient.
4200 Args:
4201 annot_rect: the annotation rectangle
4202 new_text: the text to insert.
4203 font: the fontname. Must be one of the CJK or Base-14 set, else
4204 the rectangle is returned unchanged.
4205 fsize: the fontsize
4206 Returns:
4207 A rectangle to use instead of the annot rectangle.
4208 """
4209 if not new_text or annot_rect.width <= pymupdf.EPSILON:
4210 return annot_rect
4211 try:
4212 text_width = pymupdf.get_text_length(new_text, font, fsize)
4213 except (ValueError, mupdf.FzErrorBase): # unsupported font
4214 if g_exceptions_verbose:
4215 pymupdf.exception_info()
4216 return annot_rect
4217 line_height = fsize * 1.2
4218 limit = annot_rect.width
4219 h = math.ceil(text_width / limit) * line_height # estimate rect height
4220 if h >= annot_rect.height:
4221 return annot_rect
4222 r = annot_rect
4223 y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5
4224 r.y0 = y
4225 return r
4226
4227 pymupdf.CheckParent(page)
4228 doc = page.parent
4229 if doc.is_encrypted or doc.is_closed:
4230 raise ValueError("document closed or encrypted")
4231 if not doc.is_pdf:
4232 raise ValueError("is no PDF")
4233
4234 redact_annots = [] # storage of annot values
4235 for annot in page.annots(
4236 types=(pymupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member
4237 ):
4238 # loop redactions
4239 redact_annots.append(annot._get_redact_values()) # save annot values
4240
4241 if redact_annots == []: # any redactions on this page?
4242 return False # no redactions
4243
4244 rc = page._apply_redactions(text, images, graphics) # call MuPDF
4245 if not rc: # should not happen really
4246 raise ValueError("Error applying redactions.")
4247
4248 # now write replacement text in old redact rectangles
4249 shape = page.new_shape()
4250 for redact in redact_annots:
4251 annot_rect = redact["rect"]
4252 fill = redact["fill"]
4253 if fill:
4254 shape.draw_rect(annot_rect) # colorize the rect background
4255 shape.finish(fill=fill, color=fill)
4256 if "text" in redact.keys(): # if we also have text
4257 new_text = redact["text"]
4258 align = redact.get("align", 0)
4259 fname = redact["fontname"]
4260 fsize = redact["fontsize"]
4261 color = redact["text_color"]
4262 # try finding vertical centered sub-rect
4263 trect = center_rect(annot_rect, new_text, fname, fsize)
4264
4265 rc = -1
4266 while rc < 0 and fsize >= 4: # while not enough room
4267 # (re-) try insertion
4268 rc = shape.insert_textbox(
4269 trect,
4270 new_text,
4271 fontname=fname,
4272 fontsize=fsize,
4273 color=color,
4274 align=align,
4275 )
4276 fsize -= 0.5 # reduce font if unsuccessful
4277 shape.commit() # append new contents object
4278 return True
4279
4280
4281 # ------------------------------------------------------------------------------
4282 # Remove potentially sensitive data from a PDF. Similar to the Adobe
4283 # Acrobat 'sanitize' function
4284 # ------------------------------------------------------------------------------
4285 def scrub(
4286 doc: pymupdf.Document,
4287 attached_files: bool = True,
4288 clean_pages: bool = True,
4289 embedded_files: bool = True,
4290 hidden_text: bool = True,
4291 javascript: bool = True,
4292 metadata: bool = True,
4293 redactions: bool = True,
4294 redact_images: int = 0,
4295 remove_links: bool = True,
4296 reset_fields: bool = True,
4297 reset_responses: bool = True,
4298 thumbnails: bool = True,
4299 xml_metadata: bool = True,
4300 ) -> None:
4301 def remove_hidden(cont_lines):
4302 """Remove hidden text from a PDF page.
4303
4304 Args:
4305 cont_lines: list of lines with /Contents content. Should have status
4306 from after page.cleanContents().
4307
4308 Returns:
4309 List of /Contents lines from which hidden text has been removed.
4310
4311 Notes:
4312 The input must have been created after the page's /Contents object(s)
4313 have been cleaned with page.cleanContents(). This ensures a standard
4314 formatting: one command per line, single spaces between operators.
4315 This allows for drastic simplification of this code.
4316 """
4317 out_lines = [] # will return this
4318 in_text = False # indicate if within BT/ET object
4319 suppress = False # indicate text suppression active
4320 make_return = False
4321 for line in cont_lines:
4322 if line == b"BT": # start of text object
4323 in_text = True # switch on
4324 out_lines.append(line) # output it
4325 continue
4326 if line == b"ET": # end of text object
4327 in_text = False # switch off
4328 out_lines.append(line) # output it
4329 continue
4330 if line == b"3 Tr": # text suppression operator
4331 suppress = True # switch on
4332 make_return = True
4333 continue
4334 if line[-2:] == b"Tr" and line[0] != b"3":
4335 suppress = False # text rendering changed
4336 out_lines.append(line)
4337 continue
4338 if line == b"Q": # unstack command also switches off
4339 suppress = False
4340 out_lines.append(line)
4341 continue
4342 if suppress and in_text: # suppress hidden lines
4343 continue
4344 out_lines.append(line)
4345 if make_return:
4346 return out_lines
4347 else:
4348 return None
4349
4350 if not doc.is_pdf: # only works for PDF
4351 raise ValueError("is no PDF")
4352 if doc.is_encrypted or doc.is_closed:
4353 raise ValueError("closed or encrypted doc")
4354
4355 if not clean_pages:
4356 hidden_text = False
4357 redactions = False
4358
4359 if metadata:
4360 doc.set_metadata({}) # remove standard metadata
4361
4362 for page in doc:
4363 if reset_fields:
4364 # reset form fields (widgets)
4365 for widget in page.widgets():
4366 widget.reset()
4367
4368 if remove_links:
4369 links = page.get_links() # list of all links on page
4370 for link in links: # remove all links
4371 page.delete_link(link)
4372
4373 found_redacts = False
4374 for annot in page.annots():
4375 if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
4376 annot.update_file(buffer_=b" ") # set file content to empty
4377 if reset_responses:
4378 annot.delete_responses()
4379 if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member
4380 found_redacts = True
4381
4382 if redactions and found_redacts:
4383 page.apply_redactions(images=redact_images)
4384
4385 if not (clean_pages or hidden_text):
4386 continue # done with the page
4387
4388 page.clean_contents()
4389 if not page.get_contents():
4390 continue
4391 if hidden_text:
4392 xref = page.get_contents()[0] # only one b/o cleaning!
4393 cont = doc.xref_stream(xref)
4394 cont_lines = remove_hidden(cont.splitlines()) # remove hidden text
4395 if cont_lines: # something was actually removed
4396 cont = b"\n".join(cont_lines)
4397 doc.update_stream(xref, cont) # rewrite the page /Contents
4398
4399 if thumbnails: # remove page thumbnails?
4400 if doc.xref_get_key(page.xref, "Thumb")[0] != "null":
4401 doc.xref_set_key(page.xref, "Thumb", "null")
4402
4403 # pages are scrubbed, now perform document-wide scrubbing
4404 # remove embedded files
4405 if embedded_files:
4406 for name in doc.embfile_names():
4407 doc.embfile_del(name)
4408
4409 if xml_metadata:
4410 doc.del_xml_metadata()
4411 if not (xml_metadata or javascript):
4412 xref_limit = 0
4413 else:
4414 xref_limit = doc.xref_length()
4415 for xref in range(1, xref_limit):
4416 if not doc.xref_object(xref):
4417 msg = "bad xref %i - clean PDF before scrubbing" % xref
4418 raise ValueError(msg)
4419 if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript":
4420 # a /JavaScript action object
4421 obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript
4422 doc.update_object(xref, obj) # update this object
4423 continue # no further handling
4424
4425 if not xml_metadata:
4426 continue
4427
4428 if doc.xref_get_key(xref, "Type")[1] == "/Metadata":
4429 # delete any metadata object directly
4430 doc.update_object(xref, "<<>>")
4431 doc.update_stream(xref, b"deleted", new=True)
4432 continue
4433
4434 if doc.xref_get_key(xref, "Metadata")[0] != "null":
4435 doc.xref_set_key(xref, "Metadata", "null")
4436
4437
4438 def _show_fz_text( text):
4439 #if mupdf_cppyy:
4440 # assert isinstance( text, cppyy.gbl.mupdf.Text)
4441 #else:
4442 # assert isinstance( text, mupdf.Text)
4443 num_spans = 0
4444 num_chars = 0
4445 span = text.m_internal.head
4446 while 1:
4447 if not span:
4448 break
4449 num_spans += 1
4450 num_chars += span.len
4451 span = span.next
4452 return f'num_spans={num_spans} num_chars={num_chars}'
4453
4454 def fill_textbox(
4455 writer: pymupdf.TextWriter,
4456 rect: rect_like,
4457 text: typing.Union[str, list],
4458 pos: point_like = None,
4459 font: typing.Optional[pymupdf.Font] = None,
4460 fontsize: float = 11,
4461 lineheight: OptFloat = None,
4462 align: int = 0,
4463 warn: bool = None,
4464 right_to_left: bool = False,
4465 small_caps: bool = False,
4466 ) -> tuple:
4467 """Fill a rectangle with text.
4468
4469 Args:
4470 writer: pymupdf.TextWriter object (= "self")
4471 rect: rect-like to receive the text.
4472 text: string or list/tuple of strings.
4473 pos: point-like start position of first word.
4474 font: pymupdf.Font object (default pymupdf.Font('helv')).
4475 fontsize: the fontsize.
4476 lineheight: overwrite the font property
4477 align: (int) 0 = left, 1 = center, 2 = right, 3 = justify
4478 warn: (bool) text overflow action: none, warn, or exception
4479 right_to_left: (bool) indicate right-to-left language.
4480 """
4481 rect = pymupdf.Rect(rect)
4482 if rect.is_empty:
4483 raise ValueError("fill rect must not empty.")
4484 if type(font) is not pymupdf.Font:
4485 font = pymupdf.Font("helv")
4486
4487 def textlen(x):
4488 """Return length of a string."""
4489 return font.text_length(
4490 x, fontsize=fontsize, small_caps=small_caps
4491 ) # abbreviation
4492
4493 def char_lengths(x):
4494 """Return list of single character lengths for a string."""
4495 return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps)
4496
4497 def append_this(pos, text):
4498 ret = writer.append(
4499 pos, text, font=font, fontsize=fontsize, small_caps=small_caps
4500 )
4501 return ret
4502
4503 tolerance = fontsize * 0.2 # extra distance to left border
4504 space_len = textlen(" ")
4505 std_width = rect.width - tolerance
4506 std_start = rect.x0 + tolerance
4507
4508 def norm_words(width, words):
4509 """Cut any word in pieces no longer than 'width'."""
4510 nwords = []
4511 word_lengths = []
4512 for w in words:
4513 wl_lst = char_lengths(w)
4514 wl = sum(wl_lst)
4515 if wl <= width: # nothing to do - copy over
4516 nwords.append(w)
4517 word_lengths.append(wl)
4518 continue
4519
4520 # word longer than rect width - split it in parts
4521 n = len(wl_lst)
4522 while n > 0:
4523 wl = sum(wl_lst[:n])
4524 if wl <= width:
4525 nwords.append(w[:n])
4526 word_lengths.append(wl)
4527 w = w[n:]
4528 wl_lst = wl_lst[n:]
4529 n = len(wl_lst)
4530 else:
4531 n -= 1
4532 return nwords, word_lengths
4533
4534 def output_justify(start, line):
4535 """Justified output of a line."""
4536 # ignore leading / trailing / multiple spaces
4537 words = [w for w in line.split(" ") if w != ""]
4538 nwords = len(words)
4539 if nwords == 0:
4540 return
4541 if nwords == 1: # single word cannot be justified
4542 append_this(start, words[0])
4543 return
4544 tl = sum([textlen(w) for w in words]) # total word lengths
4545 gaps = nwords - 1 # number of word gaps
4546 gapl = (std_width - tl) / gaps # width of each gap
4547 for w in words:
4548 _, lp = append_this(start, w) # output one word
4549 start.x = lp.x + gapl # next start at word end plus gap
4550 return
4551
4552 asc = font.ascender
4553 dsc = font.descender
4554 if not lineheight:
4555 if asc - dsc <= 1:
4556 lheight = 1.2
4557 else:
4558 lheight = asc - dsc
4559 else:
4560 lheight = lineheight
4561
4562 LINEHEIGHT = fontsize * lheight # effective line height
4563 width = std_width # available horizontal space
4564
4565 # starting point of text
4566 if pos is not None:
4567 pos = pymupdf.Point(pos)
4568 else: # default is just below rect top-left
4569 pos = rect.tl + (tolerance, fontsize * asc)
4570 if pos not in rect:
4571 raise ValueError("Text must start in rectangle.")
4572
4573 # calculate displacement factor for alignment
4574 if align == pymupdf.TEXT_ALIGN_CENTER:
4575 factor = 0.5
4576 elif align == pymupdf.TEXT_ALIGN_RIGHT:
4577 factor = 1.0
4578 else:
4579 factor = 0
4580
4581 # split in lines if just a string was given
4582 if type(text) is str:
4583 textlines = text.splitlines()
4584 else:
4585 textlines = []
4586 for line in text:
4587 textlines.extend(line.splitlines())
4588
4589 max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1
4590
4591 new_lines = [] # the final list of textbox lines
4592 no_justify = [] # no justify for these line numbers
4593 for i, line in enumerate(textlines):
4594 if line in ("", " "):
4595 new_lines.append((line, space_len))
4596 width = rect.width - tolerance
4597 no_justify.append((len(new_lines) - 1))
4598 continue
4599 if i == 0:
4600 width = rect.x1 - pos.x
4601 else:
4602 width = rect.width - tolerance
4603
4604 if right_to_left: # reverses Arabic / Hebrew text front to back
4605 line = writer.clean_rtl(line)
4606 tl = textlen(line)
4607 if tl <= width: # line short enough
4608 new_lines.append((line, tl))
4609 no_justify.append((len(new_lines) - 1))
4610 continue
4611
4612 # we need to split the line in fitting parts
4613 words = line.split(" ") # the words in the line
4614
4615 # cut in parts any words that are longer than rect width
4616 words, word_lengths = norm_words(width, words)
4617
4618 n = len(words)
4619 while True:
4620 line0 = " ".join(words[:n])
4621 wl = sum(word_lengths[:n]) + space_len * (n - 1)
4622 if wl <= width:
4623 new_lines.append((line0, wl))
4624 words = words[n:]
4625 word_lengths = word_lengths[n:]
4626 n = len(words)
4627 line0 = None
4628 else:
4629 n -= 1
4630
4631 if len(words) == 0:
4632 break
4633 assert n
4634
4635 # -------------------------------------------------------------------------
4636 # List of lines created. Each item is (text, tl), where 'tl' is the PDF
4637 # output length (float) and 'text' is the text. Except for justified text,
4638 # this is output-ready.
4639 # -------------------------------------------------------------------------
4640 nlines = len(new_lines)
4641 if nlines > max_lines:
4642 msg = "Only fitting %i of %i lines." % (max_lines, nlines)
4643 if warn is None:
4644 pass
4645 elif warn:
4646 pymupdf.message("Warning: " + msg)
4647 else:
4648 raise ValueError(msg)
4649
4650 start = pymupdf.Point()
4651 no_justify += [len(new_lines) - 1] # no justifying of last line
4652 for i in range(max_lines):
4653 try:
4654 line, tl = new_lines.pop(0)
4655 except IndexError:
4656 if g_exceptions_verbose >= 2: pymupdf.exception_info()
4657 break
4658
4659 if right_to_left: # Arabic, Hebrew
4660 line = "".join(reversed(line))
4661
4662 if i == 0: # may have different start for first line
4663 start = pos
4664
4665 if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width:
4666 output_justify(start, line)
4667 start.x = std_start
4668 start.y += LINEHEIGHT
4669 continue
4670
4671 if i > 0 or pos.x == std_start: # left, center, right alignments
4672 start.x += (width - tl) * factor
4673
4674 append_this(start, line)
4675 start.x = std_start
4676 start.y += LINEHEIGHT
4677
4678 return new_lines # return non-written lines
4679
4680
4681 # ------------------------------------------------------------------------
4682 # Optional Content functions
4683 # ------------------------------------------------------------------------
4684 def get_oc(doc: pymupdf.Document, xref: int) -> int:
4685 """Return optional content object xref for an image or form xobject.
4686
4687 Args:
4688 xref: (int) xref number of an image or form xobject.
4689 """
4690 if doc.is_closed or doc.is_encrypted:
4691 raise ValueError("document close or encrypted")
4692 t, name = doc.xref_get_key(xref, "Subtype")
4693 if t != "name" or name not in ("/Image", "/Form"):
4694 raise ValueError("bad object type at xref %i" % xref)
4695 t, oc = doc.xref_get_key(xref, "OC")
4696 if t != "xref":
4697 return 0
4698 rc = int(oc.replace("0 R", ""))
4699 return rc
4700
4701
4702 def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None:
4703 """Attach optional content object to image or form xobject.
4704
4705 Args:
4706 xref: (int) xref number of an image or form xobject
4707 oc: (int) xref number of an OCG or OCMD
4708 """
4709 if doc.is_closed or doc.is_encrypted:
4710 raise ValueError("document close or encrypted")
4711 t, name = doc.xref_get_key(xref, "Subtype")
4712 if t != "name" or name not in ("/Image", "/Form"):
4713 raise ValueError("bad object type at xref %i" % xref)
4714 if oc > 0:
4715 t, name = doc.xref_get_key(oc, "Type")
4716 if t != "name" or name not in ("/OCG", "/OCMD"):
4717 raise ValueError("bad object type at xref %i" % oc)
4718 if oc == 0 and "OC" in doc.xref_get_keys(xref):
4719 doc.xref_set_key(xref, "OC", "null")
4720 return None
4721 doc.xref_set_key(xref, "OC", "%i 0 R" % oc)
4722 return None
4723
4724
4725 def set_ocmd(
4726 doc: pymupdf.Document,
4727 xref: int = 0,
4728 ocgs: typing.Union[list, None] = None,
4729 policy: OptStr = None,
4730 ve: typing.Union[list, None] = None,
4731 ) -> int:
4732 """Create or update an OCMD object in a PDF document.
4733
4734 Args:
4735 xref: (int) 0 for creating a new object, otherwise update existing one.
4736 ocgs: (list) OCG xref numbers, which shall be subject to 'policy'.
4737 policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing).
4738 ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'.
4739
4740 Returns:
4741 Xref of the created or updated OCMD.
4742 """
4743
4744 all_ocgs = set(doc.get_ocgs().keys())
4745
4746 def ve_maker(ve):
4747 if type(ve) not in (list, tuple) or len(ve) < 2:
4748 raise ValueError("bad 've' format: %s" % ve)
4749 if ve[0].lower() not in ("and", "or", "not"):
4750 raise ValueError("bad operand: %s" % ve[0])
4751 if ve[0].lower() == "not" and len(ve) != 2:
4752 raise ValueError("bad 've' format: %s" % ve)
4753 item = "[/%s" % ve[0].title()
4754 for x in ve[1:]:
4755 if type(x) is int:
4756 if x not in all_ocgs:
4757 raise ValueError("bad OCG %i" % x)
4758 item += " %i 0 R" % x
4759 else:
4760 item += " %s" % ve_maker(x)
4761 item += "]"
4762 return item
4763
4764 text = "<</Type/OCMD"
4765
4766 if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided
4767 s = set(ocgs).difference(all_ocgs) # contains illegal xrefs
4768 if s != set():
4769 msg = "bad OCGs: %s" % s
4770 raise ValueError(msg)
4771 text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]"
4772
4773 if policy:
4774 policy = str(policy).lower()
4775 pols = {
4776 "anyon": "AnyOn",
4777 "allon": "AllOn",
4778 "anyoff": "AnyOff",
4779 "alloff": "AllOff",
4780 }
4781 if policy not in ("anyon", "allon", "anyoff", "alloff"):
4782 raise ValueError("bad policy: %s" % policy)
4783 text += "/P/%s" % pols[policy]
4784
4785 if ve:
4786 text += "/VE%s" % ve_maker(ve)
4787
4788 text += ">>"
4789
4790 # make new object or replace old OCMD (check type first)
4791 if xref == 0:
4792 xref = doc.get_new_xref()
4793 elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True):
4794 raise ValueError("bad xref or not an OCMD")
4795 doc.update_object(xref, text)
4796 return xref
4797
4798
4799 def get_ocmd(doc: pymupdf.Document, xref: int) -> dict:
4800 """Return the definition of an OCMD (optional content membership dictionary).
4801
4802 Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and
4803 /VE (visibility expression, PDF array). Via string manipulation, this
4804 info is converted to a Python dictionary with keys "xref", "ocgs", "policy"
4805 and "ve" - ready to recycle as input for 'set_ocmd()'.
4806 """
4807
4808 if xref not in range(doc.xref_length()):
4809 raise ValueError("bad xref")
4810 text = doc.xref_object(xref, compressed=True)
4811 if "/Type/OCMD" not in text:
4812 raise ValueError("bad object type")
4813 textlen = len(text)
4814
4815 p0 = text.find("/OCGs[") # look for /OCGs key
4816 p1 = text.find("]", p0)
4817 if p0 < 0 or p1 < 0: # no OCGs found
4818 ocgs = None
4819 else:
4820 ocgs = text[p0 + 6 : p1].replace("0 R", " ").split()
4821 ocgs = list(map(int, ocgs))
4822
4823 p0 = text.find("/P/") # look for /P policy key
4824 if p0 < 0:
4825 policy = None
4826 else:
4827 p1 = text.find("ff", p0)
4828 if p1 < 0:
4829 p1 = text.find("on", p0)
4830 if p1 < 0: # some irregular syntax
4831 raise ValueError("bad object at xref")
4832 else:
4833 policy = text[p0 + 3 : p1 + 2]
4834
4835 p0 = text.find("/VE[") # look for /VE visibility expression key
4836 if p0 < 0: # no visibility expression found
4837 ve = None
4838 else:
4839 lp = rp = 0 # find end of /VE by finding last ']'.
4840 p1 = p0
4841 while lp < 1 or lp != rp:
4842 p1 += 1
4843 if not p1 < textlen: # some irregular syntax
4844 raise ValueError("bad object at xref")
4845 if text[p1] == "[":
4846 lp += 1
4847 if text[p1] == "]":
4848 rp += 1
4849 # p1 now positioned at the last "]"
4850 ve = text[p0 + 3 : p1 + 1] # the PDF /VE array
4851 ve = (
4852 ve.replace("/And", '"and",')
4853 .replace("/Not", '"not",')
4854 .replace("/Or", '"or",')
4855 )
4856 ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[")
4857 import json
4858 try:
4859 ve = json.loads(ve)
4860 except Exception:
4861 pymupdf.exception_info()
4862 pymupdf.message(f"bad /VE key: {ve!r}")
4863 raise
4864 return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve}
4865
4866
4867 """
4868 Handle page labels for PDF documents.
4869
4870 Reading
4871 -------
4872 * compute the label of a page
4873 * find page number(s) having the given label.
4874
4875 Writing
4876 -------
4877 Supports setting (defining) page labels for PDF documents.
4878
4879 A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and
4880 significant parts of the following code during late December 2020
4881 through early January 2021.
4882 """
4883
4884
4885 def rule_dict(item):
4886 """Make a Python dict from a PDF page label rule.
4887
4888 Args:
4889 item -- a tuple (pno, rule) with the start page number and the rule
4890 string like <</S/D...>>.
4891 Returns:
4892 A dict like
4893 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
4894 """
4895 # Jorj McKie, 2021-01-06
4896
4897 pno, rule = item
4898 rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>"
4899 d = {"startpage": pno, "prefix": "", "firstpagenum": 1}
4900 skip = False
4901 for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local
4902 if skip: # this item has already been processed
4903 skip = False # deactivate skipping again
4904 continue
4905 if item == "S": # style specification
4906 d["style"] = rule[i + 1] # next item has the style
4907 skip = True # do not process next item again
4908 continue
4909 if item.startswith("P"): # prefix specification: extract the string
4910 x = item[1:].replace("(", "").replace(")", "")
4911 d["prefix"] = x
4912 continue
4913 if item.startswith("St"): # start page number specification
4914 x = int(item[2:])
4915 d["firstpagenum"] = x
4916 return d
4917
4918
4919 def get_label_pno(pgNo, labels):
4920 """Return the label for this page number.
4921
4922 Args:
4923 pgNo: page number, 0-based.
4924 labels: result of doc._get_page_labels().
4925 Returns:
4926 The label (str) of the page number. Errors return an empty string.
4927 """
4928 # Jorj McKie, 2021-01-06
4929
4930 item = [x for x in labels if x[0] <= pgNo][-1]
4931 rule = rule_dict(item)
4932 prefix = rule.get("prefix", "")
4933 style = rule.get("style", "")
4934 # make sure we start at 0 when enumerating the alphabet
4935 delta = -1 if style in ("a", "A") else 0
4936 pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta
4937 return construct_label(style, prefix, pagenumber)
4938
4939
4940 def get_label(page):
4941 """Return the label for this PDF page.
4942
4943 Args:
4944 page: page object.
4945 Returns:
4946 The label (str) of the page. Errors return an empty string.
4947 """
4948 # Jorj McKie, 2021-01-06
4949
4950 labels = page.parent._get_page_labels()
4951 if not labels:
4952 return ""
4953 labels.sort()
4954 return get_label_pno(page.number, labels)
4955
4956
4957 def get_page_numbers(doc, label, only_one=False):
4958 """Return a list of page numbers with the given label.
4959
4960 Args:
4961 doc: PDF document object (resp. 'self').
4962 label: (str) label.
4963 only_one: (bool) stop searching after first hit.
4964 Returns:
4965 List of page numbers having this label.
4966 """
4967 # Jorj McKie, 2021-01-06
4968
4969 numbers = []
4970 if not label:
4971 return numbers
4972 labels = doc._get_page_labels()
4973 if labels == []:
4974 return numbers
4975 for i in range(doc.page_count):
4976 plabel = get_label_pno(i, labels)
4977 if plabel == label:
4978 numbers.append(i)
4979 if only_one:
4980 break
4981 return numbers
4982
4983
4984 def construct_label(style, prefix, pno) -> str:
4985 """Construct a label based on style, prefix and page number."""
4986 # William Chapman, 2021-01-06
4987
4988 n_str = ""
4989 if style == "D":
4990 n_str = str(pno)
4991 elif style == "r":
4992 n_str = integerToRoman(pno).lower()
4993 elif style == "R":
4994 n_str = integerToRoman(pno).upper()
4995 elif style == "a":
4996 n_str = integerToLetter(pno).lower()
4997 elif style == "A":
4998 n_str = integerToLetter(pno).upper()
4999 result = prefix + n_str
5000 return result
5001
5002
5003 def integerToLetter(i) -> str:
5004 """Returns letter sequence string for integer i."""
5005 # William Chapman, Jorj McKie, 2021-01-06
5006 import string
5007 ls = string.ascii_uppercase
5008 n, a = 1, i
5009 while pow(26, n) <= a:
5010 a -= int(math.pow(26, n))
5011 n += 1
5012
5013 str_t = ""
5014 for j in reversed(range(n)):
5015 f, g = divmod(a, int(math.pow(26, j)))
5016 str_t += ls[f]
5017 a = g
5018 return str_t
5019
5020
5021 def integerToRoman(num: int) -> str:
5022 """Return roman numeral for an integer."""
5023 # William Chapman, Jorj McKie, 2021-01-06
5024
5025 roman = (
5026 (1000, "M"),
5027 (900, "CM"),
5028 (500, "D"),
5029 (400, "CD"),
5030 (100, "C"),
5031 (90, "XC"),
5032 (50, "L"),
5033 (40, "XL"),
5034 (10, "X"),
5035 (9, "IX"),
5036 (5, "V"),
5037 (4, "IV"),
5038 (1, "I"),
5039 )
5040
5041 def roman_num(num):
5042 for r, ltr in roman:
5043 x, _ = divmod(num, r)
5044 yield ltr * x
5045 num -= r * x
5046 if num <= 0:
5047 break
5048
5049 return "".join([a for a in roman_num(num)])
5050
5051
5052 def get_page_labels(doc):
5053 """Return page label definitions in PDF document.
5054
5055 Args:
5056 doc: PDF document (resp. 'self').
5057 Returns:
5058 A list of dictionaries with the following format:
5059 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
5060 """
5061 # Jorj McKie, 2021-01-10
5062 return [rule_dict(item) for item in doc._get_page_labels()]
5063
5064
5065 def set_page_labels(doc, labels):
5066 """Add / replace page label definitions in PDF document.
5067
5068 Args:
5069 doc: PDF document (resp. 'self').
5070 labels: list of label dictionaries like:
5071 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int},
5072 as returned by get_page_labels().
5073 """
5074 # William Chapman, 2021-01-06
5075
5076 def create_label_str(label):
5077 """Convert Python label dict to corresponding PDF rule string.
5078
5079 Args:
5080 label: (dict) build rule for the label.
5081 Returns:
5082 PDF label rule string wrapped in "<<", ">>".
5083 """
5084 s = "%i<<" % label["startpage"]
5085 if label.get("prefix", "") != "":
5086 s += "/P(%s)" % label["prefix"]
5087 if label.get("style", "") != "":
5088 s += "/S/%s" % label["style"]
5089 if label.get("firstpagenum", 1) > 1:
5090 s += "/St %i" % label["firstpagenum"]
5091 s += ">>"
5092 return s
5093
5094 def create_nums(labels):
5095 """Return concatenated string of all labels rules.
5096
5097 Args:
5098 labels: (list) dictionaries as created by function 'rule_dict'.
5099 Returns:
5100 PDF compatible string for page label definitions, ready to be
5101 enclosed in PDF array 'Nums[...]'.
5102 """
5103 labels.sort(key=lambda x: x["startpage"])
5104 s = "".join([create_label_str(label) for label in labels])
5105 return s
5106
5107 doc._set_page_labels(create_nums(labels))
5108
5109
5110 # End of Page Label Code -------------------------------------------------
5111
5112
5113 def has_links(doc: pymupdf.Document) -> bool:
5114 """Check whether there are links on any page."""
5115 if doc.is_closed:
5116 raise ValueError("document closed")
5117 if not doc.is_pdf:
5118 raise ValueError("is no PDF")
5119 for i in range(doc.page_count):
5120 for item in doc.page_annot_xrefs(i):
5121 if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member
5122 return True
5123 return False
5124
5125
5126 def has_annots(doc: pymupdf.Document) -> bool:
5127 """Check whether there are annotations on any page."""
5128 if doc.is_closed:
5129 raise ValueError("document closed")
5130 if not doc.is_pdf:
5131 raise ValueError("is no PDF")
5132 for i in range(doc.page_count):
5133 for item in doc.page_annot_xrefs(i):
5134 # pylint: disable=no-member
5135 if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member
5136 return True
5137 return False
5138
5139
5140 # -------------------------------------------------------------------
5141 # Functions to recover the quad contained in a text extraction bbox
5142 # -------------------------------------------------------------------
5143 def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad:
5144 """Compute the quad located inside the bbox.
5145
5146 The bbox may be any of the resp. tuples occurring inside the given span.
5147
5148 Args:
5149 line_dir: (tuple) 'line["dir"]' of the owning line or None.
5150 span: (dict) the span. May be from get_texttrace() method.
5151 bbox: (tuple) the bbox of the span or any of its characters.
5152 Returns:
5153 The quad which is wrapped by the bbox.
5154 """
5155 if line_dir is None:
5156 line_dir = span["dir"]
5157 cos, sin = line_dir
5158 bbox = pymupdf.Rect(bbox) # make it a rect
5159 if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height
5160 d = 1
5161 else:
5162 d = span["ascender"] - span["descender"]
5163
5164 height = d * span["size"] # the quad's rectangle height
5165 # The following are distances from the bbox corners, at which we find the
5166 # respective quad points. The computation depends on in which quadrant the
5167 # text writing angle is located.
5168 hs = height * sin
5169 hc = height * cos
5170 if hc >= 0 and hs <= 0: # quadrant 1
5171 ul = bbox.bl - (0, hc)
5172 ur = bbox.tr + (hs, 0)
5173 ll = bbox.bl - (hs, 0)
5174 lr = bbox.tr + (0, hc)
5175 elif hc <= 0 and hs <= 0: # quadrant 2
5176 ul = bbox.br + (hs, 0)
5177 ur = bbox.tl - (0, hc)
5178 ll = bbox.br + (0, hc)
5179 lr = bbox.tl - (hs, 0)
5180 elif hc <= 0 and hs >= 0: # quadrant 3
5181 ul = bbox.tr - (0, hc)
5182 ur = bbox.bl + (hs, 0)
5183 ll = bbox.tr - (hs, 0)
5184 lr = bbox.bl + (0, hc)
5185 else: # quadrant 4
5186 ul = bbox.tl + (hs, 0)
5187 ur = bbox.br - (0, hc)
5188 ll = bbox.tl + (0, hc)
5189 lr = bbox.br - (hs, 0)
5190 return pymupdf.Quad(ul, ur, ll, lr)
5191
5192
5193 def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad:
5194 """Recover the quadrilateral of a text span.
5195
5196 Args:
5197 line_dir: (tuple) 'line["dir"]' of the owning line.
5198 span: the span.
5199 Returns:
5200 The quadrilateral enveloping the span's text.
5201 """
5202 if type(line_dir) is not tuple or len(line_dir) != 2:
5203 raise ValueError("bad line dir argument")
5204 if type(span) is not dict:
5205 raise ValueError("bad span argument")
5206 return recover_bbox_quad(line_dir, span, span["bbox"])
5207
5208
5209 def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad:
5210 """Calculate the line quad for 'dict' / 'rawdict' text extractions.
5211
5212 The lower quad points are those of the first, resp. last span quad.
5213 The upper points are determined by the maximum span quad height.
5214 From this, compute a rect with bottom-left in (0, 0), convert this to a
5215 quad and rotate and shift back to cover the text of the spans.
5216
5217 Args:
5218 spans: (list, optional) sub-list of spans to consider.
5219 Returns:
5220 pymupdf.Quad covering selected spans.
5221 """
5222 if spans is None: # no sub-selection
5223 spans = line["spans"] # all spans
5224 if len(spans) == 0:
5225 raise ValueError("bad span list")
5226 line_dir = line["dir"] # text direction
5227 cos, sin = line_dir
5228 q0 = recover_quad(line_dir, spans[0]) # quad of first span
5229 if len(spans) > 1: # get quad of last span
5230 q1 = recover_quad(line_dir, spans[-1])
5231 else:
5232 q1 = q0 # last = first
5233
5234 line_ll = q0.ll # lower-left of line quad
5235 line_lr = q1.lr # lower-right of line quad
5236
5237 mat0 = pymupdf.planish_line(line_ll, line_lr)
5238
5239 # map base line to x-axis such that line_ll goes to (0, 0)
5240 x_lr = line_lr * mat0
5241
5242 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
5243
5244 h = max(
5245 [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans]
5246 )
5247
5248 line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
5249 line_quad = line_rect.quad # make it a quad and:
5250 line_quad *= ~mat0
5251 return line_quad
5252
5253
5254 def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad:
5255 """Calculate the span quad for 'dict' / 'rawdict' text extractions.
5256
5257 Notes:
5258 There are two execution paths:
5259 1. For the full span quad, the result of 'recover_quad' is returned.
5260 2. For the quad of a sub-list of characters, the char quads are
5261 computed and joined. This is only supported for the "rawdict"
5262 extraction option.
5263
5264 Args:
5265 line_dir: (tuple) 'line["dir"]' of the owning line.
5266 span: (dict) the span.
5267 chars: (list, optional) sub-list of characters to consider.
5268 Returns:
5269 pymupdf.Quad covering selected characters.
5270 """
5271 if line_dir is None: # must be a span from get_texttrace()
5272 line_dir = span["dir"]
5273 if chars is None: # no sub-selection
5274 return recover_quad(line_dir, span)
5275 if "chars" not in span.keys():
5276 raise ValueError("need 'rawdict' option to sub-select chars")
5277
5278 q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char
5279 if len(chars) > 1: # get quad of last char
5280 q1 = recover_char_quad(line_dir, span, chars[-1])
5281 else:
5282 q1 = q0 # last = first
5283
5284 span_ll = q0.ll # lower-left of span quad
5285 span_lr = q1.lr # lower-right of span quad
5286 mat0 = pymupdf.planish_line(span_ll, span_lr)
5287 # map base line to x-axis such that span_ll goes to (0, 0)
5288 x_lr = span_lr * mat0
5289
5290 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights?
5291 h = span["size"] * (1 if small else (span["ascender"] - span["descender"]))
5292
5293 span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle
5294 span_quad = span_rect.quad # make it a quad and:
5295 span_quad *= ~mat0 # rotate back and shift back
5296 return span_quad
5297
5298
5299 def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad:
5300 """Recover the quadrilateral of a text character.
5301
5302 This requires the "rawdict" option of text extraction.
5303
5304 Args:
5305 line_dir: (tuple) 'line["dir"]' of the span's line.
5306 span: (dict) the span dict.
5307 char: (dict) the character dict.
5308 Returns:
5309 The quadrilateral enveloping the character.
5310 """
5311 if line_dir is None:
5312 line_dir = span["dir"]
5313 if type(line_dir) is not tuple or len(line_dir) != 2:
5314 raise ValueError("bad line dir argument")
5315 if type(span) is not dict:
5316 raise ValueError("bad span argument")
5317 if type(char) is dict:
5318 bbox = pymupdf.Rect(char["bbox"])
5319 elif type(char) is tuple:
5320 bbox = pymupdf.Rect(char[3])
5321 else:
5322 raise ValueError("bad span argument")
5323
5324 return recover_bbox_quad(line_dir, span, bbox)
5325
5326
5327 # -------------------------------------------------------------------
5328 # Building font subsets using fontTools
5329 # -------------------------------------------------------------------
5330 def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> OptInt:
5331 """Build font subsets in a PDF.
5332
5333 Eligible fonts are potentially replaced by smaller versions. Page text is
5334 NOT rewritten and thus should retain properties like being hidden or
5335 controlled by optional content.
5336
5337 This method by default uses MuPDF's own internal feature to create subset
5338 fonts. As this is a new function, errors may still occur. In this case,
5339 please fall back to using the previous version by using "fallback=True".
5340 Fallback mode requires the external package 'fontTools'.
5341
5342 Args:
5343 fallback: use the older deprecated implementation.
5344 verbose: only used by fallback mode.
5345
5346 Returns:
5347 The new MuPDF-based code returns None. The deprecated fallback
5348 mode returns 0 if there are no fonts to subset. Otherwise, it
5349 returns the decrease in fontsize (the difference in fontsize),
5350 measured in bytes.
5351 """
5352 # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs))
5353 # An embedded font is uniquely defined by its fontbuffer only. It may have
5354 # multiple names and xrefs.
5355 # Once the sets of used unicodes and glyphs are known, we compute a
5356 # smaller version of the buffer user package fontTools.
5357
5358 if not fallback: # by default use MuPDF function
5359 pdf = mupdf.pdf_document_from_fz_document(doc)
5360 mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count)))
5361 return
5362
5363 font_buffers = {}
5364
5365 def get_old_widths(xref):
5366 """Retrieve old font '/W' and '/DW' values."""
5367 df = doc.xref_get_key(xref, "DescendantFonts")
5368 if df[0] != "array": # only handle xref specifications
5369 return None, None
5370 df_xref = int(df[1][1:-1].replace("0 R", ""))
5371 widths = doc.xref_get_key(df_xref, "W")
5372 if widths[0] != "array": # no widths key found
5373 widths = None
5374 else:
5375 widths = widths[1]
5376 dwidths = doc.xref_get_key(df_xref, "DW")
5377 if dwidths[0] != "int":
5378 dwidths = None
5379 else:
5380 dwidths = dwidths[1]
5381 return widths, dwidths
5382
5383 def set_old_widths(xref, widths, dwidths):
5384 """Restore the old '/W' and '/DW' in subsetted font.
5385
5386 If either parameter is None or evaluates to False, the corresponding
5387 dictionary key will be set to null.
5388 """
5389 df = doc.xref_get_key(xref, "DescendantFonts")
5390 if df[0] != "array": # only handle xref specs
5391 return None
5392 df_xref = int(df[1][1:-1].replace("0 R", ""))
5393 if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[
5394 0
5395 ] != "null":
5396 doc.xref_set_key(df_xref, "W", "null")
5397 else:
5398 doc.xref_set_key(df_xref, "W", widths)
5399 if (type(dwidths) is not str or not dwidths) and doc.xref_get_key(
5400 df_xref, "DW"
5401 )[0] != "null":
5402 doc.xref_set_key(df_xref, "DW", "null")
5403 else:
5404 doc.xref_set_key(df_xref, "DW", dwidths)
5405 return None
5406
5407 def set_subset_fontname(new_xref):
5408 """Generate a name prefix to tag a font as subset.
5409
5410 We use a random generator to select 6 upper case ASCII characters.
5411 The prefixed name must be put in the font xref as the "/BaseFont" value
5412 and in the FontDescriptor object as the '/FontName' value.
5413 """
5414 # The following generates a prefix like 'ABCDEF+'
5415 import random
5416 import string
5417 prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+"
5418 font_str = doc.xref_object(new_xref, compressed=True)
5419 font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix)
5420 df = doc.xref_get_key(new_xref, "DescendantFonts")
5421 if df[0] == "array":
5422 df_xref = int(df[1][1:-1].replace("0 R", ""))
5423 fd = doc.xref_get_key(df_xref, "FontDescriptor")
5424 if fd[0] == "xref":
5425 fd_xref = int(fd[1].replace("0 R", ""))
5426 fd_str = doc.xref_object(fd_xref, compressed=True)
5427 fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix)
5428 doc.update_object(fd_xref, fd_str)
5429 doc.update_object(new_xref, font_str)
5430
5431 def build_subset(buffer, unc_set, gid_set):
5432 """Build font subset using fontTools.
5433
5434 Args:
5435 buffer: (bytes) the font given as a binary buffer.
5436 unc_set: (set) required glyph ids.
5437 Returns:
5438 Either None if subsetting is unsuccessful or the subset font buffer.
5439 """
5440 try:
5441 import fontTools.subset as fts
5442 except ImportError:
5443 if g_exceptions_verbose: pymupdf.exception_info()
5444 pymupdf.message("This method requires fontTools to be installed.")
5445 raise
5446 import tempfile
5447 with tempfile.TemporaryDirectory() as tmp_dir:
5448 oldfont_path = f"{tmp_dir}/oldfont.ttf"
5449 newfont_path = f"{tmp_dir}/newfont.ttf"
5450 uncfile_path = f"{tmp_dir}/uncfile.txt"
5451 args = [
5452 oldfont_path,
5453 "--retain-gids",
5454 f"--output-file={newfont_path}",
5455 "--layout-features=*",
5456 "--passthrough-tables",
5457 "--ignore-missing-glyphs",
5458 "--ignore-missing-unicodes",
5459 "--symbol-cmap",
5460 ]
5461
5462 # store glyph ids or unicodes as file
5463 with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file:
5464 if 0xFFFD in unc_set: # error unicode exists -> use glyphs
5465 args.append(f"--gids-file={uncfile_path}")
5466 gid_set.add(189)
5467 unc_list = list(gid_set)
5468 for unc in unc_list:
5469 unc_file.write("%i\n" % unc)
5470 else:
5471 args.append(f"--unicodes-file={uncfile_path}")
5472 unc_set.add(255)
5473 unc_list = list(unc_set)
5474 for unc in unc_list:
5475 unc_file.write("%04x\n" % unc)
5476
5477 # store fontbuffer as a file
5478 with open(oldfont_path, "wb") as fontfile:
5479 fontfile.write(buffer)
5480 try:
5481 os.remove(newfont_path) # remove old file
5482 except Exception:
5483 pass
5484 try: # invoke fontTools subsetter
5485 fts.main(args)
5486 font = pymupdf.Font(fontfile=newfont_path)
5487 new_buffer = font.buffer # subset font binary
5488 if font.glyph_count == 0: # intercept empty font
5489 new_buffer = None
5490 except Exception:
5491 pymupdf.exception_info()
5492 new_buffer = None
5493 return new_buffer
5494
5495 def repl_fontnames(doc):
5496 """Populate 'font_buffers'.
5497
5498 For each font candidate, store its xref and the list of names
5499 by which PDF text may refer to it (there may be multiple).
5500 """
5501
5502 def norm_name(name):
5503 """Recreate font name that contains PDF hex codes.
5504
5505 E.g. #20 -> space, chr(32)
5506 """
5507 while "#" in name:
5508 p = name.find("#")
5509 c = int(name[p + 1 : p + 3], 16)
5510 name = name.replace(name[p : p + 3], chr(c))
5511 return name
5512
5513 def get_fontnames(doc, item):
5514 """Return a list of fontnames for an item of page.get_fonts().
5515
5516 There may be multiple names e.g. for Type0 fonts.
5517 """
5518 fontname = item[3]
5519 names = [fontname]
5520 fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:]
5521 fontname = norm_name(fontname)
5522 if fontname not in names:
5523 names.append(fontname)
5524 descendents = doc.xref_get_key(item[0], "DescendantFonts")
5525 if descendents[0] != "array":
5526 return names
5527 descendents = descendents[1][1:-1]
5528 if descendents.endswith(" 0 R"):
5529 xref = int(descendents[:-4])
5530 descendents = doc.xref_object(xref, compressed=True)
5531 p1 = descendents.find("/BaseFont")
5532 if p1 >= 0:
5533 p2 = descendents.find("/", p1 + 1)
5534 p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1))
5535 fontname = descendents[p2 + 1 : p1]
5536 fontname = norm_name(fontname)
5537 if fontname not in names:
5538 names.append(fontname)
5539 return names
5540
5541 for i in range(doc.page_count):
5542 for f in doc.get_page_fonts(i, full=True):
5543 font_xref = f[0] # font xref
5544 font_ext = f[1] # font file extension
5545 basename = f[3] # font basename
5546
5547 if font_ext not in ( # skip if not supported by fontTools
5548 "otf",
5549 "ttf",
5550 "woff",
5551 "woff2",
5552 ):
5553 continue
5554 # skip fonts which already are subsets
5555 if len(basename) > 6 and basename[6] == "+":
5556 continue
5557
5558 extr = doc.extract_font(font_xref)
5559 fontbuffer = extr[-1]
5560 names = get_fontnames(doc, f)
5561 name_set, xref_set, subsets = font_buffers.get(
5562 fontbuffer, (set(), set(), (set(), set()))
5563 )
5564 xref_set.add(font_xref)
5565 for name in names:
5566 name_set.add(name)
5567 font = pymupdf.Font(fontbuffer=fontbuffer)
5568 name_set.add(font.name)
5569 del font
5570 font_buffers[fontbuffer] = (name_set, xref_set, subsets)
5571
5572 def find_buffer_by_name(name):
5573 for buffer, (name_set, _, _) in font_buffers.items():
5574 if name in name_set:
5575 return buffer
5576 return None
5577
5578 # -----------------
5579 # main function
5580 # -----------------
5581 repl_fontnames(doc) # populate font information
5582 if not font_buffers: # nothing found to do
5583 if verbose:
5584 pymupdf.message(f'No fonts to subset.')
5585 return 0
5586
5587 old_fontsize = 0
5588 new_fontsize = 0
5589 for fontbuffer in font_buffers.keys():
5590 old_fontsize += len(fontbuffer)
5591
5592 # Scan page text for usage of subsettable fonts
5593 for page in doc:
5594 # go through the text and extend set of used glyphs by font
5595 # we use a modified MuPDF trace device, which delivers us glyph ids.
5596 for span in page.get_texttrace():
5597 if type(span) is not dict: # skip useless information
5598 continue
5599 fontname = span["font"][:33] # fontname for the span
5600 buffer = find_buffer_by_name(fontname)
5601 if buffer is None:
5602 continue
5603 name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer]
5604 for c in span["chars"]:
5605 set_ucs.add(c[0]) # unicode
5606 set_gid.add(c[1]) # glyph id
5607 font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid))
5608
5609 # build the font subsets
5610 for old_buffer, (name_set, xref_set, subsets) in font_buffers.items():
5611 new_buffer = build_subset(old_buffer, subsets[0], subsets[1])
5612 fontname = list(name_set)[0]
5613 if new_buffer is None or len(new_buffer) >= len(old_buffer):
5614 # subset was not created or did not get smaller
5615 if verbose:
5616 pymupdf.message(f'Cannot subset {fontname!r}.')
5617 continue
5618 if verbose:
5619 pymupdf.message(f"Built subset of font {fontname!r}.")
5620 val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF
5621 new_xref = val[0] # get its xref
5622 set_subset_fontname(new_xref) # tag fontname as subset font
5623 font_str = doc.xref_object( # get its object definition
5624 new_xref,
5625 compressed=True,
5626 )
5627 # walk through the original font xrefs and replace each by the subset def
5628 for font_xref in xref_set:
5629 # we need the original '/W' and '/DW' width values
5630 width_table, def_width = get_old_widths(font_xref)
5631 # ... and replace original font definition at xref with it
5632 doc.update_object(font_xref, font_str)
5633 # now copy over old '/W' and '/DW' values
5634 if width_table or def_width:
5635 set_old_widths(font_xref, width_table, def_width)
5636 # 'new_xref' remains unused in the PDF and must be removed
5637 # by garbage collection.
5638 new_fontsize += len(new_buffer)
5639
5640 return old_fontsize - new_fontsize
5641
5642
5643 # -------------------------------------------------------------------
5644 # Copy XREF object to another XREF
5645 # -------------------------------------------------------------------
5646 def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None:
5647 """Copy a PDF dictionary object to another one given their xref numbers.
5648
5649 Args:
5650 doc: PDF document object
5651 source: source xref number
5652 target: target xref number, the xref must already exist
5653 keep: an optional list of 1st level keys in target that should not be
5654 removed before copying.
5655 Notes:
5656 This works similar to the copy() method of dictionaries in Python. The
5657 source may be a stream object.
5658 """
5659 if doc.xref_is_stream(source):
5660 # read new xref stream, maintaining compression
5661 stream = doc.xref_stream_raw(source)
5662 doc.update_stream(
5663 target,
5664 stream,
5665 compress=False, # keeps source compression
5666 new=True, # in case target is no stream
5667 )
5668
5669 # empty the target completely, observe exceptions
5670 if keep is None:
5671 keep = []
5672 for key in doc.xref_get_keys(target):
5673 if key in keep:
5674 continue
5675 doc.xref_set_key(target, key, "null")
5676 # copy over all source dict items
5677 for key in doc.xref_get_keys(source):
5678 item = doc.xref_get_key(source, key)
5679 doc.xref_set_key(target, key, item[1])