Mercurial > hgrepos > Python2 > PyMuPDF
comparison src/utils.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children | a6bc019ac0b2 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 1:1d09e1dec1d9 |
|---|---|
| 1 # ------------------------------------------------------------------------ | |
| 2 # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com | |
| 3 # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html | |
| 4 # | |
| 5 # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a | |
| 6 # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is | |
| 7 # maintained and developed by Artifex Software, Inc. https://artifex.com. | |
| 8 # ------------------------------------------------------------------------ | |
| 9 import io | |
| 10 import math | |
| 11 import os | |
| 12 import typing | |
| 13 import weakref | |
| 14 | |
| 15 try: | |
| 16 from . import pymupdf | |
| 17 except Exception: | |
| 18 import pymupdf | |
| 19 try: | |
| 20 from . import mupdf | |
| 21 except Exception: | |
| 22 import mupdf | |
| 23 | |
| 24 _format_g = pymupdf.format_g | |
| 25 | |
| 26 g_exceptions_verbose = pymupdf.g_exceptions_verbose | |
| 27 | |
| 28 point_like = "point_like" | |
| 29 rect_like = "rect_like" | |
| 30 matrix_like = "matrix_like" | |
| 31 quad_like = "quad_like" | |
| 32 | |
| 33 # ByteString is gone from typing in 3.14. | |
| 34 # collections.abc.Buffer available from 3.12 only | |
| 35 try: | |
| 36 ByteString = typing.ByteString | |
| 37 except AttributeError: | |
| 38 # pylint: disable=unsupported-binary-operation | |
| 39 ByteString = bytes | bytearray | memoryview | |
| 40 | |
| 41 AnyType = typing.Any | |
| 42 OptInt = typing.Union[int, None] | |
| 43 OptFloat = typing.Optional[float] | |
| 44 OptStr = typing.Optional[str] | |
| 45 OptDict = typing.Optional[dict] | |
| 46 OptBytes = typing.Optional[ByteString] | |
| 47 OptSeq = typing.Optional[typing.Sequence] | |
| 48 | |
| 49 """ | |
| 50 This is a collection of functions to extend PyMupdf. | |
| 51 """ | |
| 52 | |
| 53 | |
| 54 def write_text( | |
| 55 page: pymupdf.Page, | |
| 56 rect=None, | |
| 57 writers=None, | |
| 58 overlay=True, | |
| 59 color=None, | |
| 60 opacity=None, | |
| 61 keep_proportion=True, | |
| 62 rotate=0, | |
| 63 oc=0, | |
| 64 ) -> None: | |
| 65 """Write the text of one or more pymupdf.TextWriter objects. | |
| 66 | |
| 67 Args: | |
| 68 rect: target rectangle. If None, the union of the text writers is used. | |
| 69 writers: one or more pymupdf.TextWriter objects. | |
| 70 overlay: put in foreground or background. | |
| 71 keep_proportion: maintain aspect ratio of rectangle sides. | |
| 72 rotate: arbitrary rotation angle. | |
| 73 oc: the xref of an optional content object | |
| 74 """ | |
| 75 assert isinstance(page, pymupdf.Page) | |
| 76 if not writers: | |
| 77 raise ValueError("need at least one pymupdf.TextWriter") | |
| 78 if type(writers) is pymupdf.TextWriter: | |
| 79 if rotate == 0 and rect is None: | |
| 80 writers.write_text(page, opacity=opacity, color=color, overlay=overlay) | |
| 81 return None | |
| 82 else: | |
| 83 writers = (writers,) | |
| 84 clip = writers[0].text_rect | |
| 85 textdoc = pymupdf.Document() | |
| 86 tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height) | |
| 87 for writer in writers: | |
| 88 clip |= writer.text_rect | |
| 89 writer.write_text(tpage, opacity=opacity, color=color) | |
| 90 if rect is None: | |
| 91 rect = clip | |
| 92 page.show_pdf_page( | |
| 93 rect, | |
| 94 textdoc, | |
| 95 0, | |
| 96 overlay=overlay, | |
| 97 keep_proportion=keep_proportion, | |
| 98 rotate=rotate, | |
| 99 clip=clip, | |
| 100 oc=oc, | |
| 101 ) | |
| 102 textdoc = None | |
| 103 tpage = None | |
| 104 | |
| 105 | |
| 106 def show_pdf_page( | |
| 107 page, | |
| 108 rect, | |
| 109 docsrc, | |
| 110 pno=0, | |
| 111 keep_proportion=True, | |
| 112 overlay=True, | |
| 113 oc=0, | |
| 114 rotate=0, | |
| 115 clip=None, | |
| 116 ) -> int: | |
| 117 """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'. | |
| 118 | |
| 119 Args: | |
| 120 rect: (rect-like) where to place the source image | |
| 121 docsrc: (document) source PDF | |
| 122 pno: (int) source page number | |
| 123 keep_proportion: (bool) do not change width-height-ratio | |
| 124 overlay: (bool) put in foreground | |
| 125 oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF) | |
| 126 rotate: (int) degrees (multiple of 90) | |
| 127 clip: (rect-like) part of source page rectangle | |
| 128 Returns: | |
| 129 xref of inserted object (for reuse) | |
| 130 """ | |
| 131 def calc_matrix(sr, tr, keep=True, rotate=0): | |
| 132 """Calculate transformation matrix from source to target rect. | |
| 133 | |
| 134 Notes: | |
| 135 The product of four matrices in this sequence: (1) translate correct | |
| 136 source corner to origin, (2) rotate, (3) scale, (4) translate to | |
| 137 target's top-left corner. | |
| 138 Args: | |
| 139 sr: source rect in PDF (!) coordinate system | |
| 140 tr: target rect in PDF coordinate system | |
| 141 keep: whether to keep source ratio of width to height | |
| 142 rotate: rotation angle in degrees | |
| 143 Returns: | |
| 144 Transformation matrix. | |
| 145 """ | |
| 146 # calc center point of source rect | |
| 147 smp = (sr.tl + sr.br) / 2.0 | |
| 148 # calc center point of target rect | |
| 149 tmp = (tr.tl + tr.br) / 2.0 | |
| 150 | |
| 151 # m moves to (0, 0), then rotates | |
| 152 m = pymupdf.Matrix(1, 0, 0, 1, -smp.x, -smp.y) * pymupdf.Matrix(rotate) | |
| 153 | |
| 154 sr1 = sr * m # resulting source rect to calculate scale factors | |
| 155 | |
| 156 fw = tr.width / sr1.width # scale the width | |
| 157 fh = tr.height / sr1.height # scale the height | |
| 158 if keep: | |
| 159 fw = fh = min(fw, fh) # take min if keeping aspect ratio | |
| 160 | |
| 161 m *= pymupdf.Matrix(fw, fh) # concat scale matrix | |
| 162 m *= pymupdf.Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center | |
| 163 return pymupdf.JM_TUPLE(m) | |
| 164 | |
| 165 pymupdf.CheckParent(page) | |
| 166 doc = page.parent | |
| 167 | |
| 168 if not doc.is_pdf or not docsrc.is_pdf: | |
| 169 raise ValueError("is no PDF") | |
| 170 | |
| 171 if rect.is_empty or rect.is_infinite: | |
| 172 raise ValueError("rect must be finite and not empty") | |
| 173 | |
| 174 while pno < 0: # support negative page numbers | |
| 175 pno += docsrc.page_count | |
| 176 src_page = docsrc[pno] # load source page | |
| 177 | |
| 178 tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates | |
| 179 | |
| 180 src_rect = src_page.rect if not clip else src_page.rect & clip # source rect | |
| 181 if src_rect.is_empty or src_rect.is_infinite: | |
| 182 raise ValueError("clip must be finite and not empty") | |
| 183 src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord | |
| 184 | |
| 185 matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate) | |
| 186 | |
| 187 # list of existing /Form /XObjects | |
| 188 ilst = [i[1] for i in doc.get_page_xobjects(page.number)] | |
| 189 ilst += [i[7] for i in doc.get_page_images(page.number)] | |
| 190 ilst += [i[4] for i in doc.get_page_fonts(page.number)] | |
| 191 | |
| 192 # create a name not in that list | |
| 193 n = "fzFrm" | |
| 194 i = 0 | |
| 195 _imgname = n + "0" | |
| 196 while _imgname in ilst: | |
| 197 i += 1 | |
| 198 _imgname = n + str(i) | |
| 199 | |
| 200 isrc = docsrc._graft_id # used as key for graftmaps | |
| 201 if doc._graft_id == isrc: | |
| 202 raise ValueError("source document must not equal target") | |
| 203 | |
| 204 # retrieve / make pymupdf.Graftmap for source PDF | |
| 205 gmap = doc.Graftmaps.get(isrc, None) | |
| 206 if gmap is None: | |
| 207 gmap = pymupdf.Graftmap(doc) | |
| 208 doc.Graftmaps[isrc] = gmap | |
| 209 | |
| 210 # take note of generated xref for automatic reuse | |
| 211 pno_id = (isrc, pno) # id of docsrc[pno] | |
| 212 xref = doc.ShownPages.get(pno_id, 0) | |
| 213 | |
| 214 if overlay: | |
| 215 page.wrap_contents() # ensure a balanced graphics state | |
| 216 xref = page._show_pdf_page( | |
| 217 src_page, | |
| 218 overlay=overlay, | |
| 219 matrix=matrix, | |
| 220 xref=xref, | |
| 221 oc=oc, | |
| 222 clip=src_rect, | |
| 223 graftmap=gmap, | |
| 224 _imgname=_imgname, | |
| 225 ) | |
| 226 doc.ShownPages[pno_id] = xref | |
| 227 | |
| 228 return xref | |
| 229 | |
| 230 | |
| 231 def replace_image(page: pymupdf.Page, xref: int, *, filename=None, pixmap=None, stream=None): | |
| 232 """Replace the image referred to by xref. | |
| 233 | |
| 234 Replace the image by changing the object definition stored under xref. This | |
| 235 will leave the pages appearance instructions intact, so the new image is | |
| 236 being displayed with the same bbox, rotation etc. | |
| 237 By providing a small fully transparent image, an effect as if the image had | |
| 238 been deleted can be achieved. | |
| 239 A typical use may include replacing large images by a smaller version, | |
| 240 e.g. with a lower resolution or graylevel instead of colored. | |
| 241 | |
| 242 Args: | |
| 243 xref: the xref of the image to replace. | |
| 244 filename, pixmap, stream: exactly one of these must be provided. The | |
| 245 meaning being the same as in Page.insert_image. | |
| 246 """ | |
| 247 doc = page.parent # the owning document | |
| 248 if not doc.xref_is_image(xref): | |
| 249 raise ValueError("xref not an image") # insert new image anywhere in page | |
| 250 if bool(filename) + bool(stream) + bool(pixmap) != 1: | |
| 251 raise ValueError("Exactly one of filename/stream/pixmap must be given") | |
| 252 new_xref = page.insert_image( | |
| 253 page.rect, filename=filename, stream=stream, pixmap=pixmap | |
| 254 ) | |
| 255 doc.xref_copy(new_xref, xref) # copy over new to old | |
| 256 last_contents_xref = page.get_contents()[-1] | |
| 257 # new image insertion has created a new /Contents source, | |
| 258 # which we will set to spaces now | |
| 259 doc.update_stream(last_contents_xref, b" ") | |
| 260 page._image_info = None # clear cache of extracted image information | |
| 261 | |
| 262 | |
| 263 def delete_image(page: pymupdf.Page, xref: int): | |
| 264 """Delete the image referred to by xef. | |
| 265 | |
| 266 Actually replaces by a small transparent Pixmap using method Page.replace_image. | |
| 267 | |
| 268 Args: | |
| 269 xref: xref of the image to delete. | |
| 270 """ | |
| 271 # make a small 100% transparent pixmap (of just any dimension) | |
| 272 pix = pymupdf.Pixmap(pymupdf.csGRAY, (0, 0, 1, 1), 1) | |
| 273 pix.clear_with() # clear all samples bytes to 0x00 | |
| 274 page.replace_image(xref, pixmap=pix) | |
| 275 | |
| 276 | |
| 277 def insert_image( | |
| 278 page, | |
| 279 rect, | |
| 280 *, | |
| 281 alpha=-1, | |
| 282 filename=None, | |
| 283 height=0, | |
| 284 keep_proportion=True, | |
| 285 mask=None, | |
| 286 oc=0, | |
| 287 overlay=True, | |
| 288 pixmap=None, | |
| 289 rotate=0, | |
| 290 stream=None, | |
| 291 width=0, | |
| 292 xref=0, | |
| 293 ): | |
| 294 """Insert an image for display in a rectangle. | |
| 295 | |
| 296 Args: | |
| 297 rect: (rect_like) position of image on the page. | |
| 298 alpha: (int, optional) set to 0 if image has no transparency. | |
| 299 filename: (str, Path, file object) image filename. | |
| 300 height: (int) | |
| 301 keep_proportion: (bool) keep width / height ratio (default). | |
| 302 mask: (bytes, optional) image consisting of alpha values to use. | |
| 303 oc: (int) xref of OCG or OCMD to declare as Optional Content. | |
| 304 overlay: (bool) put in foreground (default) or background. | |
| 305 pixmap: (pymupdf.Pixmap) use this as image. | |
| 306 rotate: (int) rotate by 0, 90, 180 or 270 degrees. | |
| 307 stream: (bytes) use this as image. | |
| 308 width: (int) | |
| 309 xref: (int) use this as image. | |
| 310 | |
| 311 'page' and 'rect' are positional, all other parameters are keywords. | |
| 312 | |
| 313 If 'xref' is given, that image is used. Other input options are ignored. | |
| 314 Else, exactly one of pixmap, stream or filename must be given. | |
| 315 | |
| 316 'alpha=0' for non-transparent images improves performance significantly. | |
| 317 Affects stream and filename only. | |
| 318 | |
| 319 Optimum transparent insertions are possible by using filename / stream in | |
| 320 conjunction with a 'mask' image of alpha values. | |
| 321 | |
| 322 Returns: | |
| 323 xref (int) of inserted image. Re-use as argument for multiple insertions. | |
| 324 """ | |
| 325 pymupdf.CheckParent(page) | |
| 326 doc = page.parent | |
| 327 if not doc.is_pdf: | |
| 328 raise ValueError("is no PDF") | |
| 329 | |
| 330 if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1): | |
| 331 raise ValueError("xref=0 needs exactly one of filename, pixmap, stream") | |
| 332 | |
| 333 if filename: | |
| 334 if type(filename) is str: | |
| 335 pass | |
| 336 elif hasattr(filename, "absolute"): | |
| 337 filename = str(filename) | |
| 338 elif hasattr(filename, "name"): | |
| 339 filename = filename.name | |
| 340 else: | |
| 341 raise ValueError("bad filename") | |
| 342 | |
| 343 if filename and not os.path.exists(filename): | |
| 344 raise FileNotFoundError("No such file: '%s'" % filename) | |
| 345 elif stream and type(stream) not in (bytes, bytearray, io.BytesIO): | |
| 346 raise ValueError("stream must be bytes-like / BytesIO") | |
| 347 elif pixmap and type(pixmap) is not pymupdf.Pixmap: | |
| 348 raise ValueError("pixmap must be a pymupdf.Pixmap") | |
| 349 if mask and not (stream or filename): | |
| 350 raise ValueError("mask requires stream or filename") | |
| 351 if mask and type(mask) not in (bytes, bytearray, io.BytesIO): | |
| 352 raise ValueError("mask must be bytes-like / BytesIO") | |
| 353 while rotate < 0: | |
| 354 rotate += 360 | |
| 355 while rotate >= 360: | |
| 356 rotate -= 360 | |
| 357 if rotate not in (0, 90, 180, 270): | |
| 358 raise ValueError("bad rotate value") | |
| 359 | |
| 360 r = pymupdf.Rect(rect) | |
| 361 if r.is_empty or r.is_infinite: | |
| 362 raise ValueError("rect must be finite and not empty") | |
| 363 clip = r * ~page.transformation_matrix | |
| 364 | |
| 365 # Create a unique image reference name. | |
| 366 ilst = [i[7] for i in doc.get_page_images(page.number)] | |
| 367 ilst += [i[1] for i in doc.get_page_xobjects(page.number)] | |
| 368 ilst += [i[4] for i in doc.get_page_fonts(page.number)] | |
| 369 n = "fzImg" # 'pymupdf image' | |
| 370 i = 0 | |
| 371 _imgname = n + "0" # first name candidate | |
| 372 while _imgname in ilst: | |
| 373 i += 1 | |
| 374 _imgname = n + str(i) # try new name | |
| 375 | |
| 376 if overlay: | |
| 377 page.wrap_contents() # ensure a balanced graphics state | |
| 378 digests = doc.InsertedImages | |
| 379 xref, digests = page._insert_image( | |
| 380 filename=filename, | |
| 381 pixmap=pixmap, | |
| 382 stream=stream, | |
| 383 imask=mask, | |
| 384 clip=clip, | |
| 385 overlay=overlay, | |
| 386 oc=oc, | |
| 387 xref=xref, | |
| 388 rotate=rotate, | |
| 389 keep_proportion=keep_proportion, | |
| 390 width=width, | |
| 391 height=height, | |
| 392 alpha=alpha, | |
| 393 _imgname=_imgname, | |
| 394 digests=digests, | |
| 395 ) | |
| 396 if digests is not None: | |
| 397 doc.InsertedImages = digests | |
| 398 | |
| 399 return xref | |
| 400 | |
| 401 | |
| 402 def search_for( | |
| 403 page, | |
| 404 text, | |
| 405 *, | |
| 406 clip=None, | |
| 407 quads=False, | |
| 408 flags=pymupdf.TEXT_DEHYPHENATE | |
| 409 | pymupdf.TEXT_PRESERVE_WHITESPACE | |
| 410 | pymupdf.TEXT_PRESERVE_LIGATURES | |
| 411 | pymupdf.TEXT_MEDIABOX_CLIP | |
| 412 , | |
| 413 textpage=None, | |
| 414 ) -> list: | |
| 415 """Search for a string on a page. | |
| 416 | |
| 417 Args: | |
| 418 text: string to be searched for | |
| 419 clip: restrict search to this rectangle | |
| 420 quads: (bool) return quads instead of rectangles | |
| 421 flags: bit switches, default: join hyphened words | |
| 422 textpage: a pre-created pymupdf.TextPage | |
| 423 Returns: | |
| 424 a list of rectangles or quads, each containing one occurrence. | |
| 425 """ | |
| 426 if clip is not None: | |
| 427 clip = pymupdf.Rect(clip) | |
| 428 | |
| 429 pymupdf.CheckParent(page) | |
| 430 tp = textpage | |
| 431 if tp is None: | |
| 432 tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage | |
| 433 elif getattr(tp, "parent") != page: | |
| 434 raise ValueError("not a textpage of this page") | |
| 435 rlist = tp.search(text, quads=quads) | |
| 436 if textpage is None: | |
| 437 del tp | |
| 438 return rlist | |
| 439 | |
| 440 | |
| 441 def search_page_for( | |
| 442 doc: pymupdf.Document, | |
| 443 pno: int, | |
| 444 text: str, | |
| 445 quads: bool = False, | |
| 446 clip: rect_like = None, | |
| 447 flags: int = pymupdf.TEXT_DEHYPHENATE | |
| 448 | pymupdf.TEXT_PRESERVE_LIGATURES | |
| 449 | pymupdf.TEXT_PRESERVE_WHITESPACE | |
| 450 | pymupdf.TEXT_MEDIABOX_CLIP | |
| 451 , | |
| 452 textpage: pymupdf.TextPage = None, | |
| 453 ) -> list: | |
| 454 """Search for a string on a page. | |
| 455 | |
| 456 Args: | |
| 457 pno: page number | |
| 458 text: string to be searched for | |
| 459 clip: restrict search to this rectangle | |
| 460 quads: (bool) return quads instead of rectangles | |
| 461 flags: bit switches, default: join hyphened words | |
| 462 textpage: reuse a prepared textpage | |
| 463 Returns: | |
| 464 a list of rectangles or quads, each containing an occurrence. | |
| 465 """ | |
| 466 | |
| 467 return doc[pno].search_for( | |
| 468 text, | |
| 469 quads=quads, | |
| 470 clip=clip, | |
| 471 flags=flags, | |
| 472 textpage=textpage, | |
| 473 ) | |
| 474 | |
| 475 | |
| 476 def get_text_blocks( | |
| 477 page: pymupdf.Page, | |
| 478 clip: rect_like = None, | |
| 479 flags: OptInt = None, | |
| 480 textpage: pymupdf.TextPage = None, | |
| 481 sort: bool = False, | |
| 482 ) -> list: | |
| 483 """Return the text blocks on a page. | |
| 484 | |
| 485 Notes: | |
| 486 Lines in a block are concatenated with line breaks. | |
| 487 Args: | |
| 488 flags: (int) control the amount of data parsed into the textpage. | |
| 489 Returns: | |
| 490 A list of the blocks. Each item contains the containing rectangle | |
| 491 coordinates, text lines, running block number and block type. | |
| 492 """ | |
| 493 pymupdf.CheckParent(page) | |
| 494 if flags is None: | |
| 495 flags = pymupdf.TEXTFLAGS_BLOCKS | |
| 496 tp = textpage | |
| 497 if tp is None: | |
| 498 tp = page.get_textpage(clip=clip, flags=flags) | |
| 499 elif getattr(tp, "parent") != page: | |
| 500 raise ValueError("not a textpage of this page") | |
| 501 | |
| 502 blocks = tp.extractBLOCKS() | |
| 503 if textpage is None: | |
| 504 del tp | |
| 505 if sort: | |
| 506 blocks.sort(key=lambda b: (b[3], b[0])) | |
| 507 return blocks | |
| 508 | |
| 509 | |
| 510 def get_text_words( | |
| 511 page: pymupdf.Page, | |
| 512 clip: rect_like = None, | |
| 513 flags: OptInt = None, | |
| 514 textpage: pymupdf.TextPage = None, | |
| 515 sort: bool = False, | |
| 516 delimiters=None, | |
| 517 tolerance=3, | |
| 518 ) -> list: | |
| 519 """Return the text words as a list with the bbox for each word. | |
| 520 | |
| 521 Args: | |
| 522 page: pymupdf.Page | |
| 523 clip: (rect-like) area on page to consider | |
| 524 flags: (int) control the amount of data parsed into the textpage. | |
| 525 textpage: (pymupdf.TextPage) either passed-in or None. | |
| 526 sort: (bool) sort the words in reading sequence. | |
| 527 delimiters: (str,list) characters to use as word delimiters. | |
| 528 tolerance: (float) consider words to be part of the same line if | |
| 529 top or bottom coordinate are not larger than this. Relevant | |
| 530 only if sort=True. | |
| 531 | |
| 532 Returns: | |
| 533 Word tuples (x0, y0, x1, y1, "word", bno, lno, wno). | |
| 534 """ | |
| 535 | |
| 536 def sort_words(words): | |
| 537 """Sort words line-wise, forgiving small deviations.""" | |
| 538 words.sort(key=lambda w: (w[3], w[0])) | |
| 539 nwords = [] # final word list | |
| 540 line = [words[0]] # collects words roughly in same line | |
| 541 lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle | |
| 542 for w in words[1:]: | |
| 543 wrect = pymupdf.Rect(w[:4]) | |
| 544 if ( | |
| 545 abs(wrect.y0 - lrect.y0) <= tolerance | |
| 546 or abs(wrect.y1 - lrect.y1) <= tolerance | |
| 547 ): | |
| 548 line.append(w) | |
| 549 lrect |= wrect | |
| 550 else: | |
| 551 line.sort(key=lambda w: w[0]) # sort words in line l-t-r | |
| 552 nwords.extend(line) # append to final words list | |
| 553 line = [w] # start next line | |
| 554 lrect = wrect # start next line rect | |
| 555 | |
| 556 line.sort(key=lambda w: w[0]) # sort words in line l-t-r | |
| 557 nwords.extend(line) # append to final words list | |
| 558 | |
| 559 return nwords | |
| 560 | |
| 561 pymupdf.CheckParent(page) | |
| 562 if flags is None: | |
| 563 flags = pymupdf.TEXTFLAGS_WORDS | |
| 564 tp = textpage | |
| 565 if tp is None: | |
| 566 tp = page.get_textpage(clip=clip, flags=flags) | |
| 567 elif getattr(tp, "parent") != page: | |
| 568 raise ValueError("not a textpage of this page") | |
| 569 | |
| 570 words = tp.extractWORDS(delimiters) | |
| 571 | |
| 572 # if textpage was given, we subselect the words in clip | |
| 573 if textpage is not None and clip is not None: | |
| 574 # sub-select words contained in clip | |
| 575 clip = pymupdf.Rect(clip) | |
| 576 words = [ | |
| 577 w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4])) | |
| 578 ] | |
| 579 | |
| 580 if textpage is None: | |
| 581 del tp | |
| 582 if words and sort: | |
| 583 # advanced sort if any words found | |
| 584 words = sort_words(words) | |
| 585 | |
| 586 return words | |
| 587 | |
| 588 | |
| 589 def get_sorted_text( | |
| 590 page: pymupdf.Page, | |
| 591 clip: rect_like = None, | |
| 592 flags: OptInt = None, | |
| 593 textpage: pymupdf.TextPage = None, | |
| 594 tolerance=3, | |
| 595 ) -> str: | |
| 596 """Extract plain text avoiding unacceptable line breaks. | |
| 597 | |
| 598 Text contained in clip will be sorted in reading sequence. Some effort | |
| 599 is also spent to simulate layout vertically and horizontally. | |
| 600 | |
| 601 Args: | |
| 602 page: pymupdf.Page | |
| 603 clip: (rect-like) only consider text inside | |
| 604 flags: (int) text extraction flags | |
| 605 textpage: pymupdf.TextPage | |
| 606 tolerance: (float) consider words to be on the same line if their top | |
| 607 or bottom coordinates do not differ more than this. | |
| 608 | |
| 609 Notes: | |
| 610 If a TextPage is provided, all text is checked for being inside clip | |
| 611 with at least 50% of its bbox. | |
| 612 This allows to use some "global" TextPage in conjunction with sub- | |
| 613 selecting words in parts of the defined TextPage rectangle. | |
| 614 | |
| 615 Returns: | |
| 616 A text string in reading sequence. Left indentation of each line, | |
| 617 inter-line and inter-word distances strive to reflect the layout. | |
| 618 """ | |
| 619 | |
| 620 def line_text(clip, line): | |
| 621 """Create the string of one text line. | |
| 622 | |
| 623 We are trying to simulate some horizontal layout here, too. | |
| 624 | |
| 625 Args: | |
| 626 clip: (pymupdf.Rect) the area from which all text is being read. | |
| 627 line: (list) word tuples (rect, text) contained in the line | |
| 628 Returns: | |
| 629 Text in this line. Generated from words in 'line'. Distance from | |
| 630 predecessor is translated to multiple spaces, thus simulating | |
| 631 text indentations and large horizontal distances. | |
| 632 """ | |
| 633 line.sort(key=lambda w: w[0].x0) | |
| 634 ltext = "" # text in the line | |
| 635 x1 = clip.x0 # end coordinate of ltext | |
| 636 lrect = pymupdf.EMPTY_RECT() # bbox of this line | |
| 637 for r, t in line: | |
| 638 lrect |= r # update line bbox | |
| 639 # convert distance to previous word to multiple spaces | |
| 640 dist = max( | |
| 641 int(round((r.x0 - x1) / r.width * len(t))), | |
| 642 0 if (x1 == clip.x0 or r.x0 <= x1) else 1, | |
| 643 ) # number of space characters | |
| 644 | |
| 645 ltext += " " * dist + t # append word string | |
| 646 x1 = r.x1 # update new end position | |
| 647 return ltext | |
| 648 | |
| 649 # Extract words in correct sequence first. | |
| 650 words = [ | |
| 651 (pymupdf.Rect(w[:4]), w[4]) | |
| 652 for w in get_text_words( | |
| 653 page, | |
| 654 clip=clip, | |
| 655 flags=flags, | |
| 656 textpage=textpage, | |
| 657 sort=True, | |
| 658 tolerance=tolerance, | |
| 659 ) | |
| 660 ] | |
| 661 | |
| 662 if not words: # no text present | |
| 663 return "" | |
| 664 totalbox = pymupdf.EMPTY_RECT() # area covering all text | |
| 665 for wr, text in words: | |
| 666 totalbox |= wr | |
| 667 | |
| 668 lines = [] # list of reconstituted lines | |
| 669 line = [words[0]] # current line | |
| 670 lrect = words[0][0] # the line's rectangle | |
| 671 | |
| 672 # walk through the words | |
| 673 for wr, text in words[1:]: # start with second word | |
| 674 w0r, _ = line[-1] # read previous word in current line | |
| 675 | |
| 676 # if this word matches top or bottom of the line, append it | |
| 677 if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance: | |
| 678 line.append((wr, text)) | |
| 679 lrect |= wr | |
| 680 else: | |
| 681 # output current line and re-initialize | |
| 682 ltext = line_text(totalbox, line) | |
| 683 lines.append((lrect, ltext)) | |
| 684 line = [(wr, text)] | |
| 685 lrect = wr | |
| 686 | |
| 687 # also append unfinished last line | |
| 688 ltext = line_text(totalbox, line) | |
| 689 lines.append((lrect, ltext)) | |
| 690 | |
| 691 # sort all lines vertically | |
| 692 lines.sort(key=lambda l: (l[0].y1)) | |
| 693 | |
| 694 text = lines[0][1] # text of first line | |
| 695 y1 = lines[0][0].y1 # its bottom coordinate | |
| 696 for lrect, ltext in lines[1:]: | |
| 697 distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5) | |
| 698 breaks = "\n" * (distance + 1) | |
| 699 text += breaks + ltext | |
| 700 y1 = lrect.y1 | |
| 701 | |
| 702 # return text in clip | |
| 703 return text | |
| 704 | |
| 705 | |
| 706 def get_textbox( | |
| 707 page: pymupdf.Page, | |
| 708 rect: rect_like, | |
| 709 textpage: pymupdf.TextPage = None, | |
| 710 ) -> str: | |
| 711 tp = textpage | |
| 712 if tp is None: | |
| 713 tp = page.get_textpage() | |
| 714 elif getattr(tp, "parent") != page: | |
| 715 raise ValueError("not a textpage of this page") | |
| 716 rc = tp.extractTextbox(rect) | |
| 717 if textpage is None: | |
| 718 del tp | |
| 719 return rc | |
| 720 | |
| 721 | |
| 722 def get_text_selection( | |
| 723 page: pymupdf.Page, | |
| 724 p1: point_like, | |
| 725 p2: point_like, | |
| 726 clip: rect_like = None, | |
| 727 textpage: pymupdf.TextPage = None, | |
| 728 ): | |
| 729 pymupdf.CheckParent(page) | |
| 730 tp = textpage | |
| 731 if tp is None: | |
| 732 tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE) | |
| 733 elif getattr(tp, "parent") != page: | |
| 734 raise ValueError("not a textpage of this page") | |
| 735 rc = tp.extractSelection(p1, p2) | |
| 736 if textpage is None: | |
| 737 del tp | |
| 738 return rc | |
| 739 | |
| 740 | |
| 741 def get_textpage_ocr( | |
| 742 page: pymupdf.Page, | |
| 743 flags: int = 0, | |
| 744 language: str = "eng", | |
| 745 dpi: int = 72, | |
| 746 full: bool = False, | |
| 747 tessdata: str = None, | |
| 748 ) -> pymupdf.TextPage: | |
| 749 """Create a Textpage from combined results of normal and OCR text parsing. | |
| 750 | |
| 751 Args: | |
| 752 flags: (int) control content becoming part of the result. | |
| 753 language: (str) specify expected language(s). Default is "eng" (English). | |
| 754 dpi: (int) resolution in dpi, default 72. | |
| 755 full: (bool) whether to OCR the full page image, or only its images (default) | |
| 756 """ | |
| 757 pymupdf.CheckParent(page) | |
| 758 tessdata = pymupdf.get_tessdata(tessdata) | |
| 759 | |
| 760 def full_ocr(page, dpi, language, flags): | |
| 761 zoom = dpi / 72 | |
| 762 mat = pymupdf.Matrix(zoom, zoom) | |
| 763 pix = page.get_pixmap(matrix=mat) | |
| 764 ocr_pdf = pymupdf.Document( | |
| 765 "pdf", | |
| 766 pix.pdfocr_tobytes( | |
| 767 compress=False, | |
| 768 language=language, | |
| 769 tessdata=tessdata, | |
| 770 ), | |
| 771 ) | |
| 772 ocr_page = ocr_pdf.load_page(0) | |
| 773 unzoom = page.rect.width / ocr_page.rect.width | |
| 774 ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix | |
| 775 tpage = ocr_page.get_textpage(flags=flags, matrix=ctm) | |
| 776 ocr_pdf.close() | |
| 777 pix = None | |
| 778 tpage.parent = weakref.proxy(page) | |
| 779 return tpage | |
| 780 | |
| 781 # if OCR for the full page, OCR its pixmap @ desired dpi | |
| 782 if full: | |
| 783 return full_ocr(page, dpi, language, flags) | |
| 784 | |
| 785 # For partial OCR, make a normal textpage, then extend it with text that | |
| 786 # is OCRed from each image. | |
| 787 # Because of this, we need the images flag bit set ON. | |
| 788 tpage = page.get_textpage(flags=flags) | |
| 789 for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]: | |
| 790 if block["type"] != 1: # only look at images | |
| 791 continue | |
| 792 bbox = pymupdf.Rect(block["bbox"]) | |
| 793 if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff | |
| 794 continue | |
| 795 try: | |
| 796 pix = pymupdf.Pixmap(block["image"]) # get image pixmap | |
| 797 if pix.n - pix.alpha != 3: # we need to convert this to RGB! | |
| 798 pix = pymupdf.Pixmap(pymupdf.csRGB, pix) | |
| 799 if pix.alpha: # must remove alpha channel | |
| 800 pix = pymupdf.Pixmap(pix, 0) | |
| 801 imgdoc = pymupdf.Document( | |
| 802 "pdf", | |
| 803 pix.pdfocr_tobytes(language=language, tessdata=tessdata), | |
| 804 ) # pdf with OCRed page | |
| 805 imgpage = imgdoc.load_page(0) # read image as a page | |
| 806 pix = None | |
| 807 # compute matrix to transform coordinates back to that of 'page' | |
| 808 imgrect = imgpage.rect # page size of image PDF | |
| 809 shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height) | |
| 810 mat = shrink * block["transform"] | |
| 811 imgpage.extend_textpage(tpage, flags=0, matrix=mat) | |
| 812 imgdoc.close() | |
| 813 except (RuntimeError, mupdf.FzErrorBase): | |
| 814 if 0 and g_exceptions_verbose: | |
| 815 # Don't show exception info here because it can happen in | |
| 816 # normal operation (see test_3842b). | |
| 817 pymupdf.exception_info() | |
| 818 tpage = None | |
| 819 pymupdf.message("Falling back to full page OCR") | |
| 820 return full_ocr(page, dpi, language, flags) | |
| 821 | |
| 822 return tpage | |
| 823 | |
| 824 | |
| 825 def get_image_info(page: pymupdf.Page, hashes: bool = False, xrefs: bool = False) -> list: | |
| 826 """Extract image information only from a pymupdf.TextPage. | |
| 827 | |
| 828 Args: | |
| 829 hashes: (bool) include MD5 hash for each image. | |
| 830 xrefs: (bool) try to find the xref for each image. Sets hashes to true. | |
| 831 """ | |
| 832 doc = page.parent | |
| 833 if xrefs and doc.is_pdf: | |
| 834 hashes = True | |
| 835 if not doc.is_pdf: | |
| 836 xrefs = False | |
| 837 imginfo = getattr(page, "_image_info", None) | |
| 838 if imginfo and not xrefs: | |
| 839 return imginfo | |
| 840 if not imginfo: | |
| 841 tp = page.get_textpage(flags=pymupdf.TEXT_PRESERVE_IMAGES) | |
| 842 imginfo = tp.extractIMGINFO(hashes=hashes) | |
| 843 del tp | |
| 844 if hashes: | |
| 845 page._image_info = imginfo | |
| 846 if not xrefs or not doc.is_pdf: | |
| 847 return imginfo | |
| 848 imglist = page.get_images() | |
| 849 digests = {} | |
| 850 for item in imglist: | |
| 851 xref = item[0] | |
| 852 pix = pymupdf.Pixmap(doc, xref) | |
| 853 digests[pix.digest] = xref | |
| 854 del pix | |
| 855 for i in range(len(imginfo)): | |
| 856 item = imginfo[i] | |
| 857 xref = digests.get(item["digest"], 0) | |
| 858 item["xref"] = xref | |
| 859 imginfo[i] = item | |
| 860 return imginfo | |
| 861 | |
| 862 | |
| 863 def get_image_rects(page: pymupdf.Page, name, transform=False) -> list: | |
| 864 """Return list of image positions on a page. | |
| 865 | |
| 866 Args: | |
| 867 name: (str, list, int) image identification. May be reference name, an | |
| 868 item of the page's image list or an xref. | |
| 869 transform: (bool) whether to also return the transformation matrix. | |
| 870 Returns: | |
| 871 A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix) | |
| 872 for all image locations on the page. | |
| 873 """ | |
| 874 if type(name) in (list, tuple): | |
| 875 xref = name[0] | |
| 876 elif type(name) is int: | |
| 877 xref = name | |
| 878 else: | |
| 879 imglist = [i for i in page.get_images() if i[7] == name] | |
| 880 if imglist == []: | |
| 881 raise ValueError("bad image name") | |
| 882 elif len(imglist) != 1: | |
| 883 raise ValueError("multiple image names found") | |
| 884 xref = imglist[0][0] | |
| 885 pix = pymupdf.Pixmap(page.parent, xref) # make pixmap of the image to compute MD5 | |
| 886 digest = pix.digest | |
| 887 del pix | |
| 888 infos = page.get_image_info(hashes=True) | |
| 889 if not transform: | |
| 890 bboxes = [pymupdf.Rect(im["bbox"]) for im in infos if im["digest"] == digest] | |
| 891 else: | |
| 892 bboxes = [ | |
| 893 (pymupdf.Rect(im["bbox"]), pymupdf.Matrix(im["transform"])) | |
| 894 for im in infos | |
| 895 if im["digest"] == digest | |
| 896 ] | |
| 897 return bboxes | |
| 898 | |
| 899 | |
| 900 def get_text( | |
| 901 page: pymupdf.Page, | |
| 902 option: str = "text", | |
| 903 *, | |
| 904 clip: rect_like = None, | |
| 905 flags: OptInt = None, | |
| 906 textpage: pymupdf.TextPage = None, | |
| 907 sort: bool = False, | |
| 908 delimiters=None, | |
| 909 tolerance=3, | |
| 910 ): | |
| 911 """Extract text from a page or an annotation. | |
| 912 | |
| 913 This is a unifying wrapper for various methods of the pymupdf.TextPage class. | |
| 914 | |
| 915 Args: | |
| 916 option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. | |
| 917 clip: (rect-like) restrict output to this area. | |
| 918 flags: bit switches to e.g. exclude images or decompose ligatures. | |
| 919 textpage: reuse this pymupdf.TextPage and make no new one. If specified, | |
| 920 'flags' and 'clip' are ignored. | |
| 921 | |
| 922 Returns: | |
| 923 the output of methods get_text_words / get_text_blocks or pymupdf.TextPage | |
| 924 methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT, | |
| 925 extractXHTML or etractXML respectively. | |
| 926 Default and misspelling choice is "text". | |
| 927 """ | |
| 928 formats = { | |
| 929 "text": pymupdf.TEXTFLAGS_TEXT, | |
| 930 "html": pymupdf.TEXTFLAGS_HTML, | |
| 931 "json": pymupdf.TEXTFLAGS_DICT, | |
| 932 "rawjson": pymupdf.TEXTFLAGS_RAWDICT, | |
| 933 "xml": pymupdf.TEXTFLAGS_XML, | |
| 934 "xhtml": pymupdf.TEXTFLAGS_XHTML, | |
| 935 "dict": pymupdf.TEXTFLAGS_DICT, | |
| 936 "rawdict": pymupdf.TEXTFLAGS_RAWDICT, | |
| 937 "words": pymupdf.TEXTFLAGS_WORDS, | |
| 938 "blocks": pymupdf.TEXTFLAGS_BLOCKS, | |
| 939 } | |
| 940 option = option.lower() | |
| 941 assert option in formats | |
| 942 if option not in formats: | |
| 943 option = "text" | |
| 944 if flags is None: | |
| 945 flags = formats[option] | |
| 946 | |
| 947 if option == "words": | |
| 948 return get_text_words( | |
| 949 page, | |
| 950 clip=clip, | |
| 951 flags=flags, | |
| 952 textpage=textpage, | |
| 953 sort=sort, | |
| 954 delimiters=delimiters, | |
| 955 ) | |
| 956 if option == "blocks": | |
| 957 return get_text_blocks( | |
| 958 page, clip=clip, flags=flags, textpage=textpage, sort=sort | |
| 959 ) | |
| 960 | |
| 961 if option == "text" and sort: | |
| 962 return get_sorted_text( | |
| 963 page, | |
| 964 clip=clip, | |
| 965 flags=flags, | |
| 966 textpage=textpage, | |
| 967 tolerance=tolerance, | |
| 968 ) | |
| 969 | |
| 970 pymupdf.CheckParent(page) | |
| 971 cb = None | |
| 972 if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions | |
| 973 clip = page.cropbox | |
| 974 if clip is not None: | |
| 975 clip = pymupdf.Rect(clip) | |
| 976 cb = None | |
| 977 elif type(page) is pymupdf.Page: | |
| 978 cb = page.cropbox | |
| 979 # pymupdf.TextPage with or without images | |
| 980 tp = textpage | |
| 981 #pymupdf.exception_info() | |
| 982 if tp is None: | |
| 983 tp = page.get_textpage(clip=clip, flags=flags) | |
| 984 elif getattr(tp, "parent") != page: | |
| 985 raise ValueError("not a textpage of this page") | |
| 986 #pymupdf.log( '{option=}') | |
| 987 if option == "json": | |
| 988 t = tp.extractJSON(cb=cb, sort=sort) | |
| 989 elif option == "rawjson": | |
| 990 t = tp.extractRAWJSON(cb=cb, sort=sort) | |
| 991 elif option == "dict": | |
| 992 t = tp.extractDICT(cb=cb, sort=sort) | |
| 993 elif option == "rawdict": | |
| 994 t = tp.extractRAWDICT(cb=cb, sort=sort) | |
| 995 elif option == "html": | |
| 996 t = tp.extractHTML() | |
| 997 elif option == "xml": | |
| 998 t = tp.extractXML() | |
| 999 elif option == "xhtml": | |
| 1000 t = tp.extractXHTML() | |
| 1001 else: | |
| 1002 t = tp.extractText(sort=sort) | |
| 1003 | |
| 1004 if textpage is None: | |
| 1005 del tp | |
| 1006 return t | |
| 1007 | |
| 1008 | |
| 1009 def get_page_text( | |
| 1010 doc: pymupdf.Document, | |
| 1011 pno: int, | |
| 1012 option: str = "text", | |
| 1013 clip: rect_like = None, | |
| 1014 flags: OptInt = None, | |
| 1015 textpage: pymupdf.TextPage = None, | |
| 1016 sort: bool = False, | |
| 1017 ) -> typing.Any: | |
| 1018 """Extract a document page's text by page number. | |
| 1019 | |
| 1020 Notes: | |
| 1021 Convenience function calling page.get_text(). | |
| 1022 Args: | |
| 1023 pno: page number | |
| 1024 option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. | |
| 1025 Returns: | |
| 1026 output from page.TextPage(). | |
| 1027 """ | |
| 1028 return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort) | |
| 1029 | |
| 1030 def get_pixmap( | |
| 1031 page: pymupdf.Page, | |
| 1032 *, | |
| 1033 matrix: matrix_like=pymupdf.Identity, | |
| 1034 dpi=None, | |
| 1035 colorspace: pymupdf.Colorspace=pymupdf.csRGB, | |
| 1036 clip: rect_like=None, | |
| 1037 alpha: bool=False, | |
| 1038 annots: bool=True, | |
| 1039 ) -> pymupdf.Pixmap: | |
| 1040 """Create pixmap of page. | |
| 1041 | |
| 1042 Keyword args: | |
| 1043 matrix: Matrix for transformation (default: Identity). | |
| 1044 dpi: desired dots per inch. If given, matrix is ignored. | |
| 1045 colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB. | |
| 1046 clip: (irect-like) restrict rendering to this area. | |
| 1047 alpha: (bool) whether to include alpha channel | |
| 1048 annots: (bool) whether to also render annotations | |
| 1049 """ | |
| 1050 if dpi: | |
| 1051 zoom = dpi / 72 | |
| 1052 matrix = pymupdf.Matrix(zoom, zoom) | |
| 1053 | |
| 1054 if type(colorspace) is str: | |
| 1055 if colorspace.upper() == "GRAY": | |
| 1056 colorspace = pymupdf.csGRAY | |
| 1057 elif colorspace.upper() == "CMYK": | |
| 1058 colorspace = pymupdf.csCMYK | |
| 1059 else: | |
| 1060 colorspace = pymupdf.csRGB | |
| 1061 if colorspace.n not in (1, 3, 4): | |
| 1062 raise ValueError("unsupported colorspace") | |
| 1063 | |
| 1064 dl = page.get_displaylist(annots=annots) | |
| 1065 pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip) | |
| 1066 dl = None | |
| 1067 if dpi: | |
| 1068 pix.set_dpi(dpi, dpi) | |
| 1069 return pix | |
| 1070 | |
| 1071 | |
| 1072 def get_page_pixmap( | |
| 1073 doc: pymupdf.Document, | |
| 1074 pno: int, | |
| 1075 *, | |
| 1076 matrix: matrix_like = pymupdf.Identity, | |
| 1077 dpi=None, | |
| 1078 colorspace: pymupdf.Colorspace = pymupdf.csRGB, | |
| 1079 clip: rect_like = None, | |
| 1080 alpha: bool = False, | |
| 1081 annots: bool = True, | |
| 1082 ) -> pymupdf.Pixmap: | |
| 1083 """Create pixmap of document page by page number. | |
| 1084 | |
| 1085 Notes: | |
| 1086 Convenience function calling page.get_pixmap. | |
| 1087 Args: | |
| 1088 pno: (int) page number | |
| 1089 matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity). | |
| 1090 colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB. | |
| 1091 clip: (irect-like) restrict rendering to this area. | |
| 1092 alpha: (bool) include alpha channel | |
| 1093 annots: (bool) also render annotations | |
| 1094 """ | |
| 1095 return doc[pno].get_pixmap( | |
| 1096 matrix=matrix, | |
| 1097 dpi=dpi, colorspace=colorspace, | |
| 1098 clip=clip, | |
| 1099 alpha=alpha, | |
| 1100 annots=annots | |
| 1101 ) | |
| 1102 | |
| 1103 | |
| 1104 def getLinkDict(ln, document=None) -> dict: | |
| 1105 if isinstance(ln, pymupdf.Outline): | |
| 1106 dest = ln.destination(document) | |
| 1107 elif isinstance(ln, pymupdf.Link): | |
| 1108 dest = ln.dest | |
| 1109 else: | |
| 1110 assert 0, f'Unexpected {type(ln)=}.' | |
| 1111 nl = {"kind": dest.kind, "xref": 0} | |
| 1112 try: | |
| 1113 if hasattr(ln, 'rect'): | |
| 1114 nl["from"] = ln.rect | |
| 1115 except Exception: | |
| 1116 # This seems to happen quite often in PyMuPDF/tests. | |
| 1117 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1118 pass | |
| 1119 pnt = pymupdf.Point(0, 0) | |
| 1120 if dest.flags & pymupdf.LINK_FLAG_L_VALID: | |
| 1121 pnt.x = dest.lt.x | |
| 1122 if dest.flags & pymupdf.LINK_FLAG_T_VALID: | |
| 1123 pnt.y = dest.lt.y | |
| 1124 | |
| 1125 if dest.kind == pymupdf.LINK_URI: | |
| 1126 nl["uri"] = dest.uri | |
| 1127 | |
| 1128 elif dest.kind == pymupdf.LINK_GOTO: | |
| 1129 nl["page"] = dest.page | |
| 1130 nl["to"] = pnt | |
| 1131 if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM: | |
| 1132 nl["zoom"] = dest.rb.x | |
| 1133 else: | |
| 1134 nl["zoom"] = 0.0 | |
| 1135 | |
| 1136 elif dest.kind == pymupdf.LINK_GOTOR: | |
| 1137 nl["file"] = dest.file_spec.replace("\\", "/") | |
| 1138 nl["page"] = dest.page | |
| 1139 if dest.page < 0: | |
| 1140 nl["to"] = dest.dest | |
| 1141 else: | |
| 1142 nl["to"] = pnt | |
| 1143 if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM: | |
| 1144 nl["zoom"] = dest.rb.x | |
| 1145 else: | |
| 1146 nl["zoom"] = 0.0 | |
| 1147 | |
| 1148 elif dest.kind == pymupdf.LINK_LAUNCH: | |
| 1149 nl["file"] = dest.file_spec.replace("\\", "/") | |
| 1150 | |
| 1151 elif dest.kind == pymupdf.LINK_NAMED: | |
| 1152 # The dicts should not have same key(s). | |
| 1153 assert not (dest.named.keys() & nl.keys()) | |
| 1154 nl.update(dest.named) | |
| 1155 if 'to' in nl: | |
| 1156 nl['to'] = pymupdf.Point(nl['to']) | |
| 1157 | |
| 1158 else: | |
| 1159 nl["page"] = dest.page | |
| 1160 return nl | |
| 1161 | |
| 1162 | |
| 1163 def get_links(page: pymupdf.Page) -> list: | |
| 1164 """Create a list of all links contained in a PDF page. | |
| 1165 | |
| 1166 Notes: | |
| 1167 see PyMuPDF ducmentation for details. | |
| 1168 """ | |
| 1169 | |
| 1170 pymupdf.CheckParent(page) | |
| 1171 ln = page.first_link | |
| 1172 links = [] | |
| 1173 while ln: | |
| 1174 nl = getLinkDict(ln, page.parent) | |
| 1175 links.append(nl) | |
| 1176 ln = ln.next | |
| 1177 if links != [] and page.parent.is_pdf: | |
| 1178 linkxrefs = [x for x in | |
| 1179 #page.annot_xrefs() | |
| 1180 pymupdf.JM_get_annot_xref_list2(page) | |
| 1181 if x[1] == pymupdf.PDF_ANNOT_LINK # pylint: disable=no-member | |
| 1182 ] | |
| 1183 if len(linkxrefs) == len(links): | |
| 1184 for i in range(len(linkxrefs)): | |
| 1185 links[i]["xref"] = linkxrefs[i][0] | |
| 1186 links[i]["id"] = linkxrefs[i][2] | |
| 1187 return links | |
| 1188 | |
| 1189 | |
| 1190 def get_toc( | |
| 1191 doc: pymupdf.Document, | |
| 1192 simple: bool = True, | |
| 1193 ) -> list: | |
| 1194 """Create a table of contents. | |
| 1195 | |
| 1196 Args: | |
| 1197 simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation. | |
| 1198 """ | |
| 1199 def recurse(olItem, liste, lvl): | |
| 1200 """Recursively follow the outline item chain and record item information in a list.""" | |
| 1201 while olItem and olItem.this.m_internal: | |
| 1202 if olItem.title: | |
| 1203 title = olItem.title | |
| 1204 else: | |
| 1205 title = " " | |
| 1206 | |
| 1207 if not olItem.is_external: | |
| 1208 if olItem.uri: | |
| 1209 if olItem.page == -1: | |
| 1210 resolve = doc.resolve_link(olItem.uri) | |
| 1211 page = resolve[0] + 1 | |
| 1212 else: | |
| 1213 page = olItem.page + 1 | |
| 1214 else: | |
| 1215 page = -1 | |
| 1216 else: | |
| 1217 page = -1 | |
| 1218 | |
| 1219 if not simple: | |
| 1220 link = getLinkDict(olItem, doc) | |
| 1221 liste.append([lvl, title, page, link]) | |
| 1222 else: | |
| 1223 liste.append([lvl, title, page]) | |
| 1224 | |
| 1225 if olItem.down: | |
| 1226 liste = recurse(olItem.down, liste, lvl + 1) | |
| 1227 olItem = olItem.next | |
| 1228 return liste | |
| 1229 | |
| 1230 # ensure document is open | |
| 1231 if doc.is_closed: | |
| 1232 raise ValueError("document closed") | |
| 1233 doc.init_doc() | |
| 1234 olItem = doc.outline | |
| 1235 if not olItem: | |
| 1236 return [] | |
| 1237 lvl = 1 | |
| 1238 liste = [] | |
| 1239 toc = recurse(olItem, liste, lvl) | |
| 1240 if doc.is_pdf and not simple: | |
| 1241 doc._extend_toc_items(toc) | |
| 1242 return toc | |
| 1243 | |
| 1244 | |
| 1245 def del_toc_item( | |
| 1246 doc: pymupdf.Document, | |
| 1247 idx: int, | |
| 1248 ) -> None: | |
| 1249 """Delete TOC / bookmark item by index.""" | |
| 1250 xref = doc.get_outline_xrefs()[idx] | |
| 1251 doc._remove_toc_item(xref) | |
| 1252 | |
| 1253 | |
| 1254 def set_toc_item( | |
| 1255 doc: pymupdf.Document, | |
| 1256 idx: int, | |
| 1257 dest_dict: OptDict = None, | |
| 1258 kind: OptInt = None, | |
| 1259 pno: OptInt = None, | |
| 1260 uri: OptStr = None, | |
| 1261 title: OptStr = None, | |
| 1262 to: point_like = None, | |
| 1263 filename: OptStr = None, | |
| 1264 zoom: float = 0, | |
| 1265 ) -> None: | |
| 1266 """Update TOC item by index. | |
| 1267 | |
| 1268 It allows changing the item's title and link destination. | |
| 1269 | |
| 1270 Args: | |
| 1271 idx: | |
| 1272 (int) desired index of the TOC list, as created by get_toc. | |
| 1273 dest_dict: | |
| 1274 (dict) destination dictionary as created by get_toc(False). | |
| 1275 Outrules all other parameters. If None, the remaining parameters | |
| 1276 are used to make a dest dictionary. | |
| 1277 kind: | |
| 1278 (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only | |
| 1279 the title will be updated. If pymupdf.LINK_NONE, the TOC item will | |
| 1280 be deleted. | |
| 1281 pno: | |
| 1282 (int) page number (1-based like in get_toc). Required if | |
| 1283 pymupdf.LINK_GOTO. | |
| 1284 uri: | |
| 1285 (str) the URL, required if pymupdf.LINK_URI. | |
| 1286 title: | |
| 1287 (str) the new title. No change if None. | |
| 1288 to: | |
| 1289 (point-like) destination on the target page. If omitted, (72, 36) | |
| 1290 will be used as target coordinates. | |
| 1291 filename: | |
| 1292 (str) destination filename, required for pymupdf.LINK_GOTOR and | |
| 1293 pymupdf.LINK_LAUNCH. | |
| 1294 name: | |
| 1295 (str) a destination name for pymupdf.LINK_NAMED. | |
| 1296 zoom: | |
| 1297 (float) a zoom factor for the target location (pymupdf.LINK_GOTO). | |
| 1298 """ | |
| 1299 xref = doc.get_outline_xrefs()[idx] | |
| 1300 page_xref = 0 | |
| 1301 if type(dest_dict) is dict: | |
| 1302 if dest_dict["kind"] == pymupdf.LINK_GOTO: | |
| 1303 pno = dest_dict["page"] | |
| 1304 page_xref = doc.page_xref(pno) | |
| 1305 page_height = doc.page_cropbox(pno).height | |
| 1306 to = dest_dict.get('to', pymupdf.Point(72, 36)) | |
| 1307 to.y = page_height - to.y | |
| 1308 dest_dict["to"] = to | |
| 1309 action = getDestStr(page_xref, dest_dict) | |
| 1310 if not action.startswith("/A"): | |
| 1311 raise ValueError("bad bookmark dest") | |
| 1312 color = dest_dict.get("color") | |
| 1313 if color: | |
| 1314 color = list(map(float, color)) | |
| 1315 if len(color) != 3 or min(color) < 0 or max(color) > 1: | |
| 1316 raise ValueError("bad color value") | |
| 1317 bold = dest_dict.get("bold", False) | |
| 1318 italic = dest_dict.get("italic", False) | |
| 1319 flags = italic + 2 * bold | |
| 1320 collapse = dest_dict.get("collapse") | |
| 1321 return doc._update_toc_item( | |
| 1322 xref, | |
| 1323 action=action[2:], | |
| 1324 title=title, | |
| 1325 color=color, | |
| 1326 flags=flags, | |
| 1327 collapse=collapse, | |
| 1328 ) | |
| 1329 | |
| 1330 if kind == pymupdf.LINK_NONE: # delete bookmark item | |
| 1331 return doc.del_toc_item(idx) | |
| 1332 if kind is None and title is None: # treat as no-op | |
| 1333 return None | |
| 1334 if kind is None: # only update title text | |
| 1335 return doc._update_toc_item(xref, action=None, title=title) | |
| 1336 | |
| 1337 if kind == pymupdf.LINK_GOTO: | |
| 1338 if pno is None or pno not in range(1, doc.page_count + 1): | |
| 1339 raise ValueError("bad page number") | |
| 1340 page_xref = doc.page_xref(pno - 1) | |
| 1341 page_height = doc.page_cropbox(pno - 1).height | |
| 1342 if to is None: | |
| 1343 to = pymupdf.Point(72, page_height - 36) | |
| 1344 else: | |
| 1345 to = pymupdf.Point(to) | |
| 1346 to.y = page_height - to.y | |
| 1347 | |
| 1348 ddict = { | |
| 1349 "kind": kind, | |
| 1350 "to": to, | |
| 1351 "uri": uri, | |
| 1352 "page": pno, | |
| 1353 "file": filename, | |
| 1354 "zoom": zoom, | |
| 1355 } | |
| 1356 action = getDestStr(page_xref, ddict) | |
| 1357 if action == "" or not action.startswith("/A"): | |
| 1358 raise ValueError("bad bookmark dest") | |
| 1359 | |
| 1360 return doc._update_toc_item(xref, action=action[2:], title=title) | |
| 1361 | |
| 1362 | |
| 1363 def get_area(*args) -> float: | |
| 1364 """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'.""" | |
| 1365 rect = args[0] | |
| 1366 if len(args) > 1: | |
| 1367 unit = args[1] | |
| 1368 else: | |
| 1369 unit = "px" | |
| 1370 u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)} | |
| 1371 f = (u[unit][0] / u[unit][1]) ** 2 | |
| 1372 return f * rect.width * rect.height | |
| 1373 | |
| 1374 | |
| 1375 def set_metadata(doc: pymupdf.Document, m: dict = None) -> None: | |
| 1376 """Update the PDF /Info object. | |
| 1377 | |
| 1378 Args: | |
| 1379 m: a dictionary like doc.metadata. | |
| 1380 """ | |
| 1381 if not doc.is_pdf: | |
| 1382 raise ValueError("is no PDF") | |
| 1383 if doc.is_closed or doc.is_encrypted: | |
| 1384 raise ValueError("document closed or encrypted") | |
| 1385 if m is None: | |
| 1386 m = {} | |
| 1387 elif type(m) is not dict: | |
| 1388 raise ValueError("bad metadata") | |
| 1389 keymap = { | |
| 1390 "author": "Author", | |
| 1391 "producer": "Producer", | |
| 1392 "creator": "Creator", | |
| 1393 "title": "Title", | |
| 1394 "format": None, | |
| 1395 "encryption": None, | |
| 1396 "creationDate": "CreationDate", | |
| 1397 "modDate": "ModDate", | |
| 1398 "subject": "Subject", | |
| 1399 "keywords": "Keywords", | |
| 1400 "trapped": "Trapped", | |
| 1401 } | |
| 1402 valid_keys = set(keymap.keys()) | |
| 1403 diff_set = set(m.keys()).difference(valid_keys) | |
| 1404 if diff_set != set(): | |
| 1405 msg = "bad dict key(s): %s" % diff_set | |
| 1406 raise ValueError(msg) | |
| 1407 | |
| 1408 t, temp = doc.xref_get_key(-1, "Info") | |
| 1409 if t != "xref": | |
| 1410 info_xref = 0 | |
| 1411 else: | |
| 1412 info_xref = int(temp.replace("0 R", "")) | |
| 1413 | |
| 1414 if m == {} and info_xref == 0: # nothing to do | |
| 1415 return | |
| 1416 | |
| 1417 if info_xref == 0: # no prev metadata: get new xref | |
| 1418 info_xref = doc.get_new_xref() | |
| 1419 doc.update_object(info_xref, "<<>>") # fill it with empty object | |
| 1420 doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref) | |
| 1421 elif m == {}: # remove existing metadata | |
| 1422 doc.xref_set_key(-1, "Info", "null") | |
| 1423 doc.init_doc() | |
| 1424 return | |
| 1425 | |
| 1426 for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]: | |
| 1427 pdf_key = keymap[key] | |
| 1428 if not bool(val) or val in ("none", "null"): | |
| 1429 val = "null" | |
| 1430 else: | |
| 1431 val = pymupdf.get_pdf_str(val) | |
| 1432 doc.xref_set_key(info_xref, pdf_key, val) | |
| 1433 doc.init_doc() | |
| 1434 return | |
| 1435 | |
| 1436 | |
| 1437 def getDestStr(xref: int, ddict: dict) -> str: | |
| 1438 """Calculate the PDF action string. | |
| 1439 | |
| 1440 Notes: | |
| 1441 Supports Link annotations and outline items (bookmarks). | |
| 1442 """ | |
| 1443 if not ddict: | |
| 1444 return "" | |
| 1445 str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>" | |
| 1446 str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>" | |
| 1447 str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>" | |
| 1448 str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>" | |
| 1449 str_uri = lambda a: f"/A<</S/URI/URI{a}>>" | |
| 1450 | |
| 1451 if type(ddict) in (int, float): | |
| 1452 dest = str_goto(xref, 0, ddict, 0) | |
| 1453 return dest | |
| 1454 d_kind = ddict.get("kind", pymupdf.LINK_NONE) | |
| 1455 | |
| 1456 if d_kind == pymupdf.LINK_NONE: | |
| 1457 return "" | |
| 1458 | |
| 1459 if ddict["kind"] == pymupdf.LINK_GOTO: | |
| 1460 d_zoom = ddict.get("zoom", 0) | |
| 1461 to = ddict.get("to", pymupdf.Point(0, 0)) | |
| 1462 d_left, d_top = to | |
| 1463 dest = str_goto(xref, d_left, d_top, d_zoom) | |
| 1464 return dest | |
| 1465 | |
| 1466 if ddict["kind"] == pymupdf.LINK_URI: | |
| 1467 dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),) | |
| 1468 return dest | |
| 1469 | |
| 1470 if ddict["kind"] == pymupdf.LINK_LAUNCH: | |
| 1471 fspec = pymupdf.get_pdf_str(ddict["file"]) | |
| 1472 dest = str_launch(fspec, fspec) | |
| 1473 return dest | |
| 1474 | |
| 1475 if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0: | |
| 1476 fspec = pymupdf.get_pdf_str(ddict["file"]) | |
| 1477 dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec) | |
| 1478 return dest | |
| 1479 | |
| 1480 if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0: | |
| 1481 fspec = pymupdf.get_pdf_str(ddict["file"]) | |
| 1482 dest = str_gotor1( | |
| 1483 ddict["page"], | |
| 1484 ddict["to"].x, | |
| 1485 ddict["to"].y, | |
| 1486 ddict["zoom"], | |
| 1487 fspec, | |
| 1488 fspec, | |
| 1489 ) | |
| 1490 return dest | |
| 1491 | |
| 1492 return "" | |
| 1493 | |
| 1494 | |
| 1495 def set_toc( | |
| 1496 doc: pymupdf.Document, | |
| 1497 toc: list, | |
| 1498 collapse: int = 1, | |
| 1499 ) -> int: | |
| 1500 """Create new outline tree (table of contents, TOC). | |
| 1501 | |
| 1502 Args: | |
| 1503 toc: (list, tuple) each entry must contain level, title, page and | |
| 1504 optionally top margin on the page. None or '()' remove the TOC. | |
| 1505 collapse: (int) collapses entries beyond this level. Zero or None | |
| 1506 shows all entries unfolded. | |
| 1507 Returns: | |
| 1508 the number of inserted items, or the number of removed items respectively. | |
| 1509 """ | |
| 1510 if doc.is_closed or doc.is_encrypted: | |
| 1511 raise ValueError("document closed or encrypted") | |
| 1512 if not doc.is_pdf: | |
| 1513 raise ValueError("is no PDF") | |
| 1514 if not toc: # remove all entries | |
| 1515 return len(doc._delToC()) | |
| 1516 | |
| 1517 # validity checks -------------------------------------------------------- | |
| 1518 if type(toc) not in (list, tuple): | |
| 1519 raise ValueError("'toc' must be list or tuple") | |
| 1520 toclen = len(toc) | |
| 1521 page_count = doc.page_count | |
| 1522 t0 = toc[0] | |
| 1523 if type(t0) not in (list, tuple): | |
| 1524 raise ValueError("items must be sequences of 3 or 4 items") | |
| 1525 if t0[0] != 1: | |
| 1526 raise ValueError("hierarchy level of item 0 must be 1") | |
| 1527 for i in list(range(toclen - 1)): | |
| 1528 t1 = toc[i] | |
| 1529 t2 = toc[i + 1] | |
| 1530 if not -1 <= t1[2] <= page_count: | |
| 1531 raise ValueError("row %i: page number out of range" % i) | |
| 1532 if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4): | |
| 1533 raise ValueError("bad row %i" % (i + 1)) | |
| 1534 if (type(t2[0]) is not int) or t2[0] < 1: | |
| 1535 raise ValueError("bad hierarchy level in row %i" % (i + 1)) | |
| 1536 if t2[0] > t1[0] + 1: | |
| 1537 raise ValueError("bad hierarchy level in row %i" % (i + 1)) | |
| 1538 # no formal errors in toc -------------------------------------------------- | |
| 1539 | |
| 1540 # -------------------------------------------------------------------------- | |
| 1541 # make a list of xref numbers, which we can use for our TOC entries | |
| 1542 # -------------------------------------------------------------------------- | |
| 1543 old_xrefs = doc._delToC() # del old outlines, get their xref numbers | |
| 1544 | |
| 1545 # prepare table of xrefs for new bookmarks | |
| 1546 old_xrefs = [] | |
| 1547 xref = [0] + old_xrefs | |
| 1548 xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number | |
| 1549 if toclen > len(old_xrefs): # too few old xrefs? | |
| 1550 for i in range((toclen - len(old_xrefs))): | |
| 1551 xref.append(doc.get_new_xref()) # acquire new ones | |
| 1552 | |
| 1553 lvltab = {0: 0} # to store last entry per hierarchy level | |
| 1554 | |
| 1555 # ------------------------------------------------------------------------------ | |
| 1556 # contains new outline objects as strings - first one is the outline root | |
| 1557 # ------------------------------------------------------------------------------ | |
| 1558 olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}] | |
| 1559 # ------------------------------------------------------------------------------ | |
| 1560 # build olitems as a list of PDF-like connected dictionaries | |
| 1561 # ------------------------------------------------------------------------------ | |
| 1562 for i in range(toclen): | |
| 1563 o = toc[i] | |
| 1564 lvl = o[0] # level | |
| 1565 title = pymupdf.get_pdf_str(o[1]) # title | |
| 1566 pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number | |
| 1567 page_xref = doc.page_xref(pno) | |
| 1568 page_height = doc.page_cropbox(pno).height | |
| 1569 top = pymupdf.Point(72, page_height - 36) | |
| 1570 dest_dict = {"to": top, "kind": pymupdf.LINK_GOTO} # fall back target | |
| 1571 if o[2] < 0: | |
| 1572 dest_dict["kind"] = pymupdf.LINK_NONE | |
| 1573 if len(o) > 3: # some target is specified | |
| 1574 if type(o[3]) in (int, float): # convert a number to a point | |
| 1575 dest_dict["to"] = pymupdf.Point(72, page_height - o[3]) | |
| 1576 else: # if something else, make sure we have a dict | |
| 1577 # We make a copy of o[3] to avoid modifying our caller's data. | |
| 1578 dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict | |
| 1579 if "to" not in dest_dict: # target point not in dict? | |
| 1580 dest_dict["to"] = top # put default in | |
| 1581 else: # transform target to PDF coordinates | |
| 1582 page = doc[pno] | |
| 1583 point = pymupdf.Point(dest_dict["to"]) | |
| 1584 point.y = page.cropbox.height - point.y | |
| 1585 point = point * page.rotation_matrix | |
| 1586 dest_dict["to"] = (point.x, point.y) | |
| 1587 d = {} | |
| 1588 d["first"] = -1 | |
| 1589 d["count"] = 0 | |
| 1590 d["last"] = -1 | |
| 1591 d["prev"] = -1 | |
| 1592 d["next"] = -1 | |
| 1593 d["dest"] = getDestStr(page_xref, dest_dict) | |
| 1594 d["top"] = dest_dict["to"] | |
| 1595 d["title"] = title | |
| 1596 d["parent"] = lvltab[lvl - 1] | |
| 1597 d["xref"] = xref[i + 1] | |
| 1598 d["color"] = dest_dict.get("color") | |
| 1599 d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0) | |
| 1600 lvltab[lvl] = i + 1 | |
| 1601 parent = olitems[lvltab[lvl - 1]] # the parent entry | |
| 1602 | |
| 1603 if ( | |
| 1604 dest_dict.get("collapse") or collapse and lvl > collapse | |
| 1605 ): # suppress expansion | |
| 1606 parent["count"] -= 1 # make /Count negative | |
| 1607 else: | |
| 1608 parent["count"] += 1 # positive /Count | |
| 1609 | |
| 1610 if parent["first"] == -1: | |
| 1611 parent["first"] = i + 1 | |
| 1612 parent["last"] = i + 1 | |
| 1613 else: | |
| 1614 d["prev"] = parent["last"] | |
| 1615 prev = olitems[parent["last"]] | |
| 1616 prev["next"] = i + 1 | |
| 1617 parent["last"] = i + 1 | |
| 1618 olitems.append(d) | |
| 1619 | |
| 1620 # ------------------------------------------------------------------------------ | |
| 1621 # now create each outline item as a string and insert it in the PDF | |
| 1622 # ------------------------------------------------------------------------------ | |
| 1623 for i, ol in enumerate(olitems): | |
| 1624 txt = "<<" | |
| 1625 if ol["count"] != 0: | |
| 1626 txt += "/Count %i" % ol["count"] | |
| 1627 try: | |
| 1628 txt += ol["dest"] | |
| 1629 except Exception: | |
| 1630 # Verbose in PyMuPDF/tests. | |
| 1631 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1632 pass | |
| 1633 try: | |
| 1634 if ol["first"] > -1: | |
| 1635 txt += "/First %i 0 R" % xref[ol["first"]] | |
| 1636 except Exception: | |
| 1637 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1638 pass | |
| 1639 try: | |
| 1640 if ol["last"] > -1: | |
| 1641 txt += "/Last %i 0 R" % xref[ol["last"]] | |
| 1642 except Exception: | |
| 1643 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1644 pass | |
| 1645 try: | |
| 1646 if ol["next"] > -1: | |
| 1647 txt += "/Next %i 0 R" % xref[ol["next"]] | |
| 1648 except Exception: | |
| 1649 # Verbose in PyMuPDF/tests. | |
| 1650 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1651 pass | |
| 1652 try: | |
| 1653 if ol["parent"] > -1: | |
| 1654 txt += "/Parent %i 0 R" % xref[ol["parent"]] | |
| 1655 except Exception: | |
| 1656 # Verbose in PyMuPDF/tests. | |
| 1657 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1658 pass | |
| 1659 try: | |
| 1660 if ol["prev"] > -1: | |
| 1661 txt += "/Prev %i 0 R" % xref[ol["prev"]] | |
| 1662 except Exception: | |
| 1663 # Verbose in PyMuPDF/tests. | |
| 1664 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1665 pass | |
| 1666 try: | |
| 1667 txt += "/Title" + ol["title"] | |
| 1668 except Exception: | |
| 1669 # Verbose in PyMuPDF/tests. | |
| 1670 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 1671 pass | |
| 1672 | |
| 1673 if ol.get("color") and len(ol["color"]) == 3: | |
| 1674 txt += f"/C[ {_format_g(tuple(ol['color']))}]" | |
| 1675 if ol.get("flags", 0) > 0: | |
| 1676 txt += "/F %i" % ol["flags"] | |
| 1677 | |
| 1678 if i == 0: # special: this is the outline root | |
| 1679 txt += "/Type/Outlines" # so add the /Type entry | |
| 1680 txt += ">>" | |
| 1681 doc.update_object(xref[i], txt) # insert the PDF object | |
| 1682 | |
| 1683 doc.init_doc() | |
| 1684 return toclen | |
| 1685 | |
| 1686 | |
| 1687 def do_widgets( | |
| 1688 tar: pymupdf.Document, | |
| 1689 src: pymupdf.Document, | |
| 1690 graftmap, | |
| 1691 from_page: int = -1, | |
| 1692 to_page: int = -1, | |
| 1693 start_at: int = -1, | |
| 1694 join_duplicates=0, | |
| 1695 ) -> None: | |
| 1696 """Insert widgets of copied page range into target PDF. | |
| 1697 | |
| 1698 Parameter values **must** equal those of method insert_pdf() which | |
| 1699 must have been previously executed. | |
| 1700 """ | |
| 1701 if not src.is_form_pdf: # nothing to do: source PDF has no fields | |
| 1702 return | |
| 1703 | |
| 1704 def clean_kid_parents(acro_fields): | |
| 1705 """ Make sure all kids have correct "Parent" pointers.""" | |
| 1706 for i in range(acro_fields.pdf_array_len()): | |
| 1707 parent = acro_fields.pdf_array_get(i) | |
| 1708 kids = parent.pdf_dict_get(pymupdf.PDF_NAME("Kids")) | |
| 1709 for j in range(kids.pdf_array_len()): | |
| 1710 kid = kids.pdf_array_get(j) | |
| 1711 kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), parent) | |
| 1712 | |
| 1713 def join_widgets(pdf, acro_fields, xref1, xref2, name): | |
| 1714 """Called for each pair of widgets having the same name. | |
| 1715 | |
| 1716 Args: | |
| 1717 pdf: target MuPDF document | |
| 1718 acro_fields: object Root/AcroForm/Fields | |
| 1719 xref1, xref2: widget xrefs having same names | |
| 1720 name: (str) the name | |
| 1721 | |
| 1722 Result: | |
| 1723 Defined or updated widget parent that points to both widgets. | |
| 1724 """ | |
| 1725 | |
| 1726 def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2): | |
| 1727 """Merge widget in xref2 into "Kids" list of widget xref1. | |
| 1728 | |
| 1729 Args: | |
| 1730 xref1, kids1: target widget and its "Kids" array. | |
| 1731 xref2, kids2: source wwidget and its "Kids" array (may be empty). | |
| 1732 """ | |
| 1733 # make indirect objects from widgets | |
| 1734 w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0) | |
| 1735 w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0) | |
| 1736 # find source widget in "Fields" array | |
| 1737 idx = acro_fields.pdf_array_find(w2_ind) | |
| 1738 acro_fields.pdf_array_delete(idx) | |
| 1739 | |
| 1740 if not kids2.pdf_is_array(): # source widget has no kids | |
| 1741 widget = mupdf.pdf_load_object(pdf, xref2) | |
| 1742 | |
| 1743 # delete name from widget and insert target as parent | |
| 1744 widget.pdf_dict_del(pymupdf.PDF_NAME("T")) | |
| 1745 widget.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind) | |
| 1746 | |
| 1747 # put in target Kids | |
| 1748 kids1.pdf_array_push(w2_ind) | |
| 1749 else: # copy source kids to target kids | |
| 1750 for i in range(kids2.pdf_array_len()): | |
| 1751 kid = kids2.pdf_array_get(i) | |
| 1752 kid.pdf_dict_put(pymupdf.PDF_NAME("Parent"), w1_ind) | |
| 1753 kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0) | |
| 1754 kids1.pdf_array_push(kid_ind) | |
| 1755 | |
| 1756 def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name): | |
| 1757 """Make new "Parent" for two widgets with same name. | |
| 1758 | |
| 1759 Args: | |
| 1760 xref1, w1: first widget | |
| 1761 xref2, w2: second widget | |
| 1762 name: field name | |
| 1763 | |
| 1764 Result: | |
| 1765 Both widgets have no "Kids". We create a new object with the | |
| 1766 name and a "Kids" array containing the widgets. | |
| 1767 Original widgets must be removed from AcroForm/Fields. | |
| 1768 """ | |
| 1769 # make new "Parent" object | |
| 1770 new = mupdf.pdf_new_dict(pdf, 5) | |
| 1771 new.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), name) | |
| 1772 kids = new.pdf_dict_put_array(pymupdf.PDF_NAME("Kids"), 2) | |
| 1773 new_obj = mupdf.pdf_add_object(pdf, new) | |
| 1774 new_obj_xref = new_obj.pdf_to_num() | |
| 1775 new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0) | |
| 1776 | |
| 1777 # copy over some required source widget properties | |
| 1778 ft = w1.pdf_dict_get(pymupdf.PDF_NAME("FT")) | |
| 1779 w1.pdf_dict_del(pymupdf.PDF_NAME("FT")) | |
| 1780 new_obj.pdf_dict_put(pymupdf.PDF_NAME("FT"), ft) | |
| 1781 | |
| 1782 aa = w1.pdf_dict_get(pymupdf.PDF_NAME("AA")) | |
| 1783 w1.pdf_dict_del(pymupdf.PDF_NAME("AA")) | |
| 1784 new_obj.pdf_dict_put(pymupdf.PDF_NAME("AA"), aa) | |
| 1785 | |
| 1786 # remove name field, insert "Parent" field in source widgets | |
| 1787 w1.pdf_dict_del(pymupdf.PDF_NAME("T")) | |
| 1788 w1.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind) | |
| 1789 w2.pdf_dict_del(pymupdf.PDF_NAME("T")) | |
| 1790 w2.pdf_dict_put(pymupdf.PDF_NAME("Parent"), new_ind) | |
| 1791 | |
| 1792 # put source widgets in "kids" array | |
| 1793 ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0) | |
| 1794 ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0) | |
| 1795 kids.pdf_array_push(ind1) | |
| 1796 kids.pdf_array_push(ind2) | |
| 1797 | |
| 1798 # remove source widgets from "AcroForm/Fields" | |
| 1799 idx = acro_fields.pdf_array_find(ind1) | |
| 1800 acro_fields.pdf_array_delete(idx) | |
| 1801 idx = acro_fields.pdf_array_find(ind2) | |
| 1802 acro_fields.pdf_array_delete(idx) | |
| 1803 | |
| 1804 acro_fields.pdf_array_push(new_ind) | |
| 1805 | |
| 1806 w1 = mupdf.pdf_load_object(pdf, xref1) | |
| 1807 w2 = mupdf.pdf_load_object(pdf, xref2) | |
| 1808 kids1 = w1.pdf_dict_get(pymupdf.PDF_NAME("Kids")) | |
| 1809 kids2 = w2.pdf_dict_get(pymupdf.PDF_NAME("Kids")) | |
| 1810 | |
| 1811 # check which widget has a suitable "Kids" array | |
| 1812 if kids1.pdf_is_array(): | |
| 1813 re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order | |
| 1814 elif kids2.pdf_is_array(): | |
| 1815 re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order | |
| 1816 else: | |
| 1817 new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order | |
| 1818 | |
| 1819 def get_kids(parent, kids_list): | |
| 1820 """Return xref list of leaf kids for a parent. | |
| 1821 | |
| 1822 Call with an empty list. | |
| 1823 """ | |
| 1824 kids = mupdf.pdf_dict_get(parent, pymupdf.PDF_NAME("Kids")) | |
| 1825 if not kids.pdf_is_array(): | |
| 1826 return kids_list | |
| 1827 for i in range(kids.pdf_array_len()): | |
| 1828 kid = kids.pdf_array_get(i) | |
| 1829 if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, pymupdf.PDF_NAME("Kids"))): | |
| 1830 kids_list = get_kids(kid, kids_list) | |
| 1831 else: | |
| 1832 kids_list.append(kid.pdf_to_num()) | |
| 1833 return kids_list | |
| 1834 | |
| 1835 def kids_xrefs(widget): | |
| 1836 """Get the xref of top "Parent" and the list of leaf widgets.""" | |
| 1837 kids_list = [] | |
| 1838 parent = mupdf.pdf_dict_get(widget, pymupdf.PDF_NAME("Parent")) | |
| 1839 parent_xref = parent.pdf_to_num() | |
| 1840 if parent_xref == 0: | |
| 1841 return parent_xref, kids_list | |
| 1842 kids_list = get_kids(parent, kids_list) | |
| 1843 return parent_xref, kids_list | |
| 1844 | |
| 1845 def deduplicate_names(pdf, acro_fields, join_duplicates=False): | |
| 1846 """Handle any widget name duplicates caused by the merge.""" | |
| 1847 names = {} # key is a widget name, value a list of widgets having it. | |
| 1848 | |
| 1849 # extract all names and widgets in "AcroForm/Fields" | |
| 1850 for i in range(mupdf.pdf_array_len(acro_fields)): | |
| 1851 wobject = mupdf.pdf_array_get(acro_fields, i) | |
| 1852 xref = wobject.pdf_to_num() | |
| 1853 | |
| 1854 # extract widget name and collect widget(s) using it | |
| 1855 T = mupdf.pdf_dict_get_text_string(wobject, pymupdf.PDF_NAME("T")) | |
| 1856 xrefs = names.get(T, []) | |
| 1857 xrefs.append(xref) | |
| 1858 names[T] = xrefs | |
| 1859 | |
| 1860 for name, xrefs in names.items(): | |
| 1861 if len(xrefs) < 2: | |
| 1862 continue | |
| 1863 xref0, xref1 = xrefs[:2] # only exactly 2 should occur! | |
| 1864 if join_duplicates: # combine fields with equal names | |
| 1865 join_widgets(pdf, acro_fields, xref0, xref1, name) | |
| 1866 else: # make field names unique | |
| 1867 newname = name + f" [{xref1}]" # append this to the name | |
| 1868 wobject = mupdf.pdf_load_object(pdf, xref1) | |
| 1869 wobject.pdf_dict_put_text_string(pymupdf.PDF_NAME("T"), newname) | |
| 1870 | |
| 1871 clean_kid_parents(acro_fields) | |
| 1872 | |
| 1873 def get_acroform(doc): | |
| 1874 """Retrieve the AcroForm dictionary form a PDF.""" | |
| 1875 pdf = mupdf.pdf_document_from_fz_document(doc) | |
| 1876 # AcroForm (= central form field info) | |
| 1877 return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm") | |
| 1878 | |
| 1879 tarpdf = mupdf.pdf_document_from_fz_document(tar) | |
| 1880 srcpdf = mupdf.pdf_document_from_fz_document(src) | |
| 1881 | |
| 1882 if tar.is_form_pdf: | |
| 1883 # target is a Form PDF, so use it to include source fields | |
| 1884 acro = get_acroform(tar) | |
| 1885 # Important arrays in AcroForm | |
| 1886 acro_fields = acro.pdf_dict_get(pymupdf.PDF_NAME("Fields")) | |
| 1887 tar_co = acro.pdf_dict_get(pymupdf.PDF_NAME("CO")) | |
| 1888 if not tar_co.pdf_is_array(): | |
| 1889 tar_co = acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5) | |
| 1890 else: | |
| 1891 # target is no Form PDF, so copy over source AcroForm | |
| 1892 acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy | |
| 1893 | |
| 1894 # Clear "Fields" and "CO" arrays: will be populated by page fields. | |
| 1895 # This is required to avoid copying unneeded objects. | |
| 1896 acro.pdf_dict_del(pymupdf.PDF_NAME("Fields")) | |
| 1897 acro.pdf_dict_put_array(pymupdf.PDF_NAME("Fields"), 5) | |
| 1898 acro.pdf_dict_del(pymupdf.PDF_NAME("CO")) | |
| 1899 acro.pdf_dict_put_array(pymupdf.PDF_NAME("CO"), 5) | |
| 1900 | |
| 1901 # Enrich AcroForm for copying to target | |
| 1902 acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro) | |
| 1903 | |
| 1904 # Insert AcroForm into target PDF | |
| 1905 acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft) | |
| 1906 acro_fields = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("Fields")) | |
| 1907 tar_co = acro_tar.pdf_dict_get(pymupdf.PDF_NAME("CO")) | |
| 1908 | |
| 1909 # get its xref and insert it into target catalog | |
| 1910 tar_xref = acro_tar.pdf_to_num() | |
| 1911 acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) | |
| 1912 root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), pymupdf.PDF_NAME("Root")) | |
| 1913 root.pdf_dict_put(pymupdf.PDF_NAME("AcroForm"), acro_tar_ind) | |
| 1914 | |
| 1915 if from_page <= to_page: | |
| 1916 src_range = range(from_page, to_page + 1) | |
| 1917 else: | |
| 1918 src_range = range(from_page, to_page - 1, -1) | |
| 1919 | |
| 1920 parents = {} # information about widget parents | |
| 1921 | |
| 1922 # remove "P" owning page reference from all widgets of all source pages | |
| 1923 for i in src_range: | |
| 1924 src_page = src[i] | |
| 1925 for xref in [ | |
| 1926 xref | |
| 1927 for xref, wtype, _ in src_page.annot_xrefs() | |
| 1928 if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member | |
| 1929 ]: | |
| 1930 w_obj = mupdf.pdf_load_object(srcpdf, xref) | |
| 1931 w_obj.pdf_dict_del(pymupdf.PDF_NAME("P")) | |
| 1932 | |
| 1933 # get the widget's parent structure | |
| 1934 parent_xref, old_kids = kids_xrefs(w_obj) | |
| 1935 if parent_xref: | |
| 1936 parents[parent_xref] = { | |
| 1937 "new_xref": 0, | |
| 1938 "old_kids": old_kids, | |
| 1939 "new_kids": [], | |
| 1940 } | |
| 1941 # Copy over Parent widgets first - they are not page-dependent | |
| 1942 for xref in parents.keys(): # pylint: disable=consider-using-dict-items | |
| 1943 parent = mupdf.pdf_load_object(srcpdf, xref) | |
| 1944 parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent) | |
| 1945 parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft) | |
| 1946 kids_xrefs_new = get_kids(parent_tar, []) | |
| 1947 parent_xref_new = parent_tar.pdf_to_num() | |
| 1948 parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0) | |
| 1949 acro_fields.pdf_array_push(parent_ind) | |
| 1950 parents[xref]["new_xref"] = parent_xref_new | |
| 1951 parents[xref]["new_kids"] = kids_xrefs_new | |
| 1952 | |
| 1953 for i in range(len(src_range)): | |
| 1954 # read first copied over page in target | |
| 1955 tar_page = tar[start_at + i] | |
| 1956 | |
| 1957 # read the original page in the source PDF | |
| 1958 src_page = src[src_range[i]] | |
| 1959 | |
| 1960 # now walk through source page widgets and copy over | |
| 1961 w_xrefs = [ # widget xrefs of the source page | |
| 1962 xref | |
| 1963 for xref, wtype, _ in src_page.annot_xrefs() | |
| 1964 if wtype == pymupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member | |
| 1965 ] | |
| 1966 if not w_xrefs: # no widgets on this source page | |
| 1967 continue | |
| 1968 | |
| 1969 # convert to formal PDF page | |
| 1970 tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page) | |
| 1971 | |
| 1972 # extract annotations array | |
| 1973 tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots")) | |
| 1974 if not mupdf.pdf_is_array(tar_annots): | |
| 1975 tar_annots = mupdf.pdf_dict_put_array( | |
| 1976 tar_page_pdf.obj(), pymupdf.PDF_NAME("Annots"), 5 | |
| 1977 ) | |
| 1978 | |
| 1979 for xref in w_xrefs: | |
| 1980 w_obj = mupdf.pdf_load_object(srcpdf, xref) | |
| 1981 | |
| 1982 # check if field takes part in inter-field validations | |
| 1983 is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C")) | |
| 1984 | |
| 1985 # check if parent of widget already in target | |
| 1986 parent_xref = mupdf.pdf_to_num( | |
| 1987 w_obj.pdf_dict_get(pymupdf.PDF_NAME("Parent")) | |
| 1988 ) | |
| 1989 if parent_xref == 0: # parent not in target yet | |
| 1990 try: | |
| 1991 w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj) | |
| 1992 except Exception as e: | |
| 1993 pymupdf.message_warning(f"cannot copy widget at {xref=}: {e}") | |
| 1994 continue | |
| 1995 w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft) | |
| 1996 tar_xref = w_obj_tar.pdf_to_num() | |
| 1997 w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) | |
| 1998 mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) | |
| 1999 mupdf.pdf_array_push(acro_fields, w_obj_tar_ind) | |
| 2000 else: | |
| 2001 parent = parents[parent_xref] | |
| 2002 idx = parent["old_kids"].index(xref) # search for xref in parent | |
| 2003 tar_xref = parent["new_kids"][idx] | |
| 2004 w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0) | |
| 2005 mupdf.pdf_array_push(tar_annots, w_obj_tar_ind) | |
| 2006 | |
| 2007 # Into "AcroForm/CO" if a computation field. | |
| 2008 if is_aac: | |
| 2009 mupdf.pdf_array_push(tar_co, w_obj_tar_ind) | |
| 2010 | |
| 2011 deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates) | |
| 2012 | |
| 2013 def do_links( | |
| 2014 doc1: pymupdf.Document, | |
| 2015 doc2: pymupdf.Document, | |
| 2016 from_page: int = -1, | |
| 2017 to_page: int = -1, | |
| 2018 start_at: int = -1, | |
| 2019 ) -> None: | |
| 2020 """Insert links contained in copied page range into destination PDF. | |
| 2021 | |
| 2022 Parameter values **must** equal those of method insert_pdf(), which must | |
| 2023 have been previously executed. | |
| 2024 """ | |
| 2025 #pymupdf.log( 'utils.do_links()') | |
| 2026 # -------------------------------------------------------------------------- | |
| 2027 # internal function to create the actual "/Annots" object string | |
| 2028 # -------------------------------------------------------------------------- | |
| 2029 def cre_annot(lnk, xref_dst, pno_src, ctm): | |
| 2030 """Create annotation object string for a passed-in link.""" | |
| 2031 | |
| 2032 r = lnk["from"] * ctm # rect in PDF coordinates | |
| 2033 rect = _format_g(tuple(r)) | |
| 2034 if lnk["kind"] == pymupdf.LINK_GOTO: | |
| 2035 txt = pymupdf.annot_skel["goto1"] # annot_goto | |
| 2036 idx = pno_src.index(lnk["page"]) | |
| 2037 p = lnk["to"] * ctm # target point in PDF coordinates | |
| 2038 annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect) | |
| 2039 | |
| 2040 elif lnk["kind"] == pymupdf.LINK_GOTOR: | |
| 2041 if lnk["page"] >= 0: | |
| 2042 txt = pymupdf.annot_skel["gotor1"] # annot_gotor | |
| 2043 pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point | |
| 2044 if type(pnt) is not pymupdf.Point: | |
| 2045 pnt = pymupdf.Point(0, 0) | |
| 2046 annot = txt( | |
| 2047 lnk["page"], | |
| 2048 pnt.x, | |
| 2049 pnt.y, | |
| 2050 lnk["zoom"], | |
| 2051 lnk["file"], | |
| 2052 lnk["file"], | |
| 2053 rect, | |
| 2054 ) | |
| 2055 else: | |
| 2056 txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n | |
| 2057 to = pymupdf.get_pdf_str(lnk["to"]) | |
| 2058 to = to[1:-1] | |
| 2059 f = lnk["file"] | |
| 2060 annot = txt(to, f, rect) | |
| 2061 | |
| 2062 elif lnk["kind"] == pymupdf.LINK_LAUNCH: | |
| 2063 txt = pymupdf.annot_skel["launch"] # annot_launch | |
| 2064 annot = txt(lnk["file"], lnk["file"], rect) | |
| 2065 | |
| 2066 elif lnk["kind"] == pymupdf.LINK_URI: | |
| 2067 txt = pymupdf.annot_skel["uri"] # annot_uri | |
| 2068 annot = txt(lnk["uri"], rect) | |
| 2069 | |
| 2070 else: | |
| 2071 annot = "" | |
| 2072 | |
| 2073 return annot | |
| 2074 | |
| 2075 # -------------------------------------------------------------------------- | |
| 2076 | |
| 2077 # validate & normalize parameters | |
| 2078 if from_page < 0: | |
| 2079 fp = 0 | |
| 2080 elif from_page >= doc2.page_count: | |
| 2081 fp = doc2.page_count - 1 | |
| 2082 else: | |
| 2083 fp = from_page | |
| 2084 | |
| 2085 if to_page < 0 or to_page >= doc2.page_count: | |
| 2086 tp = doc2.page_count - 1 | |
| 2087 else: | |
| 2088 tp = to_page | |
| 2089 | |
| 2090 if start_at < 0: | |
| 2091 raise ValueError("'start_at' must be >= 0") | |
| 2092 sa = start_at | |
| 2093 | |
| 2094 incr = 1 if fp <= tp else -1 # page range could be reversed | |
| 2095 | |
| 2096 # lists of source / destination page numbers | |
| 2097 pno_src = list(range(fp, tp + incr, incr)) | |
| 2098 pno_dst = [sa + i for i in range(len(pno_src))] | |
| 2099 | |
| 2100 # lists of source / destination page xrefs | |
| 2101 xref_src = [] | |
| 2102 xref_dst = [] | |
| 2103 for i in range(len(pno_src)): | |
| 2104 p_src = pno_src[i] | |
| 2105 p_dst = pno_dst[i] | |
| 2106 old_xref = doc2.page_xref(p_src) | |
| 2107 new_xref = doc1.page_xref(p_dst) | |
| 2108 xref_src.append(old_xref) | |
| 2109 xref_dst.append(new_xref) | |
| 2110 | |
| 2111 # create the links for each copied page in destination PDF | |
| 2112 for i in range(len(xref_src)): | |
| 2113 page_src = doc2[pno_src[i]] # load source page | |
| 2114 links = page_src.get_links() # get all its links | |
| 2115 #pymupdf.log( '{pno_src=}') | |
| 2116 #pymupdf.log( '{type(page_src)=}') | |
| 2117 #pymupdf.log( '{page_src=}') | |
| 2118 #pymupdf.log( '{=i len(links)}') | |
| 2119 if len(links) == 0: # no links there | |
| 2120 page_src = None | |
| 2121 continue | |
| 2122 ctm = ~page_src.transformation_matrix # calc page transformation matrix | |
| 2123 page_dst = doc1[pno_dst[i]] # load destination page | |
| 2124 link_tab = [] # store all link definitions here | |
| 2125 for l in links: | |
| 2126 if l["kind"] == pymupdf.LINK_GOTO and (l["page"] not in pno_src): | |
| 2127 continue # GOTO link target not in copied pages | |
| 2128 annot_text = cre_annot(l, xref_dst, pno_src, ctm) | |
| 2129 if annot_text: | |
| 2130 link_tab.append(annot_text) | |
| 2131 if link_tab != []: | |
| 2132 page_dst._addAnnot_FromString( tuple(link_tab)) | |
| 2133 #pymupdf.log( 'utils.do_links() returning.') | |
| 2134 | |
| 2135 | |
| 2136 def getLinkText(page: pymupdf.Page, lnk: dict) -> str: | |
| 2137 # -------------------------------------------------------------------------- | |
| 2138 # define skeletons for /Annots object texts | |
| 2139 # -------------------------------------------------------------------------- | |
| 2140 ctm = page.transformation_matrix | |
| 2141 ictm = ~ctm | |
| 2142 r = lnk["from"] | |
| 2143 rect = _format_g(tuple(r * ictm)) | |
| 2144 | |
| 2145 annot = "" | |
| 2146 if lnk["kind"] == pymupdf.LINK_GOTO: | |
| 2147 if lnk["page"] >= 0: | |
| 2148 txt = pymupdf.annot_skel["goto1"] # annot_goto | |
| 2149 pno = lnk["page"] | |
| 2150 xref = page.parent.page_xref(pno) | |
| 2151 pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point | |
| 2152 dest_page = page.parent[pno] | |
| 2153 dest_ctm = dest_page.transformation_matrix | |
| 2154 dest_ictm = ~dest_ctm | |
| 2155 ipnt = pnt * dest_ictm | |
| 2156 annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect) | |
| 2157 else: | |
| 2158 txt = pymupdf.annot_skel["goto2"] # annot_goto_n | |
| 2159 annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect) | |
| 2160 | |
| 2161 elif lnk["kind"] == pymupdf.LINK_GOTOR: | |
| 2162 if lnk["page"] >= 0: | |
| 2163 txt = pymupdf.annot_skel["gotor1"] # annot_gotor | |
| 2164 pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point | |
| 2165 if type(pnt) is not pymupdf.Point: | |
| 2166 pnt = pymupdf.Point(0, 0) | |
| 2167 annot = txt( | |
| 2168 lnk["page"], | |
| 2169 pnt.x, | |
| 2170 pnt.y, | |
| 2171 lnk.get("zoom", 0), | |
| 2172 lnk["file"], | |
| 2173 lnk["file"], | |
| 2174 rect, | |
| 2175 ) | |
| 2176 else: | |
| 2177 txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n | |
| 2178 annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect) | |
| 2179 | |
| 2180 elif lnk["kind"] == pymupdf.LINK_LAUNCH: | |
| 2181 txt = pymupdf.annot_skel["launch"] # annot_launch | |
| 2182 annot = txt(lnk["file"], lnk["file"], rect) | |
| 2183 | |
| 2184 elif lnk["kind"] == pymupdf.LINK_URI: | |
| 2185 txt = pymupdf.annot_skel["uri"] # txt = annot_uri | |
| 2186 annot = txt(lnk["uri"], rect) | |
| 2187 | |
| 2188 elif lnk["kind"] == pymupdf.LINK_NAMED: | |
| 2189 txt = pymupdf.annot_skel["named"] # annot_named | |
| 2190 lname = lnk.get("name") # check presence of key | |
| 2191 if lname is None: # if missing, fall back to alternative | |
| 2192 lname = lnk["nameddest"] | |
| 2193 annot = txt(lname, rect) | |
| 2194 if not annot: | |
| 2195 return annot | |
| 2196 | |
| 2197 # add a /NM PDF key to the object definition | |
| 2198 link_names = dict( # existing ids and their xref | |
| 2199 [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member | |
| 2200 ) | |
| 2201 | |
| 2202 old_name = lnk.get("id", "") # id value in the argument | |
| 2203 | |
| 2204 if old_name and (lnk["xref"], old_name) in link_names.items(): | |
| 2205 name = old_name # no new name if this is an update only | |
| 2206 else: | |
| 2207 i = 0 | |
| 2208 stem = pymupdf.TOOLS.set_annot_stem() + "-L%i" | |
| 2209 while True: | |
| 2210 name = stem % i | |
| 2211 if name not in link_names.values(): | |
| 2212 break | |
| 2213 i += 1 | |
| 2214 # add /NM key to object definition | |
| 2215 annot = annot.replace("/Link", "/Link/NM(%s)" % name) | |
| 2216 return annot | |
| 2217 | |
| 2218 | |
| 2219 def delete_widget(page: pymupdf.Page, widget: pymupdf.Widget) -> pymupdf.Widget: | |
| 2220 """Delete widget from page and return the next one.""" | |
| 2221 pymupdf.CheckParent(page) | |
| 2222 annot = getattr(widget, "_annot", None) | |
| 2223 if annot is None: | |
| 2224 raise ValueError("bad type: widget") | |
| 2225 nextwidget = widget.next | |
| 2226 page.delete_annot(annot) | |
| 2227 widget._annot.parent = None | |
| 2228 keylist = list(widget.__dict__.keys()) | |
| 2229 for key in keylist: | |
| 2230 del widget.__dict__[key] | |
| 2231 return nextwidget | |
| 2232 | |
| 2233 | |
| 2234 def update_link(page: pymupdf.Page, lnk: dict) -> None: | |
| 2235 """Update a link on the current page.""" | |
| 2236 pymupdf.CheckParent(page) | |
| 2237 annot = getLinkText(page, lnk) | |
| 2238 if annot == "": | |
| 2239 raise ValueError("link kind not supported") | |
| 2240 | |
| 2241 page.parent.update_object(lnk["xref"], annot, page=page) | |
| 2242 | |
| 2243 | |
| 2244 def insert_link(page: pymupdf.Page, lnk: dict, mark: bool = True) -> None: | |
| 2245 """Insert a new link for the current page.""" | |
| 2246 pymupdf.CheckParent(page) | |
| 2247 annot = getLinkText(page, lnk) | |
| 2248 if annot == "": | |
| 2249 raise ValueError("link kind not supported") | |
| 2250 page._addAnnot_FromString((annot,)) | |
| 2251 | |
| 2252 | |
| 2253 def insert_textbox( | |
| 2254 page: pymupdf.Page, | |
| 2255 rect: rect_like, | |
| 2256 buffer: typing.Union[str, list], | |
| 2257 *, | |
| 2258 fontname: str = "helv", | |
| 2259 fontfile: OptStr = None, | |
| 2260 set_simple: int = 0, | |
| 2261 encoding: int = 0, | |
| 2262 fontsize: float = 11, | |
| 2263 lineheight: OptFloat = None, | |
| 2264 color: OptSeq = None, | |
| 2265 fill: OptSeq = None, | |
| 2266 expandtabs: int = 1, | |
| 2267 align: int = 0, | |
| 2268 rotate: int = 0, | |
| 2269 render_mode: int = 0, | |
| 2270 miter_limit: float = 1, | |
| 2271 border_width: float = 0.05, | |
| 2272 morph: OptSeq = None, | |
| 2273 overlay: bool = True, | |
| 2274 stroke_opacity: float = 1, | |
| 2275 fill_opacity: float = 1, | |
| 2276 oc: int = 0, | |
| 2277 ) -> float: | |
| 2278 """Insert text into a given rectangle. | |
| 2279 | |
| 2280 Notes: | |
| 2281 Creates a Shape object, uses its same-named method and commits it. | |
| 2282 Parameters: | |
| 2283 rect: (rect-like) area to use for text. | |
| 2284 buffer: text to be inserted | |
| 2285 fontname: a Base-14 font, font name or '/name' | |
| 2286 fontfile: name of a font file | |
| 2287 fontsize: font size | |
| 2288 lineheight: overwrite the font property | |
| 2289 color: RGB color triple | |
| 2290 expandtabs: handles tabulators with string function | |
| 2291 align: left, center, right, justified | |
| 2292 rotate: 0, 90, 180, or 270 degrees | |
| 2293 morph: morph box with a matrix and a fixpoint | |
| 2294 overlay: put text in foreground or background | |
| 2295 Returns: | |
| 2296 unused or deficit rectangle area (float) | |
| 2297 """ | |
| 2298 img = page.new_shape() | |
| 2299 rc = img.insert_textbox( | |
| 2300 rect, | |
| 2301 buffer, | |
| 2302 fontsize=fontsize, | |
| 2303 lineheight=lineheight, | |
| 2304 fontname=fontname, | |
| 2305 fontfile=fontfile, | |
| 2306 set_simple=set_simple, | |
| 2307 encoding=encoding, | |
| 2308 color=color, | |
| 2309 fill=fill, | |
| 2310 expandtabs=expandtabs, | |
| 2311 render_mode=render_mode, | |
| 2312 miter_limit=miter_limit, | |
| 2313 border_width=border_width, | |
| 2314 align=align, | |
| 2315 rotate=rotate, | |
| 2316 morph=morph, | |
| 2317 stroke_opacity=stroke_opacity, | |
| 2318 fill_opacity=fill_opacity, | |
| 2319 oc=oc, | |
| 2320 ) | |
| 2321 if rc >= 0: | |
| 2322 img.commit(overlay) | |
| 2323 return rc | |
| 2324 | |
| 2325 | |
| 2326 def insert_text( | |
| 2327 page: pymupdf.Page, | |
| 2328 point: point_like, | |
| 2329 text: typing.Union[str, list], | |
| 2330 *, | |
| 2331 fontsize: float = 11, | |
| 2332 lineheight: OptFloat = None, | |
| 2333 fontname: str = "helv", | |
| 2334 fontfile: OptStr = None, | |
| 2335 set_simple: int = 0, | |
| 2336 encoding: int = 0, | |
| 2337 color: OptSeq = None, | |
| 2338 fill: OptSeq = None, | |
| 2339 border_width: float = 0.05, | |
| 2340 miter_limit: float = 1, | |
| 2341 render_mode: int = 0, | |
| 2342 rotate: int = 0, | |
| 2343 morph: OptSeq = None, | |
| 2344 overlay: bool = True, | |
| 2345 stroke_opacity: float = 1, | |
| 2346 fill_opacity: float = 1, | |
| 2347 oc: int = 0, | |
| 2348 ): | |
| 2349 | |
| 2350 img = page.new_shape() | |
| 2351 rc = img.insert_text( | |
| 2352 point, | |
| 2353 text, | |
| 2354 fontsize=fontsize, | |
| 2355 lineheight=lineheight, | |
| 2356 fontname=fontname, | |
| 2357 fontfile=fontfile, | |
| 2358 set_simple=set_simple, | |
| 2359 encoding=encoding, | |
| 2360 color=color, | |
| 2361 fill=fill, | |
| 2362 border_width=border_width, | |
| 2363 render_mode=render_mode, | |
| 2364 miter_limit=miter_limit, | |
| 2365 rotate=rotate, | |
| 2366 morph=morph, | |
| 2367 stroke_opacity=stroke_opacity, | |
| 2368 fill_opacity=fill_opacity, | |
| 2369 oc=oc, | |
| 2370 ) | |
| 2371 if rc >= 0: | |
| 2372 img.commit(overlay) | |
| 2373 return rc | |
| 2374 | |
| 2375 | |
| 2376 def insert_htmlbox( | |
| 2377 page, | |
| 2378 rect, | |
| 2379 text, | |
| 2380 *, | |
| 2381 css=None, | |
| 2382 scale_low=0, | |
| 2383 archive=None, | |
| 2384 rotate=0, | |
| 2385 oc=0, | |
| 2386 opacity=1, | |
| 2387 overlay=True, | |
| 2388 ) -> float: | |
| 2389 """Insert text with optional HTML tags and stylings into a rectangle. | |
| 2390 | |
| 2391 Args: | |
| 2392 rect: (rect-like) rectangle into which the text should be placed. | |
| 2393 text: (str) text with optional HTML tags and stylings. | |
| 2394 css: (str) CSS styling commands. | |
| 2395 scale_low: (float) force-fit content by scaling it down. Must be in | |
| 2396 range [0, 1]. If 1, no scaling will take place. If 0, arbitrary | |
| 2397 down-scaling is acceptable. A value of 0.1 would mean that content | |
| 2398 may be scaled down by at most 90%. | |
| 2399 archive: Archive object pointing to locations of used fonts or images | |
| 2400 rotate: (int) rotate the text in the box by a multiple of 90 degrees. | |
| 2401 oc: (int) the xref of an OCG / OCMD (Optional Content). | |
| 2402 opacity: (float) set opacity of inserted content. | |
| 2403 overlay: (bool) put text on top of page content. | |
| 2404 Returns: | |
| 2405 A tuple of floats (spare_height, scale). | |
| 2406 spare_height: -1 if content did not fit, else >= 0. It is the height of the | |
| 2407 unused (still available) rectangle stripe. Positive only if | |
| 2408 scale_min = 1 (no down scaling). | |
| 2409 scale: downscaling factor, 0 < scale <= 1. Set to 0 if spare_height = -1 (no fit). | |
| 2410 """ | |
| 2411 | |
| 2412 # normalize rotation angle | |
| 2413 if not rotate % 90 == 0: | |
| 2414 raise ValueError("bad rotation angle") | |
| 2415 while rotate < 0: | |
| 2416 rotate += 360 | |
| 2417 while rotate >= 360: | |
| 2418 rotate -= 360 | |
| 2419 | |
| 2420 if not 0 <= scale_low <= 1: | |
| 2421 raise ValueError("'scale_low' must be in [0, 1]") | |
| 2422 | |
| 2423 if css is None: | |
| 2424 css = "" | |
| 2425 | |
| 2426 rect = pymupdf.Rect(rect) | |
| 2427 if rotate in (90, 270): | |
| 2428 temp_rect = pymupdf.Rect(0, 0, rect.height, rect.width) | |
| 2429 else: | |
| 2430 temp_rect = pymupdf.Rect(0, 0, rect.width, rect.height) | |
| 2431 | |
| 2432 # use a small border by default | |
| 2433 mycss = "body {margin:1px;}" + css # append user CSS | |
| 2434 | |
| 2435 # either make a story, or accept a given one | |
| 2436 if isinstance(text, str): # if a string, convert to a Story | |
| 2437 story = pymupdf.Story(html=text, user_css=mycss, archive=archive) | |
| 2438 elif isinstance(text, pymupdf.Story): | |
| 2439 story = text | |
| 2440 else: | |
| 2441 raise ValueError("'text' must be a string or a Story") | |
| 2442 # ---------------------------------------------------------------- | |
| 2443 # Find a scaling factor that lets our story fit in | |
| 2444 # ---------------------------------------------------------------- | |
| 2445 scale_max = None if scale_low == 0 else 1 / scale_low | |
| 2446 | |
| 2447 fit = story.fit_scale(temp_rect, scale_min=1, scale_max=scale_max) | |
| 2448 if not fit.big_enough: # there was no fit | |
| 2449 return (-1, scale_low) | |
| 2450 | |
| 2451 filled = fit.filled | |
| 2452 scale = 1 / fit.parameter # shrink factor | |
| 2453 | |
| 2454 spare_height = fit.rect.y1 - filled[3] # unused room at rectangle bottom | |
| 2455 # Note: due to MuPDF's logic this may be negative even for successful fits. | |
| 2456 if scale != 1 or spare_height < 0: # if scaling occurred, set spare_height to 0 | |
| 2457 spare_height = 0 | |
| 2458 | |
| 2459 def rect_function(*args): | |
| 2460 return fit.rect, fit.rect, pymupdf.Identity | |
| 2461 | |
| 2462 # draw story on temp PDF page | |
| 2463 doc = story.write_with_links(rect_function) | |
| 2464 | |
| 2465 # Insert opacity if requested. | |
| 2466 # For this, we prepend a command to the /Contents. | |
| 2467 if 0 <= opacity < 1: | |
| 2468 tpage = doc[0] # load page | |
| 2469 # generate /ExtGstate for the page | |
| 2470 alp0 = tpage._set_opacity(CA=opacity, ca=opacity) | |
| 2471 s = f"/{alp0} gs\n" # generate graphic state command | |
| 2472 pymupdf.TOOLS._insert_contents(tpage, s.encode(), 0) | |
| 2473 | |
| 2474 # put result in target page | |
| 2475 page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay) | |
| 2476 | |
| 2477 # ------------------------------------------------------------------------- | |
| 2478 # re-insert links in target rect (show_pdf_page cannot copy annotations) | |
| 2479 # ------------------------------------------------------------------------- | |
| 2480 # scaled center point of fit.rect | |
| 2481 mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale | |
| 2482 | |
| 2483 # center point of target rect | |
| 2484 mp2 = (rect.tl + rect.br) / 2 | |
| 2485 | |
| 2486 # compute link positioning matrix: | |
| 2487 # - move center of scaled-down fit.rect to (0,0) | |
| 2488 # - rotate | |
| 2489 # - move (0,0) to center of target rect | |
| 2490 mat = ( | |
| 2491 pymupdf.Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y) | |
| 2492 * pymupdf.Matrix(-rotate) | |
| 2493 * pymupdf.Matrix(1, 0, 0, 1, mp2.x, mp2.y) | |
| 2494 ) | |
| 2495 | |
| 2496 # copy over links | |
| 2497 for link in doc[0].get_links(): | |
| 2498 link["from"] *= mat | |
| 2499 page.insert_link(link) | |
| 2500 | |
| 2501 return spare_height, scale | |
| 2502 | |
| 2503 | |
| 2504 def new_page( | |
| 2505 doc: pymupdf.Document, | |
| 2506 pno: int = -1, | |
| 2507 width: float = 595, | |
| 2508 height: float = 842, | |
| 2509 ) -> pymupdf.Page: | |
| 2510 """Create and return a new page object. | |
| 2511 | |
| 2512 Args: | |
| 2513 pno: (int) insert before this page. Default: after last page. | |
| 2514 width: (float) page width in points. Default: 595 (ISO A4 width). | |
| 2515 height: (float) page height in points. Default 842 (ISO A4 height). | |
| 2516 Returns: | |
| 2517 A pymupdf.Page object. | |
| 2518 """ | |
| 2519 doc._newPage(pno, width=width, height=height) | |
| 2520 return doc[pno] | |
| 2521 | |
| 2522 | |
| 2523 def insert_page( | |
| 2524 doc: pymupdf.Document, | |
| 2525 pno: int, | |
| 2526 text: typing.Union[str, list, None] = None, | |
| 2527 fontsize: float = 11, | |
| 2528 width: float = 595, | |
| 2529 height: float = 842, | |
| 2530 fontname: str = "helv", | |
| 2531 fontfile: OptStr = None, | |
| 2532 color: OptSeq = (0,), | |
| 2533 ) -> int: | |
| 2534 """Create a new PDF page and insert some text. | |
| 2535 | |
| 2536 Notes: | |
| 2537 Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text(). | |
| 2538 For parameter details see these methods. | |
| 2539 """ | |
| 2540 page = doc.new_page(pno=pno, width=width, height=height) | |
| 2541 if not bool(text): | |
| 2542 return 0 | |
| 2543 rc = page.insert_text( | |
| 2544 (50, 72), | |
| 2545 text, | |
| 2546 fontsize=fontsize, | |
| 2547 fontname=fontname, | |
| 2548 fontfile=fontfile, | |
| 2549 color=color, | |
| 2550 ) | |
| 2551 return rc | |
| 2552 | |
| 2553 | |
| 2554 def draw_line( | |
| 2555 page: pymupdf.Page, | |
| 2556 p1: point_like, | |
| 2557 p2: point_like, | |
| 2558 color: OptSeq = (0,), | |
| 2559 dashes: OptStr = None, | |
| 2560 width: float = 1, | |
| 2561 lineCap: int = 0, | |
| 2562 lineJoin: int = 0, | |
| 2563 overlay: bool = True, | |
| 2564 morph: OptSeq = None, | |
| 2565 stroke_opacity: float = 1, | |
| 2566 fill_opacity: float = 1, | |
| 2567 oc=0, | |
| 2568 ) -> pymupdf.Point: | |
| 2569 """Draw a line from point p1 to point p2.""" | |
| 2570 img = page.new_shape() | |
| 2571 p = img.draw_line(pymupdf.Point(p1), pymupdf.Point(p2)) | |
| 2572 img.finish( | |
| 2573 color=color, | |
| 2574 dashes=dashes, | |
| 2575 width=width, | |
| 2576 closePath=False, | |
| 2577 lineCap=lineCap, | |
| 2578 lineJoin=lineJoin, | |
| 2579 morph=morph, | |
| 2580 stroke_opacity=stroke_opacity, | |
| 2581 fill_opacity=fill_opacity, | |
| 2582 oc=oc, | |
| 2583 ) | |
| 2584 img.commit(overlay) | |
| 2585 | |
| 2586 return p | |
| 2587 | |
| 2588 | |
| 2589 def draw_squiggle( | |
| 2590 page: pymupdf.Page, | |
| 2591 p1: point_like, | |
| 2592 p2: point_like, | |
| 2593 breadth: float = 2, | |
| 2594 color: OptSeq = (0,), | |
| 2595 dashes: OptStr = None, | |
| 2596 width: float = 1, | |
| 2597 lineCap: int = 0, | |
| 2598 lineJoin: int = 0, | |
| 2599 overlay: bool = True, | |
| 2600 morph: OptSeq = None, | |
| 2601 stroke_opacity: float = 1, | |
| 2602 fill_opacity: float = 1, | |
| 2603 oc: int = 0, | |
| 2604 ) -> pymupdf.Point: | |
| 2605 """Draw a squiggly line from point p1 to point p2.""" | |
| 2606 img = page.new_shape() | |
| 2607 p = img.draw_squiggle(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth) | |
| 2608 img.finish( | |
| 2609 color=color, | |
| 2610 dashes=dashes, | |
| 2611 width=width, | |
| 2612 closePath=False, | |
| 2613 lineCap=lineCap, | |
| 2614 lineJoin=lineJoin, | |
| 2615 morph=morph, | |
| 2616 stroke_opacity=stroke_opacity, | |
| 2617 fill_opacity=fill_opacity, | |
| 2618 oc=oc, | |
| 2619 ) | |
| 2620 img.commit(overlay) | |
| 2621 | |
| 2622 return p | |
| 2623 | |
| 2624 | |
| 2625 def draw_zigzag( | |
| 2626 page: pymupdf.Page, | |
| 2627 p1: point_like, | |
| 2628 p2: point_like, | |
| 2629 breadth: float = 2, | |
| 2630 color: OptSeq = (0,), | |
| 2631 dashes: OptStr = None, | |
| 2632 width: float = 1, | |
| 2633 lineCap: int = 0, | |
| 2634 lineJoin: int = 0, | |
| 2635 overlay: bool = True, | |
| 2636 morph: OptSeq = None, | |
| 2637 stroke_opacity: float = 1, | |
| 2638 fill_opacity: float = 1, | |
| 2639 oc: int = 0, | |
| 2640 ) -> pymupdf.Point: | |
| 2641 """Draw a zigzag line from point p1 to point p2.""" | |
| 2642 img = page.new_shape() | |
| 2643 p = img.draw_zigzag(pymupdf.Point(p1), pymupdf.Point(p2), breadth=breadth) | |
| 2644 img.finish( | |
| 2645 color=color, | |
| 2646 dashes=dashes, | |
| 2647 width=width, | |
| 2648 closePath=False, | |
| 2649 lineCap=lineCap, | |
| 2650 lineJoin=lineJoin, | |
| 2651 morph=morph, | |
| 2652 stroke_opacity=stroke_opacity, | |
| 2653 fill_opacity=fill_opacity, | |
| 2654 oc=oc, | |
| 2655 ) | |
| 2656 img.commit(overlay) | |
| 2657 | |
| 2658 return p | |
| 2659 | |
| 2660 | |
| 2661 def draw_rect( | |
| 2662 page: pymupdf.Page, | |
| 2663 rect: rect_like, | |
| 2664 color: OptSeq = (0,), | |
| 2665 fill: OptSeq = None, | |
| 2666 dashes: OptStr = None, | |
| 2667 width: float = 1, | |
| 2668 lineCap: int = 0, | |
| 2669 lineJoin: int = 0, | |
| 2670 morph: OptSeq = None, | |
| 2671 overlay: bool = True, | |
| 2672 stroke_opacity: float = 1, | |
| 2673 fill_opacity: float = 1, | |
| 2674 oc: int = 0, | |
| 2675 radius=None, | |
| 2676 ) -> pymupdf.Point: | |
| 2677 ''' | |
| 2678 Draw a rectangle. See Shape class method for details. | |
| 2679 ''' | |
| 2680 img = page.new_shape() | |
| 2681 Q = img.draw_rect(pymupdf.Rect(rect), radius=radius) | |
| 2682 img.finish( | |
| 2683 color=color, | |
| 2684 fill=fill, | |
| 2685 dashes=dashes, | |
| 2686 width=width, | |
| 2687 lineCap=lineCap, | |
| 2688 lineJoin=lineJoin, | |
| 2689 morph=morph, | |
| 2690 stroke_opacity=stroke_opacity, | |
| 2691 fill_opacity=fill_opacity, | |
| 2692 oc=oc, | |
| 2693 ) | |
| 2694 img.commit(overlay) | |
| 2695 | |
| 2696 return Q | |
| 2697 | |
| 2698 | |
| 2699 def draw_quad( | |
| 2700 page: pymupdf.Page, | |
| 2701 quad: quad_like, | |
| 2702 color: OptSeq = (0,), | |
| 2703 fill: OptSeq = None, | |
| 2704 dashes: OptStr = None, | |
| 2705 width: float = 1, | |
| 2706 lineCap: int = 0, | |
| 2707 lineJoin: int = 0, | |
| 2708 morph: OptSeq = None, | |
| 2709 overlay: bool = True, | |
| 2710 stroke_opacity: float = 1, | |
| 2711 fill_opacity: float = 1, | |
| 2712 oc: int = 0, | |
| 2713 ) -> pymupdf.Point: | |
| 2714 """Draw a quadrilateral.""" | |
| 2715 img = page.new_shape() | |
| 2716 Q = img.draw_quad(pymupdf.Quad(quad)) | |
| 2717 img.finish( | |
| 2718 color=color, | |
| 2719 fill=fill, | |
| 2720 dashes=dashes, | |
| 2721 width=width, | |
| 2722 lineCap=lineCap, | |
| 2723 lineJoin=lineJoin, | |
| 2724 morph=morph, | |
| 2725 stroke_opacity=stroke_opacity, | |
| 2726 fill_opacity=fill_opacity, | |
| 2727 oc=oc, | |
| 2728 ) | |
| 2729 img.commit(overlay) | |
| 2730 | |
| 2731 return Q | |
| 2732 | |
| 2733 | |
| 2734 def draw_polyline( | |
| 2735 page: pymupdf.Page, | |
| 2736 points: list, | |
| 2737 color: OptSeq = (0,), | |
| 2738 fill: OptSeq = None, | |
| 2739 dashes: OptStr = None, | |
| 2740 width: float = 1, | |
| 2741 morph: OptSeq = None, | |
| 2742 lineCap: int = 0, | |
| 2743 lineJoin: int = 0, | |
| 2744 overlay: bool = True, | |
| 2745 closePath: bool = False, | |
| 2746 stroke_opacity: float = 1, | |
| 2747 fill_opacity: float = 1, | |
| 2748 oc: int = 0, | |
| 2749 ) -> pymupdf.Point: | |
| 2750 """Draw multiple connected line segments.""" | |
| 2751 img = page.new_shape() | |
| 2752 Q = img.draw_polyline(points) | |
| 2753 img.finish( | |
| 2754 color=color, | |
| 2755 fill=fill, | |
| 2756 dashes=dashes, | |
| 2757 width=width, | |
| 2758 lineCap=lineCap, | |
| 2759 lineJoin=lineJoin, | |
| 2760 morph=morph, | |
| 2761 closePath=closePath, | |
| 2762 stroke_opacity=stroke_opacity, | |
| 2763 fill_opacity=fill_opacity, | |
| 2764 oc=oc, | |
| 2765 ) | |
| 2766 img.commit(overlay) | |
| 2767 | |
| 2768 return Q | |
| 2769 | |
| 2770 | |
| 2771 def draw_circle( | |
| 2772 page: pymupdf.Page, | |
| 2773 center: point_like, | |
| 2774 radius: float, | |
| 2775 color: OptSeq = (0,), | |
| 2776 fill: OptSeq = None, | |
| 2777 morph: OptSeq = None, | |
| 2778 dashes: OptStr = None, | |
| 2779 width: float = 1, | |
| 2780 lineCap: int = 0, | |
| 2781 lineJoin: int = 0, | |
| 2782 overlay: bool = True, | |
| 2783 stroke_opacity: float = 1, | |
| 2784 fill_opacity: float = 1, | |
| 2785 oc: int = 0, | |
| 2786 ) -> pymupdf.Point: | |
| 2787 """Draw a circle given its center and radius.""" | |
| 2788 img = page.new_shape() | |
| 2789 Q = img.draw_circle(pymupdf.Point(center), radius) | |
| 2790 img.finish( | |
| 2791 color=color, | |
| 2792 fill=fill, | |
| 2793 dashes=dashes, | |
| 2794 width=width, | |
| 2795 lineCap=lineCap, | |
| 2796 lineJoin=lineJoin, | |
| 2797 morph=morph, | |
| 2798 stroke_opacity=stroke_opacity, | |
| 2799 fill_opacity=fill_opacity, | |
| 2800 oc=oc, | |
| 2801 ) | |
| 2802 img.commit(overlay) | |
| 2803 return Q | |
| 2804 | |
| 2805 | |
| 2806 def draw_oval( | |
| 2807 page: pymupdf.Page, | |
| 2808 rect: typing.Union[rect_like, quad_like], | |
| 2809 color: OptSeq = (0,), | |
| 2810 fill: OptSeq = None, | |
| 2811 dashes: OptStr = None, | |
| 2812 morph: OptSeq = None, | |
| 2813 width: float = 1, | |
| 2814 lineCap: int = 0, | |
| 2815 lineJoin: int = 0, | |
| 2816 overlay: bool = True, | |
| 2817 stroke_opacity: float = 1, | |
| 2818 fill_opacity: float = 1, | |
| 2819 oc: int = 0, | |
| 2820 ) -> pymupdf.Point: | |
| 2821 """Draw an oval given its containing rectangle or quad.""" | |
| 2822 img = page.new_shape() | |
| 2823 Q = img.draw_oval(rect) | |
| 2824 img.finish( | |
| 2825 color=color, | |
| 2826 fill=fill, | |
| 2827 dashes=dashes, | |
| 2828 width=width, | |
| 2829 lineCap=lineCap, | |
| 2830 lineJoin=lineJoin, | |
| 2831 morph=morph, | |
| 2832 stroke_opacity=stroke_opacity, | |
| 2833 fill_opacity=fill_opacity, | |
| 2834 oc=oc, | |
| 2835 ) | |
| 2836 img.commit(overlay) | |
| 2837 | |
| 2838 return Q | |
| 2839 | |
| 2840 | |
| 2841 def draw_curve( | |
| 2842 page: pymupdf.Page, | |
| 2843 p1: point_like, | |
| 2844 p2: point_like, | |
| 2845 p3: point_like, | |
| 2846 color: OptSeq = (0,), | |
| 2847 fill: OptSeq = None, | |
| 2848 dashes: OptStr = None, | |
| 2849 width: float = 1, | |
| 2850 morph: OptSeq = None, | |
| 2851 closePath: bool = False, | |
| 2852 lineCap: int = 0, | |
| 2853 lineJoin: int = 0, | |
| 2854 overlay: bool = True, | |
| 2855 stroke_opacity: float = 1, | |
| 2856 fill_opacity: float = 1, | |
| 2857 oc: int = 0, | |
| 2858 ) -> pymupdf.Point: | |
| 2859 """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3.""" | |
| 2860 img = page.new_shape() | |
| 2861 Q = img.draw_curve(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3)) | |
| 2862 img.finish( | |
| 2863 color=color, | |
| 2864 fill=fill, | |
| 2865 dashes=dashes, | |
| 2866 width=width, | |
| 2867 lineCap=lineCap, | |
| 2868 lineJoin=lineJoin, | |
| 2869 morph=morph, | |
| 2870 closePath=closePath, | |
| 2871 stroke_opacity=stroke_opacity, | |
| 2872 fill_opacity=fill_opacity, | |
| 2873 oc=oc, | |
| 2874 ) | |
| 2875 img.commit(overlay) | |
| 2876 | |
| 2877 return Q | |
| 2878 | |
| 2879 | |
| 2880 def draw_bezier( | |
| 2881 page: pymupdf.Page, | |
| 2882 p1: point_like, | |
| 2883 p2: point_like, | |
| 2884 p3: point_like, | |
| 2885 p4: point_like, | |
| 2886 color: OptSeq = (0,), | |
| 2887 fill: OptSeq = None, | |
| 2888 dashes: OptStr = None, | |
| 2889 width: float = 1, | |
| 2890 morph: OptStr = None, | |
| 2891 closePath: bool = False, | |
| 2892 lineCap: int = 0, | |
| 2893 lineJoin: int = 0, | |
| 2894 overlay: bool = True, | |
| 2895 stroke_opacity: float = 1, | |
| 2896 fill_opacity: float = 1, | |
| 2897 oc: int = 0, | |
| 2898 ) -> pymupdf.Point: | |
| 2899 """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3.""" | |
| 2900 img = page.new_shape() | |
| 2901 Q = img.draw_bezier(pymupdf.Point(p1), pymupdf.Point(p2), pymupdf.Point(p3), pymupdf.Point(p4)) | |
| 2902 img.finish( | |
| 2903 color=color, | |
| 2904 fill=fill, | |
| 2905 dashes=dashes, | |
| 2906 width=width, | |
| 2907 lineCap=lineCap, | |
| 2908 lineJoin=lineJoin, | |
| 2909 morph=morph, | |
| 2910 closePath=closePath, | |
| 2911 stroke_opacity=stroke_opacity, | |
| 2912 fill_opacity=fill_opacity, | |
| 2913 oc=oc, | |
| 2914 ) | |
| 2915 img.commit(overlay) | |
| 2916 | |
| 2917 return Q | |
| 2918 | |
| 2919 | |
| 2920 def draw_sector( | |
| 2921 page: pymupdf.Page, | |
| 2922 center: point_like, | |
| 2923 point: point_like, | |
| 2924 beta: float, | |
| 2925 color: OptSeq = (0,), | |
| 2926 fill: OptSeq = None, | |
| 2927 dashes: OptStr = None, | |
| 2928 fullSector: bool = True, | |
| 2929 morph: OptSeq = None, | |
| 2930 width: float = 1, | |
| 2931 closePath: bool = False, | |
| 2932 lineCap: int = 0, | |
| 2933 lineJoin: int = 0, | |
| 2934 overlay: bool = True, | |
| 2935 stroke_opacity: float = 1, | |
| 2936 fill_opacity: float = 1, | |
| 2937 oc: int = 0, | |
| 2938 ) -> pymupdf.Point: | |
| 2939 """Draw a circle sector given circle center, one arc end point and the angle of the arc. | |
| 2940 | |
| 2941 Parameters: | |
| 2942 center -- center of circle | |
| 2943 point -- arc end point | |
| 2944 beta -- angle of arc (degrees) | |
| 2945 fullSector -- connect arc ends with center | |
| 2946 """ | |
| 2947 img = page.new_shape() | |
| 2948 Q = img.draw_sector(pymupdf.Point(center), pymupdf.Point(point), beta, fullSector=fullSector) | |
| 2949 img.finish( | |
| 2950 color=color, | |
| 2951 fill=fill, | |
| 2952 dashes=dashes, | |
| 2953 width=width, | |
| 2954 lineCap=lineCap, | |
| 2955 lineJoin=lineJoin, | |
| 2956 morph=morph, | |
| 2957 closePath=closePath, | |
| 2958 stroke_opacity=stroke_opacity, | |
| 2959 fill_opacity=fill_opacity, | |
| 2960 oc=oc, | |
| 2961 ) | |
| 2962 img.commit(overlay) | |
| 2963 | |
| 2964 return Q | |
| 2965 | |
| 2966 | |
| 2967 # ---------------------------------------------------------------------- | |
| 2968 # Name: wx.lib.colourdb.py | |
| 2969 # Purpose: Adds a bunch of colour names and RGB values to the | |
| 2970 # colour database so they can be found by name | |
| 2971 # | |
| 2972 # Author: Robin Dunn | |
| 2973 # | |
| 2974 # Created: 13-March-2001 | |
| 2975 # Copyright: (c) 2001-2017 by Total Control Software | |
| 2976 # Licence: wxWindows license | |
| 2977 # Tags: phoenix-port, unittest, documented | |
| 2978 # ---------------------------------------------------------------------- | |
| 2979 | |
| 2980 | |
| 2981 def getColorList() -> list: | |
| 2982 """ | |
| 2983 Returns a list of upper-case colour names. | |
| 2984 :rtype: list of strings | |
| 2985 """ | |
| 2986 return [name for name, r, g, b in pymupdf.colors_wx_list()] | |
| 2987 | |
| 2988 | |
| 2989 def getColorInfoList() -> list: | |
| 2990 """ | |
| 2991 Returns list of (name, red, gree, blue) tuples, where: | |
| 2992 name: upper-case color name. | |
| 2993 read, green, blue: integers in range 0..255. | |
| 2994 :rtype: list of tuples | |
| 2995 """ | |
| 2996 return pymupdf.colors_wx_list() | |
| 2997 | |
| 2998 | |
| 2999 def getColor(name: str) -> tuple: | |
| 3000 """Retrieve RGB color in PDF format by name. | |
| 3001 | |
| 3002 Returns: | |
| 3003 a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned. | |
| 3004 """ | |
| 3005 return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1)) | |
| 3006 | |
| 3007 | |
| 3008 def getColorHSV(name: str) -> tuple: | |
| 3009 """Retrieve the hue, saturation, value triple of a color name. | |
| 3010 | |
| 3011 Returns: | |
| 3012 a triple (degree, percent, percent). If not found (-1, -1, -1) is returned. | |
| 3013 """ | |
| 3014 try: | |
| 3015 x = getColorInfoList()[getColorList().index(name.upper())] | |
| 3016 except Exception: | |
| 3017 if g_exceptions_verbose: pymupdf.exception_info() | |
| 3018 return (-1, -1, -1) | |
| 3019 | |
| 3020 r = x[1] / 255.0 | |
| 3021 g = x[2] / 255.0 | |
| 3022 b = x[3] / 255.0 | |
| 3023 cmax = max(r, g, b) | |
| 3024 V = round(cmax * 100, 1) | |
| 3025 cmin = min(r, g, b) | |
| 3026 delta = cmax - cmin | |
| 3027 if delta == 0: | |
| 3028 hue = 0 | |
| 3029 elif cmax == r: | |
| 3030 hue = 60.0 * (((g - b) / delta) % 6) | |
| 3031 elif cmax == g: | |
| 3032 hue = 60.0 * (((b - r) / delta) + 2) | |
| 3033 else: | |
| 3034 hue = 60.0 * (((r - g) / delta) + 4) | |
| 3035 | |
| 3036 H = int(round(hue)) | |
| 3037 | |
| 3038 if cmax == 0: | |
| 3039 sat = 0 | |
| 3040 else: | |
| 3041 sat = delta / cmax | |
| 3042 S = int(round(sat * 100)) | |
| 3043 | |
| 3044 return (H, S, V) | |
| 3045 | |
| 3046 | |
| 3047 def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple: | |
| 3048 fontname, ext, stype, buffer = doc.extract_font(xref) | |
| 3049 asc = 0.8 | |
| 3050 dsc = -0.2 | |
| 3051 if ext == "": | |
| 3052 return fontname, ext, stype, asc, dsc | |
| 3053 | |
| 3054 if buffer: | |
| 3055 try: | |
| 3056 font = pymupdf.Font(fontbuffer=buffer) | |
| 3057 asc = font.ascender | |
| 3058 dsc = font.descender | |
| 3059 bbox = font.bbox | |
| 3060 if asc - dsc < 1: | |
| 3061 if bbox.y0 < dsc: | |
| 3062 dsc = bbox.y0 | |
| 3063 asc = 1 - dsc | |
| 3064 except Exception: | |
| 3065 pymupdf.exception_info() | |
| 3066 asc *= 1.2 | |
| 3067 dsc *= 1.2 | |
| 3068 return fontname, ext, stype, asc, dsc | |
| 3069 if ext != "n/a": | |
| 3070 try: | |
| 3071 font = pymupdf.Font(fontname) | |
| 3072 asc = font.ascender | |
| 3073 dsc = font.descender | |
| 3074 except Exception: | |
| 3075 pymupdf.exception_info() | |
| 3076 asc *= 1.2 | |
| 3077 dsc *= 1.2 | |
| 3078 else: | |
| 3079 asc *= 1.2 | |
| 3080 dsc *= 1.2 | |
| 3081 return fontname, ext, stype, asc, dsc | |
| 3082 | |
| 3083 | |
| 3084 def get_char_widths( | |
| 3085 doc: pymupdf.Document, xref: int, limit: int = 256, idx: int = 0, fontdict: OptDict = None | |
| 3086 ) -> list: | |
| 3087 """Get list of glyph information of a font. | |
| 3088 | |
| 3089 Notes: | |
| 3090 Must be provided by its XREF number. If we already dealt with the | |
| 3091 font, it will be recorded in doc.FontInfos. Otherwise we insert an | |
| 3092 entry there. | |
| 3093 Finally we return the glyphs for the font. This is a list of | |
| 3094 (glyph, width) where glyph is an integer controlling the char | |
| 3095 appearance, and width is a float controlling the char's spacing: | |
| 3096 width * fontsize is the actual space. | |
| 3097 For 'simple' fonts, glyph == ord(char) will usually be true. | |
| 3098 Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here. | |
| 3099 """ | |
| 3100 fontinfo = pymupdf.CheckFontInfo(doc, xref) | |
| 3101 if fontinfo is None: # not recorded yet: create it | |
| 3102 if fontdict is None: | |
| 3103 name, ext, stype, asc, dsc = _get_font_properties(doc, xref) | |
| 3104 fontdict = { | |
| 3105 "name": name, | |
| 3106 "type": stype, | |
| 3107 "ext": ext, | |
| 3108 "ascender": asc, | |
| 3109 "descender": dsc, | |
| 3110 } | |
| 3111 else: | |
| 3112 name = fontdict["name"] | |
| 3113 ext = fontdict["ext"] | |
| 3114 stype = fontdict["type"] | |
| 3115 ordering = fontdict["ordering"] | |
| 3116 simple = fontdict["simple"] | |
| 3117 | |
| 3118 if ext == "": | |
| 3119 raise ValueError("xref is not a font") | |
| 3120 | |
| 3121 # check for 'simple' fonts | |
| 3122 if stype in ("Type1", "MMType1", "TrueType"): | |
| 3123 simple = True | |
| 3124 else: | |
| 3125 simple = False | |
| 3126 | |
| 3127 # check for CJK fonts | |
| 3128 if name in ("Fangti", "Ming"): | |
| 3129 ordering = 0 | |
| 3130 elif name in ("Heiti", "Song"): | |
| 3131 ordering = 1 | |
| 3132 elif name in ("Gothic", "Mincho"): | |
| 3133 ordering = 2 | |
| 3134 elif name in ("Dotum", "Batang"): | |
| 3135 ordering = 3 | |
| 3136 else: | |
| 3137 ordering = -1 | |
| 3138 | |
| 3139 fontdict["simple"] = simple | |
| 3140 | |
| 3141 if name == "ZapfDingbats": | |
| 3142 glyphs = pymupdf.zapf_glyphs | |
| 3143 elif name == "Symbol": | |
| 3144 glyphs = pymupdf.symbol_glyphs | |
| 3145 else: | |
| 3146 glyphs = None | |
| 3147 | |
| 3148 fontdict["glyphs"] = glyphs | |
| 3149 fontdict["ordering"] = ordering | |
| 3150 fontinfo = [xref, fontdict] | |
| 3151 doc.FontInfos.append(fontinfo) | |
| 3152 else: | |
| 3153 fontdict = fontinfo[1] | |
| 3154 glyphs = fontdict["glyphs"] | |
| 3155 simple = fontdict["simple"] | |
| 3156 ordering = fontdict["ordering"] | |
| 3157 | |
| 3158 if glyphs is None: | |
| 3159 oldlimit = 0 | |
| 3160 else: | |
| 3161 oldlimit = len(glyphs) | |
| 3162 | |
| 3163 mylimit = max(256, limit) | |
| 3164 | |
| 3165 if mylimit <= oldlimit: | |
| 3166 return glyphs | |
| 3167 | |
| 3168 if ordering < 0: # not a CJK font | |
| 3169 glyphs = doc._get_char_widths( | |
| 3170 xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx | |
| 3171 ) | |
| 3172 else: # CJK fonts use char codes and width = 1 | |
| 3173 glyphs = None | |
| 3174 | |
| 3175 fontdict["glyphs"] = glyphs | |
| 3176 fontinfo[1] = fontdict | |
| 3177 pymupdf.UpdateFontInfo(doc, fontinfo) | |
| 3178 | |
| 3179 return glyphs | |
| 3180 | |
| 3181 | |
| 3182 class Shape: | |
| 3183 """Create a new shape.""" | |
| 3184 | |
| 3185 @staticmethod | |
| 3186 def horizontal_angle(C, P): | |
| 3187 """Return the angle to the horizontal for the connection from C to P. | |
| 3188 This uses the arcus sine function and resolves its inherent ambiguity by | |
| 3189 looking up in which quadrant vector S = P - C is located. | |
| 3190 """ | |
| 3191 S = pymupdf.Point(P - C).unit # unit vector 'C' -> 'P' | |
| 3192 alfa = math.asin(abs(S.y)) # absolute angle from horizontal | |
| 3193 if S.x < 0: # make arcsin result unique | |
| 3194 if S.y <= 0: # bottom-left | |
| 3195 alfa = -(math.pi - alfa) | |
| 3196 else: # top-left | |
| 3197 alfa = math.pi - alfa | |
| 3198 else: | |
| 3199 if S.y >= 0: # top-right | |
| 3200 pass | |
| 3201 else: # bottom-right | |
| 3202 alfa = -alfa | |
| 3203 return alfa | |
| 3204 | |
| 3205 def __init__(self, page: pymupdf.Page): | |
| 3206 pymupdf.CheckParent(page) | |
| 3207 self.page = page | |
| 3208 self.doc = page.parent | |
| 3209 if not self.doc.is_pdf: | |
| 3210 raise ValueError("is no PDF") | |
| 3211 self.height = page.mediabox_size.y | |
| 3212 self.width = page.mediabox_size.x | |
| 3213 self.x = page.cropbox_position.x | |
| 3214 self.y = page.cropbox_position.y | |
| 3215 | |
| 3216 self.pctm = page.transformation_matrix # page transf. matrix | |
| 3217 self.ipctm = ~self.pctm # inverted transf. matrix | |
| 3218 | |
| 3219 self.draw_cont = "" | |
| 3220 self.text_cont = "" | |
| 3221 self.totalcont = "" | |
| 3222 self.last_point = None | |
| 3223 self.rect = None | |
| 3224 | |
| 3225 def updateRect(self, x): | |
| 3226 if self.rect is None: | |
| 3227 if len(x) == 2: | |
| 3228 self.rect = pymupdf.Rect(x, x) | |
| 3229 else: | |
| 3230 self.rect = pymupdf.Rect(x) | |
| 3231 | |
| 3232 else: | |
| 3233 if len(x) == 2: | |
| 3234 x = pymupdf.Point(x) | |
| 3235 self.rect.x0 = min(self.rect.x0, x.x) | |
| 3236 self.rect.y0 = min(self.rect.y0, x.y) | |
| 3237 self.rect.x1 = max(self.rect.x1, x.x) | |
| 3238 self.rect.y1 = max(self.rect.y1, x.y) | |
| 3239 else: | |
| 3240 x = pymupdf.Rect(x) | |
| 3241 self.rect.x0 = min(self.rect.x0, x.x0) | |
| 3242 self.rect.y0 = min(self.rect.y0, x.y0) | |
| 3243 self.rect.x1 = max(self.rect.x1, x.x1) | |
| 3244 self.rect.y1 = max(self.rect.y1, x.y1) | |
| 3245 | |
| 3246 def draw_line(self, p1: point_like, p2: point_like) -> pymupdf.Point: | |
| 3247 """Draw a line between two points.""" | |
| 3248 p1 = pymupdf.Point(p1) | |
| 3249 p2 = pymupdf.Point(p2) | |
| 3250 if not (self.last_point == p1): | |
| 3251 self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n" | |
| 3252 self.last_point = p1 | |
| 3253 self.updateRect(p1) | |
| 3254 | |
| 3255 self.draw_cont += _format_g(pymupdf.JM_TUPLE(p2 * self.ipctm)) + " l\n" | |
| 3256 self.updateRect(p2) | |
| 3257 self.last_point = p2 | |
| 3258 return self.last_point | |
| 3259 | |
| 3260 def draw_polyline(self, points: list) -> pymupdf.Point: | |
| 3261 """Draw several connected line segments.""" | |
| 3262 for i, p in enumerate(points): | |
| 3263 if i == 0: | |
| 3264 if not (self.last_point == pymupdf.Point(p)): | |
| 3265 self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " m\n" | |
| 3266 self.last_point = pymupdf.Point(p) | |
| 3267 else: | |
| 3268 self.draw_cont += _format_g(pymupdf.JM_TUPLE(pymupdf.Point(p) * self.ipctm)) + " l\n" | |
| 3269 self.updateRect(p) | |
| 3270 | |
| 3271 self.last_point = pymupdf.Point(points[-1]) | |
| 3272 return self.last_point | |
| 3273 | |
| 3274 def draw_bezier( | |
| 3275 self, | |
| 3276 p1: point_like, | |
| 3277 p2: point_like, | |
| 3278 p3: point_like, | |
| 3279 p4: point_like, | |
| 3280 ) -> pymupdf.Point: | |
| 3281 """Draw a standard cubic Bezier curve.""" | |
| 3282 p1 = pymupdf.Point(p1) | |
| 3283 p2 = pymupdf.Point(p2) | |
| 3284 p3 = pymupdf.Point(p3) | |
| 3285 p4 = pymupdf.Point(p4) | |
| 3286 if not (self.last_point == p1): | |
| 3287 self.draw_cont += _format_g(pymupdf.JM_TUPLE(p1 * self.ipctm)) + " m\n" | |
| 3288 args = pymupdf.JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm)) | |
| 3289 self.draw_cont += _format_g(args) + " c\n" | |
| 3290 self.updateRect(p1) | |
| 3291 self.updateRect(p2) | |
| 3292 self.updateRect(p3) | |
| 3293 self.updateRect(p4) | |
| 3294 self.last_point = p4 | |
| 3295 return self.last_point | |
| 3296 | |
| 3297 def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> pymupdf.Point: | |
| 3298 """Draw an ellipse inside a tetrapod.""" | |
| 3299 if len(tetra) != 4: | |
| 3300 raise ValueError("invalid arg length") | |
| 3301 if hasattr(tetra[0], "__float__"): | |
| 3302 q = pymupdf.Rect(tetra).quad | |
| 3303 else: | |
| 3304 q = pymupdf.Quad(tetra) | |
| 3305 | |
| 3306 mt = q.ul + (q.ur - q.ul) * 0.5 | |
| 3307 mr = q.ur + (q.lr - q.ur) * 0.5 | |
| 3308 mb = q.ll + (q.lr - q.ll) * 0.5 | |
| 3309 ml = q.ul + (q.ll - q.ul) * 0.5 | |
| 3310 if not (self.last_point == ml): | |
| 3311 self.draw_cont += _format_g(pymupdf.JM_TUPLE(ml * self.ipctm)) + " m\n" | |
| 3312 self.last_point = ml | |
| 3313 self.draw_curve(ml, q.ll, mb) | |
| 3314 self.draw_curve(mb, q.lr, mr) | |
| 3315 self.draw_curve(mr, q.ur, mt) | |
| 3316 self.draw_curve(mt, q.ul, ml) | |
| 3317 self.updateRect(q.rect) | |
| 3318 self.last_point = ml | |
| 3319 return self.last_point | |
| 3320 | |
| 3321 def draw_circle(self, center: point_like, radius: float) -> pymupdf.Point: | |
| 3322 """Draw a circle given its center and radius.""" | |
| 3323 if not radius > pymupdf.EPSILON: | |
| 3324 raise ValueError("radius must be positive") | |
| 3325 center = pymupdf.Point(center) | |
| 3326 p1 = center - (radius, 0) | |
| 3327 return self.draw_sector(center, p1, 360, fullSector=False) | |
| 3328 | |
| 3329 def draw_curve( | |
| 3330 self, | |
| 3331 p1: point_like, | |
| 3332 p2: point_like, | |
| 3333 p3: point_like, | |
| 3334 ) -> pymupdf.Point: | |
| 3335 """Draw a curve between points using one control point.""" | |
| 3336 kappa = 0.55228474983 | |
| 3337 p1 = pymupdf.Point(p1) | |
| 3338 p2 = pymupdf.Point(p2) | |
| 3339 p3 = pymupdf.Point(p3) | |
| 3340 k1 = p1 + (p2 - p1) * kappa | |
| 3341 k2 = p3 + (p2 - p3) * kappa | |
| 3342 return self.draw_bezier(p1, k1, k2, p3) | |
| 3343 | |
| 3344 def draw_sector( | |
| 3345 self, | |
| 3346 center: point_like, | |
| 3347 point: point_like, | |
| 3348 beta: float, | |
| 3349 fullSector: bool = True, | |
| 3350 ) -> pymupdf.Point: | |
| 3351 """Draw a circle sector.""" | |
| 3352 center = pymupdf.Point(center) | |
| 3353 point = pymupdf.Point(point) | |
| 3354 l3 = lambda a, b: _format_g((a, b)) + " m\n" | |
| 3355 l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n" | |
| 3356 l5 = lambda a, b: _format_g((a, b)) + " l\n" | |
| 3357 betar = math.radians(-beta) | |
| 3358 w360 = math.radians(math.copysign(360, betar)) * (-1) | |
| 3359 w90 = math.radians(math.copysign(90, betar)) | |
| 3360 w45 = w90 / 2 | |
| 3361 while abs(betar) > 2 * math.pi: | |
| 3362 betar += w360 # bring angle below 360 degrees | |
| 3363 if not (self.last_point == point): | |
| 3364 self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm)) | |
| 3365 self.last_point = point | |
| 3366 Q = pymupdf.Point(0, 0) # just make sure it exists | |
| 3367 C = center | |
| 3368 P = point | |
| 3369 S = P - C # vector 'center' -> 'point' | |
| 3370 rad = abs(S) # circle radius | |
| 3371 | |
| 3372 if not rad > pymupdf.EPSILON: | |
| 3373 raise ValueError("radius must be positive") | |
| 3374 | |
| 3375 alfa = self.horizontal_angle(center, point) | |
| 3376 while abs(betar) > abs(w90): # draw 90 degree arcs | |
| 3377 q1 = C.x + math.cos(alfa + w90) * rad | |
| 3378 q2 = C.y + math.sin(alfa + w90) * rad | |
| 3379 Q = pymupdf.Point(q1, q2) # the arc's end point | |
| 3380 r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45) | |
| 3381 r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45) | |
| 3382 R = pymupdf.Point(r1, r2) # crossing point of tangents | |
| 3383 kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q) | |
| 3384 kappa = kappah * abs(P - Q) | |
| 3385 cp1 = P + (R - P) * kappa # control point 1 | |
| 3386 cp2 = Q + (R - Q) * kappa # control point 2 | |
| 3387 self.draw_cont += l4(*pymupdf.JM_TUPLE( | |
| 3388 list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) | |
| 3389 )) | |
| 3390 | |
| 3391 betar -= w90 # reduce param angle by 90 deg | |
| 3392 alfa += w90 # advance start angle by 90 deg | |
| 3393 P = Q # advance to arc end point | |
| 3394 # draw (remaining) arc | |
| 3395 if abs(betar) > 1e-3: # significant degrees left? | |
| 3396 beta2 = betar / 2 | |
| 3397 q1 = C.x + math.cos(alfa + betar) * rad | |
| 3398 q2 = C.y + math.sin(alfa + betar) * rad | |
| 3399 Q = pymupdf.Point(q1, q2) # the arc's end point | |
| 3400 r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2) | |
| 3401 r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2) | |
| 3402 R = pymupdf.Point(r1, r2) # crossing point of tangents | |
| 3403 # kappa height is 4/3 of segment height | |
| 3404 kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height | |
| 3405 kappa = kappah * abs(P - Q) / (1 - math.cos(betar)) | |
| 3406 cp1 = P + (R - P) * kappa # control point 1 | |
| 3407 cp2 = Q + (R - Q) * kappa # control point 2 | |
| 3408 self.draw_cont += l4(*pymupdf.JM_TUPLE( | |
| 3409 list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm) | |
| 3410 )) | |
| 3411 if fullSector: | |
| 3412 self.draw_cont += l3(*pymupdf.JM_TUPLE(point * self.ipctm)) | |
| 3413 self.draw_cont += l5(*pymupdf.JM_TUPLE(center * self.ipctm)) | |
| 3414 self.draw_cont += l5(*pymupdf.JM_TUPLE(Q * self.ipctm)) | |
| 3415 self.last_point = Q | |
| 3416 return self.last_point | |
| 3417 | |
| 3418 def draw_rect(self, rect: rect_like, *, radius=None) -> pymupdf.Point: | |
| 3419 """Draw a rectangle. | |
| 3420 | |
| 3421 Args: | |
| 3422 radius: if not None, the rectangle will have rounded corners. | |
| 3423 This is the radius of the curvature, given as percentage of | |
| 3424 the rectangle width or height. Valid are values 0 < v <= 0.5. | |
| 3425 For a sequence of two values, the corners will have different | |
| 3426 radii. Otherwise, the percentage will be computed from the | |
| 3427 shorter side. A value of (0.5, 0.5) will draw an ellipse. | |
| 3428 """ | |
| 3429 r = pymupdf.Rect(rect) | |
| 3430 if radius is None: # standard rectangle | |
| 3431 self.draw_cont += _format_g(pymupdf.JM_TUPLE( | |
| 3432 list(r.bl * self.ipctm) + [r.width, r.height] | |
| 3433 )) + " re\n" | |
| 3434 self.updateRect(r) | |
| 3435 self.last_point = r.tl | |
| 3436 return self.last_point | |
| 3437 # rounded corners requested. This requires 1 or 2 values, each | |
| 3438 # with 0 < value <= 0.5 | |
| 3439 if hasattr(radius, "__float__"): | |
| 3440 if radius <= 0 or radius > 0.5: | |
| 3441 raise ValueError(f"bad radius value {radius}.") | |
| 3442 d = min(r.width, r.height) * radius | |
| 3443 px = (d, 0) | |
| 3444 py = (0, d) | |
| 3445 elif hasattr(radius, "__len__") and len(radius) == 2: | |
| 3446 rx, ry = radius | |
| 3447 px = (rx * r.width, 0) | |
| 3448 py = (0, ry * r.height) | |
| 3449 if min(rx, ry) <= 0 or max(rx, ry) > 0.5: | |
| 3450 raise ValueError(f"bad radius value {radius}.") | |
| 3451 else: | |
| 3452 raise ValueError(f"bad radius value {radius}.") | |
| 3453 | |
| 3454 lp = self.draw_line(r.tl + py, r.bl - py) | |
| 3455 lp = self.draw_curve(lp, r.bl, r.bl + px) | |
| 3456 | |
| 3457 lp = self.draw_line(lp, r.br - px) | |
| 3458 lp = self.draw_curve(lp, r.br, r.br - py) | |
| 3459 | |
| 3460 lp = self.draw_line(lp, r.tr + py) | |
| 3461 lp = self.draw_curve(lp, r.tr, r.tr - px) | |
| 3462 | |
| 3463 lp = self.draw_line(lp, r.tl + px) | |
| 3464 self.last_point = self.draw_curve(lp, r.tl, r.tl + py) | |
| 3465 | |
| 3466 self.updateRect(r) | |
| 3467 return self.last_point | |
| 3468 | |
| 3469 def draw_quad(self, quad: quad_like) -> pymupdf.Point: | |
| 3470 """Draw a Quad.""" | |
| 3471 q = pymupdf.Quad(quad) | |
| 3472 return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul]) | |
| 3473 | |
| 3474 def draw_zigzag( | |
| 3475 self, | |
| 3476 p1: point_like, | |
| 3477 p2: point_like, | |
| 3478 breadth: float = 2, | |
| 3479 ) -> pymupdf.Point: | |
| 3480 """Draw a zig-zagged line from p1 to p2.""" | |
| 3481 p1 = pymupdf.Point(p1) | |
| 3482 p2 = pymupdf.Point(p2) | |
| 3483 S = p2 - p1 # vector start - end | |
| 3484 rad = abs(S) # distance of points | |
| 3485 cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases | |
| 3486 if cnt < 4: | |
| 3487 raise ValueError("points too close") | |
| 3488 mb = rad / cnt # revised breadth | |
| 3489 matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis | |
| 3490 i_mat = ~matrix # get original position | |
| 3491 points = [] # stores edges | |
| 3492 for i in range(1, cnt): | |
| 3493 if i % 4 == 1: # point "above" connection | |
| 3494 p = pymupdf.Point(i, -1) * mb | |
| 3495 elif i % 4 == 3: # point "below" connection | |
| 3496 p = pymupdf.Point(i, 1) * mb | |
| 3497 else: # ignore others | |
| 3498 continue | |
| 3499 points.append(p * i_mat) | |
| 3500 self.draw_polyline([p1] + points + [p2]) # add start and end points | |
| 3501 return p2 | |
| 3502 | |
| 3503 def draw_squiggle( | |
| 3504 self, | |
| 3505 p1: point_like, | |
| 3506 p2: point_like, | |
| 3507 breadth=2, | |
| 3508 ) -> pymupdf.Point: | |
| 3509 """Draw a squiggly line from p1 to p2.""" | |
| 3510 p1 = pymupdf.Point(p1) | |
| 3511 p2 = pymupdf.Point(p2) | |
| 3512 S = p2 - p1 # vector start - end | |
| 3513 rad = abs(S) # distance of points | |
| 3514 cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases | |
| 3515 if cnt < 4: | |
| 3516 raise ValueError("points too close") | |
| 3517 mb = rad / cnt # revised breadth | |
| 3518 matrix = pymupdf.Matrix(pymupdf.util_hor_matrix(p1, p2)) # normalize line to x-axis | |
| 3519 i_mat = ~matrix # get original position | |
| 3520 k = 2.4142135623765633 # y of draw_curve helper point | |
| 3521 | |
| 3522 points = [] # stores edges | |
| 3523 for i in range(1, cnt): | |
| 3524 if i % 4 == 1: # point "above" connection | |
| 3525 p = pymupdf.Point(i, -k) * mb | |
| 3526 elif i % 4 == 3: # point "below" connection | |
| 3527 p = pymupdf.Point(i, k) * mb | |
| 3528 else: # else on connection line | |
| 3529 p = pymupdf.Point(i, 0) * mb | |
| 3530 points.append(p * i_mat) | |
| 3531 | |
| 3532 points = [p1] + points + [p2] | |
| 3533 cnt = len(points) | |
| 3534 i = 0 | |
| 3535 while i + 2 < cnt: | |
| 3536 self.draw_curve(points[i], points[i + 1], points[i + 2]) | |
| 3537 i += 2 | |
| 3538 return p2 | |
| 3539 | |
| 3540 # ============================================================================== | |
| 3541 # Shape.insert_text | |
| 3542 # ============================================================================== | |
| 3543 def insert_text( | |
| 3544 self, | |
| 3545 point: point_like, | |
| 3546 buffer: typing.Union[str, list], | |
| 3547 *, | |
| 3548 fontsize: float = 11, | |
| 3549 lineheight: OptFloat = None, | |
| 3550 fontname: str = "helv", | |
| 3551 fontfile: OptStr = None, | |
| 3552 set_simple: bool = 0, | |
| 3553 encoding: int = 0, | |
| 3554 color: OptSeq = None, | |
| 3555 fill: OptSeq = None, | |
| 3556 render_mode: int = 0, | |
| 3557 border_width: float = 0.05, | |
| 3558 miter_limit: float = 1, | |
| 3559 rotate: int = 0, | |
| 3560 morph: OptSeq = None, | |
| 3561 stroke_opacity: float = 1, | |
| 3562 fill_opacity: float = 1, | |
| 3563 oc: int = 0, | |
| 3564 ) -> int: | |
| 3565 | |
| 3566 # ensure 'text' is a list of strings, worth dealing with | |
| 3567 if not bool(buffer): | |
| 3568 return 0 | |
| 3569 | |
| 3570 if type(buffer) not in (list, tuple): | |
| 3571 text = buffer.splitlines() | |
| 3572 else: | |
| 3573 text = buffer | |
| 3574 | |
| 3575 if not len(text) > 0: | |
| 3576 return 0 | |
| 3577 | |
| 3578 point = pymupdf.Point(point) | |
| 3579 try: | |
| 3580 maxcode = max([ord(c) for c in " ".join(text)]) | |
| 3581 except Exception: | |
| 3582 pymupdf.exception_info() | |
| 3583 return 0 | |
| 3584 | |
| 3585 # ensure valid 'fontname' | |
| 3586 fname = fontname | |
| 3587 if fname.startswith("/"): | |
| 3588 fname = fname[1:] | |
| 3589 | |
| 3590 xref = self.page.insert_font( | |
| 3591 fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple | |
| 3592 ) | |
| 3593 fontinfo = pymupdf.CheckFontInfo(self.doc, xref) | |
| 3594 | |
| 3595 fontdict = fontinfo[1] | |
| 3596 ordering = fontdict["ordering"] | |
| 3597 simple = fontdict["simple"] | |
| 3598 bfname = fontdict["name"] | |
| 3599 ascender = fontdict["ascender"] | |
| 3600 descender = fontdict["descender"] | |
| 3601 if lineheight: | |
| 3602 lheight = fontsize * lineheight | |
| 3603 elif ascender - descender <= 1: | |
| 3604 lheight = fontsize * 1.2 | |
| 3605 else: | |
| 3606 lheight = fontsize * (ascender - descender) | |
| 3607 | |
| 3608 if maxcode > 255: | |
| 3609 glyphs = self.doc.get_char_widths(xref, maxcode + 1) | |
| 3610 else: | |
| 3611 glyphs = fontdict["glyphs"] | |
| 3612 | |
| 3613 tab = [] | |
| 3614 for t in text: | |
| 3615 if simple and bfname not in ("Symbol", "ZapfDingbats"): | |
| 3616 g = None | |
| 3617 else: | |
| 3618 g = glyphs | |
| 3619 tab.append(pymupdf.getTJstr(t, g, simple, ordering)) | |
| 3620 text = tab | |
| 3621 | |
| 3622 color_str = pymupdf.ColorCode(color, "c") | |
| 3623 fill_str = pymupdf.ColorCode(fill, "f") | |
| 3624 if not fill and render_mode == 0: # ensure fill color when 0 Tr | |
| 3625 fill = color | |
| 3626 fill_str = pymupdf.ColorCode(color, "f") | |
| 3627 | |
| 3628 morphing = pymupdf.CheckMorph(morph) | |
| 3629 rot = rotate | |
| 3630 if rot % 90 != 0: | |
| 3631 raise ValueError("bad rotate value") | |
| 3632 | |
| 3633 while rot < 0: | |
| 3634 rot += 360 | |
| 3635 rot = rot % 360 # text rotate = 0, 90, 270, 180 | |
| 3636 | |
| 3637 templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf " | |
| 3638 templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n" | |
| 3639 cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise | |
| 3640 cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise | |
| 3641 cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. | |
| 3642 height = self.height | |
| 3643 width = self.width | |
| 3644 | |
| 3645 # setting up for standard rotation directions | |
| 3646 # case rotate = 0 | |
| 3647 if morphing: | |
| 3648 m1 = pymupdf.Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y) | |
| 3649 mat = ~m1 * morph[1] * m1 | |
| 3650 cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" | |
| 3651 else: | |
| 3652 cm = "" | |
| 3653 top = height - point.y - self.y # start of 1st char | |
| 3654 left = point.x + self.x # start of 1. char | |
| 3655 space = top # space available | |
| 3656 #headroom = point.y + self.y # distance to page border | |
| 3657 if rot == 90: | |
| 3658 left = height - point.y - self.y | |
| 3659 top = -point.x - self.x | |
| 3660 cm += cmp90 | |
| 3661 space = width - abs(top) | |
| 3662 #headroom = point.x + self.x | |
| 3663 | |
| 3664 elif rot == 270: | |
| 3665 left = -height + point.y + self.y | |
| 3666 top = point.x + self.x | |
| 3667 cm += cmm90 | |
| 3668 space = abs(top) | |
| 3669 #headroom = width - point.x - self.x | |
| 3670 | |
| 3671 elif rot == 180: | |
| 3672 left = -point.x - self.x | |
| 3673 top = -height + point.y + self.y | |
| 3674 cm += cm180 | |
| 3675 space = abs(point.y + self.y) | |
| 3676 #headroom = height - point.y - self.y | |
| 3677 | |
| 3678 optcont = self.page._get_optional_content(oc) | |
| 3679 if optcont is not None: | |
| 3680 bdc = "/OC /%s BDC\n" % optcont | |
| 3681 emc = "EMC\n" | |
| 3682 else: | |
| 3683 bdc = emc = "" | |
| 3684 | |
| 3685 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) | |
| 3686 if alpha is None: | |
| 3687 alpha = "" | |
| 3688 else: | |
| 3689 alpha = "/%s gs\n" % alpha | |
| 3690 nres = templ1(bdc, alpha, cm, left, top, fname, fontsize) | |
| 3691 | |
| 3692 if render_mode > 0: | |
| 3693 nres += "%i Tr " % render_mode | |
| 3694 nres += _format_g(border_width * fontsize) + " w " | |
| 3695 if miter_limit is not None: | |
| 3696 nres += _format_g(miter_limit) + " M " | |
| 3697 if color is not None: | |
| 3698 nres += color_str | |
| 3699 if fill is not None: | |
| 3700 nres += fill_str | |
| 3701 | |
| 3702 # ========================================================================= | |
| 3703 # start text insertion | |
| 3704 # ========================================================================= | |
| 3705 nres += text[0] | |
| 3706 nlines = 1 # set output line counter | |
| 3707 if len(text) > 1: | |
| 3708 nres += templ2(lheight) # line 1 | |
| 3709 else: | |
| 3710 nres += 'TJ' | |
| 3711 for i in range(1, len(text)): | |
| 3712 if space < lheight: | |
| 3713 break # no space left on page | |
| 3714 if i > 1: | |
| 3715 nres += "\nT* " | |
| 3716 nres += text[i] + 'TJ' | |
| 3717 space -= lheight | |
| 3718 nlines += 1 | |
| 3719 | |
| 3720 nres += "\nET\n%sQ\n" % emc | |
| 3721 | |
| 3722 # ========================================================================= | |
| 3723 # end of text insertion | |
| 3724 # ========================================================================= | |
| 3725 # update the /Contents object | |
| 3726 self.text_cont += nres | |
| 3727 return nlines | |
| 3728 | |
| 3729 # ============================================================================== | |
| 3730 # Shape.insert_textbox | |
| 3731 # ============================================================================== | |
| 3732 def insert_textbox( | |
| 3733 self, | |
| 3734 rect: rect_like, | |
| 3735 buffer: typing.Union[str, list], | |
| 3736 *, | |
| 3737 fontname: OptStr = "helv", | |
| 3738 fontfile: OptStr = None, | |
| 3739 fontsize: float = 11, | |
| 3740 lineheight: OptFloat = None, | |
| 3741 set_simple: bool = 0, | |
| 3742 encoding: int = 0, | |
| 3743 color: OptSeq = None, | |
| 3744 fill: OptSeq = None, | |
| 3745 expandtabs: int = 1, | |
| 3746 border_width: float = 0.05, | |
| 3747 miter_limit: float = 1, | |
| 3748 align: int = 0, | |
| 3749 render_mode: int = 0, | |
| 3750 rotate: int = 0, | |
| 3751 morph: OptSeq = None, | |
| 3752 stroke_opacity: float = 1, | |
| 3753 fill_opacity: float = 1, | |
| 3754 oc: int = 0, | |
| 3755 ) -> float: | |
| 3756 """Insert text into a given rectangle. | |
| 3757 | |
| 3758 Args: | |
| 3759 rect -- the textbox to fill | |
| 3760 buffer -- text to be inserted | |
| 3761 fontname -- a Base-14 font, font name or '/name' | |
| 3762 fontfile -- name of a font file | |
| 3763 fontsize -- font size | |
| 3764 lineheight -- overwrite the font property | |
| 3765 color -- RGB stroke color triple | |
| 3766 fill -- RGB fill color triple | |
| 3767 render_mode -- text rendering control | |
| 3768 border_width -- thickness of glyph borders as percentage of fontsize | |
| 3769 expandtabs -- handles tabulators with string function | |
| 3770 align -- left, center, right, justified | |
| 3771 rotate -- 0, 90, 180, or 270 degrees | |
| 3772 morph -- morph box with a matrix and a fixpoint | |
| 3773 Returns: | |
| 3774 unused or deficit rectangle area (float) | |
| 3775 """ | |
| 3776 rect = pymupdf.Rect(rect) | |
| 3777 if rect.is_empty or rect.is_infinite: | |
| 3778 raise ValueError("text box must be finite and not empty") | |
| 3779 | |
| 3780 color_str = pymupdf.ColorCode(color, "c") | |
| 3781 fill_str = pymupdf.ColorCode(fill, "f") | |
| 3782 if fill is None and render_mode == 0: # ensure fill color for 0 Tr | |
| 3783 fill = color | |
| 3784 fill_str = pymupdf.ColorCode(color, "f") | |
| 3785 | |
| 3786 optcont = self.page._get_optional_content(oc) | |
| 3787 if optcont is not None: | |
| 3788 bdc = "/OC /%s BDC\n" % optcont | |
| 3789 emc = "EMC\n" | |
| 3790 else: | |
| 3791 bdc = emc = "" | |
| 3792 | |
| 3793 # determine opacity / transparency | |
| 3794 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) | |
| 3795 if alpha is None: | |
| 3796 alpha = "" | |
| 3797 else: | |
| 3798 alpha = "/%s gs\n" % alpha | |
| 3799 | |
| 3800 if rotate % 90 != 0: | |
| 3801 raise ValueError("rotate must be multiple of 90") | |
| 3802 | |
| 3803 rot = rotate | |
| 3804 while rot < 0: | |
| 3805 rot += 360 | |
| 3806 rot = rot % 360 | |
| 3807 | |
| 3808 # is buffer worth of dealing with? | |
| 3809 if not bool(buffer): | |
| 3810 return rect.height if rot in (0, 180) else rect.width | |
| 3811 | |
| 3812 cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise | |
| 3813 cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise | |
| 3814 cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg. | |
| 3815 height = self.height | |
| 3816 | |
| 3817 fname = fontname | |
| 3818 if fname.startswith("/"): | |
| 3819 fname = fname[1:] | |
| 3820 | |
| 3821 xref = self.page.insert_font( | |
| 3822 fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple | |
| 3823 ) | |
| 3824 fontinfo = pymupdf.CheckFontInfo(self.doc, xref) | |
| 3825 | |
| 3826 fontdict = fontinfo[1] | |
| 3827 ordering = fontdict["ordering"] | |
| 3828 simple = fontdict["simple"] | |
| 3829 glyphs = fontdict["glyphs"] | |
| 3830 bfname = fontdict["name"] | |
| 3831 ascender = fontdict["ascender"] | |
| 3832 descender = fontdict["descender"] | |
| 3833 | |
| 3834 if lineheight: | |
| 3835 lheight_factor = lineheight | |
| 3836 elif ascender - descender <= 1: | |
| 3837 lheight_factor = 1.2 | |
| 3838 else: | |
| 3839 lheight_factor = ascender - descender | |
| 3840 lheight = fontsize * lheight_factor | |
| 3841 | |
| 3842 # create a list from buffer, split into its lines | |
| 3843 if type(buffer) in (list, tuple): | |
| 3844 t0 = "\n".join(buffer) | |
| 3845 else: | |
| 3846 t0 = buffer | |
| 3847 | |
| 3848 maxcode = max([ord(c) for c in t0]) | |
| 3849 # replace invalid char codes for simple fonts | |
| 3850 if simple and maxcode > 255: | |
| 3851 t0 = "".join([c if ord(c) < 256 else "?" for c in t0]) | |
| 3852 | |
| 3853 t0 = t0.splitlines() | |
| 3854 | |
| 3855 glyphs = self.doc.get_char_widths(xref, maxcode + 1) | |
| 3856 if simple and bfname not in ("Symbol", "ZapfDingbats"): | |
| 3857 tj_glyphs = None | |
| 3858 else: | |
| 3859 tj_glyphs = glyphs | |
| 3860 | |
| 3861 # ---------------------------------------------------------------------- | |
| 3862 # calculate pixel length of a string | |
| 3863 # ---------------------------------------------------------------------- | |
| 3864 def pixlen(x): | |
| 3865 """Calculate pixel length of x.""" | |
| 3866 if ordering < 0: | |
| 3867 return sum([glyphs[ord(c)][1] for c in x]) * fontsize | |
| 3868 else: | |
| 3869 return len(x) * fontsize | |
| 3870 | |
| 3871 # --------------------------------------------------------------------- | |
| 3872 | |
| 3873 if ordering < 0: | |
| 3874 blen = glyphs[32][1] * fontsize # pixel size of space character | |
| 3875 else: | |
| 3876 blen = fontsize | |
| 3877 | |
| 3878 text = "" # output buffer | |
| 3879 | |
| 3880 if pymupdf.CheckMorph(morph): | |
| 3881 m1 = pymupdf.Matrix( | |
| 3882 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y | |
| 3883 ) | |
| 3884 mat = ~m1 * morph[1] * m1 | |
| 3885 cm = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" | |
| 3886 else: | |
| 3887 cm = "" | |
| 3888 | |
| 3889 # --------------------------------------------------------------------- | |
| 3890 # adjust for text orientation / rotation | |
| 3891 # --------------------------------------------------------------------- | |
| 3892 progr = 1 # direction of line progress | |
| 3893 c_pnt = pymupdf.Point(0, fontsize * ascender) # used for line progress | |
| 3894 if rot == 0: # normal orientation | |
| 3895 point = rect.tl + c_pnt # line 1 is 'lheight' below top | |
| 3896 maxwidth = rect.width # pixels available in one line | |
| 3897 maxheight = rect.height # available text height | |
| 3898 | |
| 3899 elif rot == 90: # rotate counter clockwise | |
| 3900 c_pnt = pymupdf.Point(fontsize * ascender, 0) # progress in x-direction | |
| 3901 point = rect.bl + c_pnt # line 1 'lheight' away from left | |
| 3902 maxwidth = rect.height # pixels available in one line | |
| 3903 maxheight = rect.width # available text height | |
| 3904 cm += cmp90 | |
| 3905 | |
| 3906 elif rot == 180: # text upside down | |
| 3907 # progress upwards in y direction | |
| 3908 c_pnt = -pymupdf.Point(0, fontsize * ascender) | |
| 3909 point = rect.br + c_pnt # line 1 'lheight' above bottom | |
| 3910 maxwidth = rect.width # pixels available in one line | |
| 3911 progr = -1 # subtract lheight for next line | |
| 3912 maxheight =rect.height # available text height | |
| 3913 cm += cm180 | |
| 3914 | |
| 3915 else: # rotate clockwise (270 or -90) | |
| 3916 # progress from right to left | |
| 3917 c_pnt = -pymupdf.Point(fontsize * ascender, 0) | |
| 3918 point = rect.tr + c_pnt # line 1 'lheight' left of right | |
| 3919 maxwidth = rect.height # pixels available in one line | |
| 3920 progr = -1 # subtract lheight for next line | |
| 3921 maxheight = rect.width # available text height | |
| 3922 cm += cmm90 | |
| 3923 | |
| 3924 # ===================================================================== | |
| 3925 # line loop | |
| 3926 # ===================================================================== | |
| 3927 just_tab = [] # 'justify' indicators per line | |
| 3928 | |
| 3929 for i, line in enumerate(t0): | |
| 3930 line_t = line.expandtabs(expandtabs).split(" ") # split into words | |
| 3931 num_words = len(line_t) | |
| 3932 lbuff = "" # init line buffer | |
| 3933 rest = maxwidth # available line pixels | |
| 3934 # ================================================================= | |
| 3935 # word loop | |
| 3936 # ================================================================= | |
| 3937 for j in range(num_words): | |
| 3938 word = line_t[j] | |
| 3939 pl_w = pixlen(word) # pixel len of word | |
| 3940 if rest >= pl_w: # does it fit on the line? | |
| 3941 lbuff += word + " " # yes, append word | |
| 3942 rest -= pl_w + blen # update available line space | |
| 3943 continue # next word | |
| 3944 | |
| 3945 # word doesn't fit - output line (if not empty) | |
| 3946 if lbuff: | |
| 3947 lbuff = lbuff.rstrip() + "\n" # line full, append line break | |
| 3948 text += lbuff # append to total text | |
| 3949 just_tab.append(True) # can align-justify | |
| 3950 | |
| 3951 lbuff = "" # re-init line buffer | |
| 3952 rest = maxwidth # re-init avail. space | |
| 3953 | |
| 3954 if pl_w <= maxwidth: # word shorter than 1 line? | |
| 3955 lbuff = word + " " # start the line with it | |
| 3956 rest = maxwidth - pl_w - blen # update free space | |
| 3957 continue | |
| 3958 | |
| 3959 # long word: split across multiple lines - char by char ... | |
| 3960 if len(just_tab) > 0: | |
| 3961 just_tab[-1] = False # cannot align-justify | |
| 3962 for c in word: | |
| 3963 if pixlen(lbuff) <= maxwidth - pixlen(c): | |
| 3964 lbuff += c | |
| 3965 else: # line full | |
| 3966 lbuff += "\n" # close line | |
| 3967 text += lbuff # append to text | |
| 3968 just_tab.append(False) # cannot align-justify | |
| 3969 lbuff = c # start new line with this char | |
| 3970 | |
| 3971 lbuff += " " # finish long word | |
| 3972 rest = maxwidth - pixlen(lbuff) # long word stored | |
| 3973 | |
| 3974 if lbuff: # unprocessed line content? | |
| 3975 text += lbuff.rstrip() # append to text | |
| 3976 just_tab.append(False) # cannot align-justify | |
| 3977 | |
| 3978 if i < len(t0) - 1: # not the last line? | |
| 3979 text += "\n" # insert line break | |
| 3980 | |
| 3981 # compute used part of the textbox | |
| 3982 if text.endswith("\n"): | |
| 3983 text = text[:-1] | |
| 3984 lb_count = text.count("\n") + 1 # number of lines written | |
| 3985 | |
| 3986 # text height = line count * line height plus one descender value | |
| 3987 text_height = lheight * lb_count - descender * fontsize | |
| 3988 | |
| 3989 more = text_height - maxheight # difference to height limit | |
| 3990 if more > pymupdf.EPSILON: # landed too much outside rect | |
| 3991 return (-1) * more # return deficit, don't output | |
| 3992 | |
| 3993 more = abs(more) | |
| 3994 if more < pymupdf.EPSILON: | |
| 3995 more = 0 # don't bother with epsilons | |
| 3996 nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer | |
| 3997 templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf " | |
| 3998 # center, right, justify: output each line with its own specifics | |
| 3999 text_t = text.splitlines() # split text in lines again | |
| 4000 just_tab[-1] = False # never justify last line | |
| 4001 for i, t in enumerate(text_t): | |
| 4002 spacing = 0 | |
| 4003 pl = maxwidth - pixlen(t) # length of empty line part | |
| 4004 pnt = point + c_pnt * (i * lheight_factor) # text start of line | |
| 4005 if align == 1: # center: right shift by half width | |
| 4006 if rot in (0, 180): | |
| 4007 pnt = pnt + pymupdf.Point(pl / 2, 0) * progr | |
| 4008 else: | |
| 4009 pnt = pnt - pymupdf.Point(0, pl / 2) * progr | |
| 4010 elif align == 2: # right: right shift by full width | |
| 4011 if rot in (0, 180): | |
| 4012 pnt = pnt + pymupdf.Point(pl, 0) * progr | |
| 4013 else: | |
| 4014 pnt = pnt - pymupdf.Point(0, pl) * progr | |
| 4015 elif align == 3: # justify | |
| 4016 spaces = t.count(" ") # number of spaces in line | |
| 4017 if spaces > 0 and just_tab[i]: # if any, and we may justify | |
| 4018 spacing = pl / spaces # make every space this much larger | |
| 4019 else: | |
| 4020 spacing = 0 # keep normal space length | |
| 4021 top = height - pnt.y - self.y | |
| 4022 left = pnt.x + self.x | |
| 4023 if rot == 90: | |
| 4024 left = height - pnt.y - self.y | |
| 4025 top = -pnt.x - self.x | |
| 4026 elif rot == 270: | |
| 4027 left = -height + pnt.y + self.y | |
| 4028 top = pnt.x + self.x | |
| 4029 elif rot == 180: | |
| 4030 left = -pnt.x - self.x | |
| 4031 top = -height + pnt.y + self.y | |
| 4032 | |
| 4033 nres += templ(left, top, fname, fontsize) | |
| 4034 | |
| 4035 if render_mode > 0: | |
| 4036 nres += "%i Tr " % render_mode | |
| 4037 nres += _format_g(border_width * fontsize) + " w " | |
| 4038 if miter_limit is not None: | |
| 4039 nres += _format_g(miter_limit) + " M " | |
| 4040 | |
| 4041 if align == 3: | |
| 4042 nres += _format_g(spacing) + " Tw " | |
| 4043 | |
| 4044 if color is not None: | |
| 4045 nres += color_str | |
| 4046 if fill is not None: | |
| 4047 nres += fill_str | |
| 4048 nres += "%sTJ\n" % pymupdf.getTJstr(t, tj_glyphs, simple, ordering) | |
| 4049 | |
| 4050 nres += "ET\n%sQ\n" % emc | |
| 4051 | |
| 4052 self.text_cont += nres | |
| 4053 self.updateRect(rect) | |
| 4054 return more | |
| 4055 | |
| 4056 def finish( | |
| 4057 self, | |
| 4058 width: float = 1, | |
| 4059 color: OptSeq = (0,), | |
| 4060 fill: OptSeq = None, | |
| 4061 lineCap: int = 0, | |
| 4062 lineJoin: int = 0, | |
| 4063 dashes: OptStr = None, | |
| 4064 even_odd: bool = False, | |
| 4065 morph: OptSeq = None, | |
| 4066 closePath: bool = True, | |
| 4067 fill_opacity: float = 1, | |
| 4068 stroke_opacity: float = 1, | |
| 4069 oc: int = 0, | |
| 4070 ) -> None: | |
| 4071 """Finish the current drawing segment. | |
| 4072 | |
| 4073 Notes: | |
| 4074 Apply colors, opacity, dashes, line style and width, or | |
| 4075 morphing. Also whether to close the path | |
| 4076 by connecting last to first point. | |
| 4077 """ | |
| 4078 if self.draw_cont == "": # treat empty contents as no-op | |
| 4079 return | |
| 4080 | |
| 4081 if width == 0: # border color makes no sense then | |
| 4082 color = None | |
| 4083 elif color is None: # vice versa | |
| 4084 width = 0 | |
| 4085 # if color == None and fill == None: | |
| 4086 # raise ValueError("at least one of 'color' or 'fill' must be given") | |
| 4087 color_str = pymupdf.ColorCode(color, "c") # ensure proper color string | |
| 4088 fill_str = pymupdf.ColorCode(fill, "f") # ensure proper fill string | |
| 4089 | |
| 4090 optcont = self.page._get_optional_content(oc) | |
| 4091 if optcont is not None: | |
| 4092 self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont | |
| 4093 emc = "EMC\n" | |
| 4094 else: | |
| 4095 emc = "" | |
| 4096 | |
| 4097 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity) | |
| 4098 if alpha is not None: | |
| 4099 self.draw_cont = "/%s gs\n" % alpha + self.draw_cont | |
| 4100 | |
| 4101 if width != 1 and width != 0: | |
| 4102 self.draw_cont += _format_g(width) + " w\n" | |
| 4103 | |
| 4104 if lineCap != 0: | |
| 4105 self.draw_cont = "%i J\n" % lineCap + self.draw_cont | |
| 4106 if lineJoin != 0: | |
| 4107 self.draw_cont = "%i j\n" % lineJoin + self.draw_cont | |
| 4108 | |
| 4109 if dashes not in (None, "", "[] 0"): | |
| 4110 self.draw_cont = "%s d\n" % dashes + self.draw_cont | |
| 4111 | |
| 4112 if closePath: | |
| 4113 self.draw_cont += "h\n" | |
| 4114 self.last_point = None | |
| 4115 | |
| 4116 if color is not None: | |
| 4117 self.draw_cont += color_str | |
| 4118 | |
| 4119 if fill is not None: | |
| 4120 self.draw_cont += fill_str | |
| 4121 if color is not None: | |
| 4122 if not even_odd: | |
| 4123 self.draw_cont += "B\n" | |
| 4124 else: | |
| 4125 self.draw_cont += "B*\n" | |
| 4126 else: | |
| 4127 if not even_odd: | |
| 4128 self.draw_cont += "f\n" | |
| 4129 else: | |
| 4130 self.draw_cont += "f*\n" | |
| 4131 else: | |
| 4132 self.draw_cont += "S\n" | |
| 4133 | |
| 4134 self.draw_cont += emc | |
| 4135 if pymupdf.CheckMorph(morph): | |
| 4136 m1 = pymupdf.Matrix( | |
| 4137 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y | |
| 4138 ) | |
| 4139 mat = ~m1 * morph[1] * m1 | |
| 4140 self.draw_cont = _format_g(pymupdf.JM_TUPLE(mat)) + " cm\n" + self.draw_cont | |
| 4141 | |
| 4142 self.totalcont += "\nq\n" + self.draw_cont + "Q\n" | |
| 4143 self.draw_cont = "" | |
| 4144 self.last_point = None | |
| 4145 return | |
| 4146 | |
| 4147 def commit(self, overlay: bool = True) -> None: | |
| 4148 """Update the page's /Contents object with Shape data. | |
| 4149 | |
| 4150 The argument controls whether data appear in foreground (default) | |
| 4151 or background. | |
| 4152 """ | |
| 4153 pymupdf.CheckParent(self.page) # doc may have died meanwhile | |
| 4154 self.totalcont += self.text_cont | |
| 4155 self.totalcont = self.totalcont.encode() | |
| 4156 | |
| 4157 if self.totalcont: | |
| 4158 if overlay: | |
| 4159 self.page.wrap_contents() # ensure a balanced graphics state | |
| 4160 # make /Contents object with dummy stream | |
| 4161 xref = pymupdf.TOOLS._insert_contents(self.page, b" ", overlay) | |
| 4162 # update it with potential compression | |
| 4163 self.doc.update_stream(xref, self.totalcont) | |
| 4164 | |
| 4165 self.last_point = None # clean up ... | |
| 4166 self.rect = None # | |
| 4167 self.draw_cont = "" # for potential ... | |
| 4168 self.text_cont = "" # ... | |
| 4169 self.totalcont = "" # re-use | |
| 4170 | |
| 4171 | |
| 4172 def apply_redactions( | |
| 4173 page: pymupdf.Page, images: int = 2, graphics: int = 1, text: int = 0 | |
| 4174 ) -> bool: | |
| 4175 """Apply the redaction annotations of the page. | |
| 4176 | |
| 4177 Args: | |
| 4178 page: the PDF page. | |
| 4179 images: | |
| 4180 0 - ignore images | |
| 4181 1 - remove all overlapping images | |
| 4182 2 - blank out overlapping image parts | |
| 4183 3 - remove image unless invisible | |
| 4184 graphics: | |
| 4185 0 - ignore graphics | |
| 4186 1 - remove graphics if contained in rectangle | |
| 4187 2 - remove all overlapping graphics | |
| 4188 text: | |
| 4189 0 - remove text | |
| 4190 1 - ignore text | |
| 4191 """ | |
| 4192 | |
| 4193 def center_rect(annot_rect, new_text, font, fsize): | |
| 4194 """Calculate minimal sub-rectangle for the overlay text. | |
| 4195 | |
| 4196 Notes: | |
| 4197 Because 'insert_textbox' supports no vertical text centering, | |
| 4198 we calculate an approximate number of lines here and return a | |
| 4199 sub-rect with smaller height, which should still be sufficient. | |
| 4200 Args: | |
| 4201 annot_rect: the annotation rectangle | |
| 4202 new_text: the text to insert. | |
| 4203 font: the fontname. Must be one of the CJK or Base-14 set, else | |
| 4204 the rectangle is returned unchanged. | |
| 4205 fsize: the fontsize | |
| 4206 Returns: | |
| 4207 A rectangle to use instead of the annot rectangle. | |
| 4208 """ | |
| 4209 if not new_text or annot_rect.width <= pymupdf.EPSILON: | |
| 4210 return annot_rect | |
| 4211 try: | |
| 4212 text_width = pymupdf.get_text_length(new_text, font, fsize) | |
| 4213 except (ValueError, mupdf.FzErrorBase): # unsupported font | |
| 4214 if g_exceptions_verbose: | |
| 4215 pymupdf.exception_info() | |
| 4216 return annot_rect | |
| 4217 line_height = fsize * 1.2 | |
| 4218 limit = annot_rect.width | |
| 4219 h = math.ceil(text_width / limit) * line_height # estimate rect height | |
| 4220 if h >= annot_rect.height: | |
| 4221 return annot_rect | |
| 4222 r = annot_rect | |
| 4223 y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5 | |
| 4224 r.y0 = y | |
| 4225 return r | |
| 4226 | |
| 4227 pymupdf.CheckParent(page) | |
| 4228 doc = page.parent | |
| 4229 if doc.is_encrypted or doc.is_closed: | |
| 4230 raise ValueError("document closed or encrypted") | |
| 4231 if not doc.is_pdf: | |
| 4232 raise ValueError("is no PDF") | |
| 4233 | |
| 4234 redact_annots = [] # storage of annot values | |
| 4235 for annot in page.annots( | |
| 4236 types=(pymupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member | |
| 4237 ): | |
| 4238 # loop redactions | |
| 4239 redact_annots.append(annot._get_redact_values()) # save annot values | |
| 4240 | |
| 4241 if redact_annots == []: # any redactions on this page? | |
| 4242 return False # no redactions | |
| 4243 | |
| 4244 rc = page._apply_redactions(text, images, graphics) # call MuPDF | |
| 4245 if not rc: # should not happen really | |
| 4246 raise ValueError("Error applying redactions.") | |
| 4247 | |
| 4248 # now write replacement text in old redact rectangles | |
| 4249 shape = page.new_shape() | |
| 4250 for redact in redact_annots: | |
| 4251 annot_rect = redact["rect"] | |
| 4252 fill = redact["fill"] | |
| 4253 if fill: | |
| 4254 shape.draw_rect(annot_rect) # colorize the rect background | |
| 4255 shape.finish(fill=fill, color=fill) | |
| 4256 if "text" in redact.keys(): # if we also have text | |
| 4257 new_text = redact["text"] | |
| 4258 align = redact.get("align", 0) | |
| 4259 fname = redact["fontname"] | |
| 4260 fsize = redact["fontsize"] | |
| 4261 color = redact["text_color"] | |
| 4262 # try finding vertical centered sub-rect | |
| 4263 trect = center_rect(annot_rect, new_text, fname, fsize) | |
| 4264 | |
| 4265 rc = -1 | |
| 4266 while rc < 0 and fsize >= 4: # while not enough room | |
| 4267 # (re-) try insertion | |
| 4268 rc = shape.insert_textbox( | |
| 4269 trect, | |
| 4270 new_text, | |
| 4271 fontname=fname, | |
| 4272 fontsize=fsize, | |
| 4273 color=color, | |
| 4274 align=align, | |
| 4275 ) | |
| 4276 fsize -= 0.5 # reduce font if unsuccessful | |
| 4277 shape.commit() # append new contents object | |
| 4278 return True | |
| 4279 | |
| 4280 | |
| 4281 # ------------------------------------------------------------------------------ | |
| 4282 # Remove potentially sensitive data from a PDF. Similar to the Adobe | |
| 4283 # Acrobat 'sanitize' function | |
| 4284 # ------------------------------------------------------------------------------ | |
| 4285 def scrub( | |
| 4286 doc: pymupdf.Document, | |
| 4287 attached_files: bool = True, | |
| 4288 clean_pages: bool = True, | |
| 4289 embedded_files: bool = True, | |
| 4290 hidden_text: bool = True, | |
| 4291 javascript: bool = True, | |
| 4292 metadata: bool = True, | |
| 4293 redactions: bool = True, | |
| 4294 redact_images: int = 0, | |
| 4295 remove_links: bool = True, | |
| 4296 reset_fields: bool = True, | |
| 4297 reset_responses: bool = True, | |
| 4298 thumbnails: bool = True, | |
| 4299 xml_metadata: bool = True, | |
| 4300 ) -> None: | |
| 4301 def remove_hidden(cont_lines): | |
| 4302 """Remove hidden text from a PDF page. | |
| 4303 | |
| 4304 Args: | |
| 4305 cont_lines: list of lines with /Contents content. Should have status | |
| 4306 from after page.cleanContents(). | |
| 4307 | |
| 4308 Returns: | |
| 4309 List of /Contents lines from which hidden text has been removed. | |
| 4310 | |
| 4311 Notes: | |
| 4312 The input must have been created after the page's /Contents object(s) | |
| 4313 have been cleaned with page.cleanContents(). This ensures a standard | |
| 4314 formatting: one command per line, single spaces between operators. | |
| 4315 This allows for drastic simplification of this code. | |
| 4316 """ | |
| 4317 out_lines = [] # will return this | |
| 4318 in_text = False # indicate if within BT/ET object | |
| 4319 suppress = False # indicate text suppression active | |
| 4320 make_return = False | |
| 4321 for line in cont_lines: | |
| 4322 if line == b"BT": # start of text object | |
| 4323 in_text = True # switch on | |
| 4324 out_lines.append(line) # output it | |
| 4325 continue | |
| 4326 if line == b"ET": # end of text object | |
| 4327 in_text = False # switch off | |
| 4328 out_lines.append(line) # output it | |
| 4329 continue | |
| 4330 if line == b"3 Tr": # text suppression operator | |
| 4331 suppress = True # switch on | |
| 4332 make_return = True | |
| 4333 continue | |
| 4334 if line[-2:] == b"Tr" and line[0] != b"3": | |
| 4335 suppress = False # text rendering changed | |
| 4336 out_lines.append(line) | |
| 4337 continue | |
| 4338 if line == b"Q": # unstack command also switches off | |
| 4339 suppress = False | |
| 4340 out_lines.append(line) | |
| 4341 continue | |
| 4342 if suppress and in_text: # suppress hidden lines | |
| 4343 continue | |
| 4344 out_lines.append(line) | |
| 4345 if make_return: | |
| 4346 return out_lines | |
| 4347 else: | |
| 4348 return None | |
| 4349 | |
| 4350 if not doc.is_pdf: # only works for PDF | |
| 4351 raise ValueError("is no PDF") | |
| 4352 if doc.is_encrypted or doc.is_closed: | |
| 4353 raise ValueError("closed or encrypted doc") | |
| 4354 | |
| 4355 if not clean_pages: | |
| 4356 hidden_text = False | |
| 4357 redactions = False | |
| 4358 | |
| 4359 if metadata: | |
| 4360 doc.set_metadata({}) # remove standard metadata | |
| 4361 | |
| 4362 for page in doc: | |
| 4363 if reset_fields: | |
| 4364 # reset form fields (widgets) | |
| 4365 for widget in page.widgets(): | |
| 4366 widget.reset() | |
| 4367 | |
| 4368 if remove_links: | |
| 4369 links = page.get_links() # list of all links on page | |
| 4370 for link in links: # remove all links | |
| 4371 page.delete_link(link) | |
| 4372 | |
| 4373 found_redacts = False | |
| 4374 for annot in page.annots(): | |
| 4375 if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files: | |
| 4376 annot.update_file(buffer_=b" ") # set file content to empty | |
| 4377 if reset_responses: | |
| 4378 annot.delete_responses() | |
| 4379 if annot.type[0] == pymupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member | |
| 4380 found_redacts = True | |
| 4381 | |
| 4382 if redactions and found_redacts: | |
| 4383 page.apply_redactions(images=redact_images) | |
| 4384 | |
| 4385 if not (clean_pages or hidden_text): | |
| 4386 continue # done with the page | |
| 4387 | |
| 4388 page.clean_contents() | |
| 4389 if not page.get_contents(): | |
| 4390 continue | |
| 4391 if hidden_text: | |
| 4392 xref = page.get_contents()[0] # only one b/o cleaning! | |
| 4393 cont = doc.xref_stream(xref) | |
| 4394 cont_lines = remove_hidden(cont.splitlines()) # remove hidden text | |
| 4395 if cont_lines: # something was actually removed | |
| 4396 cont = b"\n".join(cont_lines) | |
| 4397 doc.update_stream(xref, cont) # rewrite the page /Contents | |
| 4398 | |
| 4399 if thumbnails: # remove page thumbnails? | |
| 4400 if doc.xref_get_key(page.xref, "Thumb")[0] != "null": | |
| 4401 doc.xref_set_key(page.xref, "Thumb", "null") | |
| 4402 | |
| 4403 # pages are scrubbed, now perform document-wide scrubbing | |
| 4404 # remove embedded files | |
| 4405 if embedded_files: | |
| 4406 for name in doc.embfile_names(): | |
| 4407 doc.embfile_del(name) | |
| 4408 | |
| 4409 if xml_metadata: | |
| 4410 doc.del_xml_metadata() | |
| 4411 if not (xml_metadata or javascript): | |
| 4412 xref_limit = 0 | |
| 4413 else: | |
| 4414 xref_limit = doc.xref_length() | |
| 4415 for xref in range(1, xref_limit): | |
| 4416 if not doc.xref_object(xref): | |
| 4417 msg = "bad xref %i - clean PDF before scrubbing" % xref | |
| 4418 raise ValueError(msg) | |
| 4419 if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript": | |
| 4420 # a /JavaScript action object | |
| 4421 obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript | |
| 4422 doc.update_object(xref, obj) # update this object | |
| 4423 continue # no further handling | |
| 4424 | |
| 4425 if not xml_metadata: | |
| 4426 continue | |
| 4427 | |
| 4428 if doc.xref_get_key(xref, "Type")[1] == "/Metadata": | |
| 4429 # delete any metadata object directly | |
| 4430 doc.update_object(xref, "<<>>") | |
| 4431 doc.update_stream(xref, b"deleted", new=True) | |
| 4432 continue | |
| 4433 | |
| 4434 if doc.xref_get_key(xref, "Metadata")[0] != "null": | |
| 4435 doc.xref_set_key(xref, "Metadata", "null") | |
| 4436 | |
| 4437 | |
| 4438 def _show_fz_text( text): | |
| 4439 #if mupdf_cppyy: | |
| 4440 # assert isinstance( text, cppyy.gbl.mupdf.Text) | |
| 4441 #else: | |
| 4442 # assert isinstance( text, mupdf.Text) | |
| 4443 num_spans = 0 | |
| 4444 num_chars = 0 | |
| 4445 span = text.m_internal.head | |
| 4446 while 1: | |
| 4447 if not span: | |
| 4448 break | |
| 4449 num_spans += 1 | |
| 4450 num_chars += span.len | |
| 4451 span = span.next | |
| 4452 return f'num_spans={num_spans} num_chars={num_chars}' | |
| 4453 | |
| 4454 def fill_textbox( | |
| 4455 writer: pymupdf.TextWriter, | |
| 4456 rect: rect_like, | |
| 4457 text: typing.Union[str, list], | |
| 4458 pos: point_like = None, | |
| 4459 font: typing.Optional[pymupdf.Font] = None, | |
| 4460 fontsize: float = 11, | |
| 4461 lineheight: OptFloat = None, | |
| 4462 align: int = 0, | |
| 4463 warn: bool = None, | |
| 4464 right_to_left: bool = False, | |
| 4465 small_caps: bool = False, | |
| 4466 ) -> tuple: | |
| 4467 """Fill a rectangle with text. | |
| 4468 | |
| 4469 Args: | |
| 4470 writer: pymupdf.TextWriter object (= "self") | |
| 4471 rect: rect-like to receive the text. | |
| 4472 text: string or list/tuple of strings. | |
| 4473 pos: point-like start position of first word. | |
| 4474 font: pymupdf.Font object (default pymupdf.Font('helv')). | |
| 4475 fontsize: the fontsize. | |
| 4476 lineheight: overwrite the font property | |
| 4477 align: (int) 0 = left, 1 = center, 2 = right, 3 = justify | |
| 4478 warn: (bool) text overflow action: none, warn, or exception | |
| 4479 right_to_left: (bool) indicate right-to-left language. | |
| 4480 """ | |
| 4481 rect = pymupdf.Rect(rect) | |
| 4482 if rect.is_empty: | |
| 4483 raise ValueError("fill rect must not empty.") | |
| 4484 if type(font) is not pymupdf.Font: | |
| 4485 font = pymupdf.Font("helv") | |
| 4486 | |
| 4487 def textlen(x): | |
| 4488 """Return length of a string.""" | |
| 4489 return font.text_length( | |
| 4490 x, fontsize=fontsize, small_caps=small_caps | |
| 4491 ) # abbreviation | |
| 4492 | |
| 4493 def char_lengths(x): | |
| 4494 """Return list of single character lengths for a string.""" | |
| 4495 return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps) | |
| 4496 | |
| 4497 def append_this(pos, text): | |
| 4498 ret = writer.append( | |
| 4499 pos, text, font=font, fontsize=fontsize, small_caps=small_caps | |
| 4500 ) | |
| 4501 return ret | |
| 4502 | |
| 4503 tolerance = fontsize * 0.2 # extra distance to left border | |
| 4504 space_len = textlen(" ") | |
| 4505 std_width = rect.width - tolerance | |
| 4506 std_start = rect.x0 + tolerance | |
| 4507 | |
| 4508 def norm_words(width, words): | |
| 4509 """Cut any word in pieces no longer than 'width'.""" | |
| 4510 nwords = [] | |
| 4511 word_lengths = [] | |
| 4512 for w in words: | |
| 4513 wl_lst = char_lengths(w) | |
| 4514 wl = sum(wl_lst) | |
| 4515 if wl <= width: # nothing to do - copy over | |
| 4516 nwords.append(w) | |
| 4517 word_lengths.append(wl) | |
| 4518 continue | |
| 4519 | |
| 4520 # word longer than rect width - split it in parts | |
| 4521 n = len(wl_lst) | |
| 4522 while n > 0: | |
| 4523 wl = sum(wl_lst[:n]) | |
| 4524 if wl <= width: | |
| 4525 nwords.append(w[:n]) | |
| 4526 word_lengths.append(wl) | |
| 4527 w = w[n:] | |
| 4528 wl_lst = wl_lst[n:] | |
| 4529 n = len(wl_lst) | |
| 4530 else: | |
| 4531 n -= 1 | |
| 4532 return nwords, word_lengths | |
| 4533 | |
| 4534 def output_justify(start, line): | |
| 4535 """Justified output of a line.""" | |
| 4536 # ignore leading / trailing / multiple spaces | |
| 4537 words = [w for w in line.split(" ") if w != ""] | |
| 4538 nwords = len(words) | |
| 4539 if nwords == 0: | |
| 4540 return | |
| 4541 if nwords == 1: # single word cannot be justified | |
| 4542 append_this(start, words[0]) | |
| 4543 return | |
| 4544 tl = sum([textlen(w) for w in words]) # total word lengths | |
| 4545 gaps = nwords - 1 # number of word gaps | |
| 4546 gapl = (std_width - tl) / gaps # width of each gap | |
| 4547 for w in words: | |
| 4548 _, lp = append_this(start, w) # output one word | |
| 4549 start.x = lp.x + gapl # next start at word end plus gap | |
| 4550 return | |
| 4551 | |
| 4552 asc = font.ascender | |
| 4553 dsc = font.descender | |
| 4554 if not lineheight: | |
| 4555 if asc - dsc <= 1: | |
| 4556 lheight = 1.2 | |
| 4557 else: | |
| 4558 lheight = asc - dsc | |
| 4559 else: | |
| 4560 lheight = lineheight | |
| 4561 | |
| 4562 LINEHEIGHT = fontsize * lheight # effective line height | |
| 4563 width = std_width # available horizontal space | |
| 4564 | |
| 4565 # starting point of text | |
| 4566 if pos is not None: | |
| 4567 pos = pymupdf.Point(pos) | |
| 4568 else: # default is just below rect top-left | |
| 4569 pos = rect.tl + (tolerance, fontsize * asc) | |
| 4570 if pos not in rect: | |
| 4571 raise ValueError("Text must start in rectangle.") | |
| 4572 | |
| 4573 # calculate displacement factor for alignment | |
| 4574 if align == pymupdf.TEXT_ALIGN_CENTER: | |
| 4575 factor = 0.5 | |
| 4576 elif align == pymupdf.TEXT_ALIGN_RIGHT: | |
| 4577 factor = 1.0 | |
| 4578 else: | |
| 4579 factor = 0 | |
| 4580 | |
| 4581 # split in lines if just a string was given | |
| 4582 if type(text) is str: | |
| 4583 textlines = text.splitlines() | |
| 4584 else: | |
| 4585 textlines = [] | |
| 4586 for line in text: | |
| 4587 textlines.extend(line.splitlines()) | |
| 4588 | |
| 4589 max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1 | |
| 4590 | |
| 4591 new_lines = [] # the final list of textbox lines | |
| 4592 no_justify = [] # no justify for these line numbers | |
| 4593 for i, line in enumerate(textlines): | |
| 4594 if line in ("", " "): | |
| 4595 new_lines.append((line, space_len)) | |
| 4596 width = rect.width - tolerance | |
| 4597 no_justify.append((len(new_lines) - 1)) | |
| 4598 continue | |
| 4599 if i == 0: | |
| 4600 width = rect.x1 - pos.x | |
| 4601 else: | |
| 4602 width = rect.width - tolerance | |
| 4603 | |
| 4604 if right_to_left: # reverses Arabic / Hebrew text front to back | |
| 4605 line = writer.clean_rtl(line) | |
| 4606 tl = textlen(line) | |
| 4607 if tl <= width: # line short enough | |
| 4608 new_lines.append((line, tl)) | |
| 4609 no_justify.append((len(new_lines) - 1)) | |
| 4610 continue | |
| 4611 | |
| 4612 # we need to split the line in fitting parts | |
| 4613 words = line.split(" ") # the words in the line | |
| 4614 | |
| 4615 # cut in parts any words that are longer than rect width | |
| 4616 words, word_lengths = norm_words(width, words) | |
| 4617 | |
| 4618 n = len(words) | |
| 4619 while True: | |
| 4620 line0 = " ".join(words[:n]) | |
| 4621 wl = sum(word_lengths[:n]) + space_len * (n - 1) | |
| 4622 if wl <= width: | |
| 4623 new_lines.append((line0, wl)) | |
| 4624 words = words[n:] | |
| 4625 word_lengths = word_lengths[n:] | |
| 4626 n = len(words) | |
| 4627 line0 = None | |
| 4628 else: | |
| 4629 n -= 1 | |
| 4630 | |
| 4631 if len(words) == 0: | |
| 4632 break | |
| 4633 assert n | |
| 4634 | |
| 4635 # ------------------------------------------------------------------------- | |
| 4636 # List of lines created. Each item is (text, tl), where 'tl' is the PDF | |
| 4637 # output length (float) and 'text' is the text. Except for justified text, | |
| 4638 # this is output-ready. | |
| 4639 # ------------------------------------------------------------------------- | |
| 4640 nlines = len(new_lines) | |
| 4641 if nlines > max_lines: | |
| 4642 msg = "Only fitting %i of %i lines." % (max_lines, nlines) | |
| 4643 if warn is None: | |
| 4644 pass | |
| 4645 elif warn: | |
| 4646 pymupdf.message("Warning: " + msg) | |
| 4647 else: | |
| 4648 raise ValueError(msg) | |
| 4649 | |
| 4650 start = pymupdf.Point() | |
| 4651 no_justify += [len(new_lines) - 1] # no justifying of last line | |
| 4652 for i in range(max_lines): | |
| 4653 try: | |
| 4654 line, tl = new_lines.pop(0) | |
| 4655 except IndexError: | |
| 4656 if g_exceptions_verbose >= 2: pymupdf.exception_info() | |
| 4657 break | |
| 4658 | |
| 4659 if right_to_left: # Arabic, Hebrew | |
| 4660 line = "".join(reversed(line)) | |
| 4661 | |
| 4662 if i == 0: # may have different start for first line | |
| 4663 start = pos | |
| 4664 | |
| 4665 if align == pymupdf.TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width: | |
| 4666 output_justify(start, line) | |
| 4667 start.x = std_start | |
| 4668 start.y += LINEHEIGHT | |
| 4669 continue | |
| 4670 | |
| 4671 if i > 0 or pos.x == std_start: # left, center, right alignments | |
| 4672 start.x += (width - tl) * factor | |
| 4673 | |
| 4674 append_this(start, line) | |
| 4675 start.x = std_start | |
| 4676 start.y += LINEHEIGHT | |
| 4677 | |
| 4678 return new_lines # return non-written lines | |
| 4679 | |
| 4680 | |
| 4681 # ------------------------------------------------------------------------ | |
| 4682 # Optional Content functions | |
| 4683 # ------------------------------------------------------------------------ | |
| 4684 def get_oc(doc: pymupdf.Document, xref: int) -> int: | |
| 4685 """Return optional content object xref for an image or form xobject. | |
| 4686 | |
| 4687 Args: | |
| 4688 xref: (int) xref number of an image or form xobject. | |
| 4689 """ | |
| 4690 if doc.is_closed or doc.is_encrypted: | |
| 4691 raise ValueError("document close or encrypted") | |
| 4692 t, name = doc.xref_get_key(xref, "Subtype") | |
| 4693 if t != "name" or name not in ("/Image", "/Form"): | |
| 4694 raise ValueError("bad object type at xref %i" % xref) | |
| 4695 t, oc = doc.xref_get_key(xref, "OC") | |
| 4696 if t != "xref": | |
| 4697 return 0 | |
| 4698 rc = int(oc.replace("0 R", "")) | |
| 4699 return rc | |
| 4700 | |
| 4701 | |
| 4702 def set_oc(doc: pymupdf.Document, xref: int, oc: int) -> None: | |
| 4703 """Attach optional content object to image or form xobject. | |
| 4704 | |
| 4705 Args: | |
| 4706 xref: (int) xref number of an image or form xobject | |
| 4707 oc: (int) xref number of an OCG or OCMD | |
| 4708 """ | |
| 4709 if doc.is_closed or doc.is_encrypted: | |
| 4710 raise ValueError("document close or encrypted") | |
| 4711 t, name = doc.xref_get_key(xref, "Subtype") | |
| 4712 if t != "name" or name not in ("/Image", "/Form"): | |
| 4713 raise ValueError("bad object type at xref %i" % xref) | |
| 4714 if oc > 0: | |
| 4715 t, name = doc.xref_get_key(oc, "Type") | |
| 4716 if t != "name" or name not in ("/OCG", "/OCMD"): | |
| 4717 raise ValueError("bad object type at xref %i" % oc) | |
| 4718 if oc == 0 and "OC" in doc.xref_get_keys(xref): | |
| 4719 doc.xref_set_key(xref, "OC", "null") | |
| 4720 return None | |
| 4721 doc.xref_set_key(xref, "OC", "%i 0 R" % oc) | |
| 4722 return None | |
| 4723 | |
| 4724 | |
| 4725 def set_ocmd( | |
| 4726 doc: pymupdf.Document, | |
| 4727 xref: int = 0, | |
| 4728 ocgs: typing.Union[list, None] = None, | |
| 4729 policy: OptStr = None, | |
| 4730 ve: typing.Union[list, None] = None, | |
| 4731 ) -> int: | |
| 4732 """Create or update an OCMD object in a PDF document. | |
| 4733 | |
| 4734 Args: | |
| 4735 xref: (int) 0 for creating a new object, otherwise update existing one. | |
| 4736 ocgs: (list) OCG xref numbers, which shall be subject to 'policy'. | |
| 4737 policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing). | |
| 4738 ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'. | |
| 4739 | |
| 4740 Returns: | |
| 4741 Xref of the created or updated OCMD. | |
| 4742 """ | |
| 4743 | |
| 4744 all_ocgs = set(doc.get_ocgs().keys()) | |
| 4745 | |
| 4746 def ve_maker(ve): | |
| 4747 if type(ve) not in (list, tuple) or len(ve) < 2: | |
| 4748 raise ValueError("bad 've' format: %s" % ve) | |
| 4749 if ve[0].lower() not in ("and", "or", "not"): | |
| 4750 raise ValueError("bad operand: %s" % ve[0]) | |
| 4751 if ve[0].lower() == "not" and len(ve) != 2: | |
| 4752 raise ValueError("bad 've' format: %s" % ve) | |
| 4753 item = "[/%s" % ve[0].title() | |
| 4754 for x in ve[1:]: | |
| 4755 if type(x) is int: | |
| 4756 if x not in all_ocgs: | |
| 4757 raise ValueError("bad OCG %i" % x) | |
| 4758 item += " %i 0 R" % x | |
| 4759 else: | |
| 4760 item += " %s" % ve_maker(x) | |
| 4761 item += "]" | |
| 4762 return item | |
| 4763 | |
| 4764 text = "<</Type/OCMD" | |
| 4765 | |
| 4766 if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided | |
| 4767 s = set(ocgs).difference(all_ocgs) # contains illegal xrefs | |
| 4768 if s != set(): | |
| 4769 msg = "bad OCGs: %s" % s | |
| 4770 raise ValueError(msg) | |
| 4771 text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]" | |
| 4772 | |
| 4773 if policy: | |
| 4774 policy = str(policy).lower() | |
| 4775 pols = { | |
| 4776 "anyon": "AnyOn", | |
| 4777 "allon": "AllOn", | |
| 4778 "anyoff": "AnyOff", | |
| 4779 "alloff": "AllOff", | |
| 4780 } | |
| 4781 if policy not in ("anyon", "allon", "anyoff", "alloff"): | |
| 4782 raise ValueError("bad policy: %s" % policy) | |
| 4783 text += "/P/%s" % pols[policy] | |
| 4784 | |
| 4785 if ve: | |
| 4786 text += "/VE%s" % ve_maker(ve) | |
| 4787 | |
| 4788 text += ">>" | |
| 4789 | |
| 4790 # make new object or replace old OCMD (check type first) | |
| 4791 if xref == 0: | |
| 4792 xref = doc.get_new_xref() | |
| 4793 elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True): | |
| 4794 raise ValueError("bad xref or not an OCMD") | |
| 4795 doc.update_object(xref, text) | |
| 4796 return xref | |
| 4797 | |
| 4798 | |
| 4799 def get_ocmd(doc: pymupdf.Document, xref: int) -> dict: | |
| 4800 """Return the definition of an OCMD (optional content membership dictionary). | |
| 4801 | |
| 4802 Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and | |
| 4803 /VE (visibility expression, PDF array). Via string manipulation, this | |
| 4804 info is converted to a Python dictionary with keys "xref", "ocgs", "policy" | |
| 4805 and "ve" - ready to recycle as input for 'set_ocmd()'. | |
| 4806 """ | |
| 4807 | |
| 4808 if xref not in range(doc.xref_length()): | |
| 4809 raise ValueError("bad xref") | |
| 4810 text = doc.xref_object(xref, compressed=True) | |
| 4811 if "/Type/OCMD" not in text: | |
| 4812 raise ValueError("bad object type") | |
| 4813 textlen = len(text) | |
| 4814 | |
| 4815 p0 = text.find("/OCGs[") # look for /OCGs key | |
| 4816 p1 = text.find("]", p0) | |
| 4817 if p0 < 0 or p1 < 0: # no OCGs found | |
| 4818 ocgs = None | |
| 4819 else: | |
| 4820 ocgs = text[p0 + 6 : p1].replace("0 R", " ").split() | |
| 4821 ocgs = list(map(int, ocgs)) | |
| 4822 | |
| 4823 p0 = text.find("/P/") # look for /P policy key | |
| 4824 if p0 < 0: | |
| 4825 policy = None | |
| 4826 else: | |
| 4827 p1 = text.find("ff", p0) | |
| 4828 if p1 < 0: | |
| 4829 p1 = text.find("on", p0) | |
| 4830 if p1 < 0: # some irregular syntax | |
| 4831 raise ValueError("bad object at xref") | |
| 4832 else: | |
| 4833 policy = text[p0 + 3 : p1 + 2] | |
| 4834 | |
| 4835 p0 = text.find("/VE[") # look for /VE visibility expression key | |
| 4836 if p0 < 0: # no visibility expression found | |
| 4837 ve = None | |
| 4838 else: | |
| 4839 lp = rp = 0 # find end of /VE by finding last ']'. | |
| 4840 p1 = p0 | |
| 4841 while lp < 1 or lp != rp: | |
| 4842 p1 += 1 | |
| 4843 if not p1 < textlen: # some irregular syntax | |
| 4844 raise ValueError("bad object at xref") | |
| 4845 if text[p1] == "[": | |
| 4846 lp += 1 | |
| 4847 if text[p1] == "]": | |
| 4848 rp += 1 | |
| 4849 # p1 now positioned at the last "]" | |
| 4850 ve = text[p0 + 3 : p1 + 1] # the PDF /VE array | |
| 4851 ve = ( | |
| 4852 ve.replace("/And", '"and",') | |
| 4853 .replace("/Not", '"not",') | |
| 4854 .replace("/Or", '"or",') | |
| 4855 ) | |
| 4856 ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[") | |
| 4857 import json | |
| 4858 try: | |
| 4859 ve = json.loads(ve) | |
| 4860 except Exception: | |
| 4861 pymupdf.exception_info() | |
| 4862 pymupdf.message(f"bad /VE key: {ve!r}") | |
| 4863 raise | |
| 4864 return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve} | |
| 4865 | |
| 4866 | |
| 4867 """ | |
| 4868 Handle page labels for PDF documents. | |
| 4869 | |
| 4870 Reading | |
| 4871 ------- | |
| 4872 * compute the label of a page | |
| 4873 * find page number(s) having the given label. | |
| 4874 | |
| 4875 Writing | |
| 4876 ------- | |
| 4877 Supports setting (defining) page labels for PDF documents. | |
| 4878 | |
| 4879 A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and | |
| 4880 significant parts of the following code during late December 2020 | |
| 4881 through early January 2021. | |
| 4882 """ | |
| 4883 | |
| 4884 | |
| 4885 def rule_dict(item): | |
| 4886 """Make a Python dict from a PDF page label rule. | |
| 4887 | |
| 4888 Args: | |
| 4889 item -- a tuple (pno, rule) with the start page number and the rule | |
| 4890 string like <</S/D...>>. | |
| 4891 Returns: | |
| 4892 A dict like | |
| 4893 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. | |
| 4894 """ | |
| 4895 # Jorj McKie, 2021-01-06 | |
| 4896 | |
| 4897 pno, rule = item | |
| 4898 rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>" | |
| 4899 d = {"startpage": pno, "prefix": "", "firstpagenum": 1} | |
| 4900 skip = False | |
| 4901 for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local | |
| 4902 if skip: # this item has already been processed | |
| 4903 skip = False # deactivate skipping again | |
| 4904 continue | |
| 4905 if item == "S": # style specification | |
| 4906 d["style"] = rule[i + 1] # next item has the style | |
| 4907 skip = True # do not process next item again | |
| 4908 continue | |
| 4909 if item.startswith("P"): # prefix specification: extract the string | |
| 4910 x = item[1:].replace("(", "").replace(")", "") | |
| 4911 d["prefix"] = x | |
| 4912 continue | |
| 4913 if item.startswith("St"): # start page number specification | |
| 4914 x = int(item[2:]) | |
| 4915 d["firstpagenum"] = x | |
| 4916 return d | |
| 4917 | |
| 4918 | |
| 4919 def get_label_pno(pgNo, labels): | |
| 4920 """Return the label for this page number. | |
| 4921 | |
| 4922 Args: | |
| 4923 pgNo: page number, 0-based. | |
| 4924 labels: result of doc._get_page_labels(). | |
| 4925 Returns: | |
| 4926 The label (str) of the page number. Errors return an empty string. | |
| 4927 """ | |
| 4928 # Jorj McKie, 2021-01-06 | |
| 4929 | |
| 4930 item = [x for x in labels if x[0] <= pgNo][-1] | |
| 4931 rule = rule_dict(item) | |
| 4932 prefix = rule.get("prefix", "") | |
| 4933 style = rule.get("style", "") | |
| 4934 # make sure we start at 0 when enumerating the alphabet | |
| 4935 delta = -1 if style in ("a", "A") else 0 | |
| 4936 pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta | |
| 4937 return construct_label(style, prefix, pagenumber) | |
| 4938 | |
| 4939 | |
| 4940 def get_label(page): | |
| 4941 """Return the label for this PDF page. | |
| 4942 | |
| 4943 Args: | |
| 4944 page: page object. | |
| 4945 Returns: | |
| 4946 The label (str) of the page. Errors return an empty string. | |
| 4947 """ | |
| 4948 # Jorj McKie, 2021-01-06 | |
| 4949 | |
| 4950 labels = page.parent._get_page_labels() | |
| 4951 if not labels: | |
| 4952 return "" | |
| 4953 labels.sort() | |
| 4954 return get_label_pno(page.number, labels) | |
| 4955 | |
| 4956 | |
| 4957 def get_page_numbers(doc, label, only_one=False): | |
| 4958 """Return a list of page numbers with the given label. | |
| 4959 | |
| 4960 Args: | |
| 4961 doc: PDF document object (resp. 'self'). | |
| 4962 label: (str) label. | |
| 4963 only_one: (bool) stop searching after first hit. | |
| 4964 Returns: | |
| 4965 List of page numbers having this label. | |
| 4966 """ | |
| 4967 # Jorj McKie, 2021-01-06 | |
| 4968 | |
| 4969 numbers = [] | |
| 4970 if not label: | |
| 4971 return numbers | |
| 4972 labels = doc._get_page_labels() | |
| 4973 if labels == []: | |
| 4974 return numbers | |
| 4975 for i in range(doc.page_count): | |
| 4976 plabel = get_label_pno(i, labels) | |
| 4977 if plabel == label: | |
| 4978 numbers.append(i) | |
| 4979 if only_one: | |
| 4980 break | |
| 4981 return numbers | |
| 4982 | |
| 4983 | |
| 4984 def construct_label(style, prefix, pno) -> str: | |
| 4985 """Construct a label based on style, prefix and page number.""" | |
| 4986 # William Chapman, 2021-01-06 | |
| 4987 | |
| 4988 n_str = "" | |
| 4989 if style == "D": | |
| 4990 n_str = str(pno) | |
| 4991 elif style == "r": | |
| 4992 n_str = integerToRoman(pno).lower() | |
| 4993 elif style == "R": | |
| 4994 n_str = integerToRoman(pno).upper() | |
| 4995 elif style == "a": | |
| 4996 n_str = integerToLetter(pno).lower() | |
| 4997 elif style == "A": | |
| 4998 n_str = integerToLetter(pno).upper() | |
| 4999 result = prefix + n_str | |
| 5000 return result | |
| 5001 | |
| 5002 | |
| 5003 def integerToLetter(i) -> str: | |
| 5004 """Returns letter sequence string for integer i.""" | |
| 5005 # William Chapman, Jorj McKie, 2021-01-06 | |
| 5006 import string | |
| 5007 ls = string.ascii_uppercase | |
| 5008 n, a = 1, i | |
| 5009 while pow(26, n) <= a: | |
| 5010 a -= int(math.pow(26, n)) | |
| 5011 n += 1 | |
| 5012 | |
| 5013 str_t = "" | |
| 5014 for j in reversed(range(n)): | |
| 5015 f, g = divmod(a, int(math.pow(26, j))) | |
| 5016 str_t += ls[f] | |
| 5017 a = g | |
| 5018 return str_t | |
| 5019 | |
| 5020 | |
| 5021 def integerToRoman(num: int) -> str: | |
| 5022 """Return roman numeral for an integer.""" | |
| 5023 # William Chapman, Jorj McKie, 2021-01-06 | |
| 5024 | |
| 5025 roman = ( | |
| 5026 (1000, "M"), | |
| 5027 (900, "CM"), | |
| 5028 (500, "D"), | |
| 5029 (400, "CD"), | |
| 5030 (100, "C"), | |
| 5031 (90, "XC"), | |
| 5032 (50, "L"), | |
| 5033 (40, "XL"), | |
| 5034 (10, "X"), | |
| 5035 (9, "IX"), | |
| 5036 (5, "V"), | |
| 5037 (4, "IV"), | |
| 5038 (1, "I"), | |
| 5039 ) | |
| 5040 | |
| 5041 def roman_num(num): | |
| 5042 for r, ltr in roman: | |
| 5043 x, _ = divmod(num, r) | |
| 5044 yield ltr * x | |
| 5045 num -= r * x | |
| 5046 if num <= 0: | |
| 5047 break | |
| 5048 | |
| 5049 return "".join([a for a in roman_num(num)]) | |
| 5050 | |
| 5051 | |
| 5052 def get_page_labels(doc): | |
| 5053 """Return page label definitions in PDF document. | |
| 5054 | |
| 5055 Args: | |
| 5056 doc: PDF document (resp. 'self'). | |
| 5057 Returns: | |
| 5058 A list of dictionaries with the following format: | |
| 5059 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. | |
| 5060 """ | |
| 5061 # Jorj McKie, 2021-01-10 | |
| 5062 return [rule_dict(item) for item in doc._get_page_labels()] | |
| 5063 | |
| 5064 | |
| 5065 def set_page_labels(doc, labels): | |
| 5066 """Add / replace page label definitions in PDF document. | |
| 5067 | |
| 5068 Args: | |
| 5069 doc: PDF document (resp. 'self'). | |
| 5070 labels: list of label dictionaries like: | |
| 5071 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}, | |
| 5072 as returned by get_page_labels(). | |
| 5073 """ | |
| 5074 # William Chapman, 2021-01-06 | |
| 5075 | |
| 5076 def create_label_str(label): | |
| 5077 """Convert Python label dict to corresponding PDF rule string. | |
| 5078 | |
| 5079 Args: | |
| 5080 label: (dict) build rule for the label. | |
| 5081 Returns: | |
| 5082 PDF label rule string wrapped in "<<", ">>". | |
| 5083 """ | |
| 5084 s = "%i<<" % label["startpage"] | |
| 5085 if label.get("prefix", "") != "": | |
| 5086 s += "/P(%s)" % label["prefix"] | |
| 5087 if label.get("style", "") != "": | |
| 5088 s += "/S/%s" % label["style"] | |
| 5089 if label.get("firstpagenum", 1) > 1: | |
| 5090 s += "/St %i" % label["firstpagenum"] | |
| 5091 s += ">>" | |
| 5092 return s | |
| 5093 | |
| 5094 def create_nums(labels): | |
| 5095 """Return concatenated string of all labels rules. | |
| 5096 | |
| 5097 Args: | |
| 5098 labels: (list) dictionaries as created by function 'rule_dict'. | |
| 5099 Returns: | |
| 5100 PDF compatible string for page label definitions, ready to be | |
| 5101 enclosed in PDF array 'Nums[...]'. | |
| 5102 """ | |
| 5103 labels.sort(key=lambda x: x["startpage"]) | |
| 5104 s = "".join([create_label_str(label) for label in labels]) | |
| 5105 return s | |
| 5106 | |
| 5107 doc._set_page_labels(create_nums(labels)) | |
| 5108 | |
| 5109 | |
| 5110 # End of Page Label Code ------------------------------------------------- | |
| 5111 | |
| 5112 | |
| 5113 def has_links(doc: pymupdf.Document) -> bool: | |
| 5114 """Check whether there are links on any page.""" | |
| 5115 if doc.is_closed: | |
| 5116 raise ValueError("document closed") | |
| 5117 if not doc.is_pdf: | |
| 5118 raise ValueError("is no PDF") | |
| 5119 for i in range(doc.page_count): | |
| 5120 for item in doc.page_annot_xrefs(i): | |
| 5121 if item[1] == pymupdf.PDF_ANNOT_LINK: # pylint: disable=no-member | |
| 5122 return True | |
| 5123 return False | |
| 5124 | |
| 5125 | |
| 5126 def has_annots(doc: pymupdf.Document) -> bool: | |
| 5127 """Check whether there are annotations on any page.""" | |
| 5128 if doc.is_closed: | |
| 5129 raise ValueError("document closed") | |
| 5130 if not doc.is_pdf: | |
| 5131 raise ValueError("is no PDF") | |
| 5132 for i in range(doc.page_count): | |
| 5133 for item in doc.page_annot_xrefs(i): | |
| 5134 # pylint: disable=no-member | |
| 5135 if not (item[1] == pymupdf.PDF_ANNOT_LINK or item[1] == pymupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member | |
| 5136 return True | |
| 5137 return False | |
| 5138 | |
| 5139 | |
| 5140 # ------------------------------------------------------------------- | |
| 5141 # Functions to recover the quad contained in a text extraction bbox | |
| 5142 # ------------------------------------------------------------------- | |
| 5143 def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad: | |
| 5144 """Compute the quad located inside the bbox. | |
| 5145 | |
| 5146 The bbox may be any of the resp. tuples occurring inside the given span. | |
| 5147 | |
| 5148 Args: | |
| 5149 line_dir: (tuple) 'line["dir"]' of the owning line or None. | |
| 5150 span: (dict) the span. May be from get_texttrace() method. | |
| 5151 bbox: (tuple) the bbox of the span or any of its characters. | |
| 5152 Returns: | |
| 5153 The quad which is wrapped by the bbox. | |
| 5154 """ | |
| 5155 if line_dir is None: | |
| 5156 line_dir = span["dir"] | |
| 5157 cos, sin = line_dir | |
| 5158 bbox = pymupdf.Rect(bbox) # make it a rect | |
| 5159 if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height | |
| 5160 d = 1 | |
| 5161 else: | |
| 5162 d = span["ascender"] - span["descender"] | |
| 5163 | |
| 5164 height = d * span["size"] # the quad's rectangle height | |
| 5165 # The following are distances from the bbox corners, at which we find the | |
| 5166 # respective quad points. The computation depends on in which quadrant the | |
| 5167 # text writing angle is located. | |
| 5168 hs = height * sin | |
| 5169 hc = height * cos | |
| 5170 if hc >= 0 and hs <= 0: # quadrant 1 | |
| 5171 ul = bbox.bl - (0, hc) | |
| 5172 ur = bbox.tr + (hs, 0) | |
| 5173 ll = bbox.bl - (hs, 0) | |
| 5174 lr = bbox.tr + (0, hc) | |
| 5175 elif hc <= 0 and hs <= 0: # quadrant 2 | |
| 5176 ul = bbox.br + (hs, 0) | |
| 5177 ur = bbox.tl - (0, hc) | |
| 5178 ll = bbox.br + (0, hc) | |
| 5179 lr = bbox.tl - (hs, 0) | |
| 5180 elif hc <= 0 and hs >= 0: # quadrant 3 | |
| 5181 ul = bbox.tr - (0, hc) | |
| 5182 ur = bbox.bl + (hs, 0) | |
| 5183 ll = bbox.tr - (hs, 0) | |
| 5184 lr = bbox.bl + (0, hc) | |
| 5185 else: # quadrant 4 | |
| 5186 ul = bbox.tl + (hs, 0) | |
| 5187 ur = bbox.br - (0, hc) | |
| 5188 ll = bbox.tl + (0, hc) | |
| 5189 lr = bbox.br - (hs, 0) | |
| 5190 return pymupdf.Quad(ul, ur, ll, lr) | |
| 5191 | |
| 5192 | |
| 5193 def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad: | |
| 5194 """Recover the quadrilateral of a text span. | |
| 5195 | |
| 5196 Args: | |
| 5197 line_dir: (tuple) 'line["dir"]' of the owning line. | |
| 5198 span: the span. | |
| 5199 Returns: | |
| 5200 The quadrilateral enveloping the span's text. | |
| 5201 """ | |
| 5202 if type(line_dir) is not tuple or len(line_dir) != 2: | |
| 5203 raise ValueError("bad line dir argument") | |
| 5204 if type(span) is not dict: | |
| 5205 raise ValueError("bad span argument") | |
| 5206 return recover_bbox_quad(line_dir, span, span["bbox"]) | |
| 5207 | |
| 5208 | |
| 5209 def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad: | |
| 5210 """Calculate the line quad for 'dict' / 'rawdict' text extractions. | |
| 5211 | |
| 5212 The lower quad points are those of the first, resp. last span quad. | |
| 5213 The upper points are determined by the maximum span quad height. | |
| 5214 From this, compute a rect with bottom-left in (0, 0), convert this to a | |
| 5215 quad and rotate and shift back to cover the text of the spans. | |
| 5216 | |
| 5217 Args: | |
| 5218 spans: (list, optional) sub-list of spans to consider. | |
| 5219 Returns: | |
| 5220 pymupdf.Quad covering selected spans. | |
| 5221 """ | |
| 5222 if spans is None: # no sub-selection | |
| 5223 spans = line["spans"] # all spans | |
| 5224 if len(spans) == 0: | |
| 5225 raise ValueError("bad span list") | |
| 5226 line_dir = line["dir"] # text direction | |
| 5227 cos, sin = line_dir | |
| 5228 q0 = recover_quad(line_dir, spans[0]) # quad of first span | |
| 5229 if len(spans) > 1: # get quad of last span | |
| 5230 q1 = recover_quad(line_dir, spans[-1]) | |
| 5231 else: | |
| 5232 q1 = q0 # last = first | |
| 5233 | |
| 5234 line_ll = q0.ll # lower-left of line quad | |
| 5235 line_lr = q1.lr # lower-right of line quad | |
| 5236 | |
| 5237 mat0 = pymupdf.planish_line(line_ll, line_lr) | |
| 5238 | |
| 5239 # map base line to x-axis such that line_ll goes to (0, 0) | |
| 5240 x_lr = line_lr * mat0 | |
| 5241 | |
| 5242 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights? | |
| 5243 | |
| 5244 h = max( | |
| 5245 [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans] | |
| 5246 ) | |
| 5247 | |
| 5248 line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle | |
| 5249 line_quad = line_rect.quad # make it a quad and: | |
| 5250 line_quad *= ~mat0 | |
| 5251 return line_quad | |
| 5252 | |
| 5253 | |
| 5254 def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad: | |
| 5255 """Calculate the span quad for 'dict' / 'rawdict' text extractions. | |
| 5256 | |
| 5257 Notes: | |
| 5258 There are two execution paths: | |
| 5259 1. For the full span quad, the result of 'recover_quad' is returned. | |
| 5260 2. For the quad of a sub-list of characters, the char quads are | |
| 5261 computed and joined. This is only supported for the "rawdict" | |
| 5262 extraction option. | |
| 5263 | |
| 5264 Args: | |
| 5265 line_dir: (tuple) 'line["dir"]' of the owning line. | |
| 5266 span: (dict) the span. | |
| 5267 chars: (list, optional) sub-list of characters to consider. | |
| 5268 Returns: | |
| 5269 pymupdf.Quad covering selected characters. | |
| 5270 """ | |
| 5271 if line_dir is None: # must be a span from get_texttrace() | |
| 5272 line_dir = span["dir"] | |
| 5273 if chars is None: # no sub-selection | |
| 5274 return recover_quad(line_dir, span) | |
| 5275 if "chars" not in span.keys(): | |
| 5276 raise ValueError("need 'rawdict' option to sub-select chars") | |
| 5277 | |
| 5278 q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char | |
| 5279 if len(chars) > 1: # get quad of last char | |
| 5280 q1 = recover_char_quad(line_dir, span, chars[-1]) | |
| 5281 else: | |
| 5282 q1 = q0 # last = first | |
| 5283 | |
| 5284 span_ll = q0.ll # lower-left of span quad | |
| 5285 span_lr = q1.lr # lower-right of span quad | |
| 5286 mat0 = pymupdf.planish_line(span_ll, span_lr) | |
| 5287 # map base line to x-axis such that span_ll goes to (0, 0) | |
| 5288 x_lr = span_lr * mat0 | |
| 5289 | |
| 5290 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights? | |
| 5291 h = span["size"] * (1 if small else (span["ascender"] - span["descender"])) | |
| 5292 | |
| 5293 span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle | |
| 5294 span_quad = span_rect.quad # make it a quad and: | |
| 5295 span_quad *= ~mat0 # rotate back and shift back | |
| 5296 return span_quad | |
| 5297 | |
| 5298 | |
| 5299 def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad: | |
| 5300 """Recover the quadrilateral of a text character. | |
| 5301 | |
| 5302 This requires the "rawdict" option of text extraction. | |
| 5303 | |
| 5304 Args: | |
| 5305 line_dir: (tuple) 'line["dir"]' of the span's line. | |
| 5306 span: (dict) the span dict. | |
| 5307 char: (dict) the character dict. | |
| 5308 Returns: | |
| 5309 The quadrilateral enveloping the character. | |
| 5310 """ | |
| 5311 if line_dir is None: | |
| 5312 line_dir = span["dir"] | |
| 5313 if type(line_dir) is not tuple or len(line_dir) != 2: | |
| 5314 raise ValueError("bad line dir argument") | |
| 5315 if type(span) is not dict: | |
| 5316 raise ValueError("bad span argument") | |
| 5317 if type(char) is dict: | |
| 5318 bbox = pymupdf.Rect(char["bbox"]) | |
| 5319 elif type(char) is tuple: | |
| 5320 bbox = pymupdf.Rect(char[3]) | |
| 5321 else: | |
| 5322 raise ValueError("bad span argument") | |
| 5323 | |
| 5324 return recover_bbox_quad(line_dir, span, bbox) | |
| 5325 | |
| 5326 | |
| 5327 # ------------------------------------------------------------------- | |
| 5328 # Building font subsets using fontTools | |
| 5329 # ------------------------------------------------------------------- | |
| 5330 def subset_fonts(doc: pymupdf.Document, verbose: bool = False, fallback: bool = False) -> OptInt: | |
| 5331 """Build font subsets in a PDF. | |
| 5332 | |
| 5333 Eligible fonts are potentially replaced by smaller versions. Page text is | |
| 5334 NOT rewritten and thus should retain properties like being hidden or | |
| 5335 controlled by optional content. | |
| 5336 | |
| 5337 This method by default uses MuPDF's own internal feature to create subset | |
| 5338 fonts. As this is a new function, errors may still occur. In this case, | |
| 5339 please fall back to using the previous version by using "fallback=True". | |
| 5340 Fallback mode requires the external package 'fontTools'. | |
| 5341 | |
| 5342 Args: | |
| 5343 fallback: use the older deprecated implementation. | |
| 5344 verbose: only used by fallback mode. | |
| 5345 | |
| 5346 Returns: | |
| 5347 The new MuPDF-based code returns None. The deprecated fallback | |
| 5348 mode returns 0 if there are no fonts to subset. Otherwise, it | |
| 5349 returns the decrease in fontsize (the difference in fontsize), | |
| 5350 measured in bytes. | |
| 5351 """ | |
| 5352 # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs)) | |
| 5353 # An embedded font is uniquely defined by its fontbuffer only. It may have | |
| 5354 # multiple names and xrefs. | |
| 5355 # Once the sets of used unicodes and glyphs are known, we compute a | |
| 5356 # smaller version of the buffer user package fontTools. | |
| 5357 | |
| 5358 if not fallback: # by default use MuPDF function | |
| 5359 pdf = mupdf.pdf_document_from_fz_document(doc) | |
| 5360 mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count))) | |
| 5361 return | |
| 5362 | |
| 5363 font_buffers = {} | |
| 5364 | |
| 5365 def get_old_widths(xref): | |
| 5366 """Retrieve old font '/W' and '/DW' values.""" | |
| 5367 df = doc.xref_get_key(xref, "DescendantFonts") | |
| 5368 if df[0] != "array": # only handle xref specifications | |
| 5369 return None, None | |
| 5370 df_xref = int(df[1][1:-1].replace("0 R", "")) | |
| 5371 widths = doc.xref_get_key(df_xref, "W") | |
| 5372 if widths[0] != "array": # no widths key found | |
| 5373 widths = None | |
| 5374 else: | |
| 5375 widths = widths[1] | |
| 5376 dwidths = doc.xref_get_key(df_xref, "DW") | |
| 5377 if dwidths[0] != "int": | |
| 5378 dwidths = None | |
| 5379 else: | |
| 5380 dwidths = dwidths[1] | |
| 5381 return widths, dwidths | |
| 5382 | |
| 5383 def set_old_widths(xref, widths, dwidths): | |
| 5384 """Restore the old '/W' and '/DW' in subsetted font. | |
| 5385 | |
| 5386 If either parameter is None or evaluates to False, the corresponding | |
| 5387 dictionary key will be set to null. | |
| 5388 """ | |
| 5389 df = doc.xref_get_key(xref, "DescendantFonts") | |
| 5390 if df[0] != "array": # only handle xref specs | |
| 5391 return None | |
| 5392 df_xref = int(df[1][1:-1].replace("0 R", "")) | |
| 5393 if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[ | |
| 5394 0 | |
| 5395 ] != "null": | |
| 5396 doc.xref_set_key(df_xref, "W", "null") | |
| 5397 else: | |
| 5398 doc.xref_set_key(df_xref, "W", widths) | |
| 5399 if (type(dwidths) is not str or not dwidths) and doc.xref_get_key( | |
| 5400 df_xref, "DW" | |
| 5401 )[0] != "null": | |
| 5402 doc.xref_set_key(df_xref, "DW", "null") | |
| 5403 else: | |
| 5404 doc.xref_set_key(df_xref, "DW", dwidths) | |
| 5405 return None | |
| 5406 | |
| 5407 def set_subset_fontname(new_xref): | |
| 5408 """Generate a name prefix to tag a font as subset. | |
| 5409 | |
| 5410 We use a random generator to select 6 upper case ASCII characters. | |
| 5411 The prefixed name must be put in the font xref as the "/BaseFont" value | |
| 5412 and in the FontDescriptor object as the '/FontName' value. | |
| 5413 """ | |
| 5414 # The following generates a prefix like 'ABCDEF+' | |
| 5415 import random | |
| 5416 import string | |
| 5417 prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+" | |
| 5418 font_str = doc.xref_object(new_xref, compressed=True) | |
| 5419 font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix) | |
| 5420 df = doc.xref_get_key(new_xref, "DescendantFonts") | |
| 5421 if df[0] == "array": | |
| 5422 df_xref = int(df[1][1:-1].replace("0 R", "")) | |
| 5423 fd = doc.xref_get_key(df_xref, "FontDescriptor") | |
| 5424 if fd[0] == "xref": | |
| 5425 fd_xref = int(fd[1].replace("0 R", "")) | |
| 5426 fd_str = doc.xref_object(fd_xref, compressed=True) | |
| 5427 fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix) | |
| 5428 doc.update_object(fd_xref, fd_str) | |
| 5429 doc.update_object(new_xref, font_str) | |
| 5430 | |
| 5431 def build_subset(buffer, unc_set, gid_set): | |
| 5432 """Build font subset using fontTools. | |
| 5433 | |
| 5434 Args: | |
| 5435 buffer: (bytes) the font given as a binary buffer. | |
| 5436 unc_set: (set) required glyph ids. | |
| 5437 Returns: | |
| 5438 Either None if subsetting is unsuccessful or the subset font buffer. | |
| 5439 """ | |
| 5440 try: | |
| 5441 import fontTools.subset as fts | |
| 5442 except ImportError: | |
| 5443 if g_exceptions_verbose: pymupdf.exception_info() | |
| 5444 pymupdf.message("This method requires fontTools to be installed.") | |
| 5445 raise | |
| 5446 import tempfile | |
| 5447 with tempfile.TemporaryDirectory() as tmp_dir: | |
| 5448 oldfont_path = f"{tmp_dir}/oldfont.ttf" | |
| 5449 newfont_path = f"{tmp_dir}/newfont.ttf" | |
| 5450 uncfile_path = f"{tmp_dir}/uncfile.txt" | |
| 5451 args = [ | |
| 5452 oldfont_path, | |
| 5453 "--retain-gids", | |
| 5454 f"--output-file={newfont_path}", | |
| 5455 "--layout-features=*", | |
| 5456 "--passthrough-tables", | |
| 5457 "--ignore-missing-glyphs", | |
| 5458 "--ignore-missing-unicodes", | |
| 5459 "--symbol-cmap", | |
| 5460 ] | |
| 5461 | |
| 5462 # store glyph ids or unicodes as file | |
| 5463 with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file: | |
| 5464 if 0xFFFD in unc_set: # error unicode exists -> use glyphs | |
| 5465 args.append(f"--gids-file={uncfile_path}") | |
| 5466 gid_set.add(189) | |
| 5467 unc_list = list(gid_set) | |
| 5468 for unc in unc_list: | |
| 5469 unc_file.write("%i\n" % unc) | |
| 5470 else: | |
| 5471 args.append(f"--unicodes-file={uncfile_path}") | |
| 5472 unc_set.add(255) | |
| 5473 unc_list = list(unc_set) | |
| 5474 for unc in unc_list: | |
| 5475 unc_file.write("%04x\n" % unc) | |
| 5476 | |
| 5477 # store fontbuffer as a file | |
| 5478 with open(oldfont_path, "wb") as fontfile: | |
| 5479 fontfile.write(buffer) | |
| 5480 try: | |
| 5481 os.remove(newfont_path) # remove old file | |
| 5482 except Exception: | |
| 5483 pass | |
| 5484 try: # invoke fontTools subsetter | |
| 5485 fts.main(args) | |
| 5486 font = pymupdf.Font(fontfile=newfont_path) | |
| 5487 new_buffer = font.buffer # subset font binary | |
| 5488 if font.glyph_count == 0: # intercept empty font | |
| 5489 new_buffer = None | |
| 5490 except Exception: | |
| 5491 pymupdf.exception_info() | |
| 5492 new_buffer = None | |
| 5493 return new_buffer | |
| 5494 | |
| 5495 def repl_fontnames(doc): | |
| 5496 """Populate 'font_buffers'. | |
| 5497 | |
| 5498 For each font candidate, store its xref and the list of names | |
| 5499 by which PDF text may refer to it (there may be multiple). | |
| 5500 """ | |
| 5501 | |
| 5502 def norm_name(name): | |
| 5503 """Recreate font name that contains PDF hex codes. | |
| 5504 | |
| 5505 E.g. #20 -> space, chr(32) | |
| 5506 """ | |
| 5507 while "#" in name: | |
| 5508 p = name.find("#") | |
| 5509 c = int(name[p + 1 : p + 3], 16) | |
| 5510 name = name.replace(name[p : p + 3], chr(c)) | |
| 5511 return name | |
| 5512 | |
| 5513 def get_fontnames(doc, item): | |
| 5514 """Return a list of fontnames for an item of page.get_fonts(). | |
| 5515 | |
| 5516 There may be multiple names e.g. for Type0 fonts. | |
| 5517 """ | |
| 5518 fontname = item[3] | |
| 5519 names = [fontname] | |
| 5520 fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:] | |
| 5521 fontname = norm_name(fontname) | |
| 5522 if fontname not in names: | |
| 5523 names.append(fontname) | |
| 5524 descendents = doc.xref_get_key(item[0], "DescendantFonts") | |
| 5525 if descendents[0] != "array": | |
| 5526 return names | |
| 5527 descendents = descendents[1][1:-1] | |
| 5528 if descendents.endswith(" 0 R"): | |
| 5529 xref = int(descendents[:-4]) | |
| 5530 descendents = doc.xref_object(xref, compressed=True) | |
| 5531 p1 = descendents.find("/BaseFont") | |
| 5532 if p1 >= 0: | |
| 5533 p2 = descendents.find("/", p1 + 1) | |
| 5534 p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1)) | |
| 5535 fontname = descendents[p2 + 1 : p1] | |
| 5536 fontname = norm_name(fontname) | |
| 5537 if fontname not in names: | |
| 5538 names.append(fontname) | |
| 5539 return names | |
| 5540 | |
| 5541 for i in range(doc.page_count): | |
| 5542 for f in doc.get_page_fonts(i, full=True): | |
| 5543 font_xref = f[0] # font xref | |
| 5544 font_ext = f[1] # font file extension | |
| 5545 basename = f[3] # font basename | |
| 5546 | |
| 5547 if font_ext not in ( # skip if not supported by fontTools | |
| 5548 "otf", | |
| 5549 "ttf", | |
| 5550 "woff", | |
| 5551 "woff2", | |
| 5552 ): | |
| 5553 continue | |
| 5554 # skip fonts which already are subsets | |
| 5555 if len(basename) > 6 and basename[6] == "+": | |
| 5556 continue | |
| 5557 | |
| 5558 extr = doc.extract_font(font_xref) | |
| 5559 fontbuffer = extr[-1] | |
| 5560 names = get_fontnames(doc, f) | |
| 5561 name_set, xref_set, subsets = font_buffers.get( | |
| 5562 fontbuffer, (set(), set(), (set(), set())) | |
| 5563 ) | |
| 5564 xref_set.add(font_xref) | |
| 5565 for name in names: | |
| 5566 name_set.add(name) | |
| 5567 font = pymupdf.Font(fontbuffer=fontbuffer) | |
| 5568 name_set.add(font.name) | |
| 5569 del font | |
| 5570 font_buffers[fontbuffer] = (name_set, xref_set, subsets) | |
| 5571 | |
| 5572 def find_buffer_by_name(name): | |
| 5573 for buffer, (name_set, _, _) in font_buffers.items(): | |
| 5574 if name in name_set: | |
| 5575 return buffer | |
| 5576 return None | |
| 5577 | |
| 5578 # ----------------- | |
| 5579 # main function | |
| 5580 # ----------------- | |
| 5581 repl_fontnames(doc) # populate font information | |
| 5582 if not font_buffers: # nothing found to do | |
| 5583 if verbose: | |
| 5584 pymupdf.message(f'No fonts to subset.') | |
| 5585 return 0 | |
| 5586 | |
| 5587 old_fontsize = 0 | |
| 5588 new_fontsize = 0 | |
| 5589 for fontbuffer in font_buffers.keys(): | |
| 5590 old_fontsize += len(fontbuffer) | |
| 5591 | |
| 5592 # Scan page text for usage of subsettable fonts | |
| 5593 for page in doc: | |
| 5594 # go through the text and extend set of used glyphs by font | |
| 5595 # we use a modified MuPDF trace device, which delivers us glyph ids. | |
| 5596 for span in page.get_texttrace(): | |
| 5597 if type(span) is not dict: # skip useless information | |
| 5598 continue | |
| 5599 fontname = span["font"][:33] # fontname for the span | |
| 5600 buffer = find_buffer_by_name(fontname) | |
| 5601 if buffer is None: | |
| 5602 continue | |
| 5603 name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer] | |
| 5604 for c in span["chars"]: | |
| 5605 set_ucs.add(c[0]) # unicode | |
| 5606 set_gid.add(c[1]) # glyph id | |
| 5607 font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid)) | |
| 5608 | |
| 5609 # build the font subsets | |
| 5610 for old_buffer, (name_set, xref_set, subsets) in font_buffers.items(): | |
| 5611 new_buffer = build_subset(old_buffer, subsets[0], subsets[1]) | |
| 5612 fontname = list(name_set)[0] | |
| 5613 if new_buffer is None or len(new_buffer) >= len(old_buffer): | |
| 5614 # subset was not created or did not get smaller | |
| 5615 if verbose: | |
| 5616 pymupdf.message(f'Cannot subset {fontname!r}.') | |
| 5617 continue | |
| 5618 if verbose: | |
| 5619 pymupdf.message(f"Built subset of font {fontname!r}.") | |
| 5620 val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF | |
| 5621 new_xref = val[0] # get its xref | |
| 5622 set_subset_fontname(new_xref) # tag fontname as subset font | |
| 5623 font_str = doc.xref_object( # get its object definition | |
| 5624 new_xref, | |
| 5625 compressed=True, | |
| 5626 ) | |
| 5627 # walk through the original font xrefs and replace each by the subset def | |
| 5628 for font_xref in xref_set: | |
| 5629 # we need the original '/W' and '/DW' width values | |
| 5630 width_table, def_width = get_old_widths(font_xref) | |
| 5631 # ... and replace original font definition at xref with it | |
| 5632 doc.update_object(font_xref, font_str) | |
| 5633 # now copy over old '/W' and '/DW' values | |
| 5634 if width_table or def_width: | |
| 5635 set_old_widths(font_xref, width_table, def_width) | |
| 5636 # 'new_xref' remains unused in the PDF and must be removed | |
| 5637 # by garbage collection. | |
| 5638 new_fontsize += len(new_buffer) | |
| 5639 | |
| 5640 return old_fontsize - new_fontsize | |
| 5641 | |
| 5642 | |
| 5643 # ------------------------------------------------------------------- | |
| 5644 # Copy XREF object to another XREF | |
| 5645 # ------------------------------------------------------------------- | |
| 5646 def xref_copy(doc: pymupdf.Document, source: int, target: int, *, keep: list = None) -> None: | |
| 5647 """Copy a PDF dictionary object to another one given their xref numbers. | |
| 5648 | |
| 5649 Args: | |
| 5650 doc: PDF document object | |
| 5651 source: source xref number | |
| 5652 target: target xref number, the xref must already exist | |
| 5653 keep: an optional list of 1st level keys in target that should not be | |
| 5654 removed before copying. | |
| 5655 Notes: | |
| 5656 This works similar to the copy() method of dictionaries in Python. The | |
| 5657 source may be a stream object. | |
| 5658 """ | |
| 5659 if doc.xref_is_stream(source): | |
| 5660 # read new xref stream, maintaining compression | |
| 5661 stream = doc.xref_stream_raw(source) | |
| 5662 doc.update_stream( | |
| 5663 target, | |
| 5664 stream, | |
| 5665 compress=False, # keeps source compression | |
| 5666 new=True, # in case target is no stream | |
| 5667 ) | |
| 5668 | |
| 5669 # empty the target completely, observe exceptions | |
| 5670 if keep is None: | |
| 5671 keep = [] | |
| 5672 for key in doc.xref_get_keys(target): | |
| 5673 if key in keep: | |
| 5674 continue | |
| 5675 doc.xref_set_key(target, key, "null") | |
| 5676 # copy over all source dict items | |
| 5677 for key in doc.xref_get_keys(source): | |
| 5678 item = doc.xref_get_key(source, key) | |
| 5679 doc.xref_set_key(target, key, item[1]) |
