comparison src/__init__.py @ 41:71bcc18e306f

MERGE: New upstream PyMuPDF v1.26.5 including MuPDF v1.26.10 BUGS: Needs some additional changes yet. Not yet tested.
author Franz Glasner <fzglas.hg@dom66.de>
date Sat, 11 Oct 2025 15:24:40 +0200
parents 3b13504f9d89 a6bc019ac0b2
children 4621bd954a09
comparison
equal deleted inserted replaced
38:8934ac156ef5 41:71bcc18e306f
15 import io 15 import io
16 import math 16 import math
17 import os 17 import os
18 import pathlib 18 import pathlib
19 import glob 19 import glob
20 import packaging.version
21 import re 20 import re
22 import string 21 import string
23 import sys 22 import sys
24 import tarfile 23 import tarfile
25 import time 24 import time
382 from ._build import mupdf_location # noqa F401 381 from ._build import mupdf_location # noqa F401
383 from ._build import pymupdf_git_branch # noqa F401 382 from ._build import pymupdf_git_branch # noqa F401
384 from ._build import pymupdf_git_diff # noqa F401 383 from ._build import pymupdf_git_diff # noqa F401
385 from ._build import pymupdf_git_sha # noqa F401 384 from ._build import pymupdf_git_sha # noqa F401
386 from ._build import pymupdf_version # noqa F401 385 from ._build import pymupdf_version # noqa F401
386 from ._build import pymupdf_version_tuple # noqa F401
387 from ._build import swig_version # noqa F401 387 from ._build import swig_version # noqa F401
388 from ._build import swig_version_tuple # noqa F401 388 from ._build import swig_version_tuple # noqa F401
389 389
390 mupdf_version = mupdf.FZ_VERSION 390 mupdf_version = mupdf.FZ_VERSION
391 391
392 # Removed in PyMuPDF-1.26.1. 392 # Removed in PyMuPDF-1.26.1.
393 pymupdf_date = None 393 pymupdf_date = None
394 394
395 # Versions as tuples; useful when comparing versions. 395 # Versions as tuples; useful when comparing versions.
396 # 396 #
397 pymupdf_version_tuple = packaging.version.Version(pymupdf_version).release
398 mupdf_version_tuple = packaging.version.Version(mupdf_version).release 397 mupdf_version_tuple = packaging.version.Version(mupdf_version).release
399 398
400 assert mupdf_version_tuple == (mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH), \ 399 assert mupdf_version_tuple == (mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH), \
401 f'Inconsistent MuPDF version numbers: {mupdf_version_tuple=} != {(mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH)=}' 400 f'Inconsistent MuPDF version numbers: {mupdf_version_tuple=} != {(mupdf.FZ_VERSION_MAJOR, mupdf.FZ_VERSION_MINOR, mupdf.FZ_VERSION_PATCH)=}'
402 401
1034 res['compression'] = mupdf.pdf_to_name(obj) 1033 res['compression'] = mupdf.pdf_to_name(obj)
1035 buf = mupdf.pdf_load_stream(sound) 1034 buf = mupdf.pdf_load_stream(sound)
1036 stream = JM_BinFromBuffer(buf) 1035 stream = JM_BinFromBuffer(buf)
1037 res['stream'] = stream 1036 res['stream'] = stream
1038 return res 1037 return res
1038
1039 def get_text(self, *args, **kwargs):
1040 return utils.get_text(self, *args, **kwargs)
1041
1042 def get_textbox(self, *args, **kwargs):
1043 return utils.get_textbox(self, *args, **kwargs)
1039 1044
1040 def get_textpage(self, clip=None, flags=0): 1045 def get_textpage(self, clip=None, flags=0):
1041 """Make annotation TextPage.""" 1046 """Make annotation TextPage."""
1042 CheckParent(self) 1047 CheckParent(self)
1043 options = mupdf.FzStextOptions(flags) 1048 options = mupdf.FzStextOptions(flags)
3057 raise RuntimeError( "PDF has no form fonts yet") 3062 raise RuntimeError( "PDF has no form fonts yet")
3058 k = mupdf.pdf_new_name( name) 3063 k = mupdf.pdf_new_name( name)
3059 v = JM_pdf_obj_from_str( pdf, font) 3064 v = JM_pdf_obj_from_str( pdf, font)
3060 mupdf.pdf_dict_put( fonts, k, v) 3065 mupdf.pdf_dict_put( fonts, k, v)
3061 3066
3067 def del_toc_item(
3068 self,
3069 idx: int,
3070 ) -> None:
3071 """Delete TOC / bookmark item by index."""
3072 xref = self.get_outline_xrefs()[idx]
3073 self._remove_toc_item(xref)
3074
3062 def _delToC(self): 3075 def _delToC(self):
3063 """Delete the TOC.""" 3076 """Delete the TOC."""
3064 if self.is_closed or self.is_encrypted: 3077 if self.is_closed or self.is_encrypted:
3065 raise ValueError("document closed or encrypted") 3078 raise ValueError("document closed or encrypted")
3066 xrefs = [] # create Python list 3079 xrefs = [] # create Python list
3101 """Delete object.""" 3114 """Delete object."""
3102 pdf = _as_pdf_document(self) 3115 pdf = _as_pdf_document(self)
3103 if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1): 3116 if not _INRANGE(xref, 1, mupdf.pdf_xref_len(pdf)-1):
3104 raise ValueError( MSG_BAD_XREF) 3117 raise ValueError( MSG_BAD_XREF)
3105 mupdf.pdf_delete_object(pdf, xref) 3118 mupdf.pdf_delete_object(pdf, xref)
3119
3120 def _do_links(
3121 doc1: 'Document',
3122 doc2: 'Document',
3123 from_page: int = -1,
3124 to_page: int = -1,
3125 start_at: int = -1,
3126 ) -> None:
3127 """Insert links contained in copied page range into destination PDF.
3128
3129 Parameter values **must** equal those of method insert_pdf(), which must
3130 have been previously executed.
3131 """
3132 #pymupdf.log( 'utils.do_links()')
3133 # --------------------------------------------------------------------------
3134 # internal function to create the actual "/Annots" object string
3135 # --------------------------------------------------------------------------
3136 def cre_annot(lnk, xref_dst, pno_src, ctm):
3137 """Create annotation object string for a passed-in link."""
3138
3139 r = lnk["from"] * ctm # rect in PDF coordinates
3140 rect = _format_g(tuple(r))
3141 if lnk["kind"] == LINK_GOTO:
3142 txt = annot_skel["goto1"] # annot_goto
3143 idx = pno_src.index(lnk["page"])
3144 p = lnk["to"] * ctm # target point in PDF coordinates
3145 annot = txt(xref_dst[idx], p.x, p.y, lnk["zoom"], rect)
3146
3147 elif lnk["kind"] == LINK_GOTOR:
3148 if lnk["page"] >= 0:
3149 txt = annot_skel["gotor1"] # annot_gotor
3150 pnt = lnk.get("to", Point(0, 0)) # destination point
3151 if type(pnt) is not Point:
3152 pnt = Point(0, 0)
3153 annot = txt(
3154 lnk["page"],
3155 pnt.x,
3156 pnt.y,
3157 lnk["zoom"],
3158 lnk["file"],
3159 lnk["file"],
3160 rect,
3161 )
3162 else:
3163 txt = annot_skel["gotor2"] # annot_gotor_n
3164 to = get_pdf_str(lnk["to"])
3165 to = to[1:-1]
3166 f = lnk["file"]
3167 annot = txt(to, f, rect)
3168
3169 elif lnk["kind"] == LINK_LAUNCH:
3170 txt = annot_skel["launch"] # annot_launch
3171 annot = txt(lnk["file"], lnk["file"], rect)
3172
3173 elif lnk["kind"] == LINK_URI:
3174 txt = annot_skel["uri"] # annot_uri
3175 annot = txt(lnk["uri"], rect)
3176
3177 else:
3178 annot = ""
3179
3180 return annot
3181
3182 # --------------------------------------------------------------------------
3183
3184 # validate & normalize parameters
3185 if from_page < 0:
3186 fp = 0
3187 elif from_page >= doc2.page_count:
3188 fp = doc2.page_count - 1
3189 else:
3190 fp = from_page
3191
3192 if to_page < 0 or to_page >= doc2.page_count:
3193 tp = doc2.page_count - 1
3194 else:
3195 tp = to_page
3196
3197 if start_at < 0:
3198 raise ValueError("'start_at' must be >= 0")
3199 sa = start_at
3200
3201 incr = 1 if fp <= tp else -1 # page range could be reversed
3202
3203 # lists of source / destination page numbers
3204 pno_src = list(range(fp, tp + incr, incr))
3205 pno_dst = [sa + i for i in range(len(pno_src))]
3206
3207 # lists of source / destination page xrefs
3208 xref_src = []
3209 xref_dst = []
3210 for i in range(len(pno_src)):
3211 p_src = pno_src[i]
3212 p_dst = pno_dst[i]
3213 old_xref = doc2.page_xref(p_src)
3214 new_xref = doc1.page_xref(p_dst)
3215 xref_src.append(old_xref)
3216 xref_dst.append(new_xref)
3217
3218 # create the links for each copied page in destination PDF
3219 for i in range(len(xref_src)):
3220 page_src = doc2[pno_src[i]] # load source page
3221 links = page_src.get_links() # get all its links
3222 #log( '{pno_src=}')
3223 #log( '{type(page_src)=}')
3224 #log( '{page_src=}')
3225 #log( '{=i len(links)}')
3226 if len(links) == 0: # no links there
3227 page_src = None
3228 continue
3229 ctm = ~page_src.transformation_matrix # calc page transformation matrix
3230 page_dst = doc1[pno_dst[i]] # load destination page
3231 link_tab = [] # store all link definitions here
3232 for l in links:
3233 if l["kind"] == LINK_GOTO and (l["page"] not in pno_src):
3234 continue # GOTO link target not in copied pages
3235 annot_text = cre_annot(l, xref_dst, pno_src, ctm)
3236 if annot_text:
3237 link_tab.append(annot_text)
3238 if link_tab != []:
3239 page_dst._addAnnot_FromString( tuple(link_tab))
3240 #log( 'utils.do_links() returning.')
3241
3242 def _do_widgets(
3243 tar: 'Document',
3244 src: 'Document',
3245 graftmap,
3246 from_page: int = -1,
3247 to_page: int = -1,
3248 start_at: int = -1,
3249 join_duplicates=0,
3250 ) -> None:
3251 """Insert widgets of copied page range into target PDF.
3252
3253 Parameter values **must** equal those of method insert_pdf() which
3254 must have been previously executed.
3255 """
3256 if not src.is_form_pdf: # nothing to do: source PDF has no fields
3257 return
3258
3259 def clean_kid_parents(acro_fields):
3260 """ Make sure all kids have correct "Parent" pointers."""
3261 for i in range(acro_fields.pdf_array_len()):
3262 parent = acro_fields.pdf_array_get(i)
3263 kids = parent.pdf_dict_get(PDF_NAME("Kids"))
3264 for j in range(kids.pdf_array_len()):
3265 kid = kids.pdf_array_get(j)
3266 kid.pdf_dict_put(PDF_NAME("Parent"), parent)
3267
3268 def join_widgets(pdf, acro_fields, xref1, xref2, name):
3269 """Called for each pair of widgets having the same name.
3270
3271 Args:
3272 pdf: target MuPDF document
3273 acro_fields: object Root/AcroForm/Fields
3274 xref1, xref2: widget xrefs having same names
3275 name: (str) the name
3276
3277 Result:
3278 Defined or updated widget parent that points to both widgets.
3279 """
3280
3281 def re_target(pdf, acro_fields, xref1, kids1, xref2, kids2):
3282 """Merge widget in xref2 into "Kids" list of widget xref1.
3283
3284 Args:
3285 xref1, kids1: target widget and its "Kids" array.
3286 xref2, kids2: source wwidget and its "Kids" array (may be empty).
3287 """
3288 # make indirect objects from widgets
3289 w1_ind = mupdf.pdf_new_indirect(pdf, xref1, 0)
3290 w2_ind = mupdf.pdf_new_indirect(pdf, xref2, 0)
3291 # find source widget in "Fields" array
3292 idx = acro_fields.pdf_array_find(w2_ind)
3293 acro_fields.pdf_array_delete(idx)
3294
3295 if not kids2.pdf_is_array(): # source widget has no kids
3296 widget = mupdf.pdf_load_object(pdf, xref2)
3297
3298 # delete name from widget and insert target as parent
3299 widget.pdf_dict_del(PDF_NAME("T"))
3300 widget.pdf_dict_put(PDF_NAME("Parent"), w1_ind)
3301
3302 # put in target Kids
3303 kids1.pdf_array_push(w2_ind)
3304 else: # copy source kids to target kids
3305 for i in range(kids2.pdf_array_len()):
3306 kid = kids2.pdf_array_get(i)
3307 kid.pdf_dict_put(PDF_NAME("Parent"), w1_ind)
3308 kid_ind = mupdf.pdf_new_indirect(pdf, kid.pdf_to_num(), 0)
3309 kids1.pdf_array_push(kid_ind)
3310
3311 def new_target(pdf, acro_fields, xref1, w1, xref2, w2, name):
3312 """Make new "Parent" for two widgets with same name.
3313
3314 Args:
3315 xref1, w1: first widget
3316 xref2, w2: second widget
3317 name: field name
3318
3319 Result:
3320 Both widgets have no "Kids". We create a new object with the
3321 name and a "Kids" array containing the widgets.
3322 Original widgets must be removed from AcroForm/Fields.
3323 """
3324 # make new "Parent" object
3325 new = mupdf.pdf_new_dict(pdf, 5)
3326 new.pdf_dict_put_text_string(PDF_NAME("T"), name)
3327 kids = new.pdf_dict_put_array(PDF_NAME("Kids"), 2)
3328 new_obj = mupdf.pdf_add_object(pdf, new)
3329 new_obj_xref = new_obj.pdf_to_num()
3330 new_ind = mupdf.pdf_new_indirect(pdf, new_obj_xref, 0)
3331
3332 # copy over some required source widget properties
3333 ft = w1.pdf_dict_get(PDF_NAME("FT"))
3334 w1.pdf_dict_del(PDF_NAME("FT"))
3335 new_obj.pdf_dict_put(PDF_NAME("FT"), ft)
3336
3337 aa = w1.pdf_dict_get(PDF_NAME("AA"))
3338 w1.pdf_dict_del(PDF_NAME("AA"))
3339 new_obj.pdf_dict_put(PDF_NAME("AA"), aa)
3340
3341 # remove name field, insert "Parent" field in source widgets
3342 w1.pdf_dict_del(PDF_NAME("T"))
3343 w1.pdf_dict_put(PDF_NAME("Parent"), new_ind)
3344 w2.pdf_dict_del(PDF_NAME("T"))
3345 w2.pdf_dict_put(PDF_NAME("Parent"), new_ind)
3346
3347 # put source widgets in "kids" array
3348 ind1 = mupdf.pdf_new_indirect(pdf, xref1, 0)
3349 ind2 = mupdf.pdf_new_indirect(pdf, xref2, 0)
3350 kids.pdf_array_push(ind1)
3351 kids.pdf_array_push(ind2)
3352
3353 # remove source widgets from "AcroForm/Fields"
3354 idx = acro_fields.pdf_array_find(ind1)
3355 acro_fields.pdf_array_delete(idx)
3356 idx = acro_fields.pdf_array_find(ind2)
3357 acro_fields.pdf_array_delete(idx)
3358
3359 acro_fields.pdf_array_push(new_ind)
3360
3361 w1 = mupdf.pdf_load_object(pdf, xref1)
3362 w2 = mupdf.pdf_load_object(pdf, xref2)
3363 kids1 = w1.pdf_dict_get(PDF_NAME("Kids"))
3364 kids2 = w2.pdf_dict_get(PDF_NAME("Kids"))
3365
3366 # check which widget has a suitable "Kids" array
3367 if kids1.pdf_is_array():
3368 re_target(pdf, acro_fields, xref1, kids1, xref2, kids2) # pylint: disable=arguments-out-of-order
3369 elif kids2.pdf_is_array():
3370 re_target(pdf, acro_fields, xref2, kids2, xref1, kids1) # pylint: disable=arguments-out-of-order
3371 else:
3372 new_target(pdf, acro_fields, xref1, w1, xref2, w2, name) # pylint: disable=arguments-out-of-order
3373
3374 def get_kids(parent, kids_list):
3375 """Return xref list of leaf kids for a parent.
3376
3377 Call with an empty list.
3378 """
3379 kids = mupdf.pdf_dict_get(parent, PDF_NAME("Kids"))
3380 if not kids.pdf_is_array():
3381 return kids_list
3382 for i in range(kids.pdf_array_len()):
3383 kid = kids.pdf_array_get(i)
3384 if mupdf.pdf_is_dict(mupdf.pdf_dict_get(kid, PDF_NAME("Kids"))):
3385 kids_list = get_kids(kid, kids_list)
3386 else:
3387 kids_list.append(kid.pdf_to_num())
3388 return kids_list
3389
3390 def kids_xrefs(widget):
3391 """Get the xref of top "Parent" and the list of leaf widgets."""
3392 kids_list = []
3393 parent = mupdf.pdf_dict_get(widget, PDF_NAME("Parent"))
3394 parent_xref = parent.pdf_to_num()
3395 if parent_xref == 0:
3396 return parent_xref, kids_list
3397 kids_list = get_kids(parent, kids_list)
3398 return parent_xref, kids_list
3399
3400 def deduplicate_names(pdf, acro_fields, join_duplicates=False):
3401 """Handle any widget name duplicates caused by the merge."""
3402 names = {} # key is a widget name, value a list of widgets having it.
3403
3404 # extract all names and widgets in "AcroForm/Fields"
3405 for i in range(mupdf.pdf_array_len(acro_fields)):
3406 wobject = mupdf.pdf_array_get(acro_fields, i)
3407 xref = wobject.pdf_to_num()
3408
3409 # extract widget name and collect widget(s) using it
3410 T = mupdf.pdf_dict_get_text_string(wobject, PDF_NAME("T"))
3411 xrefs = names.get(T, [])
3412 xrefs.append(xref)
3413 names[T] = xrefs
3414
3415 for name, xrefs in names.items():
3416 if len(xrefs) < 2:
3417 continue
3418 xref0, xref1 = xrefs[:2] # only exactly 2 should occur!
3419 if join_duplicates: # combine fields with equal names
3420 join_widgets(pdf, acro_fields, xref0, xref1, name)
3421 else: # make field names unique
3422 newname = name + f" [{xref1}]" # append this to the name
3423 wobject = mupdf.pdf_load_object(pdf, xref1)
3424 wobject.pdf_dict_put_text_string(PDF_NAME("T"), newname)
3425
3426 clean_kid_parents(acro_fields)
3427
3428 def get_acroform(doc):
3429 """Retrieve the AcroForm dictionary form a PDF."""
3430 pdf = mupdf.pdf_document_from_fz_document(doc)
3431 # AcroForm (= central form field info)
3432 return mupdf.pdf_dict_getp(mupdf.pdf_trailer(pdf), "Root/AcroForm")
3433
3434 tarpdf = mupdf.pdf_document_from_fz_document(tar)
3435 srcpdf = mupdf.pdf_document_from_fz_document(src)
3436
3437 if tar.is_form_pdf:
3438 # target is a Form PDF, so use it to include source fields
3439 acro = get_acroform(tar)
3440 # Important arrays in AcroForm
3441 acro_fields = acro.pdf_dict_get(PDF_NAME("Fields"))
3442 tar_co = acro.pdf_dict_get(PDF_NAME("CO"))
3443 if not tar_co.pdf_is_array():
3444 tar_co = acro.pdf_dict_put_array(PDF_NAME("CO"), 5)
3445 else:
3446 # target is no Form PDF, so copy over source AcroForm
3447 acro = mupdf.pdf_deep_copy_obj(get_acroform(src)) # make a copy
3448
3449 # Clear "Fields" and "CO" arrays: will be populated by page fields.
3450 # This is required to avoid copying unneeded objects.
3451 acro.pdf_dict_del(PDF_NAME("Fields"))
3452 acro.pdf_dict_put_array(PDF_NAME("Fields"), 5)
3453 acro.pdf_dict_del(PDF_NAME("CO"))
3454 acro.pdf_dict_put_array(PDF_NAME("CO"), 5)
3455
3456 # Enrich AcroForm for copying to target
3457 acro_graft = mupdf.pdf_graft_mapped_object(graftmap, acro)
3458
3459 # Insert AcroForm into target PDF
3460 acro_tar = mupdf.pdf_add_object(tarpdf, acro_graft)
3461 acro_fields = acro_tar.pdf_dict_get(PDF_NAME("Fields"))
3462 tar_co = acro_tar.pdf_dict_get(PDF_NAME("CO"))
3463
3464 # get its xref and insert it into target catalog
3465 tar_xref = acro_tar.pdf_to_num()
3466 acro_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
3467 root = mupdf.pdf_dict_get(mupdf.pdf_trailer(tarpdf), PDF_NAME("Root"))
3468 root.pdf_dict_put(PDF_NAME("AcroForm"), acro_tar_ind)
3469
3470 if from_page <= to_page:
3471 src_range = range(from_page, to_page + 1)
3472 else:
3473 src_range = range(from_page, to_page - 1, -1)
3474
3475 parents = {} # information about widget parents
3476
3477 # remove "P" owning page reference from all widgets of all source pages
3478 for i in src_range:
3479 src_page = src[i]
3480 for xref in [
3481 xref
3482 for xref, wtype, _ in src_page.annot_xrefs()
3483 if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
3484 ]:
3485 w_obj = mupdf.pdf_load_object(srcpdf, xref)
3486 w_obj.pdf_dict_del(PDF_NAME("P"))
3487
3488 # get the widget's parent structure
3489 parent_xref, old_kids = kids_xrefs(w_obj)
3490 if parent_xref:
3491 parents[parent_xref] = {
3492 "new_xref": 0,
3493 "old_kids": old_kids,
3494 "new_kids": [],
3495 }
3496 # Copy over Parent widgets first - they are not page-dependent
3497 for xref in parents.keys(): # pylint: disable=consider-using-dict-items
3498 parent = mupdf.pdf_load_object(srcpdf, xref)
3499 parent_graft = mupdf.pdf_graft_mapped_object(graftmap, parent)
3500 parent_tar = mupdf.pdf_add_object(tarpdf, parent_graft)
3501 kids_xrefs_new = get_kids(parent_tar, [])
3502 parent_xref_new = parent_tar.pdf_to_num()
3503 parent_ind = mupdf.pdf_new_indirect(tarpdf, parent_xref_new, 0)
3504 acro_fields.pdf_array_push(parent_ind)
3505 parents[xref]["new_xref"] = parent_xref_new
3506 parents[xref]["new_kids"] = kids_xrefs_new
3507
3508 for i in range(len(src_range)):
3509 # read first copied over page in target
3510 tar_page = tar[start_at + i]
3511
3512 # read the original page in the source PDF
3513 src_page = src[src_range[i]]
3514
3515 # now walk through source page widgets and copy over
3516 w_xrefs = [ # widget xrefs of the source page
3517 xref
3518 for xref, wtype, _ in src_page.annot_xrefs()
3519 if wtype == mupdf.PDF_ANNOT_WIDGET # pylint: disable=no-member
3520 ]
3521 if not w_xrefs: # no widgets on this source page
3522 continue
3523
3524 # convert to formal PDF page
3525 tar_page_pdf = mupdf.pdf_page_from_fz_page(tar_page)
3526
3527 # extract annotations array
3528 tar_annots = mupdf.pdf_dict_get(tar_page_pdf.obj(), PDF_NAME("Annots"))
3529 if not mupdf.pdf_is_array(tar_annots):
3530 tar_annots = mupdf.pdf_dict_put_array(
3531 tar_page_pdf.obj(), PDF_NAME("Annots"), 5
3532 )
3533
3534 for xref in w_xrefs:
3535 w_obj = mupdf.pdf_load_object(srcpdf, xref)
3536
3537 # check if field takes part in inter-field validations
3538 is_aac = mupdf.pdf_is_dict(mupdf.pdf_dict_getp(w_obj, "AA/C"))
3539
3540 # check if parent of widget already in target
3541 parent_xref = mupdf.pdf_to_num(
3542 w_obj.pdf_dict_get(PDF_NAME("Parent"))
3543 )
3544 if parent_xref == 0: # parent not in target yet
3545 try:
3546 w_obj_graft = mupdf.pdf_graft_mapped_object(graftmap, w_obj)
3547 except Exception as e:
3548 message_warning(f"cannot copy widget at {xref=}: {e}")
3549 continue
3550 w_obj_tar = mupdf.pdf_add_object(tarpdf, w_obj_graft)
3551 tar_xref = w_obj_tar.pdf_to_num()
3552 w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
3553 mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
3554 mupdf.pdf_array_push(acro_fields, w_obj_tar_ind)
3555 else:
3556 parent = parents[parent_xref]
3557 idx = parent["old_kids"].index(xref) # search for xref in parent
3558 tar_xref = parent["new_kids"][idx]
3559 w_obj_tar_ind = mupdf.pdf_new_indirect(tarpdf, tar_xref, 0)
3560 mupdf.pdf_array_push(tar_annots, w_obj_tar_ind)
3561
3562 # Into "AcroForm/CO" if a computation field.
3563 if is_aac:
3564 mupdf.pdf_array_push(tar_co, w_obj_tar_ind)
3565
3566 deduplicate_names(tarpdf, acro_fields, join_duplicates=join_duplicates)
3106 3567
3107 def _embeddedFileGet(self, idx): 3568 def _embeddedFileGet(self, idx):
3108 pdf = _as_pdf_document(self) 3569 pdf = _as_pdf_document(self)
3109 names = mupdf.pdf_dict_getl( 3570 names = mupdf.pdf_dict_getl(
3110 mupdf.pdf_trailer(pdf), 3571 mupdf.pdf_trailer(pdf),
4265 finally: 4726 finally:
4266 mupdf.ll_pdf_drop_page_tree( pdf.m_internal) 4727 mupdf.ll_pdf_drop_page_tree( pdf.m_internal)
4267 4728
4268 self._reset_page_refs() 4729 self._reset_page_refs()
4269 4730
4731 def get_char_widths(
4732 doc: 'Document',
4733 xref: int,
4734 limit: int = 256,
4735 idx: int = 0,
4736 fontdict: OptDict = None,
4737 ) -> list:
4738 """Get list of glyph information of a font.
4739
4740 Notes:
4741 Must be provided by its XREF number. If we already dealt with the
4742 font, it will be recorded in doc.FontInfos. Otherwise we insert an
4743 entry there.
4744 Finally we return the glyphs for the font. This is a list of
4745 (glyph, width) where glyph is an integer controlling the char
4746 appearance, and width is a float controlling the char's spacing:
4747 width * fontsize is the actual space.
4748 For 'simple' fonts, glyph == ord(char) will usually be true.
4749 Exceptions are 'Symbol' and 'ZapfDingbats'. We are providing data for these directly here.
4750 """
4751 fontinfo = CheckFontInfo(doc, xref)
4752 if fontinfo is None: # not recorded yet: create it
4753 if fontdict is None:
4754 name, ext, stype, asc, dsc = utils._get_font_properties(doc, xref)
4755 fontdict = {
4756 "name": name,
4757 "type": stype,
4758 "ext": ext,
4759 "ascender": asc,
4760 "descender": dsc,
4761 }
4762 else:
4763 name = fontdict["name"]
4764 ext = fontdict["ext"]
4765 stype = fontdict["type"]
4766 ordering = fontdict["ordering"]
4767 simple = fontdict["simple"]
4768
4769 if ext == "":
4770 raise ValueError("xref is not a font")
4771
4772 # check for 'simple' fonts
4773 if stype in ("Type1", "MMType1", "TrueType"):
4774 simple = True
4775 else:
4776 simple = False
4777
4778 # check for CJK fonts
4779 if name in ("Fangti", "Ming"):
4780 ordering = 0
4781 elif name in ("Heiti", "Song"):
4782 ordering = 1
4783 elif name in ("Gothic", "Mincho"):
4784 ordering = 2
4785 elif name in ("Dotum", "Batang"):
4786 ordering = 3
4787 else:
4788 ordering = -1
4789
4790 fontdict["simple"] = simple
4791
4792 if name == "ZapfDingbats":
4793 glyphs = zapf_glyphs
4794 elif name == "Symbol":
4795 glyphs = symbol_glyphs
4796 else:
4797 glyphs = None
4798
4799 fontdict["glyphs"] = glyphs
4800 fontdict["ordering"] = ordering
4801 fontinfo = [xref, fontdict]
4802 doc.FontInfos.append(fontinfo)
4803 else:
4804 fontdict = fontinfo[1]
4805 glyphs = fontdict["glyphs"]
4806 simple = fontdict["simple"]
4807 ordering = fontdict["ordering"]
4808
4809 if glyphs is None:
4810 oldlimit = 0
4811 else:
4812 oldlimit = len(glyphs)
4813
4814 mylimit = max(256, limit)
4815
4816 if mylimit <= oldlimit:
4817 return glyphs
4818
4819 if ordering < 0: # not a CJK font
4820 glyphs = doc._get_char_widths(
4821 xref, fontdict["name"], fontdict["ext"], fontdict["ordering"], mylimit, idx
4822 )
4823 else: # CJK fonts use char codes and width = 1
4824 glyphs = None
4825
4826 fontdict["glyphs"] = glyphs
4827 fontinfo[1] = fontdict
4828 UpdateFontInfo(doc, fontinfo)
4829
4830 return glyphs
4831
4270 def get_layer(self, config=-1): 4832 def get_layer(self, config=-1):
4271 """Content of ON, OFF, RBGroups of an OC layer.""" 4833 """Content of ON, OFF, RBGroups of an OC layer."""
4272 pdf = _as_pdf_document(self) 4834 pdf = _as_pdf_document(self)
4273 ocp = mupdf.pdf_dict_getl( 4835 ocp = mupdf.pdf_dict_getl(
4274 mupdf.pdf_trailer( pdf), 4836 mupdf.pdf_trailer( pdf),
4322 xref = 0 4884 xref = 0
4323 ENSURE_OPERATION(pdf) 4885 ENSURE_OPERATION(pdf)
4324 xref = mupdf.pdf_create_object(pdf) 4886 xref = mupdf.pdf_create_object(pdf)
4325 return xref 4887 return xref
4326 4888
4889 def get_oc(doc: 'Document', xref: int) -> int:
4890 """Return optional content object xref for an image or form xobject.
4891
4892 Args:
4893 xref: (int) xref number of an image or form xobject.
4894 """
4895 if doc.is_closed or doc.is_encrypted:
4896 raise ValueError("document close or encrypted")
4897 t, name = doc.xref_get_key(xref, "Subtype")
4898 if t != "name" or name not in ("/Image", "/Form"):
4899 raise ValueError("bad object type at xref %i" % xref)
4900 t, oc = doc.xref_get_key(xref, "OC")
4901 if t != "xref":
4902 return 0
4903 rc = int(oc.replace("0 R", ""))
4904 return rc
4905
4327 def get_ocgs(self): 4906 def get_ocgs(self):
4328 """Show existing optional content groups.""" 4907 """Show existing optional content groups."""
4329 ci = mupdf.pdf_new_name( "CreatorInfo") 4908 ci = mupdf.pdf_new_name( "CreatorInfo")
4330 pdf = _as_pdf_document(self) 4909 pdf = _as_pdf_document(self)
4331 ocgs = mupdf.pdf_dict_getl( 4910 ocgs = mupdf.pdf_dict_getl(
4354 m = mupdf.pdf_array_len( intent) 4933 m = mupdf.pdf_array_len( intent)
4355 for j in range(m): 4934 for j in range(m):
4356 o = mupdf.pdf_array_get( intent, j) 4935 o = mupdf.pdf_array_get( intent, j)
4357 if mupdf.pdf_is_name( o): 4936 if mupdf.pdf_is_name( o):
4358 intents.append( mupdf.pdf_to_name( o)) 4937 intents.append( mupdf.pdf_to_name( o))
4359 hidden = mupdf.pdf_is_ocg_hidden( pdf, mupdf.PdfObj(), usage, ocg) 4938 if mupdf_version_tuple >= (1, 27):
4939 resource_stack = mupdf.PdfResourceStack()
4940 hidden = mupdf.pdf_is_ocg_hidden( pdf, resource_stack, usage, ocg)
4941 else:
4942 hidden = mupdf.pdf_is_ocg_hidden( pdf, mupdf.PdfObj(), usage, ocg)
4360 item = { 4943 item = {
4361 "name": name, 4944 "name": name,
4362 "intent": intents, 4945 "intent": intents,
4363 "on": not hidden, 4946 "on": not hidden,
4364 "usage": usage, 4947 "usage": usage,
4365 } 4948 }
4366 temp = xref 4949 temp = xref
4367 rc[ temp] = item 4950 rc[ temp] = item
4368 return rc 4951 return rc
4952
4953 def get_ocmd(doc: 'Document', xref: int) -> dict:
4954 """Return the definition of an OCMD (optional content membership dictionary).
4955
4956 Recognizes PDF dict keys /OCGs (PDF array of OCGs), /P (policy string) and
4957 /VE (visibility expression, PDF array). Via string manipulation, this
4958 info is converted to a Python dictionary with keys "xref", "ocgs", "policy"
4959 and "ve" - ready to recycle as input for 'set_ocmd()'.
4960 """
4961
4962 if xref not in range(doc.xref_length()):
4963 raise ValueError("bad xref")
4964 text = doc.xref_object(xref, compressed=True)
4965 if "/Type/OCMD" not in text:
4966 raise ValueError("bad object type")
4967 textlen = len(text)
4968
4969 p0 = text.find("/OCGs[") # look for /OCGs key
4970 p1 = text.find("]", p0)
4971 if p0 < 0 or p1 < 0: # no OCGs found
4972 ocgs = None
4973 else:
4974 ocgs = text[p0 + 6 : p1].replace("0 R", " ").split()
4975 ocgs = list(map(int, ocgs))
4976
4977 p0 = text.find("/P/") # look for /P policy key
4978 if p0 < 0:
4979 policy = None
4980 else:
4981 p1 = text.find("ff", p0)
4982 if p1 < 0:
4983 p1 = text.find("on", p0)
4984 if p1 < 0: # some irregular syntax
4985 raise ValueError("bad object at xref")
4986 else:
4987 policy = text[p0 + 3 : p1 + 2]
4988
4989 p0 = text.find("/VE[") # look for /VE visibility expression key
4990 if p0 < 0: # no visibility expression found
4991 ve = None
4992 else:
4993 lp = rp = 0 # find end of /VE by finding last ']'.
4994 p1 = p0
4995 while lp < 1 or lp != rp:
4996 p1 += 1
4997 if not p1 < textlen: # some irregular syntax
4998 raise ValueError("bad object at xref")
4999 if text[p1] == "[":
5000 lp += 1
5001 if text[p1] == "]":
5002 rp += 1
5003 # p1 now positioned at the last "]"
5004 ve = text[p0 + 3 : p1 + 1] # the PDF /VE array
5005 ve = (
5006 ve.replace("/And", '"and",')
5007 .replace("/Not", '"not",')
5008 .replace("/Or", '"or",')
5009 )
5010 ve = ve.replace(" 0 R]", "]").replace(" 0 R", ",").replace("][", "],[")
5011 import json
5012 try:
5013 ve = json.loads(ve)
5014 except Exception:
5015 exception_info()
5016 message(f"bad /VE key: {ve!r}")
5017 raise
5018 return {"xref": xref, "ocgs": ocgs, "policy": policy, "ve": ve}
4369 5019
4370 def get_outline_xrefs(self): 5020 def get_outline_xrefs(self):
4371 """Get list of outline xref numbers.""" 5021 """Get list of outline xref numbers."""
4372 xrefs = [] 5022 xrefs = []
4373 pdf = _as_pdf_document(self, required=0) 5023 pdf = _as_pdf_document(self, required=0)
4413 val = self._getPageInfo(pno, 2) 5063 val = self._getPageInfo(pno, 2)
4414 if not full: 5064 if not full:
4415 return [v[:-1] for v in val] 5065 return [v[:-1] for v in val]
4416 return val 5066 return val
4417 5067
5068 def get_page_labels(self):
5069 """Return page label definitions in PDF document.
5070
5071 Returns:
5072 A list of dictionaries with the following format:
5073 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}.
5074 """
5075 # Jorj McKie, 2021-01-10
5076 return [utils.rule_dict(item) for item in self._get_page_labels()]
5077
5078 def get_page_numbers(doc, label, only_one=False):
5079 """Return a list of page numbers with the given label.
5080
5081 Args:
5082 doc: PDF document object (resp. 'self').
5083 label: (str) label.
5084 only_one: (bool) stop searching after first hit.
5085 Returns:
5086 List of page numbers having this label.
5087 """
5088 # Jorj McKie, 2021-01-06
5089
5090 numbers = []
5091 if not label:
5092 return numbers
5093 labels = doc._get_page_labels()
5094 if labels == []:
5095 return numbers
5096 for i in range(doc.page_count):
5097 plabel = utils.get_label_pno(i, labels)
5098 if plabel == label:
5099 numbers.append(i)
5100 if only_one:
5101 break
5102 return numbers
5103
5104 def get_page_pixmap(
5105 doc: 'Document',
5106 pno: int,
5107 *,
5108 matrix: matrix_like = None,
5109 dpi=None,
5110 colorspace: Colorspace = None,
5111 clip: rect_like = None,
5112 alpha: bool = False,
5113 annots: bool = True,
5114 ) -> 'Pixmap':
5115 """Create pixmap of document page by page number.
5116
5117 Notes:
5118 Convenience function calling page.get_pixmap.
5119 Args:
5120 pno: (int) page number
5121 matrix: pymupdf.Matrix for transformation (default: pymupdf.Identity).
5122 colorspace: (str,pymupdf.Colorspace) rgb, rgb, gray - case ignored, default csRGB.
5123 clip: (irect-like) restrict rendering to this area.
5124 alpha: (bool) include alpha channel
5125 annots: (bool) also render annotations
5126 """
5127 if matrix is None:
5128 matrix = Identity
5129 if colorspace is None:
5130 colorspace = csRGB
5131 return doc[pno].get_pixmap(
5132 matrix=matrix,
5133 dpi=dpi, colorspace=colorspace,
5134 clip=clip,
5135 alpha=alpha,
5136 annots=annots
5137 )
5138
5139 def get_page_text(
5140 doc: 'Document',
5141 pno: int,
5142 option: str = "text",
5143 clip: rect_like = None,
5144 flags: OptInt = None,
5145 textpage: 'TextPage' = None,
5146 sort: bool = False,
5147 ) -> typing.Any:
5148 """Extract a document page's text by page number.
5149
5150 Notes:
5151 Convenience function calling page.get_text().
5152 Args:
5153 pno: page number
5154 option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml.
5155 Returns:
5156 output from page.TextPage().
5157 """
5158 return doc[pno].get_text(option, clip=clip, flags=flags, sort=sort)
5159
4418 def get_page_xobjects(self, pno: int) -> list: 5160 def get_page_xobjects(self, pno: int) -> list:
4419 """Retrieve a list of XObjects used on a page. 5161 """Retrieve a list of XObjects used on a page.
4420 """ 5162 """
4421 if self.is_closed or self.is_encrypted: 5163 if self.is_closed or self.is_encrypted:
4422 raise ValueError("document closed or encrypted") 5164 raise ValueError("document closed or encrypted")
4439 sigflag = -1 5181 sigflag = -1
4440 if sigflags.m_internal: 5182 if sigflags.m_internal:
4441 sigflag = mupdf.pdf_to_int(sigflags) 5183 sigflag = mupdf.pdf_to_int(sigflags)
4442 return sigflag 5184 return sigflag
4443 5185
5186 def get_toc(
5187 doc: 'Document',
5188 simple: bool = True,
5189 ) -> list:
5190 """Create a table of contents.
5191
5192 Args:
5193 simple: a bool to control output. Returns a list, where each entry consists of outline level, title, page number and link destination (if simple = False). For details see PyMuPDF's documentation.
5194 """
5195 def recurse(olItem, liste, lvl):
5196 """Recursively follow the outline item chain and record item information in a list."""
5197 while olItem and olItem.this.m_internal:
5198 if olItem.title:
5199 title = olItem.title
5200 else:
5201 title = " "
5202
5203 if not olItem.is_external:
5204 if olItem.uri:
5205 if olItem.page == -1:
5206 resolve = doc.resolve_link(olItem.uri)
5207 page = resolve[0] + 1
5208 else:
5209 page = olItem.page + 1
5210 else:
5211 page = -1
5212 else:
5213 page = -1
5214
5215 if not simple:
5216 link = utils.getLinkDict(olItem, doc)
5217 liste.append([lvl, title, page, link])
5218 else:
5219 liste.append([lvl, title, page])
5220
5221 if olItem.down:
5222 liste = recurse(olItem.down, liste, lvl + 1)
5223 olItem = olItem.next
5224 return liste
5225
5226 # ensure document is open
5227 if doc.is_closed:
5228 raise ValueError("document closed")
5229 doc.init_doc()
5230 olItem = doc.outline
5231 if not olItem:
5232 return []
5233 lvl = 1
5234 liste = []
5235 toc = recurse(olItem, liste, lvl)
5236 if doc.is_pdf and not simple:
5237 doc._extend_toc_items(toc)
5238 return toc
5239
4444 def get_xml_metadata(self): 5240 def get_xml_metadata(self):
4445 """Get document XML metadata.""" 5241 """Get document XML metadata."""
4446 xml = None 5242 xml = None
4447 pdf = _as_pdf_document(self, required=0) 5243 pdf = _as_pdf_document(self, required=0)
4448 if pdf.m_internal: 5244 if pdf.m_internal:
4456 rc = JM_UnicodeFromBuffer(buff) 5252 rc = JM_UnicodeFromBuffer(buff)
4457 else: 5253 else:
4458 rc = '' 5254 rc = ''
4459 return rc 5255 return rc
4460 5256
5257 def has_annots(doc: 'Document') -> bool:
5258 """Check whether there are annotations on any page."""
5259 if doc.is_closed:
5260 raise ValueError("document closed")
5261 if not doc.is_pdf:
5262 raise ValueError("is no PDF")
5263 for i in range(doc.page_count):
5264 for item in doc.page_annot_xrefs(i):
5265 # pylint: disable=no-member
5266 if not (item[1] == mupdf.PDF_ANNOT_LINK or item[1] == mupdf.PDF_ANNOT_WIDGET): # pylint: disable=no-member
5267 return True
5268 return False
5269
5270 def has_links(doc: 'Document') -> bool:
5271 """Check whether there are links on any page."""
5272 if doc.is_closed:
5273 raise ValueError("document closed")
5274 if not doc.is_pdf:
5275 raise ValueError("is no PDF")
5276 for i in range(doc.page_count):
5277 for item in doc.page_annot_xrefs(i):
5278 if item[1] == mupdf.PDF_ANNOT_LINK: # pylint: disable=no-member
5279 return True
5280 return False
5281
4461 def init_doc(self): 5282 def init_doc(self):
4462 if self.is_encrypted: 5283 if self.is_encrypted:
4463 raise ValueError("cannot initialize - document still encrypted") 5284 raise ValueError("cannot initialize - document still encrypted")
4464 self._outline = self._loadOutline() 5285 self._outline = self._loadOutline()
4465 self.metadata = dict( 5286 self.metadata = dict(
4521 annots=annots, 5342 annots=annots,
4522 show_progress=show_progress, 5343 show_progress=show_progress,
4523 final=final, 5344 final=final,
4524 ) 5345 )
4525 5346
5347 def insert_page(
5348 doc: 'Document',
5349 pno: int,
5350 text: typing.Union[str, list, None] = None,
5351 fontsize: float = 11,
5352 width: float = 595,
5353 height: float = 842,
5354 fontname: str = "helv",
5355 fontfile: OptStr = None,
5356 color: OptSeq = (0,),
5357 ) -> int:
5358 """Create a new PDF page and insert some text.
5359
5360 Notes:
5361 Function combining pymupdf.Document.new_page() and pymupdf.Page.insert_text().
5362 For parameter details see these methods.
5363 """
5364 page = doc.new_page(pno=pno, width=width, height=height)
5365 if not bool(text):
5366 return 0
5367 rc = page.insert_text(
5368 (50, 72),
5369 text,
5370 fontsize=fontsize,
5371 fontname=fontname,
5372 fontfile=fontfile,
5373 color=color,
5374 )
5375 return rc
5376
4526 def insert_pdf( 5377 def insert_pdf(
4527 self, 5378 self,
4528 docsrc, 5379 docsrc,
4529 *, 5380 *,
4530 from_page=-1, 5381 from_page=-1,
5021 raise ValueError("document closed") 5872 raise ValueError("document closed")
5022 document = self.this if isinstance(self.this, mupdf.FzDocument) else self.this.super() 5873 document = self.this if isinstance(self.this, mupdf.FzDocument) else self.this.super()
5023 ret = mupdf.fz_needs_password( document) 5874 ret = mupdf.fz_needs_password( document)
5024 return ret 5875 return ret
5025 5876
5877 def new_page(
5878 doc: 'Document',
5879 pno: int = -1,
5880 width: float = 595,
5881 height: float = 842,
5882 ) -> Page:
5883 """Create and return a new page object.
5884
5885 Args:
5886 pno: (int) insert before this page. Default: after last page.
5887 width: (float) page width in points. Default: 595 (ISO A4 width).
5888 height: (float) page height in points. Default 842 (ISO A4 height).
5889 Returns:
5890 A pymupdf.Page object.
5891 """
5892 doc._newPage(pno, width=width, height=height)
5893 return doc[pno]
5894
5026 def next_location(self, page_id): 5895 def next_location(self, page_id):
5027 """Get (chapter, page) of next page.""" 5896 """Get (chapter, page) of next page."""
5028 if self.is_closed or self.is_encrypted: 5897 if self.is_closed or self.is_encrypted:
5029 raise ValueError("document closed or encrypted") 5898 raise ValueError("document closed or encrypted")
5030 if type(page_id) is int: 5899 if type(page_id) is int:
5667 6536
5668 def saveIncr(self): 6537 def saveIncr(self):
5669 """ Save PDF incrementally""" 6538 """ Save PDF incrementally"""
5670 return self.save(self.name, incremental=True, encryption=mupdf.PDF_ENCRYPT_KEEP) 6539 return self.save(self.name, incremental=True, encryption=mupdf.PDF_ENCRYPT_KEEP)
5671 6540
6541 # ------------------------------------------------------------------------------
6542 # Remove potentially sensitive data from a PDF. Similar to the Adobe
6543 # Acrobat 'sanitize' function
6544 # ------------------------------------------------------------------------------
6545 def scrub(
6546 doc: 'Document',
6547 attached_files: bool = True,
6548 clean_pages: bool = True,
6549 embedded_files: bool = True,
6550 hidden_text: bool = True,
6551 javascript: bool = True,
6552 metadata: bool = True,
6553 redactions: bool = True,
6554 redact_images: int = 0,
6555 remove_links: bool = True,
6556 reset_fields: bool = True,
6557 reset_responses: bool = True,
6558 thumbnails: bool = True,
6559 xml_metadata: bool = True,
6560 ) -> None:
6561
6562 def remove_hidden(cont_lines):
6563 """Remove hidden text from a PDF page.
6564
6565 Args:
6566 cont_lines: list of lines with /Contents content. Should have status
6567 from after page.cleanContents().
6568
6569 Returns:
6570 List of /Contents lines from which hidden text has been removed.
6571
6572 Notes:
6573 The input must have been created after the page's /Contents object(s)
6574 have been cleaned with page.cleanContents(). This ensures a standard
6575 formatting: one command per line, single spaces between operators.
6576 This allows for drastic simplification of this code.
6577 """
6578 out_lines = [] # will return this
6579 in_text = False # indicate if within BT/ET object
6580 suppress = False # indicate text suppression active
6581 make_return = False
6582 for line in cont_lines:
6583 if line == b"BT": # start of text object
6584 in_text = True # switch on
6585 out_lines.append(line) # output it
6586 continue
6587 if line == b"ET": # end of text object
6588 in_text = False # switch off
6589 out_lines.append(line) # output it
6590 continue
6591 if line == b"3 Tr": # text suppression operator
6592 suppress = True # switch on
6593 make_return = True
6594 continue
6595 if line[-2:] == b"Tr" and line[0] != b"3":
6596 suppress = False # text rendering changed
6597 out_lines.append(line)
6598 continue
6599 if line == b"Q": # unstack command also switches off
6600 suppress = False
6601 out_lines.append(line)
6602 continue
6603 if suppress and in_text: # suppress hidden lines
6604 continue
6605 out_lines.append(line)
6606 if make_return:
6607 return out_lines
6608 else:
6609 return None
6610
6611 if not doc.is_pdf: # only works for PDF
6612 raise ValueError("is no PDF")
6613 if doc.is_encrypted or doc.is_closed:
6614 raise ValueError("closed or encrypted doc")
6615
6616 if not clean_pages:
6617 hidden_text = False
6618 redactions = False
6619
6620 if metadata:
6621 doc.set_metadata({}) # remove standard metadata
6622
6623 for page in doc:
6624 if reset_fields:
6625 # reset form fields (widgets)
6626 for widget in page.widgets():
6627 widget.reset()
6628
6629 if remove_links:
6630 links = page.get_links() # list of all links on page
6631 for link in links: # remove all links
6632 page.delete_link(link)
6633
6634 found_redacts = False
6635 for annot in page.annots():
6636 if annot.type[0] == mupdf.PDF_ANNOT_FILE_ATTACHMENT and attached_files:
6637 annot.update_file(buffer_=b" ") # set file content to empty
6638 if reset_responses:
6639 annot.delete_responses()
6640 if annot.type[0] == mupdf.PDF_ANNOT_REDACT: # pylint: disable=no-member
6641 found_redacts = True
6642
6643 if redactions and found_redacts:
6644 page.apply_redactions(images=redact_images)
6645
6646 if not (clean_pages or hidden_text):
6647 continue # done with the page
6648
6649 page.clean_contents()
6650 if not page.get_contents():
6651 continue
6652 if hidden_text:
6653 xrefs = page.get_contents()
6654 assert len(xrefs) == 1 # only one because of cleaning.
6655 xref = xrefs[0]
6656 cont = doc.xref_stream(xref)
6657 cont_lines = remove_hidden(cont.splitlines()) # remove hidden text
6658 if cont_lines: # something was actually removed
6659 cont = b"\n".join(cont_lines)
6660 doc.update_stream(xref, cont) # rewrite the page /Contents
6661
6662 if thumbnails: # remove page thumbnails?
6663 if doc.xref_get_key(page.xref, "Thumb")[0] != "null":
6664 doc.xref_set_key(page.xref, "Thumb", "null")
6665
6666 # pages are scrubbed, now perform document-wide scrubbing
6667 # remove embedded files
6668 if embedded_files:
6669 for name in doc.embfile_names():
6670 doc.embfile_del(name)
6671
6672 if xml_metadata:
6673 doc.del_xml_metadata()
6674 if not (xml_metadata or javascript):
6675 xref_limit = 0
6676 else:
6677 xref_limit = doc.xref_length()
6678 for xref in range(1, xref_limit):
6679 if not doc.xref_object(xref):
6680 msg = "bad xref %i - clean PDF before scrubbing" % xref
6681 raise ValueError(msg)
6682 if javascript and doc.xref_get_key(xref, "S")[1] == "/JavaScript":
6683 # a /JavaScript action object
6684 obj = "<</S/JavaScript/JS()>>" # replace with a null JavaScript
6685 doc.update_object(xref, obj) # update this object
6686 continue # no further handling
6687
6688 if not xml_metadata:
6689 continue
6690
6691 if doc.xref_get_key(xref, "Type")[1] == "/Metadata":
6692 # delete any metadata object directly
6693 doc.update_object(xref, "<<>>")
6694 doc.update_stream(xref, b"deleted", new=True)
6695 continue
6696
6697 if doc.xref_get_key(xref, "Metadata")[0] != "null":
6698 doc.xref_set_key(xref, "Metadata", "null")
6699
6700 def search_page_for(
6701 doc: 'Document',
6702 pno: int,
6703 text: str,
6704 quads: bool = False,
6705 clip: rect_like = None,
6706 flags: int = None,
6707 textpage: 'TextPage' = None,
6708 ) -> list:
6709 """Search for a string on a page.
6710
6711 Args:
6712 pno: page number
6713 text: string to be searched for
6714 clip: restrict search to this rectangle
6715 quads: (bool) return quads instead of rectangles
6716 flags: bit switches, default: join hyphened words
6717 textpage: reuse a prepared textpage
6718 Returns:
6719 a list of rectangles or quads, each containing an occurrence.
6720 """
6721 if flags is None:
6722 flags = (0
6723 | TEXT_DEHYPHENATE
6724 | TEXT_PRESERVE_LIGATURES
6725 | TEXT_PRESERVE_WHITESPACE
6726 | TEXT_MEDIABOX_CLIP
6727 )
6728 return doc[pno].search_for(
6729 text,
6730 quads=quads,
6731 clip=clip,
6732 flags=flags,
6733 textpage=textpage,
6734 )
6735
5672 def select(self, pyliste): 6736 def select(self, pyliste):
5673 """Build sub-pdf with page numbers in the list.""" 6737 """Build sub-pdf with page numbers in the list."""
5674 if self.is_closed or self.is_encrypted: 6738 if self.is_closed or self.is_encrypted:
5675 raise ValueError("document closed or encrypted") 6739 raise ValueError("document closed or encrypted")
5676 if not self.is_pdf: 6740 if not self.is_pdf:
5811 pdfdict += f"/{key} {value}" 6875 pdfdict += f"/{key} {value}"
5812 pdfdict += ">>" 6876 pdfdict += ">>"
5813 self.xref_set_key(xref, "MarkInfo", pdfdict) 6877 self.xref_set_key(xref, "MarkInfo", pdfdict)
5814 return True 6878 return True
5815 6879
6880 def set_metadata(doc: 'Document', m: dict = None) -> None:
6881 """Update the PDF /Info object.
6882
6883 Args:
6884 m: a dictionary like doc.metadata.
6885 """
6886 if not doc.is_pdf:
6887 raise ValueError("is no PDF")
6888 if doc.is_closed or doc.is_encrypted:
6889 raise ValueError("document closed or encrypted")
6890 if m is None:
6891 m = {}
6892 elif type(m) is not dict:
6893 raise ValueError("bad metadata")
6894 keymap = {
6895 "author": "Author",
6896 "producer": "Producer",
6897 "creator": "Creator",
6898 "title": "Title",
6899 "format": None,
6900 "encryption": None,
6901 "creationDate": "CreationDate",
6902 "modDate": "ModDate",
6903 "subject": "Subject",
6904 "keywords": "Keywords",
6905 "trapped": "Trapped",
6906 }
6907 valid_keys = set(keymap.keys())
6908 diff_set = set(m.keys()).difference(valid_keys)
6909 if diff_set != set():
6910 msg = "bad dict key(s): %s" % diff_set
6911 raise ValueError(msg)
6912
6913 t, temp = doc.xref_get_key(-1, "Info")
6914 if t != "xref":
6915 info_xref = 0
6916 else:
6917 info_xref = int(temp.replace("0 R", ""))
6918
6919 if m == {} and info_xref == 0: # nothing to do
6920 return
6921
6922 if info_xref == 0: # no prev metadata: get new xref
6923 info_xref = doc.get_new_xref()
6924 doc.update_object(info_xref, "<<>>") # fill it with empty object
6925 doc.xref_set_key(-1, "Info", "%i 0 R" % info_xref)
6926 elif m == {}: # remove existing metadata
6927 doc.xref_set_key(-1, "Info", "null")
6928 doc.init_doc()
6929 return
6930
6931 for key, val in [(k, v) for k, v in m.items() if keymap[k] is not None]:
6932 pdf_key = keymap[key]
6933 if not bool(val) or val in ("none", "null"):
6934 val = "null"
6935 else:
6936 val = get_pdf_str(val)
6937 doc.xref_set_key(info_xref, pdf_key, val)
6938 doc.init_doc()
6939 return
6940
6941 def set_oc(doc: 'Document', xref: int, oc: int) -> None:
6942 """Attach optional content object to image or form xobject.
6943
6944 Args:
6945 xref: (int) xref number of an image or form xobject
6946 oc: (int) xref number of an OCG or OCMD
6947 """
6948 if doc.is_closed or doc.is_encrypted:
6949 raise ValueError("document close or encrypted")
6950 t, name = doc.xref_get_key(xref, "Subtype")
6951 if t != "name" or name not in ("/Image", "/Form"):
6952 raise ValueError("bad object type at xref %i" % xref)
6953 if oc > 0:
6954 t, name = doc.xref_get_key(oc, "Type")
6955 if t != "name" or name not in ("/OCG", "/OCMD"):
6956 raise ValueError("bad object type at xref %i" % oc)
6957 if oc == 0 and "OC" in doc.xref_get_keys(xref):
6958 doc.xref_set_key(xref, "OC", "null")
6959 return None
6960 doc.xref_set_key(xref, "OC", "%i 0 R" % oc)
6961 return None
6962
6963 def set_ocmd(
6964 doc: 'Document',
6965 xref: int = 0,
6966 ocgs: typing.Union[list, None] = None,
6967 policy: OptStr = None,
6968 ve: typing.Union[list, None] = None,
6969 ) -> int:
6970 """Create or update an OCMD object in a PDF document.
6971
6972 Args:
6973 xref: (int) 0 for creating a new object, otherwise update existing one.
6974 ocgs: (list) OCG xref numbers, which shall be subject to 'policy'.
6975 policy: one of 'AllOn', 'AllOff', 'AnyOn', 'AnyOff' (any casing).
6976 ve: (list) visibility expression. Use instead of 'ocgs' with 'policy'.
6977
6978 Returns:
6979 Xref of the created or updated OCMD.
6980 """
6981
6982 all_ocgs = set(doc.get_ocgs().keys())
6983
6984 def ve_maker(ve):
6985 if type(ve) not in (list, tuple) or len(ve) < 2:
6986 raise ValueError("bad 've' format: %s" % ve)
6987 if ve[0].lower() not in ("and", "or", "not"):
6988 raise ValueError("bad operand: %s" % ve[0])
6989 if ve[0].lower() == "not" and len(ve) != 2:
6990 raise ValueError("bad 've' format: %s" % ve)
6991 item = "[/%s" % ve[0].title()
6992 for x in ve[1:]:
6993 if type(x) is int:
6994 if x not in all_ocgs:
6995 raise ValueError("bad OCG %i" % x)
6996 item += " %i 0 R" % x
6997 else:
6998 item += " %s" % ve_maker(x)
6999 item += "]"
7000 return item
7001
7002 text = "<</Type/OCMD"
7003
7004 if ocgs and type(ocgs) in (list, tuple): # some OCGs are provided
7005 s = set(ocgs).difference(all_ocgs) # contains illegal xrefs
7006 if s != set():
7007 msg = "bad OCGs: %s" % s
7008 raise ValueError(msg)
7009 text += "/OCGs[" + " ".join(map(lambda x: "%i 0 R" % x, ocgs)) + "]"
7010
7011 if policy:
7012 policy = str(policy).lower()
7013 pols = {
7014 "anyon": "AnyOn",
7015 "allon": "AllOn",
7016 "anyoff": "AnyOff",
7017 "alloff": "AllOff",
7018 }
7019 if policy not in ("anyon", "allon", "anyoff", "alloff"):
7020 raise ValueError("bad policy: %s" % policy)
7021 text += "/P/%s" % pols[policy]
7022
7023 if ve:
7024 text += "/VE%s" % ve_maker(ve)
7025
7026 text += ">>"
7027
7028 # make new object or replace old OCMD (check type first)
7029 if xref == 0:
7030 xref = doc.get_new_xref()
7031 elif "/Type/OCMD" not in doc.xref_object(xref, compressed=True):
7032 raise ValueError("bad xref or not an OCMD")
7033 doc.update_object(xref, text)
7034 return xref
7035
5816 def set_pagelayout(self, pagelayout: str): 7036 def set_pagelayout(self, pagelayout: str):
5817 """Set the PDF PageLayout value.""" 7037 """Set the PDF PageLayout value."""
5818 valid = ("SinglePage", "OneColumn", "TwoColumnLeft", "TwoColumnRight", "TwoPageLeft", "TwoPageRight") 7038 valid = ("SinglePage", "OneColumn", "TwoColumnLeft", "TwoColumnRight", "TwoPageLeft", "TwoPageRight")
5819 xref = self.pdf_catalog() 7039 xref = self.pdf_catalog()
5820 if xref == 0: 7040 if xref == 0:
5842 for v in valid: 7062 for v in valid:
5843 if pagemode.lower() == v.lower(): 7063 if pagemode.lower() == v.lower():
5844 self.xref_set_key(xref, "PageMode", f"/{v}") 7064 self.xref_set_key(xref, "PageMode", f"/{v}")
5845 return True 7065 return True
5846 raise ValueError("bad PageMode value") 7066 raise ValueError("bad PageMode value")
7067
7068 def set_page_labels(doc, labels):
7069 """Add / replace page label definitions in PDF document.
7070
7071 Args:
7072 doc: PDF document (resp. 'self').
7073 labels: list of label dictionaries like:
7074 {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int},
7075 as returned by get_page_labels().
7076 """
7077 # William Chapman, 2021-01-06
7078
7079 def create_label_str(label):
7080 """Convert Python label dict to corresponding PDF rule string.
7081
7082 Args:
7083 label: (dict) build rule for the label.
7084 Returns:
7085 PDF label rule string wrapped in "<<", ">>".
7086 """
7087 s = "%i<<" % label["startpage"]
7088 if label.get("prefix", "") != "":
7089 s += "/P(%s)" % label["prefix"]
7090 if label.get("style", "") != "":
7091 s += "/S/%s" % label["style"]
7092 if label.get("firstpagenum", 1) > 1:
7093 s += "/St %i" % label["firstpagenum"]
7094 s += ">>"
7095 return s
7096
7097 def create_nums(labels):
7098 """Return concatenated string of all labels rules.
7099
7100 Args:
7101 labels: (list) dictionaries as created by function 'rule_dict'.
7102 Returns:
7103 PDF compatible string for page label definitions, ready to be
7104 enclosed in PDF array 'Nums[...]'.
7105 """
7106 labels.sort(key=lambda x: x["startpage"])
7107 s = "".join([create_label_str(label) for label in labels])
7108 return s
7109
7110 doc._set_page_labels(create_nums(labels))
7111
7112 def set_toc(
7113 doc: 'Document',
7114 toc: list,
7115 collapse: int = 1,
7116 ) -> int:
7117 """Create new outline tree (table of contents, TOC).
7118
7119 Args:
7120 toc: (list, tuple) each entry must contain level, title, page and
7121 optionally top margin on the page. None or '()' remove the TOC.
7122 collapse: (int) collapses entries beyond this level. Zero or None
7123 shows all entries unfolded.
7124 Returns:
7125 the number of inserted items, or the number of removed items respectively.
7126 """
7127 if doc.is_closed or doc.is_encrypted:
7128 raise ValueError("document closed or encrypted")
7129 if not doc.is_pdf:
7130 raise ValueError("is no PDF")
7131 if not toc: # remove all entries
7132 return len(doc._delToC())
7133
7134 # validity checks --------------------------------------------------------
7135 if type(toc) not in (list, tuple):
7136 raise ValueError("'toc' must be list or tuple")
7137 toclen = len(toc)
7138 page_count = doc.page_count
7139 t0 = toc[0]
7140 if type(t0) not in (list, tuple):
7141 raise ValueError("items must be sequences of 3 or 4 items")
7142 if t0[0] != 1:
7143 raise ValueError("hierarchy level of item 0 must be 1")
7144 for i in list(range(toclen - 1)):
7145 t1 = toc[i]
7146 t2 = toc[i + 1]
7147 if not -1 <= t1[2] <= page_count:
7148 raise ValueError("row %i: page number out of range" % i)
7149 if (type(t2) not in (list, tuple)) or len(t2) not in (3, 4):
7150 raise ValueError("bad row %i" % (i + 1))
7151 if (type(t2[0]) is not int) or t2[0] < 1:
7152 raise ValueError("bad hierarchy level in row %i" % (i + 1))
7153 if t2[0] > t1[0] + 1:
7154 raise ValueError("bad hierarchy level in row %i" % (i + 1))
7155 # no formal errors in toc --------------------------------------------------
7156
7157 # --------------------------------------------------------------------------
7158 # make a list of xref numbers, which we can use for our TOC entries
7159 # --------------------------------------------------------------------------
7160 old_xrefs = doc._delToC() # del old outlines, get their xref numbers
7161
7162 # prepare table of xrefs for new bookmarks
7163 old_xrefs = []
7164 xref = [0] + old_xrefs
7165 xref[0] = doc._getOLRootNumber() # entry zero is outline root xref number
7166 if toclen > len(old_xrefs): # too few old xrefs?
7167 for i in range((toclen - len(old_xrefs))):
7168 xref.append(doc.get_new_xref()) # acquire new ones
7169
7170 lvltab = {0: 0} # to store last entry per hierarchy level
7171
7172 # ------------------------------------------------------------------------------
7173 # contains new outline objects as strings - first one is the outline root
7174 # ------------------------------------------------------------------------------
7175 olitems = [{"count": 0, "first": -1, "last": -1, "xref": xref[0]}]
7176 # ------------------------------------------------------------------------------
7177 # build olitems as a list of PDF-like connected dictionaries
7178 # ------------------------------------------------------------------------------
7179 for i in range(toclen):
7180 o = toc[i]
7181 lvl = o[0] # level
7182 title = get_pdf_str(o[1]) # title
7183 pno = min(doc.page_count - 1, max(0, o[2] - 1)) # page number
7184 page_xref = doc.page_xref(pno)
7185 page_height = doc.page_cropbox(pno).height
7186 top = Point(72, page_height - 36)
7187 dest_dict = {"to": top, "kind": LINK_GOTO} # fall back target
7188 if o[2] < 0:
7189 dest_dict["kind"] = LINK_NONE
7190 if len(o) > 3: # some target is specified
7191 if type(o[3]) in (int, float): # convert a number to a point
7192 dest_dict["to"] = Point(72, page_height - o[3])
7193 else: # if something else, make sure we have a dict
7194 # We make a copy of o[3] to avoid modifying our caller's data.
7195 dest_dict = o[3].copy() if type(o[3]) is dict else dest_dict
7196 if "to" not in dest_dict: # target point not in dict?
7197 dest_dict["to"] = top # put default in
7198 else: # transform target to PDF coordinates
7199 page = doc[pno]
7200 point = Point(dest_dict["to"])
7201 point.y = page.cropbox.height - point.y
7202 point = point * page.rotation_matrix
7203 dest_dict["to"] = (point.x, point.y)
7204 d = {}
7205 d["first"] = -1
7206 d["count"] = 0
7207 d["last"] = -1
7208 d["prev"] = -1
7209 d["next"] = -1
7210 d["dest"] = utils.getDestStr(page_xref, dest_dict)
7211 d["top"] = dest_dict["to"]
7212 d["title"] = title
7213 d["parent"] = lvltab[lvl - 1]
7214 d["xref"] = xref[i + 1]
7215 d["color"] = dest_dict.get("color")
7216 d["flags"] = dest_dict.get("italic", 0) + 2 * dest_dict.get("bold", 0)
7217 lvltab[lvl] = i + 1
7218 parent = olitems[lvltab[lvl - 1]] # the parent entry
7219
7220 if (
7221 dest_dict.get("collapse") or collapse and lvl > collapse
7222 ): # suppress expansion
7223 parent["count"] -= 1 # make /Count negative
7224 else:
7225 parent["count"] += 1 # positive /Count
7226
7227 if parent["first"] == -1:
7228 parent["first"] = i + 1
7229 parent["last"] = i + 1
7230 else:
7231 d["prev"] = parent["last"]
7232 prev = olitems[parent["last"]]
7233 prev["next"] = i + 1
7234 parent["last"] = i + 1
7235 olitems.append(d)
7236
7237 # ------------------------------------------------------------------------------
7238 # now create each outline item as a string and insert it in the PDF
7239 # ------------------------------------------------------------------------------
7240 for i, ol in enumerate(olitems):
7241 txt = "<<"
7242 if ol["count"] != 0:
7243 txt += "/Count %i" % ol["count"]
7244 try:
7245 txt += ol["dest"]
7246 except Exception:
7247 # Verbose in PyMuPDF/tests.
7248 if g_exceptions_verbose >= 2: exception_info()
7249 pass
7250 try:
7251 if ol["first"] > -1:
7252 txt += "/First %i 0 R" % xref[ol["first"]]
7253 except Exception:
7254 if g_exceptions_verbose >= 2: exception_info()
7255 pass
7256 try:
7257 if ol["last"] > -1:
7258 txt += "/Last %i 0 R" % xref[ol["last"]]
7259 except Exception:
7260 if g_exceptions_verbose >= 2: exception_info()
7261 pass
7262 try:
7263 if ol["next"] > -1:
7264 txt += "/Next %i 0 R" % xref[ol["next"]]
7265 except Exception:
7266 # Verbose in PyMuPDF/tests.
7267 if g_exceptions_verbose >= 2: exception_info()
7268 pass
7269 try:
7270 if ol["parent"] > -1:
7271 txt += "/Parent %i 0 R" % xref[ol["parent"]]
7272 except Exception:
7273 # Verbose in PyMuPDF/tests.
7274 if g_exceptions_verbose >= 2: exception_info()
7275 pass
7276 try:
7277 if ol["prev"] > -1:
7278 txt += "/Prev %i 0 R" % xref[ol["prev"]]
7279 except Exception:
7280 # Verbose in PyMuPDF/tests.
7281 if g_exceptions_verbose >= 2: exception_info()
7282 pass
7283 try:
7284 txt += "/Title" + ol["title"]
7285 except Exception:
7286 # Verbose in PyMuPDF/tests.
7287 if g_exceptions_verbose >= 2: exception_info()
7288 pass
7289
7290 if ol.get("color") and len(ol["color"]) == 3:
7291 txt += f"/C[ {_format_g(tuple(ol['color']))}]"
7292 if ol.get("flags", 0) > 0:
7293 txt += "/F %i" % ol["flags"]
7294
7295 if i == 0: # special: this is the outline root
7296 txt += "/Type/Outlines" # so add the /Type entry
7297 txt += ">>"
7298 doc.update_object(xref[i], txt) # insert the PDF object
7299
7300 doc.init_doc()
7301 return toclen
7302
7303 def set_toc_item(
7304 doc: 'Document',
7305 idx: int,
7306 dest_dict: OptDict = None,
7307 kind: OptInt = None,
7308 pno: OptInt = None,
7309 uri: OptStr = None,
7310 title: OptStr = None,
7311 to: point_like = None,
7312 filename: OptStr = None,
7313 zoom: float = 0,
7314 ) -> None:
7315 """Update TOC item by index.
7316
7317 It allows changing the item's title and link destination.
7318
7319 Args:
7320 idx:
7321 (int) desired index of the TOC list, as created by get_toc.
7322 dest_dict:
7323 (dict) destination dictionary as created by get_toc(False).
7324 Outrules all other parameters. If None, the remaining parameters
7325 are used to make a dest dictionary.
7326 kind:
7327 (int) kind of link (pymupdf.LINK_GOTO, etc.). If None, then only
7328 the title will be updated. If pymupdf.LINK_NONE, the TOC item will
7329 be deleted.
7330 pno:
7331 (int) page number (1-based like in get_toc). Required if
7332 pymupdf.LINK_GOTO.
7333 uri:
7334 (str) the URL, required if pymupdf.LINK_URI.
7335 title:
7336 (str) the new title. No change if None.
7337 to:
7338 (point-like) destination on the target page. If omitted, (72, 36)
7339 will be used as target coordinates.
7340 filename:
7341 (str) destination filename, required for pymupdf.LINK_GOTOR and
7342 pymupdf.LINK_LAUNCH.
7343 name:
7344 (str) a destination name for pymupdf.LINK_NAMED.
7345 zoom:
7346 (float) a zoom factor for the target location (pymupdf.LINK_GOTO).
7347 """
7348 xref = doc.get_outline_xrefs()[idx]
7349 page_xref = 0
7350 if type(dest_dict) is dict:
7351 if dest_dict["kind"] == LINK_GOTO:
7352 pno = dest_dict["page"]
7353 page_xref = doc.page_xref(pno)
7354 page_height = doc.page_cropbox(pno).height
7355 to = dest_dict.get('to', Point(72, 36))
7356 to.y = page_height - to.y
7357 dest_dict["to"] = to
7358 action = utils.getDestStr(page_xref, dest_dict)
7359 if not action.startswith("/A"):
7360 raise ValueError("bad bookmark dest")
7361 color = dest_dict.get("color")
7362 if color:
7363 color = list(map(float, color))
7364 if len(color) != 3 or min(color) < 0 or max(color) > 1:
7365 raise ValueError("bad color value")
7366 bold = dest_dict.get("bold", False)
7367 italic = dest_dict.get("italic", False)
7368 flags = italic + 2 * bold
7369 collapse = dest_dict.get("collapse")
7370 return doc._update_toc_item(
7371 xref,
7372 action=action[2:],
7373 title=title,
7374 color=color,
7375 flags=flags,
7376 collapse=collapse,
7377 )
7378
7379 if kind == LINK_NONE: # delete bookmark item
7380 return doc.del_toc_item(idx)
7381 if kind is None and title is None: # treat as no-op
7382 return None
7383 if kind is None: # only update title text
7384 return doc._update_toc_item(xref, action=None, title=title)
7385
7386 if kind == LINK_GOTO:
7387 if pno is None or pno not in range(1, doc.page_count + 1):
7388 raise ValueError("bad page number")
7389 page_xref = doc.page_xref(pno - 1)
7390 page_height = doc.page_cropbox(pno - 1).height
7391 if to is None:
7392 to = Point(72, page_height - 36)
7393 else:
7394 to = Point(to)
7395 to.y = page_height - to.y
7396
7397 ddict = {
7398 "kind": kind,
7399 "to": to,
7400 "uri": uri,
7401 "page": pno,
7402 "file": filename,
7403 "zoom": zoom,
7404 }
7405 action = utils.getDestStr(page_xref, ddict)
7406 if action == "" or not action.startswith("/A"):
7407 raise ValueError("bad bookmark dest")
7408
7409 return doc._update_toc_item(xref, action=action[2:], title=title)
5847 7410
5848 def set_xml_metadata(self, metadata): 7411 def set_xml_metadata(self, metadata):
5849 """Store XML document level metadata.""" 7412 """Store XML document level metadata."""
5850 if self.is_closed or self.is_encrypted: 7413 if self.is_closed or self.is_encrypted:
5851 raise ValueError("document closed or encrypted") 7414 raise ValueError("document closed or encrypted")
5860 else: 7423 else:
5861 xml = mupdf.pdf_add_stream( pdf, res, mupdf.PdfObj(), 0) 7424 xml = mupdf.pdf_add_stream( pdf, res, mupdf.PdfObj(), 0)
5862 mupdf.pdf_dict_put( xml, PDF_NAME('Type'), PDF_NAME('Metadata')) 7425 mupdf.pdf_dict_put( xml, PDF_NAME('Type'), PDF_NAME('Metadata'))
5863 mupdf.pdf_dict_put( xml, PDF_NAME('Subtype'), PDF_NAME('XML')) 7426 mupdf.pdf_dict_put( xml, PDF_NAME('Subtype'), PDF_NAME('XML'))
5864 mupdf.pdf_dict_put( root, PDF_NAME('Metadata'), xml) 7427 mupdf.pdf_dict_put( root, PDF_NAME('Metadata'), xml)
7428
7429 def subset_fonts(doc: 'Document', verbose: bool = False, fallback: bool = False) -> OptInt:
7430 """Build font subsets in a PDF.
7431
7432 Eligible fonts are potentially replaced by smaller versions. Page text is
7433 NOT rewritten and thus should retain properties like being hidden or
7434 controlled by optional content.
7435
7436 This method by default uses MuPDF's own internal feature to create subset
7437 fonts. As this is a new function, errors may still occur. In this case,
7438 please fall back to using the previous version by using "fallback=True".
7439 Fallback mode requires the external package 'fontTools'.
7440
7441 Args:
7442 fallback: use the older deprecated implementation.
7443 verbose: only used by fallback mode.
7444
7445 Returns:
7446 The new MuPDF-based code returns None. The deprecated fallback
7447 mode returns 0 if there are no fonts to subset. Otherwise, it
7448 returns the decrease in fontsize (the difference in fontsize),
7449 measured in bytes.
7450 """
7451 # Font binaries: - "buffer" -> (names, xrefs, (unicodes, glyphs))
7452 # An embedded font is uniquely defined by its fontbuffer only. It may have
7453 # multiple names and xrefs.
7454 # Once the sets of used unicodes and glyphs are known, we compute a
7455 # smaller version of the buffer user package fontTools.
7456
7457 if not fallback: # by default use MuPDF function
7458 pdf = mupdf.pdf_document_from_fz_document(doc)
7459 mupdf.pdf_subset_fonts2(pdf, list(range(doc.page_count)))
7460 return
7461
7462 font_buffers = {}
7463
7464 def get_old_widths(xref):
7465 """Retrieve old font '/W' and '/DW' values."""
7466 df = doc.xref_get_key(xref, "DescendantFonts")
7467 if df[0] != "array": # only handle xref specifications
7468 return None, None
7469 df_xref = int(df[1][1:-1].replace("0 R", ""))
7470 widths = doc.xref_get_key(df_xref, "W")
7471 if widths[0] != "array": # no widths key found
7472 widths = None
7473 else:
7474 widths = widths[1]
7475 dwidths = doc.xref_get_key(df_xref, "DW")
7476 if dwidths[0] != "int":
7477 dwidths = None
7478 else:
7479 dwidths = dwidths[1]
7480 return widths, dwidths
7481
7482 def set_old_widths(xref, widths, dwidths):
7483 """Restore the old '/W' and '/DW' in subsetted font.
7484
7485 If either parameter is None or evaluates to False, the corresponding
7486 dictionary key will be set to null.
7487 """
7488 df = doc.xref_get_key(xref, "DescendantFonts")
7489 if df[0] != "array": # only handle xref specs
7490 return None
7491 df_xref = int(df[1][1:-1].replace("0 R", ""))
7492 if (type(widths) is not str or not widths) and doc.xref_get_key(df_xref, "W")[
7493 0
7494 ] != "null":
7495 doc.xref_set_key(df_xref, "W", "null")
7496 else:
7497 doc.xref_set_key(df_xref, "W", widths)
7498 if (type(dwidths) is not str or not dwidths) and doc.xref_get_key(
7499 df_xref, "DW"
7500 )[0] != "null":
7501 doc.xref_set_key(df_xref, "DW", "null")
7502 else:
7503 doc.xref_set_key(df_xref, "DW", dwidths)
7504 return None
7505
7506 def set_subset_fontname(new_xref):
7507 """Generate a name prefix to tag a font as subset.
7508
7509 We use a random generator to select 6 upper case ASCII characters.
7510 The prefixed name must be put in the font xref as the "/BaseFont" value
7511 and in the FontDescriptor object as the '/FontName' value.
7512 """
7513 # The following generates a prefix like 'ABCDEF+'
7514 import random
7515 import string
7516 prefix = "".join(random.choices(tuple(string.ascii_uppercase), k=6)) + "+"
7517 font_str = doc.xref_object(new_xref, compressed=True)
7518 font_str = font_str.replace("/BaseFont/", "/BaseFont/" + prefix)
7519 df = doc.xref_get_key(new_xref, "DescendantFonts")
7520 if df[0] == "array":
7521 df_xref = int(df[1][1:-1].replace("0 R", ""))
7522 fd = doc.xref_get_key(df_xref, "FontDescriptor")
7523 if fd[0] == "xref":
7524 fd_xref = int(fd[1].replace("0 R", ""))
7525 fd_str = doc.xref_object(fd_xref, compressed=True)
7526 fd_str = fd_str.replace("/FontName/", "/FontName/" + prefix)
7527 doc.update_object(fd_xref, fd_str)
7528 doc.update_object(new_xref, font_str)
7529
7530 def build_subset(buffer, unc_set, gid_set):
7531 """Build font subset using fontTools.
7532
7533 Args:
7534 buffer: (bytes) the font given as a binary buffer.
7535 unc_set: (set) required glyph ids.
7536 Returns:
7537 Either None if subsetting is unsuccessful or the subset font buffer.
7538 """
7539 try:
7540 import fontTools.subset as fts
7541 except ImportError:
7542 if g_exceptions_verbose: exception_info()
7543 message("This method requires fontTools to be installed.")
7544 raise
7545 import tempfile
7546 with tempfile.TemporaryDirectory() as tmp_dir:
7547 oldfont_path = f"{tmp_dir}/oldfont.ttf"
7548 newfont_path = f"{tmp_dir}/newfont.ttf"
7549 uncfile_path = f"{tmp_dir}/uncfile.txt"
7550 args = [
7551 oldfont_path,
7552 "--retain-gids",
7553 f"--output-file={newfont_path}",
7554 "--layout-features=*",
7555 "--passthrough-tables",
7556 "--ignore-missing-glyphs",
7557 "--ignore-missing-unicodes",
7558 "--symbol-cmap",
7559 ]
7560
7561 # store glyph ids or unicodes as file
7562 with open(f"{tmp_dir}/uncfile.txt", "w", encoding='utf8') as unc_file:
7563 if 0xFFFD in unc_set: # error unicode exists -> use glyphs
7564 args.append(f"--gids-file={uncfile_path}")
7565 gid_set.add(189)
7566 unc_list = list(gid_set)
7567 for unc in unc_list:
7568 unc_file.write("%i\n" % unc)
7569 else:
7570 args.append(f"--unicodes-file={uncfile_path}")
7571 unc_set.add(255)
7572 unc_list = list(unc_set)
7573 for unc in unc_list:
7574 unc_file.write("%04x\n" % unc)
7575
7576 # store fontbuffer as a file
7577 with open(oldfont_path, "wb") as fontfile:
7578 fontfile.write(buffer)
7579 try:
7580 os.remove(newfont_path) # remove old file
7581 except Exception:
7582 pass
7583 try: # invoke fontTools subsetter
7584 fts.main(args)
7585 font = Font(fontfile=newfont_path)
7586 new_buffer = font.buffer # subset font binary
7587 if font.glyph_count == 0: # intercept empty font
7588 new_buffer = None
7589 except Exception:
7590 exception_info()
7591 new_buffer = None
7592 return new_buffer
7593
7594 def repl_fontnames(doc):
7595 """Populate 'font_buffers'.
7596
7597 For each font candidate, store its xref and the list of names
7598 by which PDF text may refer to it (there may be multiple).
7599 """
7600
7601 def norm_name(name):
7602 """Recreate font name that contains PDF hex codes.
7603
7604 E.g. #20 -> space, chr(32)
7605 """
7606 while "#" in name:
7607 p = name.find("#")
7608 c = int(name[p + 1 : p + 3], 16)
7609 name = name.replace(name[p : p + 3], chr(c))
7610 return name
7611
7612 def get_fontnames(doc, item):
7613 """Return a list of fontnames for an item of page.get_fonts().
7614
7615 There may be multiple names e.g. for Type0 fonts.
7616 """
7617 fontname = item[3]
7618 names = [fontname]
7619 fontname = doc.xref_get_key(item[0], "BaseFont")[1][1:]
7620 fontname = norm_name(fontname)
7621 if fontname not in names:
7622 names.append(fontname)
7623 descendents = doc.xref_get_key(item[0], "DescendantFonts")
7624 if descendents[0] != "array":
7625 return names
7626 descendents = descendents[1][1:-1]
7627 if descendents.endswith(" 0 R"):
7628 xref = int(descendents[:-4])
7629 descendents = doc.xref_object(xref, compressed=True)
7630 p1 = descendents.find("/BaseFont")
7631 if p1 >= 0:
7632 p2 = descendents.find("/", p1 + 1)
7633 p1 = min(descendents.find("/", p2 + 1), descendents.find(">>", p2 + 1))
7634 fontname = descendents[p2 + 1 : p1]
7635 fontname = norm_name(fontname)
7636 if fontname not in names:
7637 names.append(fontname)
7638 return names
7639
7640 for i in range(doc.page_count):
7641 for f in doc.get_page_fonts(i, full=True):
7642 font_xref = f[0] # font xref
7643 font_ext = f[1] # font file extension
7644 basename = f[3] # font basename
7645
7646 if font_ext not in ( # skip if not supported by fontTools
7647 "otf",
7648 "ttf",
7649 "woff",
7650 "woff2",
7651 ):
7652 continue
7653 # skip fonts which already are subsets
7654 if len(basename) > 6 and basename[6] == "+":
7655 continue
7656
7657 extr = doc.extract_font(font_xref)
7658 fontbuffer = extr[-1]
7659 names = get_fontnames(doc, f)
7660 name_set, xref_set, subsets = font_buffers.get(
7661 fontbuffer, (set(), set(), (set(), set()))
7662 )
7663 xref_set.add(font_xref)
7664 for name in names:
7665 name_set.add(name)
7666 font = Font(fontbuffer=fontbuffer)
7667 name_set.add(font.name)
7668 del font
7669 font_buffers[fontbuffer] = (name_set, xref_set, subsets)
7670
7671 def find_buffer_by_name(name):
7672 for buffer, (name_set, _, _) in font_buffers.items():
7673 if name in name_set:
7674 return buffer
7675 return None
7676
7677 # -----------------
7678 # main function
7679 # -----------------
7680 repl_fontnames(doc) # populate font information
7681 if not font_buffers: # nothing found to do
7682 if verbose:
7683 message(f'No fonts to subset.')
7684 return 0
7685
7686 old_fontsize = 0
7687 new_fontsize = 0
7688 for fontbuffer in font_buffers.keys():
7689 old_fontsize += len(fontbuffer)
7690
7691 # Scan page text for usage of subsettable fonts
7692 for page in doc:
7693 # go through the text and extend set of used glyphs by font
7694 # we use a modified MuPDF trace device, which delivers us glyph ids.
7695 for span in page.get_texttrace():
7696 if type(span) is not dict: # skip useless information
7697 continue
7698 fontname = span["font"][:33] # fontname for the span
7699 buffer = find_buffer_by_name(fontname)
7700 if buffer is None:
7701 continue
7702 name_set, xref_set, (set_ucs, set_gid) = font_buffers[buffer]
7703 for c in span["chars"]:
7704 set_ucs.add(c[0]) # unicode
7705 set_gid.add(c[1]) # glyph id
7706 font_buffers[buffer] = (name_set, xref_set, (set_ucs, set_gid))
7707
7708 # build the font subsets
7709 for old_buffer, (name_set, xref_set, subsets) in font_buffers.items():
7710 new_buffer = build_subset(old_buffer, subsets[0], subsets[1])
7711 fontname = list(name_set)[0]
7712 if new_buffer is None or len(new_buffer) >= len(old_buffer):
7713 # subset was not created or did not get smaller
7714 if verbose:
7715 message(f'Cannot subset {fontname!r}.')
7716 continue
7717 if verbose:
7718 message(f"Built subset of font {fontname!r}.")
7719 val = doc._insert_font(fontbuffer=new_buffer) # store subset font in PDF
7720 new_xref = val[0] # get its xref
7721 set_subset_fontname(new_xref) # tag fontname as subset font
7722 font_str = doc.xref_object( # get its object definition
7723 new_xref,
7724 compressed=True,
7725 )
7726 # walk through the original font xrefs and replace each by the subset def
7727 for font_xref in xref_set:
7728 # we need the original '/W' and '/DW' width values
7729 width_table, def_width = get_old_widths(font_xref)
7730 # ... and replace original font definition at xref with it
7731 doc.update_object(font_xref, font_str)
7732 # now copy over old '/W' and '/DW' values
7733 if width_table or def_width:
7734 set_old_widths(font_xref, width_table, def_width)
7735 # 'new_xref' remains unused in the PDF and must be removed
7736 # by garbage collection.
7737 new_fontsize += len(new_buffer)
7738
7739 return old_fontsize - new_fontsize
5865 7740
5866 def switch_layer(self, config, as_default=0): 7741 def switch_layer(self, config, as_default=0):
5867 """Activate an OC layer.""" 7742 """Activate an OC layer."""
5868 pdf = _as_pdf_document(self) 7743 pdf = _as_pdf_document(self)
5869 cfgs = mupdf.pdf_dict_getl( 7744 cfgs = mupdf.pdf_dict_getl(
5971 preserve_metadata=preserve_metadata, 7846 preserve_metadata=preserve_metadata,
5972 use_objstms=use_objstms, 7847 use_objstms=use_objstms,
5973 compression_effort=compression_effort, 7848 compression_effort=compression_effort,
5974 ) 7849 )
5975 return bio.getvalue() 7850 return bio.getvalue()
7851
7852 def tobytes(self, *args, **kwargs):
7853 return self.write(*args, **kwargs)
5976 7854
5977 @property 7855 @property
5978 def xref(self): 7856 def xref(self):
5979 """PDF xref number of page.""" 7857 """PDF xref number of page."""
5980 CheckParent(self) 7858 CheckParent(self)
5981 return self.parent.page_xref(self.number) 7859 return self.parent.page_xref(self.number)
5982 7860
7861 def xref_copy(doc: 'Document', source: int, target: int, *, keep: list = None) -> None:
7862 """Copy a PDF dictionary object to another one given their xref numbers.
7863
7864 Args:
7865 doc: PDF document object
7866 source: source xref number
7867 target: target xref number, the xref must already exist
7868 keep: an optional list of 1st level keys in target that should not be
7869 removed before copying.
7870 Notes:
7871 This works similar to the copy() method of dictionaries in Python. The
7872 source may be a stream object.
7873 """
7874 if doc.xref_is_stream(source):
7875 # read new xref stream, maintaining compression
7876 stream = doc.xref_stream_raw(source)
7877 doc.update_stream(
7878 target,
7879 stream,
7880 compress=False, # keeps source compression
7881 new=True, # in case target is no stream
7882 )
7883
7884 # empty the target completely, observe exceptions
7885 if keep is None:
7886 keep = []
7887 for key in doc.xref_get_keys(target):
7888 if key in keep:
7889 continue
7890 doc.xref_set_key(target, key, "null")
7891 # copy over all source dict items
7892 for key in doc.xref_get_keys(source):
7893 item = doc.xref_get_key(source, key)
7894 doc.xref_set_key(target, key, item[1])
7895
5983 def xref_get_key(self, xref, key): 7896 def xref_get_key(self, xref, key):
5984 """Get PDF dict key value of object at 'xref'.""" 7897 """Get PDF dict key value of object at 'xref'."""
5985 pdf = _as_pdf_document(self) 7898 pdf = _as_pdf_document(self)
5986 xreflen = mupdf.pdf_xref_len(pdf) 7899 xreflen = mupdf.pdf_xref_len(pdf)
5987 if not _INRANGE(xref, 1, xreflen-1) and xref != -1: 7900 if not _INRANGE(xref, 1, xreflen-1) and xref != -1:
6194 return xref 8107 return xref
6195 8108
6196 __slots__ = ('this', 'page_count2', 'this_is_pdf', '__dict__') 8109 __slots__ = ('this', 'page_count2', 'this_is_pdf', '__dict__')
6197 8110
6198 outline = property(lambda self: self._outline) 8111 outline = property(lambda self: self._outline)
6199 tobytes = write
6200 is_stream = xref_is_stream 8112 is_stream = xref_is_stream
6201 8113
6202 open = Document 8114 open = Document
6203 8115
6204 8116
8733 for xref in annot_xrefs: 10645 for xref in annot_xrefs:
8734 annot = self.load_annot(xref) 10646 annot = self.load_annot(xref)
8735 annot._yielded=True 10647 annot._yielded=True
8736 yield annot 10648 yield annot
8737 10649
10650 def apply_redactions(
10651 page: 'Page',
10652 images: int = 2,
10653 graphics: int = 1,
10654 text: int = 0,
10655 ) -> bool:
10656 """Apply the redaction annotations of the page.
10657
10658 Args:
10659 page: the PDF page.
10660 images:
10661 0 - ignore images
10662 1 - remove all overlapping images
10663 2 - blank out overlapping image parts
10664 3 - remove image unless invisible
10665 graphics:
10666 0 - ignore graphics
10667 1 - remove graphics if contained in rectangle
10668 2 - remove all overlapping graphics
10669 text:
10670 0 - remove text
10671 1 - ignore text
10672 """
10673
10674 def center_rect(annot_rect, new_text, font, fsize):
10675 """Calculate minimal sub-rectangle for the overlay text.
10676
10677 Notes:
10678 Because 'insert_textbox' supports no vertical text centering,
10679 we calculate an approximate number of lines here and return a
10680 sub-rect with smaller height, which should still be sufficient.
10681 Args:
10682 annot_rect: the annotation rectangle
10683 new_text: the text to insert.
10684 font: the fontname. Must be one of the CJK or Base-14 set, else
10685 the rectangle is returned unchanged.
10686 fsize: the fontsize
10687 Returns:
10688 A rectangle to use instead of the annot rectangle.
10689 """
10690 if not new_text or annot_rect.width <= EPSILON:
10691 return annot_rect
10692 try:
10693 text_width = get_text_length(new_text, font, fsize)
10694 except (ValueError, mupdf.FzErrorBase): # unsupported font
10695 if g_exceptions_verbose:
10696 exception_info()
10697 return annot_rect
10698 line_height = fsize * 1.2
10699 limit = annot_rect.width
10700 h = math.ceil(text_width / limit) * line_height # estimate rect height
10701 if h >= annot_rect.height:
10702 return annot_rect
10703 r = annot_rect
10704 y = (annot_rect.tl.y + annot_rect.bl.y - h) * 0.5
10705 r.y0 = y
10706 return r
10707
10708 CheckParent(page)
10709 doc = page.parent
10710 if doc.is_encrypted or doc.is_closed:
10711 raise ValueError("document closed or encrypted")
10712 if not doc.is_pdf:
10713 raise ValueError("is no PDF")
10714
10715 redact_annots = [] # storage of annot values
10716 for annot in page.annots(
10717 types=(mupdf.PDF_ANNOT_REDACT,) # pylint: disable=no-member
10718 ):
10719 # loop redactions
10720 redact_annots.append(annot._get_redact_values()) # save annot values
10721
10722 if redact_annots == []: # any redactions on this page?
10723 return False # no redactions
10724
10725 rc = page._apply_redactions(text, images, graphics) # call MuPDF
10726 if not rc: # should not happen really
10727 raise ValueError("Error applying redactions.")
10728
10729 # now write replacement text in old redact rectangles
10730 shape = page.new_shape()
10731 for redact in redact_annots:
10732 annot_rect = redact["rect"]
10733 fill = redact["fill"]
10734 if fill:
10735 shape.draw_rect(annot_rect) # colorize the rect background
10736 shape.finish(fill=fill, color=fill)
10737 if "text" in redact.keys(): # if we also have text
10738 new_text = redact["text"]
10739 align = redact.get("align", 0)
10740 fname = redact["fontname"]
10741 fsize = redact["fontsize"]
10742 color = redact["text_color"]
10743 # try finding vertical centered sub-rect
10744 trect = center_rect(annot_rect, new_text, fname, fsize)
10745
10746 rc = -1
10747 while rc < 0 and fsize >= 4: # while not enough room
10748 # (re-) try insertion
10749 rc = shape.insert_textbox(
10750 trect,
10751 new_text,
10752 fontname=fname,
10753 fontsize=fsize,
10754 color=color,
10755 align=align,
10756 )
10757 fsize -= 0.5 # reduce font if unsuccessful
10758 shape.commit() # append new contents object
10759 return True
10760
8738 def recolor(self, components=1): 10761 def recolor(self, components=1):
8739 """Convert colorspaces of objects on the page. 10762 """Convert colorspaces of objects on the page.
8740 10763
8741 Valid values are 1, 3 and 4. 10764 Valid values are 1, 3 and 4.
8742 """ 10765 """
8841 val.parent = weakref.proxy(self) # owning page object 10864 val.parent = weakref.proxy(self) # owning page object
8842 val.parent._annot_refs[id(val)] = val 10865 val.parent._annot_refs[id(val)] = val
8843 annot._erase() 10866 annot._erase()
8844 return val 10867 return val
8845 10868
10869 def delete_image(page: 'Page', xref: int):
10870 """Delete the image referred to by xef.
10871
10872 Actually replaces by a small transparent Pixmap using method Page.replace_image.
10873
10874 Args:
10875 xref: xref of the image to delete.
10876 """
10877 # make a small 100% transparent pixmap (of just any dimension)
10878 pix = Pixmap(csGRAY, (0, 0, 1, 1), 1)
10879 pix.clear_with() # clear all samples bytes to 0x00
10880 page.replace_image(xref, pixmap=pix)
10881
8846 def delete_link(self, linkdict): 10882 def delete_link(self, linkdict):
8847 """Delete a Link.""" 10883 """Delete a Link."""
8848 CheckParent(self) 10884 CheckParent(self)
8849 if not isinstance( linkdict, dict): 10885 if not isinstance( linkdict, dict):
8850 return # have no dictionary 10886 return # have no dictionary
8885 mupdf.pdf_dict_put( page.obj(), PDF_NAME('Annots'), annots) 10921 mupdf.pdf_dict_put( page.obj(), PDF_NAME('Annots'), annots)
8886 JM_refresh_links( page) 10922 JM_refresh_links( page)
8887 10923
8888 return finished() 10924 return finished()
8889 10925
10926 def delete_widget(page: 'Page', widget: Widget) -> Widget:
10927 """Delete widget from page and return the next one."""
10928 CheckParent(page)
10929 annot = getattr(widget, "_annot", None)
10930 if annot is None:
10931 raise ValueError("bad type: widget")
10932 nextwidget = widget.next
10933 page.delete_annot(annot)
10934 widget._annot.parent = None
10935 keylist = list(widget.__dict__.keys())
10936 for key in keylist:
10937 del widget.__dict__[key]
10938 return nextwidget
10939
8890 @property 10940 @property
8891 def derotation_matrix(self) -> Matrix: 10941 def derotation_matrix(self) -> Matrix:
8892 """Reflects page de-rotation.""" 10942 """Reflects page de-rotation."""
8893 if g_use_extra: 10943 if g_use_extra:
8894 return Matrix(extra.Page_derotate_matrix( self.this)) 10944 return Matrix(extra.Page_derotate_matrix( self.this))
8895 pdfpage = self._pdf_page(required=False) 10945 pdfpage = self._pdf_page(required=False)
8896 if not pdfpage.m_internal: 10946 if not pdfpage.m_internal:
8897 return Matrix(mupdf.FzRect(mupdf.FzRect.UNIT)) 10947 return Matrix(mupdf.FzRect(mupdf.FzRect.UNIT))
8898 return Matrix(JM_derotate_page_matrix(pdfpage)) 10948 return Matrix(JM_derotate_page_matrix(pdfpage))
10949
10950 def draw_bezier(
10951 page: 'Page',
10952 p1: point_like,
10953 p2: point_like,
10954 p3: point_like,
10955 p4: point_like,
10956 color: OptSeq = (0,),
10957 fill: OptSeq = None,
10958 dashes: OptStr = None,
10959 width: float = 1,
10960 morph: OptStr = None,
10961 closePath: bool = False,
10962 lineCap: int = 0,
10963 lineJoin: int = 0,
10964 overlay: bool = True,
10965 stroke_opacity: float = 1,
10966 fill_opacity: float = 1,
10967 oc: int = 0,
10968 ) -> Point:
10969 """Draw a general cubic Bezier curve from p1 to p4 using control points p2 and p3."""
10970 img = page.new_shape()
10971 Q = img.draw_bezier(Point(p1), Point(p2), Point(p3), Point(p4))
10972 img.finish(
10973 color=color,
10974 fill=fill,
10975 dashes=dashes,
10976 width=width,
10977 lineCap=lineCap,
10978 lineJoin=lineJoin,
10979 morph=morph,
10980 closePath=closePath,
10981 stroke_opacity=stroke_opacity,
10982 fill_opacity=fill_opacity,
10983 oc=oc,
10984 )
10985 img.commit(overlay)
10986
10987 return Q
10988
10989 def draw_circle(
10990 page: 'Page',
10991 center: point_like,
10992 radius: float,
10993 color: OptSeq = (0,),
10994 fill: OptSeq = None,
10995 morph: OptSeq = None,
10996 dashes: OptStr = None,
10997 width: float = 1,
10998 lineCap: int = 0,
10999 lineJoin: int = 0,
11000 overlay: bool = True,
11001 stroke_opacity: float = 1,
11002 fill_opacity: float = 1,
11003 oc: int = 0,
11004 ) -> Point:
11005 """Draw a circle given its center and radius."""
11006 img = page.new_shape()
11007 Q = img.draw_circle(Point(center), radius)
11008 img.finish(
11009 color=color,
11010 fill=fill,
11011 dashes=dashes,
11012 width=width,
11013 lineCap=lineCap,
11014 lineJoin=lineJoin,
11015 morph=morph,
11016 stroke_opacity=stroke_opacity,
11017 fill_opacity=fill_opacity,
11018 oc=oc,
11019 )
11020 img.commit(overlay)
11021 return Q
11022
11023 def draw_curve(
11024 page: 'Page',
11025 p1: point_like,
11026 p2: point_like,
11027 p3: point_like,
11028 color: OptSeq = (0,),
11029 fill: OptSeq = None,
11030 dashes: OptStr = None,
11031 width: float = 1,
11032 morph: OptSeq = None,
11033 closePath: bool = False,
11034 lineCap: int = 0,
11035 lineJoin: int = 0,
11036 overlay: bool = True,
11037 stroke_opacity: float = 1,
11038 fill_opacity: float = 1,
11039 oc: int = 0,
11040 ) -> Point:
11041 """Draw a special Bezier curve from p1 to p3, generating control points on lines p1 to p2 and p2 to p3."""
11042 img = page.new_shape()
11043 Q = img.draw_curve(Point(p1), Point(p2), Point(p3))
11044 img.finish(
11045 color=color,
11046 fill=fill,
11047 dashes=dashes,
11048 width=width,
11049 lineCap=lineCap,
11050 lineJoin=lineJoin,
11051 morph=morph,
11052 closePath=closePath,
11053 stroke_opacity=stroke_opacity,
11054 fill_opacity=fill_opacity,
11055 oc=oc,
11056 )
11057 img.commit(overlay)
11058
11059 return Q
11060
11061 def draw_line(
11062 page: 'Page',
11063 p1: point_like,
11064 p2: point_like,
11065 color: OptSeq = (0,),
11066 dashes: OptStr = None,
11067 width: float = 1,
11068 lineCap: int = 0,
11069 lineJoin: int = 0,
11070 overlay: bool = True,
11071 morph: OptSeq = None,
11072 stroke_opacity: float = 1,
11073 fill_opacity: float = 1,
11074 oc=0,
11075 ) -> Point:
11076 """Draw a line from point p1 to point p2."""
11077 img = page.new_shape()
11078 p = img.draw_line(Point(p1), Point(p2))
11079 img.finish(
11080 color=color,
11081 dashes=dashes,
11082 width=width,
11083 closePath=False,
11084 lineCap=lineCap,
11085 lineJoin=lineJoin,
11086 morph=morph,
11087 stroke_opacity=stroke_opacity,
11088 fill_opacity=fill_opacity,
11089 oc=oc,
11090 )
11091 img.commit(overlay)
11092
11093 return p
11094
11095 def draw_oval(
11096 page: 'Page',
11097 rect: typing.Union[rect_like, quad_like],
11098 color: OptSeq = (0,),
11099 fill: OptSeq = None,
11100 dashes: OptStr = None,
11101 morph: OptSeq = None,
11102 width: float = 1,
11103 lineCap: int = 0,
11104 lineJoin: int = 0,
11105 overlay: bool = True,
11106 stroke_opacity: float = 1,
11107 fill_opacity: float = 1,
11108 oc: int = 0,
11109 ) -> Point:
11110 """Draw an oval given its containing rectangle or quad."""
11111 img = page.new_shape()
11112 Q = img.draw_oval(rect)
11113 img.finish(
11114 color=color,
11115 fill=fill,
11116 dashes=dashes,
11117 width=width,
11118 lineCap=lineCap,
11119 lineJoin=lineJoin,
11120 morph=morph,
11121 stroke_opacity=stroke_opacity,
11122 fill_opacity=fill_opacity,
11123 oc=oc,
11124 )
11125 img.commit(overlay)
11126
11127 return Q
11128
11129 def draw_polyline(
11130 page: 'Page',
11131 points: list,
11132 color: OptSeq = (0,),
11133 fill: OptSeq = None,
11134 dashes: OptStr = None,
11135 width: float = 1,
11136 morph: OptSeq = None,
11137 lineCap: int = 0,
11138 lineJoin: int = 0,
11139 overlay: bool = True,
11140 closePath: bool = False,
11141 stroke_opacity: float = 1,
11142 fill_opacity: float = 1,
11143 oc: int = 0,
11144 ) -> Point:
11145 """Draw multiple connected line segments."""
11146 img = page.new_shape()
11147 Q = img.draw_polyline(points)
11148 img.finish(
11149 color=color,
11150 fill=fill,
11151 dashes=dashes,
11152 width=width,
11153 lineCap=lineCap,
11154 lineJoin=lineJoin,
11155 morph=morph,
11156 closePath=closePath,
11157 stroke_opacity=stroke_opacity,
11158 fill_opacity=fill_opacity,
11159 oc=oc,
11160 )
11161 img.commit(overlay)
11162
11163 return Q
11164
11165 def draw_quad(
11166 page: 'Page',
11167 quad: quad_like,
11168 color: OptSeq = (0,),
11169 fill: OptSeq = None,
11170 dashes: OptStr = None,
11171 width: float = 1,
11172 lineCap: int = 0,
11173 lineJoin: int = 0,
11174 morph: OptSeq = None,
11175 overlay: bool = True,
11176 stroke_opacity: float = 1,
11177 fill_opacity: float = 1,
11178 oc: int = 0,
11179 ) -> Point:
11180 """Draw a quadrilateral."""
11181 img = page.new_shape()
11182 Q = img.draw_quad(Quad(quad))
11183 img.finish(
11184 color=color,
11185 fill=fill,
11186 dashes=dashes,
11187 width=width,
11188 lineCap=lineCap,
11189 lineJoin=lineJoin,
11190 morph=morph,
11191 stroke_opacity=stroke_opacity,
11192 fill_opacity=fill_opacity,
11193 oc=oc,
11194 )
11195 img.commit(overlay)
11196
11197 return Q
11198
11199 def draw_rect(
11200 page: 'Page',
11201 rect: rect_like,
11202 color: OptSeq = (0,),
11203 fill: OptSeq = None,
11204 dashes: OptStr = None,
11205 width: float = 1,
11206 lineCap: int = 0,
11207 lineJoin: int = 0,
11208 morph: OptSeq = None,
11209 overlay: bool = True,
11210 stroke_opacity: float = 1,
11211 fill_opacity: float = 1,
11212 oc: int = 0,
11213 radius=None,
11214 ) -> Point:
11215 '''
11216 Draw a rectangle. See Shape class method for details.
11217 '''
11218 img = page.new_shape()
11219 Q = img.draw_rect(Rect(rect), radius=radius)
11220 img.finish(
11221 color=color,
11222 fill=fill,
11223 dashes=dashes,
11224 width=width,
11225 lineCap=lineCap,
11226 lineJoin=lineJoin,
11227 morph=morph,
11228 stroke_opacity=stroke_opacity,
11229 fill_opacity=fill_opacity,
11230 oc=oc,
11231 )
11232 img.commit(overlay)
11233
11234 return Q
11235
11236 def draw_sector(
11237 page: 'Page',
11238 center: point_like,
11239 point: point_like,
11240 beta: float,
11241 color: OptSeq = (0,),
11242 fill: OptSeq = None,
11243 dashes: OptStr = None,
11244 fullSector: bool = True,
11245 morph: OptSeq = None,
11246 width: float = 1,
11247 closePath: bool = False,
11248 lineCap: int = 0,
11249 lineJoin: int = 0,
11250 overlay: bool = True,
11251 stroke_opacity: float = 1,
11252 fill_opacity: float = 1,
11253 oc: int = 0,
11254 ) -> Point:
11255 """Draw a circle sector given circle center, one arc end point and the angle of the arc.
11256
11257 Parameters:
11258 center -- center of circle
11259 point -- arc end point
11260 beta -- angle of arc (degrees)
11261 fullSector -- connect arc ends with center
11262 """
11263 img = page.new_shape()
11264 Q = img.draw_sector(Point(center), Point(point), beta, fullSector=fullSector)
11265 img.finish(
11266 color=color,
11267 fill=fill,
11268 dashes=dashes,
11269 width=width,
11270 lineCap=lineCap,
11271 lineJoin=lineJoin,
11272 morph=morph,
11273 closePath=closePath,
11274 stroke_opacity=stroke_opacity,
11275 fill_opacity=fill_opacity,
11276 oc=oc,
11277 )
11278 img.commit(overlay)
11279
11280 return Q
11281
11282 def draw_squiggle(
11283 page: 'Page',
11284 p1: point_like,
11285 p2: point_like,
11286 breadth: float = 2,
11287 color: OptSeq = (0,),
11288 dashes: OptStr = None,
11289 width: float = 1,
11290 lineCap: int = 0,
11291 lineJoin: int = 0,
11292 overlay: bool = True,
11293 morph: OptSeq = None,
11294 stroke_opacity: float = 1,
11295 fill_opacity: float = 1,
11296 oc: int = 0,
11297 ) -> Point:
11298 """Draw a squiggly line from point p1 to point p2."""
11299 img = page.new_shape()
11300 p = img.draw_squiggle(Point(p1), Point(p2), breadth=breadth)
11301 img.finish(
11302 color=color,
11303 dashes=dashes,
11304 width=width,
11305 closePath=False,
11306 lineCap=lineCap,
11307 lineJoin=lineJoin,
11308 morph=morph,
11309 stroke_opacity=stroke_opacity,
11310 fill_opacity=fill_opacity,
11311 oc=oc,
11312 )
11313 img.commit(overlay)
11314
11315 return p
11316
11317 def draw_zigzag(
11318 page: 'Page',
11319 p1: point_like,
11320 p2: point_like,
11321 breadth: float = 2,
11322 color: OptSeq = (0,),
11323 dashes: OptStr = None,
11324 width: float = 1,
11325 lineCap: int = 0,
11326 lineJoin: int = 0,
11327 overlay: bool = True,
11328 morph: OptSeq = None,
11329 stroke_opacity: float = 1,
11330 fill_opacity: float = 1,
11331 oc: int = 0,
11332 ) -> Point:
11333 """Draw a zigzag line from point p1 to point p2."""
11334 img = page.new_shape()
11335 p = img.draw_zigzag(Point(p1), Point(p2), breadth=breadth)
11336 img.finish(
11337 color=color,
11338 dashes=dashes,
11339 width=width,
11340 closePath=False,
11341 lineCap=lineCap,
11342 lineJoin=lineJoin,
11343 morph=morph,
11344 stroke_opacity=stroke_opacity,
11345 fill_opacity=fill_opacity,
11346 oc=oc,
11347 )
11348 img.commit(overlay)
11349
11350 return p
8899 11351
8900 def extend_textpage(self, tpage, flags=0, matrix=None): 11352 def extend_textpage(self, tpage, flags=0, matrix=None):
8901 page = self.this 11353 page = self.this
8902 tp = tpage.this 11354 tp = tpage.this
8903 assert isinstance( tp, mupdf.FzStextPage) 11355 assert isinstance( tp, mupdf.FzStextPage)
9217 paths.append(npath) 11669 paths.append(npath)
9218 11670
9219 val = None 11671 val = None
9220 return paths 11672 return paths
9221 11673
11674 def get_image_info(
11675 page: 'Page',
11676 hashes: bool = False,
11677 xrefs: bool = False
11678 ) -> list:
11679 """Extract image information only from a pymupdf.TextPage.
11680
11681 Args:
11682 hashes: (bool) include MD5 hash for each image.
11683 xrefs: (bool) try to find the xref for each image. Sets hashes to true.
11684 """
11685 doc = page.parent
11686 if xrefs and doc.is_pdf:
11687 hashes = True
11688 if not doc.is_pdf:
11689 xrefs = False
11690 imginfo = getattr(page, "_image_info", None)
11691 if imginfo and not xrefs:
11692 return imginfo
11693 if not imginfo:
11694 tp = page.get_textpage(flags=TEXT_PRESERVE_IMAGES)
11695 imginfo = tp.extractIMGINFO(hashes=hashes)
11696 del tp
11697 if hashes:
11698 page._image_info = imginfo
11699 if not xrefs or not doc.is_pdf:
11700 return imginfo
11701 imglist = page.get_images()
11702 digests = {}
11703 for item in imglist:
11704 xref = item[0]
11705 pix = Pixmap(doc, xref)
11706 digests[pix.digest] = xref
11707 del pix
11708 for i in range(len(imginfo)):
11709 item = imginfo[i]
11710 xref = digests.get(item["digest"], 0)
11711 item["xref"] = xref
11712 imginfo[i] = item
11713 return imginfo
11714
11715 def get_image_rects(page: 'Page', name, transform=False) -> list:
11716 """Return list of image positions on a page.
11717
11718 Args:
11719 name: (str, list, int) image identification. May be reference name, an
11720 item of the page's image list or an xref.
11721 transform: (bool) whether to also return the transformation matrix.
11722 Returns:
11723 A list of pymupdf.Rect objects or tuples of (pymupdf.Rect, pymupdf.Matrix)
11724 for all image locations on the page.
11725 """
11726 if type(name) in (list, tuple):
11727 xref = name[0]
11728 elif type(name) is int:
11729 xref = name
11730 else:
11731 imglist = [i for i in page.get_images() if i[7] == name]
11732 if imglist == []:
11733 raise ValueError("bad image name")
11734 elif len(imglist) != 1:
11735 raise ValueError("multiple image names found")
11736 xref = imglist[0][0]
11737 pix = Pixmap(page.parent, xref) # make pixmap of the image to compute MD5
11738 digest = pix.digest
11739 del pix
11740 infos = page.get_image_info(hashes=True)
11741 if not transform:
11742 bboxes = [Rect(im["bbox"]) for im in infos if im["digest"] == digest]
11743 else:
11744 bboxes = [
11745 (Rect(im["bbox"]), Matrix(im["transform"]))
11746 for im in infos
11747 if im["digest"] == digest
11748 ]
11749 return bboxes
11750
11751 def get_label(page):
11752 """Return the label for this PDF page.
11753
11754 Args:
11755 page: page object.
11756 Returns:
11757 The label (str) of the page. Errors return an empty string.
11758 """
11759 # Jorj McKie, 2021-01-06
11760
11761 labels = page.parent._get_page_labels()
11762 if not labels:
11763 return ""
11764 labels.sort()
11765 return utils.get_label_pno(page.number, labels)
11766
11767 def get_links(page: 'Page') -> list:
11768 """Create a list of all links contained in a PDF page.
11769
11770 Notes:
11771 see PyMuPDF ducmentation for details.
11772 """
11773
11774 CheckParent(page)
11775 ln = page.first_link
11776 links = []
11777 while ln:
11778 nl = utils.getLinkDict(ln, page.parent)
11779 links.append(nl)
11780 ln = ln.next
11781 if links != [] and page.parent.is_pdf:
11782 linkxrefs = [x for x in
11783 #page.annot_xrefs()
11784 JM_get_annot_xref_list2(page)
11785 if x[1] == mupdf.PDF_ANNOT_LINK # pylint: disable=no-member
11786 ]
11787 if len(linkxrefs) == len(links):
11788 for i in range(len(linkxrefs)):
11789 links[i]["xref"] = linkxrefs[i][0]
11790 links[i]["id"] = linkxrefs[i][2]
11791 return links
11792
11793 def get_pixmap(
11794 page: 'Page',
11795 *,
11796 matrix: matrix_like=Identity,
11797 dpi=None,
11798 colorspace: Colorspace=None,
11799 clip: rect_like=None,
11800 alpha: bool=False,
11801 annots: bool=True,
11802 ) -> 'Pixmap':
11803 """Create pixmap of page.
11804
11805 Keyword args:
11806 matrix: Matrix for transformation (default: Identity).
11807 dpi: desired dots per inch. If given, matrix is ignored.
11808 colorspace: (str/Colorspace) cmyk, rgb, gray - case ignored, default csRGB.
11809 clip: (irect-like) restrict rendering to this area.
11810 alpha: (bool) whether to include alpha channel
11811 annots: (bool) whether to also render annotations
11812 """
11813 if colorspace is None:
11814 colorspace = csRGB
11815 if dpi:
11816 zoom = dpi / 72
11817 matrix = Matrix(zoom, zoom)
11818
11819 if type(colorspace) is str:
11820 if colorspace.upper() == "GRAY":
11821 colorspace = csGRAY
11822 elif colorspace.upper() == "CMYK":
11823 colorspace = csCMYK
11824 else:
11825 colorspace = csRGB
11826 if colorspace.n not in (1, 3, 4):
11827 raise ValueError("unsupported colorspace")
11828
11829 dl = page.get_displaylist(annots=annots)
11830 pix = dl.get_pixmap(matrix=matrix, colorspace=colorspace, alpha=alpha, clip=clip)
11831 dl = None
11832 if dpi:
11833 pix.set_dpi(dpi, dpi)
11834 return pix
11835
9222 def remove_rotation(self): 11836 def remove_rotation(self):
9223 """Set page rotation to 0 while maintaining visual appearance.""" 11837 """Set page rotation to 0 while maintaining visual appearance."""
9224 rot = self.rotation # normalized rotation value 11838 rot = self.rotation # normalized rotation value
9225 if rot == 0: 11839 if rot == 0:
9226 return Identity # nothing to do 11840 return Identity # nothing to do
9502 rc = tp.extractTextbox(rect) 12116 rc = tp.extractTextbox(rect)
9503 if textpage is None: 12117 if textpage is None:
9504 del tp 12118 del tp
9505 return rc 12119 return rc
9506 12120
12121 def get_text(self, *args, **kwargs):
12122 return utils.get_text(self, *args, **kwargs)
12123
12124 def get_text_blocks(self, *args, **kwargs):
12125 return utils.get_text_blocks(self, *args, **kwargs)
12126
12127 def get_text_selection(self, *args, **kwargs):
12128 return utils.get_text_selection(self, *args, **kwargs)
12129
12130 def get_text_words(self, *args, **kwargs):
12131 return utils.get_text_words(self, *args, **kwargs)
12132
12133 def get_textpage_ocr(self, *args, **kwargs):
12134 return utils.get_textpage_ocr(self, *args, **kwargs)
12135
9507 def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "TextPage": 12136 def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "TextPage":
9508 CheckParent(self) 12137 CheckParent(self)
9509 if matrix is None: 12138 if matrix is None:
9510 matrix = Matrix(1, 1) 12139 matrix = Matrix(1, 1)
9511 old_rotation = self.rotation 12140 old_rotation = self.rotation
9626 return xref # we are done 12255 return xref # we are done
9627 12256
9628 # need to create document font info 12257 # need to create document font info
9629 doc.get_char_widths(xref, fontdict=fontdict) 12258 doc.get_char_widths(xref, fontdict=fontdict)
9630 return xref 12259 return xref
12260
12261 def insert_htmlbox(
12262 page,
12263 rect,
12264 text,
12265 *,
12266 css=None,
12267 scale_low=0,
12268 archive=None,
12269 rotate=0,
12270 oc=0,
12271 opacity=1,
12272 overlay=True,
12273 _scale_word_width=True,
12274 _verbose=False,
12275 ) -> tuple:
12276 """Insert text with optional HTML tags and stylings into a rectangle.
12277
12278 Args:
12279 rect: (rect-like) rectangle into which the text should be placed.
12280 text: (str) text with optional HTML tags and stylings.
12281 css: (str) CSS styling commands.
12282 scale_low: (float) force-fit content by scaling it down. Must be in
12283 range [0, 1]. If 1, no scaling will take place. If 0, arbitrary
12284 down-scaling is acceptable. A value of 0.1 would mean that content
12285 may be scaled down by at most 90%.
12286 archive: Archive object pointing to locations of used fonts or images
12287 rotate: (int) rotate the text in the box by a multiple of 90 degrees.
12288 oc: (int) the xref of an OCG / OCMD (Optional Content).
12289 opacity: (float) set opacity of inserted content.
12290 overlay: (bool) put text on top of page content.
12291 _scale_word_width: internal, for testing only.
12292 _verbose: internal, for testing only.
12293 Returns:
12294 A tuple of floats (spare_height, scale).
12295 spare_height:
12296 The height of the remaining space in <rect> below the
12297 text, or -1 if we failed to fit.
12298 scale:
12299 The scaling required; `0 < scale <= 1`.
12300 Will be less than `scale_low` if we failed to fit.
12301 """
12302 # normalize rotation angle
12303 if not rotate % 90 == 0:
12304 raise ValueError("bad rotation angle")
12305 while rotate < 0:
12306 rotate += 360
12307 while rotate >= 360:
12308 rotate -= 360
12309
12310 if not 0 <= scale_low <= 1:
12311 raise ValueError("'scale_low' must be in [0, 1]")
12312
12313 if css is None:
12314 css = ""
12315
12316 rect = Rect(rect)
12317 if rotate in (90, 270):
12318 temp_rect = Rect(0, 0, rect.height, rect.width)
12319 else:
12320 temp_rect = Rect(0, 0, rect.width, rect.height)
12321
12322 # use a small border by default
12323 mycss = "body {margin:1px;}" + css # append user CSS
12324
12325 # either make a story, or accept a given one
12326 if isinstance(text, str): # if a string, convert to a Story
12327 story = Story(html=text, user_css=mycss, archive=archive)
12328 elif isinstance(text, Story):
12329 story = text
12330 else:
12331 raise ValueError("'text' must be a string or a Story")
12332
12333 # ----------------------------------------------------------------
12334 # Find a scaling factor that lets our story fit in. Instead of scaling
12335 # the text smaller, we instead look at how much bigger the rect needs
12336 # to be to fit the text, then reverse the scaling to get how much we
12337 # need to scale down the text.
12338 # ----------------------------------------------------------------
12339 rect_scale_max = None if scale_low == 0 else 1 / scale_low
12340
12341 fit = story.fit_scale(
12342 temp_rect,
12343 scale_min=1,
12344 scale_max=rect_scale_max,
12345 flags=mupdf.FZ_PLACE_STORY_FLAG_NO_OVERFLOW if _scale_word_width else 0,
12346 verbose=_verbose,
12347 )
12348
12349 if not fit.big_enough: # there was no fit
12350 scale = 1 / fit.parameter
12351 return (-1, scale)
12352
12353 # fit.filled is a tuple; we convert it in place to a Rect for
12354 # convenience. (fit.rect is already a Rect.)
12355 fit.filled = Rect(fit.filled)
12356 assert (fit.rect.x0, fit.rect.y0) == (0, 0)
12357 assert (fit.filled.x0, fit.filled.y0) == (0, 0)
12358
12359 scale = 1 / fit.parameter
12360 assert scale >= scale_low, f'{scale_low=} {scale=}'
12361
12362 spare_height = max((fit.rect.y1 - fit.filled.y1) * scale, 0)
12363
12364 def rect_function(*args):
12365 return fit.rect, fit.rect, None
12366
12367 # draw story on temp PDF page
12368 doc = story.write_with_links(rect_function)
12369
12370 # Insert opacity if requested.
12371 # For this, we prepend a command to the /Contents.
12372 if 0 <= opacity < 1:
12373 tpage = doc[0] # load page
12374 # generate /ExtGstate for the page
12375 alp0 = tpage._set_opacity(CA=opacity, ca=opacity)
12376 s = f"/{alp0} gs\n" # generate graphic state command
12377 TOOLS._insert_contents(tpage, s.encode(), 0)
12378
12379 # put result in target page
12380 page.show_pdf_page(rect, doc, 0, rotate=rotate, oc=oc, overlay=overlay)
12381
12382 # -------------------------------------------------------------------------
12383 # re-insert links in target rect (show_pdf_page cannot copy annotations)
12384 # -------------------------------------------------------------------------
12385 # scaled center point of fit.rect
12386 mp1 = (fit.rect.tl + fit.rect.br) / 2 * scale
12387
12388 # center point of target rect
12389 mp2 = (rect.tl + rect.br) / 2
12390
12391 # compute link positioning matrix:
12392 # - move center of scaled-down fit.rect to (0,0)
12393 # - rotate
12394 # - move (0,0) to center of target rect
12395 mat = (
12396 Matrix(scale, 0, 0, scale, -mp1.x, -mp1.y)
12397 * Matrix(-rotate)
12398 * Matrix(1, 0, 0, 1, mp2.x, mp2.y)
12399 )
12400
12401 # copy over links
12402 for link in doc[0].get_links():
12403 link["from"] *= mat
12404 page.insert_link(link)
12405
12406 return spare_height, scale
12407
12408 def insert_image(
12409 page,
12410 rect,
12411 *,
12412 alpha=-1,
12413 filename=None,
12414 height=0,
12415 keep_proportion=True,
12416 mask=None,
12417 oc=0,
12418 overlay=True,
12419 pixmap=None,
12420 rotate=0,
12421 stream=None,
12422 width=0,
12423 xref=0,
12424 ):
12425 """Insert an image for display in a rectangle.
12426
12427 Args:
12428 rect: (rect_like) position of image on the page.
12429 alpha: (int, optional) set to 0 if image has no transparency.
12430 filename: (str, Path, file object) image filename.
12431 height: (int)
12432 keep_proportion: (bool) keep width / height ratio (default).
12433 mask: (bytes, optional) image consisting of alpha values to use.
12434 oc: (int) xref of OCG or OCMD to declare as Optional Content.
12435 overlay: (bool) put in foreground (default) or background.
12436 pixmap: (pymupdf.Pixmap) use this as image.
12437 rotate: (int) rotate by 0, 90, 180 or 270 degrees.
12438 stream: (bytes) use this as image.
12439 width: (int)
12440 xref: (int) use this as image.
12441
12442 'page' and 'rect' are positional, all other parameters are keywords.
12443
12444 If 'xref' is given, that image is used. Other input options are ignored.
12445 Else, exactly one of pixmap, stream or filename must be given.
12446
12447 'alpha=0' for non-transparent images improves performance significantly.
12448 Affects stream and filename only.
12449
12450 Optimum transparent insertions are possible by using filename / stream in
12451 conjunction with a 'mask' image of alpha values.
12452
12453 Returns:
12454 xref (int) of inserted image. Re-use as argument for multiple insertions.
12455 """
12456 CheckParent(page)
12457 doc = page.parent
12458 if not doc.is_pdf:
12459 raise ValueError("is no PDF")
12460
12461 if xref == 0 and (bool(filename) + bool(stream) + bool(pixmap) != 1):
12462 raise ValueError("xref=0 needs exactly one of filename, pixmap, stream")
12463
12464 if filename:
12465 if type(filename) is str:
12466 pass
12467 elif hasattr(filename, "absolute"):
12468 filename = str(filename)
12469 elif hasattr(filename, "name"):
12470 filename = filename.name
12471 else:
12472 raise ValueError("bad filename")
12473
12474 if filename and not os.path.exists(filename):
12475 raise FileNotFoundError("No such file: '%s'" % filename)
12476 elif stream and type(stream) not in (bytes, bytearray, io.BytesIO):
12477 raise ValueError("stream must be bytes-like / BytesIO")
12478 elif pixmap and type(pixmap) is not Pixmap:
12479 raise ValueError("pixmap must be a Pixmap")
12480 if mask and not (stream or filename):
12481 raise ValueError("mask requires stream or filename")
12482 if mask and type(mask) not in (bytes, bytearray, io.BytesIO):
12483 raise ValueError("mask must be bytes-like / BytesIO")
12484 while rotate < 0:
12485 rotate += 360
12486 while rotate >= 360:
12487 rotate -= 360
12488 if rotate not in (0, 90, 180, 270):
12489 raise ValueError("bad rotate value")
12490
12491 r = Rect(rect)
12492 if r.is_empty or r.is_infinite:
12493 raise ValueError("rect must be finite and not empty")
12494 clip = r * ~page.transformation_matrix
12495
12496 # Create a unique image reference name.
12497 ilst = [i[7] for i in doc.get_page_images(page.number)]
12498 ilst += [i[1] for i in doc.get_page_xobjects(page.number)]
12499 ilst += [i[4] for i in doc.get_page_fonts(page.number)]
12500 n = "fzImg" # 'pymupdf image'
12501 i = 0
12502 _imgname = n + "0" # first name candidate
12503 while _imgname in ilst:
12504 i += 1
12505 _imgname = n + str(i) # try new name
12506
12507 if overlay:
12508 page.wrap_contents() # ensure a balanced graphics state
12509 digests = doc.InsertedImages
12510 xref, digests = page._insert_image(
12511 filename=filename,
12512 pixmap=pixmap,
12513 stream=stream,
12514 imask=mask,
12515 clip=clip,
12516 overlay=overlay,
12517 oc=oc,
12518 xref=xref,
12519 rotate=rotate,
12520 keep_proportion=keep_proportion,
12521 width=width,
12522 height=height,
12523 alpha=alpha,
12524 _imgname=_imgname,
12525 digests=digests,
12526 )
12527 if digests is not None:
12528 doc.InsertedImages = digests
12529
12530 return xref
12531
12532 def insert_link(page: 'Page', lnk: dict, mark: bool = True) -> None:
12533 """Insert a new link for the current page."""
12534 CheckParent(page)
12535 annot = utils.getLinkText(page, lnk)
12536 if annot == "":
12537 raise ValueError("link kind not supported")
12538 page._addAnnot_FromString((annot,))
12539
12540 def insert_text(
12541 page: 'Page',
12542 point: point_like,
12543 text: typing.Union[str, list],
12544 *,
12545 fontsize: float = 11,
12546 lineheight: OptFloat = None,
12547 fontname: str = "helv",
12548 fontfile: OptStr = None,
12549 set_simple: int = 0,
12550 encoding: int = 0,
12551 color: OptSeq = None,
12552 fill: OptSeq = None,
12553 border_width: float = 0.05,
12554 miter_limit: float = 1,
12555 render_mode: int = 0,
12556 rotate: int = 0,
12557 morph: OptSeq = None,
12558 overlay: bool = True,
12559 stroke_opacity: float = 1,
12560 fill_opacity: float = 1,
12561 oc: int = 0,
12562 ):
12563
12564 img = page.new_shape()
12565 rc = img.insert_text(
12566 point,
12567 text,
12568 fontsize=fontsize,
12569 lineheight=lineheight,
12570 fontname=fontname,
12571 fontfile=fontfile,
12572 set_simple=set_simple,
12573 encoding=encoding,
12574 color=color,
12575 fill=fill,
12576 border_width=border_width,
12577 render_mode=render_mode,
12578 miter_limit=miter_limit,
12579 rotate=rotate,
12580 morph=morph,
12581 stroke_opacity=stroke_opacity,
12582 fill_opacity=fill_opacity,
12583 oc=oc,
12584 )
12585 if rc >= 0:
12586 img.commit(overlay)
12587 return rc
12588
12589 def insert_textbox(
12590 page: 'Page',
12591 rect: rect_like,
12592 buffer: typing.Union[str, list],
12593 *,
12594 fontname: str = "helv",
12595 fontfile: OptStr = None,
12596 set_simple: int = 0,
12597 encoding: int = 0,
12598 fontsize: float = 11,
12599 lineheight: OptFloat = None,
12600 color: OptSeq = None,
12601 fill: OptSeq = None,
12602 expandtabs: int = 1,
12603 align: int = 0,
12604 rotate: int = 0,
12605 render_mode: int = 0,
12606 miter_limit: float = 1,
12607 border_width: float = 0.05,
12608 morph: OptSeq = None,
12609 overlay: bool = True,
12610 stroke_opacity: float = 1,
12611 fill_opacity: float = 1,
12612 oc: int = 0,
12613 ) -> float:
12614 """Insert text into a given rectangle.
12615
12616 Notes:
12617 Creates a Shape object, uses its same-named method and commits it.
12618 Parameters:
12619 rect: (rect-like) area to use for text.
12620 buffer: text to be inserted
12621 fontname: a Base-14 font, font name or '/name'
12622 fontfile: name of a font file
12623 fontsize: font size
12624 lineheight: overwrite the font property
12625 color: RGB color triple
12626 expandtabs: handles tabulators with string function
12627 align: left, center, right, justified
12628 rotate: 0, 90, 180, or 270 degrees
12629 morph: morph box with a matrix and a fixpoint
12630 overlay: put text in foreground or background
12631 Returns:
12632 unused or deficit rectangle area (float)
12633 """
12634 img = page.new_shape()
12635 rc = img.insert_textbox(
12636 rect,
12637 buffer,
12638 fontsize=fontsize,
12639 lineheight=lineheight,
12640 fontname=fontname,
12641 fontfile=fontfile,
12642 set_simple=set_simple,
12643 encoding=encoding,
12644 color=color,
12645 fill=fill,
12646 expandtabs=expandtabs,
12647 render_mode=render_mode,
12648 miter_limit=miter_limit,
12649 border_width=border_width,
12650 align=align,
12651 rotate=rotate,
12652 morph=morph,
12653 stroke_opacity=stroke_opacity,
12654 fill_opacity=fill_opacity,
12655 oc=oc,
12656 )
12657 if rc >= 0:
12658 img.commit(overlay)
12659 return rc
9631 12660
9632 @property 12661 @property
9633 def is_wrapped(self): 12662 def is_wrapped(self):
9634 """Check if /Contents is in a balanced graphics state.""" 12663 """Check if /Contents is in a balanced graphics state."""
9635 return self._count_q_balance() == (0, 0) 12664 return self._count_q_balance() == (0, 0)
9739 12768
9740 @property 12769 @property
9741 def mediabox_size(self): 12770 def mediabox_size(self):
9742 return Point(self.mediabox.x1, self.mediabox.y1) 12771 return Point(self.mediabox.x1, self.mediabox.y1)
9743 12772
12773 def new_shape(self):
12774 return Shape(self)
12775
9744 #@property 12776 #@property
9745 #def parent( self): 12777 #def parent( self):
9746 # assert self._parent 12778 # assert self._parent
9747 # if self._parent: 12779 # if self._parent:
9748 # return self._parent 12780 # return self._parent
9757 CheckParent(self) 12789 CheckParent(self)
9758 doc = self.parent 12790 doc = self.parent
9759 page = doc.reload_page(self) 12791 page = doc.reload_page(self)
9760 # fixme this looks wrong. 12792 # fixme this looks wrong.
9761 self.this = page 12793 self.this = page
12794
12795 def replace_image(
12796 page: 'Page',
12797 xref: int,
12798 *,
12799 filename=None,
12800 pixmap=None,
12801 stream=None,
12802 ):
12803 """Replace the image referred to by xref.
12804
12805 Replace the image by changing the object definition stored under xref. This
12806 will leave the pages appearance instructions intact, so the new image is
12807 being displayed with the same bbox, rotation etc.
12808 By providing a small fully transparent image, an effect as if the image had
12809 been deleted can be achieved.
12810 A typical use may include replacing large images by a smaller version,
12811 e.g. with a lower resolution or graylevel instead of colored.
12812
12813 Args:
12814 xref: the xref of the image to replace.
12815 filename, pixmap, stream: exactly one of these must be provided. The
12816 meaning being the same as in Page.insert_image.
12817 """
12818 doc = page.parent # the owning document
12819 if not doc.xref_is_image(xref):
12820 raise ValueError("xref not an image") # insert new image anywhere in page
12821 if bool(filename) + bool(stream) + bool(pixmap) != 1:
12822 raise ValueError("Exactly one of filename/stream/pixmap must be given")
12823 new_xref = page.insert_image(
12824 page.rect, filename=filename, stream=stream, pixmap=pixmap
12825 )
12826 doc.xref_copy(new_xref, xref) # copy over new to old
12827 last_contents_xref = page.get_contents()[-1]
12828 # new image insertion has created a new /Contents source,
12829 # which we will set to spaces now
12830 doc.update_stream(last_contents_xref, b" ")
12831 page._image_info = None # clear cache of extracted image information
9762 12832
9763 @property 12833 @property
9764 def rotation(self): 12834 def rotation(self):
9765 """Page rotation.""" 12835 """Page rotation."""
9766 CheckParent(self) 12836 CheckParent(self)
9778 """Run page through a device. 12848 """Run page through a device.
9779 dw: DeviceWrapper 12849 dw: DeviceWrapper
9780 """ 12850 """
9781 CheckParent(self) 12851 CheckParent(self)
9782 mupdf.fz_run_page(self.this, dw.device, JM_matrix_from_py(m), mupdf.FzCookie()) 12852 mupdf.fz_run_page(self.this, dw.device, JM_matrix_from_py(m), mupdf.FzCookie())
12853
12854 def search_for(
12855 page,
12856 text,
12857 *,
12858 clip=None,
12859 quads=False,
12860 flags=None,
12861 textpage=None,
12862 ) -> list:
12863 """Search for a string on a page.
12864
12865 Args:
12866 text: string to be searched for
12867 clip: restrict search to this rectangle
12868 quads: (bool) return quads instead of rectangles
12869 flags: bit switches, default: join hyphened words
12870 textpage: a pre-created pymupdf.TextPage
12871 Returns:
12872 a list of rectangles or quads, each containing one occurrence.
12873 """
12874 if flags is None:
12875 flags=(0
12876 | TEXT_DEHYPHENATE
12877 | TEXT_PRESERVE_WHITESPACE
12878 | TEXT_PRESERVE_LIGATURES
12879 | TEXT_MEDIABOX_CLIP
12880 )
12881 if clip is not None:
12882 clip = Rect(clip)
12883
12884 CheckParent(page)
12885 tp = textpage
12886 if tp is None:
12887 tp = page.get_textpage(clip=clip, flags=flags) # create pymupdf.TextPage
12888 elif getattr(tp, "parent") != page:
12889 raise ValueError("not a textpage of this page")
12890 rlist = tp.search(text, quads=quads)
12891 if textpage is None:
12892 del tp
12893 return rlist
9783 12894
9784 def set_artbox(self, rect): 12895 def set_artbox(self, rect):
9785 """Set the ArtBox.""" 12896 """Set the ArtBox."""
9786 return self._set_pagebox("ArtBox", rect) 12897 return self._set_pagebox("ArtBox", rect)
9787 12898
9846 12957
9847 def set_trimbox(self, rect): 12958 def set_trimbox(self, rect):
9848 """Set the TrimBox.""" 12959 """Set the TrimBox."""
9849 return self._set_pagebox("TrimBox", rect) 12960 return self._set_pagebox("TrimBox", rect)
9850 12961
12962 def show_pdf_page(
12963 page,
12964 rect,
12965 docsrc,
12966 pno=0,
12967 keep_proportion=True,
12968 overlay=True,
12969 oc=0,
12970 rotate=0,
12971 clip=None,
12972 ) -> int:
12973 """Show page number 'pno' of PDF 'docsrc' in rectangle 'rect'.
12974
12975 Args:
12976 rect: (rect-like) where to place the source image
12977 docsrc: (document) source PDF
12978 pno: (int) source page number
12979 keep_proportion: (bool) do not change width-height-ratio
12980 overlay: (bool) put in foreground
12981 oc: (xref) make visibility dependent on this OCG / OCMD (which must be defined in the target PDF)
12982 rotate: (int) degrees (multiple of 90)
12983 clip: (rect-like) part of source page rectangle
12984 Returns:
12985 xref of inserted object (for reuse)
12986 """
12987 def calc_matrix(sr, tr, keep=True, rotate=0):
12988 """Calculate transformation matrix from source to target rect.
12989
12990 Notes:
12991 The product of four matrices in this sequence: (1) translate correct
12992 source corner to origin, (2) rotate, (3) scale, (4) translate to
12993 target's top-left corner.
12994 Args:
12995 sr: source rect in PDF (!) coordinate system
12996 tr: target rect in PDF coordinate system
12997 keep: whether to keep source ratio of width to height
12998 rotate: rotation angle in degrees
12999 Returns:
13000 Transformation matrix.
13001 """
13002 # calc center point of source rect
13003 smp = (sr.tl + sr.br) / 2.0
13004 # calc center point of target rect
13005 tmp = (tr.tl + tr.br) / 2.0
13006
13007 # m moves to (0, 0), then rotates
13008 m = Matrix(1, 0, 0, 1, -smp.x, -smp.y) * Matrix(rotate)
13009
13010 sr1 = sr * m # resulting source rect to calculate scale factors
13011
13012 fw = tr.width / sr1.width # scale the width
13013 fh = tr.height / sr1.height # scale the height
13014 if keep:
13015 fw = fh = min(fw, fh) # take min if keeping aspect ratio
13016
13017 m *= Matrix(fw, fh) # concat scale matrix
13018 m *= Matrix(1, 0, 0, 1, tmp.x, tmp.y) # concat move to target center
13019 return JM_TUPLE(m)
13020
13021 CheckParent(page)
13022 doc = page.parent
13023
13024 if not doc.is_pdf or not docsrc.is_pdf:
13025 raise ValueError("is no PDF")
13026
13027 if rect.is_empty or rect.is_infinite:
13028 raise ValueError("rect must be finite and not empty")
13029
13030 while pno < 0: # support negative page numbers
13031 pno += docsrc.page_count
13032 src_page = docsrc[pno] # load source page
13033
13034 tar_rect = rect * ~page.transformation_matrix # target rect in PDF coordinates
13035
13036 src_rect = src_page.rect if not clip else src_page.rect & clip # source rect
13037 if src_rect.is_empty or src_rect.is_infinite:
13038 raise ValueError("clip must be finite and not empty")
13039 src_rect = src_rect * ~src_page.transformation_matrix # ... in PDF coord
13040
13041 matrix = calc_matrix(src_rect, tar_rect, keep=keep_proportion, rotate=rotate)
13042
13043 # list of existing /Form /XObjects
13044 ilst = [i[1] for i in doc.get_page_xobjects(page.number)]
13045 ilst += [i[7] for i in doc.get_page_images(page.number)]
13046 ilst += [i[4] for i in doc.get_page_fonts(page.number)]
13047
13048 # create a name not in that list
13049 n = "fzFrm"
13050 i = 0
13051 _imgname = n + "0"
13052 while _imgname in ilst:
13053 i += 1
13054 _imgname = n + str(i)
13055
13056 isrc = docsrc._graft_id # used as key for graftmaps
13057 if doc._graft_id == isrc:
13058 raise ValueError("source document must not equal target")
13059
13060 # retrieve / make Graftmap for source PDF
13061 gmap = doc.Graftmaps.get(isrc, None)
13062 if gmap is None:
13063 gmap = Graftmap(doc)
13064 doc.Graftmaps[isrc] = gmap
13065
13066 # take note of generated xref for automatic reuse
13067 pno_id = (isrc, pno) # id of docsrc[pno]
13068 xref = doc.ShownPages.get(pno_id, 0)
13069
13070 if overlay:
13071 page.wrap_contents() # ensure a balanced graphics state
13072 xref = page._show_pdf_page(
13073 src_page,
13074 overlay=overlay,
13075 matrix=matrix,
13076 xref=xref,
13077 oc=oc,
13078 clip=src_rect,
13079 graftmap=gmap,
13080 _imgname=_imgname,
13081 )
13082 doc.ShownPages[pno_id] = xref
13083
13084 return xref
13085
9851 @property 13086 @property
9852 def transformation_matrix(self): 13087 def transformation_matrix(self):
9853 """Page transformation matrix.""" 13088 """Page transformation matrix."""
9854 CheckParent(self) 13089 CheckParent(self)
9855 13090
9873 rect = self._other_box("TrimBox") 13108 rect = self._other_box("TrimBox")
9874 if rect is None: 13109 if rect is None:
9875 return self.cropbox 13110 return self.cropbox
9876 mb = self.mediabox 13111 mb = self.mediabox
9877 return Rect(rect[0], mb.y1 - rect[3], rect[2], mb.y1 - rect[1]) 13112 return Rect(rect[0], mb.y1 - rect[3], rect[2], mb.y1 - rect[1])
13113
13114 def update_link(page: 'Page', lnk: dict) -> None:
13115 """Update a link on the current page."""
13116 CheckParent(page)
13117 annot = utils.getLinkText(page, lnk)
13118 if annot == "":
13119 raise ValueError("link kind not supported")
13120
13121 page.parent.update_object(lnk["xref"], annot, page=page)
9878 13122
9879 def widgets(self, types=None): 13123 def widgets(self, types=None):
9880 """ Generator over the widgets of a page. 13124 """ Generator over the widgets of a page.
9881 13125
9882 Args: 13126 Args:
9900 prepend = b"q\n" * push 13144 prepend = b"q\n" * push
9901 TOOLS._insert_contents(self, prepend, False) 13145 TOOLS._insert_contents(self, prepend, False)
9902 if pop > 0: # append required pop commands 13146 if pop > 0: # append required pop commands
9903 append = b"\nQ" * pop + b"\n" 13147 append = b"\nQ" * pop + b"\n"
9904 TOOLS._insert_contents(self, append, True) 13148 TOOLS._insert_contents(self, append, True)
13149
13150 def write_text(
13151 page: 'Page',
13152 rect=None,
13153 writers=None,
13154 overlay=True,
13155 color=None,
13156 opacity=None,
13157 keep_proportion=True,
13158 rotate=0,
13159 oc=0,
13160 ) -> None:
13161 """Write the text of one or more pymupdf.TextWriter objects.
13162
13163 Args:
13164 rect: target rectangle. If None, the union of the text writers is used.
13165 writers: one or more pymupdf.TextWriter objects.
13166 overlay: put in foreground or background.
13167 keep_proportion: maintain aspect ratio of rectangle sides.
13168 rotate: arbitrary rotation angle.
13169 oc: the xref of an optional content object
13170 """
13171 assert isinstance(page, Page)
13172 if not writers:
13173 raise ValueError("need at least one pymupdf.TextWriter")
13174 if type(writers) is TextWriter:
13175 if rotate == 0 and rect is None:
13176 writers.write_text(page, opacity=opacity, color=color, overlay=overlay)
13177 return None
13178 else:
13179 writers = (writers,)
13180 clip = writers[0].text_rect
13181 textdoc = Document()
13182 tpage = textdoc.new_page(width=page.rect.width, height=page.rect.height)
13183 for writer in writers:
13184 clip |= writer.text_rect
13185 writer.write_text(tpage, opacity=opacity, color=color)
13186 if rect is None:
13187 rect = clip
13188 page.show_pdf_page(
13189 rect,
13190 textdoc,
13191 0,
13192 overlay=overlay,
13193 keep_proportion=keep_proportion,
13194 rotate=rotate,
13195 clip=clip,
13196 oc=oc,
13197 )
13198 textdoc = None
13199 tpage = None
9905 13200
9906 @property 13201 @property
9907 def xref(self): 13202 def xref(self):
9908 """PDF xref number of page.""" 13203 """PDF xref number of page."""
9909 CheckParent(self) 13204 CheckParent(self)
11501 irect = property(round) 14796 irect = property(round)
11502 tl = top_left 14797 tl = top_left
11503 tr = top_right 14798 tr = top_right
11504 14799
11505 14800
14801 class Shape:
14802 """Create a new shape."""
14803
14804 @staticmethod
14805 def horizontal_angle(C, P):
14806 """Return the angle to the horizontal for the connection from C to P.
14807 This uses the arcus sine function and resolves its inherent ambiguity by
14808 looking up in which quadrant vector S = P - C is located.
14809 """
14810 S = Point(P - C).unit # unit vector 'C' -> 'P'
14811 alfa = math.asin(abs(S.y)) # absolute angle from horizontal
14812 if S.x < 0: # make arcsin result unique
14813 if S.y <= 0: # bottom-left
14814 alfa = -(math.pi - alfa)
14815 else: # top-left
14816 alfa = math.pi - alfa
14817 else:
14818 if S.y >= 0: # top-right
14819 pass
14820 else: # bottom-right
14821 alfa = -alfa
14822 return alfa
14823
14824 def __init__(self, page: Page):
14825 CheckParent(page)
14826 self.page = page
14827 self.doc = page.parent
14828 if not self.doc.is_pdf:
14829 raise ValueError("is no PDF")
14830 self.height = page.mediabox_size.y
14831 self.width = page.mediabox_size.x
14832 self.x = page.cropbox_position.x
14833 self.y = page.cropbox_position.y
14834
14835 self.pctm = page.transformation_matrix # page transf. matrix
14836 self.ipctm = ~self.pctm # inverted transf. matrix
14837
14838 self.draw_cont = ""
14839 self.text_cont = ""
14840 self.totalcont = ""
14841 self.last_point = None
14842 self.rect = None
14843
14844 def updateRect(self, x):
14845 if self.rect is None:
14846 if len(x) == 2:
14847 self.rect = Rect(x, x)
14848 else:
14849 self.rect = Rect(x)
14850
14851 else:
14852 if len(x) == 2:
14853 x = Point(x)
14854 self.rect.x0 = min(self.rect.x0, x.x)
14855 self.rect.y0 = min(self.rect.y0, x.y)
14856 self.rect.x1 = max(self.rect.x1, x.x)
14857 self.rect.y1 = max(self.rect.y1, x.y)
14858 else:
14859 x = Rect(x)
14860 self.rect.x0 = min(self.rect.x0, x.x0)
14861 self.rect.y0 = min(self.rect.y0, x.y0)
14862 self.rect.x1 = max(self.rect.x1, x.x1)
14863 self.rect.y1 = max(self.rect.y1, x.y1)
14864
14865 def draw_line(self, p1: point_like, p2: point_like) -> Point:
14866 """Draw a line between two points."""
14867 p1 = Point(p1)
14868 p2 = Point(p2)
14869 if not (self.last_point == p1):
14870 self.draw_cont += _format_g(JM_TUPLE(p1 * self.ipctm)) + " m\n"
14871 self.last_point = p1
14872 self.updateRect(p1)
14873
14874 self.draw_cont += _format_g(JM_TUPLE(p2 * self.ipctm)) + " l\n"
14875 self.updateRect(p2)
14876 self.last_point = p2
14877 return self.last_point
14878
14879 def draw_polyline(self, points: list) -> Point:
14880 """Draw several connected line segments."""
14881 for i, p in enumerate(points):
14882 if i == 0:
14883 if not (self.last_point == Point(p)):
14884 self.draw_cont += _format_g(JM_TUPLE(Point(p) * self.ipctm)) + " m\n"
14885 self.last_point = Point(p)
14886 else:
14887 self.draw_cont += _format_g(JM_TUPLE(Point(p) * self.ipctm)) + " l\n"
14888 self.updateRect(p)
14889
14890 self.last_point = Point(points[-1])
14891 return self.last_point
14892
14893 def draw_bezier(
14894 self,
14895 p1: point_like,
14896 p2: point_like,
14897 p3: point_like,
14898 p4: point_like,
14899 ) -> Point:
14900 """Draw a standard cubic Bezier curve."""
14901 p1 = Point(p1)
14902 p2 = Point(p2)
14903 p3 = Point(p3)
14904 p4 = Point(p4)
14905 if not (self.last_point == p1):
14906 self.draw_cont += _format_g(JM_TUPLE(p1 * self.ipctm)) + " m\n"
14907 args = JM_TUPLE(list(p2 * self.ipctm) + list(p3 * self.ipctm) + list(p4 * self.ipctm))
14908 self.draw_cont += _format_g(args) + " c\n"
14909 self.updateRect(p1)
14910 self.updateRect(p2)
14911 self.updateRect(p3)
14912 self.updateRect(p4)
14913 self.last_point = p4
14914 return self.last_point
14915
14916 def draw_oval(self, tetra: typing.Union[quad_like, rect_like]) -> Point:
14917 """Draw an ellipse inside a tetrapod."""
14918 if len(tetra) != 4:
14919 raise ValueError("invalid arg length")
14920 if hasattr(tetra[0], "__float__"):
14921 q = Rect(tetra).quad
14922 else:
14923 q = Quad(tetra)
14924
14925 mt = q.ul + (q.ur - q.ul) * 0.5
14926 mr = q.ur + (q.lr - q.ur) * 0.5
14927 mb = q.ll + (q.lr - q.ll) * 0.5
14928 ml = q.ul + (q.ll - q.ul) * 0.5
14929 if not (self.last_point == ml):
14930 self.draw_cont += _format_g(JM_TUPLE(ml * self.ipctm)) + " m\n"
14931 self.last_point = ml
14932 self.draw_curve(ml, q.ll, mb)
14933 self.draw_curve(mb, q.lr, mr)
14934 self.draw_curve(mr, q.ur, mt)
14935 self.draw_curve(mt, q.ul, ml)
14936 self.updateRect(q.rect)
14937 self.last_point = ml
14938 return self.last_point
14939
14940 def draw_circle(self, center: point_like, radius: float) -> Point:
14941 """Draw a circle given its center and radius."""
14942 if not radius > EPSILON:
14943 raise ValueError("radius must be positive")
14944 center = Point(center)
14945 p1 = center - (radius, 0)
14946 return self.draw_sector(center, p1, 360, fullSector=False)
14947
14948 def draw_curve(
14949 self,
14950 p1: point_like,
14951 p2: point_like,
14952 p3: point_like,
14953 ) -> Point:
14954 """Draw a curve between points using one control point."""
14955 kappa = 0.55228474983
14956 p1 = Point(p1)
14957 p2 = Point(p2)
14958 p3 = Point(p3)
14959 k1 = p1 + (p2 - p1) * kappa
14960 k2 = p3 + (p2 - p3) * kappa
14961 return self.draw_bezier(p1, k1, k2, p3)
14962
14963 def draw_sector(
14964 self,
14965 center: point_like,
14966 point: point_like,
14967 beta: float,
14968 fullSector: bool = True,
14969 ) -> Point:
14970 """Draw a circle sector."""
14971 center = Point(center)
14972 point = Point(point)
14973 l3 = lambda a, b: _format_g((a, b)) + " m\n"
14974 l4 = lambda a, b, c, d, e, f: _format_g((a, b, c, d, e, f)) + " c\n"
14975 l5 = lambda a, b: _format_g((a, b)) + " l\n"
14976 betar = math.radians(-beta)
14977 w360 = math.radians(math.copysign(360, betar)) * (-1)
14978 w90 = math.radians(math.copysign(90, betar))
14979 w45 = w90 / 2
14980 while abs(betar) > 2 * math.pi:
14981 betar += w360 # bring angle below 360 degrees
14982 if not (self.last_point == point):
14983 self.draw_cont += l3(*JM_TUPLE(point * self.ipctm))
14984 self.last_point = point
14985 Q = Point(0, 0) # just make sure it exists
14986 C = center
14987 P = point
14988 S = P - C # vector 'center' -> 'point'
14989 rad = abs(S) # circle radius
14990
14991 if not rad > EPSILON:
14992 raise ValueError("radius must be positive")
14993
14994 alfa = self.horizontal_angle(center, point)
14995 while abs(betar) > abs(w90): # draw 90 degree arcs
14996 q1 = C.x + math.cos(alfa + w90) * rad
14997 q2 = C.y + math.sin(alfa + w90) * rad
14998 Q = Point(q1, q2) # the arc's end point
14999 r1 = C.x + math.cos(alfa + w45) * rad / math.cos(w45)
15000 r2 = C.y + math.sin(alfa + w45) * rad / math.cos(w45)
15001 R = Point(r1, r2) # crossing point of tangents
15002 kappah = (1 - math.cos(w45)) * 4 / 3 / abs(R - Q)
15003 kappa = kappah * abs(P - Q)
15004 cp1 = P + (R - P) * kappa # control point 1
15005 cp2 = Q + (R - Q) * kappa # control point 2
15006 self.draw_cont += l4(*JM_TUPLE(
15007 list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
15008 ))
15009
15010 betar -= w90 # reduce param angle by 90 deg
15011 alfa += w90 # advance start angle by 90 deg
15012 P = Q # advance to arc end point
15013 # draw (remaining) arc
15014 if abs(betar) > 1e-3: # significant degrees left?
15015 beta2 = betar / 2
15016 q1 = C.x + math.cos(alfa + betar) * rad
15017 q2 = C.y + math.sin(alfa + betar) * rad
15018 Q = Point(q1, q2) # the arc's end point
15019 r1 = C.x + math.cos(alfa + beta2) * rad / math.cos(beta2)
15020 r2 = C.y + math.sin(alfa + beta2) * rad / math.cos(beta2)
15021 R = Point(r1, r2) # crossing point of tangents
15022 # kappa height is 4/3 of segment height
15023 kappah = (1 - math.cos(beta2)) * 4 / 3 / abs(R - Q) # kappa height
15024 kappa = kappah * abs(P - Q) / (1 - math.cos(betar))
15025 cp1 = P + (R - P) * kappa # control point 1
15026 cp2 = Q + (R - Q) * kappa # control point 2
15027 self.draw_cont += l4(*JM_TUPLE(
15028 list(cp1 * self.ipctm) + list(cp2 * self.ipctm) + list(Q * self.ipctm)
15029 ))
15030 if fullSector:
15031 self.draw_cont += l3(*JM_TUPLE(point * self.ipctm))
15032 self.draw_cont += l5(*JM_TUPLE(center * self.ipctm))
15033 self.draw_cont += l5(*JM_TUPLE(Q * self.ipctm))
15034 self.last_point = Q
15035 return self.last_point
15036
15037 def draw_rect(self, rect: rect_like, *, radius=None) -> Point:
15038 """Draw a rectangle.
15039
15040 Args:
15041 radius: if not None, the rectangle will have rounded corners.
15042 This is the radius of the curvature, given as percentage of
15043 the rectangle width or height. Valid are values 0 < v <= 0.5.
15044 For a sequence of two values, the corners will have different
15045 radii. Otherwise, the percentage will be computed from the
15046 shorter side. A value of (0.5, 0.5) will draw an ellipse.
15047 """
15048 r = Rect(rect)
15049 if radius is None: # standard rectangle
15050 self.draw_cont += _format_g(JM_TUPLE(
15051 list(r.bl * self.ipctm) + [r.width, r.height]
15052 )) + " re\n"
15053 self.updateRect(r)
15054 self.last_point = r.tl
15055 return self.last_point
15056 # rounded corners requested. This requires 1 or 2 values, each
15057 # with 0 < value <= 0.5
15058 if hasattr(radius, "__float__"):
15059 if radius <= 0 or radius > 0.5:
15060 raise ValueError(f"bad radius value {radius}.")
15061 d = min(r.width, r.height) * radius
15062 px = (d, 0)
15063 py = (0, d)
15064 elif hasattr(radius, "__len__") and len(radius) == 2:
15065 rx, ry = radius
15066 px = (rx * r.width, 0)
15067 py = (0, ry * r.height)
15068 if min(rx, ry) <= 0 or max(rx, ry) > 0.5:
15069 raise ValueError(f"bad radius value {radius}.")
15070 else:
15071 raise ValueError(f"bad radius value {radius}.")
15072
15073 lp = self.draw_line(r.tl + py, r.bl - py)
15074 lp = self.draw_curve(lp, r.bl, r.bl + px)
15075
15076 lp = self.draw_line(lp, r.br - px)
15077 lp = self.draw_curve(lp, r.br, r.br - py)
15078
15079 lp = self.draw_line(lp, r.tr + py)
15080 lp = self.draw_curve(lp, r.tr, r.tr - px)
15081
15082 lp = self.draw_line(lp, r.tl + px)
15083 self.last_point = self.draw_curve(lp, r.tl, r.tl + py)
15084
15085 self.updateRect(r)
15086 return self.last_point
15087
15088 def draw_quad(self, quad: quad_like) -> Point:
15089 """Draw a Quad."""
15090 q = Quad(quad)
15091 return self.draw_polyline([q.ul, q.ll, q.lr, q.ur, q.ul])
15092
15093 def draw_zigzag(
15094 self,
15095 p1: point_like,
15096 p2: point_like,
15097 breadth: float = 2,
15098 ) -> Point:
15099 """Draw a zig-zagged line from p1 to p2."""
15100 p1 = Point(p1)
15101 p2 = Point(p2)
15102 S = p2 - p1 # vector start - end
15103 rad = abs(S) # distance of points
15104 cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
15105 if cnt < 4:
15106 raise ValueError("points too close")
15107 mb = rad / cnt # revised breadth
15108 matrix = Matrix(util_hor_matrix(p1, p2)) # normalize line to x-axis
15109 i_mat = ~matrix # get original position
15110 points = [] # stores edges
15111 for i in range(1, cnt):
15112 if i % 4 == 1: # point "above" connection
15113 p = Point(i, -1) * mb
15114 elif i % 4 == 3: # point "below" connection
15115 p = Point(i, 1) * mb
15116 else: # ignore others
15117 continue
15118 points.append(p * i_mat)
15119 self.draw_polyline([p1] + points + [p2]) # add start and end points
15120 return p2
15121
15122 def draw_squiggle(
15123 self,
15124 p1: point_like,
15125 p2: point_like,
15126 breadth=2,
15127 ) -> Point:
15128 """Draw a squiggly line from p1 to p2."""
15129 p1 = Point(p1)
15130 p2 = Point(p2)
15131 S = p2 - p1 # vector start - end
15132 rad = abs(S) # distance of points
15133 cnt = 4 * int(round(rad / (4 * breadth), 0)) # always take full phases
15134 if cnt < 4:
15135 raise ValueError("points too close")
15136 mb = rad / cnt # revised breadth
15137 matrix = Matrix(util_hor_matrix(p1, p2)) # normalize line to x-axis
15138 i_mat = ~matrix # get original position
15139 k = 2.4142135623765633 # y of draw_curve helper point
15140
15141 points = [] # stores edges
15142 for i in range(1, cnt):
15143 if i % 4 == 1: # point "above" connection
15144 p = Point(i, -k) * mb
15145 elif i % 4 == 3: # point "below" connection
15146 p = Point(i, k) * mb
15147 else: # else on connection line
15148 p = Point(i, 0) * mb
15149 points.append(p * i_mat)
15150
15151 points = [p1] + points + [p2]
15152 cnt = len(points)
15153 i = 0
15154 while i + 2 < cnt:
15155 self.draw_curve(points[i], points[i + 1], points[i + 2])
15156 i += 2
15157 return p2
15158
15159 # ==============================================================================
15160 # Shape.insert_text
15161 # ==============================================================================
15162 def insert_text(
15163 self,
15164 point: point_like,
15165 buffer: typing.Union[str, list],
15166 *,
15167 fontsize: float = 11,
15168 lineheight: OptFloat = None,
15169 fontname: str = "helv",
15170 fontfile: OptStr = None,
15171 set_simple: bool = 0,
15172 encoding: int = 0,
15173 color: OptSeq = None,
15174 fill: OptSeq = None,
15175 render_mode: int = 0,
15176 border_width: float = 0.05,
15177 miter_limit: float = 1,
15178 rotate: int = 0,
15179 morph: OptSeq = None,
15180 stroke_opacity: float = 1,
15181 fill_opacity: float = 1,
15182 oc: int = 0,
15183 ) -> int:
15184
15185 # ensure 'text' is a list of strings, worth dealing with
15186 if not bool(buffer):
15187 return 0
15188
15189 if type(buffer) not in (list, tuple):
15190 text = buffer.splitlines()
15191 else:
15192 text = buffer
15193
15194 if not len(text) > 0:
15195 return 0
15196
15197 point = Point(point)
15198 try:
15199 maxcode = max([ord(c) for c in " ".join(text)])
15200 except Exception:
15201 exception_info()
15202 return 0
15203
15204 # ensure valid 'fontname'
15205 fname = fontname
15206 if fname.startswith("/"):
15207 fname = fname[1:]
15208
15209 xref = self.page.insert_font(
15210 fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
15211 )
15212 fontinfo = CheckFontInfo(self.doc, xref)
15213
15214 fontdict = fontinfo[1]
15215 ordering = fontdict["ordering"]
15216 simple = fontdict["simple"]
15217 bfname = fontdict["name"]
15218 ascender = fontdict["ascender"]
15219 descender = fontdict["descender"]
15220 if lineheight:
15221 lheight = fontsize * lineheight
15222 elif ascender - descender <= 1:
15223 lheight = fontsize * 1.2
15224 else:
15225 lheight = fontsize * (ascender - descender)
15226
15227 if maxcode > 255:
15228 glyphs = self.doc.get_char_widths(xref, maxcode + 1)
15229 else:
15230 glyphs = fontdict["glyphs"]
15231
15232 tab = []
15233 for t in text:
15234 if simple and bfname not in ("Symbol", "ZapfDingbats"):
15235 g = None
15236 else:
15237 g = glyphs
15238 tab.append(getTJstr(t, g, simple, ordering))
15239 text = tab
15240
15241 color_str = ColorCode(color, "c")
15242 fill_str = ColorCode(fill, "f")
15243 if not fill and render_mode == 0: # ensure fill color when 0 Tr
15244 fill = color
15245 fill_str = ColorCode(color, "f")
15246
15247 morphing = CheckMorph(morph)
15248 rot = rotate
15249 if rot % 90 != 0:
15250 raise ValueError("bad rotate value")
15251
15252 while rot < 0:
15253 rot += 360
15254 rot = rot % 360 # text rotate = 0, 90, 270, 180
15255
15256 templ1 = lambda a, b, c, d, e, f, g: f"\nq\n{a}{b}BT\n{c}1 0 0 1 {_format_g((d, e))} Tm\n/{f} {_format_g(g)} Tf "
15257 templ2 = lambda a: f"TJ\n0 -{_format_g(a)} TD\n"
15258 cmp90 = "0 1 -1 0 0 0 cm\n" # rotates 90 deg counter-clockwise
15259 cmm90 = "0 -1 1 0 0 0 cm\n" # rotates 90 deg clockwise
15260 cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
15261 height = self.height
15262 width = self.width
15263
15264 # setting up for standard rotation directions
15265 # case rotate = 0
15266 if morphing:
15267 m1 = Matrix(1, 0, 0, 1, morph[0].x + self.x, height - morph[0].y - self.y)
15268 mat = ~m1 * morph[1] * m1
15269 cm = _format_g(JM_TUPLE(mat)) + " cm\n"
15270 else:
15271 cm = ""
15272 top = height - point.y - self.y # start of 1st char
15273 left = point.x + self.x # start of 1. char
15274 space = top # space available
15275 #headroom = point.y + self.y # distance to page border
15276 if rot == 90:
15277 left = height - point.y - self.y
15278 top = -point.x - self.x
15279 cm += cmp90
15280 space = width - abs(top)
15281 #headroom = point.x + self.x
15282
15283 elif rot == 270:
15284 left = -height + point.y + self.y
15285 top = point.x + self.x
15286 cm += cmm90
15287 space = abs(top)
15288 #headroom = width - point.x - self.x
15289
15290 elif rot == 180:
15291 left = -point.x - self.x
15292 top = -height + point.y + self.y
15293 cm += cm180
15294 space = abs(point.y + self.y)
15295 #headroom = height - point.y - self.y
15296
15297 optcont = self.page._get_optional_content(oc)
15298 if optcont is not None:
15299 bdc = "/OC /%s BDC\n" % optcont
15300 emc = "EMC\n"
15301 else:
15302 bdc = emc = ""
15303
15304 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
15305 if alpha is None:
15306 alpha = ""
15307 else:
15308 alpha = "/%s gs\n" % alpha
15309 nres = templ1(bdc, alpha, cm, left, top, fname, fontsize)
15310
15311 if render_mode > 0:
15312 nres += "%i Tr " % render_mode
15313 nres += _format_g(border_width * fontsize) + " w "
15314 if miter_limit is not None:
15315 nres += _format_g(miter_limit) + " M "
15316 if color is not None:
15317 nres += color_str
15318 if fill is not None:
15319 nres += fill_str
15320
15321 # =========================================================================
15322 # start text insertion
15323 # =========================================================================
15324 nres += text[0]
15325 nlines = 1 # set output line counter
15326 if len(text) > 1:
15327 nres += templ2(lheight) # line 1
15328 else:
15329 nres += 'TJ'
15330 for i in range(1, len(text)):
15331 if space < lheight:
15332 break # no space left on page
15333 if i > 1:
15334 nres += "\nT* "
15335 nres += text[i] + 'TJ'
15336 space -= lheight
15337 nlines += 1
15338
15339 nres += "\nET\n%sQ\n" % emc
15340
15341 # =========================================================================
15342 # end of text insertion
15343 # =========================================================================
15344 # update the /Contents object
15345 self.text_cont += nres
15346 return nlines
15347
15348 # ==============================================================================
15349 # Shape.insert_textbox
15350 # ==============================================================================
15351 def insert_textbox(
15352 self,
15353 rect: rect_like,
15354 buffer: typing.Union[str, list],
15355 *,
15356 fontname: OptStr = "helv",
15357 fontfile: OptStr = None,
15358 fontsize: float = 11,
15359 lineheight: OptFloat = None,
15360 set_simple: bool = 0,
15361 encoding: int = 0,
15362 color: OptSeq = None,
15363 fill: OptSeq = None,
15364 expandtabs: int = 1,
15365 border_width: float = 0.05,
15366 miter_limit: float = 1,
15367 align: int = 0,
15368 render_mode: int = 0,
15369 rotate: int = 0,
15370 morph: OptSeq = None,
15371 stroke_opacity: float = 1,
15372 fill_opacity: float = 1,
15373 oc: int = 0,
15374 ) -> float:
15375 """Insert text into a given rectangle.
15376
15377 Args:
15378 rect -- the textbox to fill
15379 buffer -- text to be inserted
15380 fontname -- a Base-14 font, font name or '/name'
15381 fontfile -- name of a font file
15382 fontsize -- font size
15383 lineheight -- overwrite the font property
15384 color -- RGB stroke color triple
15385 fill -- RGB fill color triple
15386 render_mode -- text rendering control
15387 border_width -- thickness of glyph borders as percentage of fontsize
15388 expandtabs -- handles tabulators with string function
15389 align -- left, center, right, justified
15390 rotate -- 0, 90, 180, or 270 degrees
15391 morph -- morph box with a matrix and a fixpoint
15392 Returns:
15393 unused or deficit rectangle area (float)
15394 """
15395 rect = Rect(rect)
15396 if rect.is_empty or rect.is_infinite:
15397 raise ValueError("text box must be finite and not empty")
15398
15399 color_str = ColorCode(color, "c")
15400 fill_str = ColorCode(fill, "f")
15401 if fill is None and render_mode == 0: # ensure fill color for 0 Tr
15402 fill = color
15403 fill_str = ColorCode(color, "f")
15404
15405 optcont = self.page._get_optional_content(oc)
15406 if optcont is not None:
15407 bdc = "/OC /%s BDC\n" % optcont
15408 emc = "EMC\n"
15409 else:
15410 bdc = emc = ""
15411
15412 # determine opacity / transparency
15413 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
15414 if alpha is None:
15415 alpha = ""
15416 else:
15417 alpha = "/%s gs\n" % alpha
15418
15419 if rotate % 90 != 0:
15420 raise ValueError("rotate must be multiple of 90")
15421
15422 rot = rotate
15423 while rot < 0:
15424 rot += 360
15425 rot = rot % 360
15426
15427 # is buffer worth of dealing with?
15428 if not bool(buffer):
15429 return rect.height if rot in (0, 180) else rect.width
15430
15431 cmp90 = "0 1 -1 0 0 0 cm\n" # rotates counter-clockwise
15432 cmm90 = "0 -1 1 0 0 0 cm\n" # rotates clockwise
15433 cm180 = "-1 0 0 -1 0 0 cm\n" # rotates by 180 deg.
15434 height = self.height
15435
15436 fname = fontname
15437 if fname.startswith("/"):
15438 fname = fname[1:]
15439
15440 xref = self.page.insert_font(
15441 fontname=fname, fontfile=fontfile, encoding=encoding, set_simple=set_simple
15442 )
15443 fontinfo = CheckFontInfo(self.doc, xref)
15444
15445 fontdict = fontinfo[1]
15446 ordering = fontdict["ordering"]
15447 simple = fontdict["simple"]
15448 glyphs = fontdict["glyphs"]
15449 bfname = fontdict["name"]
15450 ascender = fontdict["ascender"]
15451 descender = fontdict["descender"]
15452
15453 if lineheight:
15454 lheight_factor = lineheight
15455 elif ascender - descender <= 1:
15456 lheight_factor = 1.2
15457 else:
15458 lheight_factor = ascender - descender
15459 lheight = fontsize * lheight_factor
15460
15461 # create a list from buffer, split into its lines
15462 if type(buffer) in (list, tuple):
15463 t0 = "\n".join(buffer)
15464 else:
15465 t0 = buffer
15466
15467 maxcode = max([ord(c) for c in t0])
15468 # replace invalid char codes for simple fonts
15469 if simple and maxcode > 255:
15470 t0 = "".join([c if ord(c) < 256 else "?" for c in t0])
15471
15472 t0 = t0.splitlines()
15473
15474 glyphs = self.doc.get_char_widths(xref, maxcode + 1)
15475 if simple and bfname not in ("Symbol", "ZapfDingbats"):
15476 tj_glyphs = None
15477 else:
15478 tj_glyphs = glyphs
15479
15480 # ----------------------------------------------------------------------
15481 # calculate pixel length of a string
15482 # ----------------------------------------------------------------------
15483 def pixlen(x):
15484 """Calculate pixel length of x."""
15485 if ordering < 0:
15486 return sum([glyphs[ord(c)][1] for c in x]) * fontsize
15487 else:
15488 return len(x) * fontsize
15489
15490 # ---------------------------------------------------------------------
15491
15492 if ordering < 0:
15493 blen = glyphs[32][1] * fontsize # pixel size of space character
15494 else:
15495 blen = fontsize
15496
15497 text = "" # output buffer
15498
15499 if CheckMorph(morph):
15500 m1 = Matrix(
15501 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
15502 )
15503 mat = ~m1 * morph[1] * m1
15504 cm = _format_g(JM_TUPLE(mat)) + " cm\n"
15505 else:
15506 cm = ""
15507
15508 # ---------------------------------------------------------------------
15509 # adjust for text orientation / rotation
15510 # ---------------------------------------------------------------------
15511 progr = 1 # direction of line progress
15512 c_pnt = Point(0, fontsize * ascender) # used for line progress
15513 if rot == 0: # normal orientation
15514 point = rect.tl + c_pnt # line 1 is 'lheight' below top
15515 maxwidth = rect.width # pixels available in one line
15516 maxheight = rect.height # available text height
15517
15518 elif rot == 90: # rotate counter clockwise
15519 c_pnt = Point(fontsize * ascender, 0) # progress in x-direction
15520 point = rect.bl + c_pnt # line 1 'lheight' away from left
15521 maxwidth = rect.height # pixels available in one line
15522 maxheight = rect.width # available text height
15523 cm += cmp90
15524
15525 elif rot == 180: # text upside down
15526 # progress upwards in y direction
15527 c_pnt = -Point(0, fontsize * ascender)
15528 point = rect.br + c_pnt # line 1 'lheight' above bottom
15529 maxwidth = rect.width # pixels available in one line
15530 progr = -1 # subtract lheight for next line
15531 maxheight =rect.height # available text height
15532 cm += cm180
15533
15534 else: # rotate clockwise (270 or -90)
15535 # progress from right to left
15536 c_pnt = -Point(fontsize * ascender, 0)
15537 point = rect.tr + c_pnt # line 1 'lheight' left of right
15538 maxwidth = rect.height # pixels available in one line
15539 progr = -1 # subtract lheight for next line
15540 maxheight = rect.width # available text height
15541 cm += cmm90
15542
15543 # =====================================================================
15544 # line loop
15545 # =====================================================================
15546 just_tab = [] # 'justify' indicators per line
15547
15548 for i, line in enumerate(t0):
15549 line_t = line.expandtabs(expandtabs).split(" ") # split into words
15550 num_words = len(line_t)
15551 lbuff = "" # init line buffer
15552 rest = maxwidth # available line pixels
15553 # =================================================================
15554 # word loop
15555 # =================================================================
15556 for j in range(num_words):
15557 word = line_t[j]
15558 pl_w = pixlen(word) # pixel len of word
15559 if rest >= pl_w: # does it fit on the line?
15560 lbuff += word + " " # yes, append word
15561 rest -= pl_w + blen # update available line space
15562 continue # next word
15563
15564 # word doesn't fit - output line (if not empty)
15565 if lbuff:
15566 lbuff = lbuff.rstrip() + "\n" # line full, append line break
15567 text += lbuff # append to total text
15568 just_tab.append(True) # can align-justify
15569
15570 lbuff = "" # re-init line buffer
15571 rest = maxwidth # re-init avail. space
15572
15573 if pl_w <= maxwidth: # word shorter than 1 line?
15574 lbuff = word + " " # start the line with it
15575 rest = maxwidth - pl_w - blen # update free space
15576 continue
15577
15578 # long word: split across multiple lines - char by char ...
15579 if len(just_tab) > 0:
15580 just_tab[-1] = False # cannot align-justify
15581 for c in word:
15582 if pixlen(lbuff) <= maxwidth - pixlen(c):
15583 lbuff += c
15584 else: # line full
15585 lbuff += "\n" # close line
15586 text += lbuff # append to text
15587 just_tab.append(False) # cannot align-justify
15588 lbuff = c # start new line with this char
15589
15590 lbuff += " " # finish long word
15591 rest = maxwidth - pixlen(lbuff) # long word stored
15592
15593 if lbuff: # unprocessed line content?
15594 text += lbuff.rstrip() # append to text
15595 just_tab.append(False) # cannot align-justify
15596
15597 if i < len(t0) - 1: # not the last line?
15598 text += "\n" # insert line break
15599
15600 # compute used part of the textbox
15601 if text.endswith("\n"):
15602 text = text[:-1]
15603 lb_count = text.count("\n") + 1 # number of lines written
15604
15605 # text height = line count * line height plus one descender value
15606 text_height = lheight * lb_count - descender * fontsize
15607
15608 more = text_height - maxheight # difference to height limit
15609 if more > EPSILON: # landed too much outside rect
15610 return (-1) * more # return deficit, don't output
15611
15612 more = abs(more)
15613 if more < EPSILON:
15614 more = 0 # don't bother with epsilons
15615 nres = "\nq\n%s%sBT\n" % (bdc, alpha) + cm # initialize output buffer
15616 templ = lambda a, b, c, d: f"1 0 0 1 {_format_g((a, b))} Tm /{c} {_format_g(d)} Tf "
15617 # center, right, justify: output each line with its own specifics
15618 text_t = text.splitlines() # split text in lines again
15619 just_tab[-1] = False # never justify last line
15620 for i, t in enumerate(text_t):
15621 spacing = 0
15622 pl = maxwidth - pixlen(t) # length of empty line part
15623 pnt = point + c_pnt * (i * lheight_factor) # text start of line
15624 if align == 1: # center: right shift by half width
15625 if rot in (0, 180):
15626 pnt = pnt + Point(pl / 2, 0) * progr
15627 else:
15628 pnt = pnt - Point(0, pl / 2) * progr
15629 elif align == 2: # right: right shift by full width
15630 if rot in (0, 180):
15631 pnt = pnt + Point(pl, 0) * progr
15632 else:
15633 pnt = pnt - Point(0, pl) * progr
15634 elif align == 3: # justify
15635 spaces = t.count(" ") # number of spaces in line
15636 if spaces > 0 and just_tab[i]: # if any, and we may justify
15637 spacing = pl / spaces # make every space this much larger
15638 else:
15639 spacing = 0 # keep normal space length
15640 top = height - pnt.y - self.y
15641 left = pnt.x + self.x
15642 if rot == 90:
15643 left = height - pnt.y - self.y
15644 top = -pnt.x - self.x
15645 elif rot == 270:
15646 left = -height + pnt.y + self.y
15647 top = pnt.x + self.x
15648 elif rot == 180:
15649 left = -pnt.x - self.x
15650 top = -height + pnt.y + self.y
15651
15652 nres += templ(left, top, fname, fontsize)
15653
15654 if render_mode > 0:
15655 nres += "%i Tr " % render_mode
15656 nres += _format_g(border_width * fontsize) + " w "
15657 if miter_limit is not None:
15658 nres += _format_g(miter_limit) + " M "
15659
15660 if align == 3:
15661 nres += _format_g(spacing) + " Tw "
15662
15663 if color is not None:
15664 nres += color_str
15665 if fill is not None:
15666 nres += fill_str
15667 nres += "%sTJ\n" % getTJstr(t, tj_glyphs, simple, ordering)
15668
15669 nres += "ET\n%sQ\n" % emc
15670
15671 self.text_cont += nres
15672 self.updateRect(rect)
15673 return more
15674
15675 def finish(
15676 self,
15677 width: float = 1,
15678 color: OptSeq = (0,),
15679 fill: OptSeq = None,
15680 lineCap: int = 0,
15681 lineJoin: int = 0,
15682 dashes: OptStr = None,
15683 even_odd: bool = False,
15684 morph: OptSeq = None,
15685 closePath: bool = True,
15686 fill_opacity: float = 1,
15687 stroke_opacity: float = 1,
15688 oc: int = 0,
15689 ) -> None:
15690 """Finish the current drawing segment.
15691
15692 Notes:
15693 Apply colors, opacity, dashes, line style and width, or
15694 morphing. Also whether to close the path
15695 by connecting last to first point.
15696 """
15697 if self.draw_cont == "": # treat empty contents as no-op
15698 return
15699
15700 if width == 0: # border color makes no sense then
15701 color = None
15702 elif color is None: # vice versa
15703 width = 0
15704 # if color == None and fill == None:
15705 # raise ValueError("at least one of 'color' or 'fill' must be given")
15706 color_str = ColorCode(color, "c") # ensure proper color string
15707 fill_str = ColorCode(fill, "f") # ensure proper fill string
15708
15709 optcont = self.page._get_optional_content(oc)
15710 if optcont is not None:
15711 self.draw_cont = "/OC /%s BDC\n" % optcont + self.draw_cont
15712 emc = "EMC\n"
15713 else:
15714 emc = ""
15715
15716 alpha = self.page._set_opacity(CA=stroke_opacity, ca=fill_opacity)
15717 if alpha is not None:
15718 self.draw_cont = "/%s gs\n" % alpha + self.draw_cont
15719
15720 if width != 1 and width != 0:
15721 self.draw_cont += _format_g(width) + " w\n"
15722
15723 if lineCap != 0:
15724 self.draw_cont = "%i J\n" % lineCap + self.draw_cont
15725 if lineJoin != 0:
15726 self.draw_cont = "%i j\n" % lineJoin + self.draw_cont
15727
15728 if dashes not in (None, "", "[] 0"):
15729 self.draw_cont = "%s d\n" % dashes + self.draw_cont
15730
15731 if closePath:
15732 self.draw_cont += "h\n"
15733 self.last_point = None
15734
15735 if color is not None:
15736 self.draw_cont += color_str
15737
15738 if fill is not None:
15739 self.draw_cont += fill_str
15740 if color is not None:
15741 if not even_odd:
15742 self.draw_cont += "B\n"
15743 else:
15744 self.draw_cont += "B*\n"
15745 else:
15746 if not even_odd:
15747 self.draw_cont += "f\n"
15748 else:
15749 self.draw_cont += "f*\n"
15750 else:
15751 self.draw_cont += "S\n"
15752
15753 self.draw_cont += emc
15754 if CheckMorph(morph):
15755 m1 = Matrix(
15756 1, 0, 0, 1, morph[0].x + self.x, self.height - morph[0].y - self.y
15757 )
15758 mat = ~m1 * morph[1] * m1
15759 self.draw_cont = _format_g(JM_TUPLE(mat)) + " cm\n" + self.draw_cont
15760
15761 self.totalcont += "\nq\n" + self.draw_cont + "Q\n"
15762 self.draw_cont = ""
15763 self.last_point = None
15764 return
15765
15766 def commit(self, overlay: bool = True) -> None:
15767 """Update the page's /Contents object with Shape data.
15768
15769 The argument controls whether data appear in foreground (default)
15770 or background.
15771 """
15772 CheckParent(self.page) # doc may have died meanwhile
15773 self.totalcont += self.text_cont
15774 self.totalcont = self.totalcont.encode()
15775
15776 if self.totalcont:
15777 if overlay:
15778 self.page.wrap_contents() # ensure a balanced graphics state
15779 # make /Contents object with dummy stream
15780 xref = TOOLS._insert_contents(self.page, b" ", overlay)
15781 # update it with potential compression
15782 self.doc.update_stream(xref, self.totalcont)
15783
15784 self.last_point = None # clean up ...
15785 self.rect = None #
15786 self.draw_cont = "" # for potential ...
15787 self.text_cont = "" # ...
15788 self.totalcont = "" # re-use
15789
15790
11506 class Story: 15791 class Story:
11507 15792
11508 def __init__( self, html='', user_css=None, em=12, archive=None): 15793 def __init__( self, html='', user_css=None, em=12, archive=None):
11509 buffer_ = mupdf.fz_new_buffer_from_copied_data( html.encode('utf-8')) 15794 buffer_ = mupdf.fz_new_buffer_from_copied_data( html.encode('utf-8'))
11510 if archive and not isinstance(archive, Archive): 15795 if archive and not isinstance(archive, Archive):
11662 for k, v in args.items(): 15947 for k, v in args.items():
11663 setattr( position2, k, v) 15948 setattr( position2, k, v)
11664 function( position2) 15949 function( position2)
11665 mupdf.fz_story_positions( self.this, function2) 15950 mupdf.fz_story_positions( self.this, function2)
11666 15951
11667 def place( self, where): 15952 def place( self, where, flags=0):
15953 '''
15954 Wrapper for fz_place_story_flags().
15955 '''
11668 where = JM_rect_from_py( where) 15956 where = JM_rect_from_py( where)
11669 filled = mupdf.FzRect() 15957 filled = mupdf.FzRect()
11670 more = mupdf.fz_place_story( self.this, where, filled) 15958 more = mupdf.fz_place_story_flags( self.this, where, filled, flags)
11671 return more, JM_py_from_rect( filled) 15959 return more, JM_py_from_rect( filled)
11672 15960
11673 def reset( self): 15961 def reset( self):
11674 mupdf.fz_reset_story( self.this) 15962 mupdf.fz_reset_story( self.this)
11675 15963
11782 Members: 16070 Members:
11783 16071
11784 `big_enough`: 16072 `big_enough`:
11785 `True` if the fit succeeded. 16073 `True` if the fit succeeded.
11786 `filled`: 16074 `filled`:
11787 From the last call to `Story.place()`. 16075 Tuple (x0, y0, x1, y1) from the last call to `Story.place()`. This
16076 will be wider than .rect if any single word (which we never split)
16077 was too wide for .rect.
11788 `more`: 16078 `more`:
11789 `False` if the fit succeeded. 16079 `False` if the fit succeeded.
11790 `numcalls`: 16080 `numcalls`:
11791 Number of calls made to `self.place()`. 16081 Number of calls made to `self.place()`.
11792 `parameter`: 16082 `parameter`:
11793 The successful parameter value, or the largest failing value. 16083 The successful parameter value, or the largest failing value.
11794 `rect`: 16084 `rect`:
11795 The rect created from `parameter`. 16085 The pumupdf.Rect created from `parameter`.
11796 ''' 16086 '''
11797 def __init__(self, big_enough=None, filled=None, more=None, numcalls=None, parameter=None, rect=None): 16087 def __init__(self, big_enough=None, filled=None, more=None, numcalls=None, parameter=None, rect=None):
11798 self.big_enough = big_enough 16088 self.big_enough = big_enough
11799 self.filled = filled 16089 self.filled = filled
11800 self.more = more 16090 self.more = more
11810 f' numcalls={self.numcalls}' 16100 f' numcalls={self.numcalls}'
11811 f' parameter={self.parameter}' 16101 f' parameter={self.parameter}'
11812 f' rect={self.rect}' 16102 f' rect={self.rect}'
11813 ) 16103 )
11814 16104
11815 def fit(self, fn, pmin=None, pmax=None, delta=0.001, verbose=False): 16105 def fit(self, fn, pmin=None, pmax=None, delta=0.001, verbose=False, flags=0):
11816 ''' 16106 '''
11817 Finds optimal rect that contains the story `self`. 16107 Finds optimal rect that contains the story `self`.
11818 16108
11819 Returns a `Story.FitResult` instance. 16109 Returns a `Story.FitResult` instance.
11820 16110
11837 Maximum parameter to consider; `None` for +infinity. 16127 Maximum parameter to consider; `None` for +infinity.
11838 :arg delta: 16128 :arg delta:
11839 Maximum error in returned `parameter`. 16129 Maximum error in returned `parameter`.
11840 :arg verbose: 16130 :arg verbose:
11841 If true we output diagnostics. 16131 If true we output diagnostics.
16132 :arg flags:
16133 Passed to mupdf.fz_place_story_flags(). e.g.
16134 zero or `mupdf.FZ_PLACE_STORY_FLAG_NO_OVERFLOW`.
11842 ''' 16135 '''
11843 def log(text): 16136 def log(text):
11844 assert verbose 16137 assert verbose
11845 message(f'fit(): {text}') 16138 message(f'fit(): {text}')
11846 16139
11892 big_enough = False 16185 big_enough = False
11893 result = Story.FitResult(parameter=parameter, numcalls=state.numcalls) 16186 result = Story.FitResult(parameter=parameter, numcalls=state.numcalls)
11894 if verbose: 16187 if verbose:
11895 log(f'update(): not calling self.place() because rect is empty.') 16188 log(f'update(): not calling self.place() because rect is empty.')
11896 else: 16189 else:
11897 more, filled = self.place(rect) 16190 more, filled = self.place(rect, flags)
11898 state.numcalls += 1 16191 state.numcalls += 1
11899 big_enough = not more 16192 big_enough = not more
11900 result = Story.FitResult( 16193 result = Story.FitResult(
11901 filled=filled, 16194 filled=filled,
11902 more=more, 16195 more=more,
11961 if state.pmax - state.pmin < delta: 16254 if state.pmax - state.pmin < delta:
11962 return ret() 16255 return ret()
11963 parameter = (state.pmin + state.pmax) / 2 16256 parameter = (state.pmin + state.pmax) / 2
11964 update(parameter) 16257 update(parameter)
11965 16258
11966 def fit_scale(self, rect, scale_min=0, scale_max=None, delta=0.001, verbose=False): 16259 def fit_scale(self, rect, scale_min=0, scale_max=None, delta=0.001, verbose=False, flags=0):
11967 ''' 16260 '''
11968 Finds smallest value `scale` in range `scale_min..scale_max` where 16261 Finds smallest value `scale` in range `scale_min..scale_max` where
11969 `scale * rect` is large enough to contain the story `self`. 16262 `scale * rect` is large enough to contain the story `self`.
11970 16263
11971 Returns a `Story.FitResult` instance. 16264 Returns a `Story.FitResult` instance with `.parameter` set to `scale`.
11972 16265
11973 :arg width: 16266 :arg width:
11974 width of rect. 16267 width of rect.
11975 :arg height: 16268 :arg height:
11976 height of rect. 16269 height of rect.
11981 infinite. 16274 infinite.
11982 :arg delta: 16275 :arg delta:
11983 Maximum error in returned scale. 16276 Maximum error in returned scale.
11984 :arg verbose: 16277 :arg verbose:
11985 If true we output diagnostics. 16278 If true we output diagnostics.
16279 :arg flags:
16280 Passed to Story.place().
11986 ''' 16281 '''
11987 x0, y0, x1, y1 = rect 16282 x0, y0, x1, y1 = rect
11988 width = x1 - x0 16283 width = x1 - x0
11989 height = y1 - y0 16284 height = y1 - y0
11990 def fn(scale): 16285 def fn(scale):
11991 return Rect(x0, y0, x0 + scale*width, y0 + scale*height) 16286 return Rect(x0, y0, x0 + scale*width, y0 + scale*height)
11992 return self.fit(fn, scale_min, scale_max, delta, verbose) 16287 return self.fit(fn, scale_min, scale_max, delta, verbose, flags)
11993 16288
11994 def fit_height(self, width, height_min=0, height_max=None, origin=(0, 0), delta=0.001, verbose=False): 16289 def fit_height(self, width, height_min=0, height_max=None, origin=(0, 0), delta=0.001, verbose=False):
11995 ''' 16290 '''
11996 Finds smallest height in range `height_min..height_max` where a rect 16291 Finds smallest height in range `height_min..height_max` where a rect
11997 with size `(width, height)` is large enough to contain the story 16292 with size `(width, height)` is large enough to contain the story
12314 cbbox = JM_char_bbox(line, ch) 16609 cbbox = JM_char_bbox(line, ch)
12315 if (not JM_rects_overlap(tp_rect, cbbox) 16610 if (not JM_rects_overlap(tp_rect, cbbox)
12316 and not mupdf.fz_is_infinite_rect(tp_rect) 16611 and not mupdf.fz_is_infinite_rect(tp_rect)
12317 ): 16612 ):
12318 continue 16613 continue
16614
16615 if buflen == 0 and ch.m_internal.c == 0x200d:
16616 # ZERO WIDTH JOINER cannot start a word
16617 continue
12319 word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters) 16618 word_delimiter = JM_is_word_delimiter(ch.m_internal.c, delimiters)
12320 this_char_rtl = JM_is_rtl_char(ch.m_internal.c) 16619 this_char_rtl = JM_is_rtl_char(ch.m_internal.c)
12321 if word_delimiter or this_char_rtl != last_char_rtl: 16620 if word_delimiter or this_char_rtl != last_char_rtl:
12322 if buflen == 0 and word_delimiter: 16621 if buflen == 0 and word_delimiter:
12323 continue # skip delimiters at line start 16622 continue # skip delimiters at line start
12513 elif idx[i] == idx2[-1] + 1: # new adjacent Latin word 16812 elif idx[i] == idx2[-1] + 1: # new adjacent Latin word
12514 idx2.append(idx[i]) 16813 idx2.append(idx[i])
12515 16814
12516 text = " ".join(words) 16815 text = " ".join(words)
12517 return text 16816 return text
16817
16818 def fill_textbox(
16819 writer: 'TextWriter',
16820 rect: rect_like,
16821 text: typing.Union[str, list],
16822 pos: point_like = None,
16823 font: typing.Optional[Font] = None,
16824 fontsize: float = 11,
16825 lineheight: OptFloat = None,
16826 align: int = 0,
16827 warn: bool = None,
16828 right_to_left: bool = False,
16829 small_caps: bool = False,
16830 ) -> tuple:
16831 """Fill a rectangle with text.
16832
16833 Args:
16834 writer: pymupdf.TextWriter object (= "self")
16835 rect: rect-like to receive the text.
16836 text: string or list/tuple of strings.
16837 pos: point-like start position of first word.
16838 font: pymupdf.Font object (default pymupdf.Font('helv')).
16839 fontsize: the fontsize.
16840 lineheight: overwrite the font property
16841 align: (int) 0 = left, 1 = center, 2 = right, 3 = justify
16842 warn: (bool) text overflow action: none, warn, or exception
16843 right_to_left: (bool) indicate right-to-left language.
16844 """
16845 rect = Rect(rect)
16846 if rect.is_empty:
16847 raise ValueError("fill rect must not empty.")
16848 if type(font) is not Font:
16849 font = Font("helv")
16850
16851 def textlen(x):
16852 """Return length of a string."""
16853 return font.text_length(
16854 x, fontsize=fontsize, small_caps=small_caps
16855 ) # abbreviation
16856
16857 def char_lengths(x):
16858 """Return list of single character lengths for a string."""
16859 return font.char_lengths(x, fontsize=fontsize, small_caps=small_caps)
16860
16861 def append_this(pos, text):
16862 ret = writer.append(
16863 pos, text, font=font, fontsize=fontsize, small_caps=small_caps
16864 )
16865 return ret
16866
16867 tolerance = fontsize * 0.2 # extra distance to left border
16868 space_len = textlen(" ")
16869 std_width = rect.width - tolerance
16870 std_start = rect.x0 + tolerance
16871
16872 def norm_words(width, words):
16873 """Cut any word in pieces no longer than 'width'."""
16874 nwords = []
16875 word_lengths = []
16876 for w in words:
16877 wl_lst = char_lengths(w)
16878 wl = sum(wl_lst)
16879 if wl <= width: # nothing to do - copy over
16880 nwords.append(w)
16881 word_lengths.append(wl)
16882 continue
16883
16884 # word longer than rect width - split it in parts
16885 n = len(wl_lst)
16886 while n > 0:
16887 wl = sum(wl_lst[:n])
16888 if wl <= width:
16889 nwords.append(w[:n])
16890 word_lengths.append(wl)
16891 w = w[n:]
16892 wl_lst = wl_lst[n:]
16893 n = len(wl_lst)
16894 else:
16895 n -= 1
16896 return nwords, word_lengths
16897
16898 def output_justify(start, line):
16899 """Justified output of a line."""
16900 # ignore leading / trailing / multiple spaces
16901 words = [w for w in line.split(" ") if w != ""]
16902 nwords = len(words)
16903 if nwords == 0:
16904 return
16905 if nwords == 1: # single word cannot be justified
16906 append_this(start, words[0])
16907 return
16908 tl = sum([textlen(w) for w in words]) # total word lengths
16909 gaps = nwords - 1 # number of word gaps
16910 gapl = (std_width - tl) / gaps # width of each gap
16911 for w in words:
16912 _, lp = append_this(start, w) # output one word
16913 start.x = lp.x + gapl # next start at word end plus gap
16914 return
16915
16916 asc = font.ascender
16917 dsc = font.descender
16918 if not lineheight:
16919 if asc - dsc <= 1:
16920 lheight = 1.2
16921 else:
16922 lheight = asc - dsc
16923 else:
16924 lheight = lineheight
16925
16926 LINEHEIGHT = fontsize * lheight # effective line height
16927 width = std_width # available horizontal space
16928
16929 # starting point of text
16930 if pos is not None:
16931 pos = Point(pos)
16932 else: # default is just below rect top-left
16933 pos = rect.tl + (tolerance, fontsize * asc)
16934 if pos not in rect:
16935 raise ValueError("Text must start in rectangle.")
16936
16937 # calculate displacement factor for alignment
16938 if align == TEXT_ALIGN_CENTER:
16939 factor = 0.5
16940 elif align == TEXT_ALIGN_RIGHT:
16941 factor = 1.0
16942 else:
16943 factor = 0
16944
16945 # split in lines if just a string was given
16946 if type(text) is str:
16947 textlines = text.splitlines()
16948 else:
16949 textlines = []
16950 for line in text:
16951 textlines.extend(line.splitlines())
16952
16953 max_lines = int((rect.y1 - pos.y) / LINEHEIGHT) + 1
16954
16955 new_lines = [] # the final list of textbox lines
16956 no_justify = [] # no justify for these line numbers
16957 for i, line in enumerate(textlines):
16958 if line in ("", " "):
16959 new_lines.append((line, space_len))
16960 width = rect.width - tolerance
16961 no_justify.append((len(new_lines) - 1))
16962 continue
16963 if i == 0:
16964 width = rect.x1 - pos.x
16965 else:
16966 width = rect.width - tolerance
16967
16968 if right_to_left: # reverses Arabic / Hebrew text front to back
16969 line = writer.clean_rtl(line)
16970 tl = textlen(line)
16971 if tl <= width: # line short enough
16972 new_lines.append((line, tl))
16973 no_justify.append((len(new_lines) - 1))
16974 continue
16975
16976 # we need to split the line in fitting parts
16977 words = line.split(" ") # the words in the line
16978
16979 # cut in parts any words that are longer than rect width
16980 words, word_lengths = norm_words(width, words)
16981
16982 n = len(words)
16983 while True:
16984 line0 = " ".join(words[:n])
16985 wl = sum(word_lengths[:n]) + space_len * (n - 1)
16986 if wl <= width:
16987 new_lines.append((line0, wl))
16988 words = words[n:]
16989 word_lengths = word_lengths[n:]
16990 n = len(words)
16991 line0 = None
16992 else:
16993 n -= 1
16994
16995 if len(words) == 0:
16996 break
16997 assert n
16998
16999 # -------------------------------------------------------------------------
17000 # List of lines created. Each item is (text, tl), where 'tl' is the PDF
17001 # output length (float) and 'text' is the text. Except for justified text,
17002 # this is output-ready.
17003 # -------------------------------------------------------------------------
17004 nlines = len(new_lines)
17005 if nlines > max_lines:
17006 msg = "Only fitting %i of %i lines." % (max_lines, nlines)
17007 if warn is None:
17008 pass
17009 elif warn:
17010 message("Warning: " + msg)
17011 else:
17012 raise ValueError(msg)
17013
17014 start = Point()
17015 no_justify += [len(new_lines) - 1] # no justifying of last line
17016 for i in range(max_lines):
17017 try:
17018 line, tl = new_lines.pop(0)
17019 except IndexError:
17020 if g_exceptions_verbose >= 2: exception_info()
17021 break
17022
17023 if right_to_left: # Arabic, Hebrew
17024 line = "".join(reversed(line))
17025
17026 if i == 0: # may have different start for first line
17027 start = pos
17028
17029 if align == TEXT_ALIGN_JUSTIFY and i not in no_justify and tl < std_width:
17030 output_justify(start, line)
17031 start.x = std_start
17032 start.y += LINEHEIGHT
17033 continue
17034
17035 if i > 0 or pos.x == std_start: # left, center, right alignments
17036 start.x += (width - tl) * factor
17037
17038 append_this(start, line)
17039 start.x = std_start
17040 start.y += LINEHEIGHT
17041
17042 return new_lines # return non-written lines
12518 17043
12519 def write_text(self, page, color=None, opacity=-1, overlay=1, morph=None, matrix=None, render_mode=0, oc=0): 17044 def write_text(self, page, color=None, opacity=-1, overlay=1, morph=None, matrix=None, render_mode=0, oc=0):
12520 """Write the text to a PDF page having the TextWriter's page size. 17045 """Write the text to a PDF page having the TextWriter's page size.
12521 17046
12522 Args: 17047 Args:
12733 return max(0, self.y1 - self.y0) 17258 return max(0, self.y1 - self.y0)
12734 17259
12735 def contains(self, x): 17260 def contains(self, x):
12736 """Check if x is in the rectangle.""" 17261 """Check if x is in the rectangle."""
12737 return self.__contains__(x) 17262 return self.__contains__(x)
17263
17264 def get_area(self, *args) -> float:
17265 """Calculate area of rectangle.\nparameter is one of 'px' (default), 'in', 'cm', or 'mm'."""
17266 if args:
17267 unit = args[0]
17268 else:
17269 unit = "px"
17270 u = {"px": (1, 1), "in": (1.0, 72.0), "cm": (2.54, 72.0), "mm": (25.4, 72.0)}
17271 f = (u[unit][0] / u[unit][1]) ** 2
17272 return f * self.width * self.height
12738 17273
12739 def include_point(self, p): 17274 def include_point(self, p):
12740 """Extend rectangle to include point p.""" 17275 """Extend rectangle to include point p."""
12741 rect = self.rect.include_point(p) 17276 rect = self.rect.include_point(p)
12742 return rect.irect 17277 return rect.irect
20923 red, green, blue: integers in range 0..255. 25458 red, green, blue: integers in range 0..255.
20924 ''' 25459 '''
20925 return _wxcolors 25460 return _wxcolors
20926 25461
20927 25462
25463 def _mupdf_devel(make_links=True):
25464 '''
25465 Allows PyMuPDF installation to be used to compile and link programmes that
25466 use the MuPDF C/C++ API.
25467
25468 Args:
25469 make_links:
25470 If true, then on non-windows we also create softlinks to any shared
25471 libraries that are supplied with a version suffix; this allows them
25472 to be used in a link command.
25473
25474 For example we create links such as:
25475
25476 site-packages/pymupdf/
25477 libmupdf.so -> libmupdf.so.26.7
25478 libmupdfcpp.so -> libmupdfcpp.so.26.7
25479
25480 Returns: (mupdf_include, mupdf_lib).
25481 mupdf_include:
25482 Path of MuPDF include directory within PyMuPDF install.
25483 mupdf_lib
25484 Path of MuPDF library directory within PyMuPDF install.
25485 '''
25486 import platform
25487
25488 log(f'{mupdf_version=}')
25489
25490 p = os.path.normpath(f'{__file__}/..')
25491
25492 mupdf_include = f'{p}/mupdf-devel/include'
25493
25494 if platform.system() == 'Windows':
25495 # Separate .lib files are used at build time.
25496 mupdf_lib = f'{p}/mupdf-devel/lib'
25497 else:
25498 # .so files are used for both buildtime and runtime linking.
25499 mupdf_lib = p
25500 log(f'Within installed PyMuPDF:')
25501 log(f' {mupdf_include=}')
25502 log(f' {mupdf_lib=}')
25503
25504 assert os.path.isdir(mupdf_include), f'Not a directory: {mupdf_include=}.'
25505 assert os.path.isdir(mupdf_lib), f'Not a directory: {mupdf_lib=}.'
25506
25507 if platform.system() != 'Windows' and make_links:
25508 # Make symbolic links within the installed pymupdf module so
25509 # that ld can find libmupdf.so etc. This is a bit of a hack, but
25510 # necessary because wheels cannot contain symbolic links.
25511 #
25512 # For example we create `libmupdf.so -> libmupdf.so.24.8`.
25513 #
25514 # We are careful to only create symlinks for the expected MuPDF
25515 # version, in case old .so files from a previous install are still
25516 # in place.
25517 #
25518 log(f'Creating symlinks in {mupdf_lib=} for MuPDF-{mupdf_version} .so files.')
25519 regex_suffix = mupdf_version.split('.')[1:3]
25520 regex_suffix = '[.]'.join(regex_suffix)
25521 mupdf_lib_regex = f'^(lib[^.]+[.]so)[.]{regex_suffix}$'
25522 log(f'{mupdf_lib_regex=}.')
25523 for leaf in os.listdir(mupdf_lib):
25524 m = re.match(mupdf_lib_regex, leaf)
25525 if m:
25526 pfrom = f'{mupdf_lib}/{m.group(1)}'
25527 # os.path.exists() can return false if softlink exists
25528 # but points to non-existent file, so we also use
25529 # `os.path.islink()`.
25530 if os.path.islink(pfrom) or os.path.exists(pfrom):
25531 log(f'Removing existing link {pfrom=}.')
25532 os.remove(pfrom)
25533 log(f'Creating symlink: {pfrom} -> {leaf}')
25534 os.symlink(leaf, pfrom)
25535
25536 return mupdf_include, mupdf_lib
25537
25538
20928 # We cannot import utils earlier because it imports this .py file itself and 25539 # We cannot import utils earlier because it imports this .py file itself and
20929 # uses some pymupdf.* types in function typing. 25540 # uses some pymupdf.* types in function typing.
20930 # 25541 #
20931 from . import utils 25542 from . import utils
20932 25543
20937 recover_char_quad = utils.recover_char_quad 25548 recover_char_quad = utils.recover_char_quad
20938 recover_line_quad = utils.recover_line_quad 25549 recover_line_quad = utils.recover_line_quad
20939 recover_quad = utils.recover_quad 25550 recover_quad = utils.recover_quad
20940 recover_span_quad = utils.recover_span_quad 25551 recover_span_quad = utils.recover_span_quad
20941 25552
20942 Annot.get_text = utils.get_text
20943 Annot.get_textbox = utils.get_textbox
20944
20945 Document._do_links = utils.do_links
20946 Document._do_widgets = utils.do_widgets
20947 Document.del_toc_item = utils.del_toc_item
20948 Document.get_char_widths = utils.get_char_widths
20949 Document.get_oc = utils.get_oc
20950 Document.get_ocmd = utils.get_ocmd
20951 Document.get_page_labels = utils.get_page_labels
20952 Document.get_page_numbers = utils.get_page_numbers
20953 Document.get_page_pixmap = utils.get_page_pixmap
20954 Document.get_page_text = utils.get_page_text
20955 Document.get_toc = utils.get_toc
20956 Document.has_annots = utils.has_annots
20957 Document.has_links = utils.has_links
20958 Document.insert_page = utils.insert_page
20959 Document.new_page = utils.new_page
20960 Document.scrub = utils.scrub
20961 Document.search_page_for = utils.search_page_for
20962 Document.set_metadata = utils.set_metadata
20963 Document.set_oc = utils.set_oc
20964 Document.set_ocmd = utils.set_ocmd
20965 Document.set_page_labels = utils.set_page_labels
20966 Document.set_toc = utils.set_toc
20967 Document.set_toc_item = utils.set_toc_item
20968 Document.subset_fonts = utils.subset_fonts
20969 Document.tobytes = Document.write
20970 Document.xref_copy = utils.xref_copy
20971
20972 IRect.get_area = utils.get_area
20973
20974 Page.apply_redactions = utils.apply_redactions
20975 Page.delete_image = utils.delete_image
20976 Page.delete_widget = utils.delete_widget
20977 Page.draw_bezier = utils.draw_bezier
20978 Page.draw_circle = utils.draw_circle
20979 Page.draw_curve = utils.draw_curve
20980 Page.draw_line = utils.draw_line
20981 Page.draw_oval = utils.draw_oval
20982 Page.draw_polyline = utils.draw_polyline
20983 Page.draw_quad = utils.draw_quad
20984 Page.draw_rect = utils.draw_rect
20985 Page.draw_sector = utils.draw_sector
20986 Page.draw_squiggle = utils.draw_squiggle
20987 Page.draw_zigzag = utils.draw_zigzag
20988 Page.get_image_info = utils.get_image_info
20989 Page.get_image_rects = utils.get_image_rects
20990 Page.get_label = utils.get_label
20991 Page.get_links = utils.get_links
20992 Page.get_pixmap = utils.get_pixmap
20993 Page.get_text = utils.get_text
20994 Page.get_text_blocks = utils.get_text_blocks
20995 Page.get_text_selection = utils.get_text_selection
20996 Page.get_text_words = utils.get_text_words
20997 Page.get_textbox = utils.get_textbox
20998 Page.get_textpage_ocr = utils.get_textpage_ocr
20999 Page.insert_image = utils.insert_image
21000 Page.insert_link = utils.insert_link
21001 Page.insert_text = utils.insert_text
21002 Page.insert_textbox = utils.insert_textbox
21003 Page.insert_htmlbox = utils.insert_htmlbox
21004 Page.new_shape = lambda x: utils.Shape(x)
21005 Page.replace_image = utils.replace_image
21006 Page.search_for = utils.search_for
21007 Page.show_pdf_page = utils.show_pdf_page
21008 Page.update_link = utils.update_link
21009 Page.write_text = utils.write_text
21010 Shape = utils.Shape
21011 from .table import find_tables 25553 from .table import find_tables
21012
21013 Page.find_tables = find_tables 25554 Page.find_tables = find_tables
21014
21015 Rect.get_area = utils.get_area
21016
21017 TextWriter.fill_textbox = utils.fill_textbox
21018 25555
21019 25556
21020 class FitzDeprecation(DeprecationWarning): 25557 class FitzDeprecation(DeprecationWarning):
21021 pass 25558 pass
21022 25559
21283 _alias( Rect, 'include_rect') 25820 _alias( Rect, 'include_rect')
21284 _alias( Rect, 'is_empty') 25821 _alias( Rect, 'is_empty')
21285 _alias( Rect, 'is_infinite') 25822 _alias( Rect, 'is_infinite')
21286 _alias( TextWriter, 'fill_textbox') 25823 _alias( TextWriter, 'fill_textbox')
21287 _alias( TextWriter, 'write_text') 25824 _alias( TextWriter, 'write_text')
21288 _alias( utils.Shape, 'draw_bezier') 25825 _alias( Shape, 'draw_bezier')
21289 _alias( utils.Shape, 'draw_circle') 25826 _alias( Shape, 'draw_circle')
21290 _alias( utils.Shape, 'draw_curve') 25827 _alias( Shape, 'draw_curve')
21291 _alias( utils.Shape, 'draw_line') 25828 _alias( Shape, 'draw_line')
21292 _alias( utils.Shape, 'draw_oval') 25829 _alias( Shape, 'draw_oval')
21293 _alias( utils.Shape, 'draw_polyline') 25830 _alias( Shape, 'draw_polyline')
21294 _alias( utils.Shape, 'draw_quad') 25831 _alias( Shape, 'draw_quad')
21295 _alias( utils.Shape, 'draw_rect') 25832 _alias( Shape, 'draw_rect')
21296 _alias( utils.Shape, 'draw_sector') 25833 _alias( Shape, 'draw_sector')
21297 _alias( utils.Shape, 'draw_squiggle') 25834 _alias( Shape, 'draw_squiggle')
21298 _alias( utils.Shape, 'draw_zigzag') 25835 _alias( Shape, 'draw_zigzag')
21299 _alias( utils.Shape, 'insert_text') 25836 _alias( Shape, 'insert_text')
21300 _alias( utils.Shape, 'insert_textbox') 25837 _alias( Shape, 'insert_textbox')
21301 25838
21302 if 0: 25839 if 0:
21303 restore_aliases() 25840 restore_aliases()
21304 25841
21305 __version__ = VersionBind 25842 __version__ = VersionBind