Mercurial > hgrepos > Python2 > PyMuPDF
comparison src/__main__.py @ 1:1d09e1dec1d9 upstream
ADD: PyMuPDF v1.26.4: the original sdist.
It does not yet contain MuPDF. This normally will be downloaded when
building PyMuPDF.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:37:51 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 1:1d09e1dec1d9 |
|---|---|
| 1 # ----------------------------------------------------------------------------- | |
| 2 # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com | |
| 3 # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html | |
| 4 # Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a | |
| 5 # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is | |
| 6 # maintained and developed by Artifex Software, Inc. https://artifex.com. | |
| 7 # ----------------------------------------------------------------------------- | |
| 8 import argparse | |
| 9 import bisect | |
| 10 import os | |
| 11 import sys | |
| 12 import statistics | |
| 13 from typing import Dict, List, Set | |
| 14 | |
| 15 from . import pymupdf | |
| 16 | |
| 17 def mycenter(x): | |
| 18 return (" %s " % x).center(75, "-") | |
| 19 | |
| 20 | |
| 21 def recoverpix(doc, item): | |
| 22 """Return image for a given XREF.""" | |
| 23 x = item[0] # xref of PDF image | |
| 24 s = item[1] # xref of its /SMask | |
| 25 if s == 0: # no smask: use direct image output | |
| 26 return doc.extract_image(x) | |
| 27 | |
| 28 def getimage(pix): | |
| 29 if pix.colorspace.n != 4: | |
| 30 return pix | |
| 31 tpix = pymupdf.Pixmap(pymupdf.csRGB, pix) | |
| 32 return tpix | |
| 33 | |
| 34 # we need to reconstruct the alpha channel with the smask | |
| 35 pix1 = pymupdf.Pixmap(doc, x) | |
| 36 pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry | |
| 37 | |
| 38 """Sanity check: | |
| 39 - both pixmaps must have the same rectangle | |
| 40 - both pixmaps must have alpha=0 | |
| 41 - pix2 must consist of 1 byte per pixel | |
| 42 """ | |
| 43 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1): | |
| 44 pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x)) | |
| 45 pymupdf.message(pix2) | |
| 46 pix2 = None | |
| 47 return getimage(pix1) # return the pixmap as is | |
| 48 | |
| 49 pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added | |
| 50 pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values | |
| 51 pix1 = pix2 = None # free temp pixmaps | |
| 52 | |
| 53 # we may need to adjust something for CMYK pixmaps here: | |
| 54 return getimage(pix) | |
| 55 | |
| 56 | |
| 57 def open_file(filename, password, show=False, pdf=True): | |
| 58 """Open and authenticate a document.""" | |
| 59 doc = pymupdf.open(filename) | |
| 60 if not doc.is_pdf and pdf is True: | |
| 61 sys.exit("this command supports PDF files only") | |
| 62 rc = -1 | |
| 63 if not doc.needs_pass: | |
| 64 return doc | |
| 65 if password: | |
| 66 rc = doc.authenticate(password) | |
| 67 if not rc: | |
| 68 sys.exit("authentication unsuccessful") | |
| 69 if show is True: | |
| 70 pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user") | |
| 71 else: | |
| 72 sys.exit("'%s' requires a password" % doc.name) | |
| 73 return doc | |
| 74 | |
| 75 | |
| 76 def print_dict(item): | |
| 77 """Print a Python dictionary.""" | |
| 78 l = max([len(k) for k in item.keys()]) + 1 | |
| 79 for k, v in item.items(): | |
| 80 msg = "%s: %s" % (k.rjust(l), v) | |
| 81 pymupdf.message(msg) | |
| 82 | |
| 83 | |
| 84 def print_xref(doc, xref): | |
| 85 """Print an object given by XREF number. | |
| 86 | |
| 87 Simulate the PDF source in "pretty" format. | |
| 88 For a stream also print its size. | |
| 89 """ | |
| 90 pymupdf.message("%i 0 obj" % xref) | |
| 91 xref_str = doc.xref_object(xref) | |
| 92 pymupdf.message(xref_str) | |
| 93 if doc.xref_is_stream(xref): | |
| 94 temp = xref_str.split() | |
| 95 try: | |
| 96 idx = temp.index("/Length") + 1 | |
| 97 size = temp[idx] | |
| 98 if size.endswith("0 R"): | |
| 99 size = "unknown" | |
| 100 except Exception: | |
| 101 size = "unknown" | |
| 102 pymupdf.message("stream\n...%s bytes" % size) | |
| 103 pymupdf.message("endstream") | |
| 104 pymupdf.message("endobj") | |
| 105 | |
| 106 | |
| 107 def get_list(rlist, limit, what="page"): | |
| 108 """Transform a page / xref specification into a list of integers. | |
| 109 | |
| 110 Args | |
| 111 ---- | |
| 112 rlist: (str) the specification | |
| 113 limit: maximum number, i.e. number of pages, number of objects | |
| 114 what: a string to be used in error messages | |
| 115 Returns | |
| 116 ------- | |
| 117 A list of integers representing the specification. | |
| 118 """ | |
| 119 N = str(limit - 1) | |
| 120 rlist = rlist.replace("N", N).replace(" ", "") | |
| 121 rlist_arr = rlist.split(",") | |
| 122 out_list = [] | |
| 123 for seq, item in enumerate(rlist_arr): | |
| 124 n = seq + 1 | |
| 125 if item.isdecimal(): # a single integer | |
| 126 i = int(item) | |
| 127 if 1 <= i < limit: | |
| 128 out_list.append(int(item)) | |
| 129 else: | |
| 130 sys.exit("bad %s specification at item %i" % (what, n)) | |
| 131 continue | |
| 132 try: # this must be a range now, and all of the following must work: | |
| 133 i1, i2 = item.split("-") # will fail if not 2 items produced | |
| 134 i1 = int(i1) # will fail on non-integers | |
| 135 i2 = int(i2) | |
| 136 except Exception: | |
| 137 sys.exit("bad %s range specification at item %i" % (what, n)) | |
| 138 | |
| 139 if not (1 <= i1 < limit and 1 <= i2 < limit): | |
| 140 sys.exit("bad %s range specification at item %i" % (what, n)) | |
| 141 | |
| 142 if i1 == i2: # just in case: a range of equal numbers | |
| 143 out_list.append(i1) | |
| 144 continue | |
| 145 | |
| 146 if i1 < i2: # first less than second | |
| 147 out_list += list(range(i1, i2 + 1)) | |
| 148 else: # first larger than second | |
| 149 out_list += list(range(i1, i2 - 1, -1)) | |
| 150 | |
| 151 return out_list | |
| 152 | |
| 153 | |
| 154 def show(args): | |
| 155 doc = open_file(args.input, args.password, True) | |
| 156 size = os.path.getsize(args.input) / 1024 | |
| 157 flag = "KB" | |
| 158 if size > 1000: | |
| 159 size /= 1024 | |
| 160 flag = "MB" | |
| 161 size = round(size, 1) | |
| 162 meta = doc.metadata # pylint: disable=no-member | |
| 163 pymupdf.message( | |
| 164 "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s" | |
| 165 % ( | |
| 166 args.input, | |
| 167 doc.page_count, | |
| 168 doc.xref_length() - 1, | |
| 169 size, | |
| 170 flag, | |
| 171 meta["format"], | |
| 172 meta["encryption"], | |
| 173 ) | |
| 174 ) | |
| 175 n = doc.is_form_pdf | |
| 176 if n > 0: | |
| 177 s = doc.get_sigflags() | |
| 178 pymupdf.message( | |
| 179 "document contains %i root form fields and is %ssigned" | |
| 180 % (n, "not " if s != 3 else "") | |
| 181 ) | |
| 182 n = doc.embfile_count() | |
| 183 if n > 0: | |
| 184 pymupdf.message("document contains %i embedded files" % n) | |
| 185 pymupdf.message() | |
| 186 if args.catalog: | |
| 187 pymupdf.message(mycenter("PDF catalog")) | |
| 188 xref = doc.pdf_catalog() | |
| 189 print_xref(doc, xref) | |
| 190 pymupdf.message() | |
| 191 if args.metadata: | |
| 192 pymupdf.message(mycenter("PDF metadata")) | |
| 193 print_dict(doc.metadata) # pylint: disable=no-member | |
| 194 pymupdf.message() | |
| 195 if args.xrefs: | |
| 196 pymupdf.message(mycenter("object information")) | |
| 197 xrefl = get_list(args.xrefs, doc.xref_length(), what="xref") | |
| 198 for xref in xrefl: | |
| 199 print_xref(doc, xref) | |
| 200 pymupdf.message() | |
| 201 if args.pages: | |
| 202 pymupdf.message(mycenter("page information")) | |
| 203 pagel = get_list(args.pages, doc.page_count + 1) | |
| 204 for pno in pagel: | |
| 205 n = pno - 1 | |
| 206 xref = doc.page_xref(n) | |
| 207 pymupdf.message("Page %i:" % pno) | |
| 208 print_xref(doc, xref) | |
| 209 pymupdf.message() | |
| 210 if args.trailer: | |
| 211 pymupdf.message(mycenter("PDF trailer")) | |
| 212 pymupdf.message(doc.pdf_trailer()) | |
| 213 pymupdf.message() | |
| 214 doc.close() | |
| 215 | |
| 216 | |
| 217 def clean(args): | |
| 218 doc = open_file(args.input, args.password, pdf=True) | |
| 219 encryption = args.encryption | |
| 220 encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index( | |
| 221 encryption | |
| 222 ) | |
| 223 | |
| 224 if not args.pages: # simple cleaning | |
| 225 doc.save( | |
| 226 args.output, | |
| 227 garbage=args.garbage, | |
| 228 deflate=args.compress, | |
| 229 pretty=args.pretty, | |
| 230 clean=args.sanitize, | |
| 231 ascii=args.ascii, | |
| 232 linear=args.linear, | |
| 233 encryption=encrypt, | |
| 234 owner_pw=args.owner, | |
| 235 user_pw=args.user, | |
| 236 permissions=args.permission, | |
| 237 ) | |
| 238 return | |
| 239 | |
| 240 # create sub document from page numbers | |
| 241 pages = get_list(args.pages, doc.page_count + 1) | |
| 242 outdoc = pymupdf.open() | |
| 243 for pno in pages: | |
| 244 n = pno - 1 | |
| 245 outdoc.insert_pdf(doc, from_page=n, to_page=n) | |
| 246 outdoc.save( | |
| 247 args.output, | |
| 248 garbage=args.garbage, | |
| 249 deflate=args.compress, | |
| 250 pretty=args.pretty, | |
| 251 clean=args.sanitize, | |
| 252 ascii=args.ascii, | |
| 253 linear=args.linear, | |
| 254 encryption=encrypt, | |
| 255 owner_pw=args.owner, | |
| 256 user_pw=args.user, | |
| 257 permissions=args.permission, | |
| 258 ) | |
| 259 doc.close() | |
| 260 outdoc.close() | |
| 261 return | |
| 262 | |
| 263 | |
| 264 def doc_join(args): | |
| 265 """Join pages from several PDF documents.""" | |
| 266 doc_list = args.input # a list of input PDFs | |
| 267 doc = pymupdf.open() # output PDF | |
| 268 for src_item in doc_list: # process one input PDF | |
| 269 src_list = src_item.split(",") | |
| 270 password = src_list[1] if len(src_list) > 1 else None | |
| 271 src = open_file(src_list[0], password, pdf=True) | |
| 272 pages = ",".join(src_list[2:]) # get 'pages' specifications | |
| 273 if pages: # if anything there, retrieve a list of desired pages | |
| 274 page_list = get_list(",".join(src_list[2:]), src.page_count + 1) | |
| 275 else: # take all pages | |
| 276 page_list = range(1, src.page_count + 1) | |
| 277 for i in page_list: | |
| 278 doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page | |
| 279 src.close() | |
| 280 | |
| 281 doc.save(args.output, garbage=4, deflate=True) | |
| 282 doc.close() | |
| 283 | |
| 284 | |
| 285 def embedded_copy(args): | |
| 286 """Copy embedded files between PDFs.""" | |
| 287 doc = open_file(args.input, args.password, pdf=True) | |
| 288 if not doc.can_save_incrementally() and ( | |
| 289 not args.output or args.output == args.input | |
| 290 ): | |
| 291 sys.exit("cannot save PDF incrementally") | |
| 292 src = open_file(args.source, args.pwdsource) | |
| 293 names = set(args.name) if args.name else set() | |
| 294 src_names = set(src.embfile_names()) | |
| 295 if names: | |
| 296 if not names <= src_names: | |
| 297 sys.exit("not all names are contained in source") | |
| 298 else: | |
| 299 names = src_names | |
| 300 if not names: | |
| 301 sys.exit("nothing to copy") | |
| 302 intersect = names & set(doc.embfile_names()) # any equal name already in target? | |
| 303 if intersect: | |
| 304 sys.exit("following names already exist in receiving PDF: %s" % str(intersect)) | |
| 305 | |
| 306 for item in names: | |
| 307 info = src.embfile_info(item) | |
| 308 buff = src.embfile_get(item) | |
| 309 doc.embfile_add( | |
| 310 item, | |
| 311 buff, | |
| 312 filename=info["filename"], | |
| 313 ufilename=info["ufilename"], | |
| 314 desc=info["desc"], | |
| 315 ) | |
| 316 pymupdf.message("copied entry '%s' from '%s'" % (item, src.name)) | |
| 317 src.close() | |
| 318 if args.output and args.output != args.input: | |
| 319 doc.save(args.output, garbage=3) | |
| 320 else: | |
| 321 doc.saveIncr() | |
| 322 doc.close() | |
| 323 | |
| 324 | |
| 325 def embedded_del(args): | |
| 326 """Delete an embedded file entry.""" | |
| 327 doc = open_file(args.input, args.password, pdf=True) | |
| 328 if not doc.can_save_incrementally() and ( | |
| 329 not args.output or args.output == args.input | |
| 330 ): | |
| 331 sys.exit("cannot save PDF incrementally") | |
| 332 | |
| 333 try: | |
| 334 doc.embfile_del(args.name) | |
| 335 except (ValueError, pymupdf.mupdf.FzErrorBase) as e: | |
| 336 sys.exit(f'no such embedded file {args.name!r}: {e}') | |
| 337 if not args.output or args.output == args.input: | |
| 338 doc.saveIncr() | |
| 339 else: | |
| 340 doc.save(args.output, garbage=1) | |
| 341 doc.close() | |
| 342 | |
| 343 | |
| 344 def embedded_get(args): | |
| 345 """Retrieve contents of an embedded file.""" | |
| 346 doc = open_file(args.input, args.password, pdf=True) | |
| 347 try: | |
| 348 stream = doc.embfile_get(args.name) | |
| 349 d = doc.embfile_info(args.name) | |
| 350 except (ValueError, pymupdf.mupdf.FzErrorBase) as e: | |
| 351 sys.exit(f'no such embedded file {args.name!r}: {e}') | |
| 352 filename = args.output if args.output else d["filename"] | |
| 353 with open(filename, "wb") as output: | |
| 354 output.write(stream) | |
| 355 pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename)) | |
| 356 doc.close() | |
| 357 | |
| 358 | |
| 359 def embedded_add(args): | |
| 360 """Insert a new embedded file.""" | |
| 361 doc = open_file(args.input, args.password, pdf=True) | |
| 362 if not doc.can_save_incrementally() and ( | |
| 363 args.output is None or args.output == args.input | |
| 364 ): | |
| 365 sys.exit("cannot save PDF incrementally") | |
| 366 | |
| 367 try: | |
| 368 doc.embfile_del(args.name) | |
| 369 sys.exit("entry '%s' already exists" % args.name) | |
| 370 except Exception: | |
| 371 pass | |
| 372 | |
| 373 if not os.path.exists(args.path) or not os.path.isfile(args.path): | |
| 374 sys.exit("no such file '%s'" % args.path) | |
| 375 with open(args.path, "rb") as f: | |
| 376 stream = f.read() | |
| 377 filename = args.path | |
| 378 ufilename = filename | |
| 379 if not args.desc: | |
| 380 desc = filename | |
| 381 else: | |
| 382 desc = args.desc | |
| 383 doc.embfile_add( | |
| 384 args.name, stream, filename=filename, ufilename=ufilename, desc=desc | |
| 385 ) | |
| 386 if not args.output or args.output == args.input: | |
| 387 doc.saveIncr() | |
| 388 else: | |
| 389 doc.save(args.output, garbage=3) | |
| 390 doc.close() | |
| 391 | |
| 392 | |
| 393 def embedded_upd(args): | |
| 394 """Update contents or metadata of an embedded file.""" | |
| 395 doc = open_file(args.input, args.password, pdf=True) | |
| 396 if not doc.can_save_incrementally() and ( | |
| 397 args.output is None or args.output == args.input | |
| 398 ): | |
| 399 sys.exit("cannot save PDF incrementally") | |
| 400 | |
| 401 try: | |
| 402 doc.embfile_info(args.name) | |
| 403 except Exception: | |
| 404 sys.exit("no such embedded file '%s'" % args.name) | |
| 405 | |
| 406 if ( | |
| 407 args.path is not None | |
| 408 and os.path.exists(args.path) | |
| 409 and os.path.isfile(args.path) | |
| 410 ): | |
| 411 with open(args.path, "rb") as f: | |
| 412 stream = f.read() | |
| 413 else: | |
| 414 stream = None | |
| 415 | |
| 416 if args.filename: | |
| 417 filename = args.filename | |
| 418 else: | |
| 419 filename = None | |
| 420 | |
| 421 if args.ufilename: | |
| 422 ufilename = args.ufilename | |
| 423 elif args.filename: | |
| 424 ufilename = args.filename | |
| 425 else: | |
| 426 ufilename = None | |
| 427 | |
| 428 if args.desc: | |
| 429 desc = args.desc | |
| 430 else: | |
| 431 desc = None | |
| 432 | |
| 433 doc.embfile_upd( | |
| 434 args.name, stream, filename=filename, ufilename=ufilename, desc=desc | |
| 435 ) | |
| 436 if args.output is None or args.output == args.input: | |
| 437 doc.saveIncr() | |
| 438 else: | |
| 439 doc.save(args.output, garbage=3) | |
| 440 doc.close() | |
| 441 | |
| 442 | |
| 443 def embedded_list(args): | |
| 444 """List embedded files.""" | |
| 445 doc = open_file(args.input, args.password, pdf=True) | |
| 446 names = doc.embfile_names() | |
| 447 if args.name is not None: | |
| 448 if args.name not in names: | |
| 449 sys.exit("no such embedded file '%s'" % args.name) | |
| 450 else: | |
| 451 pymupdf.message() | |
| 452 pymupdf.message( | |
| 453 "printing 1 of %i embedded file%s:" | |
| 454 % (len(names), "s" if len(names) > 1 else "") | |
| 455 ) | |
| 456 pymupdf.message() | |
| 457 print_dict(doc.embfile_info(args.name)) | |
| 458 pymupdf.message() | |
| 459 return | |
| 460 if not names: | |
| 461 pymupdf.message("'%s' contains no embedded files" % doc.name) | |
| 462 return | |
| 463 if len(names) > 1: | |
| 464 msg = "'%s' contains the following %i embedded files" % (doc.name, len(names)) | |
| 465 else: | |
| 466 msg = "'%s' contains the following embedded file" % doc.name | |
| 467 pymupdf.message(msg) | |
| 468 pymupdf.message() | |
| 469 for name in names: | |
| 470 if not args.detail: | |
| 471 pymupdf.message(name) | |
| 472 continue | |
| 473 _ = doc.embfile_info(name) | |
| 474 print_dict(doc.embfile_info(name)) | |
| 475 pymupdf.message() | |
| 476 doc.close() | |
| 477 | |
| 478 | |
| 479 def extract_objects(args): | |
| 480 """Extract images and / or fonts from a PDF.""" | |
| 481 if not args.fonts and not args.images: | |
| 482 sys.exit("neither fonts nor images requested") | |
| 483 doc = open_file(args.input, args.password, pdf=True) | |
| 484 | |
| 485 if args.pages: | |
| 486 pages = get_list(args.pages, doc.page_count + 1) | |
| 487 else: | |
| 488 pages = range(1, doc.page_count + 1) | |
| 489 | |
| 490 if not args.output: | |
| 491 out_dir = os.path.abspath(os.curdir) | |
| 492 else: | |
| 493 out_dir = args.output | |
| 494 if not (os.path.exists(out_dir) and os.path.isdir(out_dir)): | |
| 495 sys.exit("output directory %s does not exist" % out_dir) | |
| 496 | |
| 497 font_xrefs = set() # already saved fonts | |
| 498 image_xrefs = set() # already saved images | |
| 499 | |
| 500 for pno in pages: | |
| 501 if args.fonts: | |
| 502 itemlist = doc.get_page_fonts(pno - 1) | |
| 503 for item in itemlist: | |
| 504 xref = item[0] | |
| 505 if xref not in font_xrefs: | |
| 506 font_xrefs.add(xref) | |
| 507 fontname, ext, _, buffer = doc.extract_font(xref) | |
| 508 if ext == "n/a" or not buffer: | |
| 509 continue | |
| 510 outname = os.path.join( | |
| 511 out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}" | |
| 512 ) | |
| 513 with open(outname, "wb") as outfile: | |
| 514 outfile.write(buffer) | |
| 515 buffer = None | |
| 516 if args.images: | |
| 517 itemlist = doc.get_page_images(pno - 1) | |
| 518 for item in itemlist: | |
| 519 xref = item[0] | |
| 520 if xref not in image_xrefs: | |
| 521 image_xrefs.add(xref) | |
| 522 pix = recoverpix(doc, item) | |
| 523 if type(pix) is dict: | |
| 524 ext = pix["ext"] | |
| 525 imgdata = pix["image"] | |
| 526 outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext)) | |
| 527 with open(outname, "wb") as outfile: | |
| 528 outfile.write(imgdata) | |
| 529 else: | |
| 530 outname = os.path.join(out_dir, "img-%i.png" % xref) | |
| 531 pix2 = ( | |
| 532 pix | |
| 533 if pix.colorspace.n < 4 | |
| 534 else pymupdf.Pixmap(pymupdf.csRGB, pix) | |
| 535 ) | |
| 536 pix2.save(outname) | |
| 537 | |
| 538 if args.fonts: | |
| 539 pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir)) | |
| 540 if args.images: | |
| 541 pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir)) | |
| 542 doc.close() | |
| 543 | |
| 544 | |
| 545 def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): | |
| 546 eop = b"\n" if noformfeed else bytes([12]) | |
| 547 text = page.get_text("text", flags=flags) | |
| 548 if not text: | |
| 549 if not skip_empty: | |
| 550 textout.write(eop) # write formfeed | |
| 551 return | |
| 552 textout.write(text.encode("utf8", errors="surrogatepass")) | |
| 553 textout.write(eop) | |
| 554 return | |
| 555 | |
| 556 | |
| 557 def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): | |
| 558 eop = b"\n" if noformfeed else bytes([12]) | |
| 559 blocks = page.get_text("blocks", flags=flags) | |
| 560 if blocks == []: | |
| 561 if not skip_empty: | |
| 562 textout.write(eop) # write formfeed | |
| 563 return | |
| 564 blocks.sort(key=lambda b: (b[3], b[0])) | |
| 565 for b in blocks: | |
| 566 textout.write(b[4].encode("utf8", errors="surrogatepass")) | |
| 567 textout.write(eop) | |
| 568 return | |
| 569 | |
| 570 | |
| 571 def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags): | |
| 572 eop = b"\n" if noformfeed else bytes([12]) | |
| 573 | |
| 574 # -------------------------------------------------------------------- | |
| 575 def find_line_index(values: List[int], value: int) -> int: | |
| 576 """Find the right row coordinate. | |
| 577 | |
| 578 Args: | |
| 579 values: (list) y-coordinates of rows. | |
| 580 value: (int) lookup for this value (y-origin of char). | |
| 581 Returns: | |
| 582 y-ccordinate of appropriate line for value. | |
| 583 """ | |
| 584 i = bisect.bisect_right(values, value) | |
| 585 if i: | |
| 586 return values[i - 1] | |
| 587 raise RuntimeError("Line for %g not found in %s" % (value, values)) | |
| 588 | |
| 589 # -------------------------------------------------------------------- | |
| 590 def curate_rows(rows: Set[int], GRID) -> List: | |
| 591 rows = list(rows) | |
| 592 rows.sort() # sort ascending | |
| 593 nrows = [rows[0]] | |
| 594 for h in rows[1:]: | |
| 595 if h >= nrows[-1] + GRID: # only keep significant differences | |
| 596 nrows.append(h) | |
| 597 return nrows # curated list of line bottom coordinates | |
| 598 | |
| 599 def process_blocks(blocks: List[Dict], page: pymupdf.Page): | |
| 600 rows = set() | |
| 601 page_width = page.rect.width | |
| 602 page_height = page.rect.height | |
| 603 rowheight = page_height | |
| 604 left = page_width | |
| 605 right = 0 | |
| 606 chars = [] | |
| 607 for block in blocks: | |
| 608 for line in block["lines"]: | |
| 609 if line["dir"] != (1, 0): # ignore non-horizontal text | |
| 610 continue | |
| 611 x0, y0, x1, y1 = line["bbox"] | |
| 612 if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox | |
| 613 continue | |
| 614 # upd row height | |
| 615 height = y1 - y0 | |
| 616 | |
| 617 if rowheight > height: | |
| 618 rowheight = height | |
| 619 for span in line["spans"]: | |
| 620 if span["size"] <= fontsize: | |
| 621 continue | |
| 622 for c in span["chars"]: | |
| 623 x0, _, x1, _ = c["bbox"] | |
| 624 cwidth = x1 - x0 | |
| 625 ox, oy = c["origin"] | |
| 626 oy = int(round(oy)) | |
| 627 rows.add(oy) | |
| 628 ch = c["c"] | |
| 629 if left > ox and ch != " ": | |
| 630 left = ox # update left coordinate | |
| 631 if right < x1: | |
| 632 right = x1 # update right coordinate | |
| 633 # handle ligatures: | |
| 634 if cwidth == 0 and chars != []: # potential ligature | |
| 635 old_ch, old_ox, old_oy, old_cwidth = chars[-1] | |
| 636 if old_oy == oy: # ligature | |
| 637 if old_ch != chr(0xFB00): # previous "ff" char lig? | |
| 638 lig = joinligature(old_ch + ch) # no | |
| 639 # convert to one of the 3-char ligatures: | |
| 640 elif ch == "i": | |
| 641 lig = chr(0xFB03) # "ffi" | |
| 642 elif ch == "l": | |
| 643 lig = chr(0xFB04) # "ffl" | |
| 644 else: # something wrong, leave old char in place | |
| 645 lig = old_ch | |
| 646 chars[-1] = (lig, old_ox, old_oy, old_cwidth) | |
| 647 continue | |
| 648 chars.append((ch, ox, oy, cwidth)) # all chars on page | |
| 649 return chars, rows, left, right, rowheight | |
| 650 | |
| 651 def joinligature(lig: str) -> str: | |
| 652 """Return ligature character for a given pair / triple of characters. | |
| 653 | |
| 654 Args: | |
| 655 lig: (str) 2/3 characters, e.g. "ff" | |
| 656 Returns: | |
| 657 Ligature, e.g. "ff" -> chr(0xFB00) | |
| 658 """ | |
| 659 | |
| 660 if lig == "ff": | |
| 661 return chr(0xFB00) | |
| 662 elif lig == "fi": | |
| 663 return chr(0xFB01) | |
| 664 elif lig == "fl": | |
| 665 return chr(0xFB02) | |
| 666 elif lig == "ffi": | |
| 667 return chr(0xFB03) | |
| 668 elif lig == "ffl": | |
| 669 return chr(0xFB04) | |
| 670 elif lig == "ft": | |
| 671 return chr(0xFB05) | |
| 672 elif lig == "st": | |
| 673 return chr(0xFB06) | |
| 674 return lig | |
| 675 | |
| 676 # -------------------------------------------------------------------- | |
| 677 def make_textline(left, slot, minslot, lchars): | |
| 678 """Produce the text of one output line. | |
| 679 | |
| 680 Args: | |
| 681 left: (float) left most coordinate used on page | |
| 682 slot: (float) avg width of one character in any font in use. | |
| 683 minslot: (float) min width for the characters in this line. | |
| 684 chars: (list[tuple]) characters of this line. | |
| 685 Returns: | |
| 686 text: (str) text string for this line | |
| 687 """ | |
| 688 text = "" # we output this | |
| 689 old_char = "" | |
| 690 old_x1 = 0 # end coordinate of last char | |
| 691 old_ox = 0 # x-origin of last char | |
| 692 if minslot <= pymupdf.EPSILON: | |
| 693 raise RuntimeError("program error: minslot too small = %g" % minslot) | |
| 694 | |
| 695 for c in lchars: # loop over characters | |
| 696 char, ox, _, cwidth = c | |
| 697 ox = ox - left # its (relative) start coordinate | |
| 698 x1 = ox + cwidth # ending coordinate | |
| 699 | |
| 700 # eliminate overprint effect | |
| 701 if old_char == char and ox - old_ox <= cwidth * 0.2: | |
| 702 continue | |
| 703 | |
| 704 # omit spaces overlapping previous char | |
| 705 if char == " " and (old_x1 - ox) / cwidth > 0.8: | |
| 706 continue | |
| 707 | |
| 708 old_char = char | |
| 709 # close enough to previous? | |
| 710 if ox < old_x1 + minslot: # assume char adjacent to previous | |
| 711 text += char # append to output | |
| 712 old_x1 = x1 # new end coord | |
| 713 old_ox = ox # new origin.x | |
| 714 continue | |
| 715 | |
| 716 # else next char starts after some gap: | |
| 717 # fill in right number of spaces, so char is positioned | |
| 718 # in the right slot of the line | |
| 719 if char == " ": # rest relevant for non-space only | |
| 720 continue | |
| 721 delta = int(ox / slot) - len(text) | |
| 722 if ox > old_x1 and delta > 1: | |
| 723 text += " " * delta | |
| 724 # now append char | |
| 725 text += char | |
| 726 old_x1 = x1 # new end coordinate | |
| 727 old_ox = ox # new origin | |
| 728 return text.rstrip() | |
| 729 | |
| 730 # extract page text by single characters ("rawdict") | |
| 731 blocks = page.get_text("rawdict", flags=flags)["blocks"] | |
| 732 chars, rows, left, right, rowheight = process_blocks(blocks, page) | |
| 733 | |
| 734 if chars == []: | |
| 735 if not skip_empty: | |
| 736 textout.write(eop) # write formfeed | |
| 737 return | |
| 738 # compute list of line coordinates - ignoring small (GRID) differences | |
| 739 rows = curate_rows(rows, GRID) | |
| 740 | |
| 741 # sort all chars by x-coordinates, so every line will receive char info, | |
| 742 # sorted from left to right. | |
| 743 chars.sort(key=lambda c: c[1]) | |
| 744 | |
| 745 # populate the lines with their char info | |
| 746 lines = {} # key: y1-ccordinate, value: char list | |
| 747 for c in chars: | |
| 748 _, _, oy, _ = c | |
| 749 y = find_line_index(rows, oy) # y-coord of the right line | |
| 750 lchars = lines.get(y, []) # read line chars so far | |
| 751 lchars.append(c) # append this char | |
| 752 lines[y] = lchars # write back to line | |
| 753 | |
| 754 # ensure line coordinates are ascending | |
| 755 keys = list(lines.keys()) | |
| 756 keys.sort() | |
| 757 | |
| 758 # ------------------------------------------------------------------------- | |
| 759 # Compute "char resolution" for the page: the char width corresponding to | |
| 760 # 1 text char position on output - call it 'slot'. | |
| 761 # For each line, compute median of its char widths. The minimum across all | |
| 762 # lines is 'slot'. | |
| 763 # The minimum char width of each line is used to determine if spaces must | |
| 764 # be inserted in between two characters. | |
| 765 # ------------------------------------------------------------------------- | |
| 766 slot = right - left | |
| 767 minslots = {} | |
| 768 for k in keys: | |
| 769 lchars = lines[k] | |
| 770 ccount = len(lchars) | |
| 771 if ccount < 2: | |
| 772 minslots[k] = 1 | |
| 773 continue | |
| 774 widths = [c[3] for c in lchars] | |
| 775 widths.sort() | |
| 776 this_slot = statistics.median(widths) # take median value | |
| 777 if this_slot < slot: | |
| 778 slot = this_slot | |
| 779 minslots[k] = widths[0] | |
| 780 | |
| 781 # compute line advance in text output | |
| 782 rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2 | |
| 783 rowpos = rows[0] # first line positioned here | |
| 784 textout.write(b"\n") | |
| 785 for k in keys: # walk through the lines | |
| 786 while rowpos < k: # honor distance between lines | |
| 787 textout.write(b"\n") | |
| 788 rowpos += rowheight | |
| 789 text = make_textline(left, slot, minslots[k], lines[k]) | |
| 790 textout.write((text + "\n").encode("utf8", errors="surrogatepass")) | |
| 791 rowpos = k + rowheight | |
| 792 | |
| 793 textout.write(eop) # write formfeed | |
| 794 | |
| 795 | |
| 796 def gettext(args): | |
| 797 doc = open_file(args.input, args.password, pdf=False) | |
| 798 pagel = get_list(args.pages, doc.page_count + 1) | |
| 799 output = args.output | |
| 800 if output is None: | |
| 801 filename, _ = os.path.splitext(doc.name) | |
| 802 output = filename + ".txt" | |
| 803 with open(output, "wb") as textout: | |
| 804 flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE | |
| 805 if args.convert_white: | |
| 806 flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE | |
| 807 if args.noligatures: | |
| 808 flags ^= pymupdf.TEXT_PRESERVE_LIGATURES | |
| 809 if args.extra_spaces: | |
| 810 flags ^= pymupdf.TEXT_INHIBIT_SPACES | |
| 811 func = { | |
| 812 "simple": page_simple, | |
| 813 "blocks": page_blocksort, | |
| 814 "layout": page_layout, | |
| 815 } | |
| 816 for pno in pagel: | |
| 817 page = doc[pno - 1] | |
| 818 func[args.mode]( | |
| 819 page, | |
| 820 textout, | |
| 821 args.grid, | |
| 822 args.fontsize, | |
| 823 args.noformfeed, | |
| 824 args.skip_empty, | |
| 825 flags=flags, | |
| 826 ) | |
| 827 | |
| 828 | |
| 829 def _internal(args): | |
| 830 pymupdf.message('This is from PyMuPDF message().') | |
| 831 pymupdf.log('This is from PyMuPDF log().') | |
| 832 | |
| 833 def main(): | |
| 834 """Define command configurations.""" | |
| 835 parser = argparse.ArgumentParser( | |
| 836 prog="pymupdf", | |
| 837 description=mycenter("Basic PyMuPDF Functions"), | |
| 838 ) | |
| 839 subps = parser.add_subparsers( | |
| 840 title="Subcommands", help="Enter 'command -h' for subcommand specific help" | |
| 841 ) | |
| 842 | |
| 843 # ------------------------------------------------------------------------- | |
| 844 # 'show' command | |
| 845 # ------------------------------------------------------------------------- | |
| 846 ps_show = subps.add_parser("show", description=mycenter("display PDF information")) | |
| 847 ps_show.add_argument("input", type=str, help="PDF filename") | |
| 848 ps_show.add_argument("-password", help="password") | |
| 849 ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog") | |
| 850 ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer") | |
| 851 ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata") | |
| 852 ps_show.add_argument( | |
| 853 "-xrefs", type=str, help="show selected objects, format: 1,5-7,N" | |
| 854 ) | |
| 855 ps_show.add_argument( | |
| 856 "-pages", type=str, help="show selected pages, format: 1,5-7,50-N" | |
| 857 ) | |
| 858 ps_show.set_defaults(func=show) | |
| 859 | |
| 860 # ------------------------------------------------------------------------- | |
| 861 # 'clean' command | |
| 862 # ------------------------------------------------------------------------- | |
| 863 ps_clean = subps.add_parser( | |
| 864 "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given") | |
| 865 ) | |
| 866 ps_clean.add_argument("input", type=str, help="PDF filename") | |
| 867 ps_clean.add_argument("output", type=str, help="output PDF filename") | |
| 868 ps_clean.add_argument("-password", help="password") | |
| 869 | |
| 870 ps_clean.add_argument( | |
| 871 "-encryption", | |
| 872 help="encryption method", | |
| 873 choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"), | |
| 874 default="none", | |
| 875 ) | |
| 876 | |
| 877 ps_clean.add_argument("-owner", type=str, help="owner password") | |
| 878 ps_clean.add_argument("-user", type=str, help="user password") | |
| 879 | |
| 880 ps_clean.add_argument( | |
| 881 "-garbage", | |
| 882 type=int, | |
| 883 help="garbage collection level", | |
| 884 choices=range(5), | |
| 885 default=0, | |
| 886 ) | |
| 887 | |
| 888 ps_clean.add_argument( | |
| 889 "-compress", | |
| 890 action="store_true", | |
| 891 default=False, | |
| 892 help="compress (deflate) output", | |
| 893 ) | |
| 894 | |
| 895 ps_clean.add_argument( | |
| 896 "-ascii", action="store_true", default=False, help="ASCII encode binary data" | |
| 897 ) | |
| 898 | |
| 899 ps_clean.add_argument( | |
| 900 "-linear", | |
| 901 action="store_true", | |
| 902 default=False, | |
| 903 help="format for fast web display", | |
| 904 ) | |
| 905 | |
| 906 ps_clean.add_argument( | |
| 907 "-permission", type=int, default=-1, help="integer with permission levels" | |
| 908 ) | |
| 909 | |
| 910 ps_clean.add_argument( | |
| 911 "-sanitize", | |
| 912 action="store_true", | |
| 913 default=False, | |
| 914 help="sanitize / clean contents", | |
| 915 ) | |
| 916 ps_clean.add_argument( | |
| 917 "-pretty", action="store_true", default=False, help="prettify PDF structure" | |
| 918 ) | |
| 919 ps_clean.add_argument( | |
| 920 "-pages", help="output selected pages pages, format: 1,5-7,50-N" | |
| 921 ) | |
| 922 ps_clean.set_defaults(func=clean) | |
| 923 | |
| 924 # ------------------------------------------------------------------------- | |
| 925 # 'join' command | |
| 926 # ------------------------------------------------------------------------- | |
| 927 ps_join = subps.add_parser( | |
| 928 "join", | |
| 929 description=mycenter("join PDF documents"), | |
| 930 epilog="specify each input as 'filename[,password[,pages]]'", | |
| 931 ) | |
| 932 ps_join.add_argument("input", nargs="*", help="input filenames") | |
| 933 ps_join.add_argument("-output", required=True, help="output filename") | |
| 934 ps_join.set_defaults(func=doc_join) | |
| 935 | |
| 936 # ------------------------------------------------------------------------- | |
| 937 # 'extract' command | |
| 938 # ------------------------------------------------------------------------- | |
| 939 ps_extract = subps.add_parser( | |
| 940 "extract", description=mycenter("extract images and fonts to disk") | |
| 941 ) | |
| 942 ps_extract.add_argument("input", type=str, help="PDF filename") | |
| 943 ps_extract.add_argument("-images", action="store_true", help="extract images") | |
| 944 ps_extract.add_argument("-fonts", action="store_true", help="extract fonts") | |
| 945 ps_extract.add_argument( | |
| 946 "-output", help="folder to receive output, defaults to current" | |
| 947 ) | |
| 948 ps_extract.add_argument("-password", help="password") | |
| 949 ps_extract.add_argument( | |
| 950 "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N" | |
| 951 ) | |
| 952 ps_extract.set_defaults(func=extract_objects) | |
| 953 | |
| 954 # ------------------------------------------------------------------------- | |
| 955 # 'embed-info' | |
| 956 # ------------------------------------------------------------------------- | |
| 957 ps_show = subps.add_parser( | |
| 958 "embed-info", description=mycenter("list embedded files") | |
| 959 ) | |
| 960 ps_show.add_argument("input", help="PDF filename") | |
| 961 ps_show.add_argument("-name", help="if given, report only this one") | |
| 962 ps_show.add_argument("-detail", action="store_true", help="detail information") | |
| 963 ps_show.add_argument("-password", help="password") | |
| 964 ps_show.set_defaults(func=embedded_list) | |
| 965 | |
| 966 # ------------------------------------------------------------------------- | |
| 967 # 'embed-add' command | |
| 968 # ------------------------------------------------------------------------- | |
| 969 ps_embed_add = subps.add_parser( | |
| 970 "embed-add", description=mycenter("add embedded file") | |
| 971 ) | |
| 972 ps_embed_add.add_argument("input", help="PDF filename") | |
| 973 ps_embed_add.add_argument("-password", help="password") | |
| 974 ps_embed_add.add_argument( | |
| 975 "-output", help="output PDF filename, incremental save if none" | |
| 976 ) | |
| 977 ps_embed_add.add_argument("-name", required=True, help="name of new entry") | |
| 978 ps_embed_add.add_argument("-path", required=True, help="path to data for new entry") | |
| 979 ps_embed_add.add_argument("-desc", help="description of new entry") | |
| 980 ps_embed_add.set_defaults(func=embedded_add) | |
| 981 | |
| 982 # ------------------------------------------------------------------------- | |
| 983 # 'embed-del' command | |
| 984 # ------------------------------------------------------------------------- | |
| 985 ps_embed_del = subps.add_parser( | |
| 986 "embed-del", description=mycenter("delete embedded file") | |
| 987 ) | |
| 988 ps_embed_del.add_argument("input", help="PDF filename") | |
| 989 ps_embed_del.add_argument("-password", help="password") | |
| 990 ps_embed_del.add_argument( | |
| 991 "-output", help="output PDF filename, incremental save if none" | |
| 992 ) | |
| 993 ps_embed_del.add_argument("-name", required=True, help="name of entry to delete") | |
| 994 ps_embed_del.set_defaults(func=embedded_del) | |
| 995 | |
| 996 # ------------------------------------------------------------------------- | |
| 997 # 'embed-upd' command | |
| 998 # ------------------------------------------------------------------------- | |
| 999 ps_embed_upd = subps.add_parser( | |
| 1000 "embed-upd", | |
| 1001 description=mycenter("update embedded file"), | |
| 1002 epilog="except '-name' all parameters are optional", | |
| 1003 ) | |
| 1004 ps_embed_upd.add_argument("input", help="PDF filename") | |
| 1005 ps_embed_upd.add_argument("-name", required=True, help="name of entry") | |
| 1006 ps_embed_upd.add_argument("-password", help="password") | |
| 1007 ps_embed_upd.add_argument( | |
| 1008 "-output", help="Output PDF filename, incremental save if none" | |
| 1009 ) | |
| 1010 ps_embed_upd.add_argument("-path", help="path to new data for entry") | |
| 1011 ps_embed_upd.add_argument("-filename", help="new filename to store in entry") | |
| 1012 ps_embed_upd.add_argument( | |
| 1013 "-ufilename", help="new unicode filename to store in entry" | |
| 1014 ) | |
| 1015 ps_embed_upd.add_argument("-desc", help="new description to store in entry") | |
| 1016 ps_embed_upd.set_defaults(func=embedded_upd) | |
| 1017 | |
| 1018 # ------------------------------------------------------------------------- | |
| 1019 # 'embed-extract' command | |
| 1020 # ------------------------------------------------------------------------- | |
| 1021 ps_embed_extract = subps.add_parser( | |
| 1022 "embed-extract", description=mycenter("extract embedded file to disk") | |
| 1023 ) | |
| 1024 ps_embed_extract.add_argument("input", type=str, help="PDF filename") | |
| 1025 ps_embed_extract.add_argument("-name", required=True, help="name of entry") | |
| 1026 ps_embed_extract.add_argument("-password", help="password") | |
| 1027 ps_embed_extract.add_argument( | |
| 1028 "-output", help="output filename, default is stored name" | |
| 1029 ) | |
| 1030 ps_embed_extract.set_defaults(func=embedded_get) | |
| 1031 | |
| 1032 # ------------------------------------------------------------------------- | |
| 1033 # 'embed-copy' command | |
| 1034 # ------------------------------------------------------------------------- | |
| 1035 ps_embed_copy = subps.add_parser( | |
| 1036 "embed-copy", description=mycenter("copy embedded files between PDFs") | |
| 1037 ) | |
| 1038 ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files") | |
| 1039 ps_embed_copy.add_argument("-password", help="password of input") | |
| 1040 ps_embed_copy.add_argument( | |
| 1041 "-output", help="output PDF, incremental save to 'input' if omitted" | |
| 1042 ) | |
| 1043 ps_embed_copy.add_argument( | |
| 1044 "-source", required=True, help="copy embedded files from here" | |
| 1045 ) | |
| 1046 ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF") | |
| 1047 ps_embed_copy.add_argument( | |
| 1048 "-name", nargs="*", help="restrict copy to these entries" | |
| 1049 ) | |
| 1050 ps_embed_copy.set_defaults(func=embedded_copy) | |
| 1051 | |
| 1052 # ------------------------------------------------------------------------- | |
| 1053 # 'textlayout' command | |
| 1054 # ------------------------------------------------------------------------- | |
| 1055 ps_gettext = subps.add_parser( | |
| 1056 "gettext", description=mycenter("extract text in various formatting modes") | |
| 1057 ) | |
| 1058 ps_gettext.add_argument("input", type=str, help="input document filename") | |
| 1059 ps_gettext.add_argument("-password", help="password for input document") | |
| 1060 ps_gettext.add_argument( | |
| 1061 "-mode", | |
| 1062 type=str, | |
| 1063 help="mode: simple, block sort, or layout (default)", | |
| 1064 choices=("simple", "blocks", "layout"), | |
| 1065 default="layout", | |
| 1066 ) | |
| 1067 ps_gettext.add_argument( | |
| 1068 "-pages", | |
| 1069 type=str, | |
| 1070 help="select pages, format: 1,5-7,50-N", | |
| 1071 default="1-N", | |
| 1072 ) | |
| 1073 ps_gettext.add_argument( | |
| 1074 "-noligatures", | |
| 1075 action="store_true", | |
| 1076 help="expand ligature characters (default False)", | |
| 1077 default=False, | |
| 1078 ) | |
| 1079 ps_gettext.add_argument( | |
| 1080 "-convert-white", | |
| 1081 action="store_true", | |
| 1082 help="convert whitespace characters to white (default False)", | |
| 1083 default=False, | |
| 1084 ) | |
| 1085 ps_gettext.add_argument( | |
| 1086 "-extra-spaces", | |
| 1087 action="store_true", | |
| 1088 help="fill gaps with spaces (default False)", | |
| 1089 default=False, | |
| 1090 ) | |
| 1091 ps_gettext.add_argument( | |
| 1092 "-noformfeed", | |
| 1093 action="store_true", | |
| 1094 help="write linefeeds, no formfeeds (default False)", | |
| 1095 default=False, | |
| 1096 ) | |
| 1097 ps_gettext.add_argument( | |
| 1098 "-skip-empty", | |
| 1099 action="store_true", | |
| 1100 help="suppress pages with no text (default False)", | |
| 1101 default=False, | |
| 1102 ) | |
| 1103 ps_gettext.add_argument( | |
| 1104 "-output", | |
| 1105 help="store text in this file (default inputfilename.txt)", | |
| 1106 ) | |
| 1107 ps_gettext.add_argument( | |
| 1108 "-grid", | |
| 1109 type=float, | |
| 1110 help="merge lines if closer than this (default 2)", | |
| 1111 default=2, | |
| 1112 ) | |
| 1113 ps_gettext.add_argument( | |
| 1114 "-fontsize", | |
| 1115 type=float, | |
| 1116 help="only include text with a larger fontsize (default 3)", | |
| 1117 default=3, | |
| 1118 ) | |
| 1119 ps_gettext.set_defaults(func=gettext) | |
| 1120 | |
| 1121 # ------------------------------------------------------------------------- | |
| 1122 # '_internal' command | |
| 1123 # ------------------------------------------------------------------------- | |
| 1124 ps_internal = subps.add_parser( | |
| 1125 "internal", description=mycenter("internal testing") | |
| 1126 ) | |
| 1127 ps_internal.set_defaults(func=_internal) | |
| 1128 | |
| 1129 # ------------------------------------------------------------------------- | |
| 1130 # start program | |
| 1131 # ------------------------------------------------------------------------- | |
| 1132 args = parser.parse_args() # create parameter arguments class | |
| 1133 if not hasattr(args, "func"): # no function selected | |
| 1134 parser.print_help() # so print top level help | |
| 1135 else: | |
| 1136 args.func(args) # execute requested command | |
| 1137 | |
| 1138 | |
| 1139 if __name__ == "__main__": | |
| 1140 main() |
