comparison src/__main__.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 # -----------------------------------------------------------------------------
2 # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
3 # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
4 # Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
5 # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
6 # maintained and developed by Artifex Software, Inc. https://artifex.com.
7 # -----------------------------------------------------------------------------
8 import argparse
9 import bisect
10 import os
11 import sys
12 import statistics
13 from typing import Dict, List, Set
14
15 from . import pymupdf
16
17 def mycenter(x):
18 return (" %s " % x).center(75, "-")
19
20
21 def recoverpix(doc, item):
22 """Return image for a given XREF."""
23 x = item[0] # xref of PDF image
24 s = item[1] # xref of its /SMask
25 if s == 0: # no smask: use direct image output
26 return doc.extract_image(x)
27
28 def getimage(pix):
29 if pix.colorspace.n != 4:
30 return pix
31 tpix = pymupdf.Pixmap(pymupdf.csRGB, pix)
32 return tpix
33
34 # we need to reconstruct the alpha channel with the smask
35 pix1 = pymupdf.Pixmap(doc, x)
36 pix2 = pymupdf.Pixmap(doc, s) # create pixmap of the /SMask entry
37
38 """Sanity check:
39 - both pixmaps must have the same rectangle
40 - both pixmaps must have alpha=0
41 - pix2 must consist of 1 byte per pixel
42 """
43 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
44 pymupdf.message("Warning: unsupported /SMask %i for %i:" % (s, x))
45 pymupdf.message(pix2)
46 pix2 = None
47 return getimage(pix1) # return the pixmap as is
48
49 pix = pymupdf.Pixmap(pix1) # copy of pix1, with an alpha channel added
50 pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values
51 pix1 = pix2 = None # free temp pixmaps
52
53 # we may need to adjust something for CMYK pixmaps here:
54 return getimage(pix)
55
56
57 def open_file(filename, password, show=False, pdf=True):
58 """Open and authenticate a document."""
59 doc = pymupdf.open(filename)
60 if not doc.is_pdf and pdf is True:
61 sys.exit("this command supports PDF files only")
62 rc = -1
63 if not doc.needs_pass:
64 return doc
65 if password:
66 rc = doc.authenticate(password)
67 if not rc:
68 sys.exit("authentication unsuccessful")
69 if show is True:
70 pymupdf.message("authenticated as %s" % "owner" if rc > 2 else "user")
71 else:
72 sys.exit("'%s' requires a password" % doc.name)
73 return doc
74
75
76 def print_dict(item):
77 """Print a Python dictionary."""
78 l = max([len(k) for k in item.keys()]) + 1
79 for k, v in item.items():
80 msg = "%s: %s" % (k.rjust(l), v)
81 pymupdf.message(msg)
82
83
84 def print_xref(doc, xref):
85 """Print an object given by XREF number.
86
87 Simulate the PDF source in "pretty" format.
88 For a stream also print its size.
89 """
90 pymupdf.message("%i 0 obj" % xref)
91 xref_str = doc.xref_object(xref)
92 pymupdf.message(xref_str)
93 if doc.xref_is_stream(xref):
94 temp = xref_str.split()
95 try:
96 idx = temp.index("/Length") + 1
97 size = temp[idx]
98 if size.endswith("0 R"):
99 size = "unknown"
100 except Exception:
101 size = "unknown"
102 pymupdf.message("stream\n...%s bytes" % size)
103 pymupdf.message("endstream")
104 pymupdf.message("endobj")
105
106
107 def get_list(rlist, limit, what="page"):
108 """Transform a page / xref specification into a list of integers.
109
110 Args
111 ----
112 rlist: (str) the specification
113 limit: maximum number, i.e. number of pages, number of objects
114 what: a string to be used in error messages
115 Returns
116 -------
117 A list of integers representing the specification.
118 """
119 N = str(limit - 1)
120 rlist = rlist.replace("N", N).replace(" ", "")
121 rlist_arr = rlist.split(",")
122 out_list = []
123 for seq, item in enumerate(rlist_arr):
124 n = seq + 1
125 if item.isdecimal(): # a single integer
126 i = int(item)
127 if 1 <= i < limit:
128 out_list.append(int(item))
129 else:
130 sys.exit("bad %s specification at item %i" % (what, n))
131 continue
132 try: # this must be a range now, and all of the following must work:
133 i1, i2 = item.split("-") # will fail if not 2 items produced
134 i1 = int(i1) # will fail on non-integers
135 i2 = int(i2)
136 except Exception:
137 sys.exit("bad %s range specification at item %i" % (what, n))
138
139 if not (1 <= i1 < limit and 1 <= i2 < limit):
140 sys.exit("bad %s range specification at item %i" % (what, n))
141
142 if i1 == i2: # just in case: a range of equal numbers
143 out_list.append(i1)
144 continue
145
146 if i1 < i2: # first less than second
147 out_list += list(range(i1, i2 + 1))
148 else: # first larger than second
149 out_list += list(range(i1, i2 - 1, -1))
150
151 return out_list
152
153
154 def show(args):
155 doc = open_file(args.input, args.password, True)
156 size = os.path.getsize(args.input) / 1024
157 flag = "KB"
158 if size > 1000:
159 size /= 1024
160 flag = "MB"
161 size = round(size, 1)
162 meta = doc.metadata # pylint: disable=no-member
163 pymupdf.message(
164 "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
165 % (
166 args.input,
167 doc.page_count,
168 doc.xref_length() - 1,
169 size,
170 flag,
171 meta["format"],
172 meta["encryption"],
173 )
174 )
175 n = doc.is_form_pdf
176 if n > 0:
177 s = doc.get_sigflags()
178 pymupdf.message(
179 "document contains %i root form fields and is %ssigned"
180 % (n, "not " if s != 3 else "")
181 )
182 n = doc.embfile_count()
183 if n > 0:
184 pymupdf.message("document contains %i embedded files" % n)
185 pymupdf.message()
186 if args.catalog:
187 pymupdf.message(mycenter("PDF catalog"))
188 xref = doc.pdf_catalog()
189 print_xref(doc, xref)
190 pymupdf.message()
191 if args.metadata:
192 pymupdf.message(mycenter("PDF metadata"))
193 print_dict(doc.metadata) # pylint: disable=no-member
194 pymupdf.message()
195 if args.xrefs:
196 pymupdf.message(mycenter("object information"))
197 xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
198 for xref in xrefl:
199 print_xref(doc, xref)
200 pymupdf.message()
201 if args.pages:
202 pymupdf.message(mycenter("page information"))
203 pagel = get_list(args.pages, doc.page_count + 1)
204 for pno in pagel:
205 n = pno - 1
206 xref = doc.page_xref(n)
207 pymupdf.message("Page %i:" % pno)
208 print_xref(doc, xref)
209 pymupdf.message()
210 if args.trailer:
211 pymupdf.message(mycenter("PDF trailer"))
212 pymupdf.message(doc.pdf_trailer())
213 pymupdf.message()
214 doc.close()
215
216
217 def clean(args):
218 doc = open_file(args.input, args.password, pdf=True)
219 encryption = args.encryption
220 encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
221 encryption
222 )
223
224 if not args.pages: # simple cleaning
225 doc.save(
226 args.output,
227 garbage=args.garbage,
228 deflate=args.compress,
229 pretty=args.pretty,
230 clean=args.sanitize,
231 ascii=args.ascii,
232 linear=args.linear,
233 encryption=encrypt,
234 owner_pw=args.owner,
235 user_pw=args.user,
236 permissions=args.permission,
237 )
238 return
239
240 # create sub document from page numbers
241 pages = get_list(args.pages, doc.page_count + 1)
242 outdoc = pymupdf.open()
243 for pno in pages:
244 n = pno - 1
245 outdoc.insert_pdf(doc, from_page=n, to_page=n)
246 outdoc.save(
247 args.output,
248 garbage=args.garbage,
249 deflate=args.compress,
250 pretty=args.pretty,
251 clean=args.sanitize,
252 ascii=args.ascii,
253 linear=args.linear,
254 encryption=encrypt,
255 owner_pw=args.owner,
256 user_pw=args.user,
257 permissions=args.permission,
258 )
259 doc.close()
260 outdoc.close()
261 return
262
263
264 def doc_join(args):
265 """Join pages from several PDF documents."""
266 doc_list = args.input # a list of input PDFs
267 doc = pymupdf.open() # output PDF
268 for src_item in doc_list: # process one input PDF
269 src_list = src_item.split(",")
270 password = src_list[1] if len(src_list) > 1 else None
271 src = open_file(src_list[0], password, pdf=True)
272 pages = ",".join(src_list[2:]) # get 'pages' specifications
273 if pages: # if anything there, retrieve a list of desired pages
274 page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
275 else: # take all pages
276 page_list = range(1, src.page_count + 1)
277 for i in page_list:
278 doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page
279 src.close()
280
281 doc.save(args.output, garbage=4, deflate=True)
282 doc.close()
283
284
285 def embedded_copy(args):
286 """Copy embedded files between PDFs."""
287 doc = open_file(args.input, args.password, pdf=True)
288 if not doc.can_save_incrementally() and (
289 not args.output or args.output == args.input
290 ):
291 sys.exit("cannot save PDF incrementally")
292 src = open_file(args.source, args.pwdsource)
293 names = set(args.name) if args.name else set()
294 src_names = set(src.embfile_names())
295 if names:
296 if not names <= src_names:
297 sys.exit("not all names are contained in source")
298 else:
299 names = src_names
300 if not names:
301 sys.exit("nothing to copy")
302 intersect = names & set(doc.embfile_names()) # any equal name already in target?
303 if intersect:
304 sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
305
306 for item in names:
307 info = src.embfile_info(item)
308 buff = src.embfile_get(item)
309 doc.embfile_add(
310 item,
311 buff,
312 filename=info["filename"],
313 ufilename=info["ufilename"],
314 desc=info["desc"],
315 )
316 pymupdf.message("copied entry '%s' from '%s'" % (item, src.name))
317 src.close()
318 if args.output and args.output != args.input:
319 doc.save(args.output, garbage=3)
320 else:
321 doc.saveIncr()
322 doc.close()
323
324
325 def embedded_del(args):
326 """Delete an embedded file entry."""
327 doc = open_file(args.input, args.password, pdf=True)
328 if not doc.can_save_incrementally() and (
329 not args.output or args.output == args.input
330 ):
331 sys.exit("cannot save PDF incrementally")
332
333 try:
334 doc.embfile_del(args.name)
335 except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
336 sys.exit(f'no such embedded file {args.name!r}: {e}')
337 if not args.output or args.output == args.input:
338 doc.saveIncr()
339 else:
340 doc.save(args.output, garbage=1)
341 doc.close()
342
343
344 def embedded_get(args):
345 """Retrieve contents of an embedded file."""
346 doc = open_file(args.input, args.password, pdf=True)
347 try:
348 stream = doc.embfile_get(args.name)
349 d = doc.embfile_info(args.name)
350 except (ValueError, pymupdf.mupdf.FzErrorBase) as e:
351 sys.exit(f'no such embedded file {args.name!r}: {e}')
352 filename = args.output if args.output else d["filename"]
353 with open(filename, "wb") as output:
354 output.write(stream)
355 pymupdf.message("saved entry '%s' as '%s'" % (args.name, filename))
356 doc.close()
357
358
359 def embedded_add(args):
360 """Insert a new embedded file."""
361 doc = open_file(args.input, args.password, pdf=True)
362 if not doc.can_save_incrementally() and (
363 args.output is None or args.output == args.input
364 ):
365 sys.exit("cannot save PDF incrementally")
366
367 try:
368 doc.embfile_del(args.name)
369 sys.exit("entry '%s' already exists" % args.name)
370 except Exception:
371 pass
372
373 if not os.path.exists(args.path) or not os.path.isfile(args.path):
374 sys.exit("no such file '%s'" % args.path)
375 with open(args.path, "rb") as f:
376 stream = f.read()
377 filename = args.path
378 ufilename = filename
379 if not args.desc:
380 desc = filename
381 else:
382 desc = args.desc
383 doc.embfile_add(
384 args.name, stream, filename=filename, ufilename=ufilename, desc=desc
385 )
386 if not args.output or args.output == args.input:
387 doc.saveIncr()
388 else:
389 doc.save(args.output, garbage=3)
390 doc.close()
391
392
393 def embedded_upd(args):
394 """Update contents or metadata of an embedded file."""
395 doc = open_file(args.input, args.password, pdf=True)
396 if not doc.can_save_incrementally() and (
397 args.output is None or args.output == args.input
398 ):
399 sys.exit("cannot save PDF incrementally")
400
401 try:
402 doc.embfile_info(args.name)
403 except Exception:
404 sys.exit("no such embedded file '%s'" % args.name)
405
406 if (
407 args.path is not None
408 and os.path.exists(args.path)
409 and os.path.isfile(args.path)
410 ):
411 with open(args.path, "rb") as f:
412 stream = f.read()
413 else:
414 stream = None
415
416 if args.filename:
417 filename = args.filename
418 else:
419 filename = None
420
421 if args.ufilename:
422 ufilename = args.ufilename
423 elif args.filename:
424 ufilename = args.filename
425 else:
426 ufilename = None
427
428 if args.desc:
429 desc = args.desc
430 else:
431 desc = None
432
433 doc.embfile_upd(
434 args.name, stream, filename=filename, ufilename=ufilename, desc=desc
435 )
436 if args.output is None or args.output == args.input:
437 doc.saveIncr()
438 else:
439 doc.save(args.output, garbage=3)
440 doc.close()
441
442
443 def embedded_list(args):
444 """List embedded files."""
445 doc = open_file(args.input, args.password, pdf=True)
446 names = doc.embfile_names()
447 if args.name is not None:
448 if args.name not in names:
449 sys.exit("no such embedded file '%s'" % args.name)
450 else:
451 pymupdf.message()
452 pymupdf.message(
453 "printing 1 of %i embedded file%s:"
454 % (len(names), "s" if len(names) > 1 else "")
455 )
456 pymupdf.message()
457 print_dict(doc.embfile_info(args.name))
458 pymupdf.message()
459 return
460 if not names:
461 pymupdf.message("'%s' contains no embedded files" % doc.name)
462 return
463 if len(names) > 1:
464 msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
465 else:
466 msg = "'%s' contains the following embedded file" % doc.name
467 pymupdf.message(msg)
468 pymupdf.message()
469 for name in names:
470 if not args.detail:
471 pymupdf.message(name)
472 continue
473 _ = doc.embfile_info(name)
474 print_dict(doc.embfile_info(name))
475 pymupdf.message()
476 doc.close()
477
478
479 def extract_objects(args):
480 """Extract images and / or fonts from a PDF."""
481 if not args.fonts and not args.images:
482 sys.exit("neither fonts nor images requested")
483 doc = open_file(args.input, args.password, pdf=True)
484
485 if args.pages:
486 pages = get_list(args.pages, doc.page_count + 1)
487 else:
488 pages = range(1, doc.page_count + 1)
489
490 if not args.output:
491 out_dir = os.path.abspath(os.curdir)
492 else:
493 out_dir = args.output
494 if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
495 sys.exit("output directory %s does not exist" % out_dir)
496
497 font_xrefs = set() # already saved fonts
498 image_xrefs = set() # already saved images
499
500 for pno in pages:
501 if args.fonts:
502 itemlist = doc.get_page_fonts(pno - 1)
503 for item in itemlist:
504 xref = item[0]
505 if xref not in font_xrefs:
506 font_xrefs.add(xref)
507 fontname, ext, _, buffer = doc.extract_font(xref)
508 if ext == "n/a" or not buffer:
509 continue
510 outname = os.path.join(
511 out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
512 )
513 with open(outname, "wb") as outfile:
514 outfile.write(buffer)
515 buffer = None
516 if args.images:
517 itemlist = doc.get_page_images(pno - 1)
518 for item in itemlist:
519 xref = item[0]
520 if xref not in image_xrefs:
521 image_xrefs.add(xref)
522 pix = recoverpix(doc, item)
523 if type(pix) is dict:
524 ext = pix["ext"]
525 imgdata = pix["image"]
526 outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
527 with open(outname, "wb") as outfile:
528 outfile.write(imgdata)
529 else:
530 outname = os.path.join(out_dir, "img-%i.png" % xref)
531 pix2 = (
532 pix
533 if pix.colorspace.n < 4
534 else pymupdf.Pixmap(pymupdf.csRGB, pix)
535 )
536 pix2.save(outname)
537
538 if args.fonts:
539 pymupdf.message("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
540 if args.images:
541 pymupdf.message("saved %i images to '%s'" % (len(image_xrefs), out_dir))
542 doc.close()
543
544
545 def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
546 eop = b"\n" if noformfeed else bytes([12])
547 text = page.get_text("text", flags=flags)
548 if not text:
549 if not skip_empty:
550 textout.write(eop) # write formfeed
551 return
552 textout.write(text.encode("utf8", errors="surrogatepass"))
553 textout.write(eop)
554 return
555
556
557 def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
558 eop = b"\n" if noformfeed else bytes([12])
559 blocks = page.get_text("blocks", flags=flags)
560 if blocks == []:
561 if not skip_empty:
562 textout.write(eop) # write formfeed
563 return
564 blocks.sort(key=lambda b: (b[3], b[0]))
565 for b in blocks:
566 textout.write(b[4].encode("utf8", errors="surrogatepass"))
567 textout.write(eop)
568 return
569
570
571 def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
572 eop = b"\n" if noformfeed else bytes([12])
573
574 # --------------------------------------------------------------------
575 def find_line_index(values: List[int], value: int) -> int:
576 """Find the right row coordinate.
577
578 Args:
579 values: (list) y-coordinates of rows.
580 value: (int) lookup for this value (y-origin of char).
581 Returns:
582 y-ccordinate of appropriate line for value.
583 """
584 i = bisect.bisect_right(values, value)
585 if i:
586 return values[i - 1]
587 raise RuntimeError("Line for %g not found in %s" % (value, values))
588
589 # --------------------------------------------------------------------
590 def curate_rows(rows: Set[int], GRID) -> List:
591 rows = list(rows)
592 rows.sort() # sort ascending
593 nrows = [rows[0]]
594 for h in rows[1:]:
595 if h >= nrows[-1] + GRID: # only keep significant differences
596 nrows.append(h)
597 return nrows # curated list of line bottom coordinates
598
599 def process_blocks(blocks: List[Dict], page: pymupdf.Page):
600 rows = set()
601 page_width = page.rect.width
602 page_height = page.rect.height
603 rowheight = page_height
604 left = page_width
605 right = 0
606 chars = []
607 for block in blocks:
608 for line in block["lines"]:
609 if line["dir"] != (1, 0): # ignore non-horizontal text
610 continue
611 x0, y0, x1, y1 = line["bbox"]
612 if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox
613 continue
614 # upd row height
615 height = y1 - y0
616
617 if rowheight > height:
618 rowheight = height
619 for span in line["spans"]:
620 if span["size"] <= fontsize:
621 continue
622 for c in span["chars"]:
623 x0, _, x1, _ = c["bbox"]
624 cwidth = x1 - x0
625 ox, oy = c["origin"]
626 oy = int(round(oy))
627 rows.add(oy)
628 ch = c["c"]
629 if left > ox and ch != " ":
630 left = ox # update left coordinate
631 if right < x1:
632 right = x1 # update right coordinate
633 # handle ligatures:
634 if cwidth == 0 and chars != []: # potential ligature
635 old_ch, old_ox, old_oy, old_cwidth = chars[-1]
636 if old_oy == oy: # ligature
637 if old_ch != chr(0xFB00): # previous "ff" char lig?
638 lig = joinligature(old_ch + ch) # no
639 # convert to one of the 3-char ligatures:
640 elif ch == "i":
641 lig = chr(0xFB03) # "ffi"
642 elif ch == "l":
643 lig = chr(0xFB04) # "ffl"
644 else: # something wrong, leave old char in place
645 lig = old_ch
646 chars[-1] = (lig, old_ox, old_oy, old_cwidth)
647 continue
648 chars.append((ch, ox, oy, cwidth)) # all chars on page
649 return chars, rows, left, right, rowheight
650
651 def joinligature(lig: str) -> str:
652 """Return ligature character for a given pair / triple of characters.
653
654 Args:
655 lig: (str) 2/3 characters, e.g. "ff"
656 Returns:
657 Ligature, e.g. "ff" -> chr(0xFB00)
658 """
659
660 if lig == "ff":
661 return chr(0xFB00)
662 elif lig == "fi":
663 return chr(0xFB01)
664 elif lig == "fl":
665 return chr(0xFB02)
666 elif lig == "ffi":
667 return chr(0xFB03)
668 elif lig == "ffl":
669 return chr(0xFB04)
670 elif lig == "ft":
671 return chr(0xFB05)
672 elif lig == "st":
673 return chr(0xFB06)
674 return lig
675
676 # --------------------------------------------------------------------
677 def make_textline(left, slot, minslot, lchars):
678 """Produce the text of one output line.
679
680 Args:
681 left: (float) left most coordinate used on page
682 slot: (float) avg width of one character in any font in use.
683 minslot: (float) min width for the characters in this line.
684 chars: (list[tuple]) characters of this line.
685 Returns:
686 text: (str) text string for this line
687 """
688 text = "" # we output this
689 old_char = ""
690 old_x1 = 0 # end coordinate of last char
691 old_ox = 0 # x-origin of last char
692 if minslot <= pymupdf.EPSILON:
693 raise RuntimeError("program error: minslot too small = %g" % minslot)
694
695 for c in lchars: # loop over characters
696 char, ox, _, cwidth = c
697 ox = ox - left # its (relative) start coordinate
698 x1 = ox + cwidth # ending coordinate
699
700 # eliminate overprint effect
701 if old_char == char and ox - old_ox <= cwidth * 0.2:
702 continue
703
704 # omit spaces overlapping previous char
705 if char == " " and (old_x1 - ox) / cwidth > 0.8:
706 continue
707
708 old_char = char
709 # close enough to previous?
710 if ox < old_x1 + minslot: # assume char adjacent to previous
711 text += char # append to output
712 old_x1 = x1 # new end coord
713 old_ox = ox # new origin.x
714 continue
715
716 # else next char starts after some gap:
717 # fill in right number of spaces, so char is positioned
718 # in the right slot of the line
719 if char == " ": # rest relevant for non-space only
720 continue
721 delta = int(ox / slot) - len(text)
722 if ox > old_x1 and delta > 1:
723 text += " " * delta
724 # now append char
725 text += char
726 old_x1 = x1 # new end coordinate
727 old_ox = ox # new origin
728 return text.rstrip()
729
730 # extract page text by single characters ("rawdict")
731 blocks = page.get_text("rawdict", flags=flags)["blocks"]
732 chars, rows, left, right, rowheight = process_blocks(blocks, page)
733
734 if chars == []:
735 if not skip_empty:
736 textout.write(eop) # write formfeed
737 return
738 # compute list of line coordinates - ignoring small (GRID) differences
739 rows = curate_rows(rows, GRID)
740
741 # sort all chars by x-coordinates, so every line will receive char info,
742 # sorted from left to right.
743 chars.sort(key=lambda c: c[1])
744
745 # populate the lines with their char info
746 lines = {} # key: y1-ccordinate, value: char list
747 for c in chars:
748 _, _, oy, _ = c
749 y = find_line_index(rows, oy) # y-coord of the right line
750 lchars = lines.get(y, []) # read line chars so far
751 lchars.append(c) # append this char
752 lines[y] = lchars # write back to line
753
754 # ensure line coordinates are ascending
755 keys = list(lines.keys())
756 keys.sort()
757
758 # -------------------------------------------------------------------------
759 # Compute "char resolution" for the page: the char width corresponding to
760 # 1 text char position on output - call it 'slot'.
761 # For each line, compute median of its char widths. The minimum across all
762 # lines is 'slot'.
763 # The minimum char width of each line is used to determine if spaces must
764 # be inserted in between two characters.
765 # -------------------------------------------------------------------------
766 slot = right - left
767 minslots = {}
768 for k in keys:
769 lchars = lines[k]
770 ccount = len(lchars)
771 if ccount < 2:
772 minslots[k] = 1
773 continue
774 widths = [c[3] for c in lchars]
775 widths.sort()
776 this_slot = statistics.median(widths) # take median value
777 if this_slot < slot:
778 slot = this_slot
779 minslots[k] = widths[0]
780
781 # compute line advance in text output
782 rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
783 rowpos = rows[0] # first line positioned here
784 textout.write(b"\n")
785 for k in keys: # walk through the lines
786 while rowpos < k: # honor distance between lines
787 textout.write(b"\n")
788 rowpos += rowheight
789 text = make_textline(left, slot, minslots[k], lines[k])
790 textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
791 rowpos = k + rowheight
792
793 textout.write(eop) # write formfeed
794
795
796 def gettext(args):
797 doc = open_file(args.input, args.password, pdf=False)
798 pagel = get_list(args.pages, doc.page_count + 1)
799 output = args.output
800 if output is None:
801 filename, _ = os.path.splitext(doc.name)
802 output = filename + ".txt"
803 with open(output, "wb") as textout:
804 flags = pymupdf.TEXT_PRESERVE_LIGATURES | pymupdf.TEXT_PRESERVE_WHITESPACE
805 if args.convert_white:
806 flags ^= pymupdf.TEXT_PRESERVE_WHITESPACE
807 if args.noligatures:
808 flags ^= pymupdf.TEXT_PRESERVE_LIGATURES
809 if args.extra_spaces:
810 flags ^= pymupdf.TEXT_INHIBIT_SPACES
811 func = {
812 "simple": page_simple,
813 "blocks": page_blocksort,
814 "layout": page_layout,
815 }
816 for pno in pagel:
817 page = doc[pno - 1]
818 func[args.mode](
819 page,
820 textout,
821 args.grid,
822 args.fontsize,
823 args.noformfeed,
824 args.skip_empty,
825 flags=flags,
826 )
827
828
829 def _internal(args):
830 pymupdf.message('This is from PyMuPDF message().')
831 pymupdf.log('This is from PyMuPDF log().')
832
833 def main():
834 """Define command configurations."""
835 parser = argparse.ArgumentParser(
836 prog="pymupdf",
837 description=mycenter("Basic PyMuPDF Functions"),
838 )
839 subps = parser.add_subparsers(
840 title="Subcommands", help="Enter 'command -h' for subcommand specific help"
841 )
842
843 # -------------------------------------------------------------------------
844 # 'show' command
845 # -------------------------------------------------------------------------
846 ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
847 ps_show.add_argument("input", type=str, help="PDF filename")
848 ps_show.add_argument("-password", help="password")
849 ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
850 ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
851 ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
852 ps_show.add_argument(
853 "-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
854 )
855 ps_show.add_argument(
856 "-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
857 )
858 ps_show.set_defaults(func=show)
859
860 # -------------------------------------------------------------------------
861 # 'clean' command
862 # -------------------------------------------------------------------------
863 ps_clean = subps.add_parser(
864 "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
865 )
866 ps_clean.add_argument("input", type=str, help="PDF filename")
867 ps_clean.add_argument("output", type=str, help="output PDF filename")
868 ps_clean.add_argument("-password", help="password")
869
870 ps_clean.add_argument(
871 "-encryption",
872 help="encryption method",
873 choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
874 default="none",
875 )
876
877 ps_clean.add_argument("-owner", type=str, help="owner password")
878 ps_clean.add_argument("-user", type=str, help="user password")
879
880 ps_clean.add_argument(
881 "-garbage",
882 type=int,
883 help="garbage collection level",
884 choices=range(5),
885 default=0,
886 )
887
888 ps_clean.add_argument(
889 "-compress",
890 action="store_true",
891 default=False,
892 help="compress (deflate) output",
893 )
894
895 ps_clean.add_argument(
896 "-ascii", action="store_true", default=False, help="ASCII encode binary data"
897 )
898
899 ps_clean.add_argument(
900 "-linear",
901 action="store_true",
902 default=False,
903 help="format for fast web display",
904 )
905
906 ps_clean.add_argument(
907 "-permission", type=int, default=-1, help="integer with permission levels"
908 )
909
910 ps_clean.add_argument(
911 "-sanitize",
912 action="store_true",
913 default=False,
914 help="sanitize / clean contents",
915 )
916 ps_clean.add_argument(
917 "-pretty", action="store_true", default=False, help="prettify PDF structure"
918 )
919 ps_clean.add_argument(
920 "-pages", help="output selected pages pages, format: 1,5-7,50-N"
921 )
922 ps_clean.set_defaults(func=clean)
923
924 # -------------------------------------------------------------------------
925 # 'join' command
926 # -------------------------------------------------------------------------
927 ps_join = subps.add_parser(
928 "join",
929 description=mycenter("join PDF documents"),
930 epilog="specify each input as 'filename[,password[,pages]]'",
931 )
932 ps_join.add_argument("input", nargs="*", help="input filenames")
933 ps_join.add_argument("-output", required=True, help="output filename")
934 ps_join.set_defaults(func=doc_join)
935
936 # -------------------------------------------------------------------------
937 # 'extract' command
938 # -------------------------------------------------------------------------
939 ps_extract = subps.add_parser(
940 "extract", description=mycenter("extract images and fonts to disk")
941 )
942 ps_extract.add_argument("input", type=str, help="PDF filename")
943 ps_extract.add_argument("-images", action="store_true", help="extract images")
944 ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
945 ps_extract.add_argument(
946 "-output", help="folder to receive output, defaults to current"
947 )
948 ps_extract.add_argument("-password", help="password")
949 ps_extract.add_argument(
950 "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
951 )
952 ps_extract.set_defaults(func=extract_objects)
953
954 # -------------------------------------------------------------------------
955 # 'embed-info'
956 # -------------------------------------------------------------------------
957 ps_show = subps.add_parser(
958 "embed-info", description=mycenter("list embedded files")
959 )
960 ps_show.add_argument("input", help="PDF filename")
961 ps_show.add_argument("-name", help="if given, report only this one")
962 ps_show.add_argument("-detail", action="store_true", help="detail information")
963 ps_show.add_argument("-password", help="password")
964 ps_show.set_defaults(func=embedded_list)
965
966 # -------------------------------------------------------------------------
967 # 'embed-add' command
968 # -------------------------------------------------------------------------
969 ps_embed_add = subps.add_parser(
970 "embed-add", description=mycenter("add embedded file")
971 )
972 ps_embed_add.add_argument("input", help="PDF filename")
973 ps_embed_add.add_argument("-password", help="password")
974 ps_embed_add.add_argument(
975 "-output", help="output PDF filename, incremental save if none"
976 )
977 ps_embed_add.add_argument("-name", required=True, help="name of new entry")
978 ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
979 ps_embed_add.add_argument("-desc", help="description of new entry")
980 ps_embed_add.set_defaults(func=embedded_add)
981
982 # -------------------------------------------------------------------------
983 # 'embed-del' command
984 # -------------------------------------------------------------------------
985 ps_embed_del = subps.add_parser(
986 "embed-del", description=mycenter("delete embedded file")
987 )
988 ps_embed_del.add_argument("input", help="PDF filename")
989 ps_embed_del.add_argument("-password", help="password")
990 ps_embed_del.add_argument(
991 "-output", help="output PDF filename, incremental save if none"
992 )
993 ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
994 ps_embed_del.set_defaults(func=embedded_del)
995
996 # -------------------------------------------------------------------------
997 # 'embed-upd' command
998 # -------------------------------------------------------------------------
999 ps_embed_upd = subps.add_parser(
1000 "embed-upd",
1001 description=mycenter("update embedded file"),
1002 epilog="except '-name' all parameters are optional",
1003 )
1004 ps_embed_upd.add_argument("input", help="PDF filename")
1005 ps_embed_upd.add_argument("-name", required=True, help="name of entry")
1006 ps_embed_upd.add_argument("-password", help="password")
1007 ps_embed_upd.add_argument(
1008 "-output", help="Output PDF filename, incremental save if none"
1009 )
1010 ps_embed_upd.add_argument("-path", help="path to new data for entry")
1011 ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
1012 ps_embed_upd.add_argument(
1013 "-ufilename", help="new unicode filename to store in entry"
1014 )
1015 ps_embed_upd.add_argument("-desc", help="new description to store in entry")
1016 ps_embed_upd.set_defaults(func=embedded_upd)
1017
1018 # -------------------------------------------------------------------------
1019 # 'embed-extract' command
1020 # -------------------------------------------------------------------------
1021 ps_embed_extract = subps.add_parser(
1022 "embed-extract", description=mycenter("extract embedded file to disk")
1023 )
1024 ps_embed_extract.add_argument("input", type=str, help="PDF filename")
1025 ps_embed_extract.add_argument("-name", required=True, help="name of entry")
1026 ps_embed_extract.add_argument("-password", help="password")
1027 ps_embed_extract.add_argument(
1028 "-output", help="output filename, default is stored name"
1029 )
1030 ps_embed_extract.set_defaults(func=embedded_get)
1031
1032 # -------------------------------------------------------------------------
1033 # 'embed-copy' command
1034 # -------------------------------------------------------------------------
1035 ps_embed_copy = subps.add_parser(
1036 "embed-copy", description=mycenter("copy embedded files between PDFs")
1037 )
1038 ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
1039 ps_embed_copy.add_argument("-password", help="password of input")
1040 ps_embed_copy.add_argument(
1041 "-output", help="output PDF, incremental save to 'input' if omitted"
1042 )
1043 ps_embed_copy.add_argument(
1044 "-source", required=True, help="copy embedded files from here"
1045 )
1046 ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
1047 ps_embed_copy.add_argument(
1048 "-name", nargs="*", help="restrict copy to these entries"
1049 )
1050 ps_embed_copy.set_defaults(func=embedded_copy)
1051
1052 # -------------------------------------------------------------------------
1053 # 'textlayout' command
1054 # -------------------------------------------------------------------------
1055 ps_gettext = subps.add_parser(
1056 "gettext", description=mycenter("extract text in various formatting modes")
1057 )
1058 ps_gettext.add_argument("input", type=str, help="input document filename")
1059 ps_gettext.add_argument("-password", help="password for input document")
1060 ps_gettext.add_argument(
1061 "-mode",
1062 type=str,
1063 help="mode: simple, block sort, or layout (default)",
1064 choices=("simple", "blocks", "layout"),
1065 default="layout",
1066 )
1067 ps_gettext.add_argument(
1068 "-pages",
1069 type=str,
1070 help="select pages, format: 1,5-7,50-N",
1071 default="1-N",
1072 )
1073 ps_gettext.add_argument(
1074 "-noligatures",
1075 action="store_true",
1076 help="expand ligature characters (default False)",
1077 default=False,
1078 )
1079 ps_gettext.add_argument(
1080 "-convert-white",
1081 action="store_true",
1082 help="convert whitespace characters to white (default False)",
1083 default=False,
1084 )
1085 ps_gettext.add_argument(
1086 "-extra-spaces",
1087 action="store_true",
1088 help="fill gaps with spaces (default False)",
1089 default=False,
1090 )
1091 ps_gettext.add_argument(
1092 "-noformfeed",
1093 action="store_true",
1094 help="write linefeeds, no formfeeds (default False)",
1095 default=False,
1096 )
1097 ps_gettext.add_argument(
1098 "-skip-empty",
1099 action="store_true",
1100 help="suppress pages with no text (default False)",
1101 default=False,
1102 )
1103 ps_gettext.add_argument(
1104 "-output",
1105 help="store text in this file (default inputfilename.txt)",
1106 )
1107 ps_gettext.add_argument(
1108 "-grid",
1109 type=float,
1110 help="merge lines if closer than this (default 2)",
1111 default=2,
1112 )
1113 ps_gettext.add_argument(
1114 "-fontsize",
1115 type=float,
1116 help="only include text with a larger fontsize (default 3)",
1117 default=3,
1118 )
1119 ps_gettext.set_defaults(func=gettext)
1120
1121 # -------------------------------------------------------------------------
1122 # '_internal' command
1123 # -------------------------------------------------------------------------
1124 ps_internal = subps.add_parser(
1125 "internal", description=mycenter("internal testing")
1126 )
1127 ps_internal.set_defaults(func=_internal)
1128
1129 # -------------------------------------------------------------------------
1130 # start program
1131 # -------------------------------------------------------------------------
1132 args = parser.parse_args() # create parameter arguments class
1133 if not hasattr(args, "func"): # no function selected
1134 parser.print_help() # so print top level help
1135 else:
1136 args.func(args) # execute requested command
1137
1138
1139 if __name__ == "__main__":
1140 main()