comparison src_classic/__main__.py @ 1:1d09e1dec1d9 upstream

ADD: PyMuPDF v1.26.4: the original sdist. It does not yet contain MuPDF. This normally will be downloaded when building PyMuPDF.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:37:51 +0200
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 1:1d09e1dec1d9
1 # -----------------------------------------------------------------------------
2 # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
3 # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
4 # Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
5 # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
6 # maintained and developed by Artifex Software, Inc. https://artifex.com.
7 # -----------------------------------------------------------------------------
8 import argparse
9 import bisect
10 import os
11 import sys
12 import statistics
13 from typing import Dict, List, Set, Tuple
14
15 import fitz
16 from fitz.fitz import (
17 TEXT_INHIBIT_SPACES,
18 TEXT_PRESERVE_LIGATURES,
19 TEXT_PRESERVE_WHITESPACE,
20 )
21
22 mycenter = lambda x: (" %s " % x).center(75, "-")
23
24
25 def recoverpix(doc, item):
26 """Return image for a given XREF."""
27 x = item[0] # xref of PDF image
28 s = item[1] # xref of its /SMask
29 if s == 0: # no smask: use direct image output
30 return doc.extract_image(x)
31
32 def getimage(pix):
33 if pix.colorspace.n != 4:
34 return pix
35 tpix = fitz.Pixmap(fitz.csRGB, pix)
36 return tpix
37
38 # we need to reconstruct the alpha channel with the smask
39 pix1 = fitz.Pixmap(doc, x)
40 pix2 = fitz.Pixmap(doc, s) # create pixmap of the /SMask entry
41
42 """Sanity check:
43 - both pixmaps must have the same rectangle
44 - both pixmaps must have alpha=0
45 - pix2 must consist of 1 byte per pixel
46 """
47 if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
48 print("Warning: unsupported /SMask %i for %i:" % (s, x))
49 print(pix2)
50 pix2 = None
51 return getimage(pix1) # return the pixmap as is
52
53 pix = fitz.Pixmap(pix1) # copy of pix1, with an alpha channel added
54 pix.set_alpha(pix2.samples) # treat pix2.samples as the alpha values
55 pix1 = pix2 = None # free temp pixmaps
56
57 # we may need to adjust something for CMYK pixmaps here:
58 return getimage(pix)
59
60
61 def open_file(filename, password, show=False, pdf=True):
62 """Open and authenticate a document."""
63 doc = fitz.open(filename)
64 if not doc.is_pdf and pdf is True:
65 sys.exit("this command supports PDF files only")
66 rc = -1
67 if not doc.needs_pass:
68 return doc
69 if password:
70 rc = doc.authenticate(password)
71 if not rc:
72 sys.exit("authentication unsuccessful")
73 if show is True:
74 print("authenticated as %s" % "owner" if rc > 2 else "user")
75 else:
76 sys.exit("'%s' requires a password" % doc.name)
77 return doc
78
79
80 def print_dict(item):
81 """Print a Python dictionary."""
82 l = max([len(k) for k in item.keys()]) + 1
83 for k, v in item.items():
84 msg = "%s: %s" % (k.rjust(l), v)
85 print(msg)
86 return
87
88
89 def print_xref(doc, xref):
90 """Print an object given by XREF number.
91
92 Simulate the PDF source in "pretty" format.
93 For a stream also print its size.
94 """
95 print("%i 0 obj" % xref)
96 xref_str = doc.xref_object(xref)
97 print(xref_str)
98 if doc.xref_is_stream(xref):
99 temp = xref_str.split()
100 try:
101 idx = temp.index("/Length") + 1
102 size = temp[idx]
103 if size.endswith("0 R"):
104 size = "unknown"
105 except:
106 size = "unknown"
107 print("stream\n...%s bytes" % size)
108 print("endstream")
109 print("endobj")
110
111
112 def get_list(rlist, limit, what="page"):
113 """Transform a page / xref specification into a list of integers.
114
115 Args
116 ----
117 rlist: (str) the specification
118 limit: maximum number, i.e. number of pages, number of objects
119 what: a string to be used in error messages
120 Returns
121 -------
122 A list of integers representing the specification.
123 """
124 N = str(limit - 1)
125 rlist = rlist.replace("N", N).replace(" ", "")
126 rlist_arr = rlist.split(",")
127 out_list = []
128 for seq, item in enumerate(rlist_arr):
129 n = seq + 1
130 if item.isdecimal(): # a single integer
131 i = int(item)
132 if 1 <= i < limit:
133 out_list.append(int(item))
134 else:
135 sys.exit("bad %s specification at item %i" % (what, n))
136 continue
137 try: # this must be a range now, and all of the following must work:
138 i1, i2 = item.split("-") # will fail if not 2 items produced
139 i1 = int(i1) # will fail on non-integers
140 i2 = int(i2)
141 except:
142 sys.exit("bad %s range specification at item %i" % (what, n))
143
144 if not (1 <= i1 < limit and 1 <= i2 < limit):
145 sys.exit("bad %s range specification at item %i" % (what, n))
146
147 if i1 == i2: # just in case: a range of equal numbers
148 out_list.append(i1)
149 continue
150
151 if i1 < i2: # first less than second
152 out_list += list(range(i1, i2 + 1))
153 else: # first larger than second
154 out_list += list(range(i1, i2 - 1, -1))
155
156 return out_list
157
158
159 def show(args):
160 doc = open_file(args.input, args.password, True)
161 size = os.path.getsize(args.input) / 1024
162 flag = "KB"
163 if size > 1000:
164 size /= 1024
165 flag = "MB"
166 size = round(size, 1)
167 meta = doc.metadata
168 print(
169 "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
170 % (
171 args.input,
172 doc.page_count,
173 doc.xref_length() - 1,
174 size,
175 flag,
176 meta["format"],
177 meta["encryption"],
178 )
179 )
180 n = doc.is_form_pdf
181 if n > 0:
182 s = doc.get_sigflags()
183 print(
184 "document contains %i root form fields and is %ssigned"
185 % (n, "not " if s != 3 else "")
186 )
187 n = doc.embfile_count()
188 if n > 0:
189 print("document contains %i embedded files" % n)
190 print()
191 if args.catalog:
192 print(mycenter("PDF catalog"))
193 xref = doc.pdf_catalog()
194 print_xref(doc, xref)
195 print()
196 if args.metadata:
197 print(mycenter("PDF metadata"))
198 print_dict(doc.metadata)
199 print()
200 if args.xrefs:
201 print(mycenter("object information"))
202 xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
203 for xref in xrefl:
204 print_xref(doc, xref)
205 print()
206 if args.pages:
207 print(mycenter("page information"))
208 pagel = get_list(args.pages, doc.page_count + 1)
209 for pno in pagel:
210 n = pno - 1
211 xref = doc.page_xref(n)
212 print("Page %i:" % pno)
213 print_xref(doc, xref)
214 print()
215 if args.trailer:
216 print(mycenter("PDF trailer"))
217 print(doc.pdf_trailer())
218 print()
219 doc.close()
220
221
222 def clean(args):
223 doc = open_file(args.input, args.password, pdf=True)
224 encryption = args.encryption
225 encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
226 encryption
227 )
228
229 if not args.pages: # simple cleaning
230 doc.save(
231 args.output,
232 garbage=args.garbage,
233 deflate=args.compress,
234 pretty=args.pretty,
235 clean=args.sanitize,
236 ascii=args.ascii,
237 linear=args.linear,
238 encryption=encrypt,
239 owner_pw=args.owner,
240 user_pw=args.user,
241 permissions=args.permission,
242 )
243 return
244
245 # create sub document from page numbers
246 pages = get_list(args.pages, doc.page_count + 1)
247 outdoc = fitz.open()
248 for pno in pages:
249 n = pno - 1
250 outdoc.insert_pdf(doc, from_page=n, to_page=n)
251 outdoc.save(
252 args.output,
253 garbage=args.garbage,
254 deflate=args.compress,
255 pretty=args.pretty,
256 clean=args.sanitize,
257 ascii=args.ascii,
258 linear=args.linear,
259 encryption=encrypt,
260 owner_pw=args.owner,
261 user_pw=args.user,
262 permissions=args.permission,
263 )
264 doc.close()
265 outdoc.close()
266 return
267
268
269 def doc_join(args):
270 """Join pages from several PDF documents."""
271 doc_list = args.input # a list of input PDFs
272 doc = fitz.open() # output PDF
273 for src_item in doc_list: # process one input PDF
274 src_list = src_item.split(",")
275 password = src_list[1] if len(src_list) > 1 else None
276 src = open_file(src_list[0], password, pdf=True)
277 pages = ",".join(src_list[2:]) # get 'pages' specifications
278 if pages: # if anything there, retrieve a list of desired pages
279 page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
280 else: # take all pages
281 page_list = range(1, src.page_count + 1)
282 for i in page_list:
283 doc.insert_pdf(src, from_page=i - 1, to_page=i - 1) # copy each source page
284 src.close()
285
286 doc.save(args.output, garbage=4, deflate=True)
287 doc.close()
288
289
290 def embedded_copy(args):
291 """Copy embedded files between PDFs."""
292 doc = open_file(args.input, args.password, pdf=True)
293 if not doc.can_save_incrementally() and (
294 not args.output or args.output == args.input
295 ):
296 sys.exit("cannot save PDF incrementally")
297 src = open_file(args.source, args.pwdsource)
298 names = set(args.name) if args.name else set()
299 src_names = set(src.embfile_names())
300 if names:
301 if not names <= src_names:
302 sys.exit("not all names are contained in source")
303 else:
304 names = src_names
305 if not names:
306 sys.exit("nothing to copy")
307 intersect = names & set(doc.embfile_names()) # any equal name already in target?
308 if intersect:
309 sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
310
311 for item in names:
312 info = src.embfile_info(item)
313 buff = src.embfile_get(item)
314 doc.embfile_add(
315 item,
316 buff,
317 filename=info["filename"],
318 ufilename=info["ufilename"],
319 desc=info["desc"],
320 )
321 print("copied entry '%s' from '%s'" % (item, src.name))
322 src.close()
323 if args.output and args.output != args.input:
324 doc.save(args.output, garbage=3)
325 else:
326 doc.saveIncr()
327 doc.close()
328
329
330 def embedded_del(args):
331 """Delete an embedded file entry."""
332 doc = open_file(args.input, args.password, pdf=True)
333 if not doc.can_save_incrementally() and (
334 not args.output or args.output == args.input
335 ):
336 sys.exit("cannot save PDF incrementally")
337
338 try:
339 doc.embfile_del(args.name)
340 except ValueError:
341 sys.exit("no such embedded file '%s'" % args.name)
342 if not args.output or args.output == args.input:
343 doc.save_incr()
344 else:
345 doc.save(args.output, garbage=1)
346 doc.close()
347
348
349 def embedded_get(args):
350 """Retrieve contents of an embedded file."""
351 doc = open_file(args.input, args.password, pdf=True)
352 try:
353 stream = doc.embfile_get(args.name)
354 d = doc.embfile_info(args.name)
355 except ValueError:
356 sys.exit("no such embedded file '%s'" % args.name)
357 filename = args.output if args.output else d["filename"]
358 output = open(filename, "wb")
359 output.write(stream)
360 output.close()
361 print("saved entry '%s' as '%s'" % (args.name, filename))
362 doc.close()
363
364
365 def embedded_add(args):
366 """Insert a new embedded file."""
367 doc = open_file(args.input, args.password, pdf=True)
368 if not doc.can_save_incrementally() and (
369 args.output is None or args.output == args.input
370 ):
371 sys.exit("cannot save PDF incrementally")
372
373 try:
374 doc.embfile_del(args.name)
375 sys.exit("entry '%s' already exists" % args.name)
376 except:
377 pass
378
379 if not os.path.exists(args.path) or not os.path.isfile(args.path):
380 sys.exit("no such file '%s'" % args.path)
381 stream = open(args.path, "rb").read()
382 filename = args.path
383 ufilename = filename
384 if not args.desc:
385 desc = filename
386 else:
387 desc = args.desc
388 doc.embfile_add(
389 args.name, stream, filename=filename, ufilename=ufilename, desc=desc
390 )
391 if not args.output or args.output == args.input:
392 doc.saveIncr()
393 else:
394 doc.save(args.output, garbage=3)
395 doc.close()
396
397
398 def embedded_upd(args):
399 """Update contents or metadata of an embedded file."""
400 doc = open_file(args.input, args.password, pdf=True)
401 if not doc.can_save_incrementally() and (
402 args.output is None or args.output == args.input
403 ):
404 sys.exit("cannot save PDF incrementally")
405
406 try:
407 doc.embfile_info(args.name)
408 except:
409 sys.exit("no such embedded file '%s'" % args.name)
410
411 if (
412 args.path is not None
413 and os.path.exists(args.path)
414 and os.path.isfile(args.path)
415 ):
416 stream = open(args.path, "rb").read()
417 else:
418 stream = None
419
420 if args.filename:
421 filename = args.filename
422 else:
423 filename = None
424
425 if args.ufilename:
426 ufilename = args.ufilename
427 elif args.filename:
428 ufilename = args.filename
429 else:
430 ufilename = None
431
432 if args.desc:
433 desc = args.desc
434 else:
435 desc = None
436
437 doc.embfile_upd(
438 args.name, stream, filename=filename, ufilename=ufilename, desc=desc
439 )
440 if args.output is None or args.output == args.input:
441 doc.saveIncr()
442 else:
443 doc.save(args.output, garbage=3)
444 doc.close()
445
446
447 def embedded_list(args):
448 """List embedded files."""
449 doc = open_file(args.input, args.password, pdf=True)
450 names = doc.embfile_names()
451 if args.name is not None:
452 if args.name not in names:
453 sys.exit("no such embedded file '%s'" % args.name)
454 else:
455 print()
456 print(
457 "printing 1 of %i embedded file%s:"
458 % (len(names), "s" if len(names) > 1 else "")
459 )
460 print()
461 print_dict(doc.embfile_info(args.name))
462 print()
463 return
464 if not names:
465 print("'%s' contains no embedded files" % doc.name)
466 return
467 if len(names) > 1:
468 msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
469 else:
470 msg = "'%s' contains the following embedded file" % doc.name
471 print(msg)
472 print()
473 for name in names:
474 if not args.detail:
475 print(name)
476 continue
477 _ = doc.embfile_info(name)
478 print_dict(doc.embfile_info(name))
479 print()
480 doc.close()
481
482
483 def extract_objects(args):
484 """Extract images and / or fonts from a PDF."""
485 if not args.fonts and not args.images:
486 sys.exit("neither fonts nor images requested")
487 doc = open_file(args.input, args.password, pdf=True)
488
489 if args.pages:
490 pages = get_list(args.pages, doc.page_count + 1)
491 else:
492 pages = range(1, doc.page_count + 1)
493
494 if not args.output:
495 out_dir = os.path.abspath(os.curdir)
496 else:
497 out_dir = args.output
498 if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
499 sys.exit("output directory %s does not exist" % out_dir)
500
501 font_xrefs = set() # already saved fonts
502 image_xrefs = set() # already saved images
503
504 for pno in pages:
505 if args.fonts:
506 itemlist = doc.get_page_fonts(pno - 1)
507 for item in itemlist:
508 xref = item[0]
509 if xref not in font_xrefs:
510 font_xrefs.add(xref)
511 fontname, ext, _, buffer = doc.extract_font(xref)
512 if ext == "n/a" or not buffer:
513 continue
514 outname = os.path.join(
515 out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
516 )
517 outfile = open(outname, "wb")
518 outfile.write(buffer)
519 outfile.close()
520 buffer = None
521 if args.images:
522 itemlist = doc.get_page_images(pno - 1)
523 for item in itemlist:
524 xref = item[0]
525 if xref not in image_xrefs:
526 image_xrefs.add(xref)
527 pix = recoverpix(doc, item)
528 if type(pix) is dict:
529 ext = pix["ext"]
530 imgdata = pix["image"]
531 outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
532 outfile = open(outname, "wb")
533 outfile.write(imgdata)
534 outfile.close()
535 else:
536 outname = os.path.join(out_dir, "img-%i.png" % xref)
537 pix2 = (
538 pix
539 if pix.colorspace.n < 4
540 else fitz.Pixmap(fitz.csRGB, pix)
541 )
542 pix2.save(outname)
543
544 if args.fonts:
545 print("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
546 if args.images:
547 print("saved %i images to '%s'" % (len(image_xrefs), out_dir))
548 doc.close()
549
550
551 def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
552 eop = b"\n" if noformfeed else bytes([12])
553 text = page.get_text("text", flags=flags)
554 if not text:
555 if not skip_empty:
556 textout.write(eop) # write formfeed
557 return
558 textout.write(text.encode("utf8", errors="surrogatepass"))
559 textout.write(eop)
560 return
561
562
563 def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
564 eop = b"\n" if noformfeed else bytes([12])
565 blocks = page.get_text("blocks", flags=flags)
566 if blocks == []:
567 if not skip_empty:
568 textout.write(eop) # write formfeed
569 return
570 blocks.sort(key=lambda b: (b[3], b[0]))
571 for b in blocks:
572 textout.write(b[4].encode("utf8", errors="surrogatepass"))
573 textout.write(eop)
574 return
575
576
577 def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
578 eop = b"\n" if noformfeed else bytes([12])
579
580 # --------------------------------------------------------------------
581 def find_line_index(values: List[int], value: int) -> int:
582 """Find the right row coordinate.
583
584 Args:
585 values: (list) y-coordinates of rows.
586 value: (int) lookup for this value (y-origin of char).
587 Returns:
588 y-ccordinate of appropriate line for value.
589 """
590 i = bisect.bisect_right(values, value)
591 if i:
592 return values[i - 1]
593 raise RuntimeError("Line for %g not found in %s" % (value, values))
594
595 # --------------------------------------------------------------------
596 def curate_rows(rows: Set[int], GRID) -> List:
597 rows = list(rows)
598 rows.sort() # sort ascending
599 nrows = [rows[0]]
600 for h in rows[1:]:
601 if h >= nrows[-1] + GRID: # only keep significant differences
602 nrows.append(h)
603 return nrows # curated list of line bottom coordinates
604
605 def process_blocks(blocks: List[Dict], page: fitz.Page):
606 rows = set()
607 page_width = page.rect.width
608 page_height = page.rect.height
609 rowheight = page_height
610 left = page_width
611 right = 0
612 chars = []
613 for block in blocks:
614 for line in block["lines"]:
615 if line["dir"] != (1, 0): # ignore non-horizontal text
616 continue
617 x0, y0, x1, y1 = line["bbox"]
618 if y1 < 0 or y0 > page.rect.height: # ignore if outside CropBox
619 continue
620 # upd row height
621 height = y1 - y0
622
623 if rowheight > height:
624 rowheight = height
625 for span in line["spans"]:
626 if span["size"] <= fontsize:
627 continue
628 for c in span["chars"]:
629 x0, _, x1, _ = c["bbox"]
630 cwidth = x1 - x0
631 ox, oy = c["origin"]
632 oy = int(round(oy))
633 rows.add(oy)
634 ch = c["c"]
635 if left > ox and ch != " ":
636 left = ox # update left coordinate
637 if right < x1:
638 right = x1 # update right coordinate
639 # handle ligatures:
640 if cwidth == 0 and chars != []: # potential ligature
641 old_ch, old_ox, old_oy, old_cwidth = chars[-1]
642 if old_oy == oy: # ligature
643 if old_ch != chr(0xFB00): # previous "ff" char lig?
644 lig = joinligature(old_ch + ch) # no
645 # convert to one of the 3-char ligatures:
646 elif ch == "i":
647 lig = chr(0xFB03) # "ffi"
648 elif ch == "l":
649 lig = chr(0xFB04) # "ffl"
650 else: # something wrong, leave old char in place
651 lig = old_ch
652 chars[-1] = (lig, old_ox, old_oy, old_cwidth)
653 continue
654 chars.append((ch, ox, oy, cwidth)) # all chars on page
655 return chars, rows, left, right, rowheight
656
657 def joinligature(lig: str) -> str:
658 """Return ligature character for a given pair / triple of characters.
659
660 Args:
661 lig: (str) 2/3 characters, e.g. "ff"
662 Returns:
663 Ligature, e.g. "ff" -> chr(0xFB00)
664 """
665
666 if lig == "ff":
667 return chr(0xFB00)
668 elif lig == "fi":
669 return chr(0xFB01)
670 elif lig == "fl":
671 return chr(0xFB02)
672 elif lig == "ffi":
673 return chr(0xFB03)
674 elif lig == "ffl":
675 return chr(0xFB04)
676 elif lig == "ft":
677 return chr(0xFB05)
678 elif lig == "st":
679 return chr(0xFB06)
680 return lig
681
682 # --------------------------------------------------------------------
683 def make_textline(left, slot, minslot, lchars):
684 """Produce the text of one output line.
685
686 Args:
687 left: (float) left most coordinate used on page
688 slot: (float) avg width of one character in any font in use.
689 minslot: (float) min width for the characters in this line.
690 chars: (list[tuple]) characters of this line.
691 Returns:
692 text: (str) text string for this line
693 """
694 text = "" # we output this
695 old_char = ""
696 old_x1 = 0 # end coordinate of last char
697 old_ox = 0 # x-origin of last char
698 if minslot <= fitz.EPSILON:
699 raise RuntimeError("program error: minslot too small = %g" % minslot)
700
701 for c in lchars: # loop over characters
702 char, ox, _, cwidth = c
703 ox = ox - left # its (relative) start coordinate
704 x1 = ox + cwidth # ending coordinate
705
706 # eliminate overprint effect
707 if old_char == char and ox - old_ox <= cwidth * 0.2:
708 continue
709
710 # omit spaces overlapping previous char
711 if char == " " and (old_x1 - ox) / cwidth > 0.8:
712 continue
713
714 old_char = char
715 # close enough to previous?
716 if ox < old_x1 + minslot: # assume char adjacent to previous
717 text += char # append to output
718 old_x1 = x1 # new end coord
719 old_ox = ox # new origin.x
720 continue
721
722 # else next char starts after some gap:
723 # fill in right number of spaces, so char is positioned
724 # in the right slot of the line
725 if char == " ": # rest relevant for non-space only
726 continue
727 delta = int(ox / slot) - len(text)
728 if ox > old_x1 and delta > 1:
729 text += " " * delta
730 # now append char
731 text += char
732 old_x1 = x1 # new end coordinate
733 old_ox = ox # new origin
734 return text.rstrip()
735
736 # extract page text by single characters ("rawdict")
737 blocks = page.get_text("rawdict", flags=flags)["blocks"]
738 chars, rows, left, right, rowheight = process_blocks(blocks, page)
739
740 if chars == []:
741 if not skip_empty:
742 textout.write(eop) # write formfeed
743 return
744 # compute list of line coordinates - ignoring small (GRID) differences
745 rows = curate_rows(rows, GRID)
746
747 # sort all chars by x-coordinates, so every line will receive char info,
748 # sorted from left to right.
749 chars.sort(key=lambda c: c[1])
750
751 # populate the lines with their char info
752 lines = {} # key: y1-ccordinate, value: char list
753 for c in chars:
754 _, _, oy, _ = c
755 y = find_line_index(rows, oy) # y-coord of the right line
756 lchars = lines.get(y, []) # read line chars so far
757 lchars.append(c) # append this char
758 lines[y] = lchars # write back to line
759
760 # ensure line coordinates are ascending
761 keys = list(lines.keys())
762 keys.sort()
763
764 # -------------------------------------------------------------------------
765 # Compute "char resolution" for the page: the char width corresponding to
766 # 1 text char position on output - call it 'slot'.
767 # For each line, compute median of its char widths. The minimum across all
768 # lines is 'slot'.
769 # The minimum char width of each line is used to determine if spaces must
770 # be inserted in between two characters.
771 # -------------------------------------------------------------------------
772 slot = right - left
773 minslots = {}
774 for k in keys:
775 lchars = lines[k]
776 ccount = len(lchars)
777 if ccount < 2:
778 minslots[k] = 1
779 continue
780 widths = [c[3] for c in lchars]
781 widths.sort()
782 this_slot = statistics.median(widths) # take median value
783 if this_slot < slot:
784 slot = this_slot
785 minslots[k] = widths[0]
786
787 # compute line advance in text output
788 rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
789 rowpos = rows[0] # first line positioned here
790 textout.write(b"\n")
791 for k in keys: # walk through the lines
792 while rowpos < k: # honor distance between lines
793 textout.write(b"\n")
794 rowpos += rowheight
795 text = make_textline(left, slot, minslots[k], lines[k])
796 textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
797 rowpos = k + rowheight
798
799 textout.write(eop) # write formfeed
800
801
802 def gettext(args):
803 doc = open_file(args.input, args.password, pdf=False)
804 pagel = get_list(args.pages, doc.page_count + 1)
805 output = args.output
806 if output == None:
807 filename, _ = os.path.splitext(doc.name)
808 output = filename + ".txt"
809 textout = open(output, "wb")
810 flags = TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE
811 if args.convert_white:
812 flags ^= TEXT_PRESERVE_WHITESPACE
813 if args.noligatures:
814 flags ^= TEXT_PRESERVE_LIGATURES
815 if args.extra_spaces:
816 flags ^= TEXT_INHIBIT_SPACES
817 func = {
818 "simple": page_simple,
819 "blocks": page_blocksort,
820 "layout": page_layout,
821 }
822 for pno in pagel:
823 page = doc[pno - 1]
824 func[args.mode](
825 page,
826 textout,
827 args.grid,
828 args.fontsize,
829 args.noformfeed,
830 args.skip_empty,
831 flags=flags,
832 )
833
834 textout.close()
835
836
837 def main():
838 """Define command configurations."""
839 parser = argparse.ArgumentParser(
840 prog="fitz",
841 description=mycenter("Basic PyMuPDF Functions"),
842 )
843 subps = parser.add_subparsers(
844 title="Subcommands", help="Enter 'command -h' for subcommand specific help"
845 )
846
847 # -------------------------------------------------------------------------
848 # 'show' command
849 # -------------------------------------------------------------------------
850 ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
851 ps_show.add_argument("input", type=str, help="PDF filename")
852 ps_show.add_argument("-password", help="password")
853 ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
854 ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
855 ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
856 ps_show.add_argument(
857 "-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
858 )
859 ps_show.add_argument(
860 "-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
861 )
862 ps_show.set_defaults(func=show)
863
864 # -------------------------------------------------------------------------
865 # 'clean' command
866 # -------------------------------------------------------------------------
867 ps_clean = subps.add_parser(
868 "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
869 )
870 ps_clean.add_argument("input", type=str, help="PDF filename")
871 ps_clean.add_argument("output", type=str, help="output PDF filename")
872 ps_clean.add_argument("-password", help="password")
873
874 ps_clean.add_argument(
875 "-encryption",
876 help="encryption method",
877 choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
878 default="none",
879 )
880
881 ps_clean.add_argument("-owner", type=str, help="owner password")
882 ps_clean.add_argument("-user", type=str, help="user password")
883
884 ps_clean.add_argument(
885 "-garbage",
886 type=int,
887 help="garbage collection level",
888 choices=range(5),
889 default=0,
890 )
891
892 ps_clean.add_argument(
893 "-compress",
894 action="store_true",
895 default=False,
896 help="compress (deflate) output",
897 )
898
899 ps_clean.add_argument(
900 "-ascii", action="store_true", default=False, help="ASCII encode binary data"
901 )
902
903 ps_clean.add_argument(
904 "-linear",
905 action="store_true",
906 default=False,
907 help="format for fast web display",
908 )
909
910 ps_clean.add_argument(
911 "-permission", type=int, default=-1, help="integer with permission levels"
912 )
913
914 ps_clean.add_argument(
915 "-sanitize",
916 action="store_true",
917 default=False,
918 help="sanitize / clean contents",
919 )
920 ps_clean.add_argument(
921 "-pretty", action="store_true", default=False, help="prettify PDF structure"
922 )
923 ps_clean.add_argument(
924 "-pages", help="output selected pages pages, format: 1,5-7,50-N"
925 )
926 ps_clean.set_defaults(func=clean)
927
928 # -------------------------------------------------------------------------
929 # 'join' command
930 # -------------------------------------------------------------------------
931 ps_join = subps.add_parser(
932 "join",
933 description=mycenter("join PDF documents"),
934 epilog="specify each input as 'filename[,password[,pages]]'",
935 )
936 ps_join.add_argument("input", nargs="*", help="input filenames")
937 ps_join.add_argument("-output", required=True, help="output filename")
938 ps_join.set_defaults(func=doc_join)
939
940 # -------------------------------------------------------------------------
941 # 'extract' command
942 # -------------------------------------------------------------------------
943 ps_extract = subps.add_parser(
944 "extract", description=mycenter("extract images and fonts to disk")
945 )
946 ps_extract.add_argument("input", type=str, help="PDF filename")
947 ps_extract.add_argument("-images", action="store_true", help="extract images")
948 ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
949 ps_extract.add_argument(
950 "-output", help="folder to receive output, defaults to current"
951 )
952 ps_extract.add_argument("-password", help="password")
953 ps_extract.add_argument(
954 "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
955 )
956 ps_extract.set_defaults(func=extract_objects)
957
958 # -------------------------------------------------------------------------
959 # 'embed-info'
960 # -------------------------------------------------------------------------
961 ps_show = subps.add_parser(
962 "embed-info", description=mycenter("list embedded files")
963 )
964 ps_show.add_argument("input", help="PDF filename")
965 ps_show.add_argument("-name", help="if given, report only this one")
966 ps_show.add_argument("-detail", action="store_true", help="detail information")
967 ps_show.add_argument("-password", help="password")
968 ps_show.set_defaults(func=embedded_list)
969
970 # -------------------------------------------------------------------------
971 # 'embed-add' command
972 # -------------------------------------------------------------------------
973 ps_embed_add = subps.add_parser(
974 "embed-add", description=mycenter("add embedded file")
975 )
976 ps_embed_add.add_argument("input", help="PDF filename")
977 ps_embed_add.add_argument("-password", help="password")
978 ps_embed_add.add_argument(
979 "-output", help="output PDF filename, incremental save if none"
980 )
981 ps_embed_add.add_argument("-name", required=True, help="name of new entry")
982 ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
983 ps_embed_add.add_argument("-desc", help="description of new entry")
984 ps_embed_add.set_defaults(func=embedded_add)
985
986 # -------------------------------------------------------------------------
987 # 'embed-del' command
988 # -------------------------------------------------------------------------
989 ps_embed_del = subps.add_parser(
990 "embed-del", description=mycenter("delete embedded file")
991 )
992 ps_embed_del.add_argument("input", help="PDF filename")
993 ps_embed_del.add_argument("-password", help="password")
994 ps_embed_del.add_argument(
995 "-output", help="output PDF filename, incremental save if none"
996 )
997 ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
998 ps_embed_del.set_defaults(func=embedded_del)
999
1000 # -------------------------------------------------------------------------
1001 # 'embed-upd' command
1002 # -------------------------------------------------------------------------
1003 ps_embed_upd = subps.add_parser(
1004 "embed-upd",
1005 description=mycenter("update embedded file"),
1006 epilog="except '-name' all parameters are optional",
1007 )
1008 ps_embed_upd.add_argument("input", help="PDF filename")
1009 ps_embed_upd.add_argument("-name", required=True, help="name of entry")
1010 ps_embed_upd.add_argument("-password", help="password")
1011 ps_embed_upd.add_argument(
1012 "-output", help="Output PDF filename, incremental save if none"
1013 )
1014 ps_embed_upd.add_argument("-path", help="path to new data for entry")
1015 ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
1016 ps_embed_upd.add_argument(
1017 "-ufilename", help="new unicode filename to store in entry"
1018 )
1019 ps_embed_upd.add_argument("-desc", help="new description to store in entry")
1020 ps_embed_upd.set_defaults(func=embedded_upd)
1021
1022 # -------------------------------------------------------------------------
1023 # 'embed-extract' command
1024 # -------------------------------------------------------------------------
1025 ps_embed_extract = subps.add_parser(
1026 "embed-extract", description=mycenter("extract embedded file to disk")
1027 )
1028 ps_embed_extract.add_argument("input", type=str, help="PDF filename")
1029 ps_embed_extract.add_argument("-name", required=True, help="name of entry")
1030 ps_embed_extract.add_argument("-password", help="password")
1031 ps_embed_extract.add_argument(
1032 "-output", help="output filename, default is stored name"
1033 )
1034 ps_embed_extract.set_defaults(func=embedded_get)
1035
1036 # -------------------------------------------------------------------------
1037 # 'embed-copy' command
1038 # -------------------------------------------------------------------------
1039 ps_embed_copy = subps.add_parser(
1040 "embed-copy", description=mycenter("copy embedded files between PDFs")
1041 )
1042 ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
1043 ps_embed_copy.add_argument("-password", help="password of input")
1044 ps_embed_copy.add_argument(
1045 "-output", help="output PDF, incremental save to 'input' if omitted"
1046 )
1047 ps_embed_copy.add_argument(
1048 "-source", required=True, help="copy embedded files from here"
1049 )
1050 ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
1051 ps_embed_copy.add_argument(
1052 "-name", nargs="*", help="restrict copy to these entries"
1053 )
1054 ps_embed_copy.set_defaults(func=embedded_copy)
1055
1056 # -------------------------------------------------------------------------
1057 # 'textlayout' command
1058 # -------------------------------------------------------------------------
1059 ps_gettext = subps.add_parser(
1060 "gettext", description=mycenter("extract text in various formatting modes")
1061 )
1062 ps_gettext.add_argument("input", type=str, help="input document filename")
1063 ps_gettext.add_argument("-password", help="password for input document")
1064 ps_gettext.add_argument(
1065 "-mode",
1066 type=str,
1067 help="mode: simple, block sort, or layout (default)",
1068 choices=("simple", "blocks", "layout"),
1069 default="layout",
1070 )
1071 ps_gettext.add_argument(
1072 "-pages",
1073 type=str,
1074 help="select pages, format: 1,5-7,50-N",
1075 default="1-N",
1076 )
1077 ps_gettext.add_argument(
1078 "-noligatures",
1079 action="store_true",
1080 help="expand ligature characters (default False)",
1081 default=False,
1082 )
1083 ps_gettext.add_argument(
1084 "-convert-white",
1085 action="store_true",
1086 help="convert whitespace characters to white (default False)",
1087 default=False,
1088 )
1089 ps_gettext.add_argument(
1090 "-extra-spaces",
1091 action="store_true",
1092 help="fill gaps with spaces (default False)",
1093 default=False,
1094 )
1095 ps_gettext.add_argument(
1096 "-noformfeed",
1097 action="store_true",
1098 help="write linefeeds, no formfeeds (default False)",
1099 default=False,
1100 )
1101 ps_gettext.add_argument(
1102 "-skip-empty",
1103 action="store_true",
1104 help="suppress pages with no text (default False)",
1105 default=False,
1106 )
1107 ps_gettext.add_argument(
1108 "-output",
1109 help="store text in this file (default inputfilename.txt)",
1110 )
1111 ps_gettext.add_argument(
1112 "-grid",
1113 type=float,
1114 help="merge lines if closer than this (default 2)",
1115 default=2,
1116 )
1117 ps_gettext.add_argument(
1118 "-fontsize",
1119 type=float,
1120 help="only include text with a larger fontsize (default 3)",
1121 default=3,
1122 )
1123 ps_gettext.set_defaults(func=gettext)
1124
1125 # -------------------------------------------------------------------------
1126 # start program
1127 # -------------------------------------------------------------------------
1128 args = parser.parse_args() # create parameter arguments class
1129 if not hasattr(args, "func"): # no function selected
1130 parser.print_help() # so print top level help
1131 else:
1132 args.func(args) # execute requested command
1133
1134
1135 if __name__ == "__main__":
1136 main()