Mercurial > hgrepos > Python2 > PyMuPDF

diff src_classic/main.py @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:44:09 +0200
parents: 1d09e1dec1d9
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src_classic/__main__.py	Mon Sep 15 11:44:09 2025 +0200
@@ -0,0 +1,1136 @@
+# -----------------------------------------------------------------------------
+# Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com
+# License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html
+# Part of "PyMuPDF", Python bindings for "MuPDF" (http://mupdf.com), a
+# lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is
+# maintained and developed by Artifex Software, Inc. https://artifex.com.
+# -----------------------------------------------------------------------------
+import argparse
+import bisect
+import os
+import sys
+import statistics
+from typing import Dict, List, Set, Tuple
+
+import fitz
+from fitz.fitz import (
+    TEXT_INHIBIT_SPACES,
+    TEXT_PRESERVE_LIGATURES,
+    TEXT_PRESERVE_WHITESPACE,
+)
+
+mycenter = lambda x: (" %s " % x).center(75, "-")
+
+
+def recoverpix(doc, item):
+    """Return image for a given XREF."""
+    x = item[0]  # xref of PDF image
+    s = item[1]  # xref of its /SMask
+    if s == 0:  # no smask: use direct image output
+        return doc.extract_image(x)
+
+    def getimage(pix):
+        if pix.colorspace.n != 4:
+            return pix
+        tpix = fitz.Pixmap(fitz.csRGB, pix)
+        return tpix
+
+    # we need to reconstruct the alpha channel with the smask
+    pix1 = fitz.Pixmap(doc, x)
+    pix2 = fitz.Pixmap(doc, s)  # create pixmap of the /SMask entry
+
+    """Sanity check:
+    - both pixmaps must have the same rectangle
+    - both pixmaps must have alpha=0
+    - pix2 must consist of 1 byte per pixel
+    """
+    if not (pix1.irect == pix2.irect and pix1.alpha == pix2.alpha == 0 and pix2.n == 1):
+        print("Warning: unsupported /SMask %i for %i:" % (s, x))
+        print(pix2)
+        pix2 = None
+        return getimage(pix1)  # return the pixmap as is
+
+    pix = fitz.Pixmap(pix1)  # copy of pix1, with an alpha channel added
+    pix.set_alpha(pix2.samples)  # treat pix2.samples as the alpha values
+    pix1 = pix2 = None  # free temp pixmaps
+
+    # we may need to adjust something for CMYK pixmaps here:
+    return getimage(pix)
+
+
+def open_file(filename, password, show=False, pdf=True):
+    """Open and authenticate a document."""
+    doc = fitz.open(filename)
+    if not doc.is_pdf and pdf is True:
+        sys.exit("this command supports PDF files only")
+    rc = -1
+    if not doc.needs_pass:
+        return doc
+    if password:
+        rc = doc.authenticate(password)
+        if not rc:
+            sys.exit("authentication unsuccessful")
+        if show is True:
+            print("authenticated as %s" % "owner" if rc > 2 else "user")
+    else:
+        sys.exit("'%s' requires a password" % doc.name)
+    return doc
+
+
+def print_dict(item):
+    """Print a Python dictionary."""
+    l = max([len(k) for k in item.keys()]) + 1
+    for k, v in item.items():
+        msg = "%s: %s" % (k.rjust(l), v)
+        print(msg)
+    return
+
+
+def print_xref(doc, xref):
+    """Print an object given by XREF number.
+
+    Simulate the PDF source in "pretty" format.
+    For a stream also print its size.
+    """
+    print("%i 0 obj" % xref)
+    xref_str = doc.xref_object(xref)
+    print(xref_str)
+    if doc.xref_is_stream(xref):
+        temp = xref_str.split()
+        try:
+            idx = temp.index("/Length") + 1
+            size = temp[idx]
+            if size.endswith("0 R"):
+                size = "unknown"
+        except:
+            size = "unknown"
+        print("stream\n...%s bytes" % size)
+        print("endstream")
+    print("endobj")
+
+
+def get_list(rlist, limit, what="page"):
+    """Transform a page / xref specification into a list of integers.
+
+    Args
+    ----
+        rlist: (str) the specification
+        limit: maximum number, i.e. number of pages, number of objects
+        what: a string to be used in error messages
+    Returns
+    -------
+        A list of integers representing the specification.
+    """
+    N = str(limit - 1)
+    rlist = rlist.replace("N", N).replace(" ", "")
+    rlist_arr = rlist.split(",")
+    out_list = []
+    for seq, item in enumerate(rlist_arr):
+        n = seq + 1
+        if item.isdecimal():  # a single integer
+            i = int(item)
+            if 1 <= i < limit:
+                out_list.append(int(item))
+            else:
+                sys.exit("bad %s specification at item %i" % (what, n))
+            continue
+        try:  # this must be a range now, and all of the following must work:
+            i1, i2 = item.split("-")  # will fail if not 2 items produced
+            i1 = int(i1)  # will fail on non-integers
+            i2 = int(i2)
+        except:
+            sys.exit("bad %s range specification at item %i" % (what, n))
+
+        if not (1 <= i1 < limit and 1 <= i2 < limit):
+            sys.exit("bad %s range specification at item %i" % (what, n))
+
+        if i1 == i2:  # just in case: a range of equal numbers
+            out_list.append(i1)
+            continue
+
+        if i1 < i2:  # first less than second
+            out_list += list(range(i1, i2 + 1))
+        else:  # first larger than second
+            out_list += list(range(i1, i2 - 1, -1))
+
+    return out_list
+
+
+def show(args):
+    doc = open_file(args.input, args.password, True)
+    size = os.path.getsize(args.input) / 1024
+    flag = "KB"
+    if size > 1000:
+        size /= 1024
+        flag = "MB"
+    size = round(size, 1)
+    meta = doc.metadata
+    print(
+        "'%s', pages: %i, objects: %i, %g %s, %s, encryption: %s"
+        % (
+            args.input,
+            doc.page_count,
+            doc.xref_length() - 1,
+            size,
+            flag,
+            meta["format"],
+            meta["encryption"],
+        )
+    )
+    n = doc.is_form_pdf
+    if n > 0:
+        s = doc.get_sigflags()
+        print(
+            "document contains %i root form fields and is %ssigned"
+            % (n, "not " if s != 3 else "")
+        )
+    n = doc.embfile_count()
+    if n > 0:
+        print("document contains %i embedded files" % n)
+    print()
+    if args.catalog:
+        print(mycenter("PDF catalog"))
+        xref = doc.pdf_catalog()
+        print_xref(doc, xref)
+        print()
+    if args.metadata:
+        print(mycenter("PDF metadata"))
+        print_dict(doc.metadata)
+        print()
+    if args.xrefs:
+        print(mycenter("object information"))
+        xrefl = get_list(args.xrefs, doc.xref_length(), what="xref")
+        for xref in xrefl:
+            print_xref(doc, xref)
+            print()
+    if args.pages:
+        print(mycenter("page information"))
+        pagel = get_list(args.pages, doc.page_count + 1)
+        for pno in pagel:
+            n = pno - 1
+            xref = doc.page_xref(n)
+            print("Page %i:" % pno)
+            print_xref(doc, xref)
+            print()
+    if args.trailer:
+        print(mycenter("PDF trailer"))
+        print(doc.pdf_trailer())
+        print()
+    doc.close()
+
+
+def clean(args):
+    doc = open_file(args.input, args.password, pdf=True)
+    encryption = args.encryption
+    encrypt = ("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256").index(
+        encryption
+    )
+
+    if not args.pages:  # simple cleaning
+        doc.save(
+            args.output,
+            garbage=args.garbage,
+            deflate=args.compress,
+            pretty=args.pretty,
+            clean=args.sanitize,
+            ascii=args.ascii,
+            linear=args.linear,
+            encryption=encrypt,
+            owner_pw=args.owner,
+            user_pw=args.user,
+            permissions=args.permission,
+        )
+        return
+
+    # create sub document from page numbers
+    pages = get_list(args.pages, doc.page_count + 1)
+    outdoc = fitz.open()
+    for pno in pages:
+        n = pno - 1
+        outdoc.insert_pdf(doc, from_page=n, to_page=n)
+    outdoc.save(
+        args.output,
+        garbage=args.garbage,
+        deflate=args.compress,
+        pretty=args.pretty,
+        clean=args.sanitize,
+        ascii=args.ascii,
+        linear=args.linear,
+        encryption=encrypt,
+        owner_pw=args.owner,
+        user_pw=args.user,
+        permissions=args.permission,
+    )
+    doc.close()
+    outdoc.close()
+    return
+
+
+def doc_join(args):
+    """Join pages from several PDF documents."""
+    doc_list = args.input  # a list of input PDFs
+    doc = fitz.open()  # output PDF
+    for src_item in doc_list:  # process one input PDF
+        src_list = src_item.split(",")
+        password = src_list[1] if len(src_list) > 1 else None
+        src = open_file(src_list[0], password, pdf=True)
+        pages = ",".join(src_list[2:])  # get 'pages' specifications
+        if pages:  # if anything there, retrieve a list of desired pages
+            page_list = get_list(",".join(src_list[2:]), src.page_count + 1)
+        else:  # take all pages
+            page_list = range(1, src.page_count + 1)
+        for i in page_list:
+            doc.insert_pdf(src, from_page=i - 1, to_page=i - 1)  # copy each source page
+        src.close()
+
+    doc.save(args.output, garbage=4, deflate=True)
+    doc.close()
+
+
+def embedded_copy(args):
+    """Copy embedded files between PDFs."""
+    doc = open_file(args.input, args.password, pdf=True)
+    if not doc.can_save_incrementally() and (
+        not args.output or args.output == args.input
+    ):
+        sys.exit("cannot save PDF incrementally")
+    src = open_file(args.source, args.pwdsource)
+    names = set(args.name) if args.name else set()
+    src_names = set(src.embfile_names())
+    if names:
+        if not names <= src_names:
+            sys.exit("not all names are contained in source")
+    else:
+        names = src_names
+    if not names:
+        sys.exit("nothing to copy")
+    intersect = names & set(doc.embfile_names())  # any equal name already in target?
+    if intersect:
+        sys.exit("following names already exist in receiving PDF: %s" % str(intersect))
+
+    for item in names:
+        info = src.embfile_info(item)
+        buff = src.embfile_get(item)
+        doc.embfile_add(
+            item,
+            buff,
+            filename=info["filename"],
+            ufilename=info["ufilename"],
+            desc=info["desc"],
+        )
+        print("copied entry '%s' from '%s'" % (item, src.name))
+    src.close()
+    if args.output and args.output != args.input:
+        doc.save(args.output, garbage=3)
+    else:
+        doc.saveIncr()
+    doc.close()
+
+
+def embedded_del(args):
+    """Delete an embedded file entry."""
+    doc = open_file(args.input, args.password, pdf=True)
+    if not doc.can_save_incrementally() and (
+        not args.output or args.output == args.input
+    ):
+        sys.exit("cannot save PDF incrementally")
+
+    try:
+        doc.embfile_del(args.name)
+    except ValueError:
+        sys.exit("no such embedded file '%s'" % args.name)
+    if not args.output or args.output == args.input:
+        doc.save_incr()
+    else:
+        doc.save(args.output, garbage=1)
+    doc.close()
+
+
+def embedded_get(args):
+    """Retrieve contents of an embedded file."""
+    doc = open_file(args.input, args.password, pdf=True)
+    try:
+        stream = doc.embfile_get(args.name)
+        d = doc.embfile_info(args.name)
+    except ValueError:
+        sys.exit("no such embedded file '%s'" % args.name)
+    filename = args.output if args.output else d["filename"]
+    output = open(filename, "wb")
+    output.write(stream)
+    output.close()
+    print("saved entry '%s' as '%s'" % (args.name, filename))
+    doc.close()
+
+
+def embedded_add(args):
+    """Insert a new embedded file."""
+    doc = open_file(args.input, args.password, pdf=True)
+    if not doc.can_save_incrementally() and (
+        args.output is None or args.output == args.input
+    ):
+        sys.exit("cannot save PDF incrementally")
+
+    try:
+        doc.embfile_del(args.name)
+        sys.exit("entry '%s' already exists" % args.name)
+    except:
+        pass
+
+    if not os.path.exists(args.path) or not os.path.isfile(args.path):
+        sys.exit("no such file '%s'" % args.path)
+    stream = open(args.path, "rb").read()
+    filename = args.path
+    ufilename = filename
+    if not args.desc:
+        desc = filename
+    else:
+        desc = args.desc
+    doc.embfile_add(
+        args.name, stream, filename=filename, ufilename=ufilename, desc=desc
+    )
+    if not args.output or args.output == args.input:
+        doc.saveIncr()
+    else:
+        doc.save(args.output, garbage=3)
+    doc.close()
+
+
+def embedded_upd(args):
+    """Update contents or metadata of an embedded file."""
+    doc = open_file(args.input, args.password, pdf=True)
+    if not doc.can_save_incrementally() and (
+        args.output is None or args.output == args.input
+    ):
+        sys.exit("cannot save PDF incrementally")
+
+    try:
+        doc.embfile_info(args.name)
+    except:
+        sys.exit("no such embedded file '%s'" % args.name)
+
+    if (
+        args.path is not None
+        and os.path.exists(args.path)
+        and os.path.isfile(args.path)
+    ):
+        stream = open(args.path, "rb").read()
+    else:
+        stream = None
+
+    if args.filename:
+        filename = args.filename
+    else:
+        filename = None
+
+    if args.ufilename:
+        ufilename = args.ufilename
+    elif args.filename:
+        ufilename = args.filename
+    else:
+        ufilename = None
+
+    if args.desc:
+        desc = args.desc
+    else:
+        desc = None
+
+    doc.embfile_upd(
+        args.name, stream, filename=filename, ufilename=ufilename, desc=desc
+    )
+    if args.output is None or args.output == args.input:
+        doc.saveIncr()
+    else:
+        doc.save(args.output, garbage=3)
+    doc.close()
+
+
+def embedded_list(args):
+    """List embedded files."""
+    doc = open_file(args.input, args.password, pdf=True)
+    names = doc.embfile_names()
+    if args.name is not None:
+        if args.name not in names:
+            sys.exit("no such embedded file '%s'" % args.name)
+        else:
+            print()
+            print(
+                "printing 1 of %i embedded file%s:"
+                % (len(names), "s" if len(names) > 1 else "")
+            )
+            print()
+            print_dict(doc.embfile_info(args.name))
+            print()
+            return
+    if not names:
+        print("'%s' contains no embedded files" % doc.name)
+        return
+    if len(names) > 1:
+        msg = "'%s' contains the following %i embedded files" % (doc.name, len(names))
+    else:
+        msg = "'%s' contains the following embedded file" % doc.name
+    print(msg)
+    print()
+    for name in names:
+        if not args.detail:
+            print(name)
+            continue
+        _ = doc.embfile_info(name)
+        print_dict(doc.embfile_info(name))
+        print()
+    doc.close()
+
+
+def extract_objects(args):
+    """Extract images and / or fonts from a PDF."""
+    if not args.fonts and not args.images:
+        sys.exit("neither fonts nor images requested")
+    doc = open_file(args.input, args.password, pdf=True)
+
+    if args.pages:
+        pages = get_list(args.pages, doc.page_count + 1)
+    else:
+        pages = range(1, doc.page_count + 1)
+
+    if not args.output:
+        out_dir = os.path.abspath(os.curdir)
+    else:
+        out_dir = args.output
+        if not (os.path.exists(out_dir) and os.path.isdir(out_dir)):
+            sys.exit("output directory %s does not exist" % out_dir)
+
+    font_xrefs = set()  # already saved fonts
+    image_xrefs = set()  # already saved images
+
+    for pno in pages:
+        if args.fonts:
+            itemlist = doc.get_page_fonts(pno - 1)
+            for item in itemlist:
+                xref = item[0]
+                if xref not in font_xrefs:
+                    font_xrefs.add(xref)
+                    fontname, ext, _, buffer = doc.extract_font(xref)
+                    if ext == "n/a" or not buffer:
+                        continue
+                    outname = os.path.join(
+                        out_dir, f"{fontname.replace(' ', '-')}-{xref}.{ext}"
+                    )
+                    outfile = open(outname, "wb")
+                    outfile.write(buffer)
+                    outfile.close()
+                    buffer = None
+        if args.images:
+            itemlist = doc.get_page_images(pno - 1)
+            for item in itemlist:
+                xref = item[0]
+                if xref not in image_xrefs:
+                    image_xrefs.add(xref)
+                    pix = recoverpix(doc, item)
+                    if type(pix) is dict:
+                        ext = pix["ext"]
+                        imgdata = pix["image"]
+                        outname = os.path.join(out_dir, "img-%i.%s" % (xref, ext))
+                        outfile = open(outname, "wb")
+                        outfile.write(imgdata)
+                        outfile.close()
+                    else:
+                        outname = os.path.join(out_dir, "img-%i.png" % xref)
+                        pix2 = (
+                            pix
+                            if pix.colorspace.n < 4
+                            else fitz.Pixmap(fitz.csRGB, pix)
+                        )
+                        pix2.save(outname)
+
+    if args.fonts:
+        print("saved %i fonts to '%s'" % (len(font_xrefs), out_dir))
+    if args.images:
+        print("saved %i images to '%s'" % (len(image_xrefs), out_dir))
+    doc.close()
+
+
+def page_simple(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
+    eop = b"\n" if noformfeed else bytes([12])
+    text = page.get_text("text", flags=flags)
+    if not text:
+        if not skip_empty:
+            textout.write(eop)  # write formfeed
+        return
+    textout.write(text.encode("utf8", errors="surrogatepass"))
+    textout.write(eop)
+    return
+
+
+def page_blocksort(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
+    eop = b"\n" if noformfeed else bytes([12])
+    blocks = page.get_text("blocks", flags=flags)
+    if blocks == []:
+        if not skip_empty:
+            textout.write(eop)  # write formfeed
+        return
+    blocks.sort(key=lambda b: (b[3], b[0]))
+    for b in blocks:
+        textout.write(b[4].encode("utf8", errors="surrogatepass"))
+    textout.write(eop)
+    return
+
+
+def page_layout(page, textout, GRID, fontsize, noformfeed, skip_empty, flags):
+    eop = b"\n" if noformfeed else bytes([12])
+
+    # --------------------------------------------------------------------
+    def find_line_index(values: List[int], value: int) -> int:
+        """Find the right row coordinate.
+
+        Args:
+            values: (list) y-coordinates of rows.
+            value: (int) lookup for this value (y-origin of char).
+        Returns:
+            y-ccordinate of appropriate line for value.
+        """
+        i = bisect.bisect_right(values, value)
+        if i:
+            return values[i - 1]
+        raise RuntimeError("Line for %g not found in %s" % (value, values))
+
+    # --------------------------------------------------------------------
+    def curate_rows(rows: Set[int], GRID) -> List:
+        rows = list(rows)
+        rows.sort()  # sort ascending
+        nrows = [rows[0]]
+        for h in rows[1:]:
+            if h >= nrows[-1] + GRID:  # only keep significant differences
+                nrows.append(h)
+        return nrows  # curated list of line bottom coordinates
+
+    def process_blocks(blocks: List[Dict], page: fitz.Page):
+        rows = set()
+        page_width = page.rect.width
+        page_height = page.rect.height
+        rowheight = page_height
+        left = page_width
+        right = 0
+        chars = []
+        for block in blocks:
+            for line in block["lines"]:
+                if line["dir"] != (1, 0):  # ignore non-horizontal text
+                    continue
+                x0, y0, x1, y1 = line["bbox"]
+                if y1 < 0 or y0 > page.rect.height:  # ignore if outside CropBox
+                    continue
+                # upd row height
+                height = y1 - y0
+
+                if rowheight > height:
+                    rowheight = height
+                for span in line["spans"]:
+                    if span["size"] <= fontsize:
+                        continue
+                    for c in span["chars"]:
+                        x0, _, x1, _ = c["bbox"]
+                        cwidth = x1 - x0
+                        ox, oy = c["origin"]
+                        oy = int(round(oy))
+                        rows.add(oy)
+                        ch = c["c"]
+                        if left > ox and ch != " ":
+                            left = ox  # update left coordinate
+                        if right < x1:
+                            right = x1  # update right coordinate
+                        # handle ligatures:
+                        if cwidth == 0 and chars != []:  # potential ligature
+                            old_ch, old_ox, old_oy, old_cwidth = chars[-1]
+                            if old_oy == oy:  # ligature
+                                if old_ch != chr(0xFB00):  # previous "ff" char lig?
+                                    lig = joinligature(old_ch + ch)  # no
+                                # convert to one of the 3-char ligatures:
+                                elif ch == "i":
+                                    lig = chr(0xFB03)  # "ffi"
+                                elif ch == "l":
+                                    lig = chr(0xFB04)  # "ffl"
+                                else:  # something wrong, leave old char in place
+                                    lig = old_ch
+                                chars[-1] = (lig, old_ox, old_oy, old_cwidth)
+                                continue
+                        chars.append((ch, ox, oy, cwidth))  # all chars on page
+        return chars, rows, left, right, rowheight
+
+    def joinligature(lig: str) -> str:
+        """Return ligature character for a given pair / triple of characters.
+
+        Args:
+            lig: (str) 2/3 characters, e.g. "ff"
+        Returns:
+            Ligature, e.g. "ff" -> chr(0xFB00)
+        """
+
+        if lig == "ff":
+            return chr(0xFB00)
+        elif lig == "fi":
+            return chr(0xFB01)
+        elif lig == "fl":
+            return chr(0xFB02)
+        elif lig == "ffi":
+            return chr(0xFB03)
+        elif lig == "ffl":
+            return chr(0xFB04)
+        elif lig == "ft":
+            return chr(0xFB05)
+        elif lig == "st":
+            return chr(0xFB06)
+        return lig
+
+    # --------------------------------------------------------------------
+    def make_textline(left, slot, minslot, lchars):
+        """Produce the text of one output line.
+
+        Args:
+            left: (float) left most coordinate used on page
+            slot: (float) avg width of one character in any font in use.
+            minslot: (float) min width for the characters in this line.
+            chars: (list[tuple]) characters of this line.
+        Returns:
+            text: (str) text string for this line
+        """
+        text = ""  # we output this
+        old_char = ""
+        old_x1 = 0  # end coordinate of last char
+        old_ox = 0  # x-origin of last char
+        if minslot <= fitz.EPSILON:
+            raise RuntimeError("program error: minslot too small = %g" % minslot)
+
+        for c in lchars:  # loop over characters
+            char, ox, _, cwidth = c
+            ox = ox - left  # its (relative) start coordinate
+            x1 = ox + cwidth  # ending coordinate
+
+            # eliminate overprint effect
+            if old_char == char and ox - old_ox <= cwidth * 0.2:
+                continue
+
+            # omit spaces overlapping previous char
+            if char == " " and (old_x1 - ox) / cwidth > 0.8:
+                continue
+
+            old_char = char
+            # close enough to previous?
+            if ox < old_x1 + minslot:  # assume char adjacent to previous
+                text += char  # append to output
+                old_x1 = x1  # new end coord
+                old_ox = ox  # new origin.x
+                continue
+
+            # else next char starts after some gap:
+            # fill in right number of spaces, so char is positioned
+            # in the right slot of the line
+            if char == " ":  # rest relevant for non-space only
+                continue
+            delta = int(ox / slot) - len(text)
+            if ox > old_x1 and delta > 1:
+                text += " " * delta
+            # now append char
+            text += char
+            old_x1 = x1  # new end coordinate
+            old_ox = ox  # new origin
+        return text.rstrip()
+
+    # extract page text by single characters ("rawdict")
+    blocks = page.get_text("rawdict", flags=flags)["blocks"]
+    chars, rows, left, right, rowheight = process_blocks(blocks, page)
+
+    if chars == []:
+        if not skip_empty:
+            textout.write(eop)  # write formfeed
+        return
+    # compute list of line coordinates - ignoring small (GRID) differences
+    rows = curate_rows(rows, GRID)
+
+    # sort all chars by x-coordinates, so every line will receive char info,
+    # sorted from left to right.
+    chars.sort(key=lambda c: c[1])
+
+    # populate the lines with their char info
+    lines = {}  # key: y1-ccordinate, value: char list
+    for c in chars:
+        _, _, oy, _ = c
+        y = find_line_index(rows, oy)  # y-coord of the right line
+        lchars = lines.get(y, [])  # read line chars so far
+        lchars.append(c)  # append this char
+        lines[y] = lchars  # write back to line
+
+    # ensure line coordinates are ascending
+    keys = list(lines.keys())
+    keys.sort()
+
+    # -------------------------------------------------------------------------
+    # Compute "char resolution" for the page: the char width corresponding to
+    # 1 text char position on output - call it 'slot'.
+    # For each line, compute median of its char widths. The minimum across all
+    # lines is 'slot'.
+    # The minimum char width of each line is used to determine if spaces must
+    # be inserted in between two characters.
+    # -------------------------------------------------------------------------
+    slot = right - left
+    minslots = {}
+    for k in keys:
+        lchars = lines[k]
+        ccount = len(lchars)
+        if ccount < 2:
+            minslots[k] = 1
+            continue
+        widths = [c[3] for c in lchars]
+        widths.sort()
+        this_slot = statistics.median(widths)  # take median value
+        if this_slot < slot:
+            slot = this_slot
+        minslots[k] = widths[0]
+
+    # compute line advance in text output
+    rowheight = rowheight * (rows[-1] - rows[0]) / (rowheight * len(rows)) * 1.2
+    rowpos = rows[0]  # first line positioned here
+    textout.write(b"\n")
+    for k in keys:  # walk through the lines
+        while rowpos < k:  # honor distance between lines
+            textout.write(b"\n")
+            rowpos += rowheight
+        text = make_textline(left, slot, minslots[k], lines[k])
+        textout.write((text + "\n").encode("utf8", errors="surrogatepass"))
+        rowpos = k + rowheight
+
+    textout.write(eop)  # write formfeed
+
+
+def gettext(args):
+    doc = open_file(args.input, args.password, pdf=False)
+    pagel = get_list(args.pages, doc.page_count + 1)
+    output = args.output
+    if output == None:
+        filename, _ = os.path.splitext(doc.name)
+        output = filename + ".txt"
+    textout = open(output, "wb")
+    flags = TEXT_PRESERVE_LIGATURES | TEXT_PRESERVE_WHITESPACE
+    if args.convert_white:
+        flags ^= TEXT_PRESERVE_WHITESPACE
+    if args.noligatures:
+        flags ^= TEXT_PRESERVE_LIGATURES
+    if args.extra_spaces:
+        flags ^= TEXT_INHIBIT_SPACES
+    func = {
+        "simple": page_simple,
+        "blocks": page_blocksort,
+        "layout": page_layout,
+    }
+    for pno in pagel:
+        page = doc[pno - 1]
+        func[args.mode](
+            page,
+            textout,
+            args.grid,
+            args.fontsize,
+            args.noformfeed,
+            args.skip_empty,
+            flags=flags,
+        )
+
+    textout.close()
+
+
+def main():
+    """Define command configurations."""
+    parser = argparse.ArgumentParser(
+        prog="fitz",
+        description=mycenter("Basic PyMuPDF Functions"),
+    )
+    subps = parser.add_subparsers(
+        title="Subcommands", help="Enter 'command -h' for subcommand specific help"
+    )
+
+    # -------------------------------------------------------------------------
+    # 'show' command
+    # -------------------------------------------------------------------------
+    ps_show = subps.add_parser("show", description=mycenter("display PDF information"))
+    ps_show.add_argument("input", type=str, help="PDF filename")
+    ps_show.add_argument("-password", help="password")
+    ps_show.add_argument("-catalog", action="store_true", help="show PDF catalog")
+    ps_show.add_argument("-trailer", action="store_true", help="show PDF trailer")
+    ps_show.add_argument("-metadata", action="store_true", help="show PDF metadata")
+    ps_show.add_argument(
+        "-xrefs", type=str, help="show selected objects, format: 1,5-7,N"
+    )
+    ps_show.add_argument(
+        "-pages", type=str, help="show selected pages, format: 1,5-7,50-N"
+    )
+    ps_show.set_defaults(func=show)
+
+    # -------------------------------------------------------------------------
+    # 'clean' command
+    # -------------------------------------------------------------------------
+    ps_clean = subps.add_parser(
+        "clean", description=mycenter("optimize PDF, or create sub-PDF if pages given")
+    )
+    ps_clean.add_argument("input", type=str, help="PDF filename")
+    ps_clean.add_argument("output", type=str, help="output PDF filename")
+    ps_clean.add_argument("-password", help="password")
+
+    ps_clean.add_argument(
+        "-encryption",
+        help="encryption method",
+        choices=("keep", "none", "rc4-40", "rc4-128", "aes-128", "aes-256"),
+        default="none",
+    )
+
+    ps_clean.add_argument("-owner", type=str, help="owner password")
+    ps_clean.add_argument("-user", type=str, help="user password")
+
+    ps_clean.add_argument(
+        "-garbage",
+        type=int,
+        help="garbage collection level",
+        choices=range(5),
+        default=0,
+    )
+
+    ps_clean.add_argument(
+        "-compress",
+        action="store_true",
+        default=False,
+        help="compress (deflate) output",
+    )
+
+    ps_clean.add_argument(
+        "-ascii", action="store_true", default=False, help="ASCII encode binary data"
+    )
+
+    ps_clean.add_argument(
+        "-linear",
+        action="store_true",
+        default=False,
+        help="format for fast web display",
+    )
+
+    ps_clean.add_argument(
+        "-permission", type=int, default=-1, help="integer with permission levels"
+    )
+
+    ps_clean.add_argument(
+        "-sanitize",
+        action="store_true",
+        default=False,
+        help="sanitize / clean contents",
+    )
+    ps_clean.add_argument(
+        "-pretty", action="store_true", default=False, help="prettify PDF structure"
+    )
+    ps_clean.add_argument(
+        "-pages", help="output selected pages pages, format: 1,5-7,50-N"
+    )
+    ps_clean.set_defaults(func=clean)
+
+    # -------------------------------------------------------------------------
+    # 'join' command
+    # -------------------------------------------------------------------------
+    ps_join = subps.add_parser(
+        "join",
+        description=mycenter("join PDF documents"),
+        epilog="specify each input as 'filename[,password[,pages]]'",
+    )
+    ps_join.add_argument("input", nargs="*", help="input filenames")
+    ps_join.add_argument("-output", required=True, help="output filename")
+    ps_join.set_defaults(func=doc_join)
+
+    # -------------------------------------------------------------------------
+    # 'extract' command
+    # -------------------------------------------------------------------------
+    ps_extract = subps.add_parser(
+        "extract", description=mycenter("extract images and fonts to disk")
+    )
+    ps_extract.add_argument("input", type=str, help="PDF filename")
+    ps_extract.add_argument("-images", action="store_true", help="extract images")
+    ps_extract.add_argument("-fonts", action="store_true", help="extract fonts")
+    ps_extract.add_argument(
+        "-output", help="folder to receive output, defaults to current"
+    )
+    ps_extract.add_argument("-password", help="password")
+    ps_extract.add_argument(
+        "-pages", type=str, help="consider these pages only, format: 1,5-7,50-N"
+    )
+    ps_extract.set_defaults(func=extract_objects)
+
+    # -------------------------------------------------------------------------
+    # 'embed-info'
+    # -------------------------------------------------------------------------
+    ps_show = subps.add_parser(
+        "embed-info", description=mycenter("list embedded files")
+    )
+    ps_show.add_argument("input", help="PDF filename")
+    ps_show.add_argument("-name", help="if given, report only this one")
+    ps_show.add_argument("-detail", action="store_true", help="detail information")
+    ps_show.add_argument("-password", help="password")
+    ps_show.set_defaults(func=embedded_list)
+
+    # -------------------------------------------------------------------------
+    # 'embed-add' command
+    # -------------------------------------------------------------------------
+    ps_embed_add = subps.add_parser(
+        "embed-add", description=mycenter("add embedded file")
+    )
+    ps_embed_add.add_argument("input", help="PDF filename")
+    ps_embed_add.add_argument("-password", help="password")
+    ps_embed_add.add_argument(
+        "-output", help="output PDF filename, incremental save if none"
+    )
+    ps_embed_add.add_argument("-name", required=True, help="name of new entry")
+    ps_embed_add.add_argument("-path", required=True, help="path to data for new entry")
+    ps_embed_add.add_argument("-desc", help="description of new entry")
+    ps_embed_add.set_defaults(func=embedded_add)
+
+    # -------------------------------------------------------------------------
+    # 'embed-del' command
+    # -------------------------------------------------------------------------
+    ps_embed_del = subps.add_parser(
+        "embed-del", description=mycenter("delete embedded file")
+    )
+    ps_embed_del.add_argument("input", help="PDF filename")
+    ps_embed_del.add_argument("-password", help="password")
+    ps_embed_del.add_argument(
+        "-output", help="output PDF filename, incremental save if none"
+    )
+    ps_embed_del.add_argument("-name", required=True, help="name of entry to delete")
+    ps_embed_del.set_defaults(func=embedded_del)
+
+    # -------------------------------------------------------------------------
+    # 'embed-upd' command
+    # -------------------------------------------------------------------------
+    ps_embed_upd = subps.add_parser(
+        "embed-upd",
+        description=mycenter("update embedded file"),
+        epilog="except '-name' all parameters are optional",
+    )
+    ps_embed_upd.add_argument("input", help="PDF filename")
+    ps_embed_upd.add_argument("-name", required=True, help="name of entry")
+    ps_embed_upd.add_argument("-password", help="password")
+    ps_embed_upd.add_argument(
+        "-output", help="Output PDF filename, incremental save if none"
+    )
+    ps_embed_upd.add_argument("-path", help="path to new data for entry")
+    ps_embed_upd.add_argument("-filename", help="new filename to store in entry")
+    ps_embed_upd.add_argument(
+        "-ufilename", help="new unicode filename to store in entry"
+    )
+    ps_embed_upd.add_argument("-desc", help="new description to store in entry")
+    ps_embed_upd.set_defaults(func=embedded_upd)
+
+    # -------------------------------------------------------------------------
+    # 'embed-extract' command
+    # -------------------------------------------------------------------------
+    ps_embed_extract = subps.add_parser(
+        "embed-extract", description=mycenter("extract embedded file to disk")
+    )
+    ps_embed_extract.add_argument("input", type=str, help="PDF filename")
+    ps_embed_extract.add_argument("-name", required=True, help="name of entry")
+    ps_embed_extract.add_argument("-password", help="password")
+    ps_embed_extract.add_argument(
+        "-output", help="output filename, default is stored name"
+    )
+    ps_embed_extract.set_defaults(func=embedded_get)
+
+    # -------------------------------------------------------------------------
+    # 'embed-copy' command
+    # -------------------------------------------------------------------------
+    ps_embed_copy = subps.add_parser(
+        "embed-copy", description=mycenter("copy embedded files between PDFs")
+    )
+    ps_embed_copy.add_argument("input", type=str, help="PDF to receive embedded files")
+    ps_embed_copy.add_argument("-password", help="password of input")
+    ps_embed_copy.add_argument(
+        "-output", help="output PDF, incremental save to 'input' if omitted"
+    )
+    ps_embed_copy.add_argument(
+        "-source", required=True, help="copy embedded files from here"
+    )
+    ps_embed_copy.add_argument("-pwdsource", help="password of 'source' PDF")
+    ps_embed_copy.add_argument(
+        "-name", nargs="*", help="restrict copy to these entries"
+    )
+    ps_embed_copy.set_defaults(func=embedded_copy)
+
+    # -------------------------------------------------------------------------
+    # 'textlayout' command
+    # -------------------------------------------------------------------------
+    ps_gettext = subps.add_parser(
+        "gettext", description=mycenter("extract text in various formatting modes")
+    )
+    ps_gettext.add_argument("input", type=str, help="input document filename")
+    ps_gettext.add_argument("-password", help="password for input document")
+    ps_gettext.add_argument(
+        "-mode",
+        type=str,
+        help="mode: simple, block sort, or layout (default)",
+        choices=("simple", "blocks", "layout"),
+        default="layout",
+    )
+    ps_gettext.add_argument(
+        "-pages",
+        type=str,
+        help="select pages, format: 1,5-7,50-N",
+        default="1-N",
+    )
+    ps_gettext.add_argument(
+        "-noligatures",
+        action="store_true",
+        help="expand ligature characters (default False)",
+        default=False,
+    )
+    ps_gettext.add_argument(
+        "-convert-white",
+        action="store_true",
+        help="convert whitespace characters to white (default False)",
+        default=False,
+    )
+    ps_gettext.add_argument(
+        "-extra-spaces",
+        action="store_true",
+        help="fill gaps with spaces (default False)",
+        default=False,
+    )
+    ps_gettext.add_argument(
+        "-noformfeed",
+        action="store_true",
+        help="write linefeeds, no formfeeds (default False)",
+        default=False,
+    )
+    ps_gettext.add_argument(
+        "-skip-empty",
+        action="store_true",
+        help="suppress pages with no text (default False)",
+        default=False,
+    )
+    ps_gettext.add_argument(
+        "-output",
+        help="store text in this file (default inputfilename.txt)",
+    )
+    ps_gettext.add_argument(
+        "-grid",
+        type=float,
+        help="merge lines if closer than this (default 2)",
+        default=2,
+    )
+    ps_gettext.add_argument(
+        "-fontsize",
+        type=float,
+        help="only include text with a larger fontsize (default 3)",
+        default=3,
+    )
+    ps_gettext.set_defaults(func=gettext)
+
+    # -------------------------------------------------------------------------
+    # start program
+    # -------------------------------------------------------------------------
+    args = parser.parse_args()  # create parameter arguments class
+    if not hasattr(args, "func"):  # no function selected
+        parser.print_help()  # so print top level help
+    else:
+        args.func(args)  # execute requested command
+
+
+if __name__ == "__main__":
+    main()
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:44:09 +0200
parents	1d09e1dec1d9
children