Mercurial > hgrepos > Python2 > PyMuPDF
view src/utils.py @ 45:b74429b0f5c4 v1.26.5+1
+++++ v1.26.5+1
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:17:13 +0200 |
| parents | a6bc019ac0b2 |
| children |
line wrap: on
line source
# ------------------------------------------------------------------------ # Copyright 2020-2022, Harald Lieder, mailto:harald.lieder@outlook.com # License: GNU AFFERO GPL 3.0, https://www.gnu.org/licenses/agpl-3.0.html # # Part of "PyMuPDF", a Python binding for "MuPDF" (http://mupdf.com), a # lightweight PDF, XPS, and E-book viewer, renderer and toolkit which is # maintained and developed by Artifex Software, Inc. https://artifex.com. # ------------------------------------------------------------------------ import math import typing import weakref try: from . import pymupdf except Exception: import pymupdf try: from . import mupdf except Exception: import mupdf _format_g = pymupdf.format_g g_exceptions_verbose = pymupdf.g_exceptions_verbose point_like = "point_like" rect_like = "rect_like" matrix_like = "matrix_like" quad_like = "quad_like" # ByteString is gone from typing in 3.14. # collections.abc.Buffer available from 3.12 only try: ByteString = typing.ByteString except AttributeError: # pylint: disable=unsupported-binary-operation ByteString = bytes | bytearray | memoryview AnyType = typing.Any OptInt = typing.Union[int, None] OptFloat = typing.Optional[float] OptStr = typing.Optional[str] OptDict = typing.Optional[dict] OptBytes = typing.Optional[ByteString] OptSeq = typing.Optional[typing.Sequence] """ This is a collection of functions to extend PyMupdf. """ def get_text_blocks( page: pymupdf.Page, clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, sort: bool = False, ) -> list: """Return the text blocks on a page. Notes: Lines in a block are concatenated with line breaks. Args: flags: (int) control the amount of data parsed into the textpage. Returns: A list of the blocks. Each item contains the containing rectangle coordinates, text lines, running block number and block type. """ pymupdf.CheckParent(page) if flags is None: flags = pymupdf.TEXTFLAGS_BLOCKS tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=flags) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") blocks = tp.extractBLOCKS() if textpage is None: del tp if sort: blocks.sort(key=lambda b: (b[3], b[0])) return blocks def get_text_words( page: pymupdf.Page, clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, sort: bool = False, delimiters=None, tolerance=3, ) -> list: """Return the text words as a list with the bbox for each word. Args: page: pymupdf.Page clip: (rect-like) area on page to consider flags: (int) control the amount of data parsed into the textpage. textpage: (pymupdf.TextPage) either passed-in or None. sort: (bool) sort the words in reading sequence. delimiters: (str,list) characters to use as word delimiters. tolerance: (float) consider words to be part of the same line if top or bottom coordinate are not larger than this. Relevant only if sort=True. Returns: Word tuples (x0, y0, x1, y1, "word", bno, lno, wno). """ def sort_words(words): """Sort words line-wise, forgiving small deviations.""" words.sort(key=lambda w: (w[3], w[0])) nwords = [] # final word list line = [words[0]] # collects words roughly in same line lrect = pymupdf.Rect(words[0][:4]) # start the line rectangle for w in words[1:]: wrect = pymupdf.Rect(w[:4]) if ( abs(wrect.y0 - lrect.y0) <= tolerance or abs(wrect.y1 - lrect.y1) <= tolerance ): line.append(w) lrect |= wrect else: line.sort(key=lambda w: w[0]) # sort words in line l-t-r nwords.extend(line) # append to final words list line = [w] # start next line lrect = wrect # start next line rect line.sort(key=lambda w: w[0]) # sort words in line l-t-r nwords.extend(line) # append to final words list return nwords pymupdf.CheckParent(page) if flags is None: flags = pymupdf.TEXTFLAGS_WORDS tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=flags) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") words = tp.extractWORDS(delimiters) # if textpage was given, we subselect the words in clip if textpage is not None and clip is not None: # sub-select words contained in clip clip = pymupdf.Rect(clip) words = [ w for w in words if abs(clip & w[:4]) >= 0.5 * abs(pymupdf.Rect(w[:4])) ] if textpage is None: del tp if words and sort: # advanced sort if any words found words = sort_words(words) return words def get_sorted_text( page: pymupdf.Page, clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, tolerance=3, ) -> str: """Extract plain text avoiding unacceptable line breaks. Text contained in clip will be sorted in reading sequence. Some effort is also spent to simulate layout vertically and horizontally. Args: page: pymupdf.Page clip: (rect-like) only consider text inside flags: (int) text extraction flags textpage: pymupdf.TextPage tolerance: (float) consider words to be on the same line if their top or bottom coordinates do not differ more than this. Notes: If a TextPage is provided, all text is checked for being inside clip with at least 50% of its bbox. This allows to use some "global" TextPage in conjunction with sub- selecting words in parts of the defined TextPage rectangle. Returns: A text string in reading sequence. Left indentation of each line, inter-line and inter-word distances strive to reflect the layout. """ def line_text(clip, line): """Create the string of one text line. We are trying to simulate some horizontal layout here, too. Args: clip: (pymupdf.Rect) the area from which all text is being read. line: (list) word tuples (rect, text) contained in the line Returns: Text in this line. Generated from words in 'line'. Distance from predecessor is translated to multiple spaces, thus simulating text indentations and large horizontal distances. """ line.sort(key=lambda w: w[0].x0) ltext = "" # text in the line x1 = clip.x0 # end coordinate of ltext lrect = pymupdf.EMPTY_RECT() # bbox of this line for r, t in line: lrect |= r # update line bbox # convert distance to previous word to multiple spaces dist = max( int(round((r.x0 - x1) / r.width * len(t))), 0 if (x1 == clip.x0 or r.x0 <= x1) else 1, ) # number of space characters ltext += " " * dist + t # append word string x1 = r.x1 # update new end position return ltext # Extract words in correct sequence first. words = [ (pymupdf.Rect(w[:4]), w[4]) for w in get_text_words( page, clip=clip, flags=flags, textpage=textpage, sort=True, tolerance=tolerance, ) ] if not words: # no text present return "" totalbox = pymupdf.EMPTY_RECT() # area covering all text for wr, text in words: totalbox |= wr lines = [] # list of reconstituted lines line = [words[0]] # current line lrect = words[0][0] # the line's rectangle # walk through the words for wr, text in words[1:]: # start with second word w0r, _ = line[-1] # read previous word in current line # if this word matches top or bottom of the line, append it if abs(lrect.y0 - wr.y0) <= tolerance or abs(lrect.y1 - wr.y1) <= tolerance: line.append((wr, text)) lrect |= wr else: # output current line and re-initialize ltext = line_text(totalbox, line) lines.append((lrect, ltext)) line = [(wr, text)] lrect = wr # also append unfinished last line ltext = line_text(totalbox, line) lines.append((lrect, ltext)) # sort all lines vertically lines.sort(key=lambda l: (l[0].y1)) text = lines[0][1] # text of first line y1 = lines[0][0].y1 # its bottom coordinate for lrect, ltext in lines[1:]: distance = min(int(round((lrect.y0 - y1) / lrect.height)), 5) breaks = "\n" * (distance + 1) text += breaks + ltext y1 = lrect.y1 # return text in clip return text def get_textbox( page: pymupdf.Page, rect: rect_like, textpage: pymupdf.TextPage = None, ) -> str: tp = textpage if tp is None: tp = page.get_textpage() elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") rc = tp.extractTextbox(rect) if textpage is None: del tp return rc def get_text_selection( page: pymupdf.Page, p1: point_like, p2: point_like, clip: rect_like = None, textpage: pymupdf.TextPage = None, ): pymupdf.CheckParent(page) tp = textpage if tp is None: tp = page.get_textpage(clip=clip, flags=pymupdf.TEXT_DEHYPHENATE) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") rc = tp.extractSelection(p1, p2) if textpage is None: del tp return rc def get_textpage_ocr( page: pymupdf.Page, flags: int = 0, language: str = "eng", dpi: int = 72, full: bool = False, tessdata: str = None, ) -> pymupdf.TextPage: """Create a Textpage from combined results of normal and OCR text parsing. Args: flags: (int) control content becoming part of the result. language: (str) specify expected language(s). Default is "eng" (English). dpi: (int) resolution in dpi, default 72. full: (bool) whether to OCR the full page image, or only its images (default) """ pymupdf.CheckParent(page) tessdata = pymupdf.get_tessdata(tessdata) def full_ocr(page, dpi, language, flags): zoom = dpi / 72 mat = pymupdf.Matrix(zoom, zoom) pix = page.get_pixmap(matrix=mat) ocr_pdf = pymupdf.Document( "pdf", pix.pdfocr_tobytes( compress=False, language=language, tessdata=tessdata, ), ) ocr_page = ocr_pdf.load_page(0) unzoom = page.rect.width / ocr_page.rect.width ctm = pymupdf.Matrix(unzoom, unzoom) * page.derotation_matrix tpage = ocr_page.get_textpage(flags=flags, matrix=ctm) ocr_pdf.close() pix = None tpage.parent = weakref.proxy(page) return tpage # if OCR for the full page, OCR its pixmap @ desired dpi if full: return full_ocr(page, dpi, language, flags) # For partial OCR, make a normal textpage, then extend it with text that # is OCRed from each image. # Because of this, we need the images flag bit set ON. tpage = page.get_textpage(flags=flags) for block in page.get_text("dict", flags=pymupdf.TEXT_PRESERVE_IMAGES)["blocks"]: if block["type"] != 1: # only look at images continue bbox = pymupdf.Rect(block["bbox"]) if bbox.width <= 3 or bbox.height <= 3: # ignore tiny stuff continue try: pix = pymupdf.Pixmap(block["image"]) # get image pixmap if pix.n - pix.alpha != 3: # we need to convert this to RGB! pix = pymupdf.Pixmap(pymupdf.csRGB, pix) if pix.alpha: # must remove alpha channel pix = pymupdf.Pixmap(pix, 0) imgdoc = pymupdf.Document( "pdf", pix.pdfocr_tobytes(language=language, tessdata=tessdata), ) # pdf with OCRed page imgpage = imgdoc.load_page(0) # read image as a page pix = None # compute matrix to transform coordinates back to that of 'page' imgrect = imgpage.rect # page size of image PDF shrink = pymupdf.Matrix(1 / imgrect.width, 1 / imgrect.height) mat = shrink * block["transform"] imgpage.extend_textpage(tpage, flags=0, matrix=mat) imgdoc.close() except (RuntimeError, mupdf.FzErrorBase): if 0 and g_exceptions_verbose: # Don't show exception info here because it can happen in # normal operation (see test_3842b). pymupdf.exception_info() tpage = None pymupdf.message("Falling back to full page OCR") return full_ocr(page, dpi, language, flags) return tpage def get_text( page: pymupdf.Page, option: str = "text", *, clip: rect_like = None, flags: OptInt = None, textpage: pymupdf.TextPage = None, sort: bool = False, delimiters=None, tolerance=3, ): """Extract text from a page or an annotation. This is a unifying wrapper for various methods of the pymupdf.TextPage class. Args: option: (str) text, words, blocks, html, dict, json, rawdict, xhtml or xml. clip: (rect-like) restrict output to this area. flags: bit switches to e.g. exclude images or decompose ligatures. textpage: reuse this pymupdf.TextPage and make no new one. If specified, 'flags' and 'clip' are ignored. Returns: the output of methods get_text_words / get_text_blocks or pymupdf.TextPage methods extractText, extractHTML, extractDICT, extractJSON, extractRAWDICT, extractXHTML or etractXML respectively. Default and misspelling choice is "text". """ formats = { "text": pymupdf.TEXTFLAGS_TEXT, "html": pymupdf.TEXTFLAGS_HTML, "json": pymupdf.TEXTFLAGS_DICT, "rawjson": pymupdf.TEXTFLAGS_RAWDICT, "xml": pymupdf.TEXTFLAGS_XML, "xhtml": pymupdf.TEXTFLAGS_XHTML, "dict": pymupdf.TEXTFLAGS_DICT, "rawdict": pymupdf.TEXTFLAGS_RAWDICT, "words": pymupdf.TEXTFLAGS_WORDS, "blocks": pymupdf.TEXTFLAGS_BLOCKS, } option = option.lower() assert option in formats if option not in formats: option = "text" if flags is None: flags = formats[option] if option == "words": return get_text_words( page, clip=clip, flags=flags, textpage=textpage, sort=sort, delimiters=delimiters, ) if option == "blocks": return get_text_blocks( page, clip=clip, flags=flags, textpage=textpage, sort=sort ) if option == "text" and sort: return get_sorted_text( page, clip=clip, flags=flags, textpage=textpage, tolerance=tolerance, ) pymupdf.CheckParent(page) cb = None if option in ("html", "xml", "xhtml"): # no clipping for MuPDF functions clip = page.cropbox if clip is not None: clip = pymupdf.Rect(clip) cb = None elif type(page) is pymupdf.Page: cb = page.cropbox # pymupdf.TextPage with or without images tp = textpage #pymupdf.exception_info() if tp is None: tp = page.get_textpage(clip=clip, flags=flags) elif getattr(tp, "parent") != page: raise ValueError("not a textpage of this page") #pymupdf.log( '{option=}') if option == "json": t = tp.extractJSON(cb=cb, sort=sort) elif option == "rawjson": t = tp.extractRAWJSON(cb=cb, sort=sort) elif option == "dict": t = tp.extractDICT(cb=cb, sort=sort) elif option == "rawdict": t = tp.extractRAWDICT(cb=cb, sort=sort) elif option == "html": t = tp.extractHTML() elif option == "xml": t = tp.extractXML() elif option == "xhtml": t = tp.extractXHTML() else: t = tp.extractText(sort=sort) if textpage is None: del tp return t def getLinkDict(ln, document=None) -> dict: if isinstance(ln, pymupdf.Outline): dest = ln.destination(document) elif isinstance(ln, pymupdf.Link): dest = ln.dest else: assert 0, f'Unexpected {type(ln)=}.' nl = {"kind": dest.kind, "xref": 0} try: if hasattr(ln, 'rect'): nl["from"] = ln.rect except Exception: # This seems to happen quite often in PyMuPDF/tests. if g_exceptions_verbose >= 2: pymupdf.exception_info() pass pnt = pymupdf.Point(0, 0) if dest.flags & pymupdf.LINK_FLAG_L_VALID: pnt.x = dest.lt.x if dest.flags & pymupdf.LINK_FLAG_T_VALID: pnt.y = dest.lt.y if dest.kind == pymupdf.LINK_URI: nl["uri"] = dest.uri elif dest.kind == pymupdf.LINK_GOTO: nl["page"] = dest.page nl["to"] = pnt if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM: nl["zoom"] = dest.rb.x else: nl["zoom"] = 0.0 elif dest.kind == pymupdf.LINK_GOTOR: nl["file"] = dest.file_spec.replace("\\", "/") nl["page"] = dest.page if dest.page < 0: nl["to"] = dest.dest else: nl["to"] = pnt if dest.flags & pymupdf.LINK_FLAG_R_IS_ZOOM: nl["zoom"] = dest.rb.x else: nl["zoom"] = 0.0 elif dest.kind == pymupdf.LINK_LAUNCH: nl["file"] = dest.file_spec.replace("\\", "/") elif dest.kind == pymupdf.LINK_NAMED: # The dicts should not have same key(s). assert not (dest.named.keys() & nl.keys()) nl.update(dest.named) if 'to' in nl: nl['to'] = pymupdf.Point(nl['to']) else: nl["page"] = dest.page return nl def getDestStr(xref: int, ddict: dict) -> str: """Calculate the PDF action string. Notes: Supports Link annotations and outline items (bookmarks). """ if not ddict: return "" str_goto = lambda a, b, c, d: f"/A<</S/GoTo/D[{a} 0 R/XYZ {_format_g((b, c, d))}]>>" str_gotor1 = lambda a, b, c, d, e, f: f"/A<</S/GoToR/D[{a} /XYZ {_format_g((b, c, d))}]/F<</F{e}/UF{f}/Type/Filespec>>>>" str_gotor2 = lambda a, b, c: f"/A<</S/GoToR/D{a}/F<</F{b}/UF{c}/Type/Filespec>>>>" str_launch = lambda a, b: f"/A<</S/Launch/F<</F{a}/UF{b}/Type/Filespec>>>>" str_uri = lambda a: f"/A<</S/URI/URI{a}>>" if type(ddict) in (int, float): dest = str_goto(xref, 0, ddict, 0) return dest d_kind = ddict.get("kind", pymupdf.LINK_NONE) if d_kind == pymupdf.LINK_NONE: return "" if ddict["kind"] == pymupdf.LINK_GOTO: d_zoom = ddict.get("zoom", 0) to = ddict.get("to", pymupdf.Point(0, 0)) d_left, d_top = to dest = str_goto(xref, d_left, d_top, d_zoom) return dest if ddict["kind"] == pymupdf.LINK_URI: dest = str_uri(pymupdf.get_pdf_str(ddict["uri"]),) return dest if ddict["kind"] == pymupdf.LINK_LAUNCH: fspec = pymupdf.get_pdf_str(ddict["file"]) dest = str_launch(fspec, fspec) return dest if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] < 0: fspec = pymupdf.get_pdf_str(ddict["file"]) dest = str_gotor2(pymupdf.get_pdf_str(ddict["to"]), fspec, fspec) return dest if ddict["kind"] == pymupdf.LINK_GOTOR and ddict["page"] >= 0: fspec = pymupdf.get_pdf_str(ddict["file"]) dest = str_gotor1( ddict["page"], ddict["to"].x, ddict["to"].y, ddict["zoom"], fspec, fspec, ) return dest return "" def getLinkText(page: pymupdf.Page, lnk: dict) -> str: # -------------------------------------------------------------------------- # define skeletons for /Annots object texts # -------------------------------------------------------------------------- ctm = page.transformation_matrix ictm = ~ctm r = lnk["from"] rect = _format_g(tuple(r * ictm)) annot = "" if lnk["kind"] == pymupdf.LINK_GOTO: if lnk["page"] >= 0: txt = pymupdf.annot_skel["goto1"] # annot_goto pno = lnk["page"] xref = page.parent.page_xref(pno) pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point dest_page = page.parent[pno] dest_ctm = dest_page.transformation_matrix dest_ictm = ~dest_ctm ipnt = pnt * dest_ictm annot = txt(xref, ipnt.x, ipnt.y, lnk.get("zoom", 0), rect) else: txt = pymupdf.annot_skel["goto2"] # annot_goto_n annot = txt(pymupdf.get_pdf_str(lnk["to"]), rect) elif lnk["kind"] == pymupdf.LINK_GOTOR: if lnk["page"] >= 0: txt = pymupdf.annot_skel["gotor1"] # annot_gotor pnt = lnk.get("to", pymupdf.Point(0, 0)) # destination point if type(pnt) is not pymupdf.Point: pnt = pymupdf.Point(0, 0) annot = txt( lnk["page"], pnt.x, pnt.y, lnk.get("zoom", 0), lnk["file"], lnk["file"], rect, ) else: txt = pymupdf.annot_skel["gotor2"] # annot_gotor_n annot = txt(pymupdf.get_pdf_str(lnk["to"]), lnk["file"], rect) elif lnk["kind"] == pymupdf.LINK_LAUNCH: txt = pymupdf.annot_skel["launch"] # annot_launch annot = txt(lnk["file"], lnk["file"], rect) elif lnk["kind"] == pymupdf.LINK_URI: txt = pymupdf.annot_skel["uri"] # txt = annot_uri annot = txt(lnk["uri"], rect) elif lnk["kind"] == pymupdf.LINK_NAMED: txt = pymupdf.annot_skel["named"] # annot_named lname = lnk.get("name") # check presence of key if lname is None: # if missing, fall back to alternative lname = lnk["nameddest"] annot = txt(lname, rect) if not annot: return annot # add a /NM PDF key to the object definition link_names = dict( # existing ids and their xref [(x[0], x[2]) for x in page.annot_xrefs() if x[1] == pymupdf.PDF_ANNOT_LINK] # pylint: disable=no-member ) old_name = lnk.get("id", "") # id value in the argument if old_name and (lnk["xref"], old_name) in link_names.items(): name = old_name # no new name if this is an update only else: i = 0 stem = pymupdf.TOOLS.set_annot_stem() + "-L%i" while True: name = stem % i if name not in link_names.values(): break i += 1 # add /NM key to object definition annot = annot.replace("/Link", "/Link/NM(%s)" % name) return annot # ---------------------------------------------------------------------- # Name: wx.lib.colourdb.py # Purpose: Adds a bunch of colour names and RGB values to the # colour database so they can be found by name # # Author: Robin Dunn # # Created: 13-March-2001 # Copyright: (c) 2001-2017 by Total Control Software # Licence: wxWindows license # Tags: phoenix-port, unittest, documented # ---------------------------------------------------------------------- def getColorList() -> list: """ Returns a list of upper-case colour names. :rtype: list of strings """ return [name for name, r, g, b in pymupdf.colors_wx_list()] def getColorInfoList() -> list: """ Returns list of (name, red, gree, blue) tuples, where: name: upper-case color name. read, green, blue: integers in range 0..255. :rtype: list of tuples """ return pymupdf.colors_wx_list() def getColor(name: str) -> tuple: """Retrieve RGB color in PDF format by name. Returns: a triple of floats in range 0 to 1. In case of name-not-found, "white" is returned. """ return pymupdf.colors_pdf_dict().get(name.lower(), (1, 1, 1)) def getColorHSV(name: str) -> tuple: """Retrieve the hue, saturation, value triple of a color name. Returns: a triple (degree, percent, percent). If not found (-1, -1, -1) is returned. """ try: x = getColorInfoList()[getColorList().index(name.upper())] except Exception: if g_exceptions_verbose: pymupdf.exception_info() return (-1, -1, -1) r = x[1] / 255.0 g = x[2] / 255.0 b = x[3] / 255.0 cmax = max(r, g, b) V = round(cmax * 100, 1) cmin = min(r, g, b) delta = cmax - cmin if delta == 0: hue = 0 elif cmax == r: hue = 60.0 * (((g - b) / delta) % 6) elif cmax == g: hue = 60.0 * (((b - r) / delta) + 2) else: hue = 60.0 * (((r - g) / delta) + 4) H = int(round(hue)) if cmax == 0: sat = 0 else: sat = delta / cmax S = int(round(sat * 100)) return (H, S, V) def _get_font_properties(doc: pymupdf.Document, xref: int) -> tuple: fontname, ext, stype, buffer = doc.extract_font(xref) asc = 0.8 dsc = -0.2 if ext == "": return fontname, ext, stype, asc, dsc if buffer: try: font = pymupdf.Font(fontbuffer=buffer) asc = font.ascender dsc = font.descender bbox = font.bbox if asc - dsc < 1: if bbox.y0 < dsc: dsc = bbox.y0 asc = 1 - dsc except Exception: pymupdf.exception_info() asc *= 1.2 dsc *= 1.2 return fontname, ext, stype, asc, dsc if ext != "n/a": try: font = pymupdf.Font(fontname) asc = font.ascender dsc = font.descender except Exception: pymupdf.exception_info() asc *= 1.2 dsc *= 1.2 else: asc *= 1.2 dsc *= 1.2 return fontname, ext, stype, asc, dsc def _show_fz_text( text): #if mupdf_cppyy: # assert isinstance( text, cppyy.gbl.mupdf.Text) #else: # assert isinstance( text, mupdf.Text) num_spans = 0 num_chars = 0 span = text.m_internal.head while 1: if not span: break num_spans += 1 num_chars += span.len span = span.next return f'num_spans={num_spans} num_chars={num_chars}' """ Handle page labels for PDF documents. Reading ------- * compute the label of a page * find page number(s) having the given label. Writing ------- Supports setting (defining) page labels for PDF documents. A big Thank You goes to WILLIAM CHAPMAN who contributed the idea and significant parts of the following code during late December 2020 through early January 2021. """ def rule_dict(item): """Make a Python dict from a PDF page label rule. Args: item -- a tuple (pno, rule) with the start page number and the rule string like <</S/D...>>. Returns: A dict like {'startpage': int, 'prefix': str, 'style': str, 'firstpagenum': int}. """ # Jorj McKie, 2021-01-06 pno, rule = item rule = rule[2:-2].split("/")[1:] # strip "<<" and ">>" d = {"startpage": pno, "prefix": "", "firstpagenum": 1} skip = False for i, item in enumerate(rule): # pylint: disable=redefined-argument-from-local if skip: # this item has already been processed skip = False # deactivate skipping again continue if item == "S": # style specification d["style"] = rule[i + 1] # next item has the style skip = True # do not process next item again continue if item.startswith("P"): # prefix specification: extract the string x = item[1:].replace("(", "").replace(")", "") d["prefix"] = x continue if item.startswith("St"): # start page number specification x = int(item[2:]) d["firstpagenum"] = x return d def get_label_pno(pgNo, labels): """Return the label for this page number. Args: pgNo: page number, 0-based. labels: result of doc._get_page_labels(). Returns: The label (str) of the page number. Errors return an empty string. """ # Jorj McKie, 2021-01-06 item = [x for x in labels if x[0] <= pgNo][-1] rule = rule_dict(item) prefix = rule.get("prefix", "") style = rule.get("style", "") # make sure we start at 0 when enumerating the alphabet delta = -1 if style in ("a", "A") else 0 pagenumber = pgNo - rule["startpage"] + rule["firstpagenum"] + delta return construct_label(style, prefix, pagenumber) def construct_label(style, prefix, pno) -> str: """Construct a label based on style, prefix and page number.""" # William Chapman, 2021-01-06 n_str = "" if style == "D": n_str = str(pno) elif style == "r": n_str = integerToRoman(pno).lower() elif style == "R": n_str = integerToRoman(pno).upper() elif style == "a": n_str = integerToLetter(pno).lower() elif style == "A": n_str = integerToLetter(pno).upper() result = prefix + n_str return result def integerToLetter(i) -> str: """Returns letter sequence string for integer i.""" # William Chapman, Jorj McKie, 2021-01-06 import string ls = string.ascii_uppercase n, a = 1, i while pow(26, n) <= a: a -= int(math.pow(26, n)) n += 1 str_t = "" for j in reversed(range(n)): f, g = divmod(a, int(math.pow(26, j))) str_t += ls[f] a = g return str_t def integerToRoman(num: int) -> str: """Return roman numeral for an integer.""" # William Chapman, Jorj McKie, 2021-01-06 roman = ( (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"), (50, "L"), (40, "XL"), (10, "X"), (9, "IX"), (5, "V"), (4, "IV"), (1, "I"), ) def roman_num(num): for r, ltr in roman: x, _ = divmod(num, r) yield ltr * x num -= r * x if num <= 0: break return "".join([a for a in roman_num(num)]) # ------------------------------------------------------------------- # Functions to recover the quad contained in a text extraction bbox # ------------------------------------------------------------------- def recover_bbox_quad(line_dir: tuple, span: dict, bbox: tuple) -> pymupdf.Quad: """Compute the quad located inside the bbox. The bbox may be any of the resp. tuples occurring inside the given span. Args: line_dir: (tuple) 'line["dir"]' of the owning line or None. span: (dict) the span. May be from get_texttrace() method. bbox: (tuple) the bbox of the span or any of its characters. Returns: The quad which is wrapped by the bbox. """ if line_dir is None: line_dir = span["dir"] cos, sin = line_dir bbox = pymupdf.Rect(bbox) # make it a rect if pymupdf.TOOLS.set_small_glyph_heights(): # ==> just fontsize as height d = 1 else: d = span["ascender"] - span["descender"] height = d * span["size"] # the quad's rectangle height # The following are distances from the bbox corners, at which we find the # respective quad points. The computation depends on in which quadrant the # text writing angle is located. hs = height * sin hc = height * cos if hc >= 0 and hs <= 0: # quadrant 1 ul = bbox.bl - (0, hc) ur = bbox.tr + (hs, 0) ll = bbox.bl - (hs, 0) lr = bbox.tr + (0, hc) elif hc <= 0 and hs <= 0: # quadrant 2 ul = bbox.br + (hs, 0) ur = bbox.tl - (0, hc) ll = bbox.br + (0, hc) lr = bbox.tl - (hs, 0) elif hc <= 0 and hs >= 0: # quadrant 3 ul = bbox.tr - (0, hc) ur = bbox.bl + (hs, 0) ll = bbox.tr - (hs, 0) lr = bbox.bl + (0, hc) else: # quadrant 4 ul = bbox.tl + (hs, 0) ur = bbox.br - (0, hc) ll = bbox.tl + (0, hc) lr = bbox.br - (hs, 0) return pymupdf.Quad(ul, ur, ll, lr) def recover_quad(line_dir: tuple, span: dict) -> pymupdf.Quad: """Recover the quadrilateral of a text span. Args: line_dir: (tuple) 'line["dir"]' of the owning line. span: the span. Returns: The quadrilateral enveloping the span's text. """ if type(line_dir) is not tuple or len(line_dir) != 2: raise ValueError("bad line dir argument") if type(span) is not dict: raise ValueError("bad span argument") return recover_bbox_quad(line_dir, span, span["bbox"]) def recover_line_quad(line: dict, spans: list = None) -> pymupdf.Quad: """Calculate the line quad for 'dict' / 'rawdict' text extractions. The lower quad points are those of the first, resp. last span quad. The upper points are determined by the maximum span quad height. From this, compute a rect with bottom-left in (0, 0), convert this to a quad and rotate and shift back to cover the text of the spans. Args: spans: (list, optional) sub-list of spans to consider. Returns: pymupdf.Quad covering selected spans. """ if spans is None: # no sub-selection spans = line["spans"] # all spans if len(spans) == 0: raise ValueError("bad span list") line_dir = line["dir"] # text direction cos, sin = line_dir q0 = recover_quad(line_dir, spans[0]) # quad of first span if len(spans) > 1: # get quad of last span q1 = recover_quad(line_dir, spans[-1]) else: q1 = q0 # last = first line_ll = q0.ll # lower-left of line quad line_lr = q1.lr # lower-right of line quad mat0 = pymupdf.planish_line(line_ll, line_lr) # map base line to x-axis such that line_ll goes to (0, 0) x_lr = line_lr * mat0 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights? h = max( [s["size"] * (1 if small else (s["ascender"] - s["descender"])) for s in spans] ) line_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle line_quad = line_rect.quad # make it a quad and: line_quad *= ~mat0 return line_quad def recover_span_quad(line_dir: tuple, span: dict, chars: list = None) -> pymupdf.Quad: """Calculate the span quad for 'dict' / 'rawdict' text extractions. Notes: There are two execution paths: 1. For the full span quad, the result of 'recover_quad' is returned. 2. For the quad of a sub-list of characters, the char quads are computed and joined. This is only supported for the "rawdict" extraction option. Args: line_dir: (tuple) 'line["dir"]' of the owning line. span: (dict) the span. chars: (list, optional) sub-list of characters to consider. Returns: pymupdf.Quad covering selected characters. """ if line_dir is None: # must be a span from get_texttrace() line_dir = span["dir"] if chars is None: # no sub-selection return recover_quad(line_dir, span) if "chars" not in span.keys(): raise ValueError("need 'rawdict' option to sub-select chars") q0 = recover_char_quad(line_dir, span, chars[0]) # quad of first char if len(chars) > 1: # get quad of last char q1 = recover_char_quad(line_dir, span, chars[-1]) else: q1 = q0 # last = first span_ll = q0.ll # lower-left of span quad span_lr = q1.lr # lower-right of span quad mat0 = pymupdf.planish_line(span_ll, span_lr) # map base line to x-axis such that span_ll goes to (0, 0) x_lr = span_lr * mat0 small = pymupdf.TOOLS.set_small_glyph_heights() # small glyph heights? h = span["size"] * (1 if small else (span["ascender"] - span["descender"])) span_rect = pymupdf.Rect(0, -h, x_lr.x, 0) # line rectangle span_quad = span_rect.quad # make it a quad and: span_quad *= ~mat0 # rotate back and shift back return span_quad def recover_char_quad(line_dir: tuple, span: dict, char: dict) -> pymupdf.Quad: """Recover the quadrilateral of a text character. This requires the "rawdict" option of text extraction. Args: line_dir: (tuple) 'line["dir"]' of the span's line. span: (dict) the span dict. char: (dict) the character dict. Returns: The quadrilateral enveloping the character. """ if line_dir is None: line_dir = span["dir"] if type(line_dir) is not tuple or len(line_dir) != 2: raise ValueError("bad line dir argument") if type(span) is not dict: raise ValueError("bad span argument") if type(char) is dict: bbox = pymupdf.Rect(char["bbox"]) elif type(char) is tuple: bbox = pymupdf.Rect(char[3]) else: raise ValueError("bad span argument") return recover_bbox_quad(line_dir, span, bbox)
