Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter.py Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,140 @@ +# Copyright 2012 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Adapter between Gumbo and html5lib. + +This exports one method, parse, with the same signature as html5lib.parse. It +takes the text to parse, and optionally an html5lib TreeBuilder to build the +tree, and gives back a DOM tree in that format. Example: + + doc = parse(text, treebuilder='lxml') +""" + +__author__ = 'jdtang@google.com (Jonathan Tang)' + +import gumboc + +# These should match html5lib.constants.namespaces, and be indexed by the enum +# values of gumboc.Namespace +_NAMESPACES = [ + 'http://www.w3.org/1999/xhtml', + 'http://www.w3.org/2000/svg', + 'http://www.w3.org/1998/Math/MathML', + ] + + +def _convert_doctype(treebuilder, source_node): + if not source_node.has_doctype: + # Mimic html5lib behavior: if no doctype token, no doctype node. + return + treebuilder.insertDoctype({ + 'name': source_node.name.decode('utf-8'), + 'publicId': source_node.public_identifier.decode('utf-8'), + 'systemId': source_node.system_identifier.decode('utf-8'), + }) + + +def _convert_attributes(source_node): + def maybe_namespace(attr): + if attr.namespace != gumboc.AttributeNamespace.NONE: + return (repr(attr.namespace).lower() if attr.name != 'xmlns' else None, + attr.name.decode('utf-8'), + attr.namespace.to_url()) + else: + return attr.name.decode('utf-8') + return dict((maybe_namespace(attr), attr.value.decode('utf-8')) + for attr in source_node.attributes) + + +def _convert_element(source_node): + if source_node.type not in ( gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): + # If-statement instead of assert so it runs with -O + raise AssertionError( + '_convert_element only works with elements; found %r' % + source_node.type) + return { + 'name': source_node.v.element.tag_name.decode('utf-8'), + 'namespace': _NAMESPACES[source_node.v.element.tag_namespace.value], + 'data': _convert_attributes(source_node), + } + + +def _insert_root(treebuilder, source_node, pop_element = True): + treebuilder.insertRoot(_convert_element(source_node)) + for child_node in source_node.children: + _insert_node(treebuilder, child_node) + if pop_element: + treebuilder.openElements.pop() + +def _insert_node(treebuilder, source_node): + assert source_node.type != gumboc.NodeType.DOCUMENT + if source_node.type == gumboc.NodeType.COMMENT: + treebuilder.insertComment({'data': source_node.v.text.text.decode('utf-8')}) + elif source_node.type in ( + gumboc.NodeType.TEXT, + gumboc.NodeType.WHITESPACE, + gumboc.NodeType.CDATA): + treebuilder.insertText(source_node.v.text.text.decode('utf-8')) + else: + treebuilder.insertElementNormal(_convert_element(source_node)) + for child_node in source_node.v.element.children: + _insert_node(treebuilder, child_node) + treebuilder.openElements.pop() + + +class HTMLParser(object): + def __init__(self, tree): + self.tree = tree + + def parse(self, text_or_file, **kwargs): + try: + text = text_or_file.read() + except AttributeError: + # Assume a string. + text = text_or_file + + with gumboc.parse(text, **kwargs) as output: + _convert_doctype(self.tree, output.contents.document.contents) + for node in output.contents.document.contents.children: + if node.type == gumboc.NodeType.COMMENT: + self.tree.insertComment({'data': node.v.text.text.decode('utf-8')}, + self.tree.document) + elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): + _insert_root(self.tree, output.contents.root.contents) + else: + assert 'Only comments and <html> nodes allowed at the root' + return self.tree.getDocument() + + def parseFragment(self, text_or_file, container, **kwargs): + try: + text = text_or_file.read() + except AttributeError: + # Assume a string. + text = text_or_file + if ' ' in container: + container_ns, container = container.split(' ') + else: + container_ns = "html" + + with gumboc.parse( + text, + fragment_context=gumboc.Tag.from_str(container), + fragment_namespace=getattr(gumboc.Namespace, container_ns.upper()), + **kwargs) as output: + for node in output.contents.document.contents.children: + if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): + _insert_root(self.tree, output.contents.root.contents, False) + else: + assert 'Malformed fragment parse (??)' + return self.tree.getFragment()
