Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 # Copyright 2012 Google Inc. All Rights Reserved. | |
| 2 # | |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 # you may not use this file except in compliance with the License. | |
| 5 # You may obtain a copy of the License at | |
| 6 # | |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 # | |
| 9 # Unless required by applicable law or agreed to in writing, software | |
| 10 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 # See the License for the specific language governing permissions and | |
| 13 # limitations under the License. | |
| 14 # | |
| 15 """Adapter between Gumbo and html5lib. | |
| 16 | |
| 17 This exports one method, parse, with the same signature as html5lib.parse. It | |
| 18 takes the text to parse, and optionally an html5lib TreeBuilder to build the | |
| 19 tree, and gives back a DOM tree in that format. Example: | |
| 20 | |
| 21 doc = parse(text, treebuilder='lxml') | |
| 22 """ | |
| 23 | |
| 24 __author__ = 'jdtang@google.com (Jonathan Tang)' | |
| 25 | |
| 26 import gumboc | |
| 27 | |
| 28 # These should match html5lib.constants.namespaces, and be indexed by the enum | |
| 29 # values of gumboc.Namespace | |
| 30 _NAMESPACES = [ | |
| 31 'http://www.w3.org/1999/xhtml', | |
| 32 'http://www.w3.org/2000/svg', | |
| 33 'http://www.w3.org/1998/Math/MathML', | |
| 34 ] | |
| 35 | |
| 36 | |
| 37 def _convert_doctype(treebuilder, source_node): | |
| 38 if not source_node.has_doctype: | |
| 39 # Mimic html5lib behavior: if no doctype token, no doctype node. | |
| 40 return | |
| 41 treebuilder.insertDoctype({ | |
| 42 'name': source_node.name.decode('utf-8'), | |
| 43 'publicId': source_node.public_identifier.decode('utf-8'), | |
| 44 'systemId': source_node.system_identifier.decode('utf-8'), | |
| 45 }) | |
| 46 | |
| 47 | |
| 48 def _convert_attributes(source_node): | |
| 49 def maybe_namespace(attr): | |
| 50 if attr.namespace != gumboc.AttributeNamespace.NONE: | |
| 51 return (repr(attr.namespace).lower() if attr.name != 'xmlns' else None, | |
| 52 attr.name.decode('utf-8'), | |
| 53 attr.namespace.to_url()) | |
| 54 else: | |
| 55 return attr.name.decode('utf-8') | |
| 56 return dict((maybe_namespace(attr), attr.value.decode('utf-8')) | |
| 57 for attr in source_node.attributes) | |
| 58 | |
| 59 | |
| 60 def _convert_element(source_node): | |
| 61 if source_node.type not in ( gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): | |
| 62 # If-statement instead of assert so it runs with -O | |
| 63 raise AssertionError( | |
| 64 '_convert_element only works with elements; found %r' % | |
| 65 source_node.type) | |
| 66 return { | |
| 67 'name': source_node.v.element.tag_name.decode('utf-8'), | |
| 68 'namespace': _NAMESPACES[source_node.v.element.tag_namespace.value], | |
| 69 'data': _convert_attributes(source_node), | |
| 70 } | |
| 71 | |
| 72 | |
| 73 def _insert_root(treebuilder, source_node, pop_element = True): | |
| 74 treebuilder.insertRoot(_convert_element(source_node)) | |
| 75 for child_node in source_node.children: | |
| 76 _insert_node(treebuilder, child_node) | |
| 77 if pop_element: | |
| 78 treebuilder.openElements.pop() | |
| 79 | |
| 80 def _insert_node(treebuilder, source_node): | |
| 81 assert source_node.type != gumboc.NodeType.DOCUMENT | |
| 82 if source_node.type == gumboc.NodeType.COMMENT: | |
| 83 treebuilder.insertComment({'data': source_node.v.text.text.decode('utf-8')}) | |
| 84 elif source_node.type in ( | |
| 85 gumboc.NodeType.TEXT, | |
| 86 gumboc.NodeType.WHITESPACE, | |
| 87 gumboc.NodeType.CDATA): | |
| 88 treebuilder.insertText(source_node.v.text.text.decode('utf-8')) | |
| 89 else: | |
| 90 treebuilder.insertElementNormal(_convert_element(source_node)) | |
| 91 for child_node in source_node.v.element.children: | |
| 92 _insert_node(treebuilder, child_node) | |
| 93 treebuilder.openElements.pop() | |
| 94 | |
| 95 | |
| 96 class HTMLParser(object): | |
| 97 def __init__(self, tree): | |
| 98 self.tree = tree | |
| 99 | |
| 100 def parse(self, text_or_file, **kwargs): | |
| 101 try: | |
| 102 text = text_or_file.read() | |
| 103 except AttributeError: | |
| 104 # Assume a string. | |
| 105 text = text_or_file | |
| 106 | |
| 107 with gumboc.parse(text, **kwargs) as output: | |
| 108 _convert_doctype(self.tree, output.contents.document.contents) | |
| 109 for node in output.contents.document.contents.children: | |
| 110 if node.type == gumboc.NodeType.COMMENT: | |
| 111 self.tree.insertComment({'data': node.v.text.text.decode('utf-8')}, | |
| 112 self.tree.document) | |
| 113 elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): | |
| 114 _insert_root(self.tree, output.contents.root.contents) | |
| 115 else: | |
| 116 assert 'Only comments and <html> nodes allowed at the root' | |
| 117 return self.tree.getDocument() | |
| 118 | |
| 119 def parseFragment(self, text_or_file, container, **kwargs): | |
| 120 try: | |
| 121 text = text_or_file.read() | |
| 122 except AttributeError: | |
| 123 # Assume a string. | |
| 124 text = text_or_file | |
| 125 if ' ' in container: | |
| 126 container_ns, container = container.split(' ') | |
| 127 else: | |
| 128 container_ns = "html" | |
| 129 | |
| 130 with gumboc.parse( | |
| 131 text, | |
| 132 fragment_context=gumboc.Tag.from_str(container), | |
| 133 fragment_namespace=getattr(gumboc.Namespace, container_ns.upper()), | |
| 134 **kwargs) as output: | |
| 135 for node in output.contents.document.contents.children: | |
| 136 if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE): | |
| 137 _insert_root(self.tree, output.contents.root.contents, False) | |
| 138 else: | |
| 139 assert 'Malformed fragment parse (??)' | |
| 140 return self.tree.getFragment() |
