comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 # Copyright 2012 Google Inc. All Rights Reserved.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 #
15 """Adapter between Gumbo and html5lib.
16
17 This exports one method, parse, with the same signature as html5lib.parse. It
18 takes the text to parse, and optionally an html5lib TreeBuilder to build the
19 tree, and gives back a DOM tree in that format. Example:
20
21 doc = parse(text, treebuilder='lxml')
22 """
23
24 __author__ = 'jdtang@google.com (Jonathan Tang)'
25
26 import gumboc
27
28 # These should match html5lib.constants.namespaces, and be indexed by the enum
29 # values of gumboc.Namespace
30 _NAMESPACES = [
31 'http://www.w3.org/1999/xhtml',
32 'http://www.w3.org/2000/svg',
33 'http://www.w3.org/1998/Math/MathML',
34 ]
35
36
37 def _convert_doctype(treebuilder, source_node):
38 if not source_node.has_doctype:
39 # Mimic html5lib behavior: if no doctype token, no doctype node.
40 return
41 treebuilder.insertDoctype({
42 'name': source_node.name.decode('utf-8'),
43 'publicId': source_node.public_identifier.decode('utf-8'),
44 'systemId': source_node.system_identifier.decode('utf-8'),
45 })
46
47
48 def _convert_attributes(source_node):
49 def maybe_namespace(attr):
50 if attr.namespace != gumboc.AttributeNamespace.NONE:
51 return (repr(attr.namespace).lower() if attr.name != 'xmlns' else None,
52 attr.name.decode('utf-8'),
53 attr.namespace.to_url())
54 else:
55 return attr.name.decode('utf-8')
56 return dict((maybe_namespace(attr), attr.value.decode('utf-8'))
57 for attr in source_node.attributes)
58
59
60 def _convert_element(source_node):
61 if source_node.type not in ( gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
62 # If-statement instead of assert so it runs with -O
63 raise AssertionError(
64 '_convert_element only works with elements; found %r' %
65 source_node.type)
66 return {
67 'name': source_node.v.element.tag_name.decode('utf-8'),
68 'namespace': _NAMESPACES[source_node.v.element.tag_namespace.value],
69 'data': _convert_attributes(source_node),
70 }
71
72
73 def _insert_root(treebuilder, source_node, pop_element = True):
74 treebuilder.insertRoot(_convert_element(source_node))
75 for child_node in source_node.children:
76 _insert_node(treebuilder, child_node)
77 if pop_element:
78 treebuilder.openElements.pop()
79
80 def _insert_node(treebuilder, source_node):
81 assert source_node.type != gumboc.NodeType.DOCUMENT
82 if source_node.type == gumboc.NodeType.COMMENT:
83 treebuilder.insertComment({'data': source_node.v.text.text.decode('utf-8')})
84 elif source_node.type in (
85 gumboc.NodeType.TEXT,
86 gumboc.NodeType.WHITESPACE,
87 gumboc.NodeType.CDATA):
88 treebuilder.insertText(source_node.v.text.text.decode('utf-8'))
89 else:
90 treebuilder.insertElementNormal(_convert_element(source_node))
91 for child_node in source_node.v.element.children:
92 _insert_node(treebuilder, child_node)
93 treebuilder.openElements.pop()
94
95
96 class HTMLParser(object):
97 def __init__(self, tree):
98 self.tree = tree
99
100 def parse(self, text_or_file, **kwargs):
101 try:
102 text = text_or_file.read()
103 except AttributeError:
104 # Assume a string.
105 text = text_or_file
106
107 with gumboc.parse(text, **kwargs) as output:
108 _convert_doctype(self.tree, output.contents.document.contents)
109 for node in output.contents.document.contents.children:
110 if node.type == gumboc.NodeType.COMMENT:
111 self.tree.insertComment({'data': node.v.text.text.decode('utf-8')},
112 self.tree.document)
113 elif node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
114 _insert_root(self.tree, output.contents.root.contents)
115 else:
116 assert 'Only comments and <html> nodes allowed at the root'
117 return self.tree.getDocument()
118
119 def parseFragment(self, text_or_file, container, **kwargs):
120 try:
121 text = text_or_file.read()
122 except AttributeError:
123 # Assume a string.
124 text = text_or_file
125 if ' ' in container:
126 container_ns, container = container.split(' ')
127 else:
128 container_ns = "html"
129
130 with gumboc.parse(
131 text,
132 fragment_context=gumboc.Tag.from_str(container),
133 fragment_namespace=getattr(gumboc.Namespace, container_ns.upper()),
134 **kwargs) as output:
135 for node in output.contents.document.contents.children:
136 if node.type in (gumboc.NodeType.ELEMENT, gumboc.NodeType.TEMPLATE):
137 _insert_root(self.tree, output.contents.root.contents, False)
138 else:
139 assert 'Malformed fragment parse (??)'
140 return self.tree.getFragment()