comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter_test.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 # Copyright 2012 Google Inc. All Rights Reserved.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 #
15 """Tests for the Gumbo => Html5lib adapter."""
16 import codecs
17 import collections
18 import glob
19 import os
20 import re
21 import StringIO
22 import unittest
23 import warnings
24
25 from html5lib import treebuilders
26
27 import html5lib_adapter
28
29
30 TREEBUILDER = treebuilders.getTreeBuilder('dom')
31 TESTDATA_BASE_PATH = os.path.join(
32 os.path.split(__file__)[0], '..', '..')
33
34
35 # Copied from html5lib.tests/test_parser.py
36 def convertTreeDump(data):
37 return "\n".join(convertExpected(data, 3).split("\n")[1:])
38
39
40 # Copied/adapted/simplified from html5lib.tests/support.py
41 def html5lib_test_files():
42 return glob.glob(os.path.join(
43 TESTDATA_BASE_PATH, 'testdata', 'tree-construction', '*.dat'))
44
45
46 class TestData(object):
47 def __init__(self, filename):
48 self.f = codecs.open(filename, encoding="utf8")
49
50 def __iter__(self):
51 data = collections.defaultdict(lambda: None)
52 key=None
53 for line in self.f:
54 heading = self.isSectionHeading(line)
55 if heading:
56 if data and heading == 'data':
57 #Remove trailing newline
58 data[key] = data[key][:-1]
59 yield self.normaliseOutput(data)
60 data = collections.defaultdict(lambda: None)
61 key = heading
62 data[key] = ''
63 elif key is not None:
64 data[key] += line
65 if data:
66 yield self.normaliseOutput(data)
67
68 def isSectionHeading(self, line):
69 """If the current heading is a test section heading return the heading,
70 otherwise return False"""
71 if line.startswith("#"):
72 return line[1:].strip()
73 else:
74 return False
75
76 def normaliseOutput(self, data):
77 # Remove trailing newlines
78 for key, value in data.iteritems():
79 if value.endswith("\n"):
80 data[key] = value[:-1]
81 return data
82
83 def convertExpected(data, stripChars):
84 """convert the output of str(document) to the format used in the testcases"""
85 data = data.split("\n")
86 rv = []
87 for line in data:
88 if line.startswith("|"):
89 rv.append(line[stripChars:])
90 else:
91 rv.append(line)
92 return "\n".join(rv)
93
94 def reformatTemplateContents(expected):
95 lines = expected.split('\n')
96 retval = []
97 template_indents = []
98 for line in lines:
99 line_stripped = line.strip()
100 indent = len(line) - len(line_stripped)
101 if line_stripped == 'content':
102 template_indents.append(indent)
103 continue
104 while template_indents and indent <= template_indents[-1]:
105 template_indents.pop()
106 if template_indents:
107 line = line[2 * len(template_indents):]
108 retval.append(line)
109 return '\n'.join(retval)
110
111 class Html5libAdapterTest(unittest.TestCase):
112 """Adapter between Gumbo and the html5lib tests.
113
114 This works through a bit of magic. It's an empty class at first, but then
115 buildTestCases runs through the test files in html5lib, and adds a
116 method to this class for each one. That method acts like
117 test_parser.TestCase.runParserTest, running a parse, serializing the tree, and
118 comparing it to the expected output.
119
120 The vague name is so nosetests doesn't try to run it as a test.
121 """
122 def impl(self, inner_html, input, expected, errors):
123 p = html5lib_adapter.HTMLParser(
124 tree=TREEBUILDER(namespaceHTMLElements=True))
125
126 if inner_html:
127 document = p.parseFragment(
128 StringIO.StringIO(input), inner_html.replace('math ', 'mathml '))
129 else:
130 document = p.parse(StringIO.StringIO(input))
131
132 with warnings.catch_warnings():
133 # Etree serializer in html5lib uses a deprecated getchildren() API.
134 warnings.filterwarnings('ignore', category=DeprecationWarning)
135 output = convertTreeDump(p.tree.testSerializer(document))
136
137 expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub(
138 r'\1<html \2>', convertExpected(expected, 2))
139 # html5lib doesn't yet support the template tag, but it appears in the
140 # tests with the expectation that the template contents will be under the
141 # word 'contents', so we need to reformat that string a bit.
142 expected = reformatTemplateContents(expected)
143
144 error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected,
145 '\nReceived:', output])
146 self.assertEquals(expected, output,
147 error_msg.encode('ascii', 'xmlcharrefreplace') + '\n')
148 # TODO(jdtang): Check error messages, when there's full error support.
149
150
151 def BuildTestCases(cls):
152 for filename in html5lib_test_files():
153 test_name = os.path.basename(filename).replace('.dat', '')
154 for i, test in enumerate(TestData(filename)):
155 # html5lib parses <noscript> tags as if the scripting-enabled flag is
156 # set, while we parse as if the scripting-disabled flag is set (since we
157 # don't really support scripting and the resulting parse tree is often
158 # more useful for toolsmiths). That means our output will differ by
159 # design from html5lib's, so we disable any of their tests that involve
160 # <noscript>
161 if '<noscript>' in test['data']:
162 continue
163
164 # <command> has been renamed to <menuitem> in recent versions of the spec.
165 # html5lib 0.95 does not include this yet, and so we disable tests that
166 # include the old tag.
167 if '<command>' in test['data']:
168 continue
169
170 def test_func(
171 self,
172 inner_html=test['document-fragment'],
173 input=test['data'],
174 expected=test['document'],
175 errors=test.get('errors', '').split('\n')):
176 return self.impl(inner_html, input, expected, errors)
177 test_func.__name__ = 'test_%s_%d' % (test_name, i + 1)
178 setattr(cls, test_func.__name__, test_func)
179
180
181 if __name__ == '__main__':
182 BuildTestCases(Html5libAdapterTest)
183 unittest.main()