Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter_test.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 # Copyright 2012 Google Inc. All Rights Reserved. | |
| 2 # | |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 # you may not use this file except in compliance with the License. | |
| 5 # You may obtain a copy of the License at | |
| 6 # | |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 # | |
| 9 # Unless required by applicable law or agreed to in writing, software | |
| 10 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 # See the License for the specific language governing permissions and | |
| 13 # limitations under the License. | |
| 14 # | |
| 15 """Tests for the Gumbo => Html5lib adapter.""" | |
| 16 import codecs | |
| 17 import collections | |
| 18 import glob | |
| 19 import os | |
| 20 import re | |
| 21 import StringIO | |
| 22 import unittest | |
| 23 import warnings | |
| 24 | |
| 25 from html5lib import treebuilders | |
| 26 | |
| 27 import html5lib_adapter | |
| 28 | |
| 29 | |
| 30 TREEBUILDER = treebuilders.getTreeBuilder('dom') | |
| 31 TESTDATA_BASE_PATH = os.path.join( | |
| 32 os.path.split(__file__)[0], '..', '..') | |
| 33 | |
| 34 | |
| 35 # Copied from html5lib.tests/test_parser.py | |
| 36 def convertTreeDump(data): | |
| 37 return "\n".join(convertExpected(data, 3).split("\n")[1:]) | |
| 38 | |
| 39 | |
| 40 # Copied/adapted/simplified from html5lib.tests/support.py | |
| 41 def html5lib_test_files(): | |
| 42 return glob.glob(os.path.join( | |
| 43 TESTDATA_BASE_PATH, 'testdata', 'tree-construction', '*.dat')) | |
| 44 | |
| 45 | |
| 46 class TestData(object): | |
| 47 def __init__(self, filename): | |
| 48 self.f = codecs.open(filename, encoding="utf8") | |
| 49 | |
| 50 def __iter__(self): | |
| 51 data = collections.defaultdict(lambda: None) | |
| 52 key=None | |
| 53 for line in self.f: | |
| 54 heading = self.isSectionHeading(line) | |
| 55 if heading: | |
| 56 if data and heading == 'data': | |
| 57 #Remove trailing newline | |
| 58 data[key] = data[key][:-1] | |
| 59 yield self.normaliseOutput(data) | |
| 60 data = collections.defaultdict(lambda: None) | |
| 61 key = heading | |
| 62 data[key] = '' | |
| 63 elif key is not None: | |
| 64 data[key] += line | |
| 65 if data: | |
| 66 yield self.normaliseOutput(data) | |
| 67 | |
| 68 def isSectionHeading(self, line): | |
| 69 """If the current heading is a test section heading return the heading, | |
| 70 otherwise return False""" | |
| 71 if line.startswith("#"): | |
| 72 return line[1:].strip() | |
| 73 else: | |
| 74 return False | |
| 75 | |
| 76 def normaliseOutput(self, data): | |
| 77 # Remove trailing newlines | |
| 78 for key, value in data.iteritems(): | |
| 79 if value.endswith("\n"): | |
| 80 data[key] = value[:-1] | |
| 81 return data | |
| 82 | |
| 83 def convertExpected(data, stripChars): | |
| 84 """convert the output of str(document) to the format used in the testcases""" | |
| 85 data = data.split("\n") | |
| 86 rv = [] | |
| 87 for line in data: | |
| 88 if line.startswith("|"): | |
| 89 rv.append(line[stripChars:]) | |
| 90 else: | |
| 91 rv.append(line) | |
| 92 return "\n".join(rv) | |
| 93 | |
| 94 def reformatTemplateContents(expected): | |
| 95 lines = expected.split('\n') | |
| 96 retval = [] | |
| 97 template_indents = [] | |
| 98 for line in lines: | |
| 99 line_stripped = line.strip() | |
| 100 indent = len(line) - len(line_stripped) | |
| 101 if line_stripped == 'content': | |
| 102 template_indents.append(indent) | |
| 103 continue | |
| 104 while template_indents and indent <= template_indents[-1]: | |
| 105 template_indents.pop() | |
| 106 if template_indents: | |
| 107 line = line[2 * len(template_indents):] | |
| 108 retval.append(line) | |
| 109 return '\n'.join(retval) | |
| 110 | |
| 111 class Html5libAdapterTest(unittest.TestCase): | |
| 112 """Adapter between Gumbo and the html5lib tests. | |
| 113 | |
| 114 This works through a bit of magic. It's an empty class at first, but then | |
| 115 buildTestCases runs through the test files in html5lib, and adds a | |
| 116 method to this class for each one. That method acts like | |
| 117 test_parser.TestCase.runParserTest, running a parse, serializing the tree, and | |
| 118 comparing it to the expected output. | |
| 119 | |
| 120 The vague name is so nosetests doesn't try to run it as a test. | |
| 121 """ | |
| 122 def impl(self, inner_html, input, expected, errors): | |
| 123 p = html5lib_adapter.HTMLParser( | |
| 124 tree=TREEBUILDER(namespaceHTMLElements=True)) | |
| 125 | |
| 126 if inner_html: | |
| 127 document = p.parseFragment( | |
| 128 StringIO.StringIO(input), inner_html.replace('math ', 'mathml ')) | |
| 129 else: | |
| 130 document = p.parse(StringIO.StringIO(input)) | |
| 131 | |
| 132 with warnings.catch_warnings(): | |
| 133 # Etree serializer in html5lib uses a deprecated getchildren() API. | |
| 134 warnings.filterwarnings('ignore', category=DeprecationWarning) | |
| 135 output = convertTreeDump(p.tree.testSerializer(document)) | |
| 136 | |
| 137 expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub( | |
| 138 r'\1<html \2>', convertExpected(expected, 2)) | |
| 139 # html5lib doesn't yet support the template tag, but it appears in the | |
| 140 # tests with the expectation that the template contents will be under the | |
| 141 # word 'contents', so we need to reformat that string a bit. | |
| 142 expected = reformatTemplateContents(expected) | |
| 143 | |
| 144 error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected, | |
| 145 '\nReceived:', output]) | |
| 146 self.assertEquals(expected, output, | |
| 147 error_msg.encode('ascii', 'xmlcharrefreplace') + '\n') | |
| 148 # TODO(jdtang): Check error messages, when there's full error support. | |
| 149 | |
| 150 | |
| 151 def BuildTestCases(cls): | |
| 152 for filename in html5lib_test_files(): | |
| 153 test_name = os.path.basename(filename).replace('.dat', '') | |
| 154 for i, test in enumerate(TestData(filename)): | |
| 155 # html5lib parses <noscript> tags as if the scripting-enabled flag is | |
| 156 # set, while we parse as if the scripting-disabled flag is set (since we | |
| 157 # don't really support scripting and the resulting parse tree is often | |
| 158 # more useful for toolsmiths). That means our output will differ by | |
| 159 # design from html5lib's, so we disable any of their tests that involve | |
| 160 # <noscript> | |
| 161 if '<noscript>' in test['data']: | |
| 162 continue | |
| 163 | |
| 164 # <command> has been renamed to <menuitem> in recent versions of the spec. | |
| 165 # html5lib 0.95 does not include this yet, and so we disable tests that | |
| 166 # include the old tag. | |
| 167 if '<command>' in test['data']: | |
| 168 continue | |
| 169 | |
| 170 def test_func( | |
| 171 self, | |
| 172 inner_html=test['document-fragment'], | |
| 173 input=test['data'], | |
| 174 expected=test['document'], | |
| 175 errors=test.get('errors', '').split('\n')): | |
| 176 return self.impl(inner_html, input, expected, errors) | |
| 177 test_func.__name__ = 'test_%s_%d' % (test_name, i + 1) | |
| 178 setattr(cls, test_func.__name__, test_func) | |
| 179 | |
| 180 | |
| 181 if __name__ == '__main__': | |
| 182 BuildTestCases(Html5libAdapterTest) | |
| 183 unittest.main() |
