Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter_test.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter_test.py Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,183 @@ +# Copyright 2012 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Tests for the Gumbo => Html5lib adapter.""" +import codecs +import collections +import glob +import os +import re +import StringIO +import unittest +import warnings + +from html5lib import treebuilders + +import html5lib_adapter + + +TREEBUILDER = treebuilders.getTreeBuilder('dom') +TESTDATA_BASE_PATH = os.path.join( + os.path.split(__file__)[0], '..', '..') + + +# Copied from html5lib.tests/test_parser.py +def convertTreeDump(data): + return "\n".join(convertExpected(data, 3).split("\n")[1:]) + + +# Copied/adapted/simplified from html5lib.tests/support.py +def html5lib_test_files(): + return glob.glob(os.path.join( + TESTDATA_BASE_PATH, 'testdata', 'tree-construction', '*.dat')) + + +class TestData(object): + def __init__(self, filename): + self.f = codecs.open(filename, encoding="utf8") + + def __iter__(self): + data = collections.defaultdict(lambda: None) + key=None + for line in self.f: + heading = self.isSectionHeading(line) + if heading: + if data and heading == 'data': + #Remove trailing newline + data[key] = data[key][:-1] + yield self.normaliseOutput(data) + data = collections.defaultdict(lambda: None) + key = heading + data[key] = '' + elif key is not None: + data[key] += line + if data: + yield self.normaliseOutput(data) + + def isSectionHeading(self, line): + """If the current heading is a test section heading return the heading, + otherwise return False""" + if line.startswith("#"): + return line[1:].strip() + else: + return False + + def normaliseOutput(self, data): + # Remove trailing newlines + for key, value in data.iteritems(): + if value.endswith("\n"): + data[key] = value[:-1] + return data + +def convertExpected(data, stripChars): + """convert the output of str(document) to the format used in the testcases""" + data = data.split("\n") + rv = [] + for line in data: + if line.startswith("|"): + rv.append(line[stripChars:]) + else: + rv.append(line) + return "\n".join(rv) + +def reformatTemplateContents(expected): + lines = expected.split('\n') + retval = [] + template_indents = [] + for line in lines: + line_stripped = line.strip() + indent = len(line) - len(line_stripped) + if line_stripped == 'content': + template_indents.append(indent) + continue + while template_indents and indent <= template_indents[-1]: + template_indents.pop() + if template_indents: + line = line[2 * len(template_indents):] + retval.append(line) + return '\n'.join(retval) + +class Html5libAdapterTest(unittest.TestCase): + """Adapter between Gumbo and the html5lib tests. + + This works through a bit of magic. It's an empty class at first, but then + buildTestCases runs through the test files in html5lib, and adds a + method to this class for each one. That method acts like + test_parser.TestCase.runParserTest, running a parse, serializing the tree, and + comparing it to the expected output. + + The vague name is so nosetests doesn't try to run it as a test. + """ + def impl(self, inner_html, input, expected, errors): + p = html5lib_adapter.HTMLParser( + tree=TREEBUILDER(namespaceHTMLElements=True)) + + if inner_html: + document = p.parseFragment( + StringIO.StringIO(input), inner_html.replace('math ', 'mathml ')) + else: + document = p.parse(StringIO.StringIO(input)) + + with warnings.catch_warnings(): + # Etree serializer in html5lib uses a deprecated getchildren() API. + warnings.filterwarnings('ignore', category=DeprecationWarning) + output = convertTreeDump(p.tree.testSerializer(document)) + + expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub( + r'\1<html \2>', convertExpected(expected, 2)) + # html5lib doesn't yet support the template tag, but it appears in the + # tests with the expectation that the template contents will be under the + # word 'contents', so we need to reformat that string a bit. + expected = reformatTemplateContents(expected) + + error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected, + '\nReceived:', output]) + self.assertEquals(expected, output, + error_msg.encode('ascii', 'xmlcharrefreplace') + '\n') + # TODO(jdtang): Check error messages, when there's full error support. + + +def BuildTestCases(cls): + for filename in html5lib_test_files(): + test_name = os.path.basename(filename).replace('.dat', '') + for i, test in enumerate(TestData(filename)): + # html5lib parses <noscript> tags as if the scripting-enabled flag is + # set, while we parse as if the scripting-disabled flag is set (since we + # don't really support scripting and the resulting parse tree is often + # more useful for toolsmiths). That means our output will differ by + # design from html5lib's, so we disable any of their tests that involve + # <noscript> + if '<noscript>' in test['data']: + continue + + # <command> has been renamed to <menuitem> in recent versions of the spec. + # html5lib 0.95 does not include this yet, and so we disable tests that + # include the old tag. + if '<command>' in test['data']: + continue + + def test_func( + self, + inner_html=test['document-fragment'], + input=test['data'], + expected=test['document'], + errors=test.get('errors', '').split('\n')): + return self.impl(inner_html, input, expected, errors) + test_func.__name__ = 'test_%s_%d' % (test_name, i + 1) + setattr(cls, test_func.__name__, test_func) + + +if __name__ == '__main__': + BuildTestCases(Html5libAdapterTest) + unittest.main()
