Python2/PyMuPDF: mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter

comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter_test.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.

author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children

comparison

equal deleted inserted replaced

-:1d09e1dec1d9
+:b50eed0cc0ef
+# Copyright 2012 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Tests for the Gumbo => Html5lib adapter."""
+import codecs
+import collections
+import glob
+import os
+import re
+import StringIO
+import unittest
+import warnings
+from html5lib import treebuilders
+import html5lib_adapter
+TREEBUILDER = treebuilders.getTreeBuilder('dom')
+TESTDATA_BASE_PATH = os.path.join(
+os.path.split(__file__)[0], '..', '..')
+# Copied from html5lib.tests/test_parser.py
+def convertTreeDump(data):
+return "\n".join(convertExpected(data, 3).split("\n")[1:])
+# Copied/adapted/simplified from html5lib.tests/support.py
+def html5lib_test_files():
+return glob.glob(os.path.join(
+TESTDATA_BASE_PATH, 'testdata', 'tree-construction', '*.dat'))
+class TestData(object):
+def __init__(self, filename):
+self.f = codecs.open(filename, encoding="utf8")
+def __iter__(self):
+data = collections.defaultdict(lambda: None)
+key=None
+for line in self.f:
+heading = self.isSectionHeading(line)
+if heading:
+if data and heading == 'data':
+#Remove trailing newline
+data[key] = data[key][:-1]
+yield self.normaliseOutput(data)
+data = collections.defaultdict(lambda: None)
+key = heading
+data[key] = ''
+elif key is not None:
+data[key] += line
+if data:
+yield self.normaliseOutput(data)
+def isSectionHeading(self, line):
+"""If the current heading is a test section heading return the heading,
+otherwise return False"""
+if line.startswith("#"):
+return line[1:].strip()
+else:
+return False
+def normaliseOutput(self, data):
+# Remove trailing newlines
+for key, value in data.iteritems():
+if value.endswith("\n"):
+data[key] = value[:-1]
+return data
+def convertExpected(data, stripChars):
+"""convert the output of str(document) to the format used in the testcases"""
+data = data.split("\n")
+rv = []
+for line in data:
+if line.startswith("|"):
+rv.append(line[stripChars:])
+else:
+rv.append(line)
+return "\n".join(rv)
+def reformatTemplateContents(expected):
+lines = expected.split('\n')
+retval = []
+template_indents = []
+for line in lines:
+line_stripped = line.strip()
+indent = len(line) - len(line_stripped)
+if line_stripped == 'content':
+template_indents.append(indent)
+continue
+while template_indents and indent <= template_indents[-1]:
+template_indents.pop()
+if template_indents:
+line = line[2 * len(template_indents):]
+retval.append(line)
+return '\n'.join(retval)
+class Html5libAdapterTest(unittest.TestCase):
+"""Adapter between Gumbo and the html5lib tests.
+This works through a bit of magic.  It's an empty class at first, but then
+buildTestCases runs through the test files in html5lib, and adds a
+method to this class for each one.  That method acts like
+test_parser.TestCase.runParserTest, running a parse, serializing the tree, and
+comparing it to the expected output.
+The vague name is so nosetests doesn't try to run it as a test.
+"""
+def impl(self, inner_html, input, expected, errors):
+p = html5lib_adapter.HTMLParser(
+tree=TREEBUILDER(namespaceHTMLElements=True))
+if inner_html:
+document = p.parseFragment(
+StringIO.StringIO(input), inner_html.replace('math ', 'mathml '))
+else:
+document = p.parse(StringIO.StringIO(input))
+with warnings.catch_warnings():
+# Etree serializer in html5lib uses a deprecated getchildren() API.
+warnings.filterwarnings('ignore', category=DeprecationWarning)
+output = convertTreeDump(p.tree.testSerializer(document))
+expected = re.compile(r'^(\s*)<(\S+)>', re.M).sub(
+r'\1<html \2>', convertExpected(expected, 2))
+# html5lib doesn't yet support the template tag, but it appears in the
+# tests with the expectation that the template contents will be under the
+# word 'contents', so we need to reformat that string a bit.
+expected = reformatTemplateContents(expected)
+error_msg = '\n'.join(['\n\nInput:', input, '\nExpected:', expected,
+'\nReceived:', output])
+self.assertEquals(expected, output,
+error_msg.encode('ascii', 'xmlcharrefreplace') + '\n')
+# TODO(jdtang): Check error messages, when there's full error support.
+def BuildTestCases(cls):
+for filename in html5lib_test_files():
+test_name = os.path.basename(filename).replace('.dat', '')
+for i, test in enumerate(TestData(filename)):
+# html5lib parses <noscript> tags as if the scripting-enabled flag is
+# set, while we parse as if the scripting-disabled flag is set (since we
+# don't really support scripting and the resulting parse tree is often
+# more useful for toolsmiths).  That means our output will differ by
+# design from html5lib's, so we disable any of their tests that involve
+# <noscript>
+if '<noscript>' in test['data']:
+continue
+# <command> has been renamed to <menuitem> in recent versions of the spec.
+# html5lib 0.95 does not include this yet, and so we disable tests that
+# include the old tag.
+if '<command>' in test['data']:
+continue
+def test_func(
+self,
+inner_html=test['document-fragment'],
+input=test['data'],
+expected=test['document'],
+errors=test.get('errors', '').split('\n')):
+return self.impl(inner_html, input, expected, errors)
+test_func.__name__ = 'test_%s_%d' % (test_name, i + 1)
+setattr(cls, test_func.__name__, test_func)
+if __name__ == '__main__':
+BuildTestCases(Html5libAdapterTest)
+unittest.main()

Mercurial > hgrepos > Python2 > PyMuPDF

comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/html5lib_adapter_test.py @ 2:b50eed0cc0ef upstream