Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/gumbo-parser/python/gumbo/gumboc.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/gumbo-parser/python/gumbo/gumboc.py Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,423 @@ +# Copyright 2012 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""CTypes bindings for the Gumbo HTML5 parser. + +This exports the raw interface of the library as a set of very thin ctypes +wrappers. It's intended to be wrapped by other libraries to provide a more +Pythonic API. +""" + +__author__ = 'jdtang@google.com (Jonathan Tang)' + +import sys +import contextlib +import ctypes +import os.path +import gumboc_tags + +_name_of_lib = 'libgumbo.so' +if sys.platform.startswith('darwin'): + _name_of_lib = 'libgumbo.dylib' +elif sys.platform.startswith('win'): + _name_of_lib = "gumbo.dll" + +try: + # First look for a freshly-built .so in the .libs directory, for development. + _dll = ctypes.cdll.LoadLibrary(os.path.join( + os.path.dirname(__file__), '..', '..', '.libs', _name_of_lib)) +except OSError: + # PyPI or setuptools install, look in the current directory. + _dll = ctypes.cdll.LoadLibrary(os.path.join( + os.path.dirname(__file__), _name_of_lib)) +except OSError: + # System library, on unix or mac osx + _dll = ctypes.cdll.LoadLibrary(_name_of_lib) + +# Some aliases for common types. +_bitvector = ctypes.c_uint +_Ptr = ctypes.POINTER + +class EnumMetaclass(type(ctypes.c_uint)): + def __new__(metaclass, name, bases, cls_dict): + cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) + if name == 'Enum': + return cls + try: + for i, value in enumerate(cls_dict['_values_']): + setattr(cls, value, cls.from_param(i)) + except KeyError: + raise ValueError('No _values_ list found inside enum type.') + except TypeError: + raise ValueError('_values_ must be a list of names of enum constants.') + return cls + +def with_metaclass(mcls): + def decorator(cls): + body = vars(cls).copy() + # clean out class body + body.pop('__dict__', None) + body.pop('__weakref__', None) + return mcls(cls.__name__, cls.__bases__, body) + return decorator + +@with_metaclass(EnumMetaclass) +class Enum(ctypes.c_uint): + @classmethod + def from_param(cls, param): + if isinstance(param, Enum): + if param.__class__ != cls: + raise ValueError("Can't mix enums of different types") + return param + if param < 0 or param > len(cls._values_): + raise ValueError('%d is out of range for enum type %s; max %d.' % + (param, cls.__name__, len(cls._values_))) + return cls(param) + + def __eq__(self, other): + return self.value == other.value + + def __ne__(self, other): + return self.value != other.value + + def __hash__(self): + return hash(self.value) + + def __repr__(self): + try: + return self._values_[self.value] + except IndexError: + raise IndexError('Value %d is out of range for %r' % + (self.value, self._values_)) + + +class StringPiece(ctypes.Structure): + _fields_ = [ + ('data', _Ptr(ctypes.c_char)), + ('length', ctypes.c_size_t), + ] + + def __len__(self): + return self.length + + def __str__(self): + return ctypes.string_at(self.data, self.length) + + +class SourcePosition(ctypes.Structure): + _fields_ = [ + ('line', ctypes.c_uint), + ('column', ctypes.c_uint), + ('offset', ctypes.c_uint) + ] +SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition') + + +class AttributeNamespace(Enum): + URLS = [ + 'http://www.w3.org/1999/xhtml', + 'http://www.w3.org/1999/xlink', + 'http://www.w3.org/XML/1998/namespace', + 'http://www.w3.org/2000/xmlns', + ] + _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS'] + + def to_url(self): + return self.URLS[self.value] + + +class Attribute(ctypes.Structure): + _fields_ = [ + ('namespace', AttributeNamespace), + ('name', ctypes.c_char_p), + ('original_name', StringPiece), + ('value', ctypes.c_char_p), + ('original_value', StringPiece), + ('name_start', SourcePosition), + ('name_end', SourcePosition), + ('value_start', SourcePosition), + ('value_end', SourcePosition) + ] + + +class Vector(ctypes.Structure): + _type_ = ctypes.c_void_p + _fields_ = [ + ('data', _Ptr(ctypes.c_void_p)), + ('length', ctypes.c_uint), + ('capacity', ctypes.c_uint) + ] + + class Iter(object): + def __init__(self, vector): + self.current = 0 + self.vector = vector + + def __iter__(self): + return self + + def __next__(self): + # Python 3 + if self.current >= self.vector.length: + raise StopIteration + obj = self.vector[self.current] + self.current += 1 + return obj + + def next(self): + # Python 2 + return self.__next__() + + def __len__(self): + return self.length + + def __getitem__(self, i): + try: + # Python 2 + numeric_types = (int, long) + except NameError: + # Python 3 + numeric_types = int + + if isinstance(i, numeric_types): + if i < 0: + i += self.length + if i > self.length: + raise IndexError + array_type = _Ptr(_Ptr(self._type_)) + return ctypes.cast(self.data, array_type)[i].contents + return list(self)[i] + + def __iter__(self): + return Vector.Iter(self) + + +Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector') + + +class AttributeVector(Vector): + _type_ = Attribute + + +class NodeVector(Vector): + # _type_ assigned later, to avoid circular references with Node + pass + + +class QuirksMode(Enum): + _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS'] + + +class Document(ctypes.Structure): + _fields_ = [ + ('children', NodeVector), + ('has_doctype', ctypes.c_bool), + ('name', ctypes.c_char_p), + ('public_identifier', ctypes.c_char_p), + ('system_identifier', ctypes.c_char_p), + ('doc_type_quirks_mode', QuirksMode), + ] + + def __repr__(self): + return 'Document' + + +class Namespace(Enum): + URLS = [ + 'http://www.w3.org/1999/xhtml', + 'http://www.w3.org/2000/svg', + 'http://www.w3.org/1998/Math/MathML', + ] + _values_ = ['HTML', 'SVG', 'MATHML'] + + def to_url(self): + return self.URLS[self.value] + + +class Tag(Enum): + @staticmethod + def from_str(tagname): + text_ptr = ctypes.c_char_p(tagname.encode('utf-8')) + return _tag_enum(text_ptr) + + _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST'] + +class Element(ctypes.Structure): + _fields_ = [ + ('children', NodeVector), + ('tag', Tag), + ('tag_namespace', Namespace), + ('original_tag', StringPiece), + ('original_end_tag', StringPiece), + ('start_pos', SourcePosition), + ('end_pos', SourcePosition), + ('attributes', AttributeVector), + ] + + @property + def tag_name(self): + original_tag = StringPiece.from_buffer_copy(self.original_tag) + _tag_from_original_text(ctypes.byref(original_tag)) + if self.tag_namespace == Namespace.SVG: + svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag)) + if svg_tagname is not None: + return str(svg_tagname) + if self.tag == Tag.UNKNOWN: + if original_tag.data is None: + return '' + return str(original_tag).lower() + return _tagname(self.tag) + + def __repr__(self): + return ('<%r>\n' % self.tag + + '\n'.join(repr(child) for child in self.children) + + '</%r>' % self.tag) + + +class Text(ctypes.Structure): + _fields_ = [ + ('text', ctypes.c_char_p), + ('original_text', StringPiece), + ('start_pos', SourcePosition) + ] + + def __repr__(self): + return 'Text(%r)' % self.text + + +class NodeType(Enum): + _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', + 'COMMENT', 'WHITESPACE', 'TEMPLATE'] + + +class NodeUnion(ctypes.Union): + _fields_ = [ + ('document', Document), + ('element', Element), + ('text', Text), + ] + + +class Node(ctypes.Structure): + # _fields_ set later to avoid a circular reference + + def _contents(self): + # Python3 enters an infinite loop if you use an @property within + # __getattr__, so we factor it out to a helper. + if self.type == NodeType.DOCUMENT: + return self.v.document + elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE): + return self.v.element + else: + return self.v.text + + @property + def contents(self): + return self._contents() + + def __getattr__(self, name): + return getattr(self._contents(), name) + + def __setattr__(self, name, value): + return setattr(self._contents(), name, value) + + def __repr__(self): + return repr(self.contents) + + +Node._fields_ = [ + ('type', NodeType), + # Set the type to Node later to avoid a circular dependency. + ('parent', _Ptr(Node)), + ('index_within_parent', ctypes.c_size_t), + # TODO(jdtang): Make a real list of enum constants for this. + ('parse_flags', _bitvector), + ('v', NodeUnion) + ] +NodeVector._type_ = Node + + +class Options(ctypes.Structure): + _fields_ = [ + # TODO(jdtang): Allow the Python API to set the allocator/deallocator + # function. Right now these are treated as opaque void pointers. + ('allocator', ctypes.c_void_p), + ('deallocator', ctypes.c_void_p), + ('userdata', ctypes.c_void_p), + ('tab_stop', ctypes.c_int), + ('stop_on_first_error', ctypes.c_bool), + ('max_errors', ctypes.c_int), + ('fragment_context', Tag), + ('fragment_namespace', Namespace), + ] + + +class Output(ctypes.Structure): + _fields_ = [ + ('document', _Ptr(Node)), + ('root', _Ptr(Node)), + # TODO(jdtang): Error type. + ('errors', Vector), + ] + +@contextlib.contextmanager +def parse(text, **kwargs): + options = Options() + for field_name, _ in Options._fields_: + try: + setattr(options, field_name, kwargs[field_name]) + except KeyError: + setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name)) + # We have to manually take a reference to the input text here so that it + # outlives the parse output. If we let ctypes do it automatically on function + # call, it creates a temporary buffer which is destroyed when the call + # completes, and then the original_text pointers point into invalid memory. + text_ptr = ctypes.c_char_p(text.encode('utf-8')) + output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) + try: + yield output + finally: + _destroy_output(ctypes.byref(options), output) + +_DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions') + +_parse_with_options = _dll.gumbo_parse_with_options +_parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t] +_parse_with_options.restype = _Ptr(Output) + +_tag_from_original_text = _dll.gumbo_tag_from_original_text +_tag_from_original_text.argtypes = [_Ptr(StringPiece)] +_tag_from_original_text.restype = None + +_normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname +_normalize_svg_tagname.argtypes = [_Ptr(StringPiece)] +_normalize_svg_tagname.restype = ctypes.c_char_p + +_destroy_output = _dll.gumbo_destroy_output +_destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)] +_destroy_output.restype = None + +_tagname = _dll.gumbo_normalized_tagname +_tagname.argtypes = [Tag] +_tagname.restype = ctypes.c_char_p + +_tag_enum = _dll.gumbo_tag_enum +_tag_enum.argtypes = [ctypes.c_char_p] +_tag_enum.restype = Tag + +__all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute', + 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document', + 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node', + 'Options', 'Output', 'parse']
