Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/gumboc.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 # Copyright 2012 Google Inc. All Rights Reserved. | |
| 2 # | |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 # you may not use this file except in compliance with the License. | |
| 5 # You may obtain a copy of the License at | |
| 6 # | |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 # | |
| 9 # Unless required by applicable law or agreed to in writing, software | |
| 10 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 # See the License for the specific language governing permissions and | |
| 13 # limitations under the License. | |
| 14 # | |
| 15 | |
| 16 """CTypes bindings for the Gumbo HTML5 parser. | |
| 17 | |
| 18 This exports the raw interface of the library as a set of very thin ctypes | |
| 19 wrappers. It's intended to be wrapped by other libraries to provide a more | |
| 20 Pythonic API. | |
| 21 """ | |
| 22 | |
| 23 __author__ = 'jdtang@google.com (Jonathan Tang)' | |
| 24 | |
| 25 import sys | |
| 26 import contextlib | |
| 27 import ctypes | |
| 28 import os.path | |
| 29 import gumboc_tags | |
| 30 | |
| 31 _name_of_lib = 'libgumbo.so' | |
| 32 if sys.platform.startswith('darwin'): | |
| 33 _name_of_lib = 'libgumbo.dylib' | |
| 34 elif sys.platform.startswith('win'): | |
| 35 _name_of_lib = "gumbo.dll" | |
| 36 | |
| 37 try: | |
| 38 # First look for a freshly-built .so in the .libs directory, for development. | |
| 39 _dll = ctypes.cdll.LoadLibrary(os.path.join( | |
| 40 os.path.dirname(__file__), '..', '..', '.libs', _name_of_lib)) | |
| 41 except OSError: | |
| 42 # PyPI or setuptools install, look in the current directory. | |
| 43 _dll = ctypes.cdll.LoadLibrary(os.path.join( | |
| 44 os.path.dirname(__file__), _name_of_lib)) | |
| 45 except OSError: | |
| 46 # System library, on unix or mac osx | |
| 47 _dll = ctypes.cdll.LoadLibrary(_name_of_lib) | |
| 48 | |
| 49 # Some aliases for common types. | |
| 50 _bitvector = ctypes.c_uint | |
| 51 _Ptr = ctypes.POINTER | |
| 52 | |
| 53 class EnumMetaclass(type(ctypes.c_uint)): | |
| 54 def __new__(metaclass, name, bases, cls_dict): | |
| 55 cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict) | |
| 56 if name == 'Enum': | |
| 57 return cls | |
| 58 try: | |
| 59 for i, value in enumerate(cls_dict['_values_']): | |
| 60 setattr(cls, value, cls.from_param(i)) | |
| 61 except KeyError: | |
| 62 raise ValueError('No _values_ list found inside enum type.') | |
| 63 except TypeError: | |
| 64 raise ValueError('_values_ must be a list of names of enum constants.') | |
| 65 return cls | |
| 66 | |
| 67 def with_metaclass(mcls): | |
| 68 def decorator(cls): | |
| 69 body = vars(cls).copy() | |
| 70 # clean out class body | |
| 71 body.pop('__dict__', None) | |
| 72 body.pop('__weakref__', None) | |
| 73 return mcls(cls.__name__, cls.__bases__, body) | |
| 74 return decorator | |
| 75 | |
| 76 @with_metaclass(EnumMetaclass) | |
| 77 class Enum(ctypes.c_uint): | |
| 78 @classmethod | |
| 79 def from_param(cls, param): | |
| 80 if isinstance(param, Enum): | |
| 81 if param.__class__ != cls: | |
| 82 raise ValueError("Can't mix enums of different types") | |
| 83 return param | |
| 84 if param < 0 or param > len(cls._values_): | |
| 85 raise ValueError('%d is out of range for enum type %s; max %d.' % | |
| 86 (param, cls.__name__, len(cls._values_))) | |
| 87 return cls(param) | |
| 88 | |
| 89 def __eq__(self, other): | |
| 90 return self.value == other.value | |
| 91 | |
| 92 def __ne__(self, other): | |
| 93 return self.value != other.value | |
| 94 | |
| 95 def __hash__(self): | |
| 96 return hash(self.value) | |
| 97 | |
| 98 def __repr__(self): | |
| 99 try: | |
| 100 return self._values_[self.value] | |
| 101 except IndexError: | |
| 102 raise IndexError('Value %d is out of range for %r' % | |
| 103 (self.value, self._values_)) | |
| 104 | |
| 105 | |
| 106 class StringPiece(ctypes.Structure): | |
| 107 _fields_ = [ | |
| 108 ('data', _Ptr(ctypes.c_char)), | |
| 109 ('length', ctypes.c_size_t), | |
| 110 ] | |
| 111 | |
| 112 def __len__(self): | |
| 113 return self.length | |
| 114 | |
| 115 def __str__(self): | |
| 116 return ctypes.string_at(self.data, self.length) | |
| 117 | |
| 118 | |
| 119 class SourcePosition(ctypes.Structure): | |
| 120 _fields_ = [ | |
| 121 ('line', ctypes.c_uint), | |
| 122 ('column', ctypes.c_uint), | |
| 123 ('offset', ctypes.c_uint) | |
| 124 ] | |
| 125 SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition') | |
| 126 | |
| 127 | |
| 128 class AttributeNamespace(Enum): | |
| 129 URLS = [ | |
| 130 'http://www.w3.org/1999/xhtml', | |
| 131 'http://www.w3.org/1999/xlink', | |
| 132 'http://www.w3.org/XML/1998/namespace', | |
| 133 'http://www.w3.org/2000/xmlns', | |
| 134 ] | |
| 135 _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS'] | |
| 136 | |
| 137 def to_url(self): | |
| 138 return self.URLS[self.value] | |
| 139 | |
| 140 | |
| 141 class Attribute(ctypes.Structure): | |
| 142 _fields_ = [ | |
| 143 ('namespace', AttributeNamespace), | |
| 144 ('name', ctypes.c_char_p), | |
| 145 ('original_name', StringPiece), | |
| 146 ('value', ctypes.c_char_p), | |
| 147 ('original_value', StringPiece), | |
| 148 ('name_start', SourcePosition), | |
| 149 ('name_end', SourcePosition), | |
| 150 ('value_start', SourcePosition), | |
| 151 ('value_end', SourcePosition) | |
| 152 ] | |
| 153 | |
| 154 | |
| 155 class Vector(ctypes.Structure): | |
| 156 _type_ = ctypes.c_void_p | |
| 157 _fields_ = [ | |
| 158 ('data', _Ptr(ctypes.c_void_p)), | |
| 159 ('length', ctypes.c_uint), | |
| 160 ('capacity', ctypes.c_uint) | |
| 161 ] | |
| 162 | |
| 163 class Iter(object): | |
| 164 def __init__(self, vector): | |
| 165 self.current = 0 | |
| 166 self.vector = vector | |
| 167 | |
| 168 def __iter__(self): | |
| 169 return self | |
| 170 | |
| 171 def __next__(self): | |
| 172 # Python 3 | |
| 173 if self.current >= self.vector.length: | |
| 174 raise StopIteration | |
| 175 obj = self.vector[self.current] | |
| 176 self.current += 1 | |
| 177 return obj | |
| 178 | |
| 179 def next(self): | |
| 180 # Python 2 | |
| 181 return self.__next__() | |
| 182 | |
| 183 def __len__(self): | |
| 184 return self.length | |
| 185 | |
| 186 def __getitem__(self, i): | |
| 187 try: | |
| 188 # Python 2 | |
| 189 numeric_types = (int, long) | |
| 190 except NameError: | |
| 191 # Python 3 | |
| 192 numeric_types = int | |
| 193 | |
| 194 if isinstance(i, numeric_types): | |
| 195 if i < 0: | |
| 196 i += self.length | |
| 197 if i > self.length: | |
| 198 raise IndexError | |
| 199 array_type = _Ptr(_Ptr(self._type_)) | |
| 200 return ctypes.cast(self.data, array_type)[i].contents | |
| 201 return list(self)[i] | |
| 202 | |
| 203 def __iter__(self): | |
| 204 return Vector.Iter(self) | |
| 205 | |
| 206 | |
| 207 Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector') | |
| 208 | |
| 209 | |
| 210 class AttributeVector(Vector): | |
| 211 _type_ = Attribute | |
| 212 | |
| 213 | |
| 214 class NodeVector(Vector): | |
| 215 # _type_ assigned later, to avoid circular references with Node | |
| 216 pass | |
| 217 | |
| 218 | |
| 219 class QuirksMode(Enum): | |
| 220 _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS'] | |
| 221 | |
| 222 | |
| 223 class Document(ctypes.Structure): | |
| 224 _fields_ = [ | |
| 225 ('children', NodeVector), | |
| 226 ('has_doctype', ctypes.c_bool), | |
| 227 ('name', ctypes.c_char_p), | |
| 228 ('public_identifier', ctypes.c_char_p), | |
| 229 ('system_identifier', ctypes.c_char_p), | |
| 230 ('doc_type_quirks_mode', QuirksMode), | |
| 231 ] | |
| 232 | |
| 233 def __repr__(self): | |
| 234 return 'Document' | |
| 235 | |
| 236 | |
| 237 class Namespace(Enum): | |
| 238 URLS = [ | |
| 239 'http://www.w3.org/1999/xhtml', | |
| 240 'http://www.w3.org/2000/svg', | |
| 241 'http://www.w3.org/1998/Math/MathML', | |
| 242 ] | |
| 243 _values_ = ['HTML', 'SVG', 'MATHML'] | |
| 244 | |
| 245 def to_url(self): | |
| 246 return self.URLS[self.value] | |
| 247 | |
| 248 | |
| 249 class Tag(Enum): | |
| 250 @staticmethod | |
| 251 def from_str(tagname): | |
| 252 text_ptr = ctypes.c_char_p(tagname.encode('utf-8')) | |
| 253 return _tag_enum(text_ptr) | |
| 254 | |
| 255 _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST'] | |
| 256 | |
| 257 class Element(ctypes.Structure): | |
| 258 _fields_ = [ | |
| 259 ('children', NodeVector), | |
| 260 ('tag', Tag), | |
| 261 ('tag_namespace', Namespace), | |
| 262 ('original_tag', StringPiece), | |
| 263 ('original_end_tag', StringPiece), | |
| 264 ('start_pos', SourcePosition), | |
| 265 ('end_pos', SourcePosition), | |
| 266 ('attributes', AttributeVector), | |
| 267 ] | |
| 268 | |
| 269 @property | |
| 270 def tag_name(self): | |
| 271 original_tag = StringPiece.from_buffer_copy(self.original_tag) | |
| 272 _tag_from_original_text(ctypes.byref(original_tag)) | |
| 273 if self.tag_namespace == Namespace.SVG: | |
| 274 svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag)) | |
| 275 if svg_tagname is not None: | |
| 276 return str(svg_tagname) | |
| 277 if self.tag == Tag.UNKNOWN: | |
| 278 if original_tag.data is None: | |
| 279 return '' | |
| 280 return str(original_tag).lower() | |
| 281 return _tagname(self.tag) | |
| 282 | |
| 283 def __repr__(self): | |
| 284 return ('<%r>\n' % self.tag + | |
| 285 '\n'.join(repr(child) for child in self.children) + | |
| 286 '</%r>' % self.tag) | |
| 287 | |
| 288 | |
| 289 class Text(ctypes.Structure): | |
| 290 _fields_ = [ | |
| 291 ('text', ctypes.c_char_p), | |
| 292 ('original_text', StringPiece), | |
| 293 ('start_pos', SourcePosition) | |
| 294 ] | |
| 295 | |
| 296 def __repr__(self): | |
| 297 return 'Text(%r)' % self.text | |
| 298 | |
| 299 | |
| 300 class NodeType(Enum): | |
| 301 _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA', | |
| 302 'COMMENT', 'WHITESPACE', 'TEMPLATE'] | |
| 303 | |
| 304 | |
| 305 class NodeUnion(ctypes.Union): | |
| 306 _fields_ = [ | |
| 307 ('document', Document), | |
| 308 ('element', Element), | |
| 309 ('text', Text), | |
| 310 ] | |
| 311 | |
| 312 | |
| 313 class Node(ctypes.Structure): | |
| 314 # _fields_ set later to avoid a circular reference | |
| 315 | |
| 316 def _contents(self): | |
| 317 # Python3 enters an infinite loop if you use an @property within | |
| 318 # __getattr__, so we factor it out to a helper. | |
| 319 if self.type == NodeType.DOCUMENT: | |
| 320 return self.v.document | |
| 321 elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE): | |
| 322 return self.v.element | |
| 323 else: | |
| 324 return self.v.text | |
| 325 | |
| 326 @property | |
| 327 def contents(self): | |
| 328 return self._contents() | |
| 329 | |
| 330 def __getattr__(self, name): | |
| 331 return getattr(self._contents(), name) | |
| 332 | |
| 333 def __setattr__(self, name, value): | |
| 334 return setattr(self._contents(), name, value) | |
| 335 | |
| 336 def __repr__(self): | |
| 337 return repr(self.contents) | |
| 338 | |
| 339 | |
| 340 Node._fields_ = [ | |
| 341 ('type', NodeType), | |
| 342 # Set the type to Node later to avoid a circular dependency. | |
| 343 ('parent', _Ptr(Node)), | |
| 344 ('index_within_parent', ctypes.c_size_t), | |
| 345 # TODO(jdtang): Make a real list of enum constants for this. | |
| 346 ('parse_flags', _bitvector), | |
| 347 ('v', NodeUnion) | |
| 348 ] | |
| 349 NodeVector._type_ = Node | |
| 350 | |
| 351 | |
| 352 class Options(ctypes.Structure): | |
| 353 _fields_ = [ | |
| 354 # TODO(jdtang): Allow the Python API to set the allocator/deallocator | |
| 355 # function. Right now these are treated as opaque void pointers. | |
| 356 ('allocator', ctypes.c_void_p), | |
| 357 ('deallocator', ctypes.c_void_p), | |
| 358 ('userdata', ctypes.c_void_p), | |
| 359 ('tab_stop', ctypes.c_int), | |
| 360 ('stop_on_first_error', ctypes.c_bool), | |
| 361 ('max_errors', ctypes.c_int), | |
| 362 ('fragment_context', Tag), | |
| 363 ('fragment_namespace', Namespace), | |
| 364 ] | |
| 365 | |
| 366 | |
| 367 class Output(ctypes.Structure): | |
| 368 _fields_ = [ | |
| 369 ('document', _Ptr(Node)), | |
| 370 ('root', _Ptr(Node)), | |
| 371 # TODO(jdtang): Error type. | |
| 372 ('errors', Vector), | |
| 373 ] | |
| 374 | |
| 375 @contextlib.contextmanager | |
| 376 def parse(text, **kwargs): | |
| 377 options = Options() | |
| 378 for field_name, _ in Options._fields_: | |
| 379 try: | |
| 380 setattr(options, field_name, kwargs[field_name]) | |
| 381 except KeyError: | |
| 382 setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name)) | |
| 383 # We have to manually take a reference to the input text here so that it | |
| 384 # outlives the parse output. If we let ctypes do it automatically on function | |
| 385 # call, it creates a temporary buffer which is destroyed when the call | |
| 386 # completes, and then the original_text pointers point into invalid memory. | |
| 387 text_ptr = ctypes.c_char_p(text.encode('utf-8')) | |
| 388 output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) | |
| 389 try: | |
| 390 yield output | |
| 391 finally: | |
| 392 _destroy_output(ctypes.byref(options), output) | |
| 393 | |
| 394 _DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions') | |
| 395 | |
| 396 _parse_with_options = _dll.gumbo_parse_with_options | |
| 397 _parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t] | |
| 398 _parse_with_options.restype = _Ptr(Output) | |
| 399 | |
| 400 _tag_from_original_text = _dll.gumbo_tag_from_original_text | |
| 401 _tag_from_original_text.argtypes = [_Ptr(StringPiece)] | |
| 402 _tag_from_original_text.restype = None | |
| 403 | |
| 404 _normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname | |
| 405 _normalize_svg_tagname.argtypes = [_Ptr(StringPiece)] | |
| 406 _normalize_svg_tagname.restype = ctypes.c_char_p | |
| 407 | |
| 408 _destroy_output = _dll.gumbo_destroy_output | |
| 409 _destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)] | |
| 410 _destroy_output.restype = None | |
| 411 | |
| 412 _tagname = _dll.gumbo_normalized_tagname | |
| 413 _tagname.argtypes = [Tag] | |
| 414 _tagname.restype = ctypes.c_char_p | |
| 415 | |
| 416 _tag_enum = _dll.gumbo_tag_enum | |
| 417 _tag_enum.argtypes = [ctypes.c_char_p] | |
| 418 _tag_enum.restype = Tag | |
| 419 | |
| 420 __all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute', | |
| 421 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document', | |
| 422 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node', | |
| 423 'Options', 'Output', 'parse'] |
