comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/gumboc.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 # Copyright 2012 Google Inc. All Rights Reserved.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 #
15
16 """CTypes bindings for the Gumbo HTML5 parser.
17
18 This exports the raw interface of the library as a set of very thin ctypes
19 wrappers. It's intended to be wrapped by other libraries to provide a more
20 Pythonic API.
21 """
22
23 __author__ = 'jdtang@google.com (Jonathan Tang)'
24
25 import sys
26 import contextlib
27 import ctypes
28 import os.path
29 import gumboc_tags
30
31 _name_of_lib = 'libgumbo.so'
32 if sys.platform.startswith('darwin'):
33 _name_of_lib = 'libgumbo.dylib'
34 elif sys.platform.startswith('win'):
35 _name_of_lib = "gumbo.dll"
36
37 try:
38 # First look for a freshly-built .so in the .libs directory, for development.
39 _dll = ctypes.cdll.LoadLibrary(os.path.join(
40 os.path.dirname(__file__), '..', '..', '.libs', _name_of_lib))
41 except OSError:
42 # PyPI or setuptools install, look in the current directory.
43 _dll = ctypes.cdll.LoadLibrary(os.path.join(
44 os.path.dirname(__file__), _name_of_lib))
45 except OSError:
46 # System library, on unix or mac osx
47 _dll = ctypes.cdll.LoadLibrary(_name_of_lib)
48
49 # Some aliases for common types.
50 _bitvector = ctypes.c_uint
51 _Ptr = ctypes.POINTER
52
53 class EnumMetaclass(type(ctypes.c_uint)):
54 def __new__(metaclass, name, bases, cls_dict):
55 cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
56 if name == 'Enum':
57 return cls
58 try:
59 for i, value in enumerate(cls_dict['_values_']):
60 setattr(cls, value, cls.from_param(i))
61 except KeyError:
62 raise ValueError('No _values_ list found inside enum type.')
63 except TypeError:
64 raise ValueError('_values_ must be a list of names of enum constants.')
65 return cls
66
67 def with_metaclass(mcls):
68 def decorator(cls):
69 body = vars(cls).copy()
70 # clean out class body
71 body.pop('__dict__', None)
72 body.pop('__weakref__', None)
73 return mcls(cls.__name__, cls.__bases__, body)
74 return decorator
75
76 @with_metaclass(EnumMetaclass)
77 class Enum(ctypes.c_uint):
78 @classmethod
79 def from_param(cls, param):
80 if isinstance(param, Enum):
81 if param.__class__ != cls:
82 raise ValueError("Can't mix enums of different types")
83 return param
84 if param < 0 or param > len(cls._values_):
85 raise ValueError('%d is out of range for enum type %s; max %d.' %
86 (param, cls.__name__, len(cls._values_)))
87 return cls(param)
88
89 def __eq__(self, other):
90 return self.value == other.value
91
92 def __ne__(self, other):
93 return self.value != other.value
94
95 def __hash__(self):
96 return hash(self.value)
97
98 def __repr__(self):
99 try:
100 return self._values_[self.value]
101 except IndexError:
102 raise IndexError('Value %d is out of range for %r' %
103 (self.value, self._values_))
104
105
106 class StringPiece(ctypes.Structure):
107 _fields_ = [
108 ('data', _Ptr(ctypes.c_char)),
109 ('length', ctypes.c_size_t),
110 ]
111
112 def __len__(self):
113 return self.length
114
115 def __str__(self):
116 return ctypes.string_at(self.data, self.length)
117
118
119 class SourcePosition(ctypes.Structure):
120 _fields_ = [
121 ('line', ctypes.c_uint),
122 ('column', ctypes.c_uint),
123 ('offset', ctypes.c_uint)
124 ]
125 SourcePosition.EMPTY = SourcePosition.in_dll(_dll, 'kGumboEmptySourcePosition')
126
127
128 class AttributeNamespace(Enum):
129 URLS = [
130 'http://www.w3.org/1999/xhtml',
131 'http://www.w3.org/1999/xlink',
132 'http://www.w3.org/XML/1998/namespace',
133 'http://www.w3.org/2000/xmlns',
134 ]
135 _values_ = ['NONE', 'XLINK', 'XML', 'XMLNS']
136
137 def to_url(self):
138 return self.URLS[self.value]
139
140
141 class Attribute(ctypes.Structure):
142 _fields_ = [
143 ('namespace', AttributeNamespace),
144 ('name', ctypes.c_char_p),
145 ('original_name', StringPiece),
146 ('value', ctypes.c_char_p),
147 ('original_value', StringPiece),
148 ('name_start', SourcePosition),
149 ('name_end', SourcePosition),
150 ('value_start', SourcePosition),
151 ('value_end', SourcePosition)
152 ]
153
154
155 class Vector(ctypes.Structure):
156 _type_ = ctypes.c_void_p
157 _fields_ = [
158 ('data', _Ptr(ctypes.c_void_p)),
159 ('length', ctypes.c_uint),
160 ('capacity', ctypes.c_uint)
161 ]
162
163 class Iter(object):
164 def __init__(self, vector):
165 self.current = 0
166 self.vector = vector
167
168 def __iter__(self):
169 return self
170
171 def __next__(self):
172 # Python 3
173 if self.current >= self.vector.length:
174 raise StopIteration
175 obj = self.vector[self.current]
176 self.current += 1
177 return obj
178
179 def next(self):
180 # Python 2
181 return self.__next__()
182
183 def __len__(self):
184 return self.length
185
186 def __getitem__(self, i):
187 try:
188 # Python 2
189 numeric_types = (int, long)
190 except NameError:
191 # Python 3
192 numeric_types = int
193
194 if isinstance(i, numeric_types):
195 if i < 0:
196 i += self.length
197 if i > self.length:
198 raise IndexError
199 array_type = _Ptr(_Ptr(self._type_))
200 return ctypes.cast(self.data, array_type)[i].contents
201 return list(self)[i]
202
203 def __iter__(self):
204 return Vector.Iter(self)
205
206
207 Vector.EMPTY = Vector.in_dll(_dll, 'kGumboEmptyVector')
208
209
210 class AttributeVector(Vector):
211 _type_ = Attribute
212
213
214 class NodeVector(Vector):
215 # _type_ assigned later, to avoid circular references with Node
216 pass
217
218
219 class QuirksMode(Enum):
220 _values_ = ['NO_QUIRKS', 'QUIRKS', 'LIMITED_QUIRKS']
221
222
223 class Document(ctypes.Structure):
224 _fields_ = [
225 ('children', NodeVector),
226 ('has_doctype', ctypes.c_bool),
227 ('name', ctypes.c_char_p),
228 ('public_identifier', ctypes.c_char_p),
229 ('system_identifier', ctypes.c_char_p),
230 ('doc_type_quirks_mode', QuirksMode),
231 ]
232
233 def __repr__(self):
234 return 'Document'
235
236
237 class Namespace(Enum):
238 URLS = [
239 'http://www.w3.org/1999/xhtml',
240 'http://www.w3.org/2000/svg',
241 'http://www.w3.org/1998/Math/MathML',
242 ]
243 _values_ = ['HTML', 'SVG', 'MATHML']
244
245 def to_url(self):
246 return self.URLS[self.value]
247
248
249 class Tag(Enum):
250 @staticmethod
251 def from_str(tagname):
252 text_ptr = ctypes.c_char_p(tagname.encode('utf-8'))
253 return _tag_enum(text_ptr)
254
255 _values_ = gumboc_tags.TagNames + ['UNKNOWN', 'LAST']
256
257 class Element(ctypes.Structure):
258 _fields_ = [
259 ('children', NodeVector),
260 ('tag', Tag),
261 ('tag_namespace', Namespace),
262 ('original_tag', StringPiece),
263 ('original_end_tag', StringPiece),
264 ('start_pos', SourcePosition),
265 ('end_pos', SourcePosition),
266 ('attributes', AttributeVector),
267 ]
268
269 @property
270 def tag_name(self):
271 original_tag = StringPiece.from_buffer_copy(self.original_tag)
272 _tag_from_original_text(ctypes.byref(original_tag))
273 if self.tag_namespace == Namespace.SVG:
274 svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
275 if svg_tagname is not None:
276 return str(svg_tagname)
277 if self.tag == Tag.UNKNOWN:
278 if original_tag.data is None:
279 return ''
280 return str(original_tag).lower()
281 return _tagname(self.tag)
282
283 def __repr__(self):
284 return ('<%r>\n' % self.tag +
285 '\n'.join(repr(child) for child in self.children) +
286 '</%r>' % self.tag)
287
288
289 class Text(ctypes.Structure):
290 _fields_ = [
291 ('text', ctypes.c_char_p),
292 ('original_text', StringPiece),
293 ('start_pos', SourcePosition)
294 ]
295
296 def __repr__(self):
297 return 'Text(%r)' % self.text
298
299
300 class NodeType(Enum):
301 _values_ = ['DOCUMENT', 'ELEMENT', 'TEXT', 'CDATA',
302 'COMMENT', 'WHITESPACE', 'TEMPLATE']
303
304
305 class NodeUnion(ctypes.Union):
306 _fields_ = [
307 ('document', Document),
308 ('element', Element),
309 ('text', Text),
310 ]
311
312
313 class Node(ctypes.Structure):
314 # _fields_ set later to avoid a circular reference
315
316 def _contents(self):
317 # Python3 enters an infinite loop if you use an @property within
318 # __getattr__, so we factor it out to a helper.
319 if self.type == NodeType.DOCUMENT:
320 return self.v.document
321 elif self.type in (NodeType.ELEMENT, NodeType.TEMPLATE):
322 return self.v.element
323 else:
324 return self.v.text
325
326 @property
327 def contents(self):
328 return self._contents()
329
330 def __getattr__(self, name):
331 return getattr(self._contents(), name)
332
333 def __setattr__(self, name, value):
334 return setattr(self._contents(), name, value)
335
336 def __repr__(self):
337 return repr(self.contents)
338
339
340 Node._fields_ = [
341 ('type', NodeType),
342 # Set the type to Node later to avoid a circular dependency.
343 ('parent', _Ptr(Node)),
344 ('index_within_parent', ctypes.c_size_t),
345 # TODO(jdtang): Make a real list of enum constants for this.
346 ('parse_flags', _bitvector),
347 ('v', NodeUnion)
348 ]
349 NodeVector._type_ = Node
350
351
352 class Options(ctypes.Structure):
353 _fields_ = [
354 # TODO(jdtang): Allow the Python API to set the allocator/deallocator
355 # function. Right now these are treated as opaque void pointers.
356 ('allocator', ctypes.c_void_p),
357 ('deallocator', ctypes.c_void_p),
358 ('userdata', ctypes.c_void_p),
359 ('tab_stop', ctypes.c_int),
360 ('stop_on_first_error', ctypes.c_bool),
361 ('max_errors', ctypes.c_int),
362 ('fragment_context', Tag),
363 ('fragment_namespace', Namespace),
364 ]
365
366
367 class Output(ctypes.Structure):
368 _fields_ = [
369 ('document', _Ptr(Node)),
370 ('root', _Ptr(Node)),
371 # TODO(jdtang): Error type.
372 ('errors', Vector),
373 ]
374
375 @contextlib.contextmanager
376 def parse(text, **kwargs):
377 options = Options()
378 for field_name, _ in Options._fields_:
379 try:
380 setattr(options, field_name, kwargs[field_name])
381 except KeyError:
382 setattr(options, field_name, getattr(_DEFAULT_OPTIONS, field_name))
383 # We have to manually take a reference to the input text here so that it
384 # outlives the parse output. If we let ctypes do it automatically on function
385 # call, it creates a temporary buffer which is destroyed when the call
386 # completes, and then the original_text pointers point into invalid memory.
387 text_ptr = ctypes.c_char_p(text.encode('utf-8'))
388 output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
389 try:
390 yield output
391 finally:
392 _destroy_output(ctypes.byref(options), output)
393
394 _DEFAULT_OPTIONS = Options.in_dll(_dll, 'kGumboDefaultOptions')
395
396 _parse_with_options = _dll.gumbo_parse_with_options
397 _parse_with_options.argtypes = [_Ptr(Options), ctypes.c_char_p, ctypes.c_size_t]
398 _parse_with_options.restype = _Ptr(Output)
399
400 _tag_from_original_text = _dll.gumbo_tag_from_original_text
401 _tag_from_original_text.argtypes = [_Ptr(StringPiece)]
402 _tag_from_original_text.restype = None
403
404 _normalize_svg_tagname = _dll.gumbo_normalize_svg_tagname
405 _normalize_svg_tagname.argtypes = [_Ptr(StringPiece)]
406 _normalize_svg_tagname.restype = ctypes.c_char_p
407
408 _destroy_output = _dll.gumbo_destroy_output
409 _destroy_output.argtypes = [_Ptr(Options), _Ptr(Output)]
410 _destroy_output.restype = None
411
412 _tagname = _dll.gumbo_normalized_tagname
413 _tagname.argtypes = [Tag]
414 _tagname.restype = ctypes.c_char_p
415
416 _tag_enum = _dll.gumbo_tag_enum
417 _tag_enum.argtypes = [ctypes.c_char_p]
418 _tag_enum.restype = Tag
419
420 __all__ = ['StringPiece', 'SourcePosition', 'AttributeNamespace', 'Attribute',
421 'Vector', 'AttributeVector', 'NodeVector', 'QuirksMode', 'Document',
422 'Namespace', 'Tag', 'Element', 'Text', 'NodeType', 'Node',
423 'Options', 'Output', 'parse']