comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/soup_adapter.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 # Copyright 2012 Google Inc. All Rights Reserved.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 #
15
16 """Adapter between Gumbo and BeautifulSoup.
17
18 This parses an HTML document and gives back a BeautifulSoup object, which you
19 can then manipulate like a normal BeautifulSoup parse tree.
20 """
21
22 __author__ = 'jdtang@google.com (Jonathan Tang)'
23
24 import BeautifulSoup
25
26 import gumboc
27
28
29 def _utf8(text):
30 return text.decode('utf-8', 'replace')
31
32
33 def _add_source_info(obj, original_text, start_pos, end_pos):
34 obj.original = str(original_text)
35 obj.line = start_pos.line
36 obj.col = start_pos.column
37 obj.offset = start_pos.offset
38 if end_pos:
39 obj.end_line = end_pos.line
40 obj.end_col = end_pos.column
41 obj.end_offset = end_pos.offset
42
43
44 def _convert_attrs(attrs):
45 # TODO(jdtang): Ideally attributes would pass along their positions as well,
46 # but I can't extend the built in str objects with new attributes. Maybe work
47 # around this with a subclass in some way...
48 return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs]
49
50
51 def _add_document(soup, element):
52 # Currently ignored, since there's no real place for this in the BeautifulSoup
53 # API.
54 pass
55
56
57 def _add_element(soup, element):
58 # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to
59 # BeautifulSoup.
60 tag = BeautifulSoup.Tag(
61 soup, _utf8(element.tag_name), _convert_attrs(element.attributes))
62 for child in element.children:
63 tag.append(_add_node(soup, child))
64 _add_source_info(
65 tag, element.original_tag, element.start_pos, element.end_pos)
66 tag.original_end_tag = str(element.original_end_tag)
67 return tag
68
69
70 def _add_text(cls):
71 def add_text_internal(soup, element):
72 text = cls(_utf8(element.text))
73 _add_source_info(text, element.original_text, element.start_pos, None)
74 return text
75 return add_text_internal
76
77
78 _HANDLERS = [
79 _add_document,
80 _add_element,
81 _add_text(BeautifulSoup.NavigableString),
82 _add_text(BeautifulSoup.CData),
83 _add_text(BeautifulSoup.Comment),
84 _add_text(BeautifulSoup.NavigableString),
85 _add_element,
86 ]
87
88
89 def _add_node(soup, node):
90 return _HANDLERS[node.type.value](soup, node.contents)
91
92
93 def _add_next_prev_pointers(soup):
94 def _traverse(node):
95 # .findAll requires the .next pointer, which is what we're trying to add
96 # when we call this, and so we manually supply a generator to yield the
97 # nodes in DOM order.
98 yield node
99 try:
100 for child in node.contents:
101 for descendant in _traverse(child):
102 yield descendant
103 except AttributeError:
104 # Not an element.
105 return
106
107 nodes = sorted(_traverse(soup), key=lambda node: node.offset)
108 if nodes:
109 nodes[0].previous = None
110 nodes[-1].next = None
111 for i, node in enumerate(nodes[1:-1], 1):
112 nodes[i-1].next = node
113 node.previous = nodes[i-1]
114
115
116 def parse(text, **kwargs):
117 with gumboc.parse(text, **kwargs) as output:
118 soup = BeautifulSoup.BeautifulSoup()
119 soup.append(_add_node(soup, output.contents.root.contents))
120 _add_next_prev_pointers(soup)
121 return soup