Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/python/gumbo/soup_adapter.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 # Copyright 2012 Google Inc. All Rights Reserved. | |
| 2 # | |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 # you may not use this file except in compliance with the License. | |
| 5 # You may obtain a copy of the License at | |
| 6 # | |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 # | |
| 9 # Unless required by applicable law or agreed to in writing, software | |
| 10 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 # See the License for the specific language governing permissions and | |
| 13 # limitations under the License. | |
| 14 # | |
| 15 | |
| 16 """Adapter between Gumbo and BeautifulSoup. | |
| 17 | |
| 18 This parses an HTML document and gives back a BeautifulSoup object, which you | |
| 19 can then manipulate like a normal BeautifulSoup parse tree. | |
| 20 """ | |
| 21 | |
| 22 __author__ = 'jdtang@google.com (Jonathan Tang)' | |
| 23 | |
| 24 import BeautifulSoup | |
| 25 | |
| 26 import gumboc | |
| 27 | |
| 28 | |
| 29 def _utf8(text): | |
| 30 return text.decode('utf-8', 'replace') | |
| 31 | |
| 32 | |
| 33 def _add_source_info(obj, original_text, start_pos, end_pos): | |
| 34 obj.original = str(original_text) | |
| 35 obj.line = start_pos.line | |
| 36 obj.col = start_pos.column | |
| 37 obj.offset = start_pos.offset | |
| 38 if end_pos: | |
| 39 obj.end_line = end_pos.line | |
| 40 obj.end_col = end_pos.column | |
| 41 obj.end_offset = end_pos.offset | |
| 42 | |
| 43 | |
| 44 def _convert_attrs(attrs): | |
| 45 # TODO(jdtang): Ideally attributes would pass along their positions as well, | |
| 46 # but I can't extend the built in str objects with new attributes. Maybe work | |
| 47 # around this with a subclass in some way... | |
| 48 return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs] | |
| 49 | |
| 50 | |
| 51 def _add_document(soup, element): | |
| 52 # Currently ignored, since there's no real place for this in the BeautifulSoup | |
| 53 # API. | |
| 54 pass | |
| 55 | |
| 56 | |
| 57 def _add_element(soup, element): | |
| 58 # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to | |
| 59 # BeautifulSoup. | |
| 60 tag = BeautifulSoup.Tag( | |
| 61 soup, _utf8(element.tag_name), _convert_attrs(element.attributes)) | |
| 62 for child in element.children: | |
| 63 tag.append(_add_node(soup, child)) | |
| 64 _add_source_info( | |
| 65 tag, element.original_tag, element.start_pos, element.end_pos) | |
| 66 tag.original_end_tag = str(element.original_end_tag) | |
| 67 return tag | |
| 68 | |
| 69 | |
| 70 def _add_text(cls): | |
| 71 def add_text_internal(soup, element): | |
| 72 text = cls(_utf8(element.text)) | |
| 73 _add_source_info(text, element.original_text, element.start_pos, None) | |
| 74 return text | |
| 75 return add_text_internal | |
| 76 | |
| 77 | |
| 78 _HANDLERS = [ | |
| 79 _add_document, | |
| 80 _add_element, | |
| 81 _add_text(BeautifulSoup.NavigableString), | |
| 82 _add_text(BeautifulSoup.CData), | |
| 83 _add_text(BeautifulSoup.Comment), | |
| 84 _add_text(BeautifulSoup.NavigableString), | |
| 85 _add_element, | |
| 86 ] | |
| 87 | |
| 88 | |
| 89 def _add_node(soup, node): | |
| 90 return _HANDLERS[node.type.value](soup, node.contents) | |
| 91 | |
| 92 | |
| 93 def _add_next_prev_pointers(soup): | |
| 94 def _traverse(node): | |
| 95 # .findAll requires the .next pointer, which is what we're trying to add | |
| 96 # when we call this, and so we manually supply a generator to yield the | |
| 97 # nodes in DOM order. | |
| 98 yield node | |
| 99 try: | |
| 100 for child in node.contents: | |
| 101 for descendant in _traverse(child): | |
| 102 yield descendant | |
| 103 except AttributeError: | |
| 104 # Not an element. | |
| 105 return | |
| 106 | |
| 107 nodes = sorted(_traverse(soup), key=lambda node: node.offset) | |
| 108 if nodes: | |
| 109 nodes[0].previous = None | |
| 110 nodes[-1].next = None | |
| 111 for i, node in enumerate(nodes[1:-1], 1): | |
| 112 nodes[i-1].next = node | |
| 113 node.previous = nodes[i-1] | |
| 114 | |
| 115 | |
| 116 def parse(text, **kwargs): | |
| 117 with gumboc.parse(text, **kwargs) as output: | |
| 118 soup = BeautifulSoup.BeautifulSoup() | |
| 119 soup.append(_add_node(soup, output.contents.root.contents)) | |
| 120 _add_next_prev_pointers(soup) | |
| 121 return soup |
