Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/gumbo-parser/examples/serialize.cc @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, All Rights Reserved. | |
| 2 // loosely based on a greatly simplified version of BeautifulSoup4 decode() routine | |
| 3 // | |
| 4 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 // you may not use this file except in compliance with the License. | |
| 6 // You may obtain a copy of the License at | |
| 7 // | |
| 8 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 // | |
| 10 // Unless required by applicable law or agreed to in writing, software | |
| 11 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 // See the License for the specific language governing permissions and | |
| 14 // limitations under the License. | |
| 15 // | |
| 16 // Author: Kevin Hendricks | |
| 17 // | |
| 18 // Serialize back to html / xhtml making as few changes as possible (even in whitespace) | |
| 19 | |
| 20 #include <fstream> | |
| 21 #include <iostream> | |
| 22 #include <stdlib.h> | |
| 23 #include <string> | |
| 24 | |
| 25 #include "gumbo.h" | |
| 26 | |
| 27 static std::string nonbreaking_inline = "|a|abbr|acronym|b|bdo|big|cite|code|dfn|em|font|i|img|kbd|nobr|s|small|span|strike|strong|sub|sup|tt|"; | |
| 28 static std::string empty_tags = "|area|base|basefont|bgsound|br|command|col|embed|event-source|frame|hr|image|img|input|keygen|link|menuitem|meta|param|source|spacer|track|wbr|"; | |
| 29 static std::string preserve_whitespace = "|pre|textarea|script|style|"; | |
| 30 static std::string special_handling = "|html|body|"; | |
| 31 static std::string no_entity_sub = "|script|style|"; | |
| 32 | |
| 33 | |
| 34 static inline void rtrim(std::string &s) | |
| 35 { | |
| 36 s.erase(s.find_last_not_of(" \n\r\t")+1); | |
| 37 } | |
| 38 | |
| 39 | |
| 40 static inline void ltrim(std::string &s) | |
| 41 { | |
| 42 s.erase(0,s.find_first_not_of(" \n\r\t")); | |
| 43 } | |
| 44 | |
| 45 | |
| 46 static void replace_all(std::string &s, const char * s1, const char * s2) | |
| 47 { | |
| 48 std::string t1(s1); | |
| 49 size_t len = t1.length(); | |
| 50 size_t pos = s.find(t1); | |
| 51 while (pos != std::string::npos) { | |
| 52 s.replace(pos, len, s2); | |
| 53 pos = s.find(t1, pos + len); | |
| 54 } | |
| 55 } | |
| 56 | |
| 57 | |
| 58 static std::string substitute_xml_entities_into_text(const std::string &text) | |
| 59 { | |
| 60 std::string result = text; | |
| 61 // replacing & must come first | |
| 62 replace_all(result, "&", "&"); | |
| 63 replace_all(result, "<", "<"); | |
| 64 replace_all(result, ">", ">"); | |
| 65 return result; | |
| 66 } | |
| 67 | |
| 68 | |
| 69 static std::string substitute_xml_entities_into_attributes(char quote, const std::string &text) | |
| 70 { | |
| 71 std::string result = substitute_xml_entities_into_text(text); | |
| 72 if (quote == '"') { | |
| 73 replace_all(result,"\"","""); | |
| 74 } | |
| 75 else if (quote == '\'') { | |
| 76 replace_all(result,"'","'"); | |
| 77 } | |
| 78 return result; | |
| 79 } | |
| 80 | |
| 81 | |
| 82 static std::string handle_unknown_tag(GumboStringPiece *text) | |
| 83 { | |
| 84 std::string tagname = ""; | |
| 85 if (text->data == NULL) { | |
| 86 return tagname; | |
| 87 } | |
| 88 // work with copy GumboStringPiece to prevent asserts | |
| 89 // if try to read same unknown tag name more than once | |
| 90 GumboStringPiece gsp = *text; | |
| 91 gumbo_tag_from_original_text(&gsp); | |
| 92 tagname = std::string(gsp.data, gsp.length); | |
| 93 return tagname; | |
| 94 } | |
| 95 | |
| 96 | |
| 97 static std::string get_tag_name(GumboNode *node) | |
| 98 { | |
| 99 std::string tagname; | |
| 100 // work around lack of proper name for document node | |
| 101 if (node->type == GUMBO_NODE_DOCUMENT) { | |
| 102 tagname = "document"; | |
| 103 } else { | |
| 104 tagname = gumbo_normalized_tagname(node->v.element.tag); | |
| 105 } | |
| 106 if (tagname.empty()) { | |
| 107 tagname = handle_unknown_tag(&node->v.element.original_tag); | |
| 108 } | |
| 109 return tagname; | |
| 110 } | |
| 111 | |
| 112 | |
| 113 static std::string build_doctype(GumboNode *node) | |
| 114 { | |
| 115 std::string results = ""; | |
| 116 if (node->v.document.has_doctype) { | |
| 117 results.append("<!DOCTYPE "); | |
| 118 results.append(node->v.document.name); | |
| 119 std::string pi(node->v.document.public_identifier); | |
| 120 if ((node->v.document.public_identifier != NULL) && !pi.empty() ) { | |
| 121 results.append(" PUBLIC \""); | |
| 122 results.append(node->v.document.public_identifier); | |
| 123 results.append("\" \""); | |
| 124 results.append(node->v.document.system_identifier); | |
| 125 results.append("\""); | |
| 126 } | |
| 127 results.append(">\n"); | |
| 128 } | |
| 129 return results; | |
| 130 } | |
| 131 | |
| 132 | |
| 133 static std::string build_attributes(GumboAttribute * at, bool no_entities) | |
| 134 { | |
| 135 std::string atts = " "; | |
| 136 atts.append(at->name); | |
| 137 | |
| 138 // how do we want to handle attributes with empty values | |
| 139 // <input type="checkbox" checked /> or <input type="checkbox" checked="" /> | |
| 140 | |
| 141 if ( (!std::string(at->value).empty()) || | |
| 142 (at->original_value.data[0] == '"') || | |
| 143 (at->original_value.data[0] == '\'') ) { | |
| 144 | |
| 145 // determine original quote character used if it exists | |
| 146 char quote = at->original_value.data[0]; | |
| 147 std::string qs = ""; | |
| 148 if (quote == '\'') qs = std::string("'"); | |
| 149 if (quote == '"') qs = std::string("\""); | |
| 150 atts.append("="); | |
| 151 atts.append(qs); | |
| 152 if (no_entities) { | |
| 153 atts.append(at->value); | |
| 154 } else { | |
| 155 atts.append(substitute_xml_entities_into_attributes(quote, std::string(at->value))); | |
| 156 } | |
| 157 atts.append(qs); | |
| 158 } | |
| 159 return atts; | |
| 160 } | |
| 161 | |
| 162 | |
| 163 // forward declaration | |
| 164 static std::string serialize(GumboNode*); | |
| 165 | |
| 166 | |
| 167 // serialize children of a node | |
| 168 // may be invoked recursively | |
| 169 | |
| 170 static std::string serialize_contents(GumboNode* node) { | |
| 171 std::string contents = ""; | |
| 172 std::string tagname = get_tag_name(node); | |
| 173 std::string key = "|" + tagname + "|"; | |
| 174 bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos; | |
| 175 bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos; | |
| 176 bool is_inline = nonbreaking_inline.find(key) != std::string::npos; | |
| 177 | |
| 178 // build up result for each child, recursively if need be | |
| 179 GumboVector* children = &node->v.element.children; | |
| 180 | |
| 181 for (unsigned int i = 0; i < children->length; ++i) { | |
| 182 GumboNode* child = static_cast<GumboNode*> (children->data[i]); | |
| 183 | |
| 184 if (child->type == GUMBO_NODE_TEXT) { | |
| 185 if (no_entity_substitution) { | |
| 186 contents.append(std::string(child->v.text.text)); | |
| 187 } else { | |
| 188 contents.append(substitute_xml_entities_into_text(std::string(child->v.text.text))); | |
| 189 } | |
| 190 | |
| 191 } else if (child->type == GUMBO_NODE_ELEMENT || child->type == GUMBO_NODE_TEMPLATE) { | |
| 192 contents.append(serialize(child)); | |
| 193 | |
| 194 } else if (child->type == GUMBO_NODE_WHITESPACE) { | |
| 195 // keep all whitespace to keep as close to original as possible | |
| 196 contents.append(std::string(child->v.text.text)); | |
| 197 | |
| 198 } else if (child->type != GUMBO_NODE_COMMENT) { | |
| 199 // Does this actually exist: (child->type == GUMBO_NODE_CDATA) | |
| 200 fprintf(stderr, "unknown element of type: %d\n", child->type); | |
| 201 } | |
| 202 } | |
| 203 return contents; | |
| 204 } | |
| 205 | |
| 206 | |
| 207 // serialize a GumboNode back to html/xhtml | |
| 208 // may be invoked recursively | |
| 209 | |
| 210 static std::string serialize(GumboNode* node) { | |
| 211 // special case the document node | |
| 212 if (node->type == GUMBO_NODE_DOCUMENT) { | |
| 213 std::string results = build_doctype(node); | |
| 214 results.append(serialize_contents(node)); | |
| 215 return results; | |
| 216 } | |
| 217 | |
| 218 std::string close = ""; | |
| 219 std::string closeTag = ""; | |
| 220 std::string atts = ""; | |
| 221 std::string tagname = get_tag_name(node); | |
| 222 std::string key = "|" + tagname + "|"; | |
| 223 bool need_special_handling = special_handling.find(key) != std::string::npos; | |
| 224 bool is_empty_tag = empty_tags.find(key) != std::string::npos; | |
| 225 bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos; | |
| 226 bool is_inline = nonbreaking_inline.find(key) != std::string::npos; | |
| 227 | |
| 228 // build attr string | |
| 229 const GumboVector * attribs = &node->v.element.attributes; | |
| 230 for (int i=0; i< attribs->length; ++i) { | |
| 231 GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]); | |
| 232 atts.append(build_attributes(at, no_entity_substitution)); | |
| 233 } | |
| 234 | |
| 235 // determine closing tag type | |
| 236 if (is_empty_tag) { | |
| 237 close = "/"; | |
| 238 } else { | |
| 239 closeTag = "</" + tagname + ">"; | |
| 240 } | |
| 241 | |
| 242 // serialize your contents | |
| 243 std::string contents = serialize_contents(node); | |
| 244 | |
| 245 if (need_special_handling) { | |
| 246 ltrim(contents); | |
| 247 rtrim(contents); | |
| 248 contents.append("\n"); | |
| 249 } | |
| 250 | |
| 251 // build results | |
| 252 std::string results; | |
| 253 results.append("<"+tagname+atts+close+">"); | |
| 254 if (need_special_handling) results.append("\n"); | |
| 255 results.append(contents); | |
| 256 results.append(closeTag); | |
| 257 if (need_special_handling) results.append("\n"); | |
| 258 return results; | |
| 259 } | |
| 260 | |
| 261 | |
| 262 int main(int argc, char** argv) { | |
| 263 if (argc != 2) { | |
| 264 std::cout << "serialize <html filename>\n"; | |
| 265 exit(EXIT_FAILURE); | |
| 266 } | |
| 267 const char* filename = argv[1]; | |
| 268 | |
| 269 std::ifstream in(filename, std::ios::in | std::ios::binary); | |
| 270 if (!in) { | |
| 271 std::cout << "File " << filename << " not found!\n"; | |
| 272 exit(EXIT_FAILURE); | |
| 273 } | |
| 274 | |
| 275 std::string contents; | |
| 276 in.seekg(0, std::ios::end); | |
| 277 contents.resize(in.tellg()); | |
| 278 in.seekg(0, std::ios::beg); | |
| 279 in.read(&contents[0], contents.size()); | |
| 280 in.close(); | |
| 281 | |
| 282 GumboOptions options = kGumboDefaultOptions; | |
| 283 | |
| 284 GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length()); | |
| 285 std::cout << serialize(output->document) << std::endl; | |
| 286 gumbo_destroy_output(&kGumboDefaultOptions, output); | |
| 287 } |
