comparison mupdf-source/thirdparty/gumbo-parser/examples/prettyprint.cc @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, All Rights Reserved.
2 // loosely based on a greatly simplified version of BeautifulSoup4 decode() routine
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 //
16 // Author: Kevin Hendricks
17 //
18 // Prettyprint back to html / xhtml
19
20 #include <fstream>
21 #include <iostream>
22 #include <stdlib.h>
23 #include <string>
24
25 #include "gumbo.h"
26
27 static std::string nonbreaking_inline = "|a|abbr|acronym|b|bdo|big|cite|code|dfn|em|font|i|img|kbd|nobr|s|small|span|strike|strong|sub|sup|tt|";
28 static std::string empty_tags = "|area|base|basefont|bgsound|br|command|col|embed|event-source|frame|hr|image|img|input|keygen|link|menuitem|meta|param|source|spacer|track|wbr|";
29 static std::string preserve_whitespace = "|pre|textarea|script|style|";
30 static std::string special_handling = "|html|body|";
31 static std::string no_entity_sub = "|script|style|";
32 static std::string treat_like_inline = "|p|";
33
34 static inline void rtrim(std::string &s)
35 {
36 s.erase(s.find_last_not_of(" \n\r\t")+1);
37 }
38
39
40 static inline void ltrim(std::string &s)
41 {
42 s.erase(0,s.find_first_not_of(" \n\r\t"));
43 }
44
45
46 static void replace_all(std::string &s, const char * s1, const char * s2)
47 {
48 std::string t1(s1);
49 size_t len = t1.length();
50 size_t pos = s.find(t1);
51 while (pos != std::string::npos) {
52 s.replace(pos, len, s2);
53 pos = s.find(t1, pos + len);
54 }
55 }
56
57
58 static std::string substitute_xml_entities_into_text(const std::string &text)
59 {
60 std::string result = text;
61 // replacing & must come first
62 replace_all(result, "&", "&amp;");
63 replace_all(result, "<", "&lt;");
64 replace_all(result, ">", "&gt;");
65 return result;
66 }
67
68
69 static std::string substitute_xml_entities_into_attributes(char quote, const std::string &text)
70 {
71 std::string result = substitute_xml_entities_into_text(text);
72 if (quote == '"') {
73 replace_all(result,"\"","&quot;");
74 }
75 else if (quote == '\'') {
76 replace_all(result,"'","&apos;");
77 }
78 return result;
79 }
80
81
82 static std::string handle_unknown_tag(GumboStringPiece *text)
83 {
84 std::string tagname = "";
85 if (text->data == NULL) {
86 return tagname;
87 }
88 // work with copy GumboStringPiece to prevent asserts
89 // if try to read same unknown tag name more than once
90 GumboStringPiece gsp = *text;
91 gumbo_tag_from_original_text(&gsp);
92 tagname = std::string(gsp.data, gsp.length);
93 return tagname;
94 }
95
96
97 static std::string get_tag_name(GumboNode *node)
98 {
99 std::string tagname;
100 // work around lack of proper name for document node
101 if (node->type == GUMBO_NODE_DOCUMENT) {
102 tagname = "document";
103 } else {
104 tagname = gumbo_normalized_tagname(node->v.element.tag);
105 }
106 if (tagname.empty()) {
107 tagname = handle_unknown_tag(&node->v.element.original_tag);
108 }
109 return tagname;
110 }
111
112
113 static std::string build_doctype(GumboNode *node)
114 {
115 std::string results = "";
116 if (node->v.document.has_doctype) {
117 results.append("<!DOCTYPE ");
118 results.append(node->v.document.name);
119 std::string pi(node->v.document.public_identifier);
120 if ((node->v.document.public_identifier != NULL) && !pi.empty() ) {
121 results.append(" PUBLIC \"");
122 results.append(node->v.document.public_identifier);
123 results.append("\" \"");
124 results.append(node->v.document.system_identifier);
125 results.append("\"");
126 }
127 results.append(">\n");
128 }
129 return results;
130 }
131
132
133 static std::string build_attributes(GumboAttribute * at, bool no_entities)
134 {
135 std::string atts = "";
136 atts.append(" ");
137 atts.append(at->name);
138
139 // how do we want to handle attributes with empty values
140 // <input type="checkbox" checked /> or <input type="checkbox" checked="" />
141
142 if ( (!std::string(at->value).empty()) ||
143 (at->original_value.data[0] == '"') ||
144 (at->original_value.data[0] == '\'') ) {
145
146 // determine original quote character used if it exists
147 char quote = at->original_value.data[0];
148 std::string qs = "";
149 if (quote == '\'') qs = std::string("'");
150 if (quote == '"') qs = std::string("\"");
151
152 atts.append("=");
153
154 atts.append(qs);
155
156 if (no_entities) {
157 atts.append(at->value);
158 } else {
159 atts.append(substitute_xml_entities_into_attributes(quote, std::string(at->value)));
160 }
161
162 atts.append(qs);
163 }
164 return atts;
165 }
166
167
168 // forward declaration
169
170 static std::string prettyprint(GumboNode*, int lvl, const std::string indent_chars);
171
172
173 // prettyprint children of a node
174 // may be invoked recursively
175
176 static std::string prettyprint_contents(GumboNode* node, int lvl, const std::string indent_chars) {
177
178 std::string contents = "";
179 std::string tagname = get_tag_name(node);
180 std::string key = "|" + tagname + "|";
181 bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
182 bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos;
183 bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
184 bool pp_okay = !is_inline && !keep_whitespace;
185
186 GumboVector* children = &node->v.element.children;
187
188 for (unsigned int i = 0; i < children->length; ++i) {
189 GumboNode* child = static_cast<GumboNode*> (children->data[i]);
190
191 if (child->type == GUMBO_NODE_TEXT) {
192
193 std::string val;
194
195 if (no_entity_substitution) {
196 val = std::string(child->v.text.text);
197 } else {
198 val = substitute_xml_entities_into_text(std::string(child->v.text.text));
199 }
200
201 if (pp_okay) rtrim(val);
202
203 if (pp_okay && (contents.length() == 0)) {
204 // add required indentation
205 char c = indent_chars.at(0);
206 int n = indent_chars.length();
207 contents.append(std::string((lvl-1)*n,c));
208 }
209
210 contents.append(val);
211
212
213 } else if ((child->type == GUMBO_NODE_ELEMENT) || (child->type == GUMBO_NODE_TEMPLATE)) {
214
215 std::string val = prettyprint(child, lvl, indent_chars);
216
217 // remove any indentation if this child is inline and not first child
218 std::string childname = get_tag_name(child);
219 std::string childkey = "|" + childname + "|";
220 if ((nonbreaking_inline.find(childkey) != std::string::npos) && (contents.length() > 0)) {
221 ltrim(val);
222 }
223
224 contents.append(val);
225
226 } else if (child->type == GUMBO_NODE_WHITESPACE) {
227
228 if (keep_whitespace || is_inline) {
229 contents.append(std::string(child->v.text.text));
230 }
231
232 } else if (child->type != GUMBO_NODE_COMMENT) {
233
234 // Does this actually exist: (child->type == GUMBO_NODE_CDATA)
235 fprintf(stderr, "unknown element of type: %d\n", child->type);
236
237 }
238
239 }
240
241 return contents;
242 }
243
244
245 // prettyprint a GumboNode back to html/xhtml
246 // may be invoked recursively
247
248 static std::string prettyprint(GumboNode* node, int lvl, const std::string indent_chars) {
249
250 // special case the document node
251 if (node->type == GUMBO_NODE_DOCUMENT) {
252 std::string results = build_doctype(node);
253 results.append(prettyprint_contents(node,lvl+1,indent_chars));
254 return results;
255 }
256
257 std::string close = "";
258 std::string closeTag = "";
259 std::string atts = "";
260 std::string tagname = get_tag_name(node);
261 std::string key = "|" + tagname + "|";
262 bool need_special_handling = special_handling.find(key) != std::string::npos;
263 bool is_empty_tag = empty_tags.find(key) != std::string::npos;
264 bool no_entity_substitution = no_entity_sub.find(key) != std::string::npos;
265 bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos;
266 bool is_inline = nonbreaking_inline.find(key) != std::string::npos;
267 bool inline_like = treat_like_inline.find(key) != std::string::npos;
268 bool pp_okay = !is_inline && !keep_whitespace;
269 char c = indent_chars.at(0);
270 int n = indent_chars.length();
271
272 // build attr string
273 const GumboVector * attribs = &node->v.element.attributes;
274 for (int i=0; i< attribs->length; ++i) {
275 GumboAttribute* at = static_cast<GumboAttribute*>(attribs->data[i]);
276 atts.append(build_attributes(at, no_entity_substitution));
277 }
278
279 // determine closing tag type
280 if (is_empty_tag) {
281 close = "/";
282 } else {
283 closeTag = "</" + tagname + ">";
284 }
285
286 std::string indent_space = std::string((lvl-1)*n,c);
287
288 // prettyprint your contents
289 std::string contents = prettyprint_contents(node, lvl+1, indent_chars);
290
291 if (need_special_handling) {
292 rtrim(contents);
293 contents.append("\n");
294 }
295
296 char last_char = ' ';
297 if (!contents.empty()) {
298 last_char = contents.at(contents.length()-1);
299 }
300
301 // build results
302 std::string results;
303 if (pp_okay) {
304 results.append(indent_space);
305 }
306 results.append("<"+tagname+atts+close+">");
307 if (pp_okay && !inline_like) {
308 results.append("\n");
309 }
310 if (inline_like) {
311 ltrim(contents);
312 }
313 results.append(contents);
314 if (pp_okay && !contents.empty() && (last_char != '\n') && (!inline_like)) {
315 results.append("\n");
316 }
317 if (pp_okay && !inline_like && !closeTag.empty()) {
318 results.append(indent_space);
319 }
320 results.append(closeTag);
321 if (pp_okay && !closeTag.empty()) {
322 results.append("\n");
323 }
324
325 return results;
326 }
327
328
329 int main(int argc, char** argv) {
330 if (argc != 2) {
331 std::cout << "prettyprint <html filename>\n";
332 exit(EXIT_FAILURE);
333 }
334 const char* filename = argv[1];
335
336 std::ifstream in(filename, std::ios::in | std::ios::binary);
337 if (!in) {
338 std::cout << "File " << filename << " not found!\n";
339 exit(EXIT_FAILURE);
340 }
341
342 std::string contents;
343 in.seekg(0, std::ios::end);
344 contents.resize(in.tellg());
345 in.seekg(0, std::ios::beg);
346 in.read(&contents[0], contents.size());
347 in.close();
348
349 GumboOptions options = kGumboDefaultOptions;
350
351 GumboOutput* output = gumbo_parse_with_options(&options, contents.data(), contents.length());
352 std::string indent_chars = " ";
353 std::cout << prettyprint(output->document, 0, indent_chars) << std::endl;
354 gumbo_destroy_output(&kGumboDefaultOptions, output);
355 }