Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/zint/docs/zint_org_uk.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 # This script takes the output from pandoc and converts it into the format needed by | |
| 2 # the website at Zint.org.uk | |
| 3 # | |
| 4 # Warning: This code is ugly... but it saves days of manual effort updating the website. | |
| 5 # | |
| 6 # Copyright (C) 2022 <rstuart114@gmail.com> | |
| 7 | |
| 8 # Works out which tags should influence indentation and puts them on their own line | |
| 9 def isolate_tag(tag): | |
| 10 global stage | |
| 11 | |
| 12 indentable_tag = True | |
| 13 for keyword in indent_skip: | |
| 14 if keyword in tag: | |
| 15 indentable_tag = False | |
| 16 | |
| 17 if '</' in tag: | |
| 18 # Close tag | |
| 19 if (indentable_tag): | |
| 20 stage += "\n" | |
| 21 stage += tag | |
| 22 stage += "\n" | |
| 23 else: | |
| 24 stage += tag | |
| 25 else: | |
| 26 # Open tag | |
| 27 if (indentable_tag): | |
| 28 stage += "\n" | |
| 29 stage += tag | |
| 30 stage += "\n" | |
| 31 else: | |
| 32 stage += tag | |
| 33 | |
| 34 # Add the right amount of indendation (indentation X 4 spaces) | |
| 35 def add_indent(): | |
| 36 global indentation | |
| 37 retval = "" | |
| 38 | |
| 39 for i in range(0,indentation): | |
| 40 retval += " " | |
| 41 | |
| 42 return retval | |
| 43 | |
| 44 # Apply indentation to text | |
| 45 def with_indent(text): | |
| 46 global indentation | |
| 47 retval = "" | |
| 48 d = '' | |
| 49 | |
| 50 for c in text: | |
| 51 if d == '\n': | |
| 52 retval += d | |
| 53 retval += add_indent() | |
| 54 else: | |
| 55 retval += d | |
| 56 d = c | |
| 57 | |
| 58 retval += d | |
| 59 | |
| 60 return retval | |
| 61 | |
| 62 # Read file and pull some tags onto their own lines for later processing | |
| 63 manual = "" | |
| 64 tag = False | |
| 65 tag_buffer = "" | |
| 66 text_buffer = "" | |
| 67 stage = "" | |
| 68 indent_skip = ['img', 'code', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', '<a', '</a', 'sup', '<col', '</col', '<hr', 'div'] | |
| 69 | |
| 70 print("Reading... manual.html") | |
| 71 with open('manual.html') as f: | |
| 72 manual = f.read() | |
| 73 | |
| 74 for c in manual: | |
| 75 if c == '<': | |
| 76 stage += text_buffer | |
| 77 tag = True | |
| 78 tag_buffer = "" | |
| 79 | |
| 80 if (tag): | |
| 81 tag_buffer += c | |
| 82 else: | |
| 83 text_buffer += c | |
| 84 | |
| 85 if c == '>': | |
| 86 tag_buffer = tag_buffer.replace("\n", " ") | |
| 87 isolate_tag(tag_buffer) | |
| 88 tag = False | |
| 89 text_buffer = "" | |
| 90 | |
| 91 f.close() | |
| 92 manual = stage | |
| 93 stage = "" | |
| 94 | |
| 95 print("Adjusting HTML") | |
| 96 # Change the guts of the HTML tags | |
| 97 in_dd = False | |
| 98 to_remove = False | |
| 99 remove_next = False | |
| 100 span_literal = False | |
| 101 for c in manual: | |
| 102 if c == '<': | |
| 103 # Remove "{#tbl:" table identifiers | |
| 104 if '{#tbl:' in text_buffer: | |
| 105 text_buffer = text_buffer[text_buffer.index('tag=') + 7:-3] | |
| 106 text_buffer = text_buffer.replace('\n', ' ') | |
| 107 text_buffer = '\n' + text_buffer + '\n' | |
| 108 | |
| 109 # Remove "{@tabl:" table references | |
| 110 if 'tbl:' in text_buffer: | |
| 111 text_buffer = '' | |
| 112 | |
| 113 stage += text_buffer | |
| 114 tag = True | |
| 115 tag_buffer = "" | |
| 116 to_remove = False | |
| 117 | |
| 118 if (tag): | |
| 119 tag_buffer += c | |
| 120 else: | |
| 121 text_buffer += c | |
| 122 | |
| 123 if c == '>': | |
| 124 # Remove some tags which aren't needed on website | |
| 125 if 'span' in tag_buffer: | |
| 126 to_remove = True | |
| 127 | |
| 128 if 'div' in tag_buffer: | |
| 129 to_remove = True | |
| 130 | |
| 131 if '<col' in tag_buffer: | |
| 132 to_remove = True | |
| 133 | |
| 134 if '</col' in tag_buffer: | |
| 135 to_remove = True | |
| 136 | |
| 137 if (remove_next): | |
| 138 to_remove = True | |
| 139 remove_next = False | |
| 140 | |
| 141 if ('a href' in tag_buffer) and ('aria-hidden="true"' in tag_buffer): | |
| 142 to_remove = True | |
| 143 remove_next = True | |
| 144 | |
| 145 if '<a href="#' in tag_buffer: | |
| 146 to_remove = True | |
| 147 remove_next = True | |
| 148 | |
| 149 # Don't allow <p> and </p> between <dd> and </dd> | |
| 150 if (tag_buffer == "<dd>"): | |
| 151 in_dd = True | |
| 152 if (tag_buffer == "</dd>"): | |
| 153 in_dd = False | |
| 154 | |
| 155 if (in_dd and tag_buffer == '<p>'): | |
| 156 to_remove = True | |
| 157 | |
| 158 if (in_dd and tag_buffer == '</p>'): | |
| 159 to_remove = True | |
| 160 | |
| 161 # Remove attributes for some tags | |
| 162 if '<pre' in tag_buffer: | |
| 163 tag_buffer = '<pre>' | |
| 164 | |
| 165 if '<table' in tag_buffer: | |
| 166 tag_buffer = '<table>' | |
| 167 | |
| 168 if '<tr' in tag_buffer: | |
| 169 tag_buffer = '<tr>' | |
| 170 | |
| 171 if '<td' in tag_buffer: | |
| 172 tag_buffer = '<td>' | |
| 173 | |
| 174 if '<th ' in tag_buffer: | |
| 175 tag_buffer = '<th>' | |
| 176 | |
| 177 # Bump all headers up one level | |
| 178 tag_buffer = tag_buffer.replace('<h6', '<h7') | |
| 179 tag_buffer = tag_buffer.replace('</h6', '</h7') | |
| 180 tag_buffer = tag_buffer.replace('<h5', '<h6') | |
| 181 tag_buffer = tag_buffer.replace('</h5', '</h6') | |
| 182 tag_buffer = tag_buffer.replace('<h4', '<h5') | |
| 183 tag_buffer = tag_buffer.replace('</h4', '</h5') | |
| 184 tag_buffer = tag_buffer.replace('<h3', '<h4') | |
| 185 tag_buffer = tag_buffer.replace('</h3', '</h4') | |
| 186 tag_buffer = tag_buffer.replace('<h2', '<h3') | |
| 187 tag_buffer = tag_buffer.replace('</h2', '</h3') | |
| 188 tag_buffer = tag_buffer.replace('<h1', '<h2') | |
| 189 tag_buffer = tag_buffer.replace('</h1', '</h2') | |
| 190 | |
| 191 # Change class names for code snippets | |
| 192 tag_buffer = tag_buffer.replace('class="sourceCode bash"', 'class="language-bash"') | |
| 193 tag_buffer = tag_buffer.replace('class="sourceCode c"', 'class="language-cpp"') | |
| 194 | |
| 195 # Change location of images | |
| 196 tag_buffer = tag_buffer.replace('src="images/', 'src="/images/manual/') | |
| 197 | |
| 198 # Change <code> without language to <span> | |
| 199 if tag_buffer == '<code>': | |
| 200 tag_buffer = '<span class="literal">' | |
| 201 span_literal = True | |
| 202 | |
| 203 if tag_buffer == '</code>' and span_literal: | |
| 204 tag_buffer = '</span>' | |
| 205 span_literal = False | |
| 206 | |
| 207 if not to_remove: | |
| 208 stage += tag_buffer | |
| 209 tag = False | |
| 210 text_buffer = "" | |
| 211 | |
| 212 manual = stage | |
| 213 stage = "" | |
| 214 | |
| 215 print("Removing empty lines") | |
| 216 # Remove blank lines unless in between <pre> and </pre> | |
| 217 last_char = '' | |
| 218 in_pre = False | |
| 219 for c in manual: | |
| 220 if c == '<': | |
| 221 tag = True | |
| 222 tag_buffer = "" | |
| 223 | |
| 224 if (tag): | |
| 225 tag_buffer += c | |
| 226 else: | |
| 227 text_buffer += c | |
| 228 | |
| 229 if c == '>': | |
| 230 if ("<pre" in tag_buffer): | |
| 231 in_pre = True | |
| 232 if ("</pre" in tag_buffer): | |
| 233 in_pre = False | |
| 234 tag = False | |
| 235 text_buffer = "" | |
| 236 | |
| 237 if c == '\n': | |
| 238 if (last_char != '\n') or (in_pre == True): | |
| 239 stage += c | |
| 240 else: | |
| 241 stage += c | |
| 242 last_char = c | |
| 243 | |
| 244 manual = stage | |
| 245 stage = "" | |
| 246 | |
| 247 print("Applying indentation") | |
| 248 # Indent the code to make it easier to read | |
| 249 indentation = 1 | |
| 250 in_pre = False | |
| 251 paragraph_block = False | |
| 252 document_start = True | |
| 253 chapter_six = False | |
| 254 last_char = '' | |
| 255 for c in manual: | |
| 256 if c == '<': | |
| 257 #Fix 'floating' full stops | |
| 258 text_buffer = text_buffer.replace(' . ', '. ') | |
| 259 | |
| 260 # Apply indentation to text | |
| 261 if in_pre: | |
| 262 stage += text_buffer | |
| 263 else: | |
| 264 stage += with_indent(text_buffer) | |
| 265 tag = True | |
| 266 tag_buffer = "" | |
| 267 | |
| 268 if (tag): | |
| 269 tag_buffer += c | |
| 270 else: | |
| 271 # Strip '{}' from already removed table references | |
| 272 if c == '}' and last_char == '{': | |
| 273 text_buffer = text_buffer[:-1] | |
| 274 else: | |
| 275 text_buffer += c | |
| 276 last_char = c | |
| 277 | |
| 278 if c == '>': | |
| 279 indentable_tag = True | |
| 280 for keyword in indent_skip: | |
| 281 if keyword in tag_buffer: | |
| 282 indentable_tag = False | |
| 283 | |
| 284 # Protect the indentation in <pre> segments | |
| 285 if ('<pre' in tag_buffer): | |
| 286 in_pre = True | |
| 287 if ('</pre' in tag_buffer): | |
| 288 in_pre = False | |
| 289 | |
| 290 # Chapter 6 requires special treatment - detect beginning and end | |
| 291 if ('id="types-of-symbology"' in tag_buffer): | |
| 292 chapter_six = True | |
| 293 if ('id="legal-and-version-information"' in tag_buffer): | |
| 294 chapter_six = False | |
| 295 | |
| 296 if '</' in tag_buffer: | |
| 297 # Close tag | |
| 298 if (indentable_tag): | |
| 299 indentation -= 1 | |
| 300 stage += add_indent() | |
| 301 stage += tag_buffer | |
| 302 else: | |
| 303 if text_buffer.endswith('\n'): | |
| 304 stage += add_indent() | |
| 305 stage += tag_buffer | |
| 306 else: | |
| 307 # Split into sections | |
| 308 if (indentation == 1) and ('<p' in tag_buffer): | |
| 309 if not paragraph_block: | |
| 310 if document_start: | |
| 311 document_start = False | |
| 312 else: | |
| 313 stage += '</section>\n' | |
| 314 stage += '<section class="container">\n' | |
| 315 paragraph_block = True | |
| 316 | |
| 317 # Handle headers but also decide where to split into multiple HTML files and mark with <page> | |
| 318 if (indentation == 1): | |
| 319 if ('<h2' in tag_buffer): | |
| 320 if document_start: | |
| 321 document_start = False | |
| 322 stage += '<section class="container">\n' | |
| 323 paragraph_block = True | |
| 324 else: | |
| 325 stage += '</section>\n' | |
| 326 stage += '<page>\n' | |
| 327 stage += '<section class="container">\n' | |
| 328 paragraph_block = True | |
| 329 elif ('<h3' in tag_buffer) and chapter_six: | |
| 330 stage += '</section>\n' | |
| 331 stage += '<page>\n' | |
| 332 stage += '<section class="container">\n' | |
| 333 paragraph_block = True | |
| 334 elif ('<h' in tag_buffer): | |
| 335 if not paragraph_block: | |
| 336 stage += '</section>\n' | |
| 337 stage += '<section class="container">\n' | |
| 338 paragraph_block = True | |
| 339 | |
| 340 # <dl> section has it's own class | |
| 341 if (indentation == 1) and ('<dl' in tag_buffer): | |
| 342 stage += '</section>\n' | |
| 343 stage += '<section class="definition-list container">\n' | |
| 344 paragraph_block = False | |
| 345 | |
| 346 # <table> section has it's own class | |
| 347 if (indentation == 1) and ('<table' in tag_buffer): | |
| 348 stage += '</section>\n' | |
| 349 stage += '<section class="table">\n' | |
| 350 paragraph_block = False | |
| 351 | |
| 352 # Open tag | |
| 353 if (indentable_tag): | |
| 354 stage += add_indent() | |
| 355 stage += tag_buffer | |
| 356 indentation += 1 | |
| 357 else: | |
| 358 if text_buffer.endswith('\n'): | |
| 359 stage += add_indent() | |
| 360 stage += tag_buffer | |
| 361 tag = False | |
| 362 text_buffer = "" | |
| 363 | |
| 364 stage += '\n</section>\n' | |
| 365 manual = stage | |
| 366 stage = "" | |
| 367 | |
| 368 # Remove <h2> data and split into output files | |
| 369 out_filenames = ['chapter1.html', 'chapter2.html', 'chapter3.html', 'chapter4.html', 'chapter5.html', | |
| 370 'chapter6.0.html', 'chapter6.1.html', 'chapter6.2.html', 'chapter6.3.html', 'chapter6.4.html', | |
| 371 'chapter6.5.html', 'chapter6.6.html', 'chapter6.7.html', 'chapter7.html', 'appendixa.html', 'appendixb.html'] | |
| 372 page = 0 | |
| 373 print("Writing... ", out_filenames[page]) | |
| 374 f = open(out_filenames[page], "w") | |
| 375 h2_tag = False | |
| 376 for c in manual: | |
| 377 if c == '<': | |
| 378 if h2_tag == False: | |
| 379 stage += text_buffer | |
| 380 tag = True | |
| 381 tag_buffer = "" | |
| 382 | |
| 383 if (tag): | |
| 384 tag_buffer += c | |
| 385 else: | |
| 386 text_buffer += c | |
| 387 | |
| 388 if c == '>': | |
| 389 if '<h2' in tag_buffer: | |
| 390 h2_tag = True | |
| 391 elif '</h2' in tag_buffer: | |
| 392 h2_tag = False | |
| 393 elif tag_buffer == '<page>': | |
| 394 f.write(stage) | |
| 395 f.close() | |
| 396 stage = "" | |
| 397 page += 1 | |
| 398 print("Writing... ", out_filenames[page]) | |
| 399 f = open(out_filenames[page], "w") | |
| 400 else: | |
| 401 stage += tag_buffer | |
| 402 tag = False | |
| 403 text_buffer = "" | |
| 404 | |
| 405 f.write(stage) | |
| 406 f.close() |
