Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/zint/docs/zint_org_uk.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/zint/docs/zint_org_uk.py Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,406 @@ +# This script takes the output from pandoc and converts it into the format needed by +# the website at Zint.org.uk +# +# Warning: This code is ugly... but it saves days of manual effort updating the website. +# +# Copyright (C) 2022 <rstuart114@gmail.com> + +# Works out which tags should influence indentation and puts them on their own line +def isolate_tag(tag): + global stage + + indentable_tag = True + for keyword in indent_skip: + if keyword in tag: + indentable_tag = False + + if '</' in tag: + # Close tag + if (indentable_tag): + stage += "\n" + stage += tag + stage += "\n" + else: + stage += tag + else: + # Open tag + if (indentable_tag): + stage += "\n" + stage += tag + stage += "\n" + else: + stage += tag + +# Add the right amount of indendation (indentation X 4 spaces) +def add_indent(): + global indentation + retval = "" + + for i in range(0,indentation): + retval += " " + + return retval + +# Apply indentation to text +def with_indent(text): + global indentation + retval = "" + d = '' + + for c in text: + if d == '\n': + retval += d + retval += add_indent() + else: + retval += d + d = c + + retval += d + + return retval + +# Read file and pull some tags onto their own lines for later processing +manual = "" +tag = False +tag_buffer = "" +text_buffer = "" +stage = "" +indent_skip = ['img', 'code', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', '<a', '</a', 'sup', '<col', '</col', '<hr', 'div'] + +print("Reading... manual.html") +with open('manual.html') as f: + manual = f.read() + + for c in manual: + if c == '<': + stage += text_buffer + tag = True + tag_buffer = "" + + if (tag): + tag_buffer += c + else: + text_buffer += c + + if c == '>': + tag_buffer = tag_buffer.replace("\n", " ") + isolate_tag(tag_buffer) + tag = False + text_buffer = "" + +f.close() +manual = stage +stage = "" + +print("Adjusting HTML") +# Change the guts of the HTML tags +in_dd = False +to_remove = False +remove_next = False +span_literal = False +for c in manual: + if c == '<': + # Remove "{#tbl:" table identifiers + if '{#tbl:' in text_buffer: + text_buffer = text_buffer[text_buffer.index('tag=') + 7:-3] + text_buffer = text_buffer.replace('\n', ' ') + text_buffer = '\n' + text_buffer + '\n' + + # Remove "{@tabl:" table references + if 'tbl:' in text_buffer: + text_buffer = '' + + stage += text_buffer + tag = True + tag_buffer = "" + to_remove = False + + if (tag): + tag_buffer += c + else: + text_buffer += c + + if c == '>': + # Remove some tags which aren't needed on website + if 'span' in tag_buffer: + to_remove = True + + if 'div' in tag_buffer: + to_remove = True + + if '<col' in tag_buffer: + to_remove = True + + if '</col' in tag_buffer: + to_remove = True + + if (remove_next): + to_remove = True + remove_next = False + + if ('a href' in tag_buffer) and ('aria-hidden="true"' in tag_buffer): + to_remove = True + remove_next = True + + if '<a href="#' in tag_buffer: + to_remove = True + remove_next = True + + # Don't allow <p> and </p> between <dd> and </dd> + if (tag_buffer == "<dd>"): + in_dd = True + if (tag_buffer == "</dd>"): + in_dd = False + + if (in_dd and tag_buffer == '<p>'): + to_remove = True + + if (in_dd and tag_buffer == '</p>'): + to_remove = True + + # Remove attributes for some tags + if '<pre' in tag_buffer: + tag_buffer = '<pre>' + + if '<table' in tag_buffer: + tag_buffer = '<table>' + + if '<tr' in tag_buffer: + tag_buffer = '<tr>' + + if '<td' in tag_buffer: + tag_buffer = '<td>' + + if '<th ' in tag_buffer: + tag_buffer = '<th>' + + # Bump all headers up one level + tag_buffer = tag_buffer.replace('<h6', '<h7') + tag_buffer = tag_buffer.replace('</h6', '</h7') + tag_buffer = tag_buffer.replace('<h5', '<h6') + tag_buffer = tag_buffer.replace('</h5', '</h6') + tag_buffer = tag_buffer.replace('<h4', '<h5') + tag_buffer = tag_buffer.replace('</h4', '</h5') + tag_buffer = tag_buffer.replace('<h3', '<h4') + tag_buffer = tag_buffer.replace('</h3', '</h4') + tag_buffer = tag_buffer.replace('<h2', '<h3') + tag_buffer = tag_buffer.replace('</h2', '</h3') + tag_buffer = tag_buffer.replace('<h1', '<h2') + tag_buffer = tag_buffer.replace('</h1', '</h2') + + # Change class names for code snippets + tag_buffer = tag_buffer.replace('class="sourceCode bash"', 'class="language-bash"') + tag_buffer = tag_buffer.replace('class="sourceCode c"', 'class="language-cpp"') + + # Change location of images + tag_buffer = tag_buffer.replace('src="images/', 'src="/images/manual/') + + # Change <code> without language to <span> + if tag_buffer == '<code>': + tag_buffer = '<span class="literal">' + span_literal = True + + if tag_buffer == '</code>' and span_literal: + tag_buffer = '</span>' + span_literal = False + + if not to_remove: + stage += tag_buffer + tag = False + text_buffer = "" + +manual = stage +stage = "" + +print("Removing empty lines") +# Remove blank lines unless in between <pre> and </pre> +last_char = '' +in_pre = False +for c in manual: + if c == '<': + tag = True + tag_buffer = "" + + if (tag): + tag_buffer += c + else: + text_buffer += c + + if c == '>': + if ("<pre" in tag_buffer): + in_pre = True + if ("</pre" in tag_buffer): + in_pre = False + tag = False + text_buffer = "" + + if c == '\n': + if (last_char != '\n') or (in_pre == True): + stage += c + else: + stage += c + last_char = c + +manual = stage +stage = "" + +print("Applying indentation") +# Indent the code to make it easier to read +indentation = 1 +in_pre = False +paragraph_block = False +document_start = True +chapter_six = False +last_char = '' +for c in manual: + if c == '<': + #Fix 'floating' full stops + text_buffer = text_buffer.replace(' . ', '. ') + + # Apply indentation to text + if in_pre: + stage += text_buffer + else: + stage += with_indent(text_buffer) + tag = True + tag_buffer = "" + + if (tag): + tag_buffer += c + else: + # Strip '{}' from already removed table references + if c == '}' and last_char == '{': + text_buffer = text_buffer[:-1] + else: + text_buffer += c + last_char = c + + if c == '>': + indentable_tag = True + for keyword in indent_skip: + if keyword in tag_buffer: + indentable_tag = False + + # Protect the indentation in <pre> segments + if ('<pre' in tag_buffer): + in_pre = True + if ('</pre' in tag_buffer): + in_pre = False + + # Chapter 6 requires special treatment - detect beginning and end + if ('id="types-of-symbology"' in tag_buffer): + chapter_six = True + if ('id="legal-and-version-information"' in tag_buffer): + chapter_six = False + + if '</' in tag_buffer: + # Close tag + if (indentable_tag): + indentation -= 1 + stage += add_indent() + stage += tag_buffer + else: + if text_buffer.endswith('\n'): + stage += add_indent() + stage += tag_buffer + else: + # Split into sections + if (indentation == 1) and ('<p' in tag_buffer): + if not paragraph_block: + if document_start: + document_start = False + else: + stage += '</section>\n' + stage += '<section class="container">\n' + paragraph_block = True + + # Handle headers but also decide where to split into multiple HTML files and mark with <page> + if (indentation == 1): + if ('<h2' in tag_buffer): + if document_start: + document_start = False + stage += '<section class="container">\n' + paragraph_block = True + else: + stage += '</section>\n' + stage += '<page>\n' + stage += '<section class="container">\n' + paragraph_block = True + elif ('<h3' in tag_buffer) and chapter_six: + stage += '</section>\n' + stage += '<page>\n' + stage += '<section class="container">\n' + paragraph_block = True + elif ('<h' in tag_buffer): + if not paragraph_block: + stage += '</section>\n' + stage += '<section class="container">\n' + paragraph_block = True + + # <dl> section has it's own class + if (indentation == 1) and ('<dl' in tag_buffer): + stage += '</section>\n' + stage += '<section class="definition-list container">\n' + paragraph_block = False + + # <table> section has it's own class + if (indentation == 1) and ('<table' in tag_buffer): + stage += '</section>\n' + stage += '<section class="table">\n' + paragraph_block = False + + # Open tag + if (indentable_tag): + stage += add_indent() + stage += tag_buffer + indentation += 1 + else: + if text_buffer.endswith('\n'): + stage += add_indent() + stage += tag_buffer + tag = False + text_buffer = "" + +stage += '\n</section>\n' +manual = stage +stage = "" + +# Remove <h2> data and split into output files +out_filenames = ['chapter1.html', 'chapter2.html', 'chapter3.html', 'chapter4.html', 'chapter5.html', + 'chapter6.0.html', 'chapter6.1.html', 'chapter6.2.html', 'chapter6.3.html', 'chapter6.4.html', + 'chapter6.5.html', 'chapter6.6.html', 'chapter6.7.html', 'chapter7.html', 'appendixa.html', 'appendixb.html'] +page = 0 +print("Writing... ", out_filenames[page]) +f = open(out_filenames[page], "w") +h2_tag = False +for c in manual: + if c == '<': + if h2_tag == False: + stage += text_buffer + tag = True + tag_buffer = "" + + if (tag): + tag_buffer += c + else: + text_buffer += c + + if c == '>': + if '<h2' in tag_buffer: + h2_tag = True + elif '</h2' in tag_buffer: + h2_tag = False + elif tag_buffer == '<page>': + f.write(stage) + f.close() + stage = "" + page += 1 + print("Writing... ", out_filenames[page]) + f = open(out_filenames[page], "w") + else: + stage += tag_buffer + tag = False + text_buffer = "" + +f.write(stage) +f.close()
