Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/extract/src/docx_template_build.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/extract/src/docx_template_build.py Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,260 @@ +#! /usr/bin/env python3 + +''' +Creates C code for creating docx/odt files using internal template docx/odt +content. + +Args: + + --pretty <directory> + Prettyfies all .xml files within <directory> using 'xmllint --format'. + + -f + Force touch of output file, even if unchanged. + + -i <in-path> + Set template docx/odt file to extract from. + + -n docx | odt + Infix to use in generated identifier names. + + -o <out-path> + Set name of output files. + + We write to <out-path>.c and <out-path>.h. +''' + +import io +import os +import re +import sys +import textwrap + + +def system(command): + ''' + Like os.system() but raises exception if command fails. + ''' + e = os.system(command) + if e: + print(f'command failed: {command}') + assert 0 + +def read(path, encoding): + ''' + Returns contents of file. + ''' + with open(path, 'rb') as f: + raw = f.read() + if encoding: + return raw.decode(encoding) + return raw + +def write(text, path, encoding): + ''' + Writes text to file. + ''' + parent = os.path.dirname(path) + if parent: + os.makedirs(parent, exist_ok=True) + with open(path, 'wb') as f: + f.write(text.encode(encoding)) + +def write_if_diff(text, path, encoding, force): + ''' + Does nothing if <force> is false and file named <path> already contains + <text>. Otherwise writes <text> to file named <path>. + ''' + if not force: + if os.path.isfile(path): + old = read(path, encoding) + if old == text: + return + print(f'Updating path={path} because contents have changed') + write(text, path, encoding) + +def check_path_safe(path): + ''' + Raises exception unless path consists only of characters and sequences that + are known to be safe for shell commands. + ''' + if '..' in path: + raise Exception(f'Path is unsafe because contains "..": {path!r}') + for c in path: + if not c.isalnum() and c not in '/._-': + #print(f'unsafe character {c} in: {path}') + raise Exception(f'Path is unsafe because contains "{c}": {path!r}') + +def path_safe(path): + ''' + Returns True if path is safe else False. + ''' + try: + check_path_safe(path) + except Exception: + return False + else: + return True + +assert not path_safe('foo;rm -rf *') +assert not path_safe('..') +assert path_safe('foo/bar.x') + + +def main(): + + path_in = None + path_out = None + infix = None + force = False + + args = iter(sys.argv[1:]) + while 1: + try: arg = next(args) + except StopIteration: break + if arg == '-h' or arg == '--help': + print(__doc__) + return + elif arg == '--pretty': + d = next(args) + for dirpath, dirnames, filenames in os.walk(d): + for filename in filenames: + if not filename.endswith('.xml'): + continue + path = os.path.join(dirpath, filename) + system(f'xmllint --format {path} > {path}-') + system(f'mv {path}- {path}') + elif arg == '-f': + force = True + elif arg == '-i': + path_in = next(args) + elif arg == '-n': + infix = next(args) + elif arg == '-o': + path_out = next(args) + else: + assert 0, f'unrecognised arg: {arg}' + + if not path_in: + return + + if not path_in: + raise Exception('Need to specify -i <in-path>') + if not infix: + raise Exception('Need to specify -n <name>') + if not path_out: + raise Exception('Need to specify -o <out-path>') + + check_path_safe(path_in) + check_path_safe(path_out) + path_temp = f'{path_in}.dir' + os.system(f'rm -r "{path_temp}" 2>/dev/null') + system(f'unzip -q -d {path_temp} {path_in}') + + out_c = io.StringIO() + out_c.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n') + out_c.write(f'\n') + out_c.write(f'#include "{os.path.basename(path_out)}.h"\n') + out_c.write(f'\n') + + + out_c.write(f'const {infix}_template_item_t {infix}_template_items[] =\n') + out_c.write(f'{{\n') + + num_items = 0 + for dirpath, dirnames, filenames in os.walk(path_temp): + dirnames.sort() + + if 0: + # Write code to create directory item in zip. This isn't recognised by zipinfo, and doesn't + # make Word like the file. + # + name = dirpath[ len(path_temp)+1: ] + if name: + if not name.endswith('/'): + name += '/' + out_c3.write(f' if (extract_zip_write_file(zip, NULL, 0, "{infix}")) goto end;\n') + + for filename in sorted(filenames): + num_items += 1 + path = os.path.join(dirpath, filename) + #print(f'looking at path={path}') + name = path[ len(path_temp)+1: ] + out_c.write(f' {{\n') + out_c.write(f' "{name}",\n') + if filename.endswith('.xml') or filename.endswith('.rels'): + text = read(os.path.join(dirpath, filename), 'utf-8') + #print(f'first line is: %r' % text.split("\n")[0]) + text = text.replace('"', '\\"') + + # Looks like .docx template files use \r\n when we interpret them as + # utf-8, so we preserve this in the generated strings. + # + # .odt seems to have first line ending with '\n', not '\r\n'. + # + text = text.replace('\r', '\\r') + text = text.replace('\n', '\\n"\n "') + + # Split on '<' to avoid overly-long lines, which break windows + # compiler. + # + text = re.sub('([<][^/])', '"\n "\\1', text) + + # Remove name of document creator. + # + for tag in 'dc:creator', 'cp:lastModifiedBy': + text = re.sub(f'[<]{tag}[>][^<]*[<]/{tag}[>]', f'<{tag}></{tag}>', text) + + out_c.write(f' "') + # Represent non-ascii utf-8 bytes as C escape sequences. + for c in text: + if ord( c) <= 127: + out_c.write( c) + else: + for cc in c.encode( 'utf-8'): + out_c.write( f'\\x{cc:02x}') + out_c.write(f'"\n') + else: + data = read(os.path.join(dirpath, filename), encoding=None) + out_c.write(f' "') + i = 0 + for byte in data: + i += 1 + if i % 16 == 0: + out_c.write(f'"\n "') + out_c.write(f'\\x{byte:02x}') + out_c.write(f'"\n') + + out_c.write(f' }},\n') + out_c.write(f'\n') + + out_c.write(f'}};\n') + out_c.write(f'\n') + out_c.write(f'int {infix}_template_items_num = {num_items};\n') + + out_c = out_c.getvalue() + write_if_diff(out_c, f'{path_out}.c', 'utf-8', force) + + out_h = io.StringIO() + out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n') + out_h.write(f'#define EXTRACT_{infix.upper()}_TEMPLATE_H\n') + out_h.write(f'\n') + out_h.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n') + out_h.write(f'\n') + out_h.write(f'\n') + out_h.write(f'typedef struct\n') + out_h.write(f'{{\n') + out_h.write(f' const char* name; /* Name of item in {infix} archive. */\n') + out_h.write(f' const char* text; /* Contents of item in {infix} archive. */\n') + out_h.write(f'}} {infix}_template_item_t;\n') + out_h.write(f'\n') + out_h.write(f'extern const {infix}_template_item_t {infix}_template_items[];\n') + out_h.write(f'extern int {infix}_template_items_num;\n') + out_h.write(f'\n') + out_h.write(f'\n') + out_h.write(f'#endif\n') + write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force) + #os.system(f'rm -r "{path_temp}"') + +if __name__ == '__main__': + main()
