Mercurial > hgrepos > Python2 > PyMuPDF

diff mupdf-source/thirdparty/extract/src/docx_template_build.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author: Franz Glasner <fzglas.hg@dom66.de>
date: Mon, 15 Sep 2025 11:43:07 +0200
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/thirdparty/extract/src/docx_template_build.py	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,260 @@
+#! /usr/bin/env python3
+
+'''
+Creates C code for creating docx/odt files using internal template docx/odt
+content.
+
+Args:
+
+    --pretty <directory>
+        Prettyfies all .xml files within <directory> using 'xmllint --format'.
+
+    -f
+        Force touch of output file, even if unchanged.
+
+    -i <in-path>
+        Set template docx/odt file to extract from.
+
+    -n docx | odt
+        Infix to use in generated identifier names.
+
+    -o <out-path>
+        Set name of output files.
+
+        We write to <out-path>.c and <out-path>.h.
+'''
+
+import io
+import os
+import re
+import sys
+import textwrap
+
+
+def system(command):
+    '''
+    Like os.system() but raises exception if command fails.
+    '''
+    e = os.system(command)
+    if e:
+        print(f'command failed: {command}')
+        assert 0
+
+def read(path, encoding):
+    '''
+    Returns contents of file.
+    '''
+    with open(path, 'rb') as f:
+        raw = f.read()
+        if encoding:
+            return raw.decode(encoding)
+        return raw
+
+def write(text, path, encoding):
+    '''
+    Writes text to file.
+    '''
+    parent = os.path.dirname(path)
+    if parent:
+        os.makedirs(parent, exist_ok=True)
+    with open(path, 'wb') as f:
+        f.write(text.encode(encoding))
+
+def write_if_diff(text, path, encoding, force):
+    '''
+    Does nothing if <force> is false and file named <path> already contains
+    <text>. Otherwise writes <text> to file named <path>.
+    '''
+    if not force:
+        if os.path.isfile(path):
+            old = read(path, encoding)
+            if old == text:
+                return
+        print(f'Updating path={path} because contents have changed')
+    write(text, path, encoding)
+
+def check_path_safe(path):
+    '''
+    Raises exception unless path consists only of characters and sequences that
+    are known to be safe for shell commands.
+    '''
+    if '..' in path:
+        raise Exception(f'Path is unsafe because contains "..": {path!r}')
+    for c in path:
+        if not c.isalnum() and c not in '/._-':
+            #print(f'unsafe character {c} in: {path}')
+            raise Exception(f'Path is unsafe because contains "{c}": {path!r}')
+
+def path_safe(path):
+    '''
+    Returns True if path is safe else False.
+    '''
+    try:
+        check_path_safe(path)
+    except Exception:
+        return False
+    else:
+        return True
+
+assert not path_safe('foo;rm -rf *')
+assert not path_safe('..')
+assert path_safe('foo/bar.x')
+
+
+def main():
+
+    path_in = None
+    path_out = None
+    infix = None
+    force = False
+
+    args = iter(sys.argv[1:])
+    while 1:
+        try: arg = next(args)
+        except StopIteration: break
+        if arg == '-h' or arg == '--help':
+            print(__doc__)
+            return
+        elif arg == '--pretty':
+            d = next(args)
+            for dirpath, dirnames, filenames in os.walk(d):
+                for filename in filenames:
+                    if not filename.endswith('.xml'):
+                        continue
+                    path = os.path.join(dirpath, filename)
+                    system(f'xmllint --format {path} > {path}-')
+                    system(f'mv {path}- {path}')
+        elif arg == '-f':
+            force = True
+        elif arg == '-i':
+            path_in = next(args)
+        elif arg == '-n':
+            infix = next(args)
+        elif arg == '-o':
+            path_out = next(args)
+        else:
+            assert 0, f'unrecognised arg: {arg}'
+
+    if not path_in:
+        return
+
+    if not path_in:
+        raise Exception('Need to specify -i <in-path>')
+    if not infix:
+        raise Exception('Need to specify -n <name>')
+    if not path_out:
+        raise Exception('Need to specify -o <out-path>')
+
+    check_path_safe(path_in)
+    check_path_safe(path_out)
+    path_temp = f'{path_in}.dir'
+    os.system(f'rm -r "{path_temp}" 2>/dev/null')
+    system(f'unzip -q -d {path_temp} {path_in}')
+
+    out_c = io.StringIO()
+    out_c.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n')
+    out_c.write(f'\n')
+    out_c.write(f'#include "{os.path.basename(path_out)}.h"\n')
+    out_c.write(f'\n')
+
+
+    out_c.write(f'const {infix}_template_item_t {infix}_template_items[] =\n')
+    out_c.write(f'{{\n')
+
+    num_items = 0
+    for dirpath, dirnames, filenames in os.walk(path_temp):
+        dirnames.sort()
+
+        if 0:
+            # Write code to create directory item in zip. This isn't recognised by zipinfo, and doesn't
+            # make Word like the file.
+            #
+            name = dirpath[ len(path_temp)+1: ]
+            if name:
+                if not name.endswith('/'):
+                    name += '/'
+                    out_c3.write(f'        if (extract_zip_write_file(zip, NULL, 0, "{infix}")) goto end;\n')
+
+        for filename in sorted(filenames):
+            num_items += 1
+            path = os.path.join(dirpath, filename)
+            #print(f'looking at path={path}')
+            name = path[ len(path_temp)+1: ]
+            out_c.write(f'    {{\n')
+            out_c.write(f'        "{name}",\n')
+            if filename.endswith('.xml') or filename.endswith('.rels'):
+                text = read(os.path.join(dirpath, filename), 'utf-8')
+                #print(f'first line is: %r' % text.split("\n")[0])
+                text = text.replace('"', '\\"')
+
+                # Looks like .docx template files use \r\n when we interpret them as
+                # utf-8, so we preserve this in the generated strings.
+                #
+                # .odt seems to have first line ending with '\n', not '\r\n'.
+                #
+                text = text.replace('\r', '\\r')
+                text = text.replace('\n', '\\n"\n                "')
+
+                # Split on '<' to avoid overly-long lines, which break windows
+                # compiler.
+                #
+                text = re.sub('([<][^/])', '"\n                "\\1', text)
+
+                # Remove name of document creator.
+                #
+                for tag in 'dc:creator', 'cp:lastModifiedBy':
+                    text = re.sub(f'[<]{tag}[>][^<]*[<]/{tag}[>]', f'<{tag}></{tag}>', text)
+
+                out_c.write(f'        "')
+                # Represent non-ascii utf-8 bytes as C escape sequences.
+                for c in text:
+                    if ord( c) <= 127:
+                        out_c.write( c)
+                    else:
+                        for cc in c.encode( 'utf-8'):
+                            out_c.write( f'\\x{cc:02x}')
+                out_c.write(f'"\n')
+            else:
+                data = read(os.path.join(dirpath, filename), encoding=None)
+                out_c.write(f'        "')
+                i = 0
+                for byte in data:
+                    i += 1
+                    if i % 16 == 0:
+                        out_c.write(f'"\n        "')
+                    out_c.write(f'\\x{byte:02x}')
+                out_c.write(f'"\n')
+
+            out_c.write(f'    }},\n')
+            out_c.write(f'\n')
+
+    out_c.write(f'}};\n')
+    out_c.write(f'\n')
+    out_c.write(f'int {infix}_template_items_num = {num_items};\n')
+
+    out_c = out_c.getvalue()
+    write_if_diff(out_c, f'{path_out}.c', 'utf-8', force)
+
+    out_h = io.StringIO()
+    out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n')
+    out_h.write(f'#define EXTRACT_{infix.upper()}_TEMPLATE_H\n')
+    out_h.write(f'\n')
+    out_h.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n')
+    out_h.write(f'\n')
+    out_h.write(f'\n')
+    out_h.write(f'typedef struct\n')
+    out_h.write(f'{{\n')
+    out_h.write(f'    const char* name; /* Name of item in {infix} archive. */\n')
+    out_h.write(f'    const char* text; /* Contents of item in {infix} archive. */\n')
+    out_h.write(f'}} {infix}_template_item_t;\n')
+    out_h.write(f'\n')
+    out_h.write(f'extern const {infix}_template_item_t {infix}_template_items[];\n')
+    out_h.write(f'extern int {infix}_template_items_num;\n')
+    out_h.write(f'\n')
+    out_h.write(f'\n')
+    out_h.write(f'#endif\n')
+    write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force)
+    #os.system(f'rm -r "{path_temp}"')
+
+if __name__ == '__main__':
+    main()
author	Franz Glasner <fzglas.hg@dom66.de>
date	Mon, 15 Sep 2025 11:43:07 +0200
parents
children