Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/docx_template_build.py @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #! /usr/bin/env python3 | |
| 2 | |
| 3 ''' | |
| 4 Creates C code for creating docx/odt files using internal template docx/odt | |
| 5 content. | |
| 6 | |
| 7 Args: | |
| 8 | |
| 9 --pretty <directory> | |
| 10 Prettyfies all .xml files within <directory> using 'xmllint --format'. | |
| 11 | |
| 12 -f | |
| 13 Force touch of output file, even if unchanged. | |
| 14 | |
| 15 -i <in-path> | |
| 16 Set template docx/odt file to extract from. | |
| 17 | |
| 18 -n docx | odt | |
| 19 Infix to use in generated identifier names. | |
| 20 | |
| 21 -o <out-path> | |
| 22 Set name of output files. | |
| 23 | |
| 24 We write to <out-path>.c and <out-path>.h. | |
| 25 ''' | |
| 26 | |
| 27 import io | |
| 28 import os | |
| 29 import re | |
| 30 import sys | |
| 31 import textwrap | |
| 32 | |
| 33 | |
| 34 def system(command): | |
| 35 ''' | |
| 36 Like os.system() but raises exception if command fails. | |
| 37 ''' | |
| 38 e = os.system(command) | |
| 39 if e: | |
| 40 print(f'command failed: {command}') | |
| 41 assert 0 | |
| 42 | |
| 43 def read(path, encoding): | |
| 44 ''' | |
| 45 Returns contents of file. | |
| 46 ''' | |
| 47 with open(path, 'rb') as f: | |
| 48 raw = f.read() | |
| 49 if encoding: | |
| 50 return raw.decode(encoding) | |
| 51 return raw | |
| 52 | |
| 53 def write(text, path, encoding): | |
| 54 ''' | |
| 55 Writes text to file. | |
| 56 ''' | |
| 57 parent = os.path.dirname(path) | |
| 58 if parent: | |
| 59 os.makedirs(parent, exist_ok=True) | |
| 60 with open(path, 'wb') as f: | |
| 61 f.write(text.encode(encoding)) | |
| 62 | |
| 63 def write_if_diff(text, path, encoding, force): | |
| 64 ''' | |
| 65 Does nothing if <force> is false and file named <path> already contains | |
| 66 <text>. Otherwise writes <text> to file named <path>. | |
| 67 ''' | |
| 68 if not force: | |
| 69 if os.path.isfile(path): | |
| 70 old = read(path, encoding) | |
| 71 if old == text: | |
| 72 return | |
| 73 print(f'Updating path={path} because contents have changed') | |
| 74 write(text, path, encoding) | |
| 75 | |
| 76 def check_path_safe(path): | |
| 77 ''' | |
| 78 Raises exception unless path consists only of characters and sequences that | |
| 79 are known to be safe for shell commands. | |
| 80 ''' | |
| 81 if '..' in path: | |
| 82 raise Exception(f'Path is unsafe because contains "..": {path!r}') | |
| 83 for c in path: | |
| 84 if not c.isalnum() and c not in '/._-': | |
| 85 #print(f'unsafe character {c} in: {path}') | |
| 86 raise Exception(f'Path is unsafe because contains "{c}": {path!r}') | |
| 87 | |
| 88 def path_safe(path): | |
| 89 ''' | |
| 90 Returns True if path is safe else False. | |
| 91 ''' | |
| 92 try: | |
| 93 check_path_safe(path) | |
| 94 except Exception: | |
| 95 return False | |
| 96 else: | |
| 97 return True | |
| 98 | |
| 99 assert not path_safe('foo;rm -rf *') | |
| 100 assert not path_safe('..') | |
| 101 assert path_safe('foo/bar.x') | |
| 102 | |
| 103 | |
| 104 def main(): | |
| 105 | |
| 106 path_in = None | |
| 107 path_out = None | |
| 108 infix = None | |
| 109 force = False | |
| 110 | |
| 111 args = iter(sys.argv[1:]) | |
| 112 while 1: | |
| 113 try: arg = next(args) | |
| 114 except StopIteration: break | |
| 115 if arg == '-h' or arg == '--help': | |
| 116 print(__doc__) | |
| 117 return | |
| 118 elif arg == '--pretty': | |
| 119 d = next(args) | |
| 120 for dirpath, dirnames, filenames in os.walk(d): | |
| 121 for filename in filenames: | |
| 122 if not filename.endswith('.xml'): | |
| 123 continue | |
| 124 path = os.path.join(dirpath, filename) | |
| 125 system(f'xmllint --format {path} > {path}-') | |
| 126 system(f'mv {path}- {path}') | |
| 127 elif arg == '-f': | |
| 128 force = True | |
| 129 elif arg == '-i': | |
| 130 path_in = next(args) | |
| 131 elif arg == '-n': | |
| 132 infix = next(args) | |
| 133 elif arg == '-o': | |
| 134 path_out = next(args) | |
| 135 else: | |
| 136 assert 0, f'unrecognised arg: {arg}' | |
| 137 | |
| 138 if not path_in: | |
| 139 return | |
| 140 | |
| 141 if not path_in: | |
| 142 raise Exception('Need to specify -i <in-path>') | |
| 143 if not infix: | |
| 144 raise Exception('Need to specify -n <name>') | |
| 145 if not path_out: | |
| 146 raise Exception('Need to specify -o <out-path>') | |
| 147 | |
| 148 check_path_safe(path_in) | |
| 149 check_path_safe(path_out) | |
| 150 path_temp = f'{path_in}.dir' | |
| 151 os.system(f'rm -r "{path_temp}" 2>/dev/null') | |
| 152 system(f'unzip -q -d {path_temp} {path_in}') | |
| 153 | |
| 154 out_c = io.StringIO() | |
| 155 out_c.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n') | |
| 156 out_c.write(f'\n') | |
| 157 out_c.write(f'#include "{os.path.basename(path_out)}.h"\n') | |
| 158 out_c.write(f'\n') | |
| 159 | |
| 160 | |
| 161 out_c.write(f'const {infix}_template_item_t {infix}_template_items[] =\n') | |
| 162 out_c.write(f'{{\n') | |
| 163 | |
| 164 num_items = 0 | |
| 165 for dirpath, dirnames, filenames in os.walk(path_temp): | |
| 166 dirnames.sort() | |
| 167 | |
| 168 if 0: | |
| 169 # Write code to create directory item in zip. This isn't recognised by zipinfo, and doesn't | |
| 170 # make Word like the file. | |
| 171 # | |
| 172 name = dirpath[ len(path_temp)+1: ] | |
| 173 if name: | |
| 174 if not name.endswith('/'): | |
| 175 name += '/' | |
| 176 out_c3.write(f' if (extract_zip_write_file(zip, NULL, 0, "{infix}")) goto end;\n') | |
| 177 | |
| 178 for filename in sorted(filenames): | |
| 179 num_items += 1 | |
| 180 path = os.path.join(dirpath, filename) | |
| 181 #print(f'looking at path={path}') | |
| 182 name = path[ len(path_temp)+1: ] | |
| 183 out_c.write(f' {{\n') | |
| 184 out_c.write(f' "{name}",\n') | |
| 185 if filename.endswith('.xml') or filename.endswith('.rels'): | |
| 186 text = read(os.path.join(dirpath, filename), 'utf-8') | |
| 187 #print(f'first line is: %r' % text.split("\n")[0]) | |
| 188 text = text.replace('"', '\\"') | |
| 189 | |
| 190 # Looks like .docx template files use \r\n when we interpret them as | |
| 191 # utf-8, so we preserve this in the generated strings. | |
| 192 # | |
| 193 # .odt seems to have first line ending with '\n', not '\r\n'. | |
| 194 # | |
| 195 text = text.replace('\r', '\\r') | |
| 196 text = text.replace('\n', '\\n"\n "') | |
| 197 | |
| 198 # Split on '<' to avoid overly-long lines, which break windows | |
| 199 # compiler. | |
| 200 # | |
| 201 text = re.sub('([<][^/])', '"\n "\\1', text) | |
| 202 | |
| 203 # Remove name of document creator. | |
| 204 # | |
| 205 for tag in 'dc:creator', 'cp:lastModifiedBy': | |
| 206 text = re.sub(f'[<]{tag}[>][^<]*[<]/{tag}[>]', f'<{tag}></{tag}>', text) | |
| 207 | |
| 208 out_c.write(f' "') | |
| 209 # Represent non-ascii utf-8 bytes as C escape sequences. | |
| 210 for c in text: | |
| 211 if ord( c) <= 127: | |
| 212 out_c.write( c) | |
| 213 else: | |
| 214 for cc in c.encode( 'utf-8'): | |
| 215 out_c.write( f'\\x{cc:02x}') | |
| 216 out_c.write(f'"\n') | |
| 217 else: | |
| 218 data = read(os.path.join(dirpath, filename), encoding=None) | |
| 219 out_c.write(f' "') | |
| 220 i = 0 | |
| 221 for byte in data: | |
| 222 i += 1 | |
| 223 if i % 16 == 0: | |
| 224 out_c.write(f'"\n "') | |
| 225 out_c.write(f'\\x{byte:02x}') | |
| 226 out_c.write(f'"\n') | |
| 227 | |
| 228 out_c.write(f' }},\n') | |
| 229 out_c.write(f'\n') | |
| 230 | |
| 231 out_c.write(f'}};\n') | |
| 232 out_c.write(f'\n') | |
| 233 out_c.write(f'int {infix}_template_items_num = {num_items};\n') | |
| 234 | |
| 235 out_c = out_c.getvalue() | |
| 236 write_if_diff(out_c, f'{path_out}.c', 'utf-8', force) | |
| 237 | |
| 238 out_h = io.StringIO() | |
| 239 out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n') | |
| 240 out_h.write(f'#define EXTRACT_{infix.upper()}_TEMPLATE_H\n') | |
| 241 out_h.write(f'\n') | |
| 242 out_h.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n') | |
| 243 out_h.write(f'\n') | |
| 244 out_h.write(f'\n') | |
| 245 out_h.write(f'typedef struct\n') | |
| 246 out_h.write(f'{{\n') | |
| 247 out_h.write(f' const char* name; /* Name of item in {infix} archive. */\n') | |
| 248 out_h.write(f' const char* text; /* Contents of item in {infix} archive. */\n') | |
| 249 out_h.write(f'}} {infix}_template_item_t;\n') | |
| 250 out_h.write(f'\n') | |
| 251 out_h.write(f'extern const {infix}_template_item_t {infix}_template_items[];\n') | |
| 252 out_h.write(f'extern int {infix}_template_items_num;\n') | |
| 253 out_h.write(f'\n') | |
| 254 out_h.write(f'\n') | |
| 255 out_h.write(f'#endif\n') | |
| 256 write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force) | |
| 257 #os.system(f'rm -r "{path_temp}"') | |
| 258 | |
| 259 if __name__ == '__main__': | |
| 260 main() |
