comparison mupdf-source/thirdparty/extract/src/docx_template_build.py @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #! /usr/bin/env python3
2
3 '''
4 Creates C code for creating docx/odt files using internal template docx/odt
5 content.
6
7 Args:
8
9 --pretty <directory>
10 Prettyfies all .xml files within <directory> using 'xmllint --format'.
11
12 -f
13 Force touch of output file, even if unchanged.
14
15 -i <in-path>
16 Set template docx/odt file to extract from.
17
18 -n docx | odt
19 Infix to use in generated identifier names.
20
21 -o <out-path>
22 Set name of output files.
23
24 We write to <out-path>.c and <out-path>.h.
25 '''
26
27 import io
28 import os
29 import re
30 import sys
31 import textwrap
32
33
34 def system(command):
35 '''
36 Like os.system() but raises exception if command fails.
37 '''
38 e = os.system(command)
39 if e:
40 print(f'command failed: {command}')
41 assert 0
42
43 def read(path, encoding):
44 '''
45 Returns contents of file.
46 '''
47 with open(path, 'rb') as f:
48 raw = f.read()
49 if encoding:
50 return raw.decode(encoding)
51 return raw
52
53 def write(text, path, encoding):
54 '''
55 Writes text to file.
56 '''
57 parent = os.path.dirname(path)
58 if parent:
59 os.makedirs(parent, exist_ok=True)
60 with open(path, 'wb') as f:
61 f.write(text.encode(encoding))
62
63 def write_if_diff(text, path, encoding, force):
64 '''
65 Does nothing if <force> is false and file named <path> already contains
66 <text>. Otherwise writes <text> to file named <path>.
67 '''
68 if not force:
69 if os.path.isfile(path):
70 old = read(path, encoding)
71 if old == text:
72 return
73 print(f'Updating path={path} because contents have changed')
74 write(text, path, encoding)
75
76 def check_path_safe(path):
77 '''
78 Raises exception unless path consists only of characters and sequences that
79 are known to be safe for shell commands.
80 '''
81 if '..' in path:
82 raise Exception(f'Path is unsafe because contains "..": {path!r}')
83 for c in path:
84 if not c.isalnum() and c not in '/._-':
85 #print(f'unsafe character {c} in: {path}')
86 raise Exception(f'Path is unsafe because contains "{c}": {path!r}')
87
88 def path_safe(path):
89 '''
90 Returns True if path is safe else False.
91 '''
92 try:
93 check_path_safe(path)
94 except Exception:
95 return False
96 else:
97 return True
98
99 assert not path_safe('foo;rm -rf *')
100 assert not path_safe('..')
101 assert path_safe('foo/bar.x')
102
103
104 def main():
105
106 path_in = None
107 path_out = None
108 infix = None
109 force = False
110
111 args = iter(sys.argv[1:])
112 while 1:
113 try: arg = next(args)
114 except StopIteration: break
115 if arg == '-h' or arg == '--help':
116 print(__doc__)
117 return
118 elif arg == '--pretty':
119 d = next(args)
120 for dirpath, dirnames, filenames in os.walk(d):
121 for filename in filenames:
122 if not filename.endswith('.xml'):
123 continue
124 path = os.path.join(dirpath, filename)
125 system(f'xmllint --format {path} > {path}-')
126 system(f'mv {path}- {path}')
127 elif arg == '-f':
128 force = True
129 elif arg == '-i':
130 path_in = next(args)
131 elif arg == '-n':
132 infix = next(args)
133 elif arg == '-o':
134 path_out = next(args)
135 else:
136 assert 0, f'unrecognised arg: {arg}'
137
138 if not path_in:
139 return
140
141 if not path_in:
142 raise Exception('Need to specify -i <in-path>')
143 if not infix:
144 raise Exception('Need to specify -n <name>')
145 if not path_out:
146 raise Exception('Need to specify -o <out-path>')
147
148 check_path_safe(path_in)
149 check_path_safe(path_out)
150 path_temp = f'{path_in}.dir'
151 os.system(f'rm -r "{path_temp}" 2>/dev/null')
152 system(f'unzip -q -d {path_temp} {path_in}')
153
154 out_c = io.StringIO()
155 out_c.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n')
156 out_c.write(f'\n')
157 out_c.write(f'#include "{os.path.basename(path_out)}.h"\n')
158 out_c.write(f'\n')
159
160
161 out_c.write(f'const {infix}_template_item_t {infix}_template_items[] =\n')
162 out_c.write(f'{{\n')
163
164 num_items = 0
165 for dirpath, dirnames, filenames in os.walk(path_temp):
166 dirnames.sort()
167
168 if 0:
169 # Write code to create directory item in zip. This isn't recognised by zipinfo, and doesn't
170 # make Word like the file.
171 #
172 name = dirpath[ len(path_temp)+1: ]
173 if name:
174 if not name.endswith('/'):
175 name += '/'
176 out_c3.write(f' if (extract_zip_write_file(zip, NULL, 0, "{infix}")) goto end;\n')
177
178 for filename in sorted(filenames):
179 num_items += 1
180 path = os.path.join(dirpath, filename)
181 #print(f'looking at path={path}')
182 name = path[ len(path_temp)+1: ]
183 out_c.write(f' {{\n')
184 out_c.write(f' "{name}",\n')
185 if filename.endswith('.xml') or filename.endswith('.rels'):
186 text = read(os.path.join(dirpath, filename), 'utf-8')
187 #print(f'first line is: %r' % text.split("\n")[0])
188 text = text.replace('"', '\\"')
189
190 # Looks like .docx template files use \r\n when we interpret them as
191 # utf-8, so we preserve this in the generated strings.
192 #
193 # .odt seems to have first line ending with '\n', not '\r\n'.
194 #
195 text = text.replace('\r', '\\r')
196 text = text.replace('\n', '\\n"\n "')
197
198 # Split on '<' to avoid overly-long lines, which break windows
199 # compiler.
200 #
201 text = re.sub('([<][^/])', '"\n "\\1', text)
202
203 # Remove name of document creator.
204 #
205 for tag in 'dc:creator', 'cp:lastModifiedBy':
206 text = re.sub(f'[<]{tag}[>][^<]*[<]/{tag}[>]', f'<{tag}></{tag}>', text)
207
208 out_c.write(f' "')
209 # Represent non-ascii utf-8 bytes as C escape sequences.
210 for c in text:
211 if ord( c) <= 127:
212 out_c.write( c)
213 else:
214 for cc in c.encode( 'utf-8'):
215 out_c.write( f'\\x{cc:02x}')
216 out_c.write(f'"\n')
217 else:
218 data = read(os.path.join(dirpath, filename), encoding=None)
219 out_c.write(f' "')
220 i = 0
221 for byte in data:
222 i += 1
223 if i % 16 == 0:
224 out_c.write(f'"\n "')
225 out_c.write(f'\\x{byte:02x}')
226 out_c.write(f'"\n')
227
228 out_c.write(f' }},\n')
229 out_c.write(f'\n')
230
231 out_c.write(f'}};\n')
232 out_c.write(f'\n')
233 out_c.write(f'int {infix}_template_items_num = {num_items};\n')
234
235 out_c = out_c.getvalue()
236 write_if_diff(out_c, f'{path_out}.c', 'utf-8', force)
237
238 out_h = io.StringIO()
239 out_h.write(f'#ifndef EXTRACT_{infix.upper()}_TEMPLATE_H\n')
240 out_h.write(f'#define EXTRACT_{infix.upper()}_TEMPLATE_H\n')
241 out_h.write(f'\n')
242 out_h.write(f'/* THIS IS AUTO-GENERATED CODE, DO NOT EDIT. */\n')
243 out_h.write(f'\n')
244 out_h.write(f'\n')
245 out_h.write(f'typedef struct\n')
246 out_h.write(f'{{\n')
247 out_h.write(f' const char* name; /* Name of item in {infix} archive. */\n')
248 out_h.write(f' const char* text; /* Contents of item in {infix} archive. */\n')
249 out_h.write(f'}} {infix}_template_item_t;\n')
250 out_h.write(f'\n')
251 out_h.write(f'extern const {infix}_template_item_t {infix}_template_items[];\n')
252 out_h.write(f'extern int {infix}_template_items_num;\n')
253 out_h.write(f'\n')
254 out_h.write(f'\n')
255 out_h.write(f'#endif\n')
256 write_if_diff(out_h.getvalue(), f'{path_out}.h', 'utf-8', force)
257 #os.system(f'rm -r "{path_temp}"')
258
259 if __name__ == '__main__':
260 main()