comparison mupdf-source/thirdparty/extract/src/docx.c @ 3:2c135c81b16c

MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:44:09 +0200
parents b50eed0cc0ef
children
comparison
equal deleted inserted replaced
0:6015a75abc2d 3:2c135c81b16c
1 /* These extract_docx_*() functions generate docx content and docx zip archive
2 data.
3
4 Caller must call things in a sensible order to create valid content -
5 e.g. don't call docx_paragraph_start() twice without intervening call to
6 docx_paragraph_finish(). */
7
8 #include "extract/extract.h"
9
10 #include "docx_template.h"
11
12 #include "astring.h"
13 #include "document.h"
14 #include "docx.h"
15 #include "mem.h"
16 #include "memento.h"
17 #include "outf.h"
18 #include "sys.h"
19 #include "text.h"
20 #include "zip.h"
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <float.h>
25 #include <math.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29
30 #include <sys/stat.h>
31
32
33 static int
34 docx_paragraph_start(extract_alloc_t *alloc, extract_astring_t *output)
35 {
36 return extract_astring_cat(alloc, output, "\n\n<w:p>");
37 }
38
39 static int
40 docx_paragraph_finish(extract_alloc_t *alloc, extract_astring_t *output)
41 {
42 return extract_astring_cat(alloc, output, "\n</w:p>");
43 }
44
45 /* Starts a new run. Caller must ensure that docx_run_finish() was
46 called to terminate any previous run. */
47 static int
48 docx_run_start( extract_alloc_t *alloc,
49 extract_astring_t *output,
50 content_state_t *content_state)
51 {
52 int e = 0;
53
54 if (!e) e = extract_astring_cat(alloc, output, "\n<w:r><w:rPr><w:rFonts w:ascii=\"");
55 if (!e) e = extract_astring_cat(alloc, output, content_state->font.name);
56 if (!e) e = extract_astring_cat(alloc, output, "\" w:hAnsi=\"");
57 if (!e) e = extract_astring_cat(alloc, output, content_state->font.name);
58 if (!e) e = extract_astring_cat(alloc, output, "\"/>");
59 if (!e && content_state->font.bold) e = extract_astring_cat(alloc, output, "<w:b/>");
60 if (!e && content_state->font.italic) e = extract_astring_cat(alloc, output, "<w:i/>");
61 {
62 char font_size_text[32];
63
64 if (!e) e = extract_astring_cat(alloc, output, "<w:sz w:val=\"");
65 snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2);
66 extract_astring_cat(alloc, output, font_size_text);
67 extract_astring_cat(alloc, output, "\"/>");
68
69 if (!e) e = extract_astring_cat(alloc, output, "<w:szCs w:val=\"");
70 snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2);
71 extract_astring_cat(alloc, output, font_size_text);
72 extract_astring_cat(alloc, output, "\"/>");
73 }
74 if (!e) e = extract_astring_cat(alloc, output, "</w:rPr><w:t xml:space=\"preserve\">");
75
76 return e;
77 }
78
79 static int
80 docx_run_finish(extract_alloc_t *alloc,
81 content_state_t *state,
82 extract_astring_t *output)
83 {
84 if (state) state->font.name = NULL;
85
86 return extract_astring_cat(alloc, output, "</w:t></w:r>");
87 }
88
89 /* Append an empty paragraph to *content. */
90 static int
91 docx_paragraph_empty(
92 extract_alloc_t *alloc,
93 extract_astring_t *output)
94 {
95 int e = -1;
96 static char fontname[] = "OpenSans";
97 content_state_t content_state = {0};
98
99 if (docx_paragraph_start(alloc, output)) goto end;
100 /* It seems like our choice of font size here doesn't make any difference
101 * to the ammount of vertical space, unless we include a non-space
102 * character. Presumably something to do with the styles in the template
103 * document. */
104 content_state.font.name = fontname;
105 content_state.font.size = 10;
106 content_state.font.bold = 0;
107 content_state.font.italic = 0;
108
109 if (docx_run_start(alloc, output, &content_state)) goto end;
110 //docx_char_append_string(output, "&#160;"); /* &#160; is non-break space. */
111 if (docx_run_finish(alloc, NULL /*state*/, output)) goto end;
112 if (docx_paragraph_finish(alloc, output)) goto end;
113
114 e = 0;
115 end:
116
117 return e;
118 }
119
120
121 /* Removes last char if it is <c>. */
122 static int
123 docx_char_truncate_if(extract_astring_t *output, char c)
124 {
125 if (output->chars_num && output->chars[output->chars_num-1] == c)
126 extract_astring_truncate(output, 1);
127
128 return 0;
129 }
130
131
132 /* Append docx xml for <paragraph> to <content>. Updates *state if we change
133 font. */
134 static int
135 document_to_docx_content_paragraph(
136 extract_alloc_t *alloc,
137 content_state_t *content_state,
138 paragraph_t *paragraph,
139 extract_astring_t *content)
140 {
141 int e = -1;
142 content_line_iterator lit;
143 line_t *line;
144
145 if (docx_paragraph_start(alloc, content)) goto end;
146
147 if ((paragraph->line_flags & paragraph_not_fully_justified) == 0)
148 {
149 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"both\"/></w:pPr>"))
150 goto end;
151 }
152 else if ((paragraph->line_flags & paragraph_not_centred) == 0)
153 {
154 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"center\"/></w:pPr>"))
155 goto end;
156 }
157 else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_left)
158 {
159 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"right\"/></w:pPr>"))
160 goto end;
161 }
162 else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_right)
163 {
164 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"left\"/></w:pPr>"))
165 goto end;
166 }
167
168 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
169 {
170 content_span_iterator sit;
171 span_t *span;
172
173 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit))
174 {
175 int si;
176 double font_size_new;
177
178 content_state->ctm_prev = &span->ctm;
179 font_size_new = extract_font_size(&span->ctm);
180 if (!content_state->font.name
181 || strcmp(span->font_name, content_state->font.name)
182 || span->flags.font_bold != content_state->font.bold
183 || span->flags.font_italic != content_state->font.italic
184 || font_size_new != content_state->font.size)
185 {
186 if (content_state->font.name)
187 if (docx_run_finish(alloc, content_state, content))
188 goto end;
189
190 content_state->font.name = span->font_name;
191 content_state->font.bold = span->flags.font_bold;
192 content_state->font.italic = span->flags.font_italic;
193 content_state->font.size = font_size_new;
194 if (docx_run_start(alloc, content, content_state))
195 goto end;
196 }
197
198 for (si=0; si<span->chars_num; ++si)
199 {
200 char_t* char_ = &span->chars[si];
201 int c = char_->ucs;
202 if (extract_astring_catc_unicode_xml(alloc, content, c))
203 goto end;
204 }
205 /* Remove any trailing '-' at end of line. */
206 if (docx_char_truncate_if(content, '-'))
207 goto end;
208 }
209 if (paragraph->line_flags & paragraph_breaks_strangely)
210 {
211 if (extract_astring_cat(alloc, content, "<w:br/>"))
212 goto end;
213 }
214 }
215 if (content_state->font.name)
216 {
217 if (docx_run_finish(alloc, content_state, content)) goto
218 end;
219 }
220 if (docx_paragraph_finish(alloc, content))
221 goto end;
222
223 e = 0;
224 end:
225
226 return e;
227 }
228
229 /* Write reference to image into docx content. */
230 static int
231 docx_append_image(
232 extract_alloc_t *alloc,
233 extract_astring_t *output,
234 image_t *image)
235 {
236 extract_astring_cat(alloc, output, "\n");
237 extract_astring_cat(alloc, output, " <w:p>\n");
238 extract_astring_cat(alloc, output, " <w:r>\n");
239 extract_astring_cat(alloc, output, " <w:rPr>\n");
240 extract_astring_cat(alloc, output, " <w:noProof/>\n");
241 extract_astring_cat(alloc, output, " </w:rPr>\n");
242 extract_astring_cat(alloc, output, " <w:drawing>\n");
243 extract_astring_cat(alloc, output, " <wp:inline distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" wp14:anchorId=\"7057A832\" wp14:editId=\"466EB3FB\">\n");
244 //extract_astring_cat(alloc, output, " <wp:extent cx=\"2933700\" cy=\"2200275\"/>\n");
245 //extract_astring_cat(alloc, output, " <wp:effectExtent l=\"0\" t=\"0\" r=\"0\" b=\"9525\"/>\n");
246 extract_astring_cat(alloc, output, " <wp:docPr id=\"1\" name=\"Picture 1\"/>\n");
247 extract_astring_cat(alloc, output, " <wp:cNvGraphicFramePr>\n");
248 extract_astring_cat(alloc, output, " <a:graphicFrameLocks xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\" noChangeAspect=\"1\"/>\n");
249 extract_astring_cat(alloc, output, " </wp:cNvGraphicFramePr>\n");
250 extract_astring_cat(alloc, output, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n");
251 extract_astring_cat(alloc, output, " <a:graphicData uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n");
252 extract_astring_cat(alloc, output, " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n");
253 extract_astring_cat(alloc, output, " <pic:nvPicPr>\n");
254 extract_astring_cat(alloc, output, " <pic:cNvPr id=\"1\" name=\"Picture 1\"/>\n");
255 extract_astring_cat(alloc, output, " <pic:cNvPicPr>\n");
256 extract_astring_cat(alloc, output, " <a:picLocks noChangeAspect=\"1\" noChangeArrowheads=\"1\"/>\n");
257 extract_astring_cat(alloc, output, " </pic:cNvPicPr>\n");
258 extract_astring_cat(alloc, output, " </pic:nvPicPr>\n");
259 extract_astring_cat(alloc, output, " <pic:blipFill>\n");
260 extract_astring_catf(alloc, output," <a:blip r:embed=\"%s\">\n", image->id);
261 extract_astring_cat(alloc, output, " <a:extLst>\n");
262 extract_astring_cat(alloc, output, " <a:ext uri=\"{28A0092B-C50C-407E-A947-70E740481C1C}\">\n");
263 extract_astring_cat(alloc, output, " <a14:useLocalDpi xmlns:a14=\"http://schemas.microsoft.com/office/drawing/2010/main\" val=\"0\"/>\n");
264 extract_astring_cat(alloc, output, " </a:ext>\n");
265 extract_astring_cat(alloc, output, " </a:extLst>\n");
266 extract_astring_cat(alloc, output, " </a:blip>\n");
267 //extract_astring_cat(alloc, output, " <a:srcRect/>\n");
268 extract_astring_cat(alloc, output, " <a:stretch>\n");
269 extract_astring_cat(alloc, output, " <a:fillRect/>\n");
270 extract_astring_cat(alloc, output, " </a:stretch>\n");
271 extract_astring_cat(alloc, output, " </pic:blipFill>\n");
272 extract_astring_cat(alloc, output, " <pic:spPr bwMode=\"auto\">\n");
273 extract_astring_cat(alloc, output, " <a:xfrm>\n");
274 extract_astring_cat(alloc, output, " <a:off x=\"0\" y=\"0\"/>\n");
275 //extract_astring_cat(alloc, output, " <a:ext cx=\"2933700\" cy=\"2200275\"/>\n");
276 extract_astring_cat(alloc, output, " </a:xfrm>\n");
277 extract_astring_cat(alloc, output, " <a:prstGeom prst=\"rect\">\n");
278 extract_astring_cat(alloc, output, " <a:avLst/>\n");
279 extract_astring_cat(alloc, output, " </a:prstGeom>\n");
280 extract_astring_cat(alloc, output, " <a:noFill/>\n");
281 extract_astring_cat(alloc, output, " <a:ln>\n");
282 extract_astring_cat(alloc, output, " <a:noFill/>\n");
283 extract_astring_cat(alloc, output, " </a:ln>\n");
284 extract_astring_cat(alloc, output, " </pic:spPr>\n");
285 extract_astring_cat(alloc, output, " </pic:pic>\n");
286 extract_astring_cat(alloc, output, " </a:graphicData>\n");
287 extract_astring_cat(alloc, output, " </a:graphic>\n");
288 extract_astring_cat(alloc, output, " </wp:inline>\n");
289 extract_astring_cat(alloc, output, " </w:drawing>\n");
290 extract_astring_cat(alloc, output, " </w:r>\n");
291 extract_astring_cat(alloc, output, " </w:p>\n");
292 extract_astring_cat(alloc, output, "\n");
293
294 return 0;
295 }
296
297
298 /* Writes paragraph to content inside rotated text box. */
299 static int
300 docx_output_rotated_paragraphs(
301 extract_alloc_t *alloc,
302 block_t *block,
303 int rot,
304 int x,
305 int y,
306 int w,
307 int h,
308 int text_box_id,
309 extract_astring_t *output,
310 content_state_t *state)
311 {
312 int e = -1;
313 paragraph_t *paragraph;
314 content_paragraph_iterator pit;
315
316 outf("x,y=%ik,%ik = %i,%i", x/1000, y/1000, x, y);
317 extract_astring_cat(alloc, output, "\n");
318 extract_astring_cat(alloc, output, "\n");
319 extract_astring_cat(alloc, output, "<w:p>\n");
320 extract_astring_cat(alloc, output, " <w:r>\n");
321 extract_astring_cat(alloc, output, " <mc:AlternateContent>\n");
322 extract_astring_cat(alloc, output, " <mc:Choice Requires=\"wps\">\n");
323 extract_astring_cat(alloc, output, " <w:drawing>\n");
324 extract_astring_cat(alloc, output, " <wp:anchor distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" simplePos=\"0\" relativeHeight=\"0\" behindDoc=\"0\" locked=\"0\" layoutInCell=\"1\" allowOverlap=\"1\" wp14:anchorId=\"53A210D1\" wp14:editId=\"2B7E8016\">\n");
325 extract_astring_cat(alloc, output, " <wp:simplePos x=\"0\" y=\"0\"/>\n");
326 extract_astring_cat(alloc, output, " <wp:positionH relativeFrom=\"page\">\n");
327 extract_astring_catf(alloc, output," <wp:posOffset>%i</wp:posOffset>\n", x);
328 extract_astring_cat(alloc, output, " </wp:positionH>\n");
329 extract_astring_cat(alloc, output, " <wp:positionV relativeFrom=\"page\">\n");
330 extract_astring_catf(alloc, output," <wp:posOffset>%i</wp:posOffset>\n", y);
331 extract_astring_cat(alloc, output, " </wp:positionV>\n");
332 extract_astring_catf(alloc, output," <wp:extent cx=\"%i\" cy=\"%i\"/>\n", w, h);
333 //extract_astring_cat(alloc, output, " <wp:effectExtent l=\"381000\" t=\"723900\" r=\"371475\" b=\"723900\"/>\n");
334 extract_astring_cat(alloc, output, " <wp:wrapNone/>\n");
335 extract_astring_catf(alloc, output," <wp:docPr id=\"%i\" name=\"Text Box %i\"/>\n", text_box_id, text_box_id);
336 extract_astring_cat(alloc, output, " <wp:cNvGraphicFramePr/>\n");
337 extract_astring_cat(alloc, output, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n");
338 extract_astring_cat(alloc, output, " <a:graphicData uri=\"http://schemas.microsoft.com/office/word/2010/wordprocessingShape\">\n");
339 extract_astring_cat(alloc, output, " <wps:wsp>\n");
340 extract_astring_cat(alloc, output, " <wps:cNvSpPr txBox=\"1\"/>\n");
341 extract_astring_cat(alloc, output, " <wps:spPr>\n");
342 extract_astring_catf(alloc, output," <a:xfrm rot=\"%i\">\n", rot);
343 extract_astring_cat(alloc, output, " <a:off x=\"0\" y=\"0\"/>\n");
344 //extract_astring_cat(alloc, output, " <a:ext cx=\"3228975\" cy=\"2286000\"/>\n");
345 extract_astring_cat(alloc, output, " </a:xfrm>\n");
346 extract_astring_cat(alloc, output, " <a:prstGeom prst=\"rect\">\n");
347 extract_astring_cat(alloc, output, " <a:avLst/>\n");
348 extract_astring_cat(alloc, output, " </a:prstGeom>\n");
349
350 /* Give box a solid background. */
351 if (0) {
352 extract_astring_cat(alloc, output, " <a:solidFill>\n");
353 extract_astring_cat(alloc, output, " <a:schemeClr val=\"lt1\"/>\n");
354 extract_astring_cat(alloc, output, " </a:solidFill>\n");
355 }
356
357 /* Draw line around box. */
358 if (0) {
359 extract_astring_cat(alloc, output, " <a:ln w=\"175\">\n");
360 extract_astring_cat(alloc, output, " <a:solidFill>\n");
361 extract_astring_cat(alloc, output, " <a:prstClr val=\"black\"/>\n");
362 extract_astring_cat(alloc, output, " </a:solidFill>\n");
363 extract_astring_cat(alloc, output, " </a:ln>\n");
364 }
365
366 extract_astring_cat(alloc, output, " </wps:spPr>\n");
367 extract_astring_cat(alloc, output, " <wps:txbx>\n");
368 extract_astring_cat(alloc, output, " <w:txbxContent>");
369
370 #if 0
371 if (0) {
372 /* Output inline text describing the rotation. */
373 extract_astring_catf(content, "<w:p>\n"
374 "<w:r><w:rPr><w:rFonts w:ascii=\"OpenSans\" w:hAnsi=\"OpenSans\"/><w:sz w:val=\"20.000000\"/><w:szCs w:val=\"15.000000\"/></w:rPr><w:t xml:space=\"preserve\">*** rotate: %f rad, %f deg. rot=%i</w:t></w:r>\n"
375 "</w:p>\n",
376 rotate,
377 rotate * 180 / pi,
378 rot
379 );
380 }
381 #endif
382
383 /* Output paragraphs p0..p2-1. */
384 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
385 if (document_to_docx_content_paragraph(alloc, state, paragraph, output)) goto end;
386
387 extract_astring_cat(alloc, output, "\n");
388 extract_astring_cat(alloc, output, " </w:txbxContent>\n");
389 extract_astring_cat(alloc, output, " </wps:txbx>\n");
390 extract_astring_cat(alloc, output, " <wps:bodyPr rot=\"0\" spcFirstLastPara=\"0\" vertOverflow=\"overflow\" horzOverflow=\"overflow\" vert=\"horz\" wrap=\"square\" lIns=\"91440\" tIns=\"45720\" rIns=\"91440\" bIns=\"45720\" numCol=\"1\" spcCol=\"0\" rtlCol=\"0\" fromWordArt=\"0\" anchor=\"t\" anchorCtr=\"0\" forceAA=\"0\" compatLnSpc=\"1\">\n");
391 extract_astring_cat(alloc, output, " <a:prstTxWarp prst=\"textNoShape\">\n");
392 extract_astring_cat(alloc, output, " <a:avLst/>\n");
393 extract_astring_cat(alloc, output, " </a:prstTxWarp>\n");
394 extract_astring_cat(alloc, output, " <a:noAutofit/>\n");
395 extract_astring_cat(alloc, output, " </wps:bodyPr>\n");
396 extract_astring_cat(alloc, output, " </wps:wsp>\n");
397 extract_astring_cat(alloc, output, " </a:graphicData>\n");
398 extract_astring_cat(alloc, output, " </a:graphic>\n");
399 extract_astring_cat(alloc, output, " </wp:anchor>\n");
400 extract_astring_cat(alloc, output, " </w:drawing>\n");
401 extract_astring_cat(alloc, output, " </mc:Choice>\n");
402
403 #if 0
404 /* This fallback is copied from a real Word document. Not sure
405 whether it works - both Libreoffice and Word use the above
406 choice. */
407 extract_astring_cat(alloc, output, " <mc:Fallback>\n");
408 extract_astring_cat(alloc, output, " <w:pict>\n");
409 extract_astring_cat(alloc, output, " <v:shapetype w14:anchorId=\"53A210D1\" id=\"_x0000_t202\" coordsize=\"21600,21600\" o:spt=\"202\" path=\"m,l,21600r21600,l21600,xe\">\n");
410 extract_astring_cat(alloc, output, " <v:stroke joinstyle=\"miter\"/>\n");
411 extract_astring_cat(alloc, output, " <v:path gradientshapeok=\"t\" o:connecttype=\"rect\"/>\n");
412 extract_astring_cat(alloc, output, " </v:shapetype>\n");
413 extract_astring_catf(alloc, output," <v:shape id=\"Text Box %i\" o:spid=\"_x0000_s1026\" type=\"#_x0000_t202\" style=\"position:absolute;margin-left:71.25pt;margin-top:48.75pt;width:254.25pt;height:180pt;rotation:-2241476fd;z-index:251659264;visibility:visible;mso-wrap-style:square;mso-wrap-distance-left:9pt;mso-wrap-distance-top:0;mso-wrap-distance-right:9pt;mso-wrap-distance-bottom:0;mso-position-horizontal:absolute;mso-position-horizontal-relative:text;mso-position-vertical:absolute;mso-position-vertical-relative:text;v-text-anchor:top\" o:gfxdata=\"UEsDBBQABgAIAAAAIQC2gziS/gAAAOEBAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbJSRQU7DMBBF&#10;90jcwfIWJU67QAgl6YK0S0CoHGBkTxKLZGx5TGhvj5O2G0SRWNoz/78nu9wcxkFMGNg6quQqL6RA&#10;0s5Y6ir5vt9lD1JwBDIwOMJKHpHlpr69KfdHjyxSmriSfYz+USnWPY7AufNIadK6MEJMx9ApD/oD&#10;OlTrorhX2lFEilmcO2RdNtjC5xDF9pCuTyYBB5bi6bQ4syoJ3g9WQ0ymaiLzg5KdCXlKLjvcW893&#10;SUOqXwnz5DrgnHtJTxOsQfEKIT7DmDSUCaxw7Rqn8787ZsmRM9e2VmPeBN4uqYvTtW7jvijg9N/y&#10;JsXecLq0q+WD6m8AAAD//wMAUEsDBBQABgAIAAAAIQA4/SH/1gAAAJQBAAALAAAAX3JlbHMvLnJl&#10;bHOkkMFqwzAMhu+DvYPRfXGawxijTi+j0GvpHsDYimMaW0Yy2fr2M4PBMnrbUb/Q94l/f/hMi1qR&#10;JVI2sOt6UJgd+ZiDgffL8ekFlFSbvV0oo4EbChzGx4f9GRdb25HMsYhqlCwG5lrLq9biZkxWOiqY&#10;22YiTra2kYMu1l1tQD30/bPm3wwYN0x18gb45AdQl1tp5j/sFB2T0FQ7R0nTNEV3j6o9feQzro1i&#10;OWA14Fm+Q8a1a8+Bvu/d/dMb2JY5uiPbhG/ktn4cqGU/er3pcvwCAAD//wMAUEsDBBQABgAIAAAA&#10;IQDQg5pQVgIAALEEAAAOAAAAZHJzL2Uyb0RvYy54bWysVE1v2zAMvQ/YfxB0X+2k+WiDOEXWosOA&#10;oi3QDj0rstwYk0VNUmJ3v35PipMl3U7DLgJFPj+Rj6TnV12j2VY5X5Mp+OAs50wZSWVtXgv+7fn2&#10;0wVnPghTCk1GFfxNeX61+Phh3tqZGtKadKkcA4nxs9YWfB2CnWWZl2vVCH9GVhkEK3KNCLi616x0&#10;ogV7o7Nhnk+yllxpHUnlPbw3uyBfJP6qUjI8VJVXgemCI7eQTpfOVTyzxVzMXp2w61r2aYh/yKIR&#10;tcGjB6obEQTbuPoPqqaWjjxV4UxSk1FV1VKlGlDNIH9XzdNaWJVqgTjeHmTy/49W3m8fHatL9I4z&#10;Ixq06Fl1gX2mjg2iOq31M4CeLGChgzsie7+HMxbdVa5hjiDu4HI8ml5MpkkLVMcAh+xvB6kjt4Tz&#10;fDi8uJyOOZOIwZ7keWpGtmOLrNb58EVRw6JRcIdeJlqxvfMBGQC6h0S4J12Xt7XW6RLnR11rx7YC&#10;ndch5YwvTlDasLbgk/NxnohPYpH68P1KC/k9Vn3KgJs2cEaNdlpEK3SrrhdoReUbdEvSQAZv5W0N&#10;3jvhw6NwGDQ4sTzhAUelCclQb3G2Jvfzb/6IR/8R5azF4Bbc/9gIpzjTXw0m43IwGsVJT5fReDrE&#10;xR1HVscRs2muCQqh+8gumREf9N6sHDUv2LFlfBUhYSTeLnjYm9dht07YUamWywTCbFsR7syTlZF6&#10;383n7kU42/czYBTuaT/iYvaurTts/NLQchOoqlPPo8A7VXvdsRepLf0Ox8U7vifU7z/N4hcAAAD/&#10;/wMAUEsDBBQABgAIAAAAIQBh17L63wAAAAoBAAAPAAAAZHJzL2Rvd25yZXYueG1sTI9BT4NAEIXv&#10;Jv6HzZh4s0ubgpayNIboSW3Syg9Y2BGI7CyyS0v99Y4nPU3ezMub72W72fbihKPvHClYLiIQSLUz&#10;HTUKyvfnuwcQPmgyuneECi7oYZdfX2U6Ne5MBzwdQyM4hHyqFbQhDKmUvm7Rar9wAxLfPtxodWA5&#10;NtKM+szhtperKEqk1R3xh1YPWLRYfx4nq8APVfz9VQxPb+WUNC+vZbGPDhelbm/mxy2IgHP4M8Mv&#10;PqNDzkyVm8h40bNer2K2Ktjc82RDEi+5XKVgHfNG5pn8XyH/AQAA//8DAFBLAQItABQABgAIAAAA&#10;IQC2gziS/gAAAOEBAAATAAAAAAAAAAAAAAAAAAAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAi0A&#10;FAAGAAgAAAAhADj9If/WAAAAlAEAAAsAAAAAAAAAAAAAAAAALwEAAF9yZWxzLy5yZWxzUEsBAi0A&#10;FAAGAAgAAAAhANCDmlBWAgAAsQQAAA4AAAAAAAAAAAAAAAAALgIAAGRycy9lMm9Eb2MueG1sUEsB&#10;Ai0AFAAGAAgAAAAhAGHXsvrfAAAACgEAAA8AAAAAAAAAAAAAAAAAsAQAAGRycy9kb3ducmV2Lnht&#10;bFBLBQYAAAAABAAEAPMAAAC8BQAAAAA=&#10;\" fillcolor=\"white [3201]\" strokeweight=\".5pt\">\n", text_box_id);
414 extract_astring_cat(alloc, output, " <v:textbox>\n");
415 extract_astring_cat(alloc, output, " <w:txbxContent>");
416
417 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
418 if (document_to_docx_content_paragraph(alloc, state, paragraph, output)) goto end;
419
420 extract_astring_cat(alloc, output, "\n");
421 extract_astring_cat(alloc, output, "\n");
422 extract_astring_cat(alloc, output, " </w:txbxContent>\n");
423 extract_astring_cat(alloc, output, " </v:textbox>\n");
424 extract_astring_cat(alloc, output, " </v:shape>\n");
425 extract_astring_cat(alloc, output, " </w:pict>\n");
426 extract_astring_cat(alloc, output, " </mc:Fallback>\n");
427 #endif
428 extract_astring_cat(alloc, output, " </mc:AlternateContent>\n");
429 extract_astring_cat(alloc, output, " </w:r>\n");
430 extract_astring_cat(alloc, output, "</w:p>");
431
432 e = 0;
433 end:
434
435 return e;
436 }
437
438
439 /* Appends table to content.
440
441 We do not fix the size of the table or its columns and rows, but instead leave layout up
442 to the application. */
443 static int
444 docx_append_table(
445 extract_alloc_t *alloc,
446 table_t *table,
447 extract_astring_t *output)
448 {
449 int e = -1;
450 int y;
451
452 if (extract_astring_cat(alloc, output,
453 "\n"
454 " <w:tbl>\n"
455 " <w:tblLayout w:type=\"autofit\"/>\n"))
456 goto end;
457
458 for (y=0; y<table->cells_num_y; ++y)
459 {
460 int x;
461 if (extract_astring_cat(alloc, output,
462 " <w:tr>\n"
463 " <w:trPr/>\n")) goto end;
464
465 for (x=0; x<table->cells_num_x; ++x)
466 {
467 cell_t* cell = table->cells[y*table->cells_num_x + x];
468 if (!cell->left) continue;
469
470 if (extract_astring_cat(alloc, output, " <w:tc>\n"))
471 goto end;
472
473 /* Write cell properties. */
474 {
475 if (extract_astring_cat(alloc, output,
476 " <w:tcPr>\n"
477 " <w:tcBorders>\n"
478 " <w:top w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
479 " <w:start w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
480 " <w:bottom w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
481 " <w:end w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n"
482 " </w:tcBorders>\n"))
483 goto end;
484 if (cell->extend_right > 1)
485 {
486 if (extract_astring_catf(alloc, output, " <w:gridSpan w:val=\"%i\"/>\n", cell->extend_right))
487 goto end;
488 }
489 if (cell->above)
490 {
491 if (cell->extend_down > 1)
492 {
493 if (extract_astring_catf(alloc, output, " <w:vMerge w:val=\"restart\"/>\n", cell->extend_down))
494 goto end;
495 }
496 }
497 else
498 {
499 if (extract_astring_catf(alloc, output, " <w:vMerge w:val=\"continue\"/>\n"))
500 goto end;
501 }
502 if (extract_astring_cat(alloc, output, " </w:tcPr>\n"))
503 goto end;
504 }
505
506 /* Write contents of this cell. */
507 {
508 content_paragraph_iterator pit;
509 paragraph_t *paragraph;
510 size_t chars_num_old = output->chars_num;
511 content_state_t content_state = {0};
512
513 content_state.font.name = NULL;
514 content_state.ctm_prev = NULL;
515 for (paragraph = content_paragraph_iterator_init(&pit, &cell->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
516 if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output))
517 goto end;
518
519 if (content_state.font.name)
520 if (docx_run_finish(alloc, &content_state, output)) goto end;
521
522 /* Need to write out at least an empty paragraph in each
523 * cell, otherwise Word/Libreoffice fail to show table at
524 * all; the OOXML spec says "If a table cell does not
525 * include at least one block-level element, then this
526 * document shall be considered corrupt." */
527 if (output->chars_num == chars_num_old)
528 if (extract_astring_catf(alloc, output, "<w:p/>\n"))
529 goto end;
530 }
531 if (extract_astring_cat(alloc, output, " </w:tc>\n"))
532 goto end;
533 }
534 if (extract_astring_cat(alloc, output, " </w:tr>\n"))
535 goto end;
536 }
537 if (extract_astring_cat(alloc, output, " </w:tbl>\n"))
538 goto end;
539
540 e = 0;
541 end:
542
543 return e;
544 }
545
546 /* Appends a block of content with same rotation. */
547 static int
548 docx_append_rotated_paragraphs(
549 extract_alloc_t *alloc,
550 content_state_t *state,
551 block_t *block,
552 int *text_box_id,
553 double angle,
554 extract_astring_t *output)
555 {
556 /* Find extent of paragraphs with this same rotation. extent
557 will contain max width and max height of paragraphs, in units
558 before application of ctm, i.e. before rotation. */
559 int e = -1;
560 rect_t bounds;
561
562 bounds = extract_block_pre_rotation_bounds(block, angle);
563
564 outf("angle=%f pre-transform box is: (%f %f) to (%f %f)",
565 angle, bounds.min.x, bounds.min.y, bounds.max.x, bounds.max.y);
566
567 /* All the paragraphs have same rotation. We output them into
568 * a single rotated text box. */
569
570 /* We need unique id for text box. */
571 *text_box_id += 1;
572
573 {
574 /* Angles are in units of 1/60,000 degree. */
575 int rot = (int) (angle * 180 / pi * 60000);
576
577 /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm.
578 * relativeHeight is z-ordering. (wp:positionV:wp:posOffset,
579 * wp:positionV:wp:posOffset) is position of origin of box in
580 * EMU. */
581 double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */
582 int x = (int) (bounds.min.x * point_to_emu);
583 int y = (int) (bounds.min.y * point_to_emu);
584 int w = (int) ((bounds.max.x - bounds.min.x) * point_to_emu);
585 int h = (int) ((bounds.max.y - bounds.min.y) * point_to_emu);
586
587 if (0) outf("rotate: %f rad, %f deg. rot=%i", angle, angle*180/pi, rot);
588
589 if (docx_output_rotated_paragraphs(alloc, block, rot, x, y, w, h, *text_box_id, output, state))
590 goto end;
591 }
592
593 e = 0;
594 end:
595
596 return e;
597 }
598
599 int
600 extract_document_to_docx_content(
601 extract_alloc_t *alloc,
602 document_t *document,
603 int spacing,
604 int rotation,
605 int images,
606 extract_astring_t *output)
607 {
608 int e = -1;
609 int text_box_id = 0;
610 int p;
611
612 /* Write paragraphs into <content>. */
613 for (p=0; p<document->pages_num; ++p)
614 {
615 extract_page_t *page = document->pages[p];
616 int c;
617
618 for (c=0; c<page->subpages_num; ++c)
619 {
620 subpage_t *subpage = page->subpages[c];
621 content_iterator cit;
622 content_t *content;
623 content_table_iterator tit;
624 table_t *table;
625
626 content_state_t content_state;
627 content_state.font.name = NULL;
628 content_state.font.size = 0;
629 content_state.font.bold = 0;
630 content_state.font.italic = 0;
631 content_state.ctm_prev = NULL;
632
633 /* Output paragraphs and tables in order of y coordinate. */
634 content = content_iterator_init(&cit, &subpage->content);
635 table = content_table_iterator_init(&tit, &subpage->tables);
636 while (1)
637 {
638 double y_paragraph;
639 double y_table;
640 /* Next block or NULL if none. */
641 block_t *block = (content && content->type == content_block) ? (block_t *)content : NULL;
642 /* Next paragraph or NULL if none. */
643 paragraph_t *paragraph = (content && content->type == content_paragraph) ? (paragraph_t *)content : (block ? content_first_paragraph(&block->content) : NULL);
644 line_t *first_line = paragraph ? content_first_line(&paragraph->content) : NULL;
645 span_t *first_span = first_line ? content_head_as_span(&first_line->content) : NULL;
646
647 if (!paragraph && !table) break;
648
649 y_paragraph = (first_span) ? first_span->chars[0].y : DBL_MAX;
650 y_table = (table) ? table->pos.y : DBL_MAX;
651
652 if (first_span && y_paragraph < y_table)
653 {
654 const matrix4_t *ctm = &first_span->ctm;
655 double angle = extract_baseline_angle(ctm);
656
657 if (spacing
658 && content_state.ctm_prev
659 && first_line
660 && first_span
661 && extract_matrix4_cmp(content_state.ctm_prev,
662 &first_span->ctm))
663 {
664 /* Extra vertical space between paragraphs that
665 * were at different angles in the original
666 * document. */
667 if (docx_paragraph_empty(alloc, output))
668 goto end;
669 }
670
671 /* Extra vertical space between paragraphs. */
672 if (spacing)
673 if (docx_paragraph_empty(alloc, output))
674 goto end;
675
676 if (rotation && angle != 0)
677 {
678 assert(block);
679 if (docx_append_rotated_paragraphs(alloc, &content_state, block, &text_box_id, angle, output))
680 goto end;
681 }
682 else if (block)
683 {
684 content_paragraph_iterator pit;
685 int first = 1;
686
687 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
688 {
689 if (spacing && !first)
690 {
691 /* Extra vertical space between paragraphs. */
692 if (docx_paragraph_empty(alloc, output))
693 goto end;
694 }
695 first = 0;
696
697 if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output)) goto end;
698 }
699 }
700 else
701 {
702 if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output))
703 goto end;
704 }
705 content = content_iterator_next(&cit);
706 }
707 else if (table)
708 {
709 if (docx_append_table(alloc, table, output))
710 goto end;
711 table = content_table_iterator_next(&tit);
712 }
713 }
714
715 if (images)
716 {
717 content_image_iterator iit;
718 image_t *image;
719
720 for (image = content_image_iterator_init(&iit, &subpage->content); image != NULL; image = content_image_iterator_next(&iit))
721 docx_append_image(alloc, output, image);
722 }
723 }
724 }
725
726 e = 0;
727 end:
728
729 return e;
730 }
731
732
733 /* Sets *o_begin to end of first occurrence of <begin> in <text>, and *o_end to
734 * beginning of first occurtence of <end> in <text>. */
735 static int
736 find_mid(
737 const char *text,
738 const char *begin,
739 const char *end,
740 const char **o_begin,
741 const char **o_end)
742 {
743 *o_begin = strstr(text, begin);
744 if (*o_begin == NULL)
745 goto fail;
746 *o_begin += strlen(begin);
747 *o_end = strstr(*o_begin, end);
748 if (*o_end == NULL)
749 goto fail;
750
751 return 0;
752
753 fail:
754 errno = ESRCH;
755 return -1;
756 }
757
758
759 int
760 extract_docx_content_item(
761 extract_alloc_t *alloc,
762 extract_astring_t *contentss,
763 int contentss_num,
764 images_t *images,
765 const char *name,
766 const char *text,
767 char **text2)
768 {
769 int e = -1;
770 extract_astring_t temp = { 0 };
771
772 *text2 = NULL;
773
774 if (0)
775 {}
776 else if (!strcmp(name, "[Content_Types].xml"))
777 {
778 /* Add information about all image types that we are going to use. */
779 const char *begin;
780 const char *end;
781 const char *insert;
782 int it;
783
784 extract_astring_free(alloc, &temp);
785 outf("text: %s", text);
786 if (find_mid(text, "<Types ", "</Types>", &begin, &end)) goto end;
787
788 insert = begin;
789 insert = strchr(insert, '>');
790 assert(insert);
791 insert += 1;
792
793 if (extract_astring_catl(alloc, &temp, text, insert - text)) goto end;
794 outf("images->imagetypes_num=%i", images->imagetypes_num);
795 for (it=0; it<images->imagetypes_num; ++it) {
796 const char *imagetype = images->imagetypes[it];
797 if (extract_astring_cat(alloc, &temp, "<Default Extension=\"")) goto end;
798 if (extract_astring_cat(alloc, &temp, imagetype)) goto end;
799 if (extract_astring_cat(alloc, &temp, "\" ContentType=\"image/")) goto end;
800 if (extract_astring_cat(alloc, &temp, imagetype)) goto end;
801 if (extract_astring_cat(alloc, &temp, "\"/>")) goto end;
802 }
803 if (extract_astring_cat(alloc, &temp, insert)) goto end;
804 *text2 = temp.chars;
805 extract_astring_init(&temp);
806 }
807 else if (!strcmp(name, "word/_rels/document.xml.rels"))
808 {
809 /* Add relationships between image ids and image names within docx
810 * archive. */
811 const char *begin;
812 const char *end;
813 int j;
814
815 extract_astring_free(alloc, &temp);
816 if (find_mid(text, "<Relationships", "</Relationships>", &begin, &end)) goto end;
817 if (extract_astring_catl(alloc, &temp, text, end - text)) goto end;
818 outf("images.images_num=%i", images->images_num);
819 for (j=0; j<images->images_num; ++j) {
820 image_t* image = images->images[j];
821 if (extract_astring_cat(alloc, &temp, "<Relationship Id=\"")) goto end;
822 if (extract_astring_cat(alloc, &temp, image->id)) goto end;
823 if (extract_astring_cat(alloc, &temp, "\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image\" Target=\"media/")) goto end;
824 if (extract_astring_cat(alloc, &temp, image->name)) goto end;
825 if (extract_astring_cat(alloc, &temp, "\"/>")) goto end;
826 }
827 if (extract_astring_cat(alloc, &temp, end)) goto end;
828 *text2 = temp.chars;
829 extract_astring_init(&temp);
830 }
831 else if (!strcmp(name, "word/document.xml"))
832 {
833 /* Insert paragraphs content. */
834 if (extract_content_insert(alloc,
835 text,
836 NULL /*single*/,
837 "<w:body>",
838 "</w:body>",
839 contentss,
840 contentss_num,
841 text2)) goto end;
842 }
843 else
844 {
845 *text2 = NULL;
846 }
847
848 e = 0;
849 end:
850
851 if (e)
852 {
853 /* We might have set <text2> to new content. */
854 extract_free(alloc, text2);
855 /* We might have used <temp> as a temporary buffer. */
856 extract_astring_free(alloc, &temp);
857 }
858 extract_astring_init(&temp);
859
860 return e;
861 }
862
863
864
865 int
866 extract_docx_write_template(
867 extract_alloc_t *alloc,
868 extract_astring_t *contentss,
869 int contentss_num,
870 images_t *images,
871 const char *path_template,
872 const char *path_out,
873 int preserve_dir)
874 {
875 int e = -1;
876 int i;
877 char *path_tempdir = NULL;
878 char *path = NULL;
879 char *text = NULL;
880 char *text2 = NULL;
881
882 assert(path_out);
883 assert(path_template);
884
885 if (extract_check_path_shell_safe(path_out))
886 {
887 outf("path_out is unsafe: %s", path_out);
888 goto end;
889 }
890
891 outf("images->images_num=%i", images->images_num);
892 if (extract_asprintf(alloc, &path_tempdir, "%s.dir", path_out) < 0) goto end;
893 if (extract_systemf(alloc, "rm -r '%s' 2>/dev/null", path_tempdir) < 0) goto end;
894
895 if (extract_mkdir(path_tempdir, 0777)) {
896 outf("Failed to create directory: %s", path_tempdir);
897 goto end;
898 }
899
900 outf("Unzipping template document '%s' to tempdir: %s",
901 path_template, path_tempdir);
902 if (extract_systemf(alloc, "unzip -q -d '%s' '%s'", path_tempdir, path_template))
903 {
904 outf("Failed to unzip %s into %s",
905 path_template, path_tempdir);
906 goto end;
907 }
908
909 /* Might be nice to iterate through all items in path_tempdir, but for now
910 * we look at just the items that we know extract_docx_content_item() will
911 * modify. */
912
913 {
914 const char *names[] = {
915 "word/document.xml",
916 "[Content_Types].xml",
917 "word/_rels/document.xml.rels",
918 };
919 int names_num = sizeof(names) / sizeof(names[0]);
920 for (i=0; i<names_num; ++i) {
921 const char* name = names[i];
922 extract_free(alloc, &path);
923 extract_free(alloc, &text);
924 extract_free(alloc, &text2);
925 if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end;
926 if (extract_read_all_path(alloc, path, &text)) goto end;
927
928 if (extract_docx_content_item(alloc,
929 contentss,
930 contentss_num,
931 images,
932 name,
933 text,
934 &text2)) goto end;
935
936 {
937 const char *text3 = (text2) ? text2 : text;
938 if (extract_write_all(text3, strlen(text3), path)) goto end;
939 }
940 }
941 }
942
943 /* Copy images into <path_tempdir>/media/. */
944 extract_free(alloc, &path);
945 if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end;
946 if (extract_mkdir(path, 0777)) goto end;
947
948 for (i=0; i<images->images_num; ++i) {
949 image_t* image = images->images[i];
950 extract_free(alloc, &path);
951 if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end;
952 if (extract_write_all(image->data, image->data_size, path)) goto end;
953 }
954
955 outf("Zipping tempdir to create %s", path_out);
956 {
957 const char *path_out_leaf = strrchr(path_out, '/');
958 if (!path_out_leaf) path_out_leaf = path_out;
959 if (extract_systemf(alloc, "cd '%s' && zip -q -r -D '../%s' .", path_tempdir, path_out_leaf))
960 {
961 outf("Zip command failed to convert '%s' directory into output file: %s",
962 path_tempdir, path_out);
963 goto end;
964 }
965 }
966
967 if (!preserve_dir) {
968 if (extract_remove_directory(alloc, path_tempdir)) goto end;
969 }
970
971 e = 0;
972 end:
973
974 outf("e=%i", e);
975 extract_free(alloc, &path_tempdir);
976 extract_free(alloc, &path);
977 extract_free(alloc, &text);
978 extract_free(alloc, &text2);
979
980 if (e)
981 {
982 outf("Failed to create %s", path_out);
983 }
984
985 return e;
986 }