Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/docx.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 /* These extract_docx_*() functions generate docx content and docx zip archive | |
| 2 data. | |
| 3 | |
| 4 Caller must call things in a sensible order to create valid content - | |
| 5 e.g. don't call docx_paragraph_start() twice without intervening call to | |
| 6 docx_paragraph_finish(). */ | |
| 7 | |
| 8 #include "extract/extract.h" | |
| 9 | |
| 10 #include "docx_template.h" | |
| 11 | |
| 12 #include "astring.h" | |
| 13 #include "document.h" | |
| 14 #include "docx.h" | |
| 15 #include "mem.h" | |
| 16 #include "memento.h" | |
| 17 #include "outf.h" | |
| 18 #include "sys.h" | |
| 19 #include "text.h" | |
| 20 #include "zip.h" | |
| 21 | |
| 22 #include <assert.h> | |
| 23 #include <errno.h> | |
| 24 #include <float.h> | |
| 25 #include <math.h> | |
| 26 #include <stdlib.h> | |
| 27 #include <stdio.h> | |
| 28 #include <string.h> | |
| 29 | |
| 30 #include <sys/stat.h> | |
| 31 | |
| 32 | |
| 33 static int | |
| 34 docx_paragraph_start(extract_alloc_t *alloc, extract_astring_t *output) | |
| 35 { | |
| 36 return extract_astring_cat(alloc, output, "\n\n<w:p>"); | |
| 37 } | |
| 38 | |
| 39 static int | |
| 40 docx_paragraph_finish(extract_alloc_t *alloc, extract_astring_t *output) | |
| 41 { | |
| 42 return extract_astring_cat(alloc, output, "\n</w:p>"); | |
| 43 } | |
| 44 | |
| 45 /* Starts a new run. Caller must ensure that docx_run_finish() was | |
| 46 called to terminate any previous run. */ | |
| 47 static int | |
| 48 docx_run_start( extract_alloc_t *alloc, | |
| 49 extract_astring_t *output, | |
| 50 content_state_t *content_state) | |
| 51 { | |
| 52 int e = 0; | |
| 53 | |
| 54 if (!e) e = extract_astring_cat(alloc, output, "\n<w:r><w:rPr><w:rFonts w:ascii=\""); | |
| 55 if (!e) e = extract_astring_cat(alloc, output, content_state->font.name); | |
| 56 if (!e) e = extract_astring_cat(alloc, output, "\" w:hAnsi=\""); | |
| 57 if (!e) e = extract_astring_cat(alloc, output, content_state->font.name); | |
| 58 if (!e) e = extract_astring_cat(alloc, output, "\"/>"); | |
| 59 if (!e && content_state->font.bold) e = extract_astring_cat(alloc, output, "<w:b/>"); | |
| 60 if (!e && content_state->font.italic) e = extract_astring_cat(alloc, output, "<w:i/>"); | |
| 61 { | |
| 62 char font_size_text[32]; | |
| 63 | |
| 64 if (!e) e = extract_astring_cat(alloc, output, "<w:sz w:val=\""); | |
| 65 snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2); | |
| 66 extract_astring_cat(alloc, output, font_size_text); | |
| 67 extract_astring_cat(alloc, output, "\"/>"); | |
| 68 | |
| 69 if (!e) e = extract_astring_cat(alloc, output, "<w:szCs w:val=\""); | |
| 70 snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2); | |
| 71 extract_astring_cat(alloc, output, font_size_text); | |
| 72 extract_astring_cat(alloc, output, "\"/>"); | |
| 73 } | |
| 74 if (!e) e = extract_astring_cat(alloc, output, "</w:rPr><w:t xml:space=\"preserve\">"); | |
| 75 | |
| 76 return e; | |
| 77 } | |
| 78 | |
| 79 static int | |
| 80 docx_run_finish(extract_alloc_t *alloc, | |
| 81 content_state_t *state, | |
| 82 extract_astring_t *output) | |
| 83 { | |
| 84 if (state) state->font.name = NULL; | |
| 85 | |
| 86 return extract_astring_cat(alloc, output, "</w:t></w:r>"); | |
| 87 } | |
| 88 | |
| 89 /* Append an empty paragraph to *content. */ | |
| 90 static int | |
| 91 docx_paragraph_empty( | |
| 92 extract_alloc_t *alloc, | |
| 93 extract_astring_t *output) | |
| 94 { | |
| 95 int e = -1; | |
| 96 static char fontname[] = "OpenSans"; | |
| 97 content_state_t content_state = {0}; | |
| 98 | |
| 99 if (docx_paragraph_start(alloc, output)) goto end; | |
| 100 /* It seems like our choice of font size here doesn't make any difference | |
| 101 * to the ammount of vertical space, unless we include a non-space | |
| 102 * character. Presumably something to do with the styles in the template | |
| 103 * document. */ | |
| 104 content_state.font.name = fontname; | |
| 105 content_state.font.size = 10; | |
| 106 content_state.font.bold = 0; | |
| 107 content_state.font.italic = 0; | |
| 108 | |
| 109 if (docx_run_start(alloc, output, &content_state)) goto end; | |
| 110 //docx_char_append_string(output, " "); /*   is non-break space. */ | |
| 111 if (docx_run_finish(alloc, NULL /*state*/, output)) goto end; | |
| 112 if (docx_paragraph_finish(alloc, output)) goto end; | |
| 113 | |
| 114 e = 0; | |
| 115 end: | |
| 116 | |
| 117 return e; | |
| 118 } | |
| 119 | |
| 120 | |
| 121 /* Removes last char if it is <c>. */ | |
| 122 static int | |
| 123 docx_char_truncate_if(extract_astring_t *output, char c) | |
| 124 { | |
| 125 if (output->chars_num && output->chars[output->chars_num-1] == c) | |
| 126 extract_astring_truncate(output, 1); | |
| 127 | |
| 128 return 0; | |
| 129 } | |
| 130 | |
| 131 | |
| 132 /* Append docx xml for <paragraph> to <content>. Updates *state if we change | |
| 133 font. */ | |
| 134 static int | |
| 135 document_to_docx_content_paragraph( | |
| 136 extract_alloc_t *alloc, | |
| 137 content_state_t *content_state, | |
| 138 paragraph_t *paragraph, | |
| 139 extract_astring_t *content) | |
| 140 { | |
| 141 int e = -1; | |
| 142 content_line_iterator lit; | |
| 143 line_t *line; | |
| 144 | |
| 145 if (docx_paragraph_start(alloc, content)) goto end; | |
| 146 | |
| 147 if ((paragraph->line_flags & paragraph_not_fully_justified) == 0) | |
| 148 { | |
| 149 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"both\"/></w:pPr>")) | |
| 150 goto end; | |
| 151 } | |
| 152 else if ((paragraph->line_flags & paragraph_not_centred) == 0) | |
| 153 { | |
| 154 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"center\"/></w:pPr>")) | |
| 155 goto end; | |
| 156 } | |
| 157 else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_left) | |
| 158 { | |
| 159 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"right\"/></w:pPr>")) | |
| 160 goto end; | |
| 161 } | |
| 162 else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_right) | |
| 163 { | |
| 164 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"left\"/></w:pPr>")) | |
| 165 goto end; | |
| 166 } | |
| 167 | |
| 168 for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) | |
| 169 { | |
| 170 content_span_iterator sit; | |
| 171 span_t *span; | |
| 172 | |
| 173 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 174 { | |
| 175 int si; | |
| 176 double font_size_new; | |
| 177 | |
| 178 content_state->ctm_prev = &span->ctm; | |
| 179 font_size_new = extract_font_size(&span->ctm); | |
| 180 if (!content_state->font.name | |
| 181 || strcmp(span->font_name, content_state->font.name) | |
| 182 || span->flags.font_bold != content_state->font.bold | |
| 183 || span->flags.font_italic != content_state->font.italic | |
| 184 || font_size_new != content_state->font.size) | |
| 185 { | |
| 186 if (content_state->font.name) | |
| 187 if (docx_run_finish(alloc, content_state, content)) | |
| 188 goto end; | |
| 189 | |
| 190 content_state->font.name = span->font_name; | |
| 191 content_state->font.bold = span->flags.font_bold; | |
| 192 content_state->font.italic = span->flags.font_italic; | |
| 193 content_state->font.size = font_size_new; | |
| 194 if (docx_run_start(alloc, content, content_state)) | |
| 195 goto end; | |
| 196 } | |
| 197 | |
| 198 for (si=0; si<span->chars_num; ++si) | |
| 199 { | |
| 200 char_t* char_ = &span->chars[si]; | |
| 201 int c = char_->ucs; | |
| 202 if (extract_astring_catc_unicode_xml(alloc, content, c)) | |
| 203 goto end; | |
| 204 } | |
| 205 /* Remove any trailing '-' at end of line. */ | |
| 206 if (docx_char_truncate_if(content, '-')) | |
| 207 goto end; | |
| 208 } | |
| 209 if (paragraph->line_flags & paragraph_breaks_strangely) | |
| 210 { | |
| 211 if (extract_astring_cat(alloc, content, "<w:br/>")) | |
| 212 goto end; | |
| 213 } | |
| 214 } | |
| 215 if (content_state->font.name) | |
| 216 { | |
| 217 if (docx_run_finish(alloc, content_state, content)) goto | |
| 218 end; | |
| 219 } | |
| 220 if (docx_paragraph_finish(alloc, content)) | |
| 221 goto end; | |
| 222 | |
| 223 e = 0; | |
| 224 end: | |
| 225 | |
| 226 return e; | |
| 227 } | |
| 228 | |
| 229 /* Write reference to image into docx content. */ | |
| 230 static int | |
| 231 docx_append_image( | |
| 232 extract_alloc_t *alloc, | |
| 233 extract_astring_t *output, | |
| 234 image_t *image) | |
| 235 { | |
| 236 extract_astring_cat(alloc, output, "\n"); | |
| 237 extract_astring_cat(alloc, output, " <w:p>\n"); | |
| 238 extract_astring_cat(alloc, output, " <w:r>\n"); | |
| 239 extract_astring_cat(alloc, output, " <w:rPr>\n"); | |
| 240 extract_astring_cat(alloc, output, " <w:noProof/>\n"); | |
| 241 extract_astring_cat(alloc, output, " </w:rPr>\n"); | |
| 242 extract_astring_cat(alloc, output, " <w:drawing>\n"); | |
| 243 extract_astring_cat(alloc, output, " <wp:inline distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" wp14:anchorId=\"7057A832\" wp14:editId=\"466EB3FB\">\n"); | |
| 244 //extract_astring_cat(alloc, output, " <wp:extent cx=\"2933700\" cy=\"2200275\"/>\n"); | |
| 245 //extract_astring_cat(alloc, output, " <wp:effectExtent l=\"0\" t=\"0\" r=\"0\" b=\"9525\"/>\n"); | |
| 246 extract_astring_cat(alloc, output, " <wp:docPr id=\"1\" name=\"Picture 1\"/>\n"); | |
| 247 extract_astring_cat(alloc, output, " <wp:cNvGraphicFramePr>\n"); | |
| 248 extract_astring_cat(alloc, output, " <a:graphicFrameLocks xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\" noChangeAspect=\"1\"/>\n"); | |
| 249 extract_astring_cat(alloc, output, " </wp:cNvGraphicFramePr>\n"); | |
| 250 extract_astring_cat(alloc, output, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n"); | |
| 251 extract_astring_cat(alloc, output, " <a:graphicData uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n"); | |
| 252 extract_astring_cat(alloc, output, " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n"); | |
| 253 extract_astring_cat(alloc, output, " <pic:nvPicPr>\n"); | |
| 254 extract_astring_cat(alloc, output, " <pic:cNvPr id=\"1\" name=\"Picture 1\"/>\n"); | |
| 255 extract_astring_cat(alloc, output, " <pic:cNvPicPr>\n"); | |
| 256 extract_astring_cat(alloc, output, " <a:picLocks noChangeAspect=\"1\" noChangeArrowheads=\"1\"/>\n"); | |
| 257 extract_astring_cat(alloc, output, " </pic:cNvPicPr>\n"); | |
| 258 extract_astring_cat(alloc, output, " </pic:nvPicPr>\n"); | |
| 259 extract_astring_cat(alloc, output, " <pic:blipFill>\n"); | |
| 260 extract_astring_catf(alloc, output," <a:blip r:embed=\"%s\">\n", image->id); | |
| 261 extract_astring_cat(alloc, output, " <a:extLst>\n"); | |
| 262 extract_astring_cat(alloc, output, " <a:ext uri=\"{28A0092B-C50C-407E-A947-70E740481C1C}\">\n"); | |
| 263 extract_astring_cat(alloc, output, " <a14:useLocalDpi xmlns:a14=\"http://schemas.microsoft.com/office/drawing/2010/main\" val=\"0\"/>\n"); | |
| 264 extract_astring_cat(alloc, output, " </a:ext>\n"); | |
| 265 extract_astring_cat(alloc, output, " </a:extLst>\n"); | |
| 266 extract_astring_cat(alloc, output, " </a:blip>\n"); | |
| 267 //extract_astring_cat(alloc, output, " <a:srcRect/>\n"); | |
| 268 extract_astring_cat(alloc, output, " <a:stretch>\n"); | |
| 269 extract_astring_cat(alloc, output, " <a:fillRect/>\n"); | |
| 270 extract_astring_cat(alloc, output, " </a:stretch>\n"); | |
| 271 extract_astring_cat(alloc, output, " </pic:blipFill>\n"); | |
| 272 extract_astring_cat(alloc, output, " <pic:spPr bwMode=\"auto\">\n"); | |
| 273 extract_astring_cat(alloc, output, " <a:xfrm>\n"); | |
| 274 extract_astring_cat(alloc, output, " <a:off x=\"0\" y=\"0\"/>\n"); | |
| 275 //extract_astring_cat(alloc, output, " <a:ext cx=\"2933700\" cy=\"2200275\"/>\n"); | |
| 276 extract_astring_cat(alloc, output, " </a:xfrm>\n"); | |
| 277 extract_astring_cat(alloc, output, " <a:prstGeom prst=\"rect\">\n"); | |
| 278 extract_astring_cat(alloc, output, " <a:avLst/>\n"); | |
| 279 extract_astring_cat(alloc, output, " </a:prstGeom>\n"); | |
| 280 extract_astring_cat(alloc, output, " <a:noFill/>\n"); | |
| 281 extract_astring_cat(alloc, output, " <a:ln>\n"); | |
| 282 extract_astring_cat(alloc, output, " <a:noFill/>\n"); | |
| 283 extract_astring_cat(alloc, output, " </a:ln>\n"); | |
| 284 extract_astring_cat(alloc, output, " </pic:spPr>\n"); | |
| 285 extract_astring_cat(alloc, output, " </pic:pic>\n"); | |
| 286 extract_astring_cat(alloc, output, " </a:graphicData>\n"); | |
| 287 extract_astring_cat(alloc, output, " </a:graphic>\n"); | |
| 288 extract_astring_cat(alloc, output, " </wp:inline>\n"); | |
| 289 extract_astring_cat(alloc, output, " </w:drawing>\n"); | |
| 290 extract_astring_cat(alloc, output, " </w:r>\n"); | |
| 291 extract_astring_cat(alloc, output, " </w:p>\n"); | |
| 292 extract_astring_cat(alloc, output, "\n"); | |
| 293 | |
| 294 return 0; | |
| 295 } | |
| 296 | |
| 297 | |
| 298 /* Writes paragraph to content inside rotated text box. */ | |
| 299 static int | |
| 300 docx_output_rotated_paragraphs( | |
| 301 extract_alloc_t *alloc, | |
| 302 block_t *block, | |
| 303 int rot, | |
| 304 int x, | |
| 305 int y, | |
| 306 int w, | |
| 307 int h, | |
| 308 int text_box_id, | |
| 309 extract_astring_t *output, | |
| 310 content_state_t *state) | |
| 311 { | |
| 312 int e = -1; | |
| 313 paragraph_t *paragraph; | |
| 314 content_paragraph_iterator pit; | |
| 315 | |
| 316 outf("x,y=%ik,%ik = %i,%i", x/1000, y/1000, x, y); | |
| 317 extract_astring_cat(alloc, output, "\n"); | |
| 318 extract_astring_cat(alloc, output, "\n"); | |
| 319 extract_astring_cat(alloc, output, "<w:p>\n"); | |
| 320 extract_astring_cat(alloc, output, " <w:r>\n"); | |
| 321 extract_astring_cat(alloc, output, " <mc:AlternateContent>\n"); | |
| 322 extract_astring_cat(alloc, output, " <mc:Choice Requires=\"wps\">\n"); | |
| 323 extract_astring_cat(alloc, output, " <w:drawing>\n"); | |
| 324 extract_astring_cat(alloc, output, " <wp:anchor distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" simplePos=\"0\" relativeHeight=\"0\" behindDoc=\"0\" locked=\"0\" layoutInCell=\"1\" allowOverlap=\"1\" wp14:anchorId=\"53A210D1\" wp14:editId=\"2B7E8016\">\n"); | |
| 325 extract_astring_cat(alloc, output, " <wp:simplePos x=\"0\" y=\"0\"/>\n"); | |
| 326 extract_astring_cat(alloc, output, " <wp:positionH relativeFrom=\"page\">\n"); | |
| 327 extract_astring_catf(alloc, output," <wp:posOffset>%i</wp:posOffset>\n", x); | |
| 328 extract_astring_cat(alloc, output, " </wp:positionH>\n"); | |
| 329 extract_astring_cat(alloc, output, " <wp:positionV relativeFrom=\"page\">\n"); | |
| 330 extract_astring_catf(alloc, output," <wp:posOffset>%i</wp:posOffset>\n", y); | |
| 331 extract_astring_cat(alloc, output, " </wp:positionV>\n"); | |
| 332 extract_astring_catf(alloc, output," <wp:extent cx=\"%i\" cy=\"%i\"/>\n", w, h); | |
| 333 //extract_astring_cat(alloc, output, " <wp:effectExtent l=\"381000\" t=\"723900\" r=\"371475\" b=\"723900\"/>\n"); | |
| 334 extract_astring_cat(alloc, output, " <wp:wrapNone/>\n"); | |
| 335 extract_astring_catf(alloc, output," <wp:docPr id=\"%i\" name=\"Text Box %i\"/>\n", text_box_id, text_box_id); | |
| 336 extract_astring_cat(alloc, output, " <wp:cNvGraphicFramePr/>\n"); | |
| 337 extract_astring_cat(alloc, output, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n"); | |
| 338 extract_astring_cat(alloc, output, " <a:graphicData uri=\"http://schemas.microsoft.com/office/word/2010/wordprocessingShape\">\n"); | |
| 339 extract_astring_cat(alloc, output, " <wps:wsp>\n"); | |
| 340 extract_astring_cat(alloc, output, " <wps:cNvSpPr txBox=\"1\"/>\n"); | |
| 341 extract_astring_cat(alloc, output, " <wps:spPr>\n"); | |
| 342 extract_astring_catf(alloc, output," <a:xfrm rot=\"%i\">\n", rot); | |
| 343 extract_astring_cat(alloc, output, " <a:off x=\"0\" y=\"0\"/>\n"); | |
| 344 //extract_astring_cat(alloc, output, " <a:ext cx=\"3228975\" cy=\"2286000\"/>\n"); | |
| 345 extract_astring_cat(alloc, output, " </a:xfrm>\n"); | |
| 346 extract_astring_cat(alloc, output, " <a:prstGeom prst=\"rect\">\n"); | |
| 347 extract_astring_cat(alloc, output, " <a:avLst/>\n"); | |
| 348 extract_astring_cat(alloc, output, " </a:prstGeom>\n"); | |
| 349 | |
| 350 /* Give box a solid background. */ | |
| 351 if (0) { | |
| 352 extract_astring_cat(alloc, output, " <a:solidFill>\n"); | |
| 353 extract_astring_cat(alloc, output, " <a:schemeClr val=\"lt1\"/>\n"); | |
| 354 extract_astring_cat(alloc, output, " </a:solidFill>\n"); | |
| 355 } | |
| 356 | |
| 357 /* Draw line around box. */ | |
| 358 if (0) { | |
| 359 extract_astring_cat(alloc, output, " <a:ln w=\"175\">\n"); | |
| 360 extract_astring_cat(alloc, output, " <a:solidFill>\n"); | |
| 361 extract_astring_cat(alloc, output, " <a:prstClr val=\"black\"/>\n"); | |
| 362 extract_astring_cat(alloc, output, " </a:solidFill>\n"); | |
| 363 extract_astring_cat(alloc, output, " </a:ln>\n"); | |
| 364 } | |
| 365 | |
| 366 extract_astring_cat(alloc, output, " </wps:spPr>\n"); | |
| 367 extract_astring_cat(alloc, output, " <wps:txbx>\n"); | |
| 368 extract_astring_cat(alloc, output, " <w:txbxContent>"); | |
| 369 | |
| 370 #if 0 | |
| 371 if (0) { | |
| 372 /* Output inline text describing the rotation. */ | |
| 373 extract_astring_catf(content, "<w:p>\n" | |
| 374 "<w:r><w:rPr><w:rFonts w:ascii=\"OpenSans\" w:hAnsi=\"OpenSans\"/><w:sz w:val=\"20.000000\"/><w:szCs w:val=\"15.000000\"/></w:rPr><w:t xml:space=\"preserve\">*** rotate: %f rad, %f deg. rot=%i</w:t></w:r>\n" | |
| 375 "</w:p>\n", | |
| 376 rotate, | |
| 377 rotate * 180 / pi, | |
| 378 rot | |
| 379 ); | |
| 380 } | |
| 381 #endif | |
| 382 | |
| 383 /* Output paragraphs p0..p2-1. */ | |
| 384 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 385 if (document_to_docx_content_paragraph(alloc, state, paragraph, output)) goto end; | |
| 386 | |
| 387 extract_astring_cat(alloc, output, "\n"); | |
| 388 extract_astring_cat(alloc, output, " </w:txbxContent>\n"); | |
| 389 extract_astring_cat(alloc, output, " </wps:txbx>\n"); | |
| 390 extract_astring_cat(alloc, output, " <wps:bodyPr rot=\"0\" spcFirstLastPara=\"0\" vertOverflow=\"overflow\" horzOverflow=\"overflow\" vert=\"horz\" wrap=\"square\" lIns=\"91440\" tIns=\"45720\" rIns=\"91440\" bIns=\"45720\" numCol=\"1\" spcCol=\"0\" rtlCol=\"0\" fromWordArt=\"0\" anchor=\"t\" anchorCtr=\"0\" forceAA=\"0\" compatLnSpc=\"1\">\n"); | |
| 391 extract_astring_cat(alloc, output, " <a:prstTxWarp prst=\"textNoShape\">\n"); | |
| 392 extract_astring_cat(alloc, output, " <a:avLst/>\n"); | |
| 393 extract_astring_cat(alloc, output, " </a:prstTxWarp>\n"); | |
| 394 extract_astring_cat(alloc, output, " <a:noAutofit/>\n"); | |
| 395 extract_astring_cat(alloc, output, " </wps:bodyPr>\n"); | |
| 396 extract_astring_cat(alloc, output, " </wps:wsp>\n"); | |
| 397 extract_astring_cat(alloc, output, " </a:graphicData>\n"); | |
| 398 extract_astring_cat(alloc, output, " </a:graphic>\n"); | |
| 399 extract_astring_cat(alloc, output, " </wp:anchor>\n"); | |
| 400 extract_astring_cat(alloc, output, " </w:drawing>\n"); | |
| 401 extract_astring_cat(alloc, output, " </mc:Choice>\n"); | |
| 402 | |
| 403 #if 0 | |
| 404 /* This fallback is copied from a real Word document. Not sure | |
| 405 whether it works - both Libreoffice and Word use the above | |
| 406 choice. */ | |
| 407 extract_astring_cat(alloc, output, " <mc:Fallback>\n"); | |
| 408 extract_astring_cat(alloc, output, " <w:pict>\n"); | |
| 409 extract_astring_cat(alloc, output, " <v:shapetype w14:anchorId=\"53A210D1\" id=\"_x0000_t202\" coordsize=\"21600,21600\" o:spt=\"202\" path=\"m,l,21600r21600,l21600,xe\">\n"); | |
| 410 extract_astring_cat(alloc, output, " <v:stroke joinstyle=\"miter\"/>\n"); | |
| 411 extract_astring_cat(alloc, output, " <v:path gradientshapeok=\"t\" o:connecttype=\"rect\"/>\n"); | |
| 412 extract_astring_cat(alloc, output, " </v:shapetype>\n"); | |
| 413 extract_astring_catf(alloc, output," <v:shape id=\"Text Box %i\" o:spid=\"_x0000_s1026\" type=\"#_x0000_t202\" style=\"position:absolute;margin-left:71.25pt;margin-top:48.75pt;width:254.25pt;height:180pt;rotation:-2241476fd;z-index:251659264;visibility:visible;mso-wrap-style:square;mso-wrap-distance-left:9pt;mso-wrap-distance-top:0;mso-wrap-distance-right:9pt;mso-wrap-distance-bottom:0;mso-position-horizontal:absolute;mso-position-horizontal-relative:text;mso-position-vertical:absolute;mso-position-vertical-relative:text;v-text-anchor:top\" o:gfxdata=\"UEsDBBQABgAIAAAAIQC2gziS/gAAAOEBAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbJSRQU7DMBBF 90jcwfIWJU67QAgl6YK0S0CoHGBkTxKLZGx5TGhvj5O2G0SRWNoz/78nu9wcxkFMGNg6quQqL6RA 0s5Y6ir5vt9lD1JwBDIwOMJKHpHlpr69KfdHjyxSmriSfYz+USnWPY7AufNIadK6MEJMx9ApD/oD OlTrorhX2lFEilmcO2RdNtjC5xDF9pCuTyYBB5bi6bQ4syoJ3g9WQ0ymaiLzg5KdCXlKLjvcW893 SUOqXwnz5DrgnHtJTxOsQfEKIT7DmDSUCaxw7Rqn8787ZsmRM9e2VmPeBN4uqYvTtW7jvijg9N/y JsXecLq0q+WD6m8AAAD//wMAUEsDBBQABgAIAAAAIQA4/SH/1gAAAJQBAAALAAAAX3JlbHMvLnJl bHOkkMFqwzAMhu+DvYPRfXGawxijTi+j0GvpHsDYimMaW0Yy2fr2M4PBMnrbUb/Q94l/f/hMi1qR JVI2sOt6UJgd+ZiDgffL8ekFlFSbvV0oo4EbChzGx4f9GRdb25HMsYhqlCwG5lrLq9biZkxWOiqY 22YiTra2kYMu1l1tQD30/bPm3wwYN0x18gb45AdQl1tp5j/sFB2T0FQ7R0nTNEV3j6o9feQzro1i OWA14Fm+Q8a1a8+Bvu/d/dMb2JY5uiPbhG/ktn4cqGU/er3pcvwCAAD//wMAUEsDBBQABgAIAAAA IQDQg5pQVgIAALEEAAAOAAAAZHJzL2Uyb0RvYy54bWysVE1v2zAMvQ/YfxB0X+2k+WiDOEXWosOA oi3QDj0rstwYk0VNUmJ3v35PipMl3U7DLgJFPj+Rj6TnV12j2VY5X5Mp+OAs50wZSWVtXgv+7fn2 0wVnPghTCk1GFfxNeX61+Phh3tqZGtKadKkcA4nxs9YWfB2CnWWZl2vVCH9GVhkEK3KNCLi616x0 ogV7o7Nhnk+yllxpHUnlPbw3uyBfJP6qUjI8VJVXgemCI7eQTpfOVTyzxVzMXp2w61r2aYh/yKIR tcGjB6obEQTbuPoPqqaWjjxV4UxSk1FV1VKlGlDNIH9XzdNaWJVqgTjeHmTy/49W3m8fHatL9I4z Ixq06Fl1gX2mjg2iOq31M4CeLGChgzsie7+HMxbdVa5hjiDu4HI8ml5MpkkLVMcAh+xvB6kjt4Tz fDi8uJyOOZOIwZ7keWpGtmOLrNb58EVRw6JRcIdeJlqxvfMBGQC6h0S4J12Xt7XW6RLnR11rx7YC ndch5YwvTlDasLbgk/NxnohPYpH68P1KC/k9Vn3KgJs2cEaNdlpEK3SrrhdoReUbdEvSQAZv5W0N 3jvhw6NwGDQ4sTzhAUelCclQb3G2Jvfzb/6IR/8R5azF4Bbc/9gIpzjTXw0m43IwGsVJT5fReDrE xR1HVscRs2muCQqh+8gumREf9N6sHDUv2LFlfBUhYSTeLnjYm9dht07YUamWywTCbFsR7syTlZF6 383n7kU42/czYBTuaT/iYvaurTts/NLQchOoqlPPo8A7VXvdsRepLf0Ox8U7vifU7z/N4hcAAAD/ /wMAUEsDBBQABgAIAAAAIQBh17L63wAAAAoBAAAPAAAAZHJzL2Rvd25yZXYueG1sTI9BT4NAEIXv Jv6HzZh4s0ubgpayNIboSW3Syg9Y2BGI7CyyS0v99Y4nPU3ezMub72W72fbihKPvHClYLiIQSLUz HTUKyvfnuwcQPmgyuneECi7oYZdfX2U6Ne5MBzwdQyM4hHyqFbQhDKmUvm7Rar9wAxLfPtxodWA5 NtKM+szhtperKEqk1R3xh1YPWLRYfx4nq8APVfz9VQxPb+WUNC+vZbGPDhelbm/mxy2IgHP4M8Mv PqNDzkyVm8h40bNer2K2Ktjc82RDEi+5XKVgHfNG5pn8XyH/AQAA//8DAFBLAQItABQABgAIAAAA IQC2gziS/gAAAOEBAAATAAAAAAAAAAAAAAAAAAAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAi0A FAAGAAgAAAAhADj9If/WAAAAlAEAAAsAAAAAAAAAAAAAAAAALwEAAF9yZWxzLy5yZWxzUEsBAi0A FAAGAAgAAAAhANCDmlBWAgAAsQQAAA4AAAAAAAAAAAAAAAAALgIAAGRycy9lMm9Eb2MueG1sUEsB Ai0AFAAGAAgAAAAhAGHXsvrfAAAACgEAAA8AAAAAAAAAAAAAAAAAsAQAAGRycy9kb3ducmV2Lnht bFBLBQYAAAAABAAEAPMAAAC8BQAAAAA= \" fillcolor=\"white [3201]\" strokeweight=\".5pt\">\n", text_box_id); | |
| 414 extract_astring_cat(alloc, output, " <v:textbox>\n"); | |
| 415 extract_astring_cat(alloc, output, " <w:txbxContent>"); | |
| 416 | |
| 417 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 418 if (document_to_docx_content_paragraph(alloc, state, paragraph, output)) goto end; | |
| 419 | |
| 420 extract_astring_cat(alloc, output, "\n"); | |
| 421 extract_astring_cat(alloc, output, "\n"); | |
| 422 extract_astring_cat(alloc, output, " </w:txbxContent>\n"); | |
| 423 extract_astring_cat(alloc, output, " </v:textbox>\n"); | |
| 424 extract_astring_cat(alloc, output, " </v:shape>\n"); | |
| 425 extract_astring_cat(alloc, output, " </w:pict>\n"); | |
| 426 extract_astring_cat(alloc, output, " </mc:Fallback>\n"); | |
| 427 #endif | |
| 428 extract_astring_cat(alloc, output, " </mc:AlternateContent>\n"); | |
| 429 extract_astring_cat(alloc, output, " </w:r>\n"); | |
| 430 extract_astring_cat(alloc, output, "</w:p>"); | |
| 431 | |
| 432 e = 0; | |
| 433 end: | |
| 434 | |
| 435 return e; | |
| 436 } | |
| 437 | |
| 438 | |
| 439 /* Appends table to content. | |
| 440 | |
| 441 We do not fix the size of the table or its columns and rows, but instead leave layout up | |
| 442 to the application. */ | |
| 443 static int | |
| 444 docx_append_table( | |
| 445 extract_alloc_t *alloc, | |
| 446 table_t *table, | |
| 447 extract_astring_t *output) | |
| 448 { | |
| 449 int e = -1; | |
| 450 int y; | |
| 451 | |
| 452 if (extract_astring_cat(alloc, output, | |
| 453 "\n" | |
| 454 " <w:tbl>\n" | |
| 455 " <w:tblLayout w:type=\"autofit\"/>\n")) | |
| 456 goto end; | |
| 457 | |
| 458 for (y=0; y<table->cells_num_y; ++y) | |
| 459 { | |
| 460 int x; | |
| 461 if (extract_astring_cat(alloc, output, | |
| 462 " <w:tr>\n" | |
| 463 " <w:trPr/>\n")) goto end; | |
| 464 | |
| 465 for (x=0; x<table->cells_num_x; ++x) | |
| 466 { | |
| 467 cell_t* cell = table->cells[y*table->cells_num_x + x]; | |
| 468 if (!cell->left) continue; | |
| 469 | |
| 470 if (extract_astring_cat(alloc, output, " <w:tc>\n")) | |
| 471 goto end; | |
| 472 | |
| 473 /* Write cell properties. */ | |
| 474 { | |
| 475 if (extract_astring_cat(alloc, output, | |
| 476 " <w:tcPr>\n" | |
| 477 " <w:tcBorders>\n" | |
| 478 " <w:top w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" | |
| 479 " <w:start w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" | |
| 480 " <w:bottom w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" | |
| 481 " <w:end w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" | |
| 482 " </w:tcBorders>\n")) | |
| 483 goto end; | |
| 484 if (cell->extend_right > 1) | |
| 485 { | |
| 486 if (extract_astring_catf(alloc, output, " <w:gridSpan w:val=\"%i\"/>\n", cell->extend_right)) | |
| 487 goto end; | |
| 488 } | |
| 489 if (cell->above) | |
| 490 { | |
| 491 if (cell->extend_down > 1) | |
| 492 { | |
| 493 if (extract_astring_catf(alloc, output, " <w:vMerge w:val=\"restart\"/>\n", cell->extend_down)) | |
| 494 goto end; | |
| 495 } | |
| 496 } | |
| 497 else | |
| 498 { | |
| 499 if (extract_astring_catf(alloc, output, " <w:vMerge w:val=\"continue\"/>\n")) | |
| 500 goto end; | |
| 501 } | |
| 502 if (extract_astring_cat(alloc, output, " </w:tcPr>\n")) | |
| 503 goto end; | |
| 504 } | |
| 505 | |
| 506 /* Write contents of this cell. */ | |
| 507 { | |
| 508 content_paragraph_iterator pit; | |
| 509 paragraph_t *paragraph; | |
| 510 size_t chars_num_old = output->chars_num; | |
| 511 content_state_t content_state = {0}; | |
| 512 | |
| 513 content_state.font.name = NULL; | |
| 514 content_state.ctm_prev = NULL; | |
| 515 for (paragraph = content_paragraph_iterator_init(&pit, &cell->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 516 if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output)) | |
| 517 goto end; | |
| 518 | |
| 519 if (content_state.font.name) | |
| 520 if (docx_run_finish(alloc, &content_state, output)) goto end; | |
| 521 | |
| 522 /* Need to write out at least an empty paragraph in each | |
| 523 * cell, otherwise Word/Libreoffice fail to show table at | |
| 524 * all; the OOXML spec says "If a table cell does not | |
| 525 * include at least one block-level element, then this | |
| 526 * document shall be considered corrupt." */ | |
| 527 if (output->chars_num == chars_num_old) | |
| 528 if (extract_astring_catf(alloc, output, "<w:p/>\n")) | |
| 529 goto end; | |
| 530 } | |
| 531 if (extract_astring_cat(alloc, output, " </w:tc>\n")) | |
| 532 goto end; | |
| 533 } | |
| 534 if (extract_astring_cat(alloc, output, " </w:tr>\n")) | |
| 535 goto end; | |
| 536 } | |
| 537 if (extract_astring_cat(alloc, output, " </w:tbl>\n")) | |
| 538 goto end; | |
| 539 | |
| 540 e = 0; | |
| 541 end: | |
| 542 | |
| 543 return e; | |
| 544 } | |
| 545 | |
| 546 /* Appends a block of content with same rotation. */ | |
| 547 static int | |
| 548 docx_append_rotated_paragraphs( | |
| 549 extract_alloc_t *alloc, | |
| 550 content_state_t *state, | |
| 551 block_t *block, | |
| 552 int *text_box_id, | |
| 553 double angle, | |
| 554 extract_astring_t *output) | |
| 555 { | |
| 556 /* Find extent of paragraphs with this same rotation. extent | |
| 557 will contain max width and max height of paragraphs, in units | |
| 558 before application of ctm, i.e. before rotation. */ | |
| 559 int e = -1; | |
| 560 rect_t bounds; | |
| 561 | |
| 562 bounds = extract_block_pre_rotation_bounds(block, angle); | |
| 563 | |
| 564 outf("angle=%f pre-transform box is: (%f %f) to (%f %f)", | |
| 565 angle, bounds.min.x, bounds.min.y, bounds.max.x, bounds.max.y); | |
| 566 | |
| 567 /* All the paragraphs have same rotation. We output them into | |
| 568 * a single rotated text box. */ | |
| 569 | |
| 570 /* We need unique id for text box. */ | |
| 571 *text_box_id += 1; | |
| 572 | |
| 573 { | |
| 574 /* Angles are in units of 1/60,000 degree. */ | |
| 575 int rot = (int) (angle * 180 / pi * 60000); | |
| 576 | |
| 577 /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm. | |
| 578 * relativeHeight is z-ordering. (wp:positionV:wp:posOffset, | |
| 579 * wp:positionV:wp:posOffset) is position of origin of box in | |
| 580 * EMU. */ | |
| 581 double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */ | |
| 582 int x = (int) (bounds.min.x * point_to_emu); | |
| 583 int y = (int) (bounds.min.y * point_to_emu); | |
| 584 int w = (int) ((bounds.max.x - bounds.min.x) * point_to_emu); | |
| 585 int h = (int) ((bounds.max.y - bounds.min.y) * point_to_emu); | |
| 586 | |
| 587 if (0) outf("rotate: %f rad, %f deg. rot=%i", angle, angle*180/pi, rot); | |
| 588 | |
| 589 if (docx_output_rotated_paragraphs(alloc, block, rot, x, y, w, h, *text_box_id, output, state)) | |
| 590 goto end; | |
| 591 } | |
| 592 | |
| 593 e = 0; | |
| 594 end: | |
| 595 | |
| 596 return e; | |
| 597 } | |
| 598 | |
| 599 int | |
| 600 extract_document_to_docx_content( | |
| 601 extract_alloc_t *alloc, | |
| 602 document_t *document, | |
| 603 int spacing, | |
| 604 int rotation, | |
| 605 int images, | |
| 606 extract_astring_t *output) | |
| 607 { | |
| 608 int e = -1; | |
| 609 int text_box_id = 0; | |
| 610 int p; | |
| 611 | |
| 612 /* Write paragraphs into <content>. */ | |
| 613 for (p=0; p<document->pages_num; ++p) | |
| 614 { | |
| 615 extract_page_t *page = document->pages[p]; | |
| 616 int c; | |
| 617 | |
| 618 for (c=0; c<page->subpages_num; ++c) | |
| 619 { | |
| 620 subpage_t *subpage = page->subpages[c]; | |
| 621 content_iterator cit; | |
| 622 content_t *content; | |
| 623 content_table_iterator tit; | |
| 624 table_t *table; | |
| 625 | |
| 626 content_state_t content_state; | |
| 627 content_state.font.name = NULL; | |
| 628 content_state.font.size = 0; | |
| 629 content_state.font.bold = 0; | |
| 630 content_state.font.italic = 0; | |
| 631 content_state.ctm_prev = NULL; | |
| 632 | |
| 633 /* Output paragraphs and tables in order of y coordinate. */ | |
| 634 content = content_iterator_init(&cit, &subpage->content); | |
| 635 table = content_table_iterator_init(&tit, &subpage->tables); | |
| 636 while (1) | |
| 637 { | |
| 638 double y_paragraph; | |
| 639 double y_table; | |
| 640 /* Next block or NULL if none. */ | |
| 641 block_t *block = (content && content->type == content_block) ? (block_t *)content : NULL; | |
| 642 /* Next paragraph or NULL if none. */ | |
| 643 paragraph_t *paragraph = (content && content->type == content_paragraph) ? (paragraph_t *)content : (block ? content_first_paragraph(&block->content) : NULL); | |
| 644 line_t *first_line = paragraph ? content_first_line(¶graph->content) : NULL; | |
| 645 span_t *first_span = first_line ? content_head_as_span(&first_line->content) : NULL; | |
| 646 | |
| 647 if (!paragraph && !table) break; | |
| 648 | |
| 649 y_paragraph = (first_span) ? first_span->chars[0].y : DBL_MAX; | |
| 650 y_table = (table) ? table->pos.y : DBL_MAX; | |
| 651 | |
| 652 if (first_span && y_paragraph < y_table) | |
| 653 { | |
| 654 const matrix4_t *ctm = &first_span->ctm; | |
| 655 double angle = extract_baseline_angle(ctm); | |
| 656 | |
| 657 if (spacing | |
| 658 && content_state.ctm_prev | |
| 659 && first_line | |
| 660 && first_span | |
| 661 && extract_matrix4_cmp(content_state.ctm_prev, | |
| 662 &first_span->ctm)) | |
| 663 { | |
| 664 /* Extra vertical space between paragraphs that | |
| 665 * were at different angles in the original | |
| 666 * document. */ | |
| 667 if (docx_paragraph_empty(alloc, output)) | |
| 668 goto end; | |
| 669 } | |
| 670 | |
| 671 /* Extra vertical space between paragraphs. */ | |
| 672 if (spacing) | |
| 673 if (docx_paragraph_empty(alloc, output)) | |
| 674 goto end; | |
| 675 | |
| 676 if (rotation && angle != 0) | |
| 677 { | |
| 678 assert(block); | |
| 679 if (docx_append_rotated_paragraphs(alloc, &content_state, block, &text_box_id, angle, output)) | |
| 680 goto end; | |
| 681 } | |
| 682 else if (block) | |
| 683 { | |
| 684 content_paragraph_iterator pit; | |
| 685 int first = 1; | |
| 686 | |
| 687 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 688 { | |
| 689 if (spacing && !first) | |
| 690 { | |
| 691 /* Extra vertical space between paragraphs. */ | |
| 692 if (docx_paragraph_empty(alloc, output)) | |
| 693 goto end; | |
| 694 } | |
| 695 first = 0; | |
| 696 | |
| 697 if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output)) goto end; | |
| 698 } | |
| 699 } | |
| 700 else | |
| 701 { | |
| 702 if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output)) | |
| 703 goto end; | |
| 704 } | |
| 705 content = content_iterator_next(&cit); | |
| 706 } | |
| 707 else if (table) | |
| 708 { | |
| 709 if (docx_append_table(alloc, table, output)) | |
| 710 goto end; | |
| 711 table = content_table_iterator_next(&tit); | |
| 712 } | |
| 713 } | |
| 714 | |
| 715 if (images) | |
| 716 { | |
| 717 content_image_iterator iit; | |
| 718 image_t *image; | |
| 719 | |
| 720 for (image = content_image_iterator_init(&iit, &subpage->content); image != NULL; image = content_image_iterator_next(&iit)) | |
| 721 docx_append_image(alloc, output, image); | |
| 722 } | |
| 723 } | |
| 724 } | |
| 725 | |
| 726 e = 0; | |
| 727 end: | |
| 728 | |
| 729 return e; | |
| 730 } | |
| 731 | |
| 732 | |
| 733 /* Sets *o_begin to end of first occurrence of <begin> in <text>, and *o_end to | |
| 734 * beginning of first occurtence of <end> in <text>. */ | |
| 735 static int | |
| 736 find_mid( | |
| 737 const char *text, | |
| 738 const char *begin, | |
| 739 const char *end, | |
| 740 const char **o_begin, | |
| 741 const char **o_end) | |
| 742 { | |
| 743 *o_begin = strstr(text, begin); | |
| 744 if (*o_begin == NULL) | |
| 745 goto fail; | |
| 746 *o_begin += strlen(begin); | |
| 747 *o_end = strstr(*o_begin, end); | |
| 748 if (*o_end == NULL) | |
| 749 goto fail; | |
| 750 | |
| 751 return 0; | |
| 752 | |
| 753 fail: | |
| 754 errno = ESRCH; | |
| 755 return -1; | |
| 756 } | |
| 757 | |
| 758 | |
| 759 int | |
| 760 extract_docx_content_item( | |
| 761 extract_alloc_t *alloc, | |
| 762 extract_astring_t *contentss, | |
| 763 int contentss_num, | |
| 764 images_t *images, | |
| 765 const char *name, | |
| 766 const char *text, | |
| 767 char **text2) | |
| 768 { | |
| 769 int e = -1; | |
| 770 extract_astring_t temp = { 0 }; | |
| 771 | |
| 772 *text2 = NULL; | |
| 773 | |
| 774 if (0) | |
| 775 {} | |
| 776 else if (!strcmp(name, "[Content_Types].xml")) | |
| 777 { | |
| 778 /* Add information about all image types that we are going to use. */ | |
| 779 const char *begin; | |
| 780 const char *end; | |
| 781 const char *insert; | |
| 782 int it; | |
| 783 | |
| 784 extract_astring_free(alloc, &temp); | |
| 785 outf("text: %s", text); | |
| 786 if (find_mid(text, "<Types ", "</Types>", &begin, &end)) goto end; | |
| 787 | |
| 788 insert = begin; | |
| 789 insert = strchr(insert, '>'); | |
| 790 assert(insert); | |
| 791 insert += 1; | |
| 792 | |
| 793 if (extract_astring_catl(alloc, &temp, text, insert - text)) goto end; | |
| 794 outf("images->imagetypes_num=%i", images->imagetypes_num); | |
| 795 for (it=0; it<images->imagetypes_num; ++it) { | |
| 796 const char *imagetype = images->imagetypes[it]; | |
| 797 if (extract_astring_cat(alloc, &temp, "<Default Extension=\"")) goto end; | |
| 798 if (extract_astring_cat(alloc, &temp, imagetype)) goto end; | |
| 799 if (extract_astring_cat(alloc, &temp, "\" ContentType=\"image/")) goto end; | |
| 800 if (extract_astring_cat(alloc, &temp, imagetype)) goto end; | |
| 801 if (extract_astring_cat(alloc, &temp, "\"/>")) goto end; | |
| 802 } | |
| 803 if (extract_astring_cat(alloc, &temp, insert)) goto end; | |
| 804 *text2 = temp.chars; | |
| 805 extract_astring_init(&temp); | |
| 806 } | |
| 807 else if (!strcmp(name, "word/_rels/document.xml.rels")) | |
| 808 { | |
| 809 /* Add relationships between image ids and image names within docx | |
| 810 * archive. */ | |
| 811 const char *begin; | |
| 812 const char *end; | |
| 813 int j; | |
| 814 | |
| 815 extract_astring_free(alloc, &temp); | |
| 816 if (find_mid(text, "<Relationships", "</Relationships>", &begin, &end)) goto end; | |
| 817 if (extract_astring_catl(alloc, &temp, text, end - text)) goto end; | |
| 818 outf("images.images_num=%i", images->images_num); | |
| 819 for (j=0; j<images->images_num; ++j) { | |
| 820 image_t* image = images->images[j]; | |
| 821 if (extract_astring_cat(alloc, &temp, "<Relationship Id=\"")) goto end; | |
| 822 if (extract_astring_cat(alloc, &temp, image->id)) goto end; | |
| 823 if (extract_astring_cat(alloc, &temp, "\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image\" Target=\"media/")) goto end; | |
| 824 if (extract_astring_cat(alloc, &temp, image->name)) goto end; | |
| 825 if (extract_astring_cat(alloc, &temp, "\"/>")) goto end; | |
| 826 } | |
| 827 if (extract_astring_cat(alloc, &temp, end)) goto end; | |
| 828 *text2 = temp.chars; | |
| 829 extract_astring_init(&temp); | |
| 830 } | |
| 831 else if (!strcmp(name, "word/document.xml")) | |
| 832 { | |
| 833 /* Insert paragraphs content. */ | |
| 834 if (extract_content_insert(alloc, | |
| 835 text, | |
| 836 NULL /*single*/, | |
| 837 "<w:body>", | |
| 838 "</w:body>", | |
| 839 contentss, | |
| 840 contentss_num, | |
| 841 text2)) goto end; | |
| 842 } | |
| 843 else | |
| 844 { | |
| 845 *text2 = NULL; | |
| 846 } | |
| 847 | |
| 848 e = 0; | |
| 849 end: | |
| 850 | |
| 851 if (e) | |
| 852 { | |
| 853 /* We might have set <text2> to new content. */ | |
| 854 extract_free(alloc, text2); | |
| 855 /* We might have used <temp> as a temporary buffer. */ | |
| 856 extract_astring_free(alloc, &temp); | |
| 857 } | |
| 858 extract_astring_init(&temp); | |
| 859 | |
| 860 return e; | |
| 861 } | |
| 862 | |
| 863 | |
| 864 | |
| 865 int | |
| 866 extract_docx_write_template( | |
| 867 extract_alloc_t *alloc, | |
| 868 extract_astring_t *contentss, | |
| 869 int contentss_num, | |
| 870 images_t *images, | |
| 871 const char *path_template, | |
| 872 const char *path_out, | |
| 873 int preserve_dir) | |
| 874 { | |
| 875 int e = -1; | |
| 876 int i; | |
| 877 char *path_tempdir = NULL; | |
| 878 char *path = NULL; | |
| 879 char *text = NULL; | |
| 880 char *text2 = NULL; | |
| 881 | |
| 882 assert(path_out); | |
| 883 assert(path_template); | |
| 884 | |
| 885 if (extract_check_path_shell_safe(path_out)) | |
| 886 { | |
| 887 outf("path_out is unsafe: %s", path_out); | |
| 888 goto end; | |
| 889 } | |
| 890 | |
| 891 outf("images->images_num=%i", images->images_num); | |
| 892 if (extract_asprintf(alloc, &path_tempdir, "%s.dir", path_out) < 0) goto end; | |
| 893 if (extract_systemf(alloc, "rm -r '%s' 2>/dev/null", path_tempdir) < 0) goto end; | |
| 894 | |
| 895 if (extract_mkdir(path_tempdir, 0777)) { | |
| 896 outf("Failed to create directory: %s", path_tempdir); | |
| 897 goto end; | |
| 898 } | |
| 899 | |
| 900 outf("Unzipping template document '%s' to tempdir: %s", | |
| 901 path_template, path_tempdir); | |
| 902 if (extract_systemf(alloc, "unzip -q -d '%s' '%s'", path_tempdir, path_template)) | |
| 903 { | |
| 904 outf("Failed to unzip %s into %s", | |
| 905 path_template, path_tempdir); | |
| 906 goto end; | |
| 907 } | |
| 908 | |
| 909 /* Might be nice to iterate through all items in path_tempdir, but for now | |
| 910 * we look at just the items that we know extract_docx_content_item() will | |
| 911 * modify. */ | |
| 912 | |
| 913 { | |
| 914 const char *names[] = { | |
| 915 "word/document.xml", | |
| 916 "[Content_Types].xml", | |
| 917 "word/_rels/document.xml.rels", | |
| 918 }; | |
| 919 int names_num = sizeof(names) / sizeof(names[0]); | |
| 920 for (i=0; i<names_num; ++i) { | |
| 921 const char* name = names[i]; | |
| 922 extract_free(alloc, &path); | |
| 923 extract_free(alloc, &text); | |
| 924 extract_free(alloc, &text2); | |
| 925 if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end; | |
| 926 if (extract_read_all_path(alloc, path, &text)) goto end; | |
| 927 | |
| 928 if (extract_docx_content_item(alloc, | |
| 929 contentss, | |
| 930 contentss_num, | |
| 931 images, | |
| 932 name, | |
| 933 text, | |
| 934 &text2)) goto end; | |
| 935 | |
| 936 { | |
| 937 const char *text3 = (text2) ? text2 : text; | |
| 938 if (extract_write_all(text3, strlen(text3), path)) goto end; | |
| 939 } | |
| 940 } | |
| 941 } | |
| 942 | |
| 943 /* Copy images into <path_tempdir>/media/. */ | |
| 944 extract_free(alloc, &path); | |
| 945 if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end; | |
| 946 if (extract_mkdir(path, 0777)) goto end; | |
| 947 | |
| 948 for (i=0; i<images->images_num; ++i) { | |
| 949 image_t* image = images->images[i]; | |
| 950 extract_free(alloc, &path); | |
| 951 if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end; | |
| 952 if (extract_write_all(image->data, image->data_size, path)) goto end; | |
| 953 } | |
| 954 | |
| 955 outf("Zipping tempdir to create %s", path_out); | |
| 956 { | |
| 957 const char *path_out_leaf = strrchr(path_out, '/'); | |
| 958 if (!path_out_leaf) path_out_leaf = path_out; | |
| 959 if (extract_systemf(alloc, "cd '%s' && zip -q -r -D '../%s' .", path_tempdir, path_out_leaf)) | |
| 960 { | |
| 961 outf("Zip command failed to convert '%s' directory into output file: %s", | |
| 962 path_tempdir, path_out); | |
| 963 goto end; | |
| 964 } | |
| 965 } | |
| 966 | |
| 967 if (!preserve_dir) { | |
| 968 if (extract_remove_directory(alloc, path_tempdir)) goto end; | |
| 969 } | |
| 970 | |
| 971 e = 0; | |
| 972 end: | |
| 973 | |
| 974 outf("e=%i", e); | |
| 975 extract_free(alloc, &path_tempdir); | |
| 976 extract_free(alloc, &path); | |
| 977 extract_free(alloc, &text); | |
| 978 extract_free(alloc, &text2); | |
| 979 | |
| 980 if (e) | |
| 981 { | |
| 982 outf("Failed to create %s", path_out); | |
| 983 } | |
| 984 | |
| 985 return e; | |
| 986 } |
