Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/extract/src/docx.c @ 46:7ee69f120f19 default tip
>>>>> tag v1.26.5+1 for changeset b74429b0f5c4
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:17:30 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
/* These extract_docx_*() functions generate docx content and docx zip archive data. Caller must call things in a sensible order to create valid content - e.g. don't call docx_paragraph_start() twice without intervening call to docx_paragraph_finish(). */ #include "extract/extract.h" #include "docx_template.h" #include "astring.h" #include "document.h" #include "docx.h" #include "mem.h" #include "memento.h" #include "outf.h" #include "sys.h" #include "text.h" #include "zip.h" #include <assert.h> #include <errno.h> #include <float.h> #include <math.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <sys/stat.h> static int docx_paragraph_start(extract_alloc_t *alloc, extract_astring_t *output) { return extract_astring_cat(alloc, output, "\n\n<w:p>"); } static int docx_paragraph_finish(extract_alloc_t *alloc, extract_astring_t *output) { return extract_astring_cat(alloc, output, "\n</w:p>"); } /* Starts a new run. Caller must ensure that docx_run_finish() was called to terminate any previous run. */ static int docx_run_start( extract_alloc_t *alloc, extract_astring_t *output, content_state_t *content_state) { int e = 0; if (!e) e = extract_astring_cat(alloc, output, "\n<w:r><w:rPr><w:rFonts w:ascii=\""); if (!e) e = extract_astring_cat(alloc, output, content_state->font.name); if (!e) e = extract_astring_cat(alloc, output, "\" w:hAnsi=\""); if (!e) e = extract_astring_cat(alloc, output, content_state->font.name); if (!e) e = extract_astring_cat(alloc, output, "\"/>"); if (!e && content_state->font.bold) e = extract_astring_cat(alloc, output, "<w:b/>"); if (!e && content_state->font.italic) e = extract_astring_cat(alloc, output, "<w:i/>"); { char font_size_text[32]; if (!e) e = extract_astring_cat(alloc, output, "<w:sz w:val=\""); snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2); extract_astring_cat(alloc, output, font_size_text); extract_astring_cat(alloc, output, "\"/>"); if (!e) e = extract_astring_cat(alloc, output, "<w:szCs w:val=\""); snprintf(font_size_text, sizeof(font_size_text), "%f", content_state->font.size * 2); extract_astring_cat(alloc, output, font_size_text); extract_astring_cat(alloc, output, "\"/>"); } if (!e) e = extract_astring_cat(alloc, output, "</w:rPr><w:t xml:space=\"preserve\">"); return e; } static int docx_run_finish(extract_alloc_t *alloc, content_state_t *state, extract_astring_t *output) { if (state) state->font.name = NULL; return extract_astring_cat(alloc, output, "</w:t></w:r>"); } /* Append an empty paragraph to *content. */ static int docx_paragraph_empty( extract_alloc_t *alloc, extract_astring_t *output) { int e = -1; static char fontname[] = "OpenSans"; content_state_t content_state = {0}; if (docx_paragraph_start(alloc, output)) goto end; /* It seems like our choice of font size here doesn't make any difference * to the ammount of vertical space, unless we include a non-space * character. Presumably something to do with the styles in the template * document. */ content_state.font.name = fontname; content_state.font.size = 10; content_state.font.bold = 0; content_state.font.italic = 0; if (docx_run_start(alloc, output, &content_state)) goto end; //docx_char_append_string(output, " "); /*   is non-break space. */ if (docx_run_finish(alloc, NULL /*state*/, output)) goto end; if (docx_paragraph_finish(alloc, output)) goto end; e = 0; end: return e; } /* Removes last char if it is <c>. */ static int docx_char_truncate_if(extract_astring_t *output, char c) { if (output->chars_num && output->chars[output->chars_num-1] == c) extract_astring_truncate(output, 1); return 0; } /* Append docx xml for <paragraph> to <content>. Updates *state if we change font. */ static int document_to_docx_content_paragraph( extract_alloc_t *alloc, content_state_t *content_state, paragraph_t *paragraph, extract_astring_t *content) { int e = -1; content_line_iterator lit; line_t *line; if (docx_paragraph_start(alloc, content)) goto end; if ((paragraph->line_flags & paragraph_not_fully_justified) == 0) { if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"both\"/></w:pPr>")) goto end; } else if ((paragraph->line_flags & paragraph_not_centred) == 0) { if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"center\"/></w:pPr>")) goto end; } else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_left) { if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"right\"/></w:pPr>")) goto end; } else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_right) { if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"left\"/></w:pPr>")) goto end; } for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) { content_span_iterator sit; span_t *span; for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) { int si; double font_size_new; content_state->ctm_prev = &span->ctm; font_size_new = extract_font_size(&span->ctm); if (!content_state->font.name || strcmp(span->font_name, content_state->font.name) || span->flags.font_bold != content_state->font.bold || span->flags.font_italic != content_state->font.italic || font_size_new != content_state->font.size) { if (content_state->font.name) if (docx_run_finish(alloc, content_state, content)) goto end; content_state->font.name = span->font_name; content_state->font.bold = span->flags.font_bold; content_state->font.italic = span->flags.font_italic; content_state->font.size = font_size_new; if (docx_run_start(alloc, content, content_state)) goto end; } for (si=0; si<span->chars_num; ++si) { char_t* char_ = &span->chars[si]; int c = char_->ucs; if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end; } /* Remove any trailing '-' at end of line. */ if (docx_char_truncate_if(content, '-')) goto end; } if (paragraph->line_flags & paragraph_breaks_strangely) { if (extract_astring_cat(alloc, content, "<w:br/>")) goto end; } } if (content_state->font.name) { if (docx_run_finish(alloc, content_state, content)) goto end; } if (docx_paragraph_finish(alloc, content)) goto end; e = 0; end: return e; } /* Write reference to image into docx content. */ static int docx_append_image( extract_alloc_t *alloc, extract_astring_t *output, image_t *image) { extract_astring_cat(alloc, output, "\n"); extract_astring_cat(alloc, output, " <w:p>\n"); extract_astring_cat(alloc, output, " <w:r>\n"); extract_astring_cat(alloc, output, " <w:rPr>\n"); extract_astring_cat(alloc, output, " <w:noProof/>\n"); extract_astring_cat(alloc, output, " </w:rPr>\n"); extract_astring_cat(alloc, output, " <w:drawing>\n"); extract_astring_cat(alloc, output, " <wp:inline distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" wp14:anchorId=\"7057A832\" wp14:editId=\"466EB3FB\">\n"); //extract_astring_cat(alloc, output, " <wp:extent cx=\"2933700\" cy=\"2200275\"/>\n"); //extract_astring_cat(alloc, output, " <wp:effectExtent l=\"0\" t=\"0\" r=\"0\" b=\"9525\"/>\n"); extract_astring_cat(alloc, output, " <wp:docPr id=\"1\" name=\"Picture 1\"/>\n"); extract_astring_cat(alloc, output, " <wp:cNvGraphicFramePr>\n"); extract_astring_cat(alloc, output, " <a:graphicFrameLocks xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\" noChangeAspect=\"1\"/>\n"); extract_astring_cat(alloc, output, " </wp:cNvGraphicFramePr>\n"); extract_astring_cat(alloc, output, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n"); extract_astring_cat(alloc, output, " <a:graphicData uri=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n"); extract_astring_cat(alloc, output, " <pic:pic xmlns:pic=\"http://schemas.openxmlformats.org/drawingml/2006/picture\">\n"); extract_astring_cat(alloc, output, " <pic:nvPicPr>\n"); extract_astring_cat(alloc, output, " <pic:cNvPr id=\"1\" name=\"Picture 1\"/>\n"); extract_astring_cat(alloc, output, " <pic:cNvPicPr>\n"); extract_astring_cat(alloc, output, " <a:picLocks noChangeAspect=\"1\" noChangeArrowheads=\"1\"/>\n"); extract_astring_cat(alloc, output, " </pic:cNvPicPr>\n"); extract_astring_cat(alloc, output, " </pic:nvPicPr>\n"); extract_astring_cat(alloc, output, " <pic:blipFill>\n"); extract_astring_catf(alloc, output," <a:blip r:embed=\"%s\">\n", image->id); extract_astring_cat(alloc, output, " <a:extLst>\n"); extract_astring_cat(alloc, output, " <a:ext uri=\"{28A0092B-C50C-407E-A947-70E740481C1C}\">\n"); extract_astring_cat(alloc, output, " <a14:useLocalDpi xmlns:a14=\"http://schemas.microsoft.com/office/drawing/2010/main\" val=\"0\"/>\n"); extract_astring_cat(alloc, output, " </a:ext>\n"); extract_astring_cat(alloc, output, " </a:extLst>\n"); extract_astring_cat(alloc, output, " </a:blip>\n"); //extract_astring_cat(alloc, output, " <a:srcRect/>\n"); extract_astring_cat(alloc, output, " <a:stretch>\n"); extract_astring_cat(alloc, output, " <a:fillRect/>\n"); extract_astring_cat(alloc, output, " </a:stretch>\n"); extract_astring_cat(alloc, output, " </pic:blipFill>\n"); extract_astring_cat(alloc, output, " <pic:spPr bwMode=\"auto\">\n"); extract_astring_cat(alloc, output, " <a:xfrm>\n"); extract_astring_cat(alloc, output, " <a:off x=\"0\" y=\"0\"/>\n"); //extract_astring_cat(alloc, output, " <a:ext cx=\"2933700\" cy=\"2200275\"/>\n"); extract_astring_cat(alloc, output, " </a:xfrm>\n"); extract_astring_cat(alloc, output, " <a:prstGeom prst=\"rect\">\n"); extract_astring_cat(alloc, output, " <a:avLst/>\n"); extract_astring_cat(alloc, output, " </a:prstGeom>\n"); extract_astring_cat(alloc, output, " <a:noFill/>\n"); extract_astring_cat(alloc, output, " <a:ln>\n"); extract_astring_cat(alloc, output, " <a:noFill/>\n"); extract_astring_cat(alloc, output, " </a:ln>\n"); extract_astring_cat(alloc, output, " </pic:spPr>\n"); extract_astring_cat(alloc, output, " </pic:pic>\n"); extract_astring_cat(alloc, output, " </a:graphicData>\n"); extract_astring_cat(alloc, output, " </a:graphic>\n"); extract_astring_cat(alloc, output, " </wp:inline>\n"); extract_astring_cat(alloc, output, " </w:drawing>\n"); extract_astring_cat(alloc, output, " </w:r>\n"); extract_astring_cat(alloc, output, " </w:p>\n"); extract_astring_cat(alloc, output, "\n"); return 0; } /* Writes paragraph to content inside rotated text box. */ static int docx_output_rotated_paragraphs( extract_alloc_t *alloc, block_t *block, int rot, int x, int y, int w, int h, int text_box_id, extract_astring_t *output, content_state_t *state) { int e = -1; paragraph_t *paragraph; content_paragraph_iterator pit; outf("x,y=%ik,%ik = %i,%i", x/1000, y/1000, x, y); extract_astring_cat(alloc, output, "\n"); extract_astring_cat(alloc, output, "\n"); extract_astring_cat(alloc, output, "<w:p>\n"); extract_astring_cat(alloc, output, " <w:r>\n"); extract_astring_cat(alloc, output, " <mc:AlternateContent>\n"); extract_astring_cat(alloc, output, " <mc:Choice Requires=\"wps\">\n"); extract_astring_cat(alloc, output, " <w:drawing>\n"); extract_astring_cat(alloc, output, " <wp:anchor distT=\"0\" distB=\"0\" distL=\"0\" distR=\"0\" simplePos=\"0\" relativeHeight=\"0\" behindDoc=\"0\" locked=\"0\" layoutInCell=\"1\" allowOverlap=\"1\" wp14:anchorId=\"53A210D1\" wp14:editId=\"2B7E8016\">\n"); extract_astring_cat(alloc, output, " <wp:simplePos x=\"0\" y=\"0\"/>\n"); extract_astring_cat(alloc, output, " <wp:positionH relativeFrom=\"page\">\n"); extract_astring_catf(alloc, output," <wp:posOffset>%i</wp:posOffset>\n", x); extract_astring_cat(alloc, output, " </wp:positionH>\n"); extract_astring_cat(alloc, output, " <wp:positionV relativeFrom=\"page\">\n"); extract_astring_catf(alloc, output," <wp:posOffset>%i</wp:posOffset>\n", y); extract_astring_cat(alloc, output, " </wp:positionV>\n"); extract_astring_catf(alloc, output," <wp:extent cx=\"%i\" cy=\"%i\"/>\n", w, h); //extract_astring_cat(alloc, output, " <wp:effectExtent l=\"381000\" t=\"723900\" r=\"371475\" b=\"723900\"/>\n"); extract_astring_cat(alloc, output, " <wp:wrapNone/>\n"); extract_astring_catf(alloc, output," <wp:docPr id=\"%i\" name=\"Text Box %i\"/>\n", text_box_id, text_box_id); extract_astring_cat(alloc, output, " <wp:cNvGraphicFramePr/>\n"); extract_astring_cat(alloc, output, " <a:graphic xmlns:a=\"http://schemas.openxmlformats.org/drawingml/2006/main\">\n"); extract_astring_cat(alloc, output, " <a:graphicData uri=\"http://schemas.microsoft.com/office/word/2010/wordprocessingShape\">\n"); extract_astring_cat(alloc, output, " <wps:wsp>\n"); extract_astring_cat(alloc, output, " <wps:cNvSpPr txBox=\"1\"/>\n"); extract_astring_cat(alloc, output, " <wps:spPr>\n"); extract_astring_catf(alloc, output," <a:xfrm rot=\"%i\">\n", rot); extract_astring_cat(alloc, output, " <a:off x=\"0\" y=\"0\"/>\n"); //extract_astring_cat(alloc, output, " <a:ext cx=\"3228975\" cy=\"2286000\"/>\n"); extract_astring_cat(alloc, output, " </a:xfrm>\n"); extract_astring_cat(alloc, output, " <a:prstGeom prst=\"rect\">\n"); extract_astring_cat(alloc, output, " <a:avLst/>\n"); extract_astring_cat(alloc, output, " </a:prstGeom>\n"); /* Give box a solid background. */ if (0) { extract_astring_cat(alloc, output, " <a:solidFill>\n"); extract_astring_cat(alloc, output, " <a:schemeClr val=\"lt1\"/>\n"); extract_astring_cat(alloc, output, " </a:solidFill>\n"); } /* Draw line around box. */ if (0) { extract_astring_cat(alloc, output, " <a:ln w=\"175\">\n"); extract_astring_cat(alloc, output, " <a:solidFill>\n"); extract_astring_cat(alloc, output, " <a:prstClr val=\"black\"/>\n"); extract_astring_cat(alloc, output, " </a:solidFill>\n"); extract_astring_cat(alloc, output, " </a:ln>\n"); } extract_astring_cat(alloc, output, " </wps:spPr>\n"); extract_astring_cat(alloc, output, " <wps:txbx>\n"); extract_astring_cat(alloc, output, " <w:txbxContent>"); #if 0 if (0) { /* Output inline text describing the rotation. */ extract_astring_catf(content, "<w:p>\n" "<w:r><w:rPr><w:rFonts w:ascii=\"OpenSans\" w:hAnsi=\"OpenSans\"/><w:sz w:val=\"20.000000\"/><w:szCs w:val=\"15.000000\"/></w:rPr><w:t xml:space=\"preserve\">*** rotate: %f rad, %f deg. rot=%i</w:t></w:r>\n" "</w:p>\n", rotate, rotate * 180 / pi, rot ); } #endif /* Output paragraphs p0..p2-1. */ for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) if (document_to_docx_content_paragraph(alloc, state, paragraph, output)) goto end; extract_astring_cat(alloc, output, "\n"); extract_astring_cat(alloc, output, " </w:txbxContent>\n"); extract_astring_cat(alloc, output, " </wps:txbx>\n"); extract_astring_cat(alloc, output, " <wps:bodyPr rot=\"0\" spcFirstLastPara=\"0\" vertOverflow=\"overflow\" horzOverflow=\"overflow\" vert=\"horz\" wrap=\"square\" lIns=\"91440\" tIns=\"45720\" rIns=\"91440\" bIns=\"45720\" numCol=\"1\" spcCol=\"0\" rtlCol=\"0\" fromWordArt=\"0\" anchor=\"t\" anchorCtr=\"0\" forceAA=\"0\" compatLnSpc=\"1\">\n"); extract_astring_cat(alloc, output, " <a:prstTxWarp prst=\"textNoShape\">\n"); extract_astring_cat(alloc, output, " <a:avLst/>\n"); extract_astring_cat(alloc, output, " </a:prstTxWarp>\n"); extract_astring_cat(alloc, output, " <a:noAutofit/>\n"); extract_astring_cat(alloc, output, " </wps:bodyPr>\n"); extract_astring_cat(alloc, output, " </wps:wsp>\n"); extract_astring_cat(alloc, output, " </a:graphicData>\n"); extract_astring_cat(alloc, output, " </a:graphic>\n"); extract_astring_cat(alloc, output, " </wp:anchor>\n"); extract_astring_cat(alloc, output, " </w:drawing>\n"); extract_astring_cat(alloc, output, " </mc:Choice>\n"); #if 0 /* This fallback is copied from a real Word document. Not sure whether it works - both Libreoffice and Word use the above choice. */ extract_astring_cat(alloc, output, " <mc:Fallback>\n"); extract_astring_cat(alloc, output, " <w:pict>\n"); extract_astring_cat(alloc, output, " <v:shapetype w14:anchorId=\"53A210D1\" id=\"_x0000_t202\" coordsize=\"21600,21600\" o:spt=\"202\" path=\"m,l,21600r21600,l21600,xe\">\n"); extract_astring_cat(alloc, output, " <v:stroke joinstyle=\"miter\"/>\n"); extract_astring_cat(alloc, output, " <v:path gradientshapeok=\"t\" o:connecttype=\"rect\"/>\n"); extract_astring_cat(alloc, output, " </v:shapetype>\n"); extract_astring_catf(alloc, output," <v:shape id=\"Text Box %i\" o:spid=\"_x0000_s1026\" type=\"#_x0000_t202\" style=\"position:absolute;margin-left:71.25pt;margin-top:48.75pt;width:254.25pt;height:180pt;rotation:-2241476fd;z-index:251659264;visibility:visible;mso-wrap-style:square;mso-wrap-distance-left:9pt;mso-wrap-distance-top:0;mso-wrap-distance-right:9pt;mso-wrap-distance-bottom:0;mso-position-horizontal:absolute;mso-position-horizontal-relative:text;mso-position-vertical:absolute;mso-position-vertical-relative:text;v-text-anchor:top\" o:gfxdata=\"UEsDBBQABgAIAAAAIQC2gziS/gAAAOEBAAATAAAAW0NvbnRlbnRfVHlwZXNdLnhtbJSRQU7DMBBF 90jcwfIWJU67QAgl6YK0S0CoHGBkTxKLZGx5TGhvj5O2G0SRWNoz/78nu9wcxkFMGNg6quQqL6RA 0s5Y6ir5vt9lD1JwBDIwOMJKHpHlpr69KfdHjyxSmriSfYz+USnWPY7AufNIadK6MEJMx9ApD/oD OlTrorhX2lFEilmcO2RdNtjC5xDF9pCuTyYBB5bi6bQ4syoJ3g9WQ0ymaiLzg5KdCXlKLjvcW893 SUOqXwnz5DrgnHtJTxOsQfEKIT7DmDSUCaxw7Rqn8787ZsmRM9e2VmPeBN4uqYvTtW7jvijg9N/y JsXecLq0q+WD6m8AAAD//wMAUEsDBBQABgAIAAAAIQA4/SH/1gAAAJQBAAALAAAAX3JlbHMvLnJl bHOkkMFqwzAMhu+DvYPRfXGawxijTi+j0GvpHsDYimMaW0Yy2fr2M4PBMnrbUb/Q94l/f/hMi1qR JVI2sOt6UJgd+ZiDgffL8ekFlFSbvV0oo4EbChzGx4f9GRdb25HMsYhqlCwG5lrLq9biZkxWOiqY 22YiTra2kYMu1l1tQD30/bPm3wwYN0x18gb45AdQl1tp5j/sFB2T0FQ7R0nTNEV3j6o9feQzro1i OWA14Fm+Q8a1a8+Bvu/d/dMb2JY5uiPbhG/ktn4cqGU/er3pcvwCAAD//wMAUEsDBBQABgAIAAAA IQDQg5pQVgIAALEEAAAOAAAAZHJzL2Uyb0RvYy54bWysVE1v2zAMvQ/YfxB0X+2k+WiDOEXWosOA oi3QDj0rstwYk0VNUmJ3v35PipMl3U7DLgJFPj+Rj6TnV12j2VY5X5Mp+OAs50wZSWVtXgv+7fn2 0wVnPghTCk1GFfxNeX61+Phh3tqZGtKadKkcA4nxs9YWfB2CnWWZl2vVCH9GVhkEK3KNCLi616x0 ogV7o7Nhnk+yllxpHUnlPbw3uyBfJP6qUjI8VJVXgemCI7eQTpfOVTyzxVzMXp2w61r2aYh/yKIR tcGjB6obEQTbuPoPqqaWjjxV4UxSk1FV1VKlGlDNIH9XzdNaWJVqgTjeHmTy/49W3m8fHatL9I4z Ixq06Fl1gX2mjg2iOq31M4CeLGChgzsie7+HMxbdVa5hjiDu4HI8ml5MpkkLVMcAh+xvB6kjt4Tz fDi8uJyOOZOIwZ7keWpGtmOLrNb58EVRw6JRcIdeJlqxvfMBGQC6h0S4J12Xt7XW6RLnR11rx7YC ndch5YwvTlDasLbgk/NxnohPYpH68P1KC/k9Vn3KgJs2cEaNdlpEK3SrrhdoReUbdEvSQAZv5W0N 3jvhw6NwGDQ4sTzhAUelCclQb3G2Jvfzb/6IR/8R5azF4Bbc/9gIpzjTXw0m43IwGsVJT5fReDrE xR1HVscRs2muCQqh+8gumREf9N6sHDUv2LFlfBUhYSTeLnjYm9dht07YUamWywTCbFsR7syTlZF6 383n7kU42/czYBTuaT/iYvaurTts/NLQchOoqlPPo8A7VXvdsRepLf0Ox8U7vifU7z/N4hcAAAD/ /wMAUEsDBBQABgAIAAAAIQBh17L63wAAAAoBAAAPAAAAZHJzL2Rvd25yZXYueG1sTI9BT4NAEIXv Jv6HzZh4s0ubgpayNIboSW3Syg9Y2BGI7CyyS0v99Y4nPU3ezMub72W72fbihKPvHClYLiIQSLUz HTUKyvfnuwcQPmgyuneECi7oYZdfX2U6Ne5MBzwdQyM4hHyqFbQhDKmUvm7Rar9wAxLfPtxodWA5 NtKM+szhtperKEqk1R3xh1YPWLRYfx4nq8APVfz9VQxPb+WUNC+vZbGPDhelbm/mxy2IgHP4M8Mv PqNDzkyVm8h40bNer2K2Ktjc82RDEi+5XKVgHfNG5pn8XyH/AQAA//8DAFBLAQItABQABgAIAAAA IQC2gziS/gAAAOEBAAATAAAAAAAAAAAAAAAAAAAAAABbQ29udGVudF9UeXBlc10ueG1sUEsBAi0A FAAGAAgAAAAhADj9If/WAAAAlAEAAAsAAAAAAAAAAAAAAAAALwEAAF9yZWxzLy5yZWxzUEsBAi0A FAAGAAgAAAAhANCDmlBWAgAAsQQAAA4AAAAAAAAAAAAAAAAALgIAAGRycy9lMm9Eb2MueG1sUEsB Ai0AFAAGAAgAAAAhAGHXsvrfAAAACgEAAA8AAAAAAAAAAAAAAAAAsAQAAGRycy9kb3ducmV2Lnht bFBLBQYAAAAABAAEAPMAAAC8BQAAAAA= \" fillcolor=\"white [3201]\" strokeweight=\".5pt\">\n", text_box_id); extract_astring_cat(alloc, output, " <v:textbox>\n"); extract_astring_cat(alloc, output, " <w:txbxContent>"); for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) if (document_to_docx_content_paragraph(alloc, state, paragraph, output)) goto end; extract_astring_cat(alloc, output, "\n"); extract_astring_cat(alloc, output, "\n"); extract_astring_cat(alloc, output, " </w:txbxContent>\n"); extract_astring_cat(alloc, output, " </v:textbox>\n"); extract_astring_cat(alloc, output, " </v:shape>\n"); extract_astring_cat(alloc, output, " </w:pict>\n"); extract_astring_cat(alloc, output, " </mc:Fallback>\n"); #endif extract_astring_cat(alloc, output, " </mc:AlternateContent>\n"); extract_astring_cat(alloc, output, " </w:r>\n"); extract_astring_cat(alloc, output, "</w:p>"); e = 0; end: return e; } /* Appends table to content. We do not fix the size of the table or its columns and rows, but instead leave layout up to the application. */ static int docx_append_table( extract_alloc_t *alloc, table_t *table, extract_astring_t *output) { int e = -1; int y; if (extract_astring_cat(alloc, output, "\n" " <w:tbl>\n" " <w:tblLayout w:type=\"autofit\"/>\n")) goto end; for (y=0; y<table->cells_num_y; ++y) { int x; if (extract_astring_cat(alloc, output, " <w:tr>\n" " <w:trPr/>\n")) goto end; for (x=0; x<table->cells_num_x; ++x) { cell_t* cell = table->cells[y*table->cells_num_x + x]; if (!cell->left) continue; if (extract_astring_cat(alloc, output, " <w:tc>\n")) goto end; /* Write cell properties. */ { if (extract_astring_cat(alloc, output, " <w:tcPr>\n" " <w:tcBorders>\n" " <w:top w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" " <w:start w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" " <w:bottom w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" " <w:end w:val=\"double\" w:sz=\"2\" w:space=\"0\" w:color=\"808080\"/>\n" " </w:tcBorders>\n")) goto end; if (cell->extend_right > 1) { if (extract_astring_catf(alloc, output, " <w:gridSpan w:val=\"%i\"/>\n", cell->extend_right)) goto end; } if (cell->above) { if (cell->extend_down > 1) { if (extract_astring_catf(alloc, output, " <w:vMerge w:val=\"restart\"/>\n", cell->extend_down)) goto end; } } else { if (extract_astring_catf(alloc, output, " <w:vMerge w:val=\"continue\"/>\n")) goto end; } if (extract_astring_cat(alloc, output, " </w:tcPr>\n")) goto end; } /* Write contents of this cell. */ { content_paragraph_iterator pit; paragraph_t *paragraph; size_t chars_num_old = output->chars_num; content_state_t content_state = {0}; content_state.font.name = NULL; content_state.ctm_prev = NULL; for (paragraph = content_paragraph_iterator_init(&pit, &cell->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output)) goto end; if (content_state.font.name) if (docx_run_finish(alloc, &content_state, output)) goto end; /* Need to write out at least an empty paragraph in each * cell, otherwise Word/Libreoffice fail to show table at * all; the OOXML spec says "If a table cell does not * include at least one block-level element, then this * document shall be considered corrupt." */ if (output->chars_num == chars_num_old) if (extract_astring_catf(alloc, output, "<w:p/>\n")) goto end; } if (extract_astring_cat(alloc, output, " </w:tc>\n")) goto end; } if (extract_astring_cat(alloc, output, " </w:tr>\n")) goto end; } if (extract_astring_cat(alloc, output, " </w:tbl>\n")) goto end; e = 0; end: return e; } /* Appends a block of content with same rotation. */ static int docx_append_rotated_paragraphs( extract_alloc_t *alloc, content_state_t *state, block_t *block, int *text_box_id, double angle, extract_astring_t *output) { /* Find extent of paragraphs with this same rotation. extent will contain max width and max height of paragraphs, in units before application of ctm, i.e. before rotation. */ int e = -1; rect_t bounds; bounds = extract_block_pre_rotation_bounds(block, angle); outf("angle=%f pre-transform box is: (%f %f) to (%f %f)", angle, bounds.min.x, bounds.min.y, bounds.max.x, bounds.max.y); /* All the paragraphs have same rotation. We output them into * a single rotated text box. */ /* We need unique id for text box. */ *text_box_id += 1; { /* Angles are in units of 1/60,000 degree. */ int rot = (int) (angle * 180 / pi * 60000); /* <wp:anchor distT=\.. etc are in EMU - 1/360,000 of a cm. * relativeHeight is z-ordering. (wp:positionV:wp:posOffset, * wp:positionV:wp:posOffset) is position of origin of box in * EMU. */ double point_to_emu = 12700; /* https://en.wikipedia.org/wiki/Office_Open_XML_file_formats#DrawingML */ int x = (int) (bounds.min.x * point_to_emu); int y = (int) (bounds.min.y * point_to_emu); int w = (int) ((bounds.max.x - bounds.min.x) * point_to_emu); int h = (int) ((bounds.max.y - bounds.min.y) * point_to_emu); if (0) outf("rotate: %f rad, %f deg. rot=%i", angle, angle*180/pi, rot); if (docx_output_rotated_paragraphs(alloc, block, rot, x, y, w, h, *text_box_id, output, state)) goto end; } e = 0; end: return e; } int extract_document_to_docx_content( extract_alloc_t *alloc, document_t *document, int spacing, int rotation, int images, extract_astring_t *output) { int e = -1; int text_box_id = 0; int p; /* Write paragraphs into <content>. */ for (p=0; p<document->pages_num; ++p) { extract_page_t *page = document->pages[p]; int c; for (c=0; c<page->subpages_num; ++c) { subpage_t *subpage = page->subpages[c]; content_iterator cit; content_t *content; content_table_iterator tit; table_t *table; content_state_t content_state; content_state.font.name = NULL; content_state.font.size = 0; content_state.font.bold = 0; content_state.font.italic = 0; content_state.ctm_prev = NULL; /* Output paragraphs and tables in order of y coordinate. */ content = content_iterator_init(&cit, &subpage->content); table = content_table_iterator_init(&tit, &subpage->tables); while (1) { double y_paragraph; double y_table; /* Next block or NULL if none. */ block_t *block = (content && content->type == content_block) ? (block_t *)content : NULL; /* Next paragraph or NULL if none. */ paragraph_t *paragraph = (content && content->type == content_paragraph) ? (paragraph_t *)content : (block ? content_first_paragraph(&block->content) : NULL); line_t *first_line = paragraph ? content_first_line(¶graph->content) : NULL; span_t *first_span = first_line ? content_head_as_span(&first_line->content) : NULL; if (!paragraph && !table) break; y_paragraph = (first_span) ? first_span->chars[0].y : DBL_MAX; y_table = (table) ? table->pos.y : DBL_MAX; if (first_span && y_paragraph < y_table) { const matrix4_t *ctm = &first_span->ctm; double angle = extract_baseline_angle(ctm); if (spacing && content_state.ctm_prev && first_line && first_span && extract_matrix4_cmp(content_state.ctm_prev, &first_span->ctm)) { /* Extra vertical space between paragraphs that * were at different angles in the original * document. */ if (docx_paragraph_empty(alloc, output)) goto end; } /* Extra vertical space between paragraphs. */ if (spacing) if (docx_paragraph_empty(alloc, output)) goto end; if (rotation && angle != 0) { assert(block); if (docx_append_rotated_paragraphs(alloc, &content_state, block, &text_box_id, angle, output)) goto end; } else if (block) { content_paragraph_iterator pit; int first = 1; for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) { if (spacing && !first) { /* Extra vertical space between paragraphs. */ if (docx_paragraph_empty(alloc, output)) goto end; } first = 0; if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output)) goto end; } } else { if (document_to_docx_content_paragraph(alloc, &content_state, paragraph, output)) goto end; } content = content_iterator_next(&cit); } else if (table) { if (docx_append_table(alloc, table, output)) goto end; table = content_table_iterator_next(&tit); } } if (images) { content_image_iterator iit; image_t *image; for (image = content_image_iterator_init(&iit, &subpage->content); image != NULL; image = content_image_iterator_next(&iit)) docx_append_image(alloc, output, image); } } } e = 0; end: return e; } /* Sets *o_begin to end of first occurrence of <begin> in <text>, and *o_end to * beginning of first occurtence of <end> in <text>. */ static int find_mid( const char *text, const char *begin, const char *end, const char **o_begin, const char **o_end) { *o_begin = strstr(text, begin); if (*o_begin == NULL) goto fail; *o_begin += strlen(begin); *o_end = strstr(*o_begin, end); if (*o_end == NULL) goto fail; return 0; fail: errno = ESRCH; return -1; } int extract_docx_content_item( extract_alloc_t *alloc, extract_astring_t *contentss, int contentss_num, images_t *images, const char *name, const char *text, char **text2) { int e = -1; extract_astring_t temp = { 0 }; *text2 = NULL; if (0) {} else if (!strcmp(name, "[Content_Types].xml")) { /* Add information about all image types that we are going to use. */ const char *begin; const char *end; const char *insert; int it; extract_astring_free(alloc, &temp); outf("text: %s", text); if (find_mid(text, "<Types ", "</Types>", &begin, &end)) goto end; insert = begin; insert = strchr(insert, '>'); assert(insert); insert += 1; if (extract_astring_catl(alloc, &temp, text, insert - text)) goto end; outf("images->imagetypes_num=%i", images->imagetypes_num); for (it=0; it<images->imagetypes_num; ++it) { const char *imagetype = images->imagetypes[it]; if (extract_astring_cat(alloc, &temp, "<Default Extension=\"")) goto end; if (extract_astring_cat(alloc, &temp, imagetype)) goto end; if (extract_astring_cat(alloc, &temp, "\" ContentType=\"image/")) goto end; if (extract_astring_cat(alloc, &temp, imagetype)) goto end; if (extract_astring_cat(alloc, &temp, "\"/>")) goto end; } if (extract_astring_cat(alloc, &temp, insert)) goto end; *text2 = temp.chars; extract_astring_init(&temp); } else if (!strcmp(name, "word/_rels/document.xml.rels")) { /* Add relationships between image ids and image names within docx * archive. */ const char *begin; const char *end; int j; extract_astring_free(alloc, &temp); if (find_mid(text, "<Relationships", "</Relationships>", &begin, &end)) goto end; if (extract_astring_catl(alloc, &temp, text, end - text)) goto end; outf("images.images_num=%i", images->images_num); for (j=0; j<images->images_num; ++j) { image_t* image = images->images[j]; if (extract_astring_cat(alloc, &temp, "<Relationship Id=\"")) goto end; if (extract_astring_cat(alloc, &temp, image->id)) goto end; if (extract_astring_cat(alloc, &temp, "\" Type=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships/image\" Target=\"media/")) goto end; if (extract_astring_cat(alloc, &temp, image->name)) goto end; if (extract_astring_cat(alloc, &temp, "\"/>")) goto end; } if (extract_astring_cat(alloc, &temp, end)) goto end; *text2 = temp.chars; extract_astring_init(&temp); } else if (!strcmp(name, "word/document.xml")) { /* Insert paragraphs content. */ if (extract_content_insert(alloc, text, NULL /*single*/, "<w:body>", "</w:body>", contentss, contentss_num, text2)) goto end; } else { *text2 = NULL; } e = 0; end: if (e) { /* We might have set <text2> to new content. */ extract_free(alloc, text2); /* We might have used <temp> as a temporary buffer. */ extract_astring_free(alloc, &temp); } extract_astring_init(&temp); return e; } int extract_docx_write_template( extract_alloc_t *alloc, extract_astring_t *contentss, int contentss_num, images_t *images, const char *path_template, const char *path_out, int preserve_dir) { int e = -1; int i; char *path_tempdir = NULL; char *path = NULL; char *text = NULL; char *text2 = NULL; assert(path_out); assert(path_template); if (extract_check_path_shell_safe(path_out)) { outf("path_out is unsafe: %s", path_out); goto end; } outf("images->images_num=%i", images->images_num); if (extract_asprintf(alloc, &path_tempdir, "%s.dir", path_out) < 0) goto end; if (extract_systemf(alloc, "rm -r '%s' 2>/dev/null", path_tempdir) < 0) goto end; if (extract_mkdir(path_tempdir, 0777)) { outf("Failed to create directory: %s", path_tempdir); goto end; } outf("Unzipping template document '%s' to tempdir: %s", path_template, path_tempdir); if (extract_systemf(alloc, "unzip -q -d '%s' '%s'", path_tempdir, path_template)) { outf("Failed to unzip %s into %s", path_template, path_tempdir); goto end; } /* Might be nice to iterate through all items in path_tempdir, but for now * we look at just the items that we know extract_docx_content_item() will * modify. */ { const char *names[] = { "word/document.xml", "[Content_Types].xml", "word/_rels/document.xml.rels", }; int names_num = sizeof(names) / sizeof(names[0]); for (i=0; i<names_num; ++i) { const char* name = names[i]; extract_free(alloc, &path); extract_free(alloc, &text); extract_free(alloc, &text2); if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end; if (extract_read_all_path(alloc, path, &text)) goto end; if (extract_docx_content_item(alloc, contentss, contentss_num, images, name, text, &text2)) goto end; { const char *text3 = (text2) ? text2 : text; if (extract_write_all(text3, strlen(text3), path)) goto end; } } } /* Copy images into <path_tempdir>/media/. */ extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media", path_tempdir) < 0) goto end; if (extract_mkdir(path, 0777)) goto end; for (i=0; i<images->images_num; ++i) { image_t* image = images->images[i]; extract_free(alloc, &path); if (extract_asprintf(alloc, &path, "%s/word/media/%s", path_tempdir, image->name) < 0) goto end; if (extract_write_all(image->data, image->data_size, path)) goto end; } outf("Zipping tempdir to create %s", path_out); { const char *path_out_leaf = strrchr(path_out, '/'); if (!path_out_leaf) path_out_leaf = path_out; if (extract_systemf(alloc, "cd '%s' && zip -q -r -D '../%s' .", path_tempdir, path_out_leaf)) { outf("Zip command failed to convert '%s' directory into output file: %s", path_tempdir, path_out); goto end; } } if (!preserve_dir) { if (extract_remove_directory(alloc, path_tempdir)) goto end; } e = 0; end: outf("e=%i", e); extract_free(alloc, &path_tempdir); extract_free(alloc, &path); extract_free(alloc, &text); extract_free(alloc, &text2); if (e) { outf("Failed to create %s", path_out); } return e; }
