comparison mupdf-source/thirdparty/extract/src/odt.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /* These extract_odt_*() functions generate odt content and odt zip archive
2 data.
3
4 Caller must call things in a sensible order to create valid content -
5 e.g. don't call odt_paragraph_start() twice without intervening call to
6 odt_paragraph_finish(). */
7
8 #include "extract/extract.h"
9
10 #include "odt_template.h"
11
12 #include "astring.h"
13 #include "document.h"
14 #include "odt.h"
15 #include "mem.h"
16 #include "memento.h"
17 #include "outf.h"
18 #include "sys.h"
19 #include "text.h"
20 #include "zip.h"
21
22 #include <assert.h>
23 #include <errno.h>
24 #include <float.h>
25 #include <math.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29
30 #include <sys/stat.h>
31
32
33 static int
34 odt_paragraph_start(
35 extract_alloc_t *alloc,
36 extract_astring_t *output)
37 {
38 return extract_astring_cat(alloc, output, "\n\n<text:p>");
39 }
40
41 static int
42 odt_paragraph_finish(
43 extract_alloc_t *alloc,
44 extract_astring_t *output)
45 {
46 return extract_astring_cat(alloc, output, "</text:p>");
47 }
48
49 /* ODT doesn't seem to support ad-hoc inline font specifications; instead
50 we have to define a style at the start of the content.xml file. So when
51 writing content we insert a style name and add the required styles to a
52 extract_odt_styles_t struct. */
53
54 struct extract_odt_style_t
55 {
56 int id; /* A unique id for this style. */
57 font_t font;
58 };
59
60 struct extract_odt_styles_t
61 {
62 /* Styles are stored sorted. */
63 extract_odt_style_t *styles;
64 int styles_num;
65 };
66
67 static int
68 odt_style_compare(
69 extract_odt_style_t *a,
70 extract_odt_style_t *b)
71 {
72 int d;
73 double dd;
74
75 if ((d = strcmp(a->font.name, b->font.name))) return d;
76 if ((dd = a->font.size - b->font.size) != 0.0) return (dd > 0.0) ? 1 : -1;
77 if ((d = a->font.bold - b->font.bold)) return d;
78 if ((d = a->font.italic - b->font.italic)) return d;
79
80 return 0;
81 }
82
83 static int
84 odt_style_append_definition(
85 extract_alloc_t *alloc,
86 extract_odt_style_t *style,
87 extract_astring_t *text)
88 {
89 const char* font_name = style->font.name;
90
91 /* This improves output e.g. for zlib.3.pdf, but clearly a hack. */
92 if (0 && strstr(font_name, "Helvetica"))
93 {
94 font_name = "Liberation Sans";
95 }
96 outf("style->font_name=%s font_name=%s", style->font.name, font_name);
97 if (extract_astring_catf(alloc, text, "<style:style style:name=\"T%i\" style:family=\"text\">", style->id)) return -1;
98 if (extract_astring_catf(alloc, text, "<style:text-properties style:font-name=\"%s\"", font_name)) return -1;
99 if (extract_astring_catf(alloc, text, " fo:font-size=\"%.2fpt\"", style->font.size)) return -1;
100 if (extract_astring_catf(alloc, text, " fo:font-weight=\"%s\"", style->font.bold ? "bold" : "normal")) return -1;
101 if (extract_astring_catf(alloc, text, " fo:font-style=\"%s\"", style->font.italic ? "italic" : "normal")) return -1;
102 if (extract_astring_cat(alloc, text, " /></style:style>")) return -1;
103
104 return 0;
105 }
106
107 void
108 extract_odt_styles_free(
109 extract_alloc_t *alloc,
110 extract_odt_styles_t *styles)
111 {
112 int i;
113
114 for (i=0; i<styles->styles_num; ++i)
115 {
116 extract_odt_style_t* style = &styles->styles[i];
117 extract_free(alloc, &style->font.name);
118 }
119 extract_free(alloc, &styles->styles);
120 }
121
122 static int
123 odt_styles_definitions(
124 extract_alloc_t *alloc,
125 extract_odt_styles_t *styles,
126 extract_astring_t *out)
127 {
128 int i;
129
130 if (extract_astring_cat(alloc, out, "<office:automatic-styles>")) return -1;
131 for (i=0; i<styles->styles_num; ++i)
132 {
133 if (odt_style_append_definition(alloc, &styles->styles[i], out)) return -1;
134 }
135 extract_astring_cat(alloc, out, "<style:style style:name=\"gr1\" style:family=\"graphic\">\n");
136 extract_astring_cat(alloc, out, "<style:graphic-properties"
137 " draw:stroke=\"none\""
138 " svg:stroke-color=\"#000000\""
139 " draw:fill=\"none\""
140 " draw:fill-color=\"#ffffff\""
141 " fo:min-height=\"1.9898in\""
142 " style:run-through=\"foreground\""
143 " style:wrap=\"run-through\""
144 " style:number-wrapped-paragraphs=\"no-limit\""
145 " style:vertical-pos=\"from-top\""
146 " style:vertical-rel=\"paragraph\""
147 " style:horizontal-pos=\"from-left\""
148 " style:horizontal-rel=\"paragraph\""
149 " />\n"
150 );
151 extract_astring_cat(alloc, out, "<style:paragraph-properties style:writing-mode=\"lr-tb\"/>\n");
152 extract_astring_cat(alloc, out, "</style:style>\n");
153
154 /* Style for images. */
155 extract_astring_cat(alloc, out, "<style:style style:name=\"fr1\" style:family=\"graphic\" style:parent-style-name=\"Graphics\">\n");
156 extract_astring_cat(alloc, out, "<style:graphic-properties"
157 " fo:margin-left=\"0in\""
158 " fo:margin-right=\"0in\""
159 " fo:margin-top=\"0in\""
160 " fo:margin-bottom=\"0in\""
161 " style:vertical-pos=\"top\""
162 " style:vertical-rel=\"baseline\""
163 " fo:background-color=\"transparent\""
164 " draw:fill=\"none\""
165 " draw:fill-color=\"#ffffff\""
166 " fo:padding=\"0in\""
167 " fo:border=\"none\""
168 " style:mirror=\"none\""
169 " fo:clip=\"rect(0in, 0in, 0in, 0in)\""
170 " draw:luminance=\"0%\""
171 " draw:contrast=\"0%\""
172 " draw:red=\"0%\""
173 " draw:green=\"0%\""
174 " draw:blue=\"0%\""
175 " draw:gamma=\"100%\""
176 " draw:color-inversion=\"false\""
177 " draw:image-opacity=\"100%\""
178 " draw:color-mode=\"standard\""
179 "/>\n");
180 extract_astring_cat(alloc, out, "</style:style>\n");
181
182 if (extract_astring_cat(alloc, out, "</office:automatic-styles>")) return -1;
183
184 return 0;
185 }
186
187 /* Adds specified style to <styles> if not already present. Sets *o_style to
188 point to the style_t within <styles>. */
189 static int
190 odt_styles_add(
191 extract_alloc_t *alloc,
192 extract_odt_styles_t *styles,
193 font_t *font,
194 extract_odt_style_t **o_style)
195 {
196 extract_odt_style_t style = {0 /*id*/, *font};
197 int i;
198
199 /* We keep styles->styles[] sorted; todo: use bsearch or similar when
200 searching. */
201 for (i=0; i<styles->styles_num; ++i)
202 {
203 int d = odt_style_compare(&style, &styles->styles[i]);
204 if (d == 0)
205 {
206 *o_style = &styles->styles[i];
207 return 0;
208 }
209 if (d > 0) break;
210 }
211 /* Insert at position <i>. */
212 if (extract_realloc(alloc, &styles->styles, sizeof(styles->styles[0]) * (styles->styles_num+1))) return -1;
213 memmove(&styles->styles[i+1], &styles->styles[i], sizeof(styles->styles[0]) * (styles->styles_num - i));
214 styles->styles_num += 1;
215 styles->styles[i].id = styles->styles_num + 10; /* Leave space for template's built-in styles. */
216 if (extract_strdup(alloc, font->name, &styles->styles[i].font.name)) return -1;
217 styles->styles[i].font.size = font->size;
218 styles->styles[i].font.bold = font->bold;
219 styles->styles[i].font.italic = font->italic;
220 *o_style = &styles->styles[i];
221
222 return 0;
223 }
224
225 /* Starts a new run. Caller must ensure that s_odt_run_finish() was
226 called to terminate any previous run. */
227 static int
228 extract_odt_run_start(
229 extract_alloc_t *alloc,
230 extract_astring_t *content,
231 extract_odt_styles_t *styles,
232 content_state_t *content_state)
233 {
234 extract_odt_style_t* style;
235
236 if (odt_styles_add(alloc,
237 styles,
238 &content_state->font,
239 &style)) return -1;
240 if (extract_astring_catf(alloc, content, "<text:span text:style-name=\"T%i\">", style->id)) return -1;
241
242 return 0;
243 }
244
245 static int
246 odt_run_finish(
247 extract_alloc_t *alloc,
248 content_state_t *content_state,
249 extract_astring_t *content)
250 {
251 if (content_state)
252 content_state->font.name = NULL;
253 return extract_astring_cat(alloc, content, "</text:span>");
254 }
255
256 /* Append an empty paragraph to *content. */
257 static int
258 odt_append_empty_paragraph(
259 extract_alloc_t *alloc,
260 extract_astring_t *content,
261 extract_odt_styles_t *styles)
262 {
263 int e = -1;
264 static char fontname[] = "OpenSans";
265 content_state_t content_state = {0};
266
267 if (odt_paragraph_start(alloc, content)) goto end;
268 /* [This comment is from docx, haven't checked odt.] It seems like our
269 choice of font size here doesn't make any difference to the amount of
270 vertical space, unless we include a non-space character. Presumably
271 something to do with the styles in the template document. */
272 content_state.font.name = fontname;
273 content_state.font.size = 10;
274 content_state.font.bold = 0;
275 content_state.font.italic = 0;
276 if (extract_odt_run_start(alloc, content, styles, &content_state)) goto end;
277 //docx_char_append_string(content, "&#160;"); /* &#160; is non-break space. */
278 if (odt_run_finish(alloc, NULL /*content_state*/, content)) goto end;
279 if (odt_paragraph_finish(alloc, content)) goto end;
280 e = 0;
281
282 end:
283 return e;
284 }
285
286
287 /* Append odt xml for <paragraph> to <content>. Updates *content_state if we
288 change font. */
289 static int
290 document_to_odt_content_paragraph(
291 extract_alloc_t *alloc,
292 content_state_t *content_state,
293 paragraph_t *paragraph,
294 extract_astring_t *content,
295 extract_odt_styles_t *styles)
296 {
297 int e = -1;
298 content_line_iterator lit;
299 line_t *line;
300
301 if (odt_paragraph_start(alloc, content)) goto end;
302
303 /* Output justification. */
304 if ((paragraph->line_flags & paragraph_not_fully_justified) == 0)
305 {
306 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"both\"/></w:pPr>")) goto end;
307 }
308 else if ((paragraph->line_flags & paragraph_not_centred) == 0)
309 {
310 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"center\"/></w:pPr>")) goto end;
311 }
312 else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_left)
313 {
314 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"right\"/></w:pPr>")) goto end;
315 }
316 else if ((paragraph->line_flags & (paragraph_not_aligned_left | paragraph_not_aligned_right)) == paragraph_not_aligned_right)
317 {
318 if (extract_astring_cat(alloc, content, "<w:pPr><w:jc w:val=\"left\"/></w:pPr>")) goto end;
319 }
320
321
322 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
323 {
324 content_span_iterator sit;
325 span_t *span;
326
327 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit))
328 {
329 int si;
330 double font_size_new;
331
332 content_state->ctm_prev = &span->ctm;
333 font_size_new = extract_font_size(&span->ctm);
334 if (!content_state->font.name
335 || strcmp(span->font_name, content_state->font.name)
336 || span->flags.font_bold != content_state->font.bold
337 || span->flags.font_italic != content_state->font.italic
338 || font_size_new != content_state->font.size
339 )
340 {
341 if (content_state->font.name)
342 {
343 if (odt_run_finish(alloc, content_state, content)) goto end;
344 }
345 content_state->font.name = span->font_name;
346 content_state->font.bold = span->flags.font_bold;
347 content_state->font.italic = span->flags.font_italic;
348 content_state->font.size = font_size_new;
349 if (extract_odt_run_start( alloc, content, styles, content_state)) goto end;
350 }
351
352 for (si=0; si<span->chars_num; ++si)
353 {
354 char_t* char_ = &span->chars[si];
355 int c = char_->ucs;
356 if (extract_astring_catc_unicode_xml(alloc, content, c)) goto end;
357 }
358 /* Remove any trailing '-' at end of line. */
359 if (extract_astring_char_truncate_if(content, '-')) goto end;
360 }
361 if (paragraph->line_flags & paragraph_breaks_strangely)
362 {
363 if (extract_astring_cat(alloc, content, "<w:br/>")) goto end;
364 }
365 }
366 if (content_state->font.name)
367 {
368 if (odt_run_finish(alloc, content_state, content)) goto end;
369 }
370 if (odt_paragraph_finish(alloc, content)) goto end;
371
372 e = 0;
373
374 end:
375 return e;
376 }
377
378 /* Write reference to image into odt content. */
379 static int
380 odt_append_image(
381 extract_alloc_t *alloc,
382 extract_astring_t *output,
383 image_t *image)
384 {
385 extract_astring_cat(alloc, output, "\n");
386 extract_astring_cat(alloc, output, "<text:p text:style-name=\"Standard\">\n");
387 extract_astring_catf(alloc, output, "<draw:frame draw:style-name=\"fr1\" draw:name=\"Picture %s\" text:anchor-type=\"as-char\" svg:width=\"%fin\" svg:height=\"%fin\" draw:z-index=\"0\">\n",
388 image->id,
389 image->w / 72.0,
390 image->h / 72.0);
391 extract_astring_catf(alloc, output, "<draw:image xlink:href=\"Pictures/%s\" xlink:type=\"simple\" xlink:show=\"embed\" xlink:actuate=\"onLoad\" draw:mime-type=\"image/%s\"/>\n",
392 image->name,
393 image->type);
394 extract_astring_cat(alloc, output, "</draw:frame>\n");
395 extract_astring_cat(alloc, output, "</text:p>\n");
396
397 return 0;
398 }
399
400
401 /* Writes paragraph to content inside rotated text box. */
402 static int
403 odt_output_rotated_paragraphs(
404 extract_alloc_t *alloc,
405 block_t *block,
406 double rotation_rad,
407 double x_pt,
408 double y_pt,
409 double w_pt,
410 double h_pt,
411 int text_box_id,
412 extract_astring_t *content,
413 extract_odt_styles_t *styles,
414 content_state_t *content_state)
415 {
416 int e = 0;
417 paragraph_t *paragraph;
418 content_paragraph_iterator pit;
419 double pt_to_inch = 1/72.0;
420
421 outf("rotated paragraphs: rotation_rad=%f (x y)=(%f %f) (w h)=(%f %f)", rotation_rad, x_pt, y_pt, w_pt, h_pt);
422
423 // https://docs.oasis-open.org/office/OpenDocument/v1.3/cs02/part3-schema/OpenDocument-v1.3-cs02-part3-schema.html#attribute-draw_transform
424 // says rotation is in degrees, but we seem to require -radians.
425 //
426
427 if (!e) e = extract_astring_cat(alloc, content, "\n");
428
429 if (!e) e = extract_astring_cat(alloc, content, "<text:p text:style-name=\"Standard\">\n");
430 if (!e) e = extract_astring_catf(alloc, content, "<draw:frame"
431 " text:anchor-type=\"paragraph\""
432 " draw:z-index=\"5\""
433 " draw:name=\"Shape%i\""
434 " draw:style-name=\"gr1\""
435 " draw:text-style-name=\"Standard\""
436 " svg:width=\"%fin\""
437 " svg:height=\"%fin\""
438 " draw:transform=\"rotate (%f) translate (%fin %fin)\""
439 ">\n"
440 ,
441 text_box_id,
442 w_pt * pt_to_inch,
443 h_pt * pt_to_inch,
444 -rotation_rad,
445 x_pt * pt_to_inch,
446 y_pt * pt_to_inch
447 );
448 if (!e) e = extract_astring_cat(alloc, content, "<draw:text-box>\n");
449
450 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
451 if (!e) e = document_to_odt_content_paragraph(alloc, content_state, paragraph, content, styles);
452
453 if (!e) e = extract_astring_cat(alloc, content, "\n");
454 if (!e) e = extract_astring_cat(alloc, content, "</draw:text-box>\n");
455 if (!e) e = extract_astring_cat(alloc, content, "</draw:frame>\n");
456
457 if (!e) e = extract_astring_cat(alloc, content, "</text:p>\n");
458
459 return e;
460 }
461
462
463 static int
464 odt_append_table(
465 extract_alloc_t *alloc,
466 table_t *table,
467 extract_astring_t *output,
468 extract_odt_styles_t *styles)
469 {
470 int e = -1;
471 int y;
472
473 {
474 int x;
475 static int table_number = 0;
476 table_number += 1;
477 if (extract_astring_catf(alloc, output,
478 "\n"
479 " <table:table text:style-name=\"extract.table\" table:name=\"extract.table.%i\">\n"
480 " <table:table-columns>\n"
481 ,
482 table_number
483 )) goto end;
484
485 for (x=0; x<table->cells_num_x; ++x)
486 {
487 if (extract_astring_cat(alloc, output,
488 " <table:table-column table:style-name=\"extract.table.column\"/>\n"
489 )) goto end;
490 }
491 if (extract_astring_cat(alloc, output,
492 " </table:table-columns>\n"
493 )) goto end;
494 }
495 for (y=0; y<table->cells_num_y; ++y)
496 {
497 int x;
498 if (extract_astring_cat(alloc, output,
499 " <table:table-row>\n"
500 )) goto end;
501
502 for (x=0; x<table->cells_num_x; ++x)
503 {
504 cell_t *cell = table->cells[y*table->cells_num_x + x];
505 content_paragraph_iterator pit;
506 paragraph_t *paragraph;
507 content_state_t content_state;
508
509 if (!cell->above || !cell->left)
510 {
511 if (extract_astring_cat(alloc, output, " <table:covered-table-cell/>\n")) goto end;
512 continue;
513 }
514
515 if (extract_astring_cat(alloc, output, " <table:table-cell")) goto end;
516 if (cell->extend_right > 1)
517 {
518 if (extract_astring_catf(alloc, output, " table:number-columns-spanned=\"%i\"", cell->extend_right)) goto end;
519 }
520 if (cell->extend_down > 1)
521 {
522 if (extract_astring_catf(alloc, output, " table:number-rows-spanned=\"%i\"", cell->extend_down)) goto end;
523 }
524 if (extract_astring_catf(alloc, output, ">\n")) goto end;
525
526 /* Write contents of this cell. */
527 content_state.font.name = NULL;
528 content_state.ctm_prev = NULL;
529 for (paragraph = content_paragraph_iterator_init(&pit, &cell->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
530 if (document_to_odt_content_paragraph(alloc, &content_state, paragraph, output, styles)) goto end;
531 if (content_state.font.name)
532 if (odt_run_finish(alloc, &content_state, output)) goto end;
533 if (extract_astring_cat(alloc, output, "\n")) goto end;
534 if (extract_astring_cat(alloc, output, " </table:table-cell>\n")) goto end;
535 }
536 if (extract_astring_cat(alloc, output, " </table:table-row>\n")) goto end;
537 }
538 if (extract_astring_cat(alloc, output, " </table:table>\n")) goto end;
539 e = 0;
540
541 end:
542 return e;
543 }
544
545
546 /* Appends paragraphs with same rotation, starting with subpage->paragraphs[*p]
547 and updates *p. */
548 static int
549 odt_append_rotated_paragraphs(
550 extract_alloc_t *alloc,
551 content_state_t *content_state,
552 block_t *block,
553 int *text_box_id,
554 const matrix4_t *ctm,
555 double rotate,
556 extract_astring_t *output,
557 extract_odt_styles_t *styles)
558 {
559 /* Find extent of paragraphs with this same rotation. extent
560 will contain max width and max height of paragraphs, in units
561 before application of ctm, i.e. before rotation. */
562 int e = -1;
563 point_t extent = {0, 0};
564 content_iterator cit;
565 content_t *content;
566 paragraph_t *paragraph = content_first_paragraph(&block->content);
567
568 /* We assume that first span is at origin of text
569 * block. This assumes left-to-right text. */
570 span_t *first_span = content_first_span(&content_first_line(&paragraph->content)->content);
571 point_t origin = { first_span->chars[0].x,
572 first_span->chars[0].y };
573 matrix_t ctm_inverse = {1, 0, 0, 1, 0, 0};
574 double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
575
576 outf("rotate=%.2frad=%.1fdeg ctm: origin=(%f %f) abcd=(%f %f %f %f)",
577 rotate, rotate * 180 / pi,
578 origin.x,
579 origin.y,
580 ctm->a,
581 ctm->b,
582 ctm->c,
583 ctm->d
584 );
585
586 if (ctm_det != 0)
587 {
588 ctm_inverse.a = +ctm->d / ctm_det;
589 ctm_inverse.b = -ctm->b / ctm_det;
590 ctm_inverse.c = -ctm->c / ctm_det;
591 ctm_inverse.d = +ctm->a / ctm_det;
592 }
593 else
594 {
595 outf("cannot invert ctm=(%f %f %f %f)",
596 ctm->a, ctm->b, ctm->c, ctm->d);
597 }
598
599 for (content = content_iterator_init(&cit, &block->content); content != NULL; content = content_iterator_next(&cit))
600 {
601 content_line_iterator lit;
602 line_t *line;
603 paragraph_t *paragraph;
604
605 assert(content->type == content_paragraph);
606 if (content->type != content_paragraph)
607 continue; /* This shouldn't happen for now! */
608
609 paragraph = (paragraph_t *)content;
610
611 /* Update <extent>. */
612 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
613 {
614 span_t *span = extract_line_span_last(line);
615 char_t *char_ = extract_span_char_last(span);
616 double adv = char_->adv * extract_font_size(&span->ctm);
617 double x = char_->x + adv * cos(rotate);
618 double y = char_->y + adv * sin(rotate);
619
620 double dx = x - origin.x;
621 double dy = y - origin.y;
622
623 /* Position relative to origin and before box rotation. */
624 double xx = ctm_inverse.a * dx + ctm_inverse.b * dy;
625 double yy = ctm_inverse.c * dx + ctm_inverse.d * dy;
626 yy = -yy;
627 if (xx > extent.x) extent.x = xx;
628 if (yy > extent.y) extent.y = yy;
629 if (0) outf("rotate=%f origin=(%f %f) xy=(%f %f) dxy=(%f %f) xxyy=(%f %f) span: %s",
630 rotate, origin.x, origin.y, x, y, dx, dy, xx, yy, extract_span_string(alloc, span));
631 }
632 }
633 outf("rotate=%f extent is: (%f %f)",
634 rotate, extent.x, extent.y);
635
636 /* All the paragraphs have same rotation. We output them into
637 * a single rotated text box. */
638
639 /* We need unique id for text box. */
640 *text_box_id += 1;
641
642 if (odt_output_rotated_paragraphs(
643 alloc,
644 block,
645 rotate,
646 origin.x,
647 origin.y,
648 extent.x,
649 extent.y,
650 *text_box_id,
651 output,
652 styles,
653 content_state)) goto end;
654
655 e = 0;
656 end:
657
658 return e;
659 }
660
661
662 static int
663 extract_page_to_odt_content(
664 extract_alloc_t *alloc,
665 extract_page_t *page,
666 int spacing,
667 int rotation,
668 int images,
669 extract_astring_t *output,
670 extract_odt_styles_t *styles)
671 {
672 int ret = -1;
673 int text_box_id = 0;
674 int c;
675
676 /* Write paragraphs into <content>. */
677 for (c=0; c<page->subpages_num; ++c)
678 {
679 subpage_t *subpage = page->subpages[c];
680 content_iterator cit;
681 content_t *content;
682 content_table_iterator tit;
683 table_t *table;
684 content_state_t content_state;
685 content_state.font.name = NULL;
686 content_state.font.size = 0;
687 content_state.font.bold = 0;
688 content_state.font.italic = 0;
689 content_state.ctm_prev = NULL;
690
691 content = content_iterator_init(&cit, &subpage->content);
692 table = content_table_iterator_init(&tit, &subpage->tables);
693 while (1)
694 {
695 double y_paragraph;
696 double y_table;
697 block_t *block = (content && content->type == content_block) ? (block_t *)content : NULL;
698 paragraph_t *paragraph = (content && content->type == content_paragraph) ? (paragraph_t *)content : (block ? content_first_paragraph(&block->content) : NULL);
699 line_t *first_line = paragraph ? content_first_line(&paragraph->content) : NULL;
700 span_t *first_span = first_line ? content_first_span(&first_line->content) : NULL;
701 if (!paragraph && !table) break;
702 y_paragraph = (first_span) ? first_span->chars[0].y : DBL_MAX;
703 y_table = (table) ? table->pos.y : DBL_MAX;
704
705 if (first_span && y_paragraph < y_table)
706 {
707 const matrix4_t *ctm = &first_span->ctm;
708 double rotate = atan2(ctm->b, ctm->a);
709
710 if (spacing
711 && content_state.ctm_prev
712 && first_span
713 && extract_matrix4_cmp(content_state.ctm_prev,
714 &first_span->ctm)
715 )
716 {
717 /* Extra vertical space between paragraphs that were at
718 different angles in the original document. */
719 if (odt_append_empty_paragraph(alloc, output, styles)) goto end;
720 }
721
722 if (spacing)
723 {
724 /* Extra vertical space between paragraphs. */
725 if (odt_append_empty_paragraph(alloc, output, styles)) goto end;
726 }
727
728 if (rotation && rotate != 0)
729 {
730 assert(block);
731 if (odt_append_rotated_paragraphs(alloc, &content_state, block, &text_box_id, ctm, rotate, output, styles)) goto end;
732 }
733 else if (block)
734 {
735 content_paragraph_iterator pit;
736 int first = 1;
737
738 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
739 {
740 if (spacing && !first)
741 {
742 /* Extra vertical space between paragraphs. */
743 if (odt_append_empty_paragraph(alloc, output, styles)) goto end;
744 }
745 first = 0;
746
747 if (document_to_odt_content_paragraph(alloc, &content_state, paragraph, output, styles)) goto end;
748 }
749 }
750 else
751 {
752 if (document_to_odt_content_paragraph(alloc, &content_state, paragraph, output, styles)) goto end;
753 }
754 content = content_iterator_next(&cit);
755 }
756 else if (table)
757 {
758 if (odt_append_table(alloc, table, output, styles)) goto end;
759 table = content_table_iterator_next(&tit);
760 }
761 }
762
763 outf("images=%i", images);
764 if (images)
765 {
766 content_t *images, *next;
767 outf("subpage->images_num=%i", content_count_images(&subpage->content));
768 for (images = subpage->content.base.next; images != &subpage->content.base; images = next)
769 {
770 image_t *image = (image_t *)images;
771 next = images->next;
772 if (images->type != content_image)
773 continue;
774 odt_append_image(alloc, output, image);
775 }
776 }
777 }
778 ret = 0;
779
780 end:
781
782 return ret;
783 }
784
785 int
786 extract_document_to_odt_content(
787 extract_alloc_t *alloc,
788 document_t *document,
789 int spacing,
790 int rotation,
791 int images,
792 extract_astring_t *content,
793 extract_odt_styles_t *styles)
794 {
795 int p;
796 int ret = 0;
797
798 /* Write paragraphs into <content>. */
799 for (p=0; p<document->pages_num; ++p)
800 {
801 extract_page_t *page = document->pages[p];
802
803 ret = extract_page_to_odt_content(
804 alloc,
805 page,
806 spacing,
807 rotation,
808 images,
809 content,
810 styles);
811 if (ret) break;
812 };
813
814 return ret;
815 }
816
817 int
818 extract_odt_content_item(
819 extract_alloc_t *alloc,
820 extract_astring_t *contentss,
821 int contentss_num,
822 extract_odt_styles_t *styles,
823 images_t *images,
824 const char *name,
825 const char *text,
826 char **text2)
827 {
828 int e = -1;
829 extract_astring_t temp;
830 extract_astring_init(&temp);
831 *text2 = NULL;
832
833 (void) images;
834 if (0)
835 {}
836 else if (!strcmp(name, "content.xml"))
837 {
838 /* Insert paragraphs content. */
839 char* text_intermediate = NULL;
840 extract_astring_t styles_definitions = {0};
841
842 /* Insert content before '</office:text>'. */
843 if (extract_content_insert(
844 alloc,
845 text,
846 NULL /*single*/,
847 NULL /*mid_begin_name*/,
848 "</office:text>" /*mid_end_name*/,
849 contentss,
850 contentss_num,
851 &text_intermediate
852 )) goto end;
853 outf("text_intermediate: %s", text_intermediate);
854
855 /* Convert <styles> to text. */
856 if (odt_styles_definitions(alloc, styles, &styles_definitions)) goto end;
857
858 /* To make tables work, we seem to need to specify table and column
859 styles, and these can be empty. todo: maybe specify exact sizes based
860 on the pdf table and cell dimensions. */
861 if (extract_astring_cat(alloc, &styles_definitions,
862 "\n"
863 "<style:style style:name=\"extract.table\" style:family=\"table\"/>\n"
864 "<style:style style:name=\"extract.table.column\" style:family=\"table-column\"/>\n"
865 )) goto end;
866
867 /* Replace '<office:automatic-styles/>' with text from
868 <styles_definitions>. */
869 e = extract_content_insert(
870 alloc,
871 text_intermediate,
872 "<office:automatic-styles/>" /*single*/,
873 NULL /*mid_begin_name*/,
874 NULL /*mid_end_name*/,
875 &styles_definitions,
876 1,
877 text2
878 );
879 outf("e=%i errno=%i", e, errno);
880 extract_free(alloc, &text_intermediate);
881 extract_astring_free(alloc, &styles_definitions);
882 outf("e=%i errno=%i", e, errno);
883 if (e) goto end;
884 }
885 else if (!strcmp(name, "META-INF/manifest.xml"))
886 {
887 /* Add images. */
888 int e = 0;
889 int i;
890 for (i=0; i<images->images_num; ++i)
891 {
892 image_t* image = images->images[i];
893 if (!e) e = extract_astring_catf(alloc, &temp, "<manifest:file-entry manifest:full-path=\"Pictures/%s\" manifest:media-type=\"image/%s\"/>\n",
894 image->name,
895 image->type
896 );
897 }
898 if (!e) e = extract_content_insert(
899 alloc,
900 text,
901 NULL /*single*/,
902 NULL /*mid_begin_name*/,
903 "</manifest:manifest>" /*mid_end_name*/,
904 &temp,
905 1,
906 text2
907 );
908 if (e) goto end;
909 }
910 else
911 {
912 *text2 = NULL;
913 }
914 e = 0;
915 end:
916 outf("e=%i errno=%i text2=%s", e, errno, text2 ? *text2 : "");
917 if (e)
918 {
919 /* We might have set <text2> to new content. */
920 extract_free(alloc, text2);
921 /* We might have used <temp> as a temporary buffer. */
922 }
923 extract_astring_free(alloc, &temp);
924 extract_astring_init(&temp);
925 return e;
926 }
927
928
929
930 int
931 extract_odt_write_template(
932 extract_alloc_t *alloc,
933 extract_astring_t *contentss,
934 int contentss_num,
935 extract_odt_styles_t *styles,
936 images_t *images,
937 const char *path_template,
938 const char *path_out,
939 int preserve_dir)
940 {
941 int e = -1;
942 int i;
943 char* path_tempdir = NULL;
944 char* path = NULL;
945 char* text = NULL;
946 char* text2 = NULL;
947
948 assert(path_out);
949 assert(path_template);
950
951 if (extract_check_path_shell_safe(path_out))
952 {
953 outf("path_out is unsafe: %s", path_out);
954 goto end;
955 }
956
957 outf("images->images_num=%i", images->images_num);
958 if (extract_asprintf(alloc, &path_tempdir, "%s.dir", path_out) < 0) goto end;
959 if (extract_systemf(alloc, "rm -r '%s' 2>/dev/null", path_tempdir) < 0) goto end;
960
961 if (extract_mkdir(path_tempdir, 0777))
962 {
963 outf("Failed to create directory: %s", path_tempdir);
964 goto end;
965 }
966
967 outf("Unzipping template document '%s' to tempdir: %s",
968 path_template, path_tempdir);
969 if (extract_systemf(alloc, "unzip -q -d '%s' '%s'", path_tempdir, path_template))
970 {
971 outf("Failed to unzip %s into %s",
972 path_template, path_tempdir);
973 goto end;
974 }
975
976 /* Might be nice to iterate through all items in path_tempdir, but for now
977 we look at just the items that we know extract_odt_content_item() will
978 modify. */
979
980 {
981 const char *names[] =
982 {
983 "content.xml",
984 "META-INF/manifest.xml",
985 };
986 int names_num = sizeof(names) / sizeof(names[0]);
987 for (i=0; i<names_num; ++i)
988 {
989 const char* name = names[i];
990 extract_free(alloc, &path);
991 extract_free(alloc, &text);
992 extract_free(alloc, &text2);
993 if (extract_asprintf(alloc, &path, "%s/%s", path_tempdir, name) < 0) goto end;
994 if (extract_read_all_path(alloc, path, &text)) goto end;
995
996 outf("before extract_odt_content_item() styles->styles_num=%i", styles->styles_num);
997 if (extract_odt_content_item(
998 alloc,
999 contentss,
1000 contentss_num,
1001 styles,
1002 images,
1003 name,
1004 text,
1005 &text2
1006 ))
1007 {
1008 outf("extract_odt_content_item() failed");
1009 goto end;
1010 }
1011
1012 outf("after extract_odt_content_item styles->styles_num=%i", styles->styles_num);
1013
1014 {
1015 const char* text3 = (text2) ? text2 : text;
1016 if (extract_write_all(text3, strlen(text3), path)) goto end;
1017 outf("have written to path=%s", path);
1018 }
1019 }
1020 }
1021
1022 /* Copy images into <path_tempdir>/Pictures/. */
1023 extract_free(alloc, &path);
1024 if (extract_asprintf(alloc, &path, "%s/Pictures", path_tempdir) < 0) goto end;
1025 if (extract_mkdir(path, 0777))
1026 {
1027 outf("Failed to mkdir %s", path);
1028 goto end;
1029 }
1030 for (i=0; i<images->images_num; ++i)
1031 {
1032 image_t* image = images->images[i];
1033 extract_free(alloc, &path);
1034 if (extract_asprintf(alloc, &path, "%s/Pictures/%s", path_tempdir, image->name) < 0) goto end;
1035 if (extract_write_all(image->data, image->data_size, path)) goto end;
1036 }
1037
1038 outf("Zipping tempdir to create %s", path_out);
1039 {
1040 const char* path_out_leaf = strrchr(path_out, '/');
1041 if (!path_out_leaf) path_out_leaf = path_out;
1042 if (extract_systemf(alloc, "cd '%s' && zip -q -r -D '../%s' .", path_tempdir, path_out_leaf))
1043 {
1044 outf("Zip command failed to convert '%s' directory into output file: %s",
1045 path_tempdir, path_out);
1046 goto end;
1047 }
1048 }
1049
1050 if (!preserve_dir)
1051 {
1052 if (extract_remove_directory(alloc, path_tempdir)) goto end;
1053 }
1054
1055 e = 0;
1056
1057 end:
1058 outf("e=%i", e);
1059 extract_free(alloc, &path_tempdir);
1060 extract_free(alloc, &path);
1061 extract_free(alloc, &text);
1062 extract_free(alloc, &text2);
1063
1064 if (e)
1065 {
1066 outf("Failed to create %s", path_out);
1067 }
1068 return e;
1069 }