comparison mupdf-source/source/html/office.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2023-2025 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "html-imp.h"
25
26 #undef DEBUG_OFFICE_TO_HTML
27
28 /* Defaults are all 0's. FIXME: Very subject to change. Possibly might be removed entirely. */
29 typedef struct
30 {
31 int output_page_numbers;
32 int output_sheet_names;
33 int output_cell_markers;
34 int output_cell_row_markers;
35 int output_cell_names;
36 int output_formatting;
37 int output_filenames;
38 int output_errors;
39 }
40 fz_office_to_html_opts;
41
42 typedef struct
43 {
44 fz_office_to_html_opts opts;
45
46 fz_output *out;
47
48 int page;
49
50 /* State for if we are parsing a sheet. */
51 /* The last column label we have to send. */
52 char *label;
53 /* Columns are numbered from 1. */
54 /* The column we are at. */
55 int col_at;
56 /* The column we last signalled. If this is 0, then we haven't
57 * even started a row yet. */
58 int col_signalled;
59
60 /* If we are currently processing a spreadsheet, store the current
61 * sheets name here. */
62 const char *sheet_name;
63
64 int shared_string_max;
65 int shared_string_len;
66 char **shared_strings;
67
68 int footnotes_max;
69 char **footnotes;
70
71 char *title;
72 } doc_info;
73
74 static void
75 doc_escape(fz_context *ctx, fz_output *output, const char *str_)
76 {
77 const unsigned char *str = (const unsigned char *)str_;
78 int c;
79
80 if (!str)
81 return;
82
83 while ((c = *str++) != 0)
84 {
85 if (c == '&')
86 {
87 fz_write_string(ctx, output, "&amp;");
88 }
89 else if (c == '<')
90 {
91 fz_write_string(ctx, output, "&lt;");
92 }
93 else if (c == '>')
94 {
95 fz_write_string(ctx, output, "&gt;");
96 }
97 else
98 {
99 /* We get utf-8 in, just parrot it out again. */
100 fz_write_byte(ctx, output, c);
101 }
102 }
103 }
104
105 static void
106 show_text(fz_context *ctx, fz_xml *top, doc_info *info)
107 {
108 fz_xml *pos = top;
109 fz_xml *next;
110
111 while (pos)
112 {
113 doc_escape(ctx, info->out, fz_xml_text(pos));
114
115 if (fz_xml_is_tag(pos, "lineBreak"))
116 {
117 fz_write_string(ctx, info->out, "\n");
118 }
119 else if (fz_xml_is_tag(pos, "tab"))
120 {
121 fz_write_string(ctx, info->out, "\t");
122 }
123 else if (fz_xml_is_tag(pos, "lastRenderedPageBreak"))
124 {
125 info->page++;
126 }
127
128 /* Always try to move down. */
129 next = fz_xml_down(pos);
130 if (next)
131 {
132 /* We can move down, easy! */
133 pos = next;
134 continue;
135 }
136
137 if (pos == top)
138 break;
139
140 /* We can't move down, try moving to next. */
141 next = fz_xml_next(pos);
142 if (next)
143 {
144 /* We can move to next, easy! */
145 pos = next;
146 continue;
147 }
148
149 /* If we can't go down, or next, pop up until we
150 * find somewhere we can go next from. */
151 while (1)
152 {
153 /* OK. So move up. */
154 pos = fz_xml_up(pos);
155 /* Check for hitting the top. */
156 if (pos == top)
157 pos = NULL;
158 if (pos == NULL)
159 break;
160 /* We've returned to a node. See if it's a 'p'. */
161 if (fz_xml_is_tag(pos, "p"))
162 {
163 fz_write_string(ctx, info->out, "\n");
164 }
165 next = fz_xml_next(pos);
166 if (next)
167 {
168 pos = next;
169 break;
170 }
171 }
172 }
173 }
174
175 static void
176 show_footnote(fz_context *ctx, fz_xml *v, doc_info *info)
177 {
178 int n = fz_atoi(fz_xml_att(v, "w:id"));
179
180 if (n < 0 || n >= info->footnotes_max)
181 return;
182
183 if (info->footnotes[n] == NULL ||
184 info->footnotes[n][0] == 0)
185 return;
186
187 /* Then send the strings. */
188 doc_escape(ctx, info->out, info->footnotes[n]);
189 }
190
191 static void
192 process_doc_stream(fz_context *ctx, fz_xml *xml, doc_info *info, int do_pages)
193 {
194 fz_xml *pos;
195 fz_xml *next;
196 const char *paragraph_style = NULL;
197 const char *inline_style = NULL;
198
199 #ifdef DEBUG_OFFICE_TO_HTML
200 fz_write_printf(ctx, fz_stddbg(ctx), "process_doc_stream:\n");
201 fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
202 #endif
203
204 /* First off, see if we can do page numbers. */
205 if (do_pages)
206 {
207 pos = fz_xml_find_dfs(xml, "lastRenderedPageBreak", NULL, NULL);
208 if (pos)
209 {
210 /* We *can* do page numbers, so start here. */
211 fz_write_string(ctx, info->out, "<div id=\"page1\">\n");
212 info->page = 1;
213 }
214 }
215
216 /* Now walk the tree for real. */
217 pos = xml;
218 while (pos)
219 {
220 /* When we arrive on a node, check if it's a 't'. */
221 if (fz_xml_is_tag(pos, "t"))
222 {
223 show_text(ctx, pos, info);
224 /* Do NOT go down, we've already dealt with that. */
225 }
226 else if (fz_xml_is_tag(pos, "br"))
227 {
228 if (paragraph_style && strcmp(paragraph_style, "pre"))
229 {
230 fz_write_printf(ctx, info->out, "<br/>\n");
231 }
232 else
233 {
234 fz_write_printf(ctx, info->out, "\n");
235 }
236 }
237 else if (fz_xml_is_tag(pos, "footnoteReference"))
238 {
239 show_footnote(ctx, pos, info);
240 /* Do NOT go down, we've already dealt with that. */
241 }
242 else if (fz_xml_is_tag(pos, "tabs"))
243 {
244 /* Don't walk through tabs, or we will hit lots of 'tab' entries and
245 * output incorrect information. */
246 }
247 else if (fz_xml_is_tag(pos, "pStyle"))
248 {
249 /* Should prob fix fz_xml_*() to strip namespace prefix
250 from attributes, to match what it does for tag names.
251 */
252 paragraph_style = fz_xml_att(pos, "w:val");
253 if (paragraph_style)
254 {
255 if (!strcmp(paragraph_style, "BodyText"))
256 paragraph_style = NULL;
257 else if (!strcmp(paragraph_style, "Heading1"))
258 paragraph_style = "h1";
259 else if (!strcmp(paragraph_style, "Heading2"))
260 paragraph_style = "h2";
261 else if (!strcmp(paragraph_style, "Heading3"))
262 paragraph_style = "h3";
263 else if (!strcmp(paragraph_style, "Heading4"))
264 paragraph_style = "h4";
265 else if (!strcmp(paragraph_style, "Heading5"))
266 paragraph_style = "h5";
267 else if (!strcmp(paragraph_style, "Heading6"))
268 paragraph_style = "h6";
269 else if (!strcmp(paragraph_style, "SourceCode"))
270 paragraph_style = "pre";
271 else
272 paragraph_style = NULL;
273
274 if (paragraph_style)
275 fz_write_printf(ctx, info->out, "<%s>", paragraph_style);
276 }
277 }
278 else if (fz_xml_is_tag(pos, "rStyle"))
279 {
280 inline_style = fz_xml_att(pos, "w:val");
281 if (inline_style)
282 {
283 if (!strcmp(inline_style, "VerbatimChar"))
284 inline_style = "tt";
285 else
286 {
287 if (0)
288 fz_write_printf(ctx, info->out, "<!-- %s -->", inline_style);
289 inline_style = NULL;
290 }
291 if (inline_style)
292 fz_write_printf(ctx, info->out, "<%s>", inline_style);
293 }
294 }
295 else
296 {
297 fz_xml *down;
298 if (fz_xml_is_tag(pos, "lineBreak"))
299 {
300 fz_write_string(ctx, info->out, "\n");
301 }
302 else if (fz_xml_is_tag(pos, "p"))
303 {
304 fz_write_string(ctx, info->out, "<p>");
305 }
306 else if (fz_xml_is_tag(pos, "tab"))
307 {
308 fz_write_string(ctx, info->out, "\t");
309 }
310 else if (do_pages && fz_xml_is_tag(pos, "lastRenderedPageBreak"))
311 {
312 if (info->page)
313 fz_write_string(ctx, info->out, "\n</div>\n");
314 info->page++;
315 fz_write_printf(ctx, info->out, "<div id=\"page%d\">\n", info->page);
316 }
317 /* Try to move down. */
318 down = fz_xml_down(pos);
319 if (down)
320 {
321 /* We can move down, easy! */
322 pos = down;
323 continue;
324 }
325 }
326 /* Try moving to next. */
327 next = fz_xml_next(pos);
328 if (next)
329 {
330 /* We can move to next, easy! */
331 pos = next;
332 continue;
333 }
334
335 /* If we can't go down, or next, pop up until we
336 * find somewhere we can go next from. */
337 while (1)
338 {
339 /* OK. So move up. */
340 pos = fz_xml_up(pos);
341 /* Check for hitting the top. */
342 if (pos == NULL)
343 break;
344 /* We've returned to a node. See if it's a 'p'. */
345 if (fz_xml_is_tag(pos, "p"))
346 {
347 if (paragraph_style)
348 {
349 fz_write_printf(ctx, info->out, "</%s>", paragraph_style);
350 paragraph_style = NULL;
351 }
352 fz_write_string(ctx, info->out, "</p>\n");
353 }
354 else if (fz_xml_is_tag(pos, "r"))
355 {
356 /* Seems to be pseudo-close for rStyle. */
357 if (inline_style)
358 {
359 fz_write_printf(ctx, info->out, "</%s>", inline_style);
360 inline_style = NULL;
361 }
362 }
363 next = fz_xml_next(pos);
364 if (next)
365 {
366 pos = next;
367 break;
368 }
369 }
370 }
371
372 if (do_pages && info->page)
373 fz_write_string(ctx, info->out, "\n</div>\n");
374 }
375
376 static void
377 process_item(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info, int do_pages)
378 {
379 fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
380
381 fz_try(ctx)
382 process_doc_stream(ctx, xml, info, do_pages);
383 fz_always(ctx)
384 fz_drop_xml(ctx, xml);
385 fz_catch(ctx)
386 fz_rethrow(ctx);
387 }
388
389 static void
390 process_rootfile(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
391 {
392 fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 0);
393
394 fz_try(ctx)
395 {
396 /* FIXME: Should really search for these just inside 'spine'. */
397 fz_xml *pos = fz_xml_find_dfs(xml, "itemref", NULL, NULL);
398 while (pos)
399 {
400 char *idref = fz_xml_att(pos, "idref");
401 fz_xml *item = fz_xml_find_dfs(xml, "item", "id", idref);
402 while (item)
403 {
404 char *type = fz_xml_att(item, "media-type");
405 char *href = fz_xml_att(item, "href");
406 if (type && href && !strcmp(type, "application/xml"))
407 {
408 process_item(ctx, arch, href, info, 1);
409 }
410 item = fz_xml_find_next_dfs(pos, "item", "id", idref);
411 }
412 pos = fz_xml_find_next_dfs(pos, "itemref", NULL, NULL);
413 }
414 }
415 fz_always(ctx)
416 fz_drop_xml(ctx, xml);
417 fz_catch(ctx)
418 fz_rethrow(ctx);
419 }
420
421 /* XLSX support */
422 static char *
423 make_rel_name(fz_context *ctx, const char *file)
424 {
425 size_t z = strlen(file);
426 char *s = fz_malloc(ctx, z + 12);
427 char *t;
428 const char *p;
429 const char *slash = file;
430
431 for (p = file; *p != 0; p++)
432 if (*p == '/')
433 slash = p+1;
434
435 t = s;
436 if (slash != file)
437 {
438 memcpy(t, file, slash - file);
439 t += slash - file;
440 }
441 memcpy(t, "_rels/", 6);
442 t += 6;
443 memcpy(t, file + (slash - file), z - (slash - file));
444 t += z - (slash - file);
445 memcpy(t, ".rels", 6);
446
447 return s;
448 }
449
450 static char *lookup_rel(fz_context *ctx, fz_xml *rels, const char *id)
451 {
452 fz_xml *pos;
453
454 if (id == NULL)
455 return NULL;
456
457 pos = fz_xml_find_dfs(rels, "Relationship", NULL, NULL);
458 while (pos)
459 {
460 char *id2 = fz_xml_att(pos, "Id");
461
462 if (id2 && !strcmp(id, id2))
463 return fz_xml_att(pos, "Target");
464
465 pos = fz_xml_find_next_dfs(pos, "Relationship", NULL, NULL);
466 }
467
468 return NULL;
469 }
470
471 static void
472 send_cell_formatting(fz_context *ctx, doc_info *info)
473 {
474 if (info->col_signalled == 0)
475 {
476 fz_write_string(ctx, info->out, "<tr>\n");
477 info->col_signalled = 1;
478 if (info->col_at > 1)
479 fz_write_string(ctx, info->out, "<td>");
480 }
481
482 /* Send the label */
483 while (info->col_signalled < info->col_at)
484 {
485 fz_write_string(ctx, info->out, "</td>");
486 info->col_signalled++;
487 if (info->col_signalled < info->col_at)
488 fz_write_string(ctx, info->out, "<td>");
489 }
490 if (info->sheet_name && info->sheet_name[0])
491 fz_write_printf(ctx, info->out, "<td id=\"%s!%s\">", info->sheet_name, info->label);
492 else
493 fz_write_printf(ctx, info->out, "<td id=\"%s\">", info->label);
494 }
495
496 static void
497 show_shared_string(fz_context *ctx, fz_xml *v, doc_info *info)
498 {
499 const char *t = fz_xml_text(fz_xml_down(v));
500 int n = fz_atoi(t);
501
502 if (n < 0 || n >= info->shared_string_len)
503 return;
504
505 if (info->shared_strings[n] == NULL ||
506 info->shared_strings[n][0] == 0)
507 return;
508
509 send_cell_formatting(ctx, info);
510 /* Then send the strings. */
511 doc_escape(ctx, info->out, info->shared_strings[n]);
512 }
513
514 static int
515 col_from_label(const char *label)
516 {
517 int col = 0;
518 int len = 26;
519 int base = 0;
520
521 /* If we can't read the column, return 0. */
522 if (label == NULL || *label < 'A' || *label > 'Z')
523 return 0;
524
525 /* Each section (A-Z, AA-ZZ, AAA-ZZZ etc) is of len 'len', and starts
526 * at base index 'base'. Each section is 26 times as long, and starts
527 * at base + len from the previous section.
528 *
529 * A: col = 26 * 0 + 0 + 0
530 * AA: col = (26 * 0 + 0 + 0) * 26 + 0 + 26 = 26
531 * AAA: col = (((26 * 0 + 0 + 0) * 26 + 0 + 26)*26 + 0 + 26*26 = 26 + 26 * 26
532 */
533 do
534 {
535 col = 26 * col + (*label++) - 'A' + base;
536 base += len;
537 len *= 26;
538 }
539 while (*label >= 'A' && *label <= 'Z');
540
541 return col+1;
542 }
543
544 static void
545 show_cell_text(fz_context *ctx, fz_xml *top, doc_info *info)
546 {
547 fz_xml *pos = top;
548 fz_xml *next;
549
550 while (pos)
551 {
552 char *text = fz_xml_text(pos);
553
554 if (text)
555 {
556 send_cell_formatting(ctx, info);
557 doc_escape(ctx, info->out, text);
558 }
559
560 /* Always try to move down. */
561 next = fz_xml_down(pos);
562 if (next)
563 {
564 /* We can move down, easy! */
565 pos = next;
566 continue;
567 }
568
569 if (pos == top)
570 break;
571
572 /* We can't move down, try moving to next. */
573 next = fz_xml_next(pos);
574 if (next)
575 {
576 /* We can move to next, easy! */
577 pos = next;
578 continue;
579 }
580
581 /* If we can't go down, or next, pop up until we
582 * find somewhere we can go next from. */
583 while (1)
584 {
585 /* OK. So move up. */
586 pos = fz_xml_up(pos);
587 /* Check for hitting the top. */
588 if (pos == top)
589 pos = NULL;
590 if (pos == NULL)
591 break;
592 next = fz_xml_next(pos);
593 if (next)
594 {
595 pos = next;
596 break;
597 }
598 }
599 }
600 }
601
602 static void
603 arrived_at_cell(fz_context *ctx, doc_info *info, const char *label)
604 {
605 int col;
606
607 /* If we have a label queued, and no label is given here, then we're
608 * processing a 'cell' callback after having had a 'cellname'
609 * callback. So don't signal it twice! */
610 if (label == NULL && info->label)
611 return;
612
613 col = label ? col_from_label(label) : 0;
614
615 fz_free(ctx, info->label);
616 info->label = NULL;
617 info->label = label ? fz_strdup(ctx, label) : NULL;
618 info->col_at = col;
619 }
620
621 static void
622 show_cell(fz_context *ctx, fz_xml *cell, doc_info *info)
623 {
624 char *t = fz_xml_att(cell, "t");
625 fz_xml *v = fz_xml_find_down(cell, "v");
626 const char *r = fz_xml_att(cell, "r");
627
628 arrived_at_cell(ctx, info, r);
629 if (t && t[0] == 's' && t[1] == 0)
630 show_shared_string(ctx, v, info);
631 else
632 show_cell_text(ctx, v, info);
633 }
634
635 static void
636 new_row(fz_context *ctx, doc_info *info)
637 {
638 if (info->col_signalled)
639 {
640 /* We've sent at least one cell. So need to close the
641 * td and tr */
642 fz_write_string(ctx, info->out, "</td>\n</tr>\n");
643 }
644 else
645 {
646 /* We've not sent anything for this row. Keep the counts
647 * correct. */
648 fz_write_string(ctx, info->out, "<tr></tr>\n");
649 }
650 info->col_at = 1;
651 info->col_signalled = 0;
652 fz_free(ctx, info->label);
653 info->label = NULL;
654 }
655
656 static void
657 process_sheet(fz_context *ctx, fz_archive *arch, const char *name, const char *file, doc_info *info)
658 {
659 fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
660
661 #ifdef DEBUG_OFFICE_TO_HTML
662 fz_write_printf(ctx, fz_stddbg(ctx), "process_sheet:\n");
663 fz_output_xml(ctx, fz_stddbg(ctx), xml, 0);
664 #endif
665
666 fz_write_printf(ctx, info->out, "<table id=\"%s\">\n", name);
667
668 info->sheet_name = name;
669 info->col_at = 0;
670 info->col_signalled = 0;
671
672 fz_try(ctx)
673 {
674 fz_xml *pos = xml;
675 fz_xml *next;
676
677 while (pos)
678 {
679 /* When we arrive on a node, check if it's a cell. */
680 if (fz_xml_is_tag(pos, "c"))
681 {
682 show_cell(ctx, pos, info);
683 /* Do NOT go down, we've already dealt with that. */
684 }
685 else
686 {
687 /* Try to move down. */
688 next = fz_xml_down(pos);
689 if (next)
690 {
691 /* We can move down, easy! */
692 pos = next;
693 continue;
694 }
695 }
696 /* Try moving to next. */
697 next = fz_xml_next(pos);
698 if (next)
699 {
700 /* We can move to next, easy! */
701 pos = next;
702 continue;
703 }
704
705 /* If we can't go down, or next, pop up until we
706 * find somewhere we can go next from. */
707 while (1)
708 {
709 /* OK. So move up. */
710 pos = fz_xml_up(pos);
711 /* Check for hitting the top. */
712 if (pos == NULL)
713 break;
714
715 /* We've returned to a node. See if it's a 'row'. */
716 if (fz_xml_is_tag(pos, "row"))
717 new_row(ctx, info);
718
719 next = fz_xml_next(pos);
720 if (next)
721 {
722 pos = next;
723 break;
724 }
725 }
726 }
727 if (info->col_signalled)
728 fz_write_printf(ctx, info->out, "</td>\n</tr>\n");
729 fz_write_printf(ctx, info->out, "</table>\n");
730 }
731 fz_always(ctx)
732 fz_drop_xml(ctx, xml);
733 fz_catch(ctx)
734 fz_rethrow(ctx);
735 }
736
737 static void
738 process_slide(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
739 {
740 fz_write_printf(ctx, info->out, "<div id=\"slide%d\">\n", info->page++);
741 process_item(ctx, arch, file, info, 0);
742 fz_write_printf(ctx, info->out, "</div>\n");
743 }
744
745 static char *
746 make_absolute_path(fz_context *ctx, const char *abs, const char *rel)
747 {
748 const char *a = abs;
749 const char *aslash = a;
750 int up = 0;
751 size_t z1, z2;
752 char *s;
753
754 if (rel == NULL)
755 return NULL;
756 if (abs == NULL || *rel == '/')
757 return fz_strdup(ctx, rel);
758
759 for (a = abs; *a != 0; a++)
760 if (*a == '/')
761 aslash = a+1;
762
763 while (rel[0] == '.')
764 {
765 if (rel[1] == '/')
766 rel += 2;
767 else if (rel[1] == '.' && rel[2] == '/')
768 rel += 3, up++;
769 else
770 fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
771 }
772 if (rel[0] == 0)
773 fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path");
774
775 while (up)
776 {
777 while (aslash != abs && aslash[-1] != '/')
778 aslash--;
779
780 up--;
781 }
782
783 z1 = aslash - abs;
784 z2 = strlen(rel);
785 s = fz_malloc(ctx, z1 + z2 + 1);
786 if (z1)
787 memcpy(s, abs, z1);
788 memcpy(s+z1, rel, z2+1);
789
790 return s;
791 }
792
793 static char *
794 collate_t_content(fz_context *ctx, fz_xml *top)
795 {
796 char *val = NULL;
797 fz_xml *next;
798 fz_xml *pos = fz_xml_down(top);
799
800 while (pos != top)
801 {
802 /* Capture all the 't' content. */
803 if (fz_xml_is_tag(pos, "t"))
804 {
805 /* Remember the content. */
806 char *s = fz_xml_text(fz_xml_down(pos));
807
808 if (s == NULL)
809 {
810 /* Do nothing */
811 }
812 else if (val == NULL)
813 val = fz_strdup(ctx, s);
814 else
815 {
816 char *val2;
817 size_t z1 = strlen(val);
818 size_t z2 = strlen(s) + 1;
819 fz_try(ctx)
820 {
821 val2 = fz_malloc(ctx, z1 + z2);
822 }
823 fz_catch(ctx)
824 {
825 fz_free(ctx, val);
826 fz_rethrow(ctx);
827 }
828 memcpy(val2, val, z1);
829 memcpy(val2 + z1, s, z2);
830 fz_free(ctx, val);
831 val = val2;
832 }
833 /* Do NOT go down, we've already dealt with that. */
834 }
835 else if (fz_xml_is_tag(pos, "rPr") || fz_xml_is_tag(pos, "rPh"))
836 {
837 /* We do not want the 't' content from within these. */
838 }
839 else
840 {
841 /* Try to move down. */
842 next = fz_xml_down(pos);
843 if (next)
844 {
845 /* We can move down, easy! */
846 pos = next;
847 continue;
848 }
849 }
850 /* Try moving to next. */
851 next = fz_xml_next(pos);
852 if (next)
853 {
854 /* We can move to next, easy! */
855 pos = next;
856 continue;
857 }
858
859 /* If we can't go down, or next, pop up until we
860 * find somewhere we can go next from. */
861 while (1)
862 {
863 /* OK. So move up. */
864 pos = fz_xml_up(pos);
865 /* Check for hitting the top. */
866 if (pos == top)
867 break;
868 next = fz_xml_next(pos);
869 if (next)
870 {
871 pos = next;
872 break;
873 }
874 }
875 }
876
877 return val;
878 }
879
880 static fz_xml *
881 try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white)
882 {
883 if (!fz_has_archive_entry(ctx, arch, filename))
884 return NULL;
885
886 return fz_parse_xml_archive_entry(ctx, arch, filename, preserve_white);
887 }
888
889 static void
890 load_shared_strings(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
891 {
892 fz_xml *pos = fz_xml_find_dfs(rels, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings");
893 const char *ss_file = fz_xml_att(pos, "Target");
894 char *resolved = NULL;
895 fz_xml *xml = NULL;
896 char *str = NULL;
897
898 if (ss_file == NULL)
899 return;
900
901 fz_var(xml);
902 fz_var(str);
903 fz_var(resolved);
904
905 fz_try(ctx)
906 {
907 resolved = make_absolute_path(ctx, file, ss_file);
908 xml = fz_parse_xml_archive_entry(ctx, arch, resolved, 1);
909
910 pos = fz_xml_find_dfs(xml, "si", NULL, NULL);
911 while (pos)
912 {
913 int n = info->shared_string_len;
914 str = collate_t_content(ctx, pos);
915
916 if (n == info->shared_string_max)
917 {
918 int max = info->shared_string_max;
919 int newmax = max ? max * 2 : 1024;
920 char **arr = fz_realloc(ctx, info->shared_strings, sizeof(*arr) * newmax);
921 memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
922 info->shared_strings = arr;
923 info->shared_string_max = newmax;
924 }
925
926 info->shared_strings[n] = str;
927 str = NULL;
928 info->shared_string_len++;
929 pos = fz_xml_find_next_dfs(pos, "si", NULL, NULL);
930 }
931 }
932 fz_always(ctx)
933 {
934 fz_drop_xml(ctx, xml);
935 fz_free(ctx, resolved);
936 fz_free(ctx, str);
937 }
938 fz_catch(ctx)
939 fz_rethrow(ctx);
940 }
941
942 static void
943 load_footnotes(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file)
944 {
945 char *resolved = NULL;
946 fz_xml *xml = NULL;
947 char *str = NULL;
948
949 fz_var(xml);
950 fz_var(str);
951 fz_var(resolved);
952
953 fz_try(ctx)
954 {
955 fz_xml *pos;
956
957 resolved = make_absolute_path(ctx, file, "footnotes.xml");
958 xml = try_parse_xml_archive_entry(ctx, arch, resolved, 1);
959 if (xml == NULL)
960 break;
961
962 pos = fz_xml_find_dfs(xml, "footnote", NULL, NULL);
963 while (pos)
964 {
965 int n = fz_atoi(fz_xml_att(pos, "w:id"));
966
967 str = collate_t_content(ctx, pos);
968
969 if (str && n >= 0)
970 {
971 if (n >= info->footnotes_max)
972 {
973 int max = info->footnotes_max;
974 int newmax = max ? max * 2 : 1024;
975 char **arr;
976 if (newmax < n)
977 newmax = n+1;
978 arr = fz_realloc(ctx, info->footnotes, sizeof(*arr) * newmax);
979 memset(&arr[max], 0, sizeof(*arr) * (newmax - max));
980 info->footnotes = arr;
981 info->footnotes_max = newmax;
982 }
983
984 info->footnotes[n] = str;
985 str = NULL;
986 }
987 pos = fz_xml_find_next_dfs(pos, "footnote", NULL, NULL);
988 }
989 }
990 fz_always(ctx)
991 {
992 fz_drop_xml(ctx, xml);
993 fz_free(ctx, resolved);
994 fz_free(ctx, str);
995 }
996 fz_catch(ctx)
997 fz_rethrow(ctx);
998 }
999
1000 static void
1001 process_office_document(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
1002 {
1003 char *file_rels;
1004 fz_xml *xml = NULL;
1005 fz_xml *rels = NULL;
1006 char *resolved_rel = NULL;
1007
1008 if (file == NULL)
1009 return;
1010
1011 file_rels = make_rel_name(ctx, file);
1012
1013 fz_var(resolved_rel);
1014
1015 fz_var(rels);
1016 fz_var(xml);
1017
1018 fz_try(ctx)
1019 {
1020 fz_xml *pos;
1021
1022 rels = fz_parse_xml_archive_entry(ctx, arch, file_rels, 0);
1023 xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
1024
1025 /* XLSX */
1026 pos = fz_xml_find_dfs(xml, "sheet", NULL, NULL);
1027 if (pos)
1028 {
1029 load_shared_strings(ctx, arch, rels, info, file);
1030 while (pos)
1031 {
1032 char *name = fz_xml_att(pos, "name");
1033 char *id = fz_xml_att(pos, "r:id");
1034 char *sheet = lookup_rel(ctx, rels, id);
1035
1036 if (sheet)
1037 {
1038 resolved_rel = make_absolute_path(ctx, file, sheet);
1039 process_sheet(ctx, arch, name, resolved_rel, info);
1040 fz_free(ctx, resolved_rel);
1041 resolved_rel = NULL;
1042 }
1043 pos = fz_xml_find_next_dfs(pos, "sheet", NULL, NULL);
1044 }
1045 break;
1046 }
1047
1048 /* Let's try it as a powerpoint */
1049 pos = fz_xml_find_dfs(xml, "sldId", NULL, NULL);
1050 if (pos)
1051 {
1052 while (pos)
1053 {
1054 char *id = fz_xml_att(pos, "r:id");
1055 char *sheet = lookup_rel(ctx, rels, id);
1056
1057 if (sheet)
1058 {
1059 resolved_rel = make_absolute_path(ctx, file, sheet);
1060 process_slide(ctx, arch, resolved_rel, info);
1061 fz_free(ctx, resolved_rel);
1062 resolved_rel = NULL;
1063 }
1064 pos = fz_xml_find_next_dfs(pos, "sldId", NULL, NULL);
1065 }
1066 break;
1067 }
1068
1069 /* Let's try it as word. */
1070 {
1071 load_footnotes(ctx, arch, rels, info, file);
1072 process_doc_stream(ctx, xml, info, 1);
1073 }
1074 }
1075 fz_always(ctx)
1076 {
1077 fz_drop_xml(ctx, xml);
1078 fz_drop_xml(ctx, rels);
1079 fz_free(ctx, resolved_rel);
1080 fz_free(ctx, file_rels);
1081 }
1082 fz_catch(ctx)
1083 fz_rethrow(ctx);
1084 }
1085
1086 static void
1087 process_office_document_properties(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info)
1088 {
1089 fz_xml *xml = NULL;
1090 char *title;
1091
1092 fz_var(xml);
1093
1094 fz_try(ctx)
1095 {
1096 fz_xml *pos;
1097
1098 xml = fz_parse_xml_archive_entry(ctx, arch, file, 1);
1099
1100 pos = fz_xml_find_dfs(xml, "title", NULL, NULL);
1101 title = fz_xml_text(fz_xml_down(pos));
1102 if (title)
1103 {
1104 fz_write_string(ctx, info->out, "<title>");
1105 doc_escape(ctx, info->out, title);
1106 fz_write_string(ctx, info->out, "</title>");
1107 }
1108 }
1109 fz_always(ctx)
1110 {
1111 fz_drop_xml(ctx, xml);
1112 }
1113 fz_catch(ctx)
1114 fz_rethrow(ctx);
1115 }
1116
1117 static fz_buffer *
1118 fz_office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buffer_in, fz_archive *dir, const char *user_css, fz_office_to_html_opts *opts)
1119 {
1120 fz_stream *stream = NULL;
1121 fz_archive *archive = NULL;
1122 fz_buffer *buffer_out = NULL;
1123 fz_xml *xml = NULL;
1124 fz_xml *pos = NULL;
1125 fz_xml *rels = NULL;
1126 const char *schema = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
1127 const char *schema_props = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties";
1128 doc_info info = { 0 };
1129 int i;
1130
1131 fz_var(archive);
1132 fz_var(stream);
1133 fz_var(buffer_out);
1134 fz_var(xml);
1135 fz_var(rels);
1136
1137 if (opts)
1138 info.opts = *opts;
1139
1140 fz_try(ctx)
1141 {
1142 if (buffer_in)
1143 {
1144 stream = fz_open_buffer(ctx, buffer_in);
1145 archive = fz_open_archive_with_stream(ctx, stream);
1146 }
1147 else
1148 archive = fz_keep_archive(ctx, dir);
1149 buffer_out = fz_new_buffer(ctx, 1024);
1150 info.out = fz_new_output_with_buffer(ctx, buffer_out);
1151
1152 /* Is it an HWPX ?*/
1153 xml = try_parse_xml_archive_entry(ctx, archive, "META-INF/container.xml", 0);
1154 if (xml)
1155 {
1156 pos = fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml");
1157 if (!pos)
1158 fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not hwpx.");
1159
1160 while (pos)
1161 {
1162 const char *file = fz_xml_att(pos, "full-path");
1163 process_rootfile(ctx, archive, file, &info);
1164 pos = fz_xml_find_next_dfs(pos, "rootfile", "media-type", "application/hwpml-package+xml");
1165 }
1166 fz_close_output(ctx, info.out);
1167 break;
1168 }
1169
1170 /* Try other types */
1171 {
1172 xml = try_parse_xml_archive_entry(ctx, archive, "_rels/.rels", 0);
1173
1174 fz_write_string(ctx, info.out, "<html>\n");
1175
1176 pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema_props);
1177 if (pos)
1178 {
1179 const char *file = fz_xml_att(pos, "Target");
1180 fz_write_string(ctx, info.out, "<head>\n");
1181 process_office_document_properties(ctx, archive, file, &info);
1182 fz_write_string(ctx, info.out, "</head>\n");
1183 }
1184
1185 fz_write_string(ctx, info.out, "<body>\n");
1186 pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema);
1187 if (!pos)
1188 fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not docx.");
1189
1190 while (pos)
1191 {
1192 const char *file = fz_xml_att(pos, "Target");
1193 if (file)
1194 process_office_document(ctx, archive, file, &info);
1195 pos = fz_xml_find_next_dfs(pos, "Relationship", "Type", schema);
1196 }
1197 }
1198
1199 fz_close_output(ctx, info.out);
1200 }
1201 fz_always(ctx)
1202 {
1203 fz_drop_xml(ctx, rels);
1204 fz_drop_xml(ctx, xml);
1205 for (i = 0; i < info.shared_string_len; ++i)
1206 fz_free(ctx, info.shared_strings[i]);
1207 fz_free(ctx, info.shared_strings);
1208 for (i = 0; i < info.footnotes_max; ++i)
1209 fz_free(ctx, info.footnotes[i]);
1210 fz_free(ctx, info.footnotes);
1211 fz_drop_output(ctx, info.out);
1212 fz_drop_archive(ctx, archive);
1213 fz_drop_stream(ctx, stream);
1214 }
1215 fz_catch(ctx)
1216 {
1217 fz_drop_buffer(ctx, buffer_out);
1218 fz_rethrow(ctx);
1219 }
1220
1221 #ifdef DEBUG_OFFICE_TO_HTML
1222 {
1223 unsigned char *storage;
1224 size_t len = fz_buffer_storage(ctx, buffer_out, &storage);
1225 fz_write_printf(ctx, fz_stddbg(ctx), "fz_office_to_html: Output buffer, len=%zd:\n", len);
1226 fz_write_buffer(ctx, fz_stddbg(ctx), buffer_out);
1227 }
1228 #endif
1229
1230 return buffer_out;
1231 }
1232
1233 /* Office document handler */
1234
1235 static fz_buffer *
1236 office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css)
1237 {
1238 fz_office_to_html_opts opts = { 0 };
1239
1240 return fz_office_to_html(ctx, set, buf, zip, user_css, &opts);
1241 }
1242
1243 static const fz_htdoc_format_t fz_htdoc_office =
1244 {
1245 "Office document",
1246 office_to_html,
1247 0, 1, 0
1248 };
1249
1250 static fz_document *
1251 office_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state)
1252 {
1253 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_office);
1254 }
1255
1256 static const char *office_extensions[] =
1257 {
1258 "docx",
1259 "xlsx",
1260 "pptx",
1261 "hwpx",
1262 NULL
1263 };
1264
1265 static const char *office_mimetypes[] =
1266 {
1267 // DOCX
1268 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1269 // XLSX
1270 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1271 // PPTX
1272 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1273 // HWPX
1274 "application/haansofthwpx",
1275 "application/vnd.hancom.hwpx",
1276 NULL
1277 };
1278
1279 /* We are only ever 75% sure here, to allow a 'better' handler, such as sodochandler
1280 * to override us by returning 100. */
1281 static int
1282 office_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *zip, void **state, fz_document_recognize_state_free_fn **free_state)
1283 {
1284 fz_archive *arch = NULL;
1285 int ret = 0;
1286 fz_xml *xml = NULL;
1287
1288 if (state)
1289 *state = NULL;
1290 if (free_state)
1291 *free_state = NULL;
1292
1293 fz_var(arch);
1294 fz_var(ret);
1295 fz_var(xml);
1296
1297 fz_try(ctx)
1298 {
1299 if (stream)
1300 {
1301 arch = fz_try_open_archive_with_stream(ctx, stream);
1302 if (arch == NULL)
1303 break;
1304 }
1305 else
1306 arch = fz_keep_archive(ctx, zip);
1307
1308 xml = fz_try_parse_xml_archive_entry(ctx, arch, "META-INF/container.xml", 0);
1309 if (xml)
1310 {
1311 if (fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml"))
1312 ret = 75; /* HWPX */
1313 break;
1314 }
1315 xml = fz_try_parse_xml_archive_entry(ctx, arch, "_rels/.rels", 0);
1316 if (xml)
1317 {
1318 if (fz_xml_find_dfs(xml, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"))
1319 {
1320 ret = 75; /* DOCX | PPTX | XLSX */
1321 }
1322 break;
1323 }
1324 }
1325 fz_always(ctx)
1326 {
1327 fz_drop_xml(ctx, xml);
1328 fz_drop_archive(ctx, arch);
1329 }
1330 fz_catch(ctx)
1331 fz_rethrow(ctx);
1332
1333 return ret;
1334 }
1335
1336 fz_document_handler office_document_handler =
1337 {
1338 NULL,
1339 office_open_document,
1340 office_extensions,
1341 office_mimetypes,
1342 office_recognize_doc_content
1343 };