Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/html/office.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2023-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "html-imp.h" | |
| 25 | |
| 26 #undef DEBUG_OFFICE_TO_HTML | |
| 27 | |
| 28 /* Defaults are all 0's. FIXME: Very subject to change. Possibly might be removed entirely. */ | |
| 29 typedef struct | |
| 30 { | |
| 31 int output_page_numbers; | |
| 32 int output_sheet_names; | |
| 33 int output_cell_markers; | |
| 34 int output_cell_row_markers; | |
| 35 int output_cell_names; | |
| 36 int output_formatting; | |
| 37 int output_filenames; | |
| 38 int output_errors; | |
| 39 } | |
| 40 fz_office_to_html_opts; | |
| 41 | |
| 42 typedef struct | |
| 43 { | |
| 44 fz_office_to_html_opts opts; | |
| 45 | |
| 46 fz_output *out; | |
| 47 | |
| 48 int page; | |
| 49 | |
| 50 /* State for if we are parsing a sheet. */ | |
| 51 /* The last column label we have to send. */ | |
| 52 char *label; | |
| 53 /* Columns are numbered from 1. */ | |
| 54 /* The column we are at. */ | |
| 55 int col_at; | |
| 56 /* The column we last signalled. If this is 0, then we haven't | |
| 57 * even started a row yet. */ | |
| 58 int col_signalled; | |
| 59 | |
| 60 /* If we are currently processing a spreadsheet, store the current | |
| 61 * sheets name here. */ | |
| 62 const char *sheet_name; | |
| 63 | |
| 64 int shared_string_max; | |
| 65 int shared_string_len; | |
| 66 char **shared_strings; | |
| 67 | |
| 68 int footnotes_max; | |
| 69 char **footnotes; | |
| 70 | |
| 71 char *title; | |
| 72 } doc_info; | |
| 73 | |
| 74 static void | |
| 75 doc_escape(fz_context *ctx, fz_output *output, const char *str_) | |
| 76 { | |
| 77 const unsigned char *str = (const unsigned char *)str_; | |
| 78 int c; | |
| 79 | |
| 80 if (!str) | |
| 81 return; | |
| 82 | |
| 83 while ((c = *str++) != 0) | |
| 84 { | |
| 85 if (c == '&') | |
| 86 { | |
| 87 fz_write_string(ctx, output, "&"); | |
| 88 } | |
| 89 else if (c == '<') | |
| 90 { | |
| 91 fz_write_string(ctx, output, "<"); | |
| 92 } | |
| 93 else if (c == '>') | |
| 94 { | |
| 95 fz_write_string(ctx, output, ">"); | |
| 96 } | |
| 97 else | |
| 98 { | |
| 99 /* We get utf-8 in, just parrot it out again. */ | |
| 100 fz_write_byte(ctx, output, c); | |
| 101 } | |
| 102 } | |
| 103 } | |
| 104 | |
| 105 static void | |
| 106 show_text(fz_context *ctx, fz_xml *top, doc_info *info) | |
| 107 { | |
| 108 fz_xml *pos = top; | |
| 109 fz_xml *next; | |
| 110 | |
| 111 while (pos) | |
| 112 { | |
| 113 doc_escape(ctx, info->out, fz_xml_text(pos)); | |
| 114 | |
| 115 if (fz_xml_is_tag(pos, "lineBreak")) | |
| 116 { | |
| 117 fz_write_string(ctx, info->out, "\n"); | |
| 118 } | |
| 119 else if (fz_xml_is_tag(pos, "tab")) | |
| 120 { | |
| 121 fz_write_string(ctx, info->out, "\t"); | |
| 122 } | |
| 123 else if (fz_xml_is_tag(pos, "lastRenderedPageBreak")) | |
| 124 { | |
| 125 info->page++; | |
| 126 } | |
| 127 | |
| 128 /* Always try to move down. */ | |
| 129 next = fz_xml_down(pos); | |
| 130 if (next) | |
| 131 { | |
| 132 /* We can move down, easy! */ | |
| 133 pos = next; | |
| 134 continue; | |
| 135 } | |
| 136 | |
| 137 if (pos == top) | |
| 138 break; | |
| 139 | |
| 140 /* We can't move down, try moving to next. */ | |
| 141 next = fz_xml_next(pos); | |
| 142 if (next) | |
| 143 { | |
| 144 /* We can move to next, easy! */ | |
| 145 pos = next; | |
| 146 continue; | |
| 147 } | |
| 148 | |
| 149 /* If we can't go down, or next, pop up until we | |
| 150 * find somewhere we can go next from. */ | |
| 151 while (1) | |
| 152 { | |
| 153 /* OK. So move up. */ | |
| 154 pos = fz_xml_up(pos); | |
| 155 /* Check for hitting the top. */ | |
| 156 if (pos == top) | |
| 157 pos = NULL; | |
| 158 if (pos == NULL) | |
| 159 break; | |
| 160 /* We've returned to a node. See if it's a 'p'. */ | |
| 161 if (fz_xml_is_tag(pos, "p")) | |
| 162 { | |
| 163 fz_write_string(ctx, info->out, "\n"); | |
| 164 } | |
| 165 next = fz_xml_next(pos); | |
| 166 if (next) | |
| 167 { | |
| 168 pos = next; | |
| 169 break; | |
| 170 } | |
| 171 } | |
| 172 } | |
| 173 } | |
| 174 | |
| 175 static void | |
| 176 show_footnote(fz_context *ctx, fz_xml *v, doc_info *info) | |
| 177 { | |
| 178 int n = fz_atoi(fz_xml_att(v, "w:id")); | |
| 179 | |
| 180 if (n < 0 || n >= info->footnotes_max) | |
| 181 return; | |
| 182 | |
| 183 if (info->footnotes[n] == NULL || | |
| 184 info->footnotes[n][0] == 0) | |
| 185 return; | |
| 186 | |
| 187 /* Then send the strings. */ | |
| 188 doc_escape(ctx, info->out, info->footnotes[n]); | |
| 189 } | |
| 190 | |
| 191 static void | |
| 192 process_doc_stream(fz_context *ctx, fz_xml *xml, doc_info *info, int do_pages) | |
| 193 { | |
| 194 fz_xml *pos; | |
| 195 fz_xml *next; | |
| 196 const char *paragraph_style = NULL; | |
| 197 const char *inline_style = NULL; | |
| 198 | |
| 199 #ifdef DEBUG_OFFICE_TO_HTML | |
| 200 fz_write_printf(ctx, fz_stddbg(ctx), "process_doc_stream:\n"); | |
| 201 fz_output_xml(ctx, fz_stddbg(ctx), xml, 0); | |
| 202 #endif | |
| 203 | |
| 204 /* First off, see if we can do page numbers. */ | |
| 205 if (do_pages) | |
| 206 { | |
| 207 pos = fz_xml_find_dfs(xml, "lastRenderedPageBreak", NULL, NULL); | |
| 208 if (pos) | |
| 209 { | |
| 210 /* We *can* do page numbers, so start here. */ | |
| 211 fz_write_string(ctx, info->out, "<div id=\"page1\">\n"); | |
| 212 info->page = 1; | |
| 213 } | |
| 214 } | |
| 215 | |
| 216 /* Now walk the tree for real. */ | |
| 217 pos = xml; | |
| 218 while (pos) | |
| 219 { | |
| 220 /* When we arrive on a node, check if it's a 't'. */ | |
| 221 if (fz_xml_is_tag(pos, "t")) | |
| 222 { | |
| 223 show_text(ctx, pos, info); | |
| 224 /* Do NOT go down, we've already dealt with that. */ | |
| 225 } | |
| 226 else if (fz_xml_is_tag(pos, "br")) | |
| 227 { | |
| 228 if (paragraph_style && strcmp(paragraph_style, "pre")) | |
| 229 { | |
| 230 fz_write_printf(ctx, info->out, "<br/>\n"); | |
| 231 } | |
| 232 else | |
| 233 { | |
| 234 fz_write_printf(ctx, info->out, "\n"); | |
| 235 } | |
| 236 } | |
| 237 else if (fz_xml_is_tag(pos, "footnoteReference")) | |
| 238 { | |
| 239 show_footnote(ctx, pos, info); | |
| 240 /* Do NOT go down, we've already dealt with that. */ | |
| 241 } | |
| 242 else if (fz_xml_is_tag(pos, "tabs")) | |
| 243 { | |
| 244 /* Don't walk through tabs, or we will hit lots of 'tab' entries and | |
| 245 * output incorrect information. */ | |
| 246 } | |
| 247 else if (fz_xml_is_tag(pos, "pStyle")) | |
| 248 { | |
| 249 /* Should prob fix fz_xml_*() to strip namespace prefix | |
| 250 from attributes, to match what it does for tag names. | |
| 251 */ | |
| 252 paragraph_style = fz_xml_att(pos, "w:val"); | |
| 253 if (paragraph_style) | |
| 254 { | |
| 255 if (!strcmp(paragraph_style, "BodyText")) | |
| 256 paragraph_style = NULL; | |
| 257 else if (!strcmp(paragraph_style, "Heading1")) | |
| 258 paragraph_style = "h1"; | |
| 259 else if (!strcmp(paragraph_style, "Heading2")) | |
| 260 paragraph_style = "h2"; | |
| 261 else if (!strcmp(paragraph_style, "Heading3")) | |
| 262 paragraph_style = "h3"; | |
| 263 else if (!strcmp(paragraph_style, "Heading4")) | |
| 264 paragraph_style = "h4"; | |
| 265 else if (!strcmp(paragraph_style, "Heading5")) | |
| 266 paragraph_style = "h5"; | |
| 267 else if (!strcmp(paragraph_style, "Heading6")) | |
| 268 paragraph_style = "h6"; | |
| 269 else if (!strcmp(paragraph_style, "SourceCode")) | |
| 270 paragraph_style = "pre"; | |
| 271 else | |
| 272 paragraph_style = NULL; | |
| 273 | |
| 274 if (paragraph_style) | |
| 275 fz_write_printf(ctx, info->out, "<%s>", paragraph_style); | |
| 276 } | |
| 277 } | |
| 278 else if (fz_xml_is_tag(pos, "rStyle")) | |
| 279 { | |
| 280 inline_style = fz_xml_att(pos, "w:val"); | |
| 281 if (inline_style) | |
| 282 { | |
| 283 if (!strcmp(inline_style, "VerbatimChar")) | |
| 284 inline_style = "tt"; | |
| 285 else | |
| 286 { | |
| 287 if (0) | |
| 288 fz_write_printf(ctx, info->out, "<!-- %s -->", inline_style); | |
| 289 inline_style = NULL; | |
| 290 } | |
| 291 if (inline_style) | |
| 292 fz_write_printf(ctx, info->out, "<%s>", inline_style); | |
| 293 } | |
| 294 } | |
| 295 else | |
| 296 { | |
| 297 fz_xml *down; | |
| 298 if (fz_xml_is_tag(pos, "lineBreak")) | |
| 299 { | |
| 300 fz_write_string(ctx, info->out, "\n"); | |
| 301 } | |
| 302 else if (fz_xml_is_tag(pos, "p")) | |
| 303 { | |
| 304 fz_write_string(ctx, info->out, "<p>"); | |
| 305 } | |
| 306 else if (fz_xml_is_tag(pos, "tab")) | |
| 307 { | |
| 308 fz_write_string(ctx, info->out, "\t"); | |
| 309 } | |
| 310 else if (do_pages && fz_xml_is_tag(pos, "lastRenderedPageBreak")) | |
| 311 { | |
| 312 if (info->page) | |
| 313 fz_write_string(ctx, info->out, "\n</div>\n"); | |
| 314 info->page++; | |
| 315 fz_write_printf(ctx, info->out, "<div id=\"page%d\">\n", info->page); | |
| 316 } | |
| 317 /* Try to move down. */ | |
| 318 down = fz_xml_down(pos); | |
| 319 if (down) | |
| 320 { | |
| 321 /* We can move down, easy! */ | |
| 322 pos = down; | |
| 323 continue; | |
| 324 } | |
| 325 } | |
| 326 /* Try moving to next. */ | |
| 327 next = fz_xml_next(pos); | |
| 328 if (next) | |
| 329 { | |
| 330 /* We can move to next, easy! */ | |
| 331 pos = next; | |
| 332 continue; | |
| 333 } | |
| 334 | |
| 335 /* If we can't go down, or next, pop up until we | |
| 336 * find somewhere we can go next from. */ | |
| 337 while (1) | |
| 338 { | |
| 339 /* OK. So move up. */ | |
| 340 pos = fz_xml_up(pos); | |
| 341 /* Check for hitting the top. */ | |
| 342 if (pos == NULL) | |
| 343 break; | |
| 344 /* We've returned to a node. See if it's a 'p'. */ | |
| 345 if (fz_xml_is_tag(pos, "p")) | |
| 346 { | |
| 347 if (paragraph_style) | |
| 348 { | |
| 349 fz_write_printf(ctx, info->out, "</%s>", paragraph_style); | |
| 350 paragraph_style = NULL; | |
| 351 } | |
| 352 fz_write_string(ctx, info->out, "</p>\n"); | |
| 353 } | |
| 354 else if (fz_xml_is_tag(pos, "r")) | |
| 355 { | |
| 356 /* Seems to be pseudo-close for rStyle. */ | |
| 357 if (inline_style) | |
| 358 { | |
| 359 fz_write_printf(ctx, info->out, "</%s>", inline_style); | |
| 360 inline_style = NULL; | |
| 361 } | |
| 362 } | |
| 363 next = fz_xml_next(pos); | |
| 364 if (next) | |
| 365 { | |
| 366 pos = next; | |
| 367 break; | |
| 368 } | |
| 369 } | |
| 370 } | |
| 371 | |
| 372 if (do_pages && info->page) | |
| 373 fz_write_string(ctx, info->out, "\n</div>\n"); | |
| 374 } | |
| 375 | |
| 376 static void | |
| 377 process_item(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info, int do_pages) | |
| 378 { | |
| 379 fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); | |
| 380 | |
| 381 fz_try(ctx) | |
| 382 process_doc_stream(ctx, xml, info, do_pages); | |
| 383 fz_always(ctx) | |
| 384 fz_drop_xml(ctx, xml); | |
| 385 fz_catch(ctx) | |
| 386 fz_rethrow(ctx); | |
| 387 } | |
| 388 | |
| 389 static void | |
| 390 process_rootfile(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) | |
| 391 { | |
| 392 fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 0); | |
| 393 | |
| 394 fz_try(ctx) | |
| 395 { | |
| 396 /* FIXME: Should really search for these just inside 'spine'. */ | |
| 397 fz_xml *pos = fz_xml_find_dfs(xml, "itemref", NULL, NULL); | |
| 398 while (pos) | |
| 399 { | |
| 400 char *idref = fz_xml_att(pos, "idref"); | |
| 401 fz_xml *item = fz_xml_find_dfs(xml, "item", "id", idref); | |
| 402 while (item) | |
| 403 { | |
| 404 char *type = fz_xml_att(item, "media-type"); | |
| 405 char *href = fz_xml_att(item, "href"); | |
| 406 if (type && href && !strcmp(type, "application/xml")) | |
| 407 { | |
| 408 process_item(ctx, arch, href, info, 1); | |
| 409 } | |
| 410 item = fz_xml_find_next_dfs(pos, "item", "id", idref); | |
| 411 } | |
| 412 pos = fz_xml_find_next_dfs(pos, "itemref", NULL, NULL); | |
| 413 } | |
| 414 } | |
| 415 fz_always(ctx) | |
| 416 fz_drop_xml(ctx, xml); | |
| 417 fz_catch(ctx) | |
| 418 fz_rethrow(ctx); | |
| 419 } | |
| 420 | |
| 421 /* XLSX support */ | |
| 422 static char * | |
| 423 make_rel_name(fz_context *ctx, const char *file) | |
| 424 { | |
| 425 size_t z = strlen(file); | |
| 426 char *s = fz_malloc(ctx, z + 12); | |
| 427 char *t; | |
| 428 const char *p; | |
| 429 const char *slash = file; | |
| 430 | |
| 431 for (p = file; *p != 0; p++) | |
| 432 if (*p == '/') | |
| 433 slash = p+1; | |
| 434 | |
| 435 t = s; | |
| 436 if (slash != file) | |
| 437 { | |
| 438 memcpy(t, file, slash - file); | |
| 439 t += slash - file; | |
| 440 } | |
| 441 memcpy(t, "_rels/", 6); | |
| 442 t += 6; | |
| 443 memcpy(t, file + (slash - file), z - (slash - file)); | |
| 444 t += z - (slash - file); | |
| 445 memcpy(t, ".rels", 6); | |
| 446 | |
| 447 return s; | |
| 448 } | |
| 449 | |
| 450 static char *lookup_rel(fz_context *ctx, fz_xml *rels, const char *id) | |
| 451 { | |
| 452 fz_xml *pos; | |
| 453 | |
| 454 if (id == NULL) | |
| 455 return NULL; | |
| 456 | |
| 457 pos = fz_xml_find_dfs(rels, "Relationship", NULL, NULL); | |
| 458 while (pos) | |
| 459 { | |
| 460 char *id2 = fz_xml_att(pos, "Id"); | |
| 461 | |
| 462 if (id2 && !strcmp(id, id2)) | |
| 463 return fz_xml_att(pos, "Target"); | |
| 464 | |
| 465 pos = fz_xml_find_next_dfs(pos, "Relationship", NULL, NULL); | |
| 466 } | |
| 467 | |
| 468 return NULL; | |
| 469 } | |
| 470 | |
| 471 static void | |
| 472 send_cell_formatting(fz_context *ctx, doc_info *info) | |
| 473 { | |
| 474 if (info->col_signalled == 0) | |
| 475 { | |
| 476 fz_write_string(ctx, info->out, "<tr>\n"); | |
| 477 info->col_signalled = 1; | |
| 478 if (info->col_at > 1) | |
| 479 fz_write_string(ctx, info->out, "<td>"); | |
| 480 } | |
| 481 | |
| 482 /* Send the label */ | |
| 483 while (info->col_signalled < info->col_at) | |
| 484 { | |
| 485 fz_write_string(ctx, info->out, "</td>"); | |
| 486 info->col_signalled++; | |
| 487 if (info->col_signalled < info->col_at) | |
| 488 fz_write_string(ctx, info->out, "<td>"); | |
| 489 } | |
| 490 if (info->sheet_name && info->sheet_name[0]) | |
| 491 fz_write_printf(ctx, info->out, "<td id=\"%s!%s\">", info->sheet_name, info->label); | |
| 492 else | |
| 493 fz_write_printf(ctx, info->out, "<td id=\"%s\">", info->label); | |
| 494 } | |
| 495 | |
| 496 static void | |
| 497 show_shared_string(fz_context *ctx, fz_xml *v, doc_info *info) | |
| 498 { | |
| 499 const char *t = fz_xml_text(fz_xml_down(v)); | |
| 500 int n = fz_atoi(t); | |
| 501 | |
| 502 if (n < 0 || n >= info->shared_string_len) | |
| 503 return; | |
| 504 | |
| 505 if (info->shared_strings[n] == NULL || | |
| 506 info->shared_strings[n][0] == 0) | |
| 507 return; | |
| 508 | |
| 509 send_cell_formatting(ctx, info); | |
| 510 /* Then send the strings. */ | |
| 511 doc_escape(ctx, info->out, info->shared_strings[n]); | |
| 512 } | |
| 513 | |
| 514 static int | |
| 515 col_from_label(const char *label) | |
| 516 { | |
| 517 int col = 0; | |
| 518 int len = 26; | |
| 519 int base = 0; | |
| 520 | |
| 521 /* If we can't read the column, return 0. */ | |
| 522 if (label == NULL || *label < 'A' || *label > 'Z') | |
| 523 return 0; | |
| 524 | |
| 525 /* Each section (A-Z, AA-ZZ, AAA-ZZZ etc) is of len 'len', and starts | |
| 526 * at base index 'base'. Each section is 26 times as long, and starts | |
| 527 * at base + len from the previous section. | |
| 528 * | |
| 529 * A: col = 26 * 0 + 0 + 0 | |
| 530 * AA: col = (26 * 0 + 0 + 0) * 26 + 0 + 26 = 26 | |
| 531 * AAA: col = (((26 * 0 + 0 + 0) * 26 + 0 + 26)*26 + 0 + 26*26 = 26 + 26 * 26 | |
| 532 */ | |
| 533 do | |
| 534 { | |
| 535 col = 26 * col + (*label++) - 'A' + base; | |
| 536 base += len; | |
| 537 len *= 26; | |
| 538 } | |
| 539 while (*label >= 'A' && *label <= 'Z'); | |
| 540 | |
| 541 return col+1; | |
| 542 } | |
| 543 | |
| 544 static void | |
| 545 show_cell_text(fz_context *ctx, fz_xml *top, doc_info *info) | |
| 546 { | |
| 547 fz_xml *pos = top; | |
| 548 fz_xml *next; | |
| 549 | |
| 550 while (pos) | |
| 551 { | |
| 552 char *text = fz_xml_text(pos); | |
| 553 | |
| 554 if (text) | |
| 555 { | |
| 556 send_cell_formatting(ctx, info); | |
| 557 doc_escape(ctx, info->out, text); | |
| 558 } | |
| 559 | |
| 560 /* Always try to move down. */ | |
| 561 next = fz_xml_down(pos); | |
| 562 if (next) | |
| 563 { | |
| 564 /* We can move down, easy! */ | |
| 565 pos = next; | |
| 566 continue; | |
| 567 } | |
| 568 | |
| 569 if (pos == top) | |
| 570 break; | |
| 571 | |
| 572 /* We can't move down, try moving to next. */ | |
| 573 next = fz_xml_next(pos); | |
| 574 if (next) | |
| 575 { | |
| 576 /* We can move to next, easy! */ | |
| 577 pos = next; | |
| 578 continue; | |
| 579 } | |
| 580 | |
| 581 /* If we can't go down, or next, pop up until we | |
| 582 * find somewhere we can go next from. */ | |
| 583 while (1) | |
| 584 { | |
| 585 /* OK. So move up. */ | |
| 586 pos = fz_xml_up(pos); | |
| 587 /* Check for hitting the top. */ | |
| 588 if (pos == top) | |
| 589 pos = NULL; | |
| 590 if (pos == NULL) | |
| 591 break; | |
| 592 next = fz_xml_next(pos); | |
| 593 if (next) | |
| 594 { | |
| 595 pos = next; | |
| 596 break; | |
| 597 } | |
| 598 } | |
| 599 } | |
| 600 } | |
| 601 | |
| 602 static void | |
| 603 arrived_at_cell(fz_context *ctx, doc_info *info, const char *label) | |
| 604 { | |
| 605 int col; | |
| 606 | |
| 607 /* If we have a label queued, and no label is given here, then we're | |
| 608 * processing a 'cell' callback after having had a 'cellname' | |
| 609 * callback. So don't signal it twice! */ | |
| 610 if (label == NULL && info->label) | |
| 611 return; | |
| 612 | |
| 613 col = label ? col_from_label(label) : 0; | |
| 614 | |
| 615 fz_free(ctx, info->label); | |
| 616 info->label = NULL; | |
| 617 info->label = label ? fz_strdup(ctx, label) : NULL; | |
| 618 info->col_at = col; | |
| 619 } | |
| 620 | |
| 621 static void | |
| 622 show_cell(fz_context *ctx, fz_xml *cell, doc_info *info) | |
| 623 { | |
| 624 char *t = fz_xml_att(cell, "t"); | |
| 625 fz_xml *v = fz_xml_find_down(cell, "v"); | |
| 626 const char *r = fz_xml_att(cell, "r"); | |
| 627 | |
| 628 arrived_at_cell(ctx, info, r); | |
| 629 if (t && t[0] == 's' && t[1] == 0) | |
| 630 show_shared_string(ctx, v, info); | |
| 631 else | |
| 632 show_cell_text(ctx, v, info); | |
| 633 } | |
| 634 | |
| 635 static void | |
| 636 new_row(fz_context *ctx, doc_info *info) | |
| 637 { | |
| 638 if (info->col_signalled) | |
| 639 { | |
| 640 /* We've sent at least one cell. So need to close the | |
| 641 * td and tr */ | |
| 642 fz_write_string(ctx, info->out, "</td>\n</tr>\n"); | |
| 643 } | |
| 644 else | |
| 645 { | |
| 646 /* We've not sent anything for this row. Keep the counts | |
| 647 * correct. */ | |
| 648 fz_write_string(ctx, info->out, "<tr></tr>\n"); | |
| 649 } | |
| 650 info->col_at = 1; | |
| 651 info->col_signalled = 0; | |
| 652 fz_free(ctx, info->label); | |
| 653 info->label = NULL; | |
| 654 } | |
| 655 | |
| 656 static void | |
| 657 process_sheet(fz_context *ctx, fz_archive *arch, const char *name, const char *file, doc_info *info) | |
| 658 { | |
| 659 fz_xml *xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); | |
| 660 | |
| 661 #ifdef DEBUG_OFFICE_TO_HTML | |
| 662 fz_write_printf(ctx, fz_stddbg(ctx), "process_sheet:\n"); | |
| 663 fz_output_xml(ctx, fz_stddbg(ctx), xml, 0); | |
| 664 #endif | |
| 665 | |
| 666 fz_write_printf(ctx, info->out, "<table id=\"%s\">\n", name); | |
| 667 | |
| 668 info->sheet_name = name; | |
| 669 info->col_at = 0; | |
| 670 info->col_signalled = 0; | |
| 671 | |
| 672 fz_try(ctx) | |
| 673 { | |
| 674 fz_xml *pos = xml; | |
| 675 fz_xml *next; | |
| 676 | |
| 677 while (pos) | |
| 678 { | |
| 679 /* When we arrive on a node, check if it's a cell. */ | |
| 680 if (fz_xml_is_tag(pos, "c")) | |
| 681 { | |
| 682 show_cell(ctx, pos, info); | |
| 683 /* Do NOT go down, we've already dealt with that. */ | |
| 684 } | |
| 685 else | |
| 686 { | |
| 687 /* Try to move down. */ | |
| 688 next = fz_xml_down(pos); | |
| 689 if (next) | |
| 690 { | |
| 691 /* We can move down, easy! */ | |
| 692 pos = next; | |
| 693 continue; | |
| 694 } | |
| 695 } | |
| 696 /* Try moving to next. */ | |
| 697 next = fz_xml_next(pos); | |
| 698 if (next) | |
| 699 { | |
| 700 /* We can move to next, easy! */ | |
| 701 pos = next; | |
| 702 continue; | |
| 703 } | |
| 704 | |
| 705 /* If we can't go down, or next, pop up until we | |
| 706 * find somewhere we can go next from. */ | |
| 707 while (1) | |
| 708 { | |
| 709 /* OK. So move up. */ | |
| 710 pos = fz_xml_up(pos); | |
| 711 /* Check for hitting the top. */ | |
| 712 if (pos == NULL) | |
| 713 break; | |
| 714 | |
| 715 /* We've returned to a node. See if it's a 'row'. */ | |
| 716 if (fz_xml_is_tag(pos, "row")) | |
| 717 new_row(ctx, info); | |
| 718 | |
| 719 next = fz_xml_next(pos); | |
| 720 if (next) | |
| 721 { | |
| 722 pos = next; | |
| 723 break; | |
| 724 } | |
| 725 } | |
| 726 } | |
| 727 if (info->col_signalled) | |
| 728 fz_write_printf(ctx, info->out, "</td>\n</tr>\n"); | |
| 729 fz_write_printf(ctx, info->out, "</table>\n"); | |
| 730 } | |
| 731 fz_always(ctx) | |
| 732 fz_drop_xml(ctx, xml); | |
| 733 fz_catch(ctx) | |
| 734 fz_rethrow(ctx); | |
| 735 } | |
| 736 | |
| 737 static void | |
| 738 process_slide(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) | |
| 739 { | |
| 740 fz_write_printf(ctx, info->out, "<div id=\"slide%d\">\n", info->page++); | |
| 741 process_item(ctx, arch, file, info, 0); | |
| 742 fz_write_printf(ctx, info->out, "</div>\n"); | |
| 743 } | |
| 744 | |
| 745 static char * | |
| 746 make_absolute_path(fz_context *ctx, const char *abs, const char *rel) | |
| 747 { | |
| 748 const char *a = abs; | |
| 749 const char *aslash = a; | |
| 750 int up = 0; | |
| 751 size_t z1, z2; | |
| 752 char *s; | |
| 753 | |
| 754 if (rel == NULL) | |
| 755 return NULL; | |
| 756 if (abs == NULL || *rel == '/') | |
| 757 return fz_strdup(ctx, rel); | |
| 758 | |
| 759 for (a = abs; *a != 0; a++) | |
| 760 if (*a == '/') | |
| 761 aslash = a+1; | |
| 762 | |
| 763 while (rel[0] == '.') | |
| 764 { | |
| 765 if (rel[1] == '/') | |
| 766 rel += 2; | |
| 767 else if (rel[1] == '.' && rel[2] == '/') | |
| 768 rel += 3, up++; | |
| 769 else | |
| 770 fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path"); | |
| 771 } | |
| 772 if (rel[0] == 0) | |
| 773 fz_throw(ctx, FZ_ERROR_FORMAT, "Unresolvable path"); | |
| 774 | |
| 775 while (up) | |
| 776 { | |
| 777 while (aslash != abs && aslash[-1] != '/') | |
| 778 aslash--; | |
| 779 | |
| 780 up--; | |
| 781 } | |
| 782 | |
| 783 z1 = aslash - abs; | |
| 784 z2 = strlen(rel); | |
| 785 s = fz_malloc(ctx, z1 + z2 + 1); | |
| 786 if (z1) | |
| 787 memcpy(s, abs, z1); | |
| 788 memcpy(s+z1, rel, z2+1); | |
| 789 | |
| 790 return s; | |
| 791 } | |
| 792 | |
| 793 static char * | |
| 794 collate_t_content(fz_context *ctx, fz_xml *top) | |
| 795 { | |
| 796 char *val = NULL; | |
| 797 fz_xml *next; | |
| 798 fz_xml *pos = fz_xml_down(top); | |
| 799 | |
| 800 while (pos != top) | |
| 801 { | |
| 802 /* Capture all the 't' content. */ | |
| 803 if (fz_xml_is_tag(pos, "t")) | |
| 804 { | |
| 805 /* Remember the content. */ | |
| 806 char *s = fz_xml_text(fz_xml_down(pos)); | |
| 807 | |
| 808 if (s == NULL) | |
| 809 { | |
| 810 /* Do nothing */ | |
| 811 } | |
| 812 else if (val == NULL) | |
| 813 val = fz_strdup(ctx, s); | |
| 814 else | |
| 815 { | |
| 816 char *val2; | |
| 817 size_t z1 = strlen(val); | |
| 818 size_t z2 = strlen(s) + 1; | |
| 819 fz_try(ctx) | |
| 820 { | |
| 821 val2 = fz_malloc(ctx, z1 + z2); | |
| 822 } | |
| 823 fz_catch(ctx) | |
| 824 { | |
| 825 fz_free(ctx, val); | |
| 826 fz_rethrow(ctx); | |
| 827 } | |
| 828 memcpy(val2, val, z1); | |
| 829 memcpy(val2 + z1, s, z2); | |
| 830 fz_free(ctx, val); | |
| 831 val = val2; | |
| 832 } | |
| 833 /* Do NOT go down, we've already dealt with that. */ | |
| 834 } | |
| 835 else if (fz_xml_is_tag(pos, "rPr") || fz_xml_is_tag(pos, "rPh")) | |
| 836 { | |
| 837 /* We do not want the 't' content from within these. */ | |
| 838 } | |
| 839 else | |
| 840 { | |
| 841 /* Try to move down. */ | |
| 842 next = fz_xml_down(pos); | |
| 843 if (next) | |
| 844 { | |
| 845 /* We can move down, easy! */ | |
| 846 pos = next; | |
| 847 continue; | |
| 848 } | |
| 849 } | |
| 850 /* Try moving to next. */ | |
| 851 next = fz_xml_next(pos); | |
| 852 if (next) | |
| 853 { | |
| 854 /* We can move to next, easy! */ | |
| 855 pos = next; | |
| 856 continue; | |
| 857 } | |
| 858 | |
| 859 /* If we can't go down, or next, pop up until we | |
| 860 * find somewhere we can go next from. */ | |
| 861 while (1) | |
| 862 { | |
| 863 /* OK. So move up. */ | |
| 864 pos = fz_xml_up(pos); | |
| 865 /* Check for hitting the top. */ | |
| 866 if (pos == top) | |
| 867 break; | |
| 868 next = fz_xml_next(pos); | |
| 869 if (next) | |
| 870 { | |
| 871 pos = next; | |
| 872 break; | |
| 873 } | |
| 874 } | |
| 875 } | |
| 876 | |
| 877 return val; | |
| 878 } | |
| 879 | |
| 880 static fz_xml * | |
| 881 try_parse_xml_archive_entry(fz_context *ctx, fz_archive *arch, const char *filename, int preserve_white) | |
| 882 { | |
| 883 if (!fz_has_archive_entry(ctx, arch, filename)) | |
| 884 return NULL; | |
| 885 | |
| 886 return fz_parse_xml_archive_entry(ctx, arch, filename, preserve_white); | |
| 887 } | |
| 888 | |
| 889 static void | |
| 890 load_shared_strings(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file) | |
| 891 { | |
| 892 fz_xml *pos = fz_xml_find_dfs(rels, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings"); | |
| 893 const char *ss_file = fz_xml_att(pos, "Target"); | |
| 894 char *resolved = NULL; | |
| 895 fz_xml *xml = NULL; | |
| 896 char *str = NULL; | |
| 897 | |
| 898 if (ss_file == NULL) | |
| 899 return; | |
| 900 | |
| 901 fz_var(xml); | |
| 902 fz_var(str); | |
| 903 fz_var(resolved); | |
| 904 | |
| 905 fz_try(ctx) | |
| 906 { | |
| 907 resolved = make_absolute_path(ctx, file, ss_file); | |
| 908 xml = fz_parse_xml_archive_entry(ctx, arch, resolved, 1); | |
| 909 | |
| 910 pos = fz_xml_find_dfs(xml, "si", NULL, NULL); | |
| 911 while (pos) | |
| 912 { | |
| 913 int n = info->shared_string_len; | |
| 914 str = collate_t_content(ctx, pos); | |
| 915 | |
| 916 if (n == info->shared_string_max) | |
| 917 { | |
| 918 int max = info->shared_string_max; | |
| 919 int newmax = max ? max * 2 : 1024; | |
| 920 char **arr = fz_realloc(ctx, info->shared_strings, sizeof(*arr) * newmax); | |
| 921 memset(&arr[max], 0, sizeof(*arr) * (newmax - max)); | |
| 922 info->shared_strings = arr; | |
| 923 info->shared_string_max = newmax; | |
| 924 } | |
| 925 | |
| 926 info->shared_strings[n] = str; | |
| 927 str = NULL; | |
| 928 info->shared_string_len++; | |
| 929 pos = fz_xml_find_next_dfs(pos, "si", NULL, NULL); | |
| 930 } | |
| 931 } | |
| 932 fz_always(ctx) | |
| 933 { | |
| 934 fz_drop_xml(ctx, xml); | |
| 935 fz_free(ctx, resolved); | |
| 936 fz_free(ctx, str); | |
| 937 } | |
| 938 fz_catch(ctx) | |
| 939 fz_rethrow(ctx); | |
| 940 } | |
| 941 | |
| 942 static void | |
| 943 load_footnotes(fz_context *ctx, fz_archive *arch, fz_xml *rels, doc_info *info, const char *file) | |
| 944 { | |
| 945 char *resolved = NULL; | |
| 946 fz_xml *xml = NULL; | |
| 947 char *str = NULL; | |
| 948 | |
| 949 fz_var(xml); | |
| 950 fz_var(str); | |
| 951 fz_var(resolved); | |
| 952 | |
| 953 fz_try(ctx) | |
| 954 { | |
| 955 fz_xml *pos; | |
| 956 | |
| 957 resolved = make_absolute_path(ctx, file, "footnotes.xml"); | |
| 958 xml = try_parse_xml_archive_entry(ctx, arch, resolved, 1); | |
| 959 if (xml == NULL) | |
| 960 break; | |
| 961 | |
| 962 pos = fz_xml_find_dfs(xml, "footnote", NULL, NULL); | |
| 963 while (pos) | |
| 964 { | |
| 965 int n = fz_atoi(fz_xml_att(pos, "w:id")); | |
| 966 | |
| 967 str = collate_t_content(ctx, pos); | |
| 968 | |
| 969 if (str && n >= 0) | |
| 970 { | |
| 971 if (n >= info->footnotes_max) | |
| 972 { | |
| 973 int max = info->footnotes_max; | |
| 974 int newmax = max ? max * 2 : 1024; | |
| 975 char **arr; | |
| 976 if (newmax < n) | |
| 977 newmax = n+1; | |
| 978 arr = fz_realloc(ctx, info->footnotes, sizeof(*arr) * newmax); | |
| 979 memset(&arr[max], 0, sizeof(*arr) * (newmax - max)); | |
| 980 info->footnotes = arr; | |
| 981 info->footnotes_max = newmax; | |
| 982 } | |
| 983 | |
| 984 info->footnotes[n] = str; | |
| 985 str = NULL; | |
| 986 } | |
| 987 pos = fz_xml_find_next_dfs(pos, "footnote", NULL, NULL); | |
| 988 } | |
| 989 } | |
| 990 fz_always(ctx) | |
| 991 { | |
| 992 fz_drop_xml(ctx, xml); | |
| 993 fz_free(ctx, resolved); | |
| 994 fz_free(ctx, str); | |
| 995 } | |
| 996 fz_catch(ctx) | |
| 997 fz_rethrow(ctx); | |
| 998 } | |
| 999 | |
| 1000 static void | |
| 1001 process_office_document(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) | |
| 1002 { | |
| 1003 char *file_rels; | |
| 1004 fz_xml *xml = NULL; | |
| 1005 fz_xml *rels = NULL; | |
| 1006 char *resolved_rel = NULL; | |
| 1007 | |
| 1008 if (file == NULL) | |
| 1009 return; | |
| 1010 | |
| 1011 file_rels = make_rel_name(ctx, file); | |
| 1012 | |
| 1013 fz_var(resolved_rel); | |
| 1014 | |
| 1015 fz_var(rels); | |
| 1016 fz_var(xml); | |
| 1017 | |
| 1018 fz_try(ctx) | |
| 1019 { | |
| 1020 fz_xml *pos; | |
| 1021 | |
| 1022 rels = fz_parse_xml_archive_entry(ctx, arch, file_rels, 0); | |
| 1023 xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); | |
| 1024 | |
| 1025 /* XLSX */ | |
| 1026 pos = fz_xml_find_dfs(xml, "sheet", NULL, NULL); | |
| 1027 if (pos) | |
| 1028 { | |
| 1029 load_shared_strings(ctx, arch, rels, info, file); | |
| 1030 while (pos) | |
| 1031 { | |
| 1032 char *name = fz_xml_att(pos, "name"); | |
| 1033 char *id = fz_xml_att(pos, "r:id"); | |
| 1034 char *sheet = lookup_rel(ctx, rels, id); | |
| 1035 | |
| 1036 if (sheet) | |
| 1037 { | |
| 1038 resolved_rel = make_absolute_path(ctx, file, sheet); | |
| 1039 process_sheet(ctx, arch, name, resolved_rel, info); | |
| 1040 fz_free(ctx, resolved_rel); | |
| 1041 resolved_rel = NULL; | |
| 1042 } | |
| 1043 pos = fz_xml_find_next_dfs(pos, "sheet", NULL, NULL); | |
| 1044 } | |
| 1045 break; | |
| 1046 } | |
| 1047 | |
| 1048 /* Let's try it as a powerpoint */ | |
| 1049 pos = fz_xml_find_dfs(xml, "sldId", NULL, NULL); | |
| 1050 if (pos) | |
| 1051 { | |
| 1052 while (pos) | |
| 1053 { | |
| 1054 char *id = fz_xml_att(pos, "r:id"); | |
| 1055 char *sheet = lookup_rel(ctx, rels, id); | |
| 1056 | |
| 1057 if (sheet) | |
| 1058 { | |
| 1059 resolved_rel = make_absolute_path(ctx, file, sheet); | |
| 1060 process_slide(ctx, arch, resolved_rel, info); | |
| 1061 fz_free(ctx, resolved_rel); | |
| 1062 resolved_rel = NULL; | |
| 1063 } | |
| 1064 pos = fz_xml_find_next_dfs(pos, "sldId", NULL, NULL); | |
| 1065 } | |
| 1066 break; | |
| 1067 } | |
| 1068 | |
| 1069 /* Let's try it as word. */ | |
| 1070 { | |
| 1071 load_footnotes(ctx, arch, rels, info, file); | |
| 1072 process_doc_stream(ctx, xml, info, 1); | |
| 1073 } | |
| 1074 } | |
| 1075 fz_always(ctx) | |
| 1076 { | |
| 1077 fz_drop_xml(ctx, xml); | |
| 1078 fz_drop_xml(ctx, rels); | |
| 1079 fz_free(ctx, resolved_rel); | |
| 1080 fz_free(ctx, file_rels); | |
| 1081 } | |
| 1082 fz_catch(ctx) | |
| 1083 fz_rethrow(ctx); | |
| 1084 } | |
| 1085 | |
| 1086 static void | |
| 1087 process_office_document_properties(fz_context *ctx, fz_archive *arch, const char *file, doc_info *info) | |
| 1088 { | |
| 1089 fz_xml *xml = NULL; | |
| 1090 char *title; | |
| 1091 | |
| 1092 fz_var(xml); | |
| 1093 | |
| 1094 fz_try(ctx) | |
| 1095 { | |
| 1096 fz_xml *pos; | |
| 1097 | |
| 1098 xml = fz_parse_xml_archive_entry(ctx, arch, file, 1); | |
| 1099 | |
| 1100 pos = fz_xml_find_dfs(xml, "title", NULL, NULL); | |
| 1101 title = fz_xml_text(fz_xml_down(pos)); | |
| 1102 if (title) | |
| 1103 { | |
| 1104 fz_write_string(ctx, info->out, "<title>"); | |
| 1105 doc_escape(ctx, info->out, title); | |
| 1106 fz_write_string(ctx, info->out, "</title>"); | |
| 1107 } | |
| 1108 } | |
| 1109 fz_always(ctx) | |
| 1110 { | |
| 1111 fz_drop_xml(ctx, xml); | |
| 1112 } | |
| 1113 fz_catch(ctx) | |
| 1114 fz_rethrow(ctx); | |
| 1115 } | |
| 1116 | |
| 1117 static fz_buffer * | |
| 1118 fz_office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buffer_in, fz_archive *dir, const char *user_css, fz_office_to_html_opts *opts) | |
| 1119 { | |
| 1120 fz_stream *stream = NULL; | |
| 1121 fz_archive *archive = NULL; | |
| 1122 fz_buffer *buffer_out = NULL; | |
| 1123 fz_xml *xml = NULL; | |
| 1124 fz_xml *pos = NULL; | |
| 1125 fz_xml *rels = NULL; | |
| 1126 const char *schema = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"; | |
| 1127 const char *schema_props = "http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties"; | |
| 1128 doc_info info = { 0 }; | |
| 1129 int i; | |
| 1130 | |
| 1131 fz_var(archive); | |
| 1132 fz_var(stream); | |
| 1133 fz_var(buffer_out); | |
| 1134 fz_var(xml); | |
| 1135 fz_var(rels); | |
| 1136 | |
| 1137 if (opts) | |
| 1138 info.opts = *opts; | |
| 1139 | |
| 1140 fz_try(ctx) | |
| 1141 { | |
| 1142 if (buffer_in) | |
| 1143 { | |
| 1144 stream = fz_open_buffer(ctx, buffer_in); | |
| 1145 archive = fz_open_archive_with_stream(ctx, stream); | |
| 1146 } | |
| 1147 else | |
| 1148 archive = fz_keep_archive(ctx, dir); | |
| 1149 buffer_out = fz_new_buffer(ctx, 1024); | |
| 1150 info.out = fz_new_output_with_buffer(ctx, buffer_out); | |
| 1151 | |
| 1152 /* Is it an HWPX ?*/ | |
| 1153 xml = try_parse_xml_archive_entry(ctx, archive, "META-INF/container.xml", 0); | |
| 1154 if (xml) | |
| 1155 { | |
| 1156 pos = fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml"); | |
| 1157 if (!pos) | |
| 1158 fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not hwpx."); | |
| 1159 | |
| 1160 while (pos) | |
| 1161 { | |
| 1162 const char *file = fz_xml_att(pos, "full-path"); | |
| 1163 process_rootfile(ctx, archive, file, &info); | |
| 1164 pos = fz_xml_find_next_dfs(pos, "rootfile", "media-type", "application/hwpml-package+xml"); | |
| 1165 } | |
| 1166 fz_close_output(ctx, info.out); | |
| 1167 break; | |
| 1168 } | |
| 1169 | |
| 1170 /* Try other types */ | |
| 1171 { | |
| 1172 xml = try_parse_xml_archive_entry(ctx, archive, "_rels/.rels", 0); | |
| 1173 | |
| 1174 fz_write_string(ctx, info.out, "<html>\n"); | |
| 1175 | |
| 1176 pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema_props); | |
| 1177 if (pos) | |
| 1178 { | |
| 1179 const char *file = fz_xml_att(pos, "Target"); | |
| 1180 fz_write_string(ctx, info.out, "<head>\n"); | |
| 1181 process_office_document_properties(ctx, archive, file, &info); | |
| 1182 fz_write_string(ctx, info.out, "</head>\n"); | |
| 1183 } | |
| 1184 | |
| 1185 fz_write_string(ctx, info.out, "<body>\n"); | |
| 1186 pos = fz_xml_find_dfs(xml, "Relationship", "Type", schema); | |
| 1187 if (!pos) | |
| 1188 fz_throw(ctx, FZ_ERROR_FORMAT, "Archive not docx."); | |
| 1189 | |
| 1190 while (pos) | |
| 1191 { | |
| 1192 const char *file = fz_xml_att(pos, "Target"); | |
| 1193 if (file) | |
| 1194 process_office_document(ctx, archive, file, &info); | |
| 1195 pos = fz_xml_find_next_dfs(pos, "Relationship", "Type", schema); | |
| 1196 } | |
| 1197 } | |
| 1198 | |
| 1199 fz_close_output(ctx, info.out); | |
| 1200 } | |
| 1201 fz_always(ctx) | |
| 1202 { | |
| 1203 fz_drop_xml(ctx, rels); | |
| 1204 fz_drop_xml(ctx, xml); | |
| 1205 for (i = 0; i < info.shared_string_len; ++i) | |
| 1206 fz_free(ctx, info.shared_strings[i]); | |
| 1207 fz_free(ctx, info.shared_strings); | |
| 1208 for (i = 0; i < info.footnotes_max; ++i) | |
| 1209 fz_free(ctx, info.footnotes[i]); | |
| 1210 fz_free(ctx, info.footnotes); | |
| 1211 fz_drop_output(ctx, info.out); | |
| 1212 fz_drop_archive(ctx, archive); | |
| 1213 fz_drop_stream(ctx, stream); | |
| 1214 } | |
| 1215 fz_catch(ctx) | |
| 1216 { | |
| 1217 fz_drop_buffer(ctx, buffer_out); | |
| 1218 fz_rethrow(ctx); | |
| 1219 } | |
| 1220 | |
| 1221 #ifdef DEBUG_OFFICE_TO_HTML | |
| 1222 { | |
| 1223 unsigned char *storage; | |
| 1224 size_t len = fz_buffer_storage(ctx, buffer_out, &storage); | |
| 1225 fz_write_printf(ctx, fz_stddbg(ctx), "fz_office_to_html: Output buffer, len=%zd:\n", len); | |
| 1226 fz_write_buffer(ctx, fz_stddbg(ctx), buffer_out); | |
| 1227 } | |
| 1228 #endif | |
| 1229 | |
| 1230 return buffer_out; | |
| 1231 } | |
| 1232 | |
| 1233 /* Office document handler */ | |
| 1234 | |
| 1235 static fz_buffer * | |
| 1236 office_to_html(fz_context *ctx, fz_html_font_set *set, fz_buffer *buf, fz_archive *zip, const char *user_css) | |
| 1237 { | |
| 1238 fz_office_to_html_opts opts = { 0 }; | |
| 1239 | |
| 1240 return fz_office_to_html(ctx, set, buf, zip, user_css, &opts); | |
| 1241 } | |
| 1242 | |
| 1243 static const fz_htdoc_format_t fz_htdoc_office = | |
| 1244 { | |
| 1245 "Office document", | |
| 1246 office_to_html, | |
| 1247 0, 1, 0 | |
| 1248 }; | |
| 1249 | |
| 1250 static fz_document * | |
| 1251 office_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *zip, void *state) | |
| 1252 { | |
| 1253 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, zip, &fz_htdoc_office); | |
| 1254 } | |
| 1255 | |
| 1256 static const char *office_extensions[] = | |
| 1257 { | |
| 1258 "docx", | |
| 1259 "xlsx", | |
| 1260 "pptx", | |
| 1261 "hwpx", | |
| 1262 NULL | |
| 1263 }; | |
| 1264 | |
| 1265 static const char *office_mimetypes[] = | |
| 1266 { | |
| 1267 // DOCX | |
| 1268 "application/vnd.openxmlformats-officedocument.wordprocessingml.document", | |
| 1269 // XLSX | |
| 1270 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", | |
| 1271 // PPTX | |
| 1272 "application/vnd.openxmlformats-officedocument.presentationml.presentation", | |
| 1273 // HWPX | |
| 1274 "application/haansofthwpx", | |
| 1275 "application/vnd.hancom.hwpx", | |
| 1276 NULL | |
| 1277 }; | |
| 1278 | |
| 1279 /* We are only ever 75% sure here, to allow a 'better' handler, such as sodochandler | |
| 1280 * to override us by returning 100. */ | |
| 1281 static int | |
| 1282 office_recognize_doc_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *zip, void **state, fz_document_recognize_state_free_fn **free_state) | |
| 1283 { | |
| 1284 fz_archive *arch = NULL; | |
| 1285 int ret = 0; | |
| 1286 fz_xml *xml = NULL; | |
| 1287 | |
| 1288 if (state) | |
| 1289 *state = NULL; | |
| 1290 if (free_state) | |
| 1291 *free_state = NULL; | |
| 1292 | |
| 1293 fz_var(arch); | |
| 1294 fz_var(ret); | |
| 1295 fz_var(xml); | |
| 1296 | |
| 1297 fz_try(ctx) | |
| 1298 { | |
| 1299 if (stream) | |
| 1300 { | |
| 1301 arch = fz_try_open_archive_with_stream(ctx, stream); | |
| 1302 if (arch == NULL) | |
| 1303 break; | |
| 1304 } | |
| 1305 else | |
| 1306 arch = fz_keep_archive(ctx, zip); | |
| 1307 | |
| 1308 xml = fz_try_parse_xml_archive_entry(ctx, arch, "META-INF/container.xml", 0); | |
| 1309 if (xml) | |
| 1310 { | |
| 1311 if (fz_xml_find_dfs(xml, "rootfile", "media-type", "application/hwpml-package+xml")) | |
| 1312 ret = 75; /* HWPX */ | |
| 1313 break; | |
| 1314 } | |
| 1315 xml = fz_try_parse_xml_archive_entry(ctx, arch, "_rels/.rels", 0); | |
| 1316 if (xml) | |
| 1317 { | |
| 1318 if (fz_xml_find_dfs(xml, "Relationship", "Type", "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument")) | |
| 1319 { | |
| 1320 ret = 75; /* DOCX | PPTX | XLSX */ | |
| 1321 } | |
| 1322 break; | |
| 1323 } | |
| 1324 } | |
| 1325 fz_always(ctx) | |
| 1326 { | |
| 1327 fz_drop_xml(ctx, xml); | |
| 1328 fz_drop_archive(ctx, arch); | |
| 1329 } | |
| 1330 fz_catch(ctx) | |
| 1331 fz_rethrow(ctx); | |
| 1332 | |
| 1333 return ret; | |
| 1334 } | |
| 1335 | |
| 1336 fz_document_handler office_document_handler = | |
| 1337 { | |
| 1338 NULL, | |
| 1339 office_open_document, | |
| 1340 office_extensions, | |
| 1341 office_mimetypes, | |
| 1342 office_recognize_doc_content | |
| 1343 }; |
