Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/fitz/output-docx.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 // Copyright (C) 2004-2025 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 | |
| 25 #if FZ_ENABLE_DOCX_OUTPUT | |
| 26 | |
| 27 #include "glyphbox.h" | |
| 28 #include "extract/extract.h" | |
| 29 #include "extract/buffer.h" | |
| 30 | |
| 31 #include <assert.h> | |
| 32 #include <errno.h> | |
| 33 #include <string.h> | |
| 34 | |
| 35 | |
| 36 typedef struct | |
| 37 { | |
| 38 fz_document_writer super; | |
| 39 extract_alloc_t *alloc; | |
| 40 | |
| 41 /* | |
| 42 * .ctx is needed for the callbacks we get from the Extract library, for | |
| 43 * example s_realloc_fn(). Each of our main device callbacks sets .ctx on | |
| 44 * entry, and resets back to NULL before returning. | |
| 45 */ | |
| 46 fz_context *ctx; | |
| 47 | |
| 48 fz_output *output; | |
| 49 extract_t *extract; | |
| 50 int spacing; | |
| 51 int rotation; | |
| 52 int images; | |
| 53 int mediabox_clip; | |
| 54 fz_rect mediabox; /* As passed to writer_begin_page(). */ | |
| 55 char output_cache[1024]; | |
| 56 } fz_docx_writer; | |
| 57 | |
| 58 | |
| 59 typedef struct | |
| 60 { | |
| 61 fz_device super; | |
| 62 fz_docx_writer *writer; | |
| 63 } fz_docx_device; | |
| 64 | |
| 65 | |
| 66 static void dev_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, | |
| 67 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) | |
| 68 { | |
| 69 fz_docx_device *dev = (fz_docx_device*) dev_; | |
| 70 fz_text_span *span; | |
| 71 assert(!dev->writer->ctx); | |
| 72 dev->writer->ctx = ctx; | |
| 73 fz_try(ctx) | |
| 74 { | |
| 75 for (span = text->head; span; span = span->next) | |
| 76 { | |
| 77 int i; | |
| 78 fz_matrix combined, trm; | |
| 79 fz_rect bbox; | |
| 80 | |
| 81 combined = fz_concat(span->trm, ctm); | |
| 82 | |
| 83 bbox = span->font->bbox; | |
| 84 if (extract_span_begin( | |
| 85 dev->writer->extract, | |
| 86 span->font->name, | |
| 87 span->font->flags.is_bold, | |
| 88 span->font->flags.is_italic, | |
| 89 span->wmode, | |
| 90 combined.a, | |
| 91 combined.b, | |
| 92 combined.c, | |
| 93 combined.d, | |
| 94 bbox.x0, | |
| 95 bbox.y0, | |
| 96 bbox.x1, | |
| 97 bbox.y1)) | |
| 98 { | |
| 99 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin span"); | |
| 100 } | |
| 101 | |
| 102 trm = span->trm; | |
| 103 for (i=0; i<span->len; ++i) | |
| 104 { | |
| 105 fz_text_item *item = &span->items[i]; | |
| 106 float adv = 0; | |
| 107 fz_rect bounds; | |
| 108 | |
| 109 trm.e = item->x; | |
| 110 trm.f = item->y; | |
| 111 combined = fz_concat(trm, ctm); | |
| 112 | |
| 113 if (dev->writer->mediabox_clip) | |
| 114 if (fz_glyph_entirely_outside_box(ctx, &ctm, span, item, &dev->writer->mediabox)) | |
| 115 continue; | |
| 116 | |
| 117 if (span->items[i].gid >= 0) | |
| 118 adv = span->items[i].adv; | |
| 119 | |
| 120 bounds = fz_bound_glyph(ctx, span->font, span->items[i].gid, combined); | |
| 121 if (extract_add_char(dev->writer->extract, combined.e, combined.f, item->ucs, adv, | |
| 122 bounds.x0, bounds.y0, bounds.x1, bounds.y1)) | |
| 123 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to add char"); | |
| 124 } | |
| 125 | |
| 126 if (extract_span_end(dev->writer->extract)) | |
| 127 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end span"); | |
| 128 } | |
| 129 } | |
| 130 fz_always(ctx) | |
| 131 { | |
| 132 dev->writer->ctx = NULL; | |
| 133 } | |
| 134 fz_catch(ctx) | |
| 135 { | |
| 136 fz_rethrow(ctx); | |
| 137 } | |
| 138 } | |
| 139 | |
| 140 static void dev_fill_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, | |
| 141 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) | |
| 142 { | |
| 143 dev_text(ctx, dev_, text, ctm, colorspace, color, alpha, color_params); | |
| 144 } | |
| 145 | |
| 146 static void dev_stroke_text(fz_context *ctx, fz_device *dev_, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, | |
| 147 fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) | |
| 148 { | |
| 149 dev_text(ctx, dev_, text, ctm, colorspace, color, alpha, color_params); | |
| 150 } | |
| 151 | |
| 152 static void dev_clip_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm, fz_rect scissor) | |
| 153 { | |
| 154 dev_text(ctx, dev_, text, ctm, NULL, NULL, 0 /*alpha*/, fz_default_color_params); | |
| 155 } | |
| 156 | |
| 157 static void dev_clip_stroke_text(fz_context *ctx, fz_device *dev_, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) | |
| 158 { | |
| 159 dev_text(ctx, dev_, text, ctm, NULL, 0, 0, fz_default_color_params); | |
| 160 } | |
| 161 | |
| 162 static void | |
| 163 dev_ignore_text(fz_context *ctx, fz_device *dev_, const fz_text *text, fz_matrix ctm) | |
| 164 { | |
| 165 } | |
| 166 | |
| 167 static void writer_image_free(void *handle, void *image_data) | |
| 168 { | |
| 169 fz_docx_writer *writer = handle; | |
| 170 fz_free(writer->ctx, image_data); | |
| 171 } | |
| 172 | |
| 173 static void dev_fill_image(fz_context *ctx, fz_device *dev_, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) | |
| 174 { | |
| 175 fz_docx_device *dev = (fz_docx_device*) dev_; | |
| 176 const char *type = NULL; | |
| 177 fz_compressed_buffer *compressed = fz_compressed_image_buffer(ctx, img); | |
| 178 | |
| 179 assert(!dev->writer->ctx); | |
| 180 dev->writer->ctx = ctx; | |
| 181 fz_try(ctx) | |
| 182 { | |
| 183 if (compressed) | |
| 184 { | |
| 185 if (0) { /* For alignment */ } | |
| 186 else if (compressed->params.type == FZ_IMAGE_RAW) type = "raw"; | |
| 187 else if (compressed->params.type == FZ_IMAGE_FAX) type = "fax"; | |
| 188 else if (compressed->params.type == FZ_IMAGE_FLATE) type = "flate"; | |
| 189 else if (compressed->params.type == FZ_IMAGE_LZW) type = "lzw"; | |
| 190 else if (compressed->params.type == FZ_IMAGE_BROTLI) type = "brotli"; | |
| 191 else if (compressed->params.type == FZ_IMAGE_BMP) type = "bmp"; | |
| 192 else if (compressed->params.type == FZ_IMAGE_GIF) type = "gif"; | |
| 193 else if (compressed->params.type == FZ_IMAGE_JBIG2) type = "jbig2"; | |
| 194 else if (compressed->params.type == FZ_IMAGE_JPEG) type = "jpeg"; | |
| 195 else if (compressed->params.type == FZ_IMAGE_JPX) type = "jpx"; | |
| 196 else if (compressed->params.type == FZ_IMAGE_JXR) type = "jxr"; | |
| 197 else if (compressed->params.type == FZ_IMAGE_PNG) type = "png"; | |
| 198 else if (compressed->params.type == FZ_IMAGE_PNM) type = "pnm"; | |
| 199 else if (compressed->params.type == FZ_IMAGE_TIFF) type = "tiff"; | |
| 200 | |
| 201 if (type) | |
| 202 { | |
| 203 /* Write out raw data. */ | |
| 204 unsigned char *data; | |
| 205 size_t datasize = fz_buffer_extract(ctx, compressed->buffer, &data); | |
| 206 if (extract_add_image( | |
| 207 dev->writer->extract, | |
| 208 type, | |
| 209 ctm.e /*x*/, | |
| 210 ctm.f /*y*/, | |
| 211 img->w /*w*/, | |
| 212 img->h /*h*/, | |
| 213 data, | |
| 214 datasize, | |
| 215 writer_image_free, | |
| 216 dev->writer | |
| 217 )) | |
| 218 { | |
| 219 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to add image type=%s", type); | |
| 220 } | |
| 221 } | |
| 222 else | |
| 223 { | |
| 224 /* We don't recognise this image type, so ignore. */ | |
| 225 } | |
| 226 } | |
| 227 else | |
| 228 { | |
| 229 /* | |
| 230 * Compressed data not available, so we could write out | |
| 231 * raw pixel values. But for now we ignore. | |
| 232 */ | |
| 233 } | |
| 234 } | |
| 235 fz_always(ctx) | |
| 236 { | |
| 237 dev->writer->ctx = NULL; | |
| 238 } | |
| 239 fz_catch(ctx) | |
| 240 { | |
| 241 fz_rethrow(ctx); | |
| 242 } | |
| 243 } | |
| 244 | |
| 245 /* | |
| 246 * Support for sending information to Extract when walking stroke/fill path | |
| 247 * with fz_walk_path(). | |
| 248 */ | |
| 249 typedef struct | |
| 250 { | |
| 251 fz_path_walker walker; | |
| 252 extract_t *extract; | |
| 253 } walker_info_t; | |
| 254 | |
| 255 static void s_moveto(fz_context *ctx, void *arg, float x, float y) | |
| 256 { | |
| 257 extract_t* extract = arg; | |
| 258 if (extract_moveto(extract, x, y)) | |
| 259 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_moveto() failed"); | |
| 260 } | |
| 261 | |
| 262 static void s_lineto(fz_context *ctx, void *arg, float x, float y) | |
| 263 { | |
| 264 extract_t* extract = arg; | |
| 265 if (extract_lineto(extract, x, y)) | |
| 266 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_lineto() failed"); | |
| 267 } | |
| 268 | |
| 269 static void s_curveto(fz_context *ctx, void *arg, float x1, float y1, | |
| 270 float x2, float y2, float x3, float y3) | |
| 271 { | |
| 272 /* We simply move to the end point of the curve so that subsequent | |
| 273 (straight) lines will be handled correctly. */ | |
| 274 extract_t* extract = arg; | |
| 275 if (extract_moveto(extract, x3, y3)) | |
| 276 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_moveto() failed"); | |
| 277 } | |
| 278 | |
| 279 static void s_closepath(fz_context *ctx, void *arg) | |
| 280 { | |
| 281 extract_t* extract = arg; | |
| 282 if (extract_closepath(extract)) | |
| 283 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_closepath() failed"); | |
| 284 } | |
| 285 | |
| 286 /* | |
| 287 * Calls extract_*() path functions on <path> using fz_walk_path() and the | |
| 288 * above callbacks. | |
| 289 */ | |
| 290 static void s_walk_path(fz_context *ctx, fz_docx_device *dev, extract_t *extract, const fz_path *path) | |
| 291 { | |
| 292 fz_path_walker walker; | |
| 293 walker.moveto = s_moveto; | |
| 294 walker.lineto = s_lineto; | |
| 295 walker.curveto = s_curveto; | |
| 296 walker.closepath = s_closepath; | |
| 297 walker.quadto = NULL; | |
| 298 walker.curvetov = NULL; | |
| 299 walker.curvetoy = NULL; | |
| 300 walker.rectto = NULL; | |
| 301 | |
| 302 assert(dev->writer->ctx == ctx); | |
| 303 fz_walk_path(ctx, path, &walker, extract /*arg*/); | |
| 304 } | |
| 305 | |
| 306 void dev_fill_path(fz_context *ctx, fz_device *dev_, const fz_path *path, int even_odd, | |
| 307 fz_matrix matrix, fz_colorspace * colorspace, const float *color, float alpha, | |
| 308 fz_color_params color_params) | |
| 309 { | |
| 310 fz_docx_device *dev = (fz_docx_device*) dev_; | |
| 311 extract_t *extract = dev->writer->extract; | |
| 312 | |
| 313 assert(!dev->writer->ctx); | |
| 314 dev->writer->ctx = ctx; | |
| 315 | |
| 316 fz_try(ctx) | |
| 317 { | |
| 318 if (extract_fill_begin( | |
| 319 extract, | |
| 320 matrix.a, | |
| 321 matrix.b, | |
| 322 matrix.c, | |
| 323 matrix.d, | |
| 324 matrix.e, | |
| 325 matrix.f, | |
| 326 color[0] | |
| 327 )) | |
| 328 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin fill"); | |
| 329 s_walk_path(ctx, dev, extract, path); | |
| 330 if (extract_fill_end(extract)) | |
| 331 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_fill_end() failed"); | |
| 332 } | |
| 333 fz_always(ctx) | |
| 334 { | |
| 335 dev->writer->ctx = NULL; | |
| 336 } | |
| 337 fz_catch(ctx) | |
| 338 { | |
| 339 fz_rethrow(ctx); | |
| 340 } | |
| 341 } | |
| 342 | |
| 343 | |
| 344 static void | |
| 345 dev_stroke_path(fz_context *ctx, fz_device *dev_, const fz_path *path, | |
| 346 const fz_stroke_state *stroke, fz_matrix in_ctm, | |
| 347 fz_colorspace *colorspace_in, const float *color, float alpha, | |
| 348 fz_color_params color_params) | |
| 349 { | |
| 350 fz_docx_device *dev = (fz_docx_device*) dev_; | |
| 351 extract_t *extract = dev->writer->extract; | |
| 352 | |
| 353 assert(!dev->writer->ctx); | |
| 354 dev->writer->ctx = ctx; | |
| 355 fz_try(ctx) | |
| 356 { | |
| 357 if (extract_stroke_begin( | |
| 358 extract, | |
| 359 in_ctm.a, | |
| 360 in_ctm.b, | |
| 361 in_ctm.c, | |
| 362 in_ctm.d, | |
| 363 in_ctm.e, | |
| 364 in_ctm.f, | |
| 365 stroke->linewidth, | |
| 366 color[0] | |
| 367 )) | |
| 368 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin stroke"); | |
| 369 s_walk_path(ctx, dev, extract, path); | |
| 370 if (extract_stroke_end(extract)) | |
| 371 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_stroke_end() failed"); | |
| 372 } | |
| 373 fz_always(ctx) | |
| 374 { | |
| 375 dev->writer->ctx = NULL; | |
| 376 } | |
| 377 fz_catch(ctx) | |
| 378 { | |
| 379 fz_rethrow(ctx); | |
| 380 } | |
| 381 } | |
| 382 | |
| 383 static extract_struct_t | |
| 384 fz_struct_to_extract(fz_structure type) | |
| 385 { | |
| 386 switch (type) | |
| 387 { | |
| 388 default: | |
| 389 return extract_struct_INVALID; | |
| 390 | |
| 391 case FZ_STRUCTURE_DOCUMENT: | |
| 392 return extract_struct_DOCUMENT; | |
| 393 case FZ_STRUCTURE_PART: | |
| 394 return extract_struct_PART; | |
| 395 case FZ_STRUCTURE_ART: | |
| 396 return extract_struct_ART; | |
| 397 case FZ_STRUCTURE_SECT: | |
| 398 return extract_struct_SECT; | |
| 399 case FZ_STRUCTURE_DIV: | |
| 400 return extract_struct_DIV; | |
| 401 case FZ_STRUCTURE_BLOCKQUOTE: | |
| 402 return extract_struct_BLOCKQUOTE; | |
| 403 case FZ_STRUCTURE_CAPTION: | |
| 404 return extract_struct_CAPTION; | |
| 405 case FZ_STRUCTURE_TOC: | |
| 406 return extract_struct_TOC; | |
| 407 case FZ_STRUCTURE_TOCI: | |
| 408 return extract_struct_TOCI; | |
| 409 case FZ_STRUCTURE_INDEX: | |
| 410 return extract_struct_INDEX; | |
| 411 case FZ_STRUCTURE_NONSTRUCT: | |
| 412 return extract_struct_NONSTRUCT; | |
| 413 case FZ_STRUCTURE_PRIVATE: | |
| 414 return extract_struct_PRIVATE; | |
| 415 /* Grouping elements (PDF 2.0 - Table 364) */ | |
| 416 case FZ_STRUCTURE_DOCUMENTFRAGMENT: | |
| 417 return extract_struct_DOCUMENTFRAGMENT; | |
| 418 /* Grouping elements (PDF 2.0 - Table 365) */ | |
| 419 case FZ_STRUCTURE_ASIDE: | |
| 420 return extract_struct_ASIDE; | |
| 421 /* Grouping elements (PDF 2.0 - Table 366) */ | |
| 422 case FZ_STRUCTURE_TITLE: | |
| 423 return extract_struct_TITLE; | |
| 424 case FZ_STRUCTURE_FENOTE: | |
| 425 return extract_struct_FENOTE; | |
| 426 /* Grouping elements (PDF 2.0 - Table 367) */ | |
| 427 case FZ_STRUCTURE_SUB: | |
| 428 return extract_struct_SUB; | |
| 429 | |
| 430 /* Paragraphlike elements (PDF 1.7 - Table 10.21) */ | |
| 431 case FZ_STRUCTURE_P: | |
| 432 return extract_struct_P; | |
| 433 case FZ_STRUCTURE_H: | |
| 434 return extract_struct_H; | |
| 435 case FZ_STRUCTURE_H1: | |
| 436 return extract_struct_H1; | |
| 437 case FZ_STRUCTURE_H2: | |
| 438 return extract_struct_H2; | |
| 439 case FZ_STRUCTURE_H3: | |
| 440 return extract_struct_H3; | |
| 441 case FZ_STRUCTURE_H4: | |
| 442 return extract_struct_H4; | |
| 443 case FZ_STRUCTURE_H5: | |
| 444 return extract_struct_H5; | |
| 445 case FZ_STRUCTURE_H6: | |
| 446 return extract_struct_H6; | |
| 447 | |
| 448 /* List elements (PDF 1.7 - Table 10.23) */ | |
| 449 case FZ_STRUCTURE_LIST: | |
| 450 return extract_struct_LIST; | |
| 451 case FZ_STRUCTURE_LISTITEM: | |
| 452 return extract_struct_LISTITEM; | |
| 453 case FZ_STRUCTURE_LABEL: | |
| 454 return extract_struct_LABEL; | |
| 455 case FZ_STRUCTURE_LISTBODY: | |
| 456 return extract_struct_LISTBODY; | |
| 457 | |
| 458 /* Table elements (PDF 1.7 - Table 10.24) */ | |
| 459 case FZ_STRUCTURE_TABLE: | |
| 460 return extract_struct_TABLE; | |
| 461 case FZ_STRUCTURE_TR: | |
| 462 return extract_struct_TR; | |
| 463 case FZ_STRUCTURE_TH: | |
| 464 return extract_struct_TH; | |
| 465 case FZ_STRUCTURE_TD: | |
| 466 return extract_struct_TD; | |
| 467 case FZ_STRUCTURE_THEAD: | |
| 468 return extract_struct_THEAD; | |
| 469 case FZ_STRUCTURE_TBODY: | |
| 470 return extract_struct_TBODY; | |
| 471 case FZ_STRUCTURE_TFOOT: | |
| 472 return extract_struct_TFOOT; | |
| 473 | |
| 474 /* Inline elements (PDF 1.7 - Table 10.25) */ | |
| 475 case FZ_STRUCTURE_SPAN: | |
| 476 return extract_struct_SPAN; | |
| 477 case FZ_STRUCTURE_QUOTE: | |
| 478 return extract_struct_QUOTE; | |
| 479 case FZ_STRUCTURE_NOTE: | |
| 480 return extract_struct_NOTE; | |
| 481 case FZ_STRUCTURE_REFERENCE: | |
| 482 return extract_struct_REFERENCE; | |
| 483 case FZ_STRUCTURE_BIBENTRY: | |
| 484 return extract_struct_BIBENTRY; | |
| 485 case FZ_STRUCTURE_CODE: | |
| 486 return extract_struct_CODE; | |
| 487 case FZ_STRUCTURE_LINK: | |
| 488 return extract_struct_LINK; | |
| 489 case FZ_STRUCTURE_ANNOT: | |
| 490 return extract_struct_ANNOT; | |
| 491 /* Inline elements (PDF 2.0 - Table 368) */ | |
| 492 case FZ_STRUCTURE_EM: | |
| 493 return extract_struct_EM; | |
| 494 case FZ_STRUCTURE_STRONG: | |
| 495 return extract_struct_STRONG; | |
| 496 | |
| 497 /* Ruby inline element (PDF 1.7 - Table 10.26) */ | |
| 498 case FZ_STRUCTURE_RUBY: | |
| 499 return extract_struct_RUBY; | |
| 500 case FZ_STRUCTURE_RB: | |
| 501 return extract_struct_RB; | |
| 502 case FZ_STRUCTURE_RT: | |
| 503 return extract_struct_RT; | |
| 504 case FZ_STRUCTURE_RP: | |
| 505 return extract_struct_RP; | |
| 506 | |
| 507 /* Warichu inline element (PDF 1.7 - Table 10.26) */ | |
| 508 case FZ_STRUCTURE_WARICHU: | |
| 509 return extract_struct_WARICHU; | |
| 510 case FZ_STRUCTURE_WT: | |
| 511 return extract_struct_WT; | |
| 512 case FZ_STRUCTURE_WP: | |
| 513 return extract_struct_WP; | |
| 514 | |
| 515 /* Illustration elements (PDF 1.7 - Table 10.27) */ | |
| 516 case FZ_STRUCTURE_FIGURE: | |
| 517 return extract_struct_FIGURE; | |
| 518 case FZ_STRUCTURE_FORMULA: | |
| 519 return extract_struct_FORMULA; | |
| 520 case FZ_STRUCTURE_FORM: | |
| 521 return extract_struct_FORM; | |
| 522 | |
| 523 /* Artifact structure type (PDF 2.0 - Table 375) */ | |
| 524 case FZ_STRUCTURE_ARTIFACT: | |
| 525 return extract_struct_ARTIFACT; | |
| 526 } | |
| 527 } | |
| 528 | |
| 529 static void | |
| 530 dev_begin_structure(fz_context *ctx, fz_device *dev_, fz_structure standard, const char *raw, int idx) | |
| 531 { | |
| 532 fz_docx_device *dev = (fz_docx_device *)dev_; | |
| 533 extract_t *extract = dev->writer->extract; | |
| 534 | |
| 535 assert(!dev->writer->ctx); | |
| 536 dev->writer->ctx = ctx; | |
| 537 fz_try(ctx) | |
| 538 { | |
| 539 if (extract_begin_struct(extract, fz_struct_to_extract(standard), idx, -1)) | |
| 540 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin struct"); | |
| 541 } | |
| 542 fz_always(ctx) | |
| 543 dev->writer->ctx = NULL; | |
| 544 fz_catch(ctx) | |
| 545 fz_rethrow(ctx); | |
| 546 } | |
| 547 | |
| 548 static void | |
| 549 dev_end_structure(fz_context *ctx, fz_device *dev_) | |
| 550 { | |
| 551 fz_docx_device *dev = (fz_docx_device *)dev_; | |
| 552 extract_t *extract = dev->writer->extract; | |
| 553 | |
| 554 assert(!dev->writer->ctx); | |
| 555 dev->writer->ctx = ctx; | |
| 556 fz_try(ctx) | |
| 557 { | |
| 558 if (extract_end_struct(extract)) | |
| 559 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end struct"); | |
| 560 } | |
| 561 fz_always(ctx) | |
| 562 dev->writer->ctx = NULL; | |
| 563 fz_catch(ctx) | |
| 564 fz_rethrow(ctx); | |
| 565 } | |
| 566 | |
| 567 | |
| 568 static fz_device *writer_begin_page(fz_context *ctx, fz_document_writer *writer_, fz_rect mediabox) | |
| 569 { | |
| 570 fz_docx_writer *writer = (fz_docx_writer*) writer_; | |
| 571 fz_docx_device *dev; | |
| 572 assert(!writer->ctx); | |
| 573 writer->ctx = ctx; | |
| 574 writer->mediabox = mediabox; | |
| 575 fz_var(dev); | |
| 576 fz_try(ctx) | |
| 577 { | |
| 578 if (extract_page_begin(writer->extract, mediabox.x0, mediabox.y0, mediabox.x1, mediabox.y1)) | |
| 579 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to begin page"); | |
| 580 dev = fz_new_derived_device(ctx, fz_docx_device); | |
| 581 dev->super.fill_text = dev_fill_text; | |
| 582 dev->super.stroke_text = dev_stroke_text; | |
| 583 dev->super.clip_text = dev_clip_text; | |
| 584 dev->super.clip_stroke_text = dev_clip_stroke_text; | |
| 585 dev->super.ignore_text = dev_ignore_text; | |
| 586 dev->super.fill_image = dev_fill_image; | |
| 587 dev->super.fill_path = dev_fill_path; | |
| 588 dev->super.stroke_path = dev_stroke_path; | |
| 589 dev->super.begin_structure = dev_begin_structure; | |
| 590 dev->super.end_structure = dev_end_structure; | |
| 591 dev->writer = writer; | |
| 592 } | |
| 593 fz_always(ctx) | |
| 594 { | |
| 595 writer->ctx = NULL; | |
| 596 } | |
| 597 fz_catch(ctx) | |
| 598 { | |
| 599 fz_rethrow(ctx); | |
| 600 } | |
| 601 return &dev->super; | |
| 602 } | |
| 603 | |
| 604 static void writer_end_page(fz_context *ctx, fz_document_writer *writer_, fz_device *dev) | |
| 605 { | |
| 606 fz_docx_writer *writer = (fz_docx_writer*) writer_; | |
| 607 assert(!writer->ctx); | |
| 608 writer->ctx = ctx; | |
| 609 fz_try(ctx) | |
| 610 { | |
| 611 fz_close_device(ctx, dev); | |
| 612 if (extract_page_end(writer->extract)) | |
| 613 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to end page"); | |
| 614 | |
| 615 if (extract_process(writer->extract, writer->spacing, writer->rotation, writer->images)) | |
| 616 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to process page"); | |
| 617 } | |
| 618 fz_always(ctx) | |
| 619 { | |
| 620 writer->ctx = NULL; | |
| 621 fz_drop_device(ctx, dev); | |
| 622 } | |
| 623 fz_catch(ctx) | |
| 624 { | |
| 625 fz_rethrow(ctx); | |
| 626 } | |
| 627 } | |
| 628 | |
| 629 static int buffer_write(void *handle, const void *source, size_t numbytes, size_t *o_actual) | |
| 630 /* | |
| 631 * extract_buffer_t callback that calls fz_write_data(). <source> will be docx | |
| 632 * archive data. | |
| 633 */ | |
| 634 { | |
| 635 int e = 0; | |
| 636 fz_docx_writer *writer = handle; | |
| 637 fz_var(e); | |
| 638 fz_try(writer->ctx) | |
| 639 { | |
| 640 fz_write_data(writer->ctx, writer->output, source, numbytes); | |
| 641 *o_actual = numbytes; | |
| 642 } | |
| 643 fz_catch(writer->ctx) | |
| 644 { | |
| 645 errno = EIO; | |
| 646 e = -1; | |
| 647 } | |
| 648 return e; | |
| 649 } | |
| 650 | |
| 651 static int buffer_cache(void *handle, void **o_cache, size_t *o_numbytes) | |
| 652 /* | |
| 653 * extract_buffer_t cache function. We simply return writer->output_cache. | |
| 654 */ | |
| 655 { | |
| 656 fz_docx_writer *writer = handle; | |
| 657 *o_cache = writer->output_cache; | |
| 658 *o_numbytes = sizeof(writer->output_cache); | |
| 659 return 0; | |
| 660 } | |
| 661 | |
| 662 static void writer_close(fz_context *ctx, fz_document_writer *writer_) | |
| 663 { | |
| 664 fz_docx_writer *writer = (fz_docx_writer*) writer_; | |
| 665 extract_buffer_t *extract_buffer_output = NULL; | |
| 666 | |
| 667 fz_var(extract_buffer_output); | |
| 668 fz_var(writer); | |
| 669 assert(!writer->ctx); | |
| 670 writer->ctx = ctx; | |
| 671 fz_try(ctx) | |
| 672 { | |
| 673 /* | |
| 674 * Write docx to writer->output. Need to create an | |
| 675 * extract_buffer_t that writes to writer->output, for use by | |
| 676 * extract_write(). | |
| 677 */ | |
| 678 if (extract_buffer_open( | |
| 679 writer->alloc, | |
| 680 writer, | |
| 681 NULL /*fn_read*/, | |
| 682 buffer_write, | |
| 683 buffer_cache, | |
| 684 NULL /*fn_close*/, | |
| 685 &extract_buffer_output | |
| 686 )) | |
| 687 { | |
| 688 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract_buffer_output: %s", strerror(errno)); | |
| 689 } | |
| 690 if (extract_write(writer->extract, extract_buffer_output)) | |
| 691 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to generate docx content: %s", strerror(errno)); | |
| 692 if (extract_buffer_close(&extract_buffer_output)) | |
| 693 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to close extract_buffer: %s", strerror(errno)); | |
| 694 | |
| 695 extract_end(&writer->extract); | |
| 696 fz_close_output(ctx, writer->output); | |
| 697 writer->ctx = NULL; | |
| 698 } | |
| 699 fz_catch(ctx) | |
| 700 { | |
| 701 /* | |
| 702 * We don't call fz_close_output() because it can throw and in | |
| 703 * this error case we can safely leave cleanup to our s_drop() | |
| 704 * function's calls to fz_drop_output(). | |
| 705 */ | |
| 706 extract_buffer_close(&extract_buffer_output); | |
| 707 extract_end(&writer->extract); | |
| 708 writer->ctx = NULL; | |
| 709 fz_rethrow(ctx); | |
| 710 } | |
| 711 } | |
| 712 | |
| 713 static void writer_drop(fz_context *ctx, fz_document_writer *writer_) | |
| 714 { | |
| 715 fz_docx_writer *writer = (fz_docx_writer*) writer_; | |
| 716 fz_drop_output(ctx, writer->output); | |
| 717 writer->output = NULL; | |
| 718 assert(!writer->ctx); | |
| 719 writer->ctx = ctx; | |
| 720 extract_end(&writer->extract); | |
| 721 extract_alloc_destroy(&writer->alloc); | |
| 722 writer->ctx = NULL; | |
| 723 } | |
| 724 | |
| 725 | |
| 726 static int get_bool_option(fz_context *ctx, const char *options, const char *name, int default_) | |
| 727 { | |
| 728 const char *value; | |
| 729 if (fz_has_option(ctx, options, name, &value)) | |
| 730 { | |
| 731 if (fz_option_eq(value, "yes")) return 1; | |
| 732 if (fz_option_eq(value, "no")) return 0; | |
| 733 else fz_throw(ctx, FZ_ERROR_SYNTAX, "option '%s' should be yes or no in options='%s'", name, options); | |
| 734 } | |
| 735 else | |
| 736 return default_; | |
| 737 } | |
| 738 | |
| 739 static double get_double_option(fz_context *ctx, const char *options, const char *name, double default_) | |
| 740 { | |
| 741 const char *value; | |
| 742 if (fz_has_option(ctx, options, name, &value)) | |
| 743 { | |
| 744 double ret = atof(value); | |
| 745 return ret; | |
| 746 } | |
| 747 else | |
| 748 return default_; | |
| 749 } | |
| 750 | |
| 751 static void *s_realloc_fn(void *state, void *prev, size_t size) | |
| 752 { | |
| 753 fz_docx_writer *writer = state; | |
| 754 assert(writer); | |
| 755 assert(writer->ctx); | |
| 756 return fz_realloc_no_throw(writer->ctx, prev, size); | |
| 757 } | |
| 758 | |
| 759 /* Will drop <out> if an error occurs. */ | |
| 760 static fz_document_writer *fz_new_docx_writer_internal(fz_context *ctx, fz_output *out, | |
| 761 const char *options, extract_format_t format) | |
| 762 { | |
| 763 fz_docx_writer *writer = NULL; | |
| 764 | |
| 765 fz_var(writer); | |
| 766 | |
| 767 fz_try(ctx) | |
| 768 { | |
| 769 double space_guess = get_double_option(ctx, options, "space-guess", 0); | |
| 770 writer = fz_new_derived_document_writer( | |
| 771 ctx, | |
| 772 fz_docx_writer, | |
| 773 writer_begin_page, | |
| 774 writer_end_page, | |
| 775 writer_close, | |
| 776 writer_drop | |
| 777 ); | |
| 778 writer->ctx = ctx; | |
| 779 writer->output = out; | |
| 780 if (get_bool_option(ctx, options, "html", 0)) format = extract_format_HTML; | |
| 781 if (get_bool_option(ctx, options, "text", 0)) format = extract_format_TEXT; | |
| 782 if (get_bool_option(ctx, options, "json", 0)) format = extract_format_JSON; | |
| 783 if (extract_alloc_create(s_realloc_fn, writer, &writer->alloc)) | |
| 784 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract_alloc instance"); | |
| 785 if (extract_begin(writer->alloc, format, &writer->extract)) | |
| 786 fz_throw(ctx, FZ_ERROR_LIBRARY, "Failed to create extract instance"); | |
| 787 if (space_guess) | |
| 788 extract_set_space_guess(writer->extract, space_guess); | |
| 789 writer->spacing = get_bool_option(ctx, options, "spacing", 0); | |
| 790 writer->rotation = get_bool_option(ctx, options, "rotation", 1); | |
| 791 writer->images = get_bool_option(ctx, options, "images", 1); | |
| 792 writer->mediabox_clip = get_bool_option(ctx, options, "mediabox-clip", 1); | |
| 793 if (extract_set_layout_analysis(writer->extract, get_bool_option(ctx, options, "analyse", 0))) | |
| 794 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_enable_analysis failed."); | |
| 795 { | |
| 796 const char* v; | |
| 797 if (fz_has_option(ctx, options, "tables-csv-format", &v)) | |
| 798 { | |
| 799 size_t len = strlen(v) + 1; /* Might include trailing options. */ | |
| 800 char* formatbuf = fz_malloc(ctx, len); | |
| 801 fz_copy_option(ctx, v, formatbuf, len); | |
| 802 fprintf(stderr, "tables-csv-format: %s\n", formatbuf); | |
| 803 if (extract_tables_csv_format(writer->extract, formatbuf)) | |
| 804 { | |
| 805 fz_free(ctx, formatbuf); | |
| 806 fz_throw(ctx, FZ_ERROR_LIBRARY, "extract_tables_csv_format() failed."); | |
| 807 } | |
| 808 fz_free(ctx, formatbuf); | |
| 809 } | |
| 810 } | |
| 811 writer->ctx = NULL; | |
| 812 } | |
| 813 fz_catch(ctx) | |
| 814 { | |
| 815 /* fz_drop_document_writer() drops its output so we only need to call | |
| 816 fz_drop_output() if we failed before creating the writer. */ | |
| 817 if (writer) | |
| 818 { | |
| 819 writer->ctx = ctx; | |
| 820 fz_drop_document_writer(ctx, &writer->super); | |
| 821 writer->ctx = NULL; | |
| 822 } | |
| 823 else | |
| 824 fz_drop_output(ctx, out); | |
| 825 fz_rethrow(ctx); | |
| 826 } | |
| 827 return &writer->super; | |
| 828 } | |
| 829 | |
| 830 fz_document_writer *fz_new_docx_writer_with_output(fz_context *ctx, fz_output *out, const char *options) | |
| 831 { | |
| 832 return fz_new_docx_writer_internal(ctx, out, options, extract_format_DOCX); | |
| 833 } | |
| 834 | |
| 835 fz_document_writer *fz_new_docx_writer(fz_context *ctx, const char *path, const char *options) | |
| 836 { | |
| 837 /* No need to drop <out> if fz_new_docx_writer_internal() throws, because | |
| 838 it always drops <out> if it fails. */ | |
| 839 fz_output *out = fz_new_output_with_path(ctx, path, 0 /*append*/); | |
| 840 return fz_new_docx_writer_internal(ctx, out, options, extract_format_DOCX); | |
| 841 } | |
| 842 | |
| 843 #if FZ_ENABLE_ODT_OUTPUT | |
| 844 | |
| 845 fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options) | |
| 846 { | |
| 847 return fz_new_docx_writer_internal(ctx, out, options, extract_format_ODT); | |
| 848 } | |
| 849 | |
| 850 fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options) | |
| 851 { | |
| 852 /* No need to drop <out> if fz_new_docx_writer_internal() throws, because | |
| 853 it always drops <out> if it fails. */ | |
| 854 fz_output *out = fz_new_output_with_path(ctx, path, 0 /*append*/); | |
| 855 return fz_new_docx_writer_internal(ctx, out, options, extract_format_ODT); | |
| 856 } | |
| 857 | |
| 858 #else | |
| 859 | |
| 860 fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options) | |
| 861 { | |
| 862 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "ODT writer not enabled"); | |
| 863 return NULL; | |
| 864 } | |
| 865 | |
| 866 fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options) | |
| 867 { | |
| 868 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "ODT writer not enabled"); | |
| 869 return NULL; | |
| 870 } | |
| 871 | |
| 872 #endif | |
| 873 | |
| 874 #else | |
| 875 | |
| 876 fz_document_writer *fz_new_odt_writer_with_output(fz_context *ctx, fz_output *out, const char *options) | |
| 877 { | |
| 878 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX/ODT writer not enabled"); | |
| 879 return NULL; | |
| 880 } | |
| 881 | |
| 882 fz_document_writer *fz_new_odt_writer(fz_context *ctx, const char *path, const char *options) | |
| 883 { | |
| 884 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX/ODT writer not enabled"); | |
| 885 return NULL; | |
| 886 } | |
| 887 | |
| 888 fz_document_writer *fz_new_docx_writer_with_output(fz_context *ctx, fz_output *out, const char *options) | |
| 889 { | |
| 890 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX writer not enabled"); | |
| 891 return NULL; | |
| 892 } | |
| 893 | |
| 894 fz_document_writer *fz_new_docx_writer(fz_context *ctx, const char *path, const char *options) | |
| 895 { | |
| 896 fz_throw(ctx, FZ_ERROR_UNSUPPORTED, "DOCX writer not enabled"); | |
| 897 return NULL; | |
| 898 } | |
| 899 | |
| 900 #endif |
