Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/extract.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "extract/extract.h" | |
| 2 #include "extract/alloc.h" | |
| 3 | |
| 4 #include "astring.h" | |
| 5 #include "document.h" | |
| 6 #include "docx.h" | |
| 7 #include "docx_template.h" | |
| 8 #include "html.h" | |
| 9 #include "json.h" | |
| 10 #include "mem.h" | |
| 11 #include "odt.h" | |
| 12 #include "odt_template.h" | |
| 13 #include "outf.h" | |
| 14 #include "xml.h" | |
| 15 #include "zip.h" | |
| 16 | |
| 17 | |
| 18 #include <assert.h> | |
| 19 #include <errno.h> | |
| 20 #include <math.h> | |
| 21 #include <stdarg.h> | |
| 22 #include <stdio.h> | |
| 23 #include <stdlib.h> | |
| 24 #include <string.h> | |
| 25 | |
| 26 | |
| 27 | |
| 28 const rect_t extract_rect_infinite = { { -DBL_MAX, -DBL_MAX }, { DBL_MAX, DBL_MAX } }; | |
| 29 const rect_t extract_rect_empty = { { DBL_MAX, DBL_MAX }, { -DBL_MAX, -DBL_MAX } }; | |
| 30 | |
| 31 | |
| 32 double extract_matrix_expansion(matrix4_t m) | |
| 33 { | |
| 34 return sqrt(fabs(m.a * m.d - m.b * m.c)); | |
| 35 } | |
| 36 | |
| 37 matrix4_t extract_matrix4_invert(const matrix4_t *ctm) | |
| 38 { | |
| 39 matrix4_t ctm_inverse = {1, 0, 0, 1}; | |
| 40 double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; | |
| 41 | |
| 42 if (ctm_det == 0) { | |
| 43 outf("cannot invert ctm=(%f %f %f %f)", | |
| 44 ctm->a, ctm->b, ctm->c, ctm->d); | |
| 45 } | |
| 46 else | |
| 47 { | |
| 48 ctm_inverse.a = +ctm->d / ctm_det; | |
| 49 ctm_inverse.b = -ctm->b / ctm_det; | |
| 50 ctm_inverse.c = -ctm->c / ctm_det; | |
| 51 ctm_inverse.d = +ctm->a / ctm_det; | |
| 52 } | |
| 53 | |
| 54 return ctm_inverse; | |
| 55 } | |
| 56 | |
| 57 static void char_init(char_t* item) | |
| 58 { | |
| 59 item->x = 0; | |
| 60 item->y = 0; | |
| 61 item->ucs = 0; | |
| 62 item->adv = 0; | |
| 63 item->bbox = extract_rect_empty; | |
| 64 } | |
| 65 | |
| 66 const char *extract_point_string(const point_t *point) | |
| 67 { | |
| 68 static char buffer[128]; | |
| 69 | |
| 70 snprintf(buffer, sizeof(buffer), "(%f %f)", point->x, point->y); | |
| 71 | |
| 72 return buffer; | |
| 73 } | |
| 74 | |
| 75 const char *extract_rect_string(const rect_t *rect) | |
| 76 { | |
| 77 static char buffer[2][256]; | |
| 78 static int i = 0; | |
| 79 | |
| 80 i = (i + 1) % 2; | |
| 81 snprintf(buffer[i], sizeof(buffer[i]), "((%f %f) (%f %f))", rect->min.x, rect->min.y, rect->max.x, rect->max.y); | |
| 82 | |
| 83 return buffer[i]; | |
| 84 } | |
| 85 | |
| 86 const char *extract_span_string(extract_alloc_t *alloc, span_t *span) | |
| 87 { | |
| 88 static extract_astring_t ret = {0}; | |
| 89 double x0 = 0; | |
| 90 double y0 = 0; | |
| 91 double x1 = 0; | |
| 92 double y1 = 0; | |
| 93 int c0 = 0; | |
| 94 int c1 = 0; | |
| 95 int i; | |
| 96 | |
| 97 extract_astring_free(alloc, &ret); | |
| 98 if (span == NULL) | |
| 99 { | |
| 100 /* This frees our internal data, and is used by extract_internal_end(). */ | |
| 101 return NULL; | |
| 102 } | |
| 103 | |
| 104 if (span->chars_num) { | |
| 105 c0 = span->chars[0].ucs; | |
| 106 x0 = span->chars[0].x; | |
| 107 y0 = span->chars[0].y; | |
| 108 c1 = span->chars[span->chars_num-1].ucs; | |
| 109 x1 = span->chars[span->chars_num-1].x; | |
| 110 y1 = span->chars[span->chars_num-1].y; | |
| 111 } | |
| 112 { | |
| 113 char buffer[400]; | |
| 114 snprintf(buffer, sizeof(buffer), | |
| 115 "span ctm=%s chars_num=%i (%c:%f,%f)..(%c:%f,%f) font=%s:(%f) wmode=%i chars_num=%i: ", | |
| 116 extract_matrix4_string(&span->ctm), | |
| 117 span->chars_num, | |
| 118 c0, x0, y0, | |
| 119 c1, x1, y1, | |
| 120 span->font_name, | |
| 121 extract_font_size(&span->ctm), | |
| 122 span->flags.wmode, | |
| 123 span->chars_num | |
| 124 ); | |
| 125 extract_astring_cat(alloc, &ret, buffer); | |
| 126 for (i=0; i<span->chars_num; ++i) { | |
| 127 snprintf( | |
| 128 buffer, | |
| 129 sizeof(buffer), | |
| 130 " i=%i {x=%f y=%f ucs=%i adv=%f}", | |
| 131 i, | |
| 132 span->chars[i].x, | |
| 133 span->chars[i].y, | |
| 134 span->chars[i].ucs, | |
| 135 span->chars[i].adv | |
| 136 ); | |
| 137 extract_astring_cat(alloc, &ret, buffer); | |
| 138 } | |
| 139 } | |
| 140 extract_astring_cat(alloc, &ret, ": "); | |
| 141 extract_astring_catc(alloc, &ret, '"'); | |
| 142 for (i=0; i<span->chars_num; ++i) | |
| 143 extract_astring_catc(alloc, &ret, (char) span->chars[i].ucs); | |
| 144 extract_astring_catc(alloc, &ret, '"'); | |
| 145 return ret.chars; | |
| 146 } | |
| 147 | |
| 148 char_t *extract_span_append_c(extract_alloc_t *alloc, span_t *span, int c) | |
| 149 { | |
| 150 char_t *item; | |
| 151 | |
| 152 if (extract_realloc2(alloc, | |
| 153 &span->chars, | |
| 154 sizeof(*span->chars) * span->chars_num, | |
| 155 sizeof(*span->chars) * (span->chars_num + 1))) | |
| 156 { | |
| 157 return NULL; | |
| 158 } | |
| 159 item = &span->chars[span->chars_num]; | |
| 160 span->chars_num += 1; | |
| 161 char_init(item); | |
| 162 item->ucs = c; | |
| 163 | |
| 164 return item; | |
| 165 } | |
| 166 | |
| 167 char_t *extract_span_char_last(span_t *span) | |
| 168 { | |
| 169 assert(span->chars_num > 0); | |
| 170 return &span->chars[span->chars_num-1]; | |
| 171 } | |
| 172 | |
| 173 /* Returns first span in a line. */ | |
| 174 span_t *extract_line_span_last(line_t *line) | |
| 175 { | |
| 176 assert(line->content.base.prev != &line->content.base && line->content.base.prev->type == content_span); | |
| 177 return (span_t *)line->content.base.prev; | |
| 178 } | |
| 179 | |
| 180 span_t *extract_line_span_first(line_t *line) | |
| 181 { | |
| 182 assert(line->content.base.next != &line->content.base && line->content.base.next->type == content_span); | |
| 183 return (span_t *)line->content.base.next; | |
| 184 } | |
| 185 | |
| 186 void extract_paragraph_free(extract_alloc_t *alloc, paragraph_t **pparagraph) | |
| 187 { | |
| 188 paragraph_t *paragraph = *pparagraph; | |
| 189 | |
| 190 if (paragraph == NULL) | |
| 191 return; | |
| 192 | |
| 193 content_unlink(¶graph->base); | |
| 194 content_clear(alloc, ¶graph->content); | |
| 195 extract_free(alloc, pparagraph); | |
| 196 } | |
| 197 | |
| 198 void extract_block_free(extract_alloc_t *alloc, block_t **pblock) | |
| 199 { | |
| 200 block_t *block = *pblock; | |
| 201 | |
| 202 if (block == NULL) | |
| 203 return; | |
| 204 | |
| 205 content_unlink(&block->base); | |
| 206 content_clear(alloc, &block->content); | |
| 207 extract_free(alloc, pblock); | |
| 208 } | |
| 209 | |
| 210 void extract_table_free(extract_alloc_t *alloc, table_t **ptable) | |
| 211 { | |
| 212 int c; | |
| 213 table_t *table = *ptable; | |
| 214 | |
| 215 content_unlink(&table->base); | |
| 216 for (c = 0; c< table->cells_num_x * table->cells_num_y; ++c) | |
| 217 { | |
| 218 extract_cell_free(alloc, &table->cells[c]); | |
| 219 } | |
| 220 extract_free(alloc, &table->cells); | |
| 221 extract_free(alloc, ptable); | |
| 222 } | |
| 223 | |
| 224 static void | |
| 225 structure_clear(extract_alloc_t *alloc, structure_t *structure) | |
| 226 { | |
| 227 while (structure != NULL) | |
| 228 { | |
| 229 structure_t *next = structure->sibling_next; | |
| 230 structure_clear(alloc, structure->kids_first); | |
| 231 extract_free(alloc, &structure); | |
| 232 structure = next; | |
| 233 } | |
| 234 } | |
| 235 | |
| 236 void extract_subpage_free(extract_alloc_t *alloc, subpage_t **psubpage) | |
| 237 { | |
| 238 subpage_t *subpage = *psubpage; | |
| 239 | |
| 240 if (!subpage) return; | |
| 241 | |
| 242 content_clear(alloc, &subpage->content); | |
| 243 content_clear(alloc, &subpage->tables); | |
| 244 | |
| 245 extract_free(alloc, &subpage->tablelines_horizontal.tablelines); | |
| 246 extract_free(alloc, &subpage->tablelines_vertical.tablelines); | |
| 247 | |
| 248 extract_free(alloc, psubpage); | |
| 249 } | |
| 250 | |
| 251 static void page_free(extract_alloc_t *alloc, extract_page_t **ppage) | |
| 252 { | |
| 253 int c; | |
| 254 extract_page_t *page = *ppage; | |
| 255 | |
| 256 if (!page) return; | |
| 257 | |
| 258 for (c=0; c<page->subpages_num; ++c) | |
| 259 { | |
| 260 subpage_t *subpage = page->subpages[c]; | |
| 261 extract_subpage_free(alloc, &subpage); | |
| 262 } | |
| 263 extract_split_free(alloc, &page->split); | |
| 264 extract_free(alloc, &page->subpages); | |
| 265 extract_free(alloc, ppage); | |
| 266 } | |
| 267 | |
| 268 void content_append(content_root_t *root, content_t *content) | |
| 269 { | |
| 270 assert(root && root->base.type == content_root); | |
| 271 | |
| 272 /* Unlink content from anywhere it might be. */ | |
| 273 content_unlink(content); | |
| 274 | |
| 275 /* Sanity check root. */ | |
| 276 if (root->base.next == &root->base) | |
| 277 { | |
| 278 assert(root->base.prev == &root->base); | |
| 279 } | |
| 280 | |
| 281 /* And append content */ | |
| 282 content->next = &root->base; | |
| 283 content->prev = root->base.prev; | |
| 284 content->prev->next = content; | |
| 285 root->base.prev = content; | |
| 286 } | |
| 287 | |
| 288 void content_append_span(content_root_t *root, span_t *span) | |
| 289 { | |
| 290 content_append(root, &span->base); | |
| 291 } | |
| 292 | |
| 293 void content_append_line(content_root_t *root, line_t *line) | |
| 294 { | |
| 295 content_append(root, &line->base); | |
| 296 } | |
| 297 | |
| 298 void content_append_paragraph(content_root_t *root, paragraph_t *paragraph) | |
| 299 { | |
| 300 content_append(root, ¶graph->base); | |
| 301 } | |
| 302 | |
| 303 void content_append_block(content_root_t *root, block_t *block) | |
| 304 { | |
| 305 content_append(root, &block->base); | |
| 306 } | |
| 307 | |
| 308 int content_new_root(extract_alloc_t *alloc, content_root_t **proot) | |
| 309 { | |
| 310 if (extract_malloc(alloc, proot, sizeof(**proot))) return -1; | |
| 311 content_init_root(*proot, NULL); | |
| 312 | |
| 313 return 0; | |
| 314 } | |
| 315 | |
| 316 int content_new_span(extract_alloc_t *alloc, span_t **pspan, structure_t *structure) | |
| 317 { | |
| 318 if (extract_malloc(alloc, pspan, sizeof(**pspan))) return -1; | |
| 319 extract_span_init(*pspan, structure); | |
| 320 | |
| 321 return 0; | |
| 322 } | |
| 323 | |
| 324 int content_new_line(extract_alloc_t *alloc, line_t **pline) | |
| 325 { | |
| 326 if (extract_malloc(alloc, pline, sizeof(**pline))) return -1; | |
| 327 extract_line_init(*pline); | |
| 328 | |
| 329 return 0; | |
| 330 } | |
| 331 | |
| 332 int content_new_paragraph(extract_alloc_t *alloc, paragraph_t **pparagraph) | |
| 333 { | |
| 334 if (extract_malloc(alloc, pparagraph, sizeof(**pparagraph))) return -1; | |
| 335 extract_paragraph_init(*pparagraph); | |
| 336 | |
| 337 return 0; | |
| 338 } | |
| 339 | |
| 340 int content_new_block(extract_alloc_t *alloc, block_t **pblock) | |
| 341 { | |
| 342 if (extract_malloc(alloc, pblock, sizeof(**pblock))) return -1; | |
| 343 extract_block_init(*pblock); | |
| 344 | |
| 345 return 0; | |
| 346 } | |
| 347 | |
| 348 int content_new_table(extract_alloc_t *alloc, table_t **ptable) | |
| 349 { | |
| 350 if (extract_malloc(alloc, ptable, sizeof(**ptable))) return -1; | |
| 351 extract_table_init(*ptable); | |
| 352 | |
| 353 return 0; | |
| 354 } | |
| 355 | |
| 356 /* Appends new empty span content to a content_list_t; returns -1 with errno set on error. */ | |
| 357 int content_append_new_span(extract_alloc_t *alloc, content_root_t *root, span_t **pspan, structure_t *structure) | |
| 358 { | |
| 359 if (content_new_span(alloc, pspan, structure)) return -1; | |
| 360 content_append(root, &(*pspan)->base); | |
| 361 | |
| 362 return 0; | |
| 363 } | |
| 364 | |
| 365 /* Appends new empty line content to a content_list_t; returns -1 with errno set on error. */ | |
| 366 int content_append_new_line(extract_alloc_t *alloc, content_root_t *root, line_t **pline) | |
| 367 { | |
| 368 if (content_new_line(alloc, pline)) return -1; | |
| 369 content_append(root, &(*pline)->base); | |
| 370 | |
| 371 return 0; | |
| 372 } | |
| 373 | |
| 374 /* Appends new empty paragraph content to a content_list_t; returns -1 with errno set on error. */ | |
| 375 int content_append_new_paragraph(extract_alloc_t *alloc, content_root_t *root, paragraph_t **pparagraph) | |
| 376 { | |
| 377 if (content_new_paragraph(alloc, pparagraph)) return -1; | |
| 378 content_append(root, &(*pparagraph)->base); | |
| 379 | |
| 380 return 0; | |
| 381 } | |
| 382 | |
| 383 /* Appends new empty block content to a content_list_t; returns -1 with errno set on error. */ | |
| 384 int content_append_new_block(extract_alloc_t *alloc, content_root_t *root, block_t **pblock) | |
| 385 { | |
| 386 if (content_new_block(alloc, pblock)) return -1; | |
| 387 content_append(root, &(*pblock)->base); | |
| 388 | |
| 389 return 0; | |
| 390 } | |
| 391 | |
| 392 /* Appends new empty table content to a content_list_t; returns -1 with errno set on error. */ | |
| 393 int content_append_new_table(extract_alloc_t *alloc, content_root_t *root, table_t **ptable) | |
| 394 { | |
| 395 if (content_new_table(alloc, ptable)) return -1; | |
| 396 content_append(root, &(*ptable)->base); | |
| 397 | |
| 398 return 0; | |
| 399 } | |
| 400 | |
| 401 /* Appends new empty image content to a content_list_t; returns -1 with errno set on error. */ | |
| 402 int content_append_new_image(extract_alloc_t *alloc, content_root_t *root, image_t **pimage) | |
| 403 { | |
| 404 if (extract_malloc(alloc, pimage, sizeof(**pimage))) return -1; | |
| 405 extract_image_init(*pimage); | |
| 406 content_append(root, &(*pimage)->base); | |
| 407 | |
| 408 return 0; | |
| 409 } | |
| 410 | |
| 411 void content_replace(content_t *current, content_t *replacement) | |
| 412 { | |
| 413 assert(current->type != content_root && replacement->type != content_root); | |
| 414 /* Unlink replacement. */ | |
| 415 if (replacement->prev) | |
| 416 { | |
| 417 replacement->prev->next = replacement->next; | |
| 418 replacement->next->prev = replacement->prev; | |
| 419 } | |
| 420 /* Insert replacement */ | |
| 421 replacement->prev = current->prev; | |
| 422 current->prev->next = replacement; | |
| 423 replacement->next = current->next; | |
| 424 current->next->prev = replacement; | |
| 425 /* Unlink current */ | |
| 426 current->prev = NULL; | |
| 427 current->next = NULL; | |
| 428 } | |
| 429 | |
| 430 /* Replaces current element with a new empty paragraph content; returns -1 with errno set on error. */ | |
| 431 int content_replace_new_paragraph(extract_alloc_t *alloc, content_t *current, paragraph_t **pparagraph) | |
| 432 { | |
| 433 if (content_new_paragraph(alloc, pparagraph)) return -1; | |
| 434 content_replace(current, &(*pparagraph)->base); | |
| 435 | |
| 436 return 0; | |
| 437 } | |
| 438 | |
| 439 /* Replaces current element with a new empty block content; returns -1 with errno set on error. */ | |
| 440 int content_replace_new_block(extract_alloc_t *alloc, content_t *current, block_t **pblock) | |
| 441 { | |
| 442 if (content_new_block(alloc, pblock)) return -1; | |
| 443 content_replace(current, &(*pblock)->base); | |
| 444 | |
| 445 return 0; | |
| 446 } | |
| 447 | |
| 448 /* Replaces current element with a new empty line content; returns -1 with errno set on error. */ | |
| 449 int content_replace_new_line(extract_alloc_t *alloc, content_t *current, line_t **pline) | |
| 450 { | |
| 451 if (content_new_line(alloc, pline)) return -1; | |
| 452 content_replace(current, &(*pline)->base); | |
| 453 | |
| 454 return 0; | |
| 455 } | |
| 456 | |
| 457 static void extract_images_free(extract_alloc_t *alloc, images_t *images) | |
| 458 { | |
| 459 int i; | |
| 460 for (i=0; i<images->images_num; ++i) { | |
| 461 extract_image_clear(alloc, images->images[i]); | |
| 462 extract_free(alloc, &images->images[i]); | |
| 463 } | |
| 464 extract_free(alloc, &images->images); | |
| 465 extract_free(alloc, &images->imagetypes); | |
| 466 images->images_num = 0; | |
| 467 images->imagetypes_num = 0; | |
| 468 } | |
| 469 | |
| 470 | |
| 471 /* Move image_t's from document->subpage[] to *o_images. | |
| 472 | |
| 473 On return document->subpage[].images* will be NULL etc. | |
| 474 */ | |
| 475 static int | |
| 476 extract_document_images(extract_alloc_t *alloc, document_t *document, images_t *o_images) | |
| 477 { | |
| 478 int e = -1; | |
| 479 int p; | |
| 480 images_t images = {0}; | |
| 481 | |
| 482 outf("extract_document_images(): images.images_num=%i", images.images_num); | |
| 483 for (p=0; p<document->pages_num; ++p) | |
| 484 { | |
| 485 extract_page_t *page = document->pages[p]; | |
| 486 int c; | |
| 487 for (c=0; c<page->subpages_num; ++c) | |
| 488 { | |
| 489 subpage_t *subpage = page->subpages[c]; | |
| 490 content_image_iterator iit; | |
| 491 image_t *image; | |
| 492 int i; | |
| 493 | |
| 494 for (i = 0, image = content_image_iterator_init(&iit, &subpage->content); image != NULL; i++, image = content_image_iterator_next(&iit)) | |
| 495 { | |
| 496 if (extract_realloc2(alloc, | |
| 497 &images.images, | |
| 498 sizeof(image_t) * images.images_num, | |
| 499 sizeof(image_t) * (images.images_num + 1))) goto end; | |
| 500 outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id); | |
| 501 assert(image->name); | |
| 502 content_unlink(&image->base); | |
| 503 images.images[images.images_num] = image; | |
| 504 images.images_num += 1; | |
| 505 | |
| 506 /* Add image type if we haven't seen it before. */ | |
| 507 { | |
| 508 int it; | |
| 509 for (it=0; it<images.imagetypes_num; ++it) | |
| 510 { | |
| 511 outf("it=%i images.imagetypes[it]=%s image->type=%s", | |
| 512 it, images.imagetypes[it], image->type); | |
| 513 if (!strcmp(images.imagetypes[it], image->type)) | |
| 514 { | |
| 515 break; | |
| 516 } | |
| 517 } | |
| 518 if (it == images.imagetypes_num) | |
| 519 { | |
| 520 /* We haven't seen this image type before. */ | |
| 521 if (extract_realloc2( | |
| 522 alloc, | |
| 523 &images.imagetypes, | |
| 524 sizeof(char*) * images.imagetypes_num, | |
| 525 sizeof(char*) * (images.imagetypes_num + 1) | |
| 526 )) goto end; | |
| 527 assert(image->type); | |
| 528 images.imagetypes[images.imagetypes_num] = image->type; | |
| 529 images.imagetypes_num += 1; | |
| 530 outf("have added images.imagetypes_num=%i", images.imagetypes_num); | |
| 531 } | |
| 532 } | |
| 533 } | |
| 534 } | |
| 535 } | |
| 536 | |
| 537 e = 0; | |
| 538 end: | |
| 539 | |
| 540 if (e) | |
| 541 { | |
| 542 extract_free(alloc, &images.images); | |
| 543 } | |
| 544 else | |
| 545 { | |
| 546 *o_images = images; | |
| 547 } | |
| 548 | |
| 549 return e; | |
| 550 } | |
| 551 | |
| 552 static void extract_document_free(extract_alloc_t *alloc, document_t *document) | |
| 553 { | |
| 554 int p; | |
| 555 | |
| 556 if (!document) return; | |
| 557 | |
| 558 for (p=0; p<document->pages_num; ++p) | |
| 559 { | |
| 560 page_free(alloc, &document->pages[p]); | |
| 561 } | |
| 562 extract_free(alloc, &document->pages); | |
| 563 document->pages = NULL; | |
| 564 document->pages_num = 0; | |
| 565 | |
| 566 structure_clear(alloc, document->structure); | |
| 567 } | |
| 568 | |
| 569 | |
| 570 /* Returns +1, 0 or -1 depending on sign of x. */ | |
| 571 static int s_sign(double x) | |
| 572 { | |
| 573 if (x < 0) return -1; | |
| 574 if (x > 0) return +1; | |
| 575 | |
| 576 return 0; | |
| 577 } | |
| 578 | |
| 579 int extract_matrix4_cmp(const matrix4_t *lhs, const matrix4_t *rhs) | |
| 580 { | |
| 581 int ret; | |
| 582 | |
| 583 ret = s_sign(lhs->a - rhs->a); if (ret) return ret; | |
| 584 ret = s_sign(lhs->b - rhs->b); if (ret) return ret; | |
| 585 ret = s_sign(lhs->c - rhs->c); if (ret) return ret; | |
| 586 ret = s_sign(lhs->d - rhs->d); if (ret) return ret; | |
| 587 | |
| 588 return 0; | |
| 589 } | |
| 590 | |
| 591 point_t extract_matrix4_transform_point(matrix4_t m, point_t p) | |
| 592 { | |
| 593 double x = p.x; | |
| 594 | |
| 595 p.x = m.a * x + m.c * p.y; | |
| 596 p.y = m.b * x + m.d * p.y; | |
| 597 | |
| 598 return p; | |
| 599 } | |
| 600 | |
| 601 point_t extract_matrix4_transform_xy(matrix4_t m, double x, double y) | |
| 602 { | |
| 603 point_t p; | |
| 604 | |
| 605 p.x = m.a * x + m.c * y; | |
| 606 p.y = m.b * x + m.d * y; | |
| 607 | |
| 608 return p; | |
| 609 } | |
| 610 | |
| 611 matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2) | |
| 612 { | |
| 613 matrix_t ret; | |
| 614 | |
| 615 ret.a = m1.a * m2.a + m1.b * m2.c; | |
| 616 ret.b = m1.a * m2.b + m1.b * m2.d; | |
| 617 ret.c = m1.c * m2.a + m1.d * m2.c; | |
| 618 ret.d = m1.c * m2.b + m1.d * m2.d; | |
| 619 ret.e = m1.e * m2.a + m1.f * m2.c + m2.e; | |
| 620 ret.f = m1.e * m2.b + m1.f * m2.d + m2.f; | |
| 621 | |
| 622 return ret; | |
| 623 } | |
| 624 | |
| 625 matrix4_t extract_multiply_matrix4_matrix4(matrix4_t m1, matrix4_t m2) | |
| 626 { | |
| 627 matrix4_t ret; | |
| 628 | |
| 629 ret.a = m1.a * m2.a + m1.b * m2.c; | |
| 630 ret.b = m1.a * m2.b + m1.b * m2.d; | |
| 631 ret.c = m1.c * m2.a + m1.d * m2.c; | |
| 632 ret.d = m1.c * m2.b + m1.d * m2.d; | |
| 633 | |
| 634 return ret; | |
| 635 } | |
| 636 | |
| 637 static int s_matrix_read(const char *text, matrix_t *matrix) | |
| 638 { | |
| 639 int n; | |
| 640 | |
| 641 if (!text) { | |
| 642 outf("text is NULL in s_matrix_read()"); | |
| 643 errno = EINVAL; | |
| 644 return -1; | |
| 645 } | |
| 646 n = sscanf(text, | |
| 647 "%lf %lf %lf %lf %lf %lf", | |
| 648 &matrix->a, | |
| 649 &matrix->b, | |
| 650 &matrix->c, | |
| 651 &matrix->d, | |
| 652 &matrix->e, | |
| 653 &matrix->f); | |
| 654 if (n != 6) { | |
| 655 errno = EINVAL; | |
| 656 return -1; | |
| 657 } | |
| 658 | |
| 659 return 0; | |
| 660 } | |
| 661 | |
| 662 | |
| 663 static void document_init(document_t *document) | |
| 664 { | |
| 665 document->pages = NULL; | |
| 666 document->pages_num = 0; | |
| 667 | |
| 668 document->structure = NULL; | |
| 669 document->current = NULL; | |
| 670 } | |
| 671 | |
| 672 /* If we exceed MAX_STRUCT_NEST then this probably indicates that | |
| 673 * structure nesting is not to be trusted. */ | |
| 674 #define MAX_STRUCT_NEST 64 | |
| 675 | |
| 676 struct extract_t | |
| 677 { | |
| 678 extract_alloc_t *alloc; | |
| 679 int layout_analysis; | |
| 680 double master_space_guess; | |
| 681 document_t document; | |
| 682 | |
| 683 /* Number of extra spans from subpage_span_end_clean(). */ | |
| 684 int num_spans_split; | |
| 685 | |
| 686 /* Number of extra spans from autosplit=1. */ | |
| 687 int num_spans_autosplit; | |
| 688 | |
| 689 /* Only used if autosplit is non-zero. */ | |
| 690 double span_offset_x; | |
| 691 double span_offset_y; | |
| 692 | |
| 693 /* Used to generate unique ids for images. */ | |
| 694 int image_n; | |
| 695 | |
| 696 /* List of strings that are the generated docx content for each page. When | |
| 697 * zip_* can handle appending of data, we will be able to remove this list. */ | |
| 698 extract_astring_t *contentss; | |
| 699 int contentss_num; | |
| 700 | |
| 701 images_t images; | |
| 702 | |
| 703 extract_format_t format; | |
| 704 extract_odt_styles_t odt_styles; | |
| 705 | |
| 706 char *tables_csv_format; | |
| 707 int tables_csv_i; | |
| 708 | |
| 709 enum | |
| 710 { | |
| 711 path_type_NONE, | |
| 712 path_type_FILL, | |
| 713 path_type_STROKE, | |
| 714 } path_type; | |
| 715 | |
| 716 union | |
| 717 { | |
| 718 struct | |
| 719 { | |
| 720 matrix_t ctm; | |
| 721 double color; | |
| 722 point_t points[4]; | |
| 723 int n; | |
| 724 } fill; | |
| 725 | |
| 726 struct | |
| 727 { | |
| 728 matrix_t ctm; | |
| 729 double color; | |
| 730 double width; | |
| 731 point_t point0; | |
| 732 int point0_set; | |
| 733 point_t point; | |
| 734 int point_set; | |
| 735 } stroke; | |
| 736 } path; | |
| 737 | |
| 738 int next_uid; | |
| 739 }; | |
| 740 | |
| 741 int extract_begin(extract_alloc_t *alloc, | |
| 742 extract_format_t format, | |
| 743 extract_t **pextract) | |
| 744 { | |
| 745 extract_t *extract; | |
| 746 | |
| 747 *pextract = NULL; | |
| 748 if (1 | |
| 749 && format != extract_format_ODT | |
| 750 && format != extract_format_DOCX | |
| 751 && format != extract_format_HTML | |
| 752 && format != extract_format_TEXT | |
| 753 && format != extract_format_JSON | |
| 754 ) | |
| 755 { | |
| 756 outf0("Invalid format=%i\n", format); | |
| 757 errno = EINVAL; | |
| 758 return -1; | |
| 759 } | |
| 760 | |
| 761 /* Create the extract structure. */ | |
| 762 if (extract_malloc(alloc, &extract, sizeof(*extract))) | |
| 763 return -1; | |
| 764 | |
| 765 extract_bzero(extract, sizeof(*extract)); | |
| 766 extract->alloc = alloc; | |
| 767 extract->master_space_guess = 0.5; | |
| 768 document_init(&extract->document); | |
| 769 | |
| 770 /* FIXME: Start at 10 because template document might use some low-numbered IDs. | |
| 771 */ | |
| 772 extract->image_n = 10; | |
| 773 | |
| 774 extract->format = format; | |
| 775 extract->tables_csv_format = NULL; | |
| 776 extract->tables_csv_i = 0; | |
| 777 | |
| 778 extract->next_uid = 1; | |
| 779 | |
| 780 *pextract = extract; | |
| 781 | |
| 782 return 0; | |
| 783 } | |
| 784 | |
| 785 void extract_set_space_guess(extract_t *extract, double space_guess) | |
| 786 { | |
| 787 extract->master_space_guess = space_guess; | |
| 788 } | |
| 789 | |
| 790 int extract_set_layout_analysis(extract_t *extract, int enable) | |
| 791 { | |
| 792 extract->layout_analysis = enable; | |
| 793 return 0; | |
| 794 } | |
| 795 | |
| 796 int extract_tables_csv_format(extract_t *extract, const char *path_format) | |
| 797 { | |
| 798 return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format); | |
| 799 } | |
| 800 | |
| 801 | |
| 802 static void image_free_fn(void *handle, void *image_data) | |
| 803 { | |
| 804 (void) handle; | |
| 805 free(image_data); | |
| 806 } | |
| 807 | |
| 808 int extract_read_intermediate(extract_t *extract, extract_buffer_t *buffer) | |
| 809 { | |
| 810 int ret = -1; | |
| 811 document_t *document = &extract->document; | |
| 812 char *image_data = NULL; | |
| 813 int num_spans = 0; | |
| 814 extract_xml_tag_t tag; | |
| 815 | |
| 816 extract_xml_tag_init(&tag); | |
| 817 | |
| 818 if (extract_xml_pparse_init(extract->alloc, buffer, NULL /*first_line*/)) { | |
| 819 outf("Failed to read start of intermediate data: %s", strerror(errno)); | |
| 820 goto end; | |
| 821 } | |
| 822 /* Data read from <path> is expected to be XML looking like: | |
| 823 | |
| 824 <page> | |
| 825 <span> | |
| 826 <char ...> | |
| 827 <char ...> | |
| 828 ... | |
| 829 </span> | |
| 830 <span> | |
| 831 ... | |
| 832 </span> | |
| 833 ... | |
| 834 </page> | |
| 835 <page> | |
| 836 ... | |
| 837 </page> | |
| 838 ... | |
| 839 | |
| 840 We convert this into a list of subpage_t's, each containing a list of | |
| 841 span_t's, each containing a list of char_t's. | |
| 842 | |
| 843 While doing this, we do some within-span processing by calling | |
| 844 subpage_span_end_clean(): | |
| 845 Remove spurious spaces. | |
| 846 Split spans in two where there seem to be large gaps between glyphs. | |
| 847 */ | |
| 848 for(;;) { | |
| 849 extract_page_t *page; | |
| 850 subpage_t *subpage; | |
| 851 rect_t mediabox = extract_rect_infinite; /* Fake mediabox */ | |
| 852 int e = extract_xml_pparse_next(buffer, &tag); | |
| 853 | |
| 854 if (e == 1) break; /* EOF. */ | |
| 855 if (e) goto end; | |
| 856 if (!strcmp(tag.name, "?xml")) { | |
| 857 /* We simply skip this if we find it. As of 2020-07-31, mutool adds | |
| 858 this header to mupdf raw output, but gs txtwrite does not include | |
| 859 it. */ | |
| 860 continue; | |
| 861 } | |
| 862 if (strcmp(tag.name, "page")) { | |
| 863 outf("Expected <page> but tag.name='%s'", tag.name); | |
| 864 errno = ESRCH; | |
| 865 goto end; | |
| 866 } | |
| 867 outfx("loading spans for page %i...", document->pages_num); | |
| 868 if (extract_page_begin(extract, mediabox.min.x, mediabox.min.y, mediabox.max.x, mediabox.max.y)) goto end; | |
| 869 page = extract->document.pages[extract->document.pages_num-1]; | |
| 870 if (!page) goto end; | |
| 871 subpage = page->subpages[page->subpages_num-1]; | |
| 872 if (!subpage) goto end; | |
| 873 | |
| 874 for(;;) { | |
| 875 if (extract_xml_pparse_next(buffer, &tag)) goto end; | |
| 876 if (!strcmp(tag.name, "/page")) { | |
| 877 num_spans += content_count_spans(&subpage->content); | |
| 878 break; | |
| 879 } | |
| 880 if (!strcmp(tag.name, "image")) { | |
| 881 const char* type = extract_xml_tag_attributes_find(&tag, "type"); | |
| 882 if (!type) { | |
| 883 errno = EINVAL; | |
| 884 goto end; | |
| 885 } | |
| 886 outf("image type=%s", type); | |
| 887 if (!strcmp(type, "pixmap")) { | |
| 888 int w; | |
| 889 int h; | |
| 890 int y; | |
| 891 if (extract_xml_tag_attributes_find_int(&tag, "w", &w)) goto end; | |
| 892 if (extract_xml_tag_attributes_find_int(&tag, "h", &h)) goto end; | |
| 893 for (y=0; y<h; ++y) { | |
| 894 int yy; | |
| 895 if (extract_xml_pparse_next(buffer, &tag)) goto end; | |
| 896 if (strcmp(tag.name, "line")) { | |
| 897 outf("Expected <line> but tag.name='%s'", tag.name); | |
| 898 errno = ESRCH; | |
| 899 goto end; | |
| 900 } | |
| 901 if (extract_xml_tag_attributes_find_int(&tag, "y", &yy)) goto end; | |
| 902 if (yy != y) { | |
| 903 outf("Expected <line y=%i> but found <line y=%i>", y, yy); | |
| 904 errno = ESRCH; | |
| 905 goto end; | |
| 906 } | |
| 907 if (extract_xml_pparse_next(buffer, &tag)) goto end; | |
| 908 if (strcmp(tag.name, "/line")) { | |
| 909 outf("Expected </line> but tag.name='%s'", tag.name); | |
| 910 errno = ESRCH; | |
| 911 goto end; | |
| 912 } | |
| 913 } | |
| 914 } | |
| 915 else { | |
| 916 /* Compressed. */ | |
| 917 size_t image_data_size; | |
| 918 const char *c; | |
| 919 size_t i; | |
| 920 if (extract_xml_tag_attributes_find_size(&tag, "datasize", &image_data_size)) goto end; | |
| 921 if (extract_malloc(extract->alloc, &image_data, image_data_size)) goto end; | |
| 922 c = tag.text.chars; | |
| 923 for(i=0;;) { | |
| 924 int byte = 0; | |
| 925 int cc; | |
| 926 cc = *c; | |
| 927 c += 1; | |
| 928 if (cc == ' ' || cc == '\n') continue; | |
| 929 if (cc >= '0' && cc <= '9') byte += cc-'0'; | |
| 930 else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; | |
| 931 else goto compressed_error; | |
| 932 byte *= 16; | |
| 933 | |
| 934 cc = *c; | |
| 935 c += 1; | |
| 936 if (cc >= '0' && cc <= '9') byte += cc-'0'; | |
| 937 else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; | |
| 938 else goto compressed_error; | |
| 939 | |
| 940 image_data[i] = (char) byte; | |
| 941 i += 1; | |
| 942 if (i == image_data_size) { | |
| 943 break; | |
| 944 } | |
| 945 continue; | |
| 946 | |
| 947 compressed_error: | |
| 948 outf("Unrecognised hex character '%x' at offset %lli in image data", cc, (long long) (c-tag.text.chars)); | |
| 949 errno = EINVAL; | |
| 950 goto end; | |
| 951 } | |
| 952 if (extract_add_image( | |
| 953 extract, | |
| 954 type, | |
| 955 0 /*x*/, | |
| 956 0 /*y*/, | |
| 957 0 /*w*/, | |
| 958 0 /*h*/, | |
| 959 image_data, | |
| 960 image_data_size, | |
| 961 image_free_fn, | |
| 962 NULL | |
| 963 )) | |
| 964 { | |
| 965 goto end; | |
| 966 } | |
| 967 image_data = NULL; | |
| 968 } | |
| 969 if (extract_xml_pparse_next(buffer, &tag)) goto end; | |
| 970 if (strcmp(tag.name, "/image")) { | |
| 971 outf("Expected </image> but tag.name='%s'", tag.name); | |
| 972 errno = ESRCH; | |
| 973 goto end; | |
| 974 } | |
| 975 continue; | |
| 976 } | |
| 977 if (strcmp(tag.name, "span")) { | |
| 978 outf("Expected <span> but tag.name='%s'", tag.name); | |
| 979 errno = ESRCH; | |
| 980 goto end; | |
| 981 } | |
| 982 | |
| 983 { | |
| 984 matrix_t ctm; | |
| 985 matrix_t trm; | |
| 986 char *font_name; | |
| 987 char *font_name2; | |
| 988 int font_bold; | |
| 989 int font_italic; | |
| 990 int wmode; | |
| 991 if (s_matrix_read(extract_xml_tag_attributes_find(&tag, "ctm"), &ctm)) goto end; | |
| 992 if (s_matrix_read(extract_xml_tag_attributes_find(&tag, "trm"), &trm)) goto end; | |
| 993 font_name = extract_xml_tag_attributes_find(&tag, "font_name"); | |
| 994 if (!font_name) { | |
| 995 outf("Failed to find attribute 'font_name'"); | |
| 996 goto end; | |
| 997 } | |
| 998 font_name2 = strchr(font_name, '+'); | |
| 999 if (font_name2) font_name = font_name2 + 1; | |
| 1000 font_bold = strstr(font_name, "-Bold") ? 1 : 0; | |
| 1001 font_italic = strstr(font_name, "-Oblique") ? 1 : 0; | |
| 1002 if (extract_xml_tag_attributes_find_int(&tag, "wmode", &wmode)) goto end; | |
| 1003 if (extract_span_begin(extract, | |
| 1004 font_name, | |
| 1005 font_bold, | |
| 1006 font_italic, | |
| 1007 wmode, | |
| 1008 ctm.a, | |
| 1009 ctm.b, | |
| 1010 ctm.c, | |
| 1011 ctm.d, | |
| 1012 0,0,0,0)) goto end; | |
| 1013 | |
| 1014 for(;;) { | |
| 1015 double x; | |
| 1016 double y; | |
| 1017 double adv; | |
| 1018 unsigned int ucs; | |
| 1019 | |
| 1020 if (extract_xml_pparse_next(buffer, &tag)) { | |
| 1021 outf("Failed to find <char or </span"); | |
| 1022 goto end; | |
| 1023 } | |
| 1024 if (!strcmp(tag.name, "/span")) { | |
| 1025 break; | |
| 1026 } | |
| 1027 if (strcmp(tag.name, "char")) { | |
| 1028 errno = ESRCH; | |
| 1029 outf("Expected <char> but tag.name='%s'", tag.name); | |
| 1030 goto end; | |
| 1031 } | |
| 1032 | |
| 1033 if (extract_xml_tag_attributes_find_double(&tag, "x", &x)) goto end; | |
| 1034 if (extract_xml_tag_attributes_find_double(&tag, "y", &y)) goto end; | |
| 1035 if (extract_xml_tag_attributes_find_double(&tag, "adv", &adv)) goto end; | |
| 1036 if (extract_xml_tag_attributes_find_uint(&tag, "ucs", &ucs)) goto end; | |
| 1037 | |
| 1038 /* BBox is bogus here. Analysis will fail. */ | |
| 1039 if (extract_add_char(extract, x, y, ucs, adv, x, y, x + adv, y + adv)) goto end; | |
| 1040 } | |
| 1041 | |
| 1042 extract_xml_tag_free(extract->alloc, &tag); | |
| 1043 } | |
| 1044 } | |
| 1045 if (extract_page_end(extract)) goto end; | |
| 1046 outf("page=%i subpage->num_spans=%i", | |
| 1047 document->pages_num, content_count_spans(&subpage->content)); | |
| 1048 } | |
| 1049 | |
| 1050 outf("num_spans=%i num_spans_split=%i num_spans_autosplit=%i", | |
| 1051 num_spans, | |
| 1052 extract->num_spans_split, | |
| 1053 extract->num_spans_autosplit | |
| 1054 ); | |
| 1055 | |
| 1056 ret = 0; | |
| 1057 end: | |
| 1058 | |
| 1059 extract_xml_tag_free(extract->alloc, &tag); | |
| 1060 extract_free(extract->alloc, &image_data); | |
| 1061 | |
| 1062 return ret; | |
| 1063 } | |
| 1064 | |
| 1065 int | |
| 1066 extract_span_begin( | |
| 1067 extract_t *extract, | |
| 1068 const char *font_name, | |
| 1069 int font_bold, | |
| 1070 int font_italic, | |
| 1071 int wmode, | |
| 1072 double ctm_a, | |
| 1073 double ctm_b, | |
| 1074 double ctm_c, | |
| 1075 double ctm_d, | |
| 1076 double bbox_x0, | |
| 1077 double bbox_y0, | |
| 1078 double bbox_x1, | |
| 1079 double bbox_y1) | |
| 1080 { | |
| 1081 int e = -1; | |
| 1082 extract_page_t *page; | |
| 1083 subpage_t *subpage; | |
| 1084 span_t *span; | |
| 1085 document_t *document = &extract->document; | |
| 1086 | |
| 1087 /* FIXME: RJW: Should continue the last span if everything is the same. */ | |
| 1088 | |
| 1089 assert(document->pages_num > 0); | |
| 1090 page = document->pages[document->pages_num-1]; | |
| 1091 subpage = page->subpages[page->subpages_num-1]; | |
| 1092 outf("extract_span_begin(): ctm=(%f %f %f %f) font_name=%s, wmode=%i", | |
| 1093 ctm_a, | |
| 1094 ctm_b, | |
| 1095 ctm_c, | |
| 1096 ctm_d, | |
| 1097 font_name, | |
| 1098 wmode); | |
| 1099 if (content_append_new_span(extract->alloc, &subpage->content, &span, document->current)) goto end; | |
| 1100 span->ctm.a = ctm_a; | |
| 1101 span->ctm.b = ctm_b; | |
| 1102 span->ctm.c = ctm_c; | |
| 1103 span->ctm.d = ctm_d; | |
| 1104 span->font_bbox.min.x = bbox_x0; | |
| 1105 span->font_bbox.min.y = bbox_y0; | |
| 1106 span->font_bbox.max.x = bbox_x1; | |
| 1107 span->font_bbox.max.y = bbox_y1; | |
| 1108 | |
| 1109 { | |
| 1110 const char *ff = strchr(font_name, '+'); | |
| 1111 const char *f = (ff) ? ff+1 : font_name; | |
| 1112 if (extract_strdup(extract->alloc, f, &span->font_name)) goto end; | |
| 1113 span->flags.font_bold = font_bold ? 1 : 0; | |
| 1114 span->flags.font_italic = font_italic ? 1 : 0; | |
| 1115 span->flags.wmode = wmode ? 1 : 0; | |
| 1116 extract->span_offset_x = 0; | |
| 1117 extract->span_offset_y = 0; | |
| 1118 } | |
| 1119 | |
| 1120 e = 0; | |
| 1121 end: | |
| 1122 | |
| 1123 return e; | |
| 1124 } | |
| 1125 | |
| 1126 /* Create a new empty span, based on the current one. */ | |
| 1127 static span_t * | |
| 1128 split_to_new_span(extract_alloc_t *alloc, content_root_t *content, span_t *span0) | |
| 1129 { | |
| 1130 content_t save; | |
| 1131 span_t *span; | |
| 1132 char *name; | |
| 1133 | |
| 1134 if (extract_strdup(alloc, span0->font_name, &name)) | |
| 1135 return NULL; | |
| 1136 | |
| 1137 if (content_append_new_span(alloc, content, &span, span0->structure)) | |
| 1138 { | |
| 1139 extract_free(alloc, &name); | |
| 1140 return NULL; | |
| 1141 } | |
| 1142 | |
| 1143 save = span->base; /* Avoid overwriting linked list. */ | |
| 1144 *span = *span0; | |
| 1145 span->base = save; | |
| 1146 span->font_name = name; | |
| 1147 span->chars = NULL; | |
| 1148 span->chars_num = 0; | |
| 1149 | |
| 1150 return span; | |
| 1151 } | |
| 1152 | |
| 1153 /* | |
| 1154 This routine returns the previous non-space-char, UNLESS the span | |
| 1155 starts with a space, in which case we accept that one. | |
| 1156 */ | |
| 1157 static span_t * | |
| 1158 find_previous_non_space_char_ish(content_root_t *content, int *char_num, int *intervening_space) | |
| 1159 { | |
| 1160 content_t *s; | |
| 1161 int i; | |
| 1162 | |
| 1163 *intervening_space = 0; | |
| 1164 for (s = content->base.prev; s != &content->base; s = s->prev) | |
| 1165 { | |
| 1166 span_t *span = (span_t *)s; | |
| 1167 | |
| 1168 if (s->type != content_span) | |
| 1169 continue; | |
| 1170 | |
| 1171 for (i = span->chars_num-1; i >= 0; i--) | |
| 1172 { | |
| 1173 if (span->chars[i].ucs != 32 || i == 0) | |
| 1174 { | |
| 1175 *char_num = i; | |
| 1176 return span; | |
| 1177 } | |
| 1178 *intervening_space = 1; | |
| 1179 } | |
| 1180 } | |
| 1181 | |
| 1182 return NULL; | |
| 1183 } | |
| 1184 | |
| 1185 point_t | |
| 1186 extract_predicted_end_of_char(char_t *char_, const span_t *span) | |
| 1187 { | |
| 1188 double adv = char_->adv; | |
| 1189 point_t dir = { adv * (1 - span->flags.wmode), adv * span->flags.wmode }; | |
| 1190 | |
| 1191 dir = extract_matrix4_transform_point(span->ctm, dir); | |
| 1192 dir.x += char_->x; | |
| 1193 dir.y += char_->y; | |
| 1194 | |
| 1195 return dir; | |
| 1196 } | |
| 1197 | |
| 1198 point_t | |
| 1199 extract_end_of_span(const span_t *span) | |
| 1200 { | |
| 1201 assert(span && span->chars_num > 0); | |
| 1202 return extract_predicted_end_of_char(&span->chars[span->chars_num-1], span); | |
| 1203 } | |
| 1204 | |
| 1205 int extract_add_char( | |
| 1206 extract_t *extract, | |
| 1207 double x, | |
| 1208 double y, | |
| 1209 unsigned int ucs, | |
| 1210 double adv, | |
| 1211 double x0, | |
| 1212 double y0, | |
| 1213 double x1, | |
| 1214 double y1) | |
| 1215 { | |
| 1216 int e = -1; | |
| 1217 char_t *char_; | |
| 1218 extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; | |
| 1219 subpage_t *subpage = page->subpages[page->subpages_num-1]; | |
| 1220 span_t *span = content_last_span(&subpage->content); | |
| 1221 span_t *span0; | |
| 1222 int char_num0; | |
| 1223 double dist, perp, scale_squared; | |
| 1224 point_t dir; | |
| 1225 int intervening_space; | |
| 1226 | |
| 1227 if (span->flags.wmode) | |
| 1228 { | |
| 1229 dir.x = 0; | |
| 1230 dir.y = 1; | |
| 1231 scale_squared = span->ctm.c * span->ctm.c + span->ctm.d * span->ctm.d; | |
| 1232 } | |
| 1233 else | |
| 1234 { | |
| 1235 dir.x = 1; | |
| 1236 dir.y = 0; | |
| 1237 scale_squared = span->ctm.a * span->ctm.a + span->ctm.b * span->ctm.b; | |
| 1238 } | |
| 1239 dir = extract_matrix4_transform_point(span->ctm, dir); | |
| 1240 | |
| 1241 outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv); | |
| 1242 | |
| 1243 /* Is there a previous span to which we should consider attaching this char. */ | |
| 1244 span0 = find_previous_non_space_char_ish(&subpage->content, &char_num0, &intervening_space); | |
| 1245 | |
| 1246 /* Spans can't continue over different structure elements. */ | |
| 1247 if (span0 && span0->structure != extract->document.current) | |
| 1248 span0 = NULL; | |
| 1249 | |
| 1250 if (span0 == NULL) | |
| 1251 { | |
| 1252 /* No previous continuable span. */ | |
| 1253 outf("%c x=%g y=%g adv=%g\n", ucs, x, y, adv); | |
| 1254 } | |
| 1255 else | |
| 1256 { | |
| 1257 /* We have a span. Check whether we need to break to a new line, or add (or subtract) a space. */ | |
| 1258 char_t *char_prev = &span0->chars[char_num0]; | |
| 1259 double adv0 = char_prev->adv; | |
| 1260 point_t predicted_end_of_char0 = extract_predicted_end_of_char(char_prev, span0); | |
| 1261 /* We don't currently have access to the size of the advance for a space. | |
| 1262 * Typically it's around 1 to 1/2 that of a real char. So guess at that | |
| 1263 * using the 2 advances we have available to us. */ | |
| 1264 double space_guess = (adv0 + adv)/2 * extract->master_space_guess; | |
| 1265 | |
| 1266 /* Use dot product to calculate the distance that we have moved along the direction vector. */ | |
| 1267 dist = (x - predicted_end_of_char0.x) * dir.x + (y - predicted_end_of_char0.y) * dir.y; | |
| 1268 /* Use dot product to calculate the distance that we have moved perpendicular to the direction vector. */ | |
| 1269 perp = (x - predicted_end_of_char0.x) * dir.y - (y - predicted_end_of_char0.y) * dir.x; | |
| 1270 /* Both dist and perp are multiplied by scale_squared. */ | |
| 1271 dist /= scale_squared; | |
| 1272 perp /= scale_squared; | |
| 1273 /* So now, dist, perp, adv, adv0 and space_guess are all in pre-transform space. */ | |
| 1274 | |
| 1275 /* So fabs(dist) is expected to be 0, and perp is expected to be 0 for characters | |
| 1276 * "naturally placed" on a line. */ | |
| 1277 outf("%c x=%g y=%g adv=%g dist=%g perp=%g\n", ucs, x, y, adv, dist, perp); | |
| 1278 | |
| 1279 /* Arbitrary fractions here; ideally we should consult the font bbox, but we don't currently | |
| 1280 * have that. */ | |
| 1281 if (fabs(perp) > 3*space_guess/2 || fabs(dist) > space_guess * 8) | |
| 1282 { | |
| 1283 /* Create new span. */ | |
| 1284 if (span->chars_num > 0) | |
| 1285 { | |
| 1286 extract->num_spans_autosplit += 1; | |
| 1287 span = split_to_new_span(extract->alloc, &subpage->content, span); | |
| 1288 if (span == NULL) goto end; | |
| 1289 } | |
| 1290 } | |
| 1291 else if (intervening_space) | |
| 1292 { | |
| 1293 /* Some files, notably zlib.3.pdf appear to contain stray extra spaces within the PDF | |
| 1294 * content themselves. e.g. "suppor ts". We therefore spot when the | |
| 1295 * space allocated for a space isn't used, and remove the space. */ | |
| 1296 /* MAGIC NUMBER WARNING. zlib.pdf says that /4 is not sensitive enough. /3 is OK. */ | |
| 1297 if (dist < space_guess/3) | |
| 1298 { | |
| 1299 if (span->chars_num > 0) | |
| 1300 { | |
| 1301 span->chars_num--; | |
| 1302 /* Don't need to worry about it being empty, as we're about to add another char! */ | |
| 1303 } | |
| 1304 else | |
| 1305 { | |
| 1306 span_t *space_span = content_prev_span(&span->base); | |
| 1307 assert(space_span->chars_num > 0); | |
| 1308 space_span->chars_num--; | |
| 1309 if (space_span->chars_num == 0) | |
| 1310 extract_span_free(extract->alloc, &space_span); | |
| 1311 } | |
| 1312 } | |
| 1313 } | |
| 1314 /* MAGIC NUMBER WARNING: We expect the space char to be about 1/2 as wide of a standard char. | |
| 1315 * zlib3.pdf shows that sometimes we need to insert a space when it's *just* smaller than | |
| 1316 * this. (e.g. 'eveninthe'). */ | |
| 1317 else if (!intervening_space && dist > 2*space_guess/3) | |
| 1318 { | |
| 1319 /* Larger gap than expected. Add an extra space. */ | |
| 1320 /* Where should the space go? At the predicted position where the previous char | |
| 1321 * ended. */ | |
| 1322 char_ = extract_span_append_c(extract->alloc, span, ' '); | |
| 1323 if (char_ == NULL) goto end; | |
| 1324 | |
| 1325 char_->x = predicted_end_of_char0.x; | |
| 1326 char_->y = predicted_end_of_char0.y; | |
| 1327 } | |
| 1328 } | |
| 1329 | |
| 1330 char_ = extract_span_append_c(extract->alloc, span, ucs); | |
| 1331 if (char_ == NULL) goto end; | |
| 1332 | |
| 1333 char_->x = x; | |
| 1334 char_->y = y; | |
| 1335 | |
| 1336 char_->adv = adv; | |
| 1337 char_->bbox.min.x = x0; | |
| 1338 char_->bbox.min.y = y0; | |
| 1339 char_->bbox.max.x = x1; | |
| 1340 char_->bbox.max.y = y1; | |
| 1341 | |
| 1342 e = 0; | |
| 1343 end: | |
| 1344 | |
| 1345 if (span && span->chars_num == 0) | |
| 1346 { | |
| 1347 extract_span_free(extract->alloc, &span); | |
| 1348 } | |
| 1349 | |
| 1350 return e; | |
| 1351 } | |
| 1352 | |
| 1353 | |
| 1354 int extract_span_end(extract_t *extract) | |
| 1355 { | |
| 1356 extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; | |
| 1357 subpage_t *subpage = page->subpages[page->subpages_num-1]; | |
| 1358 span_t *span = content_last_span(&subpage->content); | |
| 1359 | |
| 1360 if (span->chars_num == 0) { | |
| 1361 /* Calling code called extract_span_begin() then extract_span_end() | |
| 1362 without any call to extract_add_char(). Our joining code assumes that | |
| 1363 all spans are non-empty, so we need to delete this span. */ | |
| 1364 extract_span_free(extract->alloc, &span); | |
| 1365 } | |
| 1366 | |
| 1367 return 0; | |
| 1368 } | |
| 1369 | |
| 1370 | |
| 1371 int extract_add_image( | |
| 1372 extract_t *extract, | |
| 1373 const char *type, | |
| 1374 double x, | |
| 1375 double y, | |
| 1376 double w, | |
| 1377 double h, | |
| 1378 void *data, | |
| 1379 size_t data_size, | |
| 1380 extract_image_data_free data_free, | |
| 1381 void *data_free_handle) | |
| 1382 { | |
| 1383 int e = -1; | |
| 1384 extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; | |
| 1385 subpage_t *subpage = page->subpages[page->subpages_num-1]; | |
| 1386 image_t *image; | |
| 1387 | |
| 1388 extract->image_n += 1; | |
| 1389 if (content_append_new_image(extract->alloc, &subpage->content, &image)) goto end; | |
| 1390 image->x = x; | |
| 1391 image->y = y; | |
| 1392 image->w = w; | |
| 1393 image->h = h; | |
| 1394 image->data = data; | |
| 1395 image->data_size = data_size; | |
| 1396 image->data_free = data_free; | |
| 1397 image->data_free_handle = data_free_handle; | |
| 1398 if (extract_strdup(extract->alloc, type, &image->type)) goto end; | |
| 1399 if (extract_asprintf(extract->alloc, &image->id, "rId%i", extract->image_n) < 0) goto end; | |
| 1400 if (extract_asprintf(extract->alloc, &image->name, "image%i.%s", extract->image_n, image->type) < 0) goto end; | |
| 1401 | |
| 1402 subpage->images_num += 1; | |
| 1403 outf("subpage->images_num=%i", subpage->images_num); | |
| 1404 | |
| 1405 e = 0; | |
| 1406 end: | |
| 1407 | |
| 1408 if (e) { | |
| 1409 extract_image_free(extract->alloc, &image); | |
| 1410 } | |
| 1411 | |
| 1412 return e; | |
| 1413 } | |
| 1414 | |
| 1415 | |
| 1416 static int tablelines_append(extract_alloc_t *alloc, tablelines_t *tablelines, rect_t *rect, double color) | |
| 1417 { | |
| 1418 if (extract_realloc( | |
| 1419 alloc, | |
| 1420 &tablelines->tablelines, | |
| 1421 sizeof(*tablelines->tablelines) * (tablelines->tablelines_num + 1) | |
| 1422 )) return -1; | |
| 1423 tablelines->tablelines[ tablelines->tablelines_num].rect = *rect; | |
| 1424 tablelines->tablelines[ tablelines->tablelines_num].color = (float) color; | |
| 1425 tablelines->tablelines_num += 1; | |
| 1426 | |
| 1427 return 0; | |
| 1428 } | |
| 1429 | |
| 1430 static point_t transform( | |
| 1431 double x, | |
| 1432 double y, | |
| 1433 double ctm_a, | |
| 1434 double ctm_b, | |
| 1435 double ctm_c, | |
| 1436 double ctm_d, | |
| 1437 double ctm_e, | |
| 1438 double ctm_f) | |
| 1439 { | |
| 1440 point_t ret; | |
| 1441 | |
| 1442 ret.x = ctm_a * x + ctm_b * y + ctm_e; | |
| 1443 ret.y = ctm_c * x + ctm_d * y + ctm_f; | |
| 1444 | |
| 1445 return ret; | |
| 1446 } | |
| 1447 | |
| 1448 static double s_min(double a, double b) | |
| 1449 { | |
| 1450 return (a < b) ? a : b; | |
| 1451 } | |
| 1452 | |
| 1453 static double s_max(double a, double b) | |
| 1454 { | |
| 1455 return (a > b) ? a : b; | |
| 1456 } | |
| 1457 | |
| 1458 int extract_add_path4( | |
| 1459 extract_t *extract, | |
| 1460 double ctm_a, | |
| 1461 double ctm_b, | |
| 1462 double ctm_c, | |
| 1463 double ctm_d, | |
| 1464 double ctm_e, | |
| 1465 double ctm_f, | |
| 1466 double x0, | |
| 1467 double y0, | |
| 1468 double x1, | |
| 1469 double y1, | |
| 1470 double x2, | |
| 1471 double y2, | |
| 1472 double x3, | |
| 1473 double y3, | |
| 1474 double color) | |
| 1475 { | |
| 1476 extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; | |
| 1477 subpage_t *subpage = page->subpages[page->subpages_num-1]; | |
| 1478 point_t points[4] = { | |
| 1479 transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), | |
| 1480 transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), | |
| 1481 transform(x2, y2, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), | |
| 1482 transform(x3, y3, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f) | |
| 1483 }; | |
| 1484 rect_t rect; | |
| 1485 int i; | |
| 1486 double dx, dy; | |
| 1487 | |
| 1488 outf("cmt=(%f %f %f %f %f %f) points=[(%f %f) (%f %f) (%f %f) (%f %f)]", | |
| 1489 ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f, | |
| 1490 x0, y0, x1, y1, x2, y2, x3, y3 | |
| 1491 ); | |
| 1492 outf("extract_add_path4(): [(%f %f) (%f %f) (%f %f) (%f %f)]", | |
| 1493 x0, y0, x1, y1, x2, y2, x3, y3); | |
| 1494 /* Find first step with dx > 0. */ | |
| 1495 for (i=0; i<4; ++i) | |
| 1496 { | |
| 1497 if (points[(i+1) % 4].x > points[(i+0) % 4].x) break; | |
| 1498 } | |
| 1499 outf("i=%i", i); | |
| 1500 if (i == 4) return 0; | |
| 1501 rect.min.x = points[(i+0) % 4].x; | |
| 1502 rect.max.x = points[(i+1) % 4].x; | |
| 1503 if (points[(i+2) % 4].x != rect.max.x) return 0; | |
| 1504 if (points[(i+3) % 4].x != rect.min.x) return 0; | |
| 1505 y0 = points[(i+1) % 4].y; | |
| 1506 y1 = points[(i+2) % 4].y; | |
| 1507 if (y0 == y1) return 0; | |
| 1508 if (points[(i+3) % 4].y != y1) return 0; | |
| 1509 if (points[(i+4) % 4].y != y0) return 0; | |
| 1510 rect.min.y = (y1 > y0) ? y0 : y1; | |
| 1511 rect.max.y = (y1 > y0) ? y1 : y0; | |
| 1512 | |
| 1513 dx = rect.max.x - rect.min.x; | |
| 1514 dy = rect.max.y - rect.min.y; | |
| 1515 if (dx / dy > 5) | |
| 1516 { | |
| 1517 /* Horizontal line. */ | |
| 1518 outf("have found horizontal line: %s", extract_rect_string(&rect)); | |
| 1519 if (tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color)) return -1; | |
| 1520 } | |
| 1521 else if (dy / dx > 5) | |
| 1522 { | |
| 1523 /* Vertical line. */ | |
| 1524 outf("have found vertical line: %s", extract_rect_string(&rect)); | |
| 1525 if (tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color)) return -1; | |
| 1526 } | |
| 1527 | |
| 1528 return 0; | |
| 1529 } | |
| 1530 | |
| 1531 | |
| 1532 int extract_add_line( | |
| 1533 extract_t *extract, | |
| 1534 double ctm_a, | |
| 1535 double ctm_b, | |
| 1536 double ctm_c, | |
| 1537 double ctm_d, | |
| 1538 double ctm_e, | |
| 1539 double ctm_f, | |
| 1540 double width, | |
| 1541 double x0, | |
| 1542 double y0, | |
| 1543 double x1, | |
| 1544 double y1, | |
| 1545 double color) | |
| 1546 { | |
| 1547 extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; | |
| 1548 subpage_t *subpage = page->subpages[page->subpages_num-1]; | |
| 1549 point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); | |
| 1550 point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); | |
| 1551 double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c)); | |
| 1552 rect_t rect; | |
| 1553 | |
| 1554 (void)color; | |
| 1555 rect.min.x = s_min(p0.x, p1.x); | |
| 1556 rect.min.y = s_min(p0.y, p1.y); | |
| 1557 rect.max.x = s_max(p0.x, p1.x); | |
| 1558 rect.max.y = s_max(p0.y, p1.y); | |
| 1559 | |
| 1560 outf("%s: width=%f ((%f %f)(%f %f)) rect=%s", | |
| 1561 extract_FUNCTION, | |
| 1562 width, | |
| 1563 x0, y0, x1, y1, | |
| 1564 extract_rect_string(&rect) | |
| 1565 ); | |
| 1566 if (rect.min.x == rect.max.x && rect.min.y == rect.max.y) | |
| 1567 { | |
| 1568 } | |
| 1569 else if (rect.min.x == rect.max.x) | |
| 1570 { | |
| 1571 rect.min.x -= width2 / 2; | |
| 1572 rect.max.x += width2 / 2; | |
| 1573 return tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color); | |
| 1574 } | |
| 1575 else if (rect.min.y == rect.max.y) | |
| 1576 { | |
| 1577 rect.min.y -= width2 / 2; | |
| 1578 rect.max.y += width2 / 2; | |
| 1579 return tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color); | |
| 1580 } | |
| 1581 | |
| 1582 return 0; | |
| 1583 } | |
| 1584 | |
| 1585 int extract_subpage_alloc(extract_alloc_t *alloc, rect_t mediabox, extract_page_t *page, subpage_t **psubpage) | |
| 1586 { | |
| 1587 subpage_t *subpage; | |
| 1588 | |
| 1589 if (extract_malloc(alloc, psubpage, sizeof(subpage_t))) | |
| 1590 { | |
| 1591 return -1; | |
| 1592 } | |
| 1593 subpage = *psubpage; | |
| 1594 subpage->mediabox = mediabox; | |
| 1595 content_init_root(&subpage->content, NULL); | |
| 1596 subpage->images_num = 0; | |
| 1597 subpage->tablelines_horizontal.tablelines = NULL; | |
| 1598 subpage->tablelines_horizontal.tablelines_num = 0; | |
| 1599 subpage->tablelines_vertical.tablelines = NULL; | |
| 1600 subpage->tablelines_vertical.tablelines_num = 0; | |
| 1601 content_init_root(&subpage->tables, NULL); | |
| 1602 | |
| 1603 if (extract_realloc2(alloc, | |
| 1604 &page->subpages, | |
| 1605 sizeof(subpage_t*) * page->subpages_num, | |
| 1606 sizeof(subpage_t*) * (page->subpages_num + 1))) | |
| 1607 { | |
| 1608 extract_free(alloc, psubpage); | |
| 1609 return -1; | |
| 1610 } | |
| 1611 page->subpages[page->subpages_num] = subpage; | |
| 1612 page->subpages_num += 1; | |
| 1613 | |
| 1614 return 0; | |
| 1615 } | |
| 1616 | |
| 1617 /* Appends new empty subpage_t to the last page of an extract->document. */ | |
| 1618 static int extract_subpage_begin(extract_t *extract, double x0, double y0, double x1, double y1) | |
| 1619 { | |
| 1620 extract_page_t *page = extract->document.pages[extract->document.pages_num - 1]; | |
| 1621 subpage_t *subpage; | |
| 1622 rect_t mediabox = { { x0, y0 }, { x1, y1 } }; | |
| 1623 int e; | |
| 1624 | |
| 1625 e = extract_subpage_alloc(extract->alloc, mediabox, page, &subpage); | |
| 1626 | |
| 1627 if (e == 0) | |
| 1628 { | |
| 1629 } | |
| 1630 | |
| 1631 return e; | |
| 1632 } | |
| 1633 | |
| 1634 /* Appends new empty page_t to an extract->document. */ | |
| 1635 int extract_page_begin(extract_t *extract, double x0, double y0, double x1, double y1) | |
| 1636 { | |
| 1637 extract_page_t *page; | |
| 1638 | |
| 1639 if (extract_malloc(extract->alloc, &page, sizeof(*page))) return -1; | |
| 1640 page->mediabox.min.x = x0; | |
| 1641 page->mediabox.min.y = y0; | |
| 1642 page->mediabox.max.x = x1; | |
| 1643 page->mediabox.max.y = y1; | |
| 1644 page->subpages = NULL; | |
| 1645 page->subpages_num = 0; | |
| 1646 page->split = NULL; | |
| 1647 | |
| 1648 if (extract_realloc2( | |
| 1649 extract->alloc, | |
| 1650 &extract->document.pages, | |
| 1651 sizeof(subpage_t*) * extract->document.pages_num, | |
| 1652 sizeof(subpage_t*) * (extract->document.pages_num + 1) | |
| 1653 )) { | |
| 1654 extract_free(extract->alloc, &page); | |
| 1655 return -1; | |
| 1656 } | |
| 1657 | |
| 1658 extract->document.pages[extract->document.pages_num] = page; | |
| 1659 extract->document.pages_num += 1; | |
| 1660 | |
| 1661 if (extract_subpage_begin(extract, x0, y0, x1, y1)) { | |
| 1662 extract->document.pages_num--; | |
| 1663 page_free(extract->alloc, &extract->document.pages[extract->document.pages_num]); | |
| 1664 return -1; | |
| 1665 } | |
| 1666 | |
| 1667 return 0; | |
| 1668 } | |
| 1669 | |
| 1670 int extract_fill_begin( | |
| 1671 extract_t *extract, | |
| 1672 double ctm_a, | |
| 1673 double ctm_b, | |
| 1674 double ctm_c, | |
| 1675 double ctm_d, | |
| 1676 double ctm_e, | |
| 1677 double ctm_f, | |
| 1678 double color) | |
| 1679 { | |
| 1680 assert(extract->path_type == path_type_NONE); | |
| 1681 | |
| 1682 extract->path_type = path_type_FILL; | |
| 1683 extract->path.fill.color = color; | |
| 1684 extract->path.fill.n = 0; | |
| 1685 extract->path.fill.ctm.a = ctm_a; | |
| 1686 extract->path.fill.ctm.b = ctm_b; | |
| 1687 extract->path.fill.ctm.c = ctm_c; | |
| 1688 extract->path.fill.ctm.d = ctm_d; | |
| 1689 extract->path.fill.ctm.e = ctm_e; | |
| 1690 extract->path.fill.ctm.f = ctm_f; | |
| 1691 | |
| 1692 return 0; | |
| 1693 } | |
| 1694 | |
| 1695 int extract_stroke_begin( | |
| 1696 extract_t *extract, | |
| 1697 double ctm_a, | |
| 1698 double ctm_b, | |
| 1699 double ctm_c, | |
| 1700 double ctm_d, | |
| 1701 double ctm_e, | |
| 1702 double ctm_f, | |
| 1703 double line_width, | |
| 1704 double color) | |
| 1705 { | |
| 1706 assert(extract->path_type == path_type_NONE); | |
| 1707 | |
| 1708 extract->path_type = path_type_STROKE; | |
| 1709 extract->path.stroke.ctm.a = ctm_a; | |
| 1710 extract->path.stroke.ctm.b = ctm_b; | |
| 1711 extract->path.stroke.ctm.c = ctm_c; | |
| 1712 extract->path.stroke.ctm.d = ctm_d; | |
| 1713 extract->path.stroke.ctm.e = ctm_e; | |
| 1714 extract->path.stroke.ctm.f = ctm_f; | |
| 1715 extract->path.stroke.width = line_width; | |
| 1716 extract->path.stroke.color = color; | |
| 1717 extract->path.stroke.point0_set = 0; | |
| 1718 extract->path.stroke.point_set = 0; | |
| 1719 | |
| 1720 return 0; | |
| 1721 } | |
| 1722 | |
| 1723 int extract_moveto(extract_t *extract, double x, double y) | |
| 1724 { | |
| 1725 if (extract->path_type == path_type_FILL) | |
| 1726 { | |
| 1727 if (extract->path.fill.n == -1) return 0; | |
| 1728 if (extract->path.fill.n != 0) | |
| 1729 { | |
| 1730 outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); | |
| 1731 extract->path.fill.n = -1; | |
| 1732 return 0; | |
| 1733 } | |
| 1734 extract->path.fill.points[extract->path.fill.n].x = x; | |
| 1735 extract->path.fill.points[extract->path.fill.n].y = y; | |
| 1736 extract->path.fill.n += 1; | |
| 1737 return 0; | |
| 1738 } | |
| 1739 else if (extract->path_type == path_type_STROKE) | |
| 1740 { | |
| 1741 extract->path.stroke.point.x = x; | |
| 1742 extract->path.stroke.point.y = y; | |
| 1743 extract->path.stroke.point_set = 1; | |
| 1744 if (!extract->path.stroke.point0_set) | |
| 1745 { | |
| 1746 extract->path.stroke.point0 = extract->path.stroke.point; | |
| 1747 extract->path.stroke.point0_set = 1; | |
| 1748 } | |
| 1749 return 0; | |
| 1750 } | |
| 1751 else | |
| 1752 { | |
| 1753 assert(0); | |
| 1754 return -1; | |
| 1755 } | |
| 1756 } | |
| 1757 | |
| 1758 int extract_lineto(extract_t *extract, double x, double y) | |
| 1759 { | |
| 1760 if (extract->path_type == path_type_FILL) | |
| 1761 { | |
| 1762 if (extract->path.fill.n == -1) return 0; | |
| 1763 if (extract->path.fill.n == 0 || extract->path.fill.n >= 4) | |
| 1764 { | |
| 1765 outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); | |
| 1766 extract->path.fill.n = -1; | |
| 1767 return 0; | |
| 1768 } | |
| 1769 extract->path.fill.points[extract->path.fill.n].x = x; | |
| 1770 extract->path.fill.points[extract->path.fill.n].y = y; | |
| 1771 extract->path.fill.n += 1; | |
| 1772 return 0; | |
| 1773 } | |
| 1774 else if (extract->path_type == path_type_STROKE) | |
| 1775 { | |
| 1776 if (extract->path.stroke.point_set) | |
| 1777 { | |
| 1778 if (extract_add_line( | |
| 1779 extract, | |
| 1780 extract->path.stroke.ctm.a, | |
| 1781 extract->path.stroke.ctm.b, | |
| 1782 extract->path.stroke.ctm.c, | |
| 1783 extract->path.stroke.ctm.d, | |
| 1784 extract->path.stroke.ctm.e, | |
| 1785 extract->path.stroke.ctm.f, | |
| 1786 extract->path.stroke.width, | |
| 1787 extract->path.stroke.point.x, | |
| 1788 extract->path.stroke.point.y, | |
| 1789 x, | |
| 1790 y, | |
| 1791 extract->path.stroke.color)) | |
| 1792 { | |
| 1793 return -1; | |
| 1794 } | |
| 1795 } | |
| 1796 extract->path.stroke.point.x = x; | |
| 1797 extract->path.stroke.point.y = y; | |
| 1798 extract->path.stroke.point_set = 1; | |
| 1799 if (!extract->path.stroke.point0_set) | |
| 1800 { | |
| 1801 extract->path.stroke.point0 = extract->path.stroke.point; | |
| 1802 extract->path.stroke.point0_set = 1; | |
| 1803 } | |
| 1804 return 0; | |
| 1805 } | |
| 1806 else | |
| 1807 { | |
| 1808 assert(0); | |
| 1809 return -1; | |
| 1810 } | |
| 1811 } | |
| 1812 | |
| 1813 int extract_closepath(extract_t *extract) | |
| 1814 { | |
| 1815 if (extract->path_type == path_type_FILL) | |
| 1816 { | |
| 1817 if (extract->path.fill.n == 4) | |
| 1818 { | |
| 1819 /* We are closing a four-element path, so this could be a thin | |
| 1820 rectangle that defines a line in a table. */ | |
| 1821 int e; | |
| 1822 e = extract_add_path4( | |
| 1823 extract, | |
| 1824 extract->path.fill.ctm.a, | |
| 1825 extract->path.fill.ctm.b, | |
| 1826 extract->path.fill.ctm.c, | |
| 1827 extract->path.fill.ctm.d, | |
| 1828 extract->path.fill.ctm.e, | |
| 1829 extract->path.fill.ctm.f, | |
| 1830 extract->path.fill.points[0].x, | |
| 1831 extract->path.fill.points[0].y, | |
| 1832 extract->path.fill.points[1].x, | |
| 1833 extract->path.fill.points[1].y, | |
| 1834 extract->path.fill.points[2].x, | |
| 1835 extract->path.fill.points[2].y, | |
| 1836 extract->path.fill.points[3].x, | |
| 1837 extract->path.fill.points[3].y, | |
| 1838 extract->path.fill.color); | |
| 1839 if (e) return e; | |
| 1840 } | |
| 1841 extract->path.fill.n = 0; | |
| 1842 return 0; | |
| 1843 } | |
| 1844 else if (extract->path_type == path_type_STROKE) | |
| 1845 { | |
| 1846 if (extract->path.stroke.point0_set && extract->path.stroke.point_set) | |
| 1847 { | |
| 1848 if (extract_add_line( | |
| 1849 extract, | |
| 1850 extract->path.stroke.ctm.a, | |
| 1851 extract->path.stroke.ctm.b, | |
| 1852 extract->path.stroke.ctm.c, | |
| 1853 extract->path.stroke.ctm.d, | |
| 1854 extract->path.stroke.ctm.e, | |
| 1855 extract->path.stroke.ctm.f, | |
| 1856 extract->path.stroke.width, | |
| 1857 extract->path.stroke.point.x, | |
| 1858 extract->path.stroke.point.y, | |
| 1859 extract->path.stroke.point0.x, | |
| 1860 extract->path.stroke.point0.y, | |
| 1861 extract->path.stroke.color)) | |
| 1862 { | |
| 1863 return -1; | |
| 1864 } | |
| 1865 return 0; | |
| 1866 } | |
| 1867 extract->path.stroke.point = extract->path.stroke.point0; | |
| 1868 return 0; | |
| 1869 } | |
| 1870 else | |
| 1871 { | |
| 1872 assert(0); | |
| 1873 return -1; | |
| 1874 } | |
| 1875 } | |
| 1876 | |
| 1877 | |
| 1878 int extract_fill_end(extract_t *extract) | |
| 1879 { | |
| 1880 assert(extract->path_type == path_type_FILL); | |
| 1881 extract->path_type = path_type_NONE; | |
| 1882 | |
| 1883 return 0; | |
| 1884 } | |
| 1885 | |
| 1886 | |
| 1887 int extract_stroke_end(extract_t *extract) | |
| 1888 { | |
| 1889 assert(extract->path_type == path_type_STROKE); | |
| 1890 extract->path_type = path_type_NONE; | |
| 1891 | |
| 1892 return 0; | |
| 1893 } | |
| 1894 | |
| 1895 | |
| 1896 | |
| 1897 static int extract_subpage_end(extract_t *extract) | |
| 1898 { | |
| 1899 (void) extract; | |
| 1900 return 0; | |
| 1901 } | |
| 1902 | |
| 1903 | |
| 1904 int extract_page_end(extract_t *extract) | |
| 1905 { | |
| 1906 if (extract_subpage_end(extract)) | |
| 1907 return -1; | |
| 1908 | |
| 1909 return 0; | |
| 1910 } | |
| 1911 | |
| 1912 int extract_begin_struct(extract_t *extract, extract_struct_t type, int uid, int score) | |
| 1913 { | |
| 1914 document_t *document = &extract->document; | |
| 1915 structure_t *structure; | |
| 1916 | |
| 1917 if (extract_malloc(extract->alloc, &structure, sizeof(*structure))) | |
| 1918 return -1; | |
| 1919 | |
| 1920 structure->parent = document->current; | |
| 1921 structure->sibling_next = NULL; | |
| 1922 structure->sibling_prev = NULL; | |
| 1923 structure->kids_first = NULL; | |
| 1924 structure->kids_tail = &structure->kids_first; | |
| 1925 structure->type = type; | |
| 1926 structure->score = score; | |
| 1927 structure->uid = uid; | |
| 1928 | |
| 1929 if (document->current == NULL) | |
| 1930 { | |
| 1931 /* New topmost entry. */ | |
| 1932 document->current = structure; | |
| 1933 document->structure = structure; | |
| 1934 } | |
| 1935 else | |
| 1936 { | |
| 1937 /* Add a child */ | |
| 1938 *document->current->kids_tail = structure; | |
| 1939 document->current->kids_tail = &structure->sibling_next; | |
| 1940 document->current = structure; | |
| 1941 } | |
| 1942 | |
| 1943 return 0; | |
| 1944 } | |
| 1945 | |
| 1946 int extract_end_struct(extract_t *extract) | |
| 1947 { | |
| 1948 document_t *document = &extract->document; | |
| 1949 | |
| 1950 assert(document->current != NULL); | |
| 1951 | |
| 1952 document->current = document->current->parent; | |
| 1953 | |
| 1954 return 0; | |
| 1955 } | |
| 1956 | |
| 1957 const char *extract_struct_string(extract_struct_t type) | |
| 1958 { | |
| 1959 switch (type) | |
| 1960 { | |
| 1961 default: | |
| 1962 return "UNKNOWN"; | |
| 1963 case extract_struct_INVALID: | |
| 1964 return "INVALID"; | |
| 1965 case extract_struct_UNDEFINED: | |
| 1966 return "UNDEFINED"; | |
| 1967 case extract_struct_DOCUMENT: | |
| 1968 return "DOCUMENT"; | |
| 1969 case extract_struct_PART: | |
| 1970 return "PART"; | |
| 1971 case extract_struct_ART: | |
| 1972 return "ART"; | |
| 1973 case extract_struct_SECT: | |
| 1974 return "SECT"; | |
| 1975 case extract_struct_DIV: | |
| 1976 return "DIV"; | |
| 1977 case extract_struct_BLOCKQUOTE: | |
| 1978 return "BLOCKQUOTE"; | |
| 1979 case extract_struct_CAPTION: | |
| 1980 return "CAPTION"; | |
| 1981 case extract_struct_TOC: | |
| 1982 return "TOC"; | |
| 1983 case extract_struct_TOCI: | |
| 1984 return "TOCI"; | |
| 1985 case extract_struct_INDEX: | |
| 1986 return "INDEX"; | |
| 1987 case extract_struct_NONSTRUCT: | |
| 1988 return "NONSTRUCT"; | |
| 1989 case extract_struct_PRIVATE: | |
| 1990 return "PRIVATE"; | |
| 1991 case extract_struct_DOCUMENTFRAGMENT: | |
| 1992 return "DOCUMENTFRAGMENT"; | |
| 1993 case extract_struct_ASIDE: | |
| 1994 return "ASIDE"; | |
| 1995 case extract_struct_TITLE: | |
| 1996 return "TITLE"; | |
| 1997 case extract_struct_FENOTE: | |
| 1998 return "FENOTE"; | |
| 1999 case extract_struct_SUB: | |
| 2000 return "SUB"; | |
| 2001 case extract_struct_P: | |
| 2002 return "P"; | |
| 2003 case extract_struct_H: | |
| 2004 return "H"; | |
| 2005 case extract_struct_H1: | |
| 2006 return "H1"; | |
| 2007 case extract_struct_H2: | |
| 2008 return "H2"; | |
| 2009 case extract_struct_H3: | |
| 2010 return "H3"; | |
| 2011 case extract_struct_H4: | |
| 2012 return "H4"; | |
| 2013 case extract_struct_H5: | |
| 2014 return "H5"; | |
| 2015 case extract_struct_H6: | |
| 2016 return "H6"; | |
| 2017 case extract_struct_LIST: | |
| 2018 return "LIST"; | |
| 2019 case extract_struct_LISTITEM: | |
| 2020 return "LISTITEM"; | |
| 2021 case extract_struct_LABEL: | |
| 2022 return "LABEL"; | |
| 2023 case extract_struct_LISTBODY: | |
| 2024 return "LISTBODY"; | |
| 2025 case extract_struct_TABLE: | |
| 2026 return "TABLE"; | |
| 2027 case extract_struct_TR: | |
| 2028 return "TR"; | |
| 2029 case extract_struct_TH: | |
| 2030 return "TH"; | |
| 2031 case extract_struct_TD: | |
| 2032 return "TD"; | |
| 2033 case extract_struct_THEAD: | |
| 2034 return "THEAD"; | |
| 2035 case extract_struct_TBODY: | |
| 2036 return "TBODY"; | |
| 2037 case extract_struct_TFOOT: | |
| 2038 return "TFOOT"; | |
| 2039 case extract_struct_SPAN: | |
| 2040 return "SPAN"; | |
| 2041 case extract_struct_QUOTE: | |
| 2042 return "QUOTE"; | |
| 2043 case extract_struct_NOTE: | |
| 2044 return "NOTE"; | |
| 2045 case extract_struct_REFERENCE: | |
| 2046 return "REFERENCE"; | |
| 2047 case extract_struct_BIBENTRY: | |
| 2048 return "BIBENTRY"; | |
| 2049 case extract_struct_CODE: | |
| 2050 return "CODE"; | |
| 2051 case extract_struct_LINK: | |
| 2052 return "LINK"; | |
| 2053 case extract_struct_ANNOT: | |
| 2054 return "ANNOT"; | |
| 2055 case extract_struct_EM: | |
| 2056 return "EM"; | |
| 2057 case extract_struct_STRONG: | |
| 2058 return "STRONG"; | |
| 2059 case extract_struct_RUBY: | |
| 2060 return "RUBY"; | |
| 2061 case extract_struct_RB: | |
| 2062 return "RB"; | |
| 2063 case extract_struct_RT: | |
| 2064 return "RT"; | |
| 2065 case extract_struct_RP: | |
| 2066 return "RP"; | |
| 2067 case extract_struct_WARICHU: | |
| 2068 return "WARICHU"; | |
| 2069 case extract_struct_WT: | |
| 2070 return "WT"; | |
| 2071 case extract_struct_WP: | |
| 2072 return "WP"; | |
| 2073 case extract_struct_FIGURE: | |
| 2074 return "FIGURE"; | |
| 2075 case extract_struct_FORMULA: | |
| 2076 return "FORMULA"; | |
| 2077 case extract_struct_FORM: | |
| 2078 return "FORM"; | |
| 2079 case extract_struct_ARTIFACT: | |
| 2080 return "ARTIFACT"; | |
| 2081 } | |
| 2082 } | |
| 2083 | |
| 2084 static int | |
| 2085 paragraph_to_text( | |
| 2086 extract_alloc_t *alloc, | |
| 2087 paragraph_t *paragraph, | |
| 2088 extract_astring_t *text) | |
| 2089 { | |
| 2090 content_line_iterator lit; | |
| 2091 line_t *line; | |
| 2092 | |
| 2093 for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) | |
| 2094 { | |
| 2095 content_span_iterator sit; | |
| 2096 span_t *span; | |
| 2097 | |
| 2098 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 2099 { | |
| 2100 int c; | |
| 2101 | |
| 2102 for (c=0; c<span->chars_num; ++c) | |
| 2103 { | |
| 2104 /* We encode each character as utf8. */ | |
| 2105 char_t* char_ = &span->chars[c]; | |
| 2106 unsigned cc = char_->ucs; | |
| 2107 if (extract_astring_catc_unicode( | |
| 2108 alloc, | |
| 2109 text, | |
| 2110 cc, | |
| 2111 0 /*xml*/, | |
| 2112 1 /*ascii_ligatures*/, | |
| 2113 1 /*ascii_dash*/, | |
| 2114 1 /*ascii_apostrophe*/ | |
| 2115 )) return -1; | |
| 2116 } | |
| 2117 } | |
| 2118 } | |
| 2119 if (extract_astring_catc(alloc, text, '\n')) return -1; | |
| 2120 | |
| 2121 return 0; | |
| 2122 } | |
| 2123 | |
| 2124 static int | |
| 2125 paragraphs_to_text_content( | |
| 2126 extract_alloc_t *alloc, | |
| 2127 content_root_t *paragraphs, | |
| 2128 extract_astring_t *text) | |
| 2129 { | |
| 2130 content_iterator cit; | |
| 2131 content_t *content; | |
| 2132 | |
| 2133 for (content = content_iterator_init(&cit, paragraphs); content != NULL; content = content_iterator_next(&cit)) | |
| 2134 { | |
| 2135 if (content->type == content_paragraph) | |
| 2136 { | |
| 2137 if (paragraph_to_text(alloc, (paragraph_t *)content, text)) return -1; | |
| 2138 } | |
| 2139 else if (content->type == content_block) | |
| 2140 { | |
| 2141 block_t *block = (block_t *)content; | |
| 2142 content_paragraph_iterator pit; | |
| 2143 paragraph_t *paragraph; | |
| 2144 | |
| 2145 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 2146 { | |
| 2147 if (paragraph_to_text(alloc, paragraph, text)) return -1; | |
| 2148 } | |
| 2149 } | |
| 2150 } | |
| 2151 return 0; | |
| 2152 } | |
| 2153 | |
| 2154 | |
| 2155 static int extract_write_tables_csv(extract_t *extract) | |
| 2156 { | |
| 2157 int ret = -1; | |
| 2158 int p; | |
| 2159 char *path = NULL; | |
| 2160 FILE *f = NULL; | |
| 2161 extract_astring_t text = {NULL, 0}; | |
| 2162 | |
| 2163 if (!extract->tables_csv_format) return 0; | |
| 2164 | |
| 2165 outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format); | |
| 2166 outf("extract->document.pages_num=%i", extract->document.pages_num); | |
| 2167 for (p=0; p<extract->document.pages_num; ++p) | |
| 2168 { | |
| 2169 int c; | |
| 2170 extract_page_t *page = extract->document.pages[p]; | |
| 2171 for (c=0; c<page->subpages_num; ++c) | |
| 2172 { | |
| 2173 content_table_iterator tit; | |
| 2174 table_t *table; | |
| 2175 subpage_t *subpage = page->subpages[c]; | |
| 2176 | |
| 2177 outf("p=%i subpage->tables_num=%i", p, content_count_tables(&subpage->tables)); | |
| 2178 for (table = content_table_iterator_init(&tit, &subpage->tables); table != NULL; table = content_table_iterator_next(&tit)) | |
| 2179 { | |
| 2180 int y; | |
| 2181 extract_free(extract->alloc, &path); | |
| 2182 if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; | |
| 2183 extract->tables_csv_i += 1; | |
| 2184 outf("Writing table to: %s", path); | |
| 2185 outf("table->cells_num_x=%i", table->cells_num_x); | |
| 2186 outf("table->cells_num_y=%i", table->cells_num_y); | |
| 2187 f = fopen(path, "w"); | |
| 2188 if (!f) goto end; | |
| 2189 for (y=0; y<table->cells_num_y; ++y) | |
| 2190 { | |
| 2191 int x; | |
| 2192 int have_output = 0; | |
| 2193 for (x=0; x<table->cells_num_x; ++x) | |
| 2194 { | |
| 2195 cell_t* cell = table->cells[table->cells_num_x * y + x]; | |
| 2196 extract_astring_free(extract->alloc, &text); | |
| 2197 if (y==0) | |
| 2198 { | |
| 2199 outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); | |
| 2200 } | |
| 2201 if (have_output) fprintf(f, ","); | |
| 2202 have_output = 1; | |
| 2203 if (paragraphs_to_text_content( | |
| 2204 extract->alloc, | |
| 2205 &cell->content, | |
| 2206 &text | |
| 2207 )) goto end; | |
| 2208 /* Reference cvs output trims trailing spaces. */ | |
| 2209 extract_astring_char_truncate_if(&text, ' '); | |
| 2210 fprintf(f, "\"%s\"", text.chars ? text.chars : ""); | |
| 2211 } | |
| 2212 fprintf(f, "\n"); | |
| 2213 } | |
| 2214 fclose(f); | |
| 2215 f = NULL; | |
| 2216 } | |
| 2217 } | |
| 2218 } | |
| 2219 | |
| 2220 ret = 0; | |
| 2221 end: | |
| 2222 | |
| 2223 if (f) fclose(f); | |
| 2224 extract_free(extract->alloc, &path); | |
| 2225 extract_astring_free(extract->alloc, &text); | |
| 2226 | |
| 2227 return ret; | |
| 2228 } | |
| 2229 | |
| 2230 | |
| 2231 int extract_process( | |
| 2232 extract_t *extract, | |
| 2233 int spacing, | |
| 2234 int rotation, | |
| 2235 int images) | |
| 2236 { | |
| 2237 int e = -1; | |
| 2238 | |
| 2239 if (extract_realloc2( | |
| 2240 extract->alloc, | |
| 2241 &extract->contentss, | |
| 2242 sizeof(*extract->contentss) * extract->contentss_num, | |
| 2243 sizeof(*extract->contentss) * (extract->contentss_num + 1) | |
| 2244 )) goto end; | |
| 2245 extract_astring_init(&extract->contentss[extract->contentss_num]); | |
| 2246 extract->contentss_num += 1; | |
| 2247 | |
| 2248 if (extract_document_join(extract->alloc, &extract->document, extract->layout_analysis, extract->master_space_guess)) goto end; | |
| 2249 | |
| 2250 switch (extract->format) | |
| 2251 { | |
| 2252 case extract_format_ODT: | |
| 2253 if (extract_document_to_odt_content( | |
| 2254 extract->alloc, | |
| 2255 &extract->document, | |
| 2256 spacing, | |
| 2257 rotation, | |
| 2258 images, | |
| 2259 &extract->contentss[extract->contentss_num - 1], | |
| 2260 &extract->odt_styles | |
| 2261 )) goto end; | |
| 2262 break; | |
| 2263 case extract_format_DOCX: | |
| 2264 if (extract_document_to_docx_content( | |
| 2265 extract->alloc, | |
| 2266 &extract->document, | |
| 2267 spacing, | |
| 2268 rotation, | |
| 2269 images, | |
| 2270 &extract->contentss[extract->contentss_num - 1] | |
| 2271 )) goto end; | |
| 2272 break; | |
| 2273 case extract_format_HTML: | |
| 2274 if (extract_document_to_html_content( | |
| 2275 extract->alloc, | |
| 2276 &extract->document, | |
| 2277 rotation, | |
| 2278 images, | |
| 2279 &extract->contentss[extract->contentss_num - 1] | |
| 2280 )) goto end; | |
| 2281 break; | |
| 2282 case extract_format_JSON: | |
| 2283 if (extract_document_to_json_content( | |
| 2284 extract->alloc, | |
| 2285 &extract->document, | |
| 2286 rotation, | |
| 2287 images, | |
| 2288 &extract->contentss[extract->contentss_num - 1] | |
| 2289 )) goto end; | |
| 2290 break; | |
| 2291 case extract_format_TEXT: | |
| 2292 { | |
| 2293 int p; | |
| 2294 for (p=0; p<extract->document.pages_num; ++p) | |
| 2295 { | |
| 2296 extract_page_t* page = extract->document.pages[p]; | |
| 2297 int c; | |
| 2298 for (c=0; c<page->subpages_num; ++c) | |
| 2299 { | |
| 2300 subpage_t* subpage = page->subpages[c]; | |
| 2301 if (paragraphs_to_text_content( | |
| 2302 extract->alloc, | |
| 2303 &subpage->content, | |
| 2304 &extract->contentss[extract->contentss_num - 1] | |
| 2305 )) goto end; | |
| 2306 } | |
| 2307 } | |
| 2308 break; | |
| 2309 } | |
| 2310 default: | |
| 2311 outf0("Invalid format=%i", extract->format); | |
| 2312 assert(0); | |
| 2313 errno = EINVAL; | |
| 2314 return 1; | |
| 2315 } | |
| 2316 | |
| 2317 if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end; | |
| 2318 | |
| 2319 if (extract->tables_csv_format) | |
| 2320 { | |
| 2321 extract_write_tables_csv(extract); | |
| 2322 } | |
| 2323 | |
| 2324 { | |
| 2325 int p; | |
| 2326 for (p=0; p<extract->document.pages_num; ++p) { | |
| 2327 page_free(extract->alloc, &extract->document.pages[p]); | |
| 2328 } | |
| 2329 extract_free(extract->alloc, &extract->document.pages); | |
| 2330 extract->document.pages_num = 0; | |
| 2331 } | |
| 2332 | |
| 2333 e = 0; | |
| 2334 end: | |
| 2335 | |
| 2336 return e; | |
| 2337 } | |
| 2338 | |
| 2339 int extract_write(extract_t *extract, extract_buffer_t *buffer) | |
| 2340 { | |
| 2341 int e = -1; | |
| 2342 extract_zip_t *zip = NULL; | |
| 2343 char *text2 = NULL; | |
| 2344 int i; | |
| 2345 | |
| 2346 switch (extract->format) | |
| 2347 { | |
| 2348 case extract_format_ODT: | |
| 2349 { | |
| 2350 if (extract_zip_open(buffer, &zip)) goto end; | |
| 2351 for (i=0; i<odt_template_items_num; ++i) { | |
| 2352 const odt_template_item_t* item = &odt_template_items[i]; | |
| 2353 extract_free(extract->alloc, &text2); | |
| 2354 outf("i=%i item->name=%s", i, item->name); | |
| 2355 if (extract_odt_content_item( | |
| 2356 extract->alloc, | |
| 2357 extract->contentss, | |
| 2358 extract->contentss_num, | |
| 2359 &extract->odt_styles, | |
| 2360 &extract->images, | |
| 2361 item->name, | |
| 2362 item->text, | |
| 2363 &text2 | |
| 2364 )) | |
| 2365 { | |
| 2366 goto end; | |
| 2367 } | |
| 2368 { | |
| 2369 const char* text3 = (text2) ? text2 : item->text; | |
| 2370 if (extract_zip_write_file(zip, text3, strlen(text3), item->name)) goto end; | |
| 2371 } | |
| 2372 } | |
| 2373 outf0("extract->images.images_num=%i", extract->images.images_num); | |
| 2374 for (i=0; i<extract->images.images_num; ++i) { | |
| 2375 image_t* image = extract->images.images[i]; | |
| 2376 extract_free(extract->alloc, &text2); | |
| 2377 if (extract_asprintf(extract->alloc, &text2, "Pictures/%s", image->name) < 0) goto end; | |
| 2378 if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; | |
| 2379 } | |
| 2380 if (extract_zip_close(&zip)) goto end; | |
| 2381 break; | |
| 2382 } | |
| 2383 case extract_format_DOCX: | |
| 2384 { | |
| 2385 if (extract_zip_open(buffer, &zip)) goto end; | |
| 2386 for (i=0; i<docx_template_items_num; ++i) { | |
| 2387 const docx_template_item_t* item = &docx_template_items[i]; | |
| 2388 extract_free(extract->alloc, &text2); | |
| 2389 outf("i=%i item->name=%s", i, item->name); | |
| 2390 if (extract_docx_content_item( | |
| 2391 extract->alloc, | |
| 2392 extract->contentss, | |
| 2393 extract->contentss_num, | |
| 2394 &extract->images, | |
| 2395 item->name, | |
| 2396 item->text, | |
| 2397 &text2 | |
| 2398 )) | |
| 2399 { | |
| 2400 goto end; | |
| 2401 } | |
| 2402 | |
| 2403 { | |
| 2404 const char* text3 = (text2) ? text2 : item->text; | |
| 2405 if (extract_zip_write_file(zip, text3, strlen(text3), item->name)) goto end; | |
| 2406 } | |
| 2407 } | |
| 2408 for (i=0; i<extract->images.images_num; ++i) { | |
| 2409 image_t* image = extract->images.images[i]; | |
| 2410 extract_free(extract->alloc, &text2); | |
| 2411 if (extract_asprintf(extract->alloc, &text2, "word/media/%s", image->name) < 0) goto end; | |
| 2412 if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; | |
| 2413 } | |
| 2414 if (extract_zip_close(&zip)) goto end; | |
| 2415 break; | |
| 2416 } | |
| 2417 case extract_format_HTML: | |
| 2418 case extract_format_TEXT: | |
| 2419 for (i=0; i<extract->contentss_num; ++i) | |
| 2420 { | |
| 2421 if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; | |
| 2422 } | |
| 2423 break; | |
| 2424 case extract_format_JSON: | |
| 2425 { | |
| 2426 int first = 1; | |
| 2427 if (extract_buffer_cat(buffer, "{\n\"elements\" : ")) | |
| 2428 goto end; | |
| 2429 for (i=0; i<extract->contentss_num; ++i) | |
| 2430 { | |
| 2431 if (!first && extract_buffer_cat(buffer, ",\n")) | |
| 2432 goto end; | |
| 2433 if (extract->contentss[i].chars_num > 0) | |
| 2434 first = 0; | |
| 2435 if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; | |
| 2436 } | |
| 2437 if (extract_buffer_cat(buffer, "\n}\n")) | |
| 2438 goto end; | |
| 2439 break; | |
| 2440 } | |
| 2441 default: | |
| 2442 outf0("Invalid format=%i", extract->format); | |
| 2443 assert(0); | |
| 2444 errno = EINVAL; | |
| 2445 return 1; | |
| 2446 } | |
| 2447 | |
| 2448 e = 0; | |
| 2449 end: | |
| 2450 | |
| 2451 if (e) | |
| 2452 { | |
| 2453 outf("failed: %s", strerror(errno)); | |
| 2454 extract_zip_close(&zip); | |
| 2455 } | |
| 2456 extract_free(extract->alloc, &text2); | |
| 2457 | |
| 2458 return e; | |
| 2459 } | |
| 2460 | |
| 2461 int extract_write_content(extract_t *extract, extract_buffer_t *buffer) | |
| 2462 { | |
| 2463 int i; | |
| 2464 | |
| 2465 for (i=0; i<extract->contentss_num; ++i) { | |
| 2466 if (extract_buffer_write( | |
| 2467 buffer, | |
| 2468 extract->contentss[i].chars, | |
| 2469 extract->contentss[i].chars_num, | |
| 2470 NULL /*o_actual*/ | |
| 2471 )) return -1; | |
| 2472 } | |
| 2473 | |
| 2474 return 0; | |
| 2475 } | |
| 2476 | |
| 2477 static int string_ends_with(const char *string, const char *end) | |
| 2478 { | |
| 2479 size_t string_len = strlen(string); | |
| 2480 size_t end_len = strlen(end); | |
| 2481 | |
| 2482 if (end_len > string_len) return 0; | |
| 2483 | |
| 2484 return memcmp(string + string_len - end_len, end, end_len) == 0; | |
| 2485 } | |
| 2486 | |
| 2487 int extract_write_template( | |
| 2488 extract_t *extract, | |
| 2489 const char *path_template, | |
| 2490 const char *path_out, | |
| 2491 int preserve_dir) | |
| 2492 { | |
| 2493 if (string_ends_with(path_out, ".odt")) | |
| 2494 { | |
| 2495 return extract_odt_write_template( | |
| 2496 extract->alloc, | |
| 2497 extract->contentss, | |
| 2498 extract->contentss_num, | |
| 2499 &extract->odt_styles, | |
| 2500 &extract->images, | |
| 2501 path_template, | |
| 2502 path_out, | |
| 2503 preserve_dir); | |
| 2504 } | |
| 2505 else | |
| 2506 { | |
| 2507 return extract_docx_write_template( | |
| 2508 extract->alloc, | |
| 2509 extract->contentss, | |
| 2510 extract->contentss_num, | |
| 2511 &extract->images, | |
| 2512 path_template, | |
| 2513 path_out, | |
| 2514 preserve_dir); | |
| 2515 } | |
| 2516 } | |
| 2517 | |
| 2518 | |
| 2519 void extract_end(extract_t **pextract) | |
| 2520 { | |
| 2521 int i; | |
| 2522 extract_t *extract = *pextract; | |
| 2523 | |
| 2524 if (!extract) return; | |
| 2525 | |
| 2526 extract_document_free(extract->alloc, &extract->document); | |
| 2527 for (i=0; i<extract->contentss_num; ++i) { | |
| 2528 extract_astring_free(extract->alloc, &extract->contentss[i]); | |
| 2529 } | |
| 2530 extract_free(extract->alloc, &extract->contentss); | |
| 2531 extract_images_free(extract->alloc, &extract->images); | |
| 2532 extract_odt_styles_free(extract->alloc, &extract->odt_styles); | |
| 2533 | |
| 2534 extract_free(extract->alloc, pextract); | |
| 2535 } | |
| 2536 | |
| 2537 void extract_internal_end(void) | |
| 2538 { | |
| 2539 extract_span_string(NULL, NULL); | |
| 2540 } | |
| 2541 | |
| 2542 void extract_exp_min(extract_t *extract, size_t size) | |
| 2543 { | |
| 2544 extract_alloc_exp_min(extract->alloc, size); | |
| 2545 } | |
| 2546 | |
| 2547 double extract_font_size(matrix4_t *ctm) | |
| 2548 { | |
| 2549 double font_size = extract_matrix_expansion(*ctm); | |
| 2550 | |
| 2551 /* Round font_size to nearest 0.01. */ | |
| 2552 font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f; | |
| 2553 | |
| 2554 return font_size; | |
| 2555 } | |
| 2556 | |
| 2557 rect_t extract_block_pre_rotation_bounds(block_t *block, double angle) | |
| 2558 { | |
| 2559 content_paragraph_iterator pit; | |
| 2560 paragraph_t *paragraph; | |
| 2561 rect_t pre_box = extract_rect_empty; | |
| 2562 matrix4_t unrotate, rotate; | |
| 2563 point_t centre, trans_centre; | |
| 2564 | |
| 2565 /* Construct a matrix to undo the rotation that we are about to put into | |
| 2566 * the file. i.e. get us a matrix that maps us from where the chars are | |
| 2567 * positioned back to the pre-rotated position. These pre-rotated positions | |
| 2568 * can then be used to calculate the origin/extent of the area that we | |
| 2569 * need to put into the file. */ | |
| 2570 | |
| 2571 /* The well know rotation matrixes: | |
| 2572 * | |
| 2573 * CW: [ cos(theta) sin(theta) ] CCW: [ cos(theta) -sin(theta) ] | |
| 2574 * [ -sin(theta) cos(theta) ] [ sin(theta) cos(theta) ] | |
| 2575 */ | |
| 2576 | |
| 2577 /* Word gives us an angle to rotate by clockwise. So the inverse is the | |
| 2578 * CCW matrix: */ | |
| 2579 unrotate.a = cos(angle); | |
| 2580 unrotate.b = -sin(angle); | |
| 2581 unrotate.c = -unrotate.b; | |
| 2582 unrotate.d = unrotate.a; | |
| 2583 /* And the forward rotation is the CW matrix: */ | |
| 2584 rotate.a = unrotate.a; /* cos(theta) = cos(-theta) */ | |
| 2585 rotate.b = -unrotate.b; /* sin(theta) = -sin(-theta) */ | |
| 2586 rotate.c = -rotate.b; | |
| 2587 rotate.d = rotate.a; | |
| 2588 | |
| 2589 /* So ctm.unrotate.rotate = ctm, by construction. ctm.unrotate should | |
| 2590 * (in the common cases where the ctm is just a scale + rotation) map | |
| 2591 * all our character locations back to a rectangular region. We now | |
| 2592 * calculate that region as pre_box. */ | |
| 2593 | |
| 2594 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 2595 { | |
| 2596 content_line_iterator lit; | |
| 2597 line_t *line; | |
| 2598 | |
| 2599 for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) | |
| 2600 { | |
| 2601 span_t *span0 = content_first_span(&line->content); | |
| 2602 span_t *span1 = content_last_span(&line->content); | |
| 2603 point_t start = { span0->chars[0].x, span0->chars[0].y}; | |
| 2604 point_t end = extract_end_of_span(span1); | |
| 2605 double hoff = span0->font_bbox.max.y - (span0->font_bbox.min.y < 0 ? span0->font_bbox.min.y : 0); | |
| 2606 | |
| 2607 outf("%f %f -> %f %f\n", start.x, start.y, end.x, end.y); | |
| 2608 start = extract_matrix4_transform_point(unrotate, start); | |
| 2609 end = extract_matrix4_transform_point(unrotate, end); | |
| 2610 outf(" ---------> %f %f -> %f %f\n", start.x, start.y, end.x, end.y); | |
| 2611 | |
| 2612 /* Allow for the height of the span here. */ | |
| 2613 hoff *= sqrt(span0->ctm.c * span0->ctm.c + span0->ctm.d * span0->ctm.d); | |
| 2614 | |
| 2615 if (start.y < end.y) | |
| 2616 start.y -= hoff; | |
| 2617 else | |
| 2618 end.y -= hoff; | |
| 2619 pre_box = extract_rect_union_point(pre_box, start); | |
| 2620 pre_box = extract_rect_union_point(pre_box, end); | |
| 2621 } | |
| 2622 } | |
| 2623 | |
| 2624 /* So pre_box rotated around the origin by angle should give us the region we want. */ | |
| 2625 /* BUT word etc rotate around the centre of the box. So we need to offset the region to | |
| 2626 * allow for this. */ | |
| 2627 /* So word, takes the declared box, and subtracts the centre vector from it. Then it | |
| 2628 * does the rotation (around the origin - now the centre of the box). Then it adds the | |
| 2629 * centre vector to it again. So the centre of the box does not change. Unfortunately, | |
| 2630 * we haven't easily got the centre vector of the transformed box to hand, so calculate | |
| 2631 * it by rerotating the centre vector of the pre_box.*/ | |
| 2632 centre.x = (pre_box.min.x + pre_box.max.x)/2; | |
| 2633 centre.y = (pre_box.min.y + pre_box.max.y)/2; | |
| 2634 trans_centre = extract_matrix4_transform_point(rotate, centre); | |
| 2635 #if 0 | |
| 2636 { | |
| 2637 point_t centre2 = extract_matrix4_transform_point(unrotate, trans_centre); | |
| 2638 centre2 = centre2; | |
| 2639 } | |
| 2640 #endif | |
| 2641 #if 0 | |
| 2642 printf("Centre of this paragraph should be %f %f\n", trans_centre.x, trans_centre.y); | |
| 2643 #endif | |
| 2644 | |
| 2645 /* So the centre of our pre_box should be trans_centre not centre. */ | |
| 2646 centre.x -= trans_centre.x; | |
| 2647 centre.y -= trans_centre.y; | |
| 2648 pre_box.min.x -= centre.x; | |
| 2649 pre_box.min.y -= centre.y; | |
| 2650 pre_box.max.x -= centre.x; | |
| 2651 pre_box.max.y -= centre.y; | |
| 2652 | |
| 2653 #if 0 | |
| 2654 /* So, as a sanity check, convert the 4 corners back to a quad. */ | |
| 2655 { | |
| 2656 rect_t centred_box = { pre_box.min.x - trans_centre.x, | |
| 2657 pre_box.min.y - trans_centre.y, | |
| 2658 pre_box.max.x - trans_centre.x, | |
| 2659 pre_box.max.y - trans_centre.y }; | |
| 2660 point_t corner; | |
| 2661 | |
| 2662 corner = extract_matrix4_transform_xy(rotate, centred_box.min.x, centred_box.min.y); | |
| 2663 corner.x += trans_centre.x; | |
| 2664 corner.y += trans_centre.y; | |
| 2665 printf("TL: %f %f\n", corner.x, corner.y); | |
| 2666 corner = extract_matrix4_transform_xy(rotate, centred_box.max.x, centred_box.min.y); | |
| 2667 corner.x += trans_centre.x; | |
| 2668 corner.y += trans_centre.y; | |
| 2669 printf("TR: %f %f\n", corner.x, corner.y); | |
| 2670 corner = extract_matrix4_transform_xy(rotate, centred_box.max.x, centred_box.max.y); | |
| 2671 corner.x += trans_centre.x; | |
| 2672 corner.y += trans_centre.y; | |
| 2673 printf("BR: %f %f\n", corner.x, corner.y); | |
| 2674 corner = extract_matrix4_transform_xy(rotate, centred_box.min.x, centred_box.max.y); | |
| 2675 corner.x += trans_centre.x; | |
| 2676 corner.y += trans_centre.y; | |
| 2677 printf("BL: %f %f\n", corner.x, corner.y); | |
| 2678 } | |
| 2679 #endif | |
| 2680 | |
| 2681 /* And a further adjustment. If we mess up line widths, text can wrap too early, | |
| 2682 * resulting in content extending too far down the page, and truncating at the | |
| 2683 * bottom of the text frame. Similarly, line spacing. We can't tell word 'make | |
| 2684 * the box large enough', so we have to add a fudge factor and extend the bottom | |
| 2685 * of the box ourselves. As long as we aren't filling the background, or drawing | |
| 2686 * a bounding box, this should be fine. | |
| 2687 * | |
| 2688 * Unfortunately, we can't just extend pre_box downwards, because we rotate from | |
| 2689 * the centre of the box, so we need to adjust for that. | |
| 2690 */ | |
| 2691 /* Double the height of the box. */ | |
| 2692 { | |
| 2693 /* extra = how much to extend the box downwards. */ | |
| 2694 double extra = pre_box.max.y - pre_box.min.y; | |
| 2695 /* So we are offsetting the centre of the box by offset. */ | |
| 2696 point_t offset = { 0, extra/2 }; | |
| 2697 point_t toffset; | |
| 2698 pre_box.max.y += extra; | |
| 2699 toffset = extract_matrix4_transform_point(rotate, offset); | |
| 2700 pre_box.min.x += toffset.x - offset.x; | |
| 2701 pre_box.min.y += toffset.y - offset.y; | |
| 2702 pre_box.max.x += toffset.x - offset.x; | |
| 2703 pre_box.max.y += toffset.y - offset.y; | |
| 2704 } | |
| 2705 | |
| 2706 return pre_box; | |
| 2707 } | |
| 2708 | |
| 2709 double extract_baseline_angle(const matrix4_t *ctm) | |
| 2710 { | |
| 2711 return atan2(ctm->b, ctm->a); | |
| 2712 } |
