comparison mupdf-source/thirdparty/extract/src/extract.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include "extract/extract.h"
2 #include "extract/alloc.h"
3
4 #include "astring.h"
5 #include "document.h"
6 #include "docx.h"
7 #include "docx_template.h"
8 #include "html.h"
9 #include "json.h"
10 #include "mem.h"
11 #include "odt.h"
12 #include "odt_template.h"
13 #include "outf.h"
14 #include "xml.h"
15 #include "zip.h"
16
17
18 #include <assert.h>
19 #include <errno.h>
20 #include <math.h>
21 #include <stdarg.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25
26
27
28 const rect_t extract_rect_infinite = { { -DBL_MAX, -DBL_MAX }, { DBL_MAX, DBL_MAX } };
29 const rect_t extract_rect_empty = { { DBL_MAX, DBL_MAX }, { -DBL_MAX, -DBL_MAX } };
30
31
32 double extract_matrix_expansion(matrix4_t m)
33 {
34 return sqrt(fabs(m.a * m.d - m.b * m.c));
35 }
36
37 matrix4_t extract_matrix4_invert(const matrix4_t *ctm)
38 {
39 matrix4_t ctm_inverse = {1, 0, 0, 1};
40 double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c;
41
42 if (ctm_det == 0) {
43 outf("cannot invert ctm=(%f %f %f %f)",
44 ctm->a, ctm->b, ctm->c, ctm->d);
45 }
46 else
47 {
48 ctm_inverse.a = +ctm->d / ctm_det;
49 ctm_inverse.b = -ctm->b / ctm_det;
50 ctm_inverse.c = -ctm->c / ctm_det;
51 ctm_inverse.d = +ctm->a / ctm_det;
52 }
53
54 return ctm_inverse;
55 }
56
57 static void char_init(char_t* item)
58 {
59 item->x = 0;
60 item->y = 0;
61 item->ucs = 0;
62 item->adv = 0;
63 item->bbox = extract_rect_empty;
64 }
65
66 const char *extract_point_string(const point_t *point)
67 {
68 static char buffer[128];
69
70 snprintf(buffer, sizeof(buffer), "(%f %f)", point->x, point->y);
71
72 return buffer;
73 }
74
75 const char *extract_rect_string(const rect_t *rect)
76 {
77 static char buffer[2][256];
78 static int i = 0;
79
80 i = (i + 1) % 2;
81 snprintf(buffer[i], sizeof(buffer[i]), "((%f %f) (%f %f))", rect->min.x, rect->min.y, rect->max.x, rect->max.y);
82
83 return buffer[i];
84 }
85
86 const char *extract_span_string(extract_alloc_t *alloc, span_t *span)
87 {
88 static extract_astring_t ret = {0};
89 double x0 = 0;
90 double y0 = 0;
91 double x1 = 0;
92 double y1 = 0;
93 int c0 = 0;
94 int c1 = 0;
95 int i;
96
97 extract_astring_free(alloc, &ret);
98 if (span == NULL)
99 {
100 /* This frees our internal data, and is used by extract_internal_end(). */
101 return NULL;
102 }
103
104 if (span->chars_num) {
105 c0 = span->chars[0].ucs;
106 x0 = span->chars[0].x;
107 y0 = span->chars[0].y;
108 c1 = span->chars[span->chars_num-1].ucs;
109 x1 = span->chars[span->chars_num-1].x;
110 y1 = span->chars[span->chars_num-1].y;
111 }
112 {
113 char buffer[400];
114 snprintf(buffer, sizeof(buffer),
115 "span ctm=%s chars_num=%i (%c:%f,%f)..(%c:%f,%f) font=%s:(%f) wmode=%i chars_num=%i: ",
116 extract_matrix4_string(&span->ctm),
117 span->chars_num,
118 c0, x0, y0,
119 c1, x1, y1,
120 span->font_name,
121 extract_font_size(&span->ctm),
122 span->flags.wmode,
123 span->chars_num
124 );
125 extract_astring_cat(alloc, &ret, buffer);
126 for (i=0; i<span->chars_num; ++i) {
127 snprintf(
128 buffer,
129 sizeof(buffer),
130 " i=%i {x=%f y=%f ucs=%i adv=%f}",
131 i,
132 span->chars[i].x,
133 span->chars[i].y,
134 span->chars[i].ucs,
135 span->chars[i].adv
136 );
137 extract_astring_cat(alloc, &ret, buffer);
138 }
139 }
140 extract_astring_cat(alloc, &ret, ": ");
141 extract_astring_catc(alloc, &ret, '"');
142 for (i=0; i<span->chars_num; ++i)
143 extract_astring_catc(alloc, &ret, (char) span->chars[i].ucs);
144 extract_astring_catc(alloc, &ret, '"');
145 return ret.chars;
146 }
147
148 char_t *extract_span_append_c(extract_alloc_t *alloc, span_t *span, int c)
149 {
150 char_t *item;
151
152 if (extract_realloc2(alloc,
153 &span->chars,
154 sizeof(*span->chars) * span->chars_num,
155 sizeof(*span->chars) * (span->chars_num + 1)))
156 {
157 return NULL;
158 }
159 item = &span->chars[span->chars_num];
160 span->chars_num += 1;
161 char_init(item);
162 item->ucs = c;
163
164 return item;
165 }
166
167 char_t *extract_span_char_last(span_t *span)
168 {
169 assert(span->chars_num > 0);
170 return &span->chars[span->chars_num-1];
171 }
172
173 /* Returns first span in a line. */
174 span_t *extract_line_span_last(line_t *line)
175 {
176 assert(line->content.base.prev != &line->content.base && line->content.base.prev->type == content_span);
177 return (span_t *)line->content.base.prev;
178 }
179
180 span_t *extract_line_span_first(line_t *line)
181 {
182 assert(line->content.base.next != &line->content.base && line->content.base.next->type == content_span);
183 return (span_t *)line->content.base.next;
184 }
185
186 void extract_paragraph_free(extract_alloc_t *alloc, paragraph_t **pparagraph)
187 {
188 paragraph_t *paragraph = *pparagraph;
189
190 if (paragraph == NULL)
191 return;
192
193 content_unlink(&paragraph->base);
194 content_clear(alloc, &paragraph->content);
195 extract_free(alloc, pparagraph);
196 }
197
198 void extract_block_free(extract_alloc_t *alloc, block_t **pblock)
199 {
200 block_t *block = *pblock;
201
202 if (block == NULL)
203 return;
204
205 content_unlink(&block->base);
206 content_clear(alloc, &block->content);
207 extract_free(alloc, pblock);
208 }
209
210 void extract_table_free(extract_alloc_t *alloc, table_t **ptable)
211 {
212 int c;
213 table_t *table = *ptable;
214
215 content_unlink(&table->base);
216 for (c = 0; c< table->cells_num_x * table->cells_num_y; ++c)
217 {
218 extract_cell_free(alloc, &table->cells[c]);
219 }
220 extract_free(alloc, &table->cells);
221 extract_free(alloc, ptable);
222 }
223
224 static void
225 structure_clear(extract_alloc_t *alloc, structure_t *structure)
226 {
227 while (structure != NULL)
228 {
229 structure_t *next = structure->sibling_next;
230 structure_clear(alloc, structure->kids_first);
231 extract_free(alloc, &structure);
232 structure = next;
233 }
234 }
235
236 void extract_subpage_free(extract_alloc_t *alloc, subpage_t **psubpage)
237 {
238 subpage_t *subpage = *psubpage;
239
240 if (!subpage) return;
241
242 content_clear(alloc, &subpage->content);
243 content_clear(alloc, &subpage->tables);
244
245 extract_free(alloc, &subpage->tablelines_horizontal.tablelines);
246 extract_free(alloc, &subpage->tablelines_vertical.tablelines);
247
248 extract_free(alloc, psubpage);
249 }
250
251 static void page_free(extract_alloc_t *alloc, extract_page_t **ppage)
252 {
253 int c;
254 extract_page_t *page = *ppage;
255
256 if (!page) return;
257
258 for (c=0; c<page->subpages_num; ++c)
259 {
260 subpage_t *subpage = page->subpages[c];
261 extract_subpage_free(alloc, &subpage);
262 }
263 extract_split_free(alloc, &page->split);
264 extract_free(alloc, &page->subpages);
265 extract_free(alloc, ppage);
266 }
267
268 void content_append(content_root_t *root, content_t *content)
269 {
270 assert(root && root->base.type == content_root);
271
272 /* Unlink content from anywhere it might be. */
273 content_unlink(content);
274
275 /* Sanity check root. */
276 if (root->base.next == &root->base)
277 {
278 assert(root->base.prev == &root->base);
279 }
280
281 /* And append content */
282 content->next = &root->base;
283 content->prev = root->base.prev;
284 content->prev->next = content;
285 root->base.prev = content;
286 }
287
288 void content_append_span(content_root_t *root, span_t *span)
289 {
290 content_append(root, &span->base);
291 }
292
293 void content_append_line(content_root_t *root, line_t *line)
294 {
295 content_append(root, &line->base);
296 }
297
298 void content_append_paragraph(content_root_t *root, paragraph_t *paragraph)
299 {
300 content_append(root, &paragraph->base);
301 }
302
303 void content_append_block(content_root_t *root, block_t *block)
304 {
305 content_append(root, &block->base);
306 }
307
308 int content_new_root(extract_alloc_t *alloc, content_root_t **proot)
309 {
310 if (extract_malloc(alloc, proot, sizeof(**proot))) return -1;
311 content_init_root(*proot, NULL);
312
313 return 0;
314 }
315
316 int content_new_span(extract_alloc_t *alloc, span_t **pspan, structure_t *structure)
317 {
318 if (extract_malloc(alloc, pspan, sizeof(**pspan))) return -1;
319 extract_span_init(*pspan, structure);
320
321 return 0;
322 }
323
324 int content_new_line(extract_alloc_t *alloc, line_t **pline)
325 {
326 if (extract_malloc(alloc, pline, sizeof(**pline))) return -1;
327 extract_line_init(*pline);
328
329 return 0;
330 }
331
332 int content_new_paragraph(extract_alloc_t *alloc, paragraph_t **pparagraph)
333 {
334 if (extract_malloc(alloc, pparagraph, sizeof(**pparagraph))) return -1;
335 extract_paragraph_init(*pparagraph);
336
337 return 0;
338 }
339
340 int content_new_block(extract_alloc_t *alloc, block_t **pblock)
341 {
342 if (extract_malloc(alloc, pblock, sizeof(**pblock))) return -1;
343 extract_block_init(*pblock);
344
345 return 0;
346 }
347
348 int content_new_table(extract_alloc_t *alloc, table_t **ptable)
349 {
350 if (extract_malloc(alloc, ptable, sizeof(**ptable))) return -1;
351 extract_table_init(*ptable);
352
353 return 0;
354 }
355
356 /* Appends new empty span content to a content_list_t; returns -1 with errno set on error. */
357 int content_append_new_span(extract_alloc_t *alloc, content_root_t *root, span_t **pspan, structure_t *structure)
358 {
359 if (content_new_span(alloc, pspan, structure)) return -1;
360 content_append(root, &(*pspan)->base);
361
362 return 0;
363 }
364
365 /* Appends new empty line content to a content_list_t; returns -1 with errno set on error. */
366 int content_append_new_line(extract_alloc_t *alloc, content_root_t *root, line_t **pline)
367 {
368 if (content_new_line(alloc, pline)) return -1;
369 content_append(root, &(*pline)->base);
370
371 return 0;
372 }
373
374 /* Appends new empty paragraph content to a content_list_t; returns -1 with errno set on error. */
375 int content_append_new_paragraph(extract_alloc_t *alloc, content_root_t *root, paragraph_t **pparagraph)
376 {
377 if (content_new_paragraph(alloc, pparagraph)) return -1;
378 content_append(root, &(*pparagraph)->base);
379
380 return 0;
381 }
382
383 /* Appends new empty block content to a content_list_t; returns -1 with errno set on error. */
384 int content_append_new_block(extract_alloc_t *alloc, content_root_t *root, block_t **pblock)
385 {
386 if (content_new_block(alloc, pblock)) return -1;
387 content_append(root, &(*pblock)->base);
388
389 return 0;
390 }
391
392 /* Appends new empty table content to a content_list_t; returns -1 with errno set on error. */
393 int content_append_new_table(extract_alloc_t *alloc, content_root_t *root, table_t **ptable)
394 {
395 if (content_new_table(alloc, ptable)) return -1;
396 content_append(root, &(*ptable)->base);
397
398 return 0;
399 }
400
401 /* Appends new empty image content to a content_list_t; returns -1 with errno set on error. */
402 int content_append_new_image(extract_alloc_t *alloc, content_root_t *root, image_t **pimage)
403 {
404 if (extract_malloc(alloc, pimage, sizeof(**pimage))) return -1;
405 extract_image_init(*pimage);
406 content_append(root, &(*pimage)->base);
407
408 return 0;
409 }
410
411 void content_replace(content_t *current, content_t *replacement)
412 {
413 assert(current->type != content_root && replacement->type != content_root);
414 /* Unlink replacement. */
415 if (replacement->prev)
416 {
417 replacement->prev->next = replacement->next;
418 replacement->next->prev = replacement->prev;
419 }
420 /* Insert replacement */
421 replacement->prev = current->prev;
422 current->prev->next = replacement;
423 replacement->next = current->next;
424 current->next->prev = replacement;
425 /* Unlink current */
426 current->prev = NULL;
427 current->next = NULL;
428 }
429
430 /* Replaces current element with a new empty paragraph content; returns -1 with errno set on error. */
431 int content_replace_new_paragraph(extract_alloc_t *alloc, content_t *current, paragraph_t **pparagraph)
432 {
433 if (content_new_paragraph(alloc, pparagraph)) return -1;
434 content_replace(current, &(*pparagraph)->base);
435
436 return 0;
437 }
438
439 /* Replaces current element with a new empty block content; returns -1 with errno set on error. */
440 int content_replace_new_block(extract_alloc_t *alloc, content_t *current, block_t **pblock)
441 {
442 if (content_new_block(alloc, pblock)) return -1;
443 content_replace(current, &(*pblock)->base);
444
445 return 0;
446 }
447
448 /* Replaces current element with a new empty line content; returns -1 with errno set on error. */
449 int content_replace_new_line(extract_alloc_t *alloc, content_t *current, line_t **pline)
450 {
451 if (content_new_line(alloc, pline)) return -1;
452 content_replace(current, &(*pline)->base);
453
454 return 0;
455 }
456
457 static void extract_images_free(extract_alloc_t *alloc, images_t *images)
458 {
459 int i;
460 for (i=0; i<images->images_num; ++i) {
461 extract_image_clear(alloc, images->images[i]);
462 extract_free(alloc, &images->images[i]);
463 }
464 extract_free(alloc, &images->images);
465 extract_free(alloc, &images->imagetypes);
466 images->images_num = 0;
467 images->imagetypes_num = 0;
468 }
469
470
471 /* Move image_t's from document->subpage[] to *o_images.
472
473 On return document->subpage[].images* will be NULL etc.
474 */
475 static int
476 extract_document_images(extract_alloc_t *alloc, document_t *document, images_t *o_images)
477 {
478 int e = -1;
479 int p;
480 images_t images = {0};
481
482 outf("extract_document_images(): images.images_num=%i", images.images_num);
483 for (p=0; p<document->pages_num; ++p)
484 {
485 extract_page_t *page = document->pages[p];
486 int c;
487 for (c=0; c<page->subpages_num; ++c)
488 {
489 subpage_t *subpage = page->subpages[c];
490 content_image_iterator iit;
491 image_t *image;
492 int i;
493
494 for (i = 0, image = content_image_iterator_init(&iit, &subpage->content); image != NULL; i++, image = content_image_iterator_next(&iit))
495 {
496 if (extract_realloc2(alloc,
497 &images.images,
498 sizeof(image_t) * images.images_num,
499 sizeof(image_t) * (images.images_num + 1))) goto end;
500 outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id);
501 assert(image->name);
502 content_unlink(&image->base);
503 images.images[images.images_num] = image;
504 images.images_num += 1;
505
506 /* Add image type if we haven't seen it before. */
507 {
508 int it;
509 for (it=0; it<images.imagetypes_num; ++it)
510 {
511 outf("it=%i images.imagetypes[it]=%s image->type=%s",
512 it, images.imagetypes[it], image->type);
513 if (!strcmp(images.imagetypes[it], image->type))
514 {
515 break;
516 }
517 }
518 if (it == images.imagetypes_num)
519 {
520 /* We haven't seen this image type before. */
521 if (extract_realloc2(
522 alloc,
523 &images.imagetypes,
524 sizeof(char*) * images.imagetypes_num,
525 sizeof(char*) * (images.imagetypes_num + 1)
526 )) goto end;
527 assert(image->type);
528 images.imagetypes[images.imagetypes_num] = image->type;
529 images.imagetypes_num += 1;
530 outf("have added images.imagetypes_num=%i", images.imagetypes_num);
531 }
532 }
533 }
534 }
535 }
536
537 e = 0;
538 end:
539
540 if (e)
541 {
542 extract_free(alloc, &images.images);
543 }
544 else
545 {
546 *o_images = images;
547 }
548
549 return e;
550 }
551
552 static void extract_document_free(extract_alloc_t *alloc, document_t *document)
553 {
554 int p;
555
556 if (!document) return;
557
558 for (p=0; p<document->pages_num; ++p)
559 {
560 page_free(alloc, &document->pages[p]);
561 }
562 extract_free(alloc, &document->pages);
563 document->pages = NULL;
564 document->pages_num = 0;
565
566 structure_clear(alloc, document->structure);
567 }
568
569
570 /* Returns +1, 0 or -1 depending on sign of x. */
571 static int s_sign(double x)
572 {
573 if (x < 0) return -1;
574 if (x > 0) return +1;
575
576 return 0;
577 }
578
579 int extract_matrix4_cmp(const matrix4_t *lhs, const matrix4_t *rhs)
580 {
581 int ret;
582
583 ret = s_sign(lhs->a - rhs->a); if (ret) return ret;
584 ret = s_sign(lhs->b - rhs->b); if (ret) return ret;
585 ret = s_sign(lhs->c - rhs->c); if (ret) return ret;
586 ret = s_sign(lhs->d - rhs->d); if (ret) return ret;
587
588 return 0;
589 }
590
591 point_t extract_matrix4_transform_point(matrix4_t m, point_t p)
592 {
593 double x = p.x;
594
595 p.x = m.a * x + m.c * p.y;
596 p.y = m.b * x + m.d * p.y;
597
598 return p;
599 }
600
601 point_t extract_matrix4_transform_xy(matrix4_t m, double x, double y)
602 {
603 point_t p;
604
605 p.x = m.a * x + m.c * y;
606 p.y = m.b * x + m.d * y;
607
608 return p;
609 }
610
611 matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2)
612 {
613 matrix_t ret;
614
615 ret.a = m1.a * m2.a + m1.b * m2.c;
616 ret.b = m1.a * m2.b + m1.b * m2.d;
617 ret.c = m1.c * m2.a + m1.d * m2.c;
618 ret.d = m1.c * m2.b + m1.d * m2.d;
619 ret.e = m1.e * m2.a + m1.f * m2.c + m2.e;
620 ret.f = m1.e * m2.b + m1.f * m2.d + m2.f;
621
622 return ret;
623 }
624
625 matrix4_t extract_multiply_matrix4_matrix4(matrix4_t m1, matrix4_t m2)
626 {
627 matrix4_t ret;
628
629 ret.a = m1.a * m2.a + m1.b * m2.c;
630 ret.b = m1.a * m2.b + m1.b * m2.d;
631 ret.c = m1.c * m2.a + m1.d * m2.c;
632 ret.d = m1.c * m2.b + m1.d * m2.d;
633
634 return ret;
635 }
636
637 static int s_matrix_read(const char *text, matrix_t *matrix)
638 {
639 int n;
640
641 if (!text) {
642 outf("text is NULL in s_matrix_read()");
643 errno = EINVAL;
644 return -1;
645 }
646 n = sscanf(text,
647 "%lf %lf %lf %lf %lf %lf",
648 &matrix->a,
649 &matrix->b,
650 &matrix->c,
651 &matrix->d,
652 &matrix->e,
653 &matrix->f);
654 if (n != 6) {
655 errno = EINVAL;
656 return -1;
657 }
658
659 return 0;
660 }
661
662
663 static void document_init(document_t *document)
664 {
665 document->pages = NULL;
666 document->pages_num = 0;
667
668 document->structure = NULL;
669 document->current = NULL;
670 }
671
672 /* If we exceed MAX_STRUCT_NEST then this probably indicates that
673 * structure nesting is not to be trusted. */
674 #define MAX_STRUCT_NEST 64
675
676 struct extract_t
677 {
678 extract_alloc_t *alloc;
679 int layout_analysis;
680 double master_space_guess;
681 document_t document;
682
683 /* Number of extra spans from subpage_span_end_clean(). */
684 int num_spans_split;
685
686 /* Number of extra spans from autosplit=1. */
687 int num_spans_autosplit;
688
689 /* Only used if autosplit is non-zero. */
690 double span_offset_x;
691 double span_offset_y;
692
693 /* Used to generate unique ids for images. */
694 int image_n;
695
696 /* List of strings that are the generated docx content for each page. When
697 * zip_* can handle appending of data, we will be able to remove this list. */
698 extract_astring_t *contentss;
699 int contentss_num;
700
701 images_t images;
702
703 extract_format_t format;
704 extract_odt_styles_t odt_styles;
705
706 char *tables_csv_format;
707 int tables_csv_i;
708
709 enum
710 {
711 path_type_NONE,
712 path_type_FILL,
713 path_type_STROKE,
714 } path_type;
715
716 union
717 {
718 struct
719 {
720 matrix_t ctm;
721 double color;
722 point_t points[4];
723 int n;
724 } fill;
725
726 struct
727 {
728 matrix_t ctm;
729 double color;
730 double width;
731 point_t point0;
732 int point0_set;
733 point_t point;
734 int point_set;
735 } stroke;
736 } path;
737
738 int next_uid;
739 };
740
741 int extract_begin(extract_alloc_t *alloc,
742 extract_format_t format,
743 extract_t **pextract)
744 {
745 extract_t *extract;
746
747 *pextract = NULL;
748 if (1
749 && format != extract_format_ODT
750 && format != extract_format_DOCX
751 && format != extract_format_HTML
752 && format != extract_format_TEXT
753 && format != extract_format_JSON
754 )
755 {
756 outf0("Invalid format=%i\n", format);
757 errno = EINVAL;
758 return -1;
759 }
760
761 /* Create the extract structure. */
762 if (extract_malloc(alloc, &extract, sizeof(*extract)))
763 return -1;
764
765 extract_bzero(extract, sizeof(*extract));
766 extract->alloc = alloc;
767 extract->master_space_guess = 0.5;
768 document_init(&extract->document);
769
770 /* FIXME: Start at 10 because template document might use some low-numbered IDs.
771 */
772 extract->image_n = 10;
773
774 extract->format = format;
775 extract->tables_csv_format = NULL;
776 extract->tables_csv_i = 0;
777
778 extract->next_uid = 1;
779
780 *pextract = extract;
781
782 return 0;
783 }
784
785 void extract_set_space_guess(extract_t *extract, double space_guess)
786 {
787 extract->master_space_guess = space_guess;
788 }
789
790 int extract_set_layout_analysis(extract_t *extract, int enable)
791 {
792 extract->layout_analysis = enable;
793 return 0;
794 }
795
796 int extract_tables_csv_format(extract_t *extract, const char *path_format)
797 {
798 return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format);
799 }
800
801
802 static void image_free_fn(void *handle, void *image_data)
803 {
804 (void) handle;
805 free(image_data);
806 }
807
808 int extract_read_intermediate(extract_t *extract, extract_buffer_t *buffer)
809 {
810 int ret = -1;
811 document_t *document = &extract->document;
812 char *image_data = NULL;
813 int num_spans = 0;
814 extract_xml_tag_t tag;
815
816 extract_xml_tag_init(&tag);
817
818 if (extract_xml_pparse_init(extract->alloc, buffer, NULL /*first_line*/)) {
819 outf("Failed to read start of intermediate data: %s", strerror(errno));
820 goto end;
821 }
822 /* Data read from <path> is expected to be XML looking like:
823
824 <page>
825 <span>
826 <char ...>
827 <char ...>
828 ...
829 </span>
830 <span>
831 ...
832 </span>
833 ...
834 </page>
835 <page>
836 ...
837 </page>
838 ...
839
840 We convert this into a list of subpage_t's, each containing a list of
841 span_t's, each containing a list of char_t's.
842
843 While doing this, we do some within-span processing by calling
844 subpage_span_end_clean():
845 Remove spurious spaces.
846 Split spans in two where there seem to be large gaps between glyphs.
847 */
848 for(;;) {
849 extract_page_t *page;
850 subpage_t *subpage;
851 rect_t mediabox = extract_rect_infinite; /* Fake mediabox */
852 int e = extract_xml_pparse_next(buffer, &tag);
853
854 if (e == 1) break; /* EOF. */
855 if (e) goto end;
856 if (!strcmp(tag.name, "?xml")) {
857 /* We simply skip this if we find it. As of 2020-07-31, mutool adds
858 this header to mupdf raw output, but gs txtwrite does not include
859 it. */
860 continue;
861 }
862 if (strcmp(tag.name, "page")) {
863 outf("Expected <page> but tag.name='%s'", tag.name);
864 errno = ESRCH;
865 goto end;
866 }
867 outfx("loading spans for page %i...", document->pages_num);
868 if (extract_page_begin(extract, mediabox.min.x, mediabox.min.y, mediabox.max.x, mediabox.max.y)) goto end;
869 page = extract->document.pages[extract->document.pages_num-1];
870 if (!page) goto end;
871 subpage = page->subpages[page->subpages_num-1];
872 if (!subpage) goto end;
873
874 for(;;) {
875 if (extract_xml_pparse_next(buffer, &tag)) goto end;
876 if (!strcmp(tag.name, "/page")) {
877 num_spans += content_count_spans(&subpage->content);
878 break;
879 }
880 if (!strcmp(tag.name, "image")) {
881 const char* type = extract_xml_tag_attributes_find(&tag, "type");
882 if (!type) {
883 errno = EINVAL;
884 goto end;
885 }
886 outf("image type=%s", type);
887 if (!strcmp(type, "pixmap")) {
888 int w;
889 int h;
890 int y;
891 if (extract_xml_tag_attributes_find_int(&tag, "w", &w)) goto end;
892 if (extract_xml_tag_attributes_find_int(&tag, "h", &h)) goto end;
893 for (y=0; y<h; ++y) {
894 int yy;
895 if (extract_xml_pparse_next(buffer, &tag)) goto end;
896 if (strcmp(tag.name, "line")) {
897 outf("Expected <line> but tag.name='%s'", tag.name);
898 errno = ESRCH;
899 goto end;
900 }
901 if (extract_xml_tag_attributes_find_int(&tag, "y", &yy)) goto end;
902 if (yy != y) {
903 outf("Expected <line y=%i> but found <line y=%i>", y, yy);
904 errno = ESRCH;
905 goto end;
906 }
907 if (extract_xml_pparse_next(buffer, &tag)) goto end;
908 if (strcmp(tag.name, "/line")) {
909 outf("Expected </line> but tag.name='%s'", tag.name);
910 errno = ESRCH;
911 goto end;
912 }
913 }
914 }
915 else {
916 /* Compressed. */
917 size_t image_data_size;
918 const char *c;
919 size_t i;
920 if (extract_xml_tag_attributes_find_size(&tag, "datasize", &image_data_size)) goto end;
921 if (extract_malloc(extract->alloc, &image_data, image_data_size)) goto end;
922 c = tag.text.chars;
923 for(i=0;;) {
924 int byte = 0;
925 int cc;
926 cc = *c;
927 c += 1;
928 if (cc == ' ' || cc == '\n') continue;
929 if (cc >= '0' && cc <= '9') byte += cc-'0';
930 else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a';
931 else goto compressed_error;
932 byte *= 16;
933
934 cc = *c;
935 c += 1;
936 if (cc >= '0' && cc <= '9') byte += cc-'0';
937 else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a';
938 else goto compressed_error;
939
940 image_data[i] = (char) byte;
941 i += 1;
942 if (i == image_data_size) {
943 break;
944 }
945 continue;
946
947 compressed_error:
948 outf("Unrecognised hex character '%x' at offset %lli in image data", cc, (long long) (c-tag.text.chars));
949 errno = EINVAL;
950 goto end;
951 }
952 if (extract_add_image(
953 extract,
954 type,
955 0 /*x*/,
956 0 /*y*/,
957 0 /*w*/,
958 0 /*h*/,
959 image_data,
960 image_data_size,
961 image_free_fn,
962 NULL
963 ))
964 {
965 goto end;
966 }
967 image_data = NULL;
968 }
969 if (extract_xml_pparse_next(buffer, &tag)) goto end;
970 if (strcmp(tag.name, "/image")) {
971 outf("Expected </image> but tag.name='%s'", tag.name);
972 errno = ESRCH;
973 goto end;
974 }
975 continue;
976 }
977 if (strcmp(tag.name, "span")) {
978 outf("Expected <span> but tag.name='%s'", tag.name);
979 errno = ESRCH;
980 goto end;
981 }
982
983 {
984 matrix_t ctm;
985 matrix_t trm;
986 char *font_name;
987 char *font_name2;
988 int font_bold;
989 int font_italic;
990 int wmode;
991 if (s_matrix_read(extract_xml_tag_attributes_find(&tag, "ctm"), &ctm)) goto end;
992 if (s_matrix_read(extract_xml_tag_attributes_find(&tag, "trm"), &trm)) goto end;
993 font_name = extract_xml_tag_attributes_find(&tag, "font_name");
994 if (!font_name) {
995 outf("Failed to find attribute 'font_name'");
996 goto end;
997 }
998 font_name2 = strchr(font_name, '+');
999 if (font_name2) font_name = font_name2 + 1;
1000 font_bold = strstr(font_name, "-Bold") ? 1 : 0;
1001 font_italic = strstr(font_name, "-Oblique") ? 1 : 0;
1002 if (extract_xml_tag_attributes_find_int(&tag, "wmode", &wmode)) goto end;
1003 if (extract_span_begin(extract,
1004 font_name,
1005 font_bold,
1006 font_italic,
1007 wmode,
1008 ctm.a,
1009 ctm.b,
1010 ctm.c,
1011 ctm.d,
1012 0,0,0,0)) goto end;
1013
1014 for(;;) {
1015 double x;
1016 double y;
1017 double adv;
1018 unsigned int ucs;
1019
1020 if (extract_xml_pparse_next(buffer, &tag)) {
1021 outf("Failed to find <char or </span");
1022 goto end;
1023 }
1024 if (!strcmp(tag.name, "/span")) {
1025 break;
1026 }
1027 if (strcmp(tag.name, "char")) {
1028 errno = ESRCH;
1029 outf("Expected <char> but tag.name='%s'", tag.name);
1030 goto end;
1031 }
1032
1033 if (extract_xml_tag_attributes_find_double(&tag, "x", &x)) goto end;
1034 if (extract_xml_tag_attributes_find_double(&tag, "y", &y)) goto end;
1035 if (extract_xml_tag_attributes_find_double(&tag, "adv", &adv)) goto end;
1036 if (extract_xml_tag_attributes_find_uint(&tag, "ucs", &ucs)) goto end;
1037
1038 /* BBox is bogus here. Analysis will fail. */
1039 if (extract_add_char(extract, x, y, ucs, adv, x, y, x + adv, y + adv)) goto end;
1040 }
1041
1042 extract_xml_tag_free(extract->alloc, &tag);
1043 }
1044 }
1045 if (extract_page_end(extract)) goto end;
1046 outf("page=%i subpage->num_spans=%i",
1047 document->pages_num, content_count_spans(&subpage->content));
1048 }
1049
1050 outf("num_spans=%i num_spans_split=%i num_spans_autosplit=%i",
1051 num_spans,
1052 extract->num_spans_split,
1053 extract->num_spans_autosplit
1054 );
1055
1056 ret = 0;
1057 end:
1058
1059 extract_xml_tag_free(extract->alloc, &tag);
1060 extract_free(extract->alloc, &image_data);
1061
1062 return ret;
1063 }
1064
1065 int
1066 extract_span_begin(
1067 extract_t *extract,
1068 const char *font_name,
1069 int font_bold,
1070 int font_italic,
1071 int wmode,
1072 double ctm_a,
1073 double ctm_b,
1074 double ctm_c,
1075 double ctm_d,
1076 double bbox_x0,
1077 double bbox_y0,
1078 double bbox_x1,
1079 double bbox_y1)
1080 {
1081 int e = -1;
1082 extract_page_t *page;
1083 subpage_t *subpage;
1084 span_t *span;
1085 document_t *document = &extract->document;
1086
1087 /* FIXME: RJW: Should continue the last span if everything is the same. */
1088
1089 assert(document->pages_num > 0);
1090 page = document->pages[document->pages_num-1];
1091 subpage = page->subpages[page->subpages_num-1];
1092 outf("extract_span_begin(): ctm=(%f %f %f %f) font_name=%s, wmode=%i",
1093 ctm_a,
1094 ctm_b,
1095 ctm_c,
1096 ctm_d,
1097 font_name,
1098 wmode);
1099 if (content_append_new_span(extract->alloc, &subpage->content, &span, document->current)) goto end;
1100 span->ctm.a = ctm_a;
1101 span->ctm.b = ctm_b;
1102 span->ctm.c = ctm_c;
1103 span->ctm.d = ctm_d;
1104 span->font_bbox.min.x = bbox_x0;
1105 span->font_bbox.min.y = bbox_y0;
1106 span->font_bbox.max.x = bbox_x1;
1107 span->font_bbox.max.y = bbox_y1;
1108
1109 {
1110 const char *ff = strchr(font_name, '+');
1111 const char *f = (ff) ? ff+1 : font_name;
1112 if (extract_strdup(extract->alloc, f, &span->font_name)) goto end;
1113 span->flags.font_bold = font_bold ? 1 : 0;
1114 span->flags.font_italic = font_italic ? 1 : 0;
1115 span->flags.wmode = wmode ? 1 : 0;
1116 extract->span_offset_x = 0;
1117 extract->span_offset_y = 0;
1118 }
1119
1120 e = 0;
1121 end:
1122
1123 return e;
1124 }
1125
1126 /* Create a new empty span, based on the current one. */
1127 static span_t *
1128 split_to_new_span(extract_alloc_t *alloc, content_root_t *content, span_t *span0)
1129 {
1130 content_t save;
1131 span_t *span;
1132 char *name;
1133
1134 if (extract_strdup(alloc, span0->font_name, &name))
1135 return NULL;
1136
1137 if (content_append_new_span(alloc, content, &span, span0->structure))
1138 {
1139 extract_free(alloc, &name);
1140 return NULL;
1141 }
1142
1143 save = span->base; /* Avoid overwriting linked list. */
1144 *span = *span0;
1145 span->base = save;
1146 span->font_name = name;
1147 span->chars = NULL;
1148 span->chars_num = 0;
1149
1150 return span;
1151 }
1152
1153 /*
1154 This routine returns the previous non-space-char, UNLESS the span
1155 starts with a space, in which case we accept that one.
1156 */
1157 static span_t *
1158 find_previous_non_space_char_ish(content_root_t *content, int *char_num, int *intervening_space)
1159 {
1160 content_t *s;
1161 int i;
1162
1163 *intervening_space = 0;
1164 for (s = content->base.prev; s != &content->base; s = s->prev)
1165 {
1166 span_t *span = (span_t *)s;
1167
1168 if (s->type != content_span)
1169 continue;
1170
1171 for (i = span->chars_num-1; i >= 0; i--)
1172 {
1173 if (span->chars[i].ucs != 32 || i == 0)
1174 {
1175 *char_num = i;
1176 return span;
1177 }
1178 *intervening_space = 1;
1179 }
1180 }
1181
1182 return NULL;
1183 }
1184
1185 point_t
1186 extract_predicted_end_of_char(char_t *char_, const span_t *span)
1187 {
1188 double adv = char_->adv;
1189 point_t dir = { adv * (1 - span->flags.wmode), adv * span->flags.wmode };
1190
1191 dir = extract_matrix4_transform_point(span->ctm, dir);
1192 dir.x += char_->x;
1193 dir.y += char_->y;
1194
1195 return dir;
1196 }
1197
1198 point_t
1199 extract_end_of_span(const span_t *span)
1200 {
1201 assert(span && span->chars_num > 0);
1202 return extract_predicted_end_of_char(&span->chars[span->chars_num-1], span);
1203 }
1204
1205 int extract_add_char(
1206 extract_t *extract,
1207 double x,
1208 double y,
1209 unsigned int ucs,
1210 double adv,
1211 double x0,
1212 double y0,
1213 double x1,
1214 double y1)
1215 {
1216 int e = -1;
1217 char_t *char_;
1218 extract_page_t *page = extract->document.pages[extract->document.pages_num-1];
1219 subpage_t *subpage = page->subpages[page->subpages_num-1];
1220 span_t *span = content_last_span(&subpage->content);
1221 span_t *span0;
1222 int char_num0;
1223 double dist, perp, scale_squared;
1224 point_t dir;
1225 int intervening_space;
1226
1227 if (span->flags.wmode)
1228 {
1229 dir.x = 0;
1230 dir.y = 1;
1231 scale_squared = span->ctm.c * span->ctm.c + span->ctm.d * span->ctm.d;
1232 }
1233 else
1234 {
1235 dir.x = 1;
1236 dir.y = 0;
1237 scale_squared = span->ctm.a * span->ctm.a + span->ctm.b * span->ctm.b;
1238 }
1239 dir = extract_matrix4_transform_point(span->ctm, dir);
1240
1241 outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv);
1242
1243 /* Is there a previous span to which we should consider attaching this char. */
1244 span0 = find_previous_non_space_char_ish(&subpage->content, &char_num0, &intervening_space);
1245
1246 /* Spans can't continue over different structure elements. */
1247 if (span0 && span0->structure != extract->document.current)
1248 span0 = NULL;
1249
1250 if (span0 == NULL)
1251 {
1252 /* No previous continuable span. */
1253 outf("%c x=%g y=%g adv=%g\n", ucs, x, y, adv);
1254 }
1255 else
1256 {
1257 /* We have a span. Check whether we need to break to a new line, or add (or subtract) a space. */
1258 char_t *char_prev = &span0->chars[char_num0];
1259 double adv0 = char_prev->adv;
1260 point_t predicted_end_of_char0 = extract_predicted_end_of_char(char_prev, span0);
1261 /* We don't currently have access to the size of the advance for a space.
1262 * Typically it's around 1 to 1/2 that of a real char. So guess at that
1263 * using the 2 advances we have available to us. */
1264 double space_guess = (adv0 + adv)/2 * extract->master_space_guess;
1265
1266 /* Use dot product to calculate the distance that we have moved along the direction vector. */
1267 dist = (x - predicted_end_of_char0.x) * dir.x + (y - predicted_end_of_char0.y) * dir.y;
1268 /* Use dot product to calculate the distance that we have moved perpendicular to the direction vector. */
1269 perp = (x - predicted_end_of_char0.x) * dir.y - (y - predicted_end_of_char0.y) * dir.x;
1270 /* Both dist and perp are multiplied by scale_squared. */
1271 dist /= scale_squared;
1272 perp /= scale_squared;
1273 /* So now, dist, perp, adv, adv0 and space_guess are all in pre-transform space. */
1274
1275 /* So fabs(dist) is expected to be 0, and perp is expected to be 0 for characters
1276 * "naturally placed" on a line. */
1277 outf("%c x=%g y=%g adv=%g dist=%g perp=%g\n", ucs, x, y, adv, dist, perp);
1278
1279 /* Arbitrary fractions here; ideally we should consult the font bbox, but we don't currently
1280 * have that. */
1281 if (fabs(perp) > 3*space_guess/2 || fabs(dist) > space_guess * 8)
1282 {
1283 /* Create new span. */
1284 if (span->chars_num > 0)
1285 {
1286 extract->num_spans_autosplit += 1;
1287 span = split_to_new_span(extract->alloc, &subpage->content, span);
1288 if (span == NULL) goto end;
1289 }
1290 }
1291 else if (intervening_space)
1292 {
1293 /* Some files, notably zlib.3.pdf appear to contain stray extra spaces within the PDF
1294 * content themselves. e.g. "suppor ts". We therefore spot when the
1295 * space allocated for a space isn't used, and remove the space. */
1296 /* MAGIC NUMBER WARNING. zlib.pdf says that /4 is not sensitive enough. /3 is OK. */
1297 if (dist < space_guess/3)
1298 {
1299 if (span->chars_num > 0)
1300 {
1301 span->chars_num--;
1302 /* Don't need to worry about it being empty, as we're about to add another char! */
1303 }
1304 else
1305 {
1306 span_t *space_span = content_prev_span(&span->base);
1307 assert(space_span->chars_num > 0);
1308 space_span->chars_num--;
1309 if (space_span->chars_num == 0)
1310 extract_span_free(extract->alloc, &space_span);
1311 }
1312 }
1313 }
1314 /* MAGIC NUMBER WARNING: We expect the space char to be about 1/2 as wide of a standard char.
1315 * zlib3.pdf shows that sometimes we need to insert a space when it's *just* smaller than
1316 * this. (e.g. 'eveninthe'). */
1317 else if (!intervening_space && dist > 2*space_guess/3)
1318 {
1319 /* Larger gap than expected. Add an extra space. */
1320 /* Where should the space go? At the predicted position where the previous char
1321 * ended. */
1322 char_ = extract_span_append_c(extract->alloc, span, ' ');
1323 if (char_ == NULL) goto end;
1324
1325 char_->x = predicted_end_of_char0.x;
1326 char_->y = predicted_end_of_char0.y;
1327 }
1328 }
1329
1330 char_ = extract_span_append_c(extract->alloc, span, ucs);
1331 if (char_ == NULL) goto end;
1332
1333 char_->x = x;
1334 char_->y = y;
1335
1336 char_->adv = adv;
1337 char_->bbox.min.x = x0;
1338 char_->bbox.min.y = y0;
1339 char_->bbox.max.x = x1;
1340 char_->bbox.max.y = y1;
1341
1342 e = 0;
1343 end:
1344
1345 if (span && span->chars_num == 0)
1346 {
1347 extract_span_free(extract->alloc, &span);
1348 }
1349
1350 return e;
1351 }
1352
1353
1354 int extract_span_end(extract_t *extract)
1355 {
1356 extract_page_t *page = extract->document.pages[extract->document.pages_num-1];
1357 subpage_t *subpage = page->subpages[page->subpages_num-1];
1358 span_t *span = content_last_span(&subpage->content);
1359
1360 if (span->chars_num == 0) {
1361 /* Calling code called extract_span_begin() then extract_span_end()
1362 without any call to extract_add_char(). Our joining code assumes that
1363 all spans are non-empty, so we need to delete this span. */
1364 extract_span_free(extract->alloc, &span);
1365 }
1366
1367 return 0;
1368 }
1369
1370
1371 int extract_add_image(
1372 extract_t *extract,
1373 const char *type,
1374 double x,
1375 double y,
1376 double w,
1377 double h,
1378 void *data,
1379 size_t data_size,
1380 extract_image_data_free data_free,
1381 void *data_free_handle)
1382 {
1383 int e = -1;
1384 extract_page_t *page = extract->document.pages[extract->document.pages_num-1];
1385 subpage_t *subpage = page->subpages[page->subpages_num-1];
1386 image_t *image;
1387
1388 extract->image_n += 1;
1389 if (content_append_new_image(extract->alloc, &subpage->content, &image)) goto end;
1390 image->x = x;
1391 image->y = y;
1392 image->w = w;
1393 image->h = h;
1394 image->data = data;
1395 image->data_size = data_size;
1396 image->data_free = data_free;
1397 image->data_free_handle = data_free_handle;
1398 if (extract_strdup(extract->alloc, type, &image->type)) goto end;
1399 if (extract_asprintf(extract->alloc, &image->id, "rId%i", extract->image_n) < 0) goto end;
1400 if (extract_asprintf(extract->alloc, &image->name, "image%i.%s", extract->image_n, image->type) < 0) goto end;
1401
1402 subpage->images_num += 1;
1403 outf("subpage->images_num=%i", subpage->images_num);
1404
1405 e = 0;
1406 end:
1407
1408 if (e) {
1409 extract_image_free(extract->alloc, &image);
1410 }
1411
1412 return e;
1413 }
1414
1415
1416 static int tablelines_append(extract_alloc_t *alloc, tablelines_t *tablelines, rect_t *rect, double color)
1417 {
1418 if (extract_realloc(
1419 alloc,
1420 &tablelines->tablelines,
1421 sizeof(*tablelines->tablelines) * (tablelines->tablelines_num + 1)
1422 )) return -1;
1423 tablelines->tablelines[ tablelines->tablelines_num].rect = *rect;
1424 tablelines->tablelines[ tablelines->tablelines_num].color = (float) color;
1425 tablelines->tablelines_num += 1;
1426
1427 return 0;
1428 }
1429
1430 static point_t transform(
1431 double x,
1432 double y,
1433 double ctm_a,
1434 double ctm_b,
1435 double ctm_c,
1436 double ctm_d,
1437 double ctm_e,
1438 double ctm_f)
1439 {
1440 point_t ret;
1441
1442 ret.x = ctm_a * x + ctm_b * y + ctm_e;
1443 ret.y = ctm_c * x + ctm_d * y + ctm_f;
1444
1445 return ret;
1446 }
1447
1448 static double s_min(double a, double b)
1449 {
1450 return (a < b) ? a : b;
1451 }
1452
1453 static double s_max(double a, double b)
1454 {
1455 return (a > b) ? a : b;
1456 }
1457
1458 int extract_add_path4(
1459 extract_t *extract,
1460 double ctm_a,
1461 double ctm_b,
1462 double ctm_c,
1463 double ctm_d,
1464 double ctm_e,
1465 double ctm_f,
1466 double x0,
1467 double y0,
1468 double x1,
1469 double y1,
1470 double x2,
1471 double y2,
1472 double x3,
1473 double y3,
1474 double color)
1475 {
1476 extract_page_t *page = extract->document.pages[extract->document.pages_num-1];
1477 subpage_t *subpage = page->subpages[page->subpages_num-1];
1478 point_t points[4] = {
1479 transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
1480 transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
1481 transform(x2, y2, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f),
1482 transform(x3, y3, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f)
1483 };
1484 rect_t rect;
1485 int i;
1486 double dx, dy;
1487
1488 outf("cmt=(%f %f %f %f %f %f) points=[(%f %f) (%f %f) (%f %f) (%f %f)]",
1489 ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f,
1490 x0, y0, x1, y1, x2, y2, x3, y3
1491 );
1492 outf("extract_add_path4(): [(%f %f) (%f %f) (%f %f) (%f %f)]",
1493 x0, y0, x1, y1, x2, y2, x3, y3);
1494 /* Find first step with dx > 0. */
1495 for (i=0; i<4; ++i)
1496 {
1497 if (points[(i+1) % 4].x > points[(i+0) % 4].x) break;
1498 }
1499 outf("i=%i", i);
1500 if (i == 4) return 0;
1501 rect.min.x = points[(i+0) % 4].x;
1502 rect.max.x = points[(i+1) % 4].x;
1503 if (points[(i+2) % 4].x != rect.max.x) return 0;
1504 if (points[(i+3) % 4].x != rect.min.x) return 0;
1505 y0 = points[(i+1) % 4].y;
1506 y1 = points[(i+2) % 4].y;
1507 if (y0 == y1) return 0;
1508 if (points[(i+3) % 4].y != y1) return 0;
1509 if (points[(i+4) % 4].y != y0) return 0;
1510 rect.min.y = (y1 > y0) ? y0 : y1;
1511 rect.max.y = (y1 > y0) ? y1 : y0;
1512
1513 dx = rect.max.x - rect.min.x;
1514 dy = rect.max.y - rect.min.y;
1515 if (dx / dy > 5)
1516 {
1517 /* Horizontal line. */
1518 outf("have found horizontal line: %s", extract_rect_string(&rect));
1519 if (tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color)) return -1;
1520 }
1521 else if (dy / dx > 5)
1522 {
1523 /* Vertical line. */
1524 outf("have found vertical line: %s", extract_rect_string(&rect));
1525 if (tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color)) return -1;
1526 }
1527
1528 return 0;
1529 }
1530
1531
1532 int extract_add_line(
1533 extract_t *extract,
1534 double ctm_a,
1535 double ctm_b,
1536 double ctm_c,
1537 double ctm_d,
1538 double ctm_e,
1539 double ctm_f,
1540 double width,
1541 double x0,
1542 double y0,
1543 double x1,
1544 double y1,
1545 double color)
1546 {
1547 extract_page_t *page = extract->document.pages[extract->document.pages_num-1];
1548 subpage_t *subpage = page->subpages[page->subpages_num-1];
1549 point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
1550 point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f);
1551 double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c));
1552 rect_t rect;
1553
1554 (void)color;
1555 rect.min.x = s_min(p0.x, p1.x);
1556 rect.min.y = s_min(p0.y, p1.y);
1557 rect.max.x = s_max(p0.x, p1.x);
1558 rect.max.y = s_max(p0.y, p1.y);
1559
1560 outf("%s: width=%f ((%f %f)(%f %f)) rect=%s",
1561 extract_FUNCTION,
1562 width,
1563 x0, y0, x1, y1,
1564 extract_rect_string(&rect)
1565 );
1566 if (rect.min.x == rect.max.x && rect.min.y == rect.max.y)
1567 {
1568 }
1569 else if (rect.min.x == rect.max.x)
1570 {
1571 rect.min.x -= width2 / 2;
1572 rect.max.x += width2 / 2;
1573 return tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color);
1574 }
1575 else if (rect.min.y == rect.max.y)
1576 {
1577 rect.min.y -= width2 / 2;
1578 rect.max.y += width2 / 2;
1579 return tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color);
1580 }
1581
1582 return 0;
1583 }
1584
1585 int extract_subpage_alloc(extract_alloc_t *alloc, rect_t mediabox, extract_page_t *page, subpage_t **psubpage)
1586 {
1587 subpage_t *subpage;
1588
1589 if (extract_malloc(alloc, psubpage, sizeof(subpage_t)))
1590 {
1591 return -1;
1592 }
1593 subpage = *psubpage;
1594 subpage->mediabox = mediabox;
1595 content_init_root(&subpage->content, NULL);
1596 subpage->images_num = 0;
1597 subpage->tablelines_horizontal.tablelines = NULL;
1598 subpage->tablelines_horizontal.tablelines_num = 0;
1599 subpage->tablelines_vertical.tablelines = NULL;
1600 subpage->tablelines_vertical.tablelines_num = 0;
1601 content_init_root(&subpage->tables, NULL);
1602
1603 if (extract_realloc2(alloc,
1604 &page->subpages,
1605 sizeof(subpage_t*) * page->subpages_num,
1606 sizeof(subpage_t*) * (page->subpages_num + 1)))
1607 {
1608 extract_free(alloc, psubpage);
1609 return -1;
1610 }
1611 page->subpages[page->subpages_num] = subpage;
1612 page->subpages_num += 1;
1613
1614 return 0;
1615 }
1616
1617 /* Appends new empty subpage_t to the last page of an extract->document. */
1618 static int extract_subpage_begin(extract_t *extract, double x0, double y0, double x1, double y1)
1619 {
1620 extract_page_t *page = extract->document.pages[extract->document.pages_num - 1];
1621 subpage_t *subpage;
1622 rect_t mediabox = { { x0, y0 }, { x1, y1 } };
1623 int e;
1624
1625 e = extract_subpage_alloc(extract->alloc, mediabox, page, &subpage);
1626
1627 if (e == 0)
1628 {
1629 }
1630
1631 return e;
1632 }
1633
1634 /* Appends new empty page_t to an extract->document. */
1635 int extract_page_begin(extract_t *extract, double x0, double y0, double x1, double y1)
1636 {
1637 extract_page_t *page;
1638
1639 if (extract_malloc(extract->alloc, &page, sizeof(*page))) return -1;
1640 page->mediabox.min.x = x0;
1641 page->mediabox.min.y = y0;
1642 page->mediabox.max.x = x1;
1643 page->mediabox.max.y = y1;
1644 page->subpages = NULL;
1645 page->subpages_num = 0;
1646 page->split = NULL;
1647
1648 if (extract_realloc2(
1649 extract->alloc,
1650 &extract->document.pages,
1651 sizeof(subpage_t*) * extract->document.pages_num,
1652 sizeof(subpage_t*) * (extract->document.pages_num + 1)
1653 )) {
1654 extract_free(extract->alloc, &page);
1655 return -1;
1656 }
1657
1658 extract->document.pages[extract->document.pages_num] = page;
1659 extract->document.pages_num += 1;
1660
1661 if (extract_subpage_begin(extract, x0, y0, x1, y1)) {
1662 extract->document.pages_num--;
1663 page_free(extract->alloc, &extract->document.pages[extract->document.pages_num]);
1664 return -1;
1665 }
1666
1667 return 0;
1668 }
1669
1670 int extract_fill_begin(
1671 extract_t *extract,
1672 double ctm_a,
1673 double ctm_b,
1674 double ctm_c,
1675 double ctm_d,
1676 double ctm_e,
1677 double ctm_f,
1678 double color)
1679 {
1680 assert(extract->path_type == path_type_NONE);
1681
1682 extract->path_type = path_type_FILL;
1683 extract->path.fill.color = color;
1684 extract->path.fill.n = 0;
1685 extract->path.fill.ctm.a = ctm_a;
1686 extract->path.fill.ctm.b = ctm_b;
1687 extract->path.fill.ctm.c = ctm_c;
1688 extract->path.fill.ctm.d = ctm_d;
1689 extract->path.fill.ctm.e = ctm_e;
1690 extract->path.fill.ctm.f = ctm_f;
1691
1692 return 0;
1693 }
1694
1695 int extract_stroke_begin(
1696 extract_t *extract,
1697 double ctm_a,
1698 double ctm_b,
1699 double ctm_c,
1700 double ctm_d,
1701 double ctm_e,
1702 double ctm_f,
1703 double line_width,
1704 double color)
1705 {
1706 assert(extract->path_type == path_type_NONE);
1707
1708 extract->path_type = path_type_STROKE;
1709 extract->path.stroke.ctm.a = ctm_a;
1710 extract->path.stroke.ctm.b = ctm_b;
1711 extract->path.stroke.ctm.c = ctm_c;
1712 extract->path.stroke.ctm.d = ctm_d;
1713 extract->path.stroke.ctm.e = ctm_e;
1714 extract->path.stroke.ctm.f = ctm_f;
1715 extract->path.stroke.width = line_width;
1716 extract->path.stroke.color = color;
1717 extract->path.stroke.point0_set = 0;
1718 extract->path.stroke.point_set = 0;
1719
1720 return 0;
1721 }
1722
1723 int extract_moveto(extract_t *extract, double x, double y)
1724 {
1725 if (extract->path_type == path_type_FILL)
1726 {
1727 if (extract->path.fill.n == -1) return 0;
1728 if (extract->path.fill.n != 0)
1729 {
1730 outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n);
1731 extract->path.fill.n = -1;
1732 return 0;
1733 }
1734 extract->path.fill.points[extract->path.fill.n].x = x;
1735 extract->path.fill.points[extract->path.fill.n].y = y;
1736 extract->path.fill.n += 1;
1737 return 0;
1738 }
1739 else if (extract->path_type == path_type_STROKE)
1740 {
1741 extract->path.stroke.point.x = x;
1742 extract->path.stroke.point.y = y;
1743 extract->path.stroke.point_set = 1;
1744 if (!extract->path.stroke.point0_set)
1745 {
1746 extract->path.stroke.point0 = extract->path.stroke.point;
1747 extract->path.stroke.point0_set = 1;
1748 }
1749 return 0;
1750 }
1751 else
1752 {
1753 assert(0);
1754 return -1;
1755 }
1756 }
1757
1758 int extract_lineto(extract_t *extract, double x, double y)
1759 {
1760 if (extract->path_type == path_type_FILL)
1761 {
1762 if (extract->path.fill.n == -1) return 0;
1763 if (extract->path.fill.n == 0 || extract->path.fill.n >= 4)
1764 {
1765 outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n);
1766 extract->path.fill.n = -1;
1767 return 0;
1768 }
1769 extract->path.fill.points[extract->path.fill.n].x = x;
1770 extract->path.fill.points[extract->path.fill.n].y = y;
1771 extract->path.fill.n += 1;
1772 return 0;
1773 }
1774 else if (extract->path_type == path_type_STROKE)
1775 {
1776 if (extract->path.stroke.point_set)
1777 {
1778 if (extract_add_line(
1779 extract,
1780 extract->path.stroke.ctm.a,
1781 extract->path.stroke.ctm.b,
1782 extract->path.stroke.ctm.c,
1783 extract->path.stroke.ctm.d,
1784 extract->path.stroke.ctm.e,
1785 extract->path.stroke.ctm.f,
1786 extract->path.stroke.width,
1787 extract->path.stroke.point.x,
1788 extract->path.stroke.point.y,
1789 x,
1790 y,
1791 extract->path.stroke.color))
1792 {
1793 return -1;
1794 }
1795 }
1796 extract->path.stroke.point.x = x;
1797 extract->path.stroke.point.y = y;
1798 extract->path.stroke.point_set = 1;
1799 if (!extract->path.stroke.point0_set)
1800 {
1801 extract->path.stroke.point0 = extract->path.stroke.point;
1802 extract->path.stroke.point0_set = 1;
1803 }
1804 return 0;
1805 }
1806 else
1807 {
1808 assert(0);
1809 return -1;
1810 }
1811 }
1812
1813 int extract_closepath(extract_t *extract)
1814 {
1815 if (extract->path_type == path_type_FILL)
1816 {
1817 if (extract->path.fill.n == 4)
1818 {
1819 /* We are closing a four-element path, so this could be a thin
1820 rectangle that defines a line in a table. */
1821 int e;
1822 e = extract_add_path4(
1823 extract,
1824 extract->path.fill.ctm.a,
1825 extract->path.fill.ctm.b,
1826 extract->path.fill.ctm.c,
1827 extract->path.fill.ctm.d,
1828 extract->path.fill.ctm.e,
1829 extract->path.fill.ctm.f,
1830 extract->path.fill.points[0].x,
1831 extract->path.fill.points[0].y,
1832 extract->path.fill.points[1].x,
1833 extract->path.fill.points[1].y,
1834 extract->path.fill.points[2].x,
1835 extract->path.fill.points[2].y,
1836 extract->path.fill.points[3].x,
1837 extract->path.fill.points[3].y,
1838 extract->path.fill.color);
1839 if (e) return e;
1840 }
1841 extract->path.fill.n = 0;
1842 return 0;
1843 }
1844 else if (extract->path_type == path_type_STROKE)
1845 {
1846 if (extract->path.stroke.point0_set && extract->path.stroke.point_set)
1847 {
1848 if (extract_add_line(
1849 extract,
1850 extract->path.stroke.ctm.a,
1851 extract->path.stroke.ctm.b,
1852 extract->path.stroke.ctm.c,
1853 extract->path.stroke.ctm.d,
1854 extract->path.stroke.ctm.e,
1855 extract->path.stroke.ctm.f,
1856 extract->path.stroke.width,
1857 extract->path.stroke.point.x,
1858 extract->path.stroke.point.y,
1859 extract->path.stroke.point0.x,
1860 extract->path.stroke.point0.y,
1861 extract->path.stroke.color))
1862 {
1863 return -1;
1864 }
1865 return 0;
1866 }
1867 extract->path.stroke.point = extract->path.stroke.point0;
1868 return 0;
1869 }
1870 else
1871 {
1872 assert(0);
1873 return -1;
1874 }
1875 }
1876
1877
1878 int extract_fill_end(extract_t *extract)
1879 {
1880 assert(extract->path_type == path_type_FILL);
1881 extract->path_type = path_type_NONE;
1882
1883 return 0;
1884 }
1885
1886
1887 int extract_stroke_end(extract_t *extract)
1888 {
1889 assert(extract->path_type == path_type_STROKE);
1890 extract->path_type = path_type_NONE;
1891
1892 return 0;
1893 }
1894
1895
1896
1897 static int extract_subpage_end(extract_t *extract)
1898 {
1899 (void) extract;
1900 return 0;
1901 }
1902
1903
1904 int extract_page_end(extract_t *extract)
1905 {
1906 if (extract_subpage_end(extract))
1907 return -1;
1908
1909 return 0;
1910 }
1911
1912 int extract_begin_struct(extract_t *extract, extract_struct_t type, int uid, int score)
1913 {
1914 document_t *document = &extract->document;
1915 structure_t *structure;
1916
1917 if (extract_malloc(extract->alloc, &structure, sizeof(*structure)))
1918 return -1;
1919
1920 structure->parent = document->current;
1921 structure->sibling_next = NULL;
1922 structure->sibling_prev = NULL;
1923 structure->kids_first = NULL;
1924 structure->kids_tail = &structure->kids_first;
1925 structure->type = type;
1926 structure->score = score;
1927 structure->uid = uid;
1928
1929 if (document->current == NULL)
1930 {
1931 /* New topmost entry. */
1932 document->current = structure;
1933 document->structure = structure;
1934 }
1935 else
1936 {
1937 /* Add a child */
1938 *document->current->kids_tail = structure;
1939 document->current->kids_tail = &structure->sibling_next;
1940 document->current = structure;
1941 }
1942
1943 return 0;
1944 }
1945
1946 int extract_end_struct(extract_t *extract)
1947 {
1948 document_t *document = &extract->document;
1949
1950 assert(document->current != NULL);
1951
1952 document->current = document->current->parent;
1953
1954 return 0;
1955 }
1956
1957 const char *extract_struct_string(extract_struct_t type)
1958 {
1959 switch (type)
1960 {
1961 default:
1962 return "UNKNOWN";
1963 case extract_struct_INVALID:
1964 return "INVALID";
1965 case extract_struct_UNDEFINED:
1966 return "UNDEFINED";
1967 case extract_struct_DOCUMENT:
1968 return "DOCUMENT";
1969 case extract_struct_PART:
1970 return "PART";
1971 case extract_struct_ART:
1972 return "ART";
1973 case extract_struct_SECT:
1974 return "SECT";
1975 case extract_struct_DIV:
1976 return "DIV";
1977 case extract_struct_BLOCKQUOTE:
1978 return "BLOCKQUOTE";
1979 case extract_struct_CAPTION:
1980 return "CAPTION";
1981 case extract_struct_TOC:
1982 return "TOC";
1983 case extract_struct_TOCI:
1984 return "TOCI";
1985 case extract_struct_INDEX:
1986 return "INDEX";
1987 case extract_struct_NONSTRUCT:
1988 return "NONSTRUCT";
1989 case extract_struct_PRIVATE:
1990 return "PRIVATE";
1991 case extract_struct_DOCUMENTFRAGMENT:
1992 return "DOCUMENTFRAGMENT";
1993 case extract_struct_ASIDE:
1994 return "ASIDE";
1995 case extract_struct_TITLE:
1996 return "TITLE";
1997 case extract_struct_FENOTE:
1998 return "FENOTE";
1999 case extract_struct_SUB:
2000 return "SUB";
2001 case extract_struct_P:
2002 return "P";
2003 case extract_struct_H:
2004 return "H";
2005 case extract_struct_H1:
2006 return "H1";
2007 case extract_struct_H2:
2008 return "H2";
2009 case extract_struct_H3:
2010 return "H3";
2011 case extract_struct_H4:
2012 return "H4";
2013 case extract_struct_H5:
2014 return "H5";
2015 case extract_struct_H6:
2016 return "H6";
2017 case extract_struct_LIST:
2018 return "LIST";
2019 case extract_struct_LISTITEM:
2020 return "LISTITEM";
2021 case extract_struct_LABEL:
2022 return "LABEL";
2023 case extract_struct_LISTBODY:
2024 return "LISTBODY";
2025 case extract_struct_TABLE:
2026 return "TABLE";
2027 case extract_struct_TR:
2028 return "TR";
2029 case extract_struct_TH:
2030 return "TH";
2031 case extract_struct_TD:
2032 return "TD";
2033 case extract_struct_THEAD:
2034 return "THEAD";
2035 case extract_struct_TBODY:
2036 return "TBODY";
2037 case extract_struct_TFOOT:
2038 return "TFOOT";
2039 case extract_struct_SPAN:
2040 return "SPAN";
2041 case extract_struct_QUOTE:
2042 return "QUOTE";
2043 case extract_struct_NOTE:
2044 return "NOTE";
2045 case extract_struct_REFERENCE:
2046 return "REFERENCE";
2047 case extract_struct_BIBENTRY:
2048 return "BIBENTRY";
2049 case extract_struct_CODE:
2050 return "CODE";
2051 case extract_struct_LINK:
2052 return "LINK";
2053 case extract_struct_ANNOT:
2054 return "ANNOT";
2055 case extract_struct_EM:
2056 return "EM";
2057 case extract_struct_STRONG:
2058 return "STRONG";
2059 case extract_struct_RUBY:
2060 return "RUBY";
2061 case extract_struct_RB:
2062 return "RB";
2063 case extract_struct_RT:
2064 return "RT";
2065 case extract_struct_RP:
2066 return "RP";
2067 case extract_struct_WARICHU:
2068 return "WARICHU";
2069 case extract_struct_WT:
2070 return "WT";
2071 case extract_struct_WP:
2072 return "WP";
2073 case extract_struct_FIGURE:
2074 return "FIGURE";
2075 case extract_struct_FORMULA:
2076 return "FORMULA";
2077 case extract_struct_FORM:
2078 return "FORM";
2079 case extract_struct_ARTIFACT:
2080 return "ARTIFACT";
2081 }
2082 }
2083
2084 static int
2085 paragraph_to_text(
2086 extract_alloc_t *alloc,
2087 paragraph_t *paragraph,
2088 extract_astring_t *text)
2089 {
2090 content_line_iterator lit;
2091 line_t *line;
2092
2093 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
2094 {
2095 content_span_iterator sit;
2096 span_t *span;
2097
2098 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit))
2099 {
2100 int c;
2101
2102 for (c=0; c<span->chars_num; ++c)
2103 {
2104 /* We encode each character as utf8. */
2105 char_t* char_ = &span->chars[c];
2106 unsigned cc = char_->ucs;
2107 if (extract_astring_catc_unicode(
2108 alloc,
2109 text,
2110 cc,
2111 0 /*xml*/,
2112 1 /*ascii_ligatures*/,
2113 1 /*ascii_dash*/,
2114 1 /*ascii_apostrophe*/
2115 )) return -1;
2116 }
2117 }
2118 }
2119 if (extract_astring_catc(alloc, text, '\n')) return -1;
2120
2121 return 0;
2122 }
2123
2124 static int
2125 paragraphs_to_text_content(
2126 extract_alloc_t *alloc,
2127 content_root_t *paragraphs,
2128 extract_astring_t *text)
2129 {
2130 content_iterator cit;
2131 content_t *content;
2132
2133 for (content = content_iterator_init(&cit, paragraphs); content != NULL; content = content_iterator_next(&cit))
2134 {
2135 if (content->type == content_paragraph)
2136 {
2137 if (paragraph_to_text(alloc, (paragraph_t *)content, text)) return -1;
2138 }
2139 else if (content->type == content_block)
2140 {
2141 block_t *block = (block_t *)content;
2142 content_paragraph_iterator pit;
2143 paragraph_t *paragraph;
2144
2145 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
2146 {
2147 if (paragraph_to_text(alloc, paragraph, text)) return -1;
2148 }
2149 }
2150 }
2151 return 0;
2152 }
2153
2154
2155 static int extract_write_tables_csv(extract_t *extract)
2156 {
2157 int ret = -1;
2158 int p;
2159 char *path = NULL;
2160 FILE *f = NULL;
2161 extract_astring_t text = {NULL, 0};
2162
2163 if (!extract->tables_csv_format) return 0;
2164
2165 outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format);
2166 outf("extract->document.pages_num=%i", extract->document.pages_num);
2167 for (p=0; p<extract->document.pages_num; ++p)
2168 {
2169 int c;
2170 extract_page_t *page = extract->document.pages[p];
2171 for (c=0; c<page->subpages_num; ++c)
2172 {
2173 content_table_iterator tit;
2174 table_t *table;
2175 subpage_t *subpage = page->subpages[c];
2176
2177 outf("p=%i subpage->tables_num=%i", p, content_count_tables(&subpage->tables));
2178 for (table = content_table_iterator_init(&tit, &subpage->tables); table != NULL; table = content_table_iterator_next(&tit))
2179 {
2180 int y;
2181 extract_free(extract->alloc, &path);
2182 if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end;
2183 extract->tables_csv_i += 1;
2184 outf("Writing table to: %s", path);
2185 outf("table->cells_num_x=%i", table->cells_num_x);
2186 outf("table->cells_num_y=%i", table->cells_num_y);
2187 f = fopen(path, "w");
2188 if (!f) goto end;
2189 for (y=0; y<table->cells_num_y; ++y)
2190 {
2191 int x;
2192 int have_output = 0;
2193 for (x=0; x<table->cells_num_x; ++x)
2194 {
2195 cell_t* cell = table->cells[table->cells_num_x * y + x];
2196 extract_astring_free(extract->alloc, &text);
2197 if (y==0)
2198 {
2199 outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect));
2200 }
2201 if (have_output) fprintf(f, ",");
2202 have_output = 1;
2203 if (paragraphs_to_text_content(
2204 extract->alloc,
2205 &cell->content,
2206 &text
2207 )) goto end;
2208 /* Reference cvs output trims trailing spaces. */
2209 extract_astring_char_truncate_if(&text, ' ');
2210 fprintf(f, "\"%s\"", text.chars ? text.chars : "");
2211 }
2212 fprintf(f, "\n");
2213 }
2214 fclose(f);
2215 f = NULL;
2216 }
2217 }
2218 }
2219
2220 ret = 0;
2221 end:
2222
2223 if (f) fclose(f);
2224 extract_free(extract->alloc, &path);
2225 extract_astring_free(extract->alloc, &text);
2226
2227 return ret;
2228 }
2229
2230
2231 int extract_process(
2232 extract_t *extract,
2233 int spacing,
2234 int rotation,
2235 int images)
2236 {
2237 int e = -1;
2238
2239 if (extract_realloc2(
2240 extract->alloc,
2241 &extract->contentss,
2242 sizeof(*extract->contentss) * extract->contentss_num,
2243 sizeof(*extract->contentss) * (extract->contentss_num + 1)
2244 )) goto end;
2245 extract_astring_init(&extract->contentss[extract->contentss_num]);
2246 extract->contentss_num += 1;
2247
2248 if (extract_document_join(extract->alloc, &extract->document, extract->layout_analysis, extract->master_space_guess)) goto end;
2249
2250 switch (extract->format)
2251 {
2252 case extract_format_ODT:
2253 if (extract_document_to_odt_content(
2254 extract->alloc,
2255 &extract->document,
2256 spacing,
2257 rotation,
2258 images,
2259 &extract->contentss[extract->contentss_num - 1],
2260 &extract->odt_styles
2261 )) goto end;
2262 break;
2263 case extract_format_DOCX:
2264 if (extract_document_to_docx_content(
2265 extract->alloc,
2266 &extract->document,
2267 spacing,
2268 rotation,
2269 images,
2270 &extract->contentss[extract->contentss_num - 1]
2271 )) goto end;
2272 break;
2273 case extract_format_HTML:
2274 if (extract_document_to_html_content(
2275 extract->alloc,
2276 &extract->document,
2277 rotation,
2278 images,
2279 &extract->contentss[extract->contentss_num - 1]
2280 )) goto end;
2281 break;
2282 case extract_format_JSON:
2283 if (extract_document_to_json_content(
2284 extract->alloc,
2285 &extract->document,
2286 rotation,
2287 images,
2288 &extract->contentss[extract->contentss_num - 1]
2289 )) goto end;
2290 break;
2291 case extract_format_TEXT:
2292 {
2293 int p;
2294 for (p=0; p<extract->document.pages_num; ++p)
2295 {
2296 extract_page_t* page = extract->document.pages[p];
2297 int c;
2298 for (c=0; c<page->subpages_num; ++c)
2299 {
2300 subpage_t* subpage = page->subpages[c];
2301 if (paragraphs_to_text_content(
2302 extract->alloc,
2303 &subpage->content,
2304 &extract->contentss[extract->contentss_num - 1]
2305 )) goto end;
2306 }
2307 }
2308 break;
2309 }
2310 default:
2311 outf0("Invalid format=%i", extract->format);
2312 assert(0);
2313 errno = EINVAL;
2314 return 1;
2315 }
2316
2317 if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end;
2318
2319 if (extract->tables_csv_format)
2320 {
2321 extract_write_tables_csv(extract);
2322 }
2323
2324 {
2325 int p;
2326 for (p=0; p<extract->document.pages_num; ++p) {
2327 page_free(extract->alloc, &extract->document.pages[p]);
2328 }
2329 extract_free(extract->alloc, &extract->document.pages);
2330 extract->document.pages_num = 0;
2331 }
2332
2333 e = 0;
2334 end:
2335
2336 return e;
2337 }
2338
2339 int extract_write(extract_t *extract, extract_buffer_t *buffer)
2340 {
2341 int e = -1;
2342 extract_zip_t *zip = NULL;
2343 char *text2 = NULL;
2344 int i;
2345
2346 switch (extract->format)
2347 {
2348 case extract_format_ODT:
2349 {
2350 if (extract_zip_open(buffer, &zip)) goto end;
2351 for (i=0; i<odt_template_items_num; ++i) {
2352 const odt_template_item_t* item = &odt_template_items[i];
2353 extract_free(extract->alloc, &text2);
2354 outf("i=%i item->name=%s", i, item->name);
2355 if (extract_odt_content_item(
2356 extract->alloc,
2357 extract->contentss,
2358 extract->contentss_num,
2359 &extract->odt_styles,
2360 &extract->images,
2361 item->name,
2362 item->text,
2363 &text2
2364 ))
2365 {
2366 goto end;
2367 }
2368 {
2369 const char* text3 = (text2) ? text2 : item->text;
2370 if (extract_zip_write_file(zip, text3, strlen(text3), item->name)) goto end;
2371 }
2372 }
2373 outf0("extract->images.images_num=%i", extract->images.images_num);
2374 for (i=0; i<extract->images.images_num; ++i) {
2375 image_t* image = extract->images.images[i];
2376 extract_free(extract->alloc, &text2);
2377 if (extract_asprintf(extract->alloc, &text2, "Pictures/%s", image->name) < 0) goto end;
2378 if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end;
2379 }
2380 if (extract_zip_close(&zip)) goto end;
2381 break;
2382 }
2383 case extract_format_DOCX:
2384 {
2385 if (extract_zip_open(buffer, &zip)) goto end;
2386 for (i=0; i<docx_template_items_num; ++i) {
2387 const docx_template_item_t* item = &docx_template_items[i];
2388 extract_free(extract->alloc, &text2);
2389 outf("i=%i item->name=%s", i, item->name);
2390 if (extract_docx_content_item(
2391 extract->alloc,
2392 extract->contentss,
2393 extract->contentss_num,
2394 &extract->images,
2395 item->name,
2396 item->text,
2397 &text2
2398 ))
2399 {
2400 goto end;
2401 }
2402
2403 {
2404 const char* text3 = (text2) ? text2 : item->text;
2405 if (extract_zip_write_file(zip, text3, strlen(text3), item->name)) goto end;
2406 }
2407 }
2408 for (i=0; i<extract->images.images_num; ++i) {
2409 image_t* image = extract->images.images[i];
2410 extract_free(extract->alloc, &text2);
2411 if (extract_asprintf(extract->alloc, &text2, "word/media/%s", image->name) < 0) goto end;
2412 if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end;
2413 }
2414 if (extract_zip_close(&zip)) goto end;
2415 break;
2416 }
2417 case extract_format_HTML:
2418 case extract_format_TEXT:
2419 for (i=0; i<extract->contentss_num; ++i)
2420 {
2421 if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end;
2422 }
2423 break;
2424 case extract_format_JSON:
2425 {
2426 int first = 1;
2427 if (extract_buffer_cat(buffer, "{\n\"elements\" : "))
2428 goto end;
2429 for (i=0; i<extract->contentss_num; ++i)
2430 {
2431 if (!first && extract_buffer_cat(buffer, ",\n"))
2432 goto end;
2433 if (extract->contentss[i].chars_num > 0)
2434 first = 0;
2435 if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end;
2436 }
2437 if (extract_buffer_cat(buffer, "\n}\n"))
2438 goto end;
2439 break;
2440 }
2441 default:
2442 outf0("Invalid format=%i", extract->format);
2443 assert(0);
2444 errno = EINVAL;
2445 return 1;
2446 }
2447
2448 e = 0;
2449 end:
2450
2451 if (e)
2452 {
2453 outf("failed: %s", strerror(errno));
2454 extract_zip_close(&zip);
2455 }
2456 extract_free(extract->alloc, &text2);
2457
2458 return e;
2459 }
2460
2461 int extract_write_content(extract_t *extract, extract_buffer_t *buffer)
2462 {
2463 int i;
2464
2465 for (i=0; i<extract->contentss_num; ++i) {
2466 if (extract_buffer_write(
2467 buffer,
2468 extract->contentss[i].chars,
2469 extract->contentss[i].chars_num,
2470 NULL /*o_actual*/
2471 )) return -1;
2472 }
2473
2474 return 0;
2475 }
2476
2477 static int string_ends_with(const char *string, const char *end)
2478 {
2479 size_t string_len = strlen(string);
2480 size_t end_len = strlen(end);
2481
2482 if (end_len > string_len) return 0;
2483
2484 return memcmp(string + string_len - end_len, end, end_len) == 0;
2485 }
2486
2487 int extract_write_template(
2488 extract_t *extract,
2489 const char *path_template,
2490 const char *path_out,
2491 int preserve_dir)
2492 {
2493 if (string_ends_with(path_out, ".odt"))
2494 {
2495 return extract_odt_write_template(
2496 extract->alloc,
2497 extract->contentss,
2498 extract->contentss_num,
2499 &extract->odt_styles,
2500 &extract->images,
2501 path_template,
2502 path_out,
2503 preserve_dir);
2504 }
2505 else
2506 {
2507 return extract_docx_write_template(
2508 extract->alloc,
2509 extract->contentss,
2510 extract->contentss_num,
2511 &extract->images,
2512 path_template,
2513 path_out,
2514 preserve_dir);
2515 }
2516 }
2517
2518
2519 void extract_end(extract_t **pextract)
2520 {
2521 int i;
2522 extract_t *extract = *pextract;
2523
2524 if (!extract) return;
2525
2526 extract_document_free(extract->alloc, &extract->document);
2527 for (i=0; i<extract->contentss_num; ++i) {
2528 extract_astring_free(extract->alloc, &extract->contentss[i]);
2529 }
2530 extract_free(extract->alloc, &extract->contentss);
2531 extract_images_free(extract->alloc, &extract->images);
2532 extract_odt_styles_free(extract->alloc, &extract->odt_styles);
2533
2534 extract_free(extract->alloc, pextract);
2535 }
2536
2537 void extract_internal_end(void)
2538 {
2539 extract_span_string(NULL, NULL);
2540 }
2541
2542 void extract_exp_min(extract_t *extract, size_t size)
2543 {
2544 extract_alloc_exp_min(extract->alloc, size);
2545 }
2546
2547 double extract_font_size(matrix4_t *ctm)
2548 {
2549 double font_size = extract_matrix_expansion(*ctm);
2550
2551 /* Round font_size to nearest 0.01. */
2552 font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f;
2553
2554 return font_size;
2555 }
2556
2557 rect_t extract_block_pre_rotation_bounds(block_t *block, double angle)
2558 {
2559 content_paragraph_iterator pit;
2560 paragraph_t *paragraph;
2561 rect_t pre_box = extract_rect_empty;
2562 matrix4_t unrotate, rotate;
2563 point_t centre, trans_centre;
2564
2565 /* Construct a matrix to undo the rotation that we are about to put into
2566 * the file. i.e. get us a matrix that maps us from where the chars are
2567 * positioned back to the pre-rotated position. These pre-rotated positions
2568 * can then be used to calculate the origin/extent of the area that we
2569 * need to put into the file. */
2570
2571 /* The well know rotation matrixes:
2572 *
2573 * CW: [ cos(theta) sin(theta) ] CCW: [ cos(theta) -sin(theta) ]
2574 * [ -sin(theta) cos(theta) ] [ sin(theta) cos(theta) ]
2575 */
2576
2577 /* Word gives us an angle to rotate by clockwise. So the inverse is the
2578 * CCW matrix: */
2579 unrotate.a = cos(angle);
2580 unrotate.b = -sin(angle);
2581 unrotate.c = -unrotate.b;
2582 unrotate.d = unrotate.a;
2583 /* And the forward rotation is the CW matrix: */
2584 rotate.a = unrotate.a; /* cos(theta) = cos(-theta) */
2585 rotate.b = -unrotate.b; /* sin(theta) = -sin(-theta) */
2586 rotate.c = -rotate.b;
2587 rotate.d = rotate.a;
2588
2589 /* So ctm.unrotate.rotate = ctm, by construction. ctm.unrotate should
2590 * (in the common cases where the ctm is just a scale + rotation) map
2591 * all our character locations back to a rectangular region. We now
2592 * calculate that region as pre_box. */
2593
2594 for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
2595 {
2596 content_line_iterator lit;
2597 line_t *line;
2598
2599 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
2600 {
2601 span_t *span0 = content_first_span(&line->content);
2602 span_t *span1 = content_last_span(&line->content);
2603 point_t start = { span0->chars[0].x, span0->chars[0].y};
2604 point_t end = extract_end_of_span(span1);
2605 double hoff = span0->font_bbox.max.y - (span0->font_bbox.min.y < 0 ? span0->font_bbox.min.y : 0);
2606
2607 outf("%f %f -> %f %f\n", start.x, start.y, end.x, end.y);
2608 start = extract_matrix4_transform_point(unrotate, start);
2609 end = extract_matrix4_transform_point(unrotate, end);
2610 outf(" ---------> %f %f -> %f %f\n", start.x, start.y, end.x, end.y);
2611
2612 /* Allow for the height of the span here. */
2613 hoff *= sqrt(span0->ctm.c * span0->ctm.c + span0->ctm.d * span0->ctm.d);
2614
2615 if (start.y < end.y)
2616 start.y -= hoff;
2617 else
2618 end.y -= hoff;
2619 pre_box = extract_rect_union_point(pre_box, start);
2620 pre_box = extract_rect_union_point(pre_box, end);
2621 }
2622 }
2623
2624 /* So pre_box rotated around the origin by angle should give us the region we want. */
2625 /* BUT word etc rotate around the centre of the box. So we need to offset the region to
2626 * allow for this. */
2627 /* So word, takes the declared box, and subtracts the centre vector from it. Then it
2628 * does the rotation (around the origin - now the centre of the box). Then it adds the
2629 * centre vector to it again. So the centre of the box does not change. Unfortunately,
2630 * we haven't easily got the centre vector of the transformed box to hand, so calculate
2631 * it by rerotating the centre vector of the pre_box.*/
2632 centre.x = (pre_box.min.x + pre_box.max.x)/2;
2633 centre.y = (pre_box.min.y + pre_box.max.y)/2;
2634 trans_centre = extract_matrix4_transform_point(rotate, centre);
2635 #if 0
2636 {
2637 point_t centre2 = extract_matrix4_transform_point(unrotate, trans_centre);
2638 centre2 = centre2;
2639 }
2640 #endif
2641 #if 0
2642 printf("Centre of this paragraph should be %f %f\n", trans_centre.x, trans_centre.y);
2643 #endif
2644
2645 /* So the centre of our pre_box should be trans_centre not centre. */
2646 centre.x -= trans_centre.x;
2647 centre.y -= trans_centre.y;
2648 pre_box.min.x -= centre.x;
2649 pre_box.min.y -= centre.y;
2650 pre_box.max.x -= centre.x;
2651 pre_box.max.y -= centre.y;
2652
2653 #if 0
2654 /* So, as a sanity check, convert the 4 corners back to a quad. */
2655 {
2656 rect_t centred_box = { pre_box.min.x - trans_centre.x,
2657 pre_box.min.y - trans_centre.y,
2658 pre_box.max.x - trans_centre.x,
2659 pre_box.max.y - trans_centre.y };
2660 point_t corner;
2661
2662 corner = extract_matrix4_transform_xy(rotate, centred_box.min.x, centred_box.min.y);
2663 corner.x += trans_centre.x;
2664 corner.y += trans_centre.y;
2665 printf("TL: %f %f\n", corner.x, corner.y);
2666 corner = extract_matrix4_transform_xy(rotate, centred_box.max.x, centred_box.min.y);
2667 corner.x += trans_centre.x;
2668 corner.y += trans_centre.y;
2669 printf("TR: %f %f\n", corner.x, corner.y);
2670 corner = extract_matrix4_transform_xy(rotate, centred_box.max.x, centred_box.max.y);
2671 corner.x += trans_centre.x;
2672 corner.y += trans_centre.y;
2673 printf("BR: %f %f\n", corner.x, corner.y);
2674 corner = extract_matrix4_transform_xy(rotate, centred_box.min.x, centred_box.max.y);
2675 corner.x += trans_centre.x;
2676 corner.y += trans_centre.y;
2677 printf("BL: %f %f\n", corner.x, corner.y);
2678 }
2679 #endif
2680
2681 /* And a further adjustment. If we mess up line widths, text can wrap too early,
2682 * resulting in content extending too far down the page, and truncating at the
2683 * bottom of the text frame. Similarly, line spacing. We can't tell word 'make
2684 * the box large enough', so we have to add a fudge factor and extend the bottom
2685 * of the box ourselves. As long as we aren't filling the background, or drawing
2686 * a bounding box, this should be fine.
2687 *
2688 * Unfortunately, we can't just extend pre_box downwards, because we rotate from
2689 * the centre of the box, so we need to adjust for that.
2690 */
2691 /* Double the height of the box. */
2692 {
2693 /* extra = how much to extend the box downwards. */
2694 double extra = pre_box.max.y - pre_box.min.y;
2695 /* So we are offsetting the centre of the box by offset. */
2696 point_t offset = { 0, extra/2 };
2697 point_t toffset;
2698 pre_box.max.y += extra;
2699 toffset = extract_matrix4_transform_point(rotate, offset);
2700 pre_box.min.x += toffset.x - offset.x;
2701 pre_box.min.y += toffset.y - offset.y;
2702 pre_box.max.x += toffset.x - offset.x;
2703 pre_box.max.y += toffset.y - offset.y;
2704 }
2705
2706 return pre_box;
2707 }
2708
2709 double extract_baseline_angle(const matrix4_t *ctm)
2710 {
2711 return atan2(ctm->b, ctm->a);
2712 }