Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/extract/src/extract.c @ 42:4621bd954a09
FIX: Need packaging at runtime to because of the parsing of mupdf_version into a tuple.
In the merge it was removed erroneously.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sat, 11 Oct 2025 17:14:12 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
#include "extract/extract.h" #include "extract/alloc.h" #include "astring.h" #include "document.h" #include "docx.h" #include "docx_template.h" #include "html.h" #include "json.h" #include "mem.h" #include "odt.h" #include "odt_template.h" #include "outf.h" #include "xml.h" #include "zip.h" #include <assert.h> #include <errno.h> #include <math.h> #include <stdarg.h> #include <stdio.h> #include <stdlib.h> #include <string.h> const rect_t extract_rect_infinite = { { -DBL_MAX, -DBL_MAX }, { DBL_MAX, DBL_MAX } }; const rect_t extract_rect_empty = { { DBL_MAX, DBL_MAX }, { -DBL_MAX, -DBL_MAX } }; double extract_matrix_expansion(matrix4_t m) { return sqrt(fabs(m.a * m.d - m.b * m.c)); } matrix4_t extract_matrix4_invert(const matrix4_t *ctm) { matrix4_t ctm_inverse = {1, 0, 0, 1}; double ctm_det = ctm->a*ctm->d - ctm->b*ctm->c; if (ctm_det == 0) { outf("cannot invert ctm=(%f %f %f %f)", ctm->a, ctm->b, ctm->c, ctm->d); } else { ctm_inverse.a = +ctm->d / ctm_det; ctm_inverse.b = -ctm->b / ctm_det; ctm_inverse.c = -ctm->c / ctm_det; ctm_inverse.d = +ctm->a / ctm_det; } return ctm_inverse; } static void char_init(char_t* item) { item->x = 0; item->y = 0; item->ucs = 0; item->adv = 0; item->bbox = extract_rect_empty; } const char *extract_point_string(const point_t *point) { static char buffer[128]; snprintf(buffer, sizeof(buffer), "(%f %f)", point->x, point->y); return buffer; } const char *extract_rect_string(const rect_t *rect) { static char buffer[2][256]; static int i = 0; i = (i + 1) % 2; snprintf(buffer[i], sizeof(buffer[i]), "((%f %f) (%f %f))", rect->min.x, rect->min.y, rect->max.x, rect->max.y); return buffer[i]; } const char *extract_span_string(extract_alloc_t *alloc, span_t *span) { static extract_astring_t ret = {0}; double x0 = 0; double y0 = 0; double x1 = 0; double y1 = 0; int c0 = 0; int c1 = 0; int i; extract_astring_free(alloc, &ret); if (span == NULL) { /* This frees our internal data, and is used by extract_internal_end(). */ return NULL; } if (span->chars_num) { c0 = span->chars[0].ucs; x0 = span->chars[0].x; y0 = span->chars[0].y; c1 = span->chars[span->chars_num-1].ucs; x1 = span->chars[span->chars_num-1].x; y1 = span->chars[span->chars_num-1].y; } { char buffer[400]; snprintf(buffer, sizeof(buffer), "span ctm=%s chars_num=%i (%c:%f,%f)..(%c:%f,%f) font=%s:(%f) wmode=%i chars_num=%i: ", extract_matrix4_string(&span->ctm), span->chars_num, c0, x0, y0, c1, x1, y1, span->font_name, extract_font_size(&span->ctm), span->flags.wmode, span->chars_num ); extract_astring_cat(alloc, &ret, buffer); for (i=0; i<span->chars_num; ++i) { snprintf( buffer, sizeof(buffer), " i=%i {x=%f y=%f ucs=%i adv=%f}", i, span->chars[i].x, span->chars[i].y, span->chars[i].ucs, span->chars[i].adv ); extract_astring_cat(alloc, &ret, buffer); } } extract_astring_cat(alloc, &ret, ": "); extract_astring_catc(alloc, &ret, '"'); for (i=0; i<span->chars_num; ++i) extract_astring_catc(alloc, &ret, (char) span->chars[i].ucs); extract_astring_catc(alloc, &ret, '"'); return ret.chars; } char_t *extract_span_append_c(extract_alloc_t *alloc, span_t *span, int c) { char_t *item; if (extract_realloc2(alloc, &span->chars, sizeof(*span->chars) * span->chars_num, sizeof(*span->chars) * (span->chars_num + 1))) { return NULL; } item = &span->chars[span->chars_num]; span->chars_num += 1; char_init(item); item->ucs = c; return item; } char_t *extract_span_char_last(span_t *span) { assert(span->chars_num > 0); return &span->chars[span->chars_num-1]; } /* Returns first span in a line. */ span_t *extract_line_span_last(line_t *line) { assert(line->content.base.prev != &line->content.base && line->content.base.prev->type == content_span); return (span_t *)line->content.base.prev; } span_t *extract_line_span_first(line_t *line) { assert(line->content.base.next != &line->content.base && line->content.base.next->type == content_span); return (span_t *)line->content.base.next; } void extract_paragraph_free(extract_alloc_t *alloc, paragraph_t **pparagraph) { paragraph_t *paragraph = *pparagraph; if (paragraph == NULL) return; content_unlink(¶graph->base); content_clear(alloc, ¶graph->content); extract_free(alloc, pparagraph); } void extract_block_free(extract_alloc_t *alloc, block_t **pblock) { block_t *block = *pblock; if (block == NULL) return; content_unlink(&block->base); content_clear(alloc, &block->content); extract_free(alloc, pblock); } void extract_table_free(extract_alloc_t *alloc, table_t **ptable) { int c; table_t *table = *ptable; content_unlink(&table->base); for (c = 0; c< table->cells_num_x * table->cells_num_y; ++c) { extract_cell_free(alloc, &table->cells[c]); } extract_free(alloc, &table->cells); extract_free(alloc, ptable); } static void structure_clear(extract_alloc_t *alloc, structure_t *structure) { while (structure != NULL) { structure_t *next = structure->sibling_next; structure_clear(alloc, structure->kids_first); extract_free(alloc, &structure); structure = next; } } void extract_subpage_free(extract_alloc_t *alloc, subpage_t **psubpage) { subpage_t *subpage = *psubpage; if (!subpage) return; content_clear(alloc, &subpage->content); content_clear(alloc, &subpage->tables); extract_free(alloc, &subpage->tablelines_horizontal.tablelines); extract_free(alloc, &subpage->tablelines_vertical.tablelines); extract_free(alloc, psubpage); } static void page_free(extract_alloc_t *alloc, extract_page_t **ppage) { int c; extract_page_t *page = *ppage; if (!page) return; for (c=0; c<page->subpages_num; ++c) { subpage_t *subpage = page->subpages[c]; extract_subpage_free(alloc, &subpage); } extract_split_free(alloc, &page->split); extract_free(alloc, &page->subpages); extract_free(alloc, ppage); } void content_append(content_root_t *root, content_t *content) { assert(root && root->base.type == content_root); /* Unlink content from anywhere it might be. */ content_unlink(content); /* Sanity check root. */ if (root->base.next == &root->base) { assert(root->base.prev == &root->base); } /* And append content */ content->next = &root->base; content->prev = root->base.prev; content->prev->next = content; root->base.prev = content; } void content_append_span(content_root_t *root, span_t *span) { content_append(root, &span->base); } void content_append_line(content_root_t *root, line_t *line) { content_append(root, &line->base); } void content_append_paragraph(content_root_t *root, paragraph_t *paragraph) { content_append(root, ¶graph->base); } void content_append_block(content_root_t *root, block_t *block) { content_append(root, &block->base); } int content_new_root(extract_alloc_t *alloc, content_root_t **proot) { if (extract_malloc(alloc, proot, sizeof(**proot))) return -1; content_init_root(*proot, NULL); return 0; } int content_new_span(extract_alloc_t *alloc, span_t **pspan, structure_t *structure) { if (extract_malloc(alloc, pspan, sizeof(**pspan))) return -1; extract_span_init(*pspan, structure); return 0; } int content_new_line(extract_alloc_t *alloc, line_t **pline) { if (extract_malloc(alloc, pline, sizeof(**pline))) return -1; extract_line_init(*pline); return 0; } int content_new_paragraph(extract_alloc_t *alloc, paragraph_t **pparagraph) { if (extract_malloc(alloc, pparagraph, sizeof(**pparagraph))) return -1; extract_paragraph_init(*pparagraph); return 0; } int content_new_block(extract_alloc_t *alloc, block_t **pblock) { if (extract_malloc(alloc, pblock, sizeof(**pblock))) return -1; extract_block_init(*pblock); return 0; } int content_new_table(extract_alloc_t *alloc, table_t **ptable) { if (extract_malloc(alloc, ptable, sizeof(**ptable))) return -1; extract_table_init(*ptable); return 0; } /* Appends new empty span content to a content_list_t; returns -1 with errno set on error. */ int content_append_new_span(extract_alloc_t *alloc, content_root_t *root, span_t **pspan, structure_t *structure) { if (content_new_span(alloc, pspan, structure)) return -1; content_append(root, &(*pspan)->base); return 0; } /* Appends new empty line content to a content_list_t; returns -1 with errno set on error. */ int content_append_new_line(extract_alloc_t *alloc, content_root_t *root, line_t **pline) { if (content_new_line(alloc, pline)) return -1; content_append(root, &(*pline)->base); return 0; } /* Appends new empty paragraph content to a content_list_t; returns -1 with errno set on error. */ int content_append_new_paragraph(extract_alloc_t *alloc, content_root_t *root, paragraph_t **pparagraph) { if (content_new_paragraph(alloc, pparagraph)) return -1; content_append(root, &(*pparagraph)->base); return 0; } /* Appends new empty block content to a content_list_t; returns -1 with errno set on error. */ int content_append_new_block(extract_alloc_t *alloc, content_root_t *root, block_t **pblock) { if (content_new_block(alloc, pblock)) return -1; content_append(root, &(*pblock)->base); return 0; } /* Appends new empty table content to a content_list_t; returns -1 with errno set on error. */ int content_append_new_table(extract_alloc_t *alloc, content_root_t *root, table_t **ptable) { if (content_new_table(alloc, ptable)) return -1; content_append(root, &(*ptable)->base); return 0; } /* Appends new empty image content to a content_list_t; returns -1 with errno set on error. */ int content_append_new_image(extract_alloc_t *alloc, content_root_t *root, image_t **pimage) { if (extract_malloc(alloc, pimage, sizeof(**pimage))) return -1; extract_image_init(*pimage); content_append(root, &(*pimage)->base); return 0; } void content_replace(content_t *current, content_t *replacement) { assert(current->type != content_root && replacement->type != content_root); /* Unlink replacement. */ if (replacement->prev) { replacement->prev->next = replacement->next; replacement->next->prev = replacement->prev; } /* Insert replacement */ replacement->prev = current->prev; current->prev->next = replacement; replacement->next = current->next; current->next->prev = replacement; /* Unlink current */ current->prev = NULL; current->next = NULL; } /* Replaces current element with a new empty paragraph content; returns -1 with errno set on error. */ int content_replace_new_paragraph(extract_alloc_t *alloc, content_t *current, paragraph_t **pparagraph) { if (content_new_paragraph(alloc, pparagraph)) return -1; content_replace(current, &(*pparagraph)->base); return 0; } /* Replaces current element with a new empty block content; returns -1 with errno set on error. */ int content_replace_new_block(extract_alloc_t *alloc, content_t *current, block_t **pblock) { if (content_new_block(alloc, pblock)) return -1; content_replace(current, &(*pblock)->base); return 0; } /* Replaces current element with a new empty line content; returns -1 with errno set on error. */ int content_replace_new_line(extract_alloc_t *alloc, content_t *current, line_t **pline) { if (content_new_line(alloc, pline)) return -1; content_replace(current, &(*pline)->base); return 0; } static void extract_images_free(extract_alloc_t *alloc, images_t *images) { int i; for (i=0; i<images->images_num; ++i) { extract_image_clear(alloc, images->images[i]); extract_free(alloc, &images->images[i]); } extract_free(alloc, &images->images); extract_free(alloc, &images->imagetypes); images->images_num = 0; images->imagetypes_num = 0; } /* Move image_t's from document->subpage[] to *o_images. On return document->subpage[].images* will be NULL etc. */ static int extract_document_images(extract_alloc_t *alloc, document_t *document, images_t *o_images) { int e = -1; int p; images_t images = {0}; outf("extract_document_images(): images.images_num=%i", images.images_num); for (p=0; p<document->pages_num; ++p) { extract_page_t *page = document->pages[p]; int c; for (c=0; c<page->subpages_num; ++c) { subpage_t *subpage = page->subpages[c]; content_image_iterator iit; image_t *image; int i; for (i = 0, image = content_image_iterator_init(&iit, &subpage->content); image != NULL; i++, image = content_image_iterator_next(&iit)) { if (extract_realloc2(alloc, &images.images, sizeof(image_t) * images.images_num, sizeof(image_t) * (images.images_num + 1))) goto end; outf("p=%i i=%i image->name=%s image->id=%s", p, i, image->name, image->id); assert(image->name); content_unlink(&image->base); images.images[images.images_num] = image; images.images_num += 1; /* Add image type if we haven't seen it before. */ { int it; for (it=0; it<images.imagetypes_num; ++it) { outf("it=%i images.imagetypes[it]=%s image->type=%s", it, images.imagetypes[it], image->type); if (!strcmp(images.imagetypes[it], image->type)) { break; } } if (it == images.imagetypes_num) { /* We haven't seen this image type before. */ if (extract_realloc2( alloc, &images.imagetypes, sizeof(char*) * images.imagetypes_num, sizeof(char*) * (images.imagetypes_num + 1) )) goto end; assert(image->type); images.imagetypes[images.imagetypes_num] = image->type; images.imagetypes_num += 1; outf("have added images.imagetypes_num=%i", images.imagetypes_num); } } } } } e = 0; end: if (e) { extract_free(alloc, &images.images); } else { *o_images = images; } return e; } static void extract_document_free(extract_alloc_t *alloc, document_t *document) { int p; if (!document) return; for (p=0; p<document->pages_num; ++p) { page_free(alloc, &document->pages[p]); } extract_free(alloc, &document->pages); document->pages = NULL; document->pages_num = 0; structure_clear(alloc, document->structure); } /* Returns +1, 0 or -1 depending on sign of x. */ static int s_sign(double x) { if (x < 0) return -1; if (x > 0) return +1; return 0; } int extract_matrix4_cmp(const matrix4_t *lhs, const matrix4_t *rhs) { int ret; ret = s_sign(lhs->a - rhs->a); if (ret) return ret; ret = s_sign(lhs->b - rhs->b); if (ret) return ret; ret = s_sign(lhs->c - rhs->c); if (ret) return ret; ret = s_sign(lhs->d - rhs->d); if (ret) return ret; return 0; } point_t extract_matrix4_transform_point(matrix4_t m, point_t p) { double x = p.x; p.x = m.a * x + m.c * p.y; p.y = m.b * x + m.d * p.y; return p; } point_t extract_matrix4_transform_xy(matrix4_t m, double x, double y) { point_t p; p.x = m.a * x + m.c * y; p.y = m.b * x + m.d * y; return p; } matrix_t extract_multiply_matrix_matrix(matrix_t m1, matrix_t m2) { matrix_t ret; ret.a = m1.a * m2.a + m1.b * m2.c; ret.b = m1.a * m2.b + m1.b * m2.d; ret.c = m1.c * m2.a + m1.d * m2.c; ret.d = m1.c * m2.b + m1.d * m2.d; ret.e = m1.e * m2.a + m1.f * m2.c + m2.e; ret.f = m1.e * m2.b + m1.f * m2.d + m2.f; return ret; } matrix4_t extract_multiply_matrix4_matrix4(matrix4_t m1, matrix4_t m2) { matrix4_t ret; ret.a = m1.a * m2.a + m1.b * m2.c; ret.b = m1.a * m2.b + m1.b * m2.d; ret.c = m1.c * m2.a + m1.d * m2.c; ret.d = m1.c * m2.b + m1.d * m2.d; return ret; } static int s_matrix_read(const char *text, matrix_t *matrix) { int n; if (!text) { outf("text is NULL in s_matrix_read()"); errno = EINVAL; return -1; } n = sscanf(text, "%lf %lf %lf %lf %lf %lf", &matrix->a, &matrix->b, &matrix->c, &matrix->d, &matrix->e, &matrix->f); if (n != 6) { errno = EINVAL; return -1; } return 0; } static void document_init(document_t *document) { document->pages = NULL; document->pages_num = 0; document->structure = NULL; document->current = NULL; } /* If we exceed MAX_STRUCT_NEST then this probably indicates that * structure nesting is not to be trusted. */ #define MAX_STRUCT_NEST 64 struct extract_t { extract_alloc_t *alloc; int layout_analysis; double master_space_guess; document_t document; /* Number of extra spans from subpage_span_end_clean(). */ int num_spans_split; /* Number of extra spans from autosplit=1. */ int num_spans_autosplit; /* Only used if autosplit is non-zero. */ double span_offset_x; double span_offset_y; /* Used to generate unique ids for images. */ int image_n; /* List of strings that are the generated docx content for each page. When * zip_* can handle appending of data, we will be able to remove this list. */ extract_astring_t *contentss; int contentss_num; images_t images; extract_format_t format; extract_odt_styles_t odt_styles; char *tables_csv_format; int tables_csv_i; enum { path_type_NONE, path_type_FILL, path_type_STROKE, } path_type; union { struct { matrix_t ctm; double color; point_t points[4]; int n; } fill; struct { matrix_t ctm; double color; double width; point_t point0; int point0_set; point_t point; int point_set; } stroke; } path; int next_uid; }; int extract_begin(extract_alloc_t *alloc, extract_format_t format, extract_t **pextract) { extract_t *extract; *pextract = NULL; if (1 && format != extract_format_ODT && format != extract_format_DOCX && format != extract_format_HTML && format != extract_format_TEXT && format != extract_format_JSON ) { outf0("Invalid format=%i\n", format); errno = EINVAL; return -1; } /* Create the extract structure. */ if (extract_malloc(alloc, &extract, sizeof(*extract))) return -1; extract_bzero(extract, sizeof(*extract)); extract->alloc = alloc; extract->master_space_guess = 0.5; document_init(&extract->document); /* FIXME: Start at 10 because template document might use some low-numbered IDs. */ extract->image_n = 10; extract->format = format; extract->tables_csv_format = NULL; extract->tables_csv_i = 0; extract->next_uid = 1; *pextract = extract; return 0; } void extract_set_space_guess(extract_t *extract, double space_guess) { extract->master_space_guess = space_guess; } int extract_set_layout_analysis(extract_t *extract, int enable) { extract->layout_analysis = enable; return 0; } int extract_tables_csv_format(extract_t *extract, const char *path_format) { return extract_strdup(extract->alloc, path_format, &extract->tables_csv_format); } static void image_free_fn(void *handle, void *image_data) { (void) handle; free(image_data); } int extract_read_intermediate(extract_t *extract, extract_buffer_t *buffer) { int ret = -1; document_t *document = &extract->document; char *image_data = NULL; int num_spans = 0; extract_xml_tag_t tag; extract_xml_tag_init(&tag); if (extract_xml_pparse_init(extract->alloc, buffer, NULL /*first_line*/)) { outf("Failed to read start of intermediate data: %s", strerror(errno)); goto end; } /* Data read from <path> is expected to be XML looking like: <page> <span> <char ...> <char ...> ... </span> <span> ... </span> ... </page> <page> ... </page> ... We convert this into a list of subpage_t's, each containing a list of span_t's, each containing a list of char_t's. While doing this, we do some within-span processing by calling subpage_span_end_clean(): Remove spurious spaces. Split spans in two where there seem to be large gaps between glyphs. */ for(;;) { extract_page_t *page; subpage_t *subpage; rect_t mediabox = extract_rect_infinite; /* Fake mediabox */ int e = extract_xml_pparse_next(buffer, &tag); if (e == 1) break; /* EOF. */ if (e) goto end; if (!strcmp(tag.name, "?xml")) { /* We simply skip this if we find it. As of 2020-07-31, mutool adds this header to mupdf raw output, but gs txtwrite does not include it. */ continue; } if (strcmp(tag.name, "page")) { outf("Expected <page> but tag.name='%s'", tag.name); errno = ESRCH; goto end; } outfx("loading spans for page %i...", document->pages_num); if (extract_page_begin(extract, mediabox.min.x, mediabox.min.y, mediabox.max.x, mediabox.max.y)) goto end; page = extract->document.pages[extract->document.pages_num-1]; if (!page) goto end; subpage = page->subpages[page->subpages_num-1]; if (!subpage) goto end; for(;;) { if (extract_xml_pparse_next(buffer, &tag)) goto end; if (!strcmp(tag.name, "/page")) { num_spans += content_count_spans(&subpage->content); break; } if (!strcmp(tag.name, "image")) { const char* type = extract_xml_tag_attributes_find(&tag, "type"); if (!type) { errno = EINVAL; goto end; } outf("image type=%s", type); if (!strcmp(type, "pixmap")) { int w; int h; int y; if (extract_xml_tag_attributes_find_int(&tag, "w", &w)) goto end; if (extract_xml_tag_attributes_find_int(&tag, "h", &h)) goto end; for (y=0; y<h; ++y) { int yy; if (extract_xml_pparse_next(buffer, &tag)) goto end; if (strcmp(tag.name, "line")) { outf("Expected <line> but tag.name='%s'", tag.name); errno = ESRCH; goto end; } if (extract_xml_tag_attributes_find_int(&tag, "y", &yy)) goto end; if (yy != y) { outf("Expected <line y=%i> but found <line y=%i>", y, yy); errno = ESRCH; goto end; } if (extract_xml_pparse_next(buffer, &tag)) goto end; if (strcmp(tag.name, "/line")) { outf("Expected </line> but tag.name='%s'", tag.name); errno = ESRCH; goto end; } } } else { /* Compressed. */ size_t image_data_size; const char *c; size_t i; if (extract_xml_tag_attributes_find_size(&tag, "datasize", &image_data_size)) goto end; if (extract_malloc(extract->alloc, &image_data, image_data_size)) goto end; c = tag.text.chars; for(i=0;;) { int byte = 0; int cc; cc = *c; c += 1; if (cc == ' ' || cc == '\n') continue; if (cc >= '0' && cc <= '9') byte += cc-'0'; else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; else goto compressed_error; byte *= 16; cc = *c; c += 1; if (cc >= '0' && cc <= '9') byte += cc-'0'; else if (cc >= 'a' && cc <= 'f') byte += 10 + cc - 'a'; else goto compressed_error; image_data[i] = (char) byte; i += 1; if (i == image_data_size) { break; } continue; compressed_error: outf("Unrecognised hex character '%x' at offset %lli in image data", cc, (long long) (c-tag.text.chars)); errno = EINVAL; goto end; } if (extract_add_image( extract, type, 0 /*x*/, 0 /*y*/, 0 /*w*/, 0 /*h*/, image_data, image_data_size, image_free_fn, NULL )) { goto end; } image_data = NULL; } if (extract_xml_pparse_next(buffer, &tag)) goto end; if (strcmp(tag.name, "/image")) { outf("Expected </image> but tag.name='%s'", tag.name); errno = ESRCH; goto end; } continue; } if (strcmp(tag.name, "span")) { outf("Expected <span> but tag.name='%s'", tag.name); errno = ESRCH; goto end; } { matrix_t ctm; matrix_t trm; char *font_name; char *font_name2; int font_bold; int font_italic; int wmode; if (s_matrix_read(extract_xml_tag_attributes_find(&tag, "ctm"), &ctm)) goto end; if (s_matrix_read(extract_xml_tag_attributes_find(&tag, "trm"), &trm)) goto end; font_name = extract_xml_tag_attributes_find(&tag, "font_name"); if (!font_name) { outf("Failed to find attribute 'font_name'"); goto end; } font_name2 = strchr(font_name, '+'); if (font_name2) font_name = font_name2 + 1; font_bold = strstr(font_name, "-Bold") ? 1 : 0; font_italic = strstr(font_name, "-Oblique") ? 1 : 0; if (extract_xml_tag_attributes_find_int(&tag, "wmode", &wmode)) goto end; if (extract_span_begin(extract, font_name, font_bold, font_italic, wmode, ctm.a, ctm.b, ctm.c, ctm.d, 0,0,0,0)) goto end; for(;;) { double x; double y; double adv; unsigned int ucs; if (extract_xml_pparse_next(buffer, &tag)) { outf("Failed to find <char or </span"); goto end; } if (!strcmp(tag.name, "/span")) { break; } if (strcmp(tag.name, "char")) { errno = ESRCH; outf("Expected <char> but tag.name='%s'", tag.name); goto end; } if (extract_xml_tag_attributes_find_double(&tag, "x", &x)) goto end; if (extract_xml_tag_attributes_find_double(&tag, "y", &y)) goto end; if (extract_xml_tag_attributes_find_double(&tag, "adv", &adv)) goto end; if (extract_xml_tag_attributes_find_uint(&tag, "ucs", &ucs)) goto end; /* BBox is bogus here. Analysis will fail. */ if (extract_add_char(extract, x, y, ucs, adv, x, y, x + adv, y + adv)) goto end; } extract_xml_tag_free(extract->alloc, &tag); } } if (extract_page_end(extract)) goto end; outf("page=%i subpage->num_spans=%i", document->pages_num, content_count_spans(&subpage->content)); } outf("num_spans=%i num_spans_split=%i num_spans_autosplit=%i", num_spans, extract->num_spans_split, extract->num_spans_autosplit ); ret = 0; end: extract_xml_tag_free(extract->alloc, &tag); extract_free(extract->alloc, &image_data); return ret; } int extract_span_begin( extract_t *extract, const char *font_name, int font_bold, int font_italic, int wmode, double ctm_a, double ctm_b, double ctm_c, double ctm_d, double bbox_x0, double bbox_y0, double bbox_x1, double bbox_y1) { int e = -1; extract_page_t *page; subpage_t *subpage; span_t *span; document_t *document = &extract->document; /* FIXME: RJW: Should continue the last span if everything is the same. */ assert(document->pages_num > 0); page = document->pages[document->pages_num-1]; subpage = page->subpages[page->subpages_num-1]; outf("extract_span_begin(): ctm=(%f %f %f %f) font_name=%s, wmode=%i", ctm_a, ctm_b, ctm_c, ctm_d, font_name, wmode); if (content_append_new_span(extract->alloc, &subpage->content, &span, document->current)) goto end; span->ctm.a = ctm_a; span->ctm.b = ctm_b; span->ctm.c = ctm_c; span->ctm.d = ctm_d; span->font_bbox.min.x = bbox_x0; span->font_bbox.min.y = bbox_y0; span->font_bbox.max.x = bbox_x1; span->font_bbox.max.y = bbox_y1; { const char *ff = strchr(font_name, '+'); const char *f = (ff) ? ff+1 : font_name; if (extract_strdup(extract->alloc, f, &span->font_name)) goto end; span->flags.font_bold = font_bold ? 1 : 0; span->flags.font_italic = font_italic ? 1 : 0; span->flags.wmode = wmode ? 1 : 0; extract->span_offset_x = 0; extract->span_offset_y = 0; } e = 0; end: return e; } /* Create a new empty span, based on the current one. */ static span_t * split_to_new_span(extract_alloc_t *alloc, content_root_t *content, span_t *span0) { content_t save; span_t *span; char *name; if (extract_strdup(alloc, span0->font_name, &name)) return NULL; if (content_append_new_span(alloc, content, &span, span0->structure)) { extract_free(alloc, &name); return NULL; } save = span->base; /* Avoid overwriting linked list. */ *span = *span0; span->base = save; span->font_name = name; span->chars = NULL; span->chars_num = 0; return span; } /* This routine returns the previous non-space-char, UNLESS the span starts with a space, in which case we accept that one. */ static span_t * find_previous_non_space_char_ish(content_root_t *content, int *char_num, int *intervening_space) { content_t *s; int i; *intervening_space = 0; for (s = content->base.prev; s != &content->base; s = s->prev) { span_t *span = (span_t *)s; if (s->type != content_span) continue; for (i = span->chars_num-1; i >= 0; i--) { if (span->chars[i].ucs != 32 || i == 0) { *char_num = i; return span; } *intervening_space = 1; } } return NULL; } point_t extract_predicted_end_of_char(char_t *char_, const span_t *span) { double adv = char_->adv; point_t dir = { adv * (1 - span->flags.wmode), adv * span->flags.wmode }; dir = extract_matrix4_transform_point(span->ctm, dir); dir.x += char_->x; dir.y += char_->y; return dir; } point_t extract_end_of_span(const span_t *span) { assert(span && span->chars_num > 0); return extract_predicted_end_of_char(&span->chars[span->chars_num-1], span); } int extract_add_char( extract_t *extract, double x, double y, unsigned int ucs, double adv, double x0, double y0, double x1, double y1) { int e = -1; char_t *char_; extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; subpage_t *subpage = page->subpages[page->subpages_num-1]; span_t *span = content_last_span(&subpage->content); span_t *span0; int char_num0; double dist, perp, scale_squared; point_t dir; int intervening_space; if (span->flags.wmode) { dir.x = 0; dir.y = 1; scale_squared = span->ctm.c * span->ctm.c + span->ctm.d * span->ctm.d; } else { dir.x = 1; dir.y = 0; scale_squared = span->ctm.a * span->ctm.a + span->ctm.b * span->ctm.b; } dir = extract_matrix4_transform_point(span->ctm, dir); outf("(%f %f) ucs=% 5i=%c adv=%f", x, y, ucs, (ucs >=32 && ucs< 127) ? ucs : ' ', adv); /* Is there a previous span to which we should consider attaching this char. */ span0 = find_previous_non_space_char_ish(&subpage->content, &char_num0, &intervening_space); /* Spans can't continue over different structure elements. */ if (span0 && span0->structure != extract->document.current) span0 = NULL; if (span0 == NULL) { /* No previous continuable span. */ outf("%c x=%g y=%g adv=%g\n", ucs, x, y, adv); } else { /* We have a span. Check whether we need to break to a new line, or add (or subtract) a space. */ char_t *char_prev = &span0->chars[char_num0]; double adv0 = char_prev->adv; point_t predicted_end_of_char0 = extract_predicted_end_of_char(char_prev, span0); /* We don't currently have access to the size of the advance for a space. * Typically it's around 1 to 1/2 that of a real char. So guess at that * using the 2 advances we have available to us. */ double space_guess = (adv0 + adv)/2 * extract->master_space_guess; /* Use dot product to calculate the distance that we have moved along the direction vector. */ dist = (x - predicted_end_of_char0.x) * dir.x + (y - predicted_end_of_char0.y) * dir.y; /* Use dot product to calculate the distance that we have moved perpendicular to the direction vector. */ perp = (x - predicted_end_of_char0.x) * dir.y - (y - predicted_end_of_char0.y) * dir.x; /* Both dist and perp are multiplied by scale_squared. */ dist /= scale_squared; perp /= scale_squared; /* So now, dist, perp, adv, adv0 and space_guess are all in pre-transform space. */ /* So fabs(dist) is expected to be 0, and perp is expected to be 0 for characters * "naturally placed" on a line. */ outf("%c x=%g y=%g adv=%g dist=%g perp=%g\n", ucs, x, y, adv, dist, perp); /* Arbitrary fractions here; ideally we should consult the font bbox, but we don't currently * have that. */ if (fabs(perp) > 3*space_guess/2 || fabs(dist) > space_guess * 8) { /* Create new span. */ if (span->chars_num > 0) { extract->num_spans_autosplit += 1; span = split_to_new_span(extract->alloc, &subpage->content, span); if (span == NULL) goto end; } } else if (intervening_space) { /* Some files, notably zlib.3.pdf appear to contain stray extra spaces within the PDF * content themselves. e.g. "suppor ts". We therefore spot when the * space allocated for a space isn't used, and remove the space. */ /* MAGIC NUMBER WARNING. zlib.pdf says that /4 is not sensitive enough. /3 is OK. */ if (dist < space_guess/3) { if (span->chars_num > 0) { span->chars_num--; /* Don't need to worry about it being empty, as we're about to add another char! */ } else { span_t *space_span = content_prev_span(&span->base); assert(space_span->chars_num > 0); space_span->chars_num--; if (space_span->chars_num == 0) extract_span_free(extract->alloc, &space_span); } } } /* MAGIC NUMBER WARNING: We expect the space char to be about 1/2 as wide of a standard char. * zlib3.pdf shows that sometimes we need to insert a space when it's *just* smaller than * this. (e.g. 'eveninthe'). */ else if (!intervening_space && dist > 2*space_guess/3) { /* Larger gap than expected. Add an extra space. */ /* Where should the space go? At the predicted position where the previous char * ended. */ char_ = extract_span_append_c(extract->alloc, span, ' '); if (char_ == NULL) goto end; char_->x = predicted_end_of_char0.x; char_->y = predicted_end_of_char0.y; } } char_ = extract_span_append_c(extract->alloc, span, ucs); if (char_ == NULL) goto end; char_->x = x; char_->y = y; char_->adv = adv; char_->bbox.min.x = x0; char_->bbox.min.y = y0; char_->bbox.max.x = x1; char_->bbox.max.y = y1; e = 0; end: if (span && span->chars_num == 0) { extract_span_free(extract->alloc, &span); } return e; } int extract_span_end(extract_t *extract) { extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; subpage_t *subpage = page->subpages[page->subpages_num-1]; span_t *span = content_last_span(&subpage->content); if (span->chars_num == 0) { /* Calling code called extract_span_begin() then extract_span_end() without any call to extract_add_char(). Our joining code assumes that all spans are non-empty, so we need to delete this span. */ extract_span_free(extract->alloc, &span); } return 0; } int extract_add_image( extract_t *extract, const char *type, double x, double y, double w, double h, void *data, size_t data_size, extract_image_data_free data_free, void *data_free_handle) { int e = -1; extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; subpage_t *subpage = page->subpages[page->subpages_num-1]; image_t *image; extract->image_n += 1; if (content_append_new_image(extract->alloc, &subpage->content, &image)) goto end; image->x = x; image->y = y; image->w = w; image->h = h; image->data = data; image->data_size = data_size; image->data_free = data_free; image->data_free_handle = data_free_handle; if (extract_strdup(extract->alloc, type, &image->type)) goto end; if (extract_asprintf(extract->alloc, &image->id, "rId%i", extract->image_n) < 0) goto end; if (extract_asprintf(extract->alloc, &image->name, "image%i.%s", extract->image_n, image->type) < 0) goto end; subpage->images_num += 1; outf("subpage->images_num=%i", subpage->images_num); e = 0; end: if (e) { extract_image_free(extract->alloc, &image); } return e; } static int tablelines_append(extract_alloc_t *alloc, tablelines_t *tablelines, rect_t *rect, double color) { if (extract_realloc( alloc, &tablelines->tablelines, sizeof(*tablelines->tablelines) * (tablelines->tablelines_num + 1) )) return -1; tablelines->tablelines[ tablelines->tablelines_num].rect = *rect; tablelines->tablelines[ tablelines->tablelines_num].color = (float) color; tablelines->tablelines_num += 1; return 0; } static point_t transform( double x, double y, double ctm_a, double ctm_b, double ctm_c, double ctm_d, double ctm_e, double ctm_f) { point_t ret; ret.x = ctm_a * x + ctm_b * y + ctm_e; ret.y = ctm_c * x + ctm_d * y + ctm_f; return ret; } static double s_min(double a, double b) { return (a < b) ? a : b; } static double s_max(double a, double b) { return (a > b) ? a : b; } int extract_add_path4( extract_t *extract, double ctm_a, double ctm_b, double ctm_c, double ctm_d, double ctm_e, double ctm_f, double x0, double y0, double x1, double y1, double x2, double y2, double x3, double y3, double color) { extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; subpage_t *subpage = page->subpages[page->subpages_num-1]; point_t points[4] = { transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), transform(x2, y2, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f), transform(x3, y3, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f) }; rect_t rect; int i; double dx, dy; outf("cmt=(%f %f %f %f %f %f) points=[(%f %f) (%f %f) (%f %f) (%f %f)]", ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f, x0, y0, x1, y1, x2, y2, x3, y3 ); outf("extract_add_path4(): [(%f %f) (%f %f) (%f %f) (%f %f)]", x0, y0, x1, y1, x2, y2, x3, y3); /* Find first step with dx > 0. */ for (i=0; i<4; ++i) { if (points[(i+1) % 4].x > points[(i+0) % 4].x) break; } outf("i=%i", i); if (i == 4) return 0; rect.min.x = points[(i+0) % 4].x; rect.max.x = points[(i+1) % 4].x; if (points[(i+2) % 4].x != rect.max.x) return 0; if (points[(i+3) % 4].x != rect.min.x) return 0; y0 = points[(i+1) % 4].y; y1 = points[(i+2) % 4].y; if (y0 == y1) return 0; if (points[(i+3) % 4].y != y1) return 0; if (points[(i+4) % 4].y != y0) return 0; rect.min.y = (y1 > y0) ? y0 : y1; rect.max.y = (y1 > y0) ? y1 : y0; dx = rect.max.x - rect.min.x; dy = rect.max.y - rect.min.y; if (dx / dy > 5) { /* Horizontal line. */ outf("have found horizontal line: %s", extract_rect_string(&rect)); if (tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color)) return -1; } else if (dy / dx > 5) { /* Vertical line. */ outf("have found vertical line: %s", extract_rect_string(&rect)); if (tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color)) return -1; } return 0; } int extract_add_line( extract_t *extract, double ctm_a, double ctm_b, double ctm_c, double ctm_d, double ctm_e, double ctm_f, double width, double x0, double y0, double x1, double y1, double color) { extract_page_t *page = extract->document.pages[extract->document.pages_num-1]; subpage_t *subpage = page->subpages[page->subpages_num-1]; point_t p0 = transform(x0, y0, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); point_t p1 = transform(x1, y1, ctm_a, ctm_b, ctm_c, ctm_d, ctm_e, ctm_f); double width2 = width * sqrt( fabs( ctm_a * ctm_d - ctm_b * ctm_c)); rect_t rect; (void)color; rect.min.x = s_min(p0.x, p1.x); rect.min.y = s_min(p0.y, p1.y); rect.max.x = s_max(p0.x, p1.x); rect.max.y = s_max(p0.y, p1.y); outf("%s: width=%f ((%f %f)(%f %f)) rect=%s", extract_FUNCTION, width, x0, y0, x1, y1, extract_rect_string(&rect) ); if (rect.min.x == rect.max.x && rect.min.y == rect.max.y) { } else if (rect.min.x == rect.max.x) { rect.min.x -= width2 / 2; rect.max.x += width2 / 2; return tablelines_append(extract->alloc, &subpage->tablelines_vertical, &rect, color); } else if (rect.min.y == rect.max.y) { rect.min.y -= width2 / 2; rect.max.y += width2 / 2; return tablelines_append(extract->alloc, &subpage->tablelines_horizontal, &rect, color); } return 0; } int extract_subpage_alloc(extract_alloc_t *alloc, rect_t mediabox, extract_page_t *page, subpage_t **psubpage) { subpage_t *subpage; if (extract_malloc(alloc, psubpage, sizeof(subpage_t))) { return -1; } subpage = *psubpage; subpage->mediabox = mediabox; content_init_root(&subpage->content, NULL); subpage->images_num = 0; subpage->tablelines_horizontal.tablelines = NULL; subpage->tablelines_horizontal.tablelines_num = 0; subpage->tablelines_vertical.tablelines = NULL; subpage->tablelines_vertical.tablelines_num = 0; content_init_root(&subpage->tables, NULL); if (extract_realloc2(alloc, &page->subpages, sizeof(subpage_t*) * page->subpages_num, sizeof(subpage_t*) * (page->subpages_num + 1))) { extract_free(alloc, psubpage); return -1; } page->subpages[page->subpages_num] = subpage; page->subpages_num += 1; return 0; } /* Appends new empty subpage_t to the last page of an extract->document. */ static int extract_subpage_begin(extract_t *extract, double x0, double y0, double x1, double y1) { extract_page_t *page = extract->document.pages[extract->document.pages_num - 1]; subpage_t *subpage; rect_t mediabox = { { x0, y0 }, { x1, y1 } }; int e; e = extract_subpage_alloc(extract->alloc, mediabox, page, &subpage); if (e == 0) { } return e; } /* Appends new empty page_t to an extract->document. */ int extract_page_begin(extract_t *extract, double x0, double y0, double x1, double y1) { extract_page_t *page; if (extract_malloc(extract->alloc, &page, sizeof(*page))) return -1; page->mediabox.min.x = x0; page->mediabox.min.y = y0; page->mediabox.max.x = x1; page->mediabox.max.y = y1; page->subpages = NULL; page->subpages_num = 0; page->split = NULL; if (extract_realloc2( extract->alloc, &extract->document.pages, sizeof(subpage_t*) * extract->document.pages_num, sizeof(subpage_t*) * (extract->document.pages_num + 1) )) { extract_free(extract->alloc, &page); return -1; } extract->document.pages[extract->document.pages_num] = page; extract->document.pages_num += 1; if (extract_subpage_begin(extract, x0, y0, x1, y1)) { extract->document.pages_num--; page_free(extract->alloc, &extract->document.pages[extract->document.pages_num]); return -1; } return 0; } int extract_fill_begin( extract_t *extract, double ctm_a, double ctm_b, double ctm_c, double ctm_d, double ctm_e, double ctm_f, double color) { assert(extract->path_type == path_type_NONE); extract->path_type = path_type_FILL; extract->path.fill.color = color; extract->path.fill.n = 0; extract->path.fill.ctm.a = ctm_a; extract->path.fill.ctm.b = ctm_b; extract->path.fill.ctm.c = ctm_c; extract->path.fill.ctm.d = ctm_d; extract->path.fill.ctm.e = ctm_e; extract->path.fill.ctm.f = ctm_f; return 0; } int extract_stroke_begin( extract_t *extract, double ctm_a, double ctm_b, double ctm_c, double ctm_d, double ctm_e, double ctm_f, double line_width, double color) { assert(extract->path_type == path_type_NONE); extract->path_type = path_type_STROKE; extract->path.stroke.ctm.a = ctm_a; extract->path.stroke.ctm.b = ctm_b; extract->path.stroke.ctm.c = ctm_c; extract->path.stroke.ctm.d = ctm_d; extract->path.stroke.ctm.e = ctm_e; extract->path.stroke.ctm.f = ctm_f; extract->path.stroke.width = line_width; extract->path.stroke.color = color; extract->path.stroke.point0_set = 0; extract->path.stroke.point_set = 0; return 0; } int extract_moveto(extract_t *extract, double x, double y) { if (extract->path_type == path_type_FILL) { if (extract->path.fill.n == -1) return 0; if (extract->path.fill.n != 0) { outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); extract->path.fill.n = -1; return 0; } extract->path.fill.points[extract->path.fill.n].x = x; extract->path.fill.points[extract->path.fill.n].y = y; extract->path.fill.n += 1; return 0; } else if (extract->path_type == path_type_STROKE) { extract->path.stroke.point.x = x; extract->path.stroke.point.y = y; extract->path.stroke.point_set = 1; if (!extract->path.stroke.point0_set) { extract->path.stroke.point0 = extract->path.stroke.point; extract->path.stroke.point0_set = 1; } return 0; } else { assert(0); return -1; } } int extract_lineto(extract_t *extract, double x, double y) { if (extract->path_type == path_type_FILL) { if (extract->path.fill.n == -1) return 0; if (extract->path.fill.n == 0 || extract->path.fill.n >= 4) { outf0("returning error. extract->path.fill.n=%i", extract->path.fill.n); extract->path.fill.n = -1; return 0; } extract->path.fill.points[extract->path.fill.n].x = x; extract->path.fill.points[extract->path.fill.n].y = y; extract->path.fill.n += 1; return 0; } else if (extract->path_type == path_type_STROKE) { if (extract->path.stroke.point_set) { if (extract_add_line( extract, extract->path.stroke.ctm.a, extract->path.stroke.ctm.b, extract->path.stroke.ctm.c, extract->path.stroke.ctm.d, extract->path.stroke.ctm.e, extract->path.stroke.ctm.f, extract->path.stroke.width, extract->path.stroke.point.x, extract->path.stroke.point.y, x, y, extract->path.stroke.color)) { return -1; } } extract->path.stroke.point.x = x; extract->path.stroke.point.y = y; extract->path.stroke.point_set = 1; if (!extract->path.stroke.point0_set) { extract->path.stroke.point0 = extract->path.stroke.point; extract->path.stroke.point0_set = 1; } return 0; } else { assert(0); return -1; } } int extract_closepath(extract_t *extract) { if (extract->path_type == path_type_FILL) { if (extract->path.fill.n == 4) { /* We are closing a four-element path, so this could be a thin rectangle that defines a line in a table. */ int e; e = extract_add_path4( extract, extract->path.fill.ctm.a, extract->path.fill.ctm.b, extract->path.fill.ctm.c, extract->path.fill.ctm.d, extract->path.fill.ctm.e, extract->path.fill.ctm.f, extract->path.fill.points[0].x, extract->path.fill.points[0].y, extract->path.fill.points[1].x, extract->path.fill.points[1].y, extract->path.fill.points[2].x, extract->path.fill.points[2].y, extract->path.fill.points[3].x, extract->path.fill.points[3].y, extract->path.fill.color); if (e) return e; } extract->path.fill.n = 0; return 0; } else if (extract->path_type == path_type_STROKE) { if (extract->path.stroke.point0_set && extract->path.stroke.point_set) { if (extract_add_line( extract, extract->path.stroke.ctm.a, extract->path.stroke.ctm.b, extract->path.stroke.ctm.c, extract->path.stroke.ctm.d, extract->path.stroke.ctm.e, extract->path.stroke.ctm.f, extract->path.stroke.width, extract->path.stroke.point.x, extract->path.stroke.point.y, extract->path.stroke.point0.x, extract->path.stroke.point0.y, extract->path.stroke.color)) { return -1; } return 0; } extract->path.stroke.point = extract->path.stroke.point0; return 0; } else { assert(0); return -1; } } int extract_fill_end(extract_t *extract) { assert(extract->path_type == path_type_FILL); extract->path_type = path_type_NONE; return 0; } int extract_stroke_end(extract_t *extract) { assert(extract->path_type == path_type_STROKE); extract->path_type = path_type_NONE; return 0; } static int extract_subpage_end(extract_t *extract) { (void) extract; return 0; } int extract_page_end(extract_t *extract) { if (extract_subpage_end(extract)) return -1; return 0; } int extract_begin_struct(extract_t *extract, extract_struct_t type, int uid, int score) { document_t *document = &extract->document; structure_t *structure; if (extract_malloc(extract->alloc, &structure, sizeof(*structure))) return -1; structure->parent = document->current; structure->sibling_next = NULL; structure->sibling_prev = NULL; structure->kids_first = NULL; structure->kids_tail = &structure->kids_first; structure->type = type; structure->score = score; structure->uid = uid; if (document->current == NULL) { /* New topmost entry. */ document->current = structure; document->structure = structure; } else { /* Add a child */ *document->current->kids_tail = structure; document->current->kids_tail = &structure->sibling_next; document->current = structure; } return 0; } int extract_end_struct(extract_t *extract) { document_t *document = &extract->document; assert(document->current != NULL); document->current = document->current->parent; return 0; } const char *extract_struct_string(extract_struct_t type) { switch (type) { default: return "UNKNOWN"; case extract_struct_INVALID: return "INVALID"; case extract_struct_UNDEFINED: return "UNDEFINED"; case extract_struct_DOCUMENT: return "DOCUMENT"; case extract_struct_PART: return "PART"; case extract_struct_ART: return "ART"; case extract_struct_SECT: return "SECT"; case extract_struct_DIV: return "DIV"; case extract_struct_BLOCKQUOTE: return "BLOCKQUOTE"; case extract_struct_CAPTION: return "CAPTION"; case extract_struct_TOC: return "TOC"; case extract_struct_TOCI: return "TOCI"; case extract_struct_INDEX: return "INDEX"; case extract_struct_NONSTRUCT: return "NONSTRUCT"; case extract_struct_PRIVATE: return "PRIVATE"; case extract_struct_DOCUMENTFRAGMENT: return "DOCUMENTFRAGMENT"; case extract_struct_ASIDE: return "ASIDE"; case extract_struct_TITLE: return "TITLE"; case extract_struct_FENOTE: return "FENOTE"; case extract_struct_SUB: return "SUB"; case extract_struct_P: return "P"; case extract_struct_H: return "H"; case extract_struct_H1: return "H1"; case extract_struct_H2: return "H2"; case extract_struct_H3: return "H3"; case extract_struct_H4: return "H4"; case extract_struct_H5: return "H5"; case extract_struct_H6: return "H6"; case extract_struct_LIST: return "LIST"; case extract_struct_LISTITEM: return "LISTITEM"; case extract_struct_LABEL: return "LABEL"; case extract_struct_LISTBODY: return "LISTBODY"; case extract_struct_TABLE: return "TABLE"; case extract_struct_TR: return "TR"; case extract_struct_TH: return "TH"; case extract_struct_TD: return "TD"; case extract_struct_THEAD: return "THEAD"; case extract_struct_TBODY: return "TBODY"; case extract_struct_TFOOT: return "TFOOT"; case extract_struct_SPAN: return "SPAN"; case extract_struct_QUOTE: return "QUOTE"; case extract_struct_NOTE: return "NOTE"; case extract_struct_REFERENCE: return "REFERENCE"; case extract_struct_BIBENTRY: return "BIBENTRY"; case extract_struct_CODE: return "CODE"; case extract_struct_LINK: return "LINK"; case extract_struct_ANNOT: return "ANNOT"; case extract_struct_EM: return "EM"; case extract_struct_STRONG: return "STRONG"; case extract_struct_RUBY: return "RUBY"; case extract_struct_RB: return "RB"; case extract_struct_RT: return "RT"; case extract_struct_RP: return "RP"; case extract_struct_WARICHU: return "WARICHU"; case extract_struct_WT: return "WT"; case extract_struct_WP: return "WP"; case extract_struct_FIGURE: return "FIGURE"; case extract_struct_FORMULA: return "FORMULA"; case extract_struct_FORM: return "FORM"; case extract_struct_ARTIFACT: return "ARTIFACT"; } } static int paragraph_to_text( extract_alloc_t *alloc, paragraph_t *paragraph, extract_astring_t *text) { content_line_iterator lit; line_t *line; for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) { content_span_iterator sit; span_t *span; for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) { int c; for (c=0; c<span->chars_num; ++c) { /* We encode each character as utf8. */ char_t* char_ = &span->chars[c]; unsigned cc = char_->ucs; if (extract_astring_catc_unicode( alloc, text, cc, 0 /*xml*/, 1 /*ascii_ligatures*/, 1 /*ascii_dash*/, 1 /*ascii_apostrophe*/ )) return -1; } } } if (extract_astring_catc(alloc, text, '\n')) return -1; return 0; } static int paragraphs_to_text_content( extract_alloc_t *alloc, content_root_t *paragraphs, extract_astring_t *text) { content_iterator cit; content_t *content; for (content = content_iterator_init(&cit, paragraphs); content != NULL; content = content_iterator_next(&cit)) { if (content->type == content_paragraph) { if (paragraph_to_text(alloc, (paragraph_t *)content, text)) return -1; } else if (content->type == content_block) { block_t *block = (block_t *)content; content_paragraph_iterator pit; paragraph_t *paragraph; for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) { if (paragraph_to_text(alloc, paragraph, text)) return -1; } } } return 0; } static int extract_write_tables_csv(extract_t *extract) { int ret = -1; int p; char *path = NULL; FILE *f = NULL; extract_astring_t text = {NULL, 0}; if (!extract->tables_csv_format) return 0; outf("extract_write_tables_csv(): path_format=%s", extract->tables_csv_format); outf("extract->document.pages_num=%i", extract->document.pages_num); for (p=0; p<extract->document.pages_num; ++p) { int c; extract_page_t *page = extract->document.pages[p]; for (c=0; c<page->subpages_num; ++c) { content_table_iterator tit; table_t *table; subpage_t *subpage = page->subpages[c]; outf("p=%i subpage->tables_num=%i", p, content_count_tables(&subpage->tables)); for (table = content_table_iterator_init(&tit, &subpage->tables); table != NULL; table = content_table_iterator_next(&tit)) { int y; extract_free(extract->alloc, &path); if (extract_asprintf(extract->alloc, &path, extract->tables_csv_format, extract->tables_csv_i) < 0) goto end; extract->tables_csv_i += 1; outf("Writing table to: %s", path); outf("table->cells_num_x=%i", table->cells_num_x); outf("table->cells_num_y=%i", table->cells_num_y); f = fopen(path, "w"); if (!f) goto end; for (y=0; y<table->cells_num_y; ++y) { int x; int have_output = 0; for (x=0; x<table->cells_num_x; ++x) { cell_t* cell = table->cells[table->cells_num_x * y + x]; extract_astring_free(extract->alloc, &text); if (y==0) { outf("y=0 x=%i cell->rect=%s", x, extract_rect_string(&cell->rect)); } if (have_output) fprintf(f, ","); have_output = 1; if (paragraphs_to_text_content( extract->alloc, &cell->content, &text )) goto end; /* Reference cvs output trims trailing spaces. */ extract_astring_char_truncate_if(&text, ' '); fprintf(f, "\"%s\"", text.chars ? text.chars : ""); } fprintf(f, "\n"); } fclose(f); f = NULL; } } } ret = 0; end: if (f) fclose(f); extract_free(extract->alloc, &path); extract_astring_free(extract->alloc, &text); return ret; } int extract_process( extract_t *extract, int spacing, int rotation, int images) { int e = -1; if (extract_realloc2( extract->alloc, &extract->contentss, sizeof(*extract->contentss) * extract->contentss_num, sizeof(*extract->contentss) * (extract->contentss_num + 1) )) goto end; extract_astring_init(&extract->contentss[extract->contentss_num]); extract->contentss_num += 1; if (extract_document_join(extract->alloc, &extract->document, extract->layout_analysis, extract->master_space_guess)) goto end; switch (extract->format) { case extract_format_ODT: if (extract_document_to_odt_content( extract->alloc, &extract->document, spacing, rotation, images, &extract->contentss[extract->contentss_num - 1], &extract->odt_styles )) goto end; break; case extract_format_DOCX: if (extract_document_to_docx_content( extract->alloc, &extract->document, spacing, rotation, images, &extract->contentss[extract->contentss_num - 1] )) goto end; break; case extract_format_HTML: if (extract_document_to_html_content( extract->alloc, &extract->document, rotation, images, &extract->contentss[extract->contentss_num - 1] )) goto end; break; case extract_format_JSON: if (extract_document_to_json_content( extract->alloc, &extract->document, rotation, images, &extract->contentss[extract->contentss_num - 1] )) goto end; break; case extract_format_TEXT: { int p; for (p=0; p<extract->document.pages_num; ++p) { extract_page_t* page = extract->document.pages[p]; int c; for (c=0; c<page->subpages_num; ++c) { subpage_t* subpage = page->subpages[c]; if (paragraphs_to_text_content( extract->alloc, &subpage->content, &extract->contentss[extract->contentss_num - 1] )) goto end; } } break; } default: outf0("Invalid format=%i", extract->format); assert(0); errno = EINVAL; return 1; } if (extract_document_images(extract->alloc, &extract->document, &extract->images)) goto end; if (extract->tables_csv_format) { extract_write_tables_csv(extract); } { int p; for (p=0; p<extract->document.pages_num; ++p) { page_free(extract->alloc, &extract->document.pages[p]); } extract_free(extract->alloc, &extract->document.pages); extract->document.pages_num = 0; } e = 0; end: return e; } int extract_write(extract_t *extract, extract_buffer_t *buffer) { int e = -1; extract_zip_t *zip = NULL; char *text2 = NULL; int i; switch (extract->format) { case extract_format_ODT: { if (extract_zip_open(buffer, &zip)) goto end; for (i=0; i<odt_template_items_num; ++i) { const odt_template_item_t* item = &odt_template_items[i]; extract_free(extract->alloc, &text2); outf("i=%i item->name=%s", i, item->name); if (extract_odt_content_item( extract->alloc, extract->contentss, extract->contentss_num, &extract->odt_styles, &extract->images, item->name, item->text, &text2 )) { goto end; } { const char* text3 = (text2) ? text2 : item->text; if (extract_zip_write_file(zip, text3, strlen(text3), item->name)) goto end; } } outf0("extract->images.images_num=%i", extract->images.images_num); for (i=0; i<extract->images.images_num; ++i) { image_t* image = extract->images.images[i]; extract_free(extract->alloc, &text2); if (extract_asprintf(extract->alloc, &text2, "Pictures/%s", image->name) < 0) goto end; if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } if (extract_zip_close(&zip)) goto end; break; } case extract_format_DOCX: { if (extract_zip_open(buffer, &zip)) goto end; for (i=0; i<docx_template_items_num; ++i) { const docx_template_item_t* item = &docx_template_items[i]; extract_free(extract->alloc, &text2); outf("i=%i item->name=%s", i, item->name); if (extract_docx_content_item( extract->alloc, extract->contentss, extract->contentss_num, &extract->images, item->name, item->text, &text2 )) { goto end; } { const char* text3 = (text2) ? text2 : item->text; if (extract_zip_write_file(zip, text3, strlen(text3), item->name)) goto end; } } for (i=0; i<extract->images.images_num; ++i) { image_t* image = extract->images.images[i]; extract_free(extract->alloc, &text2); if (extract_asprintf(extract->alloc, &text2, "word/media/%s", image->name) < 0) goto end; if (extract_zip_write_file(zip, image->data, image->data_size, text2)) goto end; } if (extract_zip_close(&zip)) goto end; break; } case extract_format_HTML: case extract_format_TEXT: for (i=0; i<extract->contentss_num; ++i) { if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; } break; case extract_format_JSON: { int first = 1; if (extract_buffer_cat(buffer, "{\n\"elements\" : ")) goto end; for (i=0; i<extract->contentss_num; ++i) { if (!first && extract_buffer_cat(buffer, ",\n")) goto end; if (extract->contentss[i].chars_num > 0) first = 0; if (extract_buffer_write(buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL)) goto end; } if (extract_buffer_cat(buffer, "\n}\n")) goto end; break; } default: outf0("Invalid format=%i", extract->format); assert(0); errno = EINVAL; return 1; } e = 0; end: if (e) { outf("failed: %s", strerror(errno)); extract_zip_close(&zip); } extract_free(extract->alloc, &text2); return e; } int extract_write_content(extract_t *extract, extract_buffer_t *buffer) { int i; for (i=0; i<extract->contentss_num; ++i) { if (extract_buffer_write( buffer, extract->contentss[i].chars, extract->contentss[i].chars_num, NULL /*o_actual*/ )) return -1; } return 0; } static int string_ends_with(const char *string, const char *end) { size_t string_len = strlen(string); size_t end_len = strlen(end); if (end_len > string_len) return 0; return memcmp(string + string_len - end_len, end, end_len) == 0; } int extract_write_template( extract_t *extract, const char *path_template, const char *path_out, int preserve_dir) { if (string_ends_with(path_out, ".odt")) { return extract_odt_write_template( extract->alloc, extract->contentss, extract->contentss_num, &extract->odt_styles, &extract->images, path_template, path_out, preserve_dir); } else { return extract_docx_write_template( extract->alloc, extract->contentss, extract->contentss_num, &extract->images, path_template, path_out, preserve_dir); } } void extract_end(extract_t **pextract) { int i; extract_t *extract = *pextract; if (!extract) return; extract_document_free(extract->alloc, &extract->document); for (i=0; i<extract->contentss_num; ++i) { extract_astring_free(extract->alloc, &extract->contentss[i]); } extract_free(extract->alloc, &extract->contentss); extract_images_free(extract->alloc, &extract->images); extract_odt_styles_free(extract->alloc, &extract->odt_styles); extract_free(extract->alloc, pextract); } void extract_internal_end(void) { extract_span_string(NULL, NULL); } void extract_exp_min(extract_t *extract, size_t size) { extract_alloc_exp_min(extract->alloc, size); } double extract_font_size(matrix4_t *ctm) { double font_size = extract_matrix_expansion(*ctm); /* Round font_size to nearest 0.01. */ font_size = (double) (int) (font_size * 100.0f + 0.5f) / 100.0f; return font_size; } rect_t extract_block_pre_rotation_bounds(block_t *block, double angle) { content_paragraph_iterator pit; paragraph_t *paragraph; rect_t pre_box = extract_rect_empty; matrix4_t unrotate, rotate; point_t centre, trans_centre; /* Construct a matrix to undo the rotation that we are about to put into * the file. i.e. get us a matrix that maps us from where the chars are * positioned back to the pre-rotated position. These pre-rotated positions * can then be used to calculate the origin/extent of the area that we * need to put into the file. */ /* The well know rotation matrixes: * * CW: [ cos(theta) sin(theta) ] CCW: [ cos(theta) -sin(theta) ] * [ -sin(theta) cos(theta) ] [ sin(theta) cos(theta) ] */ /* Word gives us an angle to rotate by clockwise. So the inverse is the * CCW matrix: */ unrotate.a = cos(angle); unrotate.b = -sin(angle); unrotate.c = -unrotate.b; unrotate.d = unrotate.a; /* And the forward rotation is the CW matrix: */ rotate.a = unrotate.a; /* cos(theta) = cos(-theta) */ rotate.b = -unrotate.b; /* sin(theta) = -sin(-theta) */ rotate.c = -rotate.b; rotate.d = rotate.a; /* So ctm.unrotate.rotate = ctm, by construction. ctm.unrotate should * (in the common cases where the ctm is just a scale + rotation) map * all our character locations back to a rectangular region. We now * calculate that region as pre_box. */ for (paragraph = content_paragraph_iterator_init(&pit, &block->content); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) { content_line_iterator lit; line_t *line; for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) { span_t *span0 = content_first_span(&line->content); span_t *span1 = content_last_span(&line->content); point_t start = { span0->chars[0].x, span0->chars[0].y}; point_t end = extract_end_of_span(span1); double hoff = span0->font_bbox.max.y - (span0->font_bbox.min.y < 0 ? span0->font_bbox.min.y : 0); outf("%f %f -> %f %f\n", start.x, start.y, end.x, end.y); start = extract_matrix4_transform_point(unrotate, start); end = extract_matrix4_transform_point(unrotate, end); outf(" ---------> %f %f -> %f %f\n", start.x, start.y, end.x, end.y); /* Allow for the height of the span here. */ hoff *= sqrt(span0->ctm.c * span0->ctm.c + span0->ctm.d * span0->ctm.d); if (start.y < end.y) start.y -= hoff; else end.y -= hoff; pre_box = extract_rect_union_point(pre_box, start); pre_box = extract_rect_union_point(pre_box, end); } } /* So pre_box rotated around the origin by angle should give us the region we want. */ /* BUT word etc rotate around the centre of the box. So we need to offset the region to * allow for this. */ /* So word, takes the declared box, and subtracts the centre vector from it. Then it * does the rotation (around the origin - now the centre of the box). Then it adds the * centre vector to it again. So the centre of the box does not change. Unfortunately, * we haven't easily got the centre vector of the transformed box to hand, so calculate * it by rerotating the centre vector of the pre_box.*/ centre.x = (pre_box.min.x + pre_box.max.x)/2; centre.y = (pre_box.min.y + pre_box.max.y)/2; trans_centre = extract_matrix4_transform_point(rotate, centre); #if 0 { point_t centre2 = extract_matrix4_transform_point(unrotate, trans_centre); centre2 = centre2; } #endif #if 0 printf("Centre of this paragraph should be %f %f\n", trans_centre.x, trans_centre.y); #endif /* So the centre of our pre_box should be trans_centre not centre. */ centre.x -= trans_centre.x; centre.y -= trans_centre.y; pre_box.min.x -= centre.x; pre_box.min.y -= centre.y; pre_box.max.x -= centre.x; pre_box.max.y -= centre.y; #if 0 /* So, as a sanity check, convert the 4 corners back to a quad. */ { rect_t centred_box = { pre_box.min.x - trans_centre.x, pre_box.min.y - trans_centre.y, pre_box.max.x - trans_centre.x, pre_box.max.y - trans_centre.y }; point_t corner; corner = extract_matrix4_transform_xy(rotate, centred_box.min.x, centred_box.min.y); corner.x += trans_centre.x; corner.y += trans_centre.y; printf("TL: %f %f\n", corner.x, corner.y); corner = extract_matrix4_transform_xy(rotate, centred_box.max.x, centred_box.min.y); corner.x += trans_centre.x; corner.y += trans_centre.y; printf("TR: %f %f\n", corner.x, corner.y); corner = extract_matrix4_transform_xy(rotate, centred_box.max.x, centred_box.max.y); corner.x += trans_centre.x; corner.y += trans_centre.y; printf("BR: %f %f\n", corner.x, corner.y); corner = extract_matrix4_transform_xy(rotate, centred_box.min.x, centred_box.max.y); corner.x += trans_centre.x; corner.y += trans_centre.y; printf("BL: %f %f\n", corner.x, corner.y); } #endif /* And a further adjustment. If we mess up line widths, text can wrap too early, * resulting in content extending too far down the page, and truncating at the * bottom of the text frame. Similarly, line spacing. We can't tell word 'make * the box large enough', so we have to add a fudge factor and extend the bottom * of the box ourselves. As long as we aren't filling the background, or drawing * a bounding box, this should be fine. * * Unfortunately, we can't just extend pre_box downwards, because we rotate from * the centre of the box, so we need to adjust for that. */ /* Double the height of the box. */ { /* extra = how much to extend the box downwards. */ double extra = pre_box.max.y - pre_box.min.y; /* So we are offsetting the centre of the box by offset. */ point_t offset = { 0, extra/2 }; point_t toffset; pre_box.max.y += extra; toffset = extract_matrix4_transform_point(rotate, offset); pre_box.min.x += toffset.x - offset.x; pre_box.min.y += toffset.y - offset.y; pre_box.max.x += toffset.x - offset.x; pre_box.max.y += toffset.y - offset.y; } return pre_box; } double extract_baseline_angle(const matrix4_t *ctm) { return atan2(ctm->b, ctm->a); }
