Mercurial > hgrepos > Python2 > PyMuPDF
view mupdf-source/thirdparty/extract/src/html.c @ 32:72c1b70d4f5c
Also apply -Werror=implicit-function-declaration
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Sun, 21 Sep 2025 15:10:12 +0200 |
| parents | b50eed0cc0ef |
| children |
line wrap: on
line source
/* These extract_html_*() functions generate docx content and docx zip archive data. Caller must call things in a sensible order to create valid content - e.g. don't call docx_paragraph_start() twice without intervening call to docx_paragraph_finish(). */ #include "extract/extract.h" #include "astring.h" #include "document.h" #include "html.h" #include "mem.h" #include "memento.h" #include "outf.h" #include "sys.h" #include "text.h" #include "zip.h" #include <assert.h> #include <errno.h> #include <float.h> #include <math.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <sys/stat.h> static void content_state_init(content_state_t *content_state) { content_state->font.name = NULL; content_state->font.size = 0; content_state->font.bold = 0; content_state->font.italic = 0; content_state->ctm_prev = NULL; } static int content_state_reset(extract_alloc_t *alloc, content_state_t *content_state, extract_astring_t *content) { int e = -1; if (content_state->font.bold) { if (extract_astring_cat(alloc, content, "</b>")) goto end; content_state->font.bold = 0; } if (content_state->font.italic) { if (extract_astring_cat(alloc, content, "</i>")) goto end; content_state->font.italic = 0; } e = 0; end: return e; } static int paragraph_to_html_content( extract_alloc_t *alloc, content_state_t *content_state, paragraph_t *paragraph, int single_line, extract_astring_t *content) { int e = -1; const char *endl = (single_line) ? "" : "\n"; content_line_iterator lit; line_t *line; if (extract_astring_catf(alloc, content, "%s%s<p>", endl, endl)) goto end; for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) { content_span_iterator sit; span_t *span; for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) { int c; content_state->ctm_prev = &span->ctm; if (span->flags.font_bold != content_state->font.bold) { if (extract_astring_cat(alloc, content, span->flags.font_bold ? "<b>" : "</b>" )) goto end; content_state->font.bold = span->flags.font_bold; } if (span->flags.font_italic != content_state->font.italic) { if ( extract_astring_cat(alloc, content, span->flags.font_italic ? "<i>" : "</i>" )) goto end; content_state->font.italic = span->flags.font_italic; } for (c=0; c<span->chars_num; ++c) { char_t* char_ = &span->chars[c]; if (extract_astring_catc_unicode_xml(alloc, content, char_->ucs)) goto end; } } if (content->chars_num && lit.next->type != content_root) { if (content->chars[content->chars_num-1] == '-') content->chars_num -= 1; else if (content->chars[content->chars_num-1] != ' ') { extract_astring_catc(alloc, content, ' '); } } } if (extract_astring_catf(alloc, content, "%s</p>", endl)) goto end; e = 0; end: return e; } /* Append html for paragraphs[] to <content>. Updates *state if we change font etc. */ static int paragraphs_to_html_content( extract_alloc_t *alloc, content_state_t *state, content_root_t *paragraphs, int single_line, extract_astring_t *content) { content_paragraph_iterator pit; paragraph_t *paragraph; int e = -1; for (paragraph = content_paragraph_iterator_init(&pit, paragraphs); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) if (paragraph_to_html_content(alloc, state, paragraph, single_line, content)) goto end; if (content_state_reset(alloc, state, content)) goto end; e = 0; end: return e; } static int append_table(extract_alloc_t *alloc, content_state_t *state, table_t *table, extract_astring_t *content) { int e = -1; int y; if (extract_astring_cat(alloc, content, "\n\n<table border=\"1\" style=\"border-collapse:collapse\">\n")) goto end; for (y=0; y<table->cells_num_y; ++y) { /* If 1, we put each <td>...</td> on a separate line. */ int x; if (extract_astring_cat(alloc, content, " <tr>\n")) goto end; for (x=0; x<table->cells_num_x; ++x) { cell_t* cell = table->cells[y*table->cells_num_x + x]; if (!cell->above || !cell->left) { /* HTML does not require anything for cells that are subsumed by other cells that extend horizontally and vertically. */ continue; } if (extract_astring_cat(alloc, content, " ")) goto end; if (extract_astring_cat(alloc, content, "<td")) goto end; if (cell->extend_right > 1) { if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end; } if (cell->extend_down > 1) { if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end; } if (extract_astring_cat(alloc, content, ">")) goto end; if (paragraphs_to_html_content(alloc, state, &cell->content, 1 /* single_line*/, content)) goto end; if (extract_astring_cat(alloc, content, "</td>")) goto end; if (extract_astring_cat(alloc, content, "\n")) goto end; if (content_state_reset(alloc, state, content)) goto end; } if (extract_astring_cat(alloc, content, " </tr>\n")) goto end; } if (extract_astring_cat(alloc, content, "</table>\n\n")) goto end; e = 0; end: return e; } /* FIXME: Badly named! first_char_of_last_span_of_paragraph! */ static char_t * paragraph_first_char(const paragraph_t *paragraph) { line_t *line = content_last_line(¶graph->content); span_t *span = content_last_span(&line->content); return &span->chars[0]; } static int compare_paragraph_y(const void *a, const void *b) { const paragraph_t *const *a_paragraph = a; const paragraph_t *const *b_paragraph = b; double a_y = paragraph_first_char(*a_paragraph)->y; double b_y = paragraph_first_char(*b_paragraph)->y; if (a_y > b_y) return +1; if (a_y < b_y) return -1; return 0; } /* */ static int split_to_html(extract_alloc_t *alloc, split_t *split, subpage_t ***ppsubpage, extract_astring_t *output) { int p; int s; subpage_t *subpage; int paragraphs_num; paragraph_t **paragraphs = NULL; content_paragraph_iterator pit; paragraph_t *paragraph; content_table_iterator tit; table_t *table; content_state_t state; content_state_init(&state); if (split == NULL) { /* fall through to below - SPLIT_NONE */ } else if (split->type == SPLIT_HORIZONTAL) { int ret = 0; double total = 0; for (s = 0; s < split->count; s++) { total += split->split[s]->weight; } if (split->count > 1) extract_astring_cat(alloc, output, "<div style=\"display:flex;\">\n"); for (s = 0; s < split->count; s++) { if (split->count > 1) { if (total == 0) { extract_astring_catf(alloc, output, "<div>\n"); } else { extract_astring_catf(alloc, output, "<div style=\"width:%g%%;\">\n", 100.0*split->split[s]->weight/total); } } ret = split_to_html(alloc, split->split[s], ppsubpage, output); if (ret) break; if (split->count > 1) extract_astring_cat(alloc, output, "</div>\n"); } if (split->count > 1) extract_astring_cat(alloc, output, "</div>\n"); return ret; } else if (split->type == SPLIT_VERTICAL) { int ret = 0; for (s = 0; s < split->count; s++) { ret = split_to_html(alloc, split->split[s], ppsubpage, output); if (ret) break; } return ret; } /* We'll deal with the next subpage entry. Increment the pointer for the * next caller. */ subpage = **ppsubpage; *ppsubpage = (*ppsubpage)+1; /* Output paragraphs and tables in order of increasing <y> coordinate. Unfortunately the paragraph ordering we do in page->paragraphs[] isn't quite right and results in bad ordering if ctm/trm matrices are inconsistent. So we create our own list of paragraphs sorted strictly by y coordinate of the first char of each paragraph. */ paragraphs_num = content_count_paragraphs(&subpage->content); if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * paragraphs_num)) goto end; for (p = 0, paragraph = content_paragraph_iterator_init(&pit, &subpage->content); paragraph != NULL; p++, paragraph = content_paragraph_iterator_next(&pit)) paragraphs[p] = paragraph; qsort(paragraphs, paragraphs_num, sizeof(*paragraphs), compare_paragraph_y); if (0) { int p; outf0("paragraphs are:"); for (p=0; p<paragraphs_num; ++p) { paragraph_t* paragraph = paragraphs[p]; line_t *line = content_first_line(¶graph->content); span_t *span = content_first_span(&line->content); outf0(" p=%i: %s", p, extract_span_string(NULL, span)); } } p = 0; table = content_table_iterator_init(&tit, &subpage->tables); for(;;) { double y_paragraph; double y_table; paragraph_t* paragraph = (p == paragraphs_num) ? NULL : paragraphs[p]; if (!paragraph && !table) break; y_paragraph = (paragraph) ? content_first_span(&content_first_line(¶graph->content)->content)->chars[0].y : DBL_MAX; y_table = (table) ? table->pos.y : DBL_MAX; outf("p=%i y_paragraph=%f", p, y_paragraph); outf("y_table=%f", y_table); if (paragraph && y_paragraph < y_table) { //extract_astring_catf(alloc, output, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph); if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, output)) goto end; if (content_state_reset(alloc, &state, output)) goto end; p += 1; } else if (table) { //extract_astring_catf(alloc, output, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table); if (append_table(alloc, &state, table, output)) goto end; table = content_table_iterator_next(&tit); } } extract_free(alloc, ¶graphs); return 0; end: extract_free(alloc, ¶graphs); return -1; } int extract_document_to_html_content( extract_alloc_t *alloc, document_t *document, int rotation, int images, extract_astring_t *content) { int ret = -1; int n; paragraph_t **paragraphs = NULL; (void) rotation; (void) images; extract_astring_cat(alloc, content, "<html>\n"); extract_astring_cat(alloc, content, "<body>\n"); /* Write paragraphs into <content>. */ for (n=0; n<document->pages_num; ++n) { extract_page_t *page = document->pages[n]; subpage_t **psubpage = page->subpages; /* Every page gets its own div. */ extract_astring_cat(alloc, content, "<div>\n"); ret = split_to_html(alloc, page->split, &psubpage, content); if (ret) goto end; extract_astring_cat(alloc, content, "</div>\n"); } extract_astring_cat(alloc, content, "</body>\n"); extract_astring_cat(alloc, content, "</html>\n"); ret = 0; end: extract_free(alloc, ¶graphs); return ret; }
