Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/html.c @ 3:2c135c81b16c
MERGE: upstream PyMuPDF 1.26.4 with MuPDF 1.26.7
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:44:09 +0200 |
| parents | b50eed0cc0ef |
| children |
comparison
equal
deleted
inserted
replaced
| 0:6015a75abc2d | 3:2c135c81b16c |
|---|---|
| 1 /* These extract_html_*() functions generate docx content and docx zip archive | |
| 2 data. | |
| 3 | |
| 4 Caller must call things in a sensible order to create valid content - | |
| 5 e.g. don't call docx_paragraph_start() twice without intervening call to | |
| 6 docx_paragraph_finish(). */ | |
| 7 | |
| 8 #include "extract/extract.h" | |
| 9 | |
| 10 #include "astring.h" | |
| 11 #include "document.h" | |
| 12 #include "html.h" | |
| 13 #include "mem.h" | |
| 14 #include "memento.h" | |
| 15 #include "outf.h" | |
| 16 #include "sys.h" | |
| 17 #include "text.h" | |
| 18 #include "zip.h" | |
| 19 | |
| 20 #include <assert.h> | |
| 21 #include <errno.h> | |
| 22 #include <float.h> | |
| 23 #include <math.h> | |
| 24 #include <stdlib.h> | |
| 25 #include <stdio.h> | |
| 26 #include <string.h> | |
| 27 | |
| 28 #include <sys/stat.h> | |
| 29 | |
| 30 | |
| 31 static void content_state_init(content_state_t *content_state) | |
| 32 { | |
| 33 content_state->font.name = NULL; | |
| 34 content_state->font.size = 0; | |
| 35 content_state->font.bold = 0; | |
| 36 content_state->font.italic = 0; | |
| 37 content_state->ctm_prev = NULL; | |
| 38 } | |
| 39 | |
| 40 static int | |
| 41 content_state_reset(extract_alloc_t *alloc, content_state_t *content_state, extract_astring_t *content) | |
| 42 { | |
| 43 int e = -1; | |
| 44 | |
| 45 if (content_state->font.bold) | |
| 46 { | |
| 47 if (extract_astring_cat(alloc, content, "</b>")) goto end; | |
| 48 content_state->font.bold = 0; | |
| 49 } | |
| 50 if (content_state->font.italic) | |
| 51 { | |
| 52 if (extract_astring_cat(alloc, content, "</i>")) goto end; | |
| 53 content_state->font.italic = 0; | |
| 54 } | |
| 55 | |
| 56 e = 0; | |
| 57 end: | |
| 58 | |
| 59 return e; | |
| 60 } | |
| 61 | |
| 62 static int | |
| 63 paragraph_to_html_content( | |
| 64 extract_alloc_t *alloc, | |
| 65 content_state_t *content_state, | |
| 66 paragraph_t *paragraph, | |
| 67 int single_line, | |
| 68 extract_astring_t *content) | |
| 69 { | |
| 70 int e = -1; | |
| 71 const char *endl = (single_line) ? "" : "\n"; | |
| 72 content_line_iterator lit; | |
| 73 line_t *line; | |
| 74 | |
| 75 if (extract_astring_catf(alloc, content, "%s%s<p>", endl, endl)) goto end; | |
| 76 | |
| 77 for (line = content_line_iterator_init(&lit, ¶graph->content); line != NULL; line = content_line_iterator_next(&lit)) | |
| 78 { | |
| 79 content_span_iterator sit; | |
| 80 span_t *span; | |
| 81 | |
| 82 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit)) | |
| 83 { | |
| 84 int c; | |
| 85 | |
| 86 content_state->ctm_prev = &span->ctm; | |
| 87 if (span->flags.font_bold != content_state->font.bold) | |
| 88 { | |
| 89 if (extract_astring_cat(alloc, content, | |
| 90 span->flags.font_bold ? "<b>" : "</b>" | |
| 91 )) goto end; | |
| 92 content_state->font.bold = span->flags.font_bold; | |
| 93 } | |
| 94 if (span->flags.font_italic != content_state->font.italic) | |
| 95 { | |
| 96 if ( extract_astring_cat(alloc, content, | |
| 97 span->flags.font_italic ? "<i>" : "</i>" | |
| 98 )) goto end; | |
| 99 content_state->font.italic = span->flags.font_italic; | |
| 100 } | |
| 101 | |
| 102 for (c=0; c<span->chars_num; ++c) | |
| 103 { | |
| 104 char_t* char_ = &span->chars[c]; | |
| 105 if (extract_astring_catc_unicode_xml(alloc, content, char_->ucs)) goto end; | |
| 106 } | |
| 107 } | |
| 108 | |
| 109 if (content->chars_num && lit.next->type != content_root) | |
| 110 { | |
| 111 if (content->chars[content->chars_num-1] == '-') content->chars_num -= 1; | |
| 112 else if (content->chars[content->chars_num-1] != ' ') | |
| 113 { | |
| 114 extract_astring_catc(alloc, content, ' '); | |
| 115 } | |
| 116 } | |
| 117 } | |
| 118 if (extract_astring_catf(alloc, content, "%s</p>", endl)) goto end; | |
| 119 | |
| 120 e = 0; | |
| 121 | |
| 122 end: | |
| 123 return e; | |
| 124 } | |
| 125 | |
| 126 | |
| 127 /* Append html for paragraphs[] to <content>. Updates *state if we change font | |
| 128 etc. */ | |
| 129 static int | |
| 130 paragraphs_to_html_content( | |
| 131 extract_alloc_t *alloc, | |
| 132 content_state_t *state, | |
| 133 content_root_t *paragraphs, | |
| 134 int single_line, | |
| 135 extract_astring_t *content) | |
| 136 { | |
| 137 content_paragraph_iterator pit; | |
| 138 paragraph_t *paragraph; | |
| 139 int e = -1; | |
| 140 | |
| 141 for (paragraph = content_paragraph_iterator_init(&pit, paragraphs); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit)) | |
| 142 if (paragraph_to_html_content(alloc, state, paragraph, single_line, content)) goto end; | |
| 143 | |
| 144 if (content_state_reset(alloc, state, content)) goto end; | |
| 145 e = 0; | |
| 146 | |
| 147 end: | |
| 148 return e; | |
| 149 } | |
| 150 | |
| 151 static int | |
| 152 append_table(extract_alloc_t *alloc, content_state_t *state, table_t *table, extract_astring_t *content) | |
| 153 { | |
| 154 int e = -1; | |
| 155 int y; | |
| 156 | |
| 157 if (extract_astring_cat(alloc, content, "\n\n<table border=\"1\" style=\"border-collapse:collapse\">\n")) goto end; | |
| 158 | |
| 159 for (y=0; y<table->cells_num_y; ++y) | |
| 160 { | |
| 161 /* If 1, we put each <td>...</td> on a separate line. */ | |
| 162 int x; | |
| 163 if (extract_astring_cat(alloc, content, " <tr>\n")) goto end; | |
| 164 for (x=0; x<table->cells_num_x; ++x) | |
| 165 { | |
| 166 cell_t* cell = table->cells[y*table->cells_num_x + x]; | |
| 167 if (!cell->above || !cell->left) | |
| 168 { | |
| 169 /* HTML does not require anything for cells that are subsumed | |
| 170 by other cells that extend horizontally and vertically. */ | |
| 171 continue; | |
| 172 } | |
| 173 if (extract_astring_cat(alloc, content, " ")) goto end; | |
| 174 if (extract_astring_cat(alloc, content, "<td")) goto end; | |
| 175 | |
| 176 if (cell->extend_right > 1) | |
| 177 { | |
| 178 if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end; | |
| 179 } | |
| 180 if (cell->extend_down > 1) | |
| 181 { | |
| 182 if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end; | |
| 183 } | |
| 184 | |
| 185 if (extract_astring_cat(alloc, content, ">")) goto end; | |
| 186 | |
| 187 if (paragraphs_to_html_content(alloc, state, &cell->content, 1 /* single_line*/, content)) goto end; | |
| 188 if (extract_astring_cat(alloc, content, "</td>")) goto end; | |
| 189 if (extract_astring_cat(alloc, content, "\n")) goto end; | |
| 190 | |
| 191 if (content_state_reset(alloc, state, content)) goto end; | |
| 192 } | |
| 193 if (extract_astring_cat(alloc, content, " </tr>\n")) goto end; | |
| 194 } | |
| 195 if (extract_astring_cat(alloc, content, "</table>\n\n")) goto end; | |
| 196 e = 0; | |
| 197 | |
| 198 end: | |
| 199 return e; | |
| 200 } | |
| 201 | |
| 202 /* FIXME: Badly named! first_char_of_last_span_of_paragraph! */ | |
| 203 static char_t * | |
| 204 paragraph_first_char(const paragraph_t *paragraph) | |
| 205 { | |
| 206 line_t *line = content_last_line(¶graph->content); | |
| 207 span_t *span = content_last_span(&line->content); | |
| 208 return &span->chars[0]; | |
| 209 } | |
| 210 | |
| 211 static int compare_paragraph_y(const void *a, const void *b) | |
| 212 { | |
| 213 const paragraph_t *const *a_paragraph = a; | |
| 214 const paragraph_t *const *b_paragraph = b; | |
| 215 double a_y = paragraph_first_char(*a_paragraph)->y; | |
| 216 double b_y = paragraph_first_char(*b_paragraph)->y; | |
| 217 | |
| 218 if (a_y > b_y) return +1; | |
| 219 if (a_y < b_y) return -1; | |
| 220 | |
| 221 return 0; | |
| 222 } | |
| 223 | |
| 224 /* | |
| 225 */ | |
| 226 static int | |
| 227 split_to_html(extract_alloc_t *alloc, split_t *split, subpage_t ***ppsubpage, extract_astring_t *output) | |
| 228 { | |
| 229 int p; | |
| 230 int s; | |
| 231 subpage_t *subpage; | |
| 232 int paragraphs_num; | |
| 233 paragraph_t **paragraphs = NULL; | |
| 234 content_paragraph_iterator pit; | |
| 235 paragraph_t *paragraph; | |
| 236 content_table_iterator tit; | |
| 237 table_t *table; | |
| 238 content_state_t state; | |
| 239 content_state_init(&state); | |
| 240 | |
| 241 if (split == NULL) { | |
| 242 /* fall through to below - SPLIT_NONE */ | |
| 243 } else if (split->type == SPLIT_HORIZONTAL) { | |
| 244 int ret = 0; | |
| 245 double total = 0; | |
| 246 for (s = 0; s < split->count; s++) { | |
| 247 total += split->split[s]->weight; | |
| 248 } | |
| 249 if (split->count > 1) | |
| 250 extract_astring_cat(alloc, output, "<div style=\"display:flex;\">\n"); | |
| 251 for (s = 0; s < split->count; s++) { | |
| 252 if (split->count > 1) | |
| 253 { | |
| 254 if (total == 0) | |
| 255 { | |
| 256 extract_astring_catf(alloc, output, "<div>\n"); | |
| 257 } | |
| 258 else | |
| 259 { | |
| 260 extract_astring_catf(alloc, output, "<div style=\"width:%g%%;\">\n", 100.0*split->split[s]->weight/total); | |
| 261 } | |
| 262 } | |
| 263 ret = split_to_html(alloc, split->split[s], ppsubpage, output); | |
| 264 if (ret) | |
| 265 break; | |
| 266 if (split->count > 1) | |
| 267 extract_astring_cat(alloc, output, "</div>\n"); | |
| 268 } | |
| 269 if (split->count > 1) | |
| 270 extract_astring_cat(alloc, output, "</div>\n"); | |
| 271 return ret; | |
| 272 } else if (split->type == SPLIT_VERTICAL) { | |
| 273 int ret = 0; | |
| 274 for (s = 0; s < split->count; s++) { | |
| 275 ret = split_to_html(alloc, split->split[s], ppsubpage, output); | |
| 276 if (ret) | |
| 277 break; | |
| 278 } | |
| 279 return ret; | |
| 280 } | |
| 281 | |
| 282 /* We'll deal with the next subpage entry. Increment the pointer for the | |
| 283 * next caller. */ | |
| 284 subpage = **ppsubpage; | |
| 285 *ppsubpage = (*ppsubpage)+1; | |
| 286 | |
| 287 /* Output paragraphs and tables in order of increasing <y> coordinate. | |
| 288 | |
| 289 Unfortunately the paragraph ordering we do in page->paragraphs[] | |
| 290 isn't quite right and results in bad ordering if ctm/trm matrices are | |
| 291 inconsistent. So we create our own list of paragraphs sorted strictly | |
| 292 by y coordinate of the first char of each paragraph. */ | |
| 293 paragraphs_num = content_count_paragraphs(&subpage->content); | |
| 294 if (extract_malloc(alloc, ¶graphs, sizeof(*paragraphs) * paragraphs_num)) goto end; | |
| 295 for (p = 0, paragraph = content_paragraph_iterator_init(&pit, &subpage->content); paragraph != NULL; p++, paragraph = content_paragraph_iterator_next(&pit)) | |
| 296 paragraphs[p] = paragraph; | |
| 297 qsort(paragraphs, paragraphs_num, sizeof(*paragraphs), compare_paragraph_y); | |
| 298 | |
| 299 if (0) | |
| 300 { | |
| 301 int p; | |
| 302 outf0("paragraphs are:"); | |
| 303 for (p=0; p<paragraphs_num; ++p) | |
| 304 { | |
| 305 paragraph_t* paragraph = paragraphs[p]; | |
| 306 line_t *line = content_first_line(¶graph->content); | |
| 307 span_t *span = content_first_span(&line->content); | |
| 308 outf0(" p=%i: %s", p, extract_span_string(NULL, span)); | |
| 309 } | |
| 310 } | |
| 311 | |
| 312 p = 0; | |
| 313 table = content_table_iterator_init(&tit, &subpage->tables); | |
| 314 for(;;) | |
| 315 { | |
| 316 double y_paragraph; | |
| 317 double y_table; | |
| 318 paragraph_t* paragraph = (p == paragraphs_num) ? NULL : paragraphs[p]; | |
| 319 if (!paragraph && !table) break; | |
| 320 y_paragraph = (paragraph) ? content_first_span(&content_first_line(¶graph->content)->content)->chars[0].y : DBL_MAX; | |
| 321 y_table = (table) ? table->pos.y : DBL_MAX; | |
| 322 outf("p=%i y_paragraph=%f", p, y_paragraph); | |
| 323 outf("y_table=%f", y_table); | |
| 324 if (paragraph && y_paragraph < y_table) | |
| 325 { | |
| 326 //extract_astring_catf(alloc, output, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph); | |
| 327 if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, output)) goto end; | |
| 328 if (content_state_reset(alloc, &state, output)) goto end; | |
| 329 p += 1; | |
| 330 } | |
| 331 else if (table) | |
| 332 { | |
| 333 //extract_astring_catf(alloc, output, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table); | |
| 334 if (append_table(alloc, &state, table, output)) goto end; | |
| 335 table = content_table_iterator_next(&tit); | |
| 336 } | |
| 337 } | |
| 338 extract_free(alloc, ¶graphs); | |
| 339 return 0; | |
| 340 | |
| 341 end: | |
| 342 extract_free(alloc, ¶graphs); | |
| 343 return -1; | |
| 344 } | |
| 345 | |
| 346 int extract_document_to_html_content( | |
| 347 extract_alloc_t *alloc, | |
| 348 document_t *document, | |
| 349 int rotation, | |
| 350 int images, | |
| 351 extract_astring_t *content) | |
| 352 { | |
| 353 int ret = -1; | |
| 354 int n; | |
| 355 paragraph_t **paragraphs = NULL; | |
| 356 | |
| 357 (void) rotation; | |
| 358 (void) images; | |
| 359 | |
| 360 extract_astring_cat(alloc, content, "<html>\n"); | |
| 361 extract_astring_cat(alloc, content, "<body>\n"); | |
| 362 | |
| 363 /* Write paragraphs into <content>. */ | |
| 364 for (n=0; n<document->pages_num; ++n) | |
| 365 { | |
| 366 extract_page_t *page = document->pages[n]; | |
| 367 subpage_t **psubpage = page->subpages; | |
| 368 | |
| 369 /* Every page gets its own div. */ | |
| 370 extract_astring_cat(alloc, content, "<div>\n"); | |
| 371 | |
| 372 ret = split_to_html(alloc, page->split, &psubpage, content); | |
| 373 if (ret) | |
| 374 goto end; | |
| 375 | |
| 376 extract_astring_cat(alloc, content, "</div>\n"); | |
| 377 } | |
| 378 extract_astring_cat(alloc, content, "</body>\n"); | |
| 379 extract_astring_cat(alloc, content, "</html>\n"); | |
| 380 | |
| 381 ret = 0; | |
| 382 end: | |
| 383 | |
| 384 extract_free(alloc, ¶graphs); | |
| 385 | |
| 386 return ret; | |
| 387 } |
