comparison mupdf-source/thirdparty/extract/src/html.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 /* These extract_html_*() functions generate docx content and docx zip archive
2 data.
3
4 Caller must call things in a sensible order to create valid content -
5 e.g. don't call docx_paragraph_start() twice without intervening call to
6 docx_paragraph_finish(). */
7
8 #include "extract/extract.h"
9
10 #include "astring.h"
11 #include "document.h"
12 #include "html.h"
13 #include "mem.h"
14 #include "memento.h"
15 #include "outf.h"
16 #include "sys.h"
17 #include "text.h"
18 #include "zip.h"
19
20 #include <assert.h>
21 #include <errno.h>
22 #include <float.h>
23 #include <math.h>
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <string.h>
27
28 #include <sys/stat.h>
29
30
31 static void content_state_init(content_state_t *content_state)
32 {
33 content_state->font.name = NULL;
34 content_state->font.size = 0;
35 content_state->font.bold = 0;
36 content_state->font.italic = 0;
37 content_state->ctm_prev = NULL;
38 }
39
40 static int
41 content_state_reset(extract_alloc_t *alloc, content_state_t *content_state, extract_astring_t *content)
42 {
43 int e = -1;
44
45 if (content_state->font.bold)
46 {
47 if (extract_astring_cat(alloc, content, "</b>")) goto end;
48 content_state->font.bold = 0;
49 }
50 if (content_state->font.italic)
51 {
52 if (extract_astring_cat(alloc, content, "</i>")) goto end;
53 content_state->font.italic = 0;
54 }
55
56 e = 0;
57 end:
58
59 return e;
60 }
61
62 static int
63 paragraph_to_html_content(
64 extract_alloc_t *alloc,
65 content_state_t *content_state,
66 paragraph_t *paragraph,
67 int single_line,
68 extract_astring_t *content)
69 {
70 int e = -1;
71 const char *endl = (single_line) ? "" : "\n";
72 content_line_iterator lit;
73 line_t *line;
74
75 if (extract_astring_catf(alloc, content, "%s%s<p>", endl, endl)) goto end;
76
77 for (line = content_line_iterator_init(&lit, &paragraph->content); line != NULL; line = content_line_iterator_next(&lit))
78 {
79 content_span_iterator sit;
80 span_t *span;
81
82 for (span = content_span_iterator_init(&sit, &line->content); span != NULL; span = content_span_iterator_next(&sit))
83 {
84 int c;
85
86 content_state->ctm_prev = &span->ctm;
87 if (span->flags.font_bold != content_state->font.bold)
88 {
89 if (extract_astring_cat(alloc, content,
90 span->flags.font_bold ? "<b>" : "</b>"
91 )) goto end;
92 content_state->font.bold = span->flags.font_bold;
93 }
94 if (span->flags.font_italic != content_state->font.italic)
95 {
96 if ( extract_astring_cat(alloc, content,
97 span->flags.font_italic ? "<i>" : "</i>"
98 )) goto end;
99 content_state->font.italic = span->flags.font_italic;
100 }
101
102 for (c=0; c<span->chars_num; ++c)
103 {
104 char_t* char_ = &span->chars[c];
105 if (extract_astring_catc_unicode_xml(alloc, content, char_->ucs)) goto end;
106 }
107 }
108
109 if (content->chars_num && lit.next->type != content_root)
110 {
111 if (content->chars[content->chars_num-1] == '-') content->chars_num -= 1;
112 else if (content->chars[content->chars_num-1] != ' ')
113 {
114 extract_astring_catc(alloc, content, ' ');
115 }
116 }
117 }
118 if (extract_astring_catf(alloc, content, "%s</p>", endl)) goto end;
119
120 e = 0;
121
122 end:
123 return e;
124 }
125
126
127 /* Append html for paragraphs[] to <content>. Updates *state if we change font
128 etc. */
129 static int
130 paragraphs_to_html_content(
131 extract_alloc_t *alloc,
132 content_state_t *state,
133 content_root_t *paragraphs,
134 int single_line,
135 extract_astring_t *content)
136 {
137 content_paragraph_iterator pit;
138 paragraph_t *paragraph;
139 int e = -1;
140
141 for (paragraph = content_paragraph_iterator_init(&pit, paragraphs); paragraph != NULL; paragraph = content_paragraph_iterator_next(&pit))
142 if (paragraph_to_html_content(alloc, state, paragraph, single_line, content)) goto end;
143
144 if (content_state_reset(alloc, state, content)) goto end;
145 e = 0;
146
147 end:
148 return e;
149 }
150
151 static int
152 append_table(extract_alloc_t *alloc, content_state_t *state, table_t *table, extract_astring_t *content)
153 {
154 int e = -1;
155 int y;
156
157 if (extract_astring_cat(alloc, content, "\n\n<table border=\"1\" style=\"border-collapse:collapse\">\n")) goto end;
158
159 for (y=0; y<table->cells_num_y; ++y)
160 {
161 /* If 1, we put each <td>...</td> on a separate line. */
162 int x;
163 if (extract_astring_cat(alloc, content, " <tr>\n")) goto end;
164 for (x=0; x<table->cells_num_x; ++x)
165 {
166 cell_t* cell = table->cells[y*table->cells_num_x + x];
167 if (!cell->above || !cell->left)
168 {
169 /* HTML does not require anything for cells that are subsumed
170 by other cells that extend horizontally and vertically. */
171 continue;
172 }
173 if (extract_astring_cat(alloc, content, " ")) goto end;
174 if (extract_astring_cat(alloc, content, "<td")) goto end;
175
176 if (cell->extend_right > 1)
177 {
178 if (extract_astring_catf(alloc, content, " colspan=\"%i\"", cell->extend_right)) goto end;
179 }
180 if (cell->extend_down > 1)
181 {
182 if (extract_astring_catf(alloc, content, " rowspan=\"%i\"", cell->extend_down)) goto end;
183 }
184
185 if (extract_astring_cat(alloc, content, ">")) goto end;
186
187 if (paragraphs_to_html_content(alloc, state, &cell->content, 1 /* single_line*/, content)) goto end;
188 if (extract_astring_cat(alloc, content, "</td>")) goto end;
189 if (extract_astring_cat(alloc, content, "\n")) goto end;
190
191 if (content_state_reset(alloc, state, content)) goto end;
192 }
193 if (extract_astring_cat(alloc, content, " </tr>\n")) goto end;
194 }
195 if (extract_astring_cat(alloc, content, "</table>\n\n")) goto end;
196 e = 0;
197
198 end:
199 return e;
200 }
201
202 /* FIXME: Badly named! first_char_of_last_span_of_paragraph! */
203 static char_t *
204 paragraph_first_char(const paragraph_t *paragraph)
205 {
206 line_t *line = content_last_line(&paragraph->content);
207 span_t *span = content_last_span(&line->content);
208 return &span->chars[0];
209 }
210
211 static int compare_paragraph_y(const void *a, const void *b)
212 {
213 const paragraph_t *const *a_paragraph = a;
214 const paragraph_t *const *b_paragraph = b;
215 double a_y = paragraph_first_char(*a_paragraph)->y;
216 double b_y = paragraph_first_char(*b_paragraph)->y;
217
218 if (a_y > b_y) return +1;
219 if (a_y < b_y) return -1;
220
221 return 0;
222 }
223
224 /*
225 */
226 static int
227 split_to_html(extract_alloc_t *alloc, split_t *split, subpage_t ***ppsubpage, extract_astring_t *output)
228 {
229 int p;
230 int s;
231 subpage_t *subpage;
232 int paragraphs_num;
233 paragraph_t **paragraphs = NULL;
234 content_paragraph_iterator pit;
235 paragraph_t *paragraph;
236 content_table_iterator tit;
237 table_t *table;
238 content_state_t state;
239 content_state_init(&state);
240
241 if (split == NULL) {
242 /* fall through to below - SPLIT_NONE */
243 } else if (split->type == SPLIT_HORIZONTAL) {
244 int ret = 0;
245 double total = 0;
246 for (s = 0; s < split->count; s++) {
247 total += split->split[s]->weight;
248 }
249 if (split->count > 1)
250 extract_astring_cat(alloc, output, "<div style=\"display:flex;\">\n");
251 for (s = 0; s < split->count; s++) {
252 if (split->count > 1)
253 {
254 if (total == 0)
255 {
256 extract_astring_catf(alloc, output, "<div>\n");
257 }
258 else
259 {
260 extract_astring_catf(alloc, output, "<div style=\"width:%g%%;\">\n", 100.0*split->split[s]->weight/total);
261 }
262 }
263 ret = split_to_html(alloc, split->split[s], ppsubpage, output);
264 if (ret)
265 break;
266 if (split->count > 1)
267 extract_astring_cat(alloc, output, "</div>\n");
268 }
269 if (split->count > 1)
270 extract_astring_cat(alloc, output, "</div>\n");
271 return ret;
272 } else if (split->type == SPLIT_VERTICAL) {
273 int ret = 0;
274 for (s = 0; s < split->count; s++) {
275 ret = split_to_html(alloc, split->split[s], ppsubpage, output);
276 if (ret)
277 break;
278 }
279 return ret;
280 }
281
282 /* We'll deal with the next subpage entry. Increment the pointer for the
283 * next caller. */
284 subpage = **ppsubpage;
285 *ppsubpage = (*ppsubpage)+1;
286
287 /* Output paragraphs and tables in order of increasing <y> coordinate.
288
289 Unfortunately the paragraph ordering we do in page->paragraphs[]
290 isn't quite right and results in bad ordering if ctm/trm matrices are
291 inconsistent. So we create our own list of paragraphs sorted strictly
292 by y coordinate of the first char of each paragraph. */
293 paragraphs_num = content_count_paragraphs(&subpage->content);
294 if (extract_malloc(alloc, &paragraphs, sizeof(*paragraphs) * paragraphs_num)) goto end;
295 for (p = 0, paragraph = content_paragraph_iterator_init(&pit, &subpage->content); paragraph != NULL; p++, paragraph = content_paragraph_iterator_next(&pit))
296 paragraphs[p] = paragraph;
297 qsort(paragraphs, paragraphs_num, sizeof(*paragraphs), compare_paragraph_y);
298
299 if (0)
300 {
301 int p;
302 outf0("paragraphs are:");
303 for (p=0; p<paragraphs_num; ++p)
304 {
305 paragraph_t* paragraph = paragraphs[p];
306 line_t *line = content_first_line(&paragraph->content);
307 span_t *span = content_first_span(&line->content);
308 outf0(" p=%i: %s", p, extract_span_string(NULL, span));
309 }
310 }
311
312 p = 0;
313 table = content_table_iterator_init(&tit, &subpage->tables);
314 for(;;)
315 {
316 double y_paragraph;
317 double y_table;
318 paragraph_t* paragraph = (p == paragraphs_num) ? NULL : paragraphs[p];
319 if (!paragraph && !table) break;
320 y_paragraph = (paragraph) ? content_first_span(&content_first_line(&paragraph->content)->content)->chars[0].y : DBL_MAX;
321 y_table = (table) ? table->pos.y : DBL_MAX;
322 outf("p=%i y_paragraph=%f", p, y_paragraph);
323 outf("y_table=%f", y_table);
324 if (paragraph && y_paragraph < y_table)
325 {
326 //extract_astring_catf(alloc, output, "<p>@@@ paragraph %i y=%f @@@)</p>\n", p, y_paragraph);
327 if (paragraph_to_html_content(alloc, &state, paragraph, 0 /*single_line*/, output)) goto end;
328 if (content_state_reset(alloc, &state, output)) goto end;
329 p += 1;
330 }
331 else if (table)
332 {
333 //extract_astring_catf(alloc, output, "<p>@@@ table %t y=%f @@@)</p>\n", p, y_table);
334 if (append_table(alloc, &state, table, output)) goto end;
335 table = content_table_iterator_next(&tit);
336 }
337 }
338 extract_free(alloc, &paragraphs);
339 return 0;
340
341 end:
342 extract_free(alloc, &paragraphs);
343 return -1;
344 }
345
346 int extract_document_to_html_content(
347 extract_alloc_t *alloc,
348 document_t *document,
349 int rotation,
350 int images,
351 extract_astring_t *content)
352 {
353 int ret = -1;
354 int n;
355 paragraph_t **paragraphs = NULL;
356
357 (void) rotation;
358 (void) images;
359
360 extract_astring_cat(alloc, content, "<html>\n");
361 extract_astring_cat(alloc, content, "<body>\n");
362
363 /* Write paragraphs into <content>. */
364 for (n=0; n<document->pages_num; ++n)
365 {
366 extract_page_t *page = document->pages[n];
367 subpage_t **psubpage = page->subpages;
368
369 /* Every page gets its own div. */
370 extract_astring_cat(alloc, content, "<div>\n");
371
372 ret = split_to_html(alloc, page->split, &psubpage, content);
373 if (ret)
374 goto end;
375
376 extract_astring_cat(alloc, content, "</div>\n");
377 }
378 extract_astring_cat(alloc, content, "</body>\n");
379 extract_astring_cat(alloc, content, "</html>\n");
380
381 ret = 0;
382 end:
383
384 extract_free(alloc, &paragraphs);
385
386 return ret;
387 }