comparison mupdf-source/source/html/html-doc.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2024 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "html-imp.h"
25
26 #include <string.h>
27 #include <math.h>
28
29 enum { T, R, B, L };
30
31 typedef struct
32 {
33 fz_document super;
34 fz_archive *zip;
35 fz_html_font_set *set;
36 fz_html *html;
37 fz_outline *outline;
38 const fz_htdoc_format_t *format;
39 } html_document;
40
41 typedef struct
42 {
43 fz_page super;
44 html_document *doc;
45 int number;
46 } html_page;
47
48 static void
49 htdoc_drop_document(fz_context *ctx, fz_document *doc_)
50 {
51 html_document *doc = (html_document*)doc_;
52 fz_drop_archive(ctx, doc->zip);
53 fz_drop_html(ctx, doc->html);
54 fz_drop_html_font_set(ctx, doc->set);
55 fz_drop_outline(ctx, doc->outline);
56 }
57
58 static fz_link_dest
59 htdoc_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest)
60 {
61 html_document *doc = (html_document*)doc_;
62 const char *s = strchr(dest, '#');
63 if (s && s[1] != 0)
64 {
65 float y = fz_find_html_target(ctx, doc->html, s+1);
66 if (y >= 0)
67 {
68 int page = y / doc->html->page_h;
69 return fz_make_link_dest_xyz(0, page, 0, y - page * doc->html->page_h, 0);
70 }
71 }
72
73 return fz_make_link_dest_none();
74 }
75
76 static int
77 htdoc_count_pages(fz_context *ctx, fz_document *doc_, int chapter)
78 {
79 html_document *doc = (html_document*)doc_;
80 if (doc->html->tree.root->s.layout.b > 0)
81 return ceilf(doc->html->tree.root->s.layout.b / doc->html->page_h);
82 return 1;
83 }
84
85 static void
86 htdoc_update_outline(fz_context *ctx, fz_document *doc, fz_outline *node)
87 {
88 while (node)
89 {
90 fz_link_dest dest = htdoc_resolve_link(ctx, doc, node->uri);
91 node->page = dest.loc;
92 node->x = dest.x;
93 node->y = dest.y;
94 htdoc_update_outline(ctx, doc, node->down);
95 node = node->next;
96 }
97 }
98
99 static void
100 htdoc_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em)
101 {
102 html_document *doc = (html_document*)doc_;
103
104 fz_layout_html(ctx, doc->html, w, h, em);
105
106 htdoc_update_outline(ctx, doc_, doc->outline);
107 }
108
109 static void
110 htdoc_drop_page(fz_context *ctx, fz_page *page_)
111 {
112 }
113
114 static fz_rect
115 htdoc_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box)
116 {
117 html_page *page = (html_page*)page_;
118 html_document *doc = page->doc;
119 fz_rect bbox;
120 bbox.x0 = 0;
121 bbox.y0 = 0;
122 bbox.x1 = doc->html->page_w + doc->html->page_margin[L] + doc->html->page_margin[R];
123 bbox.y1 = doc->html->page_h + doc->html->page_margin[T] + doc->html->page_margin[B];
124 return bbox;
125 }
126
127 static void
128 htdoc_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie)
129 {
130 html_page *page = (html_page*)page_;
131 html_document *doc = page->doc;
132 fz_draw_html(ctx, dev, ctm, doc->html, page->number);
133 }
134
135 static fz_link *
136 htdoc_load_links(fz_context *ctx, fz_page *page_)
137 {
138 html_page *page = (html_page*)page_;
139 html_document *doc = page->doc;
140 return fz_load_html_links(ctx, doc->html, page->number, "");
141 }
142
143 static fz_bookmark
144 htdoc_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc)
145 {
146 html_document *doc = (html_document*)doc_;
147 return fz_make_html_bookmark(ctx, doc->html, loc.page);
148 }
149
150 static fz_location
151 htdoc_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark)
152 {
153 html_document *doc = (html_document*)doc_;
154 return fz_make_location(0, fz_lookup_html_bookmark(ctx, doc->html, mark));
155 }
156
157 static fz_page *
158 htdoc_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number)
159 {
160 html_document *doc = (html_document*)doc_;
161 html_page *page = fz_new_derived_page(ctx, html_page, doc_);
162 page->super.bound_page = htdoc_bound_page;
163 page->super.run_page_contents = htdoc_run_page;
164 page->super.load_links = htdoc_load_links;
165 page->super.drop_page = htdoc_drop_page;
166 page->doc = doc;
167 page->number = number;
168 return (fz_page*)page;
169 }
170
171 static fz_outline *
172 htdoc_load_outline(fz_context *ctx, fz_document *doc_)
173 {
174 html_document *doc = (html_document*)doc_;
175 return fz_keep_outline(ctx, doc->outline);
176 }
177
178 static int
179 htdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size)
180 {
181 html_document *doc = (html_document *)doc_;
182 if (!strcmp(key, FZ_META_FORMAT))
183 return 1 + (int)fz_strlcpy(buf, doc->format->format_name, size);
184 if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title)
185 return 1 + (int)fz_strlcpy(buf, doc->html->title, size);
186 return -1;
187 }
188
189 static fz_html *
190 generic_parse(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buffer_in, const char *user_css, const fz_htdoc_format_t *format)
191 {
192 fz_buffer *buffer_html = NULL;
193 fz_html *html = NULL;
194
195 fz_try(ctx)
196 {
197 if (format->convert_to_html)
198 buffer_html = format->convert_to_html(ctx, set, buffer_in, zip, user_css);
199 else
200 buffer_html = fz_keep_buffer(ctx, buffer_in);
201 html = fz_parse_html(ctx, set, zip, base_uri, buffer_html, user_css, format->try_xml, format->try_html5, format->patch_mobi);
202 }
203 fz_always(ctx)
204 {
205 fz_drop_buffer(ctx, buffer_html);
206 }
207 fz_catch(ctx)
208 {
209 fz_drop_html(ctx, html);
210 fz_rethrow(ctx);
211 }
212 return html;
213 }
214
215 fz_document *
216 fz_htdoc_open_document_with_buffer(fz_context *ctx, fz_archive *dir, fz_buffer *buf, const fz_htdoc_format_t *format)
217 {
218 html_document *doc = NULL;
219
220 fz_var(doc);
221 fz_var(dir);
222
223 fz_try(ctx)
224 {
225 doc = fz_new_derived_document(ctx, html_document);
226 doc->super.drop_document = htdoc_drop_document;
227 doc->super.layout = htdoc_layout;
228 doc->super.load_outline = htdoc_load_outline;
229 doc->super.resolve_link_dest = htdoc_resolve_link;
230 doc->super.make_bookmark = htdoc_make_bookmark;
231 doc->super.lookup_bookmark = htdoc_lookup_bookmark;
232 doc->super.count_pages = htdoc_count_pages;
233 doc->super.load_page = htdoc_load_page;
234 doc->super.lookup_metadata = htdoc_lookup_metadata;
235 doc->super.is_reflowable = 1;
236
237 doc->zip = fz_keep_archive(ctx, dir);
238 doc->format = format;
239 doc->set = fz_new_html_font_set(ctx);
240 doc->html = generic_parse(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx), format);
241 doc->outline = fz_load_html_outline(ctx, doc->html);
242 }
243 fz_always(ctx)
244 fz_drop_buffer(ctx, buf);
245 fz_catch(ctx)
246 {
247 fz_drop_document(ctx, &doc->super);
248 fz_rethrow(ctx);
249 }
250
251 return (fz_document*)doc;
252 }
253
254 fz_document *
255 fz_htdoc_open_document_with_stream_and_dir(fz_context *ctx, fz_stream *stm, fz_archive *dir, const fz_htdoc_format_t *format)
256 {
257 fz_buffer *buf = NULL;
258
259 if (stm)
260 buf = fz_read_all(ctx, stm, 0);
261
262 return fz_htdoc_open_document_with_buffer(ctx, dir, buf, format);
263 }
264
265 /* Variant specific functions */
266
267 /* Generic HTML document handler */
268
269 static int isws(int c)
270 {
271 return c == 32 || c == 9 || c == 10 || c == 13 || c == 12;
272 }
273
274 static int recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state, int xhtml)
275 {
276 uint8_t buffer[4096];
277 size_t i, n, m;
278 enum {
279 state_top,
280 state_open,
281 state_pling,
282 state_query,
283 state_maybe_doctype,
284 state_maybe_doctype_ws,
285 state_maybe_doctype_html,
286 state_maybe_doctype_html_xhtml,
287 state_maybe_comment,
288 state_maybe_html,
289 state_maybe_html_xhtml,
290 state_comment
291 };
292 int state = state_top;
293 int type = 0;
294
295 if (hstate)
296 *hstate = NULL;
297 if (free_state)
298 *free_state = NULL;
299
300 if (stream == NULL)
301 return 0;
302
303 /* Simple state machine. Search for "<!doctype html" or "<html" in the first
304 * 4K of the file, allowing for comments and whitespace and case insensitivity. */
305
306 n = fz_read(ctx, stream, buffer, sizeof(buffer));
307 fz_seek(ctx, stream, 0, SEEK_SET);
308 if (n == 0)
309 return 0;
310
311 i = 0;
312 if (n >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF)
313 {
314 /* UTF-8 encoded BOM. Just skip it. */
315 i = 3;
316 }
317 else if (n >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF)
318 {
319 /* UTF-16, big endian. */
320 type = 1;
321 i = 2;
322 n &= ~1;
323 }
324 else if (n >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE)
325 {
326 /* UTF-16, little endian. */
327 i = 2;
328 type = 2;
329 n &= ~1;
330 }
331
332 while (i < n)
333 {
334 int c;
335
336 switch (type)
337 {
338 case 0: /* UTF-8 */
339 c = buffer[i++];
340 break;
341 case 1: /* UTF-16 - big endian */
342 c = buffer[i++] << 8;
343 c |= buffer[i++];
344 break;
345 case 2: /* UTF-16 - little endian */
346 c = buffer[i++];
347 c |= buffer[i++] << 8;
348 break;
349 }
350
351 switch (state)
352 {
353 case state_top:
354 if (isws(c))
355 continue; /* whitespace */
356 if (c == '<')
357 state = state_open;
358 else
359 return 0; /* Non whitespace found at the top level prior to a known tag. Fail. */
360 break;
361 case state_open:
362 if (isws(c))
363 continue; /* whitespace */
364 if (c == '!')
365 state = state_pling;
366 else if (c == '?')
367 state = state_query;
368 else if (c == 'h' || c == 'H')
369 state = state_maybe_html;
370 else
371 return 0; /* Not an acceptable opening tag. */
372 m = 0;
373 break;
374 case state_query:
375 if (c == '>')
376 state = state_top;
377 break;
378 case state_pling:
379 if (isws(c))
380 continue; /* whitespace */
381 else if (c == '-')
382 state = state_maybe_comment;
383 else if (c == 'd' || c == 'D')
384 state = state_maybe_doctype;
385 else
386 return 0; /* Not an acceptable opening tag. */
387 break;
388 case state_maybe_comment:
389 if (c == '-')
390 state = state_comment;
391 else
392 return 0; /* Not an acceptable opening tag. */
393 break;
394 case state_comment:
395 if (c == '-')
396 {
397 m++;
398 }
399 else if (c == '>' && m >= 2)
400 {
401 state = state_top;
402 }
403 else
404 m = 0;
405 break;
406 case state_maybe_doctype:
407 if (c == "octype"[m] || c == "OCTYPE"[m])
408 {
409 m++;
410 if (m == 6)
411 {
412 state = state_maybe_doctype_ws;
413 m = 0;
414 }
415 }
416 else
417 return 0; /* Not an acceptable opening tag. */
418 break;
419 case state_maybe_doctype_ws:
420 if (isws(c))
421 m++;
422 else if (m > 0 && (c == 'h' || c == 'H'))
423 {
424 state = state_maybe_doctype_html;
425 m = 0;
426 }
427 else
428 return 0; /* Not an acceptable opening tag. */
429 break;
430 case state_maybe_doctype_html:
431 if (c == "tml"[m] || c == "TML"[m])
432 {
433 m++;
434 if (m == 3)
435 {
436 state = state_maybe_doctype_html_xhtml;
437 m = 0;
438 }
439 }
440 else
441 return 0; /* Not an acceptable opening tag. */
442 break;
443 case state_maybe_doctype_html_xhtml:
444 if (c == '>')
445 {
446 /* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
447 * but we'd rather the html one did (75). */
448 return xhtml ? 25 : 75;
449 }
450 if (c >= 'A' && c <= 'Z')
451 c += 'a'-'A';
452 if (c == "xhtml"[m])
453 {
454 m++;
455 if (m == 5)
456 {
457 /* xhtml - the xhtml agent would be better (75) than the html
458 * agent (25). */
459 return xhtml ? 75 : 25;
460 }
461 }
462 else
463 m = 0;
464 break;
465 case state_maybe_html:
466 if (c == "tml"[m] || c == "TML"[m])
467 {
468 m++;
469 if (m == 3)
470 {
471 state = state_maybe_html_xhtml;
472 m = 0;
473 }
474 }
475 else
476 return 0; /* Not an acceptable opening tag. */
477 break;
478 case state_maybe_html_xhtml:
479 if (c == '>')
480 {
481 /* Not xhtml - the xhtml agent can handle this at a pinch (so 25),
482 * but we'd rather the html one did (75). */
483 return xhtml ? 25 : 75;
484 }
485 if (c >= 'A' && c <= 'Z')
486 c += 'a'-'A';
487 if (c == "xhtml"[m])
488 {
489 m++;
490 if (m == 5)
491 {
492 /* xhtml - the xhtml agent would be better (75) than the html
493 * agent (25). */
494 return xhtml ? 75 : 25;
495 }
496 }
497 else
498 m = 0;
499 break;
500 }
501 }
502
503 return 0;
504 }
505
506 int htdoc_recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
507 {
508 return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 0);
509 }
510
511 static const fz_htdoc_format_t fz_htdoc_html5 =
512 {
513 "HTML5",
514 NULL,
515 0, 1, 0
516 };
517
518 static fz_document *
519 htdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
520 {
521 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_html5);
522 }
523
524 static const char *htdoc_extensions[] =
525 {
526 "htm",
527 "html",
528 NULL
529 };
530
531 static const char *htdoc_mimetypes[] =
532 {
533 "text/html",
534 NULL
535 };
536
537 fz_document_handler html_document_handler =
538 {
539 NULL,
540 htdoc_open_document,
541 htdoc_extensions,
542 htdoc_mimetypes,
543 htdoc_recognize_html_content,
544 1
545 };
546
547 /* XHTML document handler */
548
549 static const fz_htdoc_format_t fz_htdoc_xhtml =
550 {
551 "XHTML",
552 NULL,
553 1, 1, 0
554 };
555
556 static fz_document *
557 xhtdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
558 {
559 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_xhtml);
560 }
561
562 int xhtdoc_recognize_xhtml_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state)
563 {
564 return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 1);
565 }
566
567 static const char *xhtdoc_extensions[] =
568 {
569 "xhtml",
570 NULL
571 };
572
573 static const char *xhtdoc_mimetypes[] =
574 {
575 "application/xhtml+xml",
576 NULL
577 };
578
579 fz_document_handler xhtml_document_handler =
580 {
581 NULL,
582 xhtdoc_open_document,
583 xhtdoc_extensions,
584 xhtdoc_mimetypes,
585 xhtdoc_recognize_xhtml_content,
586 1
587 };
588
589 /* FB2 document handler */
590
591 static const fz_htdoc_format_t fz_htdoc_fb2 =
592 {
593 "FictionBook2",
594 NULL,
595 1, 0, 0
596 };
597
598 static fz_document *
599 fb2doc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
600 {
601 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_fb2);
602 }
603
604 static int
605 fb2doc_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
606 {
607 const char *match = "<FictionBook";
608 int pos = 0;
609 int n = 4096;
610 int c;
611
612 if (state)
613 *state = NULL;
614 if (free_state)
615 *free_state = NULL;
616
617 if (stream == NULL)
618 return 0;
619
620 do
621 {
622 c = fz_read_byte(ctx, stream);
623 if (c == EOF)
624 return 0;
625 if (c == match[pos])
626 {
627 pos++;
628 if (pos == 12)
629 return 100;
630 }
631 else
632 {
633 /* Restart matching, but recheck c against the start. */
634 pos = (c == match[0]);
635 }
636 }
637 while (--n > 0);
638
639 return 0;
640 }
641
642 static const char *fb2doc_extensions[] =
643 {
644 "fb2",
645 "xml",
646 NULL
647 };
648
649 static const char *fb2doc_mimetypes[] =
650 {
651 "application/x-fictionbook",
652 "application/xml",
653 "text/xml",
654 NULL
655 };
656
657 fz_document_handler fb2_document_handler =
658 {
659 NULL,
660 fb2doc_open_document,
661 fb2doc_extensions,
662 fb2doc_mimetypes,
663 fb2doc_recognize_content
664 };
665
666 /* Mobi document handler */
667
668 static const fz_htdoc_format_t fz_htdoc_mobi =
669 {
670 "MOBI",
671 NULL,
672 1, 1, 1
673 };
674
675 static fz_document *
676 mobi_open_document_with_buffer(fz_context *ctx, fz_buffer *mobi)
677 {
678 fz_archive *dir = NULL;
679 fz_buffer *html;
680 fz_document *doc;
681 fz_var(dir);
682 fz_try(ctx)
683 {
684 dir = fz_extract_html_from_mobi(ctx, mobi);
685 html = fz_read_archive_entry(ctx, dir, "index.html");
686 doc = fz_htdoc_open_document_with_buffer(ctx, dir, html, &fz_htdoc_mobi);
687 }
688 fz_always(ctx)
689 {
690 fz_drop_buffer(ctx, mobi);
691 fz_drop_archive(ctx, dir);
692 }
693 fz_catch(ctx)
694 {
695 fz_rethrow(ctx);
696 }
697 return doc;
698 }
699
700 static int
701 mobi_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state)
702 {
703 char text[8];
704
705 if (state)
706 *state = NULL;
707 if (free_state)
708 *free_state = NULL;
709
710 if (stream == NULL)
711 return 0;
712
713 fz_seek(ctx, stream, 32 + 28, SEEK_SET);
714 if (fz_read(ctx, stream, (unsigned char *)text, 8) != 8)
715 return 0;
716 if (memcmp(text, "BOOKMOBI", 8) == 0)
717 return 100;
718 if (memcmp(text, "TEXtREAd", 8) == 0)
719 return 100;
720
721 return 0;
722 }
723
724 static fz_document *
725 mobi_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state)
726 {
727 return mobi_open_document_with_buffer(ctx, fz_read_all(ctx, file, 0));
728 }
729
730 static const char *mobi_extensions[] =
731 {
732 "mobi",
733 "prc",
734 "pdb",
735 NULL
736 };
737
738 static const char *mobi_mimetypes[] =
739 {
740 "application/x-mobipocket-ebook",
741 NULL
742 };
743
744 fz_document_handler mobi_document_handler =
745 {
746 NULL,
747 mobi_open_document,
748 mobi_extensions,
749 mobi_mimetypes,
750 mobi_recognize_content
751 };