Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/source/html/html-doc.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 // Copyright (C) 2004-2024 Artifex Software, Inc. | |
| 2 // | |
| 3 // This file is part of MuPDF. | |
| 4 // | |
| 5 // MuPDF is free software: you can redistribute it and/or modify it under the | |
| 6 // terms of the GNU Affero General Public License as published by the Free | |
| 7 // Software Foundation, either version 3 of the License, or (at your option) | |
| 8 // any later version. | |
| 9 // | |
| 10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY | |
| 11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
| 12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | |
| 13 // details. | |
| 14 // | |
| 15 // You should have received a copy of the GNU Affero General Public License | |
| 16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> | |
| 17 // | |
| 18 // Alternative licensing terms are available from the licensor. | |
| 19 // For commercial licensing, see <https://www.artifex.com/> or contact | |
| 20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, | |
| 21 // CA 94129, USA, for further information. | |
| 22 | |
| 23 #include "mupdf/fitz.h" | |
| 24 #include "html-imp.h" | |
| 25 | |
| 26 #include <string.h> | |
| 27 #include <math.h> | |
| 28 | |
| 29 enum { T, R, B, L }; | |
| 30 | |
| 31 typedef struct | |
| 32 { | |
| 33 fz_document super; | |
| 34 fz_archive *zip; | |
| 35 fz_html_font_set *set; | |
| 36 fz_html *html; | |
| 37 fz_outline *outline; | |
| 38 const fz_htdoc_format_t *format; | |
| 39 } html_document; | |
| 40 | |
| 41 typedef struct | |
| 42 { | |
| 43 fz_page super; | |
| 44 html_document *doc; | |
| 45 int number; | |
| 46 } html_page; | |
| 47 | |
| 48 static void | |
| 49 htdoc_drop_document(fz_context *ctx, fz_document *doc_) | |
| 50 { | |
| 51 html_document *doc = (html_document*)doc_; | |
| 52 fz_drop_archive(ctx, doc->zip); | |
| 53 fz_drop_html(ctx, doc->html); | |
| 54 fz_drop_html_font_set(ctx, doc->set); | |
| 55 fz_drop_outline(ctx, doc->outline); | |
| 56 } | |
| 57 | |
| 58 static fz_link_dest | |
| 59 htdoc_resolve_link(fz_context *ctx, fz_document *doc_, const char *dest) | |
| 60 { | |
| 61 html_document *doc = (html_document*)doc_; | |
| 62 const char *s = strchr(dest, '#'); | |
| 63 if (s && s[1] != 0) | |
| 64 { | |
| 65 float y = fz_find_html_target(ctx, doc->html, s+1); | |
| 66 if (y >= 0) | |
| 67 { | |
| 68 int page = y / doc->html->page_h; | |
| 69 return fz_make_link_dest_xyz(0, page, 0, y - page * doc->html->page_h, 0); | |
| 70 } | |
| 71 } | |
| 72 | |
| 73 return fz_make_link_dest_none(); | |
| 74 } | |
| 75 | |
| 76 static int | |
| 77 htdoc_count_pages(fz_context *ctx, fz_document *doc_, int chapter) | |
| 78 { | |
| 79 html_document *doc = (html_document*)doc_; | |
| 80 if (doc->html->tree.root->s.layout.b > 0) | |
| 81 return ceilf(doc->html->tree.root->s.layout.b / doc->html->page_h); | |
| 82 return 1; | |
| 83 } | |
| 84 | |
| 85 static void | |
| 86 htdoc_update_outline(fz_context *ctx, fz_document *doc, fz_outline *node) | |
| 87 { | |
| 88 while (node) | |
| 89 { | |
| 90 fz_link_dest dest = htdoc_resolve_link(ctx, doc, node->uri); | |
| 91 node->page = dest.loc; | |
| 92 node->x = dest.x; | |
| 93 node->y = dest.y; | |
| 94 htdoc_update_outline(ctx, doc, node->down); | |
| 95 node = node->next; | |
| 96 } | |
| 97 } | |
| 98 | |
| 99 static void | |
| 100 htdoc_layout(fz_context *ctx, fz_document *doc_, float w, float h, float em) | |
| 101 { | |
| 102 html_document *doc = (html_document*)doc_; | |
| 103 | |
| 104 fz_layout_html(ctx, doc->html, w, h, em); | |
| 105 | |
| 106 htdoc_update_outline(ctx, doc_, doc->outline); | |
| 107 } | |
| 108 | |
| 109 static void | |
| 110 htdoc_drop_page(fz_context *ctx, fz_page *page_) | |
| 111 { | |
| 112 } | |
| 113 | |
| 114 static fz_rect | |
| 115 htdoc_bound_page(fz_context *ctx, fz_page *page_, fz_box_type box) | |
| 116 { | |
| 117 html_page *page = (html_page*)page_; | |
| 118 html_document *doc = page->doc; | |
| 119 fz_rect bbox; | |
| 120 bbox.x0 = 0; | |
| 121 bbox.y0 = 0; | |
| 122 bbox.x1 = doc->html->page_w + doc->html->page_margin[L] + doc->html->page_margin[R]; | |
| 123 bbox.y1 = doc->html->page_h + doc->html->page_margin[T] + doc->html->page_margin[B]; | |
| 124 return bbox; | |
| 125 } | |
| 126 | |
| 127 static void | |
| 128 htdoc_run_page(fz_context *ctx, fz_page *page_, fz_device *dev, fz_matrix ctm, fz_cookie *cookie) | |
| 129 { | |
| 130 html_page *page = (html_page*)page_; | |
| 131 html_document *doc = page->doc; | |
| 132 fz_draw_html(ctx, dev, ctm, doc->html, page->number); | |
| 133 } | |
| 134 | |
| 135 static fz_link * | |
| 136 htdoc_load_links(fz_context *ctx, fz_page *page_) | |
| 137 { | |
| 138 html_page *page = (html_page*)page_; | |
| 139 html_document *doc = page->doc; | |
| 140 return fz_load_html_links(ctx, doc->html, page->number, ""); | |
| 141 } | |
| 142 | |
| 143 static fz_bookmark | |
| 144 htdoc_make_bookmark(fz_context *ctx, fz_document *doc_, fz_location loc) | |
| 145 { | |
| 146 html_document *doc = (html_document*)doc_; | |
| 147 return fz_make_html_bookmark(ctx, doc->html, loc.page); | |
| 148 } | |
| 149 | |
| 150 static fz_location | |
| 151 htdoc_lookup_bookmark(fz_context *ctx, fz_document *doc_, fz_bookmark mark) | |
| 152 { | |
| 153 html_document *doc = (html_document*)doc_; | |
| 154 return fz_make_location(0, fz_lookup_html_bookmark(ctx, doc->html, mark)); | |
| 155 } | |
| 156 | |
| 157 static fz_page * | |
| 158 htdoc_load_page(fz_context *ctx, fz_document *doc_, int chapter, int number) | |
| 159 { | |
| 160 html_document *doc = (html_document*)doc_; | |
| 161 html_page *page = fz_new_derived_page(ctx, html_page, doc_); | |
| 162 page->super.bound_page = htdoc_bound_page; | |
| 163 page->super.run_page_contents = htdoc_run_page; | |
| 164 page->super.load_links = htdoc_load_links; | |
| 165 page->super.drop_page = htdoc_drop_page; | |
| 166 page->doc = doc; | |
| 167 page->number = number; | |
| 168 return (fz_page*)page; | |
| 169 } | |
| 170 | |
| 171 static fz_outline * | |
| 172 htdoc_load_outline(fz_context *ctx, fz_document *doc_) | |
| 173 { | |
| 174 html_document *doc = (html_document*)doc_; | |
| 175 return fz_keep_outline(ctx, doc->outline); | |
| 176 } | |
| 177 | |
| 178 static int | |
| 179 htdoc_lookup_metadata(fz_context *ctx, fz_document *doc_, const char *key, char *buf, size_t size) | |
| 180 { | |
| 181 html_document *doc = (html_document *)doc_; | |
| 182 if (!strcmp(key, FZ_META_FORMAT)) | |
| 183 return 1 + (int)fz_strlcpy(buf, doc->format->format_name, size); | |
| 184 if (!strcmp(key, FZ_META_INFO_TITLE) && doc->html->title) | |
| 185 return 1 + (int)fz_strlcpy(buf, doc->html->title, size); | |
| 186 return -1; | |
| 187 } | |
| 188 | |
| 189 static fz_html * | |
| 190 generic_parse(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buffer_in, const char *user_css, const fz_htdoc_format_t *format) | |
| 191 { | |
| 192 fz_buffer *buffer_html = NULL; | |
| 193 fz_html *html = NULL; | |
| 194 | |
| 195 fz_try(ctx) | |
| 196 { | |
| 197 if (format->convert_to_html) | |
| 198 buffer_html = format->convert_to_html(ctx, set, buffer_in, zip, user_css); | |
| 199 else | |
| 200 buffer_html = fz_keep_buffer(ctx, buffer_in); | |
| 201 html = fz_parse_html(ctx, set, zip, base_uri, buffer_html, user_css, format->try_xml, format->try_html5, format->patch_mobi); | |
| 202 } | |
| 203 fz_always(ctx) | |
| 204 { | |
| 205 fz_drop_buffer(ctx, buffer_html); | |
| 206 } | |
| 207 fz_catch(ctx) | |
| 208 { | |
| 209 fz_drop_html(ctx, html); | |
| 210 fz_rethrow(ctx); | |
| 211 } | |
| 212 return html; | |
| 213 } | |
| 214 | |
| 215 fz_document * | |
| 216 fz_htdoc_open_document_with_buffer(fz_context *ctx, fz_archive *dir, fz_buffer *buf, const fz_htdoc_format_t *format) | |
| 217 { | |
| 218 html_document *doc = NULL; | |
| 219 | |
| 220 fz_var(doc); | |
| 221 fz_var(dir); | |
| 222 | |
| 223 fz_try(ctx) | |
| 224 { | |
| 225 doc = fz_new_derived_document(ctx, html_document); | |
| 226 doc->super.drop_document = htdoc_drop_document; | |
| 227 doc->super.layout = htdoc_layout; | |
| 228 doc->super.load_outline = htdoc_load_outline; | |
| 229 doc->super.resolve_link_dest = htdoc_resolve_link; | |
| 230 doc->super.make_bookmark = htdoc_make_bookmark; | |
| 231 doc->super.lookup_bookmark = htdoc_lookup_bookmark; | |
| 232 doc->super.count_pages = htdoc_count_pages; | |
| 233 doc->super.load_page = htdoc_load_page; | |
| 234 doc->super.lookup_metadata = htdoc_lookup_metadata; | |
| 235 doc->super.is_reflowable = 1; | |
| 236 | |
| 237 doc->zip = fz_keep_archive(ctx, dir); | |
| 238 doc->format = format; | |
| 239 doc->set = fz_new_html_font_set(ctx); | |
| 240 doc->html = generic_parse(ctx, doc->set, doc->zip, ".", buf, fz_user_css(ctx), format); | |
| 241 doc->outline = fz_load_html_outline(ctx, doc->html); | |
| 242 } | |
| 243 fz_always(ctx) | |
| 244 fz_drop_buffer(ctx, buf); | |
| 245 fz_catch(ctx) | |
| 246 { | |
| 247 fz_drop_document(ctx, &doc->super); | |
| 248 fz_rethrow(ctx); | |
| 249 } | |
| 250 | |
| 251 return (fz_document*)doc; | |
| 252 } | |
| 253 | |
| 254 fz_document * | |
| 255 fz_htdoc_open_document_with_stream_and_dir(fz_context *ctx, fz_stream *stm, fz_archive *dir, const fz_htdoc_format_t *format) | |
| 256 { | |
| 257 fz_buffer *buf = NULL; | |
| 258 | |
| 259 if (stm) | |
| 260 buf = fz_read_all(ctx, stm, 0); | |
| 261 | |
| 262 return fz_htdoc_open_document_with_buffer(ctx, dir, buf, format); | |
| 263 } | |
| 264 | |
| 265 /* Variant specific functions */ | |
| 266 | |
| 267 /* Generic HTML document handler */ | |
| 268 | |
| 269 static int isws(int c) | |
| 270 { | |
| 271 return c == 32 || c == 9 || c == 10 || c == 13 || c == 12; | |
| 272 } | |
| 273 | |
| 274 static int recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state, int xhtml) | |
| 275 { | |
| 276 uint8_t buffer[4096]; | |
| 277 size_t i, n, m; | |
| 278 enum { | |
| 279 state_top, | |
| 280 state_open, | |
| 281 state_pling, | |
| 282 state_query, | |
| 283 state_maybe_doctype, | |
| 284 state_maybe_doctype_ws, | |
| 285 state_maybe_doctype_html, | |
| 286 state_maybe_doctype_html_xhtml, | |
| 287 state_maybe_comment, | |
| 288 state_maybe_html, | |
| 289 state_maybe_html_xhtml, | |
| 290 state_comment | |
| 291 }; | |
| 292 int state = state_top; | |
| 293 int type = 0; | |
| 294 | |
| 295 if (hstate) | |
| 296 *hstate = NULL; | |
| 297 if (free_state) | |
| 298 *free_state = NULL; | |
| 299 | |
| 300 if (stream == NULL) | |
| 301 return 0; | |
| 302 | |
| 303 /* Simple state machine. Search for "<!doctype html" or "<html" in the first | |
| 304 * 4K of the file, allowing for comments and whitespace and case insensitivity. */ | |
| 305 | |
| 306 n = fz_read(ctx, stream, buffer, sizeof(buffer)); | |
| 307 fz_seek(ctx, stream, 0, SEEK_SET); | |
| 308 if (n == 0) | |
| 309 return 0; | |
| 310 | |
| 311 i = 0; | |
| 312 if (n >= 3 && buffer[0] == 0xEF && buffer[1] == 0xBB && buffer[2] == 0xBF) | |
| 313 { | |
| 314 /* UTF-8 encoded BOM. Just skip it. */ | |
| 315 i = 3; | |
| 316 } | |
| 317 else if (n >= 2 && buffer[0] == 0xFE && buffer[1] == 0xFF) | |
| 318 { | |
| 319 /* UTF-16, big endian. */ | |
| 320 type = 1; | |
| 321 i = 2; | |
| 322 n &= ~1; | |
| 323 } | |
| 324 else if (n >= 2 && buffer[0] == 0xFF && buffer[1] == 0xFE) | |
| 325 { | |
| 326 /* UTF-16, little endian. */ | |
| 327 i = 2; | |
| 328 type = 2; | |
| 329 n &= ~1; | |
| 330 } | |
| 331 | |
| 332 while (i < n) | |
| 333 { | |
| 334 int c; | |
| 335 | |
| 336 switch (type) | |
| 337 { | |
| 338 case 0: /* UTF-8 */ | |
| 339 c = buffer[i++]; | |
| 340 break; | |
| 341 case 1: /* UTF-16 - big endian */ | |
| 342 c = buffer[i++] << 8; | |
| 343 c |= buffer[i++]; | |
| 344 break; | |
| 345 case 2: /* UTF-16 - little endian */ | |
| 346 c = buffer[i++]; | |
| 347 c |= buffer[i++] << 8; | |
| 348 break; | |
| 349 } | |
| 350 | |
| 351 switch (state) | |
| 352 { | |
| 353 case state_top: | |
| 354 if (isws(c)) | |
| 355 continue; /* whitespace */ | |
| 356 if (c == '<') | |
| 357 state = state_open; | |
| 358 else | |
| 359 return 0; /* Non whitespace found at the top level prior to a known tag. Fail. */ | |
| 360 break; | |
| 361 case state_open: | |
| 362 if (isws(c)) | |
| 363 continue; /* whitespace */ | |
| 364 if (c == '!') | |
| 365 state = state_pling; | |
| 366 else if (c == '?') | |
| 367 state = state_query; | |
| 368 else if (c == 'h' || c == 'H') | |
| 369 state = state_maybe_html; | |
| 370 else | |
| 371 return 0; /* Not an acceptable opening tag. */ | |
| 372 m = 0; | |
| 373 break; | |
| 374 case state_query: | |
| 375 if (c == '>') | |
| 376 state = state_top; | |
| 377 break; | |
| 378 case state_pling: | |
| 379 if (isws(c)) | |
| 380 continue; /* whitespace */ | |
| 381 else if (c == '-') | |
| 382 state = state_maybe_comment; | |
| 383 else if (c == 'd' || c == 'D') | |
| 384 state = state_maybe_doctype; | |
| 385 else | |
| 386 return 0; /* Not an acceptable opening tag. */ | |
| 387 break; | |
| 388 case state_maybe_comment: | |
| 389 if (c == '-') | |
| 390 state = state_comment; | |
| 391 else | |
| 392 return 0; /* Not an acceptable opening tag. */ | |
| 393 break; | |
| 394 case state_comment: | |
| 395 if (c == '-') | |
| 396 { | |
| 397 m++; | |
| 398 } | |
| 399 else if (c == '>' && m >= 2) | |
| 400 { | |
| 401 state = state_top; | |
| 402 } | |
| 403 else | |
| 404 m = 0; | |
| 405 break; | |
| 406 case state_maybe_doctype: | |
| 407 if (c == "octype"[m] || c == "OCTYPE"[m]) | |
| 408 { | |
| 409 m++; | |
| 410 if (m == 6) | |
| 411 { | |
| 412 state = state_maybe_doctype_ws; | |
| 413 m = 0; | |
| 414 } | |
| 415 } | |
| 416 else | |
| 417 return 0; /* Not an acceptable opening tag. */ | |
| 418 break; | |
| 419 case state_maybe_doctype_ws: | |
| 420 if (isws(c)) | |
| 421 m++; | |
| 422 else if (m > 0 && (c == 'h' || c == 'H')) | |
| 423 { | |
| 424 state = state_maybe_doctype_html; | |
| 425 m = 0; | |
| 426 } | |
| 427 else | |
| 428 return 0; /* Not an acceptable opening tag. */ | |
| 429 break; | |
| 430 case state_maybe_doctype_html: | |
| 431 if (c == "tml"[m] || c == "TML"[m]) | |
| 432 { | |
| 433 m++; | |
| 434 if (m == 3) | |
| 435 { | |
| 436 state = state_maybe_doctype_html_xhtml; | |
| 437 m = 0; | |
| 438 } | |
| 439 } | |
| 440 else | |
| 441 return 0; /* Not an acceptable opening tag. */ | |
| 442 break; | |
| 443 case state_maybe_doctype_html_xhtml: | |
| 444 if (c == '>') | |
| 445 { | |
| 446 /* Not xhtml - the xhtml agent can handle this at a pinch (so 25), | |
| 447 * but we'd rather the html one did (75). */ | |
| 448 return xhtml ? 25 : 75; | |
| 449 } | |
| 450 if (c >= 'A' && c <= 'Z') | |
| 451 c += 'a'-'A'; | |
| 452 if (c == "xhtml"[m]) | |
| 453 { | |
| 454 m++; | |
| 455 if (m == 5) | |
| 456 { | |
| 457 /* xhtml - the xhtml agent would be better (75) than the html | |
| 458 * agent (25). */ | |
| 459 return xhtml ? 75 : 25; | |
| 460 } | |
| 461 } | |
| 462 else | |
| 463 m = 0; | |
| 464 break; | |
| 465 case state_maybe_html: | |
| 466 if (c == "tml"[m] || c == "TML"[m]) | |
| 467 { | |
| 468 m++; | |
| 469 if (m == 3) | |
| 470 { | |
| 471 state = state_maybe_html_xhtml; | |
| 472 m = 0; | |
| 473 } | |
| 474 } | |
| 475 else | |
| 476 return 0; /* Not an acceptable opening tag. */ | |
| 477 break; | |
| 478 case state_maybe_html_xhtml: | |
| 479 if (c == '>') | |
| 480 { | |
| 481 /* Not xhtml - the xhtml agent can handle this at a pinch (so 25), | |
| 482 * but we'd rather the html one did (75). */ | |
| 483 return xhtml ? 25 : 75; | |
| 484 } | |
| 485 if (c >= 'A' && c <= 'Z') | |
| 486 c += 'a'-'A'; | |
| 487 if (c == "xhtml"[m]) | |
| 488 { | |
| 489 m++; | |
| 490 if (m == 5) | |
| 491 { | |
| 492 /* xhtml - the xhtml agent would be better (75) than the html | |
| 493 * agent (25). */ | |
| 494 return xhtml ? 75 : 25; | |
| 495 } | |
| 496 } | |
| 497 else | |
| 498 m = 0; | |
| 499 break; | |
| 500 } | |
| 501 } | |
| 502 | |
| 503 return 0; | |
| 504 } | |
| 505 | |
| 506 int htdoc_recognize_html_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state) | |
| 507 { | |
| 508 return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 0); | |
| 509 } | |
| 510 | |
| 511 static const fz_htdoc_format_t fz_htdoc_html5 = | |
| 512 { | |
| 513 "HTML5", | |
| 514 NULL, | |
| 515 0, 1, 0 | |
| 516 }; | |
| 517 | |
| 518 static fz_document * | |
| 519 htdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) | |
| 520 { | |
| 521 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_html5); | |
| 522 } | |
| 523 | |
| 524 static const char *htdoc_extensions[] = | |
| 525 { | |
| 526 "htm", | |
| 527 "html", | |
| 528 NULL | |
| 529 }; | |
| 530 | |
| 531 static const char *htdoc_mimetypes[] = | |
| 532 { | |
| 533 "text/html", | |
| 534 NULL | |
| 535 }; | |
| 536 | |
| 537 fz_document_handler html_document_handler = | |
| 538 { | |
| 539 NULL, | |
| 540 htdoc_open_document, | |
| 541 htdoc_extensions, | |
| 542 htdoc_mimetypes, | |
| 543 htdoc_recognize_html_content, | |
| 544 1 | |
| 545 }; | |
| 546 | |
| 547 /* XHTML document handler */ | |
| 548 | |
| 549 static const fz_htdoc_format_t fz_htdoc_xhtml = | |
| 550 { | |
| 551 "XHTML", | |
| 552 NULL, | |
| 553 1, 1, 0 | |
| 554 }; | |
| 555 | |
| 556 static fz_document * | |
| 557 xhtdoc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) | |
| 558 { | |
| 559 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_xhtml); | |
| 560 } | |
| 561 | |
| 562 int xhtdoc_recognize_xhtml_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **hstate, fz_document_recognize_state_free_fn **free_state) | |
| 563 { | |
| 564 return recognize_html_content(ctx, handler, stream, dir, hstate, free_state, 1); | |
| 565 } | |
| 566 | |
| 567 static const char *xhtdoc_extensions[] = | |
| 568 { | |
| 569 "xhtml", | |
| 570 NULL | |
| 571 }; | |
| 572 | |
| 573 static const char *xhtdoc_mimetypes[] = | |
| 574 { | |
| 575 "application/xhtml+xml", | |
| 576 NULL | |
| 577 }; | |
| 578 | |
| 579 fz_document_handler xhtml_document_handler = | |
| 580 { | |
| 581 NULL, | |
| 582 xhtdoc_open_document, | |
| 583 xhtdoc_extensions, | |
| 584 xhtdoc_mimetypes, | |
| 585 xhtdoc_recognize_xhtml_content, | |
| 586 1 | |
| 587 }; | |
| 588 | |
| 589 /* FB2 document handler */ | |
| 590 | |
| 591 static const fz_htdoc_format_t fz_htdoc_fb2 = | |
| 592 { | |
| 593 "FictionBook2", | |
| 594 NULL, | |
| 595 1, 0, 0 | |
| 596 }; | |
| 597 | |
| 598 static fz_document * | |
| 599 fb2doc_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) | |
| 600 { | |
| 601 return fz_htdoc_open_document_with_stream_and_dir(ctx, file, dir, &fz_htdoc_fb2); | |
| 602 } | |
| 603 | |
| 604 static int | |
| 605 fb2doc_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state) | |
| 606 { | |
| 607 const char *match = "<FictionBook"; | |
| 608 int pos = 0; | |
| 609 int n = 4096; | |
| 610 int c; | |
| 611 | |
| 612 if (state) | |
| 613 *state = NULL; | |
| 614 if (free_state) | |
| 615 *free_state = NULL; | |
| 616 | |
| 617 if (stream == NULL) | |
| 618 return 0; | |
| 619 | |
| 620 do | |
| 621 { | |
| 622 c = fz_read_byte(ctx, stream); | |
| 623 if (c == EOF) | |
| 624 return 0; | |
| 625 if (c == match[pos]) | |
| 626 { | |
| 627 pos++; | |
| 628 if (pos == 12) | |
| 629 return 100; | |
| 630 } | |
| 631 else | |
| 632 { | |
| 633 /* Restart matching, but recheck c against the start. */ | |
| 634 pos = (c == match[0]); | |
| 635 } | |
| 636 } | |
| 637 while (--n > 0); | |
| 638 | |
| 639 return 0; | |
| 640 } | |
| 641 | |
| 642 static const char *fb2doc_extensions[] = | |
| 643 { | |
| 644 "fb2", | |
| 645 "xml", | |
| 646 NULL | |
| 647 }; | |
| 648 | |
| 649 static const char *fb2doc_mimetypes[] = | |
| 650 { | |
| 651 "application/x-fictionbook", | |
| 652 "application/xml", | |
| 653 "text/xml", | |
| 654 NULL | |
| 655 }; | |
| 656 | |
| 657 fz_document_handler fb2_document_handler = | |
| 658 { | |
| 659 NULL, | |
| 660 fb2doc_open_document, | |
| 661 fb2doc_extensions, | |
| 662 fb2doc_mimetypes, | |
| 663 fb2doc_recognize_content | |
| 664 }; | |
| 665 | |
| 666 /* Mobi document handler */ | |
| 667 | |
| 668 static const fz_htdoc_format_t fz_htdoc_mobi = | |
| 669 { | |
| 670 "MOBI", | |
| 671 NULL, | |
| 672 1, 1, 1 | |
| 673 }; | |
| 674 | |
| 675 static fz_document * | |
| 676 mobi_open_document_with_buffer(fz_context *ctx, fz_buffer *mobi) | |
| 677 { | |
| 678 fz_archive *dir = NULL; | |
| 679 fz_buffer *html; | |
| 680 fz_document *doc; | |
| 681 fz_var(dir); | |
| 682 fz_try(ctx) | |
| 683 { | |
| 684 dir = fz_extract_html_from_mobi(ctx, mobi); | |
| 685 html = fz_read_archive_entry(ctx, dir, "index.html"); | |
| 686 doc = fz_htdoc_open_document_with_buffer(ctx, dir, html, &fz_htdoc_mobi); | |
| 687 } | |
| 688 fz_always(ctx) | |
| 689 { | |
| 690 fz_drop_buffer(ctx, mobi); | |
| 691 fz_drop_archive(ctx, dir); | |
| 692 } | |
| 693 fz_catch(ctx) | |
| 694 { | |
| 695 fz_rethrow(ctx); | |
| 696 } | |
| 697 return doc; | |
| 698 } | |
| 699 | |
| 700 static int | |
| 701 mobi_recognize_content(fz_context *ctx, const fz_document_handler *handler, fz_stream *stream, fz_archive *dir, void **state, fz_document_recognize_state_free_fn **free_state) | |
| 702 { | |
| 703 char text[8]; | |
| 704 | |
| 705 if (state) | |
| 706 *state = NULL; | |
| 707 if (free_state) | |
| 708 *free_state = NULL; | |
| 709 | |
| 710 if (stream == NULL) | |
| 711 return 0; | |
| 712 | |
| 713 fz_seek(ctx, stream, 32 + 28, SEEK_SET); | |
| 714 if (fz_read(ctx, stream, (unsigned char *)text, 8) != 8) | |
| 715 return 0; | |
| 716 if (memcmp(text, "BOOKMOBI", 8) == 0) | |
| 717 return 100; | |
| 718 if (memcmp(text, "TEXtREAd", 8) == 0) | |
| 719 return 100; | |
| 720 | |
| 721 return 0; | |
| 722 } | |
| 723 | |
| 724 static fz_document * | |
| 725 mobi_open_document(fz_context *ctx, const fz_document_handler *handler, fz_stream *file, fz_stream *accel, fz_archive *dir, void *state) | |
| 726 { | |
| 727 return mobi_open_document_with_buffer(ctx, fz_read_all(ctx, file, 0)); | |
| 728 } | |
| 729 | |
| 730 static const char *mobi_extensions[] = | |
| 731 { | |
| 732 "mobi", | |
| 733 "prc", | |
| 734 "pdb", | |
| 735 NULL | |
| 736 }; | |
| 737 | |
| 738 static const char *mobi_mimetypes[] = | |
| 739 { | |
| 740 "application/x-mobipocket-ebook", | |
| 741 NULL | |
| 742 }; | |
| 743 | |
| 744 fz_document_handler mobi_document_handler = | |
| 745 { | |
| 746 NULL, | |
| 747 mobi_open_document, | |
| 748 mobi_extensions, | |
| 749 mobi_mimetypes, | |
| 750 mobi_recognize_content | |
| 751 }; |
