diff mupdf-source/source/html/mobi.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mupdf-source/source/html/mobi.c	Mon Sep 15 11:43:07 2025 +0200
@@ -0,0 +1,347 @@
+// Copyright (C) 2004-2025 Artifex Software, Inc.
+//
+// This file is part of MuPDF.
+//
+// MuPDF is free software: you can redistribute it and/or modify it under the
+// terms of the GNU Affero General Public License as published by the Free
+// Software Foundation, either version 3 of the License, or (at your option)
+// any later version.
+//
+// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
+// details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
+//
+// Alternative licensing terms are available from the licensor.
+// For commercial licensing, see <https://www.artifex.com/> or contact
+// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
+// CA 94129, USA, for further information.
+
+#include "mupdf/fitz.h"
+#include "html-imp.h"
+
+#include <string.h>
+
+#define FORMAT_HTML 1
+#define FORMAT_TEXT 2
+
+#define COMPRESSION_NONE 1
+#define COMPRESSION_PALMDOC 2
+#define COMPRESSION_HUFF_CDIC 17480
+
+#define TEXT_ENCODING_LATIN_1 0
+#define TEXT_ENCODING_1252 1252
+#define TEXT_ENCODING_UTF8 65001
+
+static void
+skip_bytes(fz_context *ctx, fz_stream *stm, size_t len)
+{
+	size_t skipped = fz_skip(ctx, stm, len);
+	if (skipped < len)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "premature end in data");
+}
+
+static void
+mobi_read_text_none(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
+{
+	unsigned char buf[4096];
+	size_t n;
+	if (size > 4096)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "text block too large");
+	n = fz_read(ctx, stm, buf, size);
+	if (n < size)
+		fz_warn(ctx, "premature end in mobi uncompressed text data");
+	fz_append_data(ctx, out, buf, n);
+}
+
+static void
+mobi_read_text_palmdoc(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t size)
+{
+	// https://wiki.mobileread.com/wiki/PalmDOC
+	size_t end = out->len + size;
+	while (out->len < end)
+	{
+		int c = fz_read_byte(ctx, stm);
+		if (c == EOF)
+			break;
+		if (c >= 0x01 && c <= 0x08)
+		{
+			unsigned char buf[8];
+			size_t n = fz_read(ctx, stm, buf, c);
+			fz_append_data(ctx, out, buf, n);
+			if (n < (size_t) c)
+				break;
+		}
+		else if (c <= 0x7f)
+		{
+			fz_append_byte(ctx, out, c);
+		}
+		else if (c >= 0x80 && c <= 0xbf)
+		{
+			int cc, x, distance, length;
+			cc = fz_read_byte(ctx, stm);
+			if (cc == EOF)
+				break;
+			x = (c << 8) | cc;
+			distance = (x >> 3) & 0x7ff;
+			length = (x & 7) + 3;
+			if (distance > 0 && (size_t)distance <= out->len)
+			{
+				int i;
+				int p = (int)(out->len - distance);
+				for (i = 0; i < length; ++i)
+					fz_append_byte(ctx, out, out->data[p + i]);
+			}
+		}
+		else if (c >= 0xc0 && c <= 0xff)
+		{
+			fz_append_byte(ctx, out, ' ');
+			fz_append_byte(ctx, out, c ^ 0x80);
+		}
+	}
+
+	if (out->len < end)
+		fz_warn(ctx, "premature end in mobi palmdoc data");
+}
+
+static uint32_t
+mobi_read_data(fz_context *ctx, fz_buffer *out, fz_stream *stm, uint32_t *offset, uint32_t total_count, int format)
+{
+	// https://wiki.mobileread.com/wiki/MOBI
+	uint32_t compression, text_length, record_count, text_encoding, i;
+	unsigned char buf[4];
+	fz_range range = { 0 };
+	fz_stream *rec = NULL;
+	size_t n;
+
+	fz_var(rec);
+
+	fz_try(ctx)
+	{
+		range.offset = offset[0];
+		range.length = offset[1] - offset[0];
+		rec = fz_open_range_filter(ctx, stm, &range, 1);
+
+		// PalmDOC header
+		compression = fz_read_uint16(ctx, rec);
+		skip_bytes(ctx, rec, 2);
+		text_length = fz_read_uint32(ctx, rec);
+		record_count = fz_read_uint16(ctx, rec);
+		skip_bytes(ctx, rec, 2);
+		skip_bytes(ctx, rec, 2); // encryption
+		skip_bytes(ctx, rec, 2);
+
+		// Optional MOBI header
+		text_encoding = TEXT_ENCODING_LATIN_1;
+		n = fz_read(ctx, rec, buf, 4);
+		if (n == 4 && !memcmp(buf, "MOBI", 4))
+		{
+			skip_bytes(ctx, rec, 4);
+			skip_bytes(ctx, rec, 4);
+			text_encoding = fz_read_uint32(ctx, rec);
+		}
+	}
+	fz_always(ctx)
+		fz_drop_stream(ctx, rec);
+	fz_catch(ctx)
+		fz_rethrow(ctx);
+
+	if (compression != COMPRESSION_NONE && compression != COMPRESSION_PALMDOC)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "unknown compression method");
+	if (text_encoding != TEXT_ENCODING_LATIN_1 &&
+		text_encoding != TEXT_ENCODING_1252 &&
+		text_encoding != TEXT_ENCODING_UTF8)
+		fz_throw(ctx, FZ_ERROR_FORMAT, "unknown text encoding");
+
+	for (i = 1; i <= record_count && i < total_count; ++i)
+	{
+		uint32_t remain = text_length - (uint32_t)out->len;
+		uint32_t size = remain < 4096 ? remain : 4096;
+
+		fz_try(ctx)
+		{
+			range.offset = offset[i];
+			range.length = offset[i + 1] - offset[i];
+			rec = fz_open_range_filter(ctx, stm, &range, 1);
+
+			if (compression == COMPRESSION_NONE)
+				mobi_read_text_none(ctx, out, rec, size);
+			else
+				mobi_read_text_palmdoc(ctx, out, rec, size);
+		}
+		fz_always(ctx)
+			fz_drop_stream(ctx, rec);
+		fz_catch(ctx)
+			fz_rethrow(ctx);
+	}
+
+	if (format == FORMAT_TEXT && out->len > 6)
+	{
+		if (!memcmp(out->data, "<html>", 6) || !memcmp(out->data, "<HTML>", 6))
+			format = FORMAT_HTML;
+	}
+
+	if (text_encoding != TEXT_ENCODING_UTF8 || format == FORMAT_TEXT)
+	{
+		unsigned char *p;
+		size_t j, z = fz_buffer_extract(ctx, out, &p);
+		fz_resize_buffer(ctx, out, 0);
+		if (format == FORMAT_TEXT)
+			fz_append_string(ctx, out, "<html><head><style>body{white-space:pre-wrap}</style></head><body>");
+		for (j = 0; j < z; ++j)
+		{
+			int c = p[j];
+			if (format == FORMAT_TEXT && (c == '<' || c == '>' || c == '&'))
+			{
+				if (c == '<')
+					fz_append_string(ctx, out, "&lt;");
+				else if (c == '>')
+					fz_append_string(ctx, out, "&gt;");
+				else if (c == '&')
+					fz_append_string(ctx, out, "&amp;");
+			}
+			else
+			{
+				switch (text_encoding)
+				{
+				case TEXT_ENCODING_UTF8:
+					fz_append_byte(ctx, out, c);
+					break;
+				case TEXT_ENCODING_LATIN_1:
+					fz_append_rune(ctx, out, c);
+					break;
+				case TEXT_ENCODING_1252:
+					fz_append_rune(ctx, out, fz_unicode_from_windows_1252[c]);
+					break;
+				}
+			}
+		}
+		if (format == FORMAT_TEXT)
+			fz_append_string(ctx, out, "</body></html>");
+		fz_free(ctx, p);
+	}
+
+	return record_count;
+}
+
+static void drop_tree_entry(fz_context *ctx, void *ent)
+{
+	fz_drop_buffer(ctx, ent);
+}
+
+fz_archive *
+fz_extract_html_from_mobi(fz_context *ctx, fz_buffer *mobi)
+{
+	fz_stream *stm = NULL;
+	fz_buffer *buffer = NULL;
+	fz_tree *tree = NULL;
+	uint32_t *offsets = NULL;
+	char buf[32];
+	uint32_t i, k, extra;
+	uint32_t recindex;
+	uint32_t minoffset, maxoffset;
+	int format = FORMAT_TEXT;
+	size_t n;
+
+	// https://wiki.mobileread.com/wiki/PalmDOC
+
+	fz_var(stm);
+	fz_var(buffer);
+	fz_var(offsets);
+	fz_var(tree);
+
+	fz_try(ctx)
+	{
+		stm = fz_open_buffer(ctx, mobi);
+
+		skip_bytes(ctx, stm, 32); // database name
+		skip_bytes(ctx, stm, 28); // database attributes, version, dates, etc
+
+		n = fz_read(ctx, stm, (unsigned char *)buf, 8); // database type and creator
+		buf[8] = 0;
+
+		if (n == 8 && !memcmp(buf, "BOOKMOBI", 8))
+			format = FORMAT_HTML;
+		else if (n == 8 && !memcmp(buf, "TEXtREAd", 8))
+			format = FORMAT_TEXT;
+		else if (n != 8)
+			fz_warn(ctx, "premature end in data");
+		else
+			fz_warn(ctx, "Unknown MOBI/PRC format: %s.", buf);
+
+		skip_bytes(ctx, stm, 8); // database internal fields
+
+		// record info list count
+		n = fz_read_uint16(ctx, stm);
+
+		minoffset = (uint32_t)(fz_tell(ctx, stm) + n * 2 * sizeof (uint32_t) - 1);
+		maxoffset = (uint32_t)mobi->len;
+
+		// record info list
+		offsets = fz_malloc_array(ctx, n + 1, uint32_t);
+		for (i = 0, k = 0; i < n; ++i)
+		{
+			uint32_t offset = fz_read_uint32(ctx, stm);
+			if (offset <= minoffset)
+				continue;
+			if (offset >= maxoffset)
+				continue;
+			minoffset = offsets[k++] = offset;
+			skip_bytes(ctx, stm, 4);
+		}
+		offsets[k] = (uint32_t)mobi->len;
+
+		// adjust n in case some out of bound offsets were skipped
+		n = k;
+		if (n == 0)
+			fz_throw(ctx, FZ_ERROR_FORMAT, "no mobi records to read");
+
+		// decompress text data
+		buffer = fz_new_buffer(ctx, 128 << 10);
+		extra = mobi_read_data(ctx, buffer, stm, offsets, (uint32_t)n, format);
+		fz_terminate_buffer(ctx, buffer);
+
+#ifndef NDEBUG
+		if (fz_atoi(getenv("FZ_DEBUG_MOBI")))
+			fz_save_buffer(ctx, buffer, "mobi.xhtml");
+#endif
+
+		tree = fz_tree_insert(ctx, tree, "index.html", buffer);
+		buffer = NULL;
+
+		// copy image data records into tree
+		recindex = 1;
+		for (i = extra; i < n; ++i)
+		{
+			uint32_t size = offsets[i+1] - offsets[i];
+			if (size > 8)
+			{
+				unsigned char *data = mobi->data + offsets[i];
+				if (fz_recognize_image_format(ctx, data))
+				{
+					buffer = fz_new_buffer_from_copied_data(ctx, data, size);
+					fz_snprintf(buf, sizeof buf, "%05d", recindex);
+					tree = fz_tree_insert(ctx, tree, buf, buffer);
+					buffer = NULL;
+					recindex++;
+				}
+			}
+		}
+	}
+	fz_always(ctx)
+	{
+		fz_drop_stream(ctx, stm);
+		fz_free(ctx, offsets);
+	}
+	fz_catch(ctx)
+	{
+		fz_drop_buffer(ctx, buffer);
+		fz_drop_tree(ctx, tree, drop_tree_entry);
+		fz_rethrow(ctx);
+	}
+
+	return fz_new_tree_archive(ctx, tree);
+}