Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/source/pdf/pdf-unicode.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/source/pdf/pdf-unicode.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,157 @@ +// Copyright (C) 2004-2021 Artifex Software, Inc. +// +// This file is part of MuPDF. +// +// MuPDF is free software: you can redistribute it and/or modify it under the +// terms of the GNU Affero General Public License as published by the Free +// Software Foundation, either version 3 of the License, or (at your option) +// any later version. +// +// MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +// details. +// +// You should have received a copy of the GNU Affero General Public License +// along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html> +// +// Alternative licensing terms are available from the licensor. +// For commercial licensing, see <https://www.artifex.com/> or contact +// Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco, +// CA 94129, USA, for further information. + +#include "mupdf/fitz.h" +#include "mupdf/pdf.h" + +#include <string.h> + +/* Load or synthesize ToUnicode map for fonts */ + +static void +pdf_remap_cmap_range(fz_context *ctx, pdf_cmap *ucs_from_gid, + unsigned int cpt, unsigned int gid, unsigned int n, pdf_cmap *ucs_from_cpt) +{ + unsigned int k; + int ucsbuf[PDF_MRANGE_CAP]; + int ucslen; + + for (k = 0; k <= n; ++k) + { + ucslen = pdf_lookup_cmap_full(ucs_from_cpt, cpt + k, ucsbuf); + if (ucslen == 1) + pdf_map_range_to_range(ctx, ucs_from_gid, gid + k, gid + k, ucsbuf[0]); + else if (ucslen > 1) + pdf_map_one_to_many(ctx, ucs_from_gid, gid + k, ucsbuf, ucslen); + } +} + +static pdf_cmap * +pdf_remap_cmap(fz_context *ctx, pdf_cmap *gid_from_cpt, pdf_cmap *ucs_from_cpt) +{ + pdf_cmap *ucs_from_gid; + unsigned int a, b, x; + int i; + + ucs_from_gid = pdf_new_cmap(ctx); + + fz_try(ctx) + { + if (gid_from_cpt->usecmap) + ucs_from_gid->usecmap = pdf_remap_cmap(ctx, gid_from_cpt->usecmap, ucs_from_cpt); + + pdf_add_codespace(ctx, ucs_from_gid, 0, 0x7fffffff, 4); + + for (i = 0; i < gid_from_cpt->rlen; ++i) + { + a = gid_from_cpt->ranges[i].low; + b = gid_from_cpt->ranges[i].high; + x = gid_from_cpt->ranges[i].out; + pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt); + } + + for (i = 0; i < gid_from_cpt->xlen; ++i) + { + a = gid_from_cpt->xranges[i].low; + b = gid_from_cpt->xranges[i].high; + x = gid_from_cpt->xranges[i].out; + pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt); + } + + /* Font encoding CMaps don't have one-to-many mappings, so we can ignore the mranges. */ + + pdf_sort_cmap(ctx, ucs_from_gid); + } + fz_catch(ctx) + { + pdf_drop_cmap(ctx, ucs_from_gid); + fz_rethrow(ctx); + } + + return ucs_from_gid; +} + +void +pdf_load_to_unicode(fz_context *ctx, pdf_document *doc, pdf_font_desc *font, + const char **strings, char *collection, pdf_obj *cmapstm) +{ + unsigned int cpt; + + if (pdf_is_stream(ctx, cmapstm)) + { + pdf_cmap *ucs_from_cpt = pdf_load_embedded_cmap(ctx, doc, cmapstm); + fz_try(ctx) + font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt); + fz_always(ctx) + pdf_drop_cmap(ctx, ucs_from_cpt); + fz_catch(ctx) + fz_rethrow(ctx); + font->size += pdf_cmap_size(ctx, font->to_unicode); + } + + else if (pdf_is_name(ctx, cmapstm)) + { + pdf_cmap *ucs_from_cpt = pdf_load_system_cmap(ctx, pdf_to_name(ctx, cmapstm)); + fz_try(ctx) + font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt); + fz_always(ctx) + pdf_drop_cmap(ctx, ucs_from_cpt); + fz_catch(ctx) + fz_rethrow(ctx); + font->size += pdf_cmap_size(ctx, font->to_unicode); + } + + else if (collection) + { + if (!strcmp(collection, "Adobe-CNS1")) + font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-CNS1-UCS2"); + else if (!strcmp(collection, "Adobe-GB1")) + font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-GB1-UCS2"); + else if (!strcmp(collection, "Adobe-Japan1")) + font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Japan1-UCS2"); + else if (!strcmp(collection, "Adobe-Korea1")) + font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Korea1-UCS2"); + } + + if (strings) + { + /* TODO one-to-many mappings */ + + font->cid_to_ucs = Memento_label(fz_malloc_array(ctx, 256, unsigned short), "cid_to_ucs"); + font->cid_to_ucs_len = 256; + font->size += 256 * sizeof *font->cid_to_ucs; + + for (cpt = 0; cpt < 256; cpt++) + { + if (strings[cpt]) + font->cid_to_ucs[cpt] = fz_unicode_from_glyph_name(strings[cpt]); + else + font->cid_to_ucs[cpt] = FZ_REPLACEMENT_CHARACTER; + } + } + + if (!font->to_unicode && !font->cid_to_ucs) + { + /* TODO: synthesize a ToUnicode if it's a freetype font with + * cmap and/or post tables or if it has glyph names. */ + } +}
