comparison mupdf-source/source/pdf/pdf-unicode.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 // Copyright (C) 2004-2021 Artifex Software, Inc.
2 //
3 // This file is part of MuPDF.
4 //
5 // MuPDF is free software: you can redistribute it and/or modify it under the
6 // terms of the GNU Affero General Public License as published by the Free
7 // Software Foundation, either version 3 of the License, or (at your option)
8 // any later version.
9 //
10 // MuPDF is distributed in the hope that it will be useful, but WITHOUT ANY
11 // WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 // FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
13 // details.
14 //
15 // You should have received a copy of the GNU Affero General Public License
16 // along with MuPDF. If not, see <https://www.gnu.org/licenses/agpl-3.0.en.html>
17 //
18 // Alternative licensing terms are available from the licensor.
19 // For commercial licensing, see <https://www.artifex.com/> or contact
20 // Artifex Software, Inc., 39 Mesa Street, Suite 108A, San Francisco,
21 // CA 94129, USA, for further information.
22
23 #include "mupdf/fitz.h"
24 #include "mupdf/pdf.h"
25
26 #include <string.h>
27
28 /* Load or synthesize ToUnicode map for fonts */
29
30 static void
31 pdf_remap_cmap_range(fz_context *ctx, pdf_cmap *ucs_from_gid,
32 unsigned int cpt, unsigned int gid, unsigned int n, pdf_cmap *ucs_from_cpt)
33 {
34 unsigned int k;
35 int ucsbuf[PDF_MRANGE_CAP];
36 int ucslen;
37
38 for (k = 0; k <= n; ++k)
39 {
40 ucslen = pdf_lookup_cmap_full(ucs_from_cpt, cpt + k, ucsbuf);
41 if (ucslen == 1)
42 pdf_map_range_to_range(ctx, ucs_from_gid, gid + k, gid + k, ucsbuf[0]);
43 else if (ucslen > 1)
44 pdf_map_one_to_many(ctx, ucs_from_gid, gid + k, ucsbuf, ucslen);
45 }
46 }
47
48 static pdf_cmap *
49 pdf_remap_cmap(fz_context *ctx, pdf_cmap *gid_from_cpt, pdf_cmap *ucs_from_cpt)
50 {
51 pdf_cmap *ucs_from_gid;
52 unsigned int a, b, x;
53 int i;
54
55 ucs_from_gid = pdf_new_cmap(ctx);
56
57 fz_try(ctx)
58 {
59 if (gid_from_cpt->usecmap)
60 ucs_from_gid->usecmap = pdf_remap_cmap(ctx, gid_from_cpt->usecmap, ucs_from_cpt);
61
62 pdf_add_codespace(ctx, ucs_from_gid, 0, 0x7fffffff, 4);
63
64 for (i = 0; i < gid_from_cpt->rlen; ++i)
65 {
66 a = gid_from_cpt->ranges[i].low;
67 b = gid_from_cpt->ranges[i].high;
68 x = gid_from_cpt->ranges[i].out;
69 pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt);
70 }
71
72 for (i = 0; i < gid_from_cpt->xlen; ++i)
73 {
74 a = gid_from_cpt->xranges[i].low;
75 b = gid_from_cpt->xranges[i].high;
76 x = gid_from_cpt->xranges[i].out;
77 pdf_remap_cmap_range(ctx, ucs_from_gid, a, x, b - a, ucs_from_cpt);
78 }
79
80 /* Font encoding CMaps don't have one-to-many mappings, so we can ignore the mranges. */
81
82 pdf_sort_cmap(ctx, ucs_from_gid);
83 }
84 fz_catch(ctx)
85 {
86 pdf_drop_cmap(ctx, ucs_from_gid);
87 fz_rethrow(ctx);
88 }
89
90 return ucs_from_gid;
91 }
92
93 void
94 pdf_load_to_unicode(fz_context *ctx, pdf_document *doc, pdf_font_desc *font,
95 const char **strings, char *collection, pdf_obj *cmapstm)
96 {
97 unsigned int cpt;
98
99 if (pdf_is_stream(ctx, cmapstm))
100 {
101 pdf_cmap *ucs_from_cpt = pdf_load_embedded_cmap(ctx, doc, cmapstm);
102 fz_try(ctx)
103 font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt);
104 fz_always(ctx)
105 pdf_drop_cmap(ctx, ucs_from_cpt);
106 fz_catch(ctx)
107 fz_rethrow(ctx);
108 font->size += pdf_cmap_size(ctx, font->to_unicode);
109 }
110
111 else if (pdf_is_name(ctx, cmapstm))
112 {
113 pdf_cmap *ucs_from_cpt = pdf_load_system_cmap(ctx, pdf_to_name(ctx, cmapstm));
114 fz_try(ctx)
115 font->to_unicode = pdf_remap_cmap(ctx, font->encoding, ucs_from_cpt);
116 fz_always(ctx)
117 pdf_drop_cmap(ctx, ucs_from_cpt);
118 fz_catch(ctx)
119 fz_rethrow(ctx);
120 font->size += pdf_cmap_size(ctx, font->to_unicode);
121 }
122
123 else if (collection)
124 {
125 if (!strcmp(collection, "Adobe-CNS1"))
126 font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-CNS1-UCS2");
127 else if (!strcmp(collection, "Adobe-GB1"))
128 font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-GB1-UCS2");
129 else if (!strcmp(collection, "Adobe-Japan1"))
130 font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Japan1-UCS2");
131 else if (!strcmp(collection, "Adobe-Korea1"))
132 font->to_unicode = pdf_load_system_cmap(ctx, "Adobe-Korea1-UCS2");
133 }
134
135 if (strings)
136 {
137 /* TODO one-to-many mappings */
138
139 font->cid_to_ucs = Memento_label(fz_malloc_array(ctx, 256, unsigned short), "cid_to_ucs");
140 font->cid_to_ucs_len = 256;
141 font->size += 256 * sizeof *font->cid_to_ucs;
142
143 for (cpt = 0; cpt < 256; cpt++)
144 {
145 if (strings[cpt])
146 font->cid_to_ucs[cpt] = fz_unicode_from_glyph_name(strings[cpt]);
147 else
148 font->cid_to_ucs[cpt] = FZ_REPLACEMENT_CHARACTER;
149 }
150 }
151
152 if (!font->to_unicode && !font->cid_to_ucs)
153 {
154 /* TODO: synthesize a ToUnicode if it's a freetype font with
155 * cmap and/or post tables or if it has glyph names. */
156 }
157 }