comparison mupdf-source/thirdparty/extract/src/astring.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include "astring.h"
2 #include "mem.h"
3 #include "memento.h"
4
5 #include <assert.h>
6 #include <stdarg.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10
11
12 void extract_astring_init(extract_astring_t *string)
13 {
14 string->chars = NULL;
15 string->chars_num = 0;
16 }
17
18 void extract_astring_free(extract_alloc_t *alloc, extract_astring_t *string)
19 {
20 extract_free(alloc, &string->chars);
21 extract_astring_init(string);
22 }
23
24
25 int extract_astring_catl(extract_alloc_t *alloc, extract_astring_t *string, const char *s, size_t s_len)
26 {
27 if (extract_realloc2(alloc, &string->chars, string->chars_num+1, string->chars_num + s_len + 1))
28 return -1;
29 /* Coverity doesn't seem to realise that extract_realloc2() modifies
30 string->chars. */
31 /* coverity[deref_parm_field_in_call] */
32 memcpy(string->chars + string->chars_num, s, s_len);
33 string->chars[string->chars_num + s_len] = 0;
34 string->chars_num += s_len;
35 return 0;
36 }
37
38 int extract_astring_catc(extract_alloc_t *alloc, extract_astring_t *string, char c)
39 {
40 return extract_astring_catl(alloc, string, &c, 1);
41 }
42
43 int extract_astring_cat(extract_alloc_t *alloc, extract_astring_t *string, const char *s)
44 {
45 return extract_astring_catl(alloc, string, s, strlen(s));
46 }
47
48 int extract_astring_catf(extract_alloc_t *alloc, extract_astring_t *string, const char *format, ...)
49 {
50 char *buffer = NULL;
51 int e;
52 va_list va;
53
54 va_start(va, format);
55 e = extract_vasprintf(alloc, &buffer, format, va);
56 va_end(va);
57 if (e < 0) return e;
58 e = extract_astring_cat(alloc, string, buffer);
59 extract_free(alloc, &buffer);
60
61 return e;
62 }
63
64 int extract_astring_truncate(extract_astring_t *content, int len)
65 {
66 assert((size_t) len <= content->chars_num);
67
68 content->chars_num -= len;
69 content->chars[content->chars_num] = 0;
70
71 return 0;
72 }
73
74 int extract_astring_char_truncate_if(extract_astring_t *content, char c)
75 {
76 if (content->chars_num && content->chars[content->chars_num-1] == c)
77 extract_astring_truncate(content, 1);
78
79 return 0;
80 }
81
82 int extract_astring_catc_unicode(extract_alloc_t *alloc,
83 extract_astring_t *string,
84 int c,
85 int xml,
86 int ascii_ligatures,
87 int ascii_dash,
88 int ascii_apostrophe)
89 {
90 int ret = -1;
91
92 if (0) {}
93
94 /* Escape XML special characters. */
95 else if (xml && c == '<') extract_astring_cat(alloc, string, "&lt;");
96 else if (xml && c == '>') extract_astring_cat(alloc, string, "&gt;");
97 else if (xml && c == '&') extract_astring_cat(alloc, string, "&amp;");
98 else if (xml && c == '"') extract_astring_cat(alloc, string, "&quot;");
99 else if (xml && c == '\'') extract_astring_cat(alloc, string, "&apos;");
100
101 /* Expand ligatures. */
102 else if (ascii_ligatures && c == 0xFB00)
103 {
104 if (extract_astring_cat(alloc, string, "ff")) goto end;
105 }
106 else if (ascii_ligatures && c == 0xFB01)
107 {
108 if (extract_astring_cat(alloc, string, "fi")) goto end;
109 }
110 else if (ascii_ligatures && c == 0xFB02)
111 {
112 if (extract_astring_cat(alloc, string, "fl")) goto end;
113 }
114 else if (ascii_ligatures && c == 0xFB03)
115 {
116 if (extract_astring_cat(alloc, string, "ffi")) goto end;
117 }
118 else if (ascii_ligatures && c == 0xFB04)
119 {
120 if (extract_astring_cat(alloc, string, "ffl")) goto end;
121 }
122
123 /* Convert some special characters to ascii. */
124 else if (ascii_dash && c == 0x2212)
125 {
126 if (extract_astring_catc(alloc, string, '-')) goto end;
127 }
128 else if (ascii_apostrophe && c == 0x2019)
129 {
130 if (extract_astring_catc(alloc, string, '\'')) goto end;
131 }
132
133 /* Output ASCII verbatim. */
134 else if (c >= 32 && c <= 127)
135 {
136 if (extract_astring_catc(alloc, string, (char) c)) goto end;
137 }
138
139 /* Escape all other characters. */
140 else if (xml)
141 {
142 char buffer[32];
143 if (c < 32 && (c != 0x9 && c != 0xa && c != 0xd))
144 {
145 /* Illegal xml character; see
146 https://www.w3.org/TR/xml/#charsets. We replace with
147 0xfffd, the unicode replacement character. */
148 c = 0xfffd;
149 }
150 snprintf(buffer, sizeof(buffer), "&#x%x;", c);
151 if (extract_astring_cat(alloc, string, buffer)) goto end;
152 }
153 else
154 {
155 /* Use utf8. */
156 if (c < 0x80)
157 {
158 if (extract_astring_catc(alloc, string, (char) c)) return -1;
159 }
160 else if (c < 0x0800)
161 {
162 char cc[2] = { (char) (((c >> 6) & 0x1f) | 0xc0),
163 (char) (((c >> 0) & 0x3f) | 0x80) };
164 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
165 }
166 else if (c < 0x10000)
167 {
168 char cc[3] = { (char) (((c >> 12) & 0x0f) | 0xe0),
169 (char) (((c >> 6) & 0x3f) | 0x80),
170 (char) (((c >> 0) & 0x3f) | 0x80) };
171 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
172 }
173 else if (c < 0x110000)
174 {
175 char cc[4] = { (char) (((c >> 18) & 0x07) | 0xf0),
176 (char) (((c >> 12) & 0x3f) | 0x80),
177 (char) (((c >> 6) & 0x3f) | 0x80),
178 (char) (((c >> 0) & 0x3f) | 0x80) };
179 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
180 }
181 else
182 {
183 /* Use replacement character. */
184 char cc[4] = { (char) 0xef, (char) 0xbf, (char) 0xbd, 0};
185 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1;
186 }
187 }
188
189 ret = 0;
190
191 end:
192 return ret;
193 }
194
195 int extract_astring_catc_unicode_xml(extract_alloc_t *alloc, extract_astring_t *string, int c)
196 {
197 /* FIXME: better to use ascii_ligatures=0, but that requires updates to
198 expected output files. */
199 return extract_astring_catc_unicode(
200 alloc,
201 string,
202 c,
203 1 /*xml*/,
204 1 /*ascii_ligatures*/,
205 0 /*ascii_dash*/,
206 0 /*ascii_apostrophe*/
207 );
208 }