Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/astring.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "astring.h" | |
| 2 #include "mem.h" | |
| 3 #include "memento.h" | |
| 4 | |
| 5 #include <assert.h> | |
| 6 #include <stdarg.h> | |
| 7 #include <stdio.h> | |
| 8 #include <stdlib.h> | |
| 9 #include <string.h> | |
| 10 | |
| 11 | |
| 12 void extract_astring_init(extract_astring_t *string) | |
| 13 { | |
| 14 string->chars = NULL; | |
| 15 string->chars_num = 0; | |
| 16 } | |
| 17 | |
| 18 void extract_astring_free(extract_alloc_t *alloc, extract_astring_t *string) | |
| 19 { | |
| 20 extract_free(alloc, &string->chars); | |
| 21 extract_astring_init(string); | |
| 22 } | |
| 23 | |
| 24 | |
| 25 int extract_astring_catl(extract_alloc_t *alloc, extract_astring_t *string, const char *s, size_t s_len) | |
| 26 { | |
| 27 if (extract_realloc2(alloc, &string->chars, string->chars_num+1, string->chars_num + s_len + 1)) | |
| 28 return -1; | |
| 29 /* Coverity doesn't seem to realise that extract_realloc2() modifies | |
| 30 string->chars. */ | |
| 31 /* coverity[deref_parm_field_in_call] */ | |
| 32 memcpy(string->chars + string->chars_num, s, s_len); | |
| 33 string->chars[string->chars_num + s_len] = 0; | |
| 34 string->chars_num += s_len; | |
| 35 return 0; | |
| 36 } | |
| 37 | |
| 38 int extract_astring_catc(extract_alloc_t *alloc, extract_astring_t *string, char c) | |
| 39 { | |
| 40 return extract_astring_catl(alloc, string, &c, 1); | |
| 41 } | |
| 42 | |
| 43 int extract_astring_cat(extract_alloc_t *alloc, extract_astring_t *string, const char *s) | |
| 44 { | |
| 45 return extract_astring_catl(alloc, string, s, strlen(s)); | |
| 46 } | |
| 47 | |
| 48 int extract_astring_catf(extract_alloc_t *alloc, extract_astring_t *string, const char *format, ...) | |
| 49 { | |
| 50 char *buffer = NULL; | |
| 51 int e; | |
| 52 va_list va; | |
| 53 | |
| 54 va_start(va, format); | |
| 55 e = extract_vasprintf(alloc, &buffer, format, va); | |
| 56 va_end(va); | |
| 57 if (e < 0) return e; | |
| 58 e = extract_astring_cat(alloc, string, buffer); | |
| 59 extract_free(alloc, &buffer); | |
| 60 | |
| 61 return e; | |
| 62 } | |
| 63 | |
| 64 int extract_astring_truncate(extract_astring_t *content, int len) | |
| 65 { | |
| 66 assert((size_t) len <= content->chars_num); | |
| 67 | |
| 68 content->chars_num -= len; | |
| 69 content->chars[content->chars_num] = 0; | |
| 70 | |
| 71 return 0; | |
| 72 } | |
| 73 | |
| 74 int extract_astring_char_truncate_if(extract_astring_t *content, char c) | |
| 75 { | |
| 76 if (content->chars_num && content->chars[content->chars_num-1] == c) | |
| 77 extract_astring_truncate(content, 1); | |
| 78 | |
| 79 return 0; | |
| 80 } | |
| 81 | |
| 82 int extract_astring_catc_unicode(extract_alloc_t *alloc, | |
| 83 extract_astring_t *string, | |
| 84 int c, | |
| 85 int xml, | |
| 86 int ascii_ligatures, | |
| 87 int ascii_dash, | |
| 88 int ascii_apostrophe) | |
| 89 { | |
| 90 int ret = -1; | |
| 91 | |
| 92 if (0) {} | |
| 93 | |
| 94 /* Escape XML special characters. */ | |
| 95 else if (xml && c == '<') extract_astring_cat(alloc, string, "<"); | |
| 96 else if (xml && c == '>') extract_astring_cat(alloc, string, ">"); | |
| 97 else if (xml && c == '&') extract_astring_cat(alloc, string, "&"); | |
| 98 else if (xml && c == '"') extract_astring_cat(alloc, string, """); | |
| 99 else if (xml && c == '\'') extract_astring_cat(alloc, string, "'"); | |
| 100 | |
| 101 /* Expand ligatures. */ | |
| 102 else if (ascii_ligatures && c == 0xFB00) | |
| 103 { | |
| 104 if (extract_astring_cat(alloc, string, "ff")) goto end; | |
| 105 } | |
| 106 else if (ascii_ligatures && c == 0xFB01) | |
| 107 { | |
| 108 if (extract_astring_cat(alloc, string, "fi")) goto end; | |
| 109 } | |
| 110 else if (ascii_ligatures && c == 0xFB02) | |
| 111 { | |
| 112 if (extract_astring_cat(alloc, string, "fl")) goto end; | |
| 113 } | |
| 114 else if (ascii_ligatures && c == 0xFB03) | |
| 115 { | |
| 116 if (extract_astring_cat(alloc, string, "ffi")) goto end; | |
| 117 } | |
| 118 else if (ascii_ligatures && c == 0xFB04) | |
| 119 { | |
| 120 if (extract_astring_cat(alloc, string, "ffl")) goto end; | |
| 121 } | |
| 122 | |
| 123 /* Convert some special characters to ascii. */ | |
| 124 else if (ascii_dash && c == 0x2212) | |
| 125 { | |
| 126 if (extract_astring_catc(alloc, string, '-')) goto end; | |
| 127 } | |
| 128 else if (ascii_apostrophe && c == 0x2019) | |
| 129 { | |
| 130 if (extract_astring_catc(alloc, string, '\'')) goto end; | |
| 131 } | |
| 132 | |
| 133 /* Output ASCII verbatim. */ | |
| 134 else if (c >= 32 && c <= 127) | |
| 135 { | |
| 136 if (extract_astring_catc(alloc, string, (char) c)) goto end; | |
| 137 } | |
| 138 | |
| 139 /* Escape all other characters. */ | |
| 140 else if (xml) | |
| 141 { | |
| 142 char buffer[32]; | |
| 143 if (c < 32 && (c != 0x9 && c != 0xa && c != 0xd)) | |
| 144 { | |
| 145 /* Illegal xml character; see | |
| 146 https://www.w3.org/TR/xml/#charsets. We replace with | |
| 147 0xfffd, the unicode replacement character. */ | |
| 148 c = 0xfffd; | |
| 149 } | |
| 150 snprintf(buffer, sizeof(buffer), "&#x%x;", c); | |
| 151 if (extract_astring_cat(alloc, string, buffer)) goto end; | |
| 152 } | |
| 153 else | |
| 154 { | |
| 155 /* Use utf8. */ | |
| 156 if (c < 0x80) | |
| 157 { | |
| 158 if (extract_astring_catc(alloc, string, (char) c)) return -1; | |
| 159 } | |
| 160 else if (c < 0x0800) | |
| 161 { | |
| 162 char cc[2] = { (char) (((c >> 6) & 0x1f) | 0xc0), | |
| 163 (char) (((c >> 0) & 0x3f) | 0x80) }; | |
| 164 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; | |
| 165 } | |
| 166 else if (c < 0x10000) | |
| 167 { | |
| 168 char cc[3] = { (char) (((c >> 12) & 0x0f) | 0xe0), | |
| 169 (char) (((c >> 6) & 0x3f) | 0x80), | |
| 170 (char) (((c >> 0) & 0x3f) | 0x80) }; | |
| 171 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; | |
| 172 } | |
| 173 else if (c < 0x110000) | |
| 174 { | |
| 175 char cc[4] = { (char) (((c >> 18) & 0x07) | 0xf0), | |
| 176 (char) (((c >> 12) & 0x3f) | 0x80), | |
| 177 (char) (((c >> 6) & 0x3f) | 0x80), | |
| 178 (char) (((c >> 0) & 0x3f) | 0x80) }; | |
| 179 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; | |
| 180 } | |
| 181 else | |
| 182 { | |
| 183 /* Use replacement character. */ | |
| 184 char cc[4] = { (char) 0xef, (char) 0xbf, (char) 0xbd, 0}; | |
| 185 if (extract_astring_catl(alloc, string, cc, sizeof(cc))) return -1; | |
| 186 } | |
| 187 } | |
| 188 | |
| 189 ret = 0; | |
| 190 | |
| 191 end: | |
| 192 return ret; | |
| 193 } | |
| 194 | |
| 195 int extract_astring_catc_unicode_xml(extract_alloc_t *alloc, extract_astring_t *string, int c) | |
| 196 { | |
| 197 /* FIXME: better to use ascii_ligatures=0, but that requires updates to | |
| 198 expected output files. */ | |
| 199 return extract_astring_catc_unicode( | |
| 200 alloc, | |
| 201 string, | |
| 202 c, | |
| 203 1 /*xml*/, | |
| 204 1 /*ascii_ligatures*/, | |
| 205 0 /*ascii_dash*/, | |
| 206 0 /*ascii_apostrophe*/ | |
| 207 ); | |
| 208 } |
