comparison mupdf-source/thirdparty/extract/src/xml.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include "extract/alloc.h"
2
3 #include "mem.h"
4 #include "outf.h"
5 #include "xml.h"
6
7 #include <assert.h>
8 #include <errno.h>
9 #include <float.h>
10 #include <limits.h>
11
12 #include "compat_stdint.h"
13
14 #include <stdlib.h>
15 #include <string.h>
16
17
18 /* These str_*() functions realloc buffer as required. All return 0 or -1 with
19 errno set. */
20
21 /* Appends first <s_len> chars of string <s> to *p. */
22 static int str_catl(extract_alloc_t *alloc, char **p, const char *s, int s_len)
23 {
24 size_t p_len = (*p) ? strlen(*p) : 0;
25
26 if (extract_realloc2(alloc,
27 p,
28 p_len + 1,
29 p_len + s_len + 1)) return -1;
30 memcpy(*p + p_len, s, s_len);
31 (*p)[p_len + s_len] = 0;
32
33 return 0;
34 }
35
36 /* Appends a char. */
37 static int str_catc(extract_alloc_t *alloc, char **p, char c)
38 {
39 return str_catl(alloc, p, &c, 1);
40 }
41
42 /* Unused but useful to keep code here. */
43 #if 0
44 /* Appends a string. */
45 static int str_cat(extract_alloc_t *alloc, char **p, const char *s)
46 {
47 return str_catl(alloc, p, s, strlen(s));
48 }
49 #endif
50
51 char *extract_xml_tag_attributes_find(extract_xml_tag_t *tag, const char *name)
52 {
53 int i;
54
55 for (i=0; i<tag->attributes_num; ++i) {
56 if (!strcmp(tag->attributes[i].name, name)) {
57 char* ret = tag->attributes[i].value;
58 return ret;
59 }
60 }
61 outf("Failed to find attribute '%s'",name);
62
63 return NULL;
64 }
65
66 int extract_xml_tag_attributes_find_float(
67 extract_xml_tag_t *tag,
68 const char *name,
69 float *o_out)
70 {
71 const char *value = extract_xml_tag_attributes_find(tag, name);
72
73 if (!value) {
74 errno = ESRCH;
75 return -1;
76 }
77 if (extract_xml_str_to_float(value, o_out)) return -1;
78
79 return 0;
80 }
81
82 int extract_xml_tag_attributes_find_double(
83 extract_xml_tag_t *tag,
84 const char *name,
85 double *o_out)
86 {
87 const char *value = extract_xml_tag_attributes_find(tag, name);
88
89 if (!value) {
90 errno = ESRCH;
91 return -1;
92 }
93 if (extract_xml_str_to_double(value, o_out)) return -1;
94
95 return 0;
96 }
97
98 int extract_xml_tag_attributes_find_int(
99 extract_xml_tag_t *tag,
100 const char *name,
101 int *o_out)
102 {
103 const char *text = extract_xml_tag_attributes_find(tag, name);
104
105 return extract_xml_str_to_int(text, o_out);
106 }
107
108 int extract_xml_tag_attributes_find_uint(
109 extract_xml_tag_t *tag,
110 const char *name,
111 unsigned *o_out)
112 {
113 const char *text = extract_xml_tag_attributes_find(tag, name);
114
115 return extract_xml_str_to_uint(text, o_out);
116 }
117
118 int extract_xml_tag_attributes_find_size(
119 extract_xml_tag_t *tag,
120 const char *name,
121 size_t *o_out)
122 {
123 const char *text = extract_xml_tag_attributes_find(tag, name);
124
125 return extract_xml_str_to_size(text, o_out);
126 }
127
128 int extract_xml_str_to_llint(const char *text, long long*o_out)
129 {
130 char *endptr;
131 long long x;
132
133 if (!text) {
134 errno = ESRCH;
135 return -1;
136 }
137 if (text[0] == 0) {
138 errno = EINVAL;
139 return -1;
140 }
141 errno = 0;
142 x = strtoll(text, &endptr, 10 /*base*/);
143 if (errno) {
144 return -1;
145 }
146 if (*endptr) {
147 errno = EINVAL;
148 return -1;
149 }
150 *o_out = x;
151
152 return 0;
153 }
154
155 int extract_xml_str_to_ullint(const char *text, unsigned long long *o_out)
156 {
157 char *endptr;
158 unsigned long long x;
159
160 if (!text) {
161 errno = ESRCH;
162 return -1;
163 }
164 if (text[0] == 0) {
165 errno = EINVAL;
166 return -1;
167 }
168 errno = 0;
169 x = strtoull(text, &endptr, 10 /*base*/);
170 if (errno) {
171 return -1;
172 }
173 if (*endptr) {
174 errno = EINVAL;
175 return -1;
176 }
177 *o_out = x;
178
179 return 0;
180 }
181
182 int extract_xml_str_to_int(const char *text, int *o_out)
183 {
184 long long x;
185
186 if (extract_xml_str_to_llint(text, &x)) return -1;
187 if (x > INT_MAX || x < INT_MIN) {
188 errno = ERANGE;
189 return -1;
190 }
191 *o_out = (int) x;
192
193 return 0;
194 }
195
196 int extract_xml_str_to_uint(const char *text, unsigned *o_out)
197 {
198 unsigned long long x;
199
200 if (extract_xml_str_to_ullint(text, &x)) return -1;
201 if (x > UINT_MAX) {
202 errno = ERANGE;
203 return -1;
204 }
205 *o_out = (unsigned) x;
206
207 return 0;
208 }
209
210 int extract_xml_str_to_size(const char *text, size_t *o_out)
211 {
212 unsigned long long x;
213
214 if (extract_xml_str_to_ullint(text, &x)) return -1;
215 if (x > SIZE_MAX) {
216 errno = ERANGE;
217 return -1;
218 }
219 *o_out = (size_t) x;
220
221 return 0;
222 }
223
224 int extract_xml_str_to_double(const char *text, double *o_out)
225 {
226 char *endptr;
227 double x;
228
229 if (!text) {
230 errno = ESRCH;
231 return -1;
232 }
233 if (text[0] == 0) {
234 errno = EINVAL;
235 return -1;
236 }
237 errno = 0;
238 x = strtod(text, &endptr);
239 if (errno) {
240 return -1;
241 }
242 if (*endptr) {
243 errno = EINVAL;
244 return -1;
245 }
246 *o_out = x;
247
248 return 0;
249 }
250
251 int extract_xml_str_to_float(const char *text, float *o_out)
252 {
253 double x;
254
255 if (extract_xml_str_to_double(text, &x)) {
256 return -1;
257 }
258 if (x > FLT_MAX || x < -FLT_MAX) {
259 errno = ERANGE;
260 return -1;
261 }
262 *o_out = (float) x;
263
264 return 0;
265 }
266
267 static int
268 extract_xml_tag_attributes_append(
269 extract_alloc_t *alloc,
270 extract_xml_tag_t *tag,
271 char *name,
272 char *value)
273 {
274 if (extract_realloc2(alloc,
275 &tag->attributes,
276 sizeof(extract_xml_attribute_t) * tag->attributes_num,
277 sizeof(extract_xml_attribute_t) * (tag->attributes_num+1)))
278 {
279 return -1;
280 }
281 tag->attributes[tag->attributes_num].name = name;
282 tag->attributes[tag->attributes_num].value = value;
283 tag->attributes_num += 1;
284
285 return 0;
286 }
287
288 void extract_xml_tag_init(extract_xml_tag_t *tag)
289 {
290 tag->name = NULL;
291 tag->attributes = NULL;
292 tag->attributes_num = 0;
293 extract_astring_init(&tag->text);
294 }
295
296 void extract_xml_tag_free(extract_alloc_t *alloc, extract_xml_tag_t *tag)
297 {
298 int i;
299
300 if (tag == NULL)
301 return;
302
303 extract_free(alloc, &tag->name);
304 for (i=0; i<tag->attributes_num; ++i) {
305 extract_xml_attribute_t* attribute = &tag->attributes[i];
306 extract_free(alloc, &attribute->name);
307 extract_free(alloc, &attribute->value);
308 }
309 extract_free(alloc, &tag->attributes);
310 extract_astring_free(alloc, &tag->text);
311 extract_xml_tag_init(tag);
312 }
313
314 /* Unused but useful to keep code here. */
315 #if 0
316 /* Like strcmp() but also handles NULL. */
317 static int extract_xml_strcmp_null(const char *a, const char *b)
318 {
319 if (!a && !b) return 0;
320 if (!a) return -1;
321 if (!b) return 1;
322 return strcmp(a, b);
323 }
324 #endif
325
326 /* Unused but useful to keep code here. */
327 #if 0
328 /* Compares tag name, then attributes; returns -1, 0 or +1. Does not compare
329 extract_xml_tag_t::text members. */
330 int extract_xml_compare_tags(const extract_xml_tag_t *lhs, const extract_xml_tag_t *rhs)
331 {
332 int d;
333 int i;
334 d = extract_xml_strcmp_null(lhs->name, rhs->name);
335 if (d) return d;
336 for(i=0;; ++i) {
337 if (i >= lhs->attributes_num || i >= rhs->attributes_num) {
338 break;
339 }
340 const extract_xml_attribute_t* lhs_attribute = &lhs->attributes[i];
341 const extract_xml_attribute_t* rhs_attribute = &rhs->attributes[i];
342 d = extract_xml_strcmp_null(lhs_attribute->name, rhs_attribute->name);
343 if (d) return d;
344 d = extract_xml_strcmp_null(lhs_attribute->value, rhs_attribute->value);
345 if (d) return d;
346 }
347 if (lhs->attributes_num > rhs->attributes_num) return +1;
348 if (lhs->attributes_num < rhs->attributes_num) return -1;
349 return 0;
350 }
351 #endif
352
353
354 int extract_xml_pparse_init(extract_alloc_t *alloc, extract_buffer_t *buffer, const char *first_line)
355 {
356 char *first_line_buffer = NULL;
357 int e = -1;
358
359 if (first_line) {
360 size_t first_line_len = strlen(first_line);
361 size_t actual;
362 if (extract_malloc(alloc, &first_line_buffer, first_line_len + 1)) goto end;
363
364 if (extract_buffer_read(buffer, first_line_buffer, first_line_len, &actual)) {
365 outf("error: failed to read first line.");
366 goto end;
367 }
368 first_line_buffer[actual] = 0;
369 if (strcmp(first_line, first_line_buffer)) {
370 outf("Unrecognised prefix: %s", first_line_buffer);
371 errno = ESRCH;
372 goto end;
373 }
374 }
375
376 for(;;) {
377 char c;
378 int ee = extract_buffer_read(buffer, &c, 1, NULL);
379 if (ee) {
380 if (ee==1) errno = ESRCH; /* EOF. */
381 goto end;
382 }
383 if (c == '<') {
384 break;
385 }
386 else if (c == ' ' || c == '\n') {}
387 else {
388 outf("Expected '<' but found c=%i", c);
389 goto end;
390 }
391 }
392
393 e = 0;
394 end:
395
396 extract_free(alloc, &first_line_buffer);
397
398 return e;
399 }
400
401 static int s_next(extract_buffer_t *buffer, int *ret, char *o_c)
402 /* Reads next char, but if EOF sets *ret=+1, errno=ESRCH and returns +1. */
403 {
404 int e = extract_buffer_read(buffer, o_c, 1, NULL);
405
406 if (e == +1) {
407 *ret = +1;
408 errno = ESRCH;
409 }
410
411 return e;
412 }
413
414 static const char *
415 extract_xml_tag_string(extract_alloc_t *alloc, extract_xml_tag_t *tag)
416 {
417 static char *buffer = NULL;
418
419 extract_free(alloc, &buffer);
420 if (extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : ""))
421 {
422 return "";
423 }
424
425 return buffer;
426 }
427
428 int extract_xml_pparse_next(extract_buffer_t *buffer, extract_xml_tag_t *out)
429 {
430 int ret = -1;
431 char *attribute_name = NULL;
432 char *attribute_value = NULL;
433 char c;
434 extract_alloc_t *alloc = extract_buffer_alloc(buffer);
435
436 if (0) outf("out is: %s", extract_xml_tag_string(extract_buffer_alloc(buffer), out));
437 assert(buffer);
438 extract_xml_tag_free(alloc, out);
439
440 /* Read tag name. Initialise it to empty string so we never return
441 out->name==null on success. */
442 if (str_catl( alloc, &out->name, NULL, 0)) goto end;
443 for(;;) {
444 int e = extract_buffer_read(buffer, &c, 1, NULL);
445 if (e) {
446 if (e == +1) ret = 1; /* EOF is not an error here. */
447 goto end;
448 }
449 if (c == '>' || c == ' ') break;
450 if (str_catc(alloc, &out->name, c)) goto end;
451 }
452 if (c == ' ') {
453
454 /* Read attributes. */
455 for(;;) {
456
457 /* Read attribute name. */
458 for(;;) {
459 if (s_next(buffer, &ret, &c)) goto end;
460 if (c == '=' || c == '>' || c == ' ') break;
461 if (str_catc(alloc, &attribute_name, c)) goto end;
462 }
463 if (c == '>') break;
464
465 if (c == '=') {
466 /* Read attribute value. */
467 int quote_single = 0;
468 int quote_double = 0;
469 size_t l;
470 if (str_catl( alloc, &attribute_value, NULL, 0)) goto end;
471 for(;;) {
472 if (s_next(buffer, &ret, &c)) goto end;
473 if (c == '\'') quote_single = !quote_single;
474 else if (c == '"') quote_double = !quote_double;
475 else if (!quote_single && !quote_double
476 && (c == ' ' || c == '/' || c == '>')
477 ) {
478 /* We are at end of attribute value. */
479 break;
480 }
481 else if (c == '\\') {
482 // Escape next character.
483 if (s_next(buffer, &ret, &c)) goto end;
484 }
485 if (str_catc(alloc, &attribute_value, c)) goto end;
486 }
487
488 /* Remove any enclosing quotes. */
489 l = strlen(attribute_value);
490 if (l >= 2) {
491 if (
492 (attribute_value[0] == '"' && attribute_value[l-1] == '"')
493 ||
494 (attribute_value[0] == '\'' && attribute_value[l-1] == '\'')
495 ) {
496 memmove(attribute_value, attribute_value+1, l-2);
497 attribute_value[l-2] = 0;
498 }
499 }
500 }
501
502 /* Ensure name and value are not NULL. */
503 if (str_catl( alloc, &attribute_name, NULL, 0)) goto end;
504 if (str_catl( alloc, &attribute_value, NULL, 0)) goto end;
505
506 if (extract_xml_tag_attributes_append(alloc, out, attribute_name, attribute_value)) goto end;
507 attribute_name = NULL;
508 attribute_value = NULL;
509 if (c == '/') {
510 if (s_next(buffer, &ret, &c)) goto end;
511 }
512 if (c == '>') break;
513 }
514 }
515
516 /* Read plain text until next '<'. */
517 for(;;) {
518 /* We don't use s_next() here because EOF is not an error. */
519 int e = extract_buffer_read(buffer, &c, 1, NULL);
520 if (e == +1) {
521 break; /* EOF is not an error here. */
522 }
523 if (e) goto end;
524 if (c == '<') break;
525 if (extract_astring_catc(alloc, &out->text, c)) goto end;
526 }
527
528 ret = 0;
529 end:
530
531 extract_free(alloc, &attribute_name);
532 extract_free(alloc, &attribute_value);
533 if (ret) {
534 extract_xml_tag_free(alloc, out);
535 }
536
537 return ret;
538 }