Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/extract/src/xml.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include "extract/alloc.h" | |
| 2 | |
| 3 #include "mem.h" | |
| 4 #include "outf.h" | |
| 5 #include "xml.h" | |
| 6 | |
| 7 #include <assert.h> | |
| 8 #include <errno.h> | |
| 9 #include <float.h> | |
| 10 #include <limits.h> | |
| 11 | |
| 12 #include "compat_stdint.h" | |
| 13 | |
| 14 #include <stdlib.h> | |
| 15 #include <string.h> | |
| 16 | |
| 17 | |
| 18 /* These str_*() functions realloc buffer as required. All return 0 or -1 with | |
| 19 errno set. */ | |
| 20 | |
| 21 /* Appends first <s_len> chars of string <s> to *p. */ | |
| 22 static int str_catl(extract_alloc_t *alloc, char **p, const char *s, int s_len) | |
| 23 { | |
| 24 size_t p_len = (*p) ? strlen(*p) : 0; | |
| 25 | |
| 26 if (extract_realloc2(alloc, | |
| 27 p, | |
| 28 p_len + 1, | |
| 29 p_len + s_len + 1)) return -1; | |
| 30 memcpy(*p + p_len, s, s_len); | |
| 31 (*p)[p_len + s_len] = 0; | |
| 32 | |
| 33 return 0; | |
| 34 } | |
| 35 | |
| 36 /* Appends a char. */ | |
| 37 static int str_catc(extract_alloc_t *alloc, char **p, char c) | |
| 38 { | |
| 39 return str_catl(alloc, p, &c, 1); | |
| 40 } | |
| 41 | |
| 42 /* Unused but useful to keep code here. */ | |
| 43 #if 0 | |
| 44 /* Appends a string. */ | |
| 45 static int str_cat(extract_alloc_t *alloc, char **p, const char *s) | |
| 46 { | |
| 47 return str_catl(alloc, p, s, strlen(s)); | |
| 48 } | |
| 49 #endif | |
| 50 | |
| 51 char *extract_xml_tag_attributes_find(extract_xml_tag_t *tag, const char *name) | |
| 52 { | |
| 53 int i; | |
| 54 | |
| 55 for (i=0; i<tag->attributes_num; ++i) { | |
| 56 if (!strcmp(tag->attributes[i].name, name)) { | |
| 57 char* ret = tag->attributes[i].value; | |
| 58 return ret; | |
| 59 } | |
| 60 } | |
| 61 outf("Failed to find attribute '%s'",name); | |
| 62 | |
| 63 return NULL; | |
| 64 } | |
| 65 | |
| 66 int extract_xml_tag_attributes_find_float( | |
| 67 extract_xml_tag_t *tag, | |
| 68 const char *name, | |
| 69 float *o_out) | |
| 70 { | |
| 71 const char *value = extract_xml_tag_attributes_find(tag, name); | |
| 72 | |
| 73 if (!value) { | |
| 74 errno = ESRCH; | |
| 75 return -1; | |
| 76 } | |
| 77 if (extract_xml_str_to_float(value, o_out)) return -1; | |
| 78 | |
| 79 return 0; | |
| 80 } | |
| 81 | |
| 82 int extract_xml_tag_attributes_find_double( | |
| 83 extract_xml_tag_t *tag, | |
| 84 const char *name, | |
| 85 double *o_out) | |
| 86 { | |
| 87 const char *value = extract_xml_tag_attributes_find(tag, name); | |
| 88 | |
| 89 if (!value) { | |
| 90 errno = ESRCH; | |
| 91 return -1; | |
| 92 } | |
| 93 if (extract_xml_str_to_double(value, o_out)) return -1; | |
| 94 | |
| 95 return 0; | |
| 96 } | |
| 97 | |
| 98 int extract_xml_tag_attributes_find_int( | |
| 99 extract_xml_tag_t *tag, | |
| 100 const char *name, | |
| 101 int *o_out) | |
| 102 { | |
| 103 const char *text = extract_xml_tag_attributes_find(tag, name); | |
| 104 | |
| 105 return extract_xml_str_to_int(text, o_out); | |
| 106 } | |
| 107 | |
| 108 int extract_xml_tag_attributes_find_uint( | |
| 109 extract_xml_tag_t *tag, | |
| 110 const char *name, | |
| 111 unsigned *o_out) | |
| 112 { | |
| 113 const char *text = extract_xml_tag_attributes_find(tag, name); | |
| 114 | |
| 115 return extract_xml_str_to_uint(text, o_out); | |
| 116 } | |
| 117 | |
| 118 int extract_xml_tag_attributes_find_size( | |
| 119 extract_xml_tag_t *tag, | |
| 120 const char *name, | |
| 121 size_t *o_out) | |
| 122 { | |
| 123 const char *text = extract_xml_tag_attributes_find(tag, name); | |
| 124 | |
| 125 return extract_xml_str_to_size(text, o_out); | |
| 126 } | |
| 127 | |
| 128 int extract_xml_str_to_llint(const char *text, long long*o_out) | |
| 129 { | |
| 130 char *endptr; | |
| 131 long long x; | |
| 132 | |
| 133 if (!text) { | |
| 134 errno = ESRCH; | |
| 135 return -1; | |
| 136 } | |
| 137 if (text[0] == 0) { | |
| 138 errno = EINVAL; | |
| 139 return -1; | |
| 140 } | |
| 141 errno = 0; | |
| 142 x = strtoll(text, &endptr, 10 /*base*/); | |
| 143 if (errno) { | |
| 144 return -1; | |
| 145 } | |
| 146 if (*endptr) { | |
| 147 errno = EINVAL; | |
| 148 return -1; | |
| 149 } | |
| 150 *o_out = x; | |
| 151 | |
| 152 return 0; | |
| 153 } | |
| 154 | |
| 155 int extract_xml_str_to_ullint(const char *text, unsigned long long *o_out) | |
| 156 { | |
| 157 char *endptr; | |
| 158 unsigned long long x; | |
| 159 | |
| 160 if (!text) { | |
| 161 errno = ESRCH; | |
| 162 return -1; | |
| 163 } | |
| 164 if (text[0] == 0) { | |
| 165 errno = EINVAL; | |
| 166 return -1; | |
| 167 } | |
| 168 errno = 0; | |
| 169 x = strtoull(text, &endptr, 10 /*base*/); | |
| 170 if (errno) { | |
| 171 return -1; | |
| 172 } | |
| 173 if (*endptr) { | |
| 174 errno = EINVAL; | |
| 175 return -1; | |
| 176 } | |
| 177 *o_out = x; | |
| 178 | |
| 179 return 0; | |
| 180 } | |
| 181 | |
| 182 int extract_xml_str_to_int(const char *text, int *o_out) | |
| 183 { | |
| 184 long long x; | |
| 185 | |
| 186 if (extract_xml_str_to_llint(text, &x)) return -1; | |
| 187 if (x > INT_MAX || x < INT_MIN) { | |
| 188 errno = ERANGE; | |
| 189 return -1; | |
| 190 } | |
| 191 *o_out = (int) x; | |
| 192 | |
| 193 return 0; | |
| 194 } | |
| 195 | |
| 196 int extract_xml_str_to_uint(const char *text, unsigned *o_out) | |
| 197 { | |
| 198 unsigned long long x; | |
| 199 | |
| 200 if (extract_xml_str_to_ullint(text, &x)) return -1; | |
| 201 if (x > UINT_MAX) { | |
| 202 errno = ERANGE; | |
| 203 return -1; | |
| 204 } | |
| 205 *o_out = (unsigned) x; | |
| 206 | |
| 207 return 0; | |
| 208 } | |
| 209 | |
| 210 int extract_xml_str_to_size(const char *text, size_t *o_out) | |
| 211 { | |
| 212 unsigned long long x; | |
| 213 | |
| 214 if (extract_xml_str_to_ullint(text, &x)) return -1; | |
| 215 if (x > SIZE_MAX) { | |
| 216 errno = ERANGE; | |
| 217 return -1; | |
| 218 } | |
| 219 *o_out = (size_t) x; | |
| 220 | |
| 221 return 0; | |
| 222 } | |
| 223 | |
| 224 int extract_xml_str_to_double(const char *text, double *o_out) | |
| 225 { | |
| 226 char *endptr; | |
| 227 double x; | |
| 228 | |
| 229 if (!text) { | |
| 230 errno = ESRCH; | |
| 231 return -1; | |
| 232 } | |
| 233 if (text[0] == 0) { | |
| 234 errno = EINVAL; | |
| 235 return -1; | |
| 236 } | |
| 237 errno = 0; | |
| 238 x = strtod(text, &endptr); | |
| 239 if (errno) { | |
| 240 return -1; | |
| 241 } | |
| 242 if (*endptr) { | |
| 243 errno = EINVAL; | |
| 244 return -1; | |
| 245 } | |
| 246 *o_out = x; | |
| 247 | |
| 248 return 0; | |
| 249 } | |
| 250 | |
| 251 int extract_xml_str_to_float(const char *text, float *o_out) | |
| 252 { | |
| 253 double x; | |
| 254 | |
| 255 if (extract_xml_str_to_double(text, &x)) { | |
| 256 return -1; | |
| 257 } | |
| 258 if (x > FLT_MAX || x < -FLT_MAX) { | |
| 259 errno = ERANGE; | |
| 260 return -1; | |
| 261 } | |
| 262 *o_out = (float) x; | |
| 263 | |
| 264 return 0; | |
| 265 } | |
| 266 | |
| 267 static int | |
| 268 extract_xml_tag_attributes_append( | |
| 269 extract_alloc_t *alloc, | |
| 270 extract_xml_tag_t *tag, | |
| 271 char *name, | |
| 272 char *value) | |
| 273 { | |
| 274 if (extract_realloc2(alloc, | |
| 275 &tag->attributes, | |
| 276 sizeof(extract_xml_attribute_t) * tag->attributes_num, | |
| 277 sizeof(extract_xml_attribute_t) * (tag->attributes_num+1))) | |
| 278 { | |
| 279 return -1; | |
| 280 } | |
| 281 tag->attributes[tag->attributes_num].name = name; | |
| 282 tag->attributes[tag->attributes_num].value = value; | |
| 283 tag->attributes_num += 1; | |
| 284 | |
| 285 return 0; | |
| 286 } | |
| 287 | |
| 288 void extract_xml_tag_init(extract_xml_tag_t *tag) | |
| 289 { | |
| 290 tag->name = NULL; | |
| 291 tag->attributes = NULL; | |
| 292 tag->attributes_num = 0; | |
| 293 extract_astring_init(&tag->text); | |
| 294 } | |
| 295 | |
| 296 void extract_xml_tag_free(extract_alloc_t *alloc, extract_xml_tag_t *tag) | |
| 297 { | |
| 298 int i; | |
| 299 | |
| 300 if (tag == NULL) | |
| 301 return; | |
| 302 | |
| 303 extract_free(alloc, &tag->name); | |
| 304 for (i=0; i<tag->attributes_num; ++i) { | |
| 305 extract_xml_attribute_t* attribute = &tag->attributes[i]; | |
| 306 extract_free(alloc, &attribute->name); | |
| 307 extract_free(alloc, &attribute->value); | |
| 308 } | |
| 309 extract_free(alloc, &tag->attributes); | |
| 310 extract_astring_free(alloc, &tag->text); | |
| 311 extract_xml_tag_init(tag); | |
| 312 } | |
| 313 | |
| 314 /* Unused but useful to keep code here. */ | |
| 315 #if 0 | |
| 316 /* Like strcmp() but also handles NULL. */ | |
| 317 static int extract_xml_strcmp_null(const char *a, const char *b) | |
| 318 { | |
| 319 if (!a && !b) return 0; | |
| 320 if (!a) return -1; | |
| 321 if (!b) return 1; | |
| 322 return strcmp(a, b); | |
| 323 } | |
| 324 #endif | |
| 325 | |
| 326 /* Unused but useful to keep code here. */ | |
| 327 #if 0 | |
| 328 /* Compares tag name, then attributes; returns -1, 0 or +1. Does not compare | |
| 329 extract_xml_tag_t::text members. */ | |
| 330 int extract_xml_compare_tags(const extract_xml_tag_t *lhs, const extract_xml_tag_t *rhs) | |
| 331 { | |
| 332 int d; | |
| 333 int i; | |
| 334 d = extract_xml_strcmp_null(lhs->name, rhs->name); | |
| 335 if (d) return d; | |
| 336 for(i=0;; ++i) { | |
| 337 if (i >= lhs->attributes_num || i >= rhs->attributes_num) { | |
| 338 break; | |
| 339 } | |
| 340 const extract_xml_attribute_t* lhs_attribute = &lhs->attributes[i]; | |
| 341 const extract_xml_attribute_t* rhs_attribute = &rhs->attributes[i]; | |
| 342 d = extract_xml_strcmp_null(lhs_attribute->name, rhs_attribute->name); | |
| 343 if (d) return d; | |
| 344 d = extract_xml_strcmp_null(lhs_attribute->value, rhs_attribute->value); | |
| 345 if (d) return d; | |
| 346 } | |
| 347 if (lhs->attributes_num > rhs->attributes_num) return +1; | |
| 348 if (lhs->attributes_num < rhs->attributes_num) return -1; | |
| 349 return 0; | |
| 350 } | |
| 351 #endif | |
| 352 | |
| 353 | |
| 354 int extract_xml_pparse_init(extract_alloc_t *alloc, extract_buffer_t *buffer, const char *first_line) | |
| 355 { | |
| 356 char *first_line_buffer = NULL; | |
| 357 int e = -1; | |
| 358 | |
| 359 if (first_line) { | |
| 360 size_t first_line_len = strlen(first_line); | |
| 361 size_t actual; | |
| 362 if (extract_malloc(alloc, &first_line_buffer, first_line_len + 1)) goto end; | |
| 363 | |
| 364 if (extract_buffer_read(buffer, first_line_buffer, first_line_len, &actual)) { | |
| 365 outf("error: failed to read first line."); | |
| 366 goto end; | |
| 367 } | |
| 368 first_line_buffer[actual] = 0; | |
| 369 if (strcmp(first_line, first_line_buffer)) { | |
| 370 outf("Unrecognised prefix: %s", first_line_buffer); | |
| 371 errno = ESRCH; | |
| 372 goto end; | |
| 373 } | |
| 374 } | |
| 375 | |
| 376 for(;;) { | |
| 377 char c; | |
| 378 int ee = extract_buffer_read(buffer, &c, 1, NULL); | |
| 379 if (ee) { | |
| 380 if (ee==1) errno = ESRCH; /* EOF. */ | |
| 381 goto end; | |
| 382 } | |
| 383 if (c == '<') { | |
| 384 break; | |
| 385 } | |
| 386 else if (c == ' ' || c == '\n') {} | |
| 387 else { | |
| 388 outf("Expected '<' but found c=%i", c); | |
| 389 goto end; | |
| 390 } | |
| 391 } | |
| 392 | |
| 393 e = 0; | |
| 394 end: | |
| 395 | |
| 396 extract_free(alloc, &first_line_buffer); | |
| 397 | |
| 398 return e; | |
| 399 } | |
| 400 | |
| 401 static int s_next(extract_buffer_t *buffer, int *ret, char *o_c) | |
| 402 /* Reads next char, but if EOF sets *ret=+1, errno=ESRCH and returns +1. */ | |
| 403 { | |
| 404 int e = extract_buffer_read(buffer, o_c, 1, NULL); | |
| 405 | |
| 406 if (e == +1) { | |
| 407 *ret = +1; | |
| 408 errno = ESRCH; | |
| 409 } | |
| 410 | |
| 411 return e; | |
| 412 } | |
| 413 | |
| 414 static const char * | |
| 415 extract_xml_tag_string(extract_alloc_t *alloc, extract_xml_tag_t *tag) | |
| 416 { | |
| 417 static char *buffer = NULL; | |
| 418 | |
| 419 extract_free(alloc, &buffer); | |
| 420 if (extract_asprintf(alloc, &buffer, "<name=%s>", tag->name ? tag->name : "")) | |
| 421 { | |
| 422 return ""; | |
| 423 } | |
| 424 | |
| 425 return buffer; | |
| 426 } | |
| 427 | |
| 428 int extract_xml_pparse_next(extract_buffer_t *buffer, extract_xml_tag_t *out) | |
| 429 { | |
| 430 int ret = -1; | |
| 431 char *attribute_name = NULL; | |
| 432 char *attribute_value = NULL; | |
| 433 char c; | |
| 434 extract_alloc_t *alloc = extract_buffer_alloc(buffer); | |
| 435 | |
| 436 if (0) outf("out is: %s", extract_xml_tag_string(extract_buffer_alloc(buffer), out)); | |
| 437 assert(buffer); | |
| 438 extract_xml_tag_free(alloc, out); | |
| 439 | |
| 440 /* Read tag name. Initialise it to empty string so we never return | |
| 441 out->name==null on success. */ | |
| 442 if (str_catl( alloc, &out->name, NULL, 0)) goto end; | |
| 443 for(;;) { | |
| 444 int e = extract_buffer_read(buffer, &c, 1, NULL); | |
| 445 if (e) { | |
| 446 if (e == +1) ret = 1; /* EOF is not an error here. */ | |
| 447 goto end; | |
| 448 } | |
| 449 if (c == '>' || c == ' ') break; | |
| 450 if (str_catc(alloc, &out->name, c)) goto end; | |
| 451 } | |
| 452 if (c == ' ') { | |
| 453 | |
| 454 /* Read attributes. */ | |
| 455 for(;;) { | |
| 456 | |
| 457 /* Read attribute name. */ | |
| 458 for(;;) { | |
| 459 if (s_next(buffer, &ret, &c)) goto end; | |
| 460 if (c == '=' || c == '>' || c == ' ') break; | |
| 461 if (str_catc(alloc, &attribute_name, c)) goto end; | |
| 462 } | |
| 463 if (c == '>') break; | |
| 464 | |
| 465 if (c == '=') { | |
| 466 /* Read attribute value. */ | |
| 467 int quote_single = 0; | |
| 468 int quote_double = 0; | |
| 469 size_t l; | |
| 470 if (str_catl( alloc, &attribute_value, NULL, 0)) goto end; | |
| 471 for(;;) { | |
| 472 if (s_next(buffer, &ret, &c)) goto end; | |
| 473 if (c == '\'') quote_single = !quote_single; | |
| 474 else if (c == '"') quote_double = !quote_double; | |
| 475 else if (!quote_single && !quote_double | |
| 476 && (c == ' ' || c == '/' || c == '>') | |
| 477 ) { | |
| 478 /* We are at end of attribute value. */ | |
| 479 break; | |
| 480 } | |
| 481 else if (c == '\\') { | |
| 482 // Escape next character. | |
| 483 if (s_next(buffer, &ret, &c)) goto end; | |
| 484 } | |
| 485 if (str_catc(alloc, &attribute_value, c)) goto end; | |
| 486 } | |
| 487 | |
| 488 /* Remove any enclosing quotes. */ | |
| 489 l = strlen(attribute_value); | |
| 490 if (l >= 2) { | |
| 491 if ( | |
| 492 (attribute_value[0] == '"' && attribute_value[l-1] == '"') | |
| 493 || | |
| 494 (attribute_value[0] == '\'' && attribute_value[l-1] == '\'') | |
| 495 ) { | |
| 496 memmove(attribute_value, attribute_value+1, l-2); | |
| 497 attribute_value[l-2] = 0; | |
| 498 } | |
| 499 } | |
| 500 } | |
| 501 | |
| 502 /* Ensure name and value are not NULL. */ | |
| 503 if (str_catl( alloc, &attribute_name, NULL, 0)) goto end; | |
| 504 if (str_catl( alloc, &attribute_value, NULL, 0)) goto end; | |
| 505 | |
| 506 if (extract_xml_tag_attributes_append(alloc, out, attribute_name, attribute_value)) goto end; | |
| 507 attribute_name = NULL; | |
| 508 attribute_value = NULL; | |
| 509 if (c == '/') { | |
| 510 if (s_next(buffer, &ret, &c)) goto end; | |
| 511 } | |
| 512 if (c == '>') break; | |
| 513 } | |
| 514 } | |
| 515 | |
| 516 /* Read plain text until next '<'. */ | |
| 517 for(;;) { | |
| 518 /* We don't use s_next() here because EOF is not an error. */ | |
| 519 int e = extract_buffer_read(buffer, &c, 1, NULL); | |
| 520 if (e == +1) { | |
| 521 break; /* EOF is not an error here. */ | |
| 522 } | |
| 523 if (e) goto end; | |
| 524 if (c == '<') break; | |
| 525 if (extract_astring_catc(alloc, &out->text, c)) goto end; | |
| 526 } | |
| 527 | |
| 528 ret = 0; | |
| 529 end: | |
| 530 | |
| 531 extract_free(alloc, &attribute_name); | |
| 532 extract_free(alloc, &attribute_value); | |
| 533 if (ret) { | |
| 534 extract_xml_tag_free(alloc, out); | |
| 535 } | |
| 536 | |
| 537 return ret; | |
| 538 } |
