Mercurial > hgrepos > Python2 > PyMuPDF
comparison mupdf-source/thirdparty/mujs/regexp.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 1:1d09e1dec1d9 | 2:b50eed0cc0ef |
|---|---|
| 1 #include <stdlib.h> | |
| 2 #include <stdio.h> | |
| 3 #include <string.h> | |
| 4 #include <setjmp.h> | |
| 5 #include <limits.h> | |
| 6 | |
| 7 #include "regexp.h" | |
| 8 #include "utf.h" | |
| 9 | |
| 10 #define emit regemit | |
| 11 #define next regnext | |
| 12 #define accept regaccept | |
| 13 | |
| 14 #define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) | |
| 15 | |
| 16 #define REPINF 255 | |
| 17 #ifndef REG_MAXPROG | |
| 18 #define REG_MAXPROG (32 << 10) | |
| 19 #endif | |
| 20 #ifndef REG_MAXREC | |
| 21 #define REG_MAXREC 1024 | |
| 22 #endif | |
| 23 #ifndef REG_MAXSPAN | |
| 24 #define REG_MAXSPAN 64 | |
| 25 #endif | |
| 26 #ifndef REG_MAXCLASS | |
| 27 #define REG_MAXCLASS 128 | |
| 28 #endif | |
| 29 | |
| 30 typedef struct Reclass Reclass; | |
| 31 typedef struct Renode Renode; | |
| 32 typedef struct Reinst Reinst; | |
| 33 typedef struct Rethread Rethread; | |
| 34 | |
| 35 struct Reclass { | |
| 36 Rune *end; | |
| 37 Rune spans[REG_MAXSPAN]; | |
| 38 }; | |
| 39 | |
| 40 struct Reprog { | |
| 41 Reinst *start, *end; | |
| 42 Reclass *cclass; | |
| 43 int flags; | |
| 44 int nsub; | |
| 45 }; | |
| 46 | |
| 47 struct cstate { | |
| 48 Reprog *prog; | |
| 49 Renode *pstart, *pend; | |
| 50 | |
| 51 const char *source; | |
| 52 int ncclass; | |
| 53 int nsub; | |
| 54 Renode *sub[REG_MAXSUB]; | |
| 55 | |
| 56 int lookahead; | |
| 57 Rune yychar; | |
| 58 Reclass *yycc; | |
| 59 int yymin, yymax; | |
| 60 | |
| 61 const char *error; | |
| 62 jmp_buf kaboom; | |
| 63 | |
| 64 Reclass cclass[REG_MAXCLASS]; | |
| 65 }; | |
| 66 | |
| 67 static void die(struct cstate *g, const char *message) | |
| 68 { | |
| 69 g->error = message; | |
| 70 longjmp(g->kaboom, 1); | |
| 71 } | |
| 72 | |
| 73 static int canon(Rune c) | |
| 74 { | |
| 75 Rune u = toupperrune(c); | |
| 76 if (c >= 128 && u < 128) | |
| 77 return c; | |
| 78 return u; | |
| 79 } | |
| 80 | |
| 81 /* Scan */ | |
| 82 | |
| 83 enum { | |
| 84 L_CHAR = 256, | |
| 85 L_CCLASS, /* character class */ | |
| 86 L_NCCLASS, /* negative character class */ | |
| 87 L_NC, /* "(?:" no capture */ | |
| 88 L_PLA, /* "(?=" positive lookahead */ | |
| 89 L_NLA, /* "(?!" negative lookahead */ | |
| 90 L_WORD, /* "\b" word boundary */ | |
| 91 L_NWORD, /* "\B" non-word boundary */ | |
| 92 L_REF, /* "\1" back-reference */ | |
| 93 L_COUNT, /* {M,N} */ | |
| 94 }; | |
| 95 | |
| 96 static int hex(struct cstate *g, int c) | |
| 97 { | |
| 98 if (c >= '0' && c <= '9') return c - '0'; | |
| 99 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; | |
| 100 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; | |
| 101 die(g, "invalid escape sequence"); | |
| 102 return 0; | |
| 103 } | |
| 104 | |
| 105 static int dec(struct cstate *g, int c) | |
| 106 { | |
| 107 if (c >= '0' && c <= '9') return c - '0'; | |
| 108 die(g, "invalid quantifier"); | |
| 109 return 0; | |
| 110 } | |
| 111 | |
| 112 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789" | |
| 113 | |
| 114 static int isunicodeletter(int c) | |
| 115 { | |
| 116 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c); | |
| 117 } | |
| 118 | |
| 119 static int nextrune(struct cstate *g) | |
| 120 { | |
| 121 if (!*g->source) { | |
| 122 g->yychar = EOF; | |
| 123 return 0; | |
| 124 } | |
| 125 g->source += chartorune(&g->yychar, g->source); | |
| 126 if (g->yychar == '\\') { | |
| 127 if (!*g->source) | |
| 128 die(g, "unterminated escape sequence"); | |
| 129 g->source += chartorune(&g->yychar, g->source); | |
| 130 switch (g->yychar) { | |
| 131 case 'f': g->yychar = '\f'; return 0; | |
| 132 case 'n': g->yychar = '\n'; return 0; | |
| 133 case 'r': g->yychar = '\r'; return 0; | |
| 134 case 't': g->yychar = '\t'; return 0; | |
| 135 case 'v': g->yychar = '\v'; return 0; | |
| 136 case 'c': | |
| 137 if (!g->source[0]) | |
| 138 die(g, "unterminated escape sequence"); | |
| 139 g->yychar = (*g->source++) & 31; | |
| 140 return 0; | |
| 141 case 'x': | |
| 142 if (!g->source[0] || !g->source[1]) | |
| 143 die(g, "unterminated escape sequence"); | |
| 144 g->yychar = hex(g, *g->source++) << 4; | |
| 145 g->yychar += hex(g, *g->source++); | |
| 146 if (g->yychar == 0) { | |
| 147 g->yychar = '0'; | |
| 148 return 1; | |
| 149 } | |
| 150 return 0; | |
| 151 case 'u': | |
| 152 if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3]) | |
| 153 die(g, "unterminated escape sequence"); | |
| 154 g->yychar = hex(g, *g->source++) << 12; | |
| 155 g->yychar += hex(g, *g->source++) << 8; | |
| 156 g->yychar += hex(g, *g->source++) << 4; | |
| 157 g->yychar += hex(g, *g->source++); | |
| 158 if (g->yychar == 0) { | |
| 159 g->yychar = '0'; | |
| 160 return 1; | |
| 161 } | |
| 162 return 0; | |
| 163 case 0: | |
| 164 g->yychar = '0'; | |
| 165 return 1; | |
| 166 } | |
| 167 if (strchr(ESCAPES, g->yychar)) | |
| 168 return 1; | |
| 169 if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */ | |
| 170 die(g, "invalid escape character"); | |
| 171 return 0; | |
| 172 } | |
| 173 return 0; | |
| 174 } | |
| 175 | |
| 176 static int lexcount(struct cstate *g) | |
| 177 { | |
| 178 g->yychar = *g->source++; | |
| 179 | |
| 180 g->yymin = dec(g, g->yychar); | |
| 181 g->yychar = *g->source++; | |
| 182 while (g->yychar != ',' && g->yychar != '}') { | |
| 183 g->yymin = g->yymin * 10 + dec(g, g->yychar); | |
| 184 g->yychar = *g->source++; | |
| 185 if (g->yymin >= REPINF) | |
| 186 die(g, "numeric overflow"); | |
| 187 } | |
| 188 | |
| 189 if (g->yychar == ',') { | |
| 190 g->yychar = *g->source++; | |
| 191 if (g->yychar == '}') { | |
| 192 g->yymax = REPINF; | |
| 193 } else { | |
| 194 g->yymax = dec(g, g->yychar); | |
| 195 g->yychar = *g->source++; | |
| 196 while (g->yychar != '}') { | |
| 197 g->yymax = g->yymax * 10 + dec(g, g->yychar); | |
| 198 g->yychar = *g->source++; | |
| 199 if (g->yymax >= REPINF) | |
| 200 die(g, "numeric overflow"); | |
| 201 } | |
| 202 } | |
| 203 } else { | |
| 204 g->yymax = g->yymin; | |
| 205 } | |
| 206 | |
| 207 return L_COUNT; | |
| 208 } | |
| 209 | |
| 210 static void newcclass(struct cstate *g) | |
| 211 { | |
| 212 if (g->ncclass >= REG_MAXCLASS) | |
| 213 die(g, "too many character classes"); | |
| 214 g->yycc = g->cclass + g->ncclass++; | |
| 215 g->yycc->end = g->yycc->spans; | |
| 216 } | |
| 217 | |
| 218 static void addrange(struct cstate *g, Rune a, Rune b) | |
| 219 { | |
| 220 Reclass *cc = g->yycc; | |
| 221 Rune *p; | |
| 222 | |
| 223 if (a > b) | |
| 224 die(g, "invalid character class range"); | |
| 225 | |
| 226 /* extend existing ranges if they overlap */ | |
| 227 for (p = cc->spans; p < cc->end; p += 2) { | |
| 228 /* completely inside old range */ | |
| 229 if (a >= p[0] && b <= p[1]) | |
| 230 return; | |
| 231 | |
| 232 /* completely swallows old range */ | |
| 233 if (a < p[0] && b >= p[1]) { | |
| 234 p[0] = a; | |
| 235 p[1] = b; | |
| 236 return; | |
| 237 } | |
| 238 | |
| 239 /* extend at start */ | |
| 240 if (b >= p[0] - 1 && b <= p[1] && a < p[0]) { | |
| 241 p[0] = a; | |
| 242 return; | |
| 243 } | |
| 244 | |
| 245 /* extend at end */ | |
| 246 if (a >= p[0] && a <= p[1] + 1 && b > p[1]) { | |
| 247 p[1] = b; | |
| 248 return; | |
| 249 } | |
| 250 } | |
| 251 | |
| 252 if (cc->end + 2 >= cc->spans + nelem(cc->spans)) | |
| 253 die(g, "too many character class ranges"); | |
| 254 *cc->end++ = a; | |
| 255 *cc->end++ = b; | |
| 256 } | |
| 257 | |
| 258 static void addranges_d(struct cstate *g) | |
| 259 { | |
| 260 addrange(g, '0', '9'); | |
| 261 } | |
| 262 | |
| 263 static void addranges_D(struct cstate *g) | |
| 264 { | |
| 265 addrange(g, 0, '0'-1); | |
| 266 addrange(g, '9'+1, 0xFFFF); | |
| 267 } | |
| 268 | |
| 269 static void addranges_s(struct cstate *g) | |
| 270 { | |
| 271 addrange(g, 0x9, 0xD); | |
| 272 addrange(g, 0x20, 0x20); | |
| 273 addrange(g, 0xA0, 0xA0); | |
| 274 addrange(g, 0x2028, 0x2029); | |
| 275 addrange(g, 0xFEFF, 0xFEFF); | |
| 276 } | |
| 277 | |
| 278 static void addranges_S(struct cstate *g) | |
| 279 { | |
| 280 addrange(g, 0, 0x9-1); | |
| 281 addrange(g, 0xD+1, 0x20-1); | |
| 282 addrange(g, 0x20+1, 0xA0-1); | |
| 283 addrange(g, 0xA0+1, 0x2028-1); | |
| 284 addrange(g, 0x2029+1, 0xFEFF-1); | |
| 285 addrange(g, 0xFEFF+1, 0xFFFF); | |
| 286 } | |
| 287 | |
| 288 static void addranges_w(struct cstate *g) | |
| 289 { | |
| 290 addrange(g, '0', '9'); | |
| 291 addrange(g, 'A', 'Z'); | |
| 292 addrange(g, '_', '_'); | |
| 293 addrange(g, 'a', 'z'); | |
| 294 } | |
| 295 | |
| 296 static void addranges_W(struct cstate *g) | |
| 297 { | |
| 298 addrange(g, 0, '0'-1); | |
| 299 addrange(g, '9'+1, 'A'-1); | |
| 300 addrange(g, 'Z'+1, '_'-1); | |
| 301 addrange(g, '_'+1, 'a'-1); | |
| 302 addrange(g, 'z'+1, 0xFFFF); | |
| 303 } | |
| 304 | |
| 305 static int lexclass(struct cstate *g) | |
| 306 { | |
| 307 int type = L_CCLASS; | |
| 308 int quoted, havesave, havedash; | |
| 309 Rune save = 0; | |
| 310 | |
| 311 newcclass(g); | |
| 312 | |
| 313 quoted = nextrune(g); | |
| 314 if (!quoted && g->yychar == '^') { | |
| 315 type = L_NCCLASS; | |
| 316 quoted = nextrune(g); | |
| 317 } | |
| 318 | |
| 319 havesave = havedash = 0; | |
| 320 for (;;) { | |
| 321 if (g->yychar == EOF) | |
| 322 die(g, "unterminated character class"); | |
| 323 if (!quoted && g->yychar == ']') | |
| 324 break; | |
| 325 | |
| 326 if (!quoted && g->yychar == '-') { | |
| 327 if (havesave) { | |
| 328 if (havedash) { | |
| 329 addrange(g, save, '-'); | |
| 330 havesave = havedash = 0; | |
| 331 } else { | |
| 332 havedash = 1; | |
| 333 } | |
| 334 } else { | |
| 335 save = '-'; | |
| 336 havesave = 1; | |
| 337 } | |
| 338 } else if (quoted && strchr("DSWdsw", g->yychar)) { | |
| 339 if (havesave) { | |
| 340 addrange(g, save, save); | |
| 341 if (havedash) | |
| 342 addrange(g, '-', '-'); | |
| 343 } | |
| 344 switch (g->yychar) { | |
| 345 case 'd': addranges_d(g); break; | |
| 346 case 's': addranges_s(g); break; | |
| 347 case 'w': addranges_w(g); break; | |
| 348 case 'D': addranges_D(g); break; | |
| 349 case 'S': addranges_S(g); break; | |
| 350 case 'W': addranges_W(g); break; | |
| 351 } | |
| 352 havesave = havedash = 0; | |
| 353 } else { | |
| 354 if (quoted) { | |
| 355 if (g->yychar == 'b') | |
| 356 g->yychar = '\b'; | |
| 357 else if (g->yychar == '0') | |
| 358 g->yychar = 0; | |
| 359 /* else identity escape */ | |
| 360 } | |
| 361 if (havesave) { | |
| 362 if (havedash) { | |
| 363 addrange(g, save, g->yychar); | |
| 364 havesave = havedash = 0; | |
| 365 } else { | |
| 366 addrange(g, save, save); | |
| 367 save = g->yychar; | |
| 368 } | |
| 369 } else { | |
| 370 save = g->yychar; | |
| 371 havesave = 1; | |
| 372 } | |
| 373 } | |
| 374 | |
| 375 quoted = nextrune(g); | |
| 376 } | |
| 377 | |
| 378 if (havesave) { | |
| 379 addrange(g, save, save); | |
| 380 if (havedash) | |
| 381 addrange(g, '-', '-'); | |
| 382 } | |
| 383 | |
| 384 return type; | |
| 385 } | |
| 386 | |
| 387 static int lex(struct cstate *g) | |
| 388 { | |
| 389 int quoted = nextrune(g); | |
| 390 if (quoted) { | |
| 391 switch (g->yychar) { | |
| 392 case 'b': return L_WORD; | |
| 393 case 'B': return L_NWORD; | |
| 394 case 'd': newcclass(g); addranges_d(g); return L_CCLASS; | |
| 395 case 's': newcclass(g); addranges_s(g); return L_CCLASS; | |
| 396 case 'w': newcclass(g); addranges_w(g); return L_CCLASS; | |
| 397 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS; | |
| 398 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS; | |
| 399 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS; | |
| 400 case '0': g->yychar = 0; return L_CHAR; | |
| 401 } | |
| 402 if (g->yychar >= '0' && g->yychar <= '9') { | |
| 403 g->yychar -= '0'; | |
| 404 if (*g->source >= '0' && *g->source <= '9') | |
| 405 g->yychar = g->yychar * 10 + *g->source++ - '0'; | |
| 406 return L_REF; | |
| 407 } | |
| 408 return L_CHAR; | |
| 409 } | |
| 410 | |
| 411 switch (g->yychar) { | |
| 412 case EOF: | |
| 413 case '$': case ')': case '*': case '+': | |
| 414 case '.': case '?': case '^': case '|': | |
| 415 return g->yychar; | |
| 416 } | |
| 417 | |
| 418 if (g->yychar == '{') | |
| 419 return lexcount(g); | |
| 420 if (g->yychar == '[') | |
| 421 return lexclass(g); | |
| 422 if (g->yychar == '(') { | |
| 423 if (g->source[0] == '?') { | |
| 424 if (g->source[1] == ':') { | |
| 425 g->source += 2; | |
| 426 return L_NC; | |
| 427 } | |
| 428 if (g->source[1] == '=') { | |
| 429 g->source += 2; | |
| 430 return L_PLA; | |
| 431 } | |
| 432 if (g->source[1] == '!') { | |
| 433 g->source += 2; | |
| 434 return L_NLA; | |
| 435 } | |
| 436 } | |
| 437 return '('; | |
| 438 } | |
| 439 | |
| 440 return L_CHAR; | |
| 441 } | |
| 442 | |
| 443 /* Parse */ | |
| 444 | |
| 445 enum { | |
| 446 P_CAT, P_ALT, P_REP, | |
| 447 P_BOL, P_EOL, P_WORD, P_NWORD, | |
| 448 P_PAR, P_PLA, P_NLA, | |
| 449 P_ANY, P_CHAR, P_CCLASS, P_NCCLASS, | |
| 450 P_REF, | |
| 451 }; | |
| 452 | |
| 453 struct Renode { | |
| 454 unsigned char type; | |
| 455 unsigned char ng, m, n; | |
| 456 Rune c; | |
| 457 int cc; | |
| 458 Renode *x; | |
| 459 Renode *y; | |
| 460 }; | |
| 461 | |
| 462 static Renode *newnode(struct cstate *g, int type) | |
| 463 { | |
| 464 Renode *node = g->pend++; | |
| 465 node->type = type; | |
| 466 node->cc = -1; | |
| 467 node->c = 0; | |
| 468 node->ng = 0; | |
| 469 node->m = 0; | |
| 470 node->n = 0; | |
| 471 node->x = node->y = NULL; | |
| 472 return node; | |
| 473 } | |
| 474 | |
| 475 static int empty(Renode *node) | |
| 476 { | |
| 477 if (!node) return 1; | |
| 478 switch (node->type) { | |
| 479 default: return 1; | |
| 480 case P_CAT: return empty(node->x) && empty(node->y); | |
| 481 case P_ALT: return empty(node->x) || empty(node->y); | |
| 482 case P_REP: return empty(node->x) || node->m == 0; | |
| 483 case P_PAR: return empty(node->x); | |
| 484 case P_REF: return empty(node->x); | |
| 485 case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0; | |
| 486 } | |
| 487 } | |
| 488 | |
| 489 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max) | |
| 490 { | |
| 491 Renode *rep = newnode(g, P_REP); | |
| 492 if (max == REPINF && empty(atom)) | |
| 493 die(g, "infinite loop matching the empty string"); | |
| 494 rep->ng = ng; | |
| 495 rep->m = min; | |
| 496 rep->n = max; | |
| 497 rep->x = atom; | |
| 498 return rep; | |
| 499 } | |
| 500 | |
| 501 static void next(struct cstate *g) | |
| 502 { | |
| 503 g->lookahead = lex(g); | |
| 504 } | |
| 505 | |
| 506 static int accept(struct cstate *g, int t) | |
| 507 { | |
| 508 if (g->lookahead == t) { | |
| 509 next(g); | |
| 510 return 1; | |
| 511 } | |
| 512 return 0; | |
| 513 } | |
| 514 | |
| 515 static Renode *parsealt(struct cstate *g); | |
| 516 | |
| 517 static Renode *parseatom(struct cstate *g) | |
| 518 { | |
| 519 Renode *atom; | |
| 520 if (g->lookahead == L_CHAR) { | |
| 521 atom = newnode(g, P_CHAR); | |
| 522 atom->c = g->yychar; | |
| 523 next(g); | |
| 524 return atom; | |
| 525 } | |
| 526 if (g->lookahead == L_CCLASS) { | |
| 527 atom = newnode(g, P_CCLASS); | |
| 528 atom->cc = g->yycc - g->cclass; | |
| 529 next(g); | |
| 530 return atom; | |
| 531 } | |
| 532 if (g->lookahead == L_NCCLASS) { | |
| 533 atom = newnode(g, P_NCCLASS); | |
| 534 atom->cc = g->yycc - g->cclass; | |
| 535 next(g); | |
| 536 return atom; | |
| 537 } | |
| 538 if (g->lookahead == L_REF) { | |
| 539 atom = newnode(g, P_REF); | |
| 540 if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar]) | |
| 541 die(g, "invalid back-reference"); | |
| 542 atom->n = g->yychar; | |
| 543 atom->x = g->sub[g->yychar]; | |
| 544 next(g); | |
| 545 return atom; | |
| 546 } | |
| 547 if (accept(g, '.')) | |
| 548 return newnode(g, P_ANY); | |
| 549 if (accept(g, '(')) { | |
| 550 atom = newnode(g, P_PAR); | |
| 551 if (g->nsub == REG_MAXSUB) | |
| 552 die(g, "too many captures"); | |
| 553 atom->n = g->nsub++; | |
| 554 atom->x = parsealt(g); | |
| 555 g->sub[atom->n] = atom; | |
| 556 if (!accept(g, ')')) | |
| 557 die(g, "unmatched '('"); | |
| 558 return atom; | |
| 559 } | |
| 560 if (accept(g, L_NC)) { | |
| 561 atom = parsealt(g); | |
| 562 if (!accept(g, ')')) | |
| 563 die(g, "unmatched '('"); | |
| 564 return atom; | |
| 565 } | |
| 566 if (accept(g, L_PLA)) { | |
| 567 atom = newnode(g, P_PLA); | |
| 568 atom->x = parsealt(g); | |
| 569 if (!accept(g, ')')) | |
| 570 die(g, "unmatched '('"); | |
| 571 return atom; | |
| 572 } | |
| 573 if (accept(g, L_NLA)) { | |
| 574 atom = newnode(g, P_NLA); | |
| 575 atom->x = parsealt(g); | |
| 576 if (!accept(g, ')')) | |
| 577 die(g, "unmatched '('"); | |
| 578 return atom; | |
| 579 } | |
| 580 die(g, "syntax error"); | |
| 581 return NULL; | |
| 582 } | |
| 583 | |
| 584 static Renode *parserep(struct cstate *g) | |
| 585 { | |
| 586 Renode *atom; | |
| 587 | |
| 588 if (accept(g, '^')) return newnode(g, P_BOL); | |
| 589 if (accept(g, '$')) return newnode(g, P_EOL); | |
| 590 if (accept(g, L_WORD)) return newnode(g, P_WORD); | |
| 591 if (accept(g, L_NWORD)) return newnode(g, P_NWORD); | |
| 592 | |
| 593 atom = parseatom(g); | |
| 594 if (g->lookahead == L_COUNT) { | |
| 595 int min = g->yymin, max = g->yymax; | |
| 596 next(g); | |
| 597 if (max < min) | |
| 598 die(g, "invalid quantifier"); | |
| 599 return newrep(g, atom, accept(g, '?'), min, max); | |
| 600 } | |
| 601 if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF); | |
| 602 if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF); | |
| 603 if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1); | |
| 604 return atom; | |
| 605 } | |
| 606 | |
| 607 static Renode *parsecat(struct cstate *g) | |
| 608 { | |
| 609 Renode *cat, *head, **tail; | |
| 610 if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { | |
| 611 /* Build a right-leaning tree by splicing in new 'cat' at the tail. */ | |
| 612 head = parserep(g); | |
| 613 tail = &head; | |
| 614 while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { | |
| 615 cat = newnode(g, P_CAT); | |
| 616 cat->x = *tail; | |
| 617 cat->y = parserep(g); | |
| 618 *tail = cat; | |
| 619 tail = &cat->y; | |
| 620 } | |
| 621 return head; | |
| 622 } | |
| 623 return NULL; | |
| 624 } | |
| 625 | |
| 626 static Renode *parsealt(struct cstate *g) | |
| 627 { | |
| 628 Renode *alt, *x; | |
| 629 alt = parsecat(g); | |
| 630 while (accept(g, '|')) { | |
| 631 x = alt; | |
| 632 alt = newnode(g, P_ALT); | |
| 633 alt->x = x; | |
| 634 alt->y = parsecat(g); | |
| 635 } | |
| 636 return alt; | |
| 637 } | |
| 638 | |
| 639 /* Compile */ | |
| 640 | |
| 641 enum { | |
| 642 I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA, | |
| 643 I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF, | |
| 644 I_BOL, I_EOL, I_WORD, I_NWORD, | |
| 645 I_LPAR, I_RPAR | |
| 646 }; | |
| 647 | |
| 648 struct Reinst { | |
| 649 unsigned char opcode; | |
| 650 unsigned char n; | |
| 651 Rune c; | |
| 652 Reclass *cc; | |
| 653 Reinst *x; | |
| 654 Reinst *y; | |
| 655 }; | |
| 656 | |
| 657 static int count(struct cstate *g, Renode *node, int depth) | |
| 658 { | |
| 659 int min, max, n; | |
| 660 if (!node) return 0; | |
| 661 if (++depth > REG_MAXREC) die(g, "stack overflow"); | |
| 662 switch (node->type) { | |
| 663 default: return 1; | |
| 664 case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth); | |
| 665 case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2; | |
| 666 case P_REP: | |
| 667 min = node->m; | |
| 668 max = node->n; | |
| 669 if (min == max) n = count(g, node->x, depth) * min; | |
| 670 else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min); | |
| 671 else n = count(g, node->x, depth) * (min + 1) + 2; | |
| 672 if (n < 0 || n > REG_MAXPROG) die(g, "program too large"); | |
| 673 return n; | |
| 674 case P_PAR: return count(g, node->x, depth) + 2; | |
| 675 case P_PLA: return count(g, node->x, depth) + 2; | |
| 676 case P_NLA: return count(g, node->x, depth) + 2; | |
| 677 } | |
| 678 } | |
| 679 | |
| 680 static Reinst *emit(Reprog *prog, int opcode) | |
| 681 { | |
| 682 Reinst *inst = prog->end++; | |
| 683 inst->opcode = opcode; | |
| 684 inst->n = 0; | |
| 685 inst->c = 0; | |
| 686 inst->cc = NULL; | |
| 687 inst->x = inst->y = NULL; | |
| 688 return inst; | |
| 689 } | |
| 690 | |
| 691 static void compile(Reprog *prog, Renode *node) | |
| 692 { | |
| 693 Reinst *inst, *split, *jump; | |
| 694 int i; | |
| 695 | |
| 696 loop: | |
| 697 if (!node) | |
| 698 return; | |
| 699 | |
| 700 switch (node->type) { | |
| 701 case P_CAT: | |
| 702 compile(prog, node->x); | |
| 703 node = node->y; | |
| 704 goto loop; | |
| 705 | |
| 706 case P_ALT: | |
| 707 split = emit(prog, I_SPLIT); | |
| 708 compile(prog, node->x); | |
| 709 jump = emit(prog, I_JUMP); | |
| 710 compile(prog, node->y); | |
| 711 split->x = split + 1; | |
| 712 split->y = jump + 1; | |
| 713 jump->x = prog->end; | |
| 714 break; | |
| 715 | |
| 716 case P_REP: | |
| 717 inst = NULL; /* silence compiler warning. assert(node->m > 0). */ | |
| 718 for (i = 0; i < node->m; ++i) { | |
| 719 inst = prog->end; | |
| 720 compile(prog, node->x); | |
| 721 } | |
| 722 if (node->m == node->n) | |
| 723 break; | |
| 724 if (node->n < REPINF) { | |
| 725 for (i = node->m; i < node->n; ++i) { | |
| 726 split = emit(prog, I_SPLIT); | |
| 727 compile(prog, node->x); | |
| 728 if (node->ng) { | |
| 729 split->y = split + 1; | |
| 730 split->x = prog->end; | |
| 731 } else { | |
| 732 split->x = split + 1; | |
| 733 split->y = prog->end; | |
| 734 } | |
| 735 } | |
| 736 } else if (node->m == 0) { | |
| 737 split = emit(prog, I_SPLIT); | |
| 738 compile(prog, node->x); | |
| 739 jump = emit(prog, I_JUMP); | |
| 740 if (node->ng) { | |
| 741 split->y = split + 1; | |
| 742 split->x = prog->end; | |
| 743 } else { | |
| 744 split->x = split + 1; | |
| 745 split->y = prog->end; | |
| 746 } | |
| 747 jump->x = split; | |
| 748 } else { | |
| 749 split = emit(prog, I_SPLIT); | |
| 750 if (node->ng) { | |
| 751 split->y = inst; | |
| 752 split->x = prog->end; | |
| 753 } else { | |
| 754 split->x = inst; | |
| 755 split->y = prog->end; | |
| 756 } | |
| 757 } | |
| 758 break; | |
| 759 | |
| 760 case P_BOL: emit(prog, I_BOL); break; | |
| 761 case P_EOL: emit(prog, I_EOL); break; | |
| 762 case P_WORD: emit(prog, I_WORD); break; | |
| 763 case P_NWORD: emit(prog, I_NWORD); break; | |
| 764 | |
| 765 case P_PAR: | |
| 766 inst = emit(prog, I_LPAR); | |
| 767 inst->n = node->n; | |
| 768 compile(prog, node->x); | |
| 769 inst = emit(prog, I_RPAR); | |
| 770 inst->n = node->n; | |
| 771 break; | |
| 772 case P_PLA: | |
| 773 split = emit(prog, I_PLA); | |
| 774 compile(prog, node->x); | |
| 775 emit(prog, I_END); | |
| 776 split->x = split + 1; | |
| 777 split->y = prog->end; | |
| 778 break; | |
| 779 case P_NLA: | |
| 780 split = emit(prog, I_NLA); | |
| 781 compile(prog, node->x); | |
| 782 emit(prog, I_END); | |
| 783 split->x = split + 1; | |
| 784 split->y = prog->end; | |
| 785 break; | |
| 786 | |
| 787 case P_ANY: | |
| 788 emit(prog, I_ANY); | |
| 789 break; | |
| 790 case P_CHAR: | |
| 791 inst = emit(prog, I_CHAR); | |
| 792 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c; | |
| 793 break; | |
| 794 case P_CCLASS: | |
| 795 inst = emit(prog, I_CCLASS); | |
| 796 inst->cc = prog->cclass + node->cc; | |
| 797 break; | |
| 798 case P_NCCLASS: | |
| 799 inst = emit(prog, I_NCCLASS); | |
| 800 inst->cc = prog->cclass + node->cc; | |
| 801 break; | |
| 802 case P_REF: | |
| 803 inst = emit(prog, I_REF); | |
| 804 inst->n = node->n; | |
| 805 break; | |
| 806 } | |
| 807 } | |
| 808 | |
| 809 #ifdef TEST | |
| 810 static void dumpnode(struct cstate *g, Renode *node) | |
| 811 { | |
| 812 Rune *p; | |
| 813 Reclass *cc; | |
| 814 if (!node) { printf("Empty"); return; } | |
| 815 switch (node->type) { | |
| 816 case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; | |
| 817 case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; | |
| 818 case P_REP: | |
| 819 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n); | |
| 820 dumpnode(g, node->x); | |
| 821 printf(")"); | |
| 822 break; | |
| 823 case P_BOL: printf("Bol"); break; | |
| 824 case P_EOL: printf("Eol"); break; | |
| 825 case P_WORD: printf("Word"); break; | |
| 826 case P_NWORD: printf("NotWord"); break; | |
| 827 case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break; | |
| 828 case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break; | |
| 829 case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break; | |
| 830 case P_ANY: printf("Any"); break; | |
| 831 case P_CHAR: printf("Char(%c)", node->c); break; | |
| 832 case P_CCLASS: | |
| 833 printf("Class("); | |
| 834 cc = g->cclass + node->cc; | |
| 835 for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); | |
| 836 printf(")"); | |
| 837 break; | |
| 838 case P_NCCLASS: | |
| 839 printf("NotClass("); | |
| 840 cc = g->cclass + node->cc; | |
| 841 for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); | |
| 842 printf(")"); | |
| 843 break; | |
| 844 case P_REF: printf("Ref(%d)", node->n); break; | |
| 845 } | |
| 846 } | |
| 847 | |
| 848 static void dumpcclass(Reclass *cc) { | |
| 849 Rune *p; | |
| 850 for (p = cc->spans; p < cc->end; p += 2) { | |
| 851 if (p[0] > 32 && p[0] < 127) | |
| 852 printf(" %c", p[0]); | |
| 853 else | |
| 854 printf(" \\x%02x", p[0]); | |
| 855 if (p[1] > 32 && p[1] < 127) | |
| 856 printf("-%c", p[1]); | |
| 857 else | |
| 858 printf("-\\x%02x", p[1]); | |
| 859 } | |
| 860 putchar('\n'); | |
| 861 } | |
| 862 | |
| 863 static void dumpprog(Reprog *prog) | |
| 864 { | |
| 865 Reinst *inst; | |
| 866 int i; | |
| 867 for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) { | |
| 868 printf("% 5d: ", i); | |
| 869 switch (inst->opcode) { | |
| 870 case I_END: puts("end"); break; | |
| 871 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break; | |
| 872 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; | |
| 873 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; | |
| 874 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; | |
| 875 case I_ANY: puts("any"); break; | |
| 876 case I_ANYNL: puts("anynl"); break; | |
| 877 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break; | |
| 878 case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break; | |
| 879 case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break; | |
| 880 case I_REF: printf("ref %d\n", inst->n); break; | |
| 881 case I_BOL: puts("bol"); break; | |
| 882 case I_EOL: puts("eol"); break; | |
| 883 case I_WORD: puts("word"); break; | |
| 884 case I_NWORD: puts("nword"); break; | |
| 885 case I_LPAR: printf("lpar %d\n", inst->n); break; | |
| 886 case I_RPAR: printf("rpar %d\n", inst->n); break; | |
| 887 } | |
| 888 } | |
| 889 } | |
| 890 #endif | |
| 891 | |
| 892 Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx, | |
| 893 const char *pattern, int cflags, const char **errorp) | |
| 894 { | |
| 895 struct cstate g; | |
| 896 Renode *node; | |
| 897 Reinst *split, *jump; | |
| 898 int i, n; | |
| 899 | |
| 900 g.pstart = NULL; | |
| 901 g.prog = NULL; | |
| 902 | |
| 903 if (setjmp(g.kaboom)) { | |
| 904 if (errorp) *errorp = g.error; | |
| 905 alloc(ctx, g.pstart, 0); | |
| 906 if (g.prog) { | |
| 907 alloc(ctx, g.prog->cclass, 0); | |
| 908 alloc(ctx, g.prog->start, 0); | |
| 909 alloc(ctx, g.prog, 0); | |
| 910 } | |
| 911 return NULL; | |
| 912 } | |
| 913 | |
| 914 g.prog = alloc(ctx, NULL, sizeof (Reprog)); | |
| 915 if (!g.prog) | |
| 916 die(&g, "cannot allocate regular expression"); | |
| 917 g.prog->start = NULL; | |
| 918 g.prog->cclass = NULL; | |
| 919 | |
| 920 n = strlen(pattern) * 2; | |
| 921 if (n > REG_MAXPROG) | |
| 922 die(&g, "program too large"); | |
| 923 if (n > 0) { | |
| 924 g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n); | |
| 925 if (!g.pstart) | |
| 926 die(&g, "cannot allocate regular expression parse list"); | |
| 927 } | |
| 928 | |
| 929 g.source = pattern; | |
| 930 g.ncclass = 0; | |
| 931 g.nsub = 1; | |
| 932 for (i = 0; i < REG_MAXSUB; ++i) | |
| 933 g.sub[i] = 0; | |
| 934 | |
| 935 g.prog->flags = cflags; | |
| 936 | |
| 937 next(&g); | |
| 938 node = parsealt(&g); | |
| 939 if (g.lookahead == ')') | |
| 940 die(&g, "unmatched ')'"); | |
| 941 if (g.lookahead != EOF) | |
| 942 die(&g, "syntax error"); | |
| 943 | |
| 944 #ifdef TEST | |
| 945 dumpnode(&g, node); | |
| 946 putchar('\n'); | |
| 947 #endif | |
| 948 | |
| 949 n = 6 + count(&g, node, 0); | |
| 950 if (n < 0 || n > REG_MAXPROG) | |
| 951 die(&g, "program too large"); | |
| 952 | |
| 953 g.prog->nsub = g.nsub; | |
| 954 g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst)); | |
| 955 if (!g.prog->start) | |
| 956 die(&g, "cannot allocate regular expression instruction list"); | |
| 957 | |
| 958 if (g.ncclass > 0) { | |
| 959 g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass)); | |
| 960 if (!g.prog->cclass) | |
| 961 die(&g, "cannot allocate regular expression character class list"); | |
| 962 memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass)); | |
| 963 for (i = 0; i < g.ncclass; ++i) | |
| 964 g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans); | |
| 965 } | |
| 966 | |
| 967 split = emit(g.prog, I_SPLIT); | |
| 968 split->x = split + 3; | |
| 969 split->y = split + 1; | |
| 970 emit(g.prog, I_ANYNL); | |
| 971 jump = emit(g.prog, I_JUMP); | |
| 972 jump->x = split; | |
| 973 emit(g.prog, I_LPAR); | |
| 974 compile(g.prog, node); | |
| 975 emit(g.prog, I_RPAR); | |
| 976 emit(g.prog, I_END); | |
| 977 | |
| 978 #ifdef TEST | |
| 979 dumpprog(g.prog); | |
| 980 #endif | |
| 981 | |
| 982 alloc(ctx, g.pstart, 0); | |
| 983 | |
| 984 if (errorp) *errorp = NULL; | |
| 985 return g.prog; | |
| 986 } | |
| 987 | |
| 988 void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog) | |
| 989 { | |
| 990 if (prog) { | |
| 991 if (prog->cclass) | |
| 992 alloc(ctx, prog->cclass, 0); | |
| 993 alloc(ctx, prog->start, 0); | |
| 994 alloc(ctx, prog, 0); | |
| 995 } | |
| 996 } | |
| 997 | |
| 998 static void *default_alloc(void *ctx, void *p, int n) | |
| 999 { | |
| 1000 if (n == 0) { | |
| 1001 free(p); | |
| 1002 return NULL; | |
| 1003 } | |
| 1004 return realloc(p, (size_t)n); | |
| 1005 } | |
| 1006 | |
| 1007 Reprog *regcomp(const char *pattern, int cflags, const char **errorp) | |
| 1008 { | |
| 1009 return regcompx(default_alloc, NULL, pattern, cflags, errorp); | |
| 1010 } | |
| 1011 | |
| 1012 void regfree(Reprog *prog) | |
| 1013 { | |
| 1014 regfreex(default_alloc, NULL, prog); | |
| 1015 } | |
| 1016 | |
| 1017 /* Match */ | |
| 1018 | |
| 1019 static int isnewline(int c) | |
| 1020 { | |
| 1021 return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029; | |
| 1022 } | |
| 1023 | |
| 1024 static int iswordchar(int c) | |
| 1025 { | |
| 1026 return c == '_' || | |
| 1027 (c >= 'a' && c <= 'z') || | |
| 1028 (c >= 'A' && c <= 'Z') || | |
| 1029 (c >= '0' && c <= '9'); | |
| 1030 } | |
| 1031 | |
| 1032 static int incclass(Reclass *cc, Rune c) | |
| 1033 { | |
| 1034 Rune *p; | |
| 1035 for (p = cc->spans; p < cc->end; p += 2) | |
| 1036 if (p[0] <= c && c <= p[1]) | |
| 1037 return 1; | |
| 1038 return 0; | |
| 1039 } | |
| 1040 | |
| 1041 static int incclasscanon(Reclass *cc, Rune c) | |
| 1042 { | |
| 1043 Rune *p, r; | |
| 1044 for (p = cc->spans; p < cc->end; p += 2) | |
| 1045 for (r = p[0]; r <= p[1]; ++r) | |
| 1046 if (c == canon(r)) | |
| 1047 return 1; | |
| 1048 return 0; | |
| 1049 } | |
| 1050 | |
| 1051 static int strncmpcanon(const char *a, const char *b, int n) | |
| 1052 { | |
| 1053 Rune ra, rb; | |
| 1054 int c; | |
| 1055 while (n--) { | |
| 1056 if (!*a) return -1; | |
| 1057 if (!*b) return 1; | |
| 1058 a += chartorune(&ra, a); | |
| 1059 b += chartorune(&rb, b); | |
| 1060 c = canon(ra) - canon(rb); | |
| 1061 if (c) | |
| 1062 return c; | |
| 1063 } | |
| 1064 return 0; | |
| 1065 } | |
| 1066 | |
| 1067 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth) | |
| 1068 { | |
| 1069 Resub scratch; | |
| 1070 int result; | |
| 1071 int i; | |
| 1072 Rune c; | |
| 1073 | |
| 1074 /* stack overflow */ | |
| 1075 if (depth > REG_MAXREC) | |
| 1076 return -1; | |
| 1077 | |
| 1078 for (;;) { | |
| 1079 switch (pc->opcode) { | |
| 1080 case I_END: | |
| 1081 return 0; | |
| 1082 case I_JUMP: | |
| 1083 pc = pc->x; | |
| 1084 break; | |
| 1085 case I_SPLIT: | |
| 1086 scratch = *out; | |
| 1087 result = match(pc->x, sp, bol, flags, &scratch, depth+1); | |
| 1088 if (result == -1) | |
| 1089 return -1; | |
| 1090 if (result == 0) { | |
| 1091 *out = scratch; | |
| 1092 return 0; | |
| 1093 } | |
| 1094 pc = pc->y; | |
| 1095 break; | |
| 1096 | |
| 1097 case I_PLA: | |
| 1098 result = match(pc->x, sp, bol, flags, out, depth+1); | |
| 1099 if (result == -1) | |
| 1100 return -1; | |
| 1101 if (result == 1) | |
| 1102 return 1; | |
| 1103 pc = pc->y; | |
| 1104 break; | |
| 1105 case I_NLA: | |
| 1106 scratch = *out; | |
| 1107 result = match(pc->x, sp, bol, flags, &scratch, depth+1); | |
| 1108 if (result == -1) | |
| 1109 return -1; | |
| 1110 if (result == 0) | |
| 1111 return 1; | |
| 1112 pc = pc->y; | |
| 1113 break; | |
| 1114 | |
| 1115 case I_ANYNL: | |
| 1116 if (!*sp) return 1; | |
| 1117 sp += chartorune(&c, sp); | |
| 1118 pc = pc + 1; | |
| 1119 break; | |
| 1120 case I_ANY: | |
| 1121 if (!*sp) return 1; | |
| 1122 sp += chartorune(&c, sp); | |
| 1123 if (isnewline(c)) | |
| 1124 return 1; | |
| 1125 pc = pc + 1; | |
| 1126 break; | |
| 1127 case I_CHAR: | |
| 1128 if (!*sp) return 1; | |
| 1129 sp += chartorune(&c, sp); | |
| 1130 if (flags & REG_ICASE) | |
| 1131 c = canon(c); | |
| 1132 if (c != pc->c) | |
| 1133 return 1; | |
| 1134 pc = pc + 1; | |
| 1135 break; | |
| 1136 case I_CCLASS: | |
| 1137 if (!*sp) return 1; | |
| 1138 sp += chartorune(&c, sp); | |
| 1139 if (flags & REG_ICASE) { | |
| 1140 if (!incclasscanon(pc->cc, canon(c))) | |
| 1141 return 1; | |
| 1142 } else { | |
| 1143 if (!incclass(pc->cc, c)) | |
| 1144 return 1; | |
| 1145 } | |
| 1146 pc = pc + 1; | |
| 1147 break; | |
| 1148 case I_NCCLASS: | |
| 1149 if (!*sp) return 1; | |
| 1150 sp += chartorune(&c, sp); | |
| 1151 if (flags & REG_ICASE) { | |
| 1152 if (incclasscanon(pc->cc, canon(c))) | |
| 1153 return 1; | |
| 1154 } else { | |
| 1155 if (incclass(pc->cc, c)) | |
| 1156 return 1; | |
| 1157 } | |
| 1158 pc = pc + 1; | |
| 1159 break; | |
| 1160 case I_REF: | |
| 1161 i = out->sub[pc->n].ep - out->sub[pc->n].sp; | |
| 1162 if (flags & REG_ICASE) { | |
| 1163 if (strncmpcanon(sp, out->sub[pc->n].sp, i)) | |
| 1164 return 1; | |
| 1165 } else { | |
| 1166 if (strncmp(sp, out->sub[pc->n].sp, i)) | |
| 1167 return 1; | |
| 1168 } | |
| 1169 if (i > 0) | |
| 1170 sp += i; | |
| 1171 pc = pc + 1; | |
| 1172 break; | |
| 1173 | |
| 1174 case I_BOL: | |
| 1175 if (sp == bol && !(flags & REG_NOTBOL)) { | |
| 1176 pc = pc + 1; | |
| 1177 break; | |
| 1178 } | |
| 1179 if (flags & REG_NEWLINE) { | |
| 1180 if (sp > bol && isnewline(sp[-1])) { | |
| 1181 pc = pc + 1; | |
| 1182 break; | |
| 1183 } | |
| 1184 } | |
| 1185 return 1; | |
| 1186 case I_EOL: | |
| 1187 if (*sp == 0) { | |
| 1188 pc = pc + 1; | |
| 1189 break; | |
| 1190 } | |
| 1191 if (flags & REG_NEWLINE) { | |
| 1192 if (isnewline(*sp)) { | |
| 1193 pc = pc + 1; | |
| 1194 break; | |
| 1195 } | |
| 1196 } | |
| 1197 return 1; | |
| 1198 case I_WORD: | |
| 1199 i = sp > bol && iswordchar(sp[-1]); | |
| 1200 i ^= iswordchar(sp[0]); | |
| 1201 if (!i) | |
| 1202 return 1; | |
| 1203 pc = pc + 1; | |
| 1204 break; | |
| 1205 case I_NWORD: | |
| 1206 i = sp > bol && iswordchar(sp[-1]); | |
| 1207 i ^= iswordchar(sp[0]); | |
| 1208 if (i) | |
| 1209 return 1; | |
| 1210 pc = pc + 1; | |
| 1211 break; | |
| 1212 | |
| 1213 case I_LPAR: | |
| 1214 out->sub[pc->n].sp = sp; | |
| 1215 pc = pc + 1; | |
| 1216 break; | |
| 1217 case I_RPAR: | |
| 1218 out->sub[pc->n].ep = sp; | |
| 1219 pc = pc + 1; | |
| 1220 break; | |
| 1221 default: | |
| 1222 return 1; | |
| 1223 } | |
| 1224 } | |
| 1225 } | |
| 1226 | |
| 1227 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) | |
| 1228 { | |
| 1229 Resub scratch; | |
| 1230 int i; | |
| 1231 | |
| 1232 if (!sub) | |
| 1233 sub = &scratch; | |
| 1234 | |
| 1235 sub->nsub = prog->nsub; | |
| 1236 for (i = 0; i < REG_MAXSUB; ++i) | |
| 1237 sub->sub[i].sp = sub->sub[i].ep = NULL; | |
| 1238 | |
| 1239 return match(prog->start, sp, sp, prog->flags | eflags, sub, 0); | |
| 1240 } | |
| 1241 | |
| 1242 #ifdef TEST | |
| 1243 int main(int argc, char **argv) | |
| 1244 { | |
| 1245 const char *error; | |
| 1246 const char *s; | |
| 1247 Reprog *p; | |
| 1248 Resub m; | |
| 1249 int i; | |
| 1250 | |
| 1251 if (argc > 1) { | |
| 1252 p = regcomp(argv[1], 0, &error); | |
| 1253 if (!p) { | |
| 1254 fprintf(stderr, "regcomp: %s\n", error); | |
| 1255 return 1; | |
| 1256 } | |
| 1257 | |
| 1258 if (argc > 2) { | |
| 1259 s = argv[2]; | |
| 1260 printf("nsub = %d\n", p->nsub); | |
| 1261 if (!regexec(p, s, &m, 0)) { | |
| 1262 for (i = 0; i < m.nsub; ++i) { | |
| 1263 int n = m.sub[i].ep - m.sub[i].sp; | |
| 1264 if (n > 0) | |
| 1265 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp); | |
| 1266 else | |
| 1267 printf("match %d: n=0 ''\n", i); | |
| 1268 } | |
| 1269 } else { | |
| 1270 printf("no match\n"); | |
| 1271 } | |
| 1272 } | |
| 1273 } | |
| 1274 | |
| 1275 return 0; | |
| 1276 } | |
| 1277 #endif |
