Mercurial > hgrepos > Python2 > PyMuPDF
diff mupdf-source/thirdparty/mujs/regexp.c @ 2:b50eed0cc0ef upstream
ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4.
The directory name has changed: no version number in the expanded directory now.
| author | Franz Glasner <fzglas.hg@dom66.de> |
|---|---|
| date | Mon, 15 Sep 2025 11:43:07 +0200 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mupdf-source/thirdparty/mujs/regexp.c Mon Sep 15 11:43:07 2025 +0200 @@ -0,0 +1,1277 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <setjmp.h> +#include <limits.h> + +#include "regexp.h" +#include "utf.h" + +#define emit regemit +#define next regnext +#define accept regaccept + +#define nelem(a) (int)(sizeof (a) / sizeof (a)[0]) + +#define REPINF 255 +#ifndef REG_MAXPROG +#define REG_MAXPROG (32 << 10) +#endif +#ifndef REG_MAXREC +#define REG_MAXREC 1024 +#endif +#ifndef REG_MAXSPAN +#define REG_MAXSPAN 64 +#endif +#ifndef REG_MAXCLASS +#define REG_MAXCLASS 128 +#endif + +typedef struct Reclass Reclass; +typedef struct Renode Renode; +typedef struct Reinst Reinst; +typedef struct Rethread Rethread; + +struct Reclass { + Rune *end; + Rune spans[REG_MAXSPAN]; +}; + +struct Reprog { + Reinst *start, *end; + Reclass *cclass; + int flags; + int nsub; +}; + +struct cstate { + Reprog *prog; + Renode *pstart, *pend; + + const char *source; + int ncclass; + int nsub; + Renode *sub[REG_MAXSUB]; + + int lookahead; + Rune yychar; + Reclass *yycc; + int yymin, yymax; + + const char *error; + jmp_buf kaboom; + + Reclass cclass[REG_MAXCLASS]; +}; + +static void die(struct cstate *g, const char *message) +{ + g->error = message; + longjmp(g->kaboom, 1); +} + +static int canon(Rune c) +{ + Rune u = toupperrune(c); + if (c >= 128 && u < 128) + return c; + return u; +} + +/* Scan */ + +enum { + L_CHAR = 256, + L_CCLASS, /* character class */ + L_NCCLASS, /* negative character class */ + L_NC, /* "(?:" no capture */ + L_PLA, /* "(?=" positive lookahead */ + L_NLA, /* "(?!" negative lookahead */ + L_WORD, /* "\b" word boundary */ + L_NWORD, /* "\B" non-word boundary */ + L_REF, /* "\1" back-reference */ + L_COUNT, /* {M,N} */ +}; + +static int hex(struct cstate *g, int c) +{ + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 0xA; + if (c >= 'A' && c <= 'F') return c - 'A' + 0xA; + die(g, "invalid escape sequence"); + return 0; +} + +static int dec(struct cstate *g, int c) +{ + if (c >= '0' && c <= '9') return c - '0'; + die(g, "invalid quantifier"); + return 0; +} + +#define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789" + +static int isunicodeletter(int c) +{ + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c); +} + +static int nextrune(struct cstate *g) +{ + if (!*g->source) { + g->yychar = EOF; + return 0; + } + g->source += chartorune(&g->yychar, g->source); + if (g->yychar == '\\') { + if (!*g->source) + die(g, "unterminated escape sequence"); + g->source += chartorune(&g->yychar, g->source); + switch (g->yychar) { + case 'f': g->yychar = '\f'; return 0; + case 'n': g->yychar = '\n'; return 0; + case 'r': g->yychar = '\r'; return 0; + case 't': g->yychar = '\t'; return 0; + case 'v': g->yychar = '\v'; return 0; + case 'c': + if (!g->source[0]) + die(g, "unterminated escape sequence"); + g->yychar = (*g->source++) & 31; + return 0; + case 'x': + if (!g->source[0] || !g->source[1]) + die(g, "unterminated escape sequence"); + g->yychar = hex(g, *g->source++) << 4; + g->yychar += hex(g, *g->source++); + if (g->yychar == 0) { + g->yychar = '0'; + return 1; + } + return 0; + case 'u': + if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3]) + die(g, "unterminated escape sequence"); + g->yychar = hex(g, *g->source++) << 12; + g->yychar += hex(g, *g->source++) << 8; + g->yychar += hex(g, *g->source++) << 4; + g->yychar += hex(g, *g->source++); + if (g->yychar == 0) { + g->yychar = '0'; + return 1; + } + return 0; + case 0: + g->yychar = '0'; + return 1; + } + if (strchr(ESCAPES, g->yychar)) + return 1; + if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */ + die(g, "invalid escape character"); + return 0; + } + return 0; +} + +static int lexcount(struct cstate *g) +{ + g->yychar = *g->source++; + + g->yymin = dec(g, g->yychar); + g->yychar = *g->source++; + while (g->yychar != ',' && g->yychar != '}') { + g->yymin = g->yymin * 10 + dec(g, g->yychar); + g->yychar = *g->source++; + if (g->yymin >= REPINF) + die(g, "numeric overflow"); + } + + if (g->yychar == ',') { + g->yychar = *g->source++; + if (g->yychar == '}') { + g->yymax = REPINF; + } else { + g->yymax = dec(g, g->yychar); + g->yychar = *g->source++; + while (g->yychar != '}') { + g->yymax = g->yymax * 10 + dec(g, g->yychar); + g->yychar = *g->source++; + if (g->yymax >= REPINF) + die(g, "numeric overflow"); + } + } + } else { + g->yymax = g->yymin; + } + + return L_COUNT; +} + +static void newcclass(struct cstate *g) +{ + if (g->ncclass >= REG_MAXCLASS) + die(g, "too many character classes"); + g->yycc = g->cclass + g->ncclass++; + g->yycc->end = g->yycc->spans; +} + +static void addrange(struct cstate *g, Rune a, Rune b) +{ + Reclass *cc = g->yycc; + Rune *p; + + if (a > b) + die(g, "invalid character class range"); + + /* extend existing ranges if they overlap */ + for (p = cc->spans; p < cc->end; p += 2) { + /* completely inside old range */ + if (a >= p[0] && b <= p[1]) + return; + + /* completely swallows old range */ + if (a < p[0] && b >= p[1]) { + p[0] = a; + p[1] = b; + return; + } + + /* extend at start */ + if (b >= p[0] - 1 && b <= p[1] && a < p[0]) { + p[0] = a; + return; + } + + /* extend at end */ + if (a >= p[0] && a <= p[1] + 1 && b > p[1]) { + p[1] = b; + return; + } + } + + if (cc->end + 2 >= cc->spans + nelem(cc->spans)) + die(g, "too many character class ranges"); + *cc->end++ = a; + *cc->end++ = b; +} + +static void addranges_d(struct cstate *g) +{ + addrange(g, '0', '9'); +} + +static void addranges_D(struct cstate *g) +{ + addrange(g, 0, '0'-1); + addrange(g, '9'+1, 0xFFFF); +} + +static void addranges_s(struct cstate *g) +{ + addrange(g, 0x9, 0xD); + addrange(g, 0x20, 0x20); + addrange(g, 0xA0, 0xA0); + addrange(g, 0x2028, 0x2029); + addrange(g, 0xFEFF, 0xFEFF); +} + +static void addranges_S(struct cstate *g) +{ + addrange(g, 0, 0x9-1); + addrange(g, 0xD+1, 0x20-1); + addrange(g, 0x20+1, 0xA0-1); + addrange(g, 0xA0+1, 0x2028-1); + addrange(g, 0x2029+1, 0xFEFF-1); + addrange(g, 0xFEFF+1, 0xFFFF); +} + +static void addranges_w(struct cstate *g) +{ + addrange(g, '0', '9'); + addrange(g, 'A', 'Z'); + addrange(g, '_', '_'); + addrange(g, 'a', 'z'); +} + +static void addranges_W(struct cstate *g) +{ + addrange(g, 0, '0'-1); + addrange(g, '9'+1, 'A'-1); + addrange(g, 'Z'+1, '_'-1); + addrange(g, '_'+1, 'a'-1); + addrange(g, 'z'+1, 0xFFFF); +} + +static int lexclass(struct cstate *g) +{ + int type = L_CCLASS; + int quoted, havesave, havedash; + Rune save = 0; + + newcclass(g); + + quoted = nextrune(g); + if (!quoted && g->yychar == '^') { + type = L_NCCLASS; + quoted = nextrune(g); + } + + havesave = havedash = 0; + for (;;) { + if (g->yychar == EOF) + die(g, "unterminated character class"); + if (!quoted && g->yychar == ']') + break; + + if (!quoted && g->yychar == '-') { + if (havesave) { + if (havedash) { + addrange(g, save, '-'); + havesave = havedash = 0; + } else { + havedash = 1; + } + } else { + save = '-'; + havesave = 1; + } + } else if (quoted && strchr("DSWdsw", g->yychar)) { + if (havesave) { + addrange(g, save, save); + if (havedash) + addrange(g, '-', '-'); + } + switch (g->yychar) { + case 'd': addranges_d(g); break; + case 's': addranges_s(g); break; + case 'w': addranges_w(g); break; + case 'D': addranges_D(g); break; + case 'S': addranges_S(g); break; + case 'W': addranges_W(g); break; + } + havesave = havedash = 0; + } else { + if (quoted) { + if (g->yychar == 'b') + g->yychar = '\b'; + else if (g->yychar == '0') + g->yychar = 0; + /* else identity escape */ + } + if (havesave) { + if (havedash) { + addrange(g, save, g->yychar); + havesave = havedash = 0; + } else { + addrange(g, save, save); + save = g->yychar; + } + } else { + save = g->yychar; + havesave = 1; + } + } + + quoted = nextrune(g); + } + + if (havesave) { + addrange(g, save, save); + if (havedash) + addrange(g, '-', '-'); + } + + return type; +} + +static int lex(struct cstate *g) +{ + int quoted = nextrune(g); + if (quoted) { + switch (g->yychar) { + case 'b': return L_WORD; + case 'B': return L_NWORD; + case 'd': newcclass(g); addranges_d(g); return L_CCLASS; + case 's': newcclass(g); addranges_s(g); return L_CCLASS; + case 'w': newcclass(g); addranges_w(g); return L_CCLASS; + case 'D': newcclass(g); addranges_d(g); return L_NCCLASS; + case 'S': newcclass(g); addranges_s(g); return L_NCCLASS; + case 'W': newcclass(g); addranges_w(g); return L_NCCLASS; + case '0': g->yychar = 0; return L_CHAR; + } + if (g->yychar >= '0' && g->yychar <= '9') { + g->yychar -= '0'; + if (*g->source >= '0' && *g->source <= '9') + g->yychar = g->yychar * 10 + *g->source++ - '0'; + return L_REF; + } + return L_CHAR; + } + + switch (g->yychar) { + case EOF: + case '$': case ')': case '*': case '+': + case '.': case '?': case '^': case '|': + return g->yychar; + } + + if (g->yychar == '{') + return lexcount(g); + if (g->yychar == '[') + return lexclass(g); + if (g->yychar == '(') { + if (g->source[0] == '?') { + if (g->source[1] == ':') { + g->source += 2; + return L_NC; + } + if (g->source[1] == '=') { + g->source += 2; + return L_PLA; + } + if (g->source[1] == '!') { + g->source += 2; + return L_NLA; + } + } + return '('; + } + + return L_CHAR; +} + +/* Parse */ + +enum { + P_CAT, P_ALT, P_REP, + P_BOL, P_EOL, P_WORD, P_NWORD, + P_PAR, P_PLA, P_NLA, + P_ANY, P_CHAR, P_CCLASS, P_NCCLASS, + P_REF, +}; + +struct Renode { + unsigned char type; + unsigned char ng, m, n; + Rune c; + int cc; + Renode *x; + Renode *y; +}; + +static Renode *newnode(struct cstate *g, int type) +{ + Renode *node = g->pend++; + node->type = type; + node->cc = -1; + node->c = 0; + node->ng = 0; + node->m = 0; + node->n = 0; + node->x = node->y = NULL; + return node; +} + +static int empty(Renode *node) +{ + if (!node) return 1; + switch (node->type) { + default: return 1; + case P_CAT: return empty(node->x) && empty(node->y); + case P_ALT: return empty(node->x) || empty(node->y); + case P_REP: return empty(node->x) || node->m == 0; + case P_PAR: return empty(node->x); + case P_REF: return empty(node->x); + case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0; + } +} + +static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max) +{ + Renode *rep = newnode(g, P_REP); + if (max == REPINF && empty(atom)) + die(g, "infinite loop matching the empty string"); + rep->ng = ng; + rep->m = min; + rep->n = max; + rep->x = atom; + return rep; +} + +static void next(struct cstate *g) +{ + g->lookahead = lex(g); +} + +static int accept(struct cstate *g, int t) +{ + if (g->lookahead == t) { + next(g); + return 1; + } + return 0; +} + +static Renode *parsealt(struct cstate *g); + +static Renode *parseatom(struct cstate *g) +{ + Renode *atom; + if (g->lookahead == L_CHAR) { + atom = newnode(g, P_CHAR); + atom->c = g->yychar; + next(g); + return atom; + } + if (g->lookahead == L_CCLASS) { + atom = newnode(g, P_CCLASS); + atom->cc = g->yycc - g->cclass; + next(g); + return atom; + } + if (g->lookahead == L_NCCLASS) { + atom = newnode(g, P_NCCLASS); + atom->cc = g->yycc - g->cclass; + next(g); + return atom; + } + if (g->lookahead == L_REF) { + atom = newnode(g, P_REF); + if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar]) + die(g, "invalid back-reference"); + atom->n = g->yychar; + atom->x = g->sub[g->yychar]; + next(g); + return atom; + } + if (accept(g, '.')) + return newnode(g, P_ANY); + if (accept(g, '(')) { + atom = newnode(g, P_PAR); + if (g->nsub == REG_MAXSUB) + die(g, "too many captures"); + atom->n = g->nsub++; + atom->x = parsealt(g); + g->sub[atom->n] = atom; + if (!accept(g, ')')) + die(g, "unmatched '('"); + return atom; + } + if (accept(g, L_NC)) { + atom = parsealt(g); + if (!accept(g, ')')) + die(g, "unmatched '('"); + return atom; + } + if (accept(g, L_PLA)) { + atom = newnode(g, P_PLA); + atom->x = parsealt(g); + if (!accept(g, ')')) + die(g, "unmatched '('"); + return atom; + } + if (accept(g, L_NLA)) { + atom = newnode(g, P_NLA); + atom->x = parsealt(g); + if (!accept(g, ')')) + die(g, "unmatched '('"); + return atom; + } + die(g, "syntax error"); + return NULL; +} + +static Renode *parserep(struct cstate *g) +{ + Renode *atom; + + if (accept(g, '^')) return newnode(g, P_BOL); + if (accept(g, '$')) return newnode(g, P_EOL); + if (accept(g, L_WORD)) return newnode(g, P_WORD); + if (accept(g, L_NWORD)) return newnode(g, P_NWORD); + + atom = parseatom(g); + if (g->lookahead == L_COUNT) { + int min = g->yymin, max = g->yymax; + next(g); + if (max < min) + die(g, "invalid quantifier"); + return newrep(g, atom, accept(g, '?'), min, max); + } + if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF); + if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF); + if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1); + return atom; +} + +static Renode *parsecat(struct cstate *g) +{ + Renode *cat, *head, **tail; + if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { + /* Build a right-leaning tree by splicing in new 'cat' at the tail. */ + head = parserep(g); + tail = &head; + while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') { + cat = newnode(g, P_CAT); + cat->x = *tail; + cat->y = parserep(g); + *tail = cat; + tail = &cat->y; + } + return head; + } + return NULL; +} + +static Renode *parsealt(struct cstate *g) +{ + Renode *alt, *x; + alt = parsecat(g); + while (accept(g, '|')) { + x = alt; + alt = newnode(g, P_ALT); + alt->x = x; + alt->y = parsecat(g); + } + return alt; +} + +/* Compile */ + +enum { + I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA, + I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF, + I_BOL, I_EOL, I_WORD, I_NWORD, + I_LPAR, I_RPAR +}; + +struct Reinst { + unsigned char opcode; + unsigned char n; + Rune c; + Reclass *cc; + Reinst *x; + Reinst *y; +}; + +static int count(struct cstate *g, Renode *node, int depth) +{ + int min, max, n; + if (!node) return 0; + if (++depth > REG_MAXREC) die(g, "stack overflow"); + switch (node->type) { + default: return 1; + case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth); + case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2; + case P_REP: + min = node->m; + max = node->n; + if (min == max) n = count(g, node->x, depth) * min; + else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min); + else n = count(g, node->x, depth) * (min + 1) + 2; + if (n < 0 || n > REG_MAXPROG) die(g, "program too large"); + return n; + case P_PAR: return count(g, node->x, depth) + 2; + case P_PLA: return count(g, node->x, depth) + 2; + case P_NLA: return count(g, node->x, depth) + 2; + } +} + +static Reinst *emit(Reprog *prog, int opcode) +{ + Reinst *inst = prog->end++; + inst->opcode = opcode; + inst->n = 0; + inst->c = 0; + inst->cc = NULL; + inst->x = inst->y = NULL; + return inst; +} + +static void compile(Reprog *prog, Renode *node) +{ + Reinst *inst, *split, *jump; + int i; + +loop: + if (!node) + return; + + switch (node->type) { + case P_CAT: + compile(prog, node->x); + node = node->y; + goto loop; + + case P_ALT: + split = emit(prog, I_SPLIT); + compile(prog, node->x); + jump = emit(prog, I_JUMP); + compile(prog, node->y); + split->x = split + 1; + split->y = jump + 1; + jump->x = prog->end; + break; + + case P_REP: + inst = NULL; /* silence compiler warning. assert(node->m > 0). */ + for (i = 0; i < node->m; ++i) { + inst = prog->end; + compile(prog, node->x); + } + if (node->m == node->n) + break; + if (node->n < REPINF) { + for (i = node->m; i < node->n; ++i) { + split = emit(prog, I_SPLIT); + compile(prog, node->x); + if (node->ng) { + split->y = split + 1; + split->x = prog->end; + } else { + split->x = split + 1; + split->y = prog->end; + } + } + } else if (node->m == 0) { + split = emit(prog, I_SPLIT); + compile(prog, node->x); + jump = emit(prog, I_JUMP); + if (node->ng) { + split->y = split + 1; + split->x = prog->end; + } else { + split->x = split + 1; + split->y = prog->end; + } + jump->x = split; + } else { + split = emit(prog, I_SPLIT); + if (node->ng) { + split->y = inst; + split->x = prog->end; + } else { + split->x = inst; + split->y = prog->end; + } + } + break; + + case P_BOL: emit(prog, I_BOL); break; + case P_EOL: emit(prog, I_EOL); break; + case P_WORD: emit(prog, I_WORD); break; + case P_NWORD: emit(prog, I_NWORD); break; + + case P_PAR: + inst = emit(prog, I_LPAR); + inst->n = node->n; + compile(prog, node->x); + inst = emit(prog, I_RPAR); + inst->n = node->n; + break; + case P_PLA: + split = emit(prog, I_PLA); + compile(prog, node->x); + emit(prog, I_END); + split->x = split + 1; + split->y = prog->end; + break; + case P_NLA: + split = emit(prog, I_NLA); + compile(prog, node->x); + emit(prog, I_END); + split->x = split + 1; + split->y = prog->end; + break; + + case P_ANY: + emit(prog, I_ANY); + break; + case P_CHAR: + inst = emit(prog, I_CHAR); + inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c; + break; + case P_CCLASS: + inst = emit(prog, I_CCLASS); + inst->cc = prog->cclass + node->cc; + break; + case P_NCCLASS: + inst = emit(prog, I_NCCLASS); + inst->cc = prog->cclass + node->cc; + break; + case P_REF: + inst = emit(prog, I_REF); + inst->n = node->n; + break; + } +} + +#ifdef TEST +static void dumpnode(struct cstate *g, Renode *node) +{ + Rune *p; + Reclass *cc; + if (!node) { printf("Empty"); return; } + switch (node->type) { + case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; + case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break; + case P_REP: + printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n); + dumpnode(g, node->x); + printf(")"); + break; + case P_BOL: printf("Bol"); break; + case P_EOL: printf("Eol"); break; + case P_WORD: printf("Word"); break; + case P_NWORD: printf("NotWord"); break; + case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break; + case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break; + case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break; + case P_ANY: printf("Any"); break; + case P_CHAR: printf("Char(%c)", node->c); break; + case P_CCLASS: + printf("Class("); + cc = g->cclass + node->cc; + for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); + printf(")"); + break; + case P_NCCLASS: + printf("NotClass("); + cc = g->cclass + node->cc; + for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]); + printf(")"); + break; + case P_REF: printf("Ref(%d)", node->n); break; + } +} + +static void dumpcclass(Reclass *cc) { + Rune *p; + for (p = cc->spans; p < cc->end; p += 2) { + if (p[0] > 32 && p[0] < 127) + printf(" %c", p[0]); + else + printf(" \\x%02x", p[0]); + if (p[1] > 32 && p[1] < 127) + printf("-%c", p[1]); + else + printf("-\\x%02x", p[1]); + } + putchar('\n'); +} + +static void dumpprog(Reprog *prog) +{ + Reinst *inst; + int i; + for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) { + printf("% 5d: ", i); + switch (inst->opcode) { + case I_END: puts("end"); break; + case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break; + case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; + case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; + case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break; + case I_ANY: puts("any"); break; + case I_ANYNL: puts("anynl"); break; + case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break; + case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break; + case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break; + case I_REF: printf("ref %d\n", inst->n); break; + case I_BOL: puts("bol"); break; + case I_EOL: puts("eol"); break; + case I_WORD: puts("word"); break; + case I_NWORD: puts("nword"); break; + case I_LPAR: printf("lpar %d\n", inst->n); break; + case I_RPAR: printf("rpar %d\n", inst->n); break; + } + } +} +#endif + +Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx, + const char *pattern, int cflags, const char **errorp) +{ + struct cstate g; + Renode *node; + Reinst *split, *jump; + int i, n; + + g.pstart = NULL; + g.prog = NULL; + + if (setjmp(g.kaboom)) { + if (errorp) *errorp = g.error; + alloc(ctx, g.pstart, 0); + if (g.prog) { + alloc(ctx, g.prog->cclass, 0); + alloc(ctx, g.prog->start, 0); + alloc(ctx, g.prog, 0); + } + return NULL; + } + + g.prog = alloc(ctx, NULL, sizeof (Reprog)); + if (!g.prog) + die(&g, "cannot allocate regular expression"); + g.prog->start = NULL; + g.prog->cclass = NULL; + + n = strlen(pattern) * 2; + if (n > REG_MAXPROG) + die(&g, "program too large"); + if (n > 0) { + g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n); + if (!g.pstart) + die(&g, "cannot allocate regular expression parse list"); + } + + g.source = pattern; + g.ncclass = 0; + g.nsub = 1; + for (i = 0; i < REG_MAXSUB; ++i) + g.sub[i] = 0; + + g.prog->flags = cflags; + + next(&g); + node = parsealt(&g); + if (g.lookahead == ')') + die(&g, "unmatched ')'"); + if (g.lookahead != EOF) + die(&g, "syntax error"); + +#ifdef TEST + dumpnode(&g, node); + putchar('\n'); +#endif + + n = 6 + count(&g, node, 0); + if (n < 0 || n > REG_MAXPROG) + die(&g, "program too large"); + + g.prog->nsub = g.nsub; + g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst)); + if (!g.prog->start) + die(&g, "cannot allocate regular expression instruction list"); + + if (g.ncclass > 0) { + g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass)); + if (!g.prog->cclass) + die(&g, "cannot allocate regular expression character class list"); + memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass)); + for (i = 0; i < g.ncclass; ++i) + g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans); + } + + split = emit(g.prog, I_SPLIT); + split->x = split + 3; + split->y = split + 1; + emit(g.prog, I_ANYNL); + jump = emit(g.prog, I_JUMP); + jump->x = split; + emit(g.prog, I_LPAR); + compile(g.prog, node); + emit(g.prog, I_RPAR); + emit(g.prog, I_END); + +#ifdef TEST + dumpprog(g.prog); +#endif + + alloc(ctx, g.pstart, 0); + + if (errorp) *errorp = NULL; + return g.prog; +} + +void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog) +{ + if (prog) { + if (prog->cclass) + alloc(ctx, prog->cclass, 0); + alloc(ctx, prog->start, 0); + alloc(ctx, prog, 0); + } +} + +static void *default_alloc(void *ctx, void *p, int n) +{ + if (n == 0) { + free(p); + return NULL; + } + return realloc(p, (size_t)n); +} + +Reprog *regcomp(const char *pattern, int cflags, const char **errorp) +{ + return regcompx(default_alloc, NULL, pattern, cflags, errorp); +} + +void regfree(Reprog *prog) +{ + regfreex(default_alloc, NULL, prog); +} + +/* Match */ + +static int isnewline(int c) +{ + return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029; +} + +static int iswordchar(int c) +{ + return c == '_' || + (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9'); +} + +static int incclass(Reclass *cc, Rune c) +{ + Rune *p; + for (p = cc->spans; p < cc->end; p += 2) + if (p[0] <= c && c <= p[1]) + return 1; + return 0; +} + +static int incclasscanon(Reclass *cc, Rune c) +{ + Rune *p, r; + for (p = cc->spans; p < cc->end; p += 2) + for (r = p[0]; r <= p[1]; ++r) + if (c == canon(r)) + return 1; + return 0; +} + +static int strncmpcanon(const char *a, const char *b, int n) +{ + Rune ra, rb; + int c; + while (n--) { + if (!*a) return -1; + if (!*b) return 1; + a += chartorune(&ra, a); + b += chartorune(&rb, b); + c = canon(ra) - canon(rb); + if (c) + return c; + } + return 0; +} + +static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth) +{ + Resub scratch; + int result; + int i; + Rune c; + + /* stack overflow */ + if (depth > REG_MAXREC) + return -1; + + for (;;) { + switch (pc->opcode) { + case I_END: + return 0; + case I_JUMP: + pc = pc->x; + break; + case I_SPLIT: + scratch = *out; + result = match(pc->x, sp, bol, flags, &scratch, depth+1); + if (result == -1) + return -1; + if (result == 0) { + *out = scratch; + return 0; + } + pc = pc->y; + break; + + case I_PLA: + result = match(pc->x, sp, bol, flags, out, depth+1); + if (result == -1) + return -1; + if (result == 1) + return 1; + pc = pc->y; + break; + case I_NLA: + scratch = *out; + result = match(pc->x, sp, bol, flags, &scratch, depth+1); + if (result == -1) + return -1; + if (result == 0) + return 1; + pc = pc->y; + break; + + case I_ANYNL: + if (!*sp) return 1; + sp += chartorune(&c, sp); + pc = pc + 1; + break; + case I_ANY: + if (!*sp) return 1; + sp += chartorune(&c, sp); + if (isnewline(c)) + return 1; + pc = pc + 1; + break; + case I_CHAR: + if (!*sp) return 1; + sp += chartorune(&c, sp); + if (flags & REG_ICASE) + c = canon(c); + if (c != pc->c) + return 1; + pc = pc + 1; + break; + case I_CCLASS: + if (!*sp) return 1; + sp += chartorune(&c, sp); + if (flags & REG_ICASE) { + if (!incclasscanon(pc->cc, canon(c))) + return 1; + } else { + if (!incclass(pc->cc, c)) + return 1; + } + pc = pc + 1; + break; + case I_NCCLASS: + if (!*sp) return 1; + sp += chartorune(&c, sp); + if (flags & REG_ICASE) { + if (incclasscanon(pc->cc, canon(c))) + return 1; + } else { + if (incclass(pc->cc, c)) + return 1; + } + pc = pc + 1; + break; + case I_REF: + i = out->sub[pc->n].ep - out->sub[pc->n].sp; + if (flags & REG_ICASE) { + if (strncmpcanon(sp, out->sub[pc->n].sp, i)) + return 1; + } else { + if (strncmp(sp, out->sub[pc->n].sp, i)) + return 1; + } + if (i > 0) + sp += i; + pc = pc + 1; + break; + + case I_BOL: + if (sp == bol && !(flags & REG_NOTBOL)) { + pc = pc + 1; + break; + } + if (flags & REG_NEWLINE) { + if (sp > bol && isnewline(sp[-1])) { + pc = pc + 1; + break; + } + } + return 1; + case I_EOL: + if (*sp == 0) { + pc = pc + 1; + break; + } + if (flags & REG_NEWLINE) { + if (isnewline(*sp)) { + pc = pc + 1; + break; + } + } + return 1; + case I_WORD: + i = sp > bol && iswordchar(sp[-1]); + i ^= iswordchar(sp[0]); + if (!i) + return 1; + pc = pc + 1; + break; + case I_NWORD: + i = sp > bol && iswordchar(sp[-1]); + i ^= iswordchar(sp[0]); + if (i) + return 1; + pc = pc + 1; + break; + + case I_LPAR: + out->sub[pc->n].sp = sp; + pc = pc + 1; + break; + case I_RPAR: + out->sub[pc->n].ep = sp; + pc = pc + 1; + break; + default: + return 1; + } + } +} + +int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags) +{ + Resub scratch; + int i; + + if (!sub) + sub = &scratch; + + sub->nsub = prog->nsub; + for (i = 0; i < REG_MAXSUB; ++i) + sub->sub[i].sp = sub->sub[i].ep = NULL; + + return match(prog->start, sp, sp, prog->flags | eflags, sub, 0); +} + +#ifdef TEST +int main(int argc, char **argv) +{ + const char *error; + const char *s; + Reprog *p; + Resub m; + int i; + + if (argc > 1) { + p = regcomp(argv[1], 0, &error); + if (!p) { + fprintf(stderr, "regcomp: %s\n", error); + return 1; + } + + if (argc > 2) { + s = argv[2]; + printf("nsub = %d\n", p->nsub); + if (!regexec(p, s, &m, 0)) { + for (i = 0; i < m.nsub; ++i) { + int n = m.sub[i].ep - m.sub[i].sp; + if (n > 0) + printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp); + else + printf("match %d: n=0 ''\n", i); + } + } else { + printf("no match\n"); + } + } + } + + return 0; +} +#endif
