comparison mupdf-source/thirdparty/mujs/regexp.c @ 2:b50eed0cc0ef upstream

ADD: MuPDF v1.26.7: the MuPDF source as downloaded by a default build of PyMuPDF 1.26.4. The directory name has changed: no version number in the expanded directory now.
author Franz Glasner <fzglas.hg@dom66.de>
date Mon, 15 Sep 2025 11:43:07 +0200
parents
children
comparison
equal deleted inserted replaced
1:1d09e1dec1d9 2:b50eed0cc0ef
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <string.h>
4 #include <setjmp.h>
5 #include <limits.h>
6
7 #include "regexp.h"
8 #include "utf.h"
9
10 #define emit regemit
11 #define next regnext
12 #define accept regaccept
13
14 #define nelem(a) (int)(sizeof (a) / sizeof (a)[0])
15
16 #define REPINF 255
17 #ifndef REG_MAXPROG
18 #define REG_MAXPROG (32 << 10)
19 #endif
20 #ifndef REG_MAXREC
21 #define REG_MAXREC 1024
22 #endif
23 #ifndef REG_MAXSPAN
24 #define REG_MAXSPAN 64
25 #endif
26 #ifndef REG_MAXCLASS
27 #define REG_MAXCLASS 128
28 #endif
29
30 typedef struct Reclass Reclass;
31 typedef struct Renode Renode;
32 typedef struct Reinst Reinst;
33 typedef struct Rethread Rethread;
34
35 struct Reclass {
36 Rune *end;
37 Rune spans[REG_MAXSPAN];
38 };
39
40 struct Reprog {
41 Reinst *start, *end;
42 Reclass *cclass;
43 int flags;
44 int nsub;
45 };
46
47 struct cstate {
48 Reprog *prog;
49 Renode *pstart, *pend;
50
51 const char *source;
52 int ncclass;
53 int nsub;
54 Renode *sub[REG_MAXSUB];
55
56 int lookahead;
57 Rune yychar;
58 Reclass *yycc;
59 int yymin, yymax;
60
61 const char *error;
62 jmp_buf kaboom;
63
64 Reclass cclass[REG_MAXCLASS];
65 };
66
67 static void die(struct cstate *g, const char *message)
68 {
69 g->error = message;
70 longjmp(g->kaboom, 1);
71 }
72
73 static int canon(Rune c)
74 {
75 Rune u = toupperrune(c);
76 if (c >= 128 && u < 128)
77 return c;
78 return u;
79 }
80
81 /* Scan */
82
83 enum {
84 L_CHAR = 256,
85 L_CCLASS, /* character class */
86 L_NCCLASS, /* negative character class */
87 L_NC, /* "(?:" no capture */
88 L_PLA, /* "(?=" positive lookahead */
89 L_NLA, /* "(?!" negative lookahead */
90 L_WORD, /* "\b" word boundary */
91 L_NWORD, /* "\B" non-word boundary */
92 L_REF, /* "\1" back-reference */
93 L_COUNT, /* {M,N} */
94 };
95
96 static int hex(struct cstate *g, int c)
97 {
98 if (c >= '0' && c <= '9') return c - '0';
99 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
100 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
101 die(g, "invalid escape sequence");
102 return 0;
103 }
104
105 static int dec(struct cstate *g, int c)
106 {
107 if (c >= '0' && c <= '9') return c - '0';
108 die(g, "invalid quantifier");
109 return 0;
110 }
111
112 #define ESCAPES "BbDdSsWw^$\\.*+?()[]{}|-0123456789"
113
114 static int isunicodeletter(int c)
115 {
116 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || isalpharune(c);
117 }
118
119 static int nextrune(struct cstate *g)
120 {
121 if (!*g->source) {
122 g->yychar = EOF;
123 return 0;
124 }
125 g->source += chartorune(&g->yychar, g->source);
126 if (g->yychar == '\\') {
127 if (!*g->source)
128 die(g, "unterminated escape sequence");
129 g->source += chartorune(&g->yychar, g->source);
130 switch (g->yychar) {
131 case 'f': g->yychar = '\f'; return 0;
132 case 'n': g->yychar = '\n'; return 0;
133 case 'r': g->yychar = '\r'; return 0;
134 case 't': g->yychar = '\t'; return 0;
135 case 'v': g->yychar = '\v'; return 0;
136 case 'c':
137 if (!g->source[0])
138 die(g, "unterminated escape sequence");
139 g->yychar = (*g->source++) & 31;
140 return 0;
141 case 'x':
142 if (!g->source[0] || !g->source[1])
143 die(g, "unterminated escape sequence");
144 g->yychar = hex(g, *g->source++) << 4;
145 g->yychar += hex(g, *g->source++);
146 if (g->yychar == 0) {
147 g->yychar = '0';
148 return 1;
149 }
150 return 0;
151 case 'u':
152 if (!g->source[0] || !g->source[1] || !g->source[2] || !g->source[3])
153 die(g, "unterminated escape sequence");
154 g->yychar = hex(g, *g->source++) << 12;
155 g->yychar += hex(g, *g->source++) << 8;
156 g->yychar += hex(g, *g->source++) << 4;
157 g->yychar += hex(g, *g->source++);
158 if (g->yychar == 0) {
159 g->yychar = '0';
160 return 1;
161 }
162 return 0;
163 case 0:
164 g->yychar = '0';
165 return 1;
166 }
167 if (strchr(ESCAPES, g->yychar))
168 return 1;
169 if (isunicodeletter(g->yychar) || g->yychar == '_') /* check identity escape */
170 die(g, "invalid escape character");
171 return 0;
172 }
173 return 0;
174 }
175
176 static int lexcount(struct cstate *g)
177 {
178 g->yychar = *g->source++;
179
180 g->yymin = dec(g, g->yychar);
181 g->yychar = *g->source++;
182 while (g->yychar != ',' && g->yychar != '}') {
183 g->yymin = g->yymin * 10 + dec(g, g->yychar);
184 g->yychar = *g->source++;
185 if (g->yymin >= REPINF)
186 die(g, "numeric overflow");
187 }
188
189 if (g->yychar == ',') {
190 g->yychar = *g->source++;
191 if (g->yychar == '}') {
192 g->yymax = REPINF;
193 } else {
194 g->yymax = dec(g, g->yychar);
195 g->yychar = *g->source++;
196 while (g->yychar != '}') {
197 g->yymax = g->yymax * 10 + dec(g, g->yychar);
198 g->yychar = *g->source++;
199 if (g->yymax >= REPINF)
200 die(g, "numeric overflow");
201 }
202 }
203 } else {
204 g->yymax = g->yymin;
205 }
206
207 return L_COUNT;
208 }
209
210 static void newcclass(struct cstate *g)
211 {
212 if (g->ncclass >= REG_MAXCLASS)
213 die(g, "too many character classes");
214 g->yycc = g->cclass + g->ncclass++;
215 g->yycc->end = g->yycc->spans;
216 }
217
218 static void addrange(struct cstate *g, Rune a, Rune b)
219 {
220 Reclass *cc = g->yycc;
221 Rune *p;
222
223 if (a > b)
224 die(g, "invalid character class range");
225
226 /* extend existing ranges if they overlap */
227 for (p = cc->spans; p < cc->end; p += 2) {
228 /* completely inside old range */
229 if (a >= p[0] && b <= p[1])
230 return;
231
232 /* completely swallows old range */
233 if (a < p[0] && b >= p[1]) {
234 p[0] = a;
235 p[1] = b;
236 return;
237 }
238
239 /* extend at start */
240 if (b >= p[0] - 1 && b <= p[1] && a < p[0]) {
241 p[0] = a;
242 return;
243 }
244
245 /* extend at end */
246 if (a >= p[0] && a <= p[1] + 1 && b > p[1]) {
247 p[1] = b;
248 return;
249 }
250 }
251
252 if (cc->end + 2 >= cc->spans + nelem(cc->spans))
253 die(g, "too many character class ranges");
254 *cc->end++ = a;
255 *cc->end++ = b;
256 }
257
258 static void addranges_d(struct cstate *g)
259 {
260 addrange(g, '0', '9');
261 }
262
263 static void addranges_D(struct cstate *g)
264 {
265 addrange(g, 0, '0'-1);
266 addrange(g, '9'+1, 0xFFFF);
267 }
268
269 static void addranges_s(struct cstate *g)
270 {
271 addrange(g, 0x9, 0xD);
272 addrange(g, 0x20, 0x20);
273 addrange(g, 0xA0, 0xA0);
274 addrange(g, 0x2028, 0x2029);
275 addrange(g, 0xFEFF, 0xFEFF);
276 }
277
278 static void addranges_S(struct cstate *g)
279 {
280 addrange(g, 0, 0x9-1);
281 addrange(g, 0xD+1, 0x20-1);
282 addrange(g, 0x20+1, 0xA0-1);
283 addrange(g, 0xA0+1, 0x2028-1);
284 addrange(g, 0x2029+1, 0xFEFF-1);
285 addrange(g, 0xFEFF+1, 0xFFFF);
286 }
287
288 static void addranges_w(struct cstate *g)
289 {
290 addrange(g, '0', '9');
291 addrange(g, 'A', 'Z');
292 addrange(g, '_', '_');
293 addrange(g, 'a', 'z');
294 }
295
296 static void addranges_W(struct cstate *g)
297 {
298 addrange(g, 0, '0'-1);
299 addrange(g, '9'+1, 'A'-1);
300 addrange(g, 'Z'+1, '_'-1);
301 addrange(g, '_'+1, 'a'-1);
302 addrange(g, 'z'+1, 0xFFFF);
303 }
304
305 static int lexclass(struct cstate *g)
306 {
307 int type = L_CCLASS;
308 int quoted, havesave, havedash;
309 Rune save = 0;
310
311 newcclass(g);
312
313 quoted = nextrune(g);
314 if (!quoted && g->yychar == '^') {
315 type = L_NCCLASS;
316 quoted = nextrune(g);
317 }
318
319 havesave = havedash = 0;
320 for (;;) {
321 if (g->yychar == EOF)
322 die(g, "unterminated character class");
323 if (!quoted && g->yychar == ']')
324 break;
325
326 if (!quoted && g->yychar == '-') {
327 if (havesave) {
328 if (havedash) {
329 addrange(g, save, '-');
330 havesave = havedash = 0;
331 } else {
332 havedash = 1;
333 }
334 } else {
335 save = '-';
336 havesave = 1;
337 }
338 } else if (quoted && strchr("DSWdsw", g->yychar)) {
339 if (havesave) {
340 addrange(g, save, save);
341 if (havedash)
342 addrange(g, '-', '-');
343 }
344 switch (g->yychar) {
345 case 'd': addranges_d(g); break;
346 case 's': addranges_s(g); break;
347 case 'w': addranges_w(g); break;
348 case 'D': addranges_D(g); break;
349 case 'S': addranges_S(g); break;
350 case 'W': addranges_W(g); break;
351 }
352 havesave = havedash = 0;
353 } else {
354 if (quoted) {
355 if (g->yychar == 'b')
356 g->yychar = '\b';
357 else if (g->yychar == '0')
358 g->yychar = 0;
359 /* else identity escape */
360 }
361 if (havesave) {
362 if (havedash) {
363 addrange(g, save, g->yychar);
364 havesave = havedash = 0;
365 } else {
366 addrange(g, save, save);
367 save = g->yychar;
368 }
369 } else {
370 save = g->yychar;
371 havesave = 1;
372 }
373 }
374
375 quoted = nextrune(g);
376 }
377
378 if (havesave) {
379 addrange(g, save, save);
380 if (havedash)
381 addrange(g, '-', '-');
382 }
383
384 return type;
385 }
386
387 static int lex(struct cstate *g)
388 {
389 int quoted = nextrune(g);
390 if (quoted) {
391 switch (g->yychar) {
392 case 'b': return L_WORD;
393 case 'B': return L_NWORD;
394 case 'd': newcclass(g); addranges_d(g); return L_CCLASS;
395 case 's': newcclass(g); addranges_s(g); return L_CCLASS;
396 case 'w': newcclass(g); addranges_w(g); return L_CCLASS;
397 case 'D': newcclass(g); addranges_d(g); return L_NCCLASS;
398 case 'S': newcclass(g); addranges_s(g); return L_NCCLASS;
399 case 'W': newcclass(g); addranges_w(g); return L_NCCLASS;
400 case '0': g->yychar = 0; return L_CHAR;
401 }
402 if (g->yychar >= '0' && g->yychar <= '9') {
403 g->yychar -= '0';
404 if (*g->source >= '0' && *g->source <= '9')
405 g->yychar = g->yychar * 10 + *g->source++ - '0';
406 return L_REF;
407 }
408 return L_CHAR;
409 }
410
411 switch (g->yychar) {
412 case EOF:
413 case '$': case ')': case '*': case '+':
414 case '.': case '?': case '^': case '|':
415 return g->yychar;
416 }
417
418 if (g->yychar == '{')
419 return lexcount(g);
420 if (g->yychar == '[')
421 return lexclass(g);
422 if (g->yychar == '(') {
423 if (g->source[0] == '?') {
424 if (g->source[1] == ':') {
425 g->source += 2;
426 return L_NC;
427 }
428 if (g->source[1] == '=') {
429 g->source += 2;
430 return L_PLA;
431 }
432 if (g->source[1] == '!') {
433 g->source += 2;
434 return L_NLA;
435 }
436 }
437 return '(';
438 }
439
440 return L_CHAR;
441 }
442
443 /* Parse */
444
445 enum {
446 P_CAT, P_ALT, P_REP,
447 P_BOL, P_EOL, P_WORD, P_NWORD,
448 P_PAR, P_PLA, P_NLA,
449 P_ANY, P_CHAR, P_CCLASS, P_NCCLASS,
450 P_REF,
451 };
452
453 struct Renode {
454 unsigned char type;
455 unsigned char ng, m, n;
456 Rune c;
457 int cc;
458 Renode *x;
459 Renode *y;
460 };
461
462 static Renode *newnode(struct cstate *g, int type)
463 {
464 Renode *node = g->pend++;
465 node->type = type;
466 node->cc = -1;
467 node->c = 0;
468 node->ng = 0;
469 node->m = 0;
470 node->n = 0;
471 node->x = node->y = NULL;
472 return node;
473 }
474
475 static int empty(Renode *node)
476 {
477 if (!node) return 1;
478 switch (node->type) {
479 default: return 1;
480 case P_CAT: return empty(node->x) && empty(node->y);
481 case P_ALT: return empty(node->x) || empty(node->y);
482 case P_REP: return empty(node->x) || node->m == 0;
483 case P_PAR: return empty(node->x);
484 case P_REF: return empty(node->x);
485 case P_ANY: case P_CHAR: case P_CCLASS: case P_NCCLASS: return 0;
486 }
487 }
488
489 static Renode *newrep(struct cstate *g, Renode *atom, int ng, int min, int max)
490 {
491 Renode *rep = newnode(g, P_REP);
492 if (max == REPINF && empty(atom))
493 die(g, "infinite loop matching the empty string");
494 rep->ng = ng;
495 rep->m = min;
496 rep->n = max;
497 rep->x = atom;
498 return rep;
499 }
500
501 static void next(struct cstate *g)
502 {
503 g->lookahead = lex(g);
504 }
505
506 static int accept(struct cstate *g, int t)
507 {
508 if (g->lookahead == t) {
509 next(g);
510 return 1;
511 }
512 return 0;
513 }
514
515 static Renode *parsealt(struct cstate *g);
516
517 static Renode *parseatom(struct cstate *g)
518 {
519 Renode *atom;
520 if (g->lookahead == L_CHAR) {
521 atom = newnode(g, P_CHAR);
522 atom->c = g->yychar;
523 next(g);
524 return atom;
525 }
526 if (g->lookahead == L_CCLASS) {
527 atom = newnode(g, P_CCLASS);
528 atom->cc = g->yycc - g->cclass;
529 next(g);
530 return atom;
531 }
532 if (g->lookahead == L_NCCLASS) {
533 atom = newnode(g, P_NCCLASS);
534 atom->cc = g->yycc - g->cclass;
535 next(g);
536 return atom;
537 }
538 if (g->lookahead == L_REF) {
539 atom = newnode(g, P_REF);
540 if (g->yychar == 0 || g->yychar >= g->nsub || !g->sub[g->yychar])
541 die(g, "invalid back-reference");
542 atom->n = g->yychar;
543 atom->x = g->sub[g->yychar];
544 next(g);
545 return atom;
546 }
547 if (accept(g, '.'))
548 return newnode(g, P_ANY);
549 if (accept(g, '(')) {
550 atom = newnode(g, P_PAR);
551 if (g->nsub == REG_MAXSUB)
552 die(g, "too many captures");
553 atom->n = g->nsub++;
554 atom->x = parsealt(g);
555 g->sub[atom->n] = atom;
556 if (!accept(g, ')'))
557 die(g, "unmatched '('");
558 return atom;
559 }
560 if (accept(g, L_NC)) {
561 atom = parsealt(g);
562 if (!accept(g, ')'))
563 die(g, "unmatched '('");
564 return atom;
565 }
566 if (accept(g, L_PLA)) {
567 atom = newnode(g, P_PLA);
568 atom->x = parsealt(g);
569 if (!accept(g, ')'))
570 die(g, "unmatched '('");
571 return atom;
572 }
573 if (accept(g, L_NLA)) {
574 atom = newnode(g, P_NLA);
575 atom->x = parsealt(g);
576 if (!accept(g, ')'))
577 die(g, "unmatched '('");
578 return atom;
579 }
580 die(g, "syntax error");
581 return NULL;
582 }
583
584 static Renode *parserep(struct cstate *g)
585 {
586 Renode *atom;
587
588 if (accept(g, '^')) return newnode(g, P_BOL);
589 if (accept(g, '$')) return newnode(g, P_EOL);
590 if (accept(g, L_WORD)) return newnode(g, P_WORD);
591 if (accept(g, L_NWORD)) return newnode(g, P_NWORD);
592
593 atom = parseatom(g);
594 if (g->lookahead == L_COUNT) {
595 int min = g->yymin, max = g->yymax;
596 next(g);
597 if (max < min)
598 die(g, "invalid quantifier");
599 return newrep(g, atom, accept(g, '?'), min, max);
600 }
601 if (accept(g, '*')) return newrep(g, atom, accept(g, '?'), 0, REPINF);
602 if (accept(g, '+')) return newrep(g, atom, accept(g, '?'), 1, REPINF);
603 if (accept(g, '?')) return newrep(g, atom, accept(g, '?'), 0, 1);
604 return atom;
605 }
606
607 static Renode *parsecat(struct cstate *g)
608 {
609 Renode *cat, *head, **tail;
610 if (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
611 /* Build a right-leaning tree by splicing in new 'cat' at the tail. */
612 head = parserep(g);
613 tail = &head;
614 while (g->lookahead != EOF && g->lookahead != '|' && g->lookahead != ')') {
615 cat = newnode(g, P_CAT);
616 cat->x = *tail;
617 cat->y = parserep(g);
618 *tail = cat;
619 tail = &cat->y;
620 }
621 return head;
622 }
623 return NULL;
624 }
625
626 static Renode *parsealt(struct cstate *g)
627 {
628 Renode *alt, *x;
629 alt = parsecat(g);
630 while (accept(g, '|')) {
631 x = alt;
632 alt = newnode(g, P_ALT);
633 alt->x = x;
634 alt->y = parsecat(g);
635 }
636 return alt;
637 }
638
639 /* Compile */
640
641 enum {
642 I_END, I_JUMP, I_SPLIT, I_PLA, I_NLA,
643 I_ANYNL, I_ANY, I_CHAR, I_CCLASS, I_NCCLASS, I_REF,
644 I_BOL, I_EOL, I_WORD, I_NWORD,
645 I_LPAR, I_RPAR
646 };
647
648 struct Reinst {
649 unsigned char opcode;
650 unsigned char n;
651 Rune c;
652 Reclass *cc;
653 Reinst *x;
654 Reinst *y;
655 };
656
657 static int count(struct cstate *g, Renode *node, int depth)
658 {
659 int min, max, n;
660 if (!node) return 0;
661 if (++depth > REG_MAXREC) die(g, "stack overflow");
662 switch (node->type) {
663 default: return 1;
664 case P_CAT: return count(g, node->x, depth) + count(g, node->y, depth);
665 case P_ALT: return count(g, node->x, depth) + count(g, node->y, depth) + 2;
666 case P_REP:
667 min = node->m;
668 max = node->n;
669 if (min == max) n = count(g, node->x, depth) * min;
670 else if (max < REPINF) n = count(g, node->x, depth) * max + (max - min);
671 else n = count(g, node->x, depth) * (min + 1) + 2;
672 if (n < 0 || n > REG_MAXPROG) die(g, "program too large");
673 return n;
674 case P_PAR: return count(g, node->x, depth) + 2;
675 case P_PLA: return count(g, node->x, depth) + 2;
676 case P_NLA: return count(g, node->x, depth) + 2;
677 }
678 }
679
680 static Reinst *emit(Reprog *prog, int opcode)
681 {
682 Reinst *inst = prog->end++;
683 inst->opcode = opcode;
684 inst->n = 0;
685 inst->c = 0;
686 inst->cc = NULL;
687 inst->x = inst->y = NULL;
688 return inst;
689 }
690
691 static void compile(Reprog *prog, Renode *node)
692 {
693 Reinst *inst, *split, *jump;
694 int i;
695
696 loop:
697 if (!node)
698 return;
699
700 switch (node->type) {
701 case P_CAT:
702 compile(prog, node->x);
703 node = node->y;
704 goto loop;
705
706 case P_ALT:
707 split = emit(prog, I_SPLIT);
708 compile(prog, node->x);
709 jump = emit(prog, I_JUMP);
710 compile(prog, node->y);
711 split->x = split + 1;
712 split->y = jump + 1;
713 jump->x = prog->end;
714 break;
715
716 case P_REP:
717 inst = NULL; /* silence compiler warning. assert(node->m > 0). */
718 for (i = 0; i < node->m; ++i) {
719 inst = prog->end;
720 compile(prog, node->x);
721 }
722 if (node->m == node->n)
723 break;
724 if (node->n < REPINF) {
725 for (i = node->m; i < node->n; ++i) {
726 split = emit(prog, I_SPLIT);
727 compile(prog, node->x);
728 if (node->ng) {
729 split->y = split + 1;
730 split->x = prog->end;
731 } else {
732 split->x = split + 1;
733 split->y = prog->end;
734 }
735 }
736 } else if (node->m == 0) {
737 split = emit(prog, I_SPLIT);
738 compile(prog, node->x);
739 jump = emit(prog, I_JUMP);
740 if (node->ng) {
741 split->y = split + 1;
742 split->x = prog->end;
743 } else {
744 split->x = split + 1;
745 split->y = prog->end;
746 }
747 jump->x = split;
748 } else {
749 split = emit(prog, I_SPLIT);
750 if (node->ng) {
751 split->y = inst;
752 split->x = prog->end;
753 } else {
754 split->x = inst;
755 split->y = prog->end;
756 }
757 }
758 break;
759
760 case P_BOL: emit(prog, I_BOL); break;
761 case P_EOL: emit(prog, I_EOL); break;
762 case P_WORD: emit(prog, I_WORD); break;
763 case P_NWORD: emit(prog, I_NWORD); break;
764
765 case P_PAR:
766 inst = emit(prog, I_LPAR);
767 inst->n = node->n;
768 compile(prog, node->x);
769 inst = emit(prog, I_RPAR);
770 inst->n = node->n;
771 break;
772 case P_PLA:
773 split = emit(prog, I_PLA);
774 compile(prog, node->x);
775 emit(prog, I_END);
776 split->x = split + 1;
777 split->y = prog->end;
778 break;
779 case P_NLA:
780 split = emit(prog, I_NLA);
781 compile(prog, node->x);
782 emit(prog, I_END);
783 split->x = split + 1;
784 split->y = prog->end;
785 break;
786
787 case P_ANY:
788 emit(prog, I_ANY);
789 break;
790 case P_CHAR:
791 inst = emit(prog, I_CHAR);
792 inst->c = (prog->flags & REG_ICASE) ? canon(node->c) : node->c;
793 break;
794 case P_CCLASS:
795 inst = emit(prog, I_CCLASS);
796 inst->cc = prog->cclass + node->cc;
797 break;
798 case P_NCCLASS:
799 inst = emit(prog, I_NCCLASS);
800 inst->cc = prog->cclass + node->cc;
801 break;
802 case P_REF:
803 inst = emit(prog, I_REF);
804 inst->n = node->n;
805 break;
806 }
807 }
808
809 #ifdef TEST
810 static void dumpnode(struct cstate *g, Renode *node)
811 {
812 Rune *p;
813 Reclass *cc;
814 if (!node) { printf("Empty"); return; }
815 switch (node->type) {
816 case P_CAT: printf("Cat("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
817 case P_ALT: printf("Alt("); dumpnode(g, node->x); printf(", "); dumpnode(g, node->y); printf(")"); break;
818 case P_REP:
819 printf(node->ng ? "NgRep(%d,%d," : "Rep(%d,%d,", node->m, node->n);
820 dumpnode(g, node->x);
821 printf(")");
822 break;
823 case P_BOL: printf("Bol"); break;
824 case P_EOL: printf("Eol"); break;
825 case P_WORD: printf("Word"); break;
826 case P_NWORD: printf("NotWord"); break;
827 case P_PAR: printf("Par(%d,", node->n); dumpnode(g, node->x); printf(")"); break;
828 case P_PLA: printf("PLA("); dumpnode(g, node->x); printf(")"); break;
829 case P_NLA: printf("NLA("); dumpnode(g, node->x); printf(")"); break;
830 case P_ANY: printf("Any"); break;
831 case P_CHAR: printf("Char(%c)", node->c); break;
832 case P_CCLASS:
833 printf("Class(");
834 cc = g->cclass + node->cc;
835 for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
836 printf(")");
837 break;
838 case P_NCCLASS:
839 printf("NotClass(");
840 cc = g->cclass + node->cc;
841 for (p = cc->spans; p < cc->end; p += 2) printf("%02X-%02X,", p[0], p[1]);
842 printf(")");
843 break;
844 case P_REF: printf("Ref(%d)", node->n); break;
845 }
846 }
847
848 static void dumpcclass(Reclass *cc) {
849 Rune *p;
850 for (p = cc->spans; p < cc->end; p += 2) {
851 if (p[0] > 32 && p[0] < 127)
852 printf(" %c", p[0]);
853 else
854 printf(" \\x%02x", p[0]);
855 if (p[1] > 32 && p[1] < 127)
856 printf("-%c", p[1]);
857 else
858 printf("-\\x%02x", p[1]);
859 }
860 putchar('\n');
861 }
862
863 static void dumpprog(Reprog *prog)
864 {
865 Reinst *inst;
866 int i;
867 for (i = 0, inst = prog->start; inst < prog->end; ++i, ++inst) {
868 printf("% 5d: ", i);
869 switch (inst->opcode) {
870 case I_END: puts("end"); break;
871 case I_JUMP: printf("jump %d\n", (int)(inst->x - prog->start)); break;
872 case I_SPLIT: printf("split %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
873 case I_PLA: printf("pla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
874 case I_NLA: printf("nla %d %d\n", (int)(inst->x - prog->start), (int)(inst->y - prog->start)); break;
875 case I_ANY: puts("any"); break;
876 case I_ANYNL: puts("anynl"); break;
877 case I_CHAR: printf(inst->c >= 32 && inst->c < 127 ? "char '%c'\n" : "char U+%04X\n", inst->c); break;
878 case I_CCLASS: printf("cclass"); dumpcclass(inst->cc); break;
879 case I_NCCLASS: printf("ncclass"); dumpcclass(inst->cc); break;
880 case I_REF: printf("ref %d\n", inst->n); break;
881 case I_BOL: puts("bol"); break;
882 case I_EOL: puts("eol"); break;
883 case I_WORD: puts("word"); break;
884 case I_NWORD: puts("nword"); break;
885 case I_LPAR: printf("lpar %d\n", inst->n); break;
886 case I_RPAR: printf("rpar %d\n", inst->n); break;
887 }
888 }
889 }
890 #endif
891
892 Reprog *regcompx(void *(*alloc)(void *ctx, void *p, int n), void *ctx,
893 const char *pattern, int cflags, const char **errorp)
894 {
895 struct cstate g;
896 Renode *node;
897 Reinst *split, *jump;
898 int i, n;
899
900 g.pstart = NULL;
901 g.prog = NULL;
902
903 if (setjmp(g.kaboom)) {
904 if (errorp) *errorp = g.error;
905 alloc(ctx, g.pstart, 0);
906 if (g.prog) {
907 alloc(ctx, g.prog->cclass, 0);
908 alloc(ctx, g.prog->start, 0);
909 alloc(ctx, g.prog, 0);
910 }
911 return NULL;
912 }
913
914 g.prog = alloc(ctx, NULL, sizeof (Reprog));
915 if (!g.prog)
916 die(&g, "cannot allocate regular expression");
917 g.prog->start = NULL;
918 g.prog->cclass = NULL;
919
920 n = strlen(pattern) * 2;
921 if (n > REG_MAXPROG)
922 die(&g, "program too large");
923 if (n > 0) {
924 g.pstart = g.pend = alloc(ctx, NULL, sizeof (Renode) * n);
925 if (!g.pstart)
926 die(&g, "cannot allocate regular expression parse list");
927 }
928
929 g.source = pattern;
930 g.ncclass = 0;
931 g.nsub = 1;
932 for (i = 0; i < REG_MAXSUB; ++i)
933 g.sub[i] = 0;
934
935 g.prog->flags = cflags;
936
937 next(&g);
938 node = parsealt(&g);
939 if (g.lookahead == ')')
940 die(&g, "unmatched ')'");
941 if (g.lookahead != EOF)
942 die(&g, "syntax error");
943
944 #ifdef TEST
945 dumpnode(&g, node);
946 putchar('\n');
947 #endif
948
949 n = 6 + count(&g, node, 0);
950 if (n < 0 || n > REG_MAXPROG)
951 die(&g, "program too large");
952
953 g.prog->nsub = g.nsub;
954 g.prog->start = g.prog->end = alloc(ctx, NULL, n * sizeof (Reinst));
955 if (!g.prog->start)
956 die(&g, "cannot allocate regular expression instruction list");
957
958 if (g.ncclass > 0) {
959 g.prog->cclass = alloc(ctx, NULL, g.ncclass * sizeof (Reclass));
960 if (!g.prog->cclass)
961 die(&g, "cannot allocate regular expression character class list");
962 memcpy(g.prog->cclass, g.cclass, g.ncclass * sizeof (Reclass));
963 for (i = 0; i < g.ncclass; ++i)
964 g.prog->cclass[i].end = g.prog->cclass[i].spans + (g.cclass[i].end - g.cclass[i].spans);
965 }
966
967 split = emit(g.prog, I_SPLIT);
968 split->x = split + 3;
969 split->y = split + 1;
970 emit(g.prog, I_ANYNL);
971 jump = emit(g.prog, I_JUMP);
972 jump->x = split;
973 emit(g.prog, I_LPAR);
974 compile(g.prog, node);
975 emit(g.prog, I_RPAR);
976 emit(g.prog, I_END);
977
978 #ifdef TEST
979 dumpprog(g.prog);
980 #endif
981
982 alloc(ctx, g.pstart, 0);
983
984 if (errorp) *errorp = NULL;
985 return g.prog;
986 }
987
988 void regfreex(void *(*alloc)(void *ctx, void *p, int n), void *ctx, Reprog *prog)
989 {
990 if (prog) {
991 if (prog->cclass)
992 alloc(ctx, prog->cclass, 0);
993 alloc(ctx, prog->start, 0);
994 alloc(ctx, prog, 0);
995 }
996 }
997
998 static void *default_alloc(void *ctx, void *p, int n)
999 {
1000 if (n == 0) {
1001 free(p);
1002 return NULL;
1003 }
1004 return realloc(p, (size_t)n);
1005 }
1006
1007 Reprog *regcomp(const char *pattern, int cflags, const char **errorp)
1008 {
1009 return regcompx(default_alloc, NULL, pattern, cflags, errorp);
1010 }
1011
1012 void regfree(Reprog *prog)
1013 {
1014 regfreex(default_alloc, NULL, prog);
1015 }
1016
1017 /* Match */
1018
1019 static int isnewline(int c)
1020 {
1021 return c == 0xA || c == 0xD || c == 0x2028 || c == 0x2029;
1022 }
1023
1024 static int iswordchar(int c)
1025 {
1026 return c == '_' ||
1027 (c >= 'a' && c <= 'z') ||
1028 (c >= 'A' && c <= 'Z') ||
1029 (c >= '0' && c <= '9');
1030 }
1031
1032 static int incclass(Reclass *cc, Rune c)
1033 {
1034 Rune *p;
1035 for (p = cc->spans; p < cc->end; p += 2)
1036 if (p[0] <= c && c <= p[1])
1037 return 1;
1038 return 0;
1039 }
1040
1041 static int incclasscanon(Reclass *cc, Rune c)
1042 {
1043 Rune *p, r;
1044 for (p = cc->spans; p < cc->end; p += 2)
1045 for (r = p[0]; r <= p[1]; ++r)
1046 if (c == canon(r))
1047 return 1;
1048 return 0;
1049 }
1050
1051 static int strncmpcanon(const char *a, const char *b, int n)
1052 {
1053 Rune ra, rb;
1054 int c;
1055 while (n--) {
1056 if (!*a) return -1;
1057 if (!*b) return 1;
1058 a += chartorune(&ra, a);
1059 b += chartorune(&rb, b);
1060 c = canon(ra) - canon(rb);
1061 if (c)
1062 return c;
1063 }
1064 return 0;
1065 }
1066
1067 static int match(Reinst *pc, const char *sp, const char *bol, int flags, Resub *out, int depth)
1068 {
1069 Resub scratch;
1070 int result;
1071 int i;
1072 Rune c;
1073
1074 /* stack overflow */
1075 if (depth > REG_MAXREC)
1076 return -1;
1077
1078 for (;;) {
1079 switch (pc->opcode) {
1080 case I_END:
1081 return 0;
1082 case I_JUMP:
1083 pc = pc->x;
1084 break;
1085 case I_SPLIT:
1086 scratch = *out;
1087 result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1088 if (result == -1)
1089 return -1;
1090 if (result == 0) {
1091 *out = scratch;
1092 return 0;
1093 }
1094 pc = pc->y;
1095 break;
1096
1097 case I_PLA:
1098 result = match(pc->x, sp, bol, flags, out, depth+1);
1099 if (result == -1)
1100 return -1;
1101 if (result == 1)
1102 return 1;
1103 pc = pc->y;
1104 break;
1105 case I_NLA:
1106 scratch = *out;
1107 result = match(pc->x, sp, bol, flags, &scratch, depth+1);
1108 if (result == -1)
1109 return -1;
1110 if (result == 0)
1111 return 1;
1112 pc = pc->y;
1113 break;
1114
1115 case I_ANYNL:
1116 if (!*sp) return 1;
1117 sp += chartorune(&c, sp);
1118 pc = pc + 1;
1119 break;
1120 case I_ANY:
1121 if (!*sp) return 1;
1122 sp += chartorune(&c, sp);
1123 if (isnewline(c))
1124 return 1;
1125 pc = pc + 1;
1126 break;
1127 case I_CHAR:
1128 if (!*sp) return 1;
1129 sp += chartorune(&c, sp);
1130 if (flags & REG_ICASE)
1131 c = canon(c);
1132 if (c != pc->c)
1133 return 1;
1134 pc = pc + 1;
1135 break;
1136 case I_CCLASS:
1137 if (!*sp) return 1;
1138 sp += chartorune(&c, sp);
1139 if (flags & REG_ICASE) {
1140 if (!incclasscanon(pc->cc, canon(c)))
1141 return 1;
1142 } else {
1143 if (!incclass(pc->cc, c))
1144 return 1;
1145 }
1146 pc = pc + 1;
1147 break;
1148 case I_NCCLASS:
1149 if (!*sp) return 1;
1150 sp += chartorune(&c, sp);
1151 if (flags & REG_ICASE) {
1152 if (incclasscanon(pc->cc, canon(c)))
1153 return 1;
1154 } else {
1155 if (incclass(pc->cc, c))
1156 return 1;
1157 }
1158 pc = pc + 1;
1159 break;
1160 case I_REF:
1161 i = out->sub[pc->n].ep - out->sub[pc->n].sp;
1162 if (flags & REG_ICASE) {
1163 if (strncmpcanon(sp, out->sub[pc->n].sp, i))
1164 return 1;
1165 } else {
1166 if (strncmp(sp, out->sub[pc->n].sp, i))
1167 return 1;
1168 }
1169 if (i > 0)
1170 sp += i;
1171 pc = pc + 1;
1172 break;
1173
1174 case I_BOL:
1175 if (sp == bol && !(flags & REG_NOTBOL)) {
1176 pc = pc + 1;
1177 break;
1178 }
1179 if (flags & REG_NEWLINE) {
1180 if (sp > bol && isnewline(sp[-1])) {
1181 pc = pc + 1;
1182 break;
1183 }
1184 }
1185 return 1;
1186 case I_EOL:
1187 if (*sp == 0) {
1188 pc = pc + 1;
1189 break;
1190 }
1191 if (flags & REG_NEWLINE) {
1192 if (isnewline(*sp)) {
1193 pc = pc + 1;
1194 break;
1195 }
1196 }
1197 return 1;
1198 case I_WORD:
1199 i = sp > bol && iswordchar(sp[-1]);
1200 i ^= iswordchar(sp[0]);
1201 if (!i)
1202 return 1;
1203 pc = pc + 1;
1204 break;
1205 case I_NWORD:
1206 i = sp > bol && iswordchar(sp[-1]);
1207 i ^= iswordchar(sp[0]);
1208 if (i)
1209 return 1;
1210 pc = pc + 1;
1211 break;
1212
1213 case I_LPAR:
1214 out->sub[pc->n].sp = sp;
1215 pc = pc + 1;
1216 break;
1217 case I_RPAR:
1218 out->sub[pc->n].ep = sp;
1219 pc = pc + 1;
1220 break;
1221 default:
1222 return 1;
1223 }
1224 }
1225 }
1226
1227 int regexec(Reprog *prog, const char *sp, Resub *sub, int eflags)
1228 {
1229 Resub scratch;
1230 int i;
1231
1232 if (!sub)
1233 sub = &scratch;
1234
1235 sub->nsub = prog->nsub;
1236 for (i = 0; i < REG_MAXSUB; ++i)
1237 sub->sub[i].sp = sub->sub[i].ep = NULL;
1238
1239 return match(prog->start, sp, sp, prog->flags | eflags, sub, 0);
1240 }
1241
1242 #ifdef TEST
1243 int main(int argc, char **argv)
1244 {
1245 const char *error;
1246 const char *s;
1247 Reprog *p;
1248 Resub m;
1249 int i;
1250
1251 if (argc > 1) {
1252 p = regcomp(argv[1], 0, &error);
1253 if (!p) {
1254 fprintf(stderr, "regcomp: %s\n", error);
1255 return 1;
1256 }
1257
1258 if (argc > 2) {
1259 s = argv[2];
1260 printf("nsub = %d\n", p->nsub);
1261 if (!regexec(p, s, &m, 0)) {
1262 for (i = 0; i < m.nsub; ++i) {
1263 int n = m.sub[i].ep - m.sub[i].sp;
1264 if (n > 0)
1265 printf("match %d: s=%d e=%d n=%d '%.*s'\n", i, (int)(m.sub[i].sp - s), (int)(m.sub[i].ep - s), n, n, m.sub[i].sp);
1266 else
1267 printf("match %d: n=0 ''\n", i);
1268 }
1269 } else {
1270 printf("no match\n");
1271 }
1272 }
1273 }
1274
1275 return 0;
1276 }
1277 #endif