Logo

index : blog

---

  • summary
  • about
  • tree
  • log
  • branches
<< path: root/public/blog.git/html/modules/uniform/parse.jai blob: 932f67caa9ee45ea01a48982a184a68634ce33cc [raw] [clear marker]

        
0// This parser is based on RE2’s parser
1// https://github.com/google/re2
2
3ParseFlags :: enum_flags u16 {
4 NoParseFlags :: 0;
5 FoldCase :: 1<<0; // Fold case during matching (case-insensitive).
6 // Literal :: 1<<1; // Treat s as literal string instead of a regexp.
7 ClassNL :: 1<<2; // Allow char classes like [^a-z] and \D and \s
8 // and [[:space:]] to match newline.
9 DotNL :: 1<<3; // Allow . to match newline.
10 MatchNL :: ClassNL | DotNL;
11 OneLine :: 1<<4; // Treat ^ and $ as only matching at beginning and
12 // end of text, not around embedded newlines.
13 // (Perl's default)
14 // Latin1 :: 1<<5; // Regexp_Node and text are in Latin1, not UTF-8.
15 NonGreedy :: 1<<6; // Repetition operators are non-greedy by default.
16 PerlClasses :: 1<<7; // Allow Perl character classes like \d.
17 PerlB :: 1<<8; // Allow Perl's \b and \B.
18 PerlX :: 1<<9; // Perl extensions:
19 // non-capturing parens - (?: )
20 // non-greedy operators - *? +? ?? {}?
21 // flag edits - (?i) (?-i) (?i: )
22 // i - FoldCase
23 // m - !OneLine
24 // s - DotNL
25 // U - NonGreedy
26 // line ends: \A \z
27 // \Q and \E to disable/enable metacharacters
28 // (?P<name>expr) for named captures
29 // \C to match any single byte
30 UnicodeGroups :: 1<<10; // Allow \p{Han} for Unicode Han group
31 // and \P{Han} for its negation.
32 NeverNL :: 1<<11; // Never match NL, even if the regexp mentions
33 // it explicitly.
34 NeverCapture :: 1<<12; // Parse all parens as non-capturing.
35
36 // As close to Perl as we can get.
37 LikePerl :: ClassNL | OneLine | PerlClasses | PerlB | PerlX |
38 UnicodeGroups;
39
40 // Internal use only.
41 WasDollar :: 1<<13; // on EndText: was $ in regexp text
42 AllParseFlags :: (1<<14)-1;
43};
44
45
46Regexp_Node :: struct {
47 // Operator.
48 op: RegexpOp;
49 // Is this regexp structure already simple
50 // (has it been returned by Simplify)?
51 simple: bool;
52
53 // Flags saved from parsing and used during execution.
54 // (Only FoldCase is used.)
55 parse_flags: ParseFlags;
56
57 // Subexpressions.
58 // Concat and Alternate handle larger numbers of subexpressions
59 // by building concatenation or alternation trees.
60 // Other routines should call Concat or Alternate instead of
61 // filling in sub() by hand.
62 MAX_SUBS :: 0xFFFF;
63 subs: [] *Regexp_Node;
64 // Extra space for parse and teardown stacks.
65 down: *Regexp_Node;
66
67 // Arguments to operator.
68 using args: union {
69 struct { // Repeat
70 max: s16;
71 min: s16;
72 }
73 struct { // Capture
74 cap: s32;
75 name: string;
76 }
77 runes: [..] Rune; // LiteralString
78 struct { // CharClass
79 char_class: *CharClass;
80 char_class_finished: bool;
81 }
82 rune: Rune; // Literal
83 // @ToDo Verify size is ok
84 match_id: int; // HaveMatch
85 // void *the_union_[2]; // as big as any other element, for memset
86 };
87}
88
89RegexpOp :: enum {
90 // @ToDo: Rename all options
91
92 // Matches no strings.
93 NoMatch :: 1;
94
95 // Matches empty string.
96 EmptyMatch;
97
98 // Matches rune_.
99 Literal;
100
101 // Matches runes_.
102 LiteralString;
103
104 // Matches concatenation of sub_[0..nsub-1].
105 Concat;
106 // Matches union of sub_[0..nsub-1].
107 Alternate;
108
109 // Matches sub_[0] zero or more times.
110 Star;
111 // Matches sub_[0] one or more times.
112 Plus;
113 // Matches sub_[0] zero or one times.
114 Quest;
115
116 // Matches sub_[0] at least min_ times, at most max_ times.
117 // max_ == -1 means no upper limit.
118 Repeat;
119
120 // Parenthesized (capturing) subexpression. Index is cap_.
121 // Optionally, capturing name is name_.
122 Capture;
123
124 // Matches any character.
125 AnyChar;
126
127 // Matches any byte [sic].
128 AnyByte;
129
130 // Matches empty string at beginning of line.
131 BeginLine;
132 // Matches empty string at end of line.
133 EndLine;
134
135 // Matches word boundary "\b".
136 WordBoundary;
137 // Matches not-a-word boundary "\B".
138 NoWordBoundary;
139
140 // Matches empty string at beginning of text.
141 BeginText;
142 // Matches empty string at end of text.
143 EndText;
144
145 // Matches character class given by cc_.
146 CharClass;
147
148 // Forces match of entire expression right now,
149 // with match ID match_id_ (used by RE2::Set).
150 HaveMatch;
151
152 // The ones below are only used internally by the parser and don’t occur in finshed RegExps
153 kLeftParen;
154 kVerticalBar;
155};
156
157Regexp_Pool :: struct {
158 pool: *Pool;
159 bucket: Bucket_Array(Regexp_Node, 64);
160}
161
162init :: (pool: *Regexp_Pool) {
163 pool.pool = New(Pool);
164 set_allocators(pool.pool);
165 #if REGEXP_DEBUG pool.pool.overwrite_memory = true;
166 pool.bucket.allocator.proc = pool_allocator_proc;
167 pool.bucket.allocator.data = pool.pool;
168}
169
170uninit :: (pool: *Regexp_Pool) {
171 release(pool.pool);
172 free(pool.pool);
173}
174
175#scope_module
176
177new_regexp :: (pool: *Regexp_Pool, op: RegexpOp, flags: ParseFlags) -> *Regexp_Node {
178 re := find_and_occupy_empty_slot(*pool.bucket);
179 // Need to clear, in case we re-use a bucket slot
180 // @ToDo, @Speed: This is unnecessary when its a fresh slot. Find a better way
181 memset(re, 0, size_of(Regexp_Node));
182 re.op = op;
183 re.parse_flags = flags;
184 return re;
185}
186
187remove_regexp :: (pool: *Regexp_Pool, re: *Regexp_Node) {
188 // @ToDo: Delete subs!
189 // @ToDO: Delete char_class!
190 for * pool.bucket {
191 if it == re {
192 remove it;
193 break;
194 }
195 }
196}
197
198// Tests equality of all top-level structure but not subregexps.
199regexp_top_equal :: (a: *Regexp_Node, b: *Regexp_Node) -> bool {
200 if a.op != b.op return false;
201
202 if a.op == {
203 case .NoMatch; #through;
204 case .EmptyMatch; #through;
205 case .AnyChar; #through;
206 case .AnyByte; #through;
207 case .BeginLine; #through;
208 case .EndLine; #through;
209 case .WordBoundary; #through;
210 case .NoWordBoundary; #through;
211 case .BeginText;
212 return true;
213
214 case .EndText;
215 // The parse flags remember whether it's \z or (?-m:$),
216 // which matters when testing against PCRE.
217 return ((a.parse_flags ^ b.parse_flags) & .WasDollar) == 0;
218
219 case .Literal;
220 return a.rune == b.rune && ((a.parse_flags ^ b.parse_flags) & .FoldCase) == 0;
221
222 case .LiteralString;
223 return a.runes.count == b.runes.count && ((a.parse_flags ^ b.parse_flags) & .FoldCase) == 0 &&
224 memcmp(a.runes.data, b.runes.data, a.runes.count * size_of(Rune)) == 0;
225
226 case .Alternate; #through;
227 case .Concat;
228 return a.subs.count == b.subs.count;
229
230 case .Star; #through;
231 case .Plus; #through;
232 case .Quest;
233 return ((a.parse_flags ^ b.parse_flags) & .NonGreedy) == 0;
234
235 case .Repeat;
236 return ((a.parse_flags ^ b.parse_flags) & .NonGreedy) == 0 && a.min == b.min && a.max == b.max;
237
238 case .Capture;
239 return a.cap == b.cap && a.name == b.name;
240
241 case .HaveMatch;
242 return a.match_id == b.match_id;
243
244 case .CharClass;
245 return a.char_class.nrunes == b.char_class.nrunes &&
246 a.char_class.ranges.count == b.char_class.ranges.count &&
247 memcmp(a.char_class.ranges.data, b.char_class.ranges.data, a.char_class.ranges.count * size_of(RuneRange)) == 0;
248 case;
249 assert(false, "Unexpected op: %", a.op);
250 return false;
251 }
252}
253
254regexp_equal :: (a: *Regexp_Node, b: *Regexp_Node) -> bool {
255 if a == null || b == null return a == b;
256
257 if !regexp_top_equal(a, b) return false;
258
259 // The stack has pairs of regexps waiting to
260 // be compared. The regexps are only equal if
261 // all the pairs end up being equal.
262 stack: [..] *Regexp_Node;
263 stack.allocator = temp;
264
265 a1 := a;
266 b1 := b;
267 while true {
268 // Invariant: TopEqual(a1, b1) == true.
269 a2: *Regexp_Node;
270 b2: *Regexp_Node;
271 if a1.op == {
272 case .Alternate; #through;
273 case .Concat;
274 for i: 0..a1.subs.count-1 {
275 a2 = a1.subs[i];
276 b2 = b1.subs[i];
277 if !regexp_top_equal(a2, b2) return false;
278
279 array_add(*stack, a2);
280 array_add(*stack, b2);
281 }
282
283 case .Star; #through;
284 case .Plus; #through;
285 case .Quest; #through;
286 case .Repeat; #through;
287 case .Capture;
288 a2 = a1.subs[0];
289 b2 = b1.subs[0];
290 if !regexp_top_equal(a2, b2) return false;
291 // Really:
292 // stack.push_back(a2);
293 // stack.push_back(b2);
294 // break;
295 // but faster to assign directly and loop.
296 a1 = a2;
297 b1 = b2;
298 continue;
299 case;
300 return true;
301 }
302
303 if !stack.count return true;
304
305 a1 = stack[stack.count-2];
306 b1 = stack[stack.count-1];
307 stack.count -= 2;
308 }
309
310 return true;
311}
312
313// Determines whether regexp matches must be anchored
314// with a fixed string prefix. If so, returns the prefix and
315// the regexp that remains after the prefix. The prefix might
316// be ASCII case-insensitive.
317required_prefix :: (re: *Regexp_Node, pool: *Regexp_Pool) -> prefix: string, foldcase: bool, suffix: *Regexp_Node {
318 // No need for a walker: the regexp must be of the form
319 // 1. some number of ^ anchors
320 // 2. a literal char or string
321 // 3. the rest
322 suffix := re;
323 if re.op != .Concat return "", false, suffix;
324 i := 0;
325 while i < re.subs.count && re.subs[i].op == .BeginText {
326 i += 1;
327 }
328 if i == 0 || i >= re.subs.count return "", false, suffix;
329
330 to_analyze := re.subs[i];
331 if to_analyze.op != .Literal && to_analyze.op != .LiteralString return "", false, suffix;
332
333 i += 1;
334 if (i < re.subs.count) {
335 suffix = new_concat(pool, array_view(re.subs, i, re.subs.count - i), re.parse_flags);
336 } else {
337 suffix = new_regexp(pool, .EmptyMatch, re.parse_flags);
338 }
339
340 runes: [] Rune;
341 if to_analyze.op == .Literal {
342 runes.data = *to_analyze.rune;
343 runes.count = 1;
344 } else {
345 runes = to_analyze.runes;
346 }
347
348 {
349 push_allocator(pool_allocator_proc, pool.pool);
350 prefix := string_from_runes(runes);
351 return prefix, (to_analyze.parse_flags & .FoldCase) != 0, suffix;
352 }
353}
354
355required_prefix_for_accel :: (re: Regexp_Node) -> prefix: string, foldcase: bool {
356 // No need for a walker: the regexp must either begin with or be
357 // a literal char or string.
358 to_analyze := *re;
359 if re.op == .Concat && re.subs.count > 0 {
360 to_analyze = re.subs[0];
361 }
362
363 if to_analyze.op != .Literal && to_analyze.op != .LiteralString return "", false;
364
365 runes: [] Rune;
366 if to_analyze.op == .Literal {
367 runes.data = *to_analyze.rune;
368 runes.count = 1;
369 } else {
370 runes = to_analyze.runes;
371 }
372 prefix := string_from_runes(runes);
373
374 return prefix, (to_analyze.parse_flags & .FoldCase) != 0;
375}
376
377// @ToDo: rename these
378StatusCode :: enum {
379 // No error
380 Success :: 0;
381
382 // Unexpected error
383 InternalError;
384
385 // Parse errors
386 BadEscape; // bad escape sequence
387 BadCharClass; // bad character class
388 BadCharRange; // bad character class range
389 MissingBracket; // missing closing ]
390 MissingParen; // missing closing )
391 UnexpectedParen; // unexpected closing )
392 TrailingBackslash; // at end of regexp
393 RepeatArgument; // repeat argument missing, e.g. "*"
394 RepeatSize; // bad repetition argument
395 RepeatOp; // bad repetition operator
396 BadPerlOp; // bad perl operator
397 BadUTF8; // invalid UTF-8 in regexp
398 BadNamedCapture; // bad named capture
399};
400
401// Regular expression parse state.
402// The list of parsed regexps so far is maintained as a vector of
403// Regexp_Node pointers called the stack. Left parenthesis and vertical
404// bar markers are also placed on the stack, as Regexps with
405// non-standard opcodes.
406// Scanning a left parenthesis causes the parser to push a left parenthesis
407// marker on the stack.
408// Scanning a vertical bar causes the parser to pop the stack until it finds a
409// vertical bar or left parenthesis marker (not popping the marker),
410// concatenate all the popped results, and push them back on
411// the stack (DoConcatenation).
412// Scanning a right parenthesis causes the parser to act as though it
413// has seen a vertical bar, which then leaves the top of the stack in the
414// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
415// The parser pops all this off the stack and creates an alternation of the
416// regexps (DoAlternation).
417ParseState :: struct {
418 flags: ParseFlags;
419 whole_regexp: string;
420 status: Status;
421 stacktop: *Regexp_Node;
422 ncap: s32; // number of capturing parens seen
423 rune_max := Runemax; // maximum char value for this encoding
424 pool: Regexp_Pool;
425}
426
427Status :: struct {
428 code: StatusCode;
429 error_arg: string;
430}
431
432make_status :: (code := StatusCode.Success, error_arg := "") -> Status {
433 status: Status;
434 status.code = code;
435 status.error_arg = error_arg;
436 return status;
437}
438
439rep :: (ps: *ParseState, op: RegexpOp, t: *string, lastunary: string) -> isunary: string, success: bool {
440 opstr := t.*;
441 nongreedy := false;
442 advance(t); // '*' or '+' or '?'
443 if (ps.flags & .PerlX) {
444 if (t.count && (t.*)[0] == #char "?") {
445 nongreedy = true;
446 advance(t); // '?'
447 }
448 if (lastunary) {
449 // In Perl it is not allowed to stack repetition operators:
450 // a** is a syntax error, not a double-star.
451 // (and a++ means something else entirely, which we don't support!)
452 ps.status.code = .RepeatOp;
453 ps.status.error_arg.data = lastunary.data;
454 ps.status.error_arg.count = t.data - lastunary.data;
455 return "", false;
456 }
457 }
458
459 opstr.count -= t.count;
460 if !push_repeat_op(ps, op, opstr, nongreedy)
461 return "", false;
462 return opstr, true;
463}
464
465// Parses the regular expression given by s,
466// returning the corresponding Regexp_Node tree.
467// Returns null on error.
468parse :: (s: string, global_flags: ParseFlags = .NoParseFlags) -> *Regexp_Node, status: Status, pool: Regexp_Pool {
469 ps: ParseState;
470 ps.whole_regexp = s;
471 ps.flags = global_flags;
472
473 init(*ps.pool);
474 push_allocator(pool_allocator_proc, ps.pool.pool);
475
476 lastunary: string;
477 t := s;
478 while t.count {
479 isunary: string;
480 if t[0] == {
481 case; {
482 r, len, status := consume_rune(*t);
483 if !len return null, status, ps.pool;
484 push_literal(*ps, r);
485 }
486
487 case #char "(";
488 // "(?" introduces Perl escape.
489 if ps.flags & .PerlX && t.count >= 2 && t[1] == #char "?" {
490 // Flag changes and non-capturing groups.
491 success: bool;
492 ps.status, success = parse_perl_flags(*ps, *t);
493 if !success return null, ps.status, ps.pool;
494 } else if ps.flags & .NeverCapture {
495 do_left_paren_no_capture(*ps);
496 advance(*t); // '('
497 } else {
498 do_left_paren(*ps);
499 advance(*t); // '('
500 }
501
502 case #char "|";
503 do_vertical_bar(*ps);
504 advance(*t); // '|'
505
506 case #char ")";
507 if !do_right_paren(*ps) return null, ps.status, ps.pool;
508 advance(*t); // ')'
509
510 case #char "^"; // Beginning of line.
511 push_caret(*ps);
512 advance(*t); // '^'
513
514 case #char "$"; // End of line.
515 push_dollar(*ps);
516 advance(*t); // '$'
517
518 case #char "."; // Any character (possibly except newline).
519 push_dot(*ps);
520 advance(*t); // '.'
521
522 case #char "["; // Character class.
523 re, status, success := parse_char_class(*ps, *t);
524 if !success return null, status, ps.pool;
525 push_regexp(*ps, re);
526
527 case #char "*"; // Zero or more.
528 success: bool;
529 isunary, success = rep(*ps, .Star, *t, lastunary);
530 if !success return null, ps.status, ps.pool;
531 case #char "+"; // One or more.
532 success: bool;
533 isunary, success = rep(*ps, .Plus, *t, lastunary);
534 if !success return null, ps.status, ps.pool;
535 case #char "?"; // Zero or one.
536 success: bool;
537 isunary, success = rep(*ps, .Quest, *t, lastunary);
538 if !success return null, ps.status, ps.pool;
539
540 case #char "{"; // Counted repetition.
541 opstr := t;
542 lo, hi, success := maybe_parse_repetition(*t);
543 if !success {
544 // Treat like a literal.
545 push_literal(*ps, #char "{");
546 advance(*t); // '{'
547 } else {
548 nongreedy := false;
549 if ps.flags & .PerlX {
550 if t && t[0] == #char "?" {
551 nongreedy = true;
552 advance(*t); // '?'
553 }
554 if lastunary {
555 // Not allowed to stack repetition operators.
556 ps.status.code = .RepeatOp;
557 ps.status.error_arg = slice(lastunary, 0, t.data - lastunary.data);
558 return null, ps.status, ps.pool;
559 }
560 }
561 opstr.count -= t.count;
562 if !push_repetition(*ps, lo, hi, opstr, nongreedy) {
563 return null, ps.status, ps.pool;
564 }
565 isunary = opstr;
566 }
567
568 case #char "\\"; // Escaped character or Perl sequence.
569 // \b and \B: word boundary or not
570 parsed := false;
571 if ps.flags & .PerlB &&
572 t.count >= 2 && (t[1] == #char "b" || t[1] == #char "B") {
573 if t[1] == #char "b" {
574 push_simple_op(*ps, .WordBoundary);
575 } else {
576 push_simple_op(*ps, .NoWordBoundary);
577 }
578 advance(*t, 2); // '\\', 'b'
579 parsed = true;
580 } else if (ps.flags & .PerlX) && t.count >= 2 {
581 if (t[1] == #char "A") {
582 push_simple_op(*ps, .BeginText);
583 advance(*t, 2); // '\\', 'A'
584 parsed = true;
585 } else if (t[1] == #char "z") {
586 push_simple_op(*ps, .EndText);
587 advance(*t, 2); // '\\', 'z'
588 parsed = true;
589 // Do not recognize \Z, because this library can't
590 // implement the exact Perl/PCRE semantics.
591 // (This library treats "(?-m)$" as \z, even though
592 // in Perl and PCRE it is equivalent to \Z.)
593 } else if (t[1] == #char "C") { // \C: any byte [sic]
594 push_simple_op(*ps, .AnyByte);
595 advance(*t, 2); // '\\', 'C'
596 parsed = true;
597 } else if (t[1] == #char "Q") { // \Q ... \E: the ... is always literals
598 advance(*t, 2); // '\\', 'Q'
599 while t.count {
600 if (t.count >= 2 && t[0] == #char "\\" && t[1] == #char "E") {
601 advance(*t, 2); // '\\', 'E'
602 parsed = true;
603 }
604 r, len, status := consume_rune(*t);
605 if !len return null, status, ps.pool;
606 push_literal(*ps, r);
607 }
608 parsed = true;
609 }
610 }
611
612 if (!parsed && t.count >= 2 && (t[1] == #char "p" || t[1] == #char "P")) {
613 re := new_regexp(*ps.pool, .CharClass, ps.flags & ~.FoldCase);
614 re.char_class = New(CharClass);
615 re.char_class_finished = false;
616 parseStatus, parseResult := maybe_parse_unicode_group(*t, ps.flags, re.char_class);
617 if parseResult == {
618 case .kParseOk;
619 push_regexp(*ps, re);
620 lastunary = isunary;
621 parsed = true;
622 case .kParseError;
623 remove_regexp(*ps.pool, re);
624 ps.status = parseStatus;
625 return null, ps.status, ps.pool;
626 case .kParseNothing;
627 remove_regexp(*ps.pool, re);
628 }
629 }
630
631 if !parsed {
632 g := maybe_parse_perl_cc_escape(*t, ps.flags);
633 if g != null {
634 re := new_regexp(*ps.pool, .CharClass, ps.flags & ~.FoldCase);
635 re.char_class = New(CharClass);
636 re.char_class_finished = false;
637 add_unicode_group(re.char_class, g, g.sign, ps.flags);
638 push_regexp(*ps, re);
639 parsed = true;
640 }
641 }
642
643 if !parsed {
644 r, status, success := parse_escape(*t, ps.rune_max);
645 if !success return null, status, ps.pool;
646 push_literal(*ps, r);
647 }
648 }
649 }
650 return do_finish(*ps), ps.status, ps.pool;
651}
652
653new_literal_string :: (pool: *Regexp_Pool, runes: [] Rune, flags: ParseFlags) -> *Regexp_Node {
654 if runes.count == {
655 case 0;
656 return new_regexp(pool, .EmptyMatch, flags);
657 case 1;
658 return new_literal(pool, runes[0], flags);
659 case;
660 re := new_regexp(pool, .LiteralString, flags);
661 for runes {
662 add_rune_to_string(re, it);
663 }
664 return re;
665 }
666}
667
668new_literal :: (pool: *Regexp_Pool, rune: Rune, flags: ParseFlags) -> *Regexp_Node {
669 re := new_regexp(pool, .Literal, flags);
670 re.rune = rune;
671 return re;
672}
673
674new_star_plus_quest :: (pool: *Regexp_Pool, op: RegexpOp, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
675 // Squash **, ++ and ??.
676 if (op == sub.op && flags == sub.parse_flags) return sub;
677
678 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
679 // op is Star/Plus/Quest, we just have to check that sub.op() is too.
680 if ((sub.op == .Star ||
681 sub.op == .Plus ||
682 sub.op == .Quest) &&
683 flags == sub.parse_flags) {
684 // If sub is Star, no need to rewrite it.
685 if (sub.op == .Star) return sub;
686
687 re := new_regexp(pool, .Star, flags);
688 array_resize(*re.subs, 1, false);
689 re.subs[0] = sub.subs[0];
690 return re;
691 }
692
693 re := new_regexp(pool, op, flags);
694 array_resize(*re.subs, 1, false);
695 re.subs[0] = sub;
696 return re;
697}
698
699// @ToDo: Why does #bake_arguments not work here? (Compiler bug I can’t find a simple repro for)
700// new_plus :: #bake_arguments new_star_plus_quest(op = .Plus);
701// new_star :: #bake_arguments new_star_plus_quest(op = RegexpOp.Star);
702// new_quest :: #bake_arguments new_star_plus_quest(op = .Quest);
703new_plus:: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
704 return new_star_plus_quest(pool, .Plus, sub, flags);
705}
706new_star :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
707 return new_star_plus_quest(pool, .Star, sub, flags);
708}
709new_quest :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
710 return new_star_plus_quest(pool, .Quest, sub, flags);
711}
712
713new_concat_or_alternate :: (pool: *Regexp_Pool, op: RegexpOp, subs: [] *Regexp_Node, flags: ParseFlags, can_factor: bool) -> *Regexp_Node {
714 if subs.count == 1 {
715 sub := subs[0];
716 return sub;
717 }
718
719 if subs.count == 0 {
720 if op == .Alternate {
721 return new_regexp(pool, .NoMatch, flags);
722 } else {
723 return new_regexp(pool, .EmptyMatch, flags);
724 }
725 }
726
727 // @ToDo, @Incomplete: We need this factorization for selecting the right
728 // execution engine later. But does it matter if we implement only NDA for now?
729
730 // PODArray<Regexp_Node*> subcopy;
731 // if op == .Alternate && can_factor {
732 // // Going to edit sub; make a copy so we don't step on caller.
733 // subcopy = PODArray<Regexp_Node*>(nsub);
734 // memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
735 // sub = subcopy.data();
736 // nsub = FactorAlternation(sub, nsub, flags);
737 // if (nsub == 1) {
738 // Regexp_Node* re = sub[0];
739 // return re;
740 // }
741 // }
742
743 // @ToDo, @Incomplete: Is this necessary for us?
744 // We use a normal array at the moment, so we waste 6 more bytes on the size and won’t overflow
745
746 // if (nsub > MAX_SUBS) {
747 // // Too many subexpressions to fit in a single Regexp_Node.
748 // // Make a two-level tree. Two levels gets us to 65535^2.
749 // int nbigsub = (nsub+MAX_SUBS-1)/MAX_SUBS;
750 // Regexp_Node* re = new Regexp_Node(op, flags);
751 // re->AllocSub(nbigsub);
752 // Regexp_Node** subs = re->sub();
753 // for (int i = 0; i < nbigsub - 1; i++)
754 // subs[i] = ConcatOrAlternate(op, sub+i*MAX_SUBS, MAX_SUBS, flags, false);
755 // subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*MAX_SUBS,
756 // nsub - (nbigsub-1)*MAX_SUBS, flags,
757 // false);
758 // return re;
759 // }
760
761 re := new_regexp(pool, op, flags);
762 re.subs = subs;
763 return re;
764}
765
766// concat :: #bake_arguments concat_or_alternate(op = .Concat, can_factor = false);
767// alternate :: #bake_arguments concat_or_alternate(op = .Alternate, can_factor = true);
768// alternate_no_factor :: #bake_arguments concat_or_alternate(op = .Alternate, can_factor = false);
769new_concat :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
770 return new_concat_or_alternate(pool, .Concat, subs, flags, false);
771}
772new_alternate :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
773 return new_concat_or_alternate(pool, .Alternate, subs, flags, true);
774}
775new_alternate_no_factor :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
776 return new_concat_or_alternate(pool, .Alternate, subs, flags, false);
777}
778
779new_capture :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags, cap: s32) -> *Regexp_Node {
780 re := new_regexp(pool, .Capture, flags);
781 array_resize(*re.subs, 1, false);
782 re.subs[0] = sub;
783 re.cap = cap;
784 return re;
785}
786
787new_repeat :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags, min: s16, max: s16) -> *Regexp_Node {
788 re := new_regexp(pool, .Repeat, flags);
789 array_resize(*re.subs, 1, false);
790 re.subs[0] = sub;
791 re.min = min;
792 re.max = max;
793 return re;
794}
795
796to_string :: (re: *Regexp_Node) -> string {
797 // Appends a rune for use in a character class to the string t.
798 append_cc_char :: (builder: *String_Builder, r: Rune) {
799 if 0x20 <= r && r <= 0x7E {
800 if find_index_from_left("[]^-\\", cast(u8)r) != -1 {
801 append(builder, "\\");
802 }
803 append(builder, cast(u8)r);
804 } else {
805 if r == {
806 case #char "\r";
807 append(builder, "\\r");
808 return;
809
810 case #char "\t";
811 append(builder, "\\t");
812 return;
813
814 case #char "\n";
815 append(builder, "\\n");
816 return;
817
818 case 0x0c;
819 append(builder, "\\f");
820 return;
821 }
822
823 if (r < 0x100) {
824 print_to_builder(builder, "\\x%", formatInt(r, base = 16, minimum_digits = 2));
825 } else {
826 print_to_builder(builder, "\\x{%}", formatInt(r, base = 16));
827 }
828 }
829 }
830
831 append_cc_range :: (builder: *String_Builder, lo: Rune, hi: Rune) {
832 append_cc_char(builder, lo);
833 // < would be enough here, but we want to print invalid ranges as well, just in case we have a bug somewhere
834 if lo != hi {
835 append(builder, #char "-");
836 append_cc_char(builder, hi);
837 }
838 }
839
840 append_literal :: (builder: *String_Builder, r: Rune, foldcase: bool) {
841 if r != 0 && r < 0x80 && find_index_from_left("(){}[]*+?|.^$\\", cast(u8)r) != -1 {
842 append(builder, #char "\\");
843 append(builder, cast(u8)r);
844 } else if (foldcase && #char "a" <= r && r <= #char "z") {
845 r -= #char "a" - #char "A";
846 append(builder, #char "[");
847 append(builder, cast(u8)r);
848 append(builder, cast(u8)r + #char "a" - #char "A");
849 append(builder, #char "]");
850 } else {
851 append_cc_range(builder, r, r);
852 }
853 }
854
855 // Appends ( if needed and passes new precedence to children.
856 to_string_pre :: (builder: *String_Builder, re: *Regexp_Node, prec: Precedence) -> Precedence, bool {
857 nprec := Precedence.Atom;
858
859 if re.op == {
860 case .NoMatch; #through;
861 case .EmptyMatch; #through;
862 case .Literal; #through;
863 case .AnyChar; #through;
864 case .AnyByte; #through;
865 case .BeginLine; #through;
866 case .EndLine; #through;
867 case .BeginText; #through;
868 case .EndText; #through;
869 case .WordBoundary; #through;
870 case .NoWordBoundary; #through;
871 case .CharClass; #through;
872 case .HaveMatch;
873 // Atom is fine
874
875 case .Concat; #through;
876 case .LiteralString;
877 if prec < .Concat {
878 append(builder, "(?:");
879 }
880 nprec = .Concat;
881
882 case .Alternate;
883 if prec < .Alternate {
884 append(builder, "(?:");
885 }
886 nprec = .Alternate;
887
888 case .Capture;
889 append(builder, #char "(");
890 assert (re.cap != 0, "Capture 0");
891 if re.name {
892 append(builder, "?P<");
893 append(builder, re.name);
894 append(builder, #char ">");
895 }
896 nprec = .Paren;
897
898 case .Star; #through;
899 case .Plus; #through;
900 case .Quest; #through;
901 case .Repeat;
902 if (prec < .Unary) {
903 append(builder, "(?:");
904 }
905 // The subprecedence here is Atom instead of Unary
906 // because PCRE treats two unary ops in a row as a parse error.
907 nprec = .Atom;
908 }
909
910 return nprec, false;
911 }
912
913 to_string_post :: (builder: *String_Builder, re: *Regexp_Node, prec: Precedence, pre_arg: Precedence, child_args: [..] Precedence) -> Precedence {
914 if re.op == {
915 case .NoMatch;
916 // There's no simple symbol for "no match", but
917 // [^0-Runemax] excludes everything.
918 append(builder, "[^\\x00-\\x{10ffff}]");
919
920 case .EmptyMatch;
921 // Append (?:) to make empty string visible,
922 // unless this is already being parenthesized.
923 if prec < .Empty {
924 append(builder, "(?:)");
925 }
926
927 case .Literal;
928 append_literal(builder, re.rune, (re.parse_flags & .FoldCase) != 0);
929
930 case .LiteralString;
931 for re.runes {
932 append_literal(builder, it, (re.parse_flags & .FoldCase) != 0);
933 }
934 if prec < .Concat {
935 append(builder, #char ")");
936 }
937
938 case .Concat;
939 if prec < .Concat {
940 append(builder, #char ")");
941 }
942
943 case .Alternate;
944 // Clumsy but workable: the children all appended |
945 // at the end of their strings, so just remove the last one.
946 // @ToDo: we can’t do that with string_builder, at least not without very clumsy or very slow code
947 // if ((*t_)[t_->size()-1] == '|')
948 // t_->erase(t_->size()-1);
949 // else
950 // LOG(DFATAL) << "Bad final char: " << t_;
951 if prec < .Alternate {
952 append(builder, #char ")");
953 }
954
955 case .Star;
956 append(builder, #char "*");
957 if re.parse_flags & .NonGreedy {
958 append(builder, #char "?");
959 }
960
961 if (prec < .Unary) {
962 append(builder, #char ")");
963 }
964
965 case .Plus;
966 append(builder, #char "+");
967 if re.parse_flags & .NonGreedy {
968 append(builder, #char "?");
969 }
970 if prec < .Unary {
971 append(builder, #char ")");
972 }
973
974 case .Quest;
975 append(builder, #char "?");
976 if re.parse_flags & .NonGreedy {
977 append(builder, #char "?");
978 }
979 if (prec < .Unary) {
980 append(builder, #char ")");
981 }
982
983 case .Repeat;
984 if (re.max == -1) {
985 print_to_builder(builder, "{%,}", re.min);
986 } else if re.min == re.max {
987 print_to_builder(builder, "{%}", re.min);
988 } else {
989 print_to_builder(builder, "{%,%}", re.min, re.max);
990 }
991 if (re.parse_flags & .NonGreedy) {
992 append(builder, #char "?");
993 }
994 if (prec < .Unary) {
995 append(builder, #char ")");
996 }
997
998 case .AnyChar;
999 append(builder, #char ".");
1000
1001 case .AnyByte;
1002 append(builder, "\\C");
1003
1004 case .BeginLine;
1005 append(builder, #char "^");
1006
1007 case .EndLine;
1008 append(builder, #char "$");
1009
1010 case .BeginText;
1011 append(builder, "(?-m:^)");
1012
1013 case .EndText;
1014 if re.parse_flags & .WasDollar {
1015 append(builder, "(?-m:$)");
1016 } else {
1017 append(builder, "\\z");
1018 }
1019
1020 case .WordBoundary;
1021 append(builder, "\\b");
1022
1023 case .NoWordBoundary;
1024 append(builder, "\\B");
1025
1026 case .CharClass;
1027 if (re.char_class.ranges.count == 0) {
1028 append(builder, "[^\\x00-\\x{10ffff}]");
1029 } else {
1030 append(builder, "[");
1031 // Heuristic: show class as negated if it contains the
1032 // non-character 0xFFFE and yet somehow isn't full.
1033 cc := re.char_class;
1034 if contains(cc, 0xFFFE) && cc.nrunes != Runemax + 1 {
1035 cc = copy(cc);
1036 negate(cc);
1037 append(builder, "^");
1038 }
1039 for cc.ranges {
1040 append_cc_range(builder, it.lo, it.hi);
1041 }
1042 if cc != re.char_class {
1043 uninit(cc);
1044 free(cc);
1045 }
1046 append(builder, #char "]");
1047 }
1048
1049 case .Capture;
1050 append(builder, #char ")");
1051
1052 case .HaveMatch;
1053 // There's no syntax accepted by the parser to generate
1054 // this node (it is generated by RE2::Set) so make something
1055 // up that is readable but won't compile.
1056 print_to_builder(builder, "(?HaveMatch:%)", re.match_id);
1057 }
1058
1059 // If the parent is an alternation, append the | for it.
1060 if (prec == .Alternate) {
1061 append(builder, #char "|");
1062 }
1063
1064 return .Atom;
1065 }
1066
1067 builder: String_Builder;
1068 defer free_buffers(*builder);
1069 w := Walk(*String_Builder, Precedence).{pre_visit = to_string_pre, post_visit = to_string_post, copy = null};
1070 // @ToDO @Cleanup This is a workaround for nulls not being respected in struct literals yet.
1071 // Remove once the struct literal bug is fixed in jai
1072 w.copy = null;
1073 walk(*builder, re, .Toplevel, w);
1074 return builder_to_string(*builder);
1075}
1076
1077#scope_file
1078
1079MAX_REPEAT :: 1000;
1080
1081// Finishes the regexp if necessary, preparing it for use in
1082// a more complex expression.
1083finish_regexp:: (re: *Regexp_Node) -> *Regexp_Node {
1084 if re == null return null;
1085
1086 re.down = null;
1087
1088 if re.op == .CharClass && re.char_class != null && !re.char_class_finished {
1089 finish_char_class(re.char_class);
1090 re.char_class_finished = true;
1091 }
1092
1093 return re;
1094}
1095
1096// Pushes the given regular expression onto the stack.
1097// Could check for too much memory used here.
1098push_regexp :: (using ps: *ParseState, re: *Regexp_Node) {
1099 maybe_concat_string(ps, -1, .NoParseFlags);
1100
1101 // Special case: a character class of one character is just
1102 // a literal. This is a common idiom for escaping
1103 // single characters (e.g., [.] instead of \.), and some
1104 // analysis does better with fewer character classes.
1105 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
1106 if re.op == .CharClass && re.char_class != null {
1107 remove_above(re.char_class, rune_max);
1108 if re.char_class.nrunes == 1 {
1109 r := re.char_class.ranges[0].lo;
1110 // @ToDo, @Speed: No need to remove and create a new one, just clean up and repurpose the old one
1111 remove_regexp(*ps.pool, re);
1112 re = new_literal(*ps.pool, r, flags);
1113 } else if re.char_class.nrunes == 2 {
1114 r := re.char_class.ranges[0].lo;
1115 if #char "A" <= r && r <= #char "Z" && contains(re.char_class, r + #char "a" - #char "A") {
1116 // @ToDo, @Speed: No need to remove and create a new one, just clean up and repurpose the old one
1117 remove_regexp(*ps.pool, re);
1118 re = new_literal(*ps.pool, r + #char "a" - #char "A", flags | .FoldCase);
1119 }
1120 }
1121 }
1122
1123 if !is_marker(re.op) {
1124 re.simple = compute_simple(re);
1125 }
1126
1127 re.down = stacktop;
1128 stacktop = re;
1129}
1130
1131
1132// Add lo-hi to the class, along with their fold-equivalent characters.
1133// If lo-hi is already in the class, assume that the fold-equivalent
1134// chars are there too, so there's no work to do.
1135add_folded_range :: (cc: *CharClass, lo: Rune, hi: Rune, depth: int) {
1136 // AddFoldedRange calls itself recursively for each rune in the fold cycle.
1137 // Most folding cycles are small: there aren't any bigger than four in the
1138 // current Unicode tables. make_unicode_casefold.py checks that
1139 // the cycles are not too long, and we double-check here using depth.
1140 assert(depth <= 10, "AddFoldedRange recurses too much.");
1141
1142 if !add_range(cc, lo, hi) return; // lo-hi was already there? we're done
1143
1144 while lo <= hi {
1145 f := lookup_casefold(unicode_casefold, lo);
1146 if f == null break; // lo has no fold, nor does anything above lo
1147
1148 if lo < f.lo { // lo has no fold; next rune with a fold is f.lo
1149 lo = f.lo;
1150 } else {
1151 // Add in the result of folding the range lo - f.hi
1152 // and that range's fold, recursively.
1153 lo1 := lo;
1154 hi1 := min(hi, f.hi);
1155 if f.delta == {
1156 case;
1157 lo1 += f.delta;
1158 hi1 += f.delta;
1159 case EvenOdd;
1160 if lo1%2 == 1 lo1 -= 1;
1161 if hi1%2 == 0 hi1 += 1;
1162 case OddEven;
1163 if lo1%2 == 0 lo1 -= 1;
1164 if hi1%2 == 1 hi1 += 1;
1165 }
1166 add_folded_range(cc, lo1, hi1, depth + 1);
1167
1168 // Pick up where this fold left off.
1169 lo = f.hi + 1;
1170 }
1171 }
1172}
1173
1174// Pushes the literal rune r onto the stack.
1175push_literal :: (using ps: *ParseState, r: Rune) {
1176 // Do case folding if needed.
1177 if (flags & .FoldCase) && cycle_fold_rune(r) != r {
1178 re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
1179 re.char_class = New(CharClass);
1180 re.char_class_finished = false;
1181 r1 := r;
1182
1183 if !(flags & .NeverNL) || r != #char "\n" {
1184 add_range(re.char_class, r, r);
1185 }
1186 r = cycle_fold_rune(r);
1187 while (r != r1) {
1188 if !(flags & .NeverNL) || r != #char "\n" {
1189 add_range(re.char_class, r, r);
1190 }
1191 r = cycle_fold_rune(r);
1192 }
1193
1194 push_regexp(ps, re);
1195 return;
1196 }
1197
1198 // Exclude newline if applicable.
1199 if (flags & .NeverNL) && r == #char "\n" {
1200 push_regexp(ps, new_regexp(*ps.pool, .NoMatch, flags));
1201 return;
1202 }
1203
1204 // No fancy stuff worked. Ordinary literal.
1205 if maybe_concat_string(ps, r, flags) return;
1206
1207 re := new_regexp(*ps.pool, .Literal, flags);
1208 re.rune = r;
1209 push_regexp(ps, re);
1210}
1211
1212// Pushes a ^ onto the stack.
1213push_caret :: (using ps: *ParseState) {
1214 if flags & .OneLine {
1215 push_simple_op(ps, .BeginText);
1216 } else {
1217 push_simple_op(ps, .BeginLine);
1218 }
1219}
1220
1221// Pushes a $ onto the stack.
1222push_dollar :: (using ps: *ParseState) {
1223 if flags & .OneLine {
1224 // Clumsy marker so that MimicsPCRE() can tell whether
1225 // this EndText was a $ and not a \z.
1226 // @ToDo, @Cleanup pass flags as optional argument
1227 oflags := flags;
1228 flags = flags | .WasDollar;
1229 push_simple_op(ps, .EndText);
1230 flags = oflags;
1231 } else {
1232 push_simple_op(ps, .EndLine);
1233 }
1234}
1235
1236// Pushes a . onto the stack.
1237push_dot :: (using ps: *ParseState) {
1238 if flags & .DotNL && !(flags & .NeverNL) {
1239 push_simple_op(ps, .AnyChar);
1240 } else {
1241 // Rewrite . into [^\n]
1242 re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
1243 re.char_class = New(CharClass);
1244 re.char_class_finished = false;
1245 add_range(re.char_class, 0, #char "\n" - 1);
1246 add_range(re.char_class, #char "\n" + 1, rune_max);
1247 push_regexp(ps, re);
1248 }
1249}
1250
1251// Pushes a regexp with the given op (and no args) onto the stack.
1252push_simple_op :: (using ps: *ParseState, op: RegexpOp) {
1253 push_regexp(ps, new_regexp(*ps.pool, op, flags));
1254}
1255
1256// Pushes a repeat operator regexp onto the stack.
1257// A valid argument for the operator must already be on the stack.
1258// The char c is the name of the operator, for use in error messages.
1259push_repeat_op :: (using ps: *ParseState, op: RegexpOp, s: string, nongreedy: bool) -> bool {
1260 if stacktop == null || is_marker(stacktop.op) {
1261 status.code = .RepeatArgument;
1262 status.error_arg = s;
1263 return false;
1264 }
1265
1266 fl := ifx nongreedy then flags ^ .NonGreedy else flags;
1267
1268 // Squash **, ++ and ??. Regexp_Node::Star() et al. handle this too, but
1269 // they're mostly for use during simplification, not during parsing.
1270 if op == stacktop.op && fl == stacktop.parse_flags return true;
1271
1272 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
1273 // op is a repeat, we just have to check that stacktop_.op() is too,
1274 // then adjust stacktop_.
1275 if (stacktop.op == .Star || stacktop.op == .Plus || stacktop.op == .Quest) && fl == stacktop.parse_flags {
1276 stacktop.op = .Star;
1277 return true;
1278 }
1279
1280 re := new_regexp(*ps.pool, op, fl);
1281 re.down = stacktop.down;
1282 re.subs = NewArray(1, *Regexp_Node, initialized = false);
1283 re.subs[0] = finish_regexp(stacktop);
1284 re.simple = compute_simple(re);
1285 stacktop = re;
1286 return true;
1287}
1288
1289// // RepetitionWalker reports whether the repetition regexp is valid.
1290// // Valid means that the combination of the top-level repetition
1291// // and any inner repetitions does not exceed n copies of the
1292// // innermost thing.
1293// // This rewalks the regexp tree and is called for every repetition,
1294// // so we have to worry about inducing quadratic behavior in the parser.
1295// // We avoid this by only using RepetitionWalker when min or max >= 2.
1296// // In that case the depth of any >= 2 nesting can only get to 9 without
1297// // triggering a parse error, so each subtree can only be rewalked 9 times.
1298// class RepetitionWalker : public Regexp_Node::Walker<int> {
1299// public:
1300// RepetitionWalker() {}
1301// virtual int PreVisit(Regexp_Node* re, int parent_arg, bool* stop);
1302// virtual int PostVisit(Regexp_Node* re, int parent_arg, int pre_arg,
1303// int* child_args, int nchild_args);
1304// virtual int ShortVisit(Regexp_Node* re, int parent_arg);
1305
1306// private:
1307// RepetitionWalker(const RepetitionWalker&) = delete;
1308// RepetitionWalker& operator=(const RepetitionWalker&) = delete;
1309// };
1310
1311// int RepetitionWalker::PreVisit(Regexp_Node* re, int parent_arg, bool* stop) {
1312// int arg = parent_arg;
1313// if (re.op() == .Repeat) {
1314// int m = re.max();
1315// if (m < 0) {
1316// m = re.min();
1317// }
1318// if (m > 0) {
1319// arg /= m;
1320// }
1321// }
1322// return arg;
1323// }
1324
1325// int RepetitionWalker::PostVisit(Regexp_Node* re, int parent_arg, int pre_arg,
1326// int* child_args, int nchild_args) {
1327// int arg = pre_arg;
1328// for (int i = 0; i < nchild_args; i++) {
1329// if (child_args[i] < arg) {
1330// arg = child_args[i];
1331// }
1332// }
1333// return arg;
1334// }
1335
1336// int RepetitionWalker::ShortVisit(Regexp_Node* re, int parent_arg) {
1337// // Should never be called: we use Walk(), not WalkExponential().
1338// #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1339// LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
1340// #endif
1341// return 0;
1342// }
1343
1344// Pushes a repetition regexp onto the stack.
1345// A valid argument for the operator must already be on the stack.
1346push_repetition :: (using ps: *ParseState, min: int, max: int, s: string, nongreedy: bool) -> bool {
1347 if (max != -1 && max < min) || min > MAX_REPEAT || max > MAX_REPEAT {
1348 status.code = .RepeatSize;
1349 status.error_arg = s;
1350 return false;
1351 }
1352 if stacktop == null || is_marker(stacktop.op) {
1353 status.code = .RepeatArgument;
1354 status.error_arg = s;
1355 return false;
1356 }
1357
1358 fl := ifx nongreedy then flags ^ .NonGreedy else flags;
1359
1360 down := stacktop.down;
1361 re := new_repeat(*ps.pool, finish_regexp(stacktop), fl, cast(s16)min, cast(s16)max);
1362 re.down = down;
1363 re.simple = compute_simple(re);
1364 stacktop = re;
1365 if min >= 2 || max >= 2 {
1366 // @ToDo: Check for repetition limits (see RepetitionWalker in the original source)
1367 // if repetition_walk(stacktop, kMaxRepeat) == 0 {
1368 // status.code = .RepeatSize;
1369 // status.error_arg = s;
1370 // return false;
1371 // }
1372 }
1373
1374 return true;
1375}
1376
1377// Checks whether a particular regexp op is a marker.
1378is_marker :: (op: RegexpOp) -> bool {
1379 return op >= .kLeftParen;
1380}
1381
1382// Processes a left parenthesis in the input.
1383// Pushes a marker onto the stack.
1384do_left_paren :: (using ps: *ParseState, name: string = "") {
1385 re := new_regexp(*ps.pool, .kLeftParen, flags);
1386 ncap += 1;
1387 re.cap = ncap;
1388 if name {
1389 re.name = name;
1390 }
1391 push_regexp(ps, re);
1392}
1393
1394// Pushes a non-capturing marker onto the stack.
1395do_left_paren_no_capture :: (using ps: *ParseState) {
1396 re := new_regexp(*ps.pool, .kLeftParen, flags);
1397 re.cap = -1;
1398 push_regexp(ps, re);
1399}
1400
1401// Processes a vertical bar in the input.
1402do_vertical_bar ::(using ps: *ParseState) {
1403 maybe_concat_string(ps, -1, .NoParseFlags);
1404 do_concatenation(ps);
1405
1406 // Below the vertical bar is a list to alternate.
1407 // Above the vertical bar is a list to concatenate.
1408 // We just did the concatenation, so either swap
1409 // the result below the vertical bar or push a new
1410 // vertical bar on the stack.
1411 r1 := stacktop;
1412 if r1 {
1413 r2 := r1.down;
1414 if r2 && r2.op == .kVerticalBar {
1415 r3 := r2.down;
1416 if r3 && (r1.op == .AnyChar || r3.op == .AnyChar) {
1417 // AnyChar is above or below the vertical bar. Let it subsume
1418 // the other when the other is Literal, CharClass or AnyChar.
1419 if r3.op == .AnyChar && (r1.op == .Literal || r1.op == .CharClass || r1.op == .AnyChar) {
1420 // Discard r1.
1421 stacktop = r2;
1422 remove_regexp(*ps.pool, r1);
1423 return;
1424 }
1425 if r1.op == .AnyChar && (r3.op == .Literal || r3.op == .CharClass || r3.op == .AnyChar) {
1426 // Rearrange the stack and discard r3.
1427 r1.down = r3.down;
1428 r2.down = r1;
1429 stacktop = r2;
1430 remove_regexp(*ps.pool, r3);
1431 return;
1432 }
1433 }
1434 // Swap r1 below vertical bar (r2).
1435 r1.down = r2.down;
1436 r2.down = r1;
1437 stacktop = r2;
1438 return;
1439 }
1440 }
1441 push_simple_op(ps, .kVerticalBar);
1442}
1443
1444// Processes a right parenthesis in the input.
1445do_right_paren :: (using ps: *ParseState) -> bool {
1446 // Finish the current concatenation and alternation.
1447 do_alternation(ps);
1448
1449 // The stack should be: LeftParen regexp
1450 // Remove the LeftParen, leaving the regexp,
1451 // parenthesized.
1452 r1 := stacktop;
1453 if r1 == null || r1.down == null || r1.down.op != .kLeftParen {
1454 status.code = .UnexpectedParen;
1455 status.error_arg = whole_regexp;
1456 return false;
1457 }
1458
1459 r2 := r1.down;
1460
1461 // Pop off r1, r2. Will Decref or reuse below.
1462 stacktop = r2.down;
1463
1464 // Restore flags from when paren opened.
1465 re := r2;
1466 flags = re.parse_flags;
1467
1468 // Rewrite LeftParen as capture if needed.
1469 if re.cap > 0 {
1470 re.op = .Capture;
1471 // re.cap is already set
1472 re.subs = NewArray(1, *Regexp_Node, initialized = false);
1473 re.subs[0] = finish_regexp(r1);
1474 re.simple = compute_simple(re);
1475 } else {
1476 remove_regexp(*ps.pool, re);
1477 re = r1;
1478 }
1479
1480 push_regexp(ps, re);
1481 return true;
1482}
1483
1484// Processes the end of input, returning the final regexp.
1485do_finish :: (using ps: *ParseState) -> *Regexp_Node {
1486 do_alternation(ps);
1487 re := stacktop;
1488 if re != null && re.down != null {
1489 status.code = .MissingParen;
1490 status.error_arg = whole_regexp;
1491 return null;
1492 }
1493 stacktop = null;
1494 return finish_regexp(re);
1495}
1496
1497// Returns the leading regexp that re starts with.
1498// The returned Regexp_Node* points into a piece of re,
1499// so it must not be used after the caller calls re.Decref().
1500leading_regexp :: (re: *Regexp_Node) -> *Regexp_Node {
1501 if re.op == .EmptyMatch return null;
1502 if re.op == .Concat && re.subs.count >= 2 {
1503 sub := re.subs[0];
1504 if sub.op == .EmptyMatch return null;
1505 return sub;
1506 }
1507 return re;
1508}
1509
1510// Removes LeadingRegexp(re) from re and returns what's left.
1511// Consumes the reference to re and may edit it in place.
1512// If caller wants to hold on to LeadingRegexp(re),
1513// must have already Incref'ed it.
1514// @ToDO: Check if this is possible / correct in all usages without using reference counting!
1515remove_leading_regexp :: (using ps: *ParseState, re: *Regexp_Node) -> *Regexp_Node {
1516 if re.op == .EmptyMatch return re;
1517
1518 if re.op == .Concat && re.subs.count >= 2 {
1519 if re.subs[0].op == .EmptyMatch return re;
1520
1521 remove_regexp(*ps.pool, re.subs[0]);
1522 re.subs[0] = null;
1523
1524 if re.subs.count == 2 {
1525 // Collapse concatenation to single regexp.
1526 nre := re.subs[1];
1527 re.subs[1] = null;
1528 remove_regexp(*ps.pool, re);
1529 return nre;
1530 }
1531 // 3 or more -> 2 or more.
1532 // @Stability: We won’t be able to free this array ever again
1533 // because we modify it’s base pointer.
1534 // But it doesn’t matter because we’re working within a Pool anyway.
1535 print("Before: %\n", re.subs);
1536 re.subs.data += 1;
1537 re.subs.count -= 1;
1538 print("After: %\n", re.subs);
1539 return re;
1540 } else {
1541 pf := re.parse_flags;
1542 // @ToDo, @Speed: Reuse regexp instead of removing and creating a new one?
1543 remove_regexp(*ps.pool, re);
1544 return new_regexp(*ps.pool, .EmptyMatch, pf);
1545 }
1546}
1547
1548// Returns the leading string that re starts with.
1549// The returned Rune* points into a piece of re,
1550// so it must not be used after the caller calls re.Decref().
1551// @ToDo
1552// Rune* Regexp_Node::LeadingString(Regexp_Node* re, int *nrune,
1553// Regexp_Node::ParseFlags *flags) {
1554// while (re.op() == .Concat && re.nsub() > 0)
1555// re = re.sub()[0];
1556
1557// *flags = static_cast<Regexp_Node::ParseFlags>(re.parse_flags_ & Regexp_Node::FoldCase);
1558
1559// if (re.op() == .Literal) {
1560// *nrune = 1;
1561// return &re.rune_;
1562// }
1563
1564// if (re.op() == .LiteralString) {
1565// *nrune = re.nrunes_;
1566// return re.runes_;
1567// }
1568
1569// *nrune = 0;
1570// return null;
1571// }
1572
1573// Removes the first n leading runes from the beginning of re.
1574// Edits re in place.
1575// @ToDO
1576// void Regexp_Node::RemoveLeadingString(Regexp_Node* re, int n) {
1577// // Chase down concats to find first string.
1578// // For regexps generated by parser, nested concats are
1579// // flattened except when doing so would overflow the 16-bit
1580// // limit on the size of a concatenation, so we should never
1581// // see more than two here.
1582// Regexp_Node* stk[4];
1583// size_t d = 0;
1584// while (re.op() == .Concat) {
1585// if (d < arraysize(stk))
1586// stk[d++] = re;
1587// re = re.sub()[0];
1588// }
1589
1590// // Remove leading string from re.
1591// if (re.op() == .Literal) {
1592// re.rune_ = 0;
1593// re.op_ = .EmptyMatch;
1594// } else if (re.op() == .LiteralString) {
1595// if (n >= re.nrunes_) {
1596// delete[] re.runes_;
1597// re.runes_ = null;
1598// re.nrunes_ = 0;
1599// re.op_ = .EmptyMatch;
1600// } else if (n == re.nrunes_ - 1) {
1601// Rune rune = re.runes_[re.nrunes_ - 1];
1602// delete[] re.runes_;
1603// re.runes_ = null;
1604// re.nrunes_ = 0;
1605// re.rune_ = rune;
1606// re.op_ = .Literal;
1607// } else {
1608// re.nrunes_ -= n;
1609// memmove(re.runes_, re.runes_ + n, re.nrunes_ * sizeof re.runes_[0]);
1610// }
1611// }
1612
1613// // If re is now empty, concatenations might simplify too.
1614// while (d > 0) {
1615// re = stk[--d];
1616// Regexp_Node** sub = re.sub();
1617// if (sub[0].op() == .EmptyMatch) {
1618// sub[0].Decref();
1619// sub[0] = null;
1620// // Delete first element of concat.
1621// switch (re.nsub()) {
1622// case 0:
1623// case 1:
1624// // Impossible.
1625// LOG(DFATAL) << "Concat of " << re.nsub();
1626// re.submany_ = null;
1627// re.op_ = .EmptyMatch;
1628// break;
1629
1630// case 2: {
1631// // Replace re with sub[1].
1632// Regexp_Node* old = sub[1];
1633// sub[1] = null;
1634// re.Swap(old);
1635// old.Decref();
1636// break;
1637// }
1638
1639// default:
1640// // Slide down.
1641// re.nsub_--;
1642// memmove(sub, sub + 1, re.nsub_ * sizeof sub[0]);
1643// break;
1644// }
1645// }
1646// }
1647// }
1648
1649// @ToDo: All of that
1650//// In the context of factoring alternations, a Splice is: a factored prefix or
1651//// merged character class computed by one iteration of one round of factoring;
1652//// the span of subexpressions of the alternation to be "spliced" (i.e. removed
1653//// and replaced); and, for a factored prefix, the number of suffixes after any
1654//// factoring that might have subsequently been performed on them. For a merged
1655//// character class, there are no suffixes, of course, so the field is ignored.
1656//struct Splice {
1657// Splice(Regexp_Node* prefix, Regexp_Node** sub, int nsub)
1658// : prefix(prefix),
1659// sub(sub),
1660// nsub(nsub),
1661// nsuffix(-1) {}
1662
1663// Regexp_Node* prefix;
1664// Regexp_Node** sub;
1665// int nsub;
1666// int nsuffix;
1667//};
1668
1669//// Named so because it is used to implement an explicit stack, a Frame is: the
1670//// span of subexpressions of the alternation to be factored; the current round
1671//// of factoring; any Splices computed; and, for a factored prefix, an iterator
1672//// to the next Splice to be factored (i.e. in another Frame) because suffixes.
1673//struct Frame {
1674// Frame(Regexp_Node** sub, int nsub)
1675// : sub(sub),
1676// nsub(nsub),
1677// round(0) {}
1678
1679// Regexp_Node** sub;
1680// int nsub;
1681// int round;
1682// std::vector<Splice> splices;
1683// int spliceidx;
1684//};
1685
1686//// Bundled into a class for friend access to Regexp_Node without needing to declare
1687//// (or define) Splice in regexp.h.
1688//class FactorAlternationImpl {
1689// public:
1690// static void Round1(Regexp_Node** sub, int nsub,
1691// Regexp_Node::ParseFlags flags,
1692// std::vector<Splice>* splices);
1693// static void Round2(Regexp_Node** sub, int nsub,
1694// Regexp_Node::ParseFlags flags,
1695// std::vector<Splice>* splices);
1696// static void Round3(Regexp_Node** sub, int nsub,
1697// Regexp_Node::ParseFlags flags,
1698// std::vector<Splice>* splices);
1699//};
1700
1701//// Factors common prefixes from alternation.
1702//// For example,
1703//// ABC|ABD|AEF|BCX|BCY
1704//// simplifies to
1705//// A(B(C|D)|EF)|BC(X|Y)
1706//// and thence to
1707//// A(B[CD]|EF)|BC[XY]
1708////
1709//// Rewrites sub to contain simplified list to alternate and returns
1710//// the new length of sub. Adjusts reference counts accordingly
1711//// (incoming sub[i] decremented, outgoing sub[i] incremented).
1712//int Regexp_Node::FactorAlternation(Regexp_Node** sub, int nsub, ParseFlags flags) {
1713// std::vector<Frame> stk;
1714// stk.emplace_back(sub, nsub);
1715
1716// for (;;) {
1717// auto& sub = stk.back().sub;
1718// auto& nsub = stk.back().nsub;
1719// auto& round = stk.back().round;
1720// auto& splices = stk.back().splices;
1721// auto& spliceidx = stk.back().spliceidx;
1722
1723// if (splices.empty()) {
1724// // Advance to the next round of factoring. Note that this covers
1725// // the initialised state: when splices is empty and round is 0.
1726// round++;
1727// } else if (spliceidx < static_cast<int>(splices.size())) {
1728// // We have at least one more Splice to factor. Recurse logically.
1729// stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
1730// continue;
1731// } else {
1732// // We have no more Splices to factor. Apply them.
1733// auto iter = splices.begin();
1734// int out = 0;
1735// for (int i = 0; i < nsub; ) {
1736// // Copy until we reach where the next Splice begins.
1737// while (sub + i < iter.sub)
1738// sub[out++] = sub[i++];
1739// switch (round) {
1740// case 1:
1741// case 2: {
1742// // Assemble the Splice prefix and the suffixes.
1743// Regexp_Node* re[2];
1744// re[0] = iter.prefix;
1745// re[1] = Regexp_Node::AlternateNoFactor(iter.sub, iter.nsuffix, flags);
1746// sub[out++] = Regexp_Node::Concat(re, 2, flags);
1747// i += iter.nsub;
1748// break;
1749// }
1750// case 3:
1751// // Just use the Splice prefix.
1752// sub[out++] = iter.prefix;
1753// i += iter.nsub;
1754// break;
1755// default:
1756// LOG(DFATAL) << "unknown round: " << round;
1757// break;
1758// }
1759// // If we are done, copy until the end of sub.
1760// if (++iter == splices.end()) {
1761// while (i < nsub)
1762// sub[out++] = sub[i++];
1763// }
1764// }
1765// splices.clear();
1766// nsub = out;
1767// // Advance to the next round of factoring.
1768// round++;
1769// }
1770
1771// switch (round) {
1772// case 1:
1773// FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
1774// break;
1775// case 2:
1776// FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
1777// break;
1778// case 3:
1779// FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
1780// break;
1781// case 4:
1782// if (stk.size() == 1) {
1783// // We are at the top of the stack. Just return.
1784// return nsub;
1785// } else {
1786// // Pop the stack and set the number of suffixes.
1787// // (Note that references will be invalidated!)
1788// int nsuffix = nsub;
1789// stk.pop_back();
1790// stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1791// ++stk.back().spliceidx;
1792// continue;
1793// }
1794// default:
1795// LOG(DFATAL) << "unknown round: " << round;
1796// break;
1797// }
1798
1799// // Set spliceidx depending on whether we have Splices to factor.
1800// if (splices.empty() || round == 3) {
1801// spliceidx = static_cast<int>(splices.size());
1802// } else {
1803// spliceidx = 0;
1804// }
1805// }
1806//}
1807
1808//void FactorAlternationImpl::Round1(Regexp_Node** sub, int nsub,
1809// Regexp_Node::ParseFlags flags,
1810// std::vector<Splice>* splices) {
1811// // Round 1: Factor out common literal prefixes.
1812// int start = 0;
1813// Rune* rune = null;
1814// int nrune = 0;
1815// Regexp_Node::ParseFlags runeflags = Regexp_Node::NoParseFlags;
1816// for (int i = 0; i <= nsub; i++) {
1817// // Invariant: sub[start:i] consists of regexps that all
1818// // begin with rune[0:nrune].
1819// Rune* rune_i = null;
1820// int nrune_i = 0;
1821// Regexp_Node::ParseFlags runeflags_i = Regexp_Node::NoParseFlags;
1822// if (i < nsub) {
1823// rune_i = Regexp_Node::LeadingString(sub[i], &nrune_i, &runeflags_i);
1824// if (runeflags_i == runeflags) {
1825// int same = 0;
1826// while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1827// same++;
1828// if (same > 0) {
1829// // Matches at least one rune in current range. Keep going around.
1830// nrune = same;
1831// continue;
1832// }
1833// }
1834// }
1835
1836// // Found end of a run with common leading literal string:
1837// // sub[start:i] all begin with rune[0:nrune],
1838// // but sub[i] does not even begin with rune[0].
1839// if (i == start) {
1840// // Nothing to do - first iteration.
1841// } else if (i == start+1) {
1842// // Just one: don't bother factoring.
1843// } else {
1844// Regexp_Node* prefix = Regexp_Node::LiteralString(rune, nrune, runeflags);
1845// for (int j = start; j < i; j++)
1846// Regexp_Node::RemoveLeadingString(sub[j], nrune);
1847// splices.emplace_back(prefix, sub + start, i - start);
1848// }
1849
1850// // Prepare for next iteration (if there is one).
1851// if (i < nsub) {
1852// start = i;
1853// rune = rune_i;
1854// nrune = nrune_i;
1855// runeflags = runeflags_i;
1856// }
1857// }
1858//}
1859
1860//void FactorAlternationImpl::Round2(Regexp_Node** sub, int nsub,
1861// Regexp_Node::ParseFlags flags,
1862// std::vector<Splice>* splices) {
1863// // Round 2: Factor out common simple prefixes,
1864// // just the first piece of each concatenation.
1865// // This will be good enough a lot of the time.
1866// //
1867// // Complex subexpressions (e.g. involving quantifiers)
1868// // are not safe to factor because that collapses their
1869// // distinct paths through the automaton, which affects
1870// // correctness in some cases.
1871// int start = 0;
1872// Regexp_Node* first = null;
1873// for (int i = 0; i <= nsub; i++) {
1874// // Invariant: sub[start:i] consists of regexps that all
1875// // begin with first.
1876// Regexp_Node* first_i = null;
1877// if (i < nsub) {
1878// first_i = Regexp_Node::LeadingRegexp(sub[i]);
1879// if (first != null &&
1880// // first must be an empty-width op
1881// // OR a char class, any char or any byte
1882// // OR a fixed repeat of a literal, char class, any char or any byte.
1883// (first.op() == .BeginLine ||
1884// first.op() == .EndLine ||
1885// first.op() == .WordBoundary ||
1886// first.op() == .NoWordBoundary ||
1887// first.op() == .BeginText ||
1888// first.op() == .EndText ||
1889// first.op() == .CharClass ||
1890// first.op() == .AnyChar ||
1891// first.op() == .AnyByte ||
1892// (first.op() == .Repeat &&
1893// first.min() == first.max() &&
1894// (first.sub()[0].op() == .Literal ||
1895// first.sub()[0].op() == .CharClass ||
1896// first.sub()[0].op() == .AnyChar ||
1897// first.sub()[0].op() == .AnyByte))) &&
1898// Regexp_Node::Equal(first, first_i))
1899// continue;
1900// }
1901
1902// // Found end of a run with common leading regexp:
1903// // sub[start:i] all begin with first,
1904// // but sub[i] does not.
1905// if (i == start) {
1906// // Nothing to do - first iteration.
1907// } else if (i == start+1) {
1908// // Just one: don't bother factoring.
1909// } else {
1910// Regexp_Node* prefix = first.Incref();
1911// for (int j = start; j < i; j++)
1912// sub[j] = Regexp_Node::RemoveLeadingRegexp(sub[j]);
1913// splices.emplace_back(prefix, sub + start, i - start);
1914// }
1915
1916// // Prepare for next iteration (if there is one).
1917// if (i < nsub) {
1918// start = i;
1919// first = first_i;
1920// }
1921// }
1922//}
1923
1924//void FactorAlternationImpl::Round3(Regexp_Node** sub, int nsub,
1925// Regexp_Node::ParseFlags flags,
1926// std::vector<Splice>* splices) {
1927// // Round 3: Merge runs of literals and/or character classes.
1928// int start = 0;
1929// Regexp_Node* first = null;
1930// for (int i = 0; i <= nsub; i++) {
1931// // Invariant: sub[start:i] consists of regexps that all
1932// // are either literals (i.e. runes) or character classes.
1933// Regexp_Node* first_i = null;
1934// if (i < nsub) {
1935// first_i = sub[i];
1936// if (first != null &&
1937// (first.op() == .Literal ||
1938// first.op() == .CharClass) &&
1939// (first_i.op() == .Literal ||
1940// first_i.op() == .CharClass))
1941// continue;
1942// }
1943
1944// // Found end of a run of Literal/CharClass:
1945// // sub[start:i] all are either one or the other,
1946// // but sub[i] is not.
1947// if (i == start) {
1948// // Nothing to do - first iteration.
1949// } else if (i == start+1) {
1950// // Just one: don't bother factoring.
1951// } else {
1952// CharClassBuilder ccb;
1953// for (int j = start; j < i; j++) {
1954// Regexp_Node* re = sub[j];
1955// if (re.op() == .CharClass) {
1956// CharClass* cc = re.cc();
1957// for (CharClass::iterator it = cc.begin(); it != cc.end(); ++it)
1958// ccb.AddRange(it.lo, it.hi);
1959// } else if (re.op() == .Literal) {
1960// ccb.AddRangeFlags(re.rune(), re.rune(), re.parse_flags());
1961// } else {
1962// LOG(DFATAL) << "RE2: unexpected op: " << re.op() << " "
1963// << re.ToString();
1964// }
1965// re.Decref();
1966// }
1967// Regexp_Node* re = Regexp_Node::NewCharClass(ccb.GetCharClass(), flags);
1968// splices.emplace_back(re, sub + start, i - start);
1969// }
1970
1971// // Prepare for next iteration (if there is one).
1972// if (i < nsub) {
1973// start = i;
1974// first = first_i;
1975// }
1976// }
1977//}
1978
1979// Collapse the regexps on top of the stack, down to the
1980// first marker, into a new op node (op == .Alternate
1981// or op == .Concat).
1982do_collapse :: (using ps: *ParseState, op: RegexpOp) {
1983 // Scan backward to marker, counting children of composite.
1984 n := 0;
1985 next: *Regexp_Node;
1986 sub := stacktop;
1987 while sub != null && !is_marker(sub.op) {
1988 next = sub.down;
1989 if sub.op == op {
1990 n += sub.subs.count;
1991 }
1992 else {
1993 n += 1;
1994 }
1995 sub = next;
1996 }
1997
1998 // If there's just one child, leave it alone.
1999 // (Concat of one thing is that one thing; alternate of one thing is same.)
2000 if stacktop != null && stacktop.down == next return;
2001
2002 // Construct op (alternation or concatenation), flattening op of op.
2003 subs: [..] *Regexp_Node;
2004 array_reserve(*subs, n);
2005 subs.count = n;
2006 next = null;
2007 i := n;
2008 sub = stacktop;
2009 while sub != null && !is_marker(sub.op) {
2010 next = sub.down;
2011 if sub.op == op {
2012 for < sub.subs {
2013 i -= 1;
2014 subs[i] = it;
2015 }
2016 sub.subs.count = 0;
2017 remove_regexp(*ps.pool, sub);
2018 } else {
2019 i -= 1;
2020 subs[i] = finish_regexp(sub);
2021 }
2022 sub = next;
2023 }
2024 assert(i == 0, "Sub expression count mismatch: %", i);
2025
2026 // Takes ownership of subs
2027 re := new_concat_or_alternate(*ps.pool, op, subs, flags, true);
2028 re.simple = compute_simple(re);
2029 re.down = next;
2030 stacktop = re;
2031}
2032
2033// Finishes the current concatenation,
2034// collapsing it into a single regexp on the stack.
2035do_concatenation :: (using ps: *ParseState) {
2036 r1 := stacktop;
2037 if r1 == null || is_marker(r1.op) {
2038 // empty concatenation is special case
2039 re := new_regexp(*ps.pool, .EmptyMatch, flags);
2040 push_regexp(ps, re);
2041 }
2042 do_collapse(ps, .Concat);
2043}
2044
2045// Finishes the current alternation,
2046// collapsing it to a single regexp on the stack.
2047do_alternation :: (using ps: *ParseState) {
2048 do_vertical_bar(ps);
2049 // Now stack top is kVerticalBar.
2050 r1 := stacktop;
2051 stacktop = r1.down;
2052 remove_regexp(*ps.pool, r1);
2053 do_collapse(ps, .Alternate);
2054}
2055
2056// Incremental conversion of concatenated literals into strings.
2057// If top two elements on stack are both literal or string,
2058// collapse into single string.
2059// Don't walk down the stack -- the parser calls this frequently
2060// enough that below the bottom two is known to be collapsed.
2061// Only called when another regexp is about to be pushed
2062// on the stack, so that the topmost literal is not being considered.
2063// (Otherwise ab* would turn into (ab)*.)
2064// If r >= 0, consider pushing a literal r on the stack.
2065// Return whether that happened.
2066maybe_concat_string :: (ps: *ParseState, r: Rune, flags: ParseFlags) -> bool {
2067 re1 := ps.stacktop;
2068 if re1 == null return false;
2069 re2 := re1.down;
2070 if re2 == null return false;
2071
2072 if re1.op != .Literal && re1.op != .LiteralString return false;
2073 if re2.op != .Literal && re2.op != .LiteralString return false;
2074 if re1.parse_flags & .FoldCase != re2.parse_flags & .FoldCase return false;
2075
2076 if re2.op == .Literal {
2077 // convert into string
2078 rune := re2.rune;
2079 re2.op = .LiteralString;
2080 re2.runes.count = 0;
2081 re2.runes.data = null;
2082 add_rune_to_string(re2, rune);
2083 }
2084
2085 // push re1 into re2.
2086 if (re1.op == .Literal) {
2087 add_rune_to_string(re2, re1.rune);
2088 } else {
2089 for re1.runes {
2090 add_rune_to_string(re2, it);
2091 }
2092 array_free(re1.runes);
2093 }
2094
2095 // reuse re1 if possible
2096 if r >= 0 {
2097 re1.op = .Literal;
2098 re1.rune = r;
2099 re1.parse_flags = flags;
2100 return true;
2101 }
2102
2103 ps.stacktop = re2;
2104 remove_regexp(*ps.pool, re1);
2105 return false;
2106}
2107
2108add_rune_to_string :: (re: *Regexp_Node, r: Rune) {
2109 assert(re.op == .LiteralString);
2110 if re.runes.count == 0 {
2111 array_reserve(*re.runes, 8);
2112 }
2113 array_add(*re.runes, r);
2114}
2115
2116// Lexing routines.
2117
2118// Parses a decimal integer, storing it in *np.
2119// Sets *s to span the remainder of the string.
2120parse_integer :: (s: *string) -> int, success: bool {
2121 if !s.count || !isdigit((s.*)[0]) return 0, false;
2122
2123 // Disallow leading zeros.
2124 if s.count >= 2 && (s.*)[0] == #char "0" && isdigit((s.*)[1]) return 0, false;
2125
2126 n := 0;
2127 while s.count && isdigit((s.*)[0]) {
2128 c := (s.*)[0];
2129 // Avoid overflow.
2130 if n >= 100000000 return 0, false;
2131 n = n*10 + c - #char "0";
2132 advance(s);
2133 }
2134 return n, true;
2135}
2136
2137// Parses a repetition suffix like {1,2} or {2} or {2,}.
2138// Sets *s to span the remainder of the string on success.
2139// Sets *lo and *hi to the given range.
2140// In the case of {2,}, the high number is unbounded;
2141// sets *hi to -1 to signify this.
2142// {,2} is NOT a valid suffix.
2143// The Maybe in the name signifies that the regexp parse
2144// doesn't fail even if ParseRepetition does, so the StringPiece
2145// s must NOT be edited unless MaybeParseRepetition returns true.
2146maybe_parse_repetition :: (sp: *string) -> lo: int, hi: int, success: bool {
2147 s := sp.*;
2148 if !s.count || s[0] != #char "{" return 0, 0, false;
2149
2150 advance(*s);
2151
2152 lo, success, remainder := to_integer(s);
2153 if !success return 0, 0, false;
2154 if !remainder return 0, 0, false;
2155 s = remainder;
2156
2157 hi: int;
2158 if s[0] == #char "," {
2159 advance(*s); // ','
2160 if !s return 0, 0, false;
2161
2162 if s[0] == #char "}" {
2163 // {2,} means at least 2
2164 hi = -1;
2165 } else {
2166 // {2,4} means 2, 3, or 4.
2167 hi, success, remainder = to_integer(s);
2168 if !success return 0, 0, false;
2169 s = remainder;
2170 }
2171 } else {
2172 // {2} means exactly two
2173 hi = lo;
2174 }
2175 if !s.count || s[0] != #char "}" return 0, 0, false;
2176 advance(*s);
2177 sp.* = s;
2178
2179 return lo, hi, true;
2180}
2181
2182// @ToDo: Move to unicode module?
2183consume_rune :: (str: *string) -> Rune, len: int, status: Status {
2184 result, len := rune_from_string(str.*);
2185 if !len return 0, 0, make_status(.BadUTF8);
2186
2187 advance(str, len);
2188 return result, len, .{};
2189}
2190
2191// @Speed:: Using my own unicode#is_valid_utf8 would probably be faster
2192is_valid_utf8 :: (str: string) -> bool, status: Status {
2193 t := str;
2194 while t {
2195 result, len, status := consume_rune(*t);
2196 if !len return false, status;
2197 }
2198 return true, .{};
2199}
2200
2201// Convert hex digit to value.
2202hex_value :: (c: Rune) -> int, success: bool {
2203 if #char "0" <= c && c <= #char "9" return c - #char "0", true;
2204 if #char "A" <= c && c <= #char "F" return c - #char "A" + 10, true;
2205 if #char "a" <= c && c <= #char "f" return c - #char "a" + 10, true;
2206
2207 return 0, false;
2208}
2209
2210// Parse an escape sequence (e.g., \n, \{).
2211// Sets *s to span the remainder of the string.
2212// Sets *rp to the named character.
2213parse_escape :: (s: *string, rune_max: s32) -> r: Rune, status: Status, success: bool {
2214 bad_escape :: (begin: string, s: *string) -> Status {
2215 status: Status;
2216 status.code = .BadEscape;
2217 status.error_arg = slice(begin, 0, begin.count - s.count);
2218 return status;
2219 }
2220
2221 assert(s.count && (s.*)[0] == #char "\\");
2222
2223 if s.count == 1 {
2224 return 0, make_status(.TrailingBackslash), false;
2225 }
2226
2227 begin := s.*;
2228
2229 advance(s); // backslash
2230
2231 c, len, status := consume_rune(s);
2232 if !len return 0, status, false;
2233
2234 if c == {
2235 case;
2236 if c < Runeself && !isalpha(c) && !isdigit(c) {
2237 // Escaped non-word characters are always themselves.
2238 // PCRE is not quite so rigorous: it accepts things like
2239 // \q, but we don't. We once rejected \_, but too many
2240 // programs and people insist on using it, so allow \_.
2241 return c, status, true;
2242 }
2243 return 0, bad_escape(begin, s), false;
2244
2245 // Octal escapes.
2246 case #char "1"; #through;
2247 case #char "2"; #through;
2248 case #char "3"; #through;
2249 case #char "4"; #through;
2250 case #char "5"; #through;
2251 case #char "6"; #through;
2252 case #char "7";
2253 // Single non-zero octal digit is a backreference; not supported.
2254 if !s.count || (s.*)[0] < #char "0" || (s.*)[0] > #char "7" return 0, bad_escape(begin, s), false;
2255 #through;
2256 case #char "0";
2257 // consume up to three octal digits; already have one.
2258 code := c - #char "0";
2259 if !s.count return 0, bad_escape(begin, s), false;
2260 c = (s.*)[0];
2261 if #char "0" <= c && c <= #char "7" {
2262 code = code * 8 + c - #char "0";
2263 advance(s); // digit
2264 if s {
2265 c = (s.*)[0];
2266 if #char "0" <= c && c <= #char "7" {
2267 code = code * 8 + c - #char "0";
2268 advance(s); // digit
2269 }
2270 }
2271 }
2272 if code > rune_max return 0, bad_escape(begin, s), false;
2273
2274 return code, status, true;
2275
2276 // Hexadecimal escapes
2277 case #char "x";
2278 if !s.count return 0, bad_escape(begin, s), false;
2279 len: int;
2280 c, len, status = consume_rune(s);
2281 if !len return 0, status, false;
2282 if c == #char "{" {
2283 // Any number of digits in braces.
2284 // Update n as we consume the string, so that
2285 // the whole thing gets shown in the error message.
2286 // Perl accepts any text at all; it ignores all text
2287 // after the first non-hex digit. We require only hex digits,
2288 // and at least one.
2289 c, len, status = consume_rune(s);
2290 if !len return 0, status, false;
2291
2292 nhex := 0;
2293 code: Rune;
2294 hval, success := hex_value(c);
2295 while success {
2296 nhex += 1;
2297 code = cast(Rune)(code * 16 + hval);
2298 if code > rune_max return 0, bad_escape(begin, s), false;
2299 if !s.count return 0, bad_escape(begin, s), false;
2300 c, len, status = consume_rune(s);
2301 if !len return 0, status, false;
2302 hval, success = hex_value(c);
2303 }
2304 if c != #char "}" || nhex == 0 return 0, bad_escape(begin, s), false;
2305 return code, status, true;
2306 } else {
2307 // Easy case: two hex digits.
2308 if !s.count return 0, bad_escape(begin, s), false;
2309 c1, len, status := consume_rune(s);
2310 if !len return 0, status, false;
2311 hc0, success0 := hex_value(c);
2312 hc1, success1 := hex_value(c);
2313 if !success0 || success1 return 0, bad_escape(begin, s), false;
2314 return cast(Rune)(hc0 * 16 + hc1), status, true;
2315 }
2316
2317 // C escapes.
2318 case #char "n";
2319 return #char "\n", status, true;
2320 case #char "r";
2321 return #char "\r", status, true;
2322 case #char "t";
2323 return #char "\t", status, true;
2324
2325 // Less common C escapes.
2326 // @ToDO: Implement?
2327 case #char "a";
2328 // return #char "\a", status, true;
2329 return 0, bad_escape(begin, s), false;
2330 case #char "f";
2331 // return #char "\f", status, true;
2332 return 0, bad_escape(begin, s), false;
2333 case #char "v";
2334 // return #char "\v", status, true;
2335 return 0, bad_escape(begin, s), false;
2336
2337 // This code is disabled to avoid misparsing
2338 // the Perl word-boundary \b as a backspace
2339 // when in POSIX regexp mode. Surprisingly,
2340 // in Perl, \b means word-boundary but [\b]
2341 // means backspace. We don't support that:
2342 // if you want a backspace embed a literal
2343 // backspace character or use \x08.
2344 //
2345 // case #char "b";
2346 // *rp = #char "\b";
2347 // return true;
2348 }
2349}
2350
2351// Add a range to the character class, but exclude newline if asked.
2352// Also handle case folding.
2353add_range_flags :: (using char_class: *CharClass, lo: Rune, hi: Rune, parse_flags: ParseFlags) {
2354 // Take out \n if the flags say so.
2355 cutnl := !(parse_flags & .ClassNL) || (parse_flags & .NeverNL);
2356 if cutnl && lo <= #char "\n" && #char "\n" <= hi {
2357 if lo < #char "\n" {
2358 add_range_flags(char_class, lo, #char "\n" - 1, parse_flags);
2359 }
2360 if hi > #char "\n" {
2361 add_range_flags(char_class, #char "\n" + 1, hi, parse_flags);
2362 }
2363 return;
2364 }
2365
2366 // If folding case, add fold-equivalent characters too.
2367 if parse_flags & .FoldCase {
2368 add_folded_range(char_class, lo, hi, 0);
2369 } else {
2370 add_range(char_class, lo, hi);
2371 }
2372}
2373
2374// Look for a group with the given name.
2375lookup_group :: (name: string, groups: [] UGroup) -> *UGroup {
2376 for * groups {
2377 if it.name == name return it;
2378 }
2379 return null;
2380}
2381
2382// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
2383lookup_posix_group :: (name: string) -> *UGroup {
2384 return lookup_group(name, posix_groups);
2385}
2386lookup_perl_group :: (name: string) -> *UGroup {
2387 return lookup_group(name, perl_groups);
2388}
2389
2390#if true {
2391 // Fake UGroup containing all Runes
2392 any16 :: URange16.{0, 65535};
2393 any32 :: URange32.{65535, Runemax};
2394 any16array :: URange16.[any16];
2395 any32array :: URange32.[any32];
2396 anygroup :: UGroup.{"Any", 1, any16array, any32array};
2397
2398 // Look for a Unicode group with the given name (e.g., "Han")
2399 lookup_unicode_group :: (name: string) -> *UGroup {
2400 if name == "any" {
2401 return *anygroup;
2402 }
2403 return lookup_group(name, unicode_groups);
2404 }
2405}
2406
2407// Add a UGroup or its negation to the character class.
2408// @ToDo: Why is sign not a boolean?
2409add_unicode_group :: (cc: *CharClass, g: *UGroup, sign: int, parse_flags: ParseFlags) {
2410 if (sign == +1) {
2411 for g.r16 add_range_flags(cc, it.lo, it.hi, parse_flags);
2412 for g.r32 add_range_flags(cc, it.lo, it.hi, parse_flags);
2413 } else {
2414 if parse_flags & .FoldCase {
2415 // Normally adding a case-folded group means
2416 // adding all the extra fold-equivalent runes too.
2417 // But if we're adding the negation of the group,
2418 // we have to exclude all the runes that are fold-equivalent
2419 // to what's already missing. Too hard, so do in two steps.
2420 char_class1: CharClass;
2421 // @ToDo: Cleanup char_class1
2422 add_unicode_group(*char_class1, g, 1, parse_flags);
2423 // If the flags say to take out \n, put it in, so that negating will take it out.
2424 // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
2425 cutnl := !(parse_flags & .ClassNL) || (parse_flags & .NeverNL);
2426 if cutnl {
2427 add_range(*char_class1, #char "\n", #char "\n");
2428 }
2429 negate(*char_class1);
2430 add_char_class(cc, char_class1);
2431 return;
2432 } else {
2433 next: Rune;
2434 for g.r16 {
2435 if next < it.lo {
2436 add_range_flags(cc, next, it.lo - 1, parse_flags);
2437 }
2438 next = it.hi + 1;
2439 }
2440 for g.r32 {
2441 if next < it.lo {
2442 add_range_flags(cc, next, it.lo - 1, parse_flags);
2443 }
2444 next = it.hi + 1;
2445 }
2446 if next <= Runemax {
2447 add_range_flags(cc, next, Runemax, parse_flags);
2448 }
2449 }
2450 }
2451}
2452
2453// Maybe parse a Perl character class escape sequence.
2454// Only recognizes the Perl character classes (\d \s \w \D \S \W),
2455// not the Perl empty-string classes (\b \B \A \Z \z).
2456// On success, sets *s to span the remainder of the string
2457// and returns the corresponding UGroup.
2458// The StringPiece must *NOT* be edited unless the call succeeds.
2459maybe_parse_perl_cc_escape :: (s: *string, parse_flags: ParseFlags) -> *UGroup {
2460 if !(parse_flags & .PerlClasses) return null;
2461 if s.count < 2 || (s.*)[0] != #char "\\" return null;
2462
2463 // Could use StringPieceToRune, but there aren't
2464 // any non-ASCII Perl group names.
2465 name := slice(s.*, 0, 2);
2466 g := lookup_perl_group(name);
2467 if g == null return null;
2468 advance(s, name.count);
2469 return g;
2470}
2471
2472ParseResult :: enum {
2473 // @ToDo: Rename those
2474 kParseOk; // Did some parsing.
2475 kParseError; // Found an error.
2476 kParseNothing; // Decided not to parse.
2477};
2478
2479// Maybe parses a Unicode character group like \p{Han} or \P{Han}
2480// (the latter is a negated group).
2481maybe_parse_unicode_group :: (s: *string, parse_flags: ParseFlags, cc: *CharClass) -> status: Status, result: ParseResult {
2482 status: Status;
2483 // Decide whether to parse.
2484 if !(parse_flags & .UnicodeGroups) return status, .kParseNothing;
2485 if s.count < 2 || (s.*)[0] != #char "\\" return status, .kParseNothing;
2486
2487 c: Rune = (s.*)[1];
2488 if c != #char "p" && c != #char "P" return status, .kParseNothing;
2489
2490 // Committed to parse. Results:
2491 sign: int = 1; // -1 = negated char class
2492 if (c == #char "P") {
2493 sign = -sign;
2494 }
2495 seq := s.*; // \p{Han} or \pL
2496 name: string; // Han or L
2497 advance(s, 2); // '\\', 'p'
2498
2499
2500 if (s.*)[0] != #char "{" {
2501 status: Status;
2502 len: int;
2503 c, len, status = consume_rune(s);
2504 if !len return status, .kParseError;
2505
2506 // Name is just the single rune we just parsed
2507 name = slice(seq, 2, len);
2508 } else {
2509 // Name is in braces. Look for closing }
2510 end := find_index_from_left(s.*, #char "}");
2511 if end != -1 {
2512 valid: bool;
2513 valid, status = is_valid_utf8(seq);
2514 if !valid return status, .kParseError;
2515
2516 return make_status(.BadCharRange, seq), .kParseError;
2517 }
2518 name = slice(s.*, 1, end - 1); // without '}'
2519 advance(s, end + 1);
2520 valid: bool;
2521 valid, status = is_valid_utf8(name);
2522 if !valid return status, .kParseError;
2523 }
2524
2525 // Chop seq where s now begins.
2526 seq.count = seq.count - s.count;
2527
2528 if name && name[0] == #char "^" {
2529 sign = -sign;
2530 advance(*name); // '^'
2531 }
2532
2533 #if true {
2534 // Look up the group in the RE2 Unicode data.
2535 g := lookup_unicode_group(name);
2536 if g == null {
2537 status.code = .BadCharRange;
2538 status.error_arg = seq;
2539 return status, .kParseError;
2540 }
2541
2542 add_unicode_group(cc, g, sign, parse_flags);
2543 } else {
2544 // // Look up the group in the ICU Unicode data. Because ICU provides full
2545 // // Unicode properties support, this could be more than a lookup by name.
2546 // ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
2547 // std::string("\\p{") + std::string(name) + std::string("}"));
2548 // UErrorCode uerr = U_ZERO_ERROR;
2549 // ::icu::UnicodeSet uset(ustr, uerr);
2550 // if (U_FAILURE(uerr)) {
2551 // status.set_code(.BadCharRange);
2552 // status.set_error_arg(seq);
2553 // return .kParseError;
2554 // }
2555
2556 // // Convert the UnicodeSet to a URange32 and UGroup that we can add.
2557 // int nr = uset.getRangeCount();
2558 // PODArray<URange32> r(nr);
2559 // for (int i = 0; i < nr; i++) {
2560 // r[i].lo = uset.getRangeStart(i);
2561 // r[i].hi = uset.getRangeEnd(i);
2562 // }
2563 // UGroup g = {"", +1, 0, 0, r.data(), nr};
2564 // AddUGroup(cc, &g, sign, parse_flags);
2565 }
2566
2567 return status, .kParseOk;
2568}
2569
2570// Parses a character class name like [:alnum:].
2571// Sets *s to span the remainder of the string.
2572// Adds the ranges corresponding to the class to ranges.
2573parse_cc_name :: (s: *string, parse_flags: ParseFlags, cc: *CharClass) -> status: Status, result: ParseResult {
2574 // Check begins with [:
2575 if s.count < 2 || (s.*)[0] != #char "[" || (s.*)[1] != #char ":" return Status.{}, .kParseNothing;
2576
2577 // Look for closing :].
2578 end := find_index_from_left(s.*, ":]");
2579
2580 // If no closing :], then ignore.
2581 if !end return Status.{}, .kParseNothing;
2582
2583 // Got it. Check that it's valid.
2584 name := slice(s.*, 2, end - 2);
2585 g := lookup_posix_group(name);
2586 if (g == null) {
2587 return make_status(.BadCharRange, name), .kParseError;
2588 }
2589
2590 advance(s, name.count + 4);
2591 add_unicode_group(cc, g, g.sign, parse_flags);
2592 return Status.{}, .kParseOk;
2593}
2594
2595// Parses a character inside a character class.
2596// There are fewer special characters here than in the rest of the regexp.
2597// Sets *s to span the remainder of the string.
2598parse_cc_character :: (using ps: *ParseState, s: *string, whole_class: string) -> rune: Rune, status: Status, success: bool {
2599 if !s return 0, make_status(.MissingBracket, whole_class), false;
2600
2601 // Allow regular escape sequences even though
2602 // many need not be escaped in this context.
2603 if (s.*)[0] == #char "\\" {
2604 rune, status, success := parse_escape(s, rune_max);
2605 return rune, status, success;
2606 } else {
2607 // Otherwise take the next rune.
2608 rune, len, status := consume_rune(s);
2609 return rune, status, len > 0;
2610 }
2611}
2612
2613// Parses a character class character, or, if the character
2614// is followed by a hyphen, parses a character class range.
2615// For single characters, rr.lo == rr.hi.
2616// Sets *s to span the remainder of the string.
2617parse_cc_range :: (using ps: *ParseState, s: *string, whole_class: string) -> RuneRange, status: Status, success: bool {
2618 os := s.*;
2619 rr: RuneRange;
2620 success: bool;
2621 rr.lo, status, success = parse_cc_character(ps, s, whole_class);
2622 if !success return rr, status, false;
2623
2624 // [a-] means (a|-), so check for final ].
2625 if s.count >= 2 && (s.*)[0] == #char "-" && (s.*)[1] != #char "]" {
2626 advance(s); // #char "-"
2627
2628 rr.hi, status, success = parse_cc_character(ps, s, whole_class);
2629 if !success return rr, status, false;
2630
2631 if rr.hi < rr.lo {
2632 return rr, make_status(.BadCharRange, slice(os, 0, os.count - s.count)), false;
2633 }
2634 } else {
2635 rr.hi = rr.lo;
2636 }
2637 return rr, status, true;
2638}
2639
2640// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
2641// Sets *s to span the remainder of the string.
2642// Sets *out_re to the regexp for the class.
2643parse_char_class :: (using ps: *ParseState, s: *string) -> *Regexp_Node, status: Status, success: bool {
2644 whole_class := s.*;
2645 // Caller checked this.
2646 assert(s.count && (s.*)[0] == #char "[");
2647
2648 negated: bool;
2649 re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
2650 re.char_class = New(CharClass);
2651 re.char_class_finished = false;
2652 advance(s); // '['
2653 if s.count && (s.*)[0] == #char "^" {
2654 advance(s); // '^'
2655 negated = true;
2656 if !(flags & .ClassNL) || flags & .NeverNL {
2657 // If NL can't match implicitly, then pretend
2658 // negated classes include a leading \n.
2659 add_range(re.char_class, #char "\n", #char "\n");
2660 }
2661 }
2662 first := true; // ] is okay as first char in class
2663 while s.count && (first || (s.*)[0] != #char "]") {
2664 // - is only okay unescaped as first or last in class.
2665 // Except that Perl allows - anywhere.
2666 if !first && (s.*)[0] == #char "-" && !(flags & .PerlX) && (s.count == 1 || (s.*)[1] != #char "]") {
2667 remove_regexp(*ps.pool, re);
2668
2669 t := s.*;
2670 advance(*t); // '-'
2671 r, n, status := consume_rune(*t);
2672 if !n {
2673 return null, status, false;
2674 }
2675
2676 return null, make_status(.BadCharRange, slice(s.*, 0, 1 + n)), false;
2677 }
2678 first = false;
2679
2680 // Look for [:alnum:] etc.
2681 if s.count > 2 && (s.*)[0] == #char "[" && (s.*)[1] == #char ":" {
2682 status, result := parse_cc_name(s, flags, re.char_class);
2683 if #complete result == {
2684 case .kParseOk;
2685 continue;
2686 case .kParseError;
2687 remove_regexp(*ps.pool, re);
2688 return null, status, false;
2689 case .kParseNothing;
2690 }
2691 }
2692
2693 // Look for Unicode character group like \p{Han}
2694 if s.count > 2 && (s.*)[0] == #char "\\" && ((s.*)[1] == #char "p" || (s.*)[1] == #char "P") {
2695 status, result := maybe_parse_unicode_group(s, flags, re.char_class);
2696 if #complete result == {
2697 case .kParseOk;
2698 continue;
2699 case .kParseError;
2700 remove_regexp(*ps.pool, re);
2701 return null, status, false;
2702 case .kParseNothing;
2703 }
2704 }
2705
2706 // Look for Perl character class symbols (extension).
2707 g := maybe_parse_perl_cc_escape(s, flags);
2708 if g != null {
2709 add_unicode_group(re.char_class, g, g.sign, flags);
2710 continue;
2711 }
2712
2713 // Otherwise assume single character or simple range.
2714 rr, status, success := parse_cc_range(ps, s, whole_class);
2715 if !success {
2716 remove_regexp(*ps.pool, re);
2717 return null, status, false;
2718 }
2719
2720 // AddRangeFlags is usually called in response to a class like
2721 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
2722 // Regexp_Node::ClassNL is set. In an explicit range or singleton
2723 // like we just parsed, we do not filter \n out, so set ClassNL
2724 // in the flags.
2725 add_range_flags(re.char_class, rr.lo, rr.hi, flags | .ClassNL);
2726 }
2727
2728 if !s.count {
2729 remove_regexp(*ps.pool, re);
2730 return null, make_status(.MissingBracket, whole_class), false;
2731 }
2732 advance(s); // ']'
2733
2734 if negated {
2735 negate(re.char_class);
2736 }
2737
2738 return re, .{}, true;
2739}
2740
2741// Is this a valid capture name? [A-Za-z0-9_]+
2742// PCRE limits names to 32 bytes.
2743// Python rejects names starting with digits.
2744// We don't enforce either of those.
2745is_valid_capture_name :: (name: string) -> bool {
2746 if !name return false;
2747 for 0..name.count - 1 {
2748 c := name[it];
2749 if !((#char "0" <= c && c <= #char "9") ||
2750 (#char "a" <= c && c <= #char "z") ||
2751 (#char "A" <= c && c <= #char "Z") ||
2752 c == #char "_")
2753 return false;
2754 }
2755 return true;
2756}
2757
2758// Parses a Perl flag setting or non-capturing group or both,
2759// like (?i) or (?: or (?i:. Removes from s, updates parse state.
2760// The caller must check that s begins with "(?".
2761// Returns true on success. If the Perl flag is not
2762// well-formed or not supported, sets status and returns false.
2763parse_perl_flags :: (using ps: *ParseState, s: *string) -> Status, success: bool {
2764 // Caller is supposed to check this.
2765 assert(flags & .PerlX && s.count >= 2 && (s.*)[0] == #char "(" && (s.*)[1] == #char "?");
2766
2767 t := s.*;
2768 advance(*t, 2); // "(?"
2769
2770 // Check for named captures, first introduced in Python's regexp library.
2771 // As usual, there are three slightly different syntaxes:
2772 //
2773 // (?P<name>expr) the original, introduced by Python
2774 // (?<name>expr) the .NET alteration, adopted by Perl 5.10
2775 // (?'name'expr) another .NET alteration, adopted by Perl 5.10
2776 //
2777 // Perl 5.10 gave in and implemented the Python version too,
2778 // but they claim that the last two are the preferred forms.
2779 // PCRE and languages based on it (specifically, PHP and Ruby)
2780 // support all three as well. EcmaScript 4 uses only the Python form.
2781 //
2782 // In both the open source world (via Code Search) and the
2783 // Google source tree, (?P<expr>name) is the dominant form,
2784 // so that's the one we implement. One is enough.
2785 if (t.count > 2 && t[0] == #char "P" && t[1] == #char "<") {
2786 advance(*t, 2);
2787 // Pull out name.
2788 end := find_index_from_left(t, #char ">");
2789 if end == -1 {
2790 valid, status := is_valid_utf8(s.*);
2791 if !valid return status, false;
2792 return make_status(.BadNamedCapture, s.*), false;
2793 }
2794
2795 // t is "name>...", t[end] == '>'
2796 capture := slice(s.*, 2, end + 3);
2797 name := slice(t, 0, end);
2798 valid, status := is_valid_utf8(name);
2799 if !valid return status, false;
2800
2801 if !is_valid_capture_name(name) return make_status(.BadNamedCapture, capture), false;
2802
2803 do_left_paren(ps, name);
2804
2805 advance(s, capture.count + 2);
2806 return status, true;
2807 }
2808
2809 negated: bool;
2810 sawflags: bool;
2811 nflags := flags;
2812 done: bool;
2813 while !done {
2814 if !t.count return make_status(.BadPerlOp, s.*), false;
2815 c, len, status := consume_rune(*t);
2816 if !len return status, false;
2817 if c == {
2818 case;
2819 return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2820 // Parse flags.
2821 case #char "i";
2822 sawflags = true;
2823 if negated {
2824 nflags &= ~.FoldCase;
2825 } else {
2826 nflags |= .FoldCase;
2827 }
2828
2829 case #char "m"; // opposite of our OneLine
2830 sawflags = true;
2831 if negated {
2832 nflags |= .OneLine;
2833 } else {
2834 nflags &= ~.OneLine;
2835 }
2836
2837 case #char "s";
2838 sawflags = true;
2839 if negated {
2840 nflags &= ~.DotNL;
2841 } else {
2842 nflags |= .DotNL;
2843 }
2844
2845 case #char "U";
2846 sawflags = true;
2847 if negated {
2848 nflags &= ~.NonGreedy;
2849 } else {
2850 nflags |= .NonGreedy;
2851 }
2852
2853 // Negation
2854 case #char "-";
2855 if negated return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2856
2857 negated = true;
2858 sawflags = false;
2859
2860 // Open new group.
2861 case #char ":";
2862 do_left_paren_no_capture(ps);
2863 done = true;
2864
2865 // Finish flags.
2866 case #char ")";
2867 done = true;
2868 }
2869 }
2870
2871 if negated && !sawflags return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2872
2873 flags = nflags;
2874 s.* = t;
2875 return .{}, true;
2876}
2877
2878// // Converts latin1 (assumed to be encoded as Latin1 bytes)
2879// // into UTF8 encoding in string.
2880// // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2881// // deprecated and because it rejects code points 0x80-0x9F.
2882// void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
2883// char buf[UTFmax];
2884
2885// utf.clear();
2886// for (size_t i = 0; i < latin1.size(); i++) {
2887// Rune r = latin1[i] & 0xFF;
2888// int n = runetochar(buf, &r);
2889// utf.append(buf, n);
2890// }
2891// }
2892
2893// Actually C library functions, but it feels stupid to call out to libc for that
2894isalpha :: (c: s32) -> bool {
2895 return (c >= #char "A" && c <= #char "Z") || (c >= #char "a" && c <= #char "z");
2896}
2897isdigit :: (c: s32) -> bool {
2898 return c >= #char "0" && c <= #char "9";
2899}
2900
2901
2902// For to_string
2903Precedence :: enum {
2904 Atom;
2905 Unary;
2906 Concat;
2907 Alternate;
2908 Empty;
2909 Paren;
2910 Toplevel;
2911}
2912
2913#import "Basic";
2914#import "String";
2915#import "Bucket_Array";
2916#import "Pool";
2917
Copyright 2026  E766CB298A6D1E64 | Git-Thing heavily inspired by cgit