<<
path:
root/public/blog.git/html/modules/uniform/parse.jai
blob: 932f67caa9ee45ea01a48982a184a68634ce33cc
[raw]
[clear marker]
0// This parser is based on RE2’s parser
1// https://github.com/google/re2
3ParseFlags :: enum_flags u16 {
5 FoldCase :: 1<<0; // Fold case during matching (case-insensitive).
6 // Literal :: 1<<1; // Treat s as literal string instead of a regexp.
7 ClassNL :: 1<<2; // Allow char classes like [^a-z] and \D and \s
8 // and [[:space:]] to match newline.
9 DotNL :: 1<<3; // Allow . to match newline.
10 MatchNL :: ClassNL | DotNL;
11 OneLine :: 1<<4; // Treat ^ and $ as only matching at beginning and
12 // end of text, not around embedded newlines.
14 // Latin1 :: 1<<5; // Regexp_Node and text are in Latin1, not UTF-8.
15 NonGreedy :: 1<<6; // Repetition operators are non-greedy by default.
16 PerlClasses :: 1<<7; // Allow Perl character classes like \d.
17 PerlB :: 1<<8; // Allow Perl's \b and \B.
18 PerlX :: 1<<9; // Perl extensions:
19 // non-capturing parens - (?: )
20 // non-greedy operators - *? +? ?? {}?
21 // flag edits - (?i) (?-i) (?i: )
27 // \Q and \E to disable/enable metacharacters
28 // (?P<name>expr) for named captures
29 // \C to match any single byte
30 UnicodeGroups :: 1<<10; // Allow \p{Han} for Unicode Han group
31 // and \P{Han} for its negation.
32 NeverNL :: 1<<11; // Never match NL, even if the regexp mentions
34 NeverCapture :: 1<<12; // Parse all parens as non-capturing.
36 // As close to Perl as we can get.
37 LikePerl :: ClassNL | OneLine | PerlClasses | PerlB | PerlX |
41 WasDollar :: 1<<13; // on EndText: was $ in regexp text
42 AllParseFlags :: (1<<14)-1;
46Regexp_Node :: struct {
49 // Is this regexp structure already simple
50 // (has it been returned by Simplify)?
53 // Flags saved from parsing and used during execution.
54 // (Only FoldCase is used.)
55 parse_flags: ParseFlags;
58 // Concat and Alternate handle larger numbers of subexpressions
59 // by building concatenation or alternation trees.
60 // Other routines should call Concat or Alternate instead of
61 // filling in sub() by hand.
63 subs: [] *Regexp_Node;
64 // Extra space for parse and teardown stacks.
67 // Arguments to operator.
77 runes: [..] Rune; // LiteralString
79 char_class: *CharClass;
80 char_class_finished: bool;
82 rune: Rune; // Literal
83 // @ToDo Verify size is ok
84 match_id: int; // HaveMatch
85 // void *the_union_[2]; // as big as any other element, for memset
90 // @ToDo: Rename all options
92 // Matches no strings.
95 // Matches empty string.
104 // Matches concatenation of sub_[0..nsub-1].
106 // Matches union of sub_[0..nsub-1].
109 // Matches sub_[0] zero or more times.
111 // Matches sub_[0] one or more times.
113 // Matches sub_[0] zero or one times.
116 // Matches sub_[0] at least min_ times, at most max_ times.
117 // max_ == -1 means no upper limit.
120 // Parenthesized (capturing) subexpression. Index is cap_.
121 // Optionally, capturing name is name_.
124 // Matches any character.
127 // Matches any byte [sic].
130 // Matches empty string at beginning of line.
132 // Matches empty string at end of line.
135 // Matches word boundary "\b".
137 // Matches not-a-word boundary "\B".
140 // Matches empty string at beginning of text.
142 // Matches empty string at end of text.
145 // Matches character class given by cc_.
148 // Forces match of entire expression right now,
149 // with match ID match_id_ (used by RE2::Set).
152 // The ones below are only used internally by the parser and don’t occur in finshed RegExps
157Regexp_Pool :: struct {
159 bucket: Bucket_Array(Regexp_Node, 64);
162init :: (pool: *Regexp_Pool) {
163 pool.pool = New(Pool);
164 set_allocators(pool.pool);
165 #if REGEXP_DEBUG pool.pool.overwrite_memory = true;
166 pool.bucket.allocator.proc = pool_allocator_proc;
167 pool.bucket.allocator.data = pool.pool;
170uninit :: (pool: *Regexp_Pool) {
177new_regexp :: (pool: *Regexp_Pool, op: RegexpOp, flags: ParseFlags) -> *Regexp_Node {
178 re := find_and_occupy_empty_slot(*pool.bucket);
179 // Need to clear, in case we re-use a bucket slot
180 // @ToDo, @Speed: This is unnecessary when its a fresh slot. Find a better way
181 memset(re, 0, size_of(Regexp_Node));
183 re.parse_flags = flags;
187remove_regexp :: (pool: *Regexp_Pool, re: *Regexp_Node) {
188 // @ToDo: Delete subs!
189 // @ToDO: Delete char_class!
198// Tests equality of all top-level structure but not subregexps.
199regexp_top_equal :: (a: *Regexp_Node, b: *Regexp_Node) -> bool {
200 if a.op != b.op return false;
203 case .NoMatch; #through;
204 case .EmptyMatch; #through;
205 case .AnyChar; #through;
206 case .AnyByte; #through;
207 case .BeginLine; #through;
208 case .EndLine; #through;
209 case .WordBoundary; #through;
210 case .NoWordBoundary; #through;
215 // The parse flags remember whether it's \z or (?-m:$),
216 // which matters when testing against PCRE.
217 return ((a.parse_flags ^ b.parse_flags) & .WasDollar) == 0;
220 return a.rune == b.rune && ((a.parse_flags ^ b.parse_flags) & .FoldCase) == 0;
223 return a.runes.count == b.runes.count && ((a.parse_flags ^ b.parse_flags) & .FoldCase) == 0 &&
224 memcmp(a.runes.data, b.runes.data, a.runes.count * size_of(Rune)) == 0;
226 case .Alternate; #through;
228 return a.subs.count == b.subs.count;
230 case .Star; #through;
231 case .Plus; #through;
233 return ((a.parse_flags ^ b.parse_flags) & .NonGreedy) == 0;
236 return ((a.parse_flags ^ b.parse_flags) & .NonGreedy) == 0 && a.min == b.min && a.max == b.max;
239 return a.cap == b.cap && a.name == b.name;
242 return a.match_id == b.match_id;
245 return a.char_class.nrunes == b.char_class.nrunes &&
246 a.char_class.ranges.count == b.char_class.ranges.count &&
247 memcmp(a.char_class.ranges.data, b.char_class.ranges.data, a.char_class.ranges.count * size_of(RuneRange)) == 0;
249 assert(false, "Unexpected op: %", a.op);
254regexp_equal :: (a: *Regexp_Node, b: *Regexp_Node) -> bool {
255 if a == null || b == null return a == b;
257 if !regexp_top_equal(a, b) return false;
259 // The stack has pairs of regexps waiting to
260 // be compared. The regexps are only equal if
261 // all the pairs end up being equal.
262 stack: [..] *Regexp_Node;
263 stack.allocator = temp;
268 // Invariant: TopEqual(a1, b1) == true.
272 case .Alternate; #through;
274 for i: 0..a1.subs.count-1 {
277 if !regexp_top_equal(a2, b2) return false;
279 array_add(*stack, a2);
280 array_add(*stack, b2);
283 case .Star; #through;
284 case .Plus; #through;
285 case .Quest; #through;
286 case .Repeat; #through;
290 if !regexp_top_equal(a2, b2) return false;
292 // stack.push_back(a2);
293 // stack.push_back(b2);
295 // but faster to assign directly and loop.
303 if !stack.count return true;
305 a1 = stack[stack.count-2];
306 b1 = stack[stack.count-1];
313// Determines whether regexp matches must be anchored
314// with a fixed string prefix. If so, returns the prefix and
315// the regexp that remains after the prefix. The prefix might
316// be ASCII case-insensitive.
317required_prefix :: (re: *Regexp_Node, pool: *Regexp_Pool) -> prefix: string, foldcase: bool, suffix: *Regexp_Node {
318 // No need for a walker: the regexp must be of the form
319 // 1. some number of ^ anchors
320 // 2. a literal char or string
323 if re.op != .Concat return "", false, suffix;
325 while i < re.subs.count && re.subs[i].op == .BeginText {
328 if i == 0 || i >= re.subs.count return "", false, suffix;
330 to_analyze := re.subs[i];
331 if to_analyze.op != .Literal && to_analyze.op != .LiteralString return "", false, suffix;
334 if (i < re.subs.count) {
335 suffix = new_concat(pool, array_view(re.subs, i, re.subs.count - i), re.parse_flags);
337 suffix = new_regexp(pool, .EmptyMatch, re.parse_flags);
341 if to_analyze.op == .Literal {
342 runes.data = *to_analyze.rune;
345 runes = to_analyze.runes;
349 push_allocator(pool_allocator_proc, pool.pool);
350 prefix := string_from_runes(runes);
351 return prefix, (to_analyze.parse_flags & .FoldCase) != 0, suffix;
355required_prefix_for_accel :: (re: Regexp_Node) -> prefix: string, foldcase: bool {
356 // No need for a walker: the regexp must either begin with or be
357 // a literal char or string.
359 if re.op == .Concat && re.subs.count > 0 {
360 to_analyze = re.subs[0];
363 if to_analyze.op != .Literal && to_analyze.op != .LiteralString return "", false;
366 if to_analyze.op == .Literal {
367 runes.data = *to_analyze.rune;
370 runes = to_analyze.runes;
372 prefix := string_from_runes(runes);
374 return prefix, (to_analyze.parse_flags & .FoldCase) != 0;
377// @ToDo: rename these
386 BadEscape; // bad escape sequence
387 BadCharClass; // bad character class
388 BadCharRange; // bad character class range
389 MissingBracket; // missing closing ]
390 MissingParen; // missing closing )
391 UnexpectedParen; // unexpected closing )
392 TrailingBackslash; // at end of regexp
393 RepeatArgument; // repeat argument missing, e.g. "*"
394 RepeatSize; // bad repetition argument
395 RepeatOp; // bad repetition operator
396 BadPerlOp; // bad perl operator
397 BadUTF8; // invalid UTF-8 in regexp
398 BadNamedCapture; // bad named capture
401// Regular expression parse state.
402// The list of parsed regexps so far is maintained as a vector of
403// Regexp_Node pointers called the stack. Left parenthesis and vertical
404// bar markers are also placed on the stack, as Regexps with
405// non-standard opcodes.
406// Scanning a left parenthesis causes the parser to push a left parenthesis
407// marker on the stack.
408// Scanning a vertical bar causes the parser to pop the stack until it finds a
409// vertical bar or left parenthesis marker (not popping the marker),
410// concatenate all the popped results, and push them back on
411// the stack (DoConcatenation).
412// Scanning a right parenthesis causes the parser to act as though it
413// has seen a vertical bar, which then leaves the top of the stack in the
414// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
415// The parser pops all this off the stack and creates an alternation of the
416// regexps (DoAlternation).
417ParseState :: struct {
419 whole_regexp: string;
421 stacktop: *Regexp_Node;
422 ncap: s32; // number of capturing parens seen
423 rune_max := Runemax; // maximum char value for this encoding
432make_status :: (code := StatusCode.Success, error_arg := "") -> Status {
435 status.error_arg = error_arg;
439rep :: (ps: *ParseState, op: RegexpOp, t: *string, lastunary: string) -> isunary: string, success: bool {
442 advance(t); // '*' or '+' or '?'
443 if (ps.flags & .PerlX) {
444 if (t.count && (t.*)[0] == #char "?") {
449 // In Perl it is not allowed to stack repetition operators:
450 // a** is a syntax error, not a double-star.
451 // (and a++ means something else entirely, which we don't support!)
452 ps.status.code = .RepeatOp;
453 ps.status.error_arg.data = lastunary.data;
454 ps.status.error_arg.count = t.data - lastunary.data;
459 opstr.count -= t.count;
460 if !push_repeat_op(ps, op, opstr, nongreedy)
465// Parses the regular expression given by s,
466// returning the corresponding Regexp_Node tree.
467// Returns null on error.
468parse :: (s: string, global_flags: ParseFlags = .NoParseFlags) -> *Regexp_Node, status: Status, pool: Regexp_Pool {
471 ps.flags = global_flags;
474 push_allocator(pool_allocator_proc, ps.pool.pool);
482 r, len, status := consume_rune(*t);
483 if !len return null, status, ps.pool;
484 push_literal(*ps, r);
488 // "(?" introduces Perl escape.
489 if ps.flags & .PerlX && t.count >= 2 && t[1] == #char "?" {
490 // Flag changes and non-capturing groups.
492 ps.status, success = parse_perl_flags(*ps, *t);
493 if !success return null, ps.status, ps.pool;
494 } else if ps.flags & .NeverCapture {
495 do_left_paren_no_capture(*ps);
503 do_vertical_bar(*ps);
507 if !do_right_paren(*ps) return null, ps.status, ps.pool;
510 case #char "^"; // Beginning of line.
514 case #char "$"; // End of line.
518 case #char "."; // Any character (possibly except newline).
522 case #char "["; // Character class.
523 re, status, success := parse_char_class(*ps, *t);
524 if !success return null, status, ps.pool;
525 push_regexp(*ps, re);
527 case #char "*"; // Zero or more.
529 isunary, success = rep(*ps, .Star, *t, lastunary);
530 if !success return null, ps.status, ps.pool;
531 case #char "+"; // One or more.
533 isunary, success = rep(*ps, .Plus, *t, lastunary);
534 if !success return null, ps.status, ps.pool;
535 case #char "?"; // Zero or one.
537 isunary, success = rep(*ps, .Quest, *t, lastunary);
538 if !success return null, ps.status, ps.pool;
540 case #char "{"; // Counted repetition.
542 lo, hi, success := maybe_parse_repetition(*t);
544 // Treat like a literal.
545 push_literal(*ps, #char "{");
549 if ps.flags & .PerlX {
550 if t && t[0] == #char "?" {
555 // Not allowed to stack repetition operators.
556 ps.status.code = .RepeatOp;
557 ps.status.error_arg = slice(lastunary, 0, t.data - lastunary.data);
558 return null, ps.status, ps.pool;
561 opstr.count -= t.count;
562 if !push_repetition(*ps, lo, hi, opstr, nongreedy) {
563 return null, ps.status, ps.pool;
568 case #char "\\"; // Escaped character or Perl sequence.
569 // \b and \B: word boundary or not
571 if ps.flags & .PerlB &&
572 t.count >= 2 && (t[1] == #char "b" || t[1] == #char "B") {
573 if t[1] == #char "b" {
574 push_simple_op(*ps, .WordBoundary);
576 push_simple_op(*ps, .NoWordBoundary);
578 advance(*t, 2); // '\\', 'b'
580 } else if (ps.flags & .PerlX) && t.count >= 2 {
581 if (t[1] == #char "A") {
582 push_simple_op(*ps, .BeginText);
583 advance(*t, 2); // '\\', 'A'
585 } else if (t[1] == #char "z") {
586 push_simple_op(*ps, .EndText);
587 advance(*t, 2); // '\\', 'z'
589 // Do not recognize \Z, because this library can't
590 // implement the exact Perl/PCRE semantics.
591 // (This library treats "(?-m)$" as \z, even though
592 // in Perl and PCRE it is equivalent to \Z.)
593 } else if (t[1] == #char "C") { // \C: any byte [sic]
594 push_simple_op(*ps, .AnyByte);
595 advance(*t, 2); // '\\', 'C'
597 } else if (t[1] == #char "Q") { // \Q ... \E: the ... is always literals
598 advance(*t, 2); // '\\', 'Q'
600 if (t.count >= 2 && t[0] == #char "\\" && t[1] == #char "E") {
601 advance(*t, 2); // '\\', 'E'
604 r, len, status := consume_rune(*t);
605 if !len return null, status, ps.pool;
606 push_literal(*ps, r);
612 if (!parsed && t.count >= 2 && (t[1] == #char "p" || t[1] == #char "P")) {
613 re := new_regexp(*ps.pool, .CharClass, ps.flags & ~.FoldCase);
614 re.char_class = New(CharClass);
615 re.char_class_finished = false;
616 parseStatus, parseResult := maybe_parse_unicode_group(*t, ps.flags, re.char_class);
619 push_regexp(*ps, re);
623 remove_regexp(*ps.pool, re);
624 ps.status = parseStatus;
625 return null, ps.status, ps.pool;
627 remove_regexp(*ps.pool, re);
632 g := maybe_parse_perl_cc_escape(*t, ps.flags);
634 re := new_regexp(*ps.pool, .CharClass, ps.flags & ~.FoldCase);
635 re.char_class = New(CharClass);
636 re.char_class_finished = false;
637 add_unicode_group(re.char_class, g, g.sign, ps.flags);
638 push_regexp(*ps, re);
644 r, status, success := parse_escape(*t, ps.rune_max);
645 if !success return null, status, ps.pool;
646 push_literal(*ps, r);
650 return do_finish(*ps), ps.status, ps.pool;
653new_literal_string :: (pool: *Regexp_Pool, runes: [] Rune, flags: ParseFlags) -> *Regexp_Node {
656 return new_regexp(pool, .EmptyMatch, flags);
658 return new_literal(pool, runes[0], flags);
660 re := new_regexp(pool, .LiteralString, flags);
662 add_rune_to_string(re, it);
668new_literal :: (pool: *Regexp_Pool, rune: Rune, flags: ParseFlags) -> *Regexp_Node {
669 re := new_regexp(pool, .Literal, flags);
674new_star_plus_quest :: (pool: *Regexp_Pool, op: RegexpOp, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
675 // Squash **, ++ and ??.
676 if (op == sub.op && flags == sub.parse_flags) return sub;
678 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
679 // op is Star/Plus/Quest, we just have to check that sub.op() is too.
680 if ((sub.op == .Star ||
683 flags == sub.parse_flags) {
684 // If sub is Star, no need to rewrite it.
685 if (sub.op == .Star) return sub;
687 re := new_regexp(pool, .Star, flags);
688 array_resize(*re.subs, 1, false);
689 re.subs[0] = sub.subs[0];
693 re := new_regexp(pool, op, flags);
694 array_resize(*re.subs, 1, false);
699// @ToDo: Why does #bake_arguments not work here? (Compiler bug I can’t find a simple repro for)
700// new_plus :: #bake_arguments new_star_plus_quest(op = .Plus);
701// new_star :: #bake_arguments new_star_plus_quest(op = RegexpOp.Star);
702// new_quest :: #bake_arguments new_star_plus_quest(op = .Quest);
703new_plus:: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
704 return new_star_plus_quest(pool, .Plus, sub, flags);
706new_star :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
707 return new_star_plus_quest(pool, .Star, sub, flags);
709new_quest :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
710 return new_star_plus_quest(pool, .Quest, sub, flags);
713new_concat_or_alternate :: (pool: *Regexp_Pool, op: RegexpOp, subs: [] *Regexp_Node, flags: ParseFlags, can_factor: bool) -> *Regexp_Node {
720 if op == .Alternate {
721 return new_regexp(pool, .NoMatch, flags);
723 return new_regexp(pool, .EmptyMatch, flags);
727 // @ToDo, @Incomplete: We need this factorization for selecting the right
728 // execution engine later. But does it matter if we implement only NDA for now?
730 // PODArray<Regexp_Node*> subcopy;
731 // if op == .Alternate && can_factor {
732 // // Going to edit sub; make a copy so we don't step on caller.
733 // subcopy = PODArray<Regexp_Node*>(nsub);
734 // memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
735 // sub = subcopy.data();
736 // nsub = FactorAlternation(sub, nsub, flags);
738 // Regexp_Node* re = sub[0];
743 // @ToDo, @Incomplete: Is this necessary for us?
744 // We use a normal array at the moment, so we waste 6 more bytes on the size and won’t overflow
746 // if (nsub > MAX_SUBS) {
747 // // Too many subexpressions to fit in a single Regexp_Node.
748 // // Make a two-level tree. Two levels gets us to 65535^2.
749 // int nbigsub = (nsub+MAX_SUBS-1)/MAX_SUBS;
750 // Regexp_Node* re = new Regexp_Node(op, flags);
751 // re->AllocSub(nbigsub);
752 // Regexp_Node** subs = re->sub();
753 // for (int i = 0; i < nbigsub - 1; i++)
754 // subs[i] = ConcatOrAlternate(op, sub+i*MAX_SUBS, MAX_SUBS, flags, false);
755 // subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*MAX_SUBS,
756 // nsub - (nbigsub-1)*MAX_SUBS, flags,
761 re := new_regexp(pool, op, flags);
766// concat :: #bake_arguments concat_or_alternate(op = .Concat, can_factor = false);
767// alternate :: #bake_arguments concat_or_alternate(op = .Alternate, can_factor = true);
768// alternate_no_factor :: #bake_arguments concat_or_alternate(op = .Alternate, can_factor = false);
769new_concat :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
770 return new_concat_or_alternate(pool, .Concat, subs, flags, false);
772new_alternate :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
773 return new_concat_or_alternate(pool, .Alternate, subs, flags, true);
775new_alternate_no_factor :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
776 return new_concat_or_alternate(pool, .Alternate, subs, flags, false);
779new_capture :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags, cap: s32) -> *Regexp_Node {
780 re := new_regexp(pool, .Capture, flags);
781 array_resize(*re.subs, 1, false);
787new_repeat :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags, min: s16, max: s16) -> *Regexp_Node {
788 re := new_regexp(pool, .Repeat, flags);
789 array_resize(*re.subs, 1, false);
796to_string :: (re: *Regexp_Node) -> string {
797 // Appends a rune for use in a character class to the string t.
798 append_cc_char :: (builder: *String_Builder, r: Rune) {
799 if 0x20 <= r && r <= 0x7E {
800 if find_index_from_left("[]^-\\", cast(u8)r) != -1 {
801 append(builder, "\\");
803 append(builder, cast(u8)r);
807 append(builder, "\\r");
811 append(builder, "\\t");
815 append(builder, "\\n");
819 append(builder, "\\f");
824 print_to_builder(builder, "\\x%", formatInt(r, base = 16, minimum_digits = 2));
826 print_to_builder(builder, "\\x{%}", formatInt(r, base = 16));
831 append_cc_range :: (builder: *String_Builder, lo: Rune, hi: Rune) {
832 append_cc_char(builder, lo);
833 // < would be enough here, but we want to print invalid ranges as well, just in case we have a bug somewhere
835 append(builder, #char "-");
836 append_cc_char(builder, hi);
840 append_literal :: (builder: *String_Builder, r: Rune, foldcase: bool) {
841 if r != 0 && r < 0x80 && find_index_from_left("(){}[]*+?|.^$\\", cast(u8)r) != -1 {
842 append(builder, #char "\\");
843 append(builder, cast(u8)r);
844 } else if (foldcase && #char "a" <= r && r <= #char "z") {
845 r -= #char "a" - #char "A";
846 append(builder, #char "[");
847 append(builder, cast(u8)r);
848 append(builder, cast(u8)r + #char "a" - #char "A");
849 append(builder, #char "]");
851 append_cc_range(builder, r, r);
855 // Appends ( if needed and passes new precedence to children.
856 to_string_pre :: (builder: *String_Builder, re: *Regexp_Node, prec: Precedence) -> Precedence, bool {
857 nprec := Precedence.Atom;
860 case .NoMatch; #through;
861 case .EmptyMatch; #through;
862 case .Literal; #through;
863 case .AnyChar; #through;
864 case .AnyByte; #through;
865 case .BeginLine; #through;
866 case .EndLine; #through;
867 case .BeginText; #through;
868 case .EndText; #through;
869 case .WordBoundary; #through;
870 case .NoWordBoundary; #through;
871 case .CharClass; #through;
875 case .Concat; #through;
878 append(builder, "(?:");
883 if prec < .Alternate {
884 append(builder, "(?:");
889 append(builder, #char "(");
890 assert (re.cap != 0, "Capture 0");
892 append(builder, "?P<");
893 append(builder, re.name);
894 append(builder, #char ">");
898 case .Star; #through;
899 case .Plus; #through;
900 case .Quest; #through;
903 append(builder, "(?:");
905 // The subprecedence here is Atom instead of Unary
906 // because PCRE treats two unary ops in a row as a parse error.
913 to_string_post :: (builder: *String_Builder, re: *Regexp_Node, prec: Precedence, pre_arg: Precedence, child_args: [..] Precedence) -> Precedence {
916 // There's no simple symbol for "no match", but
917 // [^0-Runemax] excludes everything.
918 append(builder, "[^\\x00-\\x{10ffff}]");
921 // Append (?:) to make empty string visible,
922 // unless this is already being parenthesized.
924 append(builder, "(?:)");
928 append_literal(builder, re.rune, (re.parse_flags & .FoldCase) != 0);
932 append_literal(builder, it, (re.parse_flags & .FoldCase) != 0);
935 append(builder, #char ")");
940 append(builder, #char ")");
944 // Clumsy but workable: the children all appended |
945 // at the end of their strings, so just remove the last one.
946 // @ToDo: we can’t do that with string_builder, at least not without very clumsy or very slow code
947 // if ((*t_)[t_->size()-1] == '|')
948 // t_->erase(t_->size()-1);
950 // LOG(DFATAL) << "Bad final char: " << t_;
951 if prec < .Alternate {
952 append(builder, #char ")");
956 append(builder, #char "*");
957 if re.parse_flags & .NonGreedy {
958 append(builder, #char "?");
962 append(builder, #char ")");
966 append(builder, #char "+");
967 if re.parse_flags & .NonGreedy {
968 append(builder, #char "?");
971 append(builder, #char ")");
975 append(builder, #char "?");
976 if re.parse_flags & .NonGreedy {
977 append(builder, #char "?");
980 append(builder, #char ")");
985 print_to_builder(builder, "{%,}", re.min);
986 } else if re.min == re.max {
987 print_to_builder(builder, "{%}", re.min);
989 print_to_builder(builder, "{%,%}", re.min, re.max);
991 if (re.parse_flags & .NonGreedy) {
992 append(builder, #char "?");
995 append(builder, #char ")");
999 append(builder, #char ".");
1002 append(builder, "\\C");
1005 append(builder, #char "^");
1008 append(builder, #char "$");
1011 append(builder, "(?-m:^)");
1014 if re.parse_flags & .WasDollar {
1015 append(builder, "(?-m:$)");
1017 append(builder, "\\z");
1021 append(builder, "\\b");
1023 case .NoWordBoundary;
1024 append(builder, "\\B");
1027 if (re.char_class.ranges.count == 0) {
1028 append(builder, "[^\\x00-\\x{10ffff}]");
1030 append(builder, "[");
1031 // Heuristic: show class as negated if it contains the
1032 // non-character 0xFFFE and yet somehow isn't full.
1033 cc := re.char_class;
1034 if contains(cc, 0xFFFE) && cc.nrunes != Runemax + 1 {
1037 append(builder, "^");
1040 append_cc_range(builder, it.lo, it.hi);
1042 if cc != re.char_class {
1046 append(builder, #char "]");
1050 append(builder, #char ")");
1053 // There's no syntax accepted by the parser to generate
1054 // this node (it is generated by RE2::Set) so make something
1055 // up that is readable but won't compile.
1056 print_to_builder(builder, "(?HaveMatch:%)", re.match_id);
1059 // If the parent is an alternation, append the | for it.
1060 if (prec == .Alternate) {
1061 append(builder, #char "|");
1067 builder: String_Builder;
1068 defer free_buffers(*builder);
1069 w := Walk(*String_Builder, Precedence).{pre_visit = to_string_pre, post_visit = to_string_post, copy = null};
1070 // @ToDO @Cleanup This is a workaround for nulls not being respected in struct literals yet.
1071 // Remove once the struct literal bug is fixed in jai
1073 walk(*builder, re, .Toplevel, w);
1074 return builder_to_string(*builder);
1081// Finishes the regexp if necessary, preparing it for use in
1082// a more complex expression.
1083finish_regexp:: (re: *Regexp_Node) -> *Regexp_Node {
1084 if re == null return null;
1088 if re.op == .CharClass && re.char_class != null && !re.char_class_finished {
1089 finish_char_class(re.char_class);
1090 re.char_class_finished = true;
1096// Pushes the given regular expression onto the stack.
1097// Could check for too much memory used here.
1098push_regexp :: (using ps: *ParseState, re: *Regexp_Node) {
1099 maybe_concat_string(ps, -1, .NoParseFlags);
1101 // Special case: a character class of one character is just
1102 // a literal. This is a common idiom for escaping
1103 // single characters (e.g., [.] instead of \.), and some
1104 // analysis does better with fewer character classes.
1105 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
1106 if re.op == .CharClass && re.char_class != null {
1107 remove_above(re.char_class, rune_max);
1108 if re.char_class.nrunes == 1 {
1109 r := re.char_class.ranges[0].lo;
1110 // @ToDo, @Speed: No need to remove and create a new one, just clean up and repurpose the old one
1111 remove_regexp(*ps.pool, re);
1112 re = new_literal(*ps.pool, r, flags);
1113 } else if re.char_class.nrunes == 2 {
1114 r := re.char_class.ranges[0].lo;
1115 if #char "A" <= r && r <= #char "Z" && contains(re.char_class, r + #char "a" - #char "A") {
1116 // @ToDo, @Speed: No need to remove and create a new one, just clean up and repurpose the old one
1117 remove_regexp(*ps.pool, re);
1118 re = new_literal(*ps.pool, r + #char "a" - #char "A", flags | .FoldCase);
1123 if !is_marker(re.op) {
1124 re.simple = compute_simple(re);
1132// Add lo-hi to the class, along with their fold-equivalent characters.
1133// If lo-hi is already in the class, assume that the fold-equivalent
1134// chars are there too, so there's no work to do.
1135add_folded_range :: (cc: *CharClass, lo: Rune, hi: Rune, depth: int) {
1136 // AddFoldedRange calls itself recursively for each rune in the fold cycle.
1137 // Most folding cycles are small: there aren't any bigger than four in the
1138 // current Unicode tables. make_unicode_casefold.py checks that
1139 // the cycles are not too long, and we double-check here using depth.
1140 assert(depth <= 10, "AddFoldedRange recurses too much.");
1142 if !add_range(cc, lo, hi) return; // lo-hi was already there? we're done
1145 f := lookup_casefold(unicode_casefold, lo);
1146 if f == null break; // lo has no fold, nor does anything above lo
1148 if lo < f.lo { // lo has no fold; next rune with a fold is f.lo
1151 // Add in the result of folding the range lo - f.hi
1152 // and that range's fold, recursively.
1154 hi1 := min(hi, f.hi);
1160 if lo1%2 == 1 lo1 -= 1;
1161 if hi1%2 == 0 hi1 += 1;
1163 if lo1%2 == 0 lo1 -= 1;
1164 if hi1%2 == 1 hi1 += 1;
1166 add_folded_range(cc, lo1, hi1, depth + 1);
1168 // Pick up where this fold left off.
1174// Pushes the literal rune r onto the stack.
1175push_literal :: (using ps: *ParseState, r: Rune) {
1176 // Do case folding if needed.
1177 if (flags & .FoldCase) && cycle_fold_rune(r) != r {
1178 re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
1179 re.char_class = New(CharClass);
1180 re.char_class_finished = false;
1183 if !(flags & .NeverNL) || r != #char "\n" {
1184 add_range(re.char_class, r, r);
1186 r = cycle_fold_rune(r);
1188 if !(flags & .NeverNL) || r != #char "\n" {
1189 add_range(re.char_class, r, r);
1191 r = cycle_fold_rune(r);
1194 push_regexp(ps, re);
1198 // Exclude newline if applicable.
1199 if (flags & .NeverNL) && r == #char "\n" {
1200 push_regexp(ps, new_regexp(*ps.pool, .NoMatch, flags));
1204 // No fancy stuff worked. Ordinary literal.
1205 if maybe_concat_string(ps, r, flags) return;
1207 re := new_regexp(*ps.pool, .Literal, flags);
1209 push_regexp(ps, re);
1212// Pushes a ^ onto the stack.
1213push_caret :: (using ps: *ParseState) {
1214 if flags & .OneLine {
1215 push_simple_op(ps, .BeginText);
1217 push_simple_op(ps, .BeginLine);
1221// Pushes a $ onto the stack.
1222push_dollar :: (using ps: *ParseState) {
1223 if flags & .OneLine {
1224 // Clumsy marker so that MimicsPCRE() can tell whether
1225 // this EndText was a $ and not a \z.
1226 // @ToDo, @Cleanup pass flags as optional argument
1228 flags = flags | .WasDollar;
1229 push_simple_op(ps, .EndText);
1232 push_simple_op(ps, .EndLine);
1236// Pushes a . onto the stack.
1237push_dot :: (using ps: *ParseState) {
1238 if flags & .DotNL && !(flags & .NeverNL) {
1239 push_simple_op(ps, .AnyChar);
1241 // Rewrite . into [^\n]
1242 re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
1243 re.char_class = New(CharClass);
1244 re.char_class_finished = false;
1245 add_range(re.char_class, 0, #char "\n" - 1);
1246 add_range(re.char_class, #char "\n" + 1, rune_max);
1247 push_regexp(ps, re);
1251// Pushes a regexp with the given op (and no args) onto the stack.
1252push_simple_op :: (using ps: *ParseState, op: RegexpOp) {
1253 push_regexp(ps, new_regexp(*ps.pool, op, flags));
1256// Pushes a repeat operator regexp onto the stack.
1257// A valid argument for the operator must already be on the stack.
1258// The char c is the name of the operator, for use in error messages.
1259push_repeat_op :: (using ps: *ParseState, op: RegexpOp, s: string, nongreedy: bool) -> bool {
1260 if stacktop == null || is_marker(stacktop.op) {
1261 status.code = .RepeatArgument;
1262 status.error_arg = s;
1266 fl := ifx nongreedy then flags ^ .NonGreedy else flags;
1268 // Squash **, ++ and ??. Regexp_Node::Star() et al. handle this too, but
1269 // they're mostly for use during simplification, not during parsing.
1270 if op == stacktop.op && fl == stacktop.parse_flags return true;
1272 // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
1273 // op is a repeat, we just have to check that stacktop_.op() is too,
1274 // then adjust stacktop_.
1275 if (stacktop.op == .Star || stacktop.op == .Plus || stacktop.op == .Quest) && fl == stacktop.parse_flags {
1276 stacktop.op = .Star;
1280 re := new_regexp(*ps.pool, op, fl);
1281 re.down = stacktop.down;
1282 re.subs = NewArray(1, *Regexp_Node, initialized = false);
1283 re.subs[0] = finish_regexp(stacktop);
1284 re.simple = compute_simple(re);
1289// // RepetitionWalker reports whether the repetition regexp is valid.
1290// // Valid means that the combination of the top-level repetition
1291// // and any inner repetitions does not exceed n copies of the
1292// // innermost thing.
1293// // This rewalks the regexp tree and is called for every repetition,
1294// // so we have to worry about inducing quadratic behavior in the parser.
1295// // We avoid this by only using RepetitionWalker when min or max >= 2.
1296// // In that case the depth of any >= 2 nesting can only get to 9 without
1297// // triggering a parse error, so each subtree can only be rewalked 9 times.
1298// class RepetitionWalker : public Regexp_Node::Walker<int> {
1300// RepetitionWalker() {}
1301// virtual int PreVisit(Regexp_Node* re, int parent_arg, bool* stop);
1302// virtual int PostVisit(Regexp_Node* re, int parent_arg, int pre_arg,
1303// int* child_args, int nchild_args);
1304// virtual int ShortVisit(Regexp_Node* re, int parent_arg);
1307// RepetitionWalker(const RepetitionWalker&) = delete;
1308// RepetitionWalker& operator=(const RepetitionWalker&) = delete;
1311// int RepetitionWalker::PreVisit(Regexp_Node* re, int parent_arg, bool* stop) {
1312// int arg = parent_arg;
1313// if (re.op() == .Repeat) {
1325// int RepetitionWalker::PostVisit(Regexp_Node* re, int parent_arg, int pre_arg,
1326// int* child_args, int nchild_args) {
1327// int arg = pre_arg;
1328// for (int i = 0; i < nchild_args; i++) {
1329// if (child_args[i] < arg) {
1330// arg = child_args[i];
1336// int RepetitionWalker::ShortVisit(Regexp_Node* re, int parent_arg) {
1337// // Should never be called: we use Walk(), not WalkExponential().
1338// #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1339// LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
1344// Pushes a repetition regexp onto the stack.
1345// A valid argument for the operator must already be on the stack.
1346push_repetition :: (using ps: *ParseState, min: int, max: int, s: string, nongreedy: bool) -> bool {
1347 if (max != -1 && max < min) || min > MAX_REPEAT || max > MAX_REPEAT {
1348 status.code = .RepeatSize;
1349 status.error_arg = s;
1352 if stacktop == null || is_marker(stacktop.op) {
1353 status.code = .RepeatArgument;
1354 status.error_arg = s;
1358 fl := ifx nongreedy then flags ^ .NonGreedy else flags;
1360 down := stacktop.down;
1361 re := new_repeat(*ps.pool, finish_regexp(stacktop), fl, cast(s16)min, cast(s16)max);
1363 re.simple = compute_simple(re);
1365 if min >= 2 || max >= 2 {
1366 // @ToDo: Check for repetition limits (see RepetitionWalker in the original source)
1367 // if repetition_walk(stacktop, kMaxRepeat) == 0 {
1368 // status.code = .RepeatSize;
1369 // status.error_arg = s;
1377// Checks whether a particular regexp op is a marker.
1378is_marker :: (op: RegexpOp) -> bool {
1379 return op >= .kLeftParen;
1382// Processes a left parenthesis in the input.
1383// Pushes a marker onto the stack.
1384do_left_paren :: (using ps: *ParseState, name: string = "") {
1385 re := new_regexp(*ps.pool, .kLeftParen, flags);
1391 push_regexp(ps, re);
1394// Pushes a non-capturing marker onto the stack.
1395do_left_paren_no_capture :: (using ps: *ParseState) {
1396 re := new_regexp(*ps.pool, .kLeftParen, flags);
1398 push_regexp(ps, re);
1401// Processes a vertical bar in the input.
1402do_vertical_bar ::(using ps: *ParseState) {
1403 maybe_concat_string(ps, -1, .NoParseFlags);
1404 do_concatenation(ps);
1406 // Below the vertical bar is a list to alternate.
1407 // Above the vertical bar is a list to concatenate.
1408 // We just did the concatenation, so either swap
1409 // the result below the vertical bar or push a new
1410 // vertical bar on the stack.
1414 if r2 && r2.op == .kVerticalBar {
1416 if r3 && (r1.op == .AnyChar || r3.op == .AnyChar) {
1417 // AnyChar is above or below the vertical bar. Let it subsume
1418 // the other when the other is Literal, CharClass or AnyChar.
1419 if r3.op == .AnyChar && (r1.op == .Literal || r1.op == .CharClass || r1.op == .AnyChar) {
1422 remove_regexp(*ps.pool, r1);
1425 if r1.op == .AnyChar && (r3.op == .Literal || r3.op == .CharClass || r3.op == .AnyChar) {
1426 // Rearrange the stack and discard r3.
1430 remove_regexp(*ps.pool, r3);
1434 // Swap r1 below vertical bar (r2).
1441 push_simple_op(ps, .kVerticalBar);
1444// Processes a right parenthesis in the input.
1445do_right_paren :: (using ps: *ParseState) -> bool {
1446 // Finish the current concatenation and alternation.
1449 // The stack should be: LeftParen regexp
1450 // Remove the LeftParen, leaving the regexp,
1453 if r1 == null || r1.down == null || r1.down.op != .kLeftParen {
1454 status.code = .UnexpectedParen;
1455 status.error_arg = whole_regexp;
1461 // Pop off r1, r2. Will Decref or reuse below.
1464 // Restore flags from when paren opened.
1466 flags = re.parse_flags;
1468 // Rewrite LeftParen as capture if needed.
1471 // re.cap is already set
1472 re.subs = NewArray(1, *Regexp_Node, initialized = false);
1473 re.subs[0] = finish_regexp(r1);
1474 re.simple = compute_simple(re);
1476 remove_regexp(*ps.pool, re);
1480 push_regexp(ps, re);
1484// Processes the end of input, returning the final regexp.
1485do_finish :: (using ps: *ParseState) -> *Regexp_Node {
1488 if re != null && re.down != null {
1489 status.code = .MissingParen;
1490 status.error_arg = whole_regexp;
1494 return finish_regexp(re);
1497// Returns the leading regexp that re starts with.
1498// The returned Regexp_Node* points into a piece of re,
1499// so it must not be used after the caller calls re.Decref().
1500leading_regexp :: (re: *Regexp_Node) -> *Regexp_Node {
1501 if re.op == .EmptyMatch return null;
1502 if re.op == .Concat && re.subs.count >= 2 {
1504 if sub.op == .EmptyMatch return null;
1510// Removes LeadingRegexp(re) from re and returns what's left.
1511// Consumes the reference to re and may edit it in place.
1512// If caller wants to hold on to LeadingRegexp(re),
1513// must have already Incref'ed it.
1514// @ToDO: Check if this is possible / correct in all usages without using reference counting!
1515remove_leading_regexp :: (using ps: *ParseState, re: *Regexp_Node) -> *Regexp_Node {
1516 if re.op == .EmptyMatch return re;
1518 if re.op == .Concat && re.subs.count >= 2 {
1519 if re.subs[0].op == .EmptyMatch return re;
1521 remove_regexp(*ps.pool, re.subs[0]);
1524 if re.subs.count == 2 {
1525 // Collapse concatenation to single regexp.
1528 remove_regexp(*ps.pool, re);
1531 // 3 or more -> 2 or more.
1532 // @Stability: We won’t be able to free this array ever again
1533 // because we modify it’s base pointer.
1534 // But it doesn’t matter because we’re working within a Pool anyway.
1535 print("Before: %\n", re.subs);
1538 print("After: %\n", re.subs);
1541 pf := re.parse_flags;
1542 // @ToDo, @Speed: Reuse regexp instead of removing and creating a new one?
1543 remove_regexp(*ps.pool, re);
1544 return new_regexp(*ps.pool, .EmptyMatch, pf);
1548// Returns the leading string that re starts with.
1549// The returned Rune* points into a piece of re,
1550// so it must not be used after the caller calls re.Decref().
1552// Rune* Regexp_Node::LeadingString(Regexp_Node* re, int *nrune,
1553// Regexp_Node::ParseFlags *flags) {
1554// while (re.op() == .Concat && re.nsub() > 0)
1557// *flags = static_cast<Regexp_Node::ParseFlags>(re.parse_flags_ & Regexp_Node::FoldCase);
1559// if (re.op() == .Literal) {
1564// if (re.op() == .LiteralString) {
1565// *nrune = re.nrunes_;
1573// Removes the first n leading runes from the beginning of re.
1574// Edits re in place.
1576// void Regexp_Node::RemoveLeadingString(Regexp_Node* re, int n) {
1577// // Chase down concats to find first string.
1578// // For regexps generated by parser, nested concats are
1579// // flattened except when doing so would overflow the 16-bit
1580// // limit on the size of a concatenation, so we should never
1581// // see more than two here.
1582// Regexp_Node* stk[4];
1584// while (re.op() == .Concat) {
1585// if (d < arraysize(stk))
1590// // Remove leading string from re.
1591// if (re.op() == .Literal) {
1593// re.op_ = .EmptyMatch;
1594// } else if (re.op() == .LiteralString) {
1595// if (n >= re.nrunes_) {
1596// delete[] re.runes_;
1599// re.op_ = .EmptyMatch;
1600// } else if (n == re.nrunes_ - 1) {
1601// Rune rune = re.runes_[re.nrunes_ - 1];
1602// delete[] re.runes_;
1606// re.op_ = .Literal;
1609// memmove(re.runes_, re.runes_ + n, re.nrunes_ * sizeof re.runes_[0]);
1613// // If re is now empty, concatenations might simplify too.
1616// Regexp_Node** sub = re.sub();
1617// if (sub[0].op() == .EmptyMatch) {
1620// // Delete first element of concat.
1621// switch (re.nsub()) {
1625// LOG(DFATAL) << "Concat of " << re.nsub();
1626// re.submany_ = null;
1627// re.op_ = .EmptyMatch;
1631// // Replace re with sub[1].
1632// Regexp_Node* old = sub[1];
1642// memmove(sub, sub + 1, re.nsub_ * sizeof sub[0]);
1649// @ToDo: All of that
1650//// In the context of factoring alternations, a Splice is: a factored prefix or
1651//// merged character class computed by one iteration of one round of factoring;
1652//// the span of subexpressions of the alternation to be "spliced" (i.e. removed
1653//// and replaced); and, for a factored prefix, the number of suffixes after any
1654//// factoring that might have subsequently been performed on them. For a merged
1655//// character class, there are no suffixes, of course, so the field is ignored.
1657// Splice(Regexp_Node* prefix, Regexp_Node** sub, int nsub)
1663// Regexp_Node* prefix;
1664// Regexp_Node** sub;
1669//// Named so because it is used to implement an explicit stack, a Frame is: the
1670//// span of subexpressions of the alternation to be factored; the current round
1671//// of factoring; any Splices computed; and, for a factored prefix, an iterator
1672//// to the next Splice to be factored (i.e. in another Frame) because suffixes.
1674// Frame(Regexp_Node** sub, int nsub)
1679// Regexp_Node** sub;
1682// std::vector<Splice> splices;
1686//// Bundled into a class for friend access to Regexp_Node without needing to declare
1687//// (or define) Splice in regexp.h.
1688//class FactorAlternationImpl {
1690// static void Round1(Regexp_Node** sub, int nsub,
1691// Regexp_Node::ParseFlags flags,
1692// std::vector<Splice>* splices);
1693// static void Round2(Regexp_Node** sub, int nsub,
1694// Regexp_Node::ParseFlags flags,
1695// std::vector<Splice>* splices);
1696// static void Round3(Regexp_Node** sub, int nsub,
1697// Regexp_Node::ParseFlags flags,
1698// std::vector<Splice>* splices);
1701//// Factors common prefixes from alternation.
1703//// ABC|ABD|AEF|BCX|BCY
1705//// A(B(C|D)|EF)|BC(X|Y)
1707//// A(B[CD]|EF)|BC[XY]
1709//// Rewrites sub to contain simplified list to alternate and returns
1710//// the new length of sub. Adjusts reference counts accordingly
1711//// (incoming sub[i] decremented, outgoing sub[i] incremented).
1712//int Regexp_Node::FactorAlternation(Regexp_Node** sub, int nsub, ParseFlags flags) {
1713// std::vector<Frame> stk;
1714// stk.emplace_back(sub, nsub);
1717// auto& sub = stk.back().sub;
1718// auto& nsub = stk.back().nsub;
1719// auto& round = stk.back().round;
1720// auto& splices = stk.back().splices;
1721// auto& spliceidx = stk.back().spliceidx;
1723// if (splices.empty()) {
1724// // Advance to the next round of factoring. Note that this covers
1725// // the initialised state: when splices is empty and round is 0.
1727// } else if (spliceidx < static_cast<int>(splices.size())) {
1728// // We have at least one more Splice to factor. Recurse logically.
1729// stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
1732// // We have no more Splices to factor. Apply them.
1733// auto iter = splices.begin();
1735// for (int i = 0; i < nsub; ) {
1736// // Copy until we reach where the next Splice begins.
1737// while (sub + i < iter.sub)
1738// sub[out++] = sub[i++];
1742// // Assemble the Splice prefix and the suffixes.
1743// Regexp_Node* re[2];
1744// re[0] = iter.prefix;
1745// re[1] = Regexp_Node::AlternateNoFactor(iter.sub, iter.nsuffix, flags);
1746// sub[out++] = Regexp_Node::Concat(re, 2, flags);
1751// // Just use the Splice prefix.
1752// sub[out++] = iter.prefix;
1756// LOG(DFATAL) << "unknown round: " << round;
1759// // If we are done, copy until the end of sub.
1760// if (++iter == splices.end()) {
1762// sub[out++] = sub[i++];
1767// // Advance to the next round of factoring.
1773// FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
1776// FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
1779// FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
1782// if (stk.size() == 1) {
1783// // We are at the top of the stack. Just return.
1786// // Pop the stack and set the number of suffixes.
1787// // (Note that references will be invalidated!)
1788// int nsuffix = nsub;
1790// stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1791// ++stk.back().spliceidx;
1795// LOG(DFATAL) << "unknown round: " << round;
1799// // Set spliceidx depending on whether we have Splices to factor.
1800// if (splices.empty() || round == 3) {
1801// spliceidx = static_cast<int>(splices.size());
1808//void FactorAlternationImpl::Round1(Regexp_Node** sub, int nsub,
1809// Regexp_Node::ParseFlags flags,
1810// std::vector<Splice>* splices) {
1811// // Round 1: Factor out common literal prefixes.
1813// Rune* rune = null;
1815// Regexp_Node::ParseFlags runeflags = Regexp_Node::NoParseFlags;
1816// for (int i = 0; i <= nsub; i++) {
1817// // Invariant: sub[start:i] consists of regexps that all
1818// // begin with rune[0:nrune].
1819// Rune* rune_i = null;
1821// Regexp_Node::ParseFlags runeflags_i = Regexp_Node::NoParseFlags;
1823// rune_i = Regexp_Node::LeadingString(sub[i], &nrune_i, &runeflags_i);
1824// if (runeflags_i == runeflags) {
1826// while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1829// // Matches at least one rune in current range. Keep going around.
1836// // Found end of a run with common leading literal string:
1837// // sub[start:i] all begin with rune[0:nrune],
1838// // but sub[i] does not even begin with rune[0].
1840// // Nothing to do - first iteration.
1841// } else if (i == start+1) {
1842// // Just one: don't bother factoring.
1844// Regexp_Node* prefix = Regexp_Node::LiteralString(rune, nrune, runeflags);
1845// for (int j = start; j < i; j++)
1846// Regexp_Node::RemoveLeadingString(sub[j], nrune);
1847// splices.emplace_back(prefix, sub + start, i - start);
1850// // Prepare for next iteration (if there is one).
1855// runeflags = runeflags_i;
1860//void FactorAlternationImpl::Round2(Regexp_Node** sub, int nsub,
1861// Regexp_Node::ParseFlags flags,
1862// std::vector<Splice>* splices) {
1863// // Round 2: Factor out common simple prefixes,
1864// // just the first piece of each concatenation.
1865// // This will be good enough a lot of the time.
1867// // Complex subexpressions (e.g. involving quantifiers)
1868// // are not safe to factor because that collapses their
1869// // distinct paths through the automaton, which affects
1870// // correctness in some cases.
1872// Regexp_Node* first = null;
1873// for (int i = 0; i <= nsub; i++) {
1874// // Invariant: sub[start:i] consists of regexps that all
1875// // begin with first.
1876// Regexp_Node* first_i = null;
1878// first_i = Regexp_Node::LeadingRegexp(sub[i]);
1879// if (first != null &&
1880// // first must be an empty-width op
1881// // OR a char class, any char or any byte
1882// // OR a fixed repeat of a literal, char class, any char or any byte.
1883// (first.op() == .BeginLine ||
1884// first.op() == .EndLine ||
1885// first.op() == .WordBoundary ||
1886// first.op() == .NoWordBoundary ||
1887// first.op() == .BeginText ||
1888// first.op() == .EndText ||
1889// first.op() == .CharClass ||
1890// first.op() == .AnyChar ||
1891// first.op() == .AnyByte ||
1892// (first.op() == .Repeat &&
1893// first.min() == first.max() &&
1894// (first.sub()[0].op() == .Literal ||
1895// first.sub()[0].op() == .CharClass ||
1896// first.sub()[0].op() == .AnyChar ||
1897// first.sub()[0].op() == .AnyByte))) &&
1898// Regexp_Node::Equal(first, first_i))
1902// // Found end of a run with common leading regexp:
1903// // sub[start:i] all begin with first,
1904// // but sub[i] does not.
1906// // Nothing to do - first iteration.
1907// } else if (i == start+1) {
1908// // Just one: don't bother factoring.
1910// Regexp_Node* prefix = first.Incref();
1911// for (int j = start; j < i; j++)
1912// sub[j] = Regexp_Node::RemoveLeadingRegexp(sub[j]);
1913// splices.emplace_back(prefix, sub + start, i - start);
1916// // Prepare for next iteration (if there is one).
1924//void FactorAlternationImpl::Round3(Regexp_Node** sub, int nsub,
1925// Regexp_Node::ParseFlags flags,
1926// std::vector<Splice>* splices) {
1927// // Round 3: Merge runs of literals and/or character classes.
1929// Regexp_Node* first = null;
1930// for (int i = 0; i <= nsub; i++) {
1931// // Invariant: sub[start:i] consists of regexps that all
1932// // are either literals (i.e. runes) or character classes.
1933// Regexp_Node* first_i = null;
1936// if (first != null &&
1937// (first.op() == .Literal ||
1938// first.op() == .CharClass) &&
1939// (first_i.op() == .Literal ||
1940// first_i.op() == .CharClass))
1944// // Found end of a run of Literal/CharClass:
1945// // sub[start:i] all are either one or the other,
1946// // but sub[i] is not.
1948// // Nothing to do - first iteration.
1949// } else if (i == start+1) {
1950// // Just one: don't bother factoring.
1952// CharClassBuilder ccb;
1953// for (int j = start; j < i; j++) {
1954// Regexp_Node* re = sub[j];
1955// if (re.op() == .CharClass) {
1956// CharClass* cc = re.cc();
1957// for (CharClass::iterator it = cc.begin(); it != cc.end(); ++it)
1958// ccb.AddRange(it.lo, it.hi);
1959// } else if (re.op() == .Literal) {
1960// ccb.AddRangeFlags(re.rune(), re.rune(), re.parse_flags());
1962// LOG(DFATAL) << "RE2: unexpected op: " << re.op() << " "
1967// Regexp_Node* re = Regexp_Node::NewCharClass(ccb.GetCharClass(), flags);
1968// splices.emplace_back(re, sub + start, i - start);
1971// // Prepare for next iteration (if there is one).
1979// Collapse the regexps on top of the stack, down to the
1980// first marker, into a new op node (op == .Alternate
1981// or op == .Concat).
1982do_collapse :: (using ps: *ParseState, op: RegexpOp) {
1983 // Scan backward to marker, counting children of composite.
1987 while sub != null && !is_marker(sub.op) {
1990 n += sub.subs.count;
1998 // If there's just one child, leave it alone.
1999 // (Concat of one thing is that one thing; alternate of one thing is same.)
2000 if stacktop != null && stacktop.down == next return;
2002 // Construct op (alternation or concatenation), flattening op of op.
2003 subs: [..] *Regexp_Node;
2004 array_reserve(*subs, n);
2009 while sub != null && !is_marker(sub.op) {
2017 remove_regexp(*ps.pool, sub);
2020 subs[i] = finish_regexp(sub);
2024 assert(i == 0, "Sub expression count mismatch: %", i);
2026 // Takes ownership of subs
2027 re := new_concat_or_alternate(*ps.pool, op, subs, flags, true);
2028 re.simple = compute_simple(re);
2033// Finishes the current concatenation,
2034// collapsing it into a single regexp on the stack.
2035do_concatenation :: (using ps: *ParseState) {
2037 if r1 == null || is_marker(r1.op) {
2038 // empty concatenation is special case
2039 re := new_regexp(*ps.pool, .EmptyMatch, flags);
2040 push_regexp(ps, re);
2042 do_collapse(ps, .Concat);
2045// Finishes the current alternation,
2046// collapsing it to a single regexp on the stack.
2047do_alternation :: (using ps: *ParseState) {
2048 do_vertical_bar(ps);
2049 // Now stack top is kVerticalBar.
2052 remove_regexp(*ps.pool, r1);
2053 do_collapse(ps, .Alternate);
2056// Incremental conversion of concatenated literals into strings.
2057// If top two elements on stack are both literal or string,
2058// collapse into single string.
2059// Don't walk down the stack -- the parser calls this frequently
2060// enough that below the bottom two is known to be collapsed.
2061// Only called when another regexp is about to be pushed
2062// on the stack, so that the topmost literal is not being considered.
2063// (Otherwise ab* would turn into (ab)*.)
2064// If r >= 0, consider pushing a literal r on the stack.
2065// Return whether that happened.
2066maybe_concat_string :: (ps: *ParseState, r: Rune, flags: ParseFlags) -> bool {
2068 if re1 == null return false;
2070 if re2 == null return false;
2072 if re1.op != .Literal && re1.op != .LiteralString return false;
2073 if re2.op != .Literal && re2.op != .LiteralString return false;
2074 if re1.parse_flags & .FoldCase != re2.parse_flags & .FoldCase return false;
2076 if re2.op == .Literal {
2077 // convert into string
2079 re2.op = .LiteralString;
2080 re2.runes.count = 0;
2081 re2.runes.data = null;
2082 add_rune_to_string(re2, rune);
2085 // push re1 into re2.
2086 if (re1.op == .Literal) {
2087 add_rune_to_string(re2, re1.rune);
2090 add_rune_to_string(re2, it);
2092 array_free(re1.runes);
2095 // reuse re1 if possible
2099 re1.parse_flags = flags;
2104 remove_regexp(*ps.pool, re1);
2108add_rune_to_string :: (re: *Regexp_Node, r: Rune) {
2109 assert(re.op == .LiteralString);
2110 if re.runes.count == 0 {
2111 array_reserve(*re.runes, 8);
2113 array_add(*re.runes, r);
2118// Parses a decimal integer, storing it in *np.
2119// Sets *s to span the remainder of the string.
2120parse_integer :: (s: *string) -> int, success: bool {
2121 if !s.count || !isdigit((s.*)[0]) return 0, false;
2123 // Disallow leading zeros.
2124 if s.count >= 2 && (s.*)[0] == #char "0" && isdigit((s.*)[1]) return 0, false;
2127 while s.count && isdigit((s.*)[0]) {
2130 if n >= 100000000 return 0, false;
2131 n = n*10 + c - #char "0";
2137// Parses a repetition suffix like {1,2} or {2} or {2,}.
2138// Sets *s to span the remainder of the string on success.
2139// Sets *lo and *hi to the given range.
2140// In the case of {2,}, the high number is unbounded;
2141// sets *hi to -1 to signify this.
2142// {,2} is NOT a valid suffix.
2143// The Maybe in the name signifies that the regexp parse
2144// doesn't fail even if ParseRepetition does, so the StringPiece
2145// s must NOT be edited unless MaybeParseRepetition returns true.
2146maybe_parse_repetition :: (sp: *string) -> lo: int, hi: int, success: bool {
2148 if !s.count || s[0] != #char "{" return 0, 0, false;
2152 lo, success, remainder := to_integer(s);
2153 if !success return 0, 0, false;
2154 if !remainder return 0, 0, false;
2158 if s[0] == #char "," {
2160 if !s return 0, 0, false;
2162 if s[0] == #char "}" {
2163 // {2,} means at least 2
2166 // {2,4} means 2, 3, or 4.
2167 hi, success, remainder = to_integer(s);
2168 if !success return 0, 0, false;
2172 // {2} means exactly two
2175 if !s.count || s[0] != #char "}" return 0, 0, false;
2179 return lo, hi, true;
2182// @ToDo: Move to unicode module?
2183consume_rune :: (str: *string) -> Rune, len: int, status: Status {
2184 result, len := rune_from_string(str.*);
2185 if !len return 0, 0, make_status(.BadUTF8);
2188 return result, len, .{};
2191// @Speed:: Using my own unicode#is_valid_utf8 would probably be faster
2192is_valid_utf8 :: (str: string) -> bool, status: Status {
2195 result, len, status := consume_rune(*t);
2196 if !len return false, status;
2201// Convert hex digit to value.
2202hex_value :: (c: Rune) -> int, success: bool {
2203 if #char "0" <= c && c <= #char "9" return c - #char "0", true;
2204 if #char "A" <= c && c <= #char "F" return c - #char "A" + 10, true;
2205 if #char "a" <= c && c <= #char "f" return c - #char "a" + 10, true;
2210// Parse an escape sequence (e.g., \n, \{).
2211// Sets *s to span the remainder of the string.
2212// Sets *rp to the named character.
2213parse_escape :: (s: *string, rune_max: s32) -> r: Rune, status: Status, success: bool {
2214 bad_escape :: (begin: string, s: *string) -> Status {
2216 status.code = .BadEscape;
2217 status.error_arg = slice(begin, 0, begin.count - s.count);
2221 assert(s.count && (s.*)[0] == #char "\\");
2224 return 0, make_status(.TrailingBackslash), false;
2229 advance(s); // backslash
2231 c, len, status := consume_rune(s);
2232 if !len return 0, status, false;
2236 if c < Runeself && !isalpha(c) && !isdigit(c) {
2237 // Escaped non-word characters are always themselves.
2238 // PCRE is not quite so rigorous: it accepts things like
2239 // \q, but we don't. We once rejected \_, but too many
2240 // programs and people insist on using it, so allow \_.
2241 return c, status, true;
2243 return 0, bad_escape(begin, s), false;
2246 case #char "1"; #through;
2247 case #char "2"; #through;
2248 case #char "3"; #through;
2249 case #char "4"; #through;
2250 case #char "5"; #through;
2251 case #char "6"; #through;
2253 // Single non-zero octal digit is a backreference; not supported.
2254 if !s.count || (s.*)[0] < #char "0" || (s.*)[0] > #char "7" return 0, bad_escape(begin, s), false;
2257 // consume up to three octal digits; already have one.
2258 code := c - #char "0";
2259 if !s.count return 0, bad_escape(begin, s), false;
2261 if #char "0" <= c && c <= #char "7" {
2262 code = code * 8 + c - #char "0";
2263 advance(s); // digit
2266 if #char "0" <= c && c <= #char "7" {
2267 code = code * 8 + c - #char "0";
2268 advance(s); // digit
2272 if code > rune_max return 0, bad_escape(begin, s), false;
2274 return code, status, true;
2276 // Hexadecimal escapes
2278 if !s.count return 0, bad_escape(begin, s), false;
2280 c, len, status = consume_rune(s);
2281 if !len return 0, status, false;
2283 // Any number of digits in braces.
2284 // Update n as we consume the string, so that
2285 // the whole thing gets shown in the error message.
2286 // Perl accepts any text at all; it ignores all text
2287 // after the first non-hex digit. We require only hex digits,
2288 // and at least one.
2289 c, len, status = consume_rune(s);
2290 if !len return 0, status, false;
2294 hval, success := hex_value(c);
2297 code = cast(Rune)(code * 16 + hval);
2298 if code > rune_max return 0, bad_escape(begin, s), false;
2299 if !s.count return 0, bad_escape(begin, s), false;
2300 c, len, status = consume_rune(s);
2301 if !len return 0, status, false;
2302 hval, success = hex_value(c);
2304 if c != #char "}" || nhex == 0 return 0, bad_escape(begin, s), false;
2305 return code, status, true;
2307 // Easy case: two hex digits.
2308 if !s.count return 0, bad_escape(begin, s), false;
2309 c1, len, status := consume_rune(s);
2310 if !len return 0, status, false;
2311 hc0, success0 := hex_value(c);
2312 hc1, success1 := hex_value(c);
2313 if !success0 || success1 return 0, bad_escape(begin, s), false;
2314 return cast(Rune)(hc0 * 16 + hc1), status, true;
2319 return #char "\n", status, true;
2321 return #char "\r", status, true;
2323 return #char "\t", status, true;
2325 // Less common C escapes.
2326 // @ToDO: Implement?
2328 // return #char "\a", status, true;
2329 return 0, bad_escape(begin, s), false;
2331 // return #char "\f", status, true;
2332 return 0, bad_escape(begin, s), false;
2334 // return #char "\v", status, true;
2335 return 0, bad_escape(begin, s), false;
2337 // This code is disabled to avoid misparsing
2338 // the Perl word-boundary \b as a backspace
2339 // when in POSIX regexp mode. Surprisingly,
2340 // in Perl, \b means word-boundary but [\b]
2341 // means backspace. We don't support that:
2342 // if you want a backspace embed a literal
2343 // backspace character or use \x08.
2346 // *rp = #char "\b";
2351// Add a range to the character class, but exclude newline if asked.
2352// Also handle case folding.
2353add_range_flags :: (using char_class: *CharClass, lo: Rune, hi: Rune, parse_flags: ParseFlags) {
2354 // Take out \n if the flags say so.
2355 cutnl := !(parse_flags & .ClassNL) || (parse_flags & .NeverNL);
2356 if cutnl && lo <= #char "\n" && #char "\n" <= hi {
2357 if lo < #char "\n" {
2358 add_range_flags(char_class, lo, #char "\n" - 1, parse_flags);
2360 if hi > #char "\n" {
2361 add_range_flags(char_class, #char "\n" + 1, hi, parse_flags);
2366 // If folding case, add fold-equivalent characters too.
2367 if parse_flags & .FoldCase {
2368 add_folded_range(char_class, lo, hi, 0);
2370 add_range(char_class, lo, hi);
2374// Look for a group with the given name.
2375lookup_group :: (name: string, groups: [] UGroup) -> *UGroup {
2377 if it.name == name return it;
2382// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
2383lookup_posix_group :: (name: string) -> *UGroup {
2384 return lookup_group(name, posix_groups);
2386lookup_perl_group :: (name: string) -> *UGroup {
2387 return lookup_group(name, perl_groups);
2391 // Fake UGroup containing all Runes
2392 any16 :: URange16.{0, 65535};
2393 any32 :: URange32.{65535, Runemax};
2394 any16array :: URange16.[any16];
2395 any32array :: URange32.[any32];
2396 anygroup :: UGroup.{"Any", 1, any16array, any32array};
2398 // Look for a Unicode group with the given name (e.g., "Han")
2399 lookup_unicode_group :: (name: string) -> *UGroup {
2403 return lookup_group(name, unicode_groups);
2407// Add a UGroup or its negation to the character class.
2408// @ToDo: Why is sign not a boolean?
2409add_unicode_group :: (cc: *CharClass, g: *UGroup, sign: int, parse_flags: ParseFlags) {
2411 for g.r16 add_range_flags(cc, it.lo, it.hi, parse_flags);
2412 for g.r32 add_range_flags(cc, it.lo, it.hi, parse_flags);
2414 if parse_flags & .FoldCase {
2415 // Normally adding a case-folded group means
2416 // adding all the extra fold-equivalent runes too.
2417 // But if we're adding the negation of the group,
2418 // we have to exclude all the runes that are fold-equivalent
2419 // to what's already missing. Too hard, so do in two steps.
2420 char_class1: CharClass;
2421 // @ToDo: Cleanup char_class1
2422 add_unicode_group(*char_class1, g, 1, parse_flags);
2423 // If the flags say to take out \n, put it in, so that negating will take it out.
2424 // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
2425 cutnl := !(parse_flags & .ClassNL) || (parse_flags & .NeverNL);
2427 add_range(*char_class1, #char "\n", #char "\n");
2429 negate(*char_class1);
2430 add_char_class(cc, char_class1);
2436 add_range_flags(cc, next, it.lo - 1, parse_flags);
2442 add_range_flags(cc, next, it.lo - 1, parse_flags);
2446 if next <= Runemax {
2447 add_range_flags(cc, next, Runemax, parse_flags);
2453// Maybe parse a Perl character class escape sequence.
2454// Only recognizes the Perl character classes (\d \s \w \D \S \W),
2455// not the Perl empty-string classes (\b \B \A \Z \z).
2456// On success, sets *s to span the remainder of the string
2457// and returns the corresponding UGroup.
2458// The StringPiece must *NOT* be edited unless the call succeeds.
2459maybe_parse_perl_cc_escape :: (s: *string, parse_flags: ParseFlags) -> *UGroup {
2460 if !(parse_flags & .PerlClasses) return null;
2461 if s.count < 2 || (s.*)[0] != #char "\\" return null;
2463 // Could use StringPieceToRune, but there aren't
2464 // any non-ASCII Perl group names.
2465 name := slice(s.*, 0, 2);
2466 g := lookup_perl_group(name);
2467 if g == null return null;
2468 advance(s, name.count);
2472ParseResult :: enum {
2473 // @ToDo: Rename those
2474 kParseOk; // Did some parsing.
2475 kParseError; // Found an error.
2476 kParseNothing; // Decided not to parse.
2479// Maybe parses a Unicode character group like \p{Han} or \P{Han}
2480// (the latter is a negated group).
2481maybe_parse_unicode_group :: (s: *string, parse_flags: ParseFlags, cc: *CharClass) -> status: Status, result: ParseResult {
2483 // Decide whether to parse.
2484 if !(parse_flags & .UnicodeGroups) return status, .kParseNothing;
2485 if s.count < 2 || (s.*)[0] != #char "\\" return status, .kParseNothing;
2488 if c != #char "p" && c != #char "P" return status, .kParseNothing;
2490 // Committed to parse. Results:
2491 sign: int = 1; // -1 = negated char class
2492 if (c == #char "P") {
2495 seq := s.*; // \p{Han} or \pL
2496 name: string; // Han or L
2497 advance(s, 2); // '\\', 'p'
2500 if (s.*)[0] != #char "{" {
2503 c, len, status = consume_rune(s);
2504 if !len return status, .kParseError;
2506 // Name is just the single rune we just parsed
2507 name = slice(seq, 2, len);
2509 // Name is in braces. Look for closing }
2510 end := find_index_from_left(s.*, #char "}");
2513 valid, status = is_valid_utf8(seq);
2514 if !valid return status, .kParseError;
2516 return make_status(.BadCharRange, seq), .kParseError;
2518 name = slice(s.*, 1, end - 1); // without '}'
2519 advance(s, end + 1);
2521 valid, status = is_valid_utf8(name);
2522 if !valid return status, .kParseError;
2525 // Chop seq where s now begins.
2526 seq.count = seq.count - s.count;
2528 if name && name[0] == #char "^" {
2530 advance(*name); // '^'
2534 // Look up the group in the RE2 Unicode data.
2535 g := lookup_unicode_group(name);
2537 status.code = .BadCharRange;
2538 status.error_arg = seq;
2539 return status, .kParseError;
2542 add_unicode_group(cc, g, sign, parse_flags);
2544 // // Look up the group in the ICU Unicode data. Because ICU provides full
2545 // // Unicode properties support, this could be more than a lookup by name.
2546 // ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
2547 // std::string("\\p{") + std::string(name) + std::string("}"));
2548 // UErrorCode uerr = U_ZERO_ERROR;
2549 // ::icu::UnicodeSet uset(ustr, uerr);
2550 // if (U_FAILURE(uerr)) {
2551 // status.set_code(.BadCharRange);
2552 // status.set_error_arg(seq);
2553 // return .kParseError;
2556 // // Convert the UnicodeSet to a URange32 and UGroup that we can add.
2557 // int nr = uset.getRangeCount();
2558 // PODArray<URange32> r(nr);
2559 // for (int i = 0; i < nr; i++) {
2560 // r[i].lo = uset.getRangeStart(i);
2561 // r[i].hi = uset.getRangeEnd(i);
2563 // UGroup g = {"", +1, 0, 0, r.data(), nr};
2564 // AddUGroup(cc, &g, sign, parse_flags);
2567 return status, .kParseOk;
2570// Parses a character class name like [:alnum:].
2571// Sets *s to span the remainder of the string.
2572// Adds the ranges corresponding to the class to ranges.
2573parse_cc_name :: (s: *string, parse_flags: ParseFlags, cc: *CharClass) -> status: Status, result: ParseResult {
2574 // Check begins with [:
2575 if s.count < 2 || (s.*)[0] != #char "[" || (s.*)[1] != #char ":" return Status.{}, .kParseNothing;
2577 // Look for closing :].
2578 end := find_index_from_left(s.*, ":]");
2580 // If no closing :], then ignore.
2581 if !end return Status.{}, .kParseNothing;
2583 // Got it. Check that it's valid.
2584 name := slice(s.*, 2, end - 2);
2585 g := lookup_posix_group(name);
2587 return make_status(.BadCharRange, name), .kParseError;
2590 advance(s, name.count + 4);
2591 add_unicode_group(cc, g, g.sign, parse_flags);
2592 return Status.{}, .kParseOk;
2595// Parses a character inside a character class.
2596// There are fewer special characters here than in the rest of the regexp.
2597// Sets *s to span the remainder of the string.
2598parse_cc_character :: (using ps: *ParseState, s: *string, whole_class: string) -> rune: Rune, status: Status, success: bool {
2599 if !s return 0, make_status(.MissingBracket, whole_class), false;
2601 // Allow regular escape sequences even though
2602 // many need not be escaped in this context.
2603 if (s.*)[0] == #char "\\" {
2604 rune, status, success := parse_escape(s, rune_max);
2605 return rune, status, success;
2607 // Otherwise take the next rune.
2608 rune, len, status := consume_rune(s);
2609 return rune, status, len > 0;
2613// Parses a character class character, or, if the character
2614// is followed by a hyphen, parses a character class range.
2615// For single characters, rr.lo == rr.hi.
2616// Sets *s to span the remainder of the string.
2617parse_cc_range :: (using ps: *ParseState, s: *string, whole_class: string) -> RuneRange, status: Status, success: bool {
2621 rr.lo, status, success = parse_cc_character(ps, s, whole_class);
2622 if !success return rr, status, false;
2624 // [a-] means (a|-), so check for final ].
2625 if s.count >= 2 && (s.*)[0] == #char "-" && (s.*)[1] != #char "]" {
2626 advance(s); // #char "-"
2628 rr.hi, status, success = parse_cc_character(ps, s, whole_class);
2629 if !success return rr, status, false;
2632 return rr, make_status(.BadCharRange, slice(os, 0, os.count - s.count)), false;
2637 return rr, status, true;
2640// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
2641// Sets *s to span the remainder of the string.
2642// Sets *out_re to the regexp for the class.
2643parse_char_class :: (using ps: *ParseState, s: *string) -> *Regexp_Node, status: Status, success: bool {
2645 // Caller checked this.
2646 assert(s.count && (s.*)[0] == #char "[");
2649 re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
2650 re.char_class = New(CharClass);
2651 re.char_class_finished = false;
2653 if s.count && (s.*)[0] == #char "^" {
2656 if !(flags & .ClassNL) || flags & .NeverNL {
2657 // If NL can't match implicitly, then pretend
2658 // negated classes include a leading \n.
2659 add_range(re.char_class, #char "\n", #char "\n");
2662 first := true; // ] is okay as first char in class
2663 while s.count && (first || (s.*)[0] != #char "]") {
2664 // - is only okay unescaped as first or last in class.
2665 // Except that Perl allows - anywhere.
2666 if !first && (s.*)[0] == #char "-" && !(flags & .PerlX) && (s.count == 1 || (s.*)[1] != #char "]") {
2667 remove_regexp(*ps.pool, re);
2671 r, n, status := consume_rune(*t);
2673 return null, status, false;
2676 return null, make_status(.BadCharRange, slice(s.*, 0, 1 + n)), false;
2680 // Look for [:alnum:] etc.
2681 if s.count > 2 && (s.*)[0] == #char "[" && (s.*)[1] == #char ":" {
2682 status, result := parse_cc_name(s, flags, re.char_class);
2683 if #complete result == {
2687 remove_regexp(*ps.pool, re);
2688 return null, status, false;
2689 case .kParseNothing;
2693 // Look for Unicode character group like \p{Han}
2694 if s.count > 2 && (s.*)[0] == #char "\\" && ((s.*)[1] == #char "p" || (s.*)[1] == #char "P") {
2695 status, result := maybe_parse_unicode_group(s, flags, re.char_class);
2696 if #complete result == {
2700 remove_regexp(*ps.pool, re);
2701 return null, status, false;
2702 case .kParseNothing;
2706 // Look for Perl character class symbols (extension).
2707 g := maybe_parse_perl_cc_escape(s, flags);
2709 add_unicode_group(re.char_class, g, g.sign, flags);
2713 // Otherwise assume single character or simple range.
2714 rr, status, success := parse_cc_range(ps, s, whole_class);
2716 remove_regexp(*ps.pool, re);
2717 return null, status, false;
2720 // AddRangeFlags is usually called in response to a class like
2721 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless
2722 // Regexp_Node::ClassNL is set. In an explicit range or singleton
2723 // like we just parsed, we do not filter \n out, so set ClassNL
2725 add_range_flags(re.char_class, rr.lo, rr.hi, flags | .ClassNL);
2729 remove_regexp(*ps.pool, re);
2730 return null, make_status(.MissingBracket, whole_class), false;
2735 negate(re.char_class);
2738 return re, .{}, true;
2741// Is this a valid capture name? [A-Za-z0-9_]+
2742// PCRE limits names to 32 bytes.
2743// Python rejects names starting with digits.
2744// We don't enforce either of those.
2745is_valid_capture_name :: (name: string) -> bool {
2746 if !name return false;
2747 for 0..name.count - 1 {
2749 if !((#char "0" <= c && c <= #char "9") ||
2750 (#char "a" <= c && c <= #char "z") ||
2751 (#char "A" <= c && c <= #char "Z") ||
2758// Parses a Perl flag setting or non-capturing group or both,
2759// like (?i) or (?: or (?i:. Removes from s, updates parse state.
2760// The caller must check that s begins with "(?".
2761// Returns true on success. If the Perl flag is not
2762// well-formed or not supported, sets status and returns false.
2763parse_perl_flags :: (using ps: *ParseState, s: *string) -> Status, success: bool {
2764 // Caller is supposed to check this.
2765 assert(flags & .PerlX && s.count >= 2 && (s.*)[0] == #char "(" && (s.*)[1] == #char "?");
2768 advance(*t, 2); // "(?"
2770 // Check for named captures, first introduced in Python's regexp library.
2771 // As usual, there are three slightly different syntaxes:
2773 // (?P<name>expr) the original, introduced by Python
2774 // (?<name>expr) the .NET alteration, adopted by Perl 5.10
2775 // (?'name'expr) another .NET alteration, adopted by Perl 5.10
2777 // Perl 5.10 gave in and implemented the Python version too,
2778 // but they claim that the last two are the preferred forms.
2779 // PCRE and languages based on it (specifically, PHP and Ruby)
2780 // support all three as well. EcmaScript 4 uses only the Python form.
2782 // In both the open source world (via Code Search) and the
2783 // Google source tree, (?P<expr>name) is the dominant form,
2784 // so that's the one we implement. One is enough.
2785 if (t.count > 2 && t[0] == #char "P" && t[1] == #char "<") {
2788 end := find_index_from_left(t, #char ">");
2790 valid, status := is_valid_utf8(s.*);
2791 if !valid return status, false;
2792 return make_status(.BadNamedCapture, s.*), false;
2795 // t is "name>...", t[end] == '>'
2796 capture := slice(s.*, 2, end + 3);
2797 name := slice(t, 0, end);
2798 valid, status := is_valid_utf8(name);
2799 if !valid return status, false;
2801 if !is_valid_capture_name(name) return make_status(.BadNamedCapture, capture), false;
2803 do_left_paren(ps, name);
2805 advance(s, capture.count + 2);
2806 return status, true;
2814 if !t.count return make_status(.BadPerlOp, s.*), false;
2815 c, len, status := consume_rune(*t);
2816 if !len return status, false;
2819 return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2824 nflags &= ~.FoldCase;
2826 nflags |= .FoldCase;
2829 case #char "m"; // opposite of our OneLine
2834 nflags &= ~.OneLine;
2848 nflags &= ~.NonGreedy;
2850 nflags |= .NonGreedy;
2855 if negated return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2862 do_left_paren_no_capture(ps);
2871 if negated && !sawflags return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2878// // Converts latin1 (assumed to be encoded as Latin1 bytes)
2879// // into UTF8 encoding in string.
2880// // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2881// // deprecated and because it rejects code points 0x80-0x9F.
2882// void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
2886// for (size_t i = 0; i < latin1.size(); i++) {
2887// Rune r = latin1[i] & 0xFF;
2888// int n = runetochar(buf, &r);
2889// utf.append(buf, n);
2893// Actually C library functions, but it feels stupid to call out to libc for that
2894isalpha :: (c: s32) -> bool {
2895 return (c >= #char "A" && c <= #char "Z") || (c >= #char "a" && c <= #char "z");
2897isdigit :: (c: s32) -> bool {
2898 return c >= #char "0" && c <= #char "9";
2915#import "Bucket_Array";