git ptrace.dev


        
            
            
        
        0// This parser is based on RE2’s parser
1// https://github.com/google/re2
2
3ParseFlags :: enum_flags u16 {
4	NoParseFlags :: 0;
5	FoldCase     :: 1<<0;   // Fold case during matching (case-insensitive).
6	// Literal      :: 1<<1;   // Treat s as literal string instead of a regexp.
7	ClassNL      :: 1<<2;   // Allow char classes like [^a-z] and \D and \s
8	// and [[:space:]] to match newline.
9	DotNL        :: 1<<3;   // Allow . to match newline.
10	MatchNL      :: ClassNL | DotNL;
11	OneLine      :: 1<<4;   // Treat ^ and $ as only matching at beginning and
12	// end of text, not around embedded newlines.
13	// (Perl's default)
14	// Latin1       :: 1<<5;   // Regexp_Node and text are in Latin1, not UTF-8.
15	NonGreedy    :: 1<<6;   // Repetition operators are non-greedy by default.
16	PerlClasses  :: 1<<7;   // Allow Perl character classes like \d.
17	PerlB        :: 1<<8;   // Allow Perl's \b and \B.
18	PerlX        :: 1<<9;   // Perl extensions:
19	//   non-capturing parens - (?: )
20	//   non-greedy operators - *? +? ?? {}?
21	//   flag edits - (?i) (?-i) (?i: )
22	//     i - FoldCase
23	//     m - !OneLine
24	//     s - DotNL
25	//     U - NonGreedy
26	//   line ends: \A \z
27	//   \Q and \E to disable/enable metacharacters
28	//   (?P<name>expr) for named captures
29	//   \C to match any single byte
30	UnicodeGroups :: 1<<10;  // Allow \p{Han} for Unicode Han group
31	//   and \P{Han} for its negation.
32	NeverNL      :: 1<<11;  // Never match NL, even if the regexp mentions
33	//   it explicitly.
34	NeverCapture :: 1<<12;  // Parse all parens as non-capturing.
35
36	// As close to Perl as we can get.
37	LikePerl      :: ClassNL | OneLine | PerlClasses | PerlB | PerlX |
38	UnicodeGroups;
39
40	// Internal use only.
41	WasDollar     :: 1<<13;  // on EndText: was $ in regexp text
42	AllParseFlags :: (1<<14)-1;
43};
44
45
46Regexp_Node :: struct {
47	// Operator.
48	op: RegexpOp;
49	// Is this regexp structure already simple
50	// (has it been returned by Simplify)?
51	simple: bool;
52
53	// Flags saved from parsing and used during execution.
54	// (Only FoldCase is used.)
55	parse_flags: ParseFlags;
56
57	// Subexpressions.
58	// Concat and Alternate handle larger numbers of subexpressions
59	// by building concatenation or alternation trees.
60	// Other routines should call Concat or Alternate instead of
61	// filling in sub() by hand.
62	MAX_SUBS :: 0xFFFF;
63	subs: [] *Regexp_Node;
64	// Extra space for parse and teardown stacks.
65	down: *Regexp_Node;
66
67	// Arguments to operator.
68	using args: union {
69		struct {  // Repeat
70			max: s16;
71			min: s16;
72		}
73		struct {  // Capture
74			cap: s32;
75			name: string;
76		}
77		runes: [..] Rune; // LiteralString
78		struct { // CharClass
79			char_class: *CharClass;
80			char_class_finished: bool;
81		}
82		rune: Rune;  // Literal
83		// @ToDo Verify size is ok
84		match_id: int;  // HaveMatch
85		// void *the_union_[2];  // as big as any other element, for memset
86	};
87}
88
89RegexpOp :: enum {
90	// @ToDo: Rename all options
91
92	// Matches no strings.
93	NoMatch :: 1;
94
95	// Matches empty string.
96	EmptyMatch;
97
98	// Matches rune_.
99	Literal;
100
101	// Matches runes_.
102	LiteralString;
103
104	// Matches concatenation of sub_[0..nsub-1].
105	Concat;
106	// Matches union of sub_[0..nsub-1].
107	Alternate;
108
109	// Matches sub_[0] zero or more times.
110	Star;
111	// Matches sub_[0] one or more times.
112	Plus;
113	// Matches sub_[0] zero or one times.
114	Quest;
115
116	// Matches sub_[0] at least min_ times, at most max_ times.
117	// max_ == -1 means no upper limit.
118	Repeat;
119
120	// Parenthesized (capturing) subexpression.  Index is cap_.
121	// Optionally, capturing name is name_.
122	Capture;
123
124	// Matches any character.
125	AnyChar;
126
127	// Matches any byte [sic].
128	AnyByte;
129
130	// Matches empty string at beginning of line.
131	BeginLine;
132	// Matches empty string at end of line.
133	EndLine;
134
135	// Matches word boundary "\b".
136	WordBoundary;
137	// Matches not-a-word boundary "\B".
138	NoWordBoundary;
139
140	// Matches empty string at beginning of text.
141	BeginText;
142	// Matches empty string at end of text.
143	EndText;
144
145	// Matches character class given by cc_.
146	CharClass;
147
148	// Forces match of entire expression right now,
149	// with match ID match_id_ (used by RE2::Set).
150	HaveMatch;
151
152	// The ones below are only used internally by the parser and don’t occur in finshed RegExps
153	kLeftParen;
154	kVerticalBar;
155};
156
157Regexp_Pool :: struct {
158	pool: *Pool;
159	bucket: Bucket_Array(Regexp_Node, 64);
160}
161
162init :: (pool: *Regexp_Pool) {
163	pool.pool = New(Pool);
164	set_allocators(pool.pool);
165	#if REGEXP_DEBUG	pool.pool.overwrite_memory = true;
166	pool.bucket.allocator.proc = pool_allocator_proc;
167	pool.bucket.allocator.data = pool.pool;
168}
169
170uninit :: (pool: *Regexp_Pool) {
171	release(pool.pool);
172	free(pool.pool);
173}
174
175#scope_module
176
177new_regexp :: (pool: *Regexp_Pool, op: RegexpOp, flags: ParseFlags) -> *Regexp_Node {
178	re := find_and_occupy_empty_slot(*pool.bucket);
179	// Need to clear, in case we re-use a bucket slot
180	// @ToDo, @Speed: This is unnecessary when its a fresh slot. Find a better way
181	memset(re, 0, size_of(Regexp_Node));
182	re.op = op;
183	re.parse_flags = flags;
184	return re;
185}
186
187remove_regexp :: (pool: *Regexp_Pool, re: *Regexp_Node) {
188	// @ToDo: Delete subs!
189	// @ToDO: Delete char_class!
190	for * pool.bucket {
191		if it == re {
192			remove it;
193			break;
194		}
195	}
196}
197
198// Tests equality of all top-level structure but not subregexps.
199regexp_top_equal :: (a: *Regexp_Node, b: *Regexp_Node) -> bool {
200	if a.op != b.op	 return false;
201
202	if a.op == {
203		case .NoMatch; #through;
204		case .EmptyMatch; #through;
205		case .AnyChar; #through;
206		case .AnyByte; #through;
207		case .BeginLine; #through;
208		case .EndLine; #through;
209		case .WordBoundary; #through;
210		case .NoWordBoundary; #through;
211		case .BeginText;
212			return true;
213
214		case .EndText;
215			// The parse flags remember whether it's \z or (?-m:$),
216			// which matters when testing against PCRE.
217			return ((a.parse_flags ^ b.parse_flags) & .WasDollar) == 0;
218
219		case .Literal;
220			return a.rune == b.rune && ((a.parse_flags ^ b.parse_flags) & .FoldCase) == 0;
221
222		case .LiteralString;
223			return a.runes.count == b.runes.count && ((a.parse_flags ^ b.parse_flags) & .FoldCase) == 0 &&
224				memcmp(a.runes.data, b.runes.data, a.runes.count * size_of(Rune)) == 0;
225
226		case .Alternate; #through;
227		case .Concat;
228			return a.subs.count == b.subs.count;
229
230		case .Star; #through;
231		case .Plus; #through;
232		case .Quest;
233			return ((a.parse_flags ^ b.parse_flags) & .NonGreedy) == 0;
234
235		case .Repeat;
236			return ((a.parse_flags ^ b.parse_flags) & .NonGreedy) == 0 && a.min == b.min && a.max == b.max;
237
238		case .Capture;
239			return a.cap == b.cap && a.name == b.name;
240
241		case .HaveMatch;
242			return a.match_id == b.match_id;
243
244		case .CharClass;
245			return a.char_class.nrunes == b.char_class.nrunes &&
246				a.char_class.ranges.count == b.char_class.ranges.count &&
247				memcmp(a.char_class.ranges.data, b.char_class.ranges.data, a.char_class.ranges.count * size_of(RuneRange)) == 0;
248		case;
249			assert(false, "Unexpected op: %", a.op);
250			return false;
251	}
252}
253
254regexp_equal :: (a: *Regexp_Node, b: *Regexp_Node) -> bool {
255	if a == null || b == null		return a == b;
256
257	if !regexp_top_equal(a, b)	return false;
258
259	// The stack has pairs of regexps waiting to
260	// be compared.  The regexps are only equal if
261	// all the pairs end up being equal.
262	stack: [..] *Regexp_Node;
263    stack.allocator = temp;
264
265	a1 := a;
266	b1 := b;
267	while true {
268		// Invariant: TopEqual(a1, b1) == true.
269		a2: *Regexp_Node;
270		b2: *Regexp_Node;
271		if a1.op == {
272			case .Alternate; #through;
273			case .Concat;
274				for i: 0..a1.subs.count-1 {
275					a2 = a1.subs[i];
276					b2 = b1.subs[i];
277					if !regexp_top_equal(a2, b2)	return false;
278
279					array_add(*stack, a2);
280					array_add(*stack, b2);
281				}
282
283			case .Star; #through;
284			case .Plus; #through;
285			case .Quest; #through;
286			case .Repeat; #through;
287			case .Capture;
288				a2 = a1.subs[0];
289				b2 = b1.subs[0];
290				if !regexp_top_equal(a2, b2)	return false;
291				// Really:
292				//   stack.push_back(a2);
293				//   stack.push_back(b2);
294				//   break;
295				// but faster to assign directly and loop.
296				a1 = a2;
297				b1 = b2;
298				continue;
299			case;
300				return true;
301		}
302
303		if !stack.count		return true;
304
305		a1 = stack[stack.count-2];
306		b1 = stack[stack.count-1];
307		stack.count -= 2;
308	}
309
310	return true;
311}
312
313// Determines whether regexp matches must be anchored
314// with a fixed string prefix.  If so, returns the prefix and
315// the regexp that remains after the prefix.  The prefix might
316// be ASCII case-insensitive.
317required_prefix :: (re: *Regexp_Node, pool: *Regexp_Pool) -> prefix: string, foldcase: bool, suffix: *Regexp_Node {
318	// No need for a walker: the regexp must be of the form
319	// 1. some number of ^ anchors
320	// 2. a literal char or string
321	// 3. the rest
322	suffix := re;
323	if re.op != .Concat		return "", false, suffix;
324	i := 0;
325	while i < re.subs.count && re.subs[i].op == .BeginText {
326		i += 1;
327	}
328	if i == 0 || i >= re.subs.count		return "", false, suffix;
329
330	to_analyze := re.subs[i];
331	if to_analyze.op != .Literal && to_analyze.op != .LiteralString		return "", false, suffix;
332
333	i += 1;
334	if (i < re.subs.count) {
335		suffix = new_concat(pool, array_view(re.subs, i, re.subs.count - i), re.parse_flags);
336	} else {
337		suffix = new_regexp(pool, .EmptyMatch, re.parse_flags);
338	}
339
340	runes: [] Rune;
341	if to_analyze.op == .Literal {
342		runes.data = *to_analyze.rune;
343		runes.count = 1;
344	} else {
345		runes = to_analyze.runes;
346	}
347
348	{
349		push_allocator(pool_allocator_proc, pool.pool);
350		prefix := string_from_runes(runes);
351		return prefix, (to_analyze.parse_flags & .FoldCase) != 0, suffix;
352	}
353}
354
355required_prefix_for_accel :: (re: Regexp_Node) -> prefix: string, foldcase: bool {
356	// No need for a walker: the regexp must either begin with or be
357	// a literal char or string.
358	to_analyze := *re;
359	if re.op == .Concat && re.subs.count > 0 {
360		to_analyze = re.subs[0];
361	}
362
363	if to_analyze.op != .Literal && to_analyze.op != .LiteralString	return "", false;
364
365	runes: [] Rune;
366	if to_analyze.op == .Literal {
367		runes.data = *to_analyze.rune;
368		runes.count = 1;
369	} else {
370		runes = to_analyze.runes;
371	}
372	prefix := string_from_runes(runes);
373
374	return prefix, (to_analyze.parse_flags & .FoldCase) != 0;
375}
376
377// @ToDo: rename these
378StatusCode :: enum {
379	// No error
380	Success :: 0;
381
382	// Unexpected error
383	InternalError;
384
385	// Parse errors
386	BadEscape;          // bad escape sequence
387	BadCharClass;       // bad character class
388	BadCharRange;       // bad character class range
389	MissingBracket;     // missing closing ]
390	MissingParen;       // missing closing )
391	UnexpectedParen;    // unexpected closing )
392	TrailingBackslash;  // at end of regexp
393	RepeatArgument;     // repeat argument missing, e.g. "*"
394	RepeatSize;         // bad repetition argument
395	RepeatOp;           // bad repetition operator
396	BadPerlOp;          // bad perl operator
397	BadUTF8;            // invalid UTF-8 in regexp
398	BadNamedCapture;    // bad named capture
399};
400
401// Regular expression parse state.
402// The list of parsed regexps so far is maintained as a vector of
403// Regexp_Node pointers called the stack.  Left parenthesis and vertical
404// bar markers are also placed on the stack, as Regexps with
405// non-standard opcodes.
406// Scanning a left parenthesis causes the parser to push a left parenthesis
407// marker on the stack.
408// Scanning a vertical bar causes the parser to pop the stack until it finds a
409// vertical bar or left parenthesis marker (not popping the marker),
410// concatenate all the popped results, and push them back on
411// the stack (DoConcatenation).
412// Scanning a right parenthesis causes the parser to act as though it
413// has seen a vertical bar, which then leaves the top of the stack in the
414// form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.
415// The parser pops all this off the stack and creates an alternation of the
416// regexps (DoAlternation).
417ParseState :: struct {
418	flags: ParseFlags;
419	whole_regexp: string;
420	status: Status;
421	stacktop: *Regexp_Node;
422	ncap: s32;  // number of capturing parens seen
423	rune_max := Runemax;  // maximum char value for this encoding
424	pool: Regexp_Pool;
425}
426
427Status :: struct {
428	code: StatusCode;
429	error_arg: string;
430}
431
432make_status :: (code := StatusCode.Success, error_arg := "") -> Status {
433	status: Status;
434	status.code = code;
435	status.error_arg = error_arg;
436	return status;
437}
438
439rep :: (ps: *ParseState, op: RegexpOp, t: *string, lastunary: string) -> isunary: string, success: bool  {
440	opstr := t.*;
441	nongreedy := false;
442	advance(t);  // '*' or '+' or '?'
443	if (ps.flags & .PerlX) {
444		if (t.count && (t.*)[0] == #char "?") {
445			nongreedy = true;
446			advance(t);  // '?'
447		}
448		if (lastunary) {
449			// In Perl it is not allowed to stack repetition operators:
450			//   a** is a syntax error, not a double-star.
451			// (and a++ means something else entirely, which we don't support!)
452			ps.status.code = .RepeatOp;
453			ps.status.error_arg.data = lastunary.data;
454			ps.status.error_arg.count = t.data - lastunary.data;
455			return "", false;
456		}
457	}
458
459	opstr.count -= t.count;
460	if !push_repeat_op(ps, op, opstr, nongreedy)
461		return "", false;
462	return opstr, true;
463}
464
465// Parses the regular expression given by s,
466// returning the corresponding Regexp_Node tree.
467// Returns null on error.
468parse :: (s: string, global_flags: ParseFlags = .NoParseFlags) -> *Regexp_Node, status: Status, pool: Regexp_Pool {
469	ps: ParseState;
470	ps.whole_regexp = s;
471	ps.flags = global_flags;
472
473	init(*ps.pool);
474	push_allocator(pool_allocator_proc, ps.pool.pool);
475
476	lastunary: string;
477	t := s;
478	while t.count {
479		isunary: string;
480		if t[0] == {
481			case; {
482				r, len, status := consume_rune(*t);
483				if !len		return null, status, ps.pool;
484				push_literal(*ps, r);
485			}
486
487			case #char "(";
488				// "(?" introduces Perl escape.
489				if ps.flags & .PerlX && t.count >= 2 && t[1] == #char "?" {
490					// Flag changes and non-capturing groups.
491					success: bool;
492					ps.status, success = parse_perl_flags(*ps, *t);
493					if !success		return null, ps.status, ps.pool;
494				} else if ps.flags & .NeverCapture {
495					do_left_paren_no_capture(*ps);
496					advance(*t); // '('
497				} else {
498					do_left_paren(*ps);
499					advance(*t); // '('
500				}
501
502			case #char "|";
503				do_vertical_bar(*ps);
504				advance(*t);  // '|'
505
506			case #char ")";
507				if !do_right_paren(*ps)		return null, ps.status, ps.pool;
508				advance(*t);  // ')'
509
510			case #char "^";  // Beginning of line.
511				push_caret(*ps);
512				advance(*t);  // '^'
513
514			case #char "$";  // End of line.
515				push_dollar(*ps);
516				advance(*t);  // '$'
517
518			case #char ".";  // Any character (possibly except newline).
519				push_dot(*ps);
520				advance(*t);  // '.'
521
522			case #char "[";   // Character class.
523				re, status, success := parse_char_class(*ps, *t);
524				if !success					return null, status, ps.pool;
525				push_regexp(*ps, re);
526
527			case #char "*";  // Zero or more.
528				success: bool;
529				isunary, success = rep(*ps, .Star, *t, lastunary);
530				if !success		return null, ps.status, ps.pool;
531			case #char "+";  // One or more.
532				success: bool;
533				isunary, success = rep(*ps, .Plus, *t, lastunary);
534				if !success		return null, ps.status, ps.pool;
535			case #char "?";  // Zero or one.
536				success: bool;
537				isunary, success = rep(*ps, .Quest, *t, lastunary);
538				if !success		return null, ps.status, ps.pool;
539
540			case #char "{";   // Counted repetition.
541				opstr := t;
542				lo, hi, success := maybe_parse_repetition(*t);
543				if !success {
544					// Treat like a literal.
545					push_literal(*ps, #char "{");
546					advance(*t);  // '{'
547				} else {
548					nongreedy := false;
549					if ps.flags & .PerlX {
550						if t && t[0] == #char "?" {
551							nongreedy = true;
552							advance(*t);  // '?'
553						}
554						if lastunary {
555							// Not allowed to stack repetition operators.
556							ps.status.code = .RepeatOp;
557							ps.status.error_arg = slice(lastunary, 0, t.data - lastunary.data);
558							return null, ps.status, ps.pool;
559						}
560					}
561					opstr.count -= t.count;
562					if !push_repetition(*ps, lo, hi, opstr, nongreedy) {
563						return null, ps.status, ps.pool;
564					}
565					isunary = opstr;
566				}
567
568			case #char "\\";    // Escaped character or Perl sequence.
569								// \b and \B: word boundary or not
570				parsed := false;
571				if ps.flags & .PerlB &&
572						t.count >= 2 && (t[1] == #char "b" || t[1] == #char "B") {
573					if t[1] == #char "b" {
574						push_simple_op(*ps, .WordBoundary);
575					} else {
576						push_simple_op(*ps, .NoWordBoundary);
577					}
578					advance(*t, 2);  // '\\', 'b'
579					parsed = true;
580				} else if (ps.flags & .PerlX) && t.count >= 2 {
581					if (t[1] == #char "A") {
582						push_simple_op(*ps, .BeginText);
583						advance(*t, 2);  // '\\', 'A'
584						parsed = true;
585					} else if (t[1] == #char "z") {
586						push_simple_op(*ps, .EndText);
587						advance(*t, 2);  // '\\', 'z'
588						parsed = true;
589						// Do not recognize \Z, because this library can't
590						// implement the exact Perl/PCRE semantics.
591						// (This library treats "(?-m)$" as \z, even though
592						// in Perl and PCRE it is equivalent to \Z.)
593					} else if (t[1] == #char "C") {  // \C: any byte [sic]
594						push_simple_op(*ps, .AnyByte);
595						advance(*t, 2);  // '\\', 'C'
596						parsed = true;
597					} else if (t[1] == #char "Q") {  // \Q ... \E: the ... is always literals
598						advance(*t, 2);  // '\\', 'Q'
599						while t.count {
600							if (t.count >= 2 && t[0] == #char "\\" && t[1] == #char "E") {
601								advance(*t, 2);  // '\\', 'E'
602								parsed = true;
603							}
604							r, len, status := consume_rune(*t);
605							if !len		return null, status, ps.pool;
606							push_literal(*ps, r);
607						}
608						parsed = true;
609					}
610				}
611
612				if (!parsed && t.count >= 2 && (t[1] == #char "p" || t[1] == #char "P")) {
613					re := new_regexp(*ps.pool, .CharClass, ps.flags & ~.FoldCase);
614					re.char_class = New(CharClass);
615					re.char_class_finished = false;
616					parseStatus, parseResult := maybe_parse_unicode_group(*t, ps.flags, re.char_class);
617					if parseResult == {
618						case .kParseOk;
619							push_regexp(*ps, re);
620							lastunary = isunary;
621							parsed = true;
622						case .kParseError;
623							remove_regexp(*ps.pool, re);
624							ps.status = parseStatus;
625							return null, ps.status, ps.pool;
626						case .kParseNothing;
627							remove_regexp(*ps.pool, re);
628					}
629				}
630
631				if !parsed {
632					g := maybe_parse_perl_cc_escape(*t, ps.flags);
633					if g != null {
634						re := new_regexp(*ps.pool, .CharClass, ps.flags & ~.FoldCase);
635						re.char_class = New(CharClass);
636						re.char_class_finished = false;
637						add_unicode_group(re.char_class, g, g.sign, ps.flags);
638						push_regexp(*ps, re);
639						parsed = true;
640					}
641				}
642
643				if !parsed {
644					r, status, success := parse_escape(*t, ps.rune_max);
645					if !success		return null, status, ps.pool;
646					push_literal(*ps, r);
647				}
648		}
649	}
650	return do_finish(*ps), ps.status, ps.pool;
651}
652
653new_literal_string :: (pool: *Regexp_Pool, runes: [] Rune, flags: ParseFlags) -> *Regexp_Node {
654	if runes.count == {
655		case 0;
656			return new_regexp(pool, .EmptyMatch, flags);
657		case 1;
658			return new_literal(pool, runes[0], flags);
659		case;
660			re := new_regexp(pool, .LiteralString, flags);
661			for runes {
662				add_rune_to_string(re, it);
663			}
664			return re;
665	}
666}
667
668new_literal :: (pool: *Regexp_Pool, rune: Rune, flags: ParseFlags) -> *Regexp_Node {
669	re := new_regexp(pool, .Literal, flags);
670	re.rune = rune;
671	return re;
672}
673
674new_star_plus_quest :: (pool: *Regexp_Pool, op: RegexpOp, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
675	// Squash **, ++ and ??.
676	if (op == sub.op && flags == sub.parse_flags)		return sub;
677
678	// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
679	// op is Star/Plus/Quest, we just have to check that sub.op() is too.
680	if ((sub.op == .Star ||
681			sub.op == .Plus ||
682			sub.op == .Quest) &&
683			flags == sub.parse_flags) {
684		// If sub is Star, no need to rewrite it.
685		if (sub.op == .Star)	 return sub;
686
687		re := new_regexp(pool, .Star, flags);
688		array_resize(*re.subs, 1, false);
689		re.subs[0] = sub.subs[0];
690		return re;
691	}
692
693	re := new_regexp(pool, op, flags);
694	array_resize(*re.subs, 1, false);
695	re.subs[0] = sub;
696	return re;
697}
698
699// @ToDo: Why does #bake_arguments not work here? (Compiler bug I can’t find a simple repro for)
700// new_plus :: #bake_arguments new_star_plus_quest(op = .Plus);
701// new_star :: #bake_arguments new_star_plus_quest(op = RegexpOp.Star);
702// new_quest :: #bake_arguments new_star_plus_quest(op = .Quest);
703new_plus:: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
704	return new_star_plus_quest(pool, .Plus, sub, flags);
705}
706new_star :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
707	return new_star_plus_quest(pool, .Star, sub, flags);
708}
709new_quest :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
710	return new_star_plus_quest(pool, .Quest, sub, flags);
711}
712
713new_concat_or_alternate :: (pool: *Regexp_Pool, op: RegexpOp, subs: [] *Regexp_Node, flags: ParseFlags, can_factor: bool) -> *Regexp_Node {
714	if subs.count == 1 {
715		sub := subs[0];
716		return sub;
717	}
718
719	if subs.count == 0 {
720		if op == .Alternate {
721			return new_regexp(pool, .NoMatch, flags);
722		} else {
723			return new_regexp(pool, .EmptyMatch, flags);
724		}
725	}
726
727	// @ToDo, @Incomplete: We need this factorization for selecting the right
728	// execution engine later. But does it matter if we implement only NDA for now?
729
730	// PODArray<Regexp_Node*> subcopy;
731	// if op == .Alternate && can_factor {
732	// 	// Going to edit sub; make a copy so we don't step on caller.
733	// 	subcopy = PODArray<Regexp_Node*>(nsub);
734	// 	memmove(subcopy.data(), sub, nsub * sizeof sub[0]);
735	// 	sub = subcopy.data();
736	// 	nsub = FactorAlternation(sub, nsub, flags);
737	// 	if (nsub == 1) {
738	// 		Regexp_Node* re = sub[0];
739	// 		return re;
740	// 	}
741	// }
742
743	// @ToDo, @Incomplete: Is this necessary for us?
744	// We use a normal array at the moment, so we waste 6 more bytes on the size and won’t overflow
745
746	// if (nsub > MAX_SUBS) {
747	// 	// Too many subexpressions to fit in a single Regexp_Node.
748	// 	// Make a two-level tree.  Two levels gets us to 65535^2.
749	// 	int nbigsub = (nsub+MAX_SUBS-1)/MAX_SUBS;
750	// 	Regexp_Node* re = new Regexp_Node(op, flags);
751	// 	re->AllocSub(nbigsub);
752	// 	Regexp_Node** subs = re->sub();
753	// 	for (int i = 0; i < nbigsub - 1; i++)
754	// 	subs[i] = ConcatOrAlternate(op, sub+i*MAX_SUBS, MAX_SUBS, flags, false);
755	// 	subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*MAX_SUBS,
756	// 	nsub - (nbigsub-1)*MAX_SUBS, flags,
757	// 	false);
758	// 	return re;
759	// }
760
761	re := new_regexp(pool, op, flags);
762	re.subs = subs;
763	return re;
764}
765
766// concat :: #bake_arguments concat_or_alternate(op = .Concat, can_factor = false);
767// alternate :: #bake_arguments concat_or_alternate(op = .Alternate, can_factor = true);
768// alternate_no_factor :: #bake_arguments concat_or_alternate(op = .Alternate, can_factor = false);
769new_concat :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
770	return new_concat_or_alternate(pool, .Concat, subs, flags, false);
771}
772new_alternate :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
773	return new_concat_or_alternate(pool, .Alternate, subs, flags, true);
774}
775new_alternate_no_factor :: (pool: *Regexp_Pool, subs: [] *Regexp_Node, flags: ParseFlags) -> *Regexp_Node {
776	return new_concat_or_alternate(pool, .Alternate, subs, flags, false);
777}
778
779new_capture :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags, cap: s32) -> *Regexp_Node {
780	re := new_regexp(pool, .Capture, flags);
781	array_resize(*re.subs, 1, false);
782	re.subs[0] = sub;
783	re.cap = cap;
784	return re;
785}
786
787new_repeat :: (pool: *Regexp_Pool, sub: *Regexp_Node, flags: ParseFlags, min: s16, max: s16) -> *Regexp_Node {
788	re := new_regexp(pool, .Repeat, flags);
789	array_resize(*re.subs, 1, false);
790	re.subs[0] = sub;
791	re.min = min;
792	re.max = max;
793	return re;
794}
795
796to_string :: (re: *Regexp_Node) -> string {
797	// Appends a rune for use in a character class to the string t.
798	append_cc_char :: (builder: *String_Builder, r: Rune) {
799		if 0x20 <= r && r <= 0x7E {
800			if find_index_from_left("[]^-\\", cast(u8)r) != -1 {
801				append(builder, "\\");
802			}
803			append(builder, cast(u8)r);
804		} else {
805			if r == {
806				case #char "\r";
807					append(builder, "\\r");
808					return;
809
810				case #char "\t";
811					append(builder, "\\t");
812					return;
813
814				case #char "\n";
815					append(builder, "\\n");
816					return;
817
818				case 0x0c;
819					append(builder, "\\f");
820					return;
821			}
822
823			if (r < 0x100) {
824				print_to_builder(builder, "\\x%", formatInt(r, base = 16, minimum_digits = 2));
825			} else {
826				print_to_builder(builder, "\\x{%}", formatInt(r, base = 16));
827			}
828		}
829	}
830
831	append_cc_range :: (builder: *String_Builder, lo: Rune, hi: Rune) {
832		append_cc_char(builder, lo);
833		// < would be enough here, but we want to print invalid ranges as well, just in case we have a bug somewhere
834		if lo != hi {
835			append(builder, #char "-");
836			append_cc_char(builder, hi);
837		}
838	}
839
840	append_literal :: (builder: *String_Builder, r: Rune, foldcase: bool) {
841		if r != 0 && r < 0x80 && find_index_from_left("(){}[]*+?|.^$\\", cast(u8)r) != -1 {
842			append(builder, #char "\\");
843			append(builder, cast(u8)r);
844		} else if (foldcase && #char "a" <= r && r <= #char "z") {
845			r -= #char "a" - #char "A";
846			append(builder, #char "[");
847			append(builder, cast(u8)r);
848			append(builder, cast(u8)r + #char "a" - #char "A");
849			append(builder, #char "]");
850		} else {
851			append_cc_range(builder, r, r);
852		}
853	}
854
855	// Appends ( if needed and passes new precedence to children.
856	to_string_pre :: (builder: *String_Builder, re: *Regexp_Node, prec: Precedence) -> Precedence, bool {
857		nprec := Precedence.Atom;
858
859		if re.op == {
860			case .NoMatch; #through;
861			case .EmptyMatch; #through;
862			case .Literal; #through;
863			case .AnyChar; #through;
864			case .AnyByte; #through;
865			case .BeginLine; #through;
866			case .EndLine; #through;
867			case .BeginText; #through;
868			case .EndText; #through;
869			case .WordBoundary; #through;
870			case .NoWordBoundary; #through;
871			case .CharClass; #through;
872			case .HaveMatch;
873				// Atom is fine
874
875			case .Concat; #through;
876			case .LiteralString;
877				if prec < .Concat {
878					append(builder, "(?:");
879				}
880				nprec = .Concat;
881
882			case .Alternate;
883				if prec < .Alternate {
884					append(builder, "(?:");
885				}
886				nprec = .Alternate;
887
888			case .Capture;
889				append(builder, #char "(");
890				assert (re.cap != 0, "Capture 0");
891				if re.name {
892					append(builder, "?P<");
893					append(builder, re.name);
894					append(builder, #char ">");
895				}
896				nprec = .Paren;
897
898			case .Star; #through;
899			case .Plus; #through;
900			case .Quest; #through;
901			case .Repeat;
902				if (prec < .Unary) {
903					append(builder, "(?:");
904				}
905				// The subprecedence here is Atom instead of Unary
906				// because PCRE treats two unary ops in a row as a parse error.
907				nprec = .Atom;
908		}
909
910		return nprec, false;
911	}
912
913	to_string_post :: (builder: *String_Builder, re: *Regexp_Node, prec: Precedence, pre_arg: Precedence, child_args: [..] Precedence) -> Precedence {
914		if re.op == {
915			case .NoMatch;
916				// There's no simple symbol for "no match", but
917				// [^0-Runemax] excludes everything.
918				append(builder, "[^\\x00-\\x{10ffff}]");
919
920			case .EmptyMatch;
921				// Append (?:) to make empty string visible,
922				// unless this is already being parenthesized.
923				if prec < .Empty {
924					append(builder, "(?:)");
925				}
926
927			case .Literal;
928				append_literal(builder, re.rune, (re.parse_flags & .FoldCase) != 0);
929
930			case .LiteralString;
931				for re.runes {
932					append_literal(builder, it, (re.parse_flags & .FoldCase) != 0);
933				}
934				if prec < .Concat {
935					append(builder, #char ")");
936				}
937
938			case .Concat;
939				if prec < .Concat {
940					append(builder, #char ")");
941				}
942
943			case .Alternate;
944				// Clumsy but workable: the children all appended |
945				// at the end of their strings, so just remove the last one.
946				// @ToDo: we can’t do that with string_builder, at least not without very clumsy or very slow code
947				// if ((*t_)[t_->size()-1] == '|')
948				// t_->erase(t_->size()-1);
949				// else
950				// LOG(DFATAL) << "Bad final char: " << t_;
951				if prec < .Alternate {
952					append(builder, #char ")");
953				}
954
955			case .Star;
956				append(builder, #char "*");
957				if re.parse_flags & .NonGreedy {
958					append(builder, #char "?");
959				}
960
961				if (prec < .Unary) {
962					append(builder, #char ")");
963				}
964
965			case .Plus;
966				append(builder, #char "+");
967				if re.parse_flags & .NonGreedy {
968					append(builder, #char "?");
969				}
970				if prec < .Unary {
971					append(builder, #char ")");
972				}
973
974			case .Quest;
975				append(builder, #char "?");
976				if re.parse_flags & .NonGreedy {
977					append(builder, #char "?");
978				}
979				if (prec < .Unary) {
980					append(builder, #char ")");
981				}
982
983			case .Repeat;
984				if (re.max == -1) {
985					print_to_builder(builder, "{%,}", re.min);
986				} else if re.min == re.max {
987					print_to_builder(builder, "{%}", re.min);
988				} else {
989					print_to_builder(builder, "{%,%}", re.min, re.max);
990				}
991				if (re.parse_flags & .NonGreedy) {
992					append(builder, #char "?");
993				}
994				if (prec < .Unary) {
995					append(builder, #char ")");
996				}
997
998			case .AnyChar;
999				append(builder, #char ".");
1000
1001			case .AnyByte;
1002				append(builder, "\\C");
1003
1004			case .BeginLine;
1005				append(builder, #char "^");
1006
1007			case .EndLine;
1008				append(builder, #char "$");
1009
1010			case .BeginText;
1011				append(builder, "(?-m:^)");
1012
1013			case .EndText;
1014				if re.parse_flags & .WasDollar {
1015					append(builder, "(?-m:$)");
1016				} else {
1017					append(builder, "\\z");
1018				}
1019
1020			case .WordBoundary;
1021				append(builder, "\\b");
1022
1023			case .NoWordBoundary;
1024				append(builder, "\\B");
1025
1026			case .CharClass;
1027				if (re.char_class.ranges.count == 0) {
1028					append(builder, "[^\\x00-\\x{10ffff}]");
1029				} else {
1030					append(builder, "[");
1031					// Heuristic: show class as negated if it contains the
1032					// non-character 0xFFFE and yet somehow isn't full.
1033					cc := re.char_class;
1034					if contains(cc, 0xFFFE) && cc.nrunes != Runemax + 1 {
1035						cc = copy(cc);
1036						negate(cc);
1037						append(builder, "^");
1038					}
1039					for cc.ranges {
1040						append_cc_range(builder, it.lo, it.hi);
1041					}
1042					if cc != re.char_class {
1043						uninit(cc);
1044						free(cc);
1045					}
1046					append(builder, #char "]");
1047				}
1048
1049			case .Capture;
1050				append(builder, #char ")");
1051
1052			case .HaveMatch;
1053				// There's no syntax accepted by the parser to generate
1054				// this node (it is generated by RE2::Set) so make something
1055				// up that is readable but won't compile.
1056				print_to_builder(builder, "(?HaveMatch:%)", re.match_id);
1057		}
1058
1059		// If the parent is an alternation, append the | for it.
1060		if (prec == .Alternate) {
1061			append(builder, #char "|");
1062		}
1063
1064		return .Atom;
1065	}
1066
1067	builder: String_Builder;
1068	defer free_buffers(*builder);
1069	w := Walk(*String_Builder, Precedence).{pre_visit = to_string_pre, post_visit = to_string_post, copy = null};
1070	// @ToDO @Cleanup This is a workaround for nulls not being respected in struct literals yet.
1071	// Remove once the struct literal bug is fixed in jai
1072	w.copy = null;
1073	walk(*builder, re, .Toplevel, w);
1074	return builder_to_string(*builder);
1075}
1076
1077#scope_file
1078
1079MAX_REPEAT :: 1000;
1080
1081// Finishes the regexp if necessary, preparing it for use in
1082// a more complex expression.
1083finish_regexp:: (re: *Regexp_Node) -> *Regexp_Node {
1084	if re == null		return null;
1085
1086	re.down = null;
1087
1088	if re.op == .CharClass && re.char_class != null && !re.char_class_finished {
1089		finish_char_class(re.char_class);
1090		re.char_class_finished = true;
1091	}
1092
1093	return re;
1094}
1095
1096// Pushes the given regular expression onto the stack.
1097// Could check for too much memory used here.
1098push_regexp :: (using ps: *ParseState, re: *Regexp_Node) {
1099	maybe_concat_string(ps, -1, .NoParseFlags);
1100
1101	// Special case: a character class of one character is just
1102	// a literal.  This is a common idiom for escaping
1103	// single characters (e.g., [.] instead of \.), and some
1104	// analysis does better with fewer character classes.
1105	// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
1106	if re.op == .CharClass && re.char_class != null {
1107		remove_above(re.char_class, rune_max);
1108		if re.char_class.nrunes == 1 {
1109			r := re.char_class.ranges[0].lo;
1110			// @ToDo, @Speed: No need to remove and create a new one, just clean up and repurpose the old one
1111			remove_regexp(*ps.pool, re);
1112			re = new_literal(*ps.pool, r, flags);
1113		} else if re.char_class.nrunes == 2 {
1114			r := re.char_class.ranges[0].lo;
1115			if #char "A" <= r && r <= #char "Z" && contains(re.char_class, r + #char "a" - #char "A") {
1116				// @ToDo, @Speed: No need to remove and create a new one, just clean up and repurpose the old one
1117				remove_regexp(*ps.pool, re);
1118				re = new_literal(*ps.pool, r + #char "a" - #char "A", flags | .FoldCase);
1119			}
1120		}
1121	}
1122
1123	if !is_marker(re.op) {
1124		re.simple = compute_simple(re);
1125	}
1126
1127	re.down = stacktop;
1128	stacktop = re;
1129}
1130
1131
1132// Add lo-hi to the class, along with their fold-equivalent characters.
1133// If lo-hi is already in the class, assume that the fold-equivalent
1134// chars are there too, so there's no work to do.
1135add_folded_range :: (cc: *CharClass, lo: Rune, hi: Rune, depth: int) {
1136	// AddFoldedRange calls itself recursively for each rune in the fold cycle.
1137	// Most folding cycles are small: there aren't any bigger than four in the
1138	// current Unicode tables.  make_unicode_casefold.py checks that
1139	// the cycles are not too long, and we double-check here using depth.
1140	assert(depth <= 10, "AddFoldedRange recurses too much.");
1141
1142	if !add_range(cc, lo, hi)	return;  // lo-hi was already there? we're done
1143
1144	while lo <= hi {
1145		f := lookup_casefold(unicode_casefold, lo);
1146		if f == null	break;  // lo has no fold, nor does anything above lo
1147
1148		if lo < f.lo {  // lo has no fold; next rune with a fold is f.lo
1149			lo = f.lo;
1150		} else {
1151			// Add in the result of folding the range lo - f.hi
1152			// and that range's fold, recursively.
1153			lo1 := lo;
1154			hi1 := min(hi, f.hi);
1155			if f.delta ==  {
1156				case;
1157					lo1 += f.delta;
1158					hi1 += f.delta;
1159				case EvenOdd;
1160					if lo1%2 == 1	 lo1 -= 1;
1161					if hi1%2 == 0	 hi1 += 1;
1162				case OddEven;
1163					if lo1%2 == 0	 lo1 -= 1;
1164					if hi1%2 == 1	 hi1 += 1;
1165			}
1166			add_folded_range(cc, lo1, hi1, depth + 1);
1167
1168			// Pick up where this fold left off.
1169			lo = f.hi + 1;
1170		}
1171	}
1172}
1173
1174// Pushes the literal rune r onto the stack.
1175push_literal :: (using ps: *ParseState, r: Rune) {
1176	// Do case folding if needed.
1177	if (flags & .FoldCase) && cycle_fold_rune(r) != r {
1178		re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
1179		re.char_class = New(CharClass);
1180		re.char_class_finished = false;
1181		r1 := r;
1182
1183		if !(flags & .NeverNL) || r != #char "\n" {
1184			add_range(re.char_class, r, r);
1185		}
1186		r = cycle_fold_rune(r);
1187		while (r != r1) {
1188			if !(flags & .NeverNL) || r != #char "\n" {
1189				add_range(re.char_class, r, r);
1190			}
1191			r = cycle_fold_rune(r);
1192		}
1193
1194		push_regexp(ps, re);
1195		return;
1196	}
1197
1198	// Exclude newline if applicable.
1199	if (flags & .NeverNL) && r == #char "\n" {
1200		push_regexp(ps, new_regexp(*ps.pool, .NoMatch, flags));
1201		return;
1202	}
1203
1204	// No fancy stuff worked.  Ordinary literal.
1205	if maybe_concat_string(ps, r, flags)	return;
1206
1207	re := new_regexp(*ps.pool, .Literal, flags);
1208	re.rune = r;
1209	push_regexp(ps, re);
1210}
1211
1212// Pushes a ^ onto the stack.
1213push_caret :: (using ps: *ParseState) {
1214	if flags & .OneLine {
1215		push_simple_op(ps, .BeginText);
1216	} else {
1217		push_simple_op(ps, .BeginLine);
1218	}
1219}
1220
1221// Pushes a $ onto the stack.
1222push_dollar :: (using ps: *ParseState) {
1223	if flags & .OneLine {
1224		// Clumsy marker so that MimicsPCRE() can tell whether
1225		// this EndText was a $ and not a \z.
1226		// @ToDo, @Cleanup pass flags as optional argument
1227		oflags := flags;
1228		flags = flags | .WasDollar;
1229		push_simple_op(ps, .EndText);
1230		flags = oflags;
1231	} else {
1232		push_simple_op(ps, .EndLine);
1233	}
1234}
1235
1236// Pushes a . onto the stack.
1237push_dot :: (using ps: *ParseState) {
1238	if flags & .DotNL && !(flags & .NeverNL) {
1239		push_simple_op(ps, .AnyChar);
1240	} else {
1241		// Rewrite . into [^\n]
1242		re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
1243		re.char_class = New(CharClass);
1244		re.char_class_finished = false;
1245		add_range(re.char_class, 0, #char "\n" - 1);
1246		add_range(re.char_class, #char "\n" + 1, rune_max);
1247		push_regexp(ps, re);
1248	}
1249}
1250
1251// Pushes a regexp with the given op (and no args) onto the stack.
1252push_simple_op :: (using ps: *ParseState, op: RegexpOp) {
1253	push_regexp(ps, new_regexp(*ps.pool, op, flags));
1254}
1255
1256// Pushes a repeat operator regexp onto the stack.
1257// A valid argument for the operator must already be on the stack.
1258// The char c is the name of the operator, for use in error messages.
1259push_repeat_op :: (using ps: *ParseState, op: RegexpOp, s: string, nongreedy: bool) -> bool {
1260	if stacktop == null || is_marker(stacktop.op) {
1261		status.code = .RepeatArgument;
1262		status.error_arg = s;
1263		return false;
1264	}
1265
1266	fl := ifx nongreedy then flags ^ .NonGreedy else flags;
1267
1268	// Squash **, ++ and ??. Regexp_Node::Star() et al. handle this too, but
1269	// they're mostly for use during simplification, not during parsing.
1270	if op == stacktop.op && fl == stacktop.parse_flags	return true;
1271
1272	// Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because
1273	// op is a repeat, we just have to check that stacktop_.op() is too,
1274	// then adjust stacktop_.
1275	if (stacktop.op == .Star || stacktop.op == .Plus || stacktop.op == .Quest) && fl == stacktop.parse_flags {
1276		stacktop.op = .Star;
1277		return true;
1278	}
1279
1280	re := new_regexp(*ps.pool, op, fl);
1281	re.down = stacktop.down;
1282	re.subs = NewArray(1, *Regexp_Node, initialized = false);
1283	re.subs[0] = finish_regexp(stacktop);
1284	re.simple = compute_simple(re);
1285	stacktop = re;
1286	return true;
1287}
1288
1289// // RepetitionWalker reports whether the repetition regexp is valid.
1290// // Valid means that the combination of the top-level repetition
1291// // and any inner repetitions does not exceed n copies of the
1292// // innermost thing.
1293// // This rewalks the regexp tree and is called for every repetition,
1294// // so we have to worry about inducing quadratic behavior in the parser.
1295// // We avoid this by only using RepetitionWalker when min or max >= 2.
1296// // In that case the depth of any >= 2 nesting can only get to 9 without
1297// // triggering a parse error, so each subtree can only be rewalked 9 times.
1298// class RepetitionWalker : public Regexp_Node::Walker<int> {
1299//  public:
1300//   RepetitionWalker() {}
1301//   virtual int PreVisit(Regexp_Node* re, int parent_arg, bool* stop);
1302//   virtual int PostVisit(Regexp_Node* re, int parent_arg, int pre_arg,
1303//                         int* child_args, int nchild_args);
1304//   virtual int ShortVisit(Regexp_Node* re, int parent_arg);
1305
1306//  private:
1307//   RepetitionWalker(const RepetitionWalker&) = delete;
1308//   RepetitionWalker& operator=(const RepetitionWalker&) = delete;
1309// };
1310
1311// int RepetitionWalker::PreVisit(Regexp_Node* re, int parent_arg, bool* stop) {
1312//   int arg = parent_arg;
1313//   if (re.op() == .Repeat) {
1314//     int m = re.max();
1315//     if (m < 0) {
1316//       m = re.min();
1317//     }
1318//     if (m > 0) {
1319//       arg /= m;
1320//     }
1321//   }
1322//   return arg;
1323// }
1324
1325// int RepetitionWalker::PostVisit(Regexp_Node* re, int parent_arg, int pre_arg,
1326//                                 int* child_args, int nchild_args) {
1327//   int arg = pre_arg;
1328//   for (int i = 0; i < nchild_args; i++) {
1329//     if (child_args[i] < arg) {
1330//       arg = child_args[i];
1331//     }
1332//   }
1333//   return arg;
1334// }
1335
1336// int RepetitionWalker::ShortVisit(Regexp_Node* re, int parent_arg) {
1337//   // Should never be called: we use Walk(), not WalkExponential().
1338// #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
1339//   LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
1340// #endif
1341//   return 0;
1342// }
1343
1344// Pushes a repetition regexp onto the stack.
1345// A valid argument for the operator must already be on the stack.
1346push_repetition :: (using ps: *ParseState, min: int, max: int, s: string, nongreedy: bool) -> bool {
1347	if (max != -1 && max < min) || min > MAX_REPEAT || max > MAX_REPEAT {
1348		status.code = .RepeatSize;
1349		status.error_arg = s;
1350		return false;
1351	}
1352	if stacktop == null || is_marker(stacktop.op) {
1353		status.code = .RepeatArgument;
1354		status.error_arg = s;
1355		return false;
1356	}
1357
1358	fl := ifx nongreedy then flags ^ .NonGreedy else flags;
1359
1360	down := stacktop.down;
1361	re := new_repeat(*ps.pool, finish_regexp(stacktop), fl, cast(s16)min, cast(s16)max);
1362	re.down = down;
1363	re.simple = compute_simple(re);
1364	stacktop = re;
1365	if min >= 2 || max >= 2 {
1366		// @ToDo: Check for repetition limits (see RepetitionWalker in the original source)
1367		// if repetition_walk(stacktop, kMaxRepeat) == 0 {
1368		// 	status.code = .RepeatSize;
1369		// 	status.error_arg = s;
1370		// 	return false;
1371		// }
1372	}
1373
1374	return true;
1375}
1376
1377// Checks whether a particular regexp op is a marker.
1378is_marker :: (op: RegexpOp) -> bool {
1379	return op >= .kLeftParen;
1380}
1381
1382// Processes a left parenthesis in the input.
1383// Pushes a marker onto the stack.
1384do_left_paren :: (using ps: *ParseState, name: string = "") {
1385	re := new_regexp(*ps.pool, .kLeftParen, flags);
1386	ncap += 1;
1387	re.cap = ncap;
1388	if name {
1389		re.name = name;
1390	}
1391	push_regexp(ps, re);
1392}
1393
1394// Pushes a non-capturing marker onto the stack.
1395do_left_paren_no_capture :: (using ps: *ParseState) {
1396	re := new_regexp(*ps.pool, .kLeftParen, flags);
1397	re.cap = -1;
1398	push_regexp(ps, re);
1399}
1400
1401// Processes a vertical bar in the input.
1402do_vertical_bar ::(using ps: *ParseState) {
1403	maybe_concat_string(ps, -1, .NoParseFlags);
1404	do_concatenation(ps);
1405
1406	// Below the vertical bar is a list to alternate.
1407	// Above the vertical bar is a list to concatenate.
1408	// We just did the concatenation, so either swap
1409	// the result below the vertical bar or push a new
1410	// vertical bar on the stack.
1411	r1 := stacktop;
1412	if r1 {
1413		r2 := r1.down;
1414		if r2 && r2.op == .kVerticalBar {
1415			r3 := r2.down;
1416			if r3 && (r1.op == .AnyChar || r3.op == .AnyChar) {
1417				// AnyChar is above or below the vertical bar. Let it subsume
1418				// the other when the other is Literal, CharClass or AnyChar.
1419				if r3.op == .AnyChar && (r1.op == .Literal || r1.op == .CharClass || r1.op == .AnyChar) {
1420					// Discard r1.
1421					stacktop = r2;
1422					remove_regexp(*ps.pool, r1);
1423					return;
1424				}
1425				if r1.op == .AnyChar && (r3.op == .Literal || r3.op == .CharClass || r3.op == .AnyChar) {
1426					// Rearrange the stack and discard r3.
1427					r1.down = r3.down;
1428					r2.down = r1;
1429					stacktop = r2;
1430					remove_regexp(*ps.pool, r3);
1431					return;
1432				}
1433			}
1434			// Swap r1 below vertical bar (r2).
1435			r1.down = r2.down;
1436			r2.down = r1;
1437			stacktop = r2;
1438			return;
1439		}
1440	}
1441	push_simple_op(ps, .kVerticalBar);
1442}
1443
1444// Processes a right parenthesis in the input.
1445do_right_paren :: (using ps: *ParseState) -> bool {
1446	// Finish the current concatenation and alternation.
1447	do_alternation(ps);
1448
1449	// The stack should be: LeftParen regexp
1450	// Remove the LeftParen, leaving the regexp,
1451	// parenthesized.
1452	r1 := stacktop;
1453	if r1 == null || r1.down == null || r1.down.op != .kLeftParen {
1454		status.code = .UnexpectedParen;
1455		status.error_arg = whole_regexp;
1456		return false;
1457	}
1458
1459	r2 := r1.down;
1460
1461	// Pop off r1, r2.  Will Decref or reuse below.
1462	stacktop = r2.down;
1463
1464	// Restore flags from when paren opened.
1465	re := r2;
1466	flags = re.parse_flags;
1467
1468	// Rewrite LeftParen as capture if needed.
1469	if re.cap > 0 {
1470		re.op = .Capture;
1471		// re.cap is already set
1472		re.subs = NewArray(1, *Regexp_Node, initialized = false);
1473		re.subs[0] = finish_regexp(r1);
1474		re.simple = compute_simple(re);
1475	} else {
1476		remove_regexp(*ps.pool, re);
1477		re = r1;
1478	}
1479
1480	push_regexp(ps, re);
1481	return true;
1482}
1483
1484// Processes the end of input, returning the final regexp.
1485do_finish :: (using ps: *ParseState) -> *Regexp_Node {
1486	do_alternation(ps);
1487	re := stacktop;
1488	if re != null && re.down != null {
1489		status.code = .MissingParen;
1490		status.error_arg = whole_regexp;
1491		return null;
1492	}
1493	stacktop = null;
1494	return finish_regexp(re);
1495}
1496
1497// Returns the leading regexp that re starts with.
1498// The returned Regexp_Node* points into a piece of re,
1499// so it must not be used after the caller calls re.Decref().
1500leading_regexp :: (re: *Regexp_Node) -> *Regexp_Node {
1501	if re.op == .EmptyMatch		return null;
1502	if re.op == .Concat && re.subs.count >= 2 {
1503		sub := re.subs[0];
1504		if sub.op == .EmptyMatch	 return null;
1505		return sub;
1506	}
1507	return re;
1508}
1509
1510// Removes LeadingRegexp(re) from re and returns what's left.
1511// Consumes the reference to re and may edit it in place.
1512// If caller wants to hold on to LeadingRegexp(re),
1513// must have already Incref'ed it.
1514// @ToDO: Check if this is possible / correct in all usages without using reference counting!
1515remove_leading_regexp :: (using ps: *ParseState, re: *Regexp_Node) -> *Regexp_Node {
1516	if re.op == .EmptyMatch	return re;
1517
1518	if re.op == .Concat && re.subs.count >= 2 {
1519		if re.subs[0].op == .EmptyMatch		return re;
1520
1521		remove_regexp(*ps.pool, re.subs[0]);
1522		re.subs[0] = null;
1523
1524		if re.subs.count == 2 {
1525			// Collapse concatenation to single regexp.
1526			nre := re.subs[1];
1527			re.subs[1] = null;
1528			remove_regexp(*ps.pool, re);
1529			return nre;
1530		}
1531		// 3 or more -> 2 or more.
1532		// @Stability: We won’t be able to free this array ever again
1533		// because we modify it’s base pointer.
1534		// But it doesn’t matter because we’re working within a Pool anyway.
1535		print("Before: %\n", re.subs);
1536		re.subs.data += 1;
1537		re.subs.count -= 1;
1538		print("After: %\n", re.subs);
1539		return re;
1540	} else {
1541		pf := re.parse_flags;
1542		// @ToDo, @Speed: Reuse regexp instead of removing and creating a new one?
1543		remove_regexp(*ps.pool, re);
1544		return new_regexp(*ps.pool, .EmptyMatch, pf);
1545	}
1546}
1547
1548// Returns the leading string that re starts with.
1549// The returned Rune* points into a piece of re,
1550// so it must not be used after the caller calls re.Decref().
1551// @ToDo
1552// Rune* Regexp_Node::LeadingString(Regexp_Node* re, int *nrune,
1553//                             Regexp_Node::ParseFlags *flags) {
1554//   while (re.op() == .Concat && re.nsub() > 0)
1555//     re = re.sub()[0];
1556
1557//   *flags = static_cast<Regexp_Node::ParseFlags>(re.parse_flags_ & Regexp_Node::FoldCase);
1558
1559//   if (re.op() == .Literal) {
1560//     *nrune = 1;
1561//     return &re.rune_;
1562//   }
1563
1564//   if (re.op() == .LiteralString) {
1565//     *nrune = re.nrunes_;
1566//     return re.runes_;
1567//   }
1568
1569//   *nrune = 0;
1570//   return null;
1571// }
1572
1573// Removes the first n leading runes from the beginning of re.
1574// Edits re in place.
1575// @ToDO
1576// void Regexp_Node::RemoveLeadingString(Regexp_Node* re, int n) {
1577//   // Chase down concats to find first string.
1578//   // For regexps generated by parser, nested concats are
1579//   // flattened except when doing so would overflow the 16-bit
1580//   // limit on the size of a concatenation, so we should never
1581//   // see more than two here.
1582//   Regexp_Node* stk[4];
1583//   size_t d = 0;
1584//   while (re.op() == .Concat) {
1585//     if (d < arraysize(stk))
1586//       stk[d++] = re;
1587//     re = re.sub()[0];
1588//   }
1589
1590//   // Remove leading string from re.
1591//   if (re.op() == .Literal) {
1592//     re.rune_ = 0;
1593//     re.op_ = .EmptyMatch;
1594//   } else if (re.op() == .LiteralString) {
1595//     if (n >= re.nrunes_) {
1596//       delete[] re.runes_;
1597//       re.runes_ = null;
1598//       re.nrunes_ = 0;
1599//       re.op_ = .EmptyMatch;
1600//     } else if (n == re.nrunes_ - 1) {
1601//       Rune rune = re.runes_[re.nrunes_ - 1];
1602//       delete[] re.runes_;
1603//       re.runes_ = null;
1604//       re.nrunes_ = 0;
1605//       re.rune_ = rune;
1606//       re.op_ = .Literal;
1607//     } else {
1608//       re.nrunes_ -= n;
1609//       memmove(re.runes_, re.runes_ + n, re.nrunes_ * sizeof re.runes_[0]);
1610//     }
1611//   }
1612
1613//   // If re is now empty, concatenations might simplify too.
1614//   while (d > 0) {
1615//     re = stk[--d];
1616//     Regexp_Node** sub = re.sub();
1617//     if (sub[0].op() == .EmptyMatch) {
1618//       sub[0].Decref();
1619//       sub[0] = null;
1620//       // Delete first element of concat.
1621//       switch (re.nsub()) {
1622//         case 0:
1623//         case 1:
1624//           // Impossible.
1625//           LOG(DFATAL) << "Concat of " << re.nsub();
1626//           re.submany_ = null;
1627//           re.op_ = .EmptyMatch;
1628//           break;
1629
1630//         case 2: {
1631//           // Replace re with sub[1].
1632//           Regexp_Node* old = sub[1];
1633//           sub[1] = null;
1634//           re.Swap(old);
1635//           old.Decref();
1636//           break;
1637//         }
1638
1639//         default:
1640//           // Slide down.
1641//           re.nsub_--;
1642//           memmove(sub, sub + 1, re.nsub_ * sizeof sub[0]);
1643//           break;
1644//       }
1645//     }
1646//   }
1647// }
1648
1649// @ToDo: All of that
1650//// In the context of factoring alternations, a Splice is: a factored prefix or
1651//// merged character class computed by one iteration of one round of factoring;
1652//// the span of subexpressions of the alternation to be "spliced" (i.e. removed
1653//// and replaced); and, for a factored prefix, the number of suffixes after any
1654//// factoring that might have subsequently been performed on them. For a merged
1655//// character class, there are no suffixes, of course, so the field is ignored.
1656//struct Splice {
1657//  Splice(Regexp_Node* prefix, Regexp_Node** sub, int nsub)
1658//      : prefix(prefix),
1659//        sub(sub),
1660//        nsub(nsub),
1661//        nsuffix(-1) {}
1662
1663//  Regexp_Node* prefix;
1664//  Regexp_Node** sub;
1665//  int nsub;
1666//  int nsuffix;
1667//};
1668
1669//// Named so because it is used to implement an explicit stack, a Frame is: the
1670//// span of subexpressions of the alternation to be factored; the current round
1671//// of factoring; any Splices computed; and, for a factored prefix, an iterator
1672//// to the next Splice to be factored (i.e. in another Frame) because suffixes.
1673//struct Frame {
1674//  Frame(Regexp_Node** sub, int nsub)
1675//      : sub(sub),
1676//        nsub(nsub),
1677//        round(0) {}
1678
1679//  Regexp_Node** sub;
1680//  int nsub;
1681//  int round;
1682//  std::vector<Splice> splices;
1683//  int spliceidx;
1684//};
1685
1686//// Bundled into a class for friend access to Regexp_Node without needing to declare
1687//// (or define) Splice in regexp.h.
1688//class FactorAlternationImpl {
1689// public:
1690//  static void Round1(Regexp_Node** sub, int nsub,
1691//                     Regexp_Node::ParseFlags flags,
1692//                     std::vector<Splice>* splices);
1693//  static void Round2(Regexp_Node** sub, int nsub,
1694//                     Regexp_Node::ParseFlags flags,
1695//                     std::vector<Splice>* splices);
1696//  static void Round3(Regexp_Node** sub, int nsub,
1697//                     Regexp_Node::ParseFlags flags,
1698//                     std::vector<Splice>* splices);
1699//};
1700
1701//// Factors common prefixes from alternation.
1702//// For example,
1703////     ABC|ABD|AEF|BCX|BCY
1704//// simplifies to
1705////     A(B(C|D)|EF)|BC(X|Y)
1706//// and thence to
1707////     A(B[CD]|EF)|BC[XY]
1708////
1709//// Rewrites sub to contain simplified list to alternate and returns
1710//// the new length of sub.  Adjusts reference counts accordingly
1711//// (incoming sub[i] decremented, outgoing sub[i] incremented).
1712//int Regexp_Node::FactorAlternation(Regexp_Node** sub, int nsub, ParseFlags flags) {
1713//  std::vector<Frame> stk;
1714//  stk.emplace_back(sub, nsub);
1715
1716//  for (;;) {
1717//    auto& sub = stk.back().sub;
1718//    auto& nsub = stk.back().nsub;
1719//    auto& round = stk.back().round;
1720//    auto& splices = stk.back().splices;
1721//    auto& spliceidx = stk.back().spliceidx;
1722
1723//    if (splices.empty()) {
1724//      // Advance to the next round of factoring. Note that this covers
1725//      // the initialised state: when splices is empty and round is 0.
1726//      round++;
1727//    } else if (spliceidx < static_cast<int>(splices.size())) {
1728//      // We have at least one more Splice to factor. Recurse logically.
1729//      stk.emplace_back(splices[spliceidx].sub, splices[spliceidx].nsub);
1730//      continue;
1731//    } else {
1732//      // We have no more Splices to factor. Apply them.
1733//      auto iter = splices.begin();
1734//      int out = 0;
1735//      for (int i = 0; i < nsub; ) {
1736//        // Copy until we reach where the next Splice begins.
1737//        while (sub + i < iter.sub)
1738//          sub[out++] = sub[i++];
1739//        switch (round) {
1740//          case 1:
1741//          case 2: {
1742//            // Assemble the Splice prefix and the suffixes.
1743//            Regexp_Node* re[2];
1744//            re[0] = iter.prefix;
1745//            re[1] = Regexp_Node::AlternateNoFactor(iter.sub, iter.nsuffix, flags);
1746//            sub[out++] = Regexp_Node::Concat(re, 2, flags);
1747//            i += iter.nsub;
1748//            break;
1749//          }
1750//          case 3:
1751//            // Just use the Splice prefix.
1752//            sub[out++] = iter.prefix;
1753//            i += iter.nsub;
1754//            break;
1755//          default:
1756//            LOG(DFATAL) << "unknown round: " << round;
1757//            break;
1758//        }
1759//        // If we are done, copy until the end of sub.
1760//        if (++iter == splices.end()) {
1761//          while (i < nsub)
1762//            sub[out++] = sub[i++];
1763//        }
1764//      }
1765//      splices.clear();
1766//      nsub = out;
1767//      // Advance to the next round of factoring.
1768//      round++;
1769//    }
1770
1771//    switch (round) {
1772//      case 1:
1773//        FactorAlternationImpl::Round1(sub, nsub, flags, &splices);
1774//        break;
1775//      case 2:
1776//        FactorAlternationImpl::Round2(sub, nsub, flags, &splices);
1777//        break;
1778//      case 3:
1779//        FactorAlternationImpl::Round3(sub, nsub, flags, &splices);
1780//        break;
1781//      case 4:
1782//        if (stk.size() == 1) {
1783//          // We are at the top of the stack. Just return.
1784//          return nsub;
1785//        } else {
1786//          // Pop the stack and set the number of suffixes.
1787//          // (Note that references will be invalidated!)
1788//          int nsuffix = nsub;
1789//          stk.pop_back();
1790//          stk.back().splices[stk.back().spliceidx].nsuffix = nsuffix;
1791//          ++stk.back().spliceidx;
1792//          continue;
1793//        }
1794//      default:
1795//        LOG(DFATAL) << "unknown round: " << round;
1796//        break;
1797//    }
1798
1799//    // Set spliceidx depending on whether we have Splices to factor.
1800//    if (splices.empty() || round == 3) {
1801//      spliceidx = static_cast<int>(splices.size());
1802//    } else {
1803//      spliceidx = 0;
1804//    }
1805//  }
1806//}
1807
1808//void FactorAlternationImpl::Round1(Regexp_Node** sub, int nsub,
1809//                                   Regexp_Node::ParseFlags flags,
1810//                                   std::vector<Splice>* splices) {
1811//  // Round 1: Factor out common literal prefixes.
1812//  int start = 0;
1813//  Rune* rune = null;
1814//  int nrune = 0;
1815//  Regexp_Node::ParseFlags runeflags = Regexp_Node::NoParseFlags;
1816//  for (int i = 0; i <= nsub; i++) {
1817//    // Invariant: sub[start:i] consists of regexps that all
1818//    // begin with rune[0:nrune].
1819//    Rune* rune_i = null;
1820//    int nrune_i = 0;
1821//    Regexp_Node::ParseFlags runeflags_i = Regexp_Node::NoParseFlags;
1822//    if (i < nsub) {
1823//      rune_i = Regexp_Node::LeadingString(sub[i], &nrune_i, &runeflags_i);
1824//      if (runeflags_i == runeflags) {
1825//        int same = 0;
1826//        while (same < nrune && same < nrune_i && rune[same] == rune_i[same])
1827//          same++;
1828//        if (same > 0) {
1829//          // Matches at least one rune in current range.  Keep going around.
1830//          nrune = same;
1831//          continue;
1832//        }
1833//      }
1834//    }
1835
1836//    // Found end of a run with common leading literal string:
1837//    // sub[start:i] all begin with rune[0:nrune],
1838//    // but sub[i] does not even begin with rune[0].
1839//    if (i == start) {
1840//      // Nothing to do - first iteration.
1841//    } else if (i == start+1) {
1842//      // Just one: don't bother factoring.
1843//    } else {
1844//      Regexp_Node* prefix = Regexp_Node::LiteralString(rune, nrune, runeflags);
1845//      for (int j = start; j < i; j++)
1846//        Regexp_Node::RemoveLeadingString(sub[j], nrune);
1847//      splices.emplace_back(prefix, sub + start, i - start);
1848//    }
1849
1850//    // Prepare for next iteration (if there is one).
1851//    if (i < nsub) {
1852//      start = i;
1853//      rune = rune_i;
1854//      nrune = nrune_i;
1855//      runeflags = runeflags_i;
1856//    }
1857//  }
1858//}
1859
1860//void FactorAlternationImpl::Round2(Regexp_Node** sub, int nsub,
1861//                                   Regexp_Node::ParseFlags flags,
1862//                                   std::vector<Splice>* splices) {
1863//  // Round 2: Factor out common simple prefixes,
1864//  // just the first piece of each concatenation.
1865//  // This will be good enough a lot of the time.
1866//  //
1867//  // Complex subexpressions (e.g. involving quantifiers)
1868//  // are not safe to factor because that collapses their
1869//  // distinct paths through the automaton, which affects
1870//  // correctness in some cases.
1871//  int start = 0;
1872//  Regexp_Node* first = null;
1873//  for (int i = 0; i <= nsub; i++) {
1874//    // Invariant: sub[start:i] consists of regexps that all
1875//    // begin with first.
1876//    Regexp_Node* first_i = null;
1877//    if (i < nsub) {
1878//      first_i = Regexp_Node::LeadingRegexp(sub[i]);
1879//      if (first != null &&
1880//          // first must be an empty-width op
1881//          // OR a char class, any char or any byte
1882//          // OR a fixed repeat of a literal, char class, any char or any byte.
1883//          (first.op() == .BeginLine ||
1884//           first.op() == .EndLine ||
1885//           first.op() == .WordBoundary ||
1886//           first.op() == .NoWordBoundary ||
1887//           first.op() == .BeginText ||
1888//           first.op() == .EndText ||
1889//           first.op() == .CharClass ||
1890//           first.op() == .AnyChar ||
1891//           first.op() == .AnyByte ||
1892//           (first.op() == .Repeat &&
1893//            first.min() == first.max() &&
1894//            (first.sub()[0].op() == .Literal ||
1895//             first.sub()[0].op() == .CharClass ||
1896//             first.sub()[0].op() == .AnyChar ||
1897//             first.sub()[0].op() == .AnyByte))) &&
1898//          Regexp_Node::Equal(first, first_i))
1899//        continue;
1900//    }
1901
1902//    // Found end of a run with common leading regexp:
1903//    // sub[start:i] all begin with first,
1904//    // but sub[i] does not.
1905//    if (i == start) {
1906//      // Nothing to do - first iteration.
1907//    } else if (i == start+1) {
1908//      // Just one: don't bother factoring.
1909//    } else {
1910//      Regexp_Node* prefix = first.Incref();
1911//      for (int j = start; j < i; j++)
1912//        sub[j] = Regexp_Node::RemoveLeadingRegexp(sub[j]);
1913//      splices.emplace_back(prefix, sub + start, i - start);
1914//    }
1915
1916//    // Prepare for next iteration (if there is one).
1917//    if (i < nsub) {
1918//      start = i;
1919//      first = first_i;
1920//    }
1921//  }
1922//}
1923
1924//void FactorAlternationImpl::Round3(Regexp_Node** sub, int nsub,
1925//                                   Regexp_Node::ParseFlags flags,
1926//                                   std::vector<Splice>* splices) {
1927//  // Round 3: Merge runs of literals and/or character classes.
1928//  int start = 0;
1929//  Regexp_Node* first = null;
1930//  for (int i = 0; i <= nsub; i++) {
1931//    // Invariant: sub[start:i] consists of regexps that all
1932//    // are either literals (i.e. runes) or character classes.
1933//    Regexp_Node* first_i = null;
1934//    if (i < nsub) {
1935//      first_i = sub[i];
1936//      if (first != null &&
1937//          (first.op() == .Literal ||
1938//           first.op() == .CharClass) &&
1939//          (first_i.op() == .Literal ||
1940//           first_i.op() == .CharClass))
1941//        continue;
1942//    }
1943
1944//    // Found end of a run of Literal/CharClass:
1945//    // sub[start:i] all are either one or the other,
1946//    // but sub[i] is not.
1947//    if (i == start) {
1948//      // Nothing to do - first iteration.
1949//    } else if (i == start+1) {
1950//      // Just one: don't bother factoring.
1951//    } else {
1952//      CharClassBuilder ccb;
1953//      for (int j = start; j < i; j++) {
1954//        Regexp_Node* re = sub[j];
1955//        if (re.op() == .CharClass) {
1956//          CharClass* cc = re.cc();
1957//          for (CharClass::iterator it = cc.begin(); it != cc.end(); ++it)
1958//            ccb.AddRange(it.lo, it.hi);
1959//        } else if (re.op() == .Literal) {
1960//          ccb.AddRangeFlags(re.rune(), re.rune(), re.parse_flags());
1961//        } else {
1962//          LOG(DFATAL) << "RE2: unexpected op: " << re.op() << " "
1963//                      << re.ToString();
1964//        }
1965//        re.Decref();
1966//      }
1967//      Regexp_Node* re = Regexp_Node::NewCharClass(ccb.GetCharClass(), flags);
1968//      splices.emplace_back(re, sub + start, i - start);
1969//    }
1970
1971//    // Prepare for next iteration (if there is one).
1972//    if (i < nsub) {
1973//      start = i;
1974//      first = first_i;
1975//    }
1976//  }
1977//}
1978
1979// Collapse the regexps on top of the stack, down to the
1980// first marker, into a new op node (op == .Alternate
1981// or op == .Concat).
1982do_collapse :: (using ps: *ParseState, op: RegexpOp) {
1983	// Scan backward to marker, counting children of composite.
1984	n := 0;
1985	next: *Regexp_Node;
1986	sub := stacktop;
1987	while sub != null && !is_marker(sub.op) {
1988		next = sub.down;
1989		if sub.op == op {
1990			n += sub.subs.count;
1991		}
1992		else {
1993			n += 1;
1994		}
1995		sub = next;
1996	}
1997
1998	// If there's just one child, leave it alone.
1999	// (Concat of one thing is that one thing; alternate of one thing is same.)
2000	if stacktop != null && stacktop.down == next		return;
2001
2002	// Construct op (alternation or concatenation), flattening op of op.
2003	subs: [..] *Regexp_Node;
2004	array_reserve(*subs, n);
2005	subs.count = n;
2006	next = null;
2007	i := n;
2008	sub = stacktop;
2009	while sub != null && !is_marker(sub.op) {
2010		next = sub.down;
2011		if sub.op == op {
2012			for < sub.subs {
2013				i -= 1;
2014				subs[i] = it;
2015			}
2016			sub.subs.count = 0;
2017			remove_regexp(*ps.pool, sub);
2018		} else {
2019			i -= 1;
2020			subs[i] = finish_regexp(sub);
2021		}
2022		sub = next;
2023	}
2024	assert(i == 0, "Sub expression count mismatch: %", i);
2025
2026	// Takes ownership of subs
2027	re := new_concat_or_alternate(*ps.pool, op, subs, flags, true);
2028	re.simple = compute_simple(re);
2029	re.down = next;
2030	stacktop = re;
2031}
2032
2033// Finishes the current concatenation,
2034// collapsing it into a single regexp on the stack.
2035do_concatenation :: (using ps: *ParseState) {
2036	r1 := stacktop;
2037	if r1 == null || is_marker(r1.op) {
2038		// empty concatenation is special case
2039		re := new_regexp(*ps.pool, .EmptyMatch, flags);
2040		push_regexp(ps, re);
2041	}
2042	do_collapse(ps, .Concat);
2043}
2044
2045// Finishes the current alternation,
2046// collapsing it to a single regexp on the stack.
2047do_alternation :: (using ps: *ParseState) {
2048	do_vertical_bar(ps);
2049	// Now stack top is kVerticalBar.
2050	r1 := stacktop;
2051	stacktop = r1.down;
2052	remove_regexp(*ps.pool, r1);
2053	do_collapse(ps, .Alternate);
2054}
2055
2056// Incremental conversion of concatenated literals into strings.
2057// If top two elements on stack are both literal or string,
2058// collapse into single string.
2059// Don't walk down the stack -- the parser calls this frequently
2060// enough that below the bottom two is known to be collapsed.
2061// Only called when another regexp is about to be pushed
2062// on the stack, so that the topmost literal is not being considered.
2063// (Otherwise ab* would turn into (ab)*.)
2064// If r >= 0, consider pushing a literal r on the stack.
2065// Return whether that happened.
2066maybe_concat_string :: (ps: *ParseState, r: Rune, flags: ParseFlags) -> bool {
2067	re1 := ps.stacktop;
2068	if re1 == null		 return false;
2069	re2 := re1.down;
2070	if re2 == null		 return false;
2071
2072	if re1.op != .Literal && re1.op != .LiteralString	return false;
2073	if re2.op != .Literal && re2.op != .LiteralString	return false;
2074	if re1.parse_flags & .FoldCase != re2.parse_flags & .FoldCase		return false;
2075
2076	if re2.op == .Literal {
2077		// convert into string
2078		rune := re2.rune;
2079		re2.op = .LiteralString;
2080		re2.runes.count = 0;
2081		re2.runes.data = null;
2082		add_rune_to_string(re2, rune);
2083	}
2084
2085	// push re1 into re2.
2086	if (re1.op == .Literal) {
2087		add_rune_to_string(re2, re1.rune);
2088	} else {
2089		for re1.runes {
2090			add_rune_to_string(re2, it);
2091		}
2092		array_free(re1.runes);
2093	}
2094
2095	// reuse re1 if possible
2096	if r >= 0 {
2097		re1.op = .Literal;
2098		re1.rune = r;
2099		re1.parse_flags = flags;
2100		return true;
2101	}
2102
2103	ps.stacktop = re2;
2104	remove_regexp(*ps.pool, re1);
2105	return false;
2106}
2107
2108add_rune_to_string :: (re: *Regexp_Node, r: Rune) {
2109	assert(re.op == .LiteralString);
2110	if re.runes.count == 0 {
2111		array_reserve(*re.runes, 8);
2112	}
2113	array_add(*re.runes, r);
2114}
2115
2116// Lexing routines.
2117
2118// Parses a decimal integer, storing it in *np.
2119// Sets *s to span the remainder of the string.
2120parse_integer :: (s: *string) -> int, success: bool {
2121	if !s.count || !isdigit((s.*)[0])		return 0, false;
2122
2123	// Disallow leading zeros.
2124	if s.count >= 2 && (s.*)[0] == #char "0" && isdigit((s.*)[1])	return 0, false;
2125
2126	n := 0;
2127	while s.count && isdigit((s.*)[0]) {
2128		c := (s.*)[0];
2129		// Avoid overflow.
2130		if n >= 100000000	return 0, false;
2131		n = n*10 + c - #char "0";
2132		advance(s);
2133	}
2134	return n, true;
2135}
2136
2137// Parses a repetition suffix like {1,2} or {2} or {2,}.
2138// Sets *s to span the remainder of the string on success.
2139// Sets *lo and *hi to the given range.
2140// In the case of {2,}, the high number is unbounded;
2141// sets *hi to -1 to signify this.
2142// {,2} is NOT a valid suffix.
2143// The Maybe in the name signifies that the regexp parse
2144// doesn't fail even if ParseRepetition does, so the StringPiece
2145// s must NOT be edited unless MaybeParseRepetition returns true.
2146maybe_parse_repetition :: (sp: *string) -> lo: int, hi: int, success: bool {
2147	s := sp.*;
2148	if !s.count || s[0] != #char "{"	 return 0, 0, false;
2149
2150	advance(*s);
2151
2152	lo, success, remainder := to_integer(s);
2153	if !success		return 0, 0, false;
2154	if !remainder	return 0, 0, false;
2155	s = remainder;
2156
2157	hi: int;
2158	if s[0] == #char "," {
2159		advance(*s); // ','
2160		if !s	return 0, 0, false;
2161
2162		if s[0] == #char "}" {
2163			// {2,} means at least 2
2164			hi = -1;
2165		} else {
2166			// {2,4} means 2, 3, or 4.
2167			hi, success, remainder = to_integer(s);
2168			if !success		return 0, 0, false;
2169			s = remainder;
2170		}
2171	} else {
2172		// {2} means exactly two
2173		hi = lo;
2174	}
2175	if !s.count || s[0] != #char "}"	return 0, 0, false;
2176	advance(*s);
2177	sp.* = s;
2178
2179	return lo, hi, true;
2180}
2181
2182// @ToDo: Move to unicode module?
2183consume_rune :: (str: *string) -> Rune, len: int, status: Status {
2184	result, len := rune_from_string(str.*);
2185	if !len		return 0, 0, make_status(.BadUTF8);
2186
2187	advance(str, len);
2188	return result, len, .{};
2189}
2190
2191// @Speed:: Using my own unicode#is_valid_utf8 would probably be faster
2192is_valid_utf8 :: (str: string) -> bool, status: Status {
2193	t := str;
2194	while t {
2195		result, len, status := consume_rune(*t);
2196		if !len		return false, status;
2197	}
2198	return true, .{};
2199}
2200
2201// Convert hex digit to value.
2202hex_value :: (c: Rune) -> int, success: bool {
2203	if #char "0" <= c && c <= #char "9"		return c - #char "0", true;
2204	if #char "A" <= c && c <= #char "F"		return c - #char "A" + 10, true;
2205	if #char "a" <= c && c <= #char "f"		return c - #char "a" + 10, true;
2206
2207	return 0, false;
2208}
2209
2210// Parse an escape sequence (e.g., \n, \{).
2211// Sets *s to span the remainder of the string.
2212// Sets *rp to the named character.
2213parse_escape :: (s: *string, rune_max: s32) -> r: Rune, status: Status, success: bool {
2214	bad_escape :: (begin: string, s: *string) -> Status {
2215		status: Status;
2216		status.code = .BadEscape;
2217		status.error_arg = slice(begin, 0, begin.count - s.count);
2218		return status;
2219	}
2220
2221	assert(s.count && (s.*)[0] == #char "\\");
2222
2223	if s.count == 1 {
2224		return 0, make_status(.TrailingBackslash), false;
2225	}
2226
2227	begin := s.*;
2228
2229	advance(s); // backslash
2230
2231	c, len, status := consume_rune(s);
2232	if !len		return 0, status, false;
2233
2234	if c == {
2235		case;
2236			if c < Runeself && !isalpha(c) && !isdigit(c) {
2237				// Escaped non-word characters are always themselves.
2238				// PCRE is not quite so rigorous: it accepts things like
2239				// \q, but we don't.  We once rejected \_, but too many
2240				// programs and people insist on using it, so allow \_.
2241				return c, status, true;
2242			}
2243			return 0, bad_escape(begin, s), false;
2244
2245		// Octal escapes.
2246		case #char "1"; #through;
2247		case #char "2"; #through;
2248		case #char "3"; #through;
2249		case #char "4"; #through;
2250		case #char "5"; #through;
2251		case #char "6"; #through;
2252		case #char "7";
2253			// Single non-zero octal digit is a backreference; not supported.
2254			if !s.count || (s.*)[0] < #char "0" || (s.*)[0] > #char "7"	 return 0, bad_escape(begin, s), false;
2255			#through;
2256		case #char "0";
2257			// consume up to three octal digits; already have one.
2258			code := c - #char "0";
2259			if !s.count		return 0, bad_escape(begin, s), false;
2260			c = (s.*)[0];
2261			if #char "0" <= c && c <= #char "7" {
2262				code = code * 8 + c - #char "0";
2263				advance(s); // digit
2264				if s {
2265					c = (s.*)[0];
2266					if #char "0" <= c && c <= #char "7" {
2267						code = code * 8 + c - #char "0";
2268						advance(s); // digit
2269					}
2270				}
2271			}
2272			if code > rune_max		return 0, bad_escape(begin, s), false;
2273
2274			return code, status, true;
2275
2276		// Hexadecimal escapes
2277		case #char "x";
2278			if !s.count	 return 0, bad_escape(begin, s), false;
2279			len: int;
2280			c, len, status = consume_rune(s);
2281			if !len		return 0, status, false;
2282			if c == #char "{" {
2283				// Any number of digits in braces.
2284				// Update n as we consume the string, so that
2285				// the whole thing gets shown in the error message.
2286				// Perl accepts any text at all; it ignores all text
2287				// after the first non-hex digit.  We require only hex digits,
2288				// and at least one.
2289				c, len, status = consume_rune(s);
2290				if !len		return 0, status, false;
2291
2292				nhex := 0;
2293				code: Rune;
2294				hval, success := hex_value(c);
2295				while success {
2296					nhex += 1;
2297					code = cast(Rune)(code * 16 + hval);
2298					if code > rune_max		return 0, bad_escape(begin, s), false;
2299					if !s.count				return 0, bad_escape(begin, s), false;
2300					c, len, status = consume_rune(s);
2301					if !len		return 0, status, false;
2302					hval, success = hex_value(c);
2303				}
2304				if c != #char "}" || nhex == 0		return 0, bad_escape(begin, s), false;
2305				return code, status, true;
2306			} else {
2307				// Easy case: two hex digits.
2308				if !s.count		return 0, bad_escape(begin, s), false;
2309				c1, len, status := consume_rune(s);
2310				if !len			return 0, status, false;
2311				hc0, success0 := hex_value(c);
2312				hc1, success1 := hex_value(c);
2313				if !success0 || success1	return 0, bad_escape(begin, s), false;
2314				return cast(Rune)(hc0 * 16 + hc1), status, true;
2315			}
2316
2317		// C escapes.
2318		case #char "n";
2319			return #char "\n", status, true;
2320		case #char "r";
2321			return #char "\r", status, true;
2322		case #char "t";
2323			return #char "\t", status, true;
2324
2325		// Less common C escapes.
2326		// @ToDO: Implement?
2327		case #char "a";
2328			// return #char "\a", status, true;
2329			return 0, bad_escape(begin, s), false;
2330		case #char "f";
2331			// return #char "\f", status, true;
2332			return 0, bad_escape(begin, s), false;
2333		case #char "v";
2334			// return #char "\v", status, true;
2335			return 0, bad_escape(begin, s), false;
2336
2337		// This code is disabled to avoid misparsing
2338		// the Perl word-boundary \b as a backspace
2339		// when in POSIX regexp mode.  Surprisingly,
2340		// in Perl, \b means word-boundary but [\b]
2341		// means backspace.  We don't support that:
2342		// if you want a backspace embed a literal
2343		// backspace character or use \x08.
2344		//
2345		// case #char "b";
2346		//   *rp = #char "\b";
2347		//   return true;
2348	}
2349}
2350
2351// Add a range to the character class, but exclude newline if asked.
2352// Also handle case folding.
2353add_range_flags :: (using char_class: *CharClass, lo: Rune, hi: Rune, parse_flags: ParseFlags) {
2354	// Take out \n if the flags say so.
2355	cutnl := !(parse_flags & .ClassNL) || (parse_flags & .NeverNL);
2356	if cutnl && lo <= #char "\n" && #char "\n" <= hi {
2357		if lo < #char "\n" {
2358			add_range_flags(char_class, lo, #char "\n" - 1, parse_flags);
2359		}
2360		if hi > #char "\n" {
2361			add_range_flags(char_class, #char "\n" + 1, hi, parse_flags);
2362		}
2363		return;
2364	}
2365
2366	// If folding case, add fold-equivalent characters too.
2367	if parse_flags & .FoldCase {
2368		add_folded_range(char_class, lo, hi, 0);
2369	} else {
2370		add_range(char_class, lo, hi);
2371	}
2372}
2373
2374// Look for a group with the given name.
2375lookup_group :: (name: string, groups: [] UGroup) -> *UGroup {
2376	for * groups {
2377		if it.name == name	return it;
2378	}
2379	return null;
2380}
2381
2382// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
2383lookup_posix_group :: (name: string) -> *UGroup {
2384	return lookup_group(name, posix_groups);
2385}
2386lookup_perl_group :: (name: string) -> *UGroup {
2387	return lookup_group(name, perl_groups);
2388}
2389
2390#if true {
2391	// Fake UGroup containing all Runes
2392	any16 :: URange16.{0, 65535};
2393	any32 :: URange32.{65535, Runemax};
2394	any16array :: URange16.[any16];
2395	any32array :: URange32.[any32];
2396	anygroup :: UGroup.{"Any", 1, any16array, any32array};
2397
2398	// Look for a Unicode group with the given name (e.g., "Han")
2399	lookup_unicode_group :: (name: string) -> *UGroup {
2400		if name == "any" {
2401			return *anygroup;
2402		}
2403		return lookup_group(name, unicode_groups);
2404	}
2405}
2406
2407// Add a UGroup or its negation to the character class.
2408// @ToDo: Why is sign not a boolean?
2409add_unicode_group :: (cc: *CharClass, g: *UGroup, sign: int, parse_flags: ParseFlags) {
2410	if (sign == +1) {
2411		for g.r16	add_range_flags(cc, it.lo, it.hi, parse_flags);
2412		for g.r32	add_range_flags(cc, it.lo, it.hi, parse_flags);
2413	} else {
2414		if parse_flags & .FoldCase {
2415			// Normally adding a case-folded group means
2416			// adding all the extra fold-equivalent runes too.
2417			// But if we're adding the negation of the group,
2418			// we have to exclude all the runes that are fold-equivalent
2419			// to what's already missing.  Too hard, so do in two steps.
2420			char_class1: CharClass;
2421			// @ToDo: Cleanup char_class1
2422			add_unicode_group(*char_class1, g, 1, parse_flags);
2423			// If the flags say to take out \n, put it in, so that negating will take it out.
2424			// Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.
2425			cutnl := !(parse_flags & .ClassNL) || (parse_flags & .NeverNL);
2426			if cutnl {
2427				add_range(*char_class1, #char "\n", #char "\n");
2428			}
2429			negate(*char_class1);
2430			add_char_class(cc, char_class1);
2431			return;
2432		} else {
2433			next: Rune;
2434			for g.r16 {
2435				if next < it.lo {
2436					add_range_flags(cc, next, it.lo - 1, parse_flags);
2437				}
2438				next = it.hi + 1;
2439			}
2440			for g.r32 {
2441				if next < it.lo {
2442					add_range_flags(cc, next, it.lo - 1, parse_flags);
2443				}
2444				next = it.hi + 1;
2445			}
2446			if next <= Runemax {
2447				add_range_flags(cc, next, Runemax, parse_flags);
2448			}
2449		}
2450	}
2451}
2452
2453// Maybe parse a Perl character class escape sequence.
2454// Only recognizes the Perl character classes (\d \s \w \D \S \W),
2455// not the Perl empty-string classes (\b \B \A \Z \z).
2456// On success, sets *s to span the remainder of the string
2457// and returns the corresponding UGroup.
2458// The StringPiece must *NOT* be edited unless the call succeeds.
2459maybe_parse_perl_cc_escape :: (s: *string, parse_flags: ParseFlags) -> *UGroup {
2460	if !(parse_flags & .PerlClasses)	return null;
2461	if s.count < 2 || (s.*)[0] != #char "\\"		return null;
2462
2463	// Could use StringPieceToRune, but there aren't
2464	// any non-ASCII Perl group names.
2465	name := slice(s.*, 0, 2);
2466	g := lookup_perl_group(name);
2467	if g == null		return null;
2468	advance(s, name.count);
2469	return g;
2470}
2471
2472ParseResult :: enum {
2473	// @ToDo: Rename those
2474	kParseOk;  // Did some parsing.
2475	kParseError;  // Found an error.
2476	kParseNothing;  // Decided not to parse.
2477};
2478
2479// Maybe parses a Unicode character group like \p{Han} or \P{Han}
2480// (the latter is a negated group).
2481maybe_parse_unicode_group :: (s: *string, parse_flags: ParseFlags, cc: *CharClass) -> status: Status, result: ParseResult {
2482	status: Status;
2483	// Decide whether to parse.
2484	if !(parse_flags & .UnicodeGroups)	return status, .kParseNothing;
2485	if s.count < 2 || (s.*)[0] != #char "\\"  return status, .kParseNothing;
2486
2487	c: Rune = (s.*)[1];
2488	if c != #char "p" && c != #char "P"		return status, .kParseNothing;
2489
2490	// Committed to parse.  Results:
2491	sign: int = 1;  // -1 = negated char class
2492	if (c == #char "P")	{
2493		sign = -sign;
2494	}
2495	seq := s.*; // \p{Han} or \pL
2496	name: string;  // Han or L
2497	advance(s, 2);  // '\\', 'p'
2498
2499
2500	if (s.*)[0] != #char "{" {
2501		status: Status;
2502		len: int;
2503		c, len, status = consume_rune(s);
2504		if !len		return status, .kParseError;
2505
2506		// Name is just the single rune we just parsed
2507		name = slice(seq, 2, len);
2508	} else {
2509		// Name is in braces. Look for closing }
2510		end := find_index_from_left(s.*, #char "}");
2511		if end != -1 {
2512			valid: bool;
2513			valid, status = is_valid_utf8(seq);
2514			if !valid		return status, .kParseError;
2515
2516			return make_status(.BadCharRange, seq), .kParseError;
2517		}
2518		name = slice(s.*, 1, end - 1); // without '}'
2519		advance(s, end + 1);
2520		valid: bool;
2521		valid, status = is_valid_utf8(name);
2522		if !valid		return status, .kParseError;
2523	}
2524
2525	// Chop seq where s now begins.
2526	seq.count = seq.count - s.count;
2527
2528	if name && name[0] == #char "^" {
2529		sign = -sign;
2530		advance(*name);  // '^'
2531	}
2532
2533	#if true {
2534		// Look up the group in the RE2 Unicode data.
2535		g := lookup_unicode_group(name);
2536		if g == null {
2537			status.code = .BadCharRange;
2538			status.error_arg = seq;
2539			return status, .kParseError;
2540		}
2541
2542		add_unicode_group(cc, g, sign, parse_flags);
2543	} else {
2544		// // Look up the group in the ICU Unicode data. Because ICU provides full
2545		// // Unicode properties support, this could be more than a lookup by name.
2546		// ::icu::UnicodeString ustr = ::icu::UnicodeString::fromUTF8(
2547		// 	std::string("\\p{") + std::string(name) + std::string("}"));
2548		// 	UErrorCode uerr = U_ZERO_ERROR;
2549		// 	::icu::UnicodeSet uset(ustr, uerr);
2550		// 	if (U_FAILURE(uerr)) {
2551		// 		status.set_code(.BadCharRange);
2552		// 		status.set_error_arg(seq);
2553		// 		return .kParseError;
2554		// 	}
2555
2556		// 	// Convert the UnicodeSet to a URange32 and UGroup that we can add.
2557		// 	int nr = uset.getRangeCount();
2558		// 	PODArray<URange32> r(nr);
2559		// 	for (int i = 0; i < nr; i++) {
2560		// 		r[i].lo = uset.getRangeStart(i);
2561		// 		r[i].hi = uset.getRangeEnd(i);
2562		// 	}
2563		// 	UGroup g = {"", +1, 0, 0, r.data(), nr};
2564		// 	AddUGroup(cc, &g, sign, parse_flags);
2565	}
2566
2567	return status, .kParseOk;
2568}
2569
2570// Parses a character class name like [:alnum:].
2571// Sets *s to span the remainder of the string.
2572// Adds the ranges corresponding to the class to ranges.
2573parse_cc_name :: (s: *string, parse_flags: ParseFlags, cc: *CharClass) -> status: Status, result: ParseResult {
2574	// Check begins with [:
2575	if s.count < 2	|| (s.*)[0] != #char "[" || (s.*)[1] != #char ":"		return Status.{}, .kParseNothing;
2576
2577	// Look for closing :].
2578	end := find_index_from_left(s.*, ":]");
2579
2580	// If no closing :], then ignore.
2581	if !end		return Status.{}, .kParseNothing;
2582
2583	// Got it.  Check that it's valid.
2584	name := slice(s.*, 2, end - 2);
2585	g := lookup_posix_group(name);
2586	if (g == null) {
2587		return make_status(.BadCharRange, name), .kParseError;
2588	}
2589
2590	advance(s, name.count + 4);
2591	add_unicode_group(cc, g, g.sign, parse_flags);
2592	return Status.{}, .kParseOk;
2593}
2594
2595// Parses a character inside a character class.
2596// There are fewer special characters here than in the rest of the regexp.
2597// Sets *s to span the remainder of the string.
2598parse_cc_character :: (using ps: *ParseState, s: *string, whole_class: string) -> rune: Rune, status: Status, success: bool {
2599	if !s 	return 0, make_status(.MissingBracket, whole_class), false;
2600
2601	// Allow regular escape sequences even though
2602	// many need not be escaped in this context.
2603	if (s.*)[0] == #char "\\" {
2604		rune, status, success := parse_escape(s, rune_max);
2605		return rune, status, success;
2606	} else {
2607		// Otherwise take the next rune.
2608		rune, len, status := consume_rune(s);
2609		return rune, status, len > 0;
2610	}
2611}
2612
2613// Parses a character class character, or, if the character
2614// is followed by a hyphen, parses a character class range.
2615// For single characters, rr.lo == rr.hi.
2616// Sets *s to span the remainder of the string.
2617parse_cc_range :: (using ps: *ParseState, s: *string, whole_class: string) -> RuneRange, status: Status, success: bool {
2618	os := s.*;
2619	rr: RuneRange;
2620	success: bool;
2621	rr.lo, status, success = parse_cc_character(ps, s, whole_class);
2622	if !success		return rr, status, false;
2623
2624	// [a-] means (a|-), so check for final ].
2625	if s.count >= 2 && (s.*)[0] == #char "-" && (s.*)[1] != #char "]" {
2626		advance(s);  // #char "-"
2627
2628		rr.hi, status, success = parse_cc_character(ps, s, whole_class);
2629		if !success		return rr, status, false;
2630
2631		if rr.hi < rr.lo {
2632			return rr, make_status(.BadCharRange, slice(os, 0, os.count - s.count)), false;
2633		}
2634	} else {
2635		rr.hi = rr.lo;
2636	}
2637	return rr, status, true;
2638}
2639
2640// Parses a possibly-negated character class expression like [^abx-z[:digit:]].
2641// Sets *s to span the remainder of the string.
2642// Sets *out_re to the regexp for the class.
2643parse_char_class :: (using ps: *ParseState, s: *string) -> *Regexp_Node, status: Status, success: bool {
2644	whole_class := s.*;
2645	// Caller checked this.
2646	assert(s.count && (s.*)[0] == #char "[");
2647
2648	negated: bool;
2649	re := new_regexp(*ps.pool, .CharClass, flags & ~.FoldCase);
2650	re.char_class = New(CharClass);
2651	re.char_class_finished = false;
2652	advance(s);  // '['
2653	if s.count && (s.*)[0] == #char "^" {
2654		advance(s); // '^'
2655		negated = true;
2656		if !(flags & .ClassNL) || flags & .NeverNL {
2657			// If NL can't match implicitly, then pretend
2658			// negated classes include a leading \n.
2659			add_range(re.char_class, #char "\n", #char "\n");
2660		}
2661	}
2662	first := true;  // ] is okay as first char in class
2663	while s.count && (first || (s.*)[0] != #char "]") {
2664		// - is only okay unescaped as first or last in class.
2665		// Except that Perl allows - anywhere.
2666		if !first && (s.*)[0] == #char "-" && !(flags & .PerlX) && (s.count == 1 || (s.*)[1] != #char "]") {
2667			remove_regexp(*ps.pool, re);
2668
2669			t := s.*;
2670			advance(*t);  // '-'
2671			r, n, status := consume_rune(*t);
2672			if !n {
2673				return null, status, false;
2674			}
2675
2676			return null, make_status(.BadCharRange, slice(s.*, 0, 1 + n)), false;
2677		}
2678		first = false;
2679
2680		// Look for [:alnum:] etc.
2681		if s.count > 2 && (s.*)[0] == #char "[" && (s.*)[1] == #char ":" {
2682			status, result := parse_cc_name(s, flags, re.char_class);
2683			if #complete result == {
2684				case .kParseOk;
2685					continue;
2686				case .kParseError;
2687					remove_regexp(*ps.pool, re);
2688					return null, status, false;
2689				case .kParseNothing;
2690			}
2691		}
2692
2693		// Look for Unicode character group like \p{Han}
2694		if s.count > 2 && (s.*)[0] == #char "\\" && ((s.*)[1] == #char "p" || (s.*)[1] == #char "P") {
2695			status, result := maybe_parse_unicode_group(s, flags, re.char_class);
2696			if #complete result == {
2697				case .kParseOk;
2698					continue;
2699				case .kParseError;
2700					remove_regexp(*ps.pool, re);
2701					return null, status, false;
2702				case .kParseNothing;
2703			}
2704		}
2705
2706		// Look for Perl character class symbols (extension).
2707		g := maybe_parse_perl_cc_escape(s, flags);
2708		if g != null {
2709			add_unicode_group(re.char_class, g, g.sign, flags);
2710			continue;
2711		}
2712
2713		// Otherwise assume single character or simple range.
2714		rr, status, success := parse_cc_range(ps, s, whole_class);
2715		if !success {
2716			remove_regexp(*ps.pool, re);
2717			return null, status, false;
2718		}
2719
2720		// AddRangeFlags is usually called in response to a class like
2721		// \p{Foo} or [[:foo:]]; for those, it filters \n out unless
2722		// Regexp_Node::ClassNL is set.  In an explicit range or singleton
2723		// like we just parsed, we do not filter \n out, so set ClassNL
2724		// in the flags.
2725		add_range_flags(re.char_class, rr.lo, rr.hi, flags | .ClassNL);
2726	}
2727
2728	if !s.count {
2729		remove_regexp(*ps.pool, re);
2730		return null, make_status(.MissingBracket, whole_class), false;
2731	}
2732	advance(s);  // ']'
2733
2734	if negated {
2735		negate(re.char_class);
2736	}
2737
2738	return re, .{}, true;
2739}
2740
2741// Is this a valid capture name?  [A-Za-z0-9_]+
2742// PCRE limits names to 32 bytes.
2743// Python rejects names starting with digits.
2744// We don't enforce either of those.
2745is_valid_capture_name :: (name: string) -> bool {
2746  if !name	return false;
2747  for 0..name.count - 1 {
2748	  c := name[it];
2749	  if !((#char "0" <= c && c <= #char "9") ||
2750			  (#char "a" <= c && c <= #char "z") ||
2751			  (#char "A" <= c && c <= #char "Z") ||
2752			  c == #char "_")
2753		  return false;
2754  }
2755  return true;
2756}
2757
2758// Parses a Perl flag setting or non-capturing group or both,
2759// like (?i) or (?: or (?i:.  Removes from s, updates parse state.
2760// The caller must check that s begins with "(?".
2761// Returns true on success.  If the Perl flag is not
2762// well-formed or not supported, sets status and returns false.
2763parse_perl_flags :: (using ps: *ParseState, s: *string) -> Status, success: bool {
2764	// Caller is supposed to check this.
2765	assert(flags & .PerlX && s.count >= 2 && (s.*)[0] == #char "(" && (s.*)[1] == #char "?");
2766
2767	t := s.*;
2768	advance(*t, 2); // "(?"
2769
2770	// Check for named captures, first introduced in Python's regexp library.
2771	// As usual, there are three slightly different syntaxes:
2772	//
2773	//   (?P<name>expr)   the original, introduced by Python
2774	//   (?<name>expr)    the .NET alteration, adopted by Perl 5.10
2775	//   (?'name'expr)    another .NET alteration, adopted by Perl 5.10
2776	//
2777	// Perl 5.10 gave in and implemented the Python version too,
2778	// but they claim that the last two are the preferred forms.
2779	// PCRE and languages based on it (specifically, PHP and Ruby)
2780	// support all three as well.  EcmaScript 4 uses only the Python form.
2781	//
2782	// In both the open source world (via Code Search) and the
2783	// Google source tree, (?P<expr>name) is the dominant form,
2784	// so that's the one we implement.  One is enough.
2785	if (t.count > 2 && t[0] == #char "P" && t[1] == #char "<") {
2786		advance(*t, 2);
2787		// Pull out name.
2788		end := find_index_from_left(t, #char ">");
2789		if end == -1 {
2790			valid, status := is_valid_utf8(s.*);
2791			if !valid		return status, false;
2792			return make_status(.BadNamedCapture, s.*), false;
2793		}
2794
2795		// t is "name>...", t[end] == '>'
2796		capture := slice(s.*, 2, end + 3);
2797		name := slice(t, 0, end);
2798		valid, status := is_valid_utf8(name);
2799		if !valid		return status, false;
2800
2801		if !is_valid_capture_name(name)		return make_status(.BadNamedCapture, capture), false;
2802
2803		do_left_paren(ps, name);
2804
2805		advance(s, capture.count + 2);
2806		return status, true;
2807	}
2808
2809	negated: bool;
2810	sawflags: bool;
2811	nflags := flags;
2812	done: bool;
2813	while !done {
2814		if !t.count		return make_status(.BadPerlOp, s.*), false;
2815		c, len, status :=  consume_rune(*t);
2816		if !len		return status, false;
2817		if c == {
2818			case;
2819				return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2820			// Parse flags.
2821			case #char "i";
2822				sawflags = true;
2823				if negated {
2824					nflags &= ~.FoldCase;
2825				} else {
2826					nflags |= .FoldCase;
2827				}
2828
2829			case #char "m";  // opposite of our OneLine
2830				sawflags = true;
2831				if negated {
2832					nflags |= .OneLine;
2833				} else {
2834					nflags &= ~.OneLine;
2835				}
2836
2837			case #char "s";
2838				sawflags = true;
2839				if negated {
2840					nflags &= ~.DotNL;
2841				} else {
2842					nflags |= .DotNL;
2843				}
2844
2845			case #char "U";
2846				sawflags = true;
2847				if negated {
2848					nflags &= ~.NonGreedy;
2849				} else {
2850					nflags |= .NonGreedy;
2851				}
2852
2853			// Negation
2854			case #char "-";
2855				if negated		return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2856
2857				negated = true;
2858				sawflags = false;
2859
2860			// Open new group.
2861			case #char ":";
2862				do_left_paren_no_capture(ps);
2863				done = true;
2864
2865			// Finish flags.
2866			case #char ")";
2867				done = true;
2868		}
2869	}
2870
2871	if negated && !sawflags		return make_status(.BadPerlOp, slice(s.*, 0, s.count - t.count)), false;
2872
2873	flags = nflags;
2874	s.* = t;
2875	return .{}, true;
2876}
2877
2878// // Converts latin1 (assumed to be encoded as Latin1 bytes)
2879// // into UTF8 encoding in string.
2880// // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is
2881// // deprecated and because it rejects code points 0x80-0x9F.
2882// void ConvertLatin1ToUTF8(const StringPiece& latin1, std::string* utf) {
2883//   char buf[UTFmax];
2884
2885//   utf.clear();
2886//   for (size_t i = 0; i < latin1.size(); i++) {
2887//     Rune r = latin1[i] & 0xFF;
2888//     int n = runetochar(buf, &r);
2889//     utf.append(buf, n);
2890//   }
2891// }
2892
2893// Actually C library functions, but it feels stupid to call out to libc for that
2894isalpha :: (c: s32) -> bool {
2895    return (c >= #char "A" && c <= #char "Z") || (c >= #char "a" && c <= #char "z");
2896}
2897isdigit :: (c: s32) -> bool {
2898    return c >= #char "0" && c <= #char "9";
2899}
2900
2901
2902// For to_string
2903Precedence :: enum {
2904	Atom;
2905	Unary;
2906	Concat;
2907	Alternate;
2908	Empty;
2909	Paren;
2910	Toplevel;
2911}
2912
2913#import "Basic";
2914#import "String";
2915#import "Bucket_Array";
2916#import "Pool";
2917
index : blog