git ptrace.dev


        
            
            
        
        0Regexp :: struct {
1	prefix: string;
2	prefix_foldcase: bool;
3	suffix_regexp: *Regexp_Node;
4	prog: *Prog;
5	num_capture_groups: int;
6	pool: Regexp_Pool;
7}
8
9uninit :: (re: *Regexp) {
10	uninit(*re.pool);
11}
12
13compile :: (pattern: string, parse_flags := ParseFlags.LikePerl) -> Regexp, success: bool {
14	regexp: Regexp;
15	entire_regexp, status, pool := parse(pattern, parse_flags);
16
17	if status.code != .Success {
18		log_error("Could not parse regexp \"%\": % (%)", pattern, status.code, status.error_arg);
19		return regexp, false;
20	}
21	regexp.pool = pool;
22
23
24	#if REGEXP_DEBUG	print("Parsed: \"%\"\n", to_string(entire_regexp));
25
26	regexp.prefix, regexp.prefix_foldcase, regexp.suffix_regexp = required_prefix(entire_regexp, *regexp.pool);
27	#if REGEXP_DEBUG	print("Split: \"%\" (foldcase: %) + \"%\"\n", regexp.prefix, regexp.prefix_foldcase, to_string(regexp.suffix_regexp));
28
29	regexp.prog = compile(regexp.suffix_regexp, *regexp.pool);
30	if !regexp.prog {
31		log_error("Could not compile regexp \"%\"", pattern);
32        uninit(*regexp);
33		return .{}, false;
34	}
35	#if REGEXP_DEBUG	print("Compiled suffix:\n%\n", to_string(regexp.prog));
36
37	regexp.num_capture_groups = compute_num_captures(regexp.suffix_regexp);
38// 	// @ToDo: For one-pass NFA
39// 	// is_one_pass = compute_is_one_pass(prog);
40
41	return regexp, true;
42}
43
44#scope_module
45
46// List of pointers to Inst* that need to be filled in (patched).
47// Because the Inst* haven't been filled in yet,
48// we can use the Inst* word to hold the list's "next" pointer.
49// It's kind of sleazy, but it works well in practice.
50// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
51//
52// Because the out and out1 fields in Inst are no longer pointers,
53// we can't use pointers directly here either.  Instead, head refers
54// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
55// head == 0 represents the null list.  This is okay because instruction #0
56// is always the fail instruction, which never appears on a list.
57PatchList :: struct {
58	head: u32;
59	tail: u32;  // for constant-time append
60}
61
62// Returns patch list containing just p.
63make_patch_list :: (p: int) -> PatchList {
64	l: PatchList;
65	l.head = cast(u32)p;
66	l.tail = cast(u32)p;
67	return l;
68}
69
70// Patches all the entries on l to have value p.
71patch  :: (inst: *[..] Inst, l: PatchList, p: int) {
72	head := l.head;
73	while head != 0 {
74		ip := *(inst.*)[head>>1];
75		if head & 1 {
76			head = ip.out1;
77			ip.out1 = cast(u32)p;
78		} else {
79			head = get_out(ip.*);
80			set_out(ip, p);
81		}
82	}
83}
84
85// Appends two patch lists and returns result.
86append :: (inst: *[..] Inst, l1: PatchList, l2: PatchList) -> PatchList {
87	if l1.head == 0		return l2;
88	if l2.head == 0		return l1;
89
90	ip := *(inst.*)[l1.tail>>1];
91	if l1.tail & 1 {
92		ip.out1 = l2.head;
93	} else {
94		set_out(ip, l2.head);
95	}
96
97	l: PatchList;
98	l.head = l1.head;
99	l.tail = l2.tail;
100	return l;
101}
102
103NULL_PATCH_LIST :: PatchList.{0, 0};
104
105// Compiled program fragment.
106Frag :: struct {
107	begin: u32;
108	end: PatchList;
109}
110
111make_frag :: (begin: int, end: PatchList) -> Frag {
112	f: Frag;
113	f.begin = cast(u32)begin;
114	f.end = end;
115	return f;
116}
117
118// Input encodings.
119// Encoding :: enum {
120// 	kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
121// 	kEncodingLatin1,    // Latin-1 (0-FF)
122// }
123
124
125Compiler :: struct {
126	pool: *Regexp_Pool;
127	prog: *Prog;         // Program being built.
128	failed: bool;        // Did we give up compiling?
129	// Encoding encoding_;  // Input encoding
130	reversed: bool;      // Should program run backward over text?
131
132	inst: [..] Inst;
133	max_ninst: int;      // Maximum number of instructions.
134
135	max_mem: s64;    // Total memory budget.
136
137	// std::unordered_map<uint64_t, int> rune_cache_;
138	rune_cache: Table(u64, int);
139	rune_range: Frag;
140	anchor: Anchor;  // anchor mode for RE2::Set
141}
142
143// Compiles re, returning program.
144// Caller is responsible for deleting prog_.
145// If reversed is true, compiles a program that expects
146// to run over the input string backward (reverses all concatenations).
147// The reversed flag is also recorded in the returned program.
148compile :: (re: *Regexp_Node, pool: *Regexp_Pool, reversed := false, max_mem := 0) -> *Prog {
149
150	push_allocator(pool_allocator_proc, pool.pool);
151	// Init compiler
152	c: Compiler;
153	c.pool = pool;
154
155	c.prog = New(Prog);
156	c.reversed = reversed;
157
158	array_reserve(*c.inst, 8);
159
160	inst: Inst;
161	set_opcode(*inst, .kInstFail);
162	array_add(*c.inst, inst);
163
164	c.max_mem = max_mem;
165	if max_mem <= 0 {
166		c.max_ninst = 100000;  // more than enough
167	} else if (max_mem <= size_of(Prog)) {
168		// No room for anything.
169		c.max_ninst = 0;
170	} else {
171		m := (max_mem - size_of(Prog)) / size_of(Inst);
172		// Limit instruction count so that inst->id() fits nicely in an int.
173		// SparseArray also assumes that the indices (inst->id()) are ints.
174		// The call to WalkExponential uses 2*max_ninst_ below,
175		// and other places in the code use 2 or 3 * prog->size().
176		// Limiting to 2^24 should avoid overflow in those places.
177		// (The point of allowing more than 32 bits of memory is to
178		// have plenty of room for the DFA states, not to use it up
179		// on the program.)
180		if (m >= 1<<24) {
181			m = 1<<24;
182		}
183		// Inst imposes its own limit (currently bigger than 2^24 but be safe).
184		if (m > Inst.MAX_INST) {
185			m = Inst.MAX_INST;
186		}
187		c.max_ninst = m;
188	}
189	c.anchor = .UNANCHORED;
190
191	// Simplify to remove things like counted repetitions
192	// and character classes like \d.
193	sre := simplify(re, pool);
194	if sre == null		return null;
195	#if REGEXP_DEBUG	print("Simplified: \"%\" -> \"%\"\n", to_string(re), to_string(sre));
196
197	// Record whether prog is anchored, removing the anchors.
198	// (They get in the way of other optimizations.)
199	anchor_start: bool;
200	anchor_end: bool;
201	sre, anchor_start = is_anchor_start(sre, pool, 0);
202	sre, anchor_end = is_anchor_end(sre, pool, 0);
203
204	// Generate fragment for entire regexp.
205	w := Walk(*Compiler, Frag).{pre_visit = compiler_pre_visit, post_visit = compiler_post_visit, short_visit = compiler_short_visit, copy = null};
206	// @ToDO @Cleanup This is a workaround for nulls not being respected in struct literals yet.
207	// Remove once the struct literal bug is fixed in jai
208	w.copy = null;
209	w.max_visits = 2 * c.max_ninst;
210	all := walk(*c, sre, Frag.{}, w);
211	if c.failed		return null;
212
213	// Success!  Finish by putting Match node at end, and record start.
214	// Turn off c.reversed (if it is set) to force the remaining concatenations
215	// to behave normally.
216	c.reversed = false;
217	all = cat(*c, all, match(*c, 0));
218
219	c.prog.reversed = reversed;
220	if (c.prog.reversed) {
221		c.prog.anchor_start = anchor_end;
222		c.prog.anchor_end = anchor_start;
223	} else {
224		c.prog.anchor_start = anchor_start;
225		c.prog.anchor_end = anchor_end;
226	}
227
228	c.prog.start = all.begin;
229	if (!c.prog.anchor_start) {
230		// Also create unanchored version, which starts with a .*? loop.
231		all = cat(*c, dot_star(*c), all);
232	}
233	c.prog.start_unanchored = all.begin;
234
235	// Hand ownership of prog to caller.
236	return finish(*c, re);
237}
238
239to_string :: (using comp: Compiler) -> string {
240	builder: String_Builder;
241	defer free_buffers(*builder);
242	for inst {
243		print_to_builder(*builder, "%. ", it_index);
244		append(*builder, it);
245		append(*builder, #char "\n");
246	}
247	return builder_to_string(*builder);
248}
249
250finish :: (using comp: *Compiler, re: *Regexp_Node) -> *Prog {
251	if failed		 return null;
252
253	if (prog.start == 0 && prog.start_unanchored == 0) {
254		// No possible matches; keep Fail instruction only.
255		inst.count = 1;
256	}
257
258	// Hand off the array to Prog.
259	prog.inst = inst;
260
261	optimize(prog);
262	#if REGEXP_DEBUG	print("Optimized:\n%\n", to_string(prog.*));
263	flatten(prog);
264	#if REGEXP_DEBUG	print("Flattened:\n%\n", to_string(prog.*));
265	compute_byte_map(prog);
266
267	if (!prog.reversed) {
268		prefix, prefix_foldcase := required_prefix_for_accel(re.*);
269		if prefix.count && !prefix_foldcase {
270			prog.prefix_size = prefix.count;
271			prog.prefix_front = prefix[0];
272			prog.prefix_back = prefix[prefix.count - 1];
273		}
274	}
275
276	// Record remaining memory for DFA.
277	// @ToDo
278	if (max_mem <= 0) {
279		prog.dfa_mem = 1 << 20;
280	} else {
281		m := max_mem - size_of(Prog);
282		m -= prog.inst.count * size_of(Inst);  // account for inst_
283		if can_bit_state(prog) {
284			m -= prog.inst.count * size_of(u16);  // account for list_heads_
285		}
286		if (m < 0) {
287			m = 0;
288		}
289		prog.dfa_mem = m;
290	}
291
292	p := prog;
293	prog = null;
294	return p;
295}
296
297
298alloc_inst :: (using comp: *Compiler, n: int) -> int {
299	if failed || inst.count + n > max_ninst {
300		failed = true;
301		return -1;
302	}
303
304	array_reserve(*inst, inst.count + n);
305	memset(inst.data + inst.count, 0, n * size_of(type_of(inst[0])));
306	id := inst.count;
307	inst.count += n;
308	return id;
309}
310
311
312// These routines are somewhat hard to visualize in text --
313// see http://swtch.com/~rsc/regexp/regexp1.html for
314// pictures explaining what is going on here.
315
316// Returns an unmatchable fragment.
317no_match :: () -> Frag {
318	return .{0, NULL_PATCH_LIST};
319}
320
321// Is a an unmatchable fragment?
322is_no_match :: (a: Frag) -> bool {
323	return a.begin == 0;
324}
325
326// Given fragments a and b, returns fragment for ab.
327cat :: (using comp: *Compiler, a: Frag, b: Frag) -> Frag {
328	if is_no_match(a) || is_no_match(b)	return no_match();
329
330	// Elide no-op.
331	begin := *inst[a.begin];
332	if get_opcode(begin.*) == .kInstNop && a.end.head == (a.begin << 1) && get_out(begin.*) == 0 {
333		// in case refs to a somewhere
334		patch(*inst, a.end, b.begin);
335		return b;
336	}
337
338	// To run backward over string, reverse all concatenations.
339	if reversed {
340		patch(*inst, b.end, a.begin);
341		return make_frag(b.begin, a.end);
342	}
343
344	patch(*inst, a.end, b.begin);
345	return make_frag(a.begin, b.end);
346}
347
348// Given fragments for a and b, returns fragment for a|b.
349alt :: (using comp: *Compiler, a: Frag, b: Frag) -> Frag {
350	// Special case for convenience in loops.
351	if is_no_match(a)	return b;
352	if is_no_match(b)	return a;
353
354	array_reserve(*inst, inst.count + 1);
355	id := alloc_inst(comp, 1);
356	if id <	0	return no_match();
357	init_alt(*inst[id], a.begin, b.begin);
358	return make_frag(id, append(*inst, a.end, b.end));
359}
360
361// When capturing submatches in like-Perl mode, a kOpAlt Inst
362// treats out_ as the first choice, out1_ as the second.
363//
364// For *, +, and ?, if out_ causes another repetition,
365// then the operator is greedy.  If out1_ is the repetition
366// (and out_ moves forward), then the operator is non-greedy.
367
368// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
369star :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
370	id := alloc_inst(comp, 1);
371	if id <	0	return no_match();
372	init_alt(*inst[id], 0, 0);
373	patch(*inst, a.end, id);
374	if nongreedy {
375		inst[id].out1 = a.begin;
376		return make_frag(id, make_patch_list(id << 1));
377	} else {
378		set_out(*inst[id], a.begin);
379		return make_frag(id, make_patch_list(id << 1 | 1));
380	}
381}
382
383// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
384plus :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
385	// a+ is just a* with a different entry point.
386	f := star(comp, a, nongreedy);
387	return make_frag(a.begin, f.end);
388}
389
390// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
391quest :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
392	if is_no_match(a)	return nop(comp);
393	id := alloc_inst(comp, 1);
394	if id <	0	return no_match();
395	pl: PatchList;
396	if nongreedy {
397		init_alt(*inst[id], 0, a.begin);
398		pl = make_patch_list(id << 1);
399	} else {
400		init_alt(*inst[id], a.begin, 0);
401		pl = make_patch_list((id << 1) | 1);
402	}
403	return make_frag(id, append(*inst, pl, a.end));
404}
405
406dot_star :: (using comp: *Compiler) -> Frag {
407	return star(comp, byte_range(comp, 0x00, 0xff, false), true);
408}
409
410// Returns a fragment for the byte range lo-hi.
411byte_range :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool) -> Frag {
412	id := alloc_inst(comp, 1);
413	if id <	0	return no_match();
414	init_byte_range(*inst[id], lo, hi, foldcase, 0);
415	return make_frag(id, make_patch_list(id << 1));
416}
417
418// Returns a no-op fragment.  Sometimes unavoidable.
419nop :: (using comp: *Compiler) -> Frag {
420	id := alloc_inst(comp, 1);
421	if id <	0	return no_match();
422	init_nop(*inst[id]);
423	return make_frag(id, make_patch_list(id << 1));
424}
425
426// Returns a fragment that signals a match.
427match :: (using comp: *Compiler, match_id: int) -> Frag {
428	id := alloc_inst(comp, 1);
429	if id <	0	return no_match();
430	init_match(*inst[id], cast(s32)match_id);
431	return make_frag(id, NULL_PATCH_LIST);
432}
433
434// Returns a fragment matching a particular empty-width op (like ^ or $)
435empty_width :: (using comp: *Compiler, empty: EmptyOp) -> Frag {
436	id := alloc_inst(comp, 1);
437	if id <	0	return no_match();
438	init_empty_width(*inst[id], empty, 0);
439	return make_frag(id, make_patch_list(id << 1));
440}
441
442// Given a fragment a, returns a fragment with capturing parens around a.
443capture :: (using comp: *Compiler, a: Frag, n: s32) -> Frag {
444	if is_no_match(a)	return no_match();
445	id := alloc_inst(comp, 2);
446	if id <	0	return no_match();
447	init_capture(*inst[id], 2 * n, a.begin);
448	init_capture(*inst[id + 1], 2 * n + 1, 0);
449	patch(*inst, a.end, id + 1);
450	return make_frag(id, make_patch_list((id + 1) << 1));
451}
452
453// A Rune is a name for a Unicode code point.
454// Returns maximum rune encoded by UTF-8 sequence of length len.
455// ToDo: This should be an array of constants!
456max_rune :: (len: int) -> Rune {
457	b: int;  // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
458	if len == 1 {
459		b = 7;
460	} else {
461		b = 8-(len+1) + 6*(len-1);
462	}
463	return cast(Rune)(1<<b) - 1;   // maximum Rune for b bits.
464}
465
466// The rune range compiler caches common suffix fragments,
467// which are very common in UTF-8 (e.g., [80-bf]).
468// The fragment suffixes are identified by their start
469// instructions.  null denotes the eventual end match.
470// The Frag accumulates in rune_range_.  Caching common
471// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
472// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
473begin_range :: (using comp: *Compiler) {
474	table_reset(*rune_cache);
475	rune_range.begin = 0;
476	rune_range.end = NULL_PATCH_LIST;
477}
478
479uncached_rune_byte_suffix :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool, next: int) -> int {
480	f := byte_range(comp, lo, hi, foldcase);
481	if (next != 0) {
482		patch(*inst, f.end, next);
483	} else {
484		rune_range.end = append(*inst, rune_range.end, f.end);
485	}
486	return f.begin;
487}
488
489make_rune_cache_key :: (lo: u8, hi: u8, foldcase: bool, next: int) -> u64 {
490  return (cast(u64)next << 17) |
491         (cast(u64)lo   <<  9) |
492         (cast(u64)hi   <<  1) |
493         (cast(u64)foldcase);
494}
495
496cached_rune_byte_suffix :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool, next: int) -> int {
497	key := make_rune_cache_key(lo, hi, foldcase, next);
498	ptr := table_find_pointer(*rune_cache, key);
499	if ptr	return ptr.*;
500	id := uncached_rune_byte_suffix(comp, lo, hi, foldcase, next);
501	table_add(*rune_cache, key, id);
502	return id;
503}
504
505is_cached_rune_byte_suffix :: (using comp: *Compiler, id: int) -> bool {
506	lo := inst[id].lo;
507	hi := inst[id].hi;
508	foldcase := get_foldcase(inst[id]);
509	next := get_out(inst[id]);
510	key := make_rune_cache_key(lo, hi, foldcase, next);
511	return table_find_pointer(*rune_cache, key) != null;
512}
513
514add_suffix :: (using comp: *Compiler, id: int) {
515	if failed		return;
516
517	if rune_range.begin == 0 {
518		rune_range.begin = cast(u32)id;
519		return;
520	}
521
522	rune_range.begin = add_suffix_recursive(comp, rune_range.begin, id);
523}
524
525add_suffix_recursive :: (using comp: *Compiler, root: u32, id: int) -> u32 {
526	assert(get_opcode(inst[root]) == .kInstAlt || get_opcode(inst[root]) == .kInstByteRange);
527
528	f := find_byte_range(comp, root, id);
529	if is_no_match(f) {
530		alt := alloc_inst(comp, 1);
531		if alt < 0	return 0;
532		init_alt(*inst[alt], cast(u32)root, cast(u32)id);
533		return cast(u32)alt;
534	}
535
536	br: u32;
537	if f.end.head == 0 {
538		br = root;
539	} else if (f.end.head&1) {
540		br = inst[f.begin].out1;
541	} else {
542		br = get_out(inst[f.begin]);
543	}
544
545	if is_cached_rune_byte_suffix(comp, br) {
546		// We can't fiddle with cached suffixes, so make a clone of the head.
547		byterange := alloc_inst(comp, 1);
548		if byterange < 0	return 0;
549		init_byte_range(*inst[byterange], inst[br].lo, inst[br].hi, get_foldcase(inst[br]), get_out(inst[br]));
550
551		// Ensure that the parent points to the clone, not to the original.
552		// Note that this could leave the head unreachable except via the cache.
553		br = cast(u32)byterange;
554		if (f.end.head == 0) {
555			root = br;
556		} else if (f.end.head&1) {
557			inst[f.begin].out1 = cast(u32)br;
558		} else {
559			set_out(*inst[f.begin], br);
560		}
561	}
562
563	out := get_out(inst[id]);
564	if !is_cached_rune_byte_suffix(comp, id) {
565		// The head should be the instruction most recently allocated, so free it
566		// instead of leaving it unreachable.
567		assert(id == inst.count - 1);
568		inst[id].out_opcode = 0;
569		inst[id].out1 = 0;
570		inst.count -= 1;
571	}
572
573	out = add_suffix_recursive(comp, get_out(inst[br]), out);
574	if out == 0		return 0;
575
576	set_out(*inst[br], out);
577	return root;
578}
579
580byte_range_equal :: (using comp: *Compiler, id1: int, id2: int) -> bool {
581  return inst[id1].lo == inst[id2].lo &&
582         inst[id1].hi == inst[id2].hi &&
583         get_foldcase(inst[id1]) == get_foldcase(inst[id2]);
584}
585
586find_byte_range :: (using comp: *Compiler, root: int, id: int) -> Frag {
587	if get_opcode(inst[root]) == .kInstByteRange {
588		if byte_range_equal(comp, root, id) {
589			return make_frag(root, NULL_PATCH_LIST);
590		} else {
591			return no_match();
592		}
593	}
594
595	while get_opcode(inst[root]) == .kInstAlt {
596		out1 := inst[root].out1;
597		if byte_range_equal(comp, out1, id) {
598			return make_frag(root, make_patch_list((root << 1) | 1));
599		}
600
601		// CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
602		// what we're looking for, then we can stop immediately. Unfortunately, we
603		// can't short-circuit the search in reverse mode.
604		if (!reversed)	 return no_match();
605
606		out := get_out(inst[root]);
607		if get_opcode(inst[out]) == .kInstAlt {
608			root = out;
609		} else if byte_range_equal(comp, out, id) {
610			return make_frag(root, make_patch_list(root << 1));
611		} else {
612			return no_match();
613		}
614	}
615
616	assert(false, "Should never be reached");
617	return no_match();
618}
619
620end_range :: (using comp: *Compiler) -> Frag {
621	return rune_range;
622}
623
624// Converts rune range lo-hi into a fragment that recognizes
625// the bytes that would make up those runes in the current
626// encoding (always utf-8 for now).
627// This lets the machine work byte-by-byte even when
628// using multibyte encodings.
629add_rune_range :: (using comp: *Compiler, lo: Rune, hi: Rune, foldcase: bool) {
630	if (lo > hi)	return;
631
632	// Pick off 80-10FFFF as a common special case.
633	if (lo == 0x80 && hi == 0x10ffff) {
634		add_80_10ffff(comp);
635		return;
636	}
637
638	// Split range into same-length sized ranges.
639	for i: 1..UTFmax - 1 {
640		max := max_rune(i);
641		if (lo <= max && max < hi) {
642			add_rune_range(comp, lo, max, foldcase);
643			add_rune_range(comp, max+1, hi, foldcase);
644			return;
645		}
646	}
647
648	// ASCII range is always a special case.
649	if hi < Runeself {
650		add_suffix(comp, uncached_rune_byte_suffix(comp, cast(u8)lo, cast(u8)hi, foldcase, 0));
651		return;
652	}
653
654	// Split range into sections that agree on leading bytes.
655	for i: 1..UTFmax - 1 {
656		m := cast(s32)(1<<(6*i)) - 1;  // last i bytes of a UTF-8 sequence
657		if ((lo & ~m) != (hi & ~m)) {
658			if ((lo & m) != 0) {
659				add_rune_range(comp, lo, lo|m, foldcase);
660				add_rune_range(comp, (lo|m)+1, hi, foldcase);
661				return;
662			}
663			if ((hi & m) != m) {
664				add_rune_range(comp, lo, (hi&~m)-1, foldcase);
665				add_rune_range(comp, hi&~m, hi, foldcase);
666				return;
667			}
668		}
669	}
670
671	// Finally.  Generate byte matching equivalent for lo-hi.
672	ulo, n := bytes_from_rune(lo);
673	uhi, m := bytes_from_rune(hi);
674	assert(n == m);
675
676	// The logic below encodes this thinking:
677	//
678	// 1. When we have built the whole suffix, we know that it cannot
679	// possibly be a suffix of anything longer: in forward mode, nothing
680	// else can occur before the leading byte; in reverse mode, nothing
681	// else can occur after the last continuation byte or else the leading
682	// byte would have to change. Thus, there is no benefit to caching
683	// the first byte of the suffix whereas there is a cost involved in
684	// cloning it if it begins a common prefix, which is fairly likely.
685	//
686	// 2. Conversely, the last byte of the suffix cannot possibly be a
687	// prefix of anything because next == 0, so we will never want to
688	// clone it, but it is fairly likely to be a common suffix. Perhaps
689	// more so in reverse mode than in forward mode because the former is
690	// "converging" towards lower entropy, but caching is still worthwhile
691	// for the latter in cases such as 80-BF.
692	//
693	// 3. Handling the bytes between the first and the last is less
694	// straightforward and, again, the approach depends on whether we are
695	// "converging" towards lower entropy: in forward mode, a single byte
696	// is unlikely to be part of a common suffix whereas a byte range
697	// is more likely so; in reverse mode, a byte range is unlikely to
698	// be part of a common suffix whereas a single byte is more likely
699	// so. The same benefit versus cost argument applies here.
700	id := 0;
701	if (reversed) {
702		for i: 0..n-1 {
703			// In reverse UTF-8 mode: cache the leading byte; don't cache the last
704			// continuation byte; cache anything else iff it's a single byte (XX-XX).
705			if (i == 0 || (ulo[i] == uhi[i] && i != n-1)) {
706				id = cached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
707			} else {
708				id = uncached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
709			}
710		}
711	} else {
712		for < i: 0..n-1 {
713			// In forward UTF-8 mode: don't cache the leading byte; cache the last
714			// continuation byte; cache anything else iff it's a byte range (XX-YY).
715			if (i == n-1 || (ulo[i] < uhi[i] && i != 0)) {
716				id = cached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
717			} else {
718				id = uncached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
719			}
720		}
721	}
722	add_suffix(comp, id);
723}
724
725add_80_10ffff :: (using comp: *Compiler) {
726  // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
727  // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
728  // permitting overlong encodings in E0 and F0 sequences and code points
729  // over 10FFFF in F4 sequences, the size of the bytecode and the number
730  // of equivalence classes are reduced significantly.
731  id: int;
732  if reversed {
733    // Prefix factoring matters, but we don't have to handle it here
734    // because the rune range trie logic takes care of that already.
735    id = uncached_rune_byte_suffix(comp, 0xC2, 0xDF, false, 0);
736    id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
737    add_suffix(comp, id);
738
739    id = uncached_rune_byte_suffix(comp, 0xE0, 0xEF, false, 0);
740    id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
741    id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
742    add_suffix(comp, id);
743
744    id = uncached_rune_byte_suffix(comp, 0xF0, 0xF4, false, 0);
745    id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
746    id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
747    id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
748    add_suffix(comp, id);
749  } else {
750    // Suffix factoring matters - and we do have to handle it here.
751	cont1 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, 0);
752    id = uncached_rune_byte_suffix(comp, 0xC2, 0xDF, false, cont1);
753    add_suffix(comp, id);
754
755    cont2 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, cont1);
756    id = uncached_rune_byte_suffix(comp, 0xE0, 0xEF, false, cont2);
757    add_suffix(comp, id);
758
759    cont3 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, cont2);
760    id = uncached_rune_byte_suffix(comp, 0xF0, 0xF4, false, cont3);
761    add_suffix(comp, id);
762  }
763}
764
765
766compiler_short_visit :: (using comp: *Compiler, re: *Regexp_Node, f: Frag) -> Frag {
767	failed = true;
768	return no_match();
769}
770
771compiler_pre_visit :: (using comp: *Compiler, re: *Regexp_Node, f: Frag) -> Frag, stop: bool {
772	return .{}, failed;
773}
774
775literal :: (using comp: *Compiler, r: Rune, foldcase: bool) -> Frag {
776	if (r < Runeself) { // Make common case fast.
777		return byte_range(comp, cast(u8)r, cast(u8)r, foldcase);
778	}
779
780	buf, n := bytes_from_rune(r);
781	f := byte_range(comp, buf[0], buf[0], false);
782	for i: 1..n-1 {
783		f = cat(comp, f, byte_range(comp, buf[i], buf[i], false));
784	}
785	return f;
786}
787
788// Called after traversing the node's children during the walk.
789// Given their frags, build and return the frag for this re.
790// @ToDo @Cleanup: Compiler never uses any input args. Use something more specific than "walk"
791compiler_post_visit :: (using comp: *Compiler, re: *Regexp_Node, parent_arg: Frag, pre_arg: Frag, child_frags: [..] Frag) -> Frag {
792	// If a child failed, don't bother going forward, especially
793	// since the child_frags might contain Frags with NULLs in them.
794	if failed	return no_match();
795
796	// Given the child fragments, return the fragment for this node.
797	if re.op == {
798		case;
799			assert(false, "Unexpected op: %", re.op);
800			return no_match();
801
802		case .NoMatch;
803			return no_match();
804
805		case .EmptyMatch;
806			return nop(comp);
807
808		case .HaveMatch;
809			f := match(comp, re.match_id);
810			if anchor == .ANCHOR_BOTH {
811				// Append \z or else the subexpression will effectively be unanchored.
812				// Complemented by the Unanchored case in CompileSet().
813				f = cat(comp, empty_width(comp, .kEmptyEndText), f);
814			}
815			return f;
816
817		case .Concat;
818			f := child_frags[0];
819			for i: 1..child_frags.count - 1 {
820				f = cat(comp, f, child_frags[i]);
821			}
822			return f;
823
824		case .Alternate;
825			f := child_frags[0];
826			for i: 1..child_frags.count - 1 {
827				f = alt(comp, f, child_frags[i]);
828			}
829			return f;
830
831		case .Star;
832			return star(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
833
834		case .Plus;
835			return plus(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
836
837		case .Quest;
838			return quest(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
839
840		case .Literal;
841			return literal(comp, re.rune, (re.parse_flags & .FoldCase) != 0);
842
843		case .LiteralString;
844			// Concatenation of literals.
845			if re.runes.count == 0	return nop(comp);
846
847			f: Frag;
848			for re.runes {
849				f1 := literal(comp, it, (re.parse_flags & .FoldCase) != 0);
850				if (it_index == 0) {
851					f = f1;
852				} else {
853					f = cat(comp, f, f1);
854				}
855			}
856			return f;
857
858		case .AnyChar;
859			begin_range(comp);
860			add_rune_range(comp, 0, Runemax, false);
861			return end_range(comp);
862
863		case .AnyByte;
864			return byte_range(comp, 0x00, 0xFF, false);
865
866		case .CharClass;
867			assert(re.char_class.ranges.count > 0, "No ranges in char class");
868
869			// ASCII case-folding optimization: if the char class
870			// behaves the same on A-Z as it does on a-z,
871			// discard any ranges wholly contained in A-Z
872			// and mark the other ranges as foldascii.
873			// This reduces the size of a program for
874			// (?i)abc from 3 insts per letter to 1 per letter.
875			foldascii := re.char_class.folds_ascii;
876
877			// Character class is just a big OR of the different
878			// character ranges in the class.
879			begin_range(comp);
880			for re.char_class.ranges {
881				// ASCII case-folding optimization (see above).
882				if foldascii && #char "A" <= it.lo && it.hi <= #char "Z"	continue;
883
884				// If this range contains all of A-Za-z or none of it,
885				// the fold flag is unnecessary; don't bother.
886				fold := foldascii;
887				if it.lo <= #char "A" && #char "z" <= it.hi || it.hi < #char "A" || #char "z" < it.lo || #char "Z" < it.lo && it.hi < #char "a" {
888					fold = false;
889				}
890
891				add_rune_range(comp, it.lo, it.hi, fold);
892			}
893			return end_range(comp);
894
895		case .Capture;
896			// If this is a non-capturing parenthesis -- (?:foo) --
897			// just use the inner expression.
898			if re.cap < 0 {
899				return child_frags[0];
900			} else {
901				return capture(comp, child_frags[0], re.cap);
902			}
903
904		case .BeginLine;
905			return empty_width(comp, ifx reversed then EmptyOp.kEmptyEndLine else EmptyOp.kEmptyBeginLine);
906
907		case .EndLine;
908			return empty_width(comp, ifx reversed then EmptyOp.kEmptyBeginLine else EmptyOp.kEmptyEndLine);
909
910		case .BeginText;
911			return empty_width(comp, ifx reversed then EmptyOp.kEmptyEndText else EmptyOp.kEmptyBeginText);
912
913		case .EndText;
914			return empty_width(comp, ifx reversed then EmptyOp.kEmptyBeginText else EmptyOp.kEmptyEndText);
915
916		case .WordBoundary;
917			return empty_width(comp, .kEmptyWordBoundary);
918
919		case .NoWordBoundary;
920			return empty_width(comp, .kEmptyNonWordBoundary);
921	}
922}
923
924// Is this regexp required to start at the beginning of the text?
925// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
926// but handles (\A(a|b)).  Could use the Walker to write a more exact one.
927is_anchor_start :: (re: *Regexp_Node, pool: *Regexp_Pool, depth: int) -> *Regexp_Node, bool {
928	// The depth limit makes sure that we don't overflow
929	// the stack on a deeply nested regexp.  As the comment
930	// above says, IsAnchorStart is conservative, so returning
931	// a false negative is okay.  The exact limit is somewhat arbitrary.
932	if re == null || depth >= 4		return re, false;
933
934	if re.op == {
935		case .Concat;
936			if re.subs.count > 0 {
937				sub, sub_start := is_anchor_start(re.subs[0], pool, depth + 1);
938				if sub_start {
939					return new_concat(pool, re.subs, re.parse_flags), true;
940				}
941			}
942		case .Capture;
943			sub, sub_start := is_anchor_start(re.subs[0], pool, depth + 1);
944			if sub_start {
945				return new_capture(pool, sub, re.parse_flags, re.cap), true;
946			}
947		case .BeginText;
948			return new_literal_string(pool, .[], re.parse_flags), true;
949	}
950	return re, false;
951}
952
953// Is this regexp required to start at the end of the text?
954// Only approximate; can return false for complicated regexps like (a\z|b\z),
955// but handles ((a|b)\z).  Could use the Walker to write a more exact one.
956is_anchor_end :: (re: *Regexp_Node, pool: *Regexp_Pool, depth: int) -> *Regexp_Node, bool {
957	// The depth limit makes sure that we don't overflow
958	// the stack on a deeply nested regexp.  As the comment
959	// above says, IsAnchorEnd is conservative, so returning
960	// a false negative is okay.  The exact limit is somewhat arbitrary.
961	if (re == null || depth >= 4)		 return re, false;
962
963	if re.op == {
964		case .Concat;
965			if re.subs.count > 0 {
966				sub, sub_end := is_anchor_end(re.subs[re.subs.count - 1], pool, depth + 1);
967				if sub_end {
968					return new_concat(pool, re.subs, re.parse_flags), true;
969				}
970			}
971		case .Capture;
972			sub, sub_end := is_anchor_end(re.subs[0], pool, depth + 1);
973			if sub_end {
974				return new_capture(pool, sub, re.parse_flags, re.cap), true;
975			}
976
977		case .EndText;
978			return new_literal_string(pool, .[], re.parse_flags), true;
979	}
980
981	return re, false;
982}
983
984#scope_file
985
986#import "Hash_Table";
987
index : blog