git ptrace.dev


        
            
            
        
        0#scope_module
1
2// Opcodes for Inst
3InstOp :: enum u32 {
4	kInstAlt :: 0;      // choose between out_ and out1_
5	kInstAltMatch;     // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
6	kInstByteRange;    // next (possible case-folded) byte must be in [lo_, hi_]
7	kInstCapture;      // capturing parenthesis number cap_
8	kInstEmptyWidth;   // empty-width special (^ $ ...); bit(s) set in empty_
9	kInstMatch;        // found a match!
10	kInstNop;          // no-op; occasionally unavoidable
11	kInstFail;         // never match; occasionally unavoidable
12}
13
14// Bit flags for empty-width specials
15EmptyOp :: enum_flags {
16	kEmptyBeginLine        :: 1<<0;      // ^ - beginning of line
17	kEmptyEndLine          :: 1<<1;      // $ - end of line
18	kEmptyBeginText        :: 1<<2;      // \A - beginning of text
19	kEmptyEndText          :: 1<<3;      // \z - end of text
20	kEmptyWordBoundary     :: 1<<4;      // \b - word boundary
21	kEmptyNonWordBoundary  :: 1<<5;      // \B - not \b
22	kEmptyAllFlags         :: (1<<6)-1;
23}
24
25// Single instruction in regexp program.
26Inst :: struct  {
27	//     // Getters
28	//     int id(Prog* p) { return static_cast<int>(this - p->inst_.data()); }
29	//     int out1()      { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
30	//     int cap()       { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
31	//     int lo()        { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
32	//     int hi()        { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
33	//     int foldcase()  { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; }
34	//     int hint()      { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; }
35	//     int match_id()  { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
36	//     EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
37
38	// Maximum instruction id.
39	// (Must fit in out_opcode_. PatchList/last steal another bit.)
40	MAX_INST :: (1<<28) - 1;
41
42	out_opcode: u32;  // 28 bits: out, 1 bit: last, 3 (low) bits: opcode
43	union {                // additional instruction arguments:
44		out1: u32;      // opcode == kInstAlt
45						//   alternate next instruction
46		cap: s32;       // opcode == kInstCapture
47						//   Index of capture register (holds text
48						//   position recorded by capturing parentheses).
49						//   For \n (the submatch for the nth parentheses),
50						//   the left parenthesis captures into register 2*n
51						//   and the right one captures into register 2*n+1.
52
53		match_id: s32;  // opcode == kInstMatch
54						//   Match ID to identify this match (for re2::Set).
55
56		struct {        // opcode == kInstByteRange
57			lo: u8;			//   byte range is lo_-hi_ inclusive
58			hi: u8;       //
59			hint_foldcase: u16;  // 15 bits: hint, 1 (low) bit: foldcase
60							//   hint to execution engines: the delta to the
61							//   next instruction (in the current list) worth
62							//   exploring iff this instruction matched; 0
63							//   means there are no remaining possibilities,
64							//   which is most likely for character classes.
65							//   foldcase: A-Z -> a-z before checking range.
66		}
67
68		empty: EmptyOp;       // opcode == kInstEmptyWidth
69							//   empty_ is bitwise OR of kEmpty* flags above.
70	};
71}
72
73get_opcode :: (using inst: Inst) -> InstOp {
74	return cast(InstOp) (out_opcode & 7);
75}
76
77get_last :: (using inst: Inst) -> bool {
78	return cast(bool)((out_opcode >> 3) & 1);
79}
80
81get_out :: (using inst: Inst) -> u32 {
82	return out_opcode >> 4;
83}
84
85set_opcode :: (using inst: *Inst, opcode: InstOp) {
86	out_opcode = out_opcode & 0xFFFF_FFF8 | cast(u32)opcode;
87}
88
89set_last :: (using inst: *Inst) {
90	out_opcode = out_opcode & 0xFFFF_FFF7 | cast(u32)(1 << 3);
91}
92
93set_out :: (using inst: *Inst, out: int) {
94	out_opcode = out_opcode & 0x0F | cast(u32)(out << 4);
95}
96
97set_out_opcode :: (using inst: *Inst, out: int, opcode: InstOp) {
98	out_opcode = out_opcode & 0x08 | cast(u32)(out << 4) | cast(u32)opcode;
99}
100
101
102get_foldcase :: (using inst: Inst) -> bool {
103	assert(get_opcode(inst) == .kInstByteRange);
104	return cast(bool) (hint_foldcase & 1);
105}
106
107get_hint :: (using inst: Inst) -> int {
108	assert(get_opcode(inst) == .kInstByteRange);
109	return hint_foldcase >> 1;
110}
111
112greedy :: (using inst: Inst, p: *Prog) -> bool {
113	assert(get_opcode(inst) == .kInstAltMatch);
114	out_inst := p.inst[get_out(inst)];
115	return get_opcode(out_inst) == .kInstByteRange ||
116		(get_opcode(out_inst) == .kInstNop && get_opcode(p.inst[get_out(out_inst)]) == .kInstByteRange);
117}
118
119// Does this inst (an kInstByteRange) match c?
120// @ToDo: should this int be Rune?
121matches :: (using inst: Inst, c: int) -> bool {
122	assert(get_opcode(inst) == .kInstByteRange);
123	if (get_foldcase(inst) && #char "A" <= c && c <= #char "Z") {
124		c += #char "a" - #char "A";
125	}
126	return lo <= c && c <= hi;
127}
128
129
130// Returns whether byte c is a word character: ASCII only.
131// Used by the implementation of \b and \B.
132// This is not right for Unicode, but:
133//   - it's hard to get right in a byte-at-a-time matching world
134//     (the DFA has only one-byte lookahead).
135//   - even if the lookahead were possible, the Progs would be huge.
136// This crude approximation is the same one PCRE uses.
137is_word_char :: (c: u8) -> bool {
138	return (#char "A" <= c && c <= #char "Z") ||
139		(#char "a" <= c && c <= #char "z") ||
140		(#char "0" <= c && c <= #char "9") ||
141		c == #char "_";
142}
143
144// Compiled form of regexp program.
145Prog :: struct {
146	// Whether to anchor the search.
147	Anchor :: enum {
148		UNANCHORED;  // match anywhere
149		ANCHORED;    // match only starting at beginning of text
150	}
151
152	// Kind of match to look for (for anchor != kFullMatch)
153	//
154	// kLongestMatch mode finds the overall longest
155	// match but still makes its submatch choices the way
156	// Perl would, not in the way prescribed by POSIX.
157	// The POSIX rules are much more expensive to implement,
158	// and no one has needed them.
159	//
160	// kFullMatch is not strictly necessary -- we could use
161	// kLongestMatch and then check the length of the match -- but
162	// the matching code can run faster if it knows to consider only
163	// full matches.
164	MatchKind :: enum {
165		kFirstMatch;     // like Perl, PCRE
166		kLongestMatch;   // like egrep or POSIX
167		kFullMatch;      // match only entire text; implies anchor==ANCHORED
168		kManyMatch;      // for SearchDFA, records set of matches
169	};
170
171	anchor_start: bool;       // regexp has explicit start anchor
172	anchor_end: bool;         // regexp has explicit end anchor
173	reversed: bool;           // whether program runs backward over input
174	did_flatten: bool;        // has Flatten been called?
175	did_onepass: bool;        // has IsOnePass been called?
176
177	start: int;               // entry point for program
178	start_unanchored: int;    // unanchored entry point for program
179	bytemap_range: int;       // bytemap_[x] < bytemap_range_
180	prefix_size: int;		// size of prefix (0 if no prefix)
181	prefix_front: s16 = -1;        // first byte of prefix (-1 if no prefix)
182	prefix_back: s16 = -1;         // last byte of prefix (-1 if no prefix)
183
184	list_count: int;                 // count of lists (see above)
185	inst_count: [#run enum_highest_value(InstOp) + 1] int;       // count of instructions by opcode
186	list_heads: [..] u16;  // sparse array enumerating list heads
187	// not populated if size_ is overly large
188
189	inst: [..] Inst;              // pointer to instruction array
190	onepass_nodes: [..] u8;  // data for OnePass nodes
191
192	dfa_mem: s64;         // Maximum memory for DFAs.
193	// dfa_first: *DFA;          // DFA cached for kFirstMatch/kManyMatch
194	// dfa_longest: *DFA;        // DFA cached for kLongestMatch/kFullMatch
195
196	bytemap: [256] u8;    // map from input bytes to byte classes
197
198	// std::once_flag dfa_first_once_;
199	// std::once_flag dfa_longest_once_;
200}
201
202  // Inst *inst(int id) { return &inst_[id]; }
203  // int start() { return start_; }
204  // void set_start(int start) { start_ = start; }
205  // int start_unanchored() { return start_unanchored_; }
206  // void set_start_unanchored(int start) { start_unanchored_ = start; }
207  // int size() { return size_; }
208  // bool reversed() { return reversed_; }
209  // void set_reversed(bool reversed) { reversed_ = reversed; }
210  // int list_count() { return list_count_; }
211  // int inst_count(InstOp op) { return inst_count_[op]; }
212  // uint16_t* list_heads() { return list_heads_.data(); }
213  // int64_t dfa_mem() { return dfa_mem_; }
214  // void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; }
215  // bool anchor_start() { return anchor_start_; }
216  // void set_anchor_start(bool b) { anchor_start_ = b; }
217  // bool anchor_end() { return anchor_end_; }
218  // void set_anchor_end(bool b) { anchor_end_ = b; }
219  // int bytemap_range() { return bytemap_range_; }
220  // const uint8_t* bytemap() { return bytemap_; }
221  // bool can_prefix_accel() { return prefix_size_ != 0; }
222
223  // // Accelerates to the first likely occurrence of the prefix.
224  // // Returns a pointer to the first byte or NULL if not found.
225  // const void* PrefixAccel(const void* data, size_t size) {
226  //   DCHECK_GE(prefix_size_, 1);
227  //   return prefix_size_ == 1 ? memchr(data, prefix_front_, size)
228  //                            : PrefixAccel_FrontAndBack(data, size);
229  // }
230
231
232
233can_bit_state :: (using prog: *Prog) -> bool {
234	return list_heads.count > 0;
235}
236
237init_alt :: (inst: *Inst, out: u32, out1: u32) {
238	assert(inst.out_opcode == 0);
239	set_out_opcode(inst, out, .kInstAlt);
240	inst.out1 = out1;
241}
242
243init_byte_range :: (inst: *Inst, lo: u8, hi: u8, foldcase: bool, out: int) {
244	assert(inst.out_opcode == 0);
245	set_out_opcode(inst, out, .kInstByteRange);
246	inst.lo = lo;
247	inst.hi = hi;
248	inst.hint_foldcase = (cast(u16)foldcase)&1;
249}
250
251init_capture :: (inst: *Inst, cap: s32, out: u32) {
252	assert(inst.out_opcode == 0);
253	set_out_opcode(inst, out, .kInstCapture);
254	inst.cap = cap;
255}
256
257init_empty_width :: (inst: *Inst, empty: EmptyOp, out: u32) {
258	assert(inst.out_opcode == 0);
259	set_out_opcode(inst, out, .kInstEmptyWidth);
260	inst.empty = empty;
261}
262
263init_match :: (inst: *Inst, id: s32) {
264	assert(inst.out_opcode == 0);
265	set_opcode(inst, .kInstMatch);
266	inst.match_id = id;
267}
268
269init_nop :: (inst: *Inst) {
270	assert(inst.out_opcode == 0);
271	set_opcode(inst, .kInstNop);
272}
273
274init_fail :: (inst: *Inst) {
275	assert(inst.out_opcode == 0);
276	set_opcode(inst, .kInstFail);
277}
278
279// Peep-hole optimizer.
280optimize :: (using prog: *Prog) {
281	q: Sparse_Set;
282	init(*q, inst.count);
283
284	// Eliminate nops.  Most are taken out during compilation
285	// but a few are hard to avoid.
286	q.count = 0;
287	add_if_nonzero(*q, start);
288	for id: q {
289		ip := *inst[id];
290		j := get_out(ip.*);
291		// ToDo: Why is this a pointer????
292		jp: *Inst;
293		while true {
294			if j == 0	break;
295			jp = *inst[j];
296			if get_opcode(jp.*) != .kInstNop	break;
297			j = get_out(jp.*);
298		}
299		set_out(ip, j);
300		add_if_nonzero(*q, j);
301		if get_opcode(ip.*) == .kInstAlt {
302			j = ip.out1;
303			while true {
304				if j == 0	break;
305				jp = *inst[j];
306				if get_opcode(jp.*) != .kInstNop	break;
307				j = get_out(jp.*);
308			}
309			ip.out1 = j;
310			add_if_nonzero(*q, j);
311		}
312	}
313
314	// Insert kInstAltMatch instructions
315	// Look for
316	//   ip: Alt -> j | k
317	//	  j: ByteRange [00-FF] -> ip
318	//    k: Match
319	// or the reverse (the above is the greedy one).
320	// Rewrite Alt to AltMatch.
321	q.count = 0;
322	add_if_nonzero(*q, start);
323	for id: q {
324		ip := *inst[id];
325		add_if_nonzero(*q, get_out(ip.*));
326		if get_opcode(ip.*) == .kInstAlt {
327			add_if_nonzero(*q, ip.out1);
328
329			// ToDo: Why are these pointers????
330			j := inst[get_out(ip.*)];
331			k := inst[ip.out1];
332			if get_opcode(j) == .kInstByteRange && get_out(j) == id && j.lo == 0x00 && j.hi == 0xFF && is_match(prog, k) {
333				set_opcode(ip, .kInstAltMatch);
334				continue;
335			}
336			if is_match(prog, j) && get_opcode(k) == .kInstByteRange && get_out(k) == id && k.lo == 0x00 && k.hi == 0xFF {
337				set_opcode(ip, .kInstAltMatch);
338			}
339		}
340	}
341}
342
343add_if_nonzero :: (set: *Sparse_Set, i: int) {
344	if i != 0 {
345		add(set, i);
346	}
347}
348
349// Is ip a guaranteed match at end of text, perhaps after some capturing?
350is_match :: (prog: *Prog, i_in: Inst) -> bool {
351	i := i_in;
352	while true {
353		if #complete get_opcode(i) == {
354			case .kInstAlt; #through;
355			case .kInstAltMatch; #through;
356			case .kInstByteRange; #through;
357			case .kInstFail; #through;
358			case .kInstEmptyWidth;
359				return false;
360
361			case .kInstCapture; #through;
362			case .kInstNop;
363				i = prog.inst[get_out(i)];
364
365			case .kInstMatch;
366				return true;
367		}
368	}
369	// Should not be needed
370	assert(false);
371	return false;
372}
373
374empty_flags :: (text: string, pos: int) -> EmptyOp {
375	flags: EmptyOp;
376
377	// ^ and \A
378	if pos == 0 {
379		flags |= EmptyOp.kEmptyBeginText | .kEmptyBeginLine;
380	} else if text[pos - 1] == #char "\n" {
381		flags |= .kEmptyBeginLine;
382	}
383
384	// $ and \z
385	if pos == text.count {
386		flags |= EmptyOp.kEmptyEndText | .kEmptyEndLine;
387	} else if text[pos] == #char "\n" { // @ToDo: I removed a pos < count check here. Can pos ever be greater than count??
388		flags |= .kEmptyEndLine;
389	}
390
391	// \b and \B
392	if pos == 0 && text.count == 0 {
393		// no word boundary here
394	} else if pos == 0 {
395		if is_word_char(text[pos]) {
396			flags |= .kEmptyWordBoundary;
397		}
398	} else if pos == text.count {
399		if is_word_char(text[pos-1]) {
400			flags |= .kEmptyWordBoundary;
401		}
402	} else {
403		if (is_word_char(text[pos-1]) != is_word_char(text[pos])) {
404			flags |= .kEmptyWordBoundary;
405		}
406	}
407	if !(flags & .kEmptyWordBoundary) {
408		flags |= .kEmptyNonWordBoundary;
409	}
410
411	return flags;
412}
413
414compute_byte_map :: (using prog: *Prog) {
415	// Fill in bytemap with byte classes for the program.
416	// Ranges of bytes that are treated indistinguishably
417	// will be mapped to a single byte class.
418	builder: ByteMapBuilder;
419	init(*builder);
420
421	// Don't repeat the work for ^ and $.
422	marked_line_boundaries := false;
423	// Don't repeat the work for \b and \B.
424	marked_word_boundaries := false;
425
426	for id: 0..inst.count - 1 {
427		ip := *inst[id];
428		if get_opcode(ip.*) == .kInstByteRange {
429            lo: s32 = ip.lo;
430            hi: s32 = ip.hi;
431			mark(*builder, lo, hi);
432			if get_foldcase(ip.*) && lo <= #char "z" && hi >= #char "a" {
433				foldlo := lo;
434				foldhi := hi;
435				if foldlo < #char "a" {
436					foldlo = #char "a";
437				}
438				if foldhi > #char "z" {
439					foldhi = #char "z";
440				}
441				if (foldlo <= foldhi) {
442					foldlo += #char "A" - #char "a";
443					foldhi += #char "A" - #char "a";
444					mark(*builder, foldlo, foldhi);
445				}
446			}
447			// If this Inst is not the last Inst in its list AND the next Inst is
448			// also a ByteRange AND the Insts have the same out, defer the merge.
449			if !get_last(ip.*) && get_opcode(inst[id + 1]) == .kInstByteRange && get_out(ip.*) == get_out(inst[id+1]) {
450				continue;
451			}
452			merge(*builder);
453		} else if get_opcode(ip.*) == .kInstEmptyWidth {
454			if ip.empty & (EmptyOp.kEmptyBeginLine|.kEmptyEndLine) && !marked_line_boundaries {
455				mark(*builder, #char "\n", #char "\n");
456				merge(*builder);
457				marked_line_boundaries = true;
458			}
459			if ip.empty & (EmptyOp.kEmptyWordBoundary|.kEmptyNonWordBoundary) && !marked_word_boundaries {
460				// We require two batches here: the first for ranges that are word
461				// characters, the second for ranges that are not word characters.
462				for isword: bool.[true, false] {
463					i := 0;
464					j: int;
465					while i < 256 {
466						j = i + 1;
467						while j < 256 && is_word_char(cast(u8)i) == is_word_char(cast(u8)j) {
468							j += 1;
469						}
470						if is_word_char(cast(u8)i) == isword {
471							mark(*builder, cast(u8)i, cast(u8)(j - 1));
472						}
473						i = j;
474					}
475					merge(*builder);
476				}
477				marked_word_boundaries = true;
478			}
479		}
480	}
481
482	bytemap, bytemap_range = build(*builder);
483}
484
485// ByteMapBuilder implements a coloring algorithm.
486//
487// The first phase is a series of "mark and merge" batches: we mark one or more
488// [lo-hi] ranges, then merge them into our internal state. Batching is not for
489// performance; rather, it means that the ranges are treated indistinguishably.
490//
491// Internally, the ranges are represented using a bitmap that stores the splits
492// and a vector that stores the colors; both of them are indexed by the ranges'
493// last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at
494// hi (if not already split), then recolor each range in between. The color map
495// (i.e. from the old color to the new color) is maintained for the lifetime of
496// the batch and so underpins this somewhat obscure approach to set operations.
497//
498// The second phase builds the bytemap from our internal state: we recolor each
499// range, then store the new color (which is now the byte class) in each of the
500// corresponding array elements. Finally, we output the number of byte classes.
501ByteMapBuilder :: struct {
502	splits: Bit_Array;
503	colors: [256] int;
504	nextcolor: int;
505	colormap: [..] Color_Pair;
506	ranges: [..] Range;
507
508	Range :: struct {
509		lo: u8;
510		hi: u8;
511	}
512	Color_Pair :: struct {
513		a: int;
514		b: int;
515	}
516}
517
518// Initial state: the [0-255] range has color 256.
519// This will avoid problems during the second phase,
520// in which we assign byte classes numbered from 0.
521init :: (using b: *ByteMapBuilder) {
522	init_bit_array(*splits, 256);
523	set_bit(*splits, 255);
524	colors[255] = 256;
525	nextcolor = 257;
526}
527
528mark :: (using builder: *ByteMapBuilder, lo: s32, hi: s32) {
529	assert(lo <= hi);
530
531	// Ignore any [0-255] ranges. They cause us to recolor every range, which
532	// has no effect on the eventual result and is therefore a waste of time.
533	if lo == 0 && hi == 255		return;
534
535	r: Range;
536	r.lo = cast(u8) lo;
537	r.hi = cast(u8) hi;
538	array_add(*ranges, r);
539}
540
541find_next_set_bit :: (using set: Bit_Array, start: int) -> s16 {
542	i := start >> 6;
543	// Mask out everything below start
544	word := slots[i] & (0xFFFF_FFFF_FFFF_FFFF << (start & 63));
545	if word {
546		return cast(u8)((i << 6) + bit_scan_forward(word) - 1);
547	}
548
549	i += 1;
550	while i < slots.count {
551		if slots[i]	{
552			return cast(u8)((i << 6) + bit_scan_forward(slots[i]) - 1);
553		}
554		i += 1;
555	}
556	return -1;
557}
558
559merge :: (using builder: *ByteMapBuilder) {
560	for r: ranges {
561		lo := (cast(s16)r.lo) - 1;
562		hi: s16 = r.hi;
563
564		if lo >= 0 && !splits[lo] {
565			set_bit(*splits, lo);
566			next := find_next_set_bit(splits, lo + 1);
567			colors[lo] = colors[next];
568		}
569
570		if !splits[hi] {
571			set_bit(*splits, hi);
572			next := find_next_set_bit(splits, hi + 1);
573			colors[hi] = colors[next];
574		}
575
576		c := lo + 1;
577		while c < 256 {
578			next := find_next_set_bit(splits, c);
579			colors[next] = recolor(builder, colors[next]);
580			if next == hi	break;
581
582			c = next + 1;
583		}
584	}
585	colormap.count = 0;
586	ranges.count = 0;
587}
588
589build :: (using builder: *ByteMapBuilder) -> [256] u8, bytemap_range: int {
590	// Assign byte classes numbered from 0.
591	nextcolor = 0;
592
593	bytemap: [256] u8;
594	c := 0;
595	while c < 256 {
596		next := find_next_set_bit(splits, c);
597		b := cast(u8) recolor(builder, colors[next]);
598		while c <= next {
599			bytemap[c] = b;
600			c += 1;
601		}
602	}
603
604	return bytemap, nextcolor;
605}
606
607recolor :: (using builder: *ByteMapBuilder, oldcolor: int) -> int {
608	// Yes, this is a linear search. There can be at most 256
609	// colors and there will typically be far fewer than that.
610	// Also, we need to consider keys *and* values in order to
611	// avoid recoloring a given range more than once per batch.
612	for colormap {
613		if it.a == oldcolor || it.b == oldcolor	return it.b;
614	}
615
616	newcolor := nextcolor;
617	nextcolor += 1;
618	p: Color_Pair;
619	p.a = oldcolor;
620	p.b = newcolor;
621	array_add(*colormap, p);
622	return newcolor;
623}
624
625
626// Prog::Flatten() implements a graph rewriting algorithm.
627//
628// The overall process is similar to epsilon removal, but retains some epsilon
629// transitions: those from Capture and EmptyWidth instructions; and those from
630// nullable subexpressions. (The latter avoids quadratic blowup in transitions
631// in the worst case.) It might be best thought of as Alt instruction elision.
632//
633// In conceptual terms, it divides the Prog into "trees" of instructions, then
634// traverses the "trees" in order to produce "lists" of instructions. A "tree"
635// is one or more instructions that grow from one "root" instruction to one or
636// more "leaf" instructions; if a "tree" has exactly one instruction, then the
637// "root" is also the "leaf". In most cases, a "root" is the successor of some
638// "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction)
639// and is considered a "successor root". A "leaf" can be a ByteRange, Capture,
640// EmptyWidth or Match instruction. However, this is insufficient for handling
641// nested nullable subexpressions correctly, so in some cases, a "root" is the
642// dominator of the instructions reachable from some "successor root" (i.e. it
643// has an unreachable predecessor) and is considered a "dominator root". Since
644// only Alt instructions can be "dominator roots" (other instructions would be
645// "leaves"), only Alt instructions are required to be marked as predecessors.
646//
647// Dividing the Prog into "trees" comprises two passes: marking the "successor
648// roots" and the predecessors; and marking the "dominator roots". Sorting the
649// "successor roots" by their bytecode offsets enables iteration in order from
650// greatest to least during the second pass; by working backwards in this case
651// and flooding the graph no further than "leaves" and already marked "roots",
652// it becomes possible to mark "dominator roots" without doing excessive work.
653//
654// Traversing the "trees" is just iterating over the "roots" in order of their
655// marking and flooding the graph no further than "leaves" and "roots". When a
656// "leaf" is reached, the instruction is copied with its successor remapped to
657// its "root" number. When a "root" is reached, a Nop instruction is generated
658// with its successor remapped similarly. As each "list" is produced, its last
659// instruction is marked as such. After all of the "lists" have been produced,
660// a pass over their instructions remaps their successors to bytecode offsets.
661flatten :: (using prog: *Prog) {
662	if did_flatten {
663		return;
664	}
665	did_flatten = true;
666
667	// Scratch structures. It's important that these are reused by functions
668	// that we call in loops because they would thrash the heap otherwise.
669	reachable: Sparse_Set;
670	stk: [..] int;
671	init(*reachable, inst.count);
672	array_reserve(*stk, inst.count);
673
674	// First pass: Marks "successor roots" and predecessors.
675	// Builds the mapping from inst-ids to root-ids.
676	rootmap, predmap, preds_list := mark_successors(prog, *reachable, *stk);
677
678	// Second pass: Marks "dominator roots".
679	sorted := array_copy(rootmap.dense);
680	sorted.count = rootmap.count;
681	intro_sort(sorted, (a: $T, b: T) -> int {
682		if a.index < b.index {
683			return -1;
684		} else {
685			return 1;
686		}
687	});
688
689	for < sorted {
690		if it.index != start_unanchored && it.index != start {
691			mark_dominator(prog, it.index, *rootmap, predmap, preds_list, *reachable, *stk);
692		}
693	}
694
695	// Third pass: Emits "lists". Remaps outs to root-ids.
696	// Builds the mapping from root-ids to flat-ids.
697	flatmap: [..] int;
698	flat: [..] Inst;
699	array_resize(*flatmap, rootmap.count, false);
700	array_reserve(*flat, inst.count);
701
702	for rootmap {
703		flatmap[it.value] = flat.count;
704		emit_list(prog, it.index, *rootmap, *flat, *reachable, *stk);
705		set_last(*flat[flat.count - 1]);
706		// We have the bounds of the "list", so this is the
707		// most convenient point at which to compute hints.
708		compute_hints(prog, *flat, flatmap[it.value], flat.count);
709	}
710
711	list_count = flatmap.count;
712	memset(inst_count.data, 0, inst_count.count * size_of(type_of(inst_count[0])));
713	// Fourth pass: Remaps outs to flat-ids.
714	// Counts instructions by opcode.
715	for * flat {
716		if (get_opcode(it.*) != .kInstAltMatch) { // handled in EmitList()
717			set_out(it, flatmap[get_out(it.*)]);
718		}
719		inst_count[get_opcode(it.*)] += 1;
720	}
721
722	total := 0;
723	for inst_count {
724		total += it;
725	}
726	assert(total == flat.count);
727
728	// Remap start_unanchored and start.
729	if start_unanchored == 0 {
730		assert(start == 0);
731	} else if (start_unanchored == start) {
732		start_unanchored = flatmap[1];
733		start = flatmap[1];
734	} else {
735		start_unanchored = flatmap[1];
736		start = flatmap[2];
737	}
738
739	// Finally, replace the old instructions with the new instructions.
740	inst = flat;
741
742	// Populate the list heads for BitState.
743	// 512 instructions limits the memory footprint to 1KiB.
744	if inst.count <= 512 {
745		array_resize(*list_heads, inst.count, false);
746		// 0xFF makes it more obvious if we try to look up a non-head.
747		memset(list_heads.data, 0xFF, inst.count * size_of(type_of(list_heads[0])));
748		for 0..list_count - 1 {
749			list_heads[flatmap[it]] = cast(u16)it;
750		}
751	}
752}
753
754mark_successors :: (using prog: *Prog, reachable: *Sparse_Set, stk: *[..] int) -> rootmap: Sparse_Array(int), predmap: Sparse_Array(int), preds_list: [..] [..] int {
755	get_or_add_preds :: (preds_list: *[..] [..] int, predmap: *Sparse_Array(int), id: int) -> *[..] int {
756		preds: * [..] int;
757		if !contains(predmap.*, id) {
758			add_unchecked(predmap, id, preds_list.count);
759			array_reserve(preds_list, preds_list.count + 1);
760			memset(preds_list.data + preds_list.count, 0, size_of(type_of(preds_list[0])));
761			preds_list.count += 1;
762			preds = *(preds_list.*)[preds_list.count - 1];
763			preds.data = null;
764			preds.count = 0;
765		} else {
766			preds = *(preds_list.*)[get(predmap.*, id)];
767		}
768		return preds;
769	}
770
771	rootmap: Sparse_Array(int);
772	predmap: Sparse_Array(int);
773	init(*rootmap, inst.count);
774	init(*predmap, inst.count);
775	preds_list: [..] [..] int;
776
777	// Mark the kInstFail instruction.
778	add_unchecked(*rootmap, 0, rootmap.count);
779
780	// Mark the start_unanchored and start instructions.
781	if !contains(rootmap, start_unanchored) {
782		add_unchecked(*rootmap, start_unanchored, rootmap.count);
783	}
784	if !contains(rootmap, start) {
785		add_unchecked(*rootmap, start, rootmap.count);
786	}
787
788	reachable.count = 0;
789	stk.count = 0;
790	array_add(stk, start_unanchored);
791	while stk.count {
792		id := pop(stk);
793		if contains(reachable.*, id)	 continue;
794		add_unchecked(reachable, id);
795
796		i := inst[id];
797		if get_opcode(i) == {
798			case .kInstAltMatch; #through;
799			case .kInstAlt;
800				// Mark this instruction as a predecessor of each out.
801				preds := get_or_add_preds(*preds_list, *predmap, get_out(i));
802				array_add(preds, id);
803				preds = get_or_add_preds(*preds_list, *predmap, i.out1);
804				array_add(preds, id);
805				array_add(stk, i.out1);
806				array_add(stk, get_out(i));
807
808			case .kInstByteRange; #through;
809			case .kInstCapture; #through;
810			case .kInstEmptyWidth;
811				// Mark the out of this instruction as a "root".
812				if !contains(rootmap, get_out(i)) {
813					add_unchecked(*rootmap, get_out(i), rootmap.count);
814				}
815				array_add(stk, get_out(i));
816
817			case .kInstNop;
818				array_add(stk, get_out(i));
819
820			case .kInstMatch; #through;
821			case .kInstFail;
822
823			case;
824				assert(false, "Unhandled opcode: %", get_opcode(i));
825		}
826	}
827
828	return rootmap, predmap, preds_list;
829}
830
831mark_dominator :: (using prog: *Prog, root: int, rootmap: *Sparse_Array(int), predmap: Sparse_Array(int), preds_list: [..] [..] int, reachable: *Sparse_Set, stk: *[..] int) {
832	reachable.count = 0;
833	stk.count = 0;
834	array_add(stk, root);
835
836	while stk.count {
837		id := pop(stk);
838		if contains(reachable.*, id) continue;
839		add_unchecked(reachable, id);
840
841		if id != root && contains(rootmap.*, id) {
842			// We reached another "tree" via epsilon transition.
843			continue;
844		}
845
846		i := inst[id];
847		if #complete get_opcode(i) == {
848			case .kInstAltMatch; #through;
849			case .kInstAlt;
850				array_add(stk, i.out1);
851				array_add(stk, get_out(i));
852
853			case .kInstByteRange; #through;
854			case .kInstCapture; #through;
855			case .kInstEmptyWidth;
856
857			case .kInstNop;
858				array_add(stk, get_out(i));
859
860			case .kInstMatch; #through;
861			case .kInstFail;
862		}
863	}
864
865	for id: reachable {
866		if contains(predmap, id) {
867			for pred: preds_list[get(predmap, id)] {
868				if !contains(reachable.*, pred) {
869					// id has a predecessor that cannot be reached from root!
870					// Therefore, id must be a "root" too - mark it as such.
871					if !contains(rootmap.*, id) {
872						add_unchecked(rootmap, id, rootmap.count);
873					}
874				}
875			}
876		}
877	}
878}
879
880emit_list :: (using prog: *Prog, root: int, rootmap: *Sparse_Array(int), flat: *[..] Inst, reachable: *Sparse_Set, stk: *[..] int) {
881	reachable.count = 0;
882	stk.count = 0;
883	array_add(stk, root);
884	while stk.count {
885		id := pop(stk);
886		if contains(reachable.*, id) continue;
887		add_unchecked(reachable, id);
888
889		if id != root && contains(rootmap.*, id) {
890			// We reached another "tree" via epsilon transition. Emit a kInstNop
891			// instruction so that the Prog does not become quadratically larger.
892			i: Inst;
893			set_opcode(*i, .kInstNop);
894			set_out(*i, get(rootmap.*, id));
895			array_add(flat, i);
896			continue;
897		}
898
899		i := inst[id];
900		if get_opcode(i) == {
901			case .kInstAltMatch;
902				newi: Inst;
903				set_opcode(*newi, .kInstAltMatch);
904				set_out(*newi, flat.count + 1);
905				newi.out1 = cast(u32)flat.count + 2;
906				array_add(flat, newi);
907				array_add(stk, i.out1);
908				array_add(stk, get_out(i));
909
910			case .kInstAlt;
911				array_add(stk, i.out1);
912				array_add(stk, get_out(i));
913
914			case .kInstByteRange; #through;
915			case .kInstCapture; #through;
916			case .kInstEmptyWidth;
917				newi := i;
918				set_out(*newi, get(rootmap.*, get_out(i)));
919				array_add(flat, newi);
920
921			case .kInstNop;
922				array_add(stk, get_out(i));
923
924			case .kInstMatch; #through;
925			case .kInstFail;
926				array_add(flat, i);
927
928			case;
929				assert(false, "Unhandled opcode: %", get_opcode(i));
930		}
931	}
932}
933
934// For each ByteRange instruction in [begin, end), computes a hint to execution
935// engines: the delta to the next instruction (in flat) worth exploring iff the
936// current instruction matched.
937//
938// Implements a coloring algorithm related to ByteMapBuilder, but in this case,
939// colors are instructions and recoloring ranges precisely identifies conflicts
940// between instructions. Iterating backwards over [begin, end) is guaranteed to
941// identify the nearest conflict (if any) with only linear complexity.
942compute_hints :: (using prog: *Prog, flat: *[..] Inst, begin: int, end: int) {
943	recolor :: (splits: *Bit_Array, colors: *[256] int, lo_in: int, hi: int, first: *int, id: int) {
944		// Like ByteMapBuilder, we split at lo-1 and at hi.
945		lo := lo_in - 1;
946
947		if lo >= 0 && !(splits.*)[lo] {
948			set_bit(splits, lo);
949			next := find_next_set_bit(splits.*, lo + 1);
950			(colors.*)[lo] = (colors.*)[next];
951		}
952		if !(splits.*)[hi] {
953			set_bit(splits, hi);
954			next := find_next_set_bit(splits.*, hi + 1);
955			(colors.*)[hi] = (colors.*)[next];
956		}
957
958		c := lo + 1;
959		while c < 256 {
960			next := find_next_set_bit(splits.*, c);
961			// Ratchet backwards...
962			(first.*) = min(first.*, (colors.*)[next]);
963			// Recolor with id - because it's the new nearest conflict!
964			(colors.*)[next] = id;
965			if next == hi	break;
966			c = next+1;
967		}
968	}
969
970	splits: Bit_Array;
971	init_bit_array(*splits, 256);
972	colors: [256] int;
973
974	dirty := false;
975	for < id: begin..end {
976		if (id == end || get_opcode((flat.*)[id]) != .kInstByteRange) {
977			if (dirty) {
978				dirty = false;
979				clear_all_bits(*splits);
980			}
981			set_bit(*splits, 255);
982			colors[255] = id;
983			// At this point, the [0-255] range is colored with id.
984			// Thus, hints cannot point beyond id; and if id == end,
985			// hints that would have pointed to id will be 0 instead.
986			continue;
987		}
988		dirty = true;
989
990		// We recolor the [lo-hi] range with id. Note that first ratchets backwards
991		// from end to the nearest conflict (if any) during recoloring.
992		first := end;
993
994		ip := *(flat.*)[id];
995		lo: s32 = ip.lo;
996		hi: s32 = ip.hi;
997		recolor(*splits, *colors, lo, hi, *first, id);
998		if get_foldcase(ip.*) && lo <= #char "z" && hi >= #char "a" {
999			foldlo := lo;
1000			foldhi := hi;
1001			if (foldlo < #char "a") {
1002				foldlo = #char "a";
1003			}
1004			if (foldhi > #char "z") {
1005				foldhi = #char "z";
1006			}
1007			if (foldlo <= foldhi) {
1008				foldlo += #char "A" - #char "a";
1009				foldhi += #char "A" - #char "a";
1010				recolor(*splits, *colors, foldlo, foldhi, *first, id);
1011			}
1012		}
1013
1014		if first != end {
1015			hint := cast(u16) min(first - id, 32767);
1016			ip.hint_foldcase |= hint << 1;
1017		}
1018	}
1019}
1020
1021// Accelerates to the first likely occurrence of the prefix.
1022// Returns position of the first byte or -1 if not found.
1023prefix_accel :: (using prog: *Prog, text: string, pos: int) -> int {
1024	assert(prefix_size >= 1);
1025	if prefix_size == 1 {
1026		result, found := index_of_char_2(text, cast(u8)prefix_front, pos);
1027		if !found	return -1;
1028
1029		return result;
1030	} else {
1031		return prefix_accel_front_and_back(prog, text, pos);
1032	}
1033}
1034
1035prefix_accel_front_and_back :: (using prog: *Prog, text: string, pos: int) -> int {
1036	assert(prefix_size >= 2);
1037	if text.count - pos < prefix_size {
1038		return -1;
1039	}
1040
1041	// Don't bother searching the last prefix_size_-1 bytes for prefix_front_.
1042	// This also means that probing for prefix_back_ doesn't go out of bounds.
1043	size := text.count - prefix_size + 1;
1044
1045	// @ToDo Port AVX2 fast path once jai has intrinsics/assembler
1046
1047	for i: pos..size-1 {
1048		if text[i] == prefix_front && text[i + prefix_size - 1] == prefix_back {
1049			return i;
1050		}
1051	}
1052
1053	return -1;
1054}
1055
1056append :: (builder: *String_Builder, inst: Inst) {
1057	if get_opcode(inst) == {
1058		case;
1059			print_to_builder(builder, "opcode %", get_opcode(inst));
1060
1061		case .kInstAlt;
1062			print_to_builder(builder, "alt -> % | %", get_out(inst), inst.out1);
1063
1064		case .kInstAltMatch;
1065			print_to_builder(builder, "altmatch -> % | %", get_out(inst), inst.out1);
1066
1067		case .kInstByteRange;
1068			append(builder, "byte");
1069			if get_foldcase(inst) {
1070				append(builder, "/i");
1071			}
1072			print_to_builder(builder, " [%-%] % -> %", format_byte(inst.lo), format_byte(inst.hi), get_hint(inst), get_out(inst));
1073
1074		case .kInstCapture;
1075			print_to_builder(builder, "capture % -> %", inst.cap, get_out(inst));
1076
1077		case .kInstEmptyWidth;
1078			print_to_builder(builder, "emptywidth % -> %", inst.empty, get_out(inst));
1079
1080		case .kInstMatch;
1081			print_to_builder(builder, "match! %", inst.match_id);
1082
1083		case .kInstNop;
1084			print_to_builder(builder, "nop -> %", get_out(inst));
1085
1086		case .kInstFail;
1087			append(builder, "fail");
1088	}
1089}
1090
1091format_byte :: #bake_arguments formatInt(base = 16, minimum_digits = 2);
1092
1093to_string :: (using prog: Prog) -> string {
1094	if did_flatten {
1095		return to_string_flattened(prog);
1096	} else {
1097		return to_string_unflattened(prog);
1098	}
1099}
1100
1101to_string_flattened :: (using prog: Prog) -> string {
1102	builder: String_Builder;
1103	defer free_buffers(*builder);
1104	for inst {
1105		if get_last(it) {
1106			print_to_builder(*builder, "%. ", it_index);
1107		} else {
1108			print_to_builder(*builder, "%1+ ", it_index);
1109		}
1110		append(*builder, it);
1111		append(*builder, #char "\n");
1112	}
1113	return builder_to_string(*builder);
1114}
1115
1116to_string_unflattened :: (using prog: Prog) -> string {
1117	builder: String_Builder;
1118	defer free_buffers(*builder);
1119
1120	q: Sparse_Set;
1121	init(*q, prog.inst.count);
1122	add_if_nonzero(*q, start);
1123	for q {
1124		i := inst[it];
1125		print_to_builder(*builder, "%. ", it);
1126		append(*builder, i);
1127		append(*builder, #char "\n");
1128		add_if_nonzero(*q, get_out(i));
1129		if get_opcode(i) == .kInstAlt || get_opcode(i) == .kInstAltMatch {
1130			add_if_nonzero(*q, i.out1);
1131		}
1132	}
1133	return builder_to_string(*builder);
1134}
1135
1136#scope_file
1137
1138#import "Bit_Array";
1139#import "IntroSort";
1140#import "Bit_Operations";
1141
1142// @ToDo: Use assembler replacement for memchr instead, once jai supports it
1143index_of_char_2 :: (str: string, char: u8, start := 0) -> int, found:bool {
1144    for start..str.count-1 {
1145        if str[it] == char return it, true;
1146    }
1147    return -1, false;
1148}
1149
index : blog