git ptrace.dev


        
            
            
        
        0#scope_module
1
2// Search using NFA: can find submatches but kind of slow.
3match_nfa :: (prog: *Prog, text: string, ctx: string, ctx_offset: int, anchor_in: Prog.Anchor, kind: Prog.MatchKind, num_matches: int) -> bool, matches: [] string {
4	assert(num_matches > 0, "We need to capture at least the full match");
5
6	nfa: Nfa;
7	// We don’t want the matches in the pool, but everything else
8	matches: [] string;
9
10	pool: Pool;
11	defer release(*pool);
12	set_allocators(*pool);
13	#if REGEXP_DEBUG	pool.overwrite_memory = true;
14
15	{
16		push_allocator(pool_allocator_proc, *pool);
17		nfa.prog = prog;
18
19		init(*nfa.q0, prog.inst.count);
20		init(*nfa.q1, prog.inst.count);
21		array_reserve(*nfa.stack,
22			2 * prog.inst_count[cast(u32)InstOp.kInstCapture] +
23			prog.inst_count[cast(u32)InstOp.kInstEmptyWidth] +
24			prog.inst_count[cast(u32)InstOp.kInstNop] + 1); // + 1 for start inst
25
26		nfa.start = prog.start;
27		anchor := anchor_in;
28		if kind == .kFullMatch {
29			anchor = .ANCHORED;
30		}
31
32		if nfa.start == 0	return false, matches;
33		if prog.anchor_start && ctx.count != text.count		return false, matches;
34		if prog.anchor_end && ctx.count != ctx_offset + text.count	return false, matches;
35
36		anchored := (anchor == .ANCHORED) || prog.anchor_start;
37		nfa.longest = (kind != .kFirstMatch);
38		nfa.endmatch = false;
39		if prog.anchor_end {
40			nfa.longest = true;
41			nfa.endmatch = true;
42		}
43
44		nfa.ncaptures = 2 * num_matches;
45		nfa.match = NewArray(nfa.ncaptures, int, false);
46		memset(nfa.match.data, 0, nfa.match.count * size_of(type_of(nfa.match[0])));
47
48		runq := *nfa.q0;
49		nextq := *nfa.q1;
50
51		// Loop over the text, stepping the machine.
52		for pos: 0..text.count+1 {
53			c := ifx pos < text.count then cast(int) text[pos] else -1;
54
55			// This is a no-op the first time around the loop because runq is empty.
56			id := step(*nfa, runq, nextq, c, text, ctx, ctx_offset, pos);
57			assert(runq.count == 0);
58			nextq, runq = runq, nextq;
59
60			if id != 0 {
61				// We're done: full match ahead.
62				pos = text.count;
63				while true {
64					i := prog.inst[id];
65					if get_opcode(i) == {
66						case .kInstCapture;
67							if i.cap < nfa.ncaptures {
68								nfa.match[i.cap] = pos;
69							}
70							id = get_out(i);
71							continue;
72
73						case .kInstNop;
74							id = get_out(i);
75							continue;
76
77						case .kInstMatch;
78							nfa.match[1] = pos;
79							nfa.matched = true;
80
81						case;
82							assert(false, "Unexpected opcode in short circuit: %", get_opcode(i));
83					}
84					break;
85				}
86				break;
87			}
88
89			if pos > text.count		break;
90
91			// Start a new thread if there have not been any matches.
92			// (No point in starting a new thread if there have been
93			// matches, since it would be to the right of the match
94			// we already found.)
95			if !nfa.matched && (!anchored || pos == 0) {
96				// Try to use prefix accel (e.g. memchr) to skip ahead.
97				// The search must be unanchored and there must be zero
98				// possible matches already.
99				if !anchored && !runq.count && pos < text.count && prog.prefix_size {
100					pos = prefix_accel(prog, text, pos);
101					if pos < 0 {
102						pos = text.count;
103					}
104				}
105
106				t := new_thread(*nfa);
107				copy_capture(*t.capture, nfa.match);
108				t.capture[0] = pos;
109				c = ifx pos < text.count then cast(s16)text[pos] else -1;
110				add_to_queue(*nfa, runq, prog.start, c, ctx, ctx_offset, t, pos);
111				decref(*nfa, t);
112			}
113
114			// If all the threads have died, stop early.
115			if runq.count == 0		break;
116
117			// @ToDo There was a special case here.
118			// Not sure if it is needed after our transformations.
119
120		}
121	}
122
123	if nfa.matched {
124		matches = NewArray(num_matches, string, false);
125		for i: 0..num_matches - 1 {
126			matches[i] = slice(text, nfa.match[2 * i], nfa.match[2 * i + 1] - nfa.match[2 * i]);
127		}
128
129		if kind == .kFullMatch && matches[0].count != text.count {
130			return false, matches;
131		}
132	}
133
134	return nfa.matched, matches;
135}
136
137#scope_file
138
139Nfa :: struct {
140	prog: *Prog;
141	start: int; // start instruction in program
142	ncaptures: int; // number of submatches to track
143	longest: bool;              // whether searching for longest match
144	endmatch: bool;             // whether match must end at text.end()
145	q0: Thread_Queue;
146	q1: Thread_Queue;
147	stack: [..] AddState;
148	threads: Bucket_Array(Thread, 16);
149	free_list: *Thread;
150	match: [] int;  // best match so far
151	matched: bool;         // any match so far?
152}
153
154Thread :: struct {
155	// @ToDo @Cleanup I don’t want to ref-count here, but the original algorithm
156	// is too complicated to safely remove the ref-counting while porting it to jai.
157	// Needs to happend in a separat step once we have tests in place.
158	union {
159		ref: int;
160		next: *Thread;
161	}
162	capture: [] int;
163}
164
165new_thread :: (using nfa: *Nfa) -> *Thread {
166	t: *Thread;
167	if free_list {
168		t = free_list;
169		free_list = t.next;
170	} else {
171		t = find_and_occupy_empty_slot(*threads);
172		t.capture = NewArray(ncaptures, int, false);
173	}
174	t.ref = 1;
175	return t;
176}
177
178incref :: (t: *Thread) -> *Thread {
179	t.ref += 1;
180	return t;
181}
182
183decref :: (using nfa: *Nfa, t: *Thread) {
184	t.ref -= 1;
185	if !t.ref {
186		t.next = free_list;
187		free_list = t;
188	}
189}
190
191Thread_Queue :: Sparse_Array(*Thread);
192
193AddState :: struct {
194	id: int;
195	t: *Thread;
196}
197
198make_state :: (id: int, t: *Thread) -> AddState {
199	s: AddState;
200	s.id = id;
201	s.t = t;
202	return s;
203}
204
205copy_capture :: (dst: *[] int, src: [] int) {
206	memcpy(dst.data, src.data, dst.count * size_of(type_of(src[0])));
207}
208
209// Run runq on byte c, appending new states to nextq.
210// Updates matched_ and match_ as new, better matches are found.
211// context is used (with p) for evaluating empty-width specials.
212// p is the position of byte c in the input string for AddToThreadq;
213// p-1 will be used when processing Match instructions.
214// Frees all the threads on runq.
215// If there is a shortcut to the end, returns that shortcut.
216step :: (using nfa: *Nfa, runq: *Thread_Queue, nextq: *Thread_Queue, c: int, text: string, ctx: string, ctx_offset: int, pos: int) -> int {
217	for runq {
218		t := it.value;
219		if t == null		continue;
220
221		if longest {
222			// Can skip any threads started after our current best match.
223			if (matched && match[0] < t.capture[0]) {
224				decref(nfa, t);
225				continue;
226			}
227		}
228
229		id := it.index;
230		i := prog.inst[id];
231
232		if get_opcode(i) == {
233			case;
234				assert(false, "Unexpected opcode: %", get_opcode(i));
235
236			case .kInstByteRange;
237				add_to_queue(nfa, nextq, get_out(i), c, ctx, ctx_offset, t, pos);
238
239			case .kInstAltMatch;
240				if it_index == 0 {
241					// The match is ours if we want it.
242					if greedy(i, prog) || longest {
243						copy_capture(*match, t.capture);
244						matched = true;
245
246						decref(nfa, t);
247						for j: it_index+1..cast(int)runq.count-1 {
248							if runq.dense[j].value != null {
249								decref(nfa, runq.dense[j].value);
250							}
251						}
252						runq.count = 0;
253						if greedy(i, prog) {
254							return i.out1;
255						} else {
256							return get_out(i);
257						}
258					}
259				}
260
261			case .kInstMatch;
262				// @ToDo There was a special case here.
263				// Not sure if it is needed after our transformations.
264
265				if !endmatch || pos - 1 == text.count {
266					if longest {
267						// Leftmost-longest mode: save this match only if
268						// it is either farther to the left or at the same
269						// point but longer than an existing match.
270						if !matched || t.capture[0] < match[0] || t.capture[0] == match[0] && pos - 1 > match[1] {
271							copy_capture(*match, t.capture);
272							match[1] = pos - 1;
273							matched = true;
274						}
275					} else {
276						// Leftmost-biased mode: this match is by definition
277						// better than what we've already found (see next line).
278						copy_capture(*match, t.capture);
279						match[1] = pos - 1;
280						matched = true;
281
282						// Cut off the threads that can only find matches
283						// worse than the one we just found: don't run the
284						// rest of the current Threadq.
285						decref(nfa, t);
286						for j: it_index+1..cast(int)runq.count-1 {
287							if runq.dense[j].value != null {
288								decref(nfa, runq.dense[j].value);
289							}
290						}
291						runq.count = 0;
292						return 0;
293					}
294				}
295		}
296		decref(nfa, t);
297	}
298
299	runq.count = 0;
300	return 0;
301}
302
303// Follows all empty arrows from id0 and enqueues all the states reached.
304// Enqueues only the ByteRange instructions that match byte c.
305// context is used (with p) for evaluating empty-width specials.
306// pos is the current input position, and t0 is the current thread.
307add_to_queue :: (using nfa: *Nfa, q: *Thread_Queue, id0: int, c: int, ctx: string, ctx_offset: int, t0_in: *Thread, pos: int) {
308	if id0 == 0		return;
309
310	// Use stack_ to hold our stack of instructions yet to process.
311	// It was preallocated as follows:
312	//   two entries per Capture;
313	//   one entry per EmptyWidth; and
314	//   one entry per Nop.
315	// This reflects the maximum number of stack pushes that each can
316	// perform. (Each instruction can be processed at most once.)
317	stack_size := stack.allocated;
318	stack.count = 0;
319	array_add(*stack, make_state(id0, null));
320	t0 := t0_in;
321	while stack.count > 0 {
322		assert(stack.count < stack_size);
323		a := pop(*stack);
324
325		if (a.t != null) {
326			// t0 was a thread that we allocated and copied in order to
327			// record the capture, so we must now decref it.
328			decref(nfa, t0);
329			t0 = a.t;
330		}
331
332		id := a.id;
333		if id == 0	continue;
334		if contains(q.*, id)		continue;
335
336		// Create entry in q no matter what.  We might fill it in below,
337		// or we might not.  Even if not, it is necessary to have it,
338		// so that we don't revisit id0 during the recursion.
339		add_unchecked(q, id, null);
340
341		i := prog.inst[id];
342		if get_opcode(i) == {
343			case .kInstFail;
344
345			case .kInstAltMatch;
346				// Save state; will pick up at next byte.
347				t := incref(t0);
348				set(q, id, t);
349
350				assert(!get_last(i));
351				array_add(*stack, make_state(id+1, null));
352
353			case .kInstNop;
354				if !get_last(i) {
355					array_add(*stack, make_state(id+1, null));
356				}
357				array_add(*stack, make_state(get_out(i), null));
358
359			case .kInstCapture;
360				if !get_last(i) {
361					array_add(*stack, make_state(id+1, null));
362				}
363				j := i.cap;
364
365				if j < ncaptures {
366					// Push a dummy whose only job is to restore t0
367					// once we finish exploring this possibility.
368					array_add(*stack, make_state(0, t0));
369
370					// Record capture.
371					t := new_thread(nfa);
372					copy_capture(*t.capture, t0.capture);
373					t.capture[j] = pos;
374					t0 = t;
375				}
376				array_add(*stack, make_state(get_out(i), null));
377
378			case .kInstByteRange;
379				if !matches(i, c) {
380					if !get_last(i) {
381						array_add(*stack, make_state(id + 1, null));
382					}
383				} else {
384					// Save state; will pick up at next byte.
385					t: = incref(t0);
386					set(q, id, t);
387					hint := get_hint(i);
388					if hint != 0 {
389						array_add(*stack, make_state(id + hint, null));
390					}
391				}
392
393			case .kInstMatch;
394				// Save state; will pick up at next byte.
395				t: = incref(t0);
396				set(q, id, t);
397
398				if !get_last(i) {
399					array_add(*stack, make_state(id + 1, null));
400				}
401
402			case .kInstEmptyWidth;
403				if !get_last(i) {
404					array_add(*stack, make_state(id + 1, null));
405				}
406
407				// Continue on if we have all the right flag bits.
408				if !(i.empty & ~empty_flags(ctx, ctx_offset + pos)) {
409					array_add(*stack, make_state(get_out(i), null));
410				}
411
412			case;
413				assert(false, "Unexpected op: %", get_opcode(i));
414		}
415	}
416}
417
index : blog