<<
path:
root/public/blog.git/html/modules/uniform/nfa.jai
blob: 42ad3841948fd5e2b7fc49b5f9e5298138c97935
[raw]
[clear marker]
2// Search using NFA: can find submatches but kind of slow.
3match_nfa :: (prog: *Prog, text: string, ctx: string, ctx_offset: int, anchor_in: Prog.Anchor, kind: Prog.MatchKind, num_matches: int) -> bool, matches: [] string {
4 assert(num_matches > 0, "We need to capture at least the full match");
7 // We don’t want the matches in the pool, but everything else
12 set_allocators(*pool);
13 #if REGEXP_DEBUG pool.overwrite_memory = true;
16 push_allocator(pool_allocator_proc, *pool);
19 init(*nfa.q0, prog.inst.count);
20 init(*nfa.q1, prog.inst.count);
21 array_reserve(*nfa.stack,
22 2 * prog.inst_count[cast(u32)InstOp.kInstCapture] +
23 prog.inst_count[cast(u32)InstOp.kInstEmptyWidth] +
24 prog.inst_count[cast(u32)InstOp.kInstNop] + 1); // + 1 for start inst
26 nfa.start = prog.start;
28 if kind == .kFullMatch {
32 if nfa.start == 0 return false, matches;
33 if prog.anchor_start && ctx.count != text.count return false, matches;
34 if prog.anchor_end && ctx.count != ctx_offset + text.count return false, matches;
36 anchored := (anchor == .ANCHORED) || prog.anchor_start;
37 nfa.longest = (kind != .kFirstMatch);
44 nfa.ncaptures = 2 * num_matches;
45 nfa.match = NewArray(nfa.ncaptures, int, false);
46 memset(nfa.match.data, 0, nfa.match.count * size_of(type_of(nfa.match[0])));
51 // Loop over the text, stepping the machine.
52 for pos: 0..text.count+1 {
53 c := ifx pos < text.count then cast(int) text[pos] else -1;
55 // This is a no-op the first time around the loop because runq is empty.
56 id := step(*nfa, runq, nextq, c, text, ctx, ctx_offset, pos);
57 assert(runq.count == 0);
58 nextq, runq = runq, nextq;
61 // We're done: full match ahead.
67 if i.cap < nfa.ncaptures {
68 nfa.match[i.cap] = pos;
82 assert(false, "Unexpected opcode in short circuit: %", get_opcode(i));
89 if pos > text.count break;
91 // Start a new thread if there have not been any matches.
92 // (No point in starting a new thread if there have been
93 // matches, since it would be to the right of the match
95 if !nfa.matched && (!anchored || pos == 0) {
96 // Try to use prefix accel (e.g. memchr) to skip ahead.
97 // The search must be unanchored and there must be zero
98 // possible matches already.
99 if !anchored && !runq.count && pos < text.count && prog.prefix_size {
100 pos = prefix_accel(prog, text, pos);
106 t := new_thread(*nfa);
107 copy_capture(*t.capture, nfa.match);
109 c = ifx pos < text.count then cast(s16)text[pos] else -1;
110 add_to_queue(*nfa, runq, prog.start, c, ctx, ctx_offset, t, pos);
114 // If all the threads have died, stop early.
115 if runq.count == 0 break;
117 // @ToDo There was a special case here.
118 // Not sure if it is needed after our transformations.
124 matches = NewArray(num_matches, string, false);
125 for i: 0..num_matches - 1 {
126 matches[i] = slice(text, nfa.match[2 * i], nfa.match[2 * i + 1] - nfa.match[2 * i]);
129 if kind == .kFullMatch && matches[0].count != text.count {
130 return false, matches;
134 return nfa.matched, matches;
141 start: int; // start instruction in program
142 ncaptures: int; // number of submatches to track
143 longest: bool; // whether searching for longest match
144 endmatch: bool; // whether match must end at text.end()
147 stack: [..] AddState;
148 threads: Bucket_Array(Thread, 16);
150 match: [] int; // best match so far
151 matched: bool; // any match so far?
155 // @ToDo @Cleanup I don’t want to ref-count here, but the original algorithm
156 // is too complicated to safely remove the ref-counting while porting it to jai.
157 // Needs to happend in a separat step once we have tests in place.
165new_thread :: (using nfa: *Nfa) -> *Thread {
171 t = find_and_occupy_empty_slot(*threads);
172 t.capture = NewArray(ncaptures, int, false);
178incref :: (t: *Thread) -> *Thread {
183decref :: (using nfa: *Nfa, t: *Thread) {
191Thread_Queue :: Sparse_Array(*Thread);
198make_state :: (id: int, t: *Thread) -> AddState {
205copy_capture :: (dst: *[] int, src: [] int) {
206 memcpy(dst.data, src.data, dst.count * size_of(type_of(src[0])));
209// Run runq on byte c, appending new states to nextq.
210// Updates matched_ and match_ as new, better matches are found.
211// context is used (with p) for evaluating empty-width specials.
212// p is the position of byte c in the input string for AddToThreadq;
213// p-1 will be used when processing Match instructions.
214// Frees all the threads on runq.
215// If there is a shortcut to the end, returns that shortcut.
216step :: (using nfa: *Nfa, runq: *Thread_Queue, nextq: *Thread_Queue, c: int, text: string, ctx: string, ctx_offset: int, pos: int) -> int {
219 if t == null continue;
222 // Can skip any threads started after our current best match.
223 if (matched && match[0] < t.capture[0]) {
232 if get_opcode(i) == {
234 assert(false, "Unexpected opcode: %", get_opcode(i));
236 case .kInstByteRange;
237 add_to_queue(nfa, nextq, get_out(i), c, ctx, ctx_offset, t, pos);
241 // The match is ours if we want it.
242 if greedy(i, prog) || longest {
243 copy_capture(*match, t.capture);
247 for j: it_index+1..cast(int)runq.count-1 {
248 if runq.dense[j].value != null {
249 decref(nfa, runq.dense[j].value);
262 // @ToDo There was a special case here.
263 // Not sure if it is needed after our transformations.
265 if !endmatch || pos - 1 == text.count {
267 // Leftmost-longest mode: save this match only if
268 // it is either farther to the left or at the same
269 // point but longer than an existing match.
270 if !matched || t.capture[0] < match[0] || t.capture[0] == match[0] && pos - 1 > match[1] {
271 copy_capture(*match, t.capture);
276 // Leftmost-biased mode: this match is by definition
277 // better than what we've already found (see next line).
278 copy_capture(*match, t.capture);
282 // Cut off the threads that can only find matches
283 // worse than the one we just found: don't run the
284 // rest of the current Threadq.
286 for j: it_index+1..cast(int)runq.count-1 {
287 if runq.dense[j].value != null {
288 decref(nfa, runq.dense[j].value);
303// Follows all empty arrows from id0 and enqueues all the states reached.
304// Enqueues only the ByteRange instructions that match byte c.
305// context is used (with p) for evaluating empty-width specials.
306// pos is the current input position, and t0 is the current thread.
307add_to_queue :: (using nfa: *Nfa, q: *Thread_Queue, id0: int, c: int, ctx: string, ctx_offset: int, t0_in: *Thread, pos: int) {
310 // Use stack_ to hold our stack of instructions yet to process.
311 // It was preallocated as follows:
312 // two entries per Capture;
313 // one entry per EmptyWidth; and
314 // one entry per Nop.
315 // This reflects the maximum number of stack pushes that each can
316 // perform. (Each instruction can be processed at most once.)
317 stack_size := stack.allocated;
319 array_add(*stack, make_state(id0, null));
321 while stack.count > 0 {
322 assert(stack.count < stack_size);
326 // t0 was a thread that we allocated and copied in order to
327 // record the capture, so we must now decref it.
334 if contains(q.*, id) continue;
336 // Create entry in q no matter what. We might fill it in below,
337 // or we might not. Even if not, it is necessary to have it,
338 // so that we don't revisit id0 during the recursion.
339 add_unchecked(q, id, null);
342 if get_opcode(i) == {
346 // Save state; will pick up at next byte.
350 assert(!get_last(i));
351 array_add(*stack, make_state(id+1, null));
355 array_add(*stack, make_state(id+1, null));
357 array_add(*stack, make_state(get_out(i), null));
361 array_add(*stack, make_state(id+1, null));
366 // Push a dummy whose only job is to restore t0
367 // once we finish exploring this possibility.
368 array_add(*stack, make_state(0, t0));
371 t := new_thread(nfa);
372 copy_capture(*t.capture, t0.capture);
376 array_add(*stack, make_state(get_out(i), null));
378 case .kInstByteRange;
381 array_add(*stack, make_state(id + 1, null));
384 // Save state; will pick up at next byte.
389 array_add(*stack, make_state(id + hint, null));
394 // Save state; will pick up at next byte.
399 array_add(*stack, make_state(id + 1, null));
402 case .kInstEmptyWidth;
404 array_add(*stack, make_state(id + 1, null));
407 // Continue on if we have all the right flag bits.
408 if !(i.empty & ~empty_flags(ctx, ctx_offset + pos)) {
409 array_add(*stack, make_state(get_out(i), null));
413 assert(false, "Unexpected op: %", get_opcode(i));