Logo

index : blog

---

  • summary
  • about
  • tree
  • log
  • branches
<< path: root/public/blog.git/html/modules/uniform/nfa.jai blob: 42ad3841948fd5e2b7fc49b5f9e5298138c97935 [raw] [clear marker]

        
0#scope_module
1
2// Search using NFA: can find submatches but kind of slow.
3match_nfa :: (prog: *Prog, text: string, ctx: string, ctx_offset: int, anchor_in: Prog.Anchor, kind: Prog.MatchKind, num_matches: int) -> bool, matches: [] string {
4 assert(num_matches > 0, "We need to capture at least the full match");
5
6 nfa: Nfa;
7 // We don’t want the matches in the pool, but everything else
8 matches: [] string;
9
10 pool: Pool;
11 defer release(*pool);
12 set_allocators(*pool);
13 #if REGEXP_DEBUG pool.overwrite_memory = true;
14
15 {
16 push_allocator(pool_allocator_proc, *pool);
17 nfa.prog = prog;
18
19 init(*nfa.q0, prog.inst.count);
20 init(*nfa.q1, prog.inst.count);
21 array_reserve(*nfa.stack,
22 2 * prog.inst_count[cast(u32)InstOp.kInstCapture] +
23 prog.inst_count[cast(u32)InstOp.kInstEmptyWidth] +
24 prog.inst_count[cast(u32)InstOp.kInstNop] + 1); // + 1 for start inst
25
26 nfa.start = prog.start;
27 anchor := anchor_in;
28 if kind == .kFullMatch {
29 anchor = .ANCHORED;
30 }
31
32 if nfa.start == 0 return false, matches;
33 if prog.anchor_start && ctx.count != text.count return false, matches;
34 if prog.anchor_end && ctx.count != ctx_offset + text.count return false, matches;
35
36 anchored := (anchor == .ANCHORED) || prog.anchor_start;
37 nfa.longest = (kind != .kFirstMatch);
38 nfa.endmatch = false;
39 if prog.anchor_end {
40 nfa.longest = true;
41 nfa.endmatch = true;
42 }
43
44 nfa.ncaptures = 2 * num_matches;
45 nfa.match = NewArray(nfa.ncaptures, int, false);
46 memset(nfa.match.data, 0, nfa.match.count * size_of(type_of(nfa.match[0])));
47
48 runq := *nfa.q0;
49 nextq := *nfa.q1;
50
51 // Loop over the text, stepping the machine.
52 for pos: 0..text.count+1 {
53 c := ifx pos < text.count then cast(int) text[pos] else -1;
54
55 // This is a no-op the first time around the loop because runq is empty.
56 id := step(*nfa, runq, nextq, c, text, ctx, ctx_offset, pos);
57 assert(runq.count == 0);
58 nextq, runq = runq, nextq;
59
60 if id != 0 {
61 // We're done: full match ahead.
62 pos = text.count;
63 while true {
64 i := prog.inst[id];
65 if get_opcode(i) == {
66 case .kInstCapture;
67 if i.cap < nfa.ncaptures {
68 nfa.match[i.cap] = pos;
69 }
70 id = get_out(i);
71 continue;
72
73 case .kInstNop;
74 id = get_out(i);
75 continue;
76
77 case .kInstMatch;
78 nfa.match[1] = pos;
79 nfa.matched = true;
80
81 case;
82 assert(false, "Unexpected opcode in short circuit: %", get_opcode(i));
83 }
84 break;
85 }
86 break;
87 }
88
89 if pos > text.count break;
90
91 // Start a new thread if there have not been any matches.
92 // (No point in starting a new thread if there have been
93 // matches, since it would be to the right of the match
94 // we already found.)
95 if !nfa.matched && (!anchored || pos == 0) {
96 // Try to use prefix accel (e.g. memchr) to skip ahead.
97 // The search must be unanchored and there must be zero
98 // possible matches already.
99 if !anchored && !runq.count && pos < text.count && prog.prefix_size {
100 pos = prefix_accel(prog, text, pos);
101 if pos < 0 {
102 pos = text.count;
103 }
104 }
105
106 t := new_thread(*nfa);
107 copy_capture(*t.capture, nfa.match);
108 t.capture[0] = pos;
109 c = ifx pos < text.count then cast(s16)text[pos] else -1;
110 add_to_queue(*nfa, runq, prog.start, c, ctx, ctx_offset, t, pos);
111 decref(*nfa, t);
112 }
113
114 // If all the threads have died, stop early.
115 if runq.count == 0 break;
116
117 // @ToDo There was a special case here.
118 // Not sure if it is needed after our transformations.
119
120 }
121 }
122
123 if nfa.matched {
124 matches = NewArray(num_matches, string, false);
125 for i: 0..num_matches - 1 {
126 matches[i] = slice(text, nfa.match[2 * i], nfa.match[2 * i + 1] - nfa.match[2 * i]);
127 }
128
129 if kind == .kFullMatch && matches[0].count != text.count {
130 return false, matches;
131 }
132 }
133
134 return nfa.matched, matches;
135}
136
137#scope_file
138
139Nfa :: struct {
140 prog: *Prog;
141 start: int; // start instruction in program
142 ncaptures: int; // number of submatches to track
143 longest: bool; // whether searching for longest match
144 endmatch: bool; // whether match must end at text.end()
145 q0: Thread_Queue;
146 q1: Thread_Queue;
147 stack: [..] AddState;
148 threads: Bucket_Array(Thread, 16);
149 free_list: *Thread;
150 match: [] int; // best match so far
151 matched: bool; // any match so far?
152}
153
154Thread :: struct {
155 // @ToDo @Cleanup I don’t want to ref-count here, but the original algorithm
156 // is too complicated to safely remove the ref-counting while porting it to jai.
157 // Needs to happend in a separat step once we have tests in place.
158 union {
159 ref: int;
160 next: *Thread;
161 }
162 capture: [] int;
163}
164
165new_thread :: (using nfa: *Nfa) -> *Thread {
166 t: *Thread;
167 if free_list {
168 t = free_list;
169 free_list = t.next;
170 } else {
171 t = find_and_occupy_empty_slot(*threads);
172 t.capture = NewArray(ncaptures, int, false);
173 }
174 t.ref = 1;
175 return t;
176}
177
178incref :: (t: *Thread) -> *Thread {
179 t.ref += 1;
180 return t;
181}
182
183decref :: (using nfa: *Nfa, t: *Thread) {
184 t.ref -= 1;
185 if !t.ref {
186 t.next = free_list;
187 free_list = t;
188 }
189}
190
191Thread_Queue :: Sparse_Array(*Thread);
192
193AddState :: struct {
194 id: int;
195 t: *Thread;
196}
197
198make_state :: (id: int, t: *Thread) -> AddState {
199 s: AddState;
200 s.id = id;
201 s.t = t;
202 return s;
203}
204
205copy_capture :: (dst: *[] int, src: [] int) {
206 memcpy(dst.data, src.data, dst.count * size_of(type_of(src[0])));
207}
208
209// Run runq on byte c, appending new states to nextq.
210// Updates matched_ and match_ as new, better matches are found.
211// context is used (with p) for evaluating empty-width specials.
212// p is the position of byte c in the input string for AddToThreadq;
213// p-1 will be used when processing Match instructions.
214// Frees all the threads on runq.
215// If there is a shortcut to the end, returns that shortcut.
216step :: (using nfa: *Nfa, runq: *Thread_Queue, nextq: *Thread_Queue, c: int, text: string, ctx: string, ctx_offset: int, pos: int) -> int {
217 for runq {
218 t := it.value;
219 if t == null continue;
220
221 if longest {
222 // Can skip any threads started after our current best match.
223 if (matched && match[0] < t.capture[0]) {
224 decref(nfa, t);
225 continue;
226 }
227 }
228
229 id := it.index;
230 i := prog.inst[id];
231
232 if get_opcode(i) == {
233 case;
234 assert(false, "Unexpected opcode: %", get_opcode(i));
235
236 case .kInstByteRange;
237 add_to_queue(nfa, nextq, get_out(i), c, ctx, ctx_offset, t, pos);
238
239 case .kInstAltMatch;
240 if it_index == 0 {
241 // The match is ours if we want it.
242 if greedy(i, prog) || longest {
243 copy_capture(*match, t.capture);
244 matched = true;
245
246 decref(nfa, t);
247 for j: it_index+1..cast(int)runq.count-1 {
248 if runq.dense[j].value != null {
249 decref(nfa, runq.dense[j].value);
250 }
251 }
252 runq.count = 0;
253 if greedy(i, prog) {
254 return i.out1;
255 } else {
256 return get_out(i);
257 }
258 }
259 }
260
261 case .kInstMatch;
262 // @ToDo There was a special case here.
263 // Not sure if it is needed after our transformations.
264
265 if !endmatch || pos - 1 == text.count {
266 if longest {
267 // Leftmost-longest mode: save this match only if
268 // it is either farther to the left or at the same
269 // point but longer than an existing match.
270 if !matched || t.capture[0] < match[0] || t.capture[0] == match[0] && pos - 1 > match[1] {
271 copy_capture(*match, t.capture);
272 match[1] = pos - 1;
273 matched = true;
274 }
275 } else {
276 // Leftmost-biased mode: this match is by definition
277 // better than what we've already found (see next line).
278 copy_capture(*match, t.capture);
279 match[1] = pos - 1;
280 matched = true;
281
282 // Cut off the threads that can only find matches
283 // worse than the one we just found: don't run the
284 // rest of the current Threadq.
285 decref(nfa, t);
286 for j: it_index+1..cast(int)runq.count-1 {
287 if runq.dense[j].value != null {
288 decref(nfa, runq.dense[j].value);
289 }
290 }
291 runq.count = 0;
292 return 0;
293 }
294 }
295 }
296 decref(nfa, t);
297 }
298
299 runq.count = 0;
300 return 0;
301}
302
303// Follows all empty arrows from id0 and enqueues all the states reached.
304// Enqueues only the ByteRange instructions that match byte c.
305// context is used (with p) for evaluating empty-width specials.
306// pos is the current input position, and t0 is the current thread.
307add_to_queue :: (using nfa: *Nfa, q: *Thread_Queue, id0: int, c: int, ctx: string, ctx_offset: int, t0_in: *Thread, pos: int) {
308 if id0 == 0 return;
309
310 // Use stack_ to hold our stack of instructions yet to process.
311 // It was preallocated as follows:
312 // two entries per Capture;
313 // one entry per EmptyWidth; and
314 // one entry per Nop.
315 // This reflects the maximum number of stack pushes that each can
316 // perform. (Each instruction can be processed at most once.)
317 stack_size := stack.allocated;
318 stack.count = 0;
319 array_add(*stack, make_state(id0, null));
320 t0 := t0_in;
321 while stack.count > 0 {
322 assert(stack.count < stack_size);
323 a := pop(*stack);
324
325 if (a.t != null) {
326 // t0 was a thread that we allocated and copied in order to
327 // record the capture, so we must now decref it.
328 decref(nfa, t0);
329 t0 = a.t;
330 }
331
332 id := a.id;
333 if id == 0 continue;
334 if contains(q.*, id) continue;
335
336 // Create entry in q no matter what. We might fill it in below,
337 // or we might not. Even if not, it is necessary to have it,
338 // so that we don't revisit id0 during the recursion.
339 add_unchecked(q, id, null);
340
341 i := prog.inst[id];
342 if get_opcode(i) == {
343 case .kInstFail;
344
345 case .kInstAltMatch;
346 // Save state; will pick up at next byte.
347 t := incref(t0);
348 set(q, id, t);
349
350 assert(!get_last(i));
351 array_add(*stack, make_state(id+1, null));
352
353 case .kInstNop;
354 if !get_last(i) {
355 array_add(*stack, make_state(id+1, null));
356 }
357 array_add(*stack, make_state(get_out(i), null));
358
359 case .kInstCapture;
360 if !get_last(i) {
361 array_add(*stack, make_state(id+1, null));
362 }
363 j := i.cap;
364
365 if j < ncaptures {
366 // Push a dummy whose only job is to restore t0
367 // once we finish exploring this possibility.
368 array_add(*stack, make_state(0, t0));
369
370 // Record capture.
371 t := new_thread(nfa);
372 copy_capture(*t.capture, t0.capture);
373 t.capture[j] = pos;
374 t0 = t;
375 }
376 array_add(*stack, make_state(get_out(i), null));
377
378 case .kInstByteRange;
379 if !matches(i, c) {
380 if !get_last(i) {
381 array_add(*stack, make_state(id + 1, null));
382 }
383 } else {
384 // Save state; will pick up at next byte.
385 t: = incref(t0);
386 set(q, id, t);
387 hint := get_hint(i);
388 if hint != 0 {
389 array_add(*stack, make_state(id + hint, null));
390 }
391 }
392
393 case .kInstMatch;
394 // Save state; will pick up at next byte.
395 t: = incref(t0);
396 set(q, id, t);
397
398 if !get_last(i) {
399 array_add(*stack, make_state(id + 1, null));
400 }
401
402 case .kInstEmptyWidth;
403 if !get_last(i) {
404 array_add(*stack, make_state(id + 1, null));
405 }
406
407 // Continue on if we have all the right flag bits.
408 if !(i.empty & ~empty_flags(ctx, ctx_offset + pos)) {
409 array_add(*stack, make_state(get_out(i), null));
410 }
411
412 case;
413 assert(false, "Unexpected op: %", get_opcode(i));
414 }
415 }
416}
417
Copyright 2026  E766CB298A6D1E64 | Git-Thing heavily inspired by cgit