Logo

index : blog

---

  • summary
  • about
  • tree
  • log
  • branches
<< path: root/public/blog.git/html/modules/uniform/compile.jai blob: 0c1ac86a41e5162e8ce50330bfcc231a97313f03 [raw] [clear marker]

        
0Regexp :: struct {
1 prefix: string;
2 prefix_foldcase: bool;
3 suffix_regexp: *Regexp_Node;
4 prog: *Prog;
5 num_capture_groups: int;
6 pool: Regexp_Pool;
7}
8
9uninit :: (re: *Regexp) {
10 uninit(*re.pool);
11}
12
13compile :: (pattern: string, parse_flags := ParseFlags.LikePerl) -> Regexp, success: bool {
14 regexp: Regexp;
15 entire_regexp, status, pool := parse(pattern, parse_flags);
16
17 if status.code != .Success {
18 log_error("Could not parse regexp \"%\": % (%)", pattern, status.code, status.error_arg);
19 return regexp, false;
20 }
21 regexp.pool = pool;
22
23
24 #if REGEXP_DEBUG print("Parsed: \"%\"\n", to_string(entire_regexp));
25
26 regexp.prefix, regexp.prefix_foldcase, regexp.suffix_regexp = required_prefix(entire_regexp, *regexp.pool);
27 #if REGEXP_DEBUG print("Split: \"%\" (foldcase: %) + \"%\"\n", regexp.prefix, regexp.prefix_foldcase, to_string(regexp.suffix_regexp));
28
29 regexp.prog = compile(regexp.suffix_regexp, *regexp.pool);
30 if !regexp.prog {
31 log_error("Could not compile regexp \"%\"", pattern);
32 uninit(*regexp);
33 return .{}, false;
34 }
35 #if REGEXP_DEBUG print("Compiled suffix:\n%\n", to_string(regexp.prog));
36
37 regexp.num_capture_groups = compute_num_captures(regexp.suffix_regexp);
38// // @ToDo: For one-pass NFA
39// // is_one_pass = compute_is_one_pass(prog);
40
41 return regexp, true;
42}
43
44#scope_module
45
46// List of pointers to Inst* that need to be filled in (patched).
47// Because the Inst* haven't been filled in yet,
48// we can use the Inst* word to hold the list's "next" pointer.
49// It's kind of sleazy, but it works well in practice.
50// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
51//
52// Because the out and out1 fields in Inst are no longer pointers,
53// we can't use pointers directly here either. Instead, head refers
54// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
55// head == 0 represents the null list. This is okay because instruction #0
56// is always the fail instruction, which never appears on a list.
57PatchList :: struct {
58 head: u32;
59 tail: u32; // for constant-time append
60}
61
62// Returns patch list containing just p.
63make_patch_list :: (p: int) -> PatchList {
64 l: PatchList;
65 l.head = cast(u32)p;
66 l.tail = cast(u32)p;
67 return l;
68}
69
70// Patches all the entries on l to have value p.
71patch :: (inst: *[..] Inst, l: PatchList, p: int) {
72 head := l.head;
73 while head != 0 {
74 ip := *(inst.*)[head>>1];
75 if head & 1 {
76 head = ip.out1;
77 ip.out1 = cast(u32)p;
78 } else {
79 head = get_out(ip.*);
80 set_out(ip, p);
81 }
82 }
83}
84
85// Appends two patch lists and returns result.
86append :: (inst: *[..] Inst, l1: PatchList, l2: PatchList) -> PatchList {
87 if l1.head == 0 return l2;
88 if l2.head == 0 return l1;
89
90 ip := *(inst.*)[l1.tail>>1];
91 if l1.tail & 1 {
92 ip.out1 = l2.head;
93 } else {
94 set_out(ip, l2.head);
95 }
96
97 l: PatchList;
98 l.head = l1.head;
99 l.tail = l2.tail;
100 return l;
101}
102
103NULL_PATCH_LIST :: PatchList.{0, 0};
104
105// Compiled program fragment.
106Frag :: struct {
107 begin: u32;
108 end: PatchList;
109}
110
111make_frag :: (begin: int, end: PatchList) -> Frag {
112 f: Frag;
113 f.begin = cast(u32)begin;
114 f.end = end;
115 return f;
116}
117
118// Input encodings.
119// Encoding :: enum {
120// kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
121// kEncodingLatin1, // Latin-1 (0-FF)
122// }
123
124
125Compiler :: struct {
126 pool: *Regexp_Pool;
127 prog: *Prog; // Program being built.
128 failed: bool; // Did we give up compiling?
129 // Encoding encoding_; // Input encoding
130 reversed: bool; // Should program run backward over text?
131
132 inst: [..] Inst;
133 max_ninst: int; // Maximum number of instructions.
134
135 max_mem: s64; // Total memory budget.
136
137 // std::unordered_map<uint64_t, int> rune_cache_;
138 rune_cache: Table(u64, int);
139 rune_range: Frag;
140 anchor: Anchor; // anchor mode for RE2::Set
141}
142
143// Compiles re, returning program.
144// Caller is responsible for deleting prog_.
145// If reversed is true, compiles a program that expects
146// to run over the input string backward (reverses all concatenations).
147// The reversed flag is also recorded in the returned program.
148compile :: (re: *Regexp_Node, pool: *Regexp_Pool, reversed := false, max_mem := 0) -> *Prog {
149
150 push_allocator(pool_allocator_proc, pool.pool);
151 // Init compiler
152 c: Compiler;
153 c.pool = pool;
154
155 c.prog = New(Prog);
156 c.reversed = reversed;
157
158 array_reserve(*c.inst, 8);
159
160 inst: Inst;
161 set_opcode(*inst, .kInstFail);
162 array_add(*c.inst, inst);
163
164 c.max_mem = max_mem;
165 if max_mem <= 0 {
166 c.max_ninst = 100000; // more than enough
167 } else if (max_mem <= size_of(Prog)) {
168 // No room for anything.
169 c.max_ninst = 0;
170 } else {
171 m := (max_mem - size_of(Prog)) / size_of(Inst);
172 // Limit instruction count so that inst->id() fits nicely in an int.
173 // SparseArray also assumes that the indices (inst->id()) are ints.
174 // The call to WalkExponential uses 2*max_ninst_ below,
175 // and other places in the code use 2 or 3 * prog->size().
176 // Limiting to 2^24 should avoid overflow in those places.
177 // (The point of allowing more than 32 bits of memory is to
178 // have plenty of room for the DFA states, not to use it up
179 // on the program.)
180 if (m >= 1<<24) {
181 m = 1<<24;
182 }
183 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
184 if (m > Inst.MAX_INST) {
185 m = Inst.MAX_INST;
186 }
187 c.max_ninst = m;
188 }
189 c.anchor = .UNANCHORED;
190
191 // Simplify to remove things like counted repetitions
192 // and character classes like \d.
193 sre := simplify(re, pool);
194 if sre == null return null;
195 #if REGEXP_DEBUG print("Simplified: \"%\" -> \"%\"\n", to_string(re), to_string(sre));
196
197 // Record whether prog is anchored, removing the anchors.
198 // (They get in the way of other optimizations.)
199 anchor_start: bool;
200 anchor_end: bool;
201 sre, anchor_start = is_anchor_start(sre, pool, 0);
202 sre, anchor_end = is_anchor_end(sre, pool, 0);
203
204 // Generate fragment for entire regexp.
205 w := Walk(*Compiler, Frag).{pre_visit = compiler_pre_visit, post_visit = compiler_post_visit, short_visit = compiler_short_visit, copy = null};
206 // @ToDO @Cleanup This is a workaround for nulls not being respected in struct literals yet.
207 // Remove once the struct literal bug is fixed in jai
208 w.copy = null;
209 w.max_visits = 2 * c.max_ninst;
210 all := walk(*c, sre, Frag.{}, w);
211 if c.failed return null;
212
213 // Success! Finish by putting Match node at end, and record start.
214 // Turn off c.reversed (if it is set) to force the remaining concatenations
215 // to behave normally.
216 c.reversed = false;
217 all = cat(*c, all, match(*c, 0));
218
219 c.prog.reversed = reversed;
220 if (c.prog.reversed) {
221 c.prog.anchor_start = anchor_end;
222 c.prog.anchor_end = anchor_start;
223 } else {
224 c.prog.anchor_start = anchor_start;
225 c.prog.anchor_end = anchor_end;
226 }
227
228 c.prog.start = all.begin;
229 if (!c.prog.anchor_start) {
230 // Also create unanchored version, which starts with a .*? loop.
231 all = cat(*c, dot_star(*c), all);
232 }
233 c.prog.start_unanchored = all.begin;
234
235 // Hand ownership of prog to caller.
236 return finish(*c, re);
237}
238
239to_string :: (using comp: Compiler) -> string {
240 builder: String_Builder;
241 defer free_buffers(*builder);
242 for inst {
243 print_to_builder(*builder, "%. ", it_index);
244 append(*builder, it);
245 append(*builder, #char "\n");
246 }
247 return builder_to_string(*builder);
248}
249
250finish :: (using comp: *Compiler, re: *Regexp_Node) -> *Prog {
251 if failed return null;
252
253 if (prog.start == 0 && prog.start_unanchored == 0) {
254 // No possible matches; keep Fail instruction only.
255 inst.count = 1;
256 }
257
258 // Hand off the array to Prog.
259 prog.inst = inst;
260
261 optimize(prog);
262 #if REGEXP_DEBUG print("Optimized:\n%\n", to_string(prog.*));
263 flatten(prog);
264 #if REGEXP_DEBUG print("Flattened:\n%\n", to_string(prog.*));
265 compute_byte_map(prog);
266
267 if (!prog.reversed) {
268 prefix, prefix_foldcase := required_prefix_for_accel(re.*);
269 if prefix.count && !prefix_foldcase {
270 prog.prefix_size = prefix.count;
271 prog.prefix_front = prefix[0];
272 prog.prefix_back = prefix[prefix.count - 1];
273 }
274 }
275
276 // Record remaining memory for DFA.
277 // @ToDo
278 if (max_mem <= 0) {
279 prog.dfa_mem = 1 << 20;
280 } else {
281 m := max_mem - size_of(Prog);
282 m -= prog.inst.count * size_of(Inst); // account for inst_
283 if can_bit_state(prog) {
284 m -= prog.inst.count * size_of(u16); // account for list_heads_
285 }
286 if (m < 0) {
287 m = 0;
288 }
289 prog.dfa_mem = m;
290 }
291
292 p := prog;
293 prog = null;
294 return p;
295}
296
297
298alloc_inst :: (using comp: *Compiler, n: int) -> int {
299 if failed || inst.count + n > max_ninst {
300 failed = true;
301 return -1;
302 }
303
304 array_reserve(*inst, inst.count + n);
305 memset(inst.data + inst.count, 0, n * size_of(type_of(inst[0])));
306 id := inst.count;
307 inst.count += n;
308 return id;
309}
310
311
312// These routines are somewhat hard to visualize in text --
313// see http://swtch.com/~rsc/regexp/regexp1.html for
314// pictures explaining what is going on here.
315
316// Returns an unmatchable fragment.
317no_match :: () -> Frag {
318 return .{0, NULL_PATCH_LIST};
319}
320
321// Is a an unmatchable fragment?
322is_no_match :: (a: Frag) -> bool {
323 return a.begin == 0;
324}
325
326// Given fragments a and b, returns fragment for ab.
327cat :: (using comp: *Compiler, a: Frag, b: Frag) -> Frag {
328 if is_no_match(a) || is_no_match(b) return no_match();
329
330 // Elide no-op.
331 begin := *inst[a.begin];
332 if get_opcode(begin.*) == .kInstNop && a.end.head == (a.begin << 1) && get_out(begin.*) == 0 {
333 // in case refs to a somewhere
334 patch(*inst, a.end, b.begin);
335 return b;
336 }
337
338 // To run backward over string, reverse all concatenations.
339 if reversed {
340 patch(*inst, b.end, a.begin);
341 return make_frag(b.begin, a.end);
342 }
343
344 patch(*inst, a.end, b.begin);
345 return make_frag(a.begin, b.end);
346}
347
348// Given fragments for a and b, returns fragment for a|b.
349alt :: (using comp: *Compiler, a: Frag, b: Frag) -> Frag {
350 // Special case for convenience in loops.
351 if is_no_match(a) return b;
352 if is_no_match(b) return a;
353
354 array_reserve(*inst, inst.count + 1);
355 id := alloc_inst(comp, 1);
356 if id < 0 return no_match();
357 init_alt(*inst[id], a.begin, b.begin);
358 return make_frag(id, append(*inst, a.end, b.end));
359}
360
361// When capturing submatches in like-Perl mode, a kOpAlt Inst
362// treats out_ as the first choice, out1_ as the second.
363//
364// For *, +, and ?, if out_ causes another repetition,
365// then the operator is greedy. If out1_ is the repetition
366// (and out_ moves forward), then the operator is non-greedy.
367
368// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
369star :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
370 id := alloc_inst(comp, 1);
371 if id < 0 return no_match();
372 init_alt(*inst[id], 0, 0);
373 patch(*inst, a.end, id);
374 if nongreedy {
375 inst[id].out1 = a.begin;
376 return make_frag(id, make_patch_list(id << 1));
377 } else {
378 set_out(*inst[id], a.begin);
379 return make_frag(id, make_patch_list(id << 1 | 1));
380 }
381}
382
383// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
384plus :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
385 // a+ is just a* with a different entry point.
386 f := star(comp, a, nongreedy);
387 return make_frag(a.begin, f.end);
388}
389
390// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
391quest :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
392 if is_no_match(a) return nop(comp);
393 id := alloc_inst(comp, 1);
394 if id < 0 return no_match();
395 pl: PatchList;
396 if nongreedy {
397 init_alt(*inst[id], 0, a.begin);
398 pl = make_patch_list(id << 1);
399 } else {
400 init_alt(*inst[id], a.begin, 0);
401 pl = make_patch_list((id << 1) | 1);
402 }
403 return make_frag(id, append(*inst, pl, a.end));
404}
405
406dot_star :: (using comp: *Compiler) -> Frag {
407 return star(comp, byte_range(comp, 0x00, 0xff, false), true);
408}
409
410// Returns a fragment for the byte range lo-hi.
411byte_range :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool) -> Frag {
412 id := alloc_inst(comp, 1);
413 if id < 0 return no_match();
414 init_byte_range(*inst[id], lo, hi, foldcase, 0);
415 return make_frag(id, make_patch_list(id << 1));
416}
417
418// Returns a no-op fragment. Sometimes unavoidable.
419nop :: (using comp: *Compiler) -> Frag {
420 id := alloc_inst(comp, 1);
421 if id < 0 return no_match();
422 init_nop(*inst[id]);
423 return make_frag(id, make_patch_list(id << 1));
424}
425
426// Returns a fragment that signals a match.
427match :: (using comp: *Compiler, match_id: int) -> Frag {
428 id := alloc_inst(comp, 1);
429 if id < 0 return no_match();
430 init_match(*inst[id], cast(s32)match_id);
431 return make_frag(id, NULL_PATCH_LIST);
432}
433
434// Returns a fragment matching a particular empty-width op (like ^ or $)
435empty_width :: (using comp: *Compiler, empty: EmptyOp) -> Frag {
436 id := alloc_inst(comp, 1);
437 if id < 0 return no_match();
438 init_empty_width(*inst[id], empty, 0);
439 return make_frag(id, make_patch_list(id << 1));
440}
441
442// Given a fragment a, returns a fragment with capturing parens around a.
443capture :: (using comp: *Compiler, a: Frag, n: s32) -> Frag {
444 if is_no_match(a) return no_match();
445 id := alloc_inst(comp, 2);
446 if id < 0 return no_match();
447 init_capture(*inst[id], 2 * n, a.begin);
448 init_capture(*inst[id + 1], 2 * n + 1, 0);
449 patch(*inst, a.end, id + 1);
450 return make_frag(id, make_patch_list((id + 1) << 1));
451}
452
453// A Rune is a name for a Unicode code point.
454// Returns maximum rune encoded by UTF-8 sequence of length len.
455// ToDo: This should be an array of constants!
456max_rune :: (len: int) -> Rune {
457 b: int; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
458 if len == 1 {
459 b = 7;
460 } else {
461 b = 8-(len+1) + 6*(len-1);
462 }
463 return cast(Rune)(1<<b) - 1; // maximum Rune for b bits.
464}
465
466// The rune range compiler caches common suffix fragments,
467// which are very common in UTF-8 (e.g., [80-bf]).
468// The fragment suffixes are identified by their start
469// instructions. null denotes the eventual end match.
470// The Frag accumulates in rune_range_. Caching common
471// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
472// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
473begin_range :: (using comp: *Compiler) {
474 table_reset(*rune_cache);
475 rune_range.begin = 0;
476 rune_range.end = NULL_PATCH_LIST;
477}
478
479uncached_rune_byte_suffix :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool, next: int) -> int {
480 f := byte_range(comp, lo, hi, foldcase);
481 if (next != 0) {
482 patch(*inst, f.end, next);
483 } else {
484 rune_range.end = append(*inst, rune_range.end, f.end);
485 }
486 return f.begin;
487}
488
489make_rune_cache_key :: (lo: u8, hi: u8, foldcase: bool, next: int) -> u64 {
490 return (cast(u64)next << 17) |
491 (cast(u64)lo << 9) |
492 (cast(u64)hi << 1) |
493 (cast(u64)foldcase);
494}
495
496cached_rune_byte_suffix :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool, next: int) -> int {
497 key := make_rune_cache_key(lo, hi, foldcase, next);
498 ptr := table_find_pointer(*rune_cache, key);
499 if ptr return ptr.*;
500 id := uncached_rune_byte_suffix(comp, lo, hi, foldcase, next);
501 table_add(*rune_cache, key, id);
502 return id;
503}
504
505is_cached_rune_byte_suffix :: (using comp: *Compiler, id: int) -> bool {
506 lo := inst[id].lo;
507 hi := inst[id].hi;
508 foldcase := get_foldcase(inst[id]);
509 next := get_out(inst[id]);
510 key := make_rune_cache_key(lo, hi, foldcase, next);
511 return table_find_pointer(*rune_cache, key) != null;
512}
513
514add_suffix :: (using comp: *Compiler, id: int) {
515 if failed return;
516
517 if rune_range.begin == 0 {
518 rune_range.begin = cast(u32)id;
519 return;
520 }
521
522 rune_range.begin = add_suffix_recursive(comp, rune_range.begin, id);
523}
524
525add_suffix_recursive :: (using comp: *Compiler, root: u32, id: int) -> u32 {
526 assert(get_opcode(inst[root]) == .kInstAlt || get_opcode(inst[root]) == .kInstByteRange);
527
528 f := find_byte_range(comp, root, id);
529 if is_no_match(f) {
530 alt := alloc_inst(comp, 1);
531 if alt < 0 return 0;
532 init_alt(*inst[alt], cast(u32)root, cast(u32)id);
533 return cast(u32)alt;
534 }
535
536 br: u32;
537 if f.end.head == 0 {
538 br = root;
539 } else if (f.end.head&1) {
540 br = inst[f.begin].out1;
541 } else {
542 br = get_out(inst[f.begin]);
543 }
544
545 if is_cached_rune_byte_suffix(comp, br) {
546 // We can't fiddle with cached suffixes, so make a clone of the head.
547 byterange := alloc_inst(comp, 1);
548 if byterange < 0 return 0;
549 init_byte_range(*inst[byterange], inst[br].lo, inst[br].hi, get_foldcase(inst[br]), get_out(inst[br]));
550
551 // Ensure that the parent points to the clone, not to the original.
552 // Note that this could leave the head unreachable except via the cache.
553 br = cast(u32)byterange;
554 if (f.end.head == 0) {
555 root = br;
556 } else if (f.end.head&1) {
557 inst[f.begin].out1 = cast(u32)br;
558 } else {
559 set_out(*inst[f.begin], br);
560 }
561 }
562
563 out := get_out(inst[id]);
564 if !is_cached_rune_byte_suffix(comp, id) {
565 // The head should be the instruction most recently allocated, so free it
566 // instead of leaving it unreachable.
567 assert(id == inst.count - 1);
568 inst[id].out_opcode = 0;
569 inst[id].out1 = 0;
570 inst.count -= 1;
571 }
572
573 out = add_suffix_recursive(comp, get_out(inst[br]), out);
574 if out == 0 return 0;
575
576 set_out(*inst[br], out);
577 return root;
578}
579
580byte_range_equal :: (using comp: *Compiler, id1: int, id2: int) -> bool {
581 return inst[id1].lo == inst[id2].lo &&
582 inst[id1].hi == inst[id2].hi &&
583 get_foldcase(inst[id1]) == get_foldcase(inst[id2]);
584}
585
586find_byte_range :: (using comp: *Compiler, root: int, id: int) -> Frag {
587 if get_opcode(inst[root]) == .kInstByteRange {
588 if byte_range_equal(comp, root, id) {
589 return make_frag(root, NULL_PATCH_LIST);
590 } else {
591 return no_match();
592 }
593 }
594
595 while get_opcode(inst[root]) == .kInstAlt {
596 out1 := inst[root].out1;
597 if byte_range_equal(comp, out1, id) {
598 return make_frag(root, make_patch_list((root << 1) | 1));
599 }
600
601 // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
602 // what we're looking for, then we can stop immediately. Unfortunately, we
603 // can't short-circuit the search in reverse mode.
604 if (!reversed) return no_match();
605
606 out := get_out(inst[root]);
607 if get_opcode(inst[out]) == .kInstAlt {
608 root = out;
609 } else if byte_range_equal(comp, out, id) {
610 return make_frag(root, make_patch_list(root << 1));
611 } else {
612 return no_match();
613 }
614 }
615
616 assert(false, "Should never be reached");
617 return no_match();
618}
619
620end_range :: (using comp: *Compiler) -> Frag {
621 return rune_range;
622}
623
624// Converts rune range lo-hi into a fragment that recognizes
625// the bytes that would make up those runes in the current
626// encoding (always utf-8 for now).
627// This lets the machine work byte-by-byte even when
628// using multibyte encodings.
629add_rune_range :: (using comp: *Compiler, lo: Rune, hi: Rune, foldcase: bool) {
630 if (lo > hi) return;
631
632 // Pick off 80-10FFFF as a common special case.
633 if (lo == 0x80 && hi == 0x10ffff) {
634 add_80_10ffff(comp);
635 return;
636 }
637
638 // Split range into same-length sized ranges.
639 for i: 1..UTFmax - 1 {
640 max := max_rune(i);
641 if (lo <= max && max < hi) {
642 add_rune_range(comp, lo, max, foldcase);
643 add_rune_range(comp, max+1, hi, foldcase);
644 return;
645 }
646 }
647
648 // ASCII range is always a special case.
649 if hi < Runeself {
650 add_suffix(comp, uncached_rune_byte_suffix(comp, cast(u8)lo, cast(u8)hi, foldcase, 0));
651 return;
652 }
653
654 // Split range into sections that agree on leading bytes.
655 for i: 1..UTFmax - 1 {
656 m := cast(s32)(1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
657 if ((lo & ~m) != (hi & ~m)) {
658 if ((lo & m) != 0) {
659 add_rune_range(comp, lo, lo|m, foldcase);
660 add_rune_range(comp, (lo|m)+1, hi, foldcase);
661 return;
662 }
663 if ((hi & m) != m) {
664 add_rune_range(comp, lo, (hi&~m)-1, foldcase);
665 add_rune_range(comp, hi&~m, hi, foldcase);
666 return;
667 }
668 }
669 }
670
671 // Finally. Generate byte matching equivalent for lo-hi.
672 ulo, n := bytes_from_rune(lo);
673 uhi, m := bytes_from_rune(hi);
674 assert(n == m);
675
676 // The logic below encodes this thinking:
677 //
678 // 1. When we have built the whole suffix, we know that it cannot
679 // possibly be a suffix of anything longer: in forward mode, nothing
680 // else can occur before the leading byte; in reverse mode, nothing
681 // else can occur after the last continuation byte or else the leading
682 // byte would have to change. Thus, there is no benefit to caching
683 // the first byte of the suffix whereas there is a cost involved in
684 // cloning it if it begins a common prefix, which is fairly likely.
685 //
686 // 2. Conversely, the last byte of the suffix cannot possibly be a
687 // prefix of anything because next == 0, so we will never want to
688 // clone it, but it is fairly likely to be a common suffix. Perhaps
689 // more so in reverse mode than in forward mode because the former is
690 // "converging" towards lower entropy, but caching is still worthwhile
691 // for the latter in cases such as 80-BF.
692 //
693 // 3. Handling the bytes between the first and the last is less
694 // straightforward and, again, the approach depends on whether we are
695 // "converging" towards lower entropy: in forward mode, a single byte
696 // is unlikely to be part of a common suffix whereas a byte range
697 // is more likely so; in reverse mode, a byte range is unlikely to
698 // be part of a common suffix whereas a single byte is more likely
699 // so. The same benefit versus cost argument applies here.
700 id := 0;
701 if (reversed) {
702 for i: 0..n-1 {
703 // In reverse UTF-8 mode: cache the leading byte; don't cache the last
704 // continuation byte; cache anything else iff it's a single byte (XX-XX).
705 if (i == 0 || (ulo[i] == uhi[i] && i != n-1)) {
706 id = cached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
707 } else {
708 id = uncached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
709 }
710 }
711 } else {
712 for < i: 0..n-1 {
713 // In forward UTF-8 mode: don't cache the leading byte; cache the last
714 // continuation byte; cache anything else iff it's a byte range (XX-YY).
715 if (i == n-1 || (ulo[i] < uhi[i] && i != 0)) {
716 id = cached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
717 } else {
718 id = uncached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
719 }
720 }
721 }
722 add_suffix(comp, id);
723}
724
725add_80_10ffff :: (using comp: *Compiler) {
726 // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
727 // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
728 // permitting overlong encodings in E0 and F0 sequences and code points
729 // over 10FFFF in F4 sequences, the size of the bytecode and the number
730 // of equivalence classes are reduced significantly.
731 id: int;
732 if reversed {
733 // Prefix factoring matters, but we don't have to handle it here
734 // because the rune range trie logic takes care of that already.
735 id = uncached_rune_byte_suffix(comp, 0xC2, 0xDF, false, 0);
736 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
737 add_suffix(comp, id);
738
739 id = uncached_rune_byte_suffix(comp, 0xE0, 0xEF, false, 0);
740 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
741 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
742 add_suffix(comp, id);
743
744 id = uncached_rune_byte_suffix(comp, 0xF0, 0xF4, false, 0);
745 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
746 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
747 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
748 add_suffix(comp, id);
749 } else {
750 // Suffix factoring matters - and we do have to handle it here.
751 cont1 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, 0);
752 id = uncached_rune_byte_suffix(comp, 0xC2, 0xDF, false, cont1);
753 add_suffix(comp, id);
754
755 cont2 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, cont1);
756 id = uncached_rune_byte_suffix(comp, 0xE0, 0xEF, false, cont2);
757 add_suffix(comp, id);
758
759 cont3 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, cont2);
760 id = uncached_rune_byte_suffix(comp, 0xF0, 0xF4, false, cont3);
761 add_suffix(comp, id);
762 }
763}
764
765
766compiler_short_visit :: (using comp: *Compiler, re: *Regexp_Node, f: Frag) -> Frag {
767 failed = true;
768 return no_match();
769}
770
771compiler_pre_visit :: (using comp: *Compiler, re: *Regexp_Node, f: Frag) -> Frag, stop: bool {
772 return .{}, failed;
773}
774
775literal :: (using comp: *Compiler, r: Rune, foldcase: bool) -> Frag {
776 if (r < Runeself) { // Make common case fast.
777 return byte_range(comp, cast(u8)r, cast(u8)r, foldcase);
778 }
779
780 buf, n := bytes_from_rune(r);
781 f := byte_range(comp, buf[0], buf[0], false);
782 for i: 1..n-1 {
783 f = cat(comp, f, byte_range(comp, buf[i], buf[i], false));
784 }
785 return f;
786}
787
788// Called after traversing the node's children during the walk.
789// Given their frags, build and return the frag for this re.
790// @ToDo @Cleanup: Compiler never uses any input args. Use something more specific than "walk"
791compiler_post_visit :: (using comp: *Compiler, re: *Regexp_Node, parent_arg: Frag, pre_arg: Frag, child_frags: [..] Frag) -> Frag {
792 // If a child failed, don't bother going forward, especially
793 // since the child_frags might contain Frags with NULLs in them.
794 if failed return no_match();
795
796 // Given the child fragments, return the fragment for this node.
797 if re.op == {
798 case;
799 assert(false, "Unexpected op: %", re.op);
800 return no_match();
801
802 case .NoMatch;
803 return no_match();
804
805 case .EmptyMatch;
806 return nop(comp);
807
808 case .HaveMatch;
809 f := match(comp, re.match_id);
810 if anchor == .ANCHOR_BOTH {
811 // Append \z or else the subexpression will effectively be unanchored.
812 // Complemented by the Unanchored case in CompileSet().
813 f = cat(comp, empty_width(comp, .kEmptyEndText), f);
814 }
815 return f;
816
817 case .Concat;
818 f := child_frags[0];
819 for i: 1..child_frags.count - 1 {
820 f = cat(comp, f, child_frags[i]);
821 }
822 return f;
823
824 case .Alternate;
825 f := child_frags[0];
826 for i: 1..child_frags.count - 1 {
827 f = alt(comp, f, child_frags[i]);
828 }
829 return f;
830
831 case .Star;
832 return star(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
833
834 case .Plus;
835 return plus(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
836
837 case .Quest;
838 return quest(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
839
840 case .Literal;
841 return literal(comp, re.rune, (re.parse_flags & .FoldCase) != 0);
842
843 case .LiteralString;
844 // Concatenation of literals.
845 if re.runes.count == 0 return nop(comp);
846
847 f: Frag;
848 for re.runes {
849 f1 := literal(comp, it, (re.parse_flags & .FoldCase) != 0);
850 if (it_index == 0) {
851 f = f1;
852 } else {
853 f = cat(comp, f, f1);
854 }
855 }
856 return f;
857
858 case .AnyChar;
859 begin_range(comp);
860 add_rune_range(comp, 0, Runemax, false);
861 return end_range(comp);
862
863 case .AnyByte;
864 return byte_range(comp, 0x00, 0xFF, false);
865
866 case .CharClass;
867 assert(re.char_class.ranges.count > 0, "No ranges in char class");
868
869 // ASCII case-folding optimization: if the char class
870 // behaves the same on A-Z as it does on a-z,
871 // discard any ranges wholly contained in A-Z
872 // and mark the other ranges as foldascii.
873 // This reduces the size of a program for
874 // (?i)abc from 3 insts per letter to 1 per letter.
875 foldascii := re.char_class.folds_ascii;
876
877 // Character class is just a big OR of the different
878 // character ranges in the class.
879 begin_range(comp);
880 for re.char_class.ranges {
881 // ASCII case-folding optimization (see above).
882 if foldascii && #char "A" <= it.lo && it.hi <= #char "Z" continue;
883
884 // If this range contains all of A-Za-z or none of it,
885 // the fold flag is unnecessary; don't bother.
886 fold := foldascii;
887 if it.lo <= #char "A" && #char "z" <= it.hi || it.hi < #char "A" || #char "z" < it.lo || #char "Z" < it.lo && it.hi < #char "a" {
888 fold = false;
889 }
890
891 add_rune_range(comp, it.lo, it.hi, fold);
892 }
893 return end_range(comp);
894
895 case .Capture;
896 // If this is a non-capturing parenthesis -- (?:foo) --
897 // just use the inner expression.
898 if re.cap < 0 {
899 return child_frags[0];
900 } else {
901 return capture(comp, child_frags[0], re.cap);
902 }
903
904 case .BeginLine;
905 return empty_width(comp, ifx reversed then EmptyOp.kEmptyEndLine else EmptyOp.kEmptyBeginLine);
906
907 case .EndLine;
908 return empty_width(comp, ifx reversed then EmptyOp.kEmptyBeginLine else EmptyOp.kEmptyEndLine);
909
910 case .BeginText;
911 return empty_width(comp, ifx reversed then EmptyOp.kEmptyEndText else EmptyOp.kEmptyBeginText);
912
913 case .EndText;
914 return empty_width(comp, ifx reversed then EmptyOp.kEmptyBeginText else EmptyOp.kEmptyEndText);
915
916 case .WordBoundary;
917 return empty_width(comp, .kEmptyWordBoundary);
918
919 case .NoWordBoundary;
920 return empty_width(comp, .kEmptyNonWordBoundary);
921 }
922}
923
924// Is this regexp required to start at the beginning of the text?
925// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
926// but handles (\A(a|b)). Could use the Walker to write a more exact one.
927is_anchor_start :: (re: *Regexp_Node, pool: *Regexp_Pool, depth: int) -> *Regexp_Node, bool {
928 // The depth limit makes sure that we don't overflow
929 // the stack on a deeply nested regexp. As the comment
930 // above says, IsAnchorStart is conservative, so returning
931 // a false negative is okay. The exact limit is somewhat arbitrary.
932 if re == null || depth >= 4 return re, false;
933
934 if re.op == {
935 case .Concat;
936 if re.subs.count > 0 {
937 sub, sub_start := is_anchor_start(re.subs[0], pool, depth + 1);
938 if sub_start {
939 return new_concat(pool, re.subs, re.parse_flags), true;
940 }
941 }
942 case .Capture;
943 sub, sub_start := is_anchor_start(re.subs[0], pool, depth + 1);
944 if sub_start {
945 return new_capture(pool, sub, re.parse_flags, re.cap), true;
946 }
947 case .BeginText;
948 return new_literal_string(pool, .[], re.parse_flags), true;
949 }
950 return re, false;
951}
952
953// Is this regexp required to start at the end of the text?
954// Only approximate; can return false for complicated regexps like (a\z|b\z),
955// but handles ((a|b)\z). Could use the Walker to write a more exact one.
956is_anchor_end :: (re: *Regexp_Node, pool: *Regexp_Pool, depth: int) -> *Regexp_Node, bool {
957 // The depth limit makes sure that we don't overflow
958 // the stack on a deeply nested regexp. As the comment
959 // above says, IsAnchorEnd is conservative, so returning
960 // a false negative is okay. The exact limit is somewhat arbitrary.
961 if (re == null || depth >= 4) return re, false;
962
963 if re.op == {
964 case .Concat;
965 if re.subs.count > 0 {
966 sub, sub_end := is_anchor_end(re.subs[re.subs.count - 1], pool, depth + 1);
967 if sub_end {
968 return new_concat(pool, re.subs, re.parse_flags), true;
969 }
970 }
971 case .Capture;
972 sub, sub_end := is_anchor_end(re.subs[0], pool, depth + 1);
973 if sub_end {
974 return new_capture(pool, sub, re.parse_flags, re.cap), true;
975 }
976
977 case .EndText;
978 return new_literal_string(pool, .[], re.parse_flags), true;
979 }
980
981 return re, false;
982}
983
984#scope_file
985
986#import "Hash_Table";
987
Copyright 2026  E766CB298A6D1E64 | Git-Thing heavily inspired by cgit