<<
path:
root/public/blog.git/html/modules/uniform/compile.jai
blob: 0c1ac86a41e5162e8ce50330bfcc231a97313f03
[raw]
[clear marker]
3 suffix_regexp: *Regexp_Node;
5 num_capture_groups: int;
9uninit :: (re: *Regexp) {
13compile :: (pattern: string, parse_flags := ParseFlags.LikePerl) -> Regexp, success: bool {
15 entire_regexp, status, pool := parse(pattern, parse_flags);
17 if status.code != .Success {
18 log_error("Could not parse regexp \"%\": % (%)", pattern, status.code, status.error_arg);
24 #if REGEXP_DEBUG print("Parsed: \"%\"\n", to_string(entire_regexp));
26 regexp.prefix, regexp.prefix_foldcase, regexp.suffix_regexp = required_prefix(entire_regexp, *regexp.pool);
27 #if REGEXP_DEBUG print("Split: \"%\" (foldcase: %) + \"%\"\n", regexp.prefix, regexp.prefix_foldcase, to_string(regexp.suffix_regexp));
29 regexp.prog = compile(regexp.suffix_regexp, *regexp.pool);
31 log_error("Could not compile regexp \"%\"", pattern);
35 #if REGEXP_DEBUG print("Compiled suffix:\n%\n", to_string(regexp.prog));
37 regexp.num_capture_groups = compute_num_captures(regexp.suffix_regexp);
38// // @ToDo: For one-pass NFA
39// // is_one_pass = compute_is_one_pass(prog);
46// List of pointers to Inst* that need to be filled in (patched).
47// Because the Inst* haven't been filled in yet,
48// we can use the Inst* word to hold the list's "next" pointer.
49// It's kind of sleazy, but it works well in practice.
50// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
52// Because the out and out1 fields in Inst are no longer pointers,
53// we can't use pointers directly here either. Instead, head refers
54// to inst_[head>>1].out (head&1 == 0) or inst_[head>>1].out1 (head&1 == 1).
55// head == 0 represents the null list. This is okay because instruction #0
56// is always the fail instruction, which never appears on a list.
59 tail: u32; // for constant-time append
62// Returns patch list containing just p.
63make_patch_list :: (p: int) -> PatchList {
70// Patches all the entries on l to have value p.
71patch :: (inst: *[..] Inst, l: PatchList, p: int) {
74 ip := *(inst.*)[head>>1];
85// Appends two patch lists and returns result.
86append :: (inst: *[..] Inst, l1: PatchList, l2: PatchList) -> PatchList {
87 if l1.head == 0 return l2;
88 if l2.head == 0 return l1;
90 ip := *(inst.*)[l1.tail>>1];
103NULL_PATCH_LIST :: PatchList.{0, 0};
105// Compiled program fragment.
111make_frag :: (begin: int, end: PatchList) -> Frag {
113 f.begin = cast(u32)begin;
120// kEncodingUTF8 = 1, // UTF-8 (0-10FFFF)
121// kEncodingLatin1, // Latin-1 (0-FF)
127 prog: *Prog; // Program being built.
128 failed: bool; // Did we give up compiling?
129 // Encoding encoding_; // Input encoding
130 reversed: bool; // Should program run backward over text?
133 max_ninst: int; // Maximum number of instructions.
135 max_mem: s64; // Total memory budget.
137 // std::unordered_map<uint64_t, int> rune_cache_;
138 rune_cache: Table(u64, int);
140 anchor: Anchor; // anchor mode for RE2::Set
143// Compiles re, returning program.
144// Caller is responsible for deleting prog_.
145// If reversed is true, compiles a program that expects
146// to run over the input string backward (reverses all concatenations).
147// The reversed flag is also recorded in the returned program.
148compile :: (re: *Regexp_Node, pool: *Regexp_Pool, reversed := false, max_mem := 0) -> *Prog {
150 push_allocator(pool_allocator_proc, pool.pool);
156 c.reversed = reversed;
158 array_reserve(*c.inst, 8);
161 set_opcode(*inst, .kInstFail);
162 array_add(*c.inst, inst);
166 c.max_ninst = 100000; // more than enough
167 } else if (max_mem <= size_of(Prog)) {
168 // No room for anything.
171 m := (max_mem - size_of(Prog)) / size_of(Inst);
172 // Limit instruction count so that inst->id() fits nicely in an int.
173 // SparseArray also assumes that the indices (inst->id()) are ints.
174 // The call to WalkExponential uses 2*max_ninst_ below,
175 // and other places in the code use 2 or 3 * prog->size().
176 // Limiting to 2^24 should avoid overflow in those places.
177 // (The point of allowing more than 32 bits of memory is to
178 // have plenty of room for the DFA states, not to use it up
183 // Inst imposes its own limit (currently bigger than 2^24 but be safe).
184 if (m > Inst.MAX_INST) {
189 c.anchor = .UNANCHORED;
191 // Simplify to remove things like counted repetitions
192 // and character classes like \d.
193 sre := simplify(re, pool);
194 if sre == null return null;
195 #if REGEXP_DEBUG print("Simplified: \"%\" -> \"%\"\n", to_string(re), to_string(sre));
197 // Record whether prog is anchored, removing the anchors.
198 // (They get in the way of other optimizations.)
201 sre, anchor_start = is_anchor_start(sre, pool, 0);
202 sre, anchor_end = is_anchor_end(sre, pool, 0);
204 // Generate fragment for entire regexp.
205 w := Walk(*Compiler, Frag).{pre_visit = compiler_pre_visit, post_visit = compiler_post_visit, short_visit = compiler_short_visit, copy = null};
206 // @ToDO @Cleanup This is a workaround for nulls not being respected in struct literals yet.
207 // Remove once the struct literal bug is fixed in jai
209 w.max_visits = 2 * c.max_ninst;
210 all := walk(*c, sre, Frag.{}, w);
211 if c.failed return null;
213 // Success! Finish by putting Match node at end, and record start.
214 // Turn off c.reversed (if it is set) to force the remaining concatenations
215 // to behave normally.
217 all = cat(*c, all, match(*c, 0));
219 c.prog.reversed = reversed;
220 if (c.prog.reversed) {
221 c.prog.anchor_start = anchor_end;
222 c.prog.anchor_end = anchor_start;
224 c.prog.anchor_start = anchor_start;
225 c.prog.anchor_end = anchor_end;
228 c.prog.start = all.begin;
229 if (!c.prog.anchor_start) {
230 // Also create unanchored version, which starts with a .*? loop.
231 all = cat(*c, dot_star(*c), all);
233 c.prog.start_unanchored = all.begin;
235 // Hand ownership of prog to caller.
236 return finish(*c, re);
239to_string :: (using comp: Compiler) -> string {
240 builder: String_Builder;
241 defer free_buffers(*builder);
243 print_to_builder(*builder, "%. ", it_index);
244 append(*builder, it);
245 append(*builder, #char "\n");
247 return builder_to_string(*builder);
250finish :: (using comp: *Compiler, re: *Regexp_Node) -> *Prog {
251 if failed return null;
253 if (prog.start == 0 && prog.start_unanchored == 0) {
254 // No possible matches; keep Fail instruction only.
258 // Hand off the array to Prog.
262 #if REGEXP_DEBUG print("Optimized:\n%\n", to_string(prog.*));
264 #if REGEXP_DEBUG print("Flattened:\n%\n", to_string(prog.*));
265 compute_byte_map(prog);
267 if (!prog.reversed) {
268 prefix, prefix_foldcase := required_prefix_for_accel(re.*);
269 if prefix.count && !prefix_foldcase {
270 prog.prefix_size = prefix.count;
271 prog.prefix_front = prefix[0];
272 prog.prefix_back = prefix[prefix.count - 1];
276 // Record remaining memory for DFA.
279 prog.dfa_mem = 1 << 20;
281 m := max_mem - size_of(Prog);
282 m -= prog.inst.count * size_of(Inst); // account for inst_
283 if can_bit_state(prog) {
284 m -= prog.inst.count * size_of(u16); // account for list_heads_
298alloc_inst :: (using comp: *Compiler, n: int) -> int {
299 if failed || inst.count + n > max_ninst {
304 array_reserve(*inst, inst.count + n);
305 memset(inst.data + inst.count, 0, n * size_of(type_of(inst[0])));
312// These routines are somewhat hard to visualize in text --
313// see http://swtch.com/~rsc/regexp/regexp1.html for
314// pictures explaining what is going on here.
316// Returns an unmatchable fragment.
317no_match :: () -> Frag {
318 return .{0, NULL_PATCH_LIST};
321// Is a an unmatchable fragment?
322is_no_match :: (a: Frag) -> bool {
326// Given fragments a and b, returns fragment for ab.
327cat :: (using comp: *Compiler, a: Frag, b: Frag) -> Frag {
328 if is_no_match(a) || is_no_match(b) return no_match();
331 begin := *inst[a.begin];
332 if get_opcode(begin.*) == .kInstNop && a.end.head == (a.begin << 1) && get_out(begin.*) == 0 {
333 // in case refs to a somewhere
334 patch(*inst, a.end, b.begin);
338 // To run backward over string, reverse all concatenations.
340 patch(*inst, b.end, a.begin);
341 return make_frag(b.begin, a.end);
344 patch(*inst, a.end, b.begin);
345 return make_frag(a.begin, b.end);
348// Given fragments for a and b, returns fragment for a|b.
349alt :: (using comp: *Compiler, a: Frag, b: Frag) -> Frag {
350 // Special case for convenience in loops.
351 if is_no_match(a) return b;
352 if is_no_match(b) return a;
354 array_reserve(*inst, inst.count + 1);
355 id := alloc_inst(comp, 1);
356 if id < 0 return no_match();
357 init_alt(*inst[id], a.begin, b.begin);
358 return make_frag(id, append(*inst, a.end, b.end));
361// When capturing submatches in like-Perl mode, a kOpAlt Inst
362// treats out_ as the first choice, out1_ as the second.
364// For *, +, and ?, if out_ causes another repetition,
365// then the operator is greedy. If out1_ is the repetition
366// (and out_ moves forward), then the operator is non-greedy.
368// Given a fragment a, returns a fragment for a* or a*? (if nongreedy)
369star :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
370 id := alloc_inst(comp, 1);
371 if id < 0 return no_match();
372 init_alt(*inst[id], 0, 0);
373 patch(*inst, a.end, id);
375 inst[id].out1 = a.begin;
376 return make_frag(id, make_patch_list(id << 1));
378 set_out(*inst[id], a.begin);
379 return make_frag(id, make_patch_list(id << 1 | 1));
383// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy)
384plus :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
385 // a+ is just a* with a different entry point.
386 f := star(comp, a, nongreedy);
387 return make_frag(a.begin, f.end);
390// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy)
391quest :: (using comp: *Compiler, a: Frag, nongreedy: bool) -> Frag {
392 if is_no_match(a) return nop(comp);
393 id := alloc_inst(comp, 1);
394 if id < 0 return no_match();
397 init_alt(*inst[id], 0, a.begin);
398 pl = make_patch_list(id << 1);
400 init_alt(*inst[id], a.begin, 0);
401 pl = make_patch_list((id << 1) | 1);
403 return make_frag(id, append(*inst, pl, a.end));
406dot_star :: (using comp: *Compiler) -> Frag {
407 return star(comp, byte_range(comp, 0x00, 0xff, false), true);
410// Returns a fragment for the byte range lo-hi.
411byte_range :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool) -> Frag {
412 id := alloc_inst(comp, 1);
413 if id < 0 return no_match();
414 init_byte_range(*inst[id], lo, hi, foldcase, 0);
415 return make_frag(id, make_patch_list(id << 1));
418// Returns a no-op fragment. Sometimes unavoidable.
419nop :: (using comp: *Compiler) -> Frag {
420 id := alloc_inst(comp, 1);
421 if id < 0 return no_match();
423 return make_frag(id, make_patch_list(id << 1));
426// Returns a fragment that signals a match.
427match :: (using comp: *Compiler, match_id: int) -> Frag {
428 id := alloc_inst(comp, 1);
429 if id < 0 return no_match();
430 init_match(*inst[id], cast(s32)match_id);
431 return make_frag(id, NULL_PATCH_LIST);
434// Returns a fragment matching a particular empty-width op (like ^ or $)
435empty_width :: (using comp: *Compiler, empty: EmptyOp) -> Frag {
436 id := alloc_inst(comp, 1);
437 if id < 0 return no_match();
438 init_empty_width(*inst[id], empty, 0);
439 return make_frag(id, make_patch_list(id << 1));
442// Given a fragment a, returns a fragment with capturing parens around a.
443capture :: (using comp: *Compiler, a: Frag, n: s32) -> Frag {
444 if is_no_match(a) return no_match();
445 id := alloc_inst(comp, 2);
446 if id < 0 return no_match();
447 init_capture(*inst[id], 2 * n, a.begin);
448 init_capture(*inst[id + 1], 2 * n + 1, 0);
449 patch(*inst, a.end, id + 1);
450 return make_frag(id, make_patch_list((id + 1) << 1));
453// A Rune is a name for a Unicode code point.
454// Returns maximum rune encoded by UTF-8 sequence of length len.
455// ToDo: This should be an array of constants!
456max_rune :: (len: int) -> Rune {
457 b: int; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
461 b = 8-(len+1) + 6*(len-1);
463 return cast(Rune)(1<<b) - 1; // maximum Rune for b bits.
466// The rune range compiler caches common suffix fragments,
467// which are very common in UTF-8 (e.g., [80-bf]).
468// The fragment suffixes are identified by their start
469// instructions. null denotes the eventual end match.
470// The Frag accumulates in rune_range_. Caching common
471// suffixes reduces the UTF-8 "." from 32 to 24 instructions,
472// and it reduces the corresponding one-pass NFA from 16 nodes to 8.
473begin_range :: (using comp: *Compiler) {
474 table_reset(*rune_cache);
475 rune_range.begin = 0;
476 rune_range.end = NULL_PATCH_LIST;
479uncached_rune_byte_suffix :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool, next: int) -> int {
480 f := byte_range(comp, lo, hi, foldcase);
482 patch(*inst, f.end, next);
484 rune_range.end = append(*inst, rune_range.end, f.end);
489make_rune_cache_key :: (lo: u8, hi: u8, foldcase: bool, next: int) -> u64 {
490 return (cast(u64)next << 17) |
496cached_rune_byte_suffix :: (using comp: *Compiler, lo: u8, hi: u8, foldcase: bool, next: int) -> int {
497 key := make_rune_cache_key(lo, hi, foldcase, next);
498 ptr := table_find_pointer(*rune_cache, key);
500 id := uncached_rune_byte_suffix(comp, lo, hi, foldcase, next);
501 table_add(*rune_cache, key, id);
505is_cached_rune_byte_suffix :: (using comp: *Compiler, id: int) -> bool {
508 foldcase := get_foldcase(inst[id]);
509 next := get_out(inst[id]);
510 key := make_rune_cache_key(lo, hi, foldcase, next);
511 return table_find_pointer(*rune_cache, key) != null;
514add_suffix :: (using comp: *Compiler, id: int) {
517 if rune_range.begin == 0 {
518 rune_range.begin = cast(u32)id;
522 rune_range.begin = add_suffix_recursive(comp, rune_range.begin, id);
525add_suffix_recursive :: (using comp: *Compiler, root: u32, id: int) -> u32 {
526 assert(get_opcode(inst[root]) == .kInstAlt || get_opcode(inst[root]) == .kInstByteRange);
528 f := find_byte_range(comp, root, id);
530 alt := alloc_inst(comp, 1);
532 init_alt(*inst[alt], cast(u32)root, cast(u32)id);
539 } else if (f.end.head&1) {
540 br = inst[f.begin].out1;
542 br = get_out(inst[f.begin]);
545 if is_cached_rune_byte_suffix(comp, br) {
546 // We can't fiddle with cached suffixes, so make a clone of the head.
547 byterange := alloc_inst(comp, 1);
548 if byterange < 0 return 0;
549 init_byte_range(*inst[byterange], inst[br].lo, inst[br].hi, get_foldcase(inst[br]), get_out(inst[br]));
551 // Ensure that the parent points to the clone, not to the original.
552 // Note that this could leave the head unreachable except via the cache.
553 br = cast(u32)byterange;
554 if (f.end.head == 0) {
556 } else if (f.end.head&1) {
557 inst[f.begin].out1 = cast(u32)br;
559 set_out(*inst[f.begin], br);
563 out := get_out(inst[id]);
564 if !is_cached_rune_byte_suffix(comp, id) {
565 // The head should be the instruction most recently allocated, so free it
566 // instead of leaving it unreachable.
567 assert(id == inst.count - 1);
568 inst[id].out_opcode = 0;
573 out = add_suffix_recursive(comp, get_out(inst[br]), out);
574 if out == 0 return 0;
576 set_out(*inst[br], out);
580byte_range_equal :: (using comp: *Compiler, id1: int, id2: int) -> bool {
581 return inst[id1].lo == inst[id2].lo &&
582 inst[id1].hi == inst[id2].hi &&
583 get_foldcase(inst[id1]) == get_foldcase(inst[id2]);
586find_byte_range :: (using comp: *Compiler, root: int, id: int) -> Frag {
587 if get_opcode(inst[root]) == .kInstByteRange {
588 if byte_range_equal(comp, root, id) {
589 return make_frag(root, NULL_PATCH_LIST);
595 while get_opcode(inst[root]) == .kInstAlt {
596 out1 := inst[root].out1;
597 if byte_range_equal(comp, out1, id) {
598 return make_frag(root, make_patch_list((root << 1) | 1));
601 // CharClass is a sorted list of ranges, so if out1 of the root Alt wasn't
602 // what we're looking for, then we can stop immediately. Unfortunately, we
603 // can't short-circuit the search in reverse mode.
604 if (!reversed) return no_match();
606 out := get_out(inst[root]);
607 if get_opcode(inst[out]) == .kInstAlt {
609 } else if byte_range_equal(comp, out, id) {
610 return make_frag(root, make_patch_list(root << 1));
616 assert(false, "Should never be reached");
620end_range :: (using comp: *Compiler) -> Frag {
624// Converts rune range lo-hi into a fragment that recognizes
625// the bytes that would make up those runes in the current
626// encoding (always utf-8 for now).
627// This lets the machine work byte-by-byte even when
628// using multibyte encodings.
629add_rune_range :: (using comp: *Compiler, lo: Rune, hi: Rune, foldcase: bool) {
632 // Pick off 80-10FFFF as a common special case.
633 if (lo == 0x80 && hi == 0x10ffff) {
638 // Split range into same-length sized ranges.
639 for i: 1..UTFmax - 1 {
641 if (lo <= max && max < hi) {
642 add_rune_range(comp, lo, max, foldcase);
643 add_rune_range(comp, max+1, hi, foldcase);
648 // ASCII range is always a special case.
650 add_suffix(comp, uncached_rune_byte_suffix(comp, cast(u8)lo, cast(u8)hi, foldcase, 0));
654 // Split range into sections that agree on leading bytes.
655 for i: 1..UTFmax - 1 {
656 m := cast(s32)(1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence
657 if ((lo & ~m) != (hi & ~m)) {
659 add_rune_range(comp, lo, lo|m, foldcase);
660 add_rune_range(comp, (lo|m)+1, hi, foldcase);
664 add_rune_range(comp, lo, (hi&~m)-1, foldcase);
665 add_rune_range(comp, hi&~m, hi, foldcase);
671 // Finally. Generate byte matching equivalent for lo-hi.
672 ulo, n := bytes_from_rune(lo);
673 uhi, m := bytes_from_rune(hi);
676 // The logic below encodes this thinking:
678 // 1. When we have built the whole suffix, we know that it cannot
679 // possibly be a suffix of anything longer: in forward mode, nothing
680 // else can occur before the leading byte; in reverse mode, nothing
681 // else can occur after the last continuation byte or else the leading
682 // byte would have to change. Thus, there is no benefit to caching
683 // the first byte of the suffix whereas there is a cost involved in
684 // cloning it if it begins a common prefix, which is fairly likely.
686 // 2. Conversely, the last byte of the suffix cannot possibly be a
687 // prefix of anything because next == 0, so we will never want to
688 // clone it, but it is fairly likely to be a common suffix. Perhaps
689 // more so in reverse mode than in forward mode because the former is
690 // "converging" towards lower entropy, but caching is still worthwhile
691 // for the latter in cases such as 80-BF.
693 // 3. Handling the bytes between the first and the last is less
694 // straightforward and, again, the approach depends on whether we are
695 // "converging" towards lower entropy: in forward mode, a single byte
696 // is unlikely to be part of a common suffix whereas a byte range
697 // is more likely so; in reverse mode, a byte range is unlikely to
698 // be part of a common suffix whereas a single byte is more likely
699 // so. The same benefit versus cost argument applies here.
703 // In reverse UTF-8 mode: cache the leading byte; don't cache the last
704 // continuation byte; cache anything else iff it's a single byte (XX-XX).
705 if (i == 0 || (ulo[i] == uhi[i] && i != n-1)) {
706 id = cached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
708 id = uncached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
713 // In forward UTF-8 mode: don't cache the leading byte; cache the last
714 // continuation byte; cache anything else iff it's a byte range (XX-YY).
715 if (i == n-1 || (ulo[i] < uhi[i] && i != 0)) {
716 id = cached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
718 id = uncached_rune_byte_suffix(comp, ulo[i], uhi[i], false, id);
722 add_suffix(comp, id);
725add_80_10ffff :: (using comp: *Compiler) {
726 // The 80-10FFFF (Runeself-Runemax) rune range occurs frequently enough
727 // (for example, for /./ and /[^a-z]/) that it is worth simplifying: by
728 // permitting overlong encodings in E0 and F0 sequences and code points
729 // over 10FFFF in F4 sequences, the size of the bytecode and the number
730 // of equivalence classes are reduced significantly.
733 // Prefix factoring matters, but we don't have to handle it here
734 // because the rune range trie logic takes care of that already.
735 id = uncached_rune_byte_suffix(comp, 0xC2, 0xDF, false, 0);
736 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
737 add_suffix(comp, id);
739 id = uncached_rune_byte_suffix(comp, 0xE0, 0xEF, false, 0);
740 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
741 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
742 add_suffix(comp, id);
744 id = uncached_rune_byte_suffix(comp, 0xF0, 0xF4, false, 0);
745 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
746 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
747 id = uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, id);
748 add_suffix(comp, id);
750 // Suffix factoring matters - and we do have to handle it here.
751 cont1 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, 0);
752 id = uncached_rune_byte_suffix(comp, 0xC2, 0xDF, false, cont1);
753 add_suffix(comp, id);
755 cont2 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, cont1);
756 id = uncached_rune_byte_suffix(comp, 0xE0, 0xEF, false, cont2);
757 add_suffix(comp, id);
759 cont3 := uncached_rune_byte_suffix(comp, 0x80, 0xBF, false, cont2);
760 id = uncached_rune_byte_suffix(comp, 0xF0, 0xF4, false, cont3);
761 add_suffix(comp, id);
766compiler_short_visit :: (using comp: *Compiler, re: *Regexp_Node, f: Frag) -> Frag {
771compiler_pre_visit :: (using comp: *Compiler, re: *Regexp_Node, f: Frag) -> Frag, stop: bool {
775literal :: (using comp: *Compiler, r: Rune, foldcase: bool) -> Frag {
776 if (r < Runeself) { // Make common case fast.
777 return byte_range(comp, cast(u8)r, cast(u8)r, foldcase);
780 buf, n := bytes_from_rune(r);
781 f := byte_range(comp, buf[0], buf[0], false);
783 f = cat(comp, f, byte_range(comp, buf[i], buf[i], false));
788// Called after traversing the node's children during the walk.
789// Given their frags, build and return the frag for this re.
790// @ToDo @Cleanup: Compiler never uses any input args. Use something more specific than "walk"
791compiler_post_visit :: (using comp: *Compiler, re: *Regexp_Node, parent_arg: Frag, pre_arg: Frag, child_frags: [..] Frag) -> Frag {
792 // If a child failed, don't bother going forward, especially
793 // since the child_frags might contain Frags with NULLs in them.
794 if failed return no_match();
796 // Given the child fragments, return the fragment for this node.
799 assert(false, "Unexpected op: %", re.op);
809 f := match(comp, re.match_id);
810 if anchor == .ANCHOR_BOTH {
811 // Append \z or else the subexpression will effectively be unanchored.
812 // Complemented by the Unanchored case in CompileSet().
813 f = cat(comp, empty_width(comp, .kEmptyEndText), f);
819 for i: 1..child_frags.count - 1 {
820 f = cat(comp, f, child_frags[i]);
826 for i: 1..child_frags.count - 1 {
827 f = alt(comp, f, child_frags[i]);
832 return star(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
835 return plus(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
838 return quest(comp, child_frags[0], (re.parse_flags & .NonGreedy) != 0);
841 return literal(comp, re.rune, (re.parse_flags & .FoldCase) != 0);
844 // Concatenation of literals.
845 if re.runes.count == 0 return nop(comp);
849 f1 := literal(comp, it, (re.parse_flags & .FoldCase) != 0);
853 f = cat(comp, f, f1);
860 add_rune_range(comp, 0, Runemax, false);
861 return end_range(comp);
864 return byte_range(comp, 0x00, 0xFF, false);
867 assert(re.char_class.ranges.count > 0, "No ranges in char class");
869 // ASCII case-folding optimization: if the char class
870 // behaves the same on A-Z as it does on a-z,
871 // discard any ranges wholly contained in A-Z
872 // and mark the other ranges as foldascii.
873 // This reduces the size of a program for
874 // (?i)abc from 3 insts per letter to 1 per letter.
875 foldascii := re.char_class.folds_ascii;
877 // Character class is just a big OR of the different
878 // character ranges in the class.
880 for re.char_class.ranges {
881 // ASCII case-folding optimization (see above).
882 if foldascii && #char "A" <= it.lo && it.hi <= #char "Z" continue;
884 // If this range contains all of A-Za-z or none of it,
885 // the fold flag is unnecessary; don't bother.
887 if it.lo <= #char "A" && #char "z" <= it.hi || it.hi < #char "A" || #char "z" < it.lo || #char "Z" < it.lo && it.hi < #char "a" {
891 add_rune_range(comp, it.lo, it.hi, fold);
893 return end_range(comp);
896 // If this is a non-capturing parenthesis -- (?:foo) --
897 // just use the inner expression.
899 return child_frags[0];
901 return capture(comp, child_frags[0], re.cap);
905 return empty_width(comp, ifx reversed then EmptyOp.kEmptyEndLine else EmptyOp.kEmptyBeginLine);
908 return empty_width(comp, ifx reversed then EmptyOp.kEmptyBeginLine else EmptyOp.kEmptyEndLine);
911 return empty_width(comp, ifx reversed then EmptyOp.kEmptyEndText else EmptyOp.kEmptyBeginText);
914 return empty_width(comp, ifx reversed then EmptyOp.kEmptyBeginText else EmptyOp.kEmptyEndText);
917 return empty_width(comp, .kEmptyWordBoundary);
919 case .NoWordBoundary;
920 return empty_width(comp, .kEmptyNonWordBoundary);
924// Is this regexp required to start at the beginning of the text?
925// Only approximate; can return false for complicated regexps like (\Aa|\Ab),
926// but handles (\A(a|b)). Could use the Walker to write a more exact one.
927is_anchor_start :: (re: *Regexp_Node, pool: *Regexp_Pool, depth: int) -> *Regexp_Node, bool {
928 // The depth limit makes sure that we don't overflow
929 // the stack on a deeply nested regexp. As the comment
930 // above says, IsAnchorStart is conservative, so returning
931 // a false negative is okay. The exact limit is somewhat arbitrary.
932 if re == null || depth >= 4 return re, false;
936 if re.subs.count > 0 {
937 sub, sub_start := is_anchor_start(re.subs[0], pool, depth + 1);
939 return new_concat(pool, re.subs, re.parse_flags), true;
943 sub, sub_start := is_anchor_start(re.subs[0], pool, depth + 1);
945 return new_capture(pool, sub, re.parse_flags, re.cap), true;
948 return new_literal_string(pool, .[], re.parse_flags), true;
953// Is this regexp required to start at the end of the text?
954// Only approximate; can return false for complicated regexps like (a\z|b\z),
955// but handles ((a|b)\z). Could use the Walker to write a more exact one.
956is_anchor_end :: (re: *Regexp_Node, pool: *Regexp_Pool, depth: int) -> *Regexp_Node, bool {
957 // The depth limit makes sure that we don't overflow
958 // the stack on a deeply nested regexp. As the comment
959 // above says, IsAnchorEnd is conservative, so returning
960 // a false negative is okay. The exact limit is somewhat arbitrary.
961 if (re == null || depth >= 4) return re, false;
965 if re.subs.count > 0 {
966 sub, sub_end := is_anchor_end(re.subs[re.subs.count - 1], pool, depth + 1);
968 return new_concat(pool, re.subs, re.parse_flags), true;
972 sub, sub_end := is_anchor_end(re.subs[0], pool, depth + 1);
974 return new_capture(pool, sub, re.parse_flags, re.cap), true;
978 return new_literal_string(pool, .[], re.parse_flags), true;