#scope_module // Opcodes for Inst InstOp :: enum u32 { kInstAlt :: 0; // choose between out_ and out1_ kInstAltMatch; // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa. kInstByteRange; // next (possible case-folded) byte must be in [lo_, hi_] kInstCapture; // capturing parenthesis number cap_ kInstEmptyWidth; // empty-width special (^ $ ...); bit(s) set in empty_ kInstMatch; // found a match! kInstNop; // no-op; occasionally unavoidable kInstFail; // never match; occasionally unavoidable } // Bit flags for empty-width specials EmptyOp :: enum_flags { kEmptyBeginLine :: 1<<0; // ^ - beginning of line kEmptyEndLine :: 1<<1; // $ - end of line kEmptyBeginText :: 1<<2; // \A - beginning of text kEmptyEndText :: 1<<3; // \z - end of text kEmptyWordBoundary :: 1<<4; // \b - word boundary kEmptyNonWordBoundary :: 1<<5; // \B - not \b kEmptyAllFlags :: (1<<6)-1; } // Single instruction in regexp program. Inst :: struct { // // Getters // int id(Prog* p) { return static_cast(this - p->inst_.data()); } // int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; } // int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } // int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } // int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } // int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_&1; } // int hint() { DCHECK_EQ(opcode(), kInstByteRange); return hint_foldcase_>>1; } // int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } // EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } // Maximum instruction id. // (Must fit in out_opcode_. PatchList/last steal another bit.) MAX_INST :: (1<<28) - 1; out_opcode: u32; // 28 bits: out, 1 bit: last, 3 (low) bits: opcode union { // additional instruction arguments: out1: u32; // opcode == kInstAlt // alternate next instruction cap: s32; // opcode == kInstCapture // Index of capture register (holds text // position recorded by capturing parentheses). // For \n (the submatch for the nth parentheses), // the left parenthesis captures into register 2*n // and the right one captures into register 2*n+1. match_id: s32; // opcode == kInstMatch // Match ID to identify this match (for re2::Set). struct { // opcode == kInstByteRange lo: u8; // byte range is lo_-hi_ inclusive hi: u8; // hint_foldcase: u16; // 15 bits: hint, 1 (low) bit: foldcase // hint to execution engines: the delta to the // next instruction (in the current list) worth // exploring iff this instruction matched; 0 // means there are no remaining possibilities, // which is most likely for character classes. // foldcase: A-Z -> a-z before checking range. } empty: EmptyOp; // opcode == kInstEmptyWidth // empty_ is bitwise OR of kEmpty* flags above. }; } get_opcode :: (using inst: Inst) -> InstOp { return cast(InstOp) (out_opcode & 7); } get_last :: (using inst: Inst) -> bool { return cast(bool)((out_opcode >> 3) & 1); } get_out :: (using inst: Inst) -> u32 { return out_opcode >> 4; } set_opcode :: (using inst: *Inst, opcode: InstOp) { out_opcode = out_opcode & 0xFFFF_FFF8 | cast(u32)opcode; } set_last :: (using inst: *Inst) { out_opcode = out_opcode & 0xFFFF_FFF7 | cast(u32)(1 << 3); } set_out :: (using inst: *Inst, out: int) { out_opcode = out_opcode & 0x0F | cast(u32)(out << 4); } set_out_opcode :: (using inst: *Inst, out: int, opcode: InstOp) { out_opcode = out_opcode & 0x08 | cast(u32)(out << 4) | cast(u32)opcode; } get_foldcase :: (using inst: Inst) -> bool { assert(get_opcode(inst) == .kInstByteRange); return cast(bool) (hint_foldcase & 1); } get_hint :: (using inst: Inst) -> int { assert(get_opcode(inst) == .kInstByteRange); return hint_foldcase >> 1; } greedy :: (using inst: Inst, p: *Prog) -> bool { assert(get_opcode(inst) == .kInstAltMatch); out_inst := p.inst[get_out(inst)]; return get_opcode(out_inst) == .kInstByteRange || (get_opcode(out_inst) == .kInstNop && get_opcode(p.inst[get_out(out_inst)]) == .kInstByteRange); } // Does this inst (an kInstByteRange) match c? // @ToDo: should this int be Rune? matches :: (using inst: Inst, c: int) -> bool { assert(get_opcode(inst) == .kInstByteRange); if (get_foldcase(inst) && #char "A" <= c && c <= #char "Z") { c += #char "a" - #char "A"; } return lo <= c && c <= hi; } // Returns whether byte c is a word character: ASCII only. // Used by the implementation of \b and \B. // This is not right for Unicode, but: // - it's hard to get right in a byte-at-a-time matching world // (the DFA has only one-byte lookahead). // - even if the lookahead were possible, the Progs would be huge. // This crude approximation is the same one PCRE uses. is_word_char :: (c: u8) -> bool { return (#char "A" <= c && c <= #char "Z") || (#char "a" <= c && c <= #char "z") || (#char "0" <= c && c <= #char "9") || c == #char "_"; } // Compiled form of regexp program. Prog :: struct { // Whether to anchor the search. Anchor :: enum { UNANCHORED; // match anywhere ANCHORED; // match only starting at beginning of text } // Kind of match to look for (for anchor != kFullMatch) // // kLongestMatch mode finds the overall longest // match but still makes its submatch choices the way // Perl would, not in the way prescribed by POSIX. // The POSIX rules are much more expensive to implement, // and no one has needed them. // // kFullMatch is not strictly necessary -- we could use // kLongestMatch and then check the length of the match -- but // the matching code can run faster if it knows to consider only // full matches. MatchKind :: enum { kFirstMatch; // like Perl, PCRE kLongestMatch; // like egrep or POSIX kFullMatch; // match only entire text; implies anchor==ANCHORED kManyMatch; // for SearchDFA, records set of matches }; anchor_start: bool; // regexp has explicit start anchor anchor_end: bool; // regexp has explicit end anchor reversed: bool; // whether program runs backward over input did_flatten: bool; // has Flatten been called? did_onepass: bool; // has IsOnePass been called? start: int; // entry point for program start_unanchored: int; // unanchored entry point for program bytemap_range: int; // bytemap_[x] < bytemap_range_ prefix_size: int; // size of prefix (0 if no prefix) prefix_front: s16 = -1; // first byte of prefix (-1 if no prefix) prefix_back: s16 = -1; // last byte of prefix (-1 if no prefix) list_count: int; // count of lists (see above) inst_count: [#run enum_highest_value(InstOp) + 1] int; // count of instructions by opcode list_heads: [..] u16; // sparse array enumerating list heads // not populated if size_ is overly large inst: [..] Inst; // pointer to instruction array onepass_nodes: [..] u8; // data for OnePass nodes dfa_mem: s64; // Maximum memory for DFAs. // dfa_first: *DFA; // DFA cached for kFirstMatch/kManyMatch // dfa_longest: *DFA; // DFA cached for kLongestMatch/kFullMatch bytemap: [256] u8; // map from input bytes to byte classes // std::once_flag dfa_first_once_; // std::once_flag dfa_longest_once_; } // Inst *inst(int id) { return &inst_[id]; } // int start() { return start_; } // void set_start(int start) { start_ = start; } // int start_unanchored() { return start_unanchored_; } // void set_start_unanchored(int start) { start_unanchored_ = start; } // int size() { return size_; } // bool reversed() { return reversed_; } // void set_reversed(bool reversed) { reversed_ = reversed; } // int list_count() { return list_count_; } // int inst_count(InstOp op) { return inst_count_[op]; } // uint16_t* list_heads() { return list_heads_.data(); } // int64_t dfa_mem() { return dfa_mem_; } // void set_dfa_mem(int64_t dfa_mem) { dfa_mem_ = dfa_mem; } // bool anchor_start() { return anchor_start_; } // void set_anchor_start(bool b) { anchor_start_ = b; } // bool anchor_end() { return anchor_end_; } // void set_anchor_end(bool b) { anchor_end_ = b; } // int bytemap_range() { return bytemap_range_; } // const uint8_t* bytemap() { return bytemap_; } // bool can_prefix_accel() { return prefix_size_ != 0; } // // Accelerates to the first likely occurrence of the prefix. // // Returns a pointer to the first byte or NULL if not found. // const void* PrefixAccel(const void* data, size_t size) { // DCHECK_GE(prefix_size_, 1); // return prefix_size_ == 1 ? memchr(data, prefix_front_, size) // : PrefixAccel_FrontAndBack(data, size); // } can_bit_state :: (using prog: *Prog) -> bool { return list_heads.count > 0; } init_alt :: (inst: *Inst, out: u32, out1: u32) { assert(inst.out_opcode == 0); set_out_opcode(inst, out, .kInstAlt); inst.out1 = out1; } init_byte_range :: (inst: *Inst, lo: u8, hi: u8, foldcase: bool, out: int) { assert(inst.out_opcode == 0); set_out_opcode(inst, out, .kInstByteRange); inst.lo = lo; inst.hi = hi; inst.hint_foldcase = (cast(u16)foldcase)&1; } init_capture :: (inst: *Inst, cap: s32, out: u32) { assert(inst.out_opcode == 0); set_out_opcode(inst, out, .kInstCapture); inst.cap = cap; } init_empty_width :: (inst: *Inst, empty: EmptyOp, out: u32) { assert(inst.out_opcode == 0); set_out_opcode(inst, out, .kInstEmptyWidth); inst.empty = empty; } init_match :: (inst: *Inst, id: s32) { assert(inst.out_opcode == 0); set_opcode(inst, .kInstMatch); inst.match_id = id; } init_nop :: (inst: *Inst) { assert(inst.out_opcode == 0); set_opcode(inst, .kInstNop); } init_fail :: (inst: *Inst) { assert(inst.out_opcode == 0); set_opcode(inst, .kInstFail); } // Peep-hole optimizer. optimize :: (using prog: *Prog) { q: Sparse_Set; init(*q, inst.count); // Eliminate nops. Most are taken out during compilation // but a few are hard to avoid. q.count = 0; add_if_nonzero(*q, start); for id: q { ip := *inst[id]; j := get_out(ip.*); // ToDo: Why is this a pointer???? jp: *Inst; while true { if j == 0 break; jp = *inst[j]; if get_opcode(jp.*) != .kInstNop break; j = get_out(jp.*); } set_out(ip, j); add_if_nonzero(*q, j); if get_opcode(ip.*) == .kInstAlt { j = ip.out1; while true { if j == 0 break; jp = *inst[j]; if get_opcode(jp.*) != .kInstNop break; j = get_out(jp.*); } ip.out1 = j; add_if_nonzero(*q, j); } } // Insert kInstAltMatch instructions // Look for // ip: Alt -> j | k // j: ByteRange [00-FF] -> ip // k: Match // or the reverse (the above is the greedy one). // Rewrite Alt to AltMatch. q.count = 0; add_if_nonzero(*q, start); for id: q { ip := *inst[id]; add_if_nonzero(*q, get_out(ip.*)); if get_opcode(ip.*) == .kInstAlt { add_if_nonzero(*q, ip.out1); // ToDo: Why are these pointers???? j := inst[get_out(ip.*)]; k := inst[ip.out1]; if get_opcode(j) == .kInstByteRange && get_out(j) == id && j.lo == 0x00 && j.hi == 0xFF && is_match(prog, k) { set_opcode(ip, .kInstAltMatch); continue; } if is_match(prog, j) && get_opcode(k) == .kInstByteRange && get_out(k) == id && k.lo == 0x00 && k.hi == 0xFF { set_opcode(ip, .kInstAltMatch); } } } } add_if_nonzero :: (set: *Sparse_Set, i: int) { if i != 0 { add(set, i); } } // Is ip a guaranteed match at end of text, perhaps after some capturing? is_match :: (prog: *Prog, i_in: Inst) -> bool { i := i_in; while true { if #complete get_opcode(i) == { case .kInstAlt; #through; case .kInstAltMatch; #through; case .kInstByteRange; #through; case .kInstFail; #through; case .kInstEmptyWidth; return false; case .kInstCapture; #through; case .kInstNop; i = prog.inst[get_out(i)]; case .kInstMatch; return true; } } // Should not be needed assert(false); return false; } empty_flags :: (text: string, pos: int) -> EmptyOp { flags: EmptyOp; // ^ and \A if pos == 0 { flags |= EmptyOp.kEmptyBeginText | .kEmptyBeginLine; } else if text[pos - 1] == #char "\n" { flags |= .kEmptyBeginLine; } // $ and \z if pos == text.count { flags |= EmptyOp.kEmptyEndText | .kEmptyEndLine; } else if text[pos] == #char "\n" { // @ToDo: I removed a pos < count check here. Can pos ever be greater than count?? flags |= .kEmptyEndLine; } // \b and \B if pos == 0 && text.count == 0 { // no word boundary here } else if pos == 0 { if is_word_char(text[pos]) { flags |= .kEmptyWordBoundary; } } else if pos == text.count { if is_word_char(text[pos-1]) { flags |= .kEmptyWordBoundary; } } else { if (is_word_char(text[pos-1]) != is_word_char(text[pos])) { flags |= .kEmptyWordBoundary; } } if !(flags & .kEmptyWordBoundary) { flags |= .kEmptyNonWordBoundary; } return flags; } compute_byte_map :: (using prog: *Prog) { // Fill in bytemap with byte classes for the program. // Ranges of bytes that are treated indistinguishably // will be mapped to a single byte class. builder: ByteMapBuilder; init(*builder); // Don't repeat the work for ^ and $. marked_line_boundaries := false; // Don't repeat the work for \b and \B. marked_word_boundaries := false; for id: 0..inst.count - 1 { ip := *inst[id]; if get_opcode(ip.*) == .kInstByteRange { lo: s32 = ip.lo; hi: s32 = ip.hi; mark(*builder, lo, hi); if get_foldcase(ip.*) && lo <= #char "z" && hi >= #char "a" { foldlo := lo; foldhi := hi; if foldlo < #char "a" { foldlo = #char "a"; } if foldhi > #char "z" { foldhi = #char "z"; } if (foldlo <= foldhi) { foldlo += #char "A" - #char "a"; foldhi += #char "A" - #char "a"; mark(*builder, foldlo, foldhi); } } // If this Inst is not the last Inst in its list AND the next Inst is // also a ByteRange AND the Insts have the same out, defer the merge. if !get_last(ip.*) && get_opcode(inst[id + 1]) == .kInstByteRange && get_out(ip.*) == get_out(inst[id+1]) { continue; } merge(*builder); } else if get_opcode(ip.*) == .kInstEmptyWidth { if ip.empty & (EmptyOp.kEmptyBeginLine|.kEmptyEndLine) && !marked_line_boundaries { mark(*builder, #char "\n", #char "\n"); merge(*builder); marked_line_boundaries = true; } if ip.empty & (EmptyOp.kEmptyWordBoundary|.kEmptyNonWordBoundary) && !marked_word_boundaries { // We require two batches here: the first for ranges that are word // characters, the second for ranges that are not word characters. for isword: bool.[true, false] { i := 0; j: int; while i < 256 { j = i + 1; while j < 256 && is_word_char(cast(u8)i) == is_word_char(cast(u8)j) { j += 1; } if is_word_char(cast(u8)i) == isword { mark(*builder, cast(u8)i, cast(u8)(j - 1)); } i = j; } merge(*builder); } marked_word_boundaries = true; } } } bytemap, bytemap_range = build(*builder); } // ByteMapBuilder implements a coloring algorithm. // // The first phase is a series of "mark and merge" batches: we mark one or more // [lo-hi] ranges, then merge them into our internal state. Batching is not for // performance; rather, it means that the ranges are treated indistinguishably. // // Internally, the ranges are represented using a bitmap that stores the splits // and a vector that stores the colors; both of them are indexed by the ranges' // last bytes. Thus, in order to merge a [lo-hi] range, we split at lo-1 and at // hi (if not already split), then recolor each range in between. The color map // (i.e. from the old color to the new color) is maintained for the lifetime of // the batch and so underpins this somewhat obscure approach to set operations. // // The second phase builds the bytemap from our internal state: we recolor each // range, then store the new color (which is now the byte class) in each of the // corresponding array elements. Finally, we output the number of byte classes. ByteMapBuilder :: struct { splits: Bit_Array; colors: [256] int; nextcolor: int; colormap: [..] Color_Pair; ranges: [..] Range; Range :: struct { lo: u8; hi: u8; } Color_Pair :: struct { a: int; b: int; } } // Initial state: the [0-255] range has color 256. // This will avoid problems during the second phase, // in which we assign byte classes numbered from 0. init :: (using b: *ByteMapBuilder) { init_bit_array(*splits, 256); set_bit(*splits, 255); colors[255] = 256; nextcolor = 257; } mark :: (using builder: *ByteMapBuilder, lo: s32, hi: s32) { assert(lo <= hi); // Ignore any [0-255] ranges. They cause us to recolor every range, which // has no effect on the eventual result and is therefore a waste of time. if lo == 0 && hi == 255 return; r: Range; r.lo = cast(u8) lo; r.hi = cast(u8) hi; array_add(*ranges, r); } find_next_set_bit :: (using set: Bit_Array, start: int) -> s16 { i := start >> 6; // Mask out everything below start word := slots[i] & (0xFFFF_FFFF_FFFF_FFFF << (start & 63)); if word { return cast(u8)((i << 6) + bit_scan_forward(word) - 1); } i += 1; while i < slots.count { if slots[i] { return cast(u8)((i << 6) + bit_scan_forward(slots[i]) - 1); } i += 1; } return -1; } merge :: (using builder: *ByteMapBuilder) { for r: ranges { lo := (cast(s16)r.lo) - 1; hi: s16 = r.hi; if lo >= 0 && !splits[lo] { set_bit(*splits, lo); next := find_next_set_bit(splits, lo + 1); colors[lo] = colors[next]; } if !splits[hi] { set_bit(*splits, hi); next := find_next_set_bit(splits, hi + 1); colors[hi] = colors[next]; } c := lo + 1; while c < 256 { next := find_next_set_bit(splits, c); colors[next] = recolor(builder, colors[next]); if next == hi break; c = next + 1; } } colormap.count = 0; ranges.count = 0; } build :: (using builder: *ByteMapBuilder) -> [256] u8, bytemap_range: int { // Assign byte classes numbered from 0. nextcolor = 0; bytemap: [256] u8; c := 0; while c < 256 { next := find_next_set_bit(splits, c); b := cast(u8) recolor(builder, colors[next]); while c <= next { bytemap[c] = b; c += 1; } } return bytemap, nextcolor; } recolor :: (using builder: *ByteMapBuilder, oldcolor: int) -> int { // Yes, this is a linear search. There can be at most 256 // colors and there will typically be far fewer than that. // Also, we need to consider keys *and* values in order to // avoid recoloring a given range more than once per batch. for colormap { if it.a == oldcolor || it.b == oldcolor return it.b; } newcolor := nextcolor; nextcolor += 1; p: Color_Pair; p.a = oldcolor; p.b = newcolor; array_add(*colormap, p); return newcolor; } // Prog::Flatten() implements a graph rewriting algorithm. // // The overall process is similar to epsilon removal, but retains some epsilon // transitions: those from Capture and EmptyWidth instructions; and those from // nullable subexpressions. (The latter avoids quadratic blowup in transitions // in the worst case.) It might be best thought of as Alt instruction elision. // // In conceptual terms, it divides the Prog into "trees" of instructions, then // traverses the "trees" in order to produce "lists" of instructions. A "tree" // is one or more instructions that grow from one "root" instruction to one or // more "leaf" instructions; if a "tree" has exactly one instruction, then the // "root" is also the "leaf". In most cases, a "root" is the successor of some // "leaf" (i.e. the "leaf" instruction's out() returns the "root" instruction) // and is considered a "successor root". A "leaf" can be a ByteRange, Capture, // EmptyWidth or Match instruction. However, this is insufficient for handling // nested nullable subexpressions correctly, so in some cases, a "root" is the // dominator of the instructions reachable from some "successor root" (i.e. it // has an unreachable predecessor) and is considered a "dominator root". Since // only Alt instructions can be "dominator roots" (other instructions would be // "leaves"), only Alt instructions are required to be marked as predecessors. // // Dividing the Prog into "trees" comprises two passes: marking the "successor // roots" and the predecessors; and marking the "dominator roots". Sorting the // "successor roots" by their bytecode offsets enables iteration in order from // greatest to least during the second pass; by working backwards in this case // and flooding the graph no further than "leaves" and already marked "roots", // it becomes possible to mark "dominator roots" without doing excessive work. // // Traversing the "trees" is just iterating over the "roots" in order of their // marking and flooding the graph no further than "leaves" and "roots". When a // "leaf" is reached, the instruction is copied with its successor remapped to // its "root" number. When a "root" is reached, a Nop instruction is generated // with its successor remapped similarly. As each "list" is produced, its last // instruction is marked as such. After all of the "lists" have been produced, // a pass over their instructions remaps their successors to bytecode offsets. flatten :: (using prog: *Prog) { if did_flatten { return; } did_flatten = true; // Scratch structures. It's important that these are reused by functions // that we call in loops because they would thrash the heap otherwise. reachable: Sparse_Set; stk: [..] int; init(*reachable, inst.count); array_reserve(*stk, inst.count); // First pass: Marks "successor roots" and predecessors. // Builds the mapping from inst-ids to root-ids. rootmap, predmap, preds_list := mark_successors(prog, *reachable, *stk); // Second pass: Marks "dominator roots". sorted := array_copy(rootmap.dense); sorted.count = rootmap.count; intro_sort(sorted, (a: $T, b: T) -> int { if a.index < b.index { return -1; } else { return 1; } }); for < sorted { if it.index != start_unanchored && it.index != start { mark_dominator(prog, it.index, *rootmap, predmap, preds_list, *reachable, *stk); } } // Third pass: Emits "lists". Remaps outs to root-ids. // Builds the mapping from root-ids to flat-ids. flatmap: [..] int; flat: [..] Inst; array_resize(*flatmap, rootmap.count, false); array_reserve(*flat, inst.count); for rootmap { flatmap[it.value] = flat.count; emit_list(prog, it.index, *rootmap, *flat, *reachable, *stk); set_last(*flat[flat.count - 1]); // We have the bounds of the "list", so this is the // most convenient point at which to compute hints. compute_hints(prog, *flat, flatmap[it.value], flat.count); } list_count = flatmap.count; memset(inst_count.data, 0, inst_count.count * size_of(type_of(inst_count[0]))); // Fourth pass: Remaps outs to flat-ids. // Counts instructions by opcode. for * flat { if (get_opcode(it.*) != .kInstAltMatch) { // handled in EmitList() set_out(it, flatmap[get_out(it.*)]); } inst_count[get_opcode(it.*)] += 1; } total := 0; for inst_count { total += it; } assert(total == flat.count); // Remap start_unanchored and start. if start_unanchored == 0 { assert(start == 0); } else if (start_unanchored == start) { start_unanchored = flatmap[1]; start = flatmap[1]; } else { start_unanchored = flatmap[1]; start = flatmap[2]; } // Finally, replace the old instructions with the new instructions. inst = flat; // Populate the list heads for BitState. // 512 instructions limits the memory footprint to 1KiB. if inst.count <= 512 { array_resize(*list_heads, inst.count, false); // 0xFF makes it more obvious if we try to look up a non-head. memset(list_heads.data, 0xFF, inst.count * size_of(type_of(list_heads[0]))); for 0..list_count - 1 { list_heads[flatmap[it]] = cast(u16)it; } } } mark_successors :: (using prog: *Prog, reachable: *Sparse_Set, stk: *[..] int) -> rootmap: Sparse_Array(int), predmap: Sparse_Array(int), preds_list: [..] [..] int { get_or_add_preds :: (preds_list: *[..] [..] int, predmap: *Sparse_Array(int), id: int) -> *[..] int { preds: * [..] int; if !contains(predmap.*, id) { add_unchecked(predmap, id, preds_list.count); array_reserve(preds_list, preds_list.count + 1); memset(preds_list.data + preds_list.count, 0, size_of(type_of(preds_list[0]))); preds_list.count += 1; preds = *(preds_list.*)[preds_list.count - 1]; preds.data = null; preds.count = 0; } else { preds = *(preds_list.*)[get(predmap.*, id)]; } return preds; } rootmap: Sparse_Array(int); predmap: Sparse_Array(int); init(*rootmap, inst.count); init(*predmap, inst.count); preds_list: [..] [..] int; // Mark the kInstFail instruction. add_unchecked(*rootmap, 0, rootmap.count); // Mark the start_unanchored and start instructions. if !contains(rootmap, start_unanchored) { add_unchecked(*rootmap, start_unanchored, rootmap.count); } if !contains(rootmap, start) { add_unchecked(*rootmap, start, rootmap.count); } reachable.count = 0; stk.count = 0; array_add(stk, start_unanchored); while stk.count { id := pop(stk); if contains(reachable.*, id) continue; add_unchecked(reachable, id); i := inst[id]; if get_opcode(i) == { case .kInstAltMatch; #through; case .kInstAlt; // Mark this instruction as a predecessor of each out. preds := get_or_add_preds(*preds_list, *predmap, get_out(i)); array_add(preds, id); preds = get_or_add_preds(*preds_list, *predmap, i.out1); array_add(preds, id); array_add(stk, i.out1); array_add(stk, get_out(i)); case .kInstByteRange; #through; case .kInstCapture; #through; case .kInstEmptyWidth; // Mark the out of this instruction as a "root". if !contains(rootmap, get_out(i)) { add_unchecked(*rootmap, get_out(i), rootmap.count); } array_add(stk, get_out(i)); case .kInstNop; array_add(stk, get_out(i)); case .kInstMatch; #through; case .kInstFail; case; assert(false, "Unhandled opcode: %", get_opcode(i)); } } return rootmap, predmap, preds_list; } mark_dominator :: (using prog: *Prog, root: int, rootmap: *Sparse_Array(int), predmap: Sparse_Array(int), preds_list: [..] [..] int, reachable: *Sparse_Set, stk: *[..] int) { reachable.count = 0; stk.count = 0; array_add(stk, root); while stk.count { id := pop(stk); if contains(reachable.*, id) continue; add_unchecked(reachable, id); if id != root && contains(rootmap.*, id) { // We reached another "tree" via epsilon transition. continue; } i := inst[id]; if #complete get_opcode(i) == { case .kInstAltMatch; #through; case .kInstAlt; array_add(stk, i.out1); array_add(stk, get_out(i)); case .kInstByteRange; #through; case .kInstCapture; #through; case .kInstEmptyWidth; case .kInstNop; array_add(stk, get_out(i)); case .kInstMatch; #through; case .kInstFail; } } for id: reachable { if contains(predmap, id) { for pred: preds_list[get(predmap, id)] { if !contains(reachable.*, pred) { // id has a predecessor that cannot be reached from root! // Therefore, id must be a "root" too - mark it as such. if !contains(rootmap.*, id) { add_unchecked(rootmap, id, rootmap.count); } } } } } } emit_list :: (using prog: *Prog, root: int, rootmap: *Sparse_Array(int), flat: *[..] Inst, reachable: *Sparse_Set, stk: *[..] int) { reachable.count = 0; stk.count = 0; array_add(stk, root); while stk.count { id := pop(stk); if contains(reachable.*, id) continue; add_unchecked(reachable, id); if id != root && contains(rootmap.*, id) { // We reached another "tree" via epsilon transition. Emit a kInstNop // instruction so that the Prog does not become quadratically larger. i: Inst; set_opcode(*i, .kInstNop); set_out(*i, get(rootmap.*, id)); array_add(flat, i); continue; } i := inst[id]; if get_opcode(i) == { case .kInstAltMatch; newi: Inst; set_opcode(*newi, .kInstAltMatch); set_out(*newi, flat.count + 1); newi.out1 = cast(u32)flat.count + 2; array_add(flat, newi); array_add(stk, i.out1); array_add(stk, get_out(i)); case .kInstAlt; array_add(stk, i.out1); array_add(stk, get_out(i)); case .kInstByteRange; #through; case .kInstCapture; #through; case .kInstEmptyWidth; newi := i; set_out(*newi, get(rootmap.*, get_out(i))); array_add(flat, newi); case .kInstNop; array_add(stk, get_out(i)); case .kInstMatch; #through; case .kInstFail; array_add(flat, i); case; assert(false, "Unhandled opcode: %", get_opcode(i)); } } } // For each ByteRange instruction in [begin, end), computes a hint to execution // engines: the delta to the next instruction (in flat) worth exploring iff the // current instruction matched. // // Implements a coloring algorithm related to ByteMapBuilder, but in this case, // colors are instructions and recoloring ranges precisely identifies conflicts // between instructions. Iterating backwards over [begin, end) is guaranteed to // identify the nearest conflict (if any) with only linear complexity. compute_hints :: (using prog: *Prog, flat: *[..] Inst, begin: int, end: int) { recolor :: (splits: *Bit_Array, colors: *[256] int, lo_in: int, hi: int, first: *int, id: int) { // Like ByteMapBuilder, we split at lo-1 and at hi. lo := lo_in - 1; if lo >= 0 && !(splits.*)[lo] { set_bit(splits, lo); next := find_next_set_bit(splits.*, lo + 1); (colors.*)[lo] = (colors.*)[next]; } if !(splits.*)[hi] { set_bit(splits, hi); next := find_next_set_bit(splits.*, hi + 1); (colors.*)[hi] = (colors.*)[next]; } c := lo + 1; while c < 256 { next := find_next_set_bit(splits.*, c); // Ratchet backwards... (first.*) = min(first.*, (colors.*)[next]); // Recolor with id - because it's the new nearest conflict! (colors.*)[next] = id; if next == hi break; c = next+1; } } splits: Bit_Array; init_bit_array(*splits, 256); colors: [256] int; dirty := false; for < id: begin..end { if (id == end || get_opcode((flat.*)[id]) != .kInstByteRange) { if (dirty) { dirty = false; clear_all_bits(*splits); } set_bit(*splits, 255); colors[255] = id; // At this point, the [0-255] range is colored with id. // Thus, hints cannot point beyond id; and if id == end, // hints that would have pointed to id will be 0 instead. continue; } dirty = true; // We recolor the [lo-hi] range with id. Note that first ratchets backwards // from end to the nearest conflict (if any) during recoloring. first := end; ip := *(flat.*)[id]; lo: s32 = ip.lo; hi: s32 = ip.hi; recolor(*splits, *colors, lo, hi, *first, id); if get_foldcase(ip.*) && lo <= #char "z" && hi >= #char "a" { foldlo := lo; foldhi := hi; if (foldlo < #char "a") { foldlo = #char "a"; } if (foldhi > #char "z") { foldhi = #char "z"; } if (foldlo <= foldhi) { foldlo += #char "A" - #char "a"; foldhi += #char "A" - #char "a"; recolor(*splits, *colors, foldlo, foldhi, *first, id); } } if first != end { hint := cast(u16) min(first - id, 32767); ip.hint_foldcase |= hint << 1; } } } // Accelerates to the first likely occurrence of the prefix. // Returns position of the first byte or -1 if not found. prefix_accel :: (using prog: *Prog, text: string, pos: int) -> int { assert(prefix_size >= 1); if prefix_size == 1 { result, found := index_of_char_2(text, cast(u8)prefix_front, pos); if !found return -1; return result; } else { return prefix_accel_front_and_back(prog, text, pos); } } prefix_accel_front_and_back :: (using prog: *Prog, text: string, pos: int) -> int { assert(prefix_size >= 2); if text.count - pos < prefix_size { return -1; } // Don't bother searching the last prefix_size_-1 bytes for prefix_front_. // This also means that probing for prefix_back_ doesn't go out of bounds. size := text.count - prefix_size + 1; // @ToDo Port AVX2 fast path once jai has intrinsics/assembler for i: pos..size-1 { if text[i] == prefix_front && text[i + prefix_size - 1] == prefix_back { return i; } } return -1; } append :: (builder: *String_Builder, inst: Inst) { if get_opcode(inst) == { case; print_to_builder(builder, "opcode %", get_opcode(inst)); case .kInstAlt; print_to_builder(builder, "alt -> % | %", get_out(inst), inst.out1); case .kInstAltMatch; print_to_builder(builder, "altmatch -> % | %", get_out(inst), inst.out1); case .kInstByteRange; append(builder, "byte"); if get_foldcase(inst) { append(builder, "/i"); } print_to_builder(builder, " [%-%] % -> %", format_byte(inst.lo), format_byte(inst.hi), get_hint(inst), get_out(inst)); case .kInstCapture; print_to_builder(builder, "capture % -> %", inst.cap, get_out(inst)); case .kInstEmptyWidth; print_to_builder(builder, "emptywidth % -> %", inst.empty, get_out(inst)); case .kInstMatch; print_to_builder(builder, "match! %", inst.match_id); case .kInstNop; print_to_builder(builder, "nop -> %", get_out(inst)); case .kInstFail; append(builder, "fail"); } } format_byte :: #bake_arguments formatInt(base = 16, minimum_digits = 2); to_string :: (using prog: Prog) -> string { if did_flatten { return to_string_flattened(prog); } else { return to_string_unflattened(prog); } } to_string_flattened :: (using prog: Prog) -> string { builder: String_Builder; defer free_buffers(*builder); for inst { if get_last(it) { print_to_builder(*builder, "%. ", it_index); } else { print_to_builder(*builder, "%1+ ", it_index); } append(*builder, it); append(*builder, #char "\n"); } return builder_to_string(*builder); } to_string_unflattened :: (using prog: Prog) -> string { builder: String_Builder; defer free_buffers(*builder); q: Sparse_Set; init(*q, prog.inst.count); add_if_nonzero(*q, start); for q { i := inst[it]; print_to_builder(*builder, "%. ", it); append(*builder, i); append(*builder, #char "\n"); add_if_nonzero(*q, get_out(i)); if get_opcode(i) == .kInstAlt || get_opcode(i) == .kInstAltMatch { add_if_nonzero(*q, i.out1); } } return builder_to_string(*builder); } #scope_file #import "Bit_Array"; #import "IntroSort"; #import "Bit_Operations"; // @ToDo: Use assembler replacement for memchr instead, once jai supports it index_of_char_2 :: (str: string, char: u8, start := 0) -> int, found:bool { for start..str.count-1 { if str[it] == char return it, true; } return -1, false; }