<<
path:
root/public/blog.git/html/modules/uniform/simplify.jai
blob: 9d22f9bd6e084a5d6a0b1fadc75cc27be888a48c
[raw]
[clear marker]
2compute_simple :: (using re: *Regexp_Node) -> bool {
4 case .NoMatch; #through;
5 case .EmptyMatch; #through;
6 case .Literal; #through;
7 case .LiteralString; #through;
8 case .BeginLine; #through;
9 case .EndLine; #through;
10 case .BeginText; #through;
11 case .WordBoundary; #through;
12 case .NoWordBoundary; #through;
13 case .EndText; #through;
14 case .AnyChar; #through;
15 case .AnyByte; #through;
18 case .Concat; #through;
20 // These are simple as long as the subpieces are simple.
22 if !it.simple return false;
26 // Simple as long as the char class is not empty, not full.
27 return char_class.nrunes != 0 && char_class.nrunes != Runemax + 1;
29 return subs[0].simple;
33 if !subs[0].simple return false;
37 case .Quest; #through;
38 case .EmptyMatch; #through;
46 case .kVerticalBar; #through;
48 assert(false, "Found parser tokens in compute_simple: %", op);
53// Simplifies a regular expression, returning a new regexp.
54// The new regexp uses traditional Unix egrep features only,
55// plus the Perl (?:) non-capturing parentheses.
56// Otherwise, no POSIX or Perl additions. The new regexp
57// captures exactly the same subexpressions (with the same indices)
59// Does not edit given Regexp_Node.
60simplify :: (re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
61 cre := coalesce(re, pool);
64 sre := simplify_coalesced(cre, pool);
70// Coalesces runs of star/plus/quest/repeat of the same literal along with any
71// occurrences of that literal into repeats of that literal. It also works for
72// char classes, any char and any byte.
73coalesce :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
74 coalesce_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
75 if re.subs.count == 0 return re;
77 if !child_args_changed(re, child_args) return re;
79 // Something changed. Build a new op.
80 nre := new_regexp(pool, re.op, re.parse_flags);
81 // @ToDo: Should we copy here?
82 nre.subs = child_args;
83 // Repeats and Captures have additional data that must be copied.
84 if (re.op == .Repeat) {
87 } else if re.op == .Capture {
93 can_coalesce_any := false;
94 for i: 0..child_args.count - 2 {
95 if can_coalesce(child_args[i], child_args[i+1]) {
96 can_coalesce_any = true;
101 if !can_coalesce_any {
102 if !child_args_changed(re, child_args) return re;
104 // Something changed. Build a new op.
105 nre := new_regexp(pool, re.op, re.parse_flags);
106 // @ToDo: Should we copy here?
107 nre.subs = child_args;
111 // @Speed: Do this in one pass with the loop above?
112 for i: 0..child_args.count - 2 {
113 if can_coalesce(child_args[i], child_args[i+1]) {
114 do_coalesce(pool, *child_args[i], *child_args[i+1]);
118 // Determine how many empty matches were left by DoCoalesce.
121 if (it.op == .EmptyMatch) {
127 nre := new_regexp(pool, re.op, re.parse_flags);
128 nre.subs = NewArray(re.subs.count - n, *Regexp_Node, false);
131 if (it.op != .EmptyMatch) {
139 w := Walk(*Regexp_Pool, *Regexp_Node).{post_visit = coalesce_post};
140 result, stopped := walk(pool, re, null, w);
141 if stopped return null;
145child_args_changed :: (re: *Regexp_Node, child_args: [] *Regexp_Node) -> bool {
146 for sub, i: re.subs {
147 if sub != child_args[i] return true;
152can_coalesce :: (r1: *Regexp_Node, r2: *Regexp_Node) -> bool {
153 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
155 if ((r1.op == .Star ||
159 (r1.subs[0].op == .Literal ||
160 r1.subs[0].op == .CharClass ||
161 r1.subs[0].op == .AnyChar ||
162 r1.subs[0].op == .AnyByte)) {
163 // r2 must be a star/plus/quest/repeat of the same literal, char class,
164 // any char or any byte.
165 if ((r2.op == .Star ||
169 regexp_equal(r1.subs[0], r2.subs[0]) &&
170 // The parse flags must be consistent.
171 ((r1.parse_flags & .NonGreedy) == (r2.parse_flags & .NonGreedy))) {
174 // ... OR an occurrence of that literal, char class, any char or any byte
175 if (regexp_equal(r1.subs[0], r2)) {
178 // ... OR a literal string that begins with that literal.
179 if (r1.subs[0].op == .Literal && r2.op == .LiteralString && r2.runes[0] == r1.subs[0].rune &&
180 // The parse flags must be consistent.
181 ((r1.subs[0].parse_flags & .FoldCase) == (r2.parse_flags & .FoldCase))) {
188do_coalesce :: (pool: *Regexp_Pool, r1ptr: **Regexp_Node, r2ptr: **Regexp_Node) {
189 leave_empty :: (r1ptr: **Regexp_Node, r2ptr: **Regexp_Node, pool: *Regexp_Pool, nre: *Regexp_Node) {
190 // @ToDo, @Speed: Why don’t we set this to null instead of allocing a temporary new regexp?
191 r1ptr.* = new_regexp(pool, .EmptyMatch, .NoParseFlags);
198 nre := new_repeat(pool, r1.subs[0], r1.parse_flags, 0, 0);
214 assert(false, "Unexpected r1.op: %", r1.op);
221 leave_empty(r1ptr, r2ptr, pool, nre);
226 leave_empty(r1ptr, r2ptr, pool, nre);
232 leave_empty(r1ptr, r2ptr, pool, nre);
238 } else if (nre.max != -1) {
241 leave_empty(r1ptr, r2ptr, pool, nre);
243 case .Literal; #through;
244 case .CharClass; #through;
245 case .AnyChar; #through;
251 leave_empty(r1ptr, r2ptr, pool, nre);
254 r := r1.subs[0].rune;
255 // Determine how much of the literal string is removed.
256 // We know that we have at least one rune. :)
258 while (n < r2.runes.count && r2.runes[n] == r) {
265 if n == r2.runes.count {
266 leave_empty(r1ptr, r2ptr, pool, nre);
269 r2ptr.* = new_literal_string(pool, slice(r2.runes, n, r2.runes.count - n), r2.parse_flags);
273 assert(false, "Unexpected r2.op: %", r2.op);
277simplify_coalesced :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
278 simplify_pre :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node) -> *Regexp_Node, stop: bool {
285 simplify_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
287 case .NoMatch; #through;
288 case .EmptyMatch; #through;
289 case .Literal; #through;
290 case .LiteralString; #through;
291 case .BeginLine; #through;
292 case .EndLine; #through;
293 case .BeginText; #through;
294 case .WordBoundary; #through;
295 case .NoWordBoundary; #through;
296 case .EndText; #through;
297 case .AnyChar; #through;
298 case .AnyByte; #through;
300 // All these are always simple.
304 case .Concat; #through;
306 // These are simple as long as the subpieces are simple.
307 if !child_args_changed(re, child_args) {
311 nre := new_regexp(pool, re.op, re.parse_flags);
312 // @ToDo: Should we copy here?
313 nre.subs = child_args;
319 newsub := child_args[0];
320 if newsub == re.subs[0] {
324 nre := new_regexp(pool, .Capture, re.parse_flags);
325 nre.subs = NewArray(1, *Regexp_Node, false);
326 nre.subs[0] = newsub;
331 case .Star; #through;
332 case .Plus; #through;
334 newsub := child_args[0];
335 // Special case: repeat the empty string as much as
336 // you want, but it's still the empty string.
337 if newsub.op == .EmptyMatch return newsub;
339 // These are simple as long as the subpiece is simple.
340 if newsub == re.subs[0] {
345 // These are also idempotent if flags are constant.
346 if re.op == newsub.op && re.parse_flags == newsub.parse_flags return newsub;
348 nre := new_regexp(pool, re.op, re.parse_flags);
349 nre.subs = NewArray(1, *Regexp_Node, false);
350 nre.subs[0] = newsub;
355 newsub := child_args[0];
356 // Special case: repeat the empty string as much as
357 // you want, but it's still the empty string.
358 if newsub.op == .EmptyMatch return newsub;
360 nre := simplify_repeat(pool, newsub, re.min, re.max, re.parse_flags);
365 nre := simplify_char_class(pool, re);
369 assert(false, "Simpilfy case not handled: %", re.op);
374 w := Walk(*Regexp_Pool, *Regexp_Node).{pre_visit = simplify_pre, post_visit = simplify_post};
375 result, stopped := walk(pool, re, null, w);
376 if stopped return null;
380// Creates a concatenation of two Regexp_Node, consuming refs to re1 and re2.
381// Returns a new Regexp_Node, handing the ref to the caller.
382concat2 :: (pool: *Regexp_Pool, re1: *Regexp_Node, re2: *Regexp_Node, parse_flags: ParseFlags) -> *Regexp_Node {
383 re := new_regexp(pool, .Concat, parse_flags);
384 re.subs = NewArray(2, *Regexp_Node, false);
390// Simplifies the expression re{min,max} in terms of *, +, and ?.
391// Returns a new regexp. Does not edit re. Does not consume reference to re.
392// Caller must Decref return value when done with it.
393// The result will *not* necessarily have the right capturing parens
394// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
395// but in the Regexp_Node* representation, both (x) are marked as $1.
396simplify_repeat :: (pool: *Regexp_Pool, re: *Regexp_Node, min: int, max: int, f: ParseFlags) -> *Regexp_Node {
397 // x{n,} means at least n matches of x.
399 // Special case: x{0,} is x*
400 if (min == 0) return new_star(pool, re, f);
402 // Special case: x{1,} is x+
403 if (min == 1) return new_plus(pool, re, f);
405 // General case: x{4,} is xxxx+
406 nre_subs := NewArray(min, *Regexp_Node, false);
410 nre_subs[min-1] = new_plus(pool, re, f);
411 return new_concat(pool, nre_subs, f);
414 // Special case: (x){0} matches only empty string.
415 if (min == 0 && max == 0) return new_regexp(pool, .EmptyMatch, f);
417 // Special case: x{1} is just x.
418 if (min == 1 && max == 1) return re;
420 // General case: x{n,m} means n copies of x and m copies of x?.
421 // The machine will do less work if we nest the final m copies,
422 // so that x{2,5} = xx(x(x(x)?)?)?
424 // Build leading prefix: xx. Capturing only on the last one.
427 nre_subs := NewArray(min, *Regexp_Node, false);
431 nre = new_concat(pool, nre_subs, f);
434 // Build and attach suffix: (x(x(x)?)?)?
436 suf := new_quest(pool, re, f);
437 for i: min+1..max-1 {
438 suf = new_quest(pool, concat2(pool, re, suf, f), f);
443 nre = concat2(pool, nre, suf, f);
447 // Some degenerate case, like min > max, or min < max < 0.
448 // This shouldn't happen, because the parser rejects such regexps.
449 assert(nre != null, "Malformed repeat: % %", min, max);
454// Simplifies a character class.
455simplify_char_class :: (pool: *Regexp_Pool, re: *Regexp_Node) -> *Regexp_Node {
457 if (re.char_class.nrunes == 0) return new_regexp(pool, .NoMatch, re.parse_flags);
458 if (re.char_class.nrunes == Runemax + 1) return new_regexp(pool, .AnyChar, re.parse_flags);
465// @ToDo: Move to a common utility module
466slice :: inline (array: [] $T, index: int, count: int) -> [] T {
467 assert(index >= 0, "index = %", index); // @@ Should we also clamp in these cases?
468 assert(count >= 0, "count = %", count);
470 if index >= array.count {
476 if index + count > array.count {
477 count = array.count - index;
480 c.data = array.data + index;