git ptrace.dev


        
            
            
        
        0#scope_module
1
2compute_simple :: (using re: *Regexp_Node) -> bool {
3	if #complete op == {
4		case .NoMatch;	#through;
5		case .EmptyMatch;	#through;
6		case .Literal;	#through;
7		case .LiteralString;	#through;
8		case .BeginLine;	#through;
9		case .EndLine;	#through;
10		case .BeginText;	#through;
11		case .WordBoundary;	#through;
12		case .NoWordBoundary;	#through;
13		case .EndText;	#through;
14		case .AnyChar;	#through;
15		case .AnyByte;	#through;
16		case .HaveMatch;
17			return true;
18		case .Concat;		#through;
19		case .Alternate;
20			// These are simple as long as the subpieces are simple.
21			for subs {
22				if !it.simple	return false;
23			}
24			return true;
25		case .CharClass;
26			// Simple as long as the char class is not empty, not full.
27			return char_class.nrunes != 0 && char_class.nrunes != Runemax + 1;
28		case .Capture;
29			return subs[0].simple;
30		case .Star;	#through;
31		case .Plus;	#through;
32		case .Quest;
33			if !subs[0].simple		return false;
34			if subs[0].op == {
35				case .Star;	#through;
36				case .Plus;	#through;
37				case .Quest;	#through;
38				case .EmptyMatch;	#through;
39				case .NoMatch;
40					return false;
41				case;
42					return true;
43			}
44		case .Repeat;
45			return false;
46		case .kVerticalBar; #through;
47		case .kLeftParen;
48			assert(false, "Found parser tokens in compute_simple: %", op);
49			return false;
50	}
51}
52
53// Simplifies a regular expression, returning a new regexp.
54// The new regexp uses traditional Unix egrep features only,
55// plus the Perl (?:) non-capturing parentheses.
56// Otherwise, no POSIX or Perl additions.  The new regexp
57// captures exactly the same subexpressions (with the same indices)
58// as the original.
59// Does not edit given Regexp_Node.
60simplify :: (re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
61	cre := coalesce(re, pool);
62	if !cre		return null;
63
64	sre := simplify_coalesced(cre, pool);
65	if !sre		return null;
66
67	return sre;
68}
69
70// Coalesces runs of star/plus/quest/repeat of the same literal along with any
71// occurrences of that literal into repeats of that literal. It also works for
72// char classes, any char and any byte.
73coalesce :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
74	coalesce_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
75		if re.subs.count == 0	return re;
76		if re.op != .Concat {
77			if !child_args_changed(re, child_args) return re;
78
79			// Something changed. Build a new op.
80			nre := new_regexp(pool, re.op, re.parse_flags);
81			// @ToDo: Should we copy here?
82			nre.subs = child_args;
83			// Repeats and Captures have additional data that must be copied.
84			if (re.op == .Repeat) {
85				nre.min = re.min;
86				nre.max = re.max;
87			} else if re.op == .Capture {
88				nre.cap = re.cap;
89			}
90			return nre;
91		}
92
93		can_coalesce_any := false;
94		for i: 0..child_args.count - 2 {
95			if can_coalesce(child_args[i], child_args[i+1]) {
96				can_coalesce_any = true;
97				break;
98			}
99		}
100
101		if !can_coalesce_any {
102			if !child_args_changed(re, child_args)	return re;
103
104			// Something changed. Build a new op.
105			nre := new_regexp(pool, re.op, re.parse_flags);
106			// @ToDo: Should we copy here?
107			nre.subs = child_args;
108			return nre;
109		}
110
111		// @Speed: Do this in one pass with the loop above?
112		for i: 0..child_args.count - 2 {
113			if can_coalesce(child_args[i], child_args[i+1]) {
114				do_coalesce(pool, *child_args[i], *child_args[i+1]);
115			}
116		}
117
118		// Determine how many empty matches were left by DoCoalesce.
119		n := 0;
120		for child_args {
121			if (it.op == .EmptyMatch) {
122				n += 1;
123			}
124		}
125
126		// Build a new op.
127		nre := new_regexp(pool, re.op, re.parse_flags);
128		nre.subs = NewArray(re.subs.count - n, *Regexp_Node, false);
129		j := 0;
130		for child_args {
131			if (it.op != .EmptyMatch) {
132				nre.subs[j] = it;
133				j += 1;
134			}
135		}
136		return nre;
137	}
138
139	w := Walk(*Regexp_Pool, *Regexp_Node).{post_visit = coalesce_post};
140	result, stopped := walk(pool, re, null, w);
141	if stopped		return null;
142	return result;
143}
144
145child_args_changed :: (re: *Regexp_Node, child_args: [] *Regexp_Node) -> bool {
146	for sub, i: re.subs {
147		if sub != child_args[i]		return true;
148	}
149	return false;
150}
151
152can_coalesce :: (r1: *Regexp_Node, r2: *Regexp_Node) -> bool {
153	// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
154	// any byte.
155	if ((r1.op == .Star ||
156			r1.op == .Plus ||
157			r1.op == .Quest ||
158			r1.op == .Repeat) &&
159			(r1.subs[0].op == .Literal ||
160			r1.subs[0].op == .CharClass ||
161			r1.subs[0].op == .AnyChar ||
162			r1.subs[0].op == .AnyByte)) {
163		// r2 must be a star/plus/quest/repeat of the same literal, char class,
164		// any char or any byte.
165		if ((r2.op == .Star ||
166				r2.op == .Plus ||
167				r2.op == .Quest ||
168				r2.op == .Repeat) &&
169				regexp_equal(r1.subs[0], r2.subs[0]) &&
170				// The parse flags must be consistent.
171				((r1.parse_flags & .NonGreedy) == (r2.parse_flags & .NonGreedy))) {
172			return true;
173		}
174		// ... OR an occurrence of that literal, char class, any char or any byte
175		if (regexp_equal(r1.subs[0], r2)) {
176			return true;
177		}
178		// ... OR a literal string that begins with that literal.
179		if (r1.subs[0].op == .Literal && r2.op == .LiteralString && r2.runes[0] == r1.subs[0].rune &&
180				// The parse flags must be consistent.
181				((r1.subs[0].parse_flags & .FoldCase) == (r2.parse_flags & .FoldCase))) {
182			return true;
183		}
184	}
185	return false;
186}
187
188do_coalesce :: (pool: *Regexp_Pool, r1ptr: **Regexp_Node, r2ptr: **Regexp_Node) {
189	leave_empty :: (r1ptr: **Regexp_Node, r2ptr: **Regexp_Node, pool: *Regexp_Pool, nre: *Regexp_Node) {
190		// @ToDo, @Speed: Why don’t we set this to null instead of allocing a temporary new regexp?
191		r1ptr.* = new_regexp(pool, .EmptyMatch, .NoParseFlags);
192		r2ptr.* = nre;
193	}
194
195	r1 := r1ptr.*;
196	r2 := r2ptr.*;
197
198	nre := new_repeat(pool, r1.subs[0], r1.parse_flags, 0, 0);
199
200	if r1.op == {
201		case .Star;
202			nre.min = 0;
203			nre.max = -1;
204		case .Plus;
205			nre.min = 1;
206			nre.max = -1;
207		case .Quest;
208			nre.min = 0;
209			nre.max = 1;
210		case .Repeat;
211			nre.min = r1.min;
212			nre.max = r1.max;
213		case;
214			assert(false, "Unexpected r1.op: %", r1.op);
215	}
216
217
218	if r2.op == {
219		case .Star;
220			nre.max = -1;
221			leave_empty(r1ptr, r2ptr, pool, nre);
222
223		case .Plus;
224			nre.min += 1;
225			nre.max = -1;
226			leave_empty(r1ptr, r2ptr, pool, nre);
227
228		case .Quest;
229			if nre.max != -1 {
230				nre.max += 1;
231			}
232			leave_empty(r1ptr, r2ptr, pool, nre);
233
234		case .Repeat;
235			nre.min += r2.min;
236			if (r2.max == -1) {
237				nre.max = -1;
238			} else if (nre.max != -1) {
239				nre.max += r2.max;
240			}
241			leave_empty(r1ptr, r2ptr, pool, nre);
242
243		case .Literal; #through;
244		case .CharClass; #through;
245		case .AnyChar; #through;
246		case .AnyByte;
247			nre.min += 1;
248			if (nre.max != -1) {
249				nre.max += 1;
250			}
251			leave_empty(r1ptr, r2ptr, pool, nre);
252
253		case .LiteralString;
254			r := r1.subs[0].rune;
255			// Determine how much of the literal string is removed.
256			// We know that we have at least one rune. :)
257			n: s16 = 1;
258			while (n < r2.runes.count && r2.runes[n] == r) {
259				n += 1;
260			}
261			nre.min += n;
262			if (nre.max != -1) {
263				nre.max += n;
264			}
265			if n == r2.runes.count {
266				leave_empty(r1ptr, r2ptr, pool, nre);
267			} else {
268				r1ptr.* = nre;
269				r2ptr.* = new_literal_string(pool, slice(r2.runes, n, r2.runes.count - n), r2.parse_flags);
270			}
271
272		case;
273			assert(false, "Unexpected r2.op: %", r2.op);
274	}
275}
276
277simplify_coalesced :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
278	simplify_pre :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node) -> *Regexp_Node, stop: bool {
279		if (re.simple) {
280			return re, true;
281		}
282
283		return null, false;
284	}
285	simplify_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
286		if re.op == {
287			case .NoMatch; #through;
288			case .EmptyMatch; #through;
289			case .Literal; #through;
290			case .LiteralString; #through;
291			case .BeginLine; #through;
292			case .EndLine; #through;
293			case .BeginText; #through;
294			case .WordBoundary; #through;
295			case .NoWordBoundary; #through;
296			case .EndText; #through;
297			case .AnyChar; #through;
298			case .AnyByte; #through;
299			case .HaveMatch;
300				// All these are always simple.
301				re.simple = true;
302				return re;
303
304			case .Concat; #through;
305			case .Alternate;
306				// These are simple as long as the subpieces are simple.
307				if !child_args_changed(re, child_args) {
308					re.simple = true;
309					return re;
310				}
311				nre := new_regexp(pool, re.op, re.parse_flags);
312				// @ToDo: Should we copy here?
313				nre.subs = child_args;
314				nre.simple = true;
315				return nre;
316
317
318			case .Capture;
319				newsub := child_args[0];
320				if newsub == re.subs[0] {
321					re.simple = true;
322					return re;
323				}
324				nre := new_regexp(pool, .Capture, re.parse_flags);
325				nre.subs = NewArray(1, *Regexp_Node, false);
326				nre.subs[0] = newsub;
327				nre.cap = re.cap;
328				nre.simple = true;
329				return nre;
330
331			case .Star; #through;
332			case .Plus; #through;
333			case .Quest;
334				newsub := child_args[0];
335				// Special case: repeat the empty string as much as
336				// you want, but it's still the empty string.
337				if newsub.op == .EmptyMatch		return newsub;
338
339				// These are simple as long as the subpiece is simple.
340				if newsub == re.subs[0] {
341					re.simple = true;
342					return re;
343				}
344
345				// These are also idempotent if flags are constant.
346				if re.op == newsub.op && re.parse_flags == newsub.parse_flags	return newsub;
347
348				nre := new_regexp(pool, re.op, re.parse_flags);
349				nre.subs = NewArray(1, *Regexp_Node, false);
350				nre.subs[0] = newsub;
351				nre.simple = true;
352				return nre;
353
354			case .Repeat;
355				newsub := child_args[0];
356				// Special case: repeat the empty string as much as
357				// you want, but it's still the empty string.
358				if newsub.op == .EmptyMatch		return newsub;
359
360				nre := simplify_repeat(pool, newsub, re.min, re.max, re.parse_flags);
361				nre.simple = true;
362				return nre;
363
364			case .CharClass;
365				nre := simplify_char_class(pool, re);
366				nre.simple = true;
367				return nre;
368			case;
369				assert(false, "Simpilfy case not handled: %", re.op);
370				return null;
371		}
372	}
373
374	w := Walk(*Regexp_Pool, *Regexp_Node).{pre_visit = simplify_pre, post_visit = simplify_post};
375	result, stopped := walk(pool, re, null, w);
376	if stopped		return null;
377	return result;
378}
379
380// Creates a concatenation of two Regexp_Node, consuming refs to re1 and re2.
381// Returns a new Regexp_Node, handing the ref to the caller.
382concat2 :: (pool: *Regexp_Pool, re1: *Regexp_Node, re2: *Regexp_Node, parse_flags: ParseFlags) -> *Regexp_Node {
383	re := new_regexp(pool, .Concat, parse_flags);
384	re.subs = NewArray(2, *Regexp_Node, false);
385	re.subs[0] = re1;
386	re.subs[1] = re2;
387	return re;
388}
389
390// Simplifies the expression re{min,max} in terms of *, +, and ?.
391// Returns a new regexp.  Does not edit re.  Does not consume reference to re.
392// Caller must Decref return value when done with it.
393// The result will *not* necessarily have the right capturing parens
394// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
395// but in the Regexp_Node* representation, both (x) are marked as $1.
396simplify_repeat :: (pool: *Regexp_Pool, re: *Regexp_Node, min: int, max: int, f: ParseFlags) -> *Regexp_Node {
397	// x{n,} means at least n matches of x.
398	if (max == -1) {
399		// Special case: x{0,} is x*
400		if (min == 0) return new_star(pool, re, f);
401
402		// Special case: x{1,} is x+
403		if (min == 1) return new_plus(pool, re, f);
404
405		// General case: x{4,} is xxxx+
406		nre_subs := NewArray(min, *Regexp_Node, false);
407		for i: 0..min-2 {
408			nre_subs[i] = re;
409		}
410		nre_subs[min-1] = new_plus(pool, re, f);
411		return new_concat(pool, nre_subs, f);
412	}
413
414	// Special case: (x){0} matches only empty string.
415	if (min == 0 && max == 0) return new_regexp(pool, .EmptyMatch, f);
416
417	// Special case: x{1} is just x.
418	if (min == 1 && max == 1) return re;
419
420	// General case: x{n,m} means n copies of x and m copies of x?.
421	// The machine will do less work if we nest the final m copies,
422	// so that x{2,5} = xx(x(x(x)?)?)?
423
424	// Build leading prefix: xx.  Capturing only on the last one.
425	nre: *Regexp_Node;
426	if (min > 0) {
427		nre_subs := NewArray(min, *Regexp_Node, false);
428		for i: 0..min-1 {
429			nre_subs[i] = re;
430		}
431		nre = new_concat(pool, nre_subs, f);
432	}
433
434	// Build and attach suffix: (x(x(x)?)?)?
435	if (max > min) {
436		suf := new_quest(pool, re, f);
437		for i: min+1..max-1 {
438			suf = new_quest(pool, concat2(pool, re, suf, f), f);
439		}
440		if (nre == null) {
441			nre = suf;
442		} else {
443			nre = concat2(pool, nre, suf, f);
444		}
445	}
446
447	// Some degenerate case, like min > max, or min < max < 0.
448	// This shouldn't happen, because the parser rejects such regexps.
449	assert(nre != null, "Malformed repeat: % %", min, max);
450
451	return nre;
452}
453
454// Simplifies a character class.
455simplify_char_class :: (pool: *Regexp_Pool, re: *Regexp_Node) -> *Regexp_Node {
456	// Special cases
457	if (re.char_class.nrunes == 0)				return new_regexp(pool, .NoMatch, re.parse_flags);
458	if (re.char_class.nrunes == Runemax + 1)	return new_regexp(pool, .AnyChar, re.parse_flags);
459
460	return re;
461}
462
463#scope_file
464
465// @ToDo: Move to a common utility module
466slice :: inline (array: [] $T, index: int, count: int) -> [] T {
467    assert(index >= 0, "index = %", index);             // @@ Should we also clamp in these cases?
468    assert(count >= 0, "count = %", count);
469	c: [] T = ---;
470    if index >= array.count {
471		c.data = null;
472		c.count = 0;
473		return c;
474	}
475
476    if index + count > array.count {
477        count = array.count - index;
478    }
479
480    c.data = array.data + index;
481    c.count = count;
482    return c;
483}
484
485
index : blog