Logo

index : blog

---

  • summary
  • about
  • tree
  • log
  • branches
<< path: root/public/blog.git/html/modules/uniform/simplify.jai blob: 9d22f9bd6e084a5d6a0b1fadc75cc27be888a48c [raw] [clear marker]

        
0#scope_module
1
2compute_simple :: (using re: *Regexp_Node) -> bool {
3 if #complete op == {
4 case .NoMatch; #through;
5 case .EmptyMatch; #through;
6 case .Literal; #through;
7 case .LiteralString; #through;
8 case .BeginLine; #through;
9 case .EndLine; #through;
10 case .BeginText; #through;
11 case .WordBoundary; #through;
12 case .NoWordBoundary; #through;
13 case .EndText; #through;
14 case .AnyChar; #through;
15 case .AnyByte; #through;
16 case .HaveMatch;
17 return true;
18 case .Concat; #through;
19 case .Alternate;
20 // These are simple as long as the subpieces are simple.
21 for subs {
22 if !it.simple return false;
23 }
24 return true;
25 case .CharClass;
26 // Simple as long as the char class is not empty, not full.
27 return char_class.nrunes != 0 && char_class.nrunes != Runemax + 1;
28 case .Capture;
29 return subs[0].simple;
30 case .Star; #through;
31 case .Plus; #through;
32 case .Quest;
33 if !subs[0].simple return false;
34 if subs[0].op == {
35 case .Star; #through;
36 case .Plus; #through;
37 case .Quest; #through;
38 case .EmptyMatch; #through;
39 case .NoMatch;
40 return false;
41 case;
42 return true;
43 }
44 case .Repeat;
45 return false;
46 case .kVerticalBar; #through;
47 case .kLeftParen;
48 assert(false, "Found parser tokens in compute_simple: %", op);
49 return false;
50 }
51}
52
53// Simplifies a regular expression, returning a new regexp.
54// The new regexp uses traditional Unix egrep features only,
55// plus the Perl (?:) non-capturing parentheses.
56// Otherwise, no POSIX or Perl additions. The new regexp
57// captures exactly the same subexpressions (with the same indices)
58// as the original.
59// Does not edit given Regexp_Node.
60simplify :: (re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
61 cre := coalesce(re, pool);
62 if !cre return null;
63
64 sre := simplify_coalesced(cre, pool);
65 if !sre return null;
66
67 return sre;
68}
69
70// Coalesces runs of star/plus/quest/repeat of the same literal along with any
71// occurrences of that literal into repeats of that literal. It also works for
72// char classes, any char and any byte.
73coalesce :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
74 coalesce_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
75 if re.subs.count == 0 return re;
76 if re.op != .Concat {
77 if !child_args_changed(re, child_args) return re;
78
79 // Something changed. Build a new op.
80 nre := new_regexp(pool, re.op, re.parse_flags);
81 // @ToDo: Should we copy here?
82 nre.subs = child_args;
83 // Repeats and Captures have additional data that must be copied.
84 if (re.op == .Repeat) {
85 nre.min = re.min;
86 nre.max = re.max;
87 } else if re.op == .Capture {
88 nre.cap = re.cap;
89 }
90 return nre;
91 }
92
93 can_coalesce_any := false;
94 for i: 0..child_args.count - 2 {
95 if can_coalesce(child_args[i], child_args[i+1]) {
96 can_coalesce_any = true;
97 break;
98 }
99 }
100
101 if !can_coalesce_any {
102 if !child_args_changed(re, child_args) return re;
103
104 // Something changed. Build a new op.
105 nre := new_regexp(pool, re.op, re.parse_flags);
106 // @ToDo: Should we copy here?
107 nre.subs = child_args;
108 return nre;
109 }
110
111 // @Speed: Do this in one pass with the loop above?
112 for i: 0..child_args.count - 2 {
113 if can_coalesce(child_args[i], child_args[i+1]) {
114 do_coalesce(pool, *child_args[i], *child_args[i+1]);
115 }
116 }
117
118 // Determine how many empty matches were left by DoCoalesce.
119 n := 0;
120 for child_args {
121 if (it.op == .EmptyMatch) {
122 n += 1;
123 }
124 }
125
126 // Build a new op.
127 nre := new_regexp(pool, re.op, re.parse_flags);
128 nre.subs = NewArray(re.subs.count - n, *Regexp_Node, false);
129 j := 0;
130 for child_args {
131 if (it.op != .EmptyMatch) {
132 nre.subs[j] = it;
133 j += 1;
134 }
135 }
136 return nre;
137 }
138
139 w := Walk(*Regexp_Pool, *Regexp_Node).{post_visit = coalesce_post};
140 result, stopped := walk(pool, re, null, w);
141 if stopped return null;
142 return result;
143}
144
145child_args_changed :: (re: *Regexp_Node, child_args: [] *Regexp_Node) -> bool {
146 for sub, i: re.subs {
147 if sub != child_args[i] return true;
148 }
149 return false;
150}
151
152can_coalesce :: (r1: *Regexp_Node, r2: *Regexp_Node) -> bool {
153 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
154 // any byte.
155 if ((r1.op == .Star ||
156 r1.op == .Plus ||
157 r1.op == .Quest ||
158 r1.op == .Repeat) &&
159 (r1.subs[0].op == .Literal ||
160 r1.subs[0].op == .CharClass ||
161 r1.subs[0].op == .AnyChar ||
162 r1.subs[0].op == .AnyByte)) {
163 // r2 must be a star/plus/quest/repeat of the same literal, char class,
164 // any char or any byte.
165 if ((r2.op == .Star ||
166 r2.op == .Plus ||
167 r2.op == .Quest ||
168 r2.op == .Repeat) &&
169 regexp_equal(r1.subs[0], r2.subs[0]) &&
170 // The parse flags must be consistent.
171 ((r1.parse_flags & .NonGreedy) == (r2.parse_flags & .NonGreedy))) {
172 return true;
173 }
174 // ... OR an occurrence of that literal, char class, any char or any byte
175 if (regexp_equal(r1.subs[0], r2)) {
176 return true;
177 }
178 // ... OR a literal string that begins with that literal.
179 if (r1.subs[0].op == .Literal && r2.op == .LiteralString && r2.runes[0] == r1.subs[0].rune &&
180 // The parse flags must be consistent.
181 ((r1.subs[0].parse_flags & .FoldCase) == (r2.parse_flags & .FoldCase))) {
182 return true;
183 }
184 }
185 return false;
186}
187
188do_coalesce :: (pool: *Regexp_Pool, r1ptr: **Regexp_Node, r2ptr: **Regexp_Node) {
189 leave_empty :: (r1ptr: **Regexp_Node, r2ptr: **Regexp_Node, pool: *Regexp_Pool, nre: *Regexp_Node) {
190 // @ToDo, @Speed: Why don’t we set this to null instead of allocing a temporary new regexp?
191 r1ptr.* = new_regexp(pool, .EmptyMatch, .NoParseFlags);
192 r2ptr.* = nre;
193 }
194
195 r1 := r1ptr.*;
196 r2 := r2ptr.*;
197
198 nre := new_repeat(pool, r1.subs[0], r1.parse_flags, 0, 0);
199
200 if r1.op == {
201 case .Star;
202 nre.min = 0;
203 nre.max = -1;
204 case .Plus;
205 nre.min = 1;
206 nre.max = -1;
207 case .Quest;
208 nre.min = 0;
209 nre.max = 1;
210 case .Repeat;
211 nre.min = r1.min;
212 nre.max = r1.max;
213 case;
214 assert(false, "Unexpected r1.op: %", r1.op);
215 }
216
217
218 if r2.op == {
219 case .Star;
220 nre.max = -1;
221 leave_empty(r1ptr, r2ptr, pool, nre);
222
223 case .Plus;
224 nre.min += 1;
225 nre.max = -1;
226 leave_empty(r1ptr, r2ptr, pool, nre);
227
228 case .Quest;
229 if nre.max != -1 {
230 nre.max += 1;
231 }
232 leave_empty(r1ptr, r2ptr, pool, nre);
233
234 case .Repeat;
235 nre.min += r2.min;
236 if (r2.max == -1) {
237 nre.max = -1;
238 } else if (nre.max != -1) {
239 nre.max += r2.max;
240 }
241 leave_empty(r1ptr, r2ptr, pool, nre);
242
243 case .Literal; #through;
244 case .CharClass; #through;
245 case .AnyChar; #through;
246 case .AnyByte;
247 nre.min += 1;
248 if (nre.max != -1) {
249 nre.max += 1;
250 }
251 leave_empty(r1ptr, r2ptr, pool, nre);
252
253 case .LiteralString;
254 r := r1.subs[0].rune;
255 // Determine how much of the literal string is removed.
256 // We know that we have at least one rune. :)
257 n: s16 = 1;
258 while (n < r2.runes.count && r2.runes[n] == r) {
259 n += 1;
260 }
261 nre.min += n;
262 if (nre.max != -1) {
263 nre.max += n;
264 }
265 if n == r2.runes.count {
266 leave_empty(r1ptr, r2ptr, pool, nre);
267 } else {
268 r1ptr.* = nre;
269 r2ptr.* = new_literal_string(pool, slice(r2.runes, n, r2.runes.count - n), r2.parse_flags);
270 }
271
272 case;
273 assert(false, "Unexpected r2.op: %", r2.op);
274 }
275}
276
277simplify_coalesced :: (using re: *Regexp_Node, pool: *Regexp_Pool) -> *Regexp_Node {
278 simplify_pre :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node) -> *Regexp_Node, stop: bool {
279 if (re.simple) {
280 return re, true;
281 }
282
283 return null, false;
284 }
285 simplify_post :: (pool: *Regexp_Pool, re: *Regexp_Node, parent_arg: *Regexp_Node, pre_arg: *Regexp_Node, child_args: [..] *Regexp_Node) -> *Regexp_Node {
286 if re.op == {
287 case .NoMatch; #through;
288 case .EmptyMatch; #through;
289 case .Literal; #through;
290 case .LiteralString; #through;
291 case .BeginLine; #through;
292 case .EndLine; #through;
293 case .BeginText; #through;
294 case .WordBoundary; #through;
295 case .NoWordBoundary; #through;
296 case .EndText; #through;
297 case .AnyChar; #through;
298 case .AnyByte; #through;
299 case .HaveMatch;
300 // All these are always simple.
301 re.simple = true;
302 return re;
303
304 case .Concat; #through;
305 case .Alternate;
306 // These are simple as long as the subpieces are simple.
307 if !child_args_changed(re, child_args) {
308 re.simple = true;
309 return re;
310 }
311 nre := new_regexp(pool, re.op, re.parse_flags);
312 // @ToDo: Should we copy here?
313 nre.subs = child_args;
314 nre.simple = true;
315 return nre;
316
317
318 case .Capture;
319 newsub := child_args[0];
320 if newsub == re.subs[0] {
321 re.simple = true;
322 return re;
323 }
324 nre := new_regexp(pool, .Capture, re.parse_flags);
325 nre.subs = NewArray(1, *Regexp_Node, false);
326 nre.subs[0] = newsub;
327 nre.cap = re.cap;
328 nre.simple = true;
329 return nre;
330
331 case .Star; #through;
332 case .Plus; #through;
333 case .Quest;
334 newsub := child_args[0];
335 // Special case: repeat the empty string as much as
336 // you want, but it's still the empty string.
337 if newsub.op == .EmptyMatch return newsub;
338
339 // These are simple as long as the subpiece is simple.
340 if newsub == re.subs[0] {
341 re.simple = true;
342 return re;
343 }
344
345 // These are also idempotent if flags are constant.
346 if re.op == newsub.op && re.parse_flags == newsub.parse_flags return newsub;
347
348 nre := new_regexp(pool, re.op, re.parse_flags);
349 nre.subs = NewArray(1, *Regexp_Node, false);
350 nre.subs[0] = newsub;
351 nre.simple = true;
352 return nre;
353
354 case .Repeat;
355 newsub := child_args[0];
356 // Special case: repeat the empty string as much as
357 // you want, but it's still the empty string.
358 if newsub.op == .EmptyMatch return newsub;
359
360 nre := simplify_repeat(pool, newsub, re.min, re.max, re.parse_flags);
361 nre.simple = true;
362 return nre;
363
364 case .CharClass;
365 nre := simplify_char_class(pool, re);
366 nre.simple = true;
367 return nre;
368 case;
369 assert(false, "Simpilfy case not handled: %", re.op);
370 return null;
371 }
372 }
373
374 w := Walk(*Regexp_Pool, *Regexp_Node).{pre_visit = simplify_pre, post_visit = simplify_post};
375 result, stopped := walk(pool, re, null, w);
376 if stopped return null;
377 return result;
378}
379
380// Creates a concatenation of two Regexp_Node, consuming refs to re1 and re2.
381// Returns a new Regexp_Node, handing the ref to the caller.
382concat2 :: (pool: *Regexp_Pool, re1: *Regexp_Node, re2: *Regexp_Node, parse_flags: ParseFlags) -> *Regexp_Node {
383 re := new_regexp(pool, .Concat, parse_flags);
384 re.subs = NewArray(2, *Regexp_Node, false);
385 re.subs[0] = re1;
386 re.subs[1] = re2;
387 return re;
388}
389
390// Simplifies the expression re{min,max} in terms of *, +, and ?.
391// Returns a new regexp. Does not edit re. Does not consume reference to re.
392// Caller must Decref return value when done with it.
393// The result will *not* necessarily have the right capturing parens
394// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
395// but in the Regexp_Node* representation, both (x) are marked as $1.
396simplify_repeat :: (pool: *Regexp_Pool, re: *Regexp_Node, min: int, max: int, f: ParseFlags) -> *Regexp_Node {
397 // x{n,} means at least n matches of x.
398 if (max == -1) {
399 // Special case: x{0,} is x*
400 if (min == 0) return new_star(pool, re, f);
401
402 // Special case: x{1,} is x+
403 if (min == 1) return new_plus(pool, re, f);
404
405 // General case: x{4,} is xxxx+
406 nre_subs := NewArray(min, *Regexp_Node, false);
407 for i: 0..min-2 {
408 nre_subs[i] = re;
409 }
410 nre_subs[min-1] = new_plus(pool, re, f);
411 return new_concat(pool, nre_subs, f);
412 }
413
414 // Special case: (x){0} matches only empty string.
415 if (min == 0 && max == 0) return new_regexp(pool, .EmptyMatch, f);
416
417 // Special case: x{1} is just x.
418 if (min == 1 && max == 1) return re;
419
420 // General case: x{n,m} means n copies of x and m copies of x?.
421 // The machine will do less work if we nest the final m copies,
422 // so that x{2,5} = xx(x(x(x)?)?)?
423
424 // Build leading prefix: xx. Capturing only on the last one.
425 nre: *Regexp_Node;
426 if (min > 0) {
427 nre_subs := NewArray(min, *Regexp_Node, false);
428 for i: 0..min-1 {
429 nre_subs[i] = re;
430 }
431 nre = new_concat(pool, nre_subs, f);
432 }
433
434 // Build and attach suffix: (x(x(x)?)?)?
435 if (max > min) {
436 suf := new_quest(pool, re, f);
437 for i: min+1..max-1 {
438 suf = new_quest(pool, concat2(pool, re, suf, f), f);
439 }
440 if (nre == null) {
441 nre = suf;
442 } else {
443 nre = concat2(pool, nre, suf, f);
444 }
445 }
446
447 // Some degenerate case, like min > max, or min < max < 0.
448 // This shouldn't happen, because the parser rejects such regexps.
449 assert(nre != null, "Malformed repeat: % %", min, max);
450
451 return nre;
452}
453
454// Simplifies a character class.
455simplify_char_class :: (pool: *Regexp_Pool, re: *Regexp_Node) -> *Regexp_Node {
456 // Special cases
457 if (re.char_class.nrunes == 0) return new_regexp(pool, .NoMatch, re.parse_flags);
458 if (re.char_class.nrunes == Runemax + 1) return new_regexp(pool, .AnyChar, re.parse_flags);
459
460 return re;
461}
462
463#scope_file
464
465// @ToDo: Move to a common utility module
466slice :: inline (array: [] $T, index: int, count: int) -> [] T {
467 assert(index >= 0, "index = %", index); // @@ Should we also clamp in these cases?
468 assert(count >= 0, "count = %", count);
469 c: [] T = ---;
470 if index >= array.count {
471 c.data = null;
472 c.count = 0;
473 return c;
474 }
475
476 if index + count > array.count {
477 count = array.count - index;
478 }
479
480 c.data = array.data + index;
481 c.count = count;
482 return c;
483}
484
485
Copyright 2026  E766CB298A6D1E64 | Git-Thing heavily inspired by cgit