Logo

index : blog

---

  • summary
  • about
  • tree
  • log
  • branches
<< path: root/public/blog.git/html/modules/uniform/char_class.jai blob: 01647b76c645116de30fbfd5949da31eca424399 [raw] [clear marker]

        
0#scope_module
1
2ALPHA_MASK :: (1<<26) - 1;
3
4CharClass :: struct {
5 upper: u32; // bitmap of A-Z
6 lower: u32; // bitmap of a-z
7 nrunes: int;
8 // @ToDo. @Speed: Do we need a faster data structure?
9 // TODO: Clean up the builders properly!!
10 ranges: [..] RuneRange;
11 folds_ascii: bool;
12}
13
14uninit :: (using cc: *CharClass) {
15 array_free(ranges);
16}
17
18copy :: (using cc: *CharClass) -> *CharClass {
19 new_cc := New(CharClass);
20 memcpy(new_cc, cc, size_of(CharClass));
21 new_ranges: [..] RuneRange;
22 array_copy(*new_ranges, cc.ranges);
23 new_cc.ranges = new_ranges;
24 return new_cc;
25}
26
27RuneRange :: struct {
28 lo: Rune;
29 hi: Rune;
30}
31
32less :: (a: RuneRange, b: RuneRange) -> bool {
33 return a.hi < b.lo;
34}
35
36find_range :: (ranges: [] RuneRange, r: RuneRange) -> RuneRange, index: int, found: bool {
37 start := 0;
38 end := ranges.count - 1;
39
40 while start <= end {
41 middle := (end + start) / 2;
42 range := ranges[middle];
43 if less(range, r) {
44 start = middle + 1;
45 } else if less(r, range) {
46 end = middle - 1;
47 } else {
48 return range, middle, true;
49 }
50 }
51
52 return .{}, start, false;
53}
54
55make_range :: (lo: Rune, hi: Rune) -> RuneRange {
56 r: RuneRange;
57 r.lo = lo;
58 r.hi = hi;
59 return r;
60}
61
62add_range :: (using ccb: *CharClass, lo: Rune, hi: Rune) -> bool {
63 if hi < lo return false;
64
65 if lo <= #char "z" && hi >= #char "A" {
66 // Overlaps some alpha, maybe not all.
67 // Update bitmaps telling which ASCII letters are in the set.
68 lo1 := max(lo, #char "A");
69 hi1 := min(hi, #char "Z");
70 if lo1 <= hi1 {
71 upper |= cast(u32)(((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - #char "A"));
72 }
73
74 lo1 = max(lo, #char "a");
75 hi1 = min(hi, #char "z");
76 if lo1 <= hi1 {
77 lower |= cast(u32)(((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - #char "a"));
78 }
79 }
80
81 { // Check whether lo, hi is already in the class.
82 range, index, found := find_range(ranges, make_range(lo, lo));
83 if found && range.lo <= lo && hi <= range.hi return false;
84 }
85
86 // Look for a range abutting lo on the left.
87 // If it exists, take it out and increase our range.
88 if lo > 0 {
89 range, index, found := find_range(ranges, make_range(lo - 1, lo - 1));
90 if found {
91 lo = range.lo;
92 if (range.hi > hi) {
93 hi = range.hi;
94 }
95 nrunes -= range.hi - range.lo + 1;
96 array_ordered_remove_by_index(*ranges, index);
97 }
98 }
99
100 // Look for a range abutting hi on the right.
101 // If it exists, take it out and increase our range.
102 if (hi < Runemax) {
103 range, index, found := find_range(ranges, make_range(hi + 1, hi + 1));
104 if found {
105 hi = range.hi;
106 nrunes -= range.hi - range.lo + 1;
107 array_ordered_remove_by_index(*ranges, index);
108 }
109 }
110
111 // Look for ranges between lo and hi. Take them out.
112 // This is only safe because the set has no overlapping ranges.
113 // We've already removed any ranges abutting lo and hi, so
114 // any that overlap [lo, hi] must be contained within it.
115 new_range := make_range(lo, hi);
116 index: int;
117 while true {
118 range: RuneRange;
119 found: bool;
120 range, index, found = find_range(ranges, new_range);
121 if !found {
122 break;
123 }
124 nrunes -= range.hi - range.lo + 1;
125 array_ordered_remove_by_index(*ranges, index);
126 }
127
128 // Finally, add [lo, hi].
129 nrunes += hi - lo + 1;
130 array_insert_at(*ranges, new_range, index);
131 return true;
132}
133
134add_char_class :: (using ccb: *CharClass, other: CharClass) {
135 for other.ranges {
136 add_range(ccb, it.lo, it.hi);
137 }
138}
139
140contains :: (using ccb: *CharClass, r: Rune) -> bool {
141 range, index, found := find_range(ccb.ranges, make_range(r, r));
142 return found;
143}
144
145// Does the character class behave the same on A-Z as on a-z?
146folds_ascii :: (using ccb: *CharClass) -> bool {
147 return ((upper ^ lower) & ALPHA_MASK) == 0;
148}
149
150// copy :: (using ccb: *CharClass) -> *CharClass {
151// CharClass* cc = new CharClass;
152// for (iterator it = begin(); it != end(); ++it)
153// cc->ranges_.insert(RuneRange(it->lo, it->hi));
154// cc->upper_ = upper_;
155// cc->lower_ = lower_;
156// cc->nrunes_ = nrunes_;
157// return cc;
158// }
159
160
161remove_above :: (using ccb: *CharClass, r: Rune) {
162 if r >= Runemax return;
163
164 if r < #char "z" {
165 if r < #char "a" {
166 lower = 0;
167 } else {
168 lower &= cast(u32)(ALPHA_MASK >> (#char "z" - r));
169 }
170 }
171
172 if r < #char "Z" {
173 if r < #char "A" {
174 upper = 0;
175 } else {
176 upper &= cast(u32)(ALPHA_MASK >> (#char "Z" - r));
177 }
178 }
179
180 while true {
181 range, index, found := find_range(ranges, make_range(r + 1, Runemax));
182 if !found break;
183 nrunes -= range.hi - range.lo + 1;
184 if (range.lo <= r) {
185 range.hi = r;
186 ranges[index] = range;
187 nrunes += range.hi - range.lo + 1;
188 } else {
189 array_ordered_remove_by_index(*ranges, index);
190 }
191 }
192}
193
194negate :: (using ccb: *CharClass) {
195 // In negation, first range begins at 0, unless
196 // the current class begins at 0.
197 dest := 0;
198 src := 0;
199 nextlo: Rune;
200 if ranges.count && ranges[0].lo == 0 {
201 nextlo = ranges[0].hi + 1;
202 src = 1;
203 }
204 while src < ranges.count {
205 r := ranges[src];
206 ranges[dest].lo = nextlo;
207 ranges[dest].hi = r.lo - 1;
208 nextlo = r.hi + 1;
209 src += 1;
210 dest += 1;
211 }
212 if nextlo <= Runemax {
213 if dest == ranges.count {
214 array_reserve(*ranges, ranges.count + 1);
215 ranges.count = dest + 1;
216 }
217 ranges[dest].lo = nextlo;
218 ranges[dest].hi = Runemax;
219 } else {
220 ranges.count = dest;
221 }
222
223 upper = ALPHA_MASK & ~upper;
224 lower = ALPHA_MASK & ~lower;
225 nrunes = Runemax+1 - nrunes;
226}
227
228finish_char_class :: (using ccb: *CharClass) {
229 // Does the character class behave the same on A-Z as on a-z?
230 folds_ascii = (((upper ^ lower) & ALPHA_MASK) == 0);
231}
232
233#import "Basic";
234
Copyright 2026  E766CB298A6D1E64 | Git-Thing heavily inspired by cgit