Logo

index : blog

---

  • summary
  • about
  • tree
  • log
  • branches
<< path: root/public/blog.git/html/modules/uniform/rune.jai blob: ddf426567f0123ba34b14cc8124a9ac5bdfacf48 [raw] [clear marker]

        
0/*
1 * This source is adapted from the utf-8 library by Rob Pike and Ken Thompson.
2 * Copyright (c) 2002 by Lucent Technologies.
3 * Permission to use, copy, modify, and distribute this software for any
4 * purpose without fee is hereby granted, provided that this entire notice
5 * is included in all copies of any software which is or includes a copy
6 * or modification of this software and in all copies of the supporting
7 * documentation for such software.
8 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
9 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
10 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
11 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
12 */
13
14#scope_module
15
16// @ToDo: Why is this signed!?
17Rune :: s32;
18
19UTFmax :: 4; /* maximum bytes per rune */
20Runesync :: 0x80; /* cannot represent part of a UTF sequence (<) */
21Runeself :: 0x80; /* rune and UTF sequences are the same (<) */
22Runeerror :: 0xFFFD; /* decoding error in UTF */
23Runemax : s32 : 0x10FFFF; /* maximum rune value */
24
25// @ToDo: Move to unicode module?
26rune_from_string :: (str: string) -> Rune, len: int {
27 /*
28 * one character sequence
29 * 00000-0007F => T1
30 */
31 c: s32 = str[0];
32 if c < Tx {
33 return c, 1;
34 }
35
36
37 /*
38 * two character sequence
39 * 0080-07FF => T2 Tx
40 */
41 if str.count < 2 return Runeerror, 0;
42
43 c1 := cast(s32)(str[1]) ^ Tx;
44 if c1 & Testx return Runeerror, 0;
45
46 if c < T3 {
47 if c < T2 return Runeerror, 0;
48
49 result := ((c << Bitx) | c1) & Rune2;
50 if result <= Rune1 return Runeerror, 0;
51
52 return result, 2;
53 }
54
55 /*
56 * three character sequence
57 * 0800-FFFF => T3 Tx Tx
58 */
59 if str.count < 3 return Runeerror, 0;
60
61 c2 := cast(s32)(str[2]) ^ Tx;
62 if c2 & Testx return Runeerror, 0;
63
64 if c < T4 {
65 result := ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
66 if result <= Rune2 return Runeerror, 0;
67
68 return result, 3;
69 }
70
71 /*
72 * four character sequence (21-bit value)
73 * 10000-1FFFFF => T4 Tx Tx Tx
74 */
75 if str.count < 4 return Runeerror, 0;
76
77 c3 := cast(s32)(str[3]) ^ Tx;
78 if c3 & Testx return Runeerror, 0;
79
80 if c < T5 {
81 result := ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
82 if result <= Rune3 return Runeerror, 0;
83
84 return result, 4;
85 }
86
87 return Runeerror, 0;
88}
89
90bytes_from_rune :: (rune: Rune) -> [UTFmax] u8, len: int {
91 /* Runes are signed, so convert to unsigned for range check. */
92 c := cast(u32)rune;
93 result: [UTFmax] u8;
94
95 /*
96 * one character sequence
97 * 00000-0007F => 00-7F
98 */
99 if c <= Rune1 {
100 result[0] = cast(u8) c;
101 return result, 1;
102 }
103
104 /*
105 * two character sequence
106 * 0080-07FF => T2 Tx
107 */
108 if c <= Rune2 {
109 result[0] = T2 | cast(u8)(c >> (1*Bitx));
110 result[1] = Tx | cast(u8)(c & Maskx);
111 return result, 2;
112 }
113
114 /*
115 * If the Rune is out of range, convert it to the error rune.
116 * Do this test here because the error rune encodes to three bytes.
117 * Doing it earlier would duplicate work, since an out of range
118 * Rune wouldn't have fit in one or two bytes.
119 */
120 if c > cast(u32)Runemax {
121 c = Runeerror;
122 }
123
124 /*
125 * three character sequence
126 * 0800-FFFF => T3 Tx Tx
127 */
128 if c <= Rune3 {
129 result[0] = T3 | cast(u8)(c >> (2*Bitx));
130 result[1] = Tx | cast(u8)((c >> (1*Bitx)) & Maskx);
131 result[2] = Tx | cast(u8)(c & Maskx);
132 return result, 3;
133 }
134
135 /*
136 * four character sequence (21-bit value)
137 * 10000-1FFFFF => T4 Tx Tx Tx
138 */
139 result[0] = T4 | cast(u8)(c >> (3*Bitx));
140 result[1] = Tx | cast(u8)((c >> (2*Bitx)) & Maskx);
141 result[2] = Tx | cast(u8)((c >> (1*Bitx)) & Maskx);
142 result[3] = Tx | cast(u8)(c & Maskx);
143 return result, 4;
144}
145
146string_from_runes :: (runes: [] Rune) -> string {
147 builder: String_Builder;
148 defer free_buffers(*builder);
149 for runes {
150 bytes, len := bytes_from_rune(it);
151 append(*builder, bytes.data, len);
152 }
153 return builder_to_string(*builder);
154}
155
156#scope_file
157
158Bit1 :: 7;
159Bitx :: 6;
160Bit2 :: 5;
161Bit3 :: 4;
162Bit4 :: 3;
163Bit5 :: 2;
164
165T1 : u8 : 0b0000_0000;
166Tx : u8 : 0b1000_0000;
167T2 : u8 : 0b1100_0000;
168T3 : u8 : 0b1110_0000;
169T4 : u8 : 0b1111_0000;
170T5 : u8 : 0b1111_1000;
171
172Rune1 :: 0b0000_0000_0111_1111;
173Rune2 :: 0b0000_0111_1111_1111;
174Rune3 :: 0b1111_1111_1111_1111;
175Rune4 :: 0b0001_1111_1111_1111_1111_1111;
176
177Maskx : u8 : 0b0011_1111;
178Testx : u8 : 0b1100_0000;
179
180
Copyright 2026  E766CB298A6D1E64 | Git-Thing heavily inspired by cgit