Logo

index : blog

---

  • summary
  • about
  • tree
  • log
  • branches
<< path: root/public/blog.git/html/src/search/search.jai blob: 7c0448c188ceeb28381b93940a216eab59e04f93 [raw] [clear marker]

        
0
1
2Search_Response :: enum {
3 ERROR :: 0;
4 OK;
5 NO_RESULT;
6 VALIDATION_EMPTY;
7 VALIDATION_TOO_LONG;
8}
9
10
11search_run :: (
12 ht: *Table(string, []int), query: string
13)
14 -> (ok: Search_Response, idx: []int = .[])
15{
16 ok, validated := validate_query(query);
17 if ok != .OK return ok;
18
19 found := find_in_documents(ht, validated);
20
21 if found return .OK, found;
22 else return .NO_RESULT;
23}
24
25search_dump_search_index :: () {
26 qtrace_init();
27 init_my_pool();
28 search_index := New(Table(string, []int)); // @OS GC
29 table_ensure_space(search_index, HT_ENSURE_SPACE);
30 defer this_allocation_is_not_a_leak(search_index.entries.data);
31
32 documents_md := load_data_or_exit(FP_DUMP_ENTRIES_MD, []Entry);
33
34 build_index(documents_md, search_index);
35 assert(search_index.count > 0, "HT is empty");
36
37 ok := marshal_and_save_to_disk(FP_DUMP_SEARCH_INDEX, search_index);
38 assert(ok);
39}
40
41
42#scope_file
43
44
45#import "Hash_Table";
46#import "Sort";
47#import "Flat_Pool";
48
49
50is_init: bool;
51pool: Flat_Pool;
52pool_alloc: Allocator;
53
54
55init_my_pool :: () {
56 if is_init return;
57 is_init = true;
58
59 init(*pool);
60 pool_alloc.proc = flat_pool_allocator_proc;
61 pool_alloc.data = *pool;
62}
63
64build_index :: ($$entries: *[]Entry, ht: *Table(string, []int)) {
65 data := entries.*;
66
67 buf: [..]int;
68 array_reserve(*buf, 512);
69 defer this_allocation_is_not_a_leak(buf.data);
70
71 for doc, i: data {
72 post_with_title := join(doc.title, doc.post, " ",, pool_alloc);
73 sanitized := replace_special_chars_with_space(post_with_title,, pool_alloc);
74 lower_case := to_lower_copy(sanitized,, pool_alloc);
75 words := split_by_whitespace(lower_case,, pool_alloc);
76 defer reset(*pool);
77
78 for word: words {
79 trimmed := trim(word,, pool_alloc);
80 if !table_contains(ht, trimmed) {
81 new := NewArray(1, int);
82 new[0] = i;
83 new_word := copy_string(trimmed);
84 table_add(ht, new_word, new);
85 this_allocation_is_not_a_leak(new.data);
86 } else {
87 found, value := table_find(ht, trimmed);
88 assert(found);
89
90 if array_find(value, i) then continue;
91
92 for value array_add(*buf, it);
93 array_add(*buf, i);
94
95 new_value := array_copy(buf);
96 this_allocation_is_not_a_leak(new_value.data);
97 array_reset_keeping_memory(*buf);
98
99 table_set(ht, trimmed, new_value);
100 }
101 }
102 }
103}
104
105validate_query :: (query: string) -> (status: Search_Response, []string) {
106
107 is_only_whitespace :: (s: string) -> bool {
108 for s if !is_space(it) return false;
109 return true;
110 }
111
112 terms: []string;
113
114 if query.count > MAX_QUERY_LENGTH return .VALIDATION_TOO_LONG, terms;
115 if !query || is_only_whitespace(query) return .VALIDATION_EMPTY, terms;
116
117 terms = split(query, " ");
118 return .OK, terms;
119}
120
121// @Memory: Free by caller
122find_in_documents :: (ht: *Table(string, []int), query: []string) -> []int {
123
124 Item :: struct {
125 value: int;
126 count: int;
127 }
128
129 compare_count :: (a: Item, b: Item) -> s64 {
130 return inline compare_floats(xx b.count, xx a.count);
131 }
132
133
134 buf: [..]int;
135
136 for term: query {
137 found, indexes := table_find(ht, to_lower_copy(term));
138 if found then { for indexes array_add(*buf, it); }
139 }
140
141 if !buf return .[];
142
143 max := -1;
144 for buf { if it > max then max = it; }
145 assert(max > -1, "Something went shit");
146
147 freq := NewArray(max+1, int);
148 for i: 0..buf.count-1 { freq[buf[i]] += 1; }
149
150
151 entries: [..]Item;
152
153 for i: 0..max {
154 if freq[i] > 0 {
155 array_add(*entries, .{value = i, count = freq[i]});
156 }
157 }
158
159 quick_sort(entries, compare_count);
160
161 new := NewArray(entries.count, int, initialized=false);
162 for i: 0..entries.count-1 new[i] = entries[i].value;
163
164 return new;
165}
166
167/** Dear reader, if you're a professional: look away.
168 If you're someone that currently discovers programming, this is
169 **NOT** how you replace_special_chars_with_space data!
170
171 I recommend reading the code from 'https://github.com/mozilla/bleach'
172*/
173replace_special_chars_with_space :: (s: string) -> string {
174
175 has_special_char :: (char: u8) -> bool {
176 SPECIAL_CHARACTERS :: "#,;.:_!\"\\()[]{}`*?=/&^°|<>’";
177
178 for SPECIAL_CHARACTERS {
179 if it == char return true;
180 }
181 return false;
182 }
183
184
185 buf: String_Builder;
186 init_string_builder(*buf);
187
188 for char: s {
189 if has_special_char(char)
190 then append(*buf, " ");
191 else append(*buf, char);
192 }
193
194 return builder_to_string(*buf);
195}
196
197split_by_whitespace :: (s: string) -> []string {
198 results: [..]string;
199 buf: [..]u8;
200 array_reset_keeping_memory(*buf);
201
202 trimmed := trim(s);
203
204 for trimmed {
205 if it == {
206 case " "; #through;
207 case "\r"; #through;
208 case "\n"; #through;
209 case "\t";
210 word := join(xx buf);
211 if !word continue;
212 array_reset_keeping_memory(*buf);
213 array_add(*results, word);
214 continue;
215 case; array_add(*buf, it);
216 }
217 }
218
219 word := join(xx buf);
220 if word then array_add(*results, word);
221
222 return results;
223}
224
225
Copyright 2026  E766CB298A6D1E64 | Git-Thing heavily inspired by cgit