Search_Response :: enum { ERROR :: 0; OK; NO_RESULT; VALIDATION_EMPTY; VALIDATION_TOO_LONG; } search_run :: ( ht: *Table(string, []int), query: string ) -> (ok: Search_Response, idx: []int = .[]) { ok, validated := validate_query(query); if ok != .OK return ok; found := find_in_documents(ht, validated); if found return .OK, found; else return .NO_RESULT; } search_dump_search_index :: () { qtrace_init(); init_my_pool(); search_index := New(Table(string, []int)); // @OS GC table_ensure_space(search_index, HT_ENSURE_SPACE); defer this_allocation_is_not_a_leak(search_index.entries.data); documents_md := load_data_or_exit(FP_DUMP_ENTRIES_MD, []Entry); build_index(documents_md, search_index); assert(search_index.count > 0, "HT is empty"); ok := marshal_and_save_to_disk(FP_DUMP_SEARCH_INDEX, search_index); assert(ok); } #scope_file #import "Hash_Table"; #import "Sort"; #import "Flat_Pool"; is_init: bool; pool: Flat_Pool; pool_alloc: Allocator; init_my_pool :: () { if is_init return; is_init = true; init(*pool); pool_alloc.proc = flat_pool_allocator_proc; pool_alloc.data = *pool; } build_index :: ($$entries: *[]Entry, ht: *Table(string, []int)) { data := entries.*; buf: [..]int; array_reserve(*buf, 512); defer this_allocation_is_not_a_leak(buf.data); for doc, i: data { post_with_title := join(doc.title, doc.post, " ",, pool_alloc); sanitized := replace_special_chars_with_space(post_with_title,, pool_alloc); lower_case := to_lower_copy(sanitized,, pool_alloc); words := split_by_whitespace(lower_case,, pool_alloc); defer reset(*pool); for word: words { trimmed := trim(word,, pool_alloc); if !table_contains(ht, trimmed) { new := NewArray(1, int); new[0] = i; new_word := copy_string(trimmed); table_add(ht, new_word, new); this_allocation_is_not_a_leak(new.data); } else { found, value := table_find(ht, trimmed); assert(found); if array_find(value, i) then continue; for value array_add(*buf, it); array_add(*buf, i); new_value := array_copy(buf); this_allocation_is_not_a_leak(new_value.data); array_reset_keeping_memory(*buf); table_set(ht, trimmed, new_value); } } } } validate_query :: (query: string) -> (status: Search_Response, []string) { is_only_whitespace :: (s: string) -> bool { for s if !is_space(it) return false; return true; } terms: []string; if query.count > MAX_QUERY_LENGTH return .VALIDATION_TOO_LONG, terms; if !query || is_only_whitespace(query) return .VALIDATION_EMPTY, terms; terms = split(query, " "); return .OK, terms; } // @Memory: Free by caller find_in_documents :: (ht: *Table(string, []int), query: []string) -> []int { Item :: struct { value: int; count: int; } compare_count :: (a: Item, b: Item) -> s64 { return inline compare_floats(xx b.count, xx a.count); } buf: [..]int; for term: query { found, indexes := table_find(ht, to_lower_copy(term)); if found then { for indexes array_add(*buf, it); } } if !buf return .[]; max := -1; for buf { if it > max then max = it; } assert(max > -1, "Something went shit"); freq := NewArray(max+1, int); for i: 0..buf.count-1 { freq[buf[i]] += 1; } entries: [..]Item; for i: 0..max { if freq[i] > 0 { array_add(*entries, .{value = i, count = freq[i]}); } } quick_sort(entries, compare_count); new := NewArray(entries.count, int, initialized=false); for i: 0..entries.count-1 new[i] = entries[i].value; return new; } /** Dear reader, if you're a professional: look away. If you're someone that currently discovers programming, this is **NOT** how you replace_special_chars_with_space data! I recommend reading the code from 'https://github.com/mozilla/bleach' */ replace_special_chars_with_space :: (s: string) -> string { has_special_char :: (char: u8) -> bool { SPECIAL_CHARACTERS :: "#,;.:_!\"\\()[]{}`*?=/&^°|<>’"; for SPECIAL_CHARACTERS { if it == char return true; } return false; } buf: String_Builder; init_string_builder(*buf); for char: s { if has_special_char(char) then append(*buf, " "); else append(*buf, char); } return builder_to_string(*buf); } split_by_whitespace :: (s: string) -> []string { results: [..]string; buf: [..]u8; array_reset_keeping_memory(*buf); trimmed := trim(s); for trimmed { if it == { case " "; #through; case "\r"; #through; case "\n"; #through; case "\t"; word := join(xx buf); if !word continue; array_reset_keeping_memory(*buf); array_add(*results, word); continue; case; array_add(*buf, it); } } word := join(xx buf); if word then array_add(*results, word); return results; }