#!/bin/python from pathlib import Path from sys import exit, argv def write_to_disk(fn, data): try: with open(fn, "w+", encoding="utf8") as f: f.write(data) except Exception as e: print(e) exit(1) def corpus_generate(corpus_data, path): for idx, string in enumerate(corpus_data): padded = f"{idx+1:02d}" fp = path / Path(padded) print(f"Stored: {fp}") write_to_disk(fp, string) def config_read(path): if not path.exists(): print("Cannot find config file in:", path) exit(1) with open(path, "r", encoding="utf8") as f: config = f.readlines() if not config: print("Could not read config") exit(1) if len(config) <= 2: print("Config is missing corpus data") exit(1) if not config[0].startswith("CFG"): print("Invalid protocol byte. Missing 'CFG'") exit(1) if not config[1] == "\n": print("Invalid protocol. Missing newline on second line") exit(1) header = config[0].split(" ") if len(header) != 2: print("Invalid header") exit(1) path = Path(header[1].strip()) path.mkdir(parents=True, exist_ok=True) corpus_data = config[2:] corpus_generate(corpus_data, path) def is_arg(args, *matches): for match in matches: if match in args: return True return False def print_usage(): print("Usage: generate_copus.py [PATH TO CORPUS CONFIG]") def main(): args = argv if is_arg("help", "-h", "-help", "--help"): print_usage() elif len(args) > 1: paths = args[1:] for path in paths: fp = Path(path.strip()) config_read(fp) else: print_usage() if __name__ == "__main__": main()