diff --git a/sorts/external-sort.py b/sorts/external-sort.py new file mode 100644 index 000000000..eca26012d --- /dev/null +++ b/sorts/external-sort.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +# +# Sort large text files in a minimum amount of memory +# +import os +import sys +import argparse + +class FileSplitter(object): + BLOCK_FILENAME_FORMAT = 'block_{0}.dat' + + def __init__(self, filename): + self.filename = filename + self.block_filenames = [] + + def write_block(self, data, block_number): + filename = self.BLOCK_FILENAME_FORMAT.format(block_number) + file = open(filename, 'w') + file.write(data) + file.close() + self.block_filenames.append(filename) + + def get_block_filenames(self): + return self.block_filenames + + def split(self, block_size, sort_key=None): + file = open(self.filename, 'r') + i = 0 + + while True: + lines = file.readlines(block_size) + + if lines == []: + break + + if sort_key is None: + lines.sort() + else: + lines.sort(key=sort_key) + + self.write_block(''.join(lines), i) + i += 1 + + def cleanup(self): + map(lambda f: os.remove(f), self.block_filenames) + + +class NWayMerge(object): + def select(self, choices): + min_index = -1 + min_str = None + + for i in range(len(choices)): + if min_str is None or choices[i] < min_str: + min_index = i + + return min_index + + +class FilesArray(object): + def __init__(self, files): + self.files = files + self.empty = set() + self.num_buffers = len(files) + self.buffers = {i: None for i in range(self.num_buffers)} + + def get_dict(self): + return {i: self.buffers[i] for i in range(self.num_buffers) if i not in self.empty} + + def refresh(self): + for i in range(self.num_buffers): + if self.buffers[i] is None and i not in self.empty: + self.buffers[i] = self.files[i].readline() + + if self.buffers[i] == '': + self.empty.add(i) + + if len(self.empty) == self.num_buffers: + return False + + return True + + def unshift(self, index): + value = self.buffers[index] + self.buffers[index] = None + + return value + + +class FileMerger(object): + def __init__(self, merge_strategy): + self.merge_strategy = merge_strategy + + def merge(self, filenames, outfilename, buffer_size): + outfile = open(outfilename, 'w', buffer_size) + buffers = FilesArray(self.get_file_handles(filenames, buffer_size)) + + while buffers.refresh(): + min_index = self.merge_strategy.select(buffers.get_dict()) + outfile.write(buffers.unshift(min_index)) + + def get_file_handles(self, filenames, buffer_size): + files = {} + + for i in range(len(filenames)): + files[i] = open(filenames[i], 'r', buffer_size) + + return files + + + +class ExternalSort(object): + def __init__(self, block_size): + self.block_size = block_size + + def sort(self, filename, sort_key=None): + num_blocks = self.get_number_blocks(filename, self.block_size) + splitter = FileSplitter(filename) + splitter.split(self.block_size, sort_key) + + merger = FileMerger(NWayMerge()) + buffer_size = self.block_size / (num_blocks + 1) + merger.merge(splitter.get_block_filenames(), filename + '.out', buffer_size) + + splitter.cleanup() + + def get_number_blocks(self, filename, block_size): + return (os.stat(filename).st_size / block_size) + 1 + + +def parse_memory(string): + if string[-1].lower() == 'k': + return int(string[:-1]) * 1024 + elif string[-1].lower() == 'm': + return int(string[:-1]) * 1024 * 1024 + elif string[-1].lower() == 'g': + return int(string[:-1]) * 1024 * 1024 * 1024 + else: + return int(string) + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-m', + '--mem', + help='amount of memory to use for sorting', + default='100M') + parser.add_argument('filename', + metavar='', + nargs=1, + help='name of file to sort') + args = parser.parse_args() + + sorter = ExternalSort(parse_memory(args.mem)) + sorter.sort(args.filename[0]) + + +if __name__ == '__main__': +main() \ No newline at end of file