diff --git a/sorts/countingsort.py b/sorts/countingsort.py new file mode 100644 index 000000000..c7b502d8f --- /dev/null +++ b/sorts/countingsort.py @@ -0,0 +1,41 @@ +# Python program for counting sort + +# This is the main function that sort the given string arr[] in +# in the alphabetical order +def countSort(arr): + + # The output character array that will have sorted arr + output = [0 for i in range(256)] + + # Create a count array to store count of inidividul + # characters and initialize count array as 0 + count = [0 for i in range(256)] + + # For storing the resulting answer since the + # string is immutable + ans = ["" for _ in arr] + + # Store count of each character + for i in arr: + count[ord(i)] += 1 + + # Change count[i] so that count[i] now contains actual + # position of this character in output array + for i in range(256): + count[i] += count[i-1] + + # Build the output character array + for i in range(len(arr)): + output[count[ord(arr[i])]-1] = arr[i] + count[ord(arr[i])] -= 1 + + # Copy the output array to arr, so that arr now + # contains sorted characters + for i in range(len(arr)): + ans[i] = output[i] + return ans + +# Driver program to test above function +arr = "thisisthestring" +ans = countSort(arr) +print ("Sorted string array is %s" %("".join(ans))) diff --git a/sorts/external-sort.py b/sorts/external-sort.py new file mode 100644 index 000000000..eca26012d --- /dev/null +++ b/sorts/external-sort.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python + +# +# Sort large text files in a minimum amount of memory +# +import os +import sys +import argparse + +class FileSplitter(object): + BLOCK_FILENAME_FORMAT = 'block_{0}.dat' + + def __init__(self, filename): + self.filename = filename + self.block_filenames = [] + + def write_block(self, data, block_number): + filename = self.BLOCK_FILENAME_FORMAT.format(block_number) + file = open(filename, 'w') + file.write(data) + file.close() + self.block_filenames.append(filename) + + def get_block_filenames(self): + return self.block_filenames + + def split(self, block_size, sort_key=None): + file = open(self.filename, 'r') + i = 0 + + while True: + lines = file.readlines(block_size) + + if lines == []: + break + + if sort_key is None: + lines.sort() + else: + lines.sort(key=sort_key) + + self.write_block(''.join(lines), i) + i += 1 + + def cleanup(self): + map(lambda f: os.remove(f), self.block_filenames) + + +class NWayMerge(object): + def select(self, choices): + min_index = -1 + min_str = None + + for i in range(len(choices)): + if min_str is None or choices[i] < min_str: + min_index = i + + return min_index + + +class FilesArray(object): + def __init__(self, files): + self.files = files + self.empty = set() + self.num_buffers = len(files) + self.buffers = {i: None for i in range(self.num_buffers)} + + def get_dict(self): + return {i: self.buffers[i] for i in range(self.num_buffers) if i not in self.empty} + + def refresh(self): + for i in range(self.num_buffers): + if self.buffers[i] is None and i not in self.empty: + self.buffers[i] = self.files[i].readline() + + if self.buffers[i] == '': + self.empty.add(i) + + if len(self.empty) == self.num_buffers: + return False + + return True + + def unshift(self, index): + value = self.buffers[index] + self.buffers[index] = None + + return value + + +class FileMerger(object): + def __init__(self, merge_strategy): + self.merge_strategy = merge_strategy + + def merge(self, filenames, outfilename, buffer_size): + outfile = open(outfilename, 'w', buffer_size) + buffers = FilesArray(self.get_file_handles(filenames, buffer_size)) + + while buffers.refresh(): + min_index = self.merge_strategy.select(buffers.get_dict()) + outfile.write(buffers.unshift(min_index)) + + def get_file_handles(self, filenames, buffer_size): + files = {} + + for i in range(len(filenames)): + files[i] = open(filenames[i], 'r', buffer_size) + + return files + + + +class ExternalSort(object): + def __init__(self, block_size): + self.block_size = block_size + + def sort(self, filename, sort_key=None): + num_blocks = self.get_number_blocks(filename, self.block_size) + splitter = FileSplitter(filename) + splitter.split(self.block_size, sort_key) + + merger = FileMerger(NWayMerge()) + buffer_size = self.block_size / (num_blocks + 1) + merger.merge(splitter.get_block_filenames(), filename + '.out', buffer_size) + + splitter.cleanup() + + def get_number_blocks(self, filename, block_size): + return (os.stat(filename).st_size / block_size) + 1 + + +def parse_memory(string): + if string[-1].lower() == 'k': + return int(string[:-1]) * 1024 + elif string[-1].lower() == 'm': + return int(string[:-1]) * 1024 * 1024 + elif string[-1].lower() == 'g': + return int(string[:-1]) * 1024 * 1024 * 1024 + else: + return int(string) + + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('-m', + '--mem', + help='amount of memory to use for sorting', + default='100M') + parser.add_argument('filename', + metavar='', + nargs=1, + help='name of file to sort') + args = parser.parse_args() + + sorter = ExternalSort(parse_memory(args.mem)) + sorter.sort(args.filename[0]) + + +if __name__ == '__main__': +main() \ No newline at end of file