diff --git a/compression/huffman.py b/compression/huffman.py new file mode 100644 index 000000000000..b6238b66e9fd --- /dev/null +++ b/compression/huffman.py @@ -0,0 +1,87 @@ +import sys + +class Letter: + def __init__(self, letter, freq): + self.letter = letter + self.freq = freq + self.bitstring = "" + + def __repr__(self): + return f'{self.letter}:{self.freq}' + + +class TreeNode: + def __init__(self, freq, left, right): + self.freq = freq + self.left = left + self.right = right + + +def parse_file(file_path): + """ + Read the file and build a dict of all letters and their + frequences, then convert the dict into a list of Letters. + """ + chars = {} + with open(file_path) as f: + while True: + c = f.read(1) + if not c: + break + chars[c] = chars[c] + 1 if c in chars.keys() else 1 + letters = [] + for char, freq in chars.items(): + letter = Letter(char, freq) + letters.append(letter) + letters.sort(key=lambda l: l.freq) + return letters + +def build_tree(letters): + """ + Run through the list of Letters and build the min heap + for the Huffman Tree. + """ + while len(letters) > 1: + left = letters.pop(0) + right = letters.pop(0) + total_freq = left.freq + right.freq + node = TreeNode(total_freq, left, right) + letters.append(node) + letters.sort(key=lambda l: l.freq) + return letters[0] + +def traverse_tree(root, bitstring): + """ + Recursively traverse the Huffman Tree to set each + Letter's bitstring, and return the list of Letters + """ + if type(root) is Letter: + root.bitstring = bitstring + return [root] + letters = [] + letters += traverse_tree(root.left, bitstring + "0") + letters += traverse_tree(root.right, bitstring + "1") + return letters + +def huffman(file_path): + """ + Parse the file, build the tree, then run through the file + again, using the list of Letters to find and print out the + bitstring for each letter. + """ + letters_list = parse_file(file_path) + root = build_tree(letters_list) + letters = traverse_tree(root, "") + print(f'Huffman Coding of {file_path}: ') + with open(file_path) as f: + while True: + c = f.read(1) + if not c: + break + le = list(filter(lambda l: l.letter == c, letters))[0] + print(le.bitstring, end=" ") + print() + +if __name__ == "__main__": + # pass the file path to the huffman function + huffman(sys.argv[1])