# -*- coding: utf-8 -*-
"""Huffman coding algorithm classes, contains node and tree separate classes."""
from __future__ import absolute_import
from typing import Dict
[docs]class HuffmanNode:
"""A class to represent heap nodes of a huffman coding tree.
Attributes
----------
char: str
The character to be saved as a node value.
freq: int
The frequency of the char.
right_child: HuffmanNode
The right child of the heap node.
left_child: HuffmanNode
The left child of the heap node.
dir: str
The path of the current node in the Huffman tree, usually a string of
0s and 1s.
"""
def __init__(self: object, char: str, freq: int, left: object=None, right: object=None) -> None:
"""Class constructor
Parameters
----------
char : str
A character (Huffman coding characters).
freq : int
The frequency of the character.
left : object, optional
The left child of the heap node. The default is None.
right : object, optional
The right child of the heap node. The default is None.
Returns
-------
None
A class instance.
"""
self.char = char
self.freq = freq
self.__left_child = left
self.__right_child = right
self.dir = ''
@property
def right_child(self: object) -> object:
"""The getter of the right child.
Returns
-------
object
"""
return self.__right_child
@right_child.setter
def right_child(self: object, right_child: object) -> None:
"""The setter of the right child.
Parameters
----------
right_child : object
The new child.
Returns
-------
None
"""
self.__right_child = right_child
@property
def left_child(self: object) -> object:
"""The getter of the left child.
Returns
-------
object
"""
return self.__left_child
@left_child.setter
def left_child(self: object, left_child: object) -> None:
"""The setter of the left child.
Parameters
----------
right_child : object
The new child.
Returns
-------
None
"""
self.__left_child = left_child
def __str__(self: object) -> str:
"""Returns a string representation using print().
Returns
-------
str
A string to be displayed with print().
"""
return "Hello! I'm a Huffman coding HeapNode class instance."
def __repr__(self: object) -> str:
"""A coder friendly representation of the HuffmanNode object.
Returns
-------
str
A string.
"""
return "Node(%s, freq=%s, right=%s, left=%s)"%(self.char,
self.freq,
self.right_child,
self.left_child)
[docs]class HuffmanTree:
"""A class to represent a huffman tree.
Attributes
----------
sequence: str
The sequence to be encoded, i.e; the Burros-Wheeler transform of the
sequence.
frequency: Dict[str, int]
A dictionary to store frequency values for each character of the
sequence, the character as a key and its frequency as a value.
root: HuffmanNode
The root node of the Huffman tree.
codes: Dict[str, str]
The Huffman codes for a given sequence, The character as a key and
its tree path as a value(a string).
pad: int
The padding of the sequence, i.e; the number of zeroes that was added
to the end of sequence until it was divisible by 8 (coding in 8 bits).
"""
def __init__(self: object, sequence: str) -> None:
"""The class constructor.
Parameters
----------
sequence : str
The sequence to be coded.
Returns
-------
None
A class instance.
"""
self.sequence = sequence
self.frequency = HuffmanTree.freq_dict(self.sequence)
self.root = self.create_tree()
self.codes = {}
[docs] @staticmethod
def freq_dict(sequence: str) -> Dict[str, int]:
"""This method returns the frequence dictionary of a given sequence.
Parameters
----------
sequence : str
The sequence of interest.
Returns
-------
Dict[str, int]
A dictionary of character frequencies.
"""
f_dict = {}
for char in sequence:
if char in f_dict.keys():
f_dict[char] += 1
else:
f_dict[char] = 1
return f_dict
[docs] def create_tree(self: object) -> HuffmanNode:
"""The main algorithm for creating The Huffman tree.
Returns
-------
HuffmanNode
The root node of the tree.
"""
leafs = [] # creating a list for leafs of the tree
for char, freq in self.frequency.items():
leafs.append(HuffmanNode(char, freq))
while len(leafs) > 1: # while there is more than one element, do the algoritm
leafs = sorted(leafs, key=lambda x: x.freq) # sort the leafs by frequency
left = leafs.pop(0) # taking the first least frequency node as the left node
right = leafs.pop(0) # taking the second least frequency node as the right node
new_char = left.char + right.char # the new character
new_freq = left.freq + right.freq # the new frequency is the sum
left.dir = "0" # assign 0 as a direction for the left node
right.dir = "1" # assign 1 as a direction for the right node
new_node = HuffmanNode(new_char, new_freq, left, right) # create a new node and then append it
leafs.append(new_node)
return leafs[0]
[docs] def get_codings(self: object, node: HuffmanNode, val: str='') -> None:
"""A depth-first search recursive algorithm to find the paths of each
character in the tree.
Parameters
----------
node : HuffmanNode
The starting node, the root of the tree.
val : str, optional
The path of the current recursion in a tree. The default is ''.
Returns
-------
None
Fills the codes dictionary property of the Huffman Tree.
"""
curr_path = val + node.dir # the current path in the recursion
if node.left_child:
self.get_codings(node.left_child, curr_path)
if node.right_child:
self.get_codings(node.right_child, curr_path)
if not node.left_child and not node.right_child:
self.codes[node.char] = curr_path
[docs] def seq_to_binstr(self: object) -> str:
"""This method transforms the current sequence of the Huffman tree
to a binary sequence using the codes (paths of every character), it
also adds a padding to the end of the binary sequence (a variable
number of zeroes, when sequence is divisible by 8 then add 8 zeroes
by default). The binary sequence will be coded in 8-bits later.
It saves the padding in the codes property.
Returns
-------
str
The binary sequence.
"""
bin_str = ""
for char in self.sequence:
bin_str += self.codes[char]
pad = 8 - len(bin_str) % 8
if pad != 0:
for _ in range(0, pad, 1):
bin_str += '0'
self.codes['pad'] = str(pad) #Save the padding value to codes
return bin_str
[docs] @staticmethod
def binstr_to_unicode(bin_str: str) -> str:
"""This method codes a binary sequence in 8-bits in UTF-8.
Parameters
----------
bin_str : str
The binary sequence to be coded.
Returns
-------
str
The unicode string that corrsponds to the binary sequence.
"""
unicode = ""
for bit in range(0, len(bin_str), 8): # cut the sequence to 8-bits
eight_bits = bin_str[bit:bit+8]
code = int(eight_bits, 2)
unicode += chr(code)
return unicode
[docs] @staticmethod
def unicode_to_binstr(unicode: str) -> str:
"""Transforms a unicode sequence to a binary sequence.
Parameters
----------
unicode : str
The unicode sequence to be transformed.
Returns
-------
str
The corrsponding binary string.
"""
bin_str = ""
for uni in unicode:
code = ord(uni)
bin_str += '{:08b}'.format(code)
return bin_str
[docs] @staticmethod
def remove_padding(bin_str: str, pad: int) -> str:
"""This function removes the padding from a given binary sequence,
the padding is normally a variable number of zeroes added when coding
in 8-bits.
Parameters
----------
bin_str : str
The binary string to be stripped from a sequence of zeroes.
pad : int
The padding, the number of zeroes at the end of the binary
sequence.
Returns
-------
str
The no-padded binary string.
"""
return bin_str[:-pad] # take all the sequence without the padding
[docs] @staticmethod
def binstr_to_seq(bin_str: str, codes: Dict[str, str]) -> str:
"""Transforms a binary string to a sequence given a codes dictionary
of paths.
Parameters
----------
bin_str : str
The binary string to be transformed.
codes : Dict[str, str]
A dictionary of characters alongside their paths.
Returns
-------
str
The original sequence.
"""
original_seq = ""
reading_stream = ""
for num in bin_str:
reading_stream += num
for char, path in codes.items():
if path == reading_stream:
original_seq += char
reading_stream = ""
break
return original_seq