Skip to content

implementation of entropy algorithm. #2110

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jun 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion graphs/bidirectional_breadth_first_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ def retrace_path(self, node: Node) -> List[Tuple[int]]:

class BidirectionalBreadthFirstSearch:
"""
>>> bd_bfs = BidirectionalBreadthFirstSearch((0, 0), (len(grid) - 1, len(grid[0]) - 1))
>>> bd_bfs = BidirectionalBreadthFirstSearch((0, 0), (len(grid) - 1,
... len(grid[0]) - 1))
>>> bd_bfs.fwd_bfs.start.pos == bd_bfs.bwd_bfs.target.pos
True
>>> bd_bfs.retrace_bidirectional_path(bd_bfs.fwd_bfs.start,
Expand Down
131 changes: 131 additions & 0 deletions maths/entropy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env python3

"""
Implementation of entropy of information
https://en.wikipedia.org/wiki/Entropy_(information_theory)
"""

import math
from collections import Counter
from string import ascii_lowercase
from typing import Tuple


def calculate_prob(text: str) -> None:
"""
This method takes path and two dict as argument
and than calculates entropy of them.
:param dict:
:param dict:
:return: Prints
1) Entropy of information based on 1 alphabet
2) Entropy of information based on couples of 2 alphabet
3) print Entropy of H(X n∣Xn−1)

Text from random books. Also, random quotes.
>>> text = ("Behind Winston’s back the voice "
... "from the telescreen was still "
... "babbling and the overfulfilment")
>>> calculate_prob(text)
4.0
6.0
2.0

>>> text = ("The Ministry of Truth—Minitrue, in Newspeak [Newspeak was the official"
... "face in elegant lettering, the three")
>>> calculate_prob(text)
4.0
5.0
1.0
>>> text = ("Had repulsive dashwoods suspicion sincerity but advantage now him. "
... "Remark easily garret nor nay. Civil those mrs enjoy shy fat merry. "
... "You greatest jointure saw horrible. He private he on be imagine "
... "suppose. Fertile beloved evident through no service elderly is. Blind "
... "there if every no so at. Own neglected you preferred way sincerity "
... "delivered his attempted. To of message cottage windows do besides "
... "against uncivil. Delightful unreserved impossible few estimating "
... "men favourable see entreaties. She propriety immediate was improving. "
... "He or entrance humoured likewise moderate. Much nor game son say "
... "feel. Fat make met can must form into gate. Me we offending prevailed "
... "discovery.")
>>> calculate_prob(text)
4.0
7.0
3.0
"""
single_char_strings, two_char_strings = analyze_text(text)
my_alphas = list(' ' + ascii_lowercase)
# what is our total sum of probabilities.
all_sum = sum(single_char_strings.values())

# one length string
my_fir_sum = 0
# for each alpha we go in our dict and if it is in it we calculate entropy
for ch in my_alphas:
if ch in single_char_strings:
my_str = single_char_strings[ch]
prob = my_str / all_sum
my_fir_sum += prob * math.log2(prob) # entropy formula.

# print entropy
print("{0:.1f}".format(round(-1 * my_fir_sum)))

# two len string
all_sum = sum(two_char_strings.values())
my_sec_sum = 0
# for each alpha (two in size) calculate entropy.
for ch0 in my_alphas:
for ch1 in my_alphas:
sequence = ch0 + ch1
if sequence in two_char_strings:
my_str = two_char_strings[sequence]
prob = int(my_str) / all_sum
my_sec_sum += prob * math.log2(prob)

# print second entropy
print("{0:.1f}".format(round(-1 * my_sec_sum)))

# print the difference between them
print("{0:.1f}".format(round(((-1 * my_sec_sum) - (-1 * my_fir_sum)))))


def analyze_text(text: str) -> Tuple[dict, dict]:
"""
Convert text input into two dicts of counts.
The first dictionary stores the frequency of single character strings.
The second dictionary stores the frequency of two character strings.
"""
single_char_strings = Counter() # type: ignore
two_char_strings = Counter() # type: ignore
single_char_strings[text[-1]] += 1

# first case when we have space at start.
two_char_strings[" " + text[0]] += 1
for i in range(0, len(text) - 1):
single_char_strings[text[i]] += 1
two_char_strings[text[i : i + 2]] += 1
return single_char_strings, two_char_strings


def main():
import doctest

doctest.testmod()
# text = (
# "Had repulsive dashwoods suspicion sincerity but advantage now him. Remark "
# "easily garret nor nay. Civil those mrs enjoy shy fat merry. You greatest "
# "jointure saw horrible. He private he on be imagine suppose. Fertile "
# "beloved evident through no service elderly is. Blind there if every no so "
# "at. Own neglected you preferred way sincerity delivered his attempted. To "
# "of message cottage windows do besides against uncivil. Delightful "
# "unreserved impossible few estimating men favourable see entreaties. She "
# "propriety immediate was improving. He or entrance humoured likewise "
# "moderate. Much nor game son say feel. Fat make met can must form into "
# "gate. Me we offending prevailed discovery. "
# )

# calculate_prob(text)


if __name__ == "__main__":
main()