Python/maths/entropy.py
pre-commit-ci[bot] 421ace81ed
[pre-commit.ci] pre-commit autoupdate (#9013)
* [pre-commit.ci] pre-commit autoupdate

updates:
- [github.com/astral-sh/ruff-pre-commit: v0.0.285 → v0.0.286](https://github.com/astral-sh/ruff-pre-commit/compare/v0.0.285...v0.0.286)
- [github.com/tox-dev/pyproject-fmt: 0.13.1 → 1.1.0](https://github.com/tox-dev/pyproject-fmt/compare/0.13.1...1.1.0)

* updating DIRECTORY.md

* Fis ruff rules PIE808,PLR1714

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
Co-authored-by: Christian Clauss <cclauss@me.com>
2023-08-29 15:18:10 +02:00

132 lines
4.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Implementation of entropy of information
https://en.wikipedia.org/wiki/Entropy_(information_theory)
"""
from __future__ import annotations
import math
from collections import Counter
from string import ascii_lowercase
def calculate_prob(text: str) -> None:
"""
This method takes path and two dict as argument
and than calculates entropy of them.
:param dict:
:param dict:
:return: Prints
1) Entropy of information based on 1 alphabet
2) Entropy of information based on couples of 2 alphabet
3) print Entropy of H(X nXn1)
Text from random books. Also, random quotes.
>>> text = ("Behind Winstons back the voice "
... "from the telescreen was still "
... "babbling and the overfulfilment")
>>> calculate_prob(text)
4.0
6.0
2.0
>>> text = ("The Ministry of Truth—Minitrue, in Newspeak [Newspeak was the official"
... "face in elegant lettering, the three")
>>> calculate_prob(text)
4.0
5.0
1.0
>>> text = ("Had repulsive dashwoods suspicion sincerity but advantage now him. "
... "Remark easily garret nor nay. Civil those mrs enjoy shy fat merry. "
... "You greatest jointure saw horrible. He private he on be imagine "
... "suppose. Fertile beloved evident through no service elderly is. Blind "
... "there if every no so at. Own neglected you preferred way sincerity "
... "delivered his attempted. To of message cottage windows do besides "
... "against uncivil. Delightful unreserved impossible few estimating "
... "men favourable see entreaties. She propriety immediate was improving. "
... "He or entrance humoured likewise moderate. Much nor game son say "
... "feel. Fat make met can must form into gate. Me we offending prevailed "
... "discovery.")
>>> calculate_prob(text)
4.0
7.0
3.0
"""
single_char_strings, two_char_strings = analyze_text(text)
my_alphas = list(" " + ascii_lowercase)
# what is our total sum of probabilities.
all_sum = sum(single_char_strings.values())
# one length string
my_fir_sum = 0
# for each alpha we go in our dict and if it is in it we calculate entropy
for ch in my_alphas:
if ch in single_char_strings:
my_str = single_char_strings[ch]
prob = my_str / all_sum
my_fir_sum += prob * math.log2(prob) # entropy formula.
# print entropy
print(f"{round(-1 * my_fir_sum):.1f}")
# two len string
all_sum = sum(two_char_strings.values())
my_sec_sum = 0
# for each alpha (two in size) calculate entropy.
for ch0 in my_alphas:
for ch1 in my_alphas:
sequence = ch0 + ch1
if sequence in two_char_strings:
my_str = two_char_strings[sequence]
prob = int(my_str) / all_sum
my_sec_sum += prob * math.log2(prob)
# print second entropy
print(f"{round(-1 * my_sec_sum):.1f}")
# print the difference between them
print(f"{round((-1 * my_sec_sum) - (-1 * my_fir_sum)):.1f}")
def analyze_text(text: str) -> tuple[dict, dict]:
"""
Convert text input into two dicts of counts.
The first dictionary stores the frequency of single character strings.
The second dictionary stores the frequency of two character strings.
"""
single_char_strings = Counter() # type: ignore
two_char_strings = Counter() # type: ignore
single_char_strings[text[-1]] += 1
# first case when we have space at start.
two_char_strings[" " + text[0]] += 1
for i in range(len(text) - 1):
single_char_strings[text[i]] += 1
two_char_strings[text[i : i + 2]] += 1
return single_char_strings, two_char_strings
def main():
import doctest
doctest.testmod()
# text = (
# "Had repulsive dashwoods suspicion sincerity but advantage now him. Remark "
# "easily garret nor nay. Civil those mrs enjoy shy fat merry. You greatest "
# "jointure saw horrible. He private he on be imagine suppose. Fertile "
# "beloved evident through no service elderly is. Blind there if every no so "
# "at. Own neglected you preferred way sincerity delivered his attempted. To "
# "of message cottage windows do besides against uncivil. Delightful "
# "unreserved impossible few estimating men favourable see entreaties. She "
# "propriety immediate was improving. He or entrance humoured likewise "
# "moderate. Much nor game son say feel. Fat make met can must form into "
# "gate. Me we offending prevailed discovery. "
# )
# calculate_prob(text)
if __name__ == "__main__":
main()