mirror of
https://github.com/hastagAB/Awesome-Python-Scripts.git
synced 2024-11-23 20:11:07 +00:00
Add the word frequency counter (#225)
This commit is contained in:
parent
cdaa837fe4
commit
e316697c80
|
@ -136,6 +136,7 @@ So far, the following projects have been integrated to this repo:
|
|||
|[Vinegère Cipher](vigenere_cipher)|[victoni](https://github.com/victoni)|
|
||||
|[Web proxy](Proxy-Request)|[Nikhil Kumar Singh](https://github.com/nikhilkumarsingh)|
|
||||
|[Website blocker](Website-Blocker)|[Ayush Bhardwaj](https://github.com/hastagAB)|
|
||||
|[Word Frequency Counter](Word_Frequency_Counter)|[sonniki](https://github.com/sonniki)|
|
||||
|[Word generator](Word-generator)|[TGLIDE](https://github.com/TGlide)|
|
||||
|[Work log generator](Work_Log_Generator)|[Maël Pedretti](https://github.com/73VW)|
|
||||
|[Youtube video downloader](Youtube_Video_Downloader)|[Christopher He](https://github.com/hecris)|
|
||||
|
|
78
Word_Frequency_Counter/README.md
Normal file
78
Word_Frequency_Counter/README.md
Normal file
|
@ -0,0 +1,78 @@
|
|||
# Word Frequency Counter
|
||||
|
||||
## Description
|
||||
A python script that counts word frequencies in a text.
|
||||
|
||||
The text is pre-processed beforehand to keep only the most informative words.
|
||||
Top-10 most frequent words are shown to the user. The full output is saved in a file in the same directory as the input text file.
|
||||
|
||||
## Usage
|
||||
|
||||
```py
|
||||
>>> python count_word_freq.py --filepath [filepath]
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
```py
|
||||
>>> python count_word_freq.py --filepath test_file.txt
|
||||
Top 10 most frequent words:
|
||||
[('queen', 3), ('said', 3), ('fair', 3), ('mirror', 3), ('snow', 2), ('castle', 2), ('father', 2), ('stepmother', 2), ('upon', 1), ('time', 1)]
|
||||
|
||||
Saved the word frequencies to 'test_file_freq_dist.txt'
|
||||
```
|
||||
|
||||
```
|
||||
test_file.txt
|
||||
|
||||
Once upon a time, a princess named Snow White lived in a castle with her father, the King, and her stepmother, the Queen. Her father had always said to his daughter that she must be fair to everyone at court. Said he, "People come here to the castle when they have a problem. They need the ruler to make a fair decision. Nothing is more important than to be fair."
|
||||
|
||||
The Queen, Snow White's stepmother, knew how much this meant to her husband. At the first chance, she went to her magic mirror. "Mirror, mirror, on the wall," said the Queen. "Who is the fairest of them all?"
|
||||
|
||||
```
|
||||
|
||||
```
|
||||
test_file_freq_dist.txt
|
||||
|
||||
('queen', 3)
|
||||
('said', 3)
|
||||
('fair', 3)
|
||||
('mirror', 3)
|
||||
('snow', 2)
|
||||
('castle', 2)
|
||||
('father', 2)
|
||||
('stepmother', 2)
|
||||
('upon', 1)
|
||||
('time', 1)
|
||||
('princess', 1)
|
||||
('named', 1)
|
||||
('white', 1)
|
||||
('lived', 1)
|
||||
('king', 1)
|
||||
('always', 1)
|
||||
('daughter', 1)
|
||||
('must', 1)
|
||||
('everyone', 1)
|
||||
('court', 1)
|
||||
('people', 1)
|
||||
('come', 1)
|
||||
('problem', 1)
|
||||
('need', 1)
|
||||
('ruler', 1)
|
||||
('make', 1)
|
||||
('decision', 1)
|
||||
('nothing', 1)
|
||||
('important', 1)
|
||||
('whites', 1)
|
||||
('knew', 1)
|
||||
('much', 1)
|
||||
('meant', 1)
|
||||
('husband', 1)
|
||||
('first', 1)
|
||||
('chance', 1)
|
||||
('went', 1)
|
||||
('magic', 1)
|
||||
('wall', 1)
|
||||
('fairest', 1)
|
||||
|
||||
```
|
77
Word_Frequency_Counter/count_word_freq.py
Normal file
77
Word_Frequency_Counter/count_word_freq.py
Normal file
|
@ -0,0 +1,77 @@
|
|||
import argparse
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.probability import FreqDist
|
||||
from nltk.tokenize import word_tokenize
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
def preprocess(text: str) -> str:
|
||||
"""
|
||||
Pre-process the input text.
|
||||
|
||||
- Remove punctuation
|
||||
- Remove numbers
|
||||
- Lowercase
|
||||
|
||||
:param text: text to pre-process
|
||||
:return: the pre-processed text
|
||||
"""
|
||||
# Lowercase.
|
||||
text = text.lower()
|
||||
# Remove numbers.
|
||||
text = re.sub(r"[0-9]+", "", text)
|
||||
# Remove punctuation.
|
||||
text = text.translate(str.maketrans("", "", string.punctuation))
|
||||
return text
|
||||
|
||||
|
||||
def run(text: str) -> FreqDist:
|
||||
"""
|
||||
Count the word frequencies in a text.
|
||||
|
||||
The text is pre-processed beforehand to remove uninformative
|
||||
tokens such as punctuation, numbers, stopwords, and to unify
|
||||
the same tokens by lowercasing the text.
|
||||
|
||||
:param text: text to count the word frequencies in
|
||||
:return: the word frequencies in the text
|
||||
"""
|
||||
# Pre-process the text.
|
||||
text = preprocess(text)
|
||||
# Tokenize the text.
|
||||
tokens = word_tokenize(text)
|
||||
# Remove stopwords.
|
||||
stop_words = set(stopwords.words("english"))
|
||||
tokens = [token for token in tokens if token not in stop_words]
|
||||
# Count the frequencies.
|
||||
freq_dist = FreqDist(tokens)
|
||||
print("Top 10 most frequent words:")
|
||||
print(freq_dist.most_common(10))
|
||||
return freq_dist
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--filepath",
|
||||
"-f",
|
||||
required=True,
|
||||
help="path to the text file"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
# Open the text file.
|
||||
with open(args.filepath, "r") as f:
|
||||
text = f.read()
|
||||
# Count the frequencies.
|
||||
freq_dist = run(text)
|
||||
freq_dist_str = "\n".join([str(x) for x in freq_dist.most_common(freq_dist.B())])
|
||||
# Save the result.
|
||||
old_file_name = args.filepath.split("/")[-1].split(".")[0]
|
||||
new_file_name = old_file_name + "_freq_dist"
|
||||
new_filepath = args.filepath.replace(old_file_name, new_file_name)
|
||||
with open(new_filepath, "w") as f:
|
||||
f.write(freq_dist_str)
|
||||
print(f"\nSaved the word frequencies to '{new_filepath}'")
|
||||
|
||||
|
4
Word_Frequency_Counter/requirements.txt
Normal file
4
Word_Frequency_Counter/requirements.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
argparse
|
||||
nltk==3.4.5
|
||||
re
|
||||
string
|
3
Word_Frequency_Counter/test_file.txt
Normal file
3
Word_Frequency_Counter/test_file.txt
Normal file
|
@ -0,0 +1,3 @@
|
|||
Once upon a time, a princess named Snow White lived in a castle with her father, the King, and her stepmother, the Queen. Her father had always said to his daughter that she must be fair to everyone at court. Said he, "People come here to the castle when they have a problem. They need the ruler to make a fair decision. Nothing is more important than to be fair."
|
||||
|
||||
The Queen, Snow White's stepmother, knew how much this meant to her husband. At the first chance, she went to her magic mirror. "Mirror, mirror, on the wall," said the Queen. "Who is the fairest of them all?"
|
40
Word_Frequency_Counter/test_file_freq_dist.txt
Normal file
40
Word_Frequency_Counter/test_file_freq_dist.txt
Normal file
|
@ -0,0 +1,40 @@
|
|||
('queen', 3)
|
||||
('said', 3)
|
||||
('fair', 3)
|
||||
('mirror', 3)
|
||||
('snow', 2)
|
||||
('castle', 2)
|
||||
('father', 2)
|
||||
('stepmother', 2)
|
||||
('upon', 1)
|
||||
('time', 1)
|
||||
('princess', 1)
|
||||
('named', 1)
|
||||
('white', 1)
|
||||
('lived', 1)
|
||||
('king', 1)
|
||||
('always', 1)
|
||||
('daughter', 1)
|
||||
('must', 1)
|
||||
('everyone', 1)
|
||||
('court', 1)
|
||||
('people', 1)
|
||||
('come', 1)
|
||||
('problem', 1)
|
||||
('need', 1)
|
||||
('ruler', 1)
|
||||
('make', 1)
|
||||
('decision', 1)
|
||||
('nothing', 1)
|
||||
('important', 1)
|
||||
('whites', 1)
|
||||
('knew', 1)
|
||||
('much', 1)
|
||||
('meant', 1)
|
||||
('husband', 1)
|
||||
('first', 1)
|
||||
('chance', 1)
|
||||
('went', 1)
|
||||
('magic', 1)
|
||||
('wall', 1)
|
||||
('fairest', 1)
|
Loading…
Reference in New Issue
Block a user