mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-03-27 17:06:44 +00:00
NLP Word Frequency Algorithms (#2142)
* NLP Word Frequency Algorithms * Added type hints and Wikipedia link to tf-idf * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Fix line length for flake8 * Fix line length for flake8 V2 * Add line escapes and change int to float * Corrected doctests * Fix for TravisCI * Fix for TravisCI V2 * Tests passing locally * Tests passing locally * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/word_frequency_functions.py Co-authored-by: Christian Clauss <cclauss@me.com> * Add doctest examples and clean up docstrings Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
parent
c7ca9cf0df
commit
b368b1ecfd
133
machine_learning/word_frequency_functions.py
Normal file
133
machine_learning/word_frequency_functions.py
Normal file
@ -0,0 +1,133 @@
|
||||
import string
|
||||
from math import log10
|
||||
|
||||
"""
|
||||
tf-idf Wikipedia: https://en.wikipedia.org/wiki/Tf%E2%80%93idf
|
||||
tf-idf and other word frequency algorithms are often used
|
||||
as a weighting factor in information retrieval and text
|
||||
mining. 83% of text-based recommender systems use
|
||||
tf-idf for term weighting. In Layman's terms, tf-idf
|
||||
is a statistic intended to reflect how important a word
|
||||
is to a document in a corpus (a collection of documents)
|
||||
|
||||
|
||||
Here I've implemented several word frequency algorithms
|
||||
that are commonly used in information retrieval: Term Frequency,
|
||||
Document Frequency, and TF-IDF (Term-Frequency*Inverse-Document-Frequency)
|
||||
are included.
|
||||
|
||||
Term Frequency is a statistical function that
|
||||
returns a number representing how frequently
|
||||
an expression occurs in a document. This
|
||||
indicates how significant a particular term is in
|
||||
a given document.
|
||||
|
||||
Document Frequency is a statistical function that returns
|
||||
an integer representing the number of documents in a
|
||||
corpus that a term occurs in (where the max number returned
|
||||
would be the number of documents in the corpus).
|
||||
|
||||
Inverse Document Frequency is mathematically written as
|
||||
log10(N/df), where N is the number of documents in your
|
||||
corpus and df is the Document Frequency. If df is 0, a
|
||||
ZeroDivisionError will be thrown.
|
||||
|
||||
Term-Frequency*Inverse-Document-Frequency is a measure
|
||||
of the originality of a term. It is mathematically written
|
||||
as tf*log10(N/df). It compares the number of times
|
||||
a term appears in a document with the number of documents
|
||||
the term appears in. If df is 0, a ZeroDivisionError will be thrown.
|
||||
"""
|
||||
|
||||
|
||||
def term_frequency(term : str, document : str) -> int:
|
||||
"""
|
||||
Return the number of times a term occurs within
|
||||
a given document.
|
||||
@params: term, the term to search a document for, and document,
|
||||
the document to search within
|
||||
@returns: an integer representing the number of times a term is
|
||||
found within the document
|
||||
|
||||
@examples:
|
||||
>>> term_frequency("to", "To be, or not to be")
|
||||
2
|
||||
"""
|
||||
# strip all punctuation and newlines and replace it with ''
|
||||
document_without_punctuation = document.translate(
|
||||
str.maketrans("", "", string.punctuation)
|
||||
).replace("\n", "")
|
||||
tokenize_document = document_without_punctuation.split(" ") # word tokenization
|
||||
return len(
|
||||
[word for word in tokenize_document if word.lower() == term.lower()]
|
||||
)
|
||||
|
||||
|
||||
def document_frequency(term: str, corpus: str) -> int:
|
||||
"""
|
||||
Calculate the number of documents in a corpus that contain a
|
||||
given term
|
||||
@params : term, the term to search each document for, and corpus, a collection of
|
||||
documents. Each document should be separated by a newline.
|
||||
@returns : the number of documents in the corpus that contain the term you are
|
||||
searching for and the number of documents in the corpus
|
||||
@examples :
|
||||
>>> document_frequency("first", "This is the first document in the corpus.\\nThIs\
|
||||
is the second document in the corpus.\\nTHIS is \
|
||||
the third document in the corpus.")
|
||||
(1, 3)
|
||||
"""
|
||||
corpus_without_punctuation = corpus.translate(
|
||||
str.maketrans("", "", string.punctuation)
|
||||
) # strip all punctuation and replace it with ''
|
||||
documents = corpus_without_punctuation.split("\n")
|
||||
lowercase_documents = [document.lower() for document in documents]
|
||||
return len(
|
||||
[document for document in lowercase_documents if term.lower() in document]
|
||||
), len(documents)
|
||||
|
||||
|
||||
def inverse_document_frequency(df : int, N: int) -> float:
|
||||
"""
|
||||
Return an integer denoting the importance
|
||||
of a word. This measure of importance is
|
||||
calculated by log10(N/df), where N is the
|
||||
number of documents and df is
|
||||
the Document Frequency.
|
||||
@params : df, the Document Frequency, and N,
|
||||
the number of documents in the corpus.
|
||||
@returns : log10(N/df)
|
||||
@examples :
|
||||
>>> inverse_document_frequency(3, 0)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: log10(0) is undefined.
|
||||
>>> inverse_document_frequency(1, 3)
|
||||
0.477
|
||||
>>> inverse_document_frequency(0, 3)
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ZeroDivisionError: df must be > 0
|
||||
"""
|
||||
if df == 0:
|
||||
raise ZeroDivisionError("df must be > 0")
|
||||
elif N == 0:
|
||||
raise ValueError("log10(0) is undefined.")
|
||||
return round(log10(N / df), 3)
|
||||
|
||||
|
||||
def tf_idf(tf : int, idf: int) -> float:
|
||||
"""
|
||||
Combine the term frequency
|
||||
and inverse document frequency functions to
|
||||
calculate the originality of a term. This
|
||||
'originality' is calculated by multiplying
|
||||
the term frequency and the inverse document
|
||||
frequency : tf-idf = TF * IDF
|
||||
@params : tf, the term frequency, and idf, the inverse document
|
||||
frequency
|
||||
@examples :
|
||||
>>> tf_idf(2, 0.477)
|
||||
0.954
|
||||
"""
|
||||
return round(tf * idf, 3)
|
Loading…
x
Reference in New Issue
Block a user