mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-02-12 04:18:08 +00:00
Implemented a multinomial naive bayes classifier for text classification
This commit is contained in:
parent
35dd529c85
commit
2759947a48
135
machine_learning/multinomial_naive_bayes_classifier.py
Normal file
135
machine_learning/multinomial_naive_bayes_classifier.py
Normal file
|
@ -0,0 +1,135 @@
|
|||
"""
|
||||
Implementation from scratch of a basic Multinomial Naive Bayes classifier for text classification.
|
||||
"""
|
||||
|
||||
|
||||
import numpy as np
|
||||
import doctest
|
||||
from scipy import sparse
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.datasets import fetch_20newsgroups
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
|
||||
def group_data_by_target(targets):
|
||||
"""
|
||||
Associates to each target label the indices of the examples with that label
|
||||
|
||||
Parameters
|
||||
----------
|
||||
targets : array-like of shape (n_samples,)
|
||||
Target labels
|
||||
|
||||
Returns
|
||||
----------
|
||||
grouped_data : dict of (label : list)
|
||||
Maps each target label to the list of indices of the examples with that label
|
||||
|
||||
Example
|
||||
----------
|
||||
>>> y = np.array([1, 2, 3, 1, 2, 5])
|
||||
>>> group_data_by_target(y)
|
||||
{1: [0, 3], 2: [1, 4], 3: [2], 5: [5]}
|
||||
"""
|
||||
grouped_data = {}
|
||||
for i, y in enumerate(targets):
|
||||
if y not in grouped_data:
|
||||
grouped_data[y] = []
|
||||
grouped_data[y].append(i)
|
||||
return grouped_data
|
||||
|
||||
|
||||
class MultinomialNBClassifier:
|
||||
def __init__(self, alpha=1):
|
||||
self.classes = None
|
||||
self.features_probs = None
|
||||
self.priors = None
|
||||
self.alpha = alpha
|
||||
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
X : scipy.sparse.csr_matrix of shape (n_samples, n_features)
|
||||
Multinomial training examples
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
Target labels
|
||||
"""
|
||||
if not sparse.issparse(X):
|
||||
raise ValueError("Matrix X must be an instance of scipy.sparse.csr_matrix")
|
||||
n_examples, n_features = X.shape
|
||||
grouped_data = group_data_by_target(y)
|
||||
self.classes = list(grouped_data.keys())
|
||||
self.priors = np.zeros(shape=len(self.classes))
|
||||
self.features_probs = np.zeros(shape=(len(self.classes), n_features))
|
||||
|
||||
for i, class_i in enumerate(self.classes):
|
||||
data_class_i = X[grouped_data[class_i]]
|
||||
prior_class_i = data_class_i.shape[0] / n_examples
|
||||
self.priors[i] = prior_class_i
|
||||
tot_features_count = data_class_i.sum() # count of all features in class_i
|
||||
features_count = np.array(data_class_i.sum(axis=0))[0] # count of each feature x_j in class_i
|
||||
for j, n_j in enumerate(features_count):
|
||||
self.features_probs[i][j] = (self.alpha + n_j) / (tot_features_count + self.alpha * n_features)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
X : scipy.sparse.csr_matrix of shape (n_samples, n_features)
|
||||
Multinomial test examples
|
||||
|
||||
Returns
|
||||
----------
|
||||
y_pred : ndarray of shape (n_samples,)
|
||||
Predicted target labels of test examples
|
||||
|
||||
Example
|
||||
----------
|
||||
Let's test the function following an example taken from the documentation of the MultinomialNB model
|
||||
from sklearn
|
||||
>>> rng = np.random.RandomState(1)
|
||||
>>> X = rng.randint(5, size=(6, 100))
|
||||
>>> X = sparse.csr_matrix(X)
|
||||
>>> y = np.array([1, 2, 3, 4, 5, 6])
|
||||
>>> model = MultinomialNBClassifier()
|
||||
>>> model.fit(X, y)
|
||||
>>> model.predict(X[2:3])
|
||||
array([3])
|
||||
"""
|
||||
if not sparse.issparse(X):
|
||||
raise ValueError("Matrix X must be an instance of scipy.sparse.csr_matrix")
|
||||
y_pred = []
|
||||
log_features_probs = np.log(self.features_probs)
|
||||
log_priors = np.log(self.priors)
|
||||
for instance in X:
|
||||
theta = instance.multiply(log_features_probs).sum(axis=1)
|
||||
likelihood = [log_prior_class_i + theta[i] for i, log_prior_class_i in enumerate(log_priors)]
|
||||
y_pred.append(self.classes[np.argmax(likelihood)])
|
||||
return np.array(y_pred)
|
||||
|
||||
|
||||
def main():
|
||||
newsgroups_train = fetch_20newsgroups(subset='train')
|
||||
newsgroups_test = fetch_20newsgroups(subset='test')
|
||||
X_train = newsgroups_train['data']
|
||||
y_train = newsgroups_train['target']
|
||||
X_test = newsgroups_test['data']
|
||||
y_test = newsgroups_test['target']
|
||||
vectorizer = TfidfVectorizer(stop_words='english')
|
||||
X_train = vectorizer.fit_transform(X_train)
|
||||
X_test = vectorizer.transform(X_test)
|
||||
|
||||
model = MultinomialNBClassifier()
|
||||
print("Start training")
|
||||
model.fit(X_train, y_train)
|
||||
|
||||
y_pred = model.predict(X_test)
|
||||
print("Accuracy of Naive Bayes text classifier: " + str(accuracy_score(y_test, y_pred)))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
doctest.testmod()
|
||||
|
Loading…
Reference in New Issue
Block a user