From 2759947a481661c4ae776614aad053b0f0f4aab7 Mon Sep 17 00:00:00 2001 From: ricca Date: Mon, 2 Oct 2023 20:48:40 +0200 Subject: [PATCH] Implemented a multinomial naive bayes classifier for text classification --- .../multinomial_naive_bayes_classifier.py | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 machine_learning/multinomial_naive_bayes_classifier.py diff --git a/machine_learning/multinomial_naive_bayes_classifier.py b/machine_learning/multinomial_naive_bayes_classifier.py new file mode 100644 index 000000000..c290f2c63 --- /dev/null +++ b/machine_learning/multinomial_naive_bayes_classifier.py @@ -0,0 +1,135 @@ +""" +Implementation from scratch of a basic Multinomial Naive Bayes classifier for text classification. +""" + + +import numpy as np +import doctest +from scipy import sparse +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.datasets import fetch_20newsgroups +from sklearn.metrics import accuracy_score + + +def group_data_by_target(targets): + """ + Associates to each target label the indices of the examples with that label + + Parameters + ---------- + targets : array-like of shape (n_samples,) + Target labels + + Returns + ---------- + grouped_data : dict of (label : list) + Maps each target label to the list of indices of the examples with that label + + Example + ---------- + >>> y = np.array([1, 2, 3, 1, 2, 5]) + >>> group_data_by_target(y) + {1: [0, 3], 2: [1, 4], 3: [2], 5: [5]} + """ + grouped_data = {} + for i, y in enumerate(targets): + if y not in grouped_data: + grouped_data[y] = [] + grouped_data[y].append(i) + return grouped_data + + +class MultinomialNBClassifier: + def __init__(self, alpha=1): + self.classes = None + self.features_probs = None + self.priors = None + self.alpha = alpha + + def fit(self, X, y): + """ + Parameters + ---------- + X : scipy.sparse.csr_matrix of shape (n_samples, n_features) + Multinomial training examples + + y : array-like of shape (n_samples,) + Target labels + """ + if not sparse.issparse(X): + raise ValueError("Matrix X must be an instance of scipy.sparse.csr_matrix") + n_examples, n_features = X.shape + grouped_data = group_data_by_target(y) + self.classes = list(grouped_data.keys()) + self.priors = np.zeros(shape=len(self.classes)) + self.features_probs = np.zeros(shape=(len(self.classes), n_features)) + + for i, class_i in enumerate(self.classes): + data_class_i = X[grouped_data[class_i]] + prior_class_i = data_class_i.shape[0] / n_examples + self.priors[i] = prior_class_i + tot_features_count = data_class_i.sum() # count of all features in class_i + features_count = np.array(data_class_i.sum(axis=0))[0] # count of each feature x_j in class_i + for j, n_j in enumerate(features_count): + self.features_probs[i][j] = (self.alpha + n_j) / (tot_features_count + self.alpha * n_features) + + def predict(self, X): + """ + Parameters + ---------- + X : scipy.sparse.csr_matrix of shape (n_samples, n_features) + Multinomial test examples + + Returns + ---------- + y_pred : ndarray of shape (n_samples,) + Predicted target labels of test examples + + Example + ---------- + Let's test the function following an example taken from the documentation of the MultinomialNB model + from sklearn + >>> rng = np.random.RandomState(1) + >>> X = rng.randint(5, size=(6, 100)) + >>> X = sparse.csr_matrix(X) + >>> y = np.array([1, 2, 3, 4, 5, 6]) + >>> model = MultinomialNBClassifier() + >>> model.fit(X, y) + >>> model.predict(X[2:3]) + array([3]) + """ + if not sparse.issparse(X): + raise ValueError("Matrix X must be an instance of scipy.sparse.csr_matrix") + y_pred = [] + log_features_probs = np.log(self.features_probs) + log_priors = np.log(self.priors) + for instance in X: + theta = instance.multiply(log_features_probs).sum(axis=1) + likelihood = [log_prior_class_i + theta[i] for i, log_prior_class_i in enumerate(log_priors)] + y_pred.append(self.classes[np.argmax(likelihood)]) + return np.array(y_pred) + + +def main(): + newsgroups_train = fetch_20newsgroups(subset='train') + newsgroups_test = fetch_20newsgroups(subset='test') + X_train = newsgroups_train['data'] + y_train = newsgroups_train['target'] + X_test = newsgroups_test['data'] + y_test = newsgroups_test['target'] + vectorizer = TfidfVectorizer(stop_words='english') + X_train = vectorizer.fit_transform(X_train) + X_test = vectorizer.transform(X_test) + + model = MultinomialNBClassifier() + print("Start training") + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + print("Accuracy of Naive Bayes text classifier: " + str(accuracy_score(y_test, y_pred))) + + +if __name__ == "__main__": + main() + doctest.testmod() +