Implemented a multinomial naive bayes classifier for text classification

2025-04-15 10:17:35 +00:00 · 2023-10-02 20:48:40 +02:00 · 2023-10-02 20:48:40 +02:00 · 2759947a48
commit 2759947a48
parent 35dd529c85
1 changed files with 135 additions and 0 deletions
--- a/machine_learning/multinomial_naive_bayes_classifier.py
+++ b/machine_learning/multinomial_naive_bayes_classifier.py
@ -0,0 +1,135 @@
+"""
+Implementation from scratch of a basic Multinomial Naive Bayes classifier for text classification.
+"""
+
+
+import numpy as np
+import doctest
+from scipy import sparse
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.datasets import fetch_20newsgroups
+from sklearn.metrics import accuracy_score
+
+
+def group_data_by_target(targets):
+    """
+    Associates to each target label the indices of the examples with that label
+
+    Parameters
+    ----------
+    targets : array-like of shape (n_samples,)
+              Target labels
+
+    Returns
+    ----------
+    grouped_data : dict of (label : list)
+                   Maps each target label to the list of indices of the examples with that label
+
+    Example
+    ----------
+    >>> y = np.array([1, 2, 3, 1, 2, 5])
+    >>> group_data_by_target(y)
+    {1: [0, 3], 2: [1, 4], 3: [2], 5: [5]}
+    """
+    grouped_data = {}
+    for i, y in enumerate(targets):
+        if y not in grouped_data:
+            grouped_data[y] = []
+        grouped_data[y].append(i)
+    return grouped_data
+
+
+class MultinomialNBClassifier:
+    def __init__(self, alpha=1):
+        self.classes = None
+        self.features_probs = None
+        self.priors = None
+        self.alpha = alpha
+
+    def fit(self, X, y):
+        """
+        Parameters
+        ----------
+        X : scipy.sparse.csr_matrix of shape (n_samples, n_features)
+            Multinomial training examples
+
+        y : array-like of shape (n_samples,)
+            Target labels
+        """
+        if not sparse.issparse(X):
+            raise ValueError("Matrix X must be an instance of scipy.sparse.csr_matrix")
+        n_examples, n_features = X.shape
+        grouped_data = group_data_by_target(y)
+        self.classes = list(grouped_data.keys())
+        self.priors = np.zeros(shape=len(self.classes))
+        self.features_probs = np.zeros(shape=(len(self.classes), n_features))
+
+        for i, class_i in enumerate(self.classes):
+            data_class_i = X[grouped_data[class_i]]
+            prior_class_i = data_class_i.shape[0] / n_examples
+            self.priors[i] = prior_class_i
+            tot_features_count = data_class_i.sum()   # count of all features in class_i
+            features_count = np.array(data_class_i.sum(axis=0))[0]   # count of each feature x_j in class_i
+            for j, n_j in enumerate(features_count):
+                self.features_probs[i][j] = (self.alpha + n_j) / (tot_features_count + self.alpha * n_features)
+
+    def predict(self, X):
+        """
+        Parameters
+        ----------
+        X : scipy.sparse.csr_matrix of shape (n_samples, n_features)
+            Multinomial test examples
+
+        Returns
+        ----------
+        y_pred : ndarray of shape (n_samples,)
+                 Predicted target labels of test examples
+
+        Example
+        ----------
+        Let's test the function following an example taken from the documentation of the MultinomialNB model
+        from sklearn
+        >>> rng = np.random.RandomState(1)
+        >>> X = rng.randint(5, size=(6, 100))
+        >>> X = sparse.csr_matrix(X)
+        >>> y = np.array([1, 2, 3, 4, 5, 6])
+        >>> model = MultinomialNBClassifier()
+        >>> model.fit(X, y)
+        >>> model.predict(X[2:3])
+        array([3])
+        """
+        if not sparse.issparse(X):
+            raise ValueError("Matrix X must be an instance of scipy.sparse.csr_matrix")
+        y_pred = []
+        log_features_probs = np.log(self.features_probs)
+        log_priors = np.log(self.priors)
+        for instance in X:
+            theta = instance.multiply(log_features_probs).sum(axis=1)
+            likelihood = [log_prior_class_i + theta[i] for i, log_prior_class_i in enumerate(log_priors)]
+            y_pred.append(self.classes[np.argmax(likelihood)])
+        return np.array(y_pred)
+
+
+def main():
+    newsgroups_train = fetch_20newsgroups(subset='train')
+    newsgroups_test = fetch_20newsgroups(subset='test')
+    X_train = newsgroups_train['data']
+    y_train = newsgroups_train['target']
+    X_test = newsgroups_test['data']
+    y_test = newsgroups_test['target']
+    vectorizer = TfidfVectorizer(stop_words='english')
+    X_train = vectorizer.fit_transform(X_train)
+    X_test = vectorizer.transform(X_test)
+
+    model = MultinomialNBClassifier()
+    print("Start training")
+    model.fit(X_train, y_train)
+
+    y_pred = model.predict(X_test)
+    print("Accuracy of Naive Bayes text classifier: " + str(accuracy_score(y_test, y_pred)))
+
+
+if __name__ == "__main__":
+    main()
+    doctest.testmod()
+