Python/machine_learning/multinomial_naive_bayes_classifier.py

"""
Implementation from scratch of a Multinomial Naive Bayes Classifier.
The algorithm is trained and tested on the twenty_newsgroup dataset
from sklearn to perform text classification

Here the Wikipedia page to understand the theory behind this kind
of probabilistic models:
https://en.wikipedia.org/wiki/Naive_Bayes_classifier
"""

import doctest

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


def group_indices_by_target(targets):
    """
    Associates to each target label the indices of the examples with that label

    Parameters
    ----------
    targets : array-like of shape (n_samples,)
              Target labels

    Returns
    ----------
    grouped_indices : dict of (label : list)
                      Maps each target label to the list of indices of the
                      examples with that label

    Example
    ----------
    >>> y = np.array([1, 2, 3, 1, 2, 5])
    >>> group_indices_by_target(y)
    {1: [0, 3], 2: [1, 4], 3: [2], 5: [5]}
    """
    grouped_indices = {}
    for i, y in enumerate(targets):
        if y not in grouped_indices:
            grouped_indices[y] = []
        grouped_indices[y].append(i)
    return grouped_indices


class MultinomialNBClassifier:
    def __init__(self, alpha=1):
        self.classes = None
        self.features_probs = None
        self.priors = None
        self.alpha = alpha

    def fit(self, data, targets):
        """
        Parameters
        ----------
        data : scipy.sparse.csr_matrix of shape (n_samples, n_features)
            Multinomial training examples

        targets : array-like of shape (n_samples,)
            Target labels
        """
        n_examples, n_features = data.shape
        grouped_indices = group_indices_by_target(targets)
        self.classes = list(grouped_indices.keys())
        self.priors = np.zeros(shape=len(self.classes))
        self.features_probs = np.zeros(shape=(len(self.classes), n_features))

        for i, class_i in enumerate(self.classes):
            data_class_i = data[grouped_indices[class_i]]
            prior_class_i = data_class_i.shape[0] / n_examples
            self.priors[i] = prior_class_i
            tot_features_count = data_class_i.sum()  # count of all features in class_i
            features_count = np.array(data_class_i.sum(axis=0))[0]
            for j, n_j in enumerate(features_count):
                self.features_probs[i][j] = (self.alpha + n_j) / (
                    tot_features_count + self.alpha * n_features
                )

    def predict(self, data):
        """
        Parameters
        ----------
        data : scipy.sparse.csr_matrix of shape (n_samples, n_features)
            Multinomial test examples

        Returns
        ----------
        y_pred : ndarray of shape (n_samples,)
                 Predicted target labels of test examples

        Example
        ----------
        Let's test the function following an example taken from the documentation
        of the MultinomialNB model from sklearn
        >>> from scipy import sparse
        >>> rng = np.random.RandomState(1)
        >>> data = rng.randint(5, size=(6, 100))
        >>> data = sparse.csr_matrix(data)
        >>> y = np.array([1, 2, 3, 4, 5, 6])
        >>> model = MultinomialNBClassifier()
        >>> model.fit(data, y)
        >>> model.predict(data[2:3])
        array([3])
        """
        y_pred = []
        log_features_probs = np.log(self.features_probs)
        log_priors = np.log(self.priors)
        for instance in data:
            theta = instance.multiply(log_features_probs).sum(axis=1)
            likelihood = [
                log_prior_class_i + theta[i]
                for i, log_prior_class_i in enumerate(log_priors)
            ]
            y_pred.append(self.classes[np.argmax(likelihood)])
        return np.array(y_pred)


def main():
    newsgroups_train = fetch_20newsgroups(subset="train")
    newsgroups_test = fetch_20newsgroups(subset="test")
    x_train = newsgroups_train["data"]
    y_train = newsgroups_train["target"]
    x_test = newsgroups_test["data"]
    y_test = newsgroups_test["target"]
    vectorizer = TfidfVectorizer(stop_words="english")
    x_train = vectorizer.fit_transform(x_train)
    x_test = vectorizer.transform(x_test)

    model = MultinomialNBClassifier()
    print("Start training")
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    print(
        "Accuracy of naive bayes text classifier: "
        + str(accuracy_score(y_test, y_pred))
    )


if __name__ == "__main__":
    main()
    doctest.testmod()
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`"""`
Comments added 2023-10-03 16:46:20 +00:00			`Implementation from scratch of a Multinomial Naive Bayes Classifier.`
Fixed comments 2023-10-03 18:18:09 +00:00			`The algorithm is trained and tested on the twenty_newsgroup dataset`
			`from sklearn to perform text classification`
Add comments 2023-10-03 17:07:57 +00:00
Fixed comments 2023-10-03 18:18:09 +00:00			`Here the Wikipedia page to understand the theory behind this kind`
			`of probabilistic models:`
Add comments 2023-10-03 17:07:57 +00:00			`https://en.wikipedia.org/wiki/Naive_Bayes_classifier`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`"""`

			`import doctest`
Fixed imports 2023-10-06 15:13:04 +00:00
Fixed imports 2023-10-03 18:35:49 +00:00			`import numpy as np`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`from sklearn.datasets import fetch_20newsgroups`
Fixed imports 2023-10-06 15:13:04 +00:00			`from sklearn.feature_extraction.text import TfidfVectorizer`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`from sklearn.metrics import accuracy_score`
Fixed imports 2023-10-03 18:35:49 +00:00

Handle mypy errors 2023-10-06 15:32:14 +00:00			`def group_indices_by_target(targets):`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`"""`
			`Associates to each target label the indices of the examples with that label`

			`Parameters`
			`----------`
			`targets : array-like of shape (n_samples,)`
			`Target labels`

			`Returns`
			`----------`
Implemented input check 2023-10-03 16:28:37 +00:00			`grouped_indices : dict of (label : list)`
Fixed comments 2023-10-03 18:18:09 +00:00			`Maps each target label to the list of indices of the`
			`examples with that label`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00
			`Example`
			`----------`
			`>>> y = np.array([1, 2, 3, 1, 2, 5])`
Implemented input check 2023-10-03 16:28:37 +00:00			`>>> group_indices_by_target(y)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`{1: [0, 3], 2: [1, 4], 3: [2], 5: [5]}`
			`"""`
Implemented input check 2023-10-03 16:28:37 +00:00			`grouped_indices = {}`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`for i, y in enumerate(targets):`
Implemented input check 2023-10-03 16:28:37 +00:00			`if y not in grouped_indices:`
			`grouped_indices[y] = []`
			`grouped_indices[y].append(i)`
			`return grouped_indices`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00

			`class MultinomialNBClassifier:`
Handle mypy errors 2023-10-06 15:32:14 +00:00			`def __init__(self, alpha=1):`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`self.classes = None`
			`self.features_probs = None`
			`self.priors = None`
			`self.alpha = alpha`

Handle mypy errors 2023-10-06 15:32:14 +00:00			`def fit(self, data, targets):`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`"""`
			`Parameters`
			`----------`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`data : scipy.sparse.csr_matrix of shape (n_samples, n_features)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`Multinomial training examples`

Fixed imports 2023-10-06 15:13:04 +00:00			`targets : array-like of shape (n_samples,)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`Target labels`
			`"""`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`n_examples, n_features = data.shape`
Fixed imports 2023-10-06 15:13:04 +00:00			`grouped_indices = group_indices_by_target(targets)`
Implemented input check 2023-10-03 16:28:37 +00:00			`self.classes = list(grouped_indices.keys())`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`self.priors = np.zeros(shape=len(self.classes))`
			`self.features_probs = np.zeros(shape=(len(self.classes), n_features))`

			`for i, class_i in enumerate(self.classes):`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`data_class_i = data[grouped_indices[class_i]]`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`prior_class_i = data_class_i.shape[0] / n_examples`
			`self.priors[i] = prior_class_i`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2023-10-03 18:04:15 +00:00			`tot_features_count = data_class_i.sum() # count of all features in class_i`
Fixed imports 2023-10-06 15:13:04 +00:00			`features_count = np.array(data_class_i.sum(axis=0))[0]`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`for j, n_j in enumerate(features_count):`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2023-10-03 18:04:15 +00:00			`self.features_probs[i][j] = (self.alpha + n_j) / (`
			`tot_features_count + self.alpha * n_features`
			`)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00
Handle mypy errors 2023-10-06 15:32:14 +00:00			`def predict(self, data):`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`"""`
			`Parameters`
			`----------`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`data : scipy.sparse.csr_matrix of shape (n_samples, n_features)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`Multinomial test examples`

			`Returns`
			`----------`
			`y_pred : ndarray of shape (n_samples,)`
			`Predicted target labels of test examples`

			`Example`
			`----------`
Fixed comments 2023-10-03 18:18:09 +00:00			`Let's test the function following an example taken from the documentation`
			`of the MultinomialNB model from sklearn`
Handle mypy errors 2023-10-06 15:32:14 +00:00			`>>> from scipy import sparse`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`>>> rng = np.random.RandomState(1)`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`>>> data = rng.randint(5, size=(6, 100))`
			`>>> data = sparse.csr_matrix(data)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`>>> y = np.array([1, 2, 3, 4, 5, 6])`
			`>>> model = MultinomialNBClassifier()`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`>>> model.fit(data, y)`
			`>>> model.predict(data[2:3])`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`array([3])`
			`"""`
			`y_pred = []`
			`log_features_probs = np.log(self.features_probs)`
			`log_priors = np.log(self.priors)`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`for instance in data:`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`theta = instance.multiply(log_features_probs).sum(axis=1)`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2023-10-03 18:04:15 +00:00			`likelihood = [`
			`log_prior_class_i + theta[i]`
			`for i, log_prior_class_i in enumerate(log_priors)`
			`]`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00			`y_pred.append(self.classes[np.argmax(likelihood)])`
			`return np.array(y_pred)`


Handle mypy errors 2023-10-06 15:32:14 +00:00			`def main():`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2023-10-03 18:04:15 +00:00			`newsgroups_train = fetch_20newsgroups(subset="train")`
			`newsgroups_test = fetch_20newsgroups(subset="test")`
			`x_train = newsgroups_train["data"]`
			`y_train = newsgroups_train["target"]`
			`x_test = newsgroups_test["data"]`
			`y_test = newsgroups_test["target"]`
			`vectorizer = TfidfVectorizer(stop_words="english")`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`x_train = vectorizer.fit_transform(x_train)`
			`x_test = vectorizer.transform(x_test)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00
			`model = MultinomialNBClassifier()`
			`print("Start training")`
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`model.fit(x_train, y_train)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00
Add typing hints and naming conventions 2023-10-03 17:52:45 +00:00			`y_pred = model.predict(x_test)`
[pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci 2023-10-03 18:04:15 +00:00			`print(`
			`"Accuracy of naive bayes text classifier: "`
			`+ str(accuracy_score(y_test, y_pred))`
			`)`
Implemented a multinomial naive bayes classifier for text classification 2023-10-02 18:48:40 +00:00

			`if __name__ == "__main__":`
			`main()`
			`doctest.testmod()`