Fixed imports

This commit is contained in:
ricca 2023-10-06 17:13:04 +02:00
parent 40c39a81f6
commit f6404ccb10

View File

@ -9,17 +9,16 @@ https://en.wikipedia.org/wiki/Naive_Bayes_classifier
""" """
import doctest import doctest
import numpy as np import numpy as np
from numpy.typing import ArrayLike import numpy.typing as npt
from scipy import sparse from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
def group_indices_by_target(targets: npt.ArrayLike) -> dict:
def group_indices_by_target(targets: ArrayLike) -> dict:
""" """
Associates to each target label the indices of the examples with that label Associates to each target label the indices of the examples with that label
@ -49,24 +48,24 @@ def group_indices_by_target(targets: ArrayLike) -> dict:
class MultinomialNBClassifier: class MultinomialNBClassifier:
def __init__(self, alpha=1): def __init__(self, alpha: int = 1):
self.classes = None self.classes = None
self.features_probs = None self.features_probs = None
self.priors = None self.priors = None
self.alpha = alpha self.alpha = alpha
def fit(self, data: sparse.csr_matrix, y: ArrayLike) -> None: def fit(self, data: sparse.csr_matrix, targets: npt.ArrayLike) -> None:
""" """
Parameters Parameters
---------- ----------
data : scipy.sparse.csr_matrix of shape (n_samples, n_features) data : scipy.sparse.csr_matrix of shape (n_samples, n_features)
Multinomial training examples Multinomial training examples
y : array-like of shape (n_samples,) targets : array-like of shape (n_samples,)
Target labels Target labels
""" """
n_examples, n_features = data.shape n_examples, n_features = data.shape
grouped_indices = group_indices_by_target(y) grouped_indices = group_indices_by_target(targets)
self.classes = list(grouped_indices.keys()) self.classes = list(grouped_indices.keys())
self.priors = np.zeros(shape=len(self.classes)) self.priors = np.zeros(shape=len(self.classes))
self.features_probs = np.zeros(shape=(len(self.classes), n_features)) self.features_probs = np.zeros(shape=(len(self.classes), n_features))
@ -76,15 +75,13 @@ class MultinomialNBClassifier:
prior_class_i = data_class_i.shape[0] / n_examples prior_class_i = data_class_i.shape[0] / n_examples
self.priors[i] = prior_class_i self.priors[i] = prior_class_i
tot_features_count = data_class_i.sum() # count of all features in class_i tot_features_count = data_class_i.sum() # count of all features in class_i
features_count = np.array(data_class_i.sum(axis=0))[ features_count = np.array(data_class_i.sum(axis=0))[0]
0
] # count of each feature x_j in class_i
for j, n_j in enumerate(features_count): for j, n_j in enumerate(features_count):
self.features_probs[i][j] = (self.alpha + n_j) / ( self.features_probs[i][j] = (self.alpha + n_j) / (
tot_features_count + self.alpha * n_features tot_features_count + self.alpha * n_features
) )
def predict(self, data: sparse.csr_matrix) -> np.array: def predict(self, data: sparse.csr_matrix) -> np.ndarray:
""" """
Parameters Parameters
---------- ----------
@ -123,9 +120,6 @@ class MultinomialNBClassifier:
def main() -> None: def main() -> None:
"""
Performs the text classification on the twenty_newsgroup dataset from sklearn
"""
newsgroups_train = fetch_20newsgroups(subset="train") newsgroups_train = fetch_20newsgroups(subset="train")
newsgroups_test = fetch_20newsgroups(subset="test") newsgroups_test = fetch_20newsgroups(subset="test")
x_train = newsgroups_train["data"] x_train = newsgroups_train["data"]