mirror of
https://github.com/TheAlgorithms/Python.git
synced 2025-02-07 10:00:55 +00:00
Add typing hints and naming conventions
This commit is contained in:
parent
e9f3d61643
commit
2de3ac6ec9
|
@ -13,9 +13,10 @@ from scipy import sparse
|
||||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
from sklearn.datasets import fetch_20newsgroups
|
from sklearn.datasets import fetch_20newsgroups
|
||||||
from sklearn.metrics import accuracy_score
|
from sklearn.metrics import accuracy_score
|
||||||
|
from numpy.typing import ArrayLike
|
||||||
|
|
||||||
|
|
||||||
def group_indices_by_target(targets):
|
def group_indices_by_target(targets: ArrayLike) -> dict:
|
||||||
"""
|
"""
|
||||||
Associates to each target label the indices of the examples with that label
|
Associates to each target label the indices of the examples with that label
|
||||||
|
|
||||||
|
@ -50,35 +51,24 @@ class MultinomialNBClassifier:
|
||||||
self.priors = None
|
self.priors = None
|
||||||
self.alpha = alpha
|
self.alpha = alpha
|
||||||
|
|
||||||
def _check_X(self, X):
|
def fit(self, data: sparse.csr_matrix, y: ArrayLike) -> None:
|
||||||
if not sparse.issparse(X):
|
|
||||||
raise ValueError("Matrix X must be an instance of scipy.sparse.csr_matrix")
|
|
||||||
|
|
||||||
def _check_X_y(self, X, y):
|
|
||||||
self._check_X(X)
|
|
||||||
if X.shape[0] != len(y):
|
|
||||||
raise ValueError(
|
|
||||||
"The expected shape for array y is (" + str(X.shape[0]) + ",), but got (" + str(len(y)) + ",)")
|
|
||||||
|
|
||||||
def fit(self, X, y):
|
|
||||||
"""
|
"""
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
X : scipy.sparse.csr_matrix of shape (n_samples, n_features)
|
data : scipy.sparse.csr_matrix of shape (n_samples, n_features)
|
||||||
Multinomial training examples
|
Multinomial training examples
|
||||||
|
|
||||||
y : array-like of shape (n_samples,)
|
y : array-like of shape (n_samples,)
|
||||||
Target labels
|
Target labels
|
||||||
"""
|
"""
|
||||||
self._check_X_y(X, y)
|
n_examples, n_features = data.shape
|
||||||
n_examples, n_features = X.shape
|
|
||||||
grouped_indices = group_indices_by_target(y)
|
grouped_indices = group_indices_by_target(y)
|
||||||
self.classes = list(grouped_indices.keys())
|
self.classes = list(grouped_indices.keys())
|
||||||
self.priors = np.zeros(shape=len(self.classes))
|
self.priors = np.zeros(shape=len(self.classes))
|
||||||
self.features_probs = np.zeros(shape=(len(self.classes), n_features))
|
self.features_probs = np.zeros(shape=(len(self.classes), n_features))
|
||||||
|
|
||||||
for i, class_i in enumerate(self.classes):
|
for i, class_i in enumerate(self.classes):
|
||||||
data_class_i = X[grouped_indices[class_i]]
|
data_class_i = data[grouped_indices[class_i]]
|
||||||
prior_class_i = data_class_i.shape[0] / n_examples
|
prior_class_i = data_class_i.shape[0] / n_examples
|
||||||
self.priors[i] = prior_class_i
|
self.priors[i] = prior_class_i
|
||||||
tot_features_count = data_class_i.sum() # count of all features in class_i
|
tot_features_count = data_class_i.sum() # count of all features in class_i
|
||||||
|
@ -86,11 +76,11 @@ class MultinomialNBClassifier:
|
||||||
for j, n_j in enumerate(features_count):
|
for j, n_j in enumerate(features_count):
|
||||||
self.features_probs[i][j] = (self.alpha + n_j) / (tot_features_count + self.alpha * n_features)
|
self.features_probs[i][j] = (self.alpha + n_j) / (tot_features_count + self.alpha * n_features)
|
||||||
|
|
||||||
def predict(self, X):
|
def predict(self, data: sparse.csr_matrix) -> np.array:
|
||||||
"""
|
"""
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
X : scipy.sparse.csr_matrix of shape (n_samples, n_features)
|
data : scipy.sparse.csr_matrix of shape (n_samples, n_features)
|
||||||
Multinomial test examples
|
Multinomial test examples
|
||||||
|
|
||||||
Returns
|
Returns
|
||||||
|
@ -103,41 +93,43 @@ class MultinomialNBClassifier:
|
||||||
Let's test the function following an example taken from the documentation of the MultinomialNB model
|
Let's test the function following an example taken from the documentation of the MultinomialNB model
|
||||||
from sklearn
|
from sklearn
|
||||||
>>> rng = np.random.RandomState(1)
|
>>> rng = np.random.RandomState(1)
|
||||||
>>> X = rng.randint(5, size=(6, 100))
|
>>> data = rng.randint(5, size=(6, 100))
|
||||||
>>> X = sparse.csr_matrix(X)
|
>>> data = sparse.csr_matrix(data)
|
||||||
>>> y = np.array([1, 2, 3, 4, 5, 6])
|
>>> y = np.array([1, 2, 3, 4, 5, 6])
|
||||||
>>> model = MultinomialNBClassifier()
|
>>> model = MultinomialNBClassifier()
|
||||||
>>> model.fit(X, y)
|
>>> model.fit(data, y)
|
||||||
>>> model.predict(X[2:3])
|
>>> model.predict(data[2:3])
|
||||||
array([3])
|
array([3])
|
||||||
"""
|
"""
|
||||||
self._check_X(X)
|
|
||||||
y_pred = []
|
y_pred = []
|
||||||
log_features_probs = np.log(self.features_probs)
|
log_features_probs = np.log(self.features_probs)
|
||||||
log_priors = np.log(self.priors)
|
log_priors = np.log(self.priors)
|
||||||
for instance in X:
|
for instance in data:
|
||||||
theta = instance.multiply(log_features_probs).sum(axis=1)
|
theta = instance.multiply(log_features_probs).sum(axis=1)
|
||||||
likelihood = [log_prior_class_i + theta[i] for i, log_prior_class_i in enumerate(log_priors)]
|
likelihood = [log_prior_class_i + theta[i] for i, log_prior_class_i in enumerate(log_priors)]
|
||||||
y_pred.append(self.classes[np.argmax(likelihood)])
|
y_pred.append(self.classes[np.argmax(likelihood)])
|
||||||
return np.array(y_pred)
|
return np.array(y_pred)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main() -> None:
|
||||||
|
"""
|
||||||
|
Performs the text classification on the twenty_newsgroup dataset from sklearn
|
||||||
|
"""
|
||||||
newsgroups_train = fetch_20newsgroups(subset='train')
|
newsgroups_train = fetch_20newsgroups(subset='train')
|
||||||
newsgroups_test = fetch_20newsgroups(subset='test')
|
newsgroups_test = fetch_20newsgroups(subset='test')
|
||||||
X_train = newsgroups_train['data']
|
x_train = newsgroups_train['data']
|
||||||
y_train = newsgroups_train['target']
|
y_train = newsgroups_train['target']
|
||||||
X_test = newsgroups_test['data']
|
x_test = newsgroups_test['data']
|
||||||
y_test = newsgroups_test['target']
|
y_test = newsgroups_test['target']
|
||||||
vectorizer = TfidfVectorizer(stop_words='english')
|
vectorizer = TfidfVectorizer(stop_words='english')
|
||||||
X_train = vectorizer.fit_transform(X_train)
|
x_train = vectorizer.fit_transform(x_train)
|
||||||
X_test = vectorizer.transform(X_test)
|
x_test = vectorizer.transform(x_test)
|
||||||
|
|
||||||
model = MultinomialNBClassifier()
|
model = MultinomialNBClassifier()
|
||||||
print("Start training")
|
print("Start training")
|
||||||
model.fit(X_train, y_train)
|
model.fit(x_train, y_train)
|
||||||
|
|
||||||
y_pred = model.predict(X_test)
|
y_pred = model.predict(x_test)
|
||||||
print("Accuracy of naive bayes text classifier: " + str(accuracy_score(y_test, y_pred)))
|
print("Accuracy of naive bayes text classifier: " + str(accuracy_score(y_test, y_pred)))
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user