Dimensionality reduction (#8590)

This commit is contained in:
Diego Gasco 2023-04-17 00:34:22 +02:00 committed by GitHub
parent d182f95646
commit 54dedf844a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -0,0 +1,198 @@
# Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub
"""
Requirements:
- numpy version 1.21
- scipy version 1.3.3
Notes:
- Each column of the features matrix corresponds to a class item
"""
import logging
import numpy as np
import pytest
from scipy.linalg import eigh
logging.basicConfig(level=logging.INFO, format="%(message)s")
def column_reshape(input_array: np.ndarray) -> np.ndarray:
"""Function to reshape a row Numpy array into a column Numpy array
>>> input_array = np.array([1, 2, 3])
>>> column_reshape(input_array)
array([[1],
[2],
[3]])
"""
return input_array.reshape((input_array.size, 1))
def covariance_within_classes(
features: np.ndarray, labels: np.ndarray, classes: int
) -> np.ndarray:
"""Function to compute the covariance matrix inside each class.
>>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
>>> labels = np.array([0, 1, 0])
>>> covariance_within_classes(features, labels, 2)
array([[0.66666667, 0.66666667, 0.66666667],
[0.66666667, 0.66666667, 0.66666667],
[0.66666667, 0.66666667, 0.66666667]])
"""
covariance_sum = np.nan
for i in range(classes):
data = features[:, labels == i]
data_mean = data.mean(1)
# Centralize the data of class i
centered_data = data - column_reshape(data_mean)
if i > 0:
# If covariance_sum is not None
covariance_sum += np.dot(centered_data, centered_data.T)
else:
# If covariance_sum is np.nan (i.e. first loop)
covariance_sum = np.dot(centered_data, centered_data.T)
return covariance_sum / features.shape[1]
def covariance_between_classes(
features: np.ndarray, labels: np.ndarray, classes: int
) -> np.ndarray:
"""Function to compute the covariance matrix between multiple classes
>>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]])
>>> labels = np.array([0, 1, 0])
>>> covariance_between_classes(features, labels, 2)
array([[ 3.55555556, 1.77777778, -2.66666667],
[ 1.77777778, 0.88888889, -1.33333333],
[-2.66666667, -1.33333333, 2. ]])
"""
general_data_mean = features.mean(1)
covariance_sum = np.nan
for i in range(classes):
data = features[:, labels == i]
device_data = data.shape[1]
data_mean = data.mean(1)
if i > 0:
# If covariance_sum is not None
covariance_sum += device_data * np.dot(
column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
)
else:
# If covariance_sum is np.nan (i.e. first loop)
covariance_sum = device_data * np.dot(
column_reshape(data_mean) - column_reshape(general_data_mean),
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
)
return covariance_sum / features.shape[1]
def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray:
"""
Principal Component Analysis.
For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis.
Parameters:
* features: the features extracted from the dataset
* dimensions: to filter the projected data for the desired dimension
>>> test_principal_component_analysis()
"""
# Check if the features have been loaded
if features.any():
data_mean = features.mean(1)
# Center the dataset
centered_data = features - np.reshape(data_mean, (data_mean.size, 1))
covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1]
_, eigenvectors = np.linalg.eigh(covariance_matrix)
# Take all the columns in the reverse order (-1), and then takes only the first
filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions]
# Project the database on the new space
projected_data = np.dot(filtered_eigenvectors.T, features)
logging.info("Principal Component Analysis computed")
return projected_data
else:
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
logging.error("Dataset empty")
raise AssertionError
def linear_discriminant_analysis(
features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int
) -> np.ndarray:
"""
Linear Discriminant Analysis.
For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis.
Parameters:
* features: the features extracted from the dataset
* labels: the class labels of the features
* classes: the number of classes present in the dataset
* dimensions: to filter the projected data for the desired dimension
>>> test_linear_discriminant_analysis()
"""
# Check if the dimension desired is less than the number of classes
assert classes > dimensions
# Check if features have been already loaded
if features.any:
_, eigenvectors = eigh(
covariance_between_classes(features, labels, classes),
covariance_within_classes(features, labels, classes),
)
filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
filtered_svd_matrix = svd_matrix[:, 0:dimensions]
projected_data = np.dot(filtered_svd_matrix.T, features)
logging.info("Linear Discriminant Analysis computed")
return projected_data
else:
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
logging.error("Dataset empty")
raise AssertionError
def test_linear_discriminant_analysis() -> None:
# Create dummy dataset with 2 classes and 3 features
features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]])
labels = np.array([0, 0, 0, 1, 1])
classes = 2
dimensions = 2
# Assert that the function raises an AssertionError if dimensions > classes
with pytest.raises(AssertionError) as error_info:
projected_data = linear_discriminant_analysis(
features, labels, classes, dimensions
)
if isinstance(projected_data, np.ndarray):
raise AssertionError(
"Did not raise AssertionError for dimensions > classes"
)
assert error_info.type is AssertionError
def test_principal_component_analysis() -> None:
features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
dimensions = 2
expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]])
with pytest.raises(AssertionError) as error_info:
output = principal_component_analysis(features, dimensions)
if not np.allclose(expected_output, output):
raise AssertionError
assert error_info.type is AssertionError
if __name__ == "__main__":
import doctest
doctest.testmod()