diff --git a/machine_learning/dimensionality_reduction.py b/machine_learning/dimensionality_reduction.py new file mode 100644 index 000000000..d2046f81a --- /dev/null +++ b/machine_learning/dimensionality_reduction.py @@ -0,0 +1,198 @@ +# Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub + +""" +Requirements: + - numpy version 1.21 + - scipy version 1.3.3 +Notes: + - Each column of the features matrix corresponds to a class item +""" + +import logging + +import numpy as np +import pytest +from scipy.linalg import eigh + +logging.basicConfig(level=logging.INFO, format="%(message)s") + + +def column_reshape(input_array: np.ndarray) -> np.ndarray: + """Function to reshape a row Numpy array into a column Numpy array + >>> input_array = np.array([1, 2, 3]) + >>> column_reshape(input_array) + array([[1], + [2], + [3]]) + """ + + return input_array.reshape((input_array.size, 1)) + + +def covariance_within_classes( + features: np.ndarray, labels: np.ndarray, classes: int +) -> np.ndarray: + """Function to compute the covariance matrix inside each class. + >>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> labels = np.array([0, 1, 0]) + >>> covariance_within_classes(features, labels, 2) + array([[0.66666667, 0.66666667, 0.66666667], + [0.66666667, 0.66666667, 0.66666667], + [0.66666667, 0.66666667, 0.66666667]]) + """ + + covariance_sum = np.nan + for i in range(classes): + data = features[:, labels == i] + data_mean = data.mean(1) + # Centralize the data of class i + centered_data = data - column_reshape(data_mean) + if i > 0: + # If covariance_sum is not None + covariance_sum += np.dot(centered_data, centered_data.T) + else: + # If covariance_sum is np.nan (i.e. first loop) + covariance_sum = np.dot(centered_data, centered_data.T) + + return covariance_sum / features.shape[1] + + +def covariance_between_classes( + features: np.ndarray, labels: np.ndarray, classes: int +) -> np.ndarray: + """Function to compute the covariance matrix between multiple classes + >>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]]) + >>> labels = np.array([0, 1, 0]) + >>> covariance_between_classes(features, labels, 2) + array([[ 3.55555556, 1.77777778, -2.66666667], + [ 1.77777778, 0.88888889, -1.33333333], + [-2.66666667, -1.33333333, 2. ]]) + """ + + general_data_mean = features.mean(1) + covariance_sum = np.nan + for i in range(classes): + data = features[:, labels == i] + device_data = data.shape[1] + data_mean = data.mean(1) + if i > 0: + # If covariance_sum is not None + covariance_sum += device_data * np.dot( + column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T, + ) + else: + # If covariance_sum is np.nan (i.e. first loop) + covariance_sum = device_data * np.dot( + column_reshape(data_mean) - column_reshape(general_data_mean), + (column_reshape(data_mean) - column_reshape(general_data_mean)).T, + ) + + return covariance_sum / features.shape[1] + + +def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray: + """ + Principal Component Analysis. + + For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis. + Parameters: + * features: the features extracted from the dataset + * dimensions: to filter the projected data for the desired dimension + + >>> test_principal_component_analysis() + """ + + # Check if the features have been loaded + if features.any(): + data_mean = features.mean(1) + # Center the dataset + centered_data = features - np.reshape(data_mean, (data_mean.size, 1)) + covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1] + _, eigenvectors = np.linalg.eigh(covariance_matrix) + # Take all the columns in the reverse order (-1), and then takes only the first + filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions] + # Project the database on the new space + projected_data = np.dot(filtered_eigenvectors.T, features) + logging.info("Principal Component Analysis computed") + + return projected_data + else: + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) + logging.error("Dataset empty") + raise AssertionError + + +def linear_discriminant_analysis( + features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int +) -> np.ndarray: + """ + Linear Discriminant Analysis. + + For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis. + Parameters: + * features: the features extracted from the dataset + * labels: the class labels of the features + * classes: the number of classes present in the dataset + * dimensions: to filter the projected data for the desired dimension + + >>> test_linear_discriminant_analysis() + """ + + # Check if the dimension desired is less than the number of classes + assert classes > dimensions + + # Check if features have been already loaded + if features.any: + _, eigenvectors = eigh( + covariance_between_classes(features, labels, classes), + covariance_within_classes(features, labels, classes), + ) + filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions] + svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors) + filtered_svd_matrix = svd_matrix[:, 0:dimensions] + projected_data = np.dot(filtered_svd_matrix.T, features) + logging.info("Linear Discriminant Analysis computed") + + return projected_data + else: + logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True) + logging.error("Dataset empty") + raise AssertionError + + +def test_linear_discriminant_analysis() -> None: + # Create dummy dataset with 2 classes and 3 features + features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]]) + labels = np.array([0, 0, 0, 1, 1]) + classes = 2 + dimensions = 2 + + # Assert that the function raises an AssertionError if dimensions > classes + with pytest.raises(AssertionError) as error_info: + projected_data = linear_discriminant_analysis( + features, labels, classes, dimensions + ) + if isinstance(projected_data, np.ndarray): + raise AssertionError( + "Did not raise AssertionError for dimensions > classes" + ) + assert error_info.type is AssertionError + + +def test_principal_component_analysis() -> None: + features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + dimensions = 2 + expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]]) + + with pytest.raises(AssertionError) as error_info: + output = principal_component_analysis(features, dimensions) + if not np.allclose(expected_output, output): + raise AssertionError + assert error_info.type is AssertionError + + +if __name__ == "__main__": + import doctest + + doctest.testmod()