mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-11-23 21:11:08 +00:00
Dimensionality reduction (#8590)
This commit is contained in:
parent
d182f95646
commit
54dedf844a
198
machine_learning/dimensionality_reduction.py
Normal file
198
machine_learning/dimensionality_reduction.py
Normal file
|
@ -0,0 +1,198 @@
|
|||
# Copyright (c) 2023 Diego Gasco (diego.gasco99@gmail.com), Diegomangasco on GitHub
|
||||
|
||||
"""
|
||||
Requirements:
|
||||
- numpy version 1.21
|
||||
- scipy version 1.3.3
|
||||
Notes:
|
||||
- Each column of the features matrix corresponds to a class item
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from scipy.linalg import eigh
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
|
||||
|
||||
def column_reshape(input_array: np.ndarray) -> np.ndarray:
|
||||
"""Function to reshape a row Numpy array into a column Numpy array
|
||||
>>> input_array = np.array([1, 2, 3])
|
||||
>>> column_reshape(input_array)
|
||||
array([[1],
|
||||
[2],
|
||||
[3]])
|
||||
"""
|
||||
|
||||
return input_array.reshape((input_array.size, 1))
|
||||
|
||||
|
||||
def covariance_within_classes(
|
||||
features: np.ndarray, labels: np.ndarray, classes: int
|
||||
) -> np.ndarray:
|
||||
"""Function to compute the covariance matrix inside each class.
|
||||
>>> features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
>>> labels = np.array([0, 1, 0])
|
||||
>>> covariance_within_classes(features, labels, 2)
|
||||
array([[0.66666667, 0.66666667, 0.66666667],
|
||||
[0.66666667, 0.66666667, 0.66666667],
|
||||
[0.66666667, 0.66666667, 0.66666667]])
|
||||
"""
|
||||
|
||||
covariance_sum = np.nan
|
||||
for i in range(classes):
|
||||
data = features[:, labels == i]
|
||||
data_mean = data.mean(1)
|
||||
# Centralize the data of class i
|
||||
centered_data = data - column_reshape(data_mean)
|
||||
if i > 0:
|
||||
# If covariance_sum is not None
|
||||
covariance_sum += np.dot(centered_data, centered_data.T)
|
||||
else:
|
||||
# If covariance_sum is np.nan (i.e. first loop)
|
||||
covariance_sum = np.dot(centered_data, centered_data.T)
|
||||
|
||||
return covariance_sum / features.shape[1]
|
||||
|
||||
|
||||
def covariance_between_classes(
|
||||
features: np.ndarray, labels: np.ndarray, classes: int
|
||||
) -> np.ndarray:
|
||||
"""Function to compute the covariance matrix between multiple classes
|
||||
>>> features = np.array([[9, 2, 3], [4, 3, 6], [1, 8, 9]])
|
||||
>>> labels = np.array([0, 1, 0])
|
||||
>>> covariance_between_classes(features, labels, 2)
|
||||
array([[ 3.55555556, 1.77777778, -2.66666667],
|
||||
[ 1.77777778, 0.88888889, -1.33333333],
|
||||
[-2.66666667, -1.33333333, 2. ]])
|
||||
"""
|
||||
|
||||
general_data_mean = features.mean(1)
|
||||
covariance_sum = np.nan
|
||||
for i in range(classes):
|
||||
data = features[:, labels == i]
|
||||
device_data = data.shape[1]
|
||||
data_mean = data.mean(1)
|
||||
if i > 0:
|
||||
# If covariance_sum is not None
|
||||
covariance_sum += device_data * np.dot(
|
||||
column_reshape(data_mean) - column_reshape(general_data_mean),
|
||||
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
|
||||
)
|
||||
else:
|
||||
# If covariance_sum is np.nan (i.e. first loop)
|
||||
covariance_sum = device_data * np.dot(
|
||||
column_reshape(data_mean) - column_reshape(general_data_mean),
|
||||
(column_reshape(data_mean) - column_reshape(general_data_mean)).T,
|
||||
)
|
||||
|
||||
return covariance_sum / features.shape[1]
|
||||
|
||||
|
||||
def principal_component_analysis(features: np.ndarray, dimensions: int) -> np.ndarray:
|
||||
"""
|
||||
Principal Component Analysis.
|
||||
|
||||
For more details, see: https://en.wikipedia.org/wiki/Principal_component_analysis.
|
||||
Parameters:
|
||||
* features: the features extracted from the dataset
|
||||
* dimensions: to filter the projected data for the desired dimension
|
||||
|
||||
>>> test_principal_component_analysis()
|
||||
"""
|
||||
|
||||
# Check if the features have been loaded
|
||||
if features.any():
|
||||
data_mean = features.mean(1)
|
||||
# Center the dataset
|
||||
centered_data = features - np.reshape(data_mean, (data_mean.size, 1))
|
||||
covariance_matrix = np.dot(centered_data, centered_data.T) / features.shape[1]
|
||||
_, eigenvectors = np.linalg.eigh(covariance_matrix)
|
||||
# Take all the columns in the reverse order (-1), and then takes only the first
|
||||
filtered_eigenvectors = eigenvectors[:, ::-1][:, 0:dimensions]
|
||||
# Project the database on the new space
|
||||
projected_data = np.dot(filtered_eigenvectors.T, features)
|
||||
logging.info("Principal Component Analysis computed")
|
||||
|
||||
return projected_data
|
||||
else:
|
||||
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
|
||||
logging.error("Dataset empty")
|
||||
raise AssertionError
|
||||
|
||||
|
||||
def linear_discriminant_analysis(
|
||||
features: np.ndarray, labels: np.ndarray, classes: int, dimensions: int
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Linear Discriminant Analysis.
|
||||
|
||||
For more details, see: https://en.wikipedia.org/wiki/Linear_discriminant_analysis.
|
||||
Parameters:
|
||||
* features: the features extracted from the dataset
|
||||
* labels: the class labels of the features
|
||||
* classes: the number of classes present in the dataset
|
||||
* dimensions: to filter the projected data for the desired dimension
|
||||
|
||||
>>> test_linear_discriminant_analysis()
|
||||
"""
|
||||
|
||||
# Check if the dimension desired is less than the number of classes
|
||||
assert classes > dimensions
|
||||
|
||||
# Check if features have been already loaded
|
||||
if features.any:
|
||||
_, eigenvectors = eigh(
|
||||
covariance_between_classes(features, labels, classes),
|
||||
covariance_within_classes(features, labels, classes),
|
||||
)
|
||||
filtered_eigenvectors = eigenvectors[:, ::-1][:, :dimensions]
|
||||
svd_matrix, _, _ = np.linalg.svd(filtered_eigenvectors)
|
||||
filtered_svd_matrix = svd_matrix[:, 0:dimensions]
|
||||
projected_data = np.dot(filtered_svd_matrix.T, features)
|
||||
logging.info("Linear Discriminant Analysis computed")
|
||||
|
||||
return projected_data
|
||||
else:
|
||||
logging.basicConfig(level=logging.ERROR, format="%(message)s", force=True)
|
||||
logging.error("Dataset empty")
|
||||
raise AssertionError
|
||||
|
||||
|
||||
def test_linear_discriminant_analysis() -> None:
|
||||
# Create dummy dataset with 2 classes and 3 features
|
||||
features = np.array([[1, 2, 3, 4, 5], [2, 3, 4, 5, 6], [3, 4, 5, 6, 7]])
|
||||
labels = np.array([0, 0, 0, 1, 1])
|
||||
classes = 2
|
||||
dimensions = 2
|
||||
|
||||
# Assert that the function raises an AssertionError if dimensions > classes
|
||||
with pytest.raises(AssertionError) as error_info:
|
||||
projected_data = linear_discriminant_analysis(
|
||||
features, labels, classes, dimensions
|
||||
)
|
||||
if isinstance(projected_data, np.ndarray):
|
||||
raise AssertionError(
|
||||
"Did not raise AssertionError for dimensions > classes"
|
||||
)
|
||||
assert error_info.type is AssertionError
|
||||
|
||||
|
||||
def test_principal_component_analysis() -> None:
|
||||
features = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
|
||||
dimensions = 2
|
||||
expected_output = np.array([[6.92820323, 8.66025404, 10.39230485], [3.0, 3.0, 3.0]])
|
||||
|
||||
with pytest.raises(AssertionError) as error_info:
|
||||
output = principal_component_analysis(features, dimensions)
|
||||
if not np.allclose(expected_output, output):
|
||||
raise AssertionError
|
||||
assert error_info.type is AssertionError
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import doctest
|
||||
|
||||
doctest.testmod()
|
Loading…
Reference in New Issue
Block a user