diff --git a/DIRECTORY.md b/DIRECTORY.md index a535f12cb..ab3259b9a 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -395,6 +395,7 @@ * [Minimum Tickets Cost](dynamic_programming/minimum_tickets_cost.py) * [Optimal Binary Search Tree](dynamic_programming/optimal_binary_search_tree.py) * [Palindrome Partitioning](dynamic_programming/palindrome_partitioning.py) + * [Range Sum Query](dynamic_programming/range_sum_query.py) * [Regex Match](dynamic_programming/regex_match.py) * [Rod Cutting](dynamic_programming/rod_cutting.py) * [Smith Waterman](dynamic_programming/smith_waterman.py) @@ -608,6 +609,7 @@ * [Mfcc](machine_learning/mfcc.py) * [Multilayer Perceptron Classifier](machine_learning/multilayer_perceptron_classifier.py) * [Polynomial Regression](machine_learning/polynomial_regression.py) + * [Principle Component Analysis](machine_learning/principle_component_analysis.py) * [Scoring Functions](machine_learning/scoring_functions.py) * [Self Organizing Map](machine_learning/self_organizing_map.py) * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) diff --git a/machine_learning/principle_component_analysis.py b/machine_learning/principle_component_analysis.py new file mode 100644 index 000000000..46ccdb968 --- /dev/null +++ b/machine_learning/principle_component_analysis.py @@ -0,0 +1,85 @@ +""" +Principal Component Analysis (PCA) is a dimensionality reduction technique +used in machine learning. It transforms high-dimensional data into a lower-dimensional +representation while retaining as much variance as possible. + +This implementation follows best practices, including: +- Standardizing the dataset. +- Computing principal components using Singular Value Decomposition (SVD). +- Returning transformed data and explained variance ratio. +""" + +import doctest + +import numpy as np +from sklearn.datasets import load_iris +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + + +def collect_dataset() -> tuple[np.ndarray, np.ndarray]: + """ + Collects the dataset (Iris dataset) and returns feature matrix and target values. + + :return: Tuple containing feature matrix (X) and target labels (y) + + Example: + >>> X, y = collect_dataset() + >>> X.shape + (150, 4) + >>> y.shape + (150,) + """ + data = load_iris() + return np.array(data.data), np.array(data.target) + + +def apply_pca(data_x: np.ndarray, n_components: int) -> tuple[np.ndarray, np.ndarray]: + """ + Applies Principal Component Analysis (PCA) to reduce dimensionality. + + :param data_x: Original dataset (features) + :param n_components: Number of principal components to retain + :return: Tuple containing transformed dataset and explained variance ratio + + Example: + >>> X, _ = collect_dataset() + >>> transformed_X, variance = apply_pca(X, 2) + >>> transformed_X.shape + (150, 2) + >>> len(variance) == 2 + True + """ + # Standardizing the dataset + scaler = StandardScaler() + data_x_scaled = scaler.fit_transform(data_x) + + # Applying PCA + pca = PCA(n_components=n_components) + principal_components = pca.fit_transform(data_x_scaled) + + return principal_components, pca.explained_variance_ratio_ + + +def main() -> None: + """ + Driver function to execute PCA and display results. + """ + data_x, data_y = collect_dataset() + + # Number of principal components to retain + n_components = 2 + + # Apply PCA + transformed_data, variance_ratio = apply_pca(data_x, n_components) + + print("Transformed Dataset (First 5 rows):") + print(transformed_data[:5]) + + print("\nExplained Variance Ratio:") + print(variance_ratio) + + +if __name__ == "__main__": + doctest.testmod() + main()