From 725731c8d289f742bfde3f159a538a47d19c27dc Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Mon, 2 Jan 2023 05:07:39 -0800 Subject: [PATCH] Refactor `local_weighted_learning.py` to use `np.array` (#8069) * updating DIRECTORY.md * Format local_weighted_learning.py doctests for clarity * Refactor local_weighted_learning.py to use np.array instead of np.mat The np.matrix class is planned to be eventually depreciated in favor of np.array, and current use of the class raises warnings in pytest * Update local_weighted_learning.py documentation Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> --- DIRECTORY.md | 3 +- .../local_weighted_learning.py | 116 ++++++++++-------- 2 files changed, 68 insertions(+), 51 deletions(-) diff --git a/DIRECTORY.md b/DIRECTORY.md index 3437df12c..5ce9dca74 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -123,6 +123,7 @@ * [Huffman](compression/huffman.py) * [Lempel Ziv](compression/lempel_ziv.py) * [Lempel Ziv Decompress](compression/lempel_ziv_decompress.py) + * [Lz77](compression/lz77.py) * [Peak Signal To Noise Ratio](compression/peak_signal_to_noise_ratio.py) * [Run Length Encoding](compression/run_length_encoding.py) @@ -1162,7 +1163,7 @@ * [Get Amazon Product Data](web_programming/get_amazon_product_data.py) * [Get Imdb Top 250 Movies Csv](web_programming/get_imdb_top_250_movies_csv.py) * [Get Imdbtop](web_programming/get_imdbtop.py) - * [Get Top Billioners](web_programming/get_top_billioners.py) + * [Get Top Billionaires](web_programming/get_top_billionaires.py) * [Get Top Hn Posts](web_programming/get_top_hn_posts.py) * [Get User Tweets](web_programming/get_user_tweets.py) * [Giphy](web_programming/giphy.py) diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index df03fe0a1..6260e9ac6 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,76 +1,86 @@ -# Required imports to run this file import matplotlib.pyplot as plt import numpy as np -# weighted matrix -def weighted_matrix(point: np.mat, training_data_x: np.mat, bandwidth: float) -> np.mat: +def weighted_matrix( + point: np.array, training_data_x: np.array, bandwidth: float +) -> np.array: """ - Calculate the weight for every point in the - data set. It takes training_point , query_point, and tau - Here Tau is not a fixed value it can be varied depends on output. - tau --> bandwidth - xmat -->Training data - point --> the x where we want to make predictions - >>> weighted_matrix(np.array([1., 1.]),np.mat([[16.99, 10.34], [21.01,23.68], - ... [24.59,25.69]]), 0.6) - matrix([[1.43807972e-207, 0.00000000e+000, 0.00000000e+000], - [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], - [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) + Calculate the weight for every point in the data set. + point --> the x value at which we want to make predictions + >>> weighted_matrix( + ... np.array([1., 1.]), + ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), + ... 0.6 + ... ) + array([[1.43807972e-207, 0.00000000e+000, 0.00000000e+000], + [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], + [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - # m is the number of training samples - m, n = np.shape(training_data_x) - # Initializing weights as identity matrix - weights = np.mat(np.eye(m)) + m, _ = np.shape(training_data_x) # m is the number of training samples + weights = np.eye(m) # Initializing weights as identity matrix + # calculating weights for all training examples [x(i)'s] for j in range(m): diff = point - training_data_x[j] - weights[j, j] = np.exp(diff * diff.T / (-2.0 * bandwidth**2)) + weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) return weights def local_weight( - point: np.mat, training_data_x: np.mat, training_data_y: np.mat, bandwidth: float -) -> np.mat: + point: np.array, + training_data_x: np.array, + training_data_y: np.array, + bandwidth: float, +) -> np.array: """ Calculate the local weights using the weight_matrix function on training data. Return the weighted matrix. - >>> local_weight(np.array([1., 1.]),np.mat([[16.99, 10.34], [21.01,23.68], - ... [24.59,25.69]]),np.mat([[1.01, 1.66, 3.5]]), 0.6) - matrix([[0.00873174], - [0.08272556]]) + >>> local_weight( + ... np.array([1., 1.]), + ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), + ... np.array([[1.01, 1.66, 3.5]]), + ... 0.6 + ... ) + array([[0.00873174], + [0.08272556]]) """ weight = weighted_matrix(point, training_data_x, bandwidth) - w = (training_data_x.T * (weight * training_data_x)).I * ( - training_data_x.T * weight * training_data_y.T + w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( + training_data_x.T @ weight @ training_data_y.T ) return w def local_weight_regression( - training_data_x: np.mat, training_data_y: np.mat, bandwidth: float -) -> np.mat: + training_data_x: np.array, training_data_y: np.array, bandwidth: float +) -> np.array: """ - Calculate predictions for each data point on axis. - >>> local_weight_regression(np.mat([[16.99, 10.34], [21.01,23.68], - ... [24.59,25.69]]),np.mat([[1.01, 1.66, 3.5]]), 0.6) + Calculate predictions for each data point on axis + >>> local_weight_regression( + ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), + ... np.array([[1.01, 1.66, 3.5]]), + ... 0.6 + ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, n = np.shape(training_data_x) + m, _ = np.shape(training_data_x) ypred = np.zeros(m) for i, item in enumerate(training_data_x): - ypred[i] = item * local_weight( + ypred[i] = item @ local_weight( item, training_data_x, training_data_y, bandwidth ) return ypred -def load_data(dataset_name: str, cola_name: str, colb_name: str) -> np.mat: +def load_data( + dataset_name: str, cola_name: str, colb_name: str +) -> tuple[np.array, np.array, np.array, np.array]: """ - Function used for loading data from the seaborn splitting into x and y points + Load data from seaborn and split it into x and y points """ import seaborn as sns @@ -78,23 +88,25 @@ def load_data(dataset_name: str, cola_name: str, colb_name: str) -> np.mat: col_a = np.array(data[cola_name]) # total_bill col_b = np.array(data[colb_name]) # tip - mcol_a = np.mat(col_a) - mcol_b = np.mat(col_b) + mcol_a = col_a.copy() + mcol_b = col_b.copy() - m = np.shape(mcol_b)[1] - one = np.ones((1, m), dtype=int) + one = np.ones(np.shape(mcol_b)[0], dtype=int) - # horizontal stacking - training_data_x = np.hstack((one.T, mcol_a.T)) + # pairing elements of one and mcol_a + training_data_x = np.column_stack((one, mcol_a)) return training_data_x, mcol_b, col_a, col_b -def get_preds(training_data_x: np.mat, mcol_b: np.mat, tau: float) -> np.ndarray: +def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array: """ Get predictions with minimum error for each training data - >>> get_preds(np.mat([[16.99, 10.34], [21.01,23.68], - ... [24.59,25.69]]),np.mat([[1.01, 1.66, 3.5]]), 0.6) + >>> get_preds( + ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), + ... np.array([[1.01, 1.66, 3.5]]), + ... 0.6 + ... ) array([1.07173261, 1.65970737, 3.50160179]) """ ypred = local_weight_regression(training_data_x, mcol_b, tau) @@ -102,15 +114,15 @@ def get_preds(training_data_x: np.mat, mcol_b: np.mat, tau: float) -> np.ndarray def plot_preds( - training_data_x: np.mat, - predictions: np.ndarray, - col_x: np.ndarray, - col_y: np.ndarray, + training_data_x: np.array, + predictions: np.array, + col_x: np.array, + col_y: np.array, cola_name: str, colb_name: str, ) -> plt.plot: """ - This function used to plot predictions and display the graph + Plot predictions and display the graph """ xsort = training_data_x.copy() xsort.sort(axis=0) @@ -128,6 +140,10 @@ def plot_preds( if __name__ == "__main__": + import doctest + + doctest.testmod() + training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") predictions = get_preds(training_data_x, mcol_b, 0.5) plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip")