diff --git a/machine_learning/local_weighted_learning/local_weighted_learning.py b/machine_learning/local_weighted_learning/local_weighted_learning.py index 6260e9ac6..8dd0e55d4 100644 --- a/machine_learning/local_weighted_learning/local_weighted_learning.py +++ b/machine_learning/local_weighted_learning/local_weighted_learning.py @@ -1,14 +1,55 @@ +""" +Locally weighted linear regression, also called local regression, is a type of +non-parametric linear regression that prioritizes data closest to a given +prediction point. The algorithm estimates the vector of model coefficients β +using weighted least squares regression: + +β = (XᵀWX)⁻¹(XᵀWy), + +where X is the design matrix, y is the response vector, and W is the diagonal +weight matrix. + +This implementation calculates wᵢ, the weight of the ith training sample, using +the Gaussian weight: + +wᵢ = exp(-‖xᵢ - x‖²/(2τ²)), + +where xᵢ is the ith training sample, x is the prediction point, τ is the +"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L² +norm). The bandwidth τ controls how quickly the weight of a training sample +decreases as its distance from the prediction point increases. One can think of +the Gaussian weight as a bell curve centered around the prediction point: a +training sample is weighted lower if it's farther from the center, and τ +controls the spread of the bell curve. + +Other types of locally weighted regression such as locally estimated scatterplot +smoothing (LOESS) typically use different weight functions. + +References: + - https://en.wikipedia.org/wiki/Local_regression + - https://en.wikipedia.org/wiki/Weighted_least_squares + - https://cs229.stanford.edu/notes2022fall/main_notes.pdf +""" + import matplotlib.pyplot as plt import numpy as np -def weighted_matrix( - point: np.array, training_data_x: np.array, bandwidth: float -) -> np.array: +def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: """ - Calculate the weight for every point in the data set. - point --> the x value at which we want to make predictions - >>> weighted_matrix( + Calculate the weight of every point in the training data around a given + prediction point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + m x m weight matrix around the prediction point, where m is the size of + the training set + >>> weight_matrix( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... 0.6 @@ -17,25 +58,30 @@ def weighted_matrix( [0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) """ - m, _ = np.shape(training_data_x) # m is the number of training samples - weights = np.eye(m) # Initializing weights as identity matrix - - # calculating weights for all training examples [x(i)'s] + m = len(x_train) # Number of training samples + weights = np.eye(m) # Initialize weights as identity matrix for j in range(m): - diff = point - training_data_x[j] - weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) + diff = point - x_train[j] + weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2)) + return weights def local_weight( - point: np.array, - training_data_x: np.array, - training_data_y: np.array, - bandwidth: float, -) -> np.array: + point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float +) -> np.ndarray: """ - Calculate the local weights using the weight_matrix function on training data. - Return the weighted matrix. + Calculate the local weights at a given prediction point using the weight + matrix for that point + + Args: + point: x-value at which the prediction is being made + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + Returns: + ndarray of local weights >>> local_weight( ... np.array([1., 1.]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), @@ -45,19 +91,28 @@ def local_weight( array([[0.00873174], [0.08272556]]) """ - weight = weighted_matrix(point, training_data_x, bandwidth) - w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( - training_data_x.T @ weight @ training_data_y.T + weight_mat = weight_matrix(point, x_train, tau) + weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ ( + x_train.T @ weight_mat @ y_train.T ) - return w + return weight def local_weight_regression( - training_data_x: np.array, training_data_y: np.array, bandwidth: float -) -> np.array: + x_train: np.ndarray, y_train: np.ndarray, tau: float +) -> np.ndarray: """ - Calculate predictions for each data point on axis + Calculate predictions for each point in the training data + + Args: + x_train: ndarray of x-values for training + y_train: ndarray of y-values for training + tau: bandwidth value, controls how quickly the weight of training values + decreases as the distance from the prediction point increases + + Returns: + ndarray of predictions >>> local_weight_regression( ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[1.01, 1.66, 3.5]]), @@ -65,77 +120,57 @@ def local_weight_regression( ... ) array([1.07173261, 1.65970737, 3.50160179]) """ - m, _ = np.shape(training_data_x) - ypred = np.zeros(m) + y_pred = np.zeros(len(x_train)) # Initialize array of predictions + for i, item in enumerate(x_train): + y_pred[i] = item @ local_weight(item, x_train, y_train, tau) - for i, item in enumerate(training_data_x): - ypred[i] = item @ local_weight( - item, training_data_x, training_data_y, bandwidth - ) - - return ypred + return y_pred def load_data( - dataset_name: str, cola_name: str, colb_name: str -) -> tuple[np.array, np.array, np.array, np.array]: + dataset_name: str, x_name: str, y_name: str +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Load data from seaborn and split it into x and y points + >>> pass # No doctests, function is for demo purposes only """ import seaborn as sns data = sns.load_dataset(dataset_name) - col_a = np.array(data[cola_name]) # total_bill - col_b = np.array(data[colb_name]) # tip + x_data = np.array(data[x_name]) + y_data = np.array(data[y_name]) - mcol_a = col_a.copy() - mcol_b = col_b.copy() + one = np.ones(len(y_data)) - one = np.ones(np.shape(mcol_b)[0], dtype=int) + # pairing elements of one and x_data + x_train = np.column_stack((one, x_data)) - # pairing elements of one and mcol_a - training_data_x = np.column_stack((one, mcol_a)) - - return training_data_x, mcol_b, col_a, col_b - - -def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array: - """ - Get predictions with minimum error for each training data - >>> get_preds( - ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), - ... np.array([[1.01, 1.66, 3.5]]), - ... 0.6 - ... ) - array([1.07173261, 1.65970737, 3.50160179]) - """ - ypred = local_weight_regression(training_data_x, mcol_b, tau) - return ypred + return x_train, x_data, y_data def plot_preds( - training_data_x: np.array, - predictions: np.array, - col_x: np.array, - col_y: np.array, - cola_name: str, - colb_name: str, -) -> plt.plot: + x_train: np.ndarray, + preds: np.ndarray, + x_data: np.ndarray, + y_data: np.ndarray, + x_name: str, + y_name: str, +) -> None: """ Plot predictions and display the graph + >>> pass # No doctests, function is for demo purposes only """ - xsort = training_data_x.copy() - xsort.sort(axis=0) - plt.scatter(col_x, col_y, color="blue") + x_train_sorted = np.sort(x_train, axis=0) + plt.scatter(x_data, y_data, color="blue") plt.plot( - xsort[:, 1], - predictions[training_data_x[:, 1].argsort(0)], + x_train_sorted[:, 1], + preds[x_train[:, 1].argsort(0)], color="yellow", linewidth=5, ) plt.title("Local Weighted Regression") - plt.xlabel(cola_name) - plt.ylabel(colb_name) + plt.xlabel(x_name) + plt.ylabel(y_name) plt.show() @@ -144,6 +179,7 @@ if __name__ == "__main__": doctest.testmod() - training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") - predictions = get_preds(training_data_x, mcol_b, 0.5) - plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip") + # Demo with a dataset from the seaborn module + training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") + predictions = local_weight_regression(training_data_x, tip, 5) + plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip")