From e406801f9e3967ff0533dfe8cb98a3249db48d33 Mon Sep 17 00:00:00 2001 From: Tianyi Zheng Date: Fri, 28 Jul 2023 11:17:46 -0700 Subject: [PATCH] Reimplement polynomial_regression.py (#8889) * Reimplement polynomial_regression.py Rename machine_learning/polymonial_regression.py to machine_learning/polynomial_regression.py Reimplement machine_learning/polynomial_regression.py using numpy because the old original implementation was just a how-to on doing polynomial regression using sklearn Add detailed function documentation, doctests, and algorithm explanation * updating DIRECTORY.md * Fix matrix formatting in docstrings * Try to fix failing doctest * Debugging failing doctest * Fix failing doctest attempt 2 * Remove unnecessary return value descriptions in docstrings * Readd placeholder doctest for main function * Fix typo in algorithm description --------- Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com> --- DIRECTORY.md | 2 +- machine_learning/polymonial_regression.py | 44 ----- machine_learning/polynomial_regression.py | 213 ++++++++++++++++++++++ 3 files changed, 214 insertions(+), 45 deletions(-) delete mode 100644 machine_learning/polymonial_regression.py create mode 100644 machine_learning/polynomial_regression.py diff --git a/DIRECTORY.md b/DIRECTORY.md index 77938f450..133a1ab01 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -511,7 +511,7 @@ * Lstm * [Lstm Prediction](machine_learning/lstm/lstm_prediction.py) * [Multilayer Perceptron Classifier](machine_learning/multilayer_perceptron_classifier.py) - * [Polymonial Regression](machine_learning/polymonial_regression.py) + * [Polynomial Regression](machine_learning/polynomial_regression.py) * [Scoring Functions](machine_learning/scoring_functions.py) * [Self Organizing Map](machine_learning/self_organizing_map.py) * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py) diff --git a/machine_learning/polymonial_regression.py b/machine_learning/polymonial_regression.py deleted file mode 100644 index 487fb8145..000000000 --- a/machine_learning/polymonial_regression.py +++ /dev/null @@ -1,44 +0,0 @@ -import pandas as pd -from matplotlib import pyplot as plt -from sklearn.linear_model import LinearRegression - -# Splitting the dataset into the Training set and Test set -from sklearn.model_selection import train_test_split - -# Fitting Polynomial Regression to the dataset -from sklearn.preprocessing import PolynomialFeatures - -# Importing the dataset -dataset = pd.read_csv( - "https://s3.us-west-2.amazonaws.com/public.gamelab.fun/dataset/" - "position_salaries.csv" -) -X = dataset.iloc[:, 1:2].values -y = dataset.iloc[:, 2].values - - -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) - - -poly_reg = PolynomialFeatures(degree=4) -X_poly = poly_reg.fit_transform(X) -pol_reg = LinearRegression() -pol_reg.fit(X_poly, y) - - -# Visualizing the Polymonial Regression results -def viz_polymonial(): - plt.scatter(X, y, color="red") - plt.plot(X, pol_reg.predict(poly_reg.fit_transform(X)), color="blue") - plt.title("Truth or Bluff (Linear Regression)") - plt.xlabel("Position level") - plt.ylabel("Salary") - plt.show() - - -if __name__ == "__main__": - viz_polymonial() - - # Predicting a new result with Polymonial Regression - pol_reg.predict(poly_reg.fit_transform([[5.5]])) - # output should be 132148.43750003 diff --git a/machine_learning/polynomial_regression.py b/machine_learning/polynomial_regression.py new file mode 100644 index 000000000..5bafea96f --- /dev/null +++ b/machine_learning/polynomial_regression.py @@ -0,0 +1,213 @@ +""" +Polynomial regression is a type of regression analysis that models the relationship +between a predictor x and the response y as an mth-degree polynomial: + +y = β₀ + β₁x + β₂x² + ... + βₘxᵐ + ε + +By treating x, x², ..., xᵐ as distinct variables, we see that polynomial regression is a +special case of multiple linear regression. Therefore, we can use ordinary least squares +(OLS) estimation to estimate the vector of model parameters β = (β₀, β₁, β₂, ..., βₘ) +for polynomial regression: + +β = (XᵀX)⁻¹Xᵀy = X⁺y + +where X is the design matrix, y is the response vector, and X⁺ denotes the Moore–Penrose +pseudoinverse of X. In the case of polynomial regression, the design matrix is + + |1 x₁ x₁² ⋯ x₁ᵐ| +X = |1 x₂ x₂² ⋯ x₂ᵐ| + |⋮ ⋮ ⋮ ⋱ ⋮ | + |1 xₙ xₙ² ⋯ xₙᵐ| + +In OLS estimation, inverting XᵀX to compute X⁺ can be very numerically unstable. This +implementation sidesteps this need to invert XᵀX by computing X⁺ using singular value +decomposition (SVD): + +β = VΣ⁺Uᵀy + +where UΣVᵀ is an SVD of X. + +References: + - https://en.wikipedia.org/wiki/Polynomial_regression + - https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse + - https://en.wikipedia.org/wiki/Numerical_methods_for_linear_least_squares + - https://en.wikipedia.org/wiki/Singular_value_decomposition +""" + +import matplotlib.pyplot as plt +import numpy as np + + +class PolynomialRegression: + __slots__ = "degree", "params" + + def __init__(self, degree: int) -> None: + """ + @raises ValueError: if the polynomial degree is negative + """ + if degree < 0: + raise ValueError("Polynomial degree must be non-negative") + + self.degree = degree + self.params = None + + @staticmethod + def _design_matrix(data: np.ndarray, degree: int) -> np.ndarray: + """ + Constructs a polynomial regression design matrix for the given input data. For + input data x = (x₁, x₂, ..., xₙ) and polynomial degree m, the design matrix is + the Vandermonde matrix + + |1 x₁ x₁² ⋯ x₁ᵐ| + X = |1 x₂ x₂² ⋯ x₂ᵐ| + |⋮ ⋮ ⋮ ⋱ ⋮ | + |1 xₙ xₙ² ⋯ xₙᵐ| + + Reference: https://en.wikipedia.org/wiki/Vandermonde_matrix + + @param data: the input predictor values x, either for model fitting or for + prediction + @param degree: the polynomial degree m + @returns: the Vandermonde matrix X (see above) + @raises ValueError: if input data is not N x 1 + + >>> x = np.array([0, 1, 2]) + >>> PolynomialRegression._design_matrix(x, degree=0) + array([[1], + [1], + [1]]) + >>> PolynomialRegression._design_matrix(x, degree=1) + array([[1, 0], + [1, 1], + [1, 2]]) + >>> PolynomialRegression._design_matrix(x, degree=2) + array([[1, 0, 0], + [1, 1, 1], + [1, 2, 4]]) + >>> PolynomialRegression._design_matrix(x, degree=3) + array([[1, 0, 0, 0], + [1, 1, 1, 1], + [1, 2, 4, 8]]) + >>> PolynomialRegression._design_matrix(np.array([[0, 0], [0 , 0]]), degree=3) + Traceback (most recent call last): + ... + ValueError: Data must have dimensions N x 1 + """ + rows, *remaining = data.shape + if remaining: + raise ValueError("Data must have dimensions N x 1") + + return np.vander(data, N=degree + 1, increasing=True) + + def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None: + """ + Computes the polynomial regression model parameters using ordinary least squares + (OLS) estimation: + + β = (XᵀX)⁻¹Xᵀy = X⁺y + + where X⁺ denotes the Moore–Penrose pseudoinverse of the design matrix X. This + function computes X⁺ using singular value decomposition (SVD). + + References: + - https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse + - https://en.wikipedia.org/wiki/Singular_value_decomposition + - https://en.wikipedia.org/wiki/Multicollinearity + + @param x_train: the predictor values x for model fitting + @param y_train: the response values y for model fitting + @raises ArithmeticError: if X isn't full rank, then XᵀX is singular and β + doesn't exist + + >>> x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + >>> y = x**3 - 2 * x**2 + 3 * x - 5 + >>> poly_reg = PolynomialRegression(degree=3) + >>> poly_reg.fit(x, y) + >>> poly_reg.params + array([-5., 3., -2., 1.]) + >>> poly_reg = PolynomialRegression(degree=20) + >>> poly_reg.fit(x, y) + Traceback (most recent call last): + ... + ArithmeticError: Design matrix is not full rank, can't compute coefficients + + Make sure errors don't grow too large: + >>> coefs = np.array([-250, 50, -2, 36, 20, -12, 10, 2, -1, -15, 1]) + >>> y = PolynomialRegression._design_matrix(x, len(coefs) - 1) @ coefs + >>> poly_reg = PolynomialRegression(degree=len(coefs) - 1) + >>> poly_reg.fit(x, y) + >>> np.allclose(poly_reg.params, coefs, atol=10e-3) + True + """ + X = PolynomialRegression._design_matrix(x_train, self.degree) # noqa: N806 + _, cols = X.shape + if np.linalg.matrix_rank(X) < cols: + raise ArithmeticError( + "Design matrix is not full rank, can't compute coefficients" + ) + + # np.linalg.pinv() computes the Moore–Penrose pseudoinverse using SVD + self.params = np.linalg.pinv(X) @ y_train + + def predict(self, data: np.ndarray) -> np.ndarray: + """ + Computes the predicted response values y for the given input data by + constructing the design matrix X and evaluating y = Xβ. + + @param data: the predictor values x for prediction + @returns: the predicted response values y = Xβ + @raises ArithmeticError: if this function is called before the model + parameters are fit + + >>> x = np.array([0, 1, 2, 3, 4]) + >>> y = x**3 - 2 * x**2 + 3 * x - 5 + >>> poly_reg = PolynomialRegression(degree=3) + >>> poly_reg.fit(x, y) + >>> poly_reg.predict(np.array([-1])) + array([-11.]) + >>> poly_reg.predict(np.array([-2])) + array([-27.]) + >>> poly_reg.predict(np.array([6])) + array([157.]) + >>> PolynomialRegression(degree=3).predict(x) + Traceback (most recent call last): + ... + ArithmeticError: Predictor hasn't been fit yet + """ + if self.params is None: + raise ArithmeticError("Predictor hasn't been fit yet") + + return PolynomialRegression._design_matrix(data, self.degree) @ self.params + + +def main() -> None: + """ + Fit a polynomial regression model to predict fuel efficiency using seaborn's mpg + dataset + + >>> pass # Placeholder, function is only for demo purposes + """ + import seaborn as sns + + mpg_data = sns.load_dataset("mpg") + + poly_reg = PolynomialRegression(degree=2) + poly_reg.fit(mpg_data.weight, mpg_data.mpg) + + weight_sorted = np.sort(mpg_data.weight) + predictions = poly_reg.predict(weight_sorted) + + plt.scatter(mpg_data.weight, mpg_data.mpg, color="gray", alpha=0.5) + plt.plot(weight_sorted, predictions, color="red", linewidth=3) + plt.title("Predicting Fuel Efficiency Using Polynomial Regression") + plt.xlabel("Weight (lbs)") + plt.ylabel("Fuel Efficiency (mpg)") + plt.show() + + +if __name__ == "__main__": + import doctest + + doctest.testmod() + + main()