From e406801f9e3967ff0533dfe8cb98a3249db48d33 Mon Sep 17 00:00:00 2001
From: Tianyi Zheng <tianyizheng02@gmail.com>
Date: Fri, 28 Jul 2023 11:17:46 -0700
Subject: [PATCH] Reimplement polynomial_regression.py (#8889)

* Reimplement polynomial_regression.py

Rename machine_learning/polymonial_regression.py to
machine_learning/polynomial_regression.py

Reimplement machine_learning/polynomial_regression.py using numpy
because the old original implementation was just a how-to on doing
polynomial regression using sklearn

Add detailed function documentation, doctests, and algorithm
explanation

* updating DIRECTORY.md

* Fix matrix formatting in docstrings

* Try to fix failing doctest

* Debugging failing doctest

* Fix failing doctest attempt 2

* Remove unnecessary return value descriptions in docstrings

* Readd placeholder doctest for main function

* Fix typo in algorithm description

---------

Co-authored-by: github-actions <${GITHUB_ACTOR}@users.noreply.github.com>
---
 DIRECTORY.md                              |   2 +-
 machine_learning/polymonial_regression.py |  44 -----
 machine_learning/polynomial_regression.py | 213 ++++++++++++++++++++++
 3 files changed, 214 insertions(+), 45 deletions(-)
 delete mode 100644 machine_learning/polymonial_regression.py
 create mode 100644 machine_learning/polynomial_regression.py

diff --git a/DIRECTORY.md b/DIRECTORY.md
index 77938f450..133a1ab01 100644
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@@ -511,7 +511,7 @@
   * Lstm
     * [Lstm Prediction](machine_learning/lstm/lstm_prediction.py)
   * [Multilayer Perceptron Classifier](machine_learning/multilayer_perceptron_classifier.py)
-  * [Polymonial Regression](machine_learning/polymonial_regression.py)
+  * [Polynomial Regression](machine_learning/polynomial_regression.py)
   * [Scoring Functions](machine_learning/scoring_functions.py)
   * [Self Organizing Map](machine_learning/self_organizing_map.py)
   * [Sequential Minimum Optimization](machine_learning/sequential_minimum_optimization.py)
diff --git a/machine_learning/polymonial_regression.py b/machine_learning/polymonial_regression.py
deleted file mode 100644
index 487fb8145..000000000
--- a/machine_learning/polymonial_regression.py
+++ /dev/null
@@ -1,44 +0,0 @@
-import pandas as pd
-from matplotlib import pyplot as plt
-from sklearn.linear_model import LinearRegression
-
-# Splitting the dataset into the Training set and Test set
-from sklearn.model_selection import train_test_split
-
-# Fitting Polynomial Regression to the dataset
-from sklearn.preprocessing import PolynomialFeatures
-
-# Importing the dataset
-dataset = pd.read_csv(
-    "https://s3.us-west-2.amazonaws.com/public.gamelab.fun/dataset/"
-    "position_salaries.csv"
-)
-X = dataset.iloc[:, 1:2].values
-y = dataset.iloc[:, 2].values
-
-
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
-
-
-poly_reg = PolynomialFeatures(degree=4)
-X_poly = poly_reg.fit_transform(X)
-pol_reg = LinearRegression()
-pol_reg.fit(X_poly, y)
-
-
-# Visualizing the Polymonial Regression results
-def viz_polymonial():
-    plt.scatter(X, y, color="red")
-    plt.plot(X, pol_reg.predict(poly_reg.fit_transform(X)), color="blue")
-    plt.title("Truth or Bluff (Linear Regression)")
-    plt.xlabel("Position level")
-    plt.ylabel("Salary")
-    plt.show()
-
-
-if __name__ == "__main__":
-    viz_polymonial()
-
-    # Predicting a new result with Polymonial Regression
-    pol_reg.predict(poly_reg.fit_transform([[5.5]]))
-    # output should be 132148.43750003
diff --git a/machine_learning/polynomial_regression.py b/machine_learning/polynomial_regression.py
new file mode 100644
index 000000000..5bafea96f
--- /dev/null
+++ b/machine_learning/polynomial_regression.py
@@ -0,0 +1,213 @@
+"""
+Polynomial regression is a type of regression analysis that models the relationship
+between a predictor x and the response y as an mth-degree polynomial:
+
+y = β₀ + β₁x + β₂x² + ... + βₘxᵐ + ε
+
+By treating x, x², ..., xᵐ as distinct variables, we see that polynomial regression is a
+special case of multiple linear regression. Therefore, we can use ordinary least squares
+(OLS) estimation to estimate the vector of model parameters β = (β₀, β₁, β₂, ..., βₘ)
+for polynomial regression:
+
+β = (XᵀX)⁻¹Xᵀy = X⁺y
+
+where X is the design matrix, y is the response vector, and X⁺ denotes the Moore–Penrose
+pseudoinverse of X. In the case of polynomial regression, the design matrix is
+
+    |1  x₁  x₁² ⋯ x₁ᵐ|
+X = |1  x₂  x₂² ⋯ x₂ᵐ|
+    |⋮  ⋮   ⋮   ⋱ ⋮  |
+    |1  xₙ  xₙ² ⋯  xₙᵐ|
+
+In OLS estimation, inverting XᵀX to compute X⁺ can be very numerically unstable. This
+implementation sidesteps this need to invert XᵀX by computing X⁺ using singular value
+decomposition (SVD):
+
+β = VΣ⁺Uᵀy
+
+where UΣVᵀ is an SVD of X.
+
+References:
+    - https://en.wikipedia.org/wiki/Polynomial_regression
+    - https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse
+    - https://en.wikipedia.org/wiki/Numerical_methods_for_linear_least_squares
+    - https://en.wikipedia.org/wiki/Singular_value_decomposition
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+class PolynomialRegression:
+    __slots__ = "degree", "params"
+
+    def __init__(self, degree: int) -> None:
+        """
+        @raises ValueError: if the polynomial degree is negative
+        """
+        if degree < 0:
+            raise ValueError("Polynomial degree must be non-negative")
+
+        self.degree = degree
+        self.params = None
+
+    @staticmethod
+    def _design_matrix(data: np.ndarray, degree: int) -> np.ndarray:
+        """
+        Constructs a polynomial regression design matrix for the given input data. For
+        input data x = (x₁, x₂, ..., xₙ) and polynomial degree m, the design matrix is
+        the Vandermonde matrix
+
+            |1  x₁  x₁² ⋯ x₁ᵐ|
+        X = |1  x₂  x₂² ⋯ x₂ᵐ|
+            |⋮  ⋮   ⋮   ⋱ ⋮  |
+            |1  xₙ  xₙ² ⋯  xₙᵐ|
+
+        Reference: https://en.wikipedia.org/wiki/Vandermonde_matrix
+
+        @param data:    the input predictor values x, either for model fitting or for
+                        prediction
+        @param degree:  the polynomial degree m
+        @returns:       the Vandermonde matrix X (see above)
+        @raises ValueError: if input data is not N x 1
+
+        >>> x = np.array([0, 1, 2])
+        >>> PolynomialRegression._design_matrix(x, degree=0)
+        array([[1],
+               [1],
+               [1]])
+        >>> PolynomialRegression._design_matrix(x, degree=1)
+        array([[1, 0],
+               [1, 1],
+               [1, 2]])
+        >>> PolynomialRegression._design_matrix(x, degree=2)
+        array([[1, 0, 0],
+               [1, 1, 1],
+               [1, 2, 4]])
+        >>> PolynomialRegression._design_matrix(x, degree=3)
+        array([[1, 0, 0, 0],
+               [1, 1, 1, 1],
+               [1, 2, 4, 8]])
+        >>> PolynomialRegression._design_matrix(np.array([[0, 0], [0 , 0]]), degree=3)
+        Traceback (most recent call last):
+        ...
+        ValueError: Data must have dimensions N x 1
+        """
+        rows, *remaining = data.shape
+        if remaining:
+            raise ValueError("Data must have dimensions N x 1")
+
+        return np.vander(data, N=degree + 1, increasing=True)
+
+    def fit(self, x_train: np.ndarray, y_train: np.ndarray) -> None:
+        """
+        Computes the polynomial regression model parameters using ordinary least squares
+        (OLS) estimation:
+
+        β = (XᵀX)⁻¹Xᵀy = X⁺y
+
+        where X⁺ denotes the Moore–Penrose pseudoinverse of the design matrix X. This
+        function computes X⁺ using singular value decomposition (SVD).
+
+        References:
+            - https://en.wikipedia.org/wiki/Moore%E2%80%93Penrose_inverse
+            - https://en.wikipedia.org/wiki/Singular_value_decomposition
+            - https://en.wikipedia.org/wiki/Multicollinearity
+
+        @param x_train: the predictor values x for model fitting
+        @param y_train: the response values y for model fitting
+        @raises ArithmeticError:    if X isn't full rank, then XᵀX is singular and β
+                                    doesn't exist
+
+        >>> x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
+        >>> y = x**3 - 2 * x**2 + 3 * x - 5
+        >>> poly_reg = PolynomialRegression(degree=3)
+        >>> poly_reg.fit(x, y)
+        >>> poly_reg.params
+        array([-5.,  3., -2.,  1.])
+        >>> poly_reg = PolynomialRegression(degree=20)
+        >>> poly_reg.fit(x, y)
+        Traceback (most recent call last):
+        ...
+        ArithmeticError: Design matrix is not full rank, can't compute coefficients
+
+        Make sure errors don't grow too large:
+        >>> coefs = np.array([-250, 50, -2, 36, 20, -12, 10, 2, -1, -15, 1])
+        >>> y = PolynomialRegression._design_matrix(x, len(coefs) - 1) @ coefs
+        >>> poly_reg = PolynomialRegression(degree=len(coefs) - 1)
+        >>> poly_reg.fit(x, y)
+        >>> np.allclose(poly_reg.params, coefs, atol=10e-3)
+        True
+        """
+        X = PolynomialRegression._design_matrix(x_train, self.degree)  # noqa: N806
+        _, cols = X.shape
+        if np.linalg.matrix_rank(X) < cols:
+            raise ArithmeticError(
+                "Design matrix is not full rank, can't compute coefficients"
+            )
+
+        # np.linalg.pinv() computes the Moore–Penrose pseudoinverse using SVD
+        self.params = np.linalg.pinv(X) @ y_train
+
+    def predict(self, data: np.ndarray) -> np.ndarray:
+        """
+        Computes the predicted response values y for the given input data by
+        constructing the design matrix X and evaluating y = Xβ.
+
+        @param data:    the predictor values x for prediction
+        @returns:       the predicted response values y = Xβ
+        @raises ArithmeticError:    if this function is called before the model
+                                    parameters are fit
+
+        >>> x = np.array([0, 1, 2, 3, 4])
+        >>> y = x**3 - 2 * x**2 + 3 * x - 5
+        >>> poly_reg = PolynomialRegression(degree=3)
+        >>> poly_reg.fit(x, y)
+        >>> poly_reg.predict(np.array([-1]))
+        array([-11.])
+        >>> poly_reg.predict(np.array([-2]))
+        array([-27.])
+        >>> poly_reg.predict(np.array([6]))
+        array([157.])
+        >>> PolynomialRegression(degree=3).predict(x)
+        Traceback (most recent call last):
+        ...
+        ArithmeticError: Predictor hasn't been fit yet
+        """
+        if self.params is None:
+            raise ArithmeticError("Predictor hasn't been fit yet")
+
+        return PolynomialRegression._design_matrix(data, self.degree) @ self.params
+
+
+def main() -> None:
+    """
+    Fit a polynomial regression model to predict fuel efficiency using seaborn's mpg
+    dataset
+
+    >>> pass    # Placeholder, function is only for demo purposes
+    """
+    import seaborn as sns
+
+    mpg_data = sns.load_dataset("mpg")
+
+    poly_reg = PolynomialRegression(degree=2)
+    poly_reg.fit(mpg_data.weight, mpg_data.mpg)
+
+    weight_sorted = np.sort(mpg_data.weight)
+    predictions = poly_reg.predict(weight_sorted)
+
+    plt.scatter(mpg_data.weight, mpg_data.mpg, color="gray", alpha=0.5)
+    plt.plot(weight_sorted, predictions, color="red", linewidth=3)
+    plt.title("Predicting Fuel Efficiency Using Polynomial Regression")
+    plt.xlabel("Weight (lbs)")
+    plt.ylabel("Fuel Efficiency (mpg)")
+    plt.show()
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
+
+    main()