Merge d7a0b9d245 into e3bd7721c8

2024-11-23 21:11:08 +00:00 · 2024-11-19 00:03:39 +05:30 · 2024-11-19 00:03:39 +05:30 · 632f4845aa
commit 632f4845aa
parent e3bd7721c8 d7a0b9d245
8 changed files with 1278 additions and 5 deletions
--- a/data_structures/arrays/kadanes_algorithm.py
+++ b/data_structures/arrays/kadanes_algorithm.py
@ -0,0 +1,42 @@
+# Kadane's algorithm
+
+
+def kadanes_algorithm(arr: list[int]) -> int:
+    """
+    Function to find the maximum sum of a contiguous subarray using Kadane's algorithm
+
+    >>> kadanes_algorithm([-2, 1, -3, 4, -1, 2, 1, -5, 4])
+    6
+
+    >>> kadanes_algorithm([-1, -2, -3, -4])
+    -1
+
+    >>> kadanes_algorithm([5, 4, -1, 7, 8])
+    23
+
+    >>> kadanes_algorithm([1])
+    1
+
+    >>> kadanes_algorithm([-1, 2, 3, -5, 4])
+    5
+    """
+    # initializing variables
+    max_current = arr[0]  # store the current max sum
+    max_global = arr[0]  # store the global max sum
+
+    # looping through the array starting at the second element
+    for i in range(1, len(arr)):
+        # update current max sum by choosing the maximum between
+        # current element alone or current element plus previous max
+        max_current = max(arr[i], max_current + arr[i])
+
+        # update global max sum if current max is larger
+        max_global = max(max_current, max_global)
+
+    return max_global
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
--- a/data_structures/stacks/largest_rectangle_histogram.py
+++ b/data_structures/stacks/largest_rectangle_histogram.py
@ -0,0 +1,39 @@
+def largest_rectangle_area(heights: list[int]) -> int:
+    """
+    Inputs an array of integers representing the heights of bars,
+    and returns the area of the largest rectangle that can be formed
+
+    >>> largest_rectangle_area([2, 1, 5, 6, 2, 3])
+    10
+
+    >>> largest_rectangle_area([2, 4])
+    4
+
+    >>> largest_rectangle_area([6, 2, 5, 4, 5, 1, 6])
+    12
+
+    >>> largest_rectangle_area([1])
+    1
+    """
+    stack: list[int] = []
+    max_area = 0
+    heights = [*heights, 0]  # make a new list by appending the sentinel 0
+    n = len(heights)
+
+    for i in range(n):
+        # make sure the stack remains in increasing order
+        while stack and heights[i] < heights[stack[-1]]:
+            h = heights[stack.pop()]  # height of the bar
+            # if stack is empty, it means entire width can be taken from index 0 to i-1
+            w = i if not stack else i - stack[-1] - 1  # calculate width
+            max_area = max(max_area, h * w)
+
+        stack.append(i)
+
+    return max_area
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
--- a/machine_learning/frequent_pattern_growth.py
+++ b/machine_learning/frequent_pattern_growth.py
@ -240,7 +240,7 @@ def ascend_tree(leaf_node: TreeNode, prefix_path: list[str]) -> None:
        ascend_tree(leaf_node.parent, prefix_path)


-def find_prefix_path(base_pat: frozenset, tree_node: TreeNode | None) -> dict:  # noqa: ARG001
+def find_prefix_path(_: frozenset, tree_node: TreeNode | None) -> dict:
    """
    Find the conditional pattern base for a given base pattern.

--- a/machine_learning/loss_functions.py
+++ b/machine_learning/loss_functions.py
@ -629,13 +629,15 @@ def smooth_l1_loss(y_true: np.ndarray, y_pred: np.ndarray, beta: float = 1.0) ->
    return np.mean(loss)


-def kullback_leibler_divergence(y_true: np.ndarray, y_pred: np.ndarray) -> float:
+def kullback_leibler_divergence(
+    y_true: np.ndarray, y_pred: np.ndarray, epsilon: float = 1e-10
+) -> float:
    """
    Calculate the Kullback-Leibler divergence (KL divergence) loss between true labels
    and predicted probabilities.

-    KL divergence loss quantifies dissimilarity between true labels and predicted
-    probabilities. It's often used in training generative models.
+    KL divergence loss quantifies the dissimilarity between true labels and predicted
+    probabilities. It is often used in training generative models.

    KL = Σ(y_true * ln(y_true / y_pred))

@ -649,6 +651,7 @@ def kullback_leibler_divergence(y_true: np.ndarray, y_pred: np.ndarray) -> float
    >>> predicted_probs = np.array([0.3, 0.3, 0.4])
    >>> float(kullback_leibler_divergence(true_labels, predicted_probs))
    0.030478754035472025
+
    >>> true_labels = np.array([0.2, 0.3, 0.5])
    >>> predicted_probs = np.array([0.3, 0.3, 0.4, 0.5])
    >>> kullback_leibler_divergence(true_labels, predicted_probs)
@ -659,7 +662,13 @@ def kullback_leibler_divergence(y_true: np.ndarray, y_pred: np.ndarray) -> float
    if len(y_true) != len(y_pred):
        raise ValueError("Input arrays must have the same length.")

-    kl_loss = y_true * np.log(y_true / y_pred)
+    # negligible epsilon to avoid issues with log(0) or division by zero
+    epsilon = 1e-10
+    y_pred = np.clip(y_pred, epsilon, None)
+
+    # calculate KL divergence only where y_true is not zero
+    kl_loss = np.where(y_true != 0, y_true * np.log(y_true / y_pred), 0.0)
+
    return np.sum(kl_loss)


--- a/machine_learning/ridge_regression/ADRvsRating.csv
+++ b/machine_learning/ridge_regression/ADRvsRating.csv
--- a/machine_learning/ridge_regression/init.py
+++ b/machine_learning/ridge_regression/init.py
--- a/machine_learning/ridge_regression/ridge_regression.py
+++ b/machine_learning/ridge_regression/ridge_regression.py
@ -0,0 +1,82 @@
+import numpy as np
+import pandas as pd
+
+
+class RidgeRegression:
+    def __init__(
+        self,
+        alpha: float = 0.001,
+        regularization_param: float = 0.1,
+        num_iterations: int = 1000,
+    ) -> None:
+        self.alpha: float = alpha
+        self.regularization_param: float = regularization_param
+        self.num_iterations: int = num_iterations
+        self.theta: np.ndarray = None
+
+    def feature_scaling(
+        self, features: np.ndarray
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+        mean = np.mean(features, axis=0)
+        std = np.std(features, axis=0)
+
+        # avoid division by zero for constant features (std = 0)
+        std[std == 0] = 1  # set std=1 for constant features to avoid NaN
+
+        features_scaled = (features - mean) / std
+        return features_scaled, mean, std
+
+    def fit(self, features: np.ndarray, target: np.ndarray) -> None:
+        features_scaled, mean, std = self.feature_scaling(features)
+        m, n = features_scaled.shape
+        self.theta = np.zeros(n)  # initializing weights to zeros
+
+        for _ in range(self.num_iterations):
+            predictions = features_scaled.dot(self.theta)
+            error = predictions - target
+
+            # computing gradient with L2 regularization
+            gradient = (
+                features_scaled.T.dot(error) + self.regularization_param * self.theta
+            ) / m
+            self.theta -= self.alpha * gradient  # updating weights
+
+    def predict(self, features: np.ndarray) -> np.ndarray:
+        features_scaled, _, _ = self.feature_scaling(features)
+        return features_scaled.dot(self.theta)
+
+    def compute_cost(self, features: np.ndarray, target: np.ndarray) -> float:
+        features_scaled, _, _ = self.feature_scaling(features)
+        m = len(target)
+
+        predictions = features_scaled.dot(self.theta)
+        cost = (1 / (2 * m)) * np.sum((predictions - target) ** 2) + (
+            self.regularization_param / (2 * m)
+        ) * np.sum(self.theta**2)
+        return cost
+
+    def mean_absolute_error(self, target: np.ndarray, predictions: np.ndarray) -> float:
+        return np.mean(np.abs(target - predictions))
+
+
+# Example usage
+if __name__ == "__main__":
+    data = pd.read_csv("ADRvsRating.csv")
+    features_matrix = data[["Rating"]].to_numpy()
+    target = data["ADR"].to_numpy()
+    target = (target - np.mean(target)) / np.std(target)
+
+    # added bias term to the feature matrix
+    x = np.c_[np.ones(features_matrix.shape[0]), features_matrix]
+
+    # initialize and train the ridge regression model
+    model = RidgeRegression(alpha=0.01, regularization_param=0.1, num_iterations=1000)
+    model.fit(features_matrix, target)
+
+    # predictions
+    predictions = model.predict(features_matrix)
+
+    # results
+    print("Optimized Weights:", model.theta)
+    print("Cost:", model.compute_cost(features_matrix, target))
+    print("Mean Absolute Error:", model.mean_absolute_error(target, predictions))
--- a/machine_learning/ridge_regression/test_ridge_regression.py
+++ b/machine_learning/ridge_regression/test_ridge_regression.py
@ -0,0 +1,100 @@
+"""
+Doctest for RidgeRegression class
+
+Tests include:
+- feature_scaling
+- fit
+- predict
+- mean_absolute_error
+
+To run these tests, use the following command:
+    python -m doctest test_ridge_regression.py -v
+"""
+
+import numpy as np  # noqa: F401
+
+from machine_learning.ridge_regression.ridge_regression import (
+    RidgeRegression,  # noqa: F401
+)
+
+
+def test_feature_scaling():
+    """
+       Tests the feature_scaling function of RidgeRegression.
+    --------
+       >>> model = RidgeRegression()
+       >>> features = np.array([[1, 2], [2, 3], [3, 4]])
+       >>> features_scaled, mean, std = model.feature_scaling(features)
+       >>> np.round(features_scaled, 2)
+       array([[-1.22, -1.22],
+              [ 0.  ,  0.  ],
+              [ 1.22,  1.22]])
+       >>> np.round(mean, 2)
+       array([2., 3.])
+       >>> np.round(std, 2)
+       array([0.82, 0.82])
+    """
+
+
+def test_fit():
+    """
+    Tests the fit function of RidgeRegression
+    --------
+    >>> model = RidgeRegression(alpha=0.01,
+    ...                          regularization_param=0.1,
+    ...                          num_iterations=1000)
+    >>> features = np.array([[1], [2], [3]])
+    >>> target = np.array([2, 3, 4])
+
+    # Adding a bias term
+    >>> features = np.c_[np.ones(features.shape[0]), features]
+
+    # Fit the model
+    >>> model.fit(features, target)
+
+    # Check if the weights have been updated
+    >>> np.round(model.theta, decimals=2)
+    array([0.  , 0.79])
+    """
+
+
+def test_predict():
+    """
+    Tests the predict function of RidgeRegression
+    --------
+    >>> model = RidgeRegression(alpha=0.01,
+    ...                          regularization_param=0.1,
+    ...                          num_iterations=1000)
+    >>> features = np.array([[1], [2], [3]])
+    >>> target = np.array([2, 3, 4])
+
+    # Adding a bias term
+    >>> features = np.c_[np.ones(features.shape[0]), features]
+
+    # Fit the model
+    >>> model.fit(features, target)
+
+    # Predict with the model
+    >>> predictions = model.predict(features)
+    >>> np.round(predictions, decimals=2)
+    array([-0.97,  0.  ,  0.97])
+    """
+
+
+def test_mean_absolute_error():
+    """
+    Tests the mean_absolute_error function of RidgeRegression
+    --------
+    >>> model = RidgeRegression()
+    >>> target = np.array([2, 3, 4])
+    >>> predictions = np.array([2.1, 3.0, 3.9])
+    >>> mae = model.mean_absolute_error(target, predictions)
+    >>> float(np.round(mae, 2))
+    0.07
+    """
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()