2025-02-24 18:08:39 +00:00
5 changed files with 76 additions and 391 deletions
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@ -712,7 +712,6 @@
  * [Gauss Easter](other/gauss_easter.py)
  * [Graham Scan](other/graham_scan.py)
  * [Greedy](other/greedy.py)
  * [H Index](other/h_index.py)
  * [Least Recently Used](other/least_recently_used.py)
  * [Lfu Cache](other/lfu_cache.py)
  * [Linear Congruential Generator](other/linear_congruential_generator.py)
--- a/machine_learning/local_weighted_learning/local_weighted_learning.py
+++ b/machine_learning/local_weighted_learning/local_weighted_learning.py
@ -1,55 +1,14 @@
 """
 Locally weighted linear regression, also called local regression, is a type of
 non-parametric linear regression that prioritizes data closest to a given
 prediction point. The algorithm estimates the vector of model coefficients β
 using weighted least squares regression:
 β = (XᵀWX)⁻¹(XᵀWy),
 where X is the design matrix, y is the response vector, and W is the diagonal
 weight matrix.
 This implementation calculates wᵢ, the weight of the ith training sample, using
 the Gaussian weight:
 wᵢ = exp(-‖xᵢ - x‖²/(2τ²)),
 where xᵢ is the ith training sample, x is the prediction point, τ is the
 "bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L²
 norm). The bandwidth τ controls how quickly the weight of a training sample
 decreases as its distance from the prediction point increases. One can think of
 the Gaussian weight as a bell curve centered around the prediction point: a
 training sample is weighted lower if it's farther from the center, and τ
 controls the spread of the bell curve.
 Other types of locally weighted regression such as locally estimated scatterplot
 smoothing (LOESS) typically use different weight functions.
 References:
    - https://en.wikipedia.org/wiki/Local_regression
    - https://en.wikipedia.org/wiki/Weighted_least_squares
    - https://cs229.stanford.edu/notes2022fall/main_notes.pdf
 """
 import matplotlib.pyplot as plt
 import numpy as np
-def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray:
+def weighted_matrix(
    point: np.array, training_data_x: np.array, bandwidth: float
 ) -> np.array:
    """
-    Calculate the weight of every point in the training data around a given
+    Calculate the weight for every point in the data set.
-    prediction point
+    point --> the x value at which we want to make predictions
-
+    >>> weighted_matrix(
    Args:
        point: x-value at which the prediction is being made
        x_train: ndarray of x-values for training
        tau: bandwidth value, controls how quickly the weight of training values
            decreases as the distance from the prediction point increases
    Returns:
        m x m weight matrix around the prediction point, where m is the size of
        the training set
    >>> weight_matrix(
    ...     np.array([1., 1.]),
    ...     np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
    ...     0.6
@ -58,30 +17,25 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar
           [0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
           [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])
    """
-    m = len(x_train)  # Number of training samples
+    m, _ = np.shape(training_data_x)  # m is the number of training samples
-    weights = np.eye(m)  # Initialize weights as identity matrix
+    weights = np.eye(m)  # Initializing weights as identity matrix
    for j in range(m):
        diff = point - x_train[j]
        weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2))
    # calculating weights for all training examples [x(i)'s]
    for j in range(m):
        diff = point - training_data_x[j]
        weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2))
    return weights
 def local_weight(
-    point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float
+    point: np.array,
-) -> np.ndarray:
+    training_data_x: np.array,
    training_data_y: np.array,
    bandwidth: float,
 ) -> np.array:
    """
-    Calculate the local weights at a given prediction point using the weight
+    Calculate the local weights using the weight_matrix function on training data.
-    matrix for that point
+    Return the weighted matrix.
    Args:
        point: x-value at which the prediction is being made
        x_train: ndarray of x-values for training
        y_train: ndarray of y-values for training
        tau: bandwidth value, controls how quickly the weight of training values
            decreases as the distance from the prediction point increases
    Returns:
        ndarray of local weights
    >>> local_weight(
    ...     np.array([1., 1.]),
    ...     np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@ -91,28 +45,19 @@ def local_weight(
    array([[0.00873174],
           [0.08272556]])
    """
-    weight_mat = weight_matrix(point, x_train, tau)
+    weight = weighted_matrix(point, training_data_x, bandwidth)
-    weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ (
+    w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ (
-        x_train.T @ weight_mat @ y_train.T
+        training_data_x.T @ weight @ training_data_y.T
    )
-    return weight
+    return w
 def local_weight_regression(
-    x_train: np.ndarray, y_train: np.ndarray, tau: float
+    training_data_x: np.array, training_data_y: np.array, bandwidth: float
-) -> np.ndarray:
+) -> np.array:
    """
-    Calculate predictions for each point in the training data
+    Calculate predictions for each data point on axis
    Args:
        x_train: ndarray of x-values for training
        y_train: ndarray of y-values for training
        tau: bandwidth value, controls how quickly the weight of training values
            decreases as the distance from the prediction point increases
    Returns:
        ndarray of predictions
    >>> local_weight_regression(
    ...     np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
    ...     np.array([[1.01, 1.66, 3.5]]),
@ -120,57 +65,77 @@ def local_weight_regression(
    ... )
    array([1.07173261, 1.65970737, 3.50160179])
    """
-    y_pred = np.zeros(len(x_train))  # Initialize array of predictions
+    m, _ = np.shape(training_data_x)
-    for i, item in enumerate(x_train):
+    ypred = np.zeros(m)
        y_pred[i] = item @ local_weight(item, x_train, y_train, tau)
-    return y_pred
+    for i, item in enumerate(training_data_x):
        ypred[i] = item @ local_weight(
            item, training_data_x, training_data_y, bandwidth
        )
    return ypred
 def load_data(
-    dataset_name: str, x_name: str, y_name: str
+    dataset_name: str, cola_name: str, colb_name: str
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+) -> tuple[np.array, np.array, np.array, np.array]:
    """
    Load data from seaborn and split it into x and y points
    >>> pass    # No doctests, function is for demo purposes only
    """
    import seaborn as sns
    data = sns.load_dataset(dataset_name)
-    x_data = np.array(data[x_name])
+    col_a = np.array(data[cola_name])  # total_bill
-    y_data = np.array(data[y_name])
+    col_b = np.array(data[colb_name])  # tip
-    one = np.ones(len(y_data))
+    mcol_a = col_a.copy()
    mcol_b = col_b.copy()
-    # pairing elements of one and x_data
+    one = np.ones(np.shape(mcol_b)[0], dtype=int)
    x_train = np.column_stack((one, x_data))
-    return x_train, x_data, y_data
+    # pairing elements of one and mcol_a
    training_data_x = np.column_stack((one, mcol_a))
    return training_data_x, mcol_b, col_a, col_b
 def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array:
    """
    Get predictions with minimum error for each training data
    >>> get_preds(
    ...     np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
    ...     np.array([[1.01, 1.66, 3.5]]),
    ...     0.6
    ... )
    array([1.07173261, 1.65970737, 3.50160179])
    """
    ypred = local_weight_regression(training_data_x, mcol_b, tau)
    return ypred
 def plot_preds(
-    x_train: np.ndarray,
+    training_data_x: np.array,
-    preds: np.ndarray,
+    predictions: np.array,
-    x_data: np.ndarray,
+    col_x: np.array,
-    y_data: np.ndarray,
+    col_y: np.array,
-    x_name: str,
+    cola_name: str,
-    y_name: str,
+    colb_name: str,
-) -> None:
+) -> plt.plot:
    """
    Plot predictions and display the graph
    >>> pass    # No doctests, function is for demo purposes only
    """
-    x_train_sorted = np.sort(x_train, axis=0)
+    xsort = training_data_x.copy()
-    plt.scatter(x_data, y_data, color="blue")
+    xsort.sort(axis=0)
    plt.scatter(col_x, col_y, color="blue")
    plt.plot(
-        x_train_sorted[:, 1],
+        xsort[:, 1],
-        preds[x_train[:, 1].argsort(0)],
+        predictions[training_data_x[:, 1].argsort(0)],
        color="yellow",
        linewidth=5,
    )
    plt.title("Local Weighted Regression")
-    plt.xlabel(x_name)
+    plt.xlabel(cola_name)
-    plt.ylabel(y_name)
+    plt.ylabel(colb_name)
    plt.show()
@ -179,7 +144,6 @@ if __name__ == "__main__":
    doctest.testmod()
-    # Demo with a dataset from the seaborn module
+    training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip")
-    training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip")
+    predictions = get_preds(training_data_x, mcol_b, 0.5)
-    predictions = local_weight_regression(training_data_x, tip, 5)
+    plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip")
    plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip")
--- a/maths/odd_sieve.py
+++ b/maths/odd_sieve.py
@ -1,42 +0,0 @@
 from itertools import compress, repeat
 from math import ceil, sqrt
 def odd_sieve(num: int) -> list[int]:
    """
    Returns the prime numbers < `num`. The prime numbers are calculated using an
    odd sieve implementation of the Sieve of Eratosthenes algorithm
    (see for reference https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes).
    >>> odd_sieve(2)
    []
    >>> odd_sieve(3)
    [2]
    >>> odd_sieve(10)
    [2, 3, 5, 7]
    >>> odd_sieve(20)
    [2, 3, 5, 7, 11, 13, 17, 19]
    """
    if num <= 2:
        return []
    if num == 3:
        return [2]
    # Odd sieve for numbers in range [3, num - 1]
    sieve = bytearray(b"\x01") * ((num >> 1) - 1)
    for i in range(3, int(sqrt(num)) + 1, 2):
        if sieve[(i >> 1) - 1]:
            i_squared = i**2
            sieve[(i_squared >> 1) - 1 :: i] = repeat(
                0, ceil((num - i_squared) / (i << 1))
            )
    return [2] + list(compress(range(3, num, 2), sieve))
 if __name__ == "__main__":
    import doctest
    doctest.testmod()
--- a/other/guess_the_number_search.py
+++ b/other/guess_the_number_search.py
@ -1,165 +0,0 @@
 """
 guess the number using lower,higher and the value to find or guess
 solution works by dividing lower and higher of number guessed
 suppose lower is 0, higher is 1000 and the number to guess is 355
 >>> guess_the_number(10, 1000, 17)
 started...
 guess the number : 17
 details : [505, 257, 133, 71, 40, 25, 17]
 """
 def temp_input_value(
    min_val: int = 10, max_val: int = 1000, option: bool = True
 ) -> int:
    """
    Temporary input values for tests
    >>> temp_input_value(option=True)
    10
    >>> temp_input_value(option=False)
    1000
    >>> temp_input_value(min_val=100, option=True)
    100
    >>> temp_input_value(min_val=100, max_val=50)
    Traceback (most recent call last):
        ...
    ValueError: Invalid value for min_val or max_val (min_value < max_value)
    >>> temp_input_value("ten","fifty",1)
    Traceback (most recent call last):
        ...
    AssertionError: Invalid type of value(s) specified to function!
    >>> temp_input_value(min_val=-100, max_val=500)
    -100
    >>> temp_input_value(min_val=-5100, max_val=-100)
    -5100
    """
    assert (
        isinstance(min_val, int)
        and isinstance(max_val, int)
        and isinstance(option, bool)
    ), "Invalid type of value(s) specified to function!"
    if min_val > max_val:
        raise ValueError("Invalid value for min_val or max_val (min_value < max_value)")
    return min_val if option else max_val
 def get_avg(number_1: int, number_2: int) -> int:
    """
    Return the mid-number(whole) of two integers a and b
    >>> get_avg(10, 15)
    12
    >>> get_avg(20, 300)
    160
    >>> get_avg("abcd", 300)
    Traceback (most recent call last):
        ...
    TypeError: can only concatenate str (not "int") to str
    >>> get_avg(10.5,50.25)
    30
    """
    return int((number_1 + number_2) / 2)
 def guess_the_number(lower: int, higher: int, to_guess: int) -> None:
    """
    The `guess_the_number` function that guess the number by some operations
    and using inner functions
    >>> guess_the_number(10, 1000, 17)
    started...
    guess the number : 17
    details : [505, 257, 133, 71, 40, 25, 17]
    >>> guess_the_number(-10000, 10000, 7)
    started...
    guess the number : 7
    details : [0, 5000, 2500, 1250, 625, 312, 156, 78, 39, 19, 9, 4, 6, 7]
    >>> guess_the_number(10, 1000, "a")
    Traceback (most recent call last):
        ...
    AssertionError: argument values must be type of "int"
    >>> guess_the_number(10, 1000, 5)
    Traceback (most recent call last):
        ...
    ValueError: guess value must be within the range of lower and higher value
    >>> guess_the_number(10000, 100, 5)
    Traceback (most recent call last):
        ...
    ValueError: argument value for lower and higher must be(lower > higher)
    """
    assert (
        isinstance(lower, int) and isinstance(higher, int) and isinstance(to_guess, int)
    ), 'argument values must be type of "int"'
    if lower > higher:
        raise ValueError("argument value for lower and higher must be(lower > higher)")
    if not lower < to_guess < higher:
        raise ValueError(
            "guess value must be within the range of lower and higher value"
        )
    def answer(number: int) -> str:
        """
        Returns value by comparing with entered `to_guess` number
        """
        if number > to_guess:
            return "high"
        elif number < to_guess:
            return "low"
        else:
            return "same"
    print("started...")
    last_lowest = lower
    last_highest = higher
    last_numbers = []
    while True:
        number = get_avg(last_lowest, last_highest)
        last_numbers.append(number)
        if answer(number) == "low":
            last_lowest = number
        elif answer(number) == "high":
            last_highest = number
        else:
            break
    print(f"guess the number : {last_numbers[-1]}")
    print(f"details : {str(last_numbers)}")
 def main() -> None:
    """
    starting point or function of script
    """
    lower = int(input("Enter lower value : ").strip())
    higher = int(input("Enter high value : ").strip())
    guess = int(input("Enter value to guess : ").strip())
    guess_the_number(lower, higher, guess)
 if __name__ == "__main__":
    main()
--- a/other/h_index.py
+++ b/other/h_index.py
@ -1,71 +0,0 @@
 """
 Task:
 Given an array of integers citations where citations[i] is the number of
 citations a researcher received for their ith paper, return compute the
 researcher's h-index.
 According to the definition of h-index on Wikipedia: A scientist has an
 index h if h of their n papers have at least h citations each, and the other
 n - h papers have no more than h citations each.
 If there are several possible values for h, the maximum one is taken as the
 h-index.
 H-Index link: https://en.wikipedia.org/wiki/H-index
 Implementation notes:
 Use sorting of array
 Leetcode link: https://leetcode.com/problems/h-index/description/
 n = len(citations)
 Runtime Complexity: O(n * log(n))
 Space  Complexity: O(1)
 """
 def h_index(citations: list[int]) -> int:
    """
    Return H-index of citations
    >>> h_index([3, 0, 6, 1, 5])
    3
    >>> h_index([1, 3, 1])
    1
    >>> h_index([1, 2, 3])
    2
    >>> h_index('test')
    Traceback (most recent call last):
        ...
    ValueError: The citations should be a list of non negative integers.
    >>> h_index([1,2,'3'])
    Traceback (most recent call last):
        ...
    ValueError: The citations should be a list of non negative integers.
    >>> h_index([1,2,-3])
    Traceback (most recent call last):
        ...
    ValueError: The citations should be a list of non negative integers.
    """
    # validate:
    if not isinstance(citations, list) or not all(
        isinstance(item, int) and item >= 0 for item in citations
    ):
        raise ValueError("The citations should be a list of non negative integers.")
    citations.sort()
    len_citations = len(citations)
    for i in range(len_citations):
        if citations[len_citations - 1 - i] <= i:
            return i
    return len_citations
 if __name__ == "__main__":
    import doctest
    doctest.testmod()