2025-02-24 09:58:39 +00:00
5 changed files with 76 additions and 391 deletions
--- a/DIRECTORY.md
+++ b/DIRECTORY.md
@ -712,7 +712,6 @@
  * [Gauss Easter](other/gauss_easter.py)
  * [Graham Scan](other/graham_scan.py)
  * [Greedy](other/greedy.py)
-  * [H Index](other/h_index.py)
  * [Least Recently Used](other/least_recently_used.py)
  * [Lfu Cache](other/lfu_cache.py)
  * [Linear Congruential Generator](other/linear_congruential_generator.py)
--- a/machine_learning/local_weighted_learning/local_weighted_learning.py
+++ b/machine_learning/local_weighted_learning/local_weighted_learning.py
@ -1,55 +1,14 @@
-"""
-Locally weighted linear regression, also called local regression, is a type of
-non-parametric linear regression that prioritizes data closest to a given
-prediction point. The algorithm estimates the vector of model coefficients β
-using weighted least squares regression:
-
-β = (XᵀWX)⁻¹(XᵀWy),
-
-where X is the design matrix, y is the response vector, and W is the diagonal
-weight matrix.
-
-This implementation calculates wᵢ, the weight of the ith training sample, using
-the Gaussian weight:
-
-wᵢ = exp(-‖xᵢ - x‖²/(2τ²)),
-
-where xᵢ is the ith training sample, x is the prediction point, τ is the
-"bandwidth", and ‖x‖ is the Euclidean norm (also called the 2-norm or the L²
-norm). The bandwidth τ controls how quickly the weight of a training sample
-decreases as its distance from the prediction point increases. One can think of
-the Gaussian weight as a bell curve centered around the prediction point: a
-training sample is weighted lower if it's farther from the center, and τ
-controls the spread of the bell curve.
-
-Other types of locally weighted regression such as locally estimated scatterplot
-smoothing (LOESS) typically use different weight functions.
-
-References:
-    - https://en.wikipedia.org/wiki/Local_regression
-    - https://en.wikipedia.org/wiki/Weighted_least_squares
-    - https://cs229.stanford.edu/notes2022fall/main_notes.pdf
-"""
-
 import matplotlib.pyplot as plt
 import numpy as np


-def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray:
+def weighted_matrix(
+    point: np.array, training_data_x: np.array, bandwidth: float
+) -> np.array:
    """
-    Calculate the weight of every point in the training data around a given
-    prediction point
-
-    Args:
-        point: x-value at which the prediction is being made
-        x_train: ndarray of x-values for training
-        tau: bandwidth value, controls how quickly the weight of training values
-            decreases as the distance from the prediction point increases
-
-    Returns:
-        m x m weight matrix around the prediction point, where m is the size of
-        the training set
-    >>> weight_matrix(
+    Calculate the weight for every point in the data set.
+    point --> the x value at which we want to make predictions
+    >>> weighted_matrix(
    ...     np.array([1., 1.]),
    ...     np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
    ...     0.6
@ -58,30 +17,25 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar
           [0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
           [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])
    """
-    m = len(x_train)  # Number of training samples
-    weights = np.eye(m)  # Initialize weights as identity matrix
-    for j in range(m):
-        diff = point - x_train[j]
-        weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2))
+    m, _ = np.shape(training_data_x)  # m is the number of training samples
+    weights = np.eye(m)  # Initializing weights as identity matrix

+    # calculating weights for all training examples [x(i)'s]
+    for j in range(m):
+        diff = point - training_data_x[j]
+        weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2))
    return weights


 def local_weight(
-    point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float
-) -> np.ndarray:
+    point: np.array,
+    training_data_x: np.array,
+    training_data_y: np.array,
+    bandwidth: float,
+) -> np.array:
    """
-    Calculate the local weights at a given prediction point using the weight
-    matrix for that point
-
-    Args:
-        point: x-value at which the prediction is being made
-        x_train: ndarray of x-values for training
-        y_train: ndarray of y-values for training
-        tau: bandwidth value, controls how quickly the weight of training values
-            decreases as the distance from the prediction point increases
-    Returns:
-        ndarray of local weights
+    Calculate the local weights using the weight_matrix function on training data.
+    Return the weighted matrix.
    >>> local_weight(
    ...     np.array([1., 1.]),
    ...     np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@ -91,28 +45,19 @@ def local_weight(
    array([[0.00873174],
           [0.08272556]])
    """
-    weight_mat = weight_matrix(point, x_train, tau)
-    weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ (
-        x_train.T @ weight_mat @ y_train.T
+    weight = weighted_matrix(point, training_data_x, bandwidth)
+    w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ (
+        training_data_x.T @ weight @ training_data_y.T
    )

-    return weight
+    return w


 def local_weight_regression(
-    x_train: np.ndarray, y_train: np.ndarray, tau: float
-) -> np.ndarray:
+    training_data_x: np.array, training_data_y: np.array, bandwidth: float
+) -> np.array:
    """
-    Calculate predictions for each point in the training data
-
-    Args:
-        x_train: ndarray of x-values for training
-        y_train: ndarray of y-values for training
-        tau: bandwidth value, controls how quickly the weight of training values
-            decreases as the distance from the prediction point increases
-
-    Returns:
-        ndarray of predictions
+    Calculate predictions for each data point on axis
    >>> local_weight_regression(
    ...     np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
    ...     np.array([[1.01, 1.66, 3.5]]),
@ -120,57 +65,77 @@ def local_weight_regression(
    ... )
    array([1.07173261, 1.65970737, 3.50160179])
    """
-    y_pred = np.zeros(len(x_train))  # Initialize array of predictions
-    for i, item in enumerate(x_train):
-        y_pred[i] = item @ local_weight(item, x_train, y_train, tau)
+    m, _ = np.shape(training_data_x)
+    ypred = np.zeros(m)

-    return y_pred
+    for i, item in enumerate(training_data_x):
+        ypred[i] = item @ local_weight(
+            item, training_data_x, training_data_y, bandwidth
+        )
+
+    return ypred


 def load_data(
-    dataset_name: str, x_name: str, y_name: str
-) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    dataset_name: str, cola_name: str, colb_name: str
+) -> tuple[np.array, np.array, np.array, np.array]:
    """
    Load data from seaborn and split it into x and y points
-    >>> pass    # No doctests, function is for demo purposes only
    """
    import seaborn as sns

    data = sns.load_dataset(dataset_name)
-    x_data = np.array(data[x_name])
-    y_data = np.array(data[y_name])
+    col_a = np.array(data[cola_name])  # total_bill
+    col_b = np.array(data[colb_name])  # tip

-    one = np.ones(len(y_data))
+    mcol_a = col_a.copy()
+    mcol_b = col_b.copy()

-    # pairing elements of one and x_data
-    x_train = np.column_stack((one, x_data))
+    one = np.ones(np.shape(mcol_b)[0], dtype=int)

-    return x_train, x_data, y_data
+    # pairing elements of one and mcol_a
+    training_data_x = np.column_stack((one, mcol_a))
+
+    return training_data_x, mcol_b, col_a, col_b
+
+
+def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array:
+    """
+    Get predictions with minimum error for each training data
+    >>> get_preds(
+    ...     np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
+    ...     np.array([[1.01, 1.66, 3.5]]),
+    ...     0.6
+    ... )
+    array([1.07173261, 1.65970737, 3.50160179])
+    """
+    ypred = local_weight_regression(training_data_x, mcol_b, tau)
+    return ypred


 def plot_preds(
-    x_train: np.ndarray,
-    preds: np.ndarray,
-    x_data: np.ndarray,
-    y_data: np.ndarray,
-    x_name: str,
-    y_name: str,
-) -> None:
+    training_data_x: np.array,
+    predictions: np.array,
+    col_x: np.array,
+    col_y: np.array,
+    cola_name: str,
+    colb_name: str,
+) -> plt.plot:
    """
    Plot predictions and display the graph
-    >>> pass    # No doctests, function is for demo purposes only
    """
-    x_train_sorted = np.sort(x_train, axis=0)
-    plt.scatter(x_data, y_data, color="blue")
+    xsort = training_data_x.copy()
+    xsort.sort(axis=0)
+    plt.scatter(col_x, col_y, color="blue")
    plt.plot(
-        x_train_sorted[:, 1],
-        preds[x_train[:, 1].argsort(0)],
+        xsort[:, 1],
+        predictions[training_data_x[:, 1].argsort(0)],
        color="yellow",
        linewidth=5,
    )
    plt.title("Local Weighted Regression")
-    plt.xlabel(x_name)
-    plt.ylabel(y_name)
+    plt.xlabel(cola_name)
+    plt.ylabel(colb_name)
    plt.show()


@ -179,7 +144,6 @@ if __name__ == "__main__":

    doctest.testmod()

-    # Demo with a dataset from the seaborn module
-    training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip")
-    predictions = local_weight_regression(training_data_x, tip, 5)
-    plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip")
+    training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip")
+    predictions = get_preds(training_data_x, mcol_b, 0.5)
+    plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip")
--- a/maths/odd_sieve.py
+++ b/maths/odd_sieve.py
@ -1,42 +0,0 @@
-from itertools import compress, repeat
-from math import ceil, sqrt
-
-
-def odd_sieve(num: int) -> list[int]:
-    """
-    Returns the prime numbers < `num`. The prime numbers are calculated using an
-    odd sieve implementation of the Sieve of Eratosthenes algorithm
-    (see for reference https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes).
-
-    >>> odd_sieve(2)
-    []
-    >>> odd_sieve(3)
-    [2]
-    >>> odd_sieve(10)
-    [2, 3, 5, 7]
-    >>> odd_sieve(20)
-    [2, 3, 5, 7, 11, 13, 17, 19]
-    """
-
-    if num <= 2:
-        return []
-    if num == 3:
-        return [2]
-
-    # Odd sieve for numbers in range [3, num - 1]
-    sieve = bytearray(b"\x01") * ((num >> 1) - 1)
-
-    for i in range(3, int(sqrt(num)) + 1, 2):
-        if sieve[(i >> 1) - 1]:
-            i_squared = i**2
-            sieve[(i_squared >> 1) - 1 :: i] = repeat(
-                0, ceil((num - i_squared) / (i << 1))
-            )
-
-    return [2] + list(compress(range(3, num, 2), sieve))
-
-
-if __name__ == "__main__":
-    import doctest
-
-    doctest.testmod()
--- a/other/guess_the_number_search.py
+++ b/other/guess_the_number_search.py
@ -1,165 +0,0 @@
-"""
-guess the number using lower,higher and the value to find or guess
-
-solution works by dividing lower and higher of number guessed
-
-suppose lower is 0, higher is 1000 and the number to guess is 355
-
->>> guess_the_number(10, 1000, 17)
-started...
-guess the number : 17
-details : [505, 257, 133, 71, 40, 25, 17]
-
-"""
-
-
-def temp_input_value(
-    min_val: int = 10, max_val: int = 1000, option: bool = True
-) -> int:
-    """
-    Temporary input values for tests
-
-    >>> temp_input_value(option=True)
-    10
-
-    >>> temp_input_value(option=False)
-    1000
-
-    >>> temp_input_value(min_val=100, option=True)
-    100
-
-    >>> temp_input_value(min_val=100, max_val=50)
-    Traceback (most recent call last):
-        ...
-    ValueError: Invalid value for min_val or max_val (min_value < max_value)
-
-    >>> temp_input_value("ten","fifty",1)
-    Traceback (most recent call last):
-        ...
-    AssertionError: Invalid type of value(s) specified to function!
-
-    >>> temp_input_value(min_val=-100, max_val=500)
-    -100
-
-    >>> temp_input_value(min_val=-5100, max_val=-100)
-    -5100
-    """
-    assert (
-        isinstance(min_val, int)
-        and isinstance(max_val, int)
-        and isinstance(option, bool)
-    ), "Invalid type of value(s) specified to function!"
-
-    if min_val > max_val:
-        raise ValueError("Invalid value for min_val or max_val (min_value < max_value)")
-    return min_val if option else max_val
-
-
-def get_avg(number_1: int, number_2: int) -> int:
-    """
-    Return the mid-number(whole) of two integers a and b
-
-    >>> get_avg(10, 15)
-    12
-
-    >>> get_avg(20, 300)
-    160
-
-    >>> get_avg("abcd", 300)
-    Traceback (most recent call last):
-        ...
-    TypeError: can only concatenate str (not "int") to str
-
-    >>> get_avg(10.5,50.25)
-    30
-    """
-    return int((number_1 + number_2) / 2)
-
-
-def guess_the_number(lower: int, higher: int, to_guess: int) -> None:
-    """
-    The `guess_the_number` function that guess the number by some operations
-    and using inner functions
-
-    >>> guess_the_number(10, 1000, 17)
-    started...
-    guess the number : 17
-    details : [505, 257, 133, 71, 40, 25, 17]
-
-    >>> guess_the_number(-10000, 10000, 7)
-    started...
-    guess the number : 7
-    details : [0, 5000, 2500, 1250, 625, 312, 156, 78, 39, 19, 9, 4, 6, 7]
-
-    >>> guess_the_number(10, 1000, "a")
-    Traceback (most recent call last):
-        ...
-    AssertionError: argument values must be type of "int"
-
-    >>> guess_the_number(10, 1000, 5)
-    Traceback (most recent call last):
-        ...
-    ValueError: guess value must be within the range of lower and higher value
-
-    >>> guess_the_number(10000, 100, 5)
-    Traceback (most recent call last):
-        ...
-    ValueError: argument value for lower and higher must be(lower > higher)
-    """
-    assert (
-        isinstance(lower, int) and isinstance(higher, int) and isinstance(to_guess, int)
-    ), 'argument values must be type of "int"'
-
-    if lower > higher:
-        raise ValueError("argument value for lower and higher must be(lower > higher)")
-
-    if not lower < to_guess < higher:
-        raise ValueError(
-            "guess value must be within the range of lower and higher value"
-        )
-
-    def answer(number: int) -> str:
-        """
-        Returns value by comparing with entered `to_guess` number
-        """
-        if number > to_guess:
-            return "high"
-        elif number < to_guess:
-            return "low"
-        else:
-            return "same"
-
-    print("started...")
-
-    last_lowest = lower
-    last_highest = higher
-
-    last_numbers = []
-
-    while True:
-        number = get_avg(last_lowest, last_highest)
-        last_numbers.append(number)
-
-        if answer(number) == "low":
-            last_lowest = number
-        elif answer(number) == "high":
-            last_highest = number
-        else:
-            break
-
-    print(f"guess the number : {last_numbers[-1]}")
-    print(f"details : {str(last_numbers)}")
-
-
-def main() -> None:
-    """
-    starting point or function of script
-    """
-    lower = int(input("Enter lower value : ").strip())
-    higher = int(input("Enter high value : ").strip())
-    guess = int(input("Enter value to guess : ").strip())
-    guess_the_number(lower, higher, guess)
-
-
-if __name__ == "__main__":
-    main()
--- a/other/h_index.py
+++ b/other/h_index.py
@ -1,71 +0,0 @@
-"""
-Task:
-Given an array of integers citations where citations[i] is the number of
-citations a researcher received for their ith paper, return compute the
-researcher's h-index.
-
-According to the definition of h-index on Wikipedia: A scientist has an
-index h if h of their n papers have at least h citations each, and the other
-n - h papers have no more than h citations each.
-
-If there are several possible values for h, the maximum one is taken as the
-h-index.
-
-H-Index link: https://en.wikipedia.org/wiki/H-index
-
-Implementation notes:
-Use sorting of array
-
-Leetcode link: https://leetcode.com/problems/h-index/description/
-
-n = len(citations)
-Runtime Complexity: O(n * log(n))
-Space  Complexity: O(1)
-
-"""
-
-
-def h_index(citations: list[int]) -> int:
-    """
-    Return H-index of citations
-
-    >>> h_index([3, 0, 6, 1, 5])
-    3
-    >>> h_index([1, 3, 1])
-    1
-    >>> h_index([1, 2, 3])
-    2
-    >>> h_index('test')
-    Traceback (most recent call last):
-        ...
-    ValueError: The citations should be a list of non negative integers.
-    >>> h_index([1,2,'3'])
-    Traceback (most recent call last):
-        ...
-    ValueError: The citations should be a list of non negative integers.
-    >>> h_index([1,2,-3])
-    Traceback (most recent call last):
-        ...
-    ValueError: The citations should be a list of non negative integers.
-    """
-
-    # validate:
-    if not isinstance(citations, list) or not all(
-        isinstance(item, int) and item >= 0 for item in citations
-    ):
-        raise ValueError("The citations should be a list of non negative integers.")
-
-    citations.sort()
-    len_citations = len(citations)
-
-    for i in range(len_citations):
-        if citations[len_citations - 1 - i] <= i:
-            return i
-
-    return len_citations
-
-
-if __name__ == "__main__":
-    import doctest
-
-    doctest.testmod()