Compare commits

..

No commits in common. "a2783c6597a154a87f60bb5878770d2f152a1d09" and "c0892a06515b8ea5030db2e8344dee2292bb10ad" have entirely different histories.

5 changed files with 76 additions and 391 deletions

View File

@ -712,7 +712,6 @@
* [Gauss Easter](other/gauss_easter.py) * [Gauss Easter](other/gauss_easter.py)
* [Graham Scan](other/graham_scan.py) * [Graham Scan](other/graham_scan.py)
* [Greedy](other/greedy.py) * [Greedy](other/greedy.py)
* [H Index](other/h_index.py)
* [Least Recently Used](other/least_recently_used.py) * [Least Recently Used](other/least_recently_used.py)
* [Lfu Cache](other/lfu_cache.py) * [Lfu Cache](other/lfu_cache.py)
* [Linear Congruential Generator](other/linear_congruential_generator.py) * [Linear Congruential Generator](other/linear_congruential_generator.py)

View File

@ -1,55 +1,14 @@
"""
Locally weighted linear regression, also called local regression, is a type of
non-parametric linear regression that prioritizes data closest to a given
prediction point. The algorithm estimates the vector of model coefficients β
using weighted least squares regression:
β = (XᵀWX)¹(XᵀWy),
where X is the design matrix, y is the response vector, and W is the diagonal
weight matrix.
This implementation calculates wᵢ, the weight of the ith training sample, using
the Gaussian weight:
wᵢ = exp(-xᵢ - x²/(2τ²)),
where xᵢ is the ith training sample, x is the prediction point, τ is the
"bandwidth", and x is the Euclidean norm (also called the 2-norm or the
norm). The bandwidth τ controls how quickly the weight of a training sample
decreases as its distance from the prediction point increases. One can think of
the Gaussian weight as a bell curve centered around the prediction point: a
training sample is weighted lower if it's farther from the center, and τ
controls the spread of the bell curve.
Other types of locally weighted regression such as locally estimated scatterplot
smoothing (LOESS) typically use different weight functions.
References:
- https://en.wikipedia.org/wiki/Local_regression
- https://en.wikipedia.org/wiki/Weighted_least_squares
- https://cs229.stanford.edu/notes2022fall/main_notes.pdf
"""
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray: def weighted_matrix(
point: np.array, training_data_x: np.array, bandwidth: float
) -> np.array:
""" """
Calculate the weight of every point in the training data around a given Calculate the weight for every point in the data set.
prediction point point --> the x value at which we want to make predictions
>>> weighted_matrix(
Args:
point: x-value at which the prediction is being made
x_train: ndarray of x-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
m x m weight matrix around the prediction point, where m is the size of
the training set
>>> weight_matrix(
... np.array([1., 1.]), ... np.array([1., 1.]),
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
... 0.6 ... 0.6
@ -58,30 +17,25 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])
""" """
m = len(x_train) # Number of training samples m, _ = np.shape(training_data_x) # m is the number of training samples
weights = np.eye(m) # Initialize weights as identity matrix weights = np.eye(m) # Initializing weights as identity matrix
for j in range(m):
diff = point - x_train[j]
weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2))
# calculating weights for all training examples [x(i)'s]
for j in range(m):
diff = point - training_data_x[j]
weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2))
return weights return weights
def local_weight( def local_weight(
point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float point: np.array,
) -> np.ndarray: training_data_x: np.array,
training_data_y: np.array,
bandwidth: float,
) -> np.array:
""" """
Calculate the local weights at a given prediction point using the weight Calculate the local weights using the weight_matrix function on training data.
matrix for that point Return the weighted matrix.
Args:
point: x-value at which the prediction is being made
x_train: ndarray of x-values for training
y_train: ndarray of y-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
ndarray of local weights
>>> local_weight( >>> local_weight(
... np.array([1., 1.]), ... np.array([1., 1.]),
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@ -91,28 +45,19 @@ def local_weight(
array([[0.00873174], array([[0.00873174],
[0.08272556]]) [0.08272556]])
""" """
weight_mat = weight_matrix(point, x_train, tau) weight = weighted_matrix(point, training_data_x, bandwidth)
weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ ( w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ (
x_train.T @ weight_mat @ y_train.T training_data_x.T @ weight @ training_data_y.T
) )
return weight return w
def local_weight_regression( def local_weight_regression(
x_train: np.ndarray, y_train: np.ndarray, tau: float training_data_x: np.array, training_data_y: np.array, bandwidth: float
) -> np.ndarray: ) -> np.array:
""" """
Calculate predictions for each point in the training data Calculate predictions for each data point on axis
Args:
x_train: ndarray of x-values for training
y_train: ndarray of y-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
ndarray of predictions
>>> local_weight_regression( >>> local_weight_regression(
... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
... np.array([[1.01, 1.66, 3.5]]), ... np.array([[1.01, 1.66, 3.5]]),
@ -120,57 +65,77 @@ def local_weight_regression(
... ) ... )
array([1.07173261, 1.65970737, 3.50160179]) array([1.07173261, 1.65970737, 3.50160179])
""" """
y_pred = np.zeros(len(x_train)) # Initialize array of predictions m, _ = np.shape(training_data_x)
for i, item in enumerate(x_train): ypred = np.zeros(m)
y_pred[i] = item @ local_weight(item, x_train, y_train, tau)
return y_pred for i, item in enumerate(training_data_x):
ypred[i] = item @ local_weight(
item, training_data_x, training_data_y, bandwidth
)
return ypred
def load_data( def load_data(
dataset_name: str, x_name: str, y_name: str dataset_name: str, cola_name: str, colb_name: str
) -> tuple[np.ndarray, np.ndarray, np.ndarray]: ) -> tuple[np.array, np.array, np.array, np.array]:
""" """
Load data from seaborn and split it into x and y points Load data from seaborn and split it into x and y points
>>> pass # No doctests, function is for demo purposes only
""" """
import seaborn as sns import seaborn as sns
data = sns.load_dataset(dataset_name) data = sns.load_dataset(dataset_name)
x_data = np.array(data[x_name]) col_a = np.array(data[cola_name]) # total_bill
y_data = np.array(data[y_name]) col_b = np.array(data[colb_name]) # tip
one = np.ones(len(y_data)) mcol_a = col_a.copy()
mcol_b = col_b.copy()
# pairing elements of one and x_data one = np.ones(np.shape(mcol_b)[0], dtype=int)
x_train = np.column_stack((one, x_data))
return x_train, x_data, y_data # pairing elements of one and mcol_a
training_data_x = np.column_stack((one, mcol_a))
return training_data_x, mcol_b, col_a, col_b
def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array:
"""
Get predictions with minimum error for each training data
>>> get_preds(
... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
... np.array([[1.01, 1.66, 3.5]]),
... 0.6
... )
array([1.07173261, 1.65970737, 3.50160179])
"""
ypred = local_weight_regression(training_data_x, mcol_b, tau)
return ypred
def plot_preds( def plot_preds(
x_train: np.ndarray, training_data_x: np.array,
preds: np.ndarray, predictions: np.array,
x_data: np.ndarray, col_x: np.array,
y_data: np.ndarray, col_y: np.array,
x_name: str, cola_name: str,
y_name: str, colb_name: str,
) -> None: ) -> plt.plot:
""" """
Plot predictions and display the graph Plot predictions and display the graph
>>> pass # No doctests, function is for demo purposes only
""" """
x_train_sorted = np.sort(x_train, axis=0) xsort = training_data_x.copy()
plt.scatter(x_data, y_data, color="blue") xsort.sort(axis=0)
plt.scatter(col_x, col_y, color="blue")
plt.plot( plt.plot(
x_train_sorted[:, 1], xsort[:, 1],
preds[x_train[:, 1].argsort(0)], predictions[training_data_x[:, 1].argsort(0)],
color="yellow", color="yellow",
linewidth=5, linewidth=5,
) )
plt.title("Local Weighted Regression") plt.title("Local Weighted Regression")
plt.xlabel(x_name) plt.xlabel(cola_name)
plt.ylabel(y_name) plt.ylabel(colb_name)
plt.show() plt.show()
@ -179,7 +144,6 @@ if __name__ == "__main__":
doctest.testmod() doctest.testmod()
# Demo with a dataset from the seaborn module training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip")
training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip") predictions = get_preds(training_data_x, mcol_b, 0.5)
predictions = local_weight_regression(training_data_x, tip, 5) plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip")
plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip")

View File

@ -1,42 +0,0 @@
from itertools import compress, repeat
from math import ceil, sqrt
def odd_sieve(num: int) -> list[int]:
"""
Returns the prime numbers < `num`. The prime numbers are calculated using an
odd sieve implementation of the Sieve of Eratosthenes algorithm
(see for reference https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes).
>>> odd_sieve(2)
[]
>>> odd_sieve(3)
[2]
>>> odd_sieve(10)
[2, 3, 5, 7]
>>> odd_sieve(20)
[2, 3, 5, 7, 11, 13, 17, 19]
"""
if num <= 2:
return []
if num == 3:
return [2]
# Odd sieve for numbers in range [3, num - 1]
sieve = bytearray(b"\x01") * ((num >> 1) - 1)
for i in range(3, int(sqrt(num)) + 1, 2):
if sieve[(i >> 1) - 1]:
i_squared = i**2
sieve[(i_squared >> 1) - 1 :: i] = repeat(
0, ceil((num - i_squared) / (i << 1))
)
return [2] + list(compress(range(3, num, 2), sieve))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,165 +0,0 @@
"""
guess the number using lower,higher and the value to find or guess
solution works by dividing lower and higher of number guessed
suppose lower is 0, higher is 1000 and the number to guess is 355
>>> guess_the_number(10, 1000, 17)
started...
guess the number : 17
details : [505, 257, 133, 71, 40, 25, 17]
"""
def temp_input_value(
min_val: int = 10, max_val: int = 1000, option: bool = True
) -> int:
"""
Temporary input values for tests
>>> temp_input_value(option=True)
10
>>> temp_input_value(option=False)
1000
>>> temp_input_value(min_val=100, option=True)
100
>>> temp_input_value(min_val=100, max_val=50)
Traceback (most recent call last):
...
ValueError: Invalid value for min_val or max_val (min_value < max_value)
>>> temp_input_value("ten","fifty",1)
Traceback (most recent call last):
...
AssertionError: Invalid type of value(s) specified to function!
>>> temp_input_value(min_val=-100, max_val=500)
-100
>>> temp_input_value(min_val=-5100, max_val=-100)
-5100
"""
assert (
isinstance(min_val, int)
and isinstance(max_val, int)
and isinstance(option, bool)
), "Invalid type of value(s) specified to function!"
if min_val > max_val:
raise ValueError("Invalid value for min_val or max_val (min_value < max_value)")
return min_val if option else max_val
def get_avg(number_1: int, number_2: int) -> int:
"""
Return the mid-number(whole) of two integers a and b
>>> get_avg(10, 15)
12
>>> get_avg(20, 300)
160
>>> get_avg("abcd", 300)
Traceback (most recent call last):
...
TypeError: can only concatenate str (not "int") to str
>>> get_avg(10.5,50.25)
30
"""
return int((number_1 + number_2) / 2)
def guess_the_number(lower: int, higher: int, to_guess: int) -> None:
"""
The `guess_the_number` function that guess the number by some operations
and using inner functions
>>> guess_the_number(10, 1000, 17)
started...
guess the number : 17
details : [505, 257, 133, 71, 40, 25, 17]
>>> guess_the_number(-10000, 10000, 7)
started...
guess the number : 7
details : [0, 5000, 2500, 1250, 625, 312, 156, 78, 39, 19, 9, 4, 6, 7]
>>> guess_the_number(10, 1000, "a")
Traceback (most recent call last):
...
AssertionError: argument values must be type of "int"
>>> guess_the_number(10, 1000, 5)
Traceback (most recent call last):
...
ValueError: guess value must be within the range of lower and higher value
>>> guess_the_number(10000, 100, 5)
Traceback (most recent call last):
...
ValueError: argument value for lower and higher must be(lower > higher)
"""
assert (
isinstance(lower, int) and isinstance(higher, int) and isinstance(to_guess, int)
), 'argument values must be type of "int"'
if lower > higher:
raise ValueError("argument value for lower and higher must be(lower > higher)")
if not lower < to_guess < higher:
raise ValueError(
"guess value must be within the range of lower and higher value"
)
def answer(number: int) -> str:
"""
Returns value by comparing with entered `to_guess` number
"""
if number > to_guess:
return "high"
elif number < to_guess:
return "low"
else:
return "same"
print("started...")
last_lowest = lower
last_highest = higher
last_numbers = []
while True:
number = get_avg(last_lowest, last_highest)
last_numbers.append(number)
if answer(number) == "low":
last_lowest = number
elif answer(number) == "high":
last_highest = number
else:
break
print(f"guess the number : {last_numbers[-1]}")
print(f"details : {str(last_numbers)}")
def main() -> None:
"""
starting point or function of script
"""
lower = int(input("Enter lower value : ").strip())
higher = int(input("Enter high value : ").strip())
guess = int(input("Enter value to guess : ").strip())
guess_the_number(lower, higher, guess)
if __name__ == "__main__":
main()

View File

@ -1,71 +0,0 @@
"""
Task:
Given an array of integers citations where citations[i] is the number of
citations a researcher received for their ith paper, return compute the
researcher's h-index.
According to the definition of h-index on Wikipedia: A scientist has an
index h if h of their n papers have at least h citations each, and the other
n - h papers have no more than h citations each.
If there are several possible values for h, the maximum one is taken as the
h-index.
H-Index link: https://en.wikipedia.org/wiki/H-index
Implementation notes:
Use sorting of array
Leetcode link: https://leetcode.com/problems/h-index/description/
n = len(citations)
Runtime Complexity: O(n * log(n))
Space Complexity: O(1)
"""
def h_index(citations: list[int]) -> int:
"""
Return H-index of citations
>>> h_index([3, 0, 6, 1, 5])
3
>>> h_index([1, 3, 1])
1
>>> h_index([1, 2, 3])
2
>>> h_index('test')
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
>>> h_index([1,2,'3'])
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
>>> h_index([1,2,-3])
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
"""
# validate:
if not isinstance(citations, list) or not all(
isinstance(item, int) and item >= 0 for item in citations
):
raise ValueError("The citations should be a list of non negative integers.")
citations.sort()
len_citations = len(citations)
for i in range(len_citations):
if citations[len_citations - 1 - i] <= i:
return i
return len_citations
if __name__ == "__main__":
import doctest
doctest.testmod()