Compare commits

..

No commits in common. "a2783c6597a154a87f60bb5878770d2f152a1d09" and "c0892a06515b8ea5030db2e8344dee2292bb10ad" have entirely different histories.

5 changed files with 76 additions and 391 deletions

View File

@ -712,7 +712,6 @@
* [Gauss Easter](other/gauss_easter.py)
* [Graham Scan](other/graham_scan.py)
* [Greedy](other/greedy.py)
* [H Index](other/h_index.py)
* [Least Recently Used](other/least_recently_used.py)
* [Lfu Cache](other/lfu_cache.py)
* [Linear Congruential Generator](other/linear_congruential_generator.py)

View File

@ -1,55 +1,14 @@
"""
Locally weighted linear regression, also called local regression, is a type of
non-parametric linear regression that prioritizes data closest to a given
prediction point. The algorithm estimates the vector of model coefficients β
using weighted least squares regression:
β = (XᵀWX)¹(XᵀWy),
where X is the design matrix, y is the response vector, and W is the diagonal
weight matrix.
This implementation calculates wᵢ, the weight of the ith training sample, using
the Gaussian weight:
wᵢ = exp(-xᵢ - x²/(2τ²)),
where xᵢ is the ith training sample, x is the prediction point, τ is the
"bandwidth", and x is the Euclidean norm (also called the 2-norm or the
norm). The bandwidth τ controls how quickly the weight of a training sample
decreases as its distance from the prediction point increases. One can think of
the Gaussian weight as a bell curve centered around the prediction point: a
training sample is weighted lower if it's farther from the center, and τ
controls the spread of the bell curve.
Other types of locally weighted regression such as locally estimated scatterplot
smoothing (LOESS) typically use different weight functions.
References:
- https://en.wikipedia.org/wiki/Local_regression
- https://en.wikipedia.org/wiki/Weighted_least_squares
- https://cs229.stanford.edu/notes2022fall/main_notes.pdf
"""
import matplotlib.pyplot as plt
import numpy as np
def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray:
def weighted_matrix(
point: np.array, training_data_x: np.array, bandwidth: float
) -> np.array:
"""
Calculate the weight of every point in the training data around a given
prediction point
Args:
point: x-value at which the prediction is being made
x_train: ndarray of x-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
m x m weight matrix around the prediction point, where m is the size of
the training set
>>> weight_matrix(
Calculate the weight for every point in the data set.
point --> the x value at which we want to make predictions
>>> weighted_matrix(
... np.array([1., 1.]),
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
... 0.6
@ -58,30 +17,25 @@ def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndar
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])
"""
m = len(x_train) # Number of training samples
weights = np.eye(m) # Initialize weights as identity matrix
for j in range(m):
diff = point - x_train[j]
weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2))
m, _ = np.shape(training_data_x) # m is the number of training samples
weights = np.eye(m) # Initializing weights as identity matrix
# calculating weights for all training examples [x(i)'s]
for j in range(m):
diff = point - training_data_x[j]
weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2))
return weights
def local_weight(
point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float
) -> np.ndarray:
point: np.array,
training_data_x: np.array,
training_data_y: np.array,
bandwidth: float,
) -> np.array:
"""
Calculate the local weights at a given prediction point using the weight
matrix for that point
Args:
point: x-value at which the prediction is being made
x_train: ndarray of x-values for training
y_train: ndarray of y-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
ndarray of local weights
Calculate the local weights using the weight_matrix function on training data.
Return the weighted matrix.
>>> local_weight(
... np.array([1., 1.]),
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@ -91,28 +45,19 @@ def local_weight(
array([[0.00873174],
[0.08272556]])
"""
weight_mat = weight_matrix(point, x_train, tau)
weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ (
x_train.T @ weight_mat @ y_train.T
weight = weighted_matrix(point, training_data_x, bandwidth)
w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ (
training_data_x.T @ weight @ training_data_y.T
)
return weight
return w
def local_weight_regression(
x_train: np.ndarray, y_train: np.ndarray, tau: float
) -> np.ndarray:
training_data_x: np.array, training_data_y: np.array, bandwidth: float
) -> np.array:
"""
Calculate predictions for each point in the training data
Args:
x_train: ndarray of x-values for training
y_train: ndarray of y-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
ndarray of predictions
Calculate predictions for each data point on axis
>>> local_weight_regression(
... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
... np.array([[1.01, 1.66, 3.5]]),
@ -120,57 +65,77 @@ def local_weight_regression(
... )
array([1.07173261, 1.65970737, 3.50160179])
"""
y_pred = np.zeros(len(x_train)) # Initialize array of predictions
for i, item in enumerate(x_train):
y_pred[i] = item @ local_weight(item, x_train, y_train, tau)
m, _ = np.shape(training_data_x)
ypred = np.zeros(m)
return y_pred
for i, item in enumerate(training_data_x):
ypred[i] = item @ local_weight(
item, training_data_x, training_data_y, bandwidth
)
return ypred
def load_data(
dataset_name: str, x_name: str, y_name: str
) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
dataset_name: str, cola_name: str, colb_name: str
) -> tuple[np.array, np.array, np.array, np.array]:
"""
Load data from seaborn and split it into x and y points
>>> pass # No doctests, function is for demo purposes only
"""
import seaborn as sns
data = sns.load_dataset(dataset_name)
x_data = np.array(data[x_name])
y_data = np.array(data[y_name])
col_a = np.array(data[cola_name]) # total_bill
col_b = np.array(data[colb_name]) # tip
one = np.ones(len(y_data))
mcol_a = col_a.copy()
mcol_b = col_b.copy()
# pairing elements of one and x_data
x_train = np.column_stack((one, x_data))
one = np.ones(np.shape(mcol_b)[0], dtype=int)
return x_train, x_data, y_data
# pairing elements of one and mcol_a
training_data_x = np.column_stack((one, mcol_a))
return training_data_x, mcol_b, col_a, col_b
def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array:
"""
Get predictions with minimum error for each training data
>>> get_preds(
... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
... np.array([[1.01, 1.66, 3.5]]),
... 0.6
... )
array([1.07173261, 1.65970737, 3.50160179])
"""
ypred = local_weight_regression(training_data_x, mcol_b, tau)
return ypred
def plot_preds(
x_train: np.ndarray,
preds: np.ndarray,
x_data: np.ndarray,
y_data: np.ndarray,
x_name: str,
y_name: str,
) -> None:
training_data_x: np.array,
predictions: np.array,
col_x: np.array,
col_y: np.array,
cola_name: str,
colb_name: str,
) -> plt.plot:
"""
Plot predictions and display the graph
>>> pass # No doctests, function is for demo purposes only
"""
x_train_sorted = np.sort(x_train, axis=0)
plt.scatter(x_data, y_data, color="blue")
xsort = training_data_x.copy()
xsort.sort(axis=0)
plt.scatter(col_x, col_y, color="blue")
plt.plot(
x_train_sorted[:, 1],
preds[x_train[:, 1].argsort(0)],
xsort[:, 1],
predictions[training_data_x[:, 1].argsort(0)],
color="yellow",
linewidth=5,
)
plt.title("Local Weighted Regression")
plt.xlabel(x_name)
plt.ylabel(y_name)
plt.xlabel(cola_name)
plt.ylabel(colb_name)
plt.show()
@ -179,7 +144,6 @@ if __name__ == "__main__":
doctest.testmod()
# Demo with a dataset from the seaborn module
training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip")
predictions = local_weight_regression(training_data_x, tip, 5)
plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip")
training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip")
predictions = get_preds(training_data_x, mcol_b, 0.5)
plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip")

View File

@ -1,42 +0,0 @@
from itertools import compress, repeat
from math import ceil, sqrt
def odd_sieve(num: int) -> list[int]:
"""
Returns the prime numbers < `num`. The prime numbers are calculated using an
odd sieve implementation of the Sieve of Eratosthenes algorithm
(see for reference https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes).
>>> odd_sieve(2)
[]
>>> odd_sieve(3)
[2]
>>> odd_sieve(10)
[2, 3, 5, 7]
>>> odd_sieve(20)
[2, 3, 5, 7, 11, 13, 17, 19]
"""
if num <= 2:
return []
if num == 3:
return [2]
# Odd sieve for numbers in range [3, num - 1]
sieve = bytearray(b"\x01") * ((num >> 1) - 1)
for i in range(3, int(sqrt(num)) + 1, 2):
if sieve[(i >> 1) - 1]:
i_squared = i**2
sieve[(i_squared >> 1) - 1 :: i] = repeat(
0, ceil((num - i_squared) / (i << 1))
)
return [2] + list(compress(range(3, num, 2), sieve))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -1,165 +0,0 @@
"""
guess the number using lower,higher and the value to find or guess
solution works by dividing lower and higher of number guessed
suppose lower is 0, higher is 1000 and the number to guess is 355
>>> guess_the_number(10, 1000, 17)
started...
guess the number : 17
details : [505, 257, 133, 71, 40, 25, 17]
"""
def temp_input_value(
min_val: int = 10, max_val: int = 1000, option: bool = True
) -> int:
"""
Temporary input values for tests
>>> temp_input_value(option=True)
10
>>> temp_input_value(option=False)
1000
>>> temp_input_value(min_val=100, option=True)
100
>>> temp_input_value(min_val=100, max_val=50)
Traceback (most recent call last):
...
ValueError: Invalid value for min_val or max_val (min_value < max_value)
>>> temp_input_value("ten","fifty",1)
Traceback (most recent call last):
...
AssertionError: Invalid type of value(s) specified to function!
>>> temp_input_value(min_val=-100, max_val=500)
-100
>>> temp_input_value(min_val=-5100, max_val=-100)
-5100
"""
assert (
isinstance(min_val, int)
and isinstance(max_val, int)
and isinstance(option, bool)
), "Invalid type of value(s) specified to function!"
if min_val > max_val:
raise ValueError("Invalid value for min_val or max_val (min_value < max_value)")
return min_val if option else max_val
def get_avg(number_1: int, number_2: int) -> int:
"""
Return the mid-number(whole) of two integers a and b
>>> get_avg(10, 15)
12
>>> get_avg(20, 300)
160
>>> get_avg("abcd", 300)
Traceback (most recent call last):
...
TypeError: can only concatenate str (not "int") to str
>>> get_avg(10.5,50.25)
30
"""
return int((number_1 + number_2) / 2)
def guess_the_number(lower: int, higher: int, to_guess: int) -> None:
"""
The `guess_the_number` function that guess the number by some operations
and using inner functions
>>> guess_the_number(10, 1000, 17)
started...
guess the number : 17
details : [505, 257, 133, 71, 40, 25, 17]
>>> guess_the_number(-10000, 10000, 7)
started...
guess the number : 7
details : [0, 5000, 2500, 1250, 625, 312, 156, 78, 39, 19, 9, 4, 6, 7]
>>> guess_the_number(10, 1000, "a")
Traceback (most recent call last):
...
AssertionError: argument values must be type of "int"
>>> guess_the_number(10, 1000, 5)
Traceback (most recent call last):
...
ValueError: guess value must be within the range of lower and higher value
>>> guess_the_number(10000, 100, 5)
Traceback (most recent call last):
...
ValueError: argument value for lower and higher must be(lower > higher)
"""
assert (
isinstance(lower, int) and isinstance(higher, int) and isinstance(to_guess, int)
), 'argument values must be type of "int"'
if lower > higher:
raise ValueError("argument value for lower and higher must be(lower > higher)")
if not lower < to_guess < higher:
raise ValueError(
"guess value must be within the range of lower and higher value"
)
def answer(number: int) -> str:
"""
Returns value by comparing with entered `to_guess` number
"""
if number > to_guess:
return "high"
elif number < to_guess:
return "low"
else:
return "same"
print("started...")
last_lowest = lower
last_highest = higher
last_numbers = []
while True:
number = get_avg(last_lowest, last_highest)
last_numbers.append(number)
if answer(number) == "low":
last_lowest = number
elif answer(number) == "high":
last_highest = number
else:
break
print(f"guess the number : {last_numbers[-1]}")
print(f"details : {str(last_numbers)}")
def main() -> None:
"""
starting point or function of script
"""
lower = int(input("Enter lower value : ").strip())
higher = int(input("Enter high value : ").strip())
guess = int(input("Enter value to guess : ").strip())
guess_the_number(lower, higher, guess)
if __name__ == "__main__":
main()

View File

@ -1,71 +0,0 @@
"""
Task:
Given an array of integers citations where citations[i] is the number of
citations a researcher received for their ith paper, return compute the
researcher's h-index.
According to the definition of h-index on Wikipedia: A scientist has an
index h if h of their n papers have at least h citations each, and the other
n - h papers have no more than h citations each.
If there are several possible values for h, the maximum one is taken as the
h-index.
H-Index link: https://en.wikipedia.org/wiki/H-index
Implementation notes:
Use sorting of array
Leetcode link: https://leetcode.com/problems/h-index/description/
n = len(citations)
Runtime Complexity: O(n * log(n))
Space Complexity: O(1)
"""
def h_index(citations: list[int]) -> int:
"""
Return H-index of citations
>>> h_index([3, 0, 6, 1, 5])
3
>>> h_index([1, 3, 1])
1
>>> h_index([1, 2, 3])
2
>>> h_index('test')
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
>>> h_index([1,2,'3'])
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
>>> h_index([1,2,-3])
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
"""
# validate:
if not isinstance(citations, list) or not all(
isinstance(item, int) and item >= 0 for item in citations
):
raise ValueError("The citations should be a list of non negative integers.")
citations.sort()
len_citations = len(citations)
for i in range(len_citations):
if citations[len_citations - 1 - i] <= i:
return i
return len_citations
if __name__ == "__main__":
import doctest
doctest.testmod()