Compare commits

...

4 Commits

Author SHA1 Message Date
Harkishan Khuva
a2783c6597
Create guess_the_number_search.py (#7937) 2023-05-17 12:22:24 +12:00
Alexander Pantyukhin
61cfb43d2b
Add h index (#8036) 2023-05-17 12:21:16 +12:00
Rohan Saraogi
3dc143f721
Added odd_sieve.py (#8740) 2023-05-17 12:08:56 +12:00
Tianyi Zheng
8102424950
local_weighted_learning.py: fix mypy errors and more (#8073) 2023-05-17 12:05:55 +12:00
5 changed files with 391 additions and 76 deletions

View File

@ -712,6 +712,7 @@
* [Gauss Easter](other/gauss_easter.py) * [Gauss Easter](other/gauss_easter.py)
* [Graham Scan](other/graham_scan.py) * [Graham Scan](other/graham_scan.py)
* [Greedy](other/greedy.py) * [Greedy](other/greedy.py)
* [H Index](other/h_index.py)
* [Least Recently Used](other/least_recently_used.py) * [Least Recently Used](other/least_recently_used.py)
* [Lfu Cache](other/lfu_cache.py) * [Lfu Cache](other/lfu_cache.py)
* [Linear Congruential Generator](other/linear_congruential_generator.py) * [Linear Congruential Generator](other/linear_congruential_generator.py)

View File

@ -1,14 +1,55 @@
"""
Locally weighted linear regression, also called local regression, is a type of
non-parametric linear regression that prioritizes data closest to a given
prediction point. The algorithm estimates the vector of model coefficients β
using weighted least squares regression:
β = (XᵀWX)¹(XᵀWy),
where X is the design matrix, y is the response vector, and W is the diagonal
weight matrix.
This implementation calculates wᵢ, the weight of the ith training sample, using
the Gaussian weight:
wᵢ = exp(-xᵢ - x²/(2τ²)),
where xᵢ is the ith training sample, x is the prediction point, τ is the
"bandwidth", and x is the Euclidean norm (also called the 2-norm or the
norm). The bandwidth τ controls how quickly the weight of a training sample
decreases as its distance from the prediction point increases. One can think of
the Gaussian weight as a bell curve centered around the prediction point: a
training sample is weighted lower if it's farther from the center, and τ
controls the spread of the bell curve.
Other types of locally weighted regression such as locally estimated scatterplot
smoothing (LOESS) typically use different weight functions.
References:
- https://en.wikipedia.org/wiki/Local_regression
- https://en.wikipedia.org/wiki/Weighted_least_squares
- https://cs229.stanford.edu/notes2022fall/main_notes.pdf
"""
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import numpy as np import numpy as np
def weighted_matrix( def weight_matrix(point: np.ndarray, x_train: np.ndarray, tau: float) -> np.ndarray:
point: np.array, training_data_x: np.array, bandwidth: float
) -> np.array:
""" """
Calculate the weight for every point in the data set. Calculate the weight of every point in the training data around a given
point --> the x value at which we want to make predictions prediction point
>>> weighted_matrix(
Args:
point: x-value at which the prediction is being made
x_train: ndarray of x-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
m x m weight matrix around the prediction point, where m is the size of
the training set
>>> weight_matrix(
... np.array([1., 1.]), ... np.array([1., 1.]),
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
... 0.6 ... 0.6
@ -17,25 +58,30 @@ def weighted_matrix(
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000], [0.00000000e+000, 0.00000000e+000, 0.00000000e+000],
[0.00000000e+000, 0.00000000e+000, 0.00000000e+000]]) [0.00000000e+000, 0.00000000e+000, 0.00000000e+000]])
""" """
m, _ = np.shape(training_data_x) # m is the number of training samples m = len(x_train) # Number of training samples
weights = np.eye(m) # Initializing weights as identity matrix weights = np.eye(m) # Initialize weights as identity matrix
# calculating weights for all training examples [x(i)'s]
for j in range(m): for j in range(m):
diff = point - training_data_x[j] diff = point - x_train[j]
weights[j, j] = np.exp(diff @ diff.T / (-2.0 * bandwidth**2)) weights[j, j] = np.exp(diff @ diff.T / (-2.0 * tau**2))
return weights return weights
def local_weight( def local_weight(
point: np.array, point: np.ndarray, x_train: np.ndarray, y_train: np.ndarray, tau: float
training_data_x: np.array, ) -> np.ndarray:
training_data_y: np.array,
bandwidth: float,
) -> np.array:
""" """
Calculate the local weights using the weight_matrix function on training data. Calculate the local weights at a given prediction point using the weight
Return the weighted matrix. matrix for that point
Args:
point: x-value at which the prediction is being made
x_train: ndarray of x-values for training
y_train: ndarray of y-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
ndarray of local weights
>>> local_weight( >>> local_weight(
... np.array([1., 1.]), ... np.array([1., 1.]),
... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]), ... np.array([[16.99, 10.34], [21.01,23.68], [24.59,25.69]]),
@ -45,19 +91,28 @@ def local_weight(
array([[0.00873174], array([[0.00873174],
[0.08272556]]) [0.08272556]])
""" """
weight = weighted_matrix(point, training_data_x, bandwidth) weight_mat = weight_matrix(point, x_train, tau)
w = np.linalg.inv(training_data_x.T @ (weight @ training_data_x)) @ ( weight = np.linalg.inv(x_train.T @ weight_mat @ x_train) @ (
training_data_x.T @ weight @ training_data_y.T x_train.T @ weight_mat @ y_train.T
) )
return w return weight
def local_weight_regression( def local_weight_regression(
training_data_x: np.array, training_data_y: np.array, bandwidth: float x_train: np.ndarray, y_train: np.ndarray, tau: float
) -> np.array: ) -> np.ndarray:
""" """
Calculate predictions for each data point on axis Calculate predictions for each point in the training data
Args:
x_train: ndarray of x-values for training
y_train: ndarray of y-values for training
tau: bandwidth value, controls how quickly the weight of training values
decreases as the distance from the prediction point increases
Returns:
ndarray of predictions
>>> local_weight_regression( >>> local_weight_regression(
... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]), ... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
... np.array([[1.01, 1.66, 3.5]]), ... np.array([[1.01, 1.66, 3.5]]),
@ -65,77 +120,57 @@ def local_weight_regression(
... ) ... )
array([1.07173261, 1.65970737, 3.50160179]) array([1.07173261, 1.65970737, 3.50160179])
""" """
m, _ = np.shape(training_data_x) y_pred = np.zeros(len(x_train)) # Initialize array of predictions
ypred = np.zeros(m) for i, item in enumerate(x_train):
y_pred[i] = item @ local_weight(item, x_train, y_train, tau)
for i, item in enumerate(training_data_x): return y_pred
ypred[i] = item @ local_weight(
item, training_data_x, training_data_y, bandwidth
)
return ypred
def load_data( def load_data(
dataset_name: str, cola_name: str, colb_name: str dataset_name: str, x_name: str, y_name: str
) -> tuple[np.array, np.array, np.array, np.array]: ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
""" """
Load data from seaborn and split it into x and y points Load data from seaborn and split it into x and y points
>>> pass # No doctests, function is for demo purposes only
""" """
import seaborn as sns import seaborn as sns
data = sns.load_dataset(dataset_name) data = sns.load_dataset(dataset_name)
col_a = np.array(data[cola_name]) # total_bill x_data = np.array(data[x_name])
col_b = np.array(data[colb_name]) # tip y_data = np.array(data[y_name])
mcol_a = col_a.copy() one = np.ones(len(y_data))
mcol_b = col_b.copy()
one = np.ones(np.shape(mcol_b)[0], dtype=int) # pairing elements of one and x_data
x_train = np.column_stack((one, x_data))
# pairing elements of one and mcol_a return x_train, x_data, y_data
training_data_x = np.column_stack((one, mcol_a))
return training_data_x, mcol_b, col_a, col_b
def get_preds(training_data_x: np.array, mcol_b: np.array, tau: float) -> np.array:
"""
Get predictions with minimum error for each training data
>>> get_preds(
... np.array([[16.99, 10.34], [21.01, 23.68], [24.59, 25.69]]),
... np.array([[1.01, 1.66, 3.5]]),
... 0.6
... )
array([1.07173261, 1.65970737, 3.50160179])
"""
ypred = local_weight_regression(training_data_x, mcol_b, tau)
return ypred
def plot_preds( def plot_preds(
training_data_x: np.array, x_train: np.ndarray,
predictions: np.array, preds: np.ndarray,
col_x: np.array, x_data: np.ndarray,
col_y: np.array, y_data: np.ndarray,
cola_name: str, x_name: str,
colb_name: str, y_name: str,
) -> plt.plot: ) -> None:
""" """
Plot predictions and display the graph Plot predictions and display the graph
>>> pass # No doctests, function is for demo purposes only
""" """
xsort = training_data_x.copy() x_train_sorted = np.sort(x_train, axis=0)
xsort.sort(axis=0) plt.scatter(x_data, y_data, color="blue")
plt.scatter(col_x, col_y, color="blue")
plt.plot( plt.plot(
xsort[:, 1], x_train_sorted[:, 1],
predictions[training_data_x[:, 1].argsort(0)], preds[x_train[:, 1].argsort(0)],
color="yellow", color="yellow",
linewidth=5, linewidth=5,
) )
plt.title("Local Weighted Regression") plt.title("Local Weighted Regression")
plt.xlabel(cola_name) plt.xlabel(x_name)
plt.ylabel(colb_name) plt.ylabel(y_name)
plt.show() plt.show()
@ -144,6 +179,7 @@ if __name__ == "__main__":
doctest.testmod() doctest.testmod()
training_data_x, mcol_b, col_a, col_b = load_data("tips", "total_bill", "tip") # Demo with a dataset from the seaborn module
predictions = get_preds(training_data_x, mcol_b, 0.5) training_data_x, total_bill, tip = load_data("tips", "total_bill", "tip")
plot_preds(training_data_x, predictions, col_a, col_b, "total_bill", "tip") predictions = local_weight_regression(training_data_x, tip, 5)
plot_preds(training_data_x, predictions, total_bill, tip, "total_bill", "tip")

42
maths/odd_sieve.py Normal file
View File

@ -0,0 +1,42 @@
from itertools import compress, repeat
from math import ceil, sqrt
def odd_sieve(num: int) -> list[int]:
"""
Returns the prime numbers < `num`. The prime numbers are calculated using an
odd sieve implementation of the Sieve of Eratosthenes algorithm
(see for reference https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes).
>>> odd_sieve(2)
[]
>>> odd_sieve(3)
[2]
>>> odd_sieve(10)
[2, 3, 5, 7]
>>> odd_sieve(20)
[2, 3, 5, 7, 11, 13, 17, 19]
"""
if num <= 2:
return []
if num == 3:
return [2]
# Odd sieve for numbers in range [3, num - 1]
sieve = bytearray(b"\x01") * ((num >> 1) - 1)
for i in range(3, int(sqrt(num)) + 1, 2):
if sieve[(i >> 1) - 1]:
i_squared = i**2
sieve[(i_squared >> 1) - 1 :: i] = repeat(
0, ceil((num - i_squared) / (i << 1))
)
return [2] + list(compress(range(3, num, 2), sieve))
if __name__ == "__main__":
import doctest
doctest.testmod()

View File

@ -0,0 +1,165 @@
"""
guess the number using lower,higher and the value to find or guess
solution works by dividing lower and higher of number guessed
suppose lower is 0, higher is 1000 and the number to guess is 355
>>> guess_the_number(10, 1000, 17)
started...
guess the number : 17
details : [505, 257, 133, 71, 40, 25, 17]
"""
def temp_input_value(
min_val: int = 10, max_val: int = 1000, option: bool = True
) -> int:
"""
Temporary input values for tests
>>> temp_input_value(option=True)
10
>>> temp_input_value(option=False)
1000
>>> temp_input_value(min_val=100, option=True)
100
>>> temp_input_value(min_val=100, max_val=50)
Traceback (most recent call last):
...
ValueError: Invalid value for min_val or max_val (min_value < max_value)
>>> temp_input_value("ten","fifty",1)
Traceback (most recent call last):
...
AssertionError: Invalid type of value(s) specified to function!
>>> temp_input_value(min_val=-100, max_val=500)
-100
>>> temp_input_value(min_val=-5100, max_val=-100)
-5100
"""
assert (
isinstance(min_val, int)
and isinstance(max_val, int)
and isinstance(option, bool)
), "Invalid type of value(s) specified to function!"
if min_val > max_val:
raise ValueError("Invalid value for min_val or max_val (min_value < max_value)")
return min_val if option else max_val
def get_avg(number_1: int, number_2: int) -> int:
"""
Return the mid-number(whole) of two integers a and b
>>> get_avg(10, 15)
12
>>> get_avg(20, 300)
160
>>> get_avg("abcd", 300)
Traceback (most recent call last):
...
TypeError: can only concatenate str (not "int") to str
>>> get_avg(10.5,50.25)
30
"""
return int((number_1 + number_2) / 2)
def guess_the_number(lower: int, higher: int, to_guess: int) -> None:
"""
The `guess_the_number` function that guess the number by some operations
and using inner functions
>>> guess_the_number(10, 1000, 17)
started...
guess the number : 17
details : [505, 257, 133, 71, 40, 25, 17]
>>> guess_the_number(-10000, 10000, 7)
started...
guess the number : 7
details : [0, 5000, 2500, 1250, 625, 312, 156, 78, 39, 19, 9, 4, 6, 7]
>>> guess_the_number(10, 1000, "a")
Traceback (most recent call last):
...
AssertionError: argument values must be type of "int"
>>> guess_the_number(10, 1000, 5)
Traceback (most recent call last):
...
ValueError: guess value must be within the range of lower and higher value
>>> guess_the_number(10000, 100, 5)
Traceback (most recent call last):
...
ValueError: argument value for lower and higher must be(lower > higher)
"""
assert (
isinstance(lower, int) and isinstance(higher, int) and isinstance(to_guess, int)
), 'argument values must be type of "int"'
if lower > higher:
raise ValueError("argument value for lower and higher must be(lower > higher)")
if not lower < to_guess < higher:
raise ValueError(
"guess value must be within the range of lower and higher value"
)
def answer(number: int) -> str:
"""
Returns value by comparing with entered `to_guess` number
"""
if number > to_guess:
return "high"
elif number < to_guess:
return "low"
else:
return "same"
print("started...")
last_lowest = lower
last_highest = higher
last_numbers = []
while True:
number = get_avg(last_lowest, last_highest)
last_numbers.append(number)
if answer(number) == "low":
last_lowest = number
elif answer(number) == "high":
last_highest = number
else:
break
print(f"guess the number : {last_numbers[-1]}")
print(f"details : {str(last_numbers)}")
def main() -> None:
"""
starting point or function of script
"""
lower = int(input("Enter lower value : ").strip())
higher = int(input("Enter high value : ").strip())
guess = int(input("Enter value to guess : ").strip())
guess_the_number(lower, higher, guess)
if __name__ == "__main__":
main()

71
other/h_index.py Normal file
View File

@ -0,0 +1,71 @@
"""
Task:
Given an array of integers citations where citations[i] is the number of
citations a researcher received for their ith paper, return compute the
researcher's h-index.
According to the definition of h-index on Wikipedia: A scientist has an
index h if h of their n papers have at least h citations each, and the other
n - h papers have no more than h citations each.
If there are several possible values for h, the maximum one is taken as the
h-index.
H-Index link: https://en.wikipedia.org/wiki/H-index
Implementation notes:
Use sorting of array
Leetcode link: https://leetcode.com/problems/h-index/description/
n = len(citations)
Runtime Complexity: O(n * log(n))
Space Complexity: O(1)
"""
def h_index(citations: list[int]) -> int:
"""
Return H-index of citations
>>> h_index([3, 0, 6, 1, 5])
3
>>> h_index([1, 3, 1])
1
>>> h_index([1, 2, 3])
2
>>> h_index('test')
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
>>> h_index([1,2,'3'])
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
>>> h_index([1,2,-3])
Traceback (most recent call last):
...
ValueError: The citations should be a list of non negative integers.
"""
# validate:
if not isinstance(citations, list) or not all(
isinstance(item, int) and item >= 0 for item in citations
):
raise ValueError("The citations should be a list of non negative integers.")
citations.sort()
len_citations = len(citations)
for i in range(len_citations):
if citations[len_citations - 1 - i] <= i:
return i
return len_citations
if __name__ == "__main__":
import doctest
doctest.testmod()