mirror of
https://github.com/TheAlgorithms/Python.git
synced 2024-12-18 01:00:15 +00:00
c599f6c910
* updating DIRECTORY.md * Fix some SIM114 per file ignores * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix review issue --------- Co-authored-by: MaximSmolskiy <MaximSmolskiy@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
205 lines
7.0 KiB
Python
205 lines
7.0 KiB
Python
"""
|
|
Implementation of a basic regression decision tree.
|
|
Input data set: The input data set must be 1-dimensional with continuous labels.
|
|
Output: The decision tree maps a real number input to a real number output.
|
|
"""
|
|
|
|
import numpy as np
|
|
|
|
|
|
class DecisionTree:
|
|
def __init__(self, depth=5, min_leaf_size=5):
|
|
self.depth = depth
|
|
self.decision_boundary = 0
|
|
self.left = None
|
|
self.right = None
|
|
self.min_leaf_size = min_leaf_size
|
|
self.prediction = None
|
|
|
|
def mean_squared_error(self, labels, prediction):
|
|
"""
|
|
mean_squared_error:
|
|
@param labels: a one-dimensional numpy array
|
|
@param prediction: a floating point value
|
|
return value: mean_squared_error calculates the error if prediction is used to
|
|
estimate the labels
|
|
>>> tester = DecisionTree()
|
|
>>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
|
|
>>> test_prediction = float(6)
|
|
>>> tester.mean_squared_error(test_labels, test_prediction) == (
|
|
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
|
|
... test_prediction))
|
|
True
|
|
>>> test_labels = np.array([1,2,3])
|
|
>>> test_prediction = float(2)
|
|
>>> tester.mean_squared_error(test_labels, test_prediction) == (
|
|
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
|
|
... test_prediction))
|
|
True
|
|
"""
|
|
if labels.ndim != 1:
|
|
print("Error: Input labels must be one dimensional")
|
|
|
|
return np.mean((labels - prediction) ** 2)
|
|
|
|
def train(self, x, y):
|
|
"""
|
|
train:
|
|
@param x: a one-dimensional numpy array
|
|
@param y: a one-dimensional numpy array.
|
|
The contents of y are the labels for the corresponding X values
|
|
|
|
train() does not have a return value
|
|
|
|
Examples:
|
|
1. Try to train when x & y are of same length & 1 dimensions (No errors)
|
|
>>> dt = DecisionTree()
|
|
>>> dt.train(np.array([10,20,30,40,50]),np.array([0,0,0,1,1]))
|
|
|
|
2. Try to train when x is 2 dimensions
|
|
>>> dt = DecisionTree()
|
|
>>> dt.train(np.array([[1,2,3,4,5],[1,2,3,4,5]]),np.array([0,0,0,1,1]))
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: Input data set must be one-dimensional
|
|
|
|
3. Try to train when x and y are not of the same length
|
|
>>> dt = DecisionTree()
|
|
>>> dt.train(np.array([1,2,3,4,5]),np.array([[0,0,0,1,1],[0,0,0,1,1]]))
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: x and y have different lengths
|
|
|
|
4. Try to train when x & y are of the same length but different dimensions
|
|
>>> dt = DecisionTree()
|
|
>>> dt.train(np.array([1,2,3,4,5]),np.array([[1],[2],[3],[4],[5]]))
|
|
Traceback (most recent call last):
|
|
...
|
|
ValueError: Data set labels must be one-dimensional
|
|
|
|
This section is to check that the inputs conform to our dimensionality
|
|
constraints
|
|
"""
|
|
if x.ndim != 1:
|
|
raise ValueError("Input data set must be one-dimensional")
|
|
if len(x) != len(y):
|
|
raise ValueError("x and y have different lengths")
|
|
if y.ndim != 1:
|
|
raise ValueError("Data set labels must be one-dimensional")
|
|
|
|
if len(x) < 2 * self.min_leaf_size:
|
|
self.prediction = np.mean(y)
|
|
return
|
|
|
|
if self.depth == 1:
|
|
self.prediction = np.mean(y)
|
|
return
|
|
|
|
best_split = 0
|
|
min_error = self.mean_squared_error(x, np.mean(y)) * 2
|
|
|
|
"""
|
|
loop over all possible splits for the decision tree. find the best split.
|
|
if no split exists that is less than 2 * error for the entire array
|
|
then the data set is not split and the average for the entire array is used as
|
|
the predictor
|
|
"""
|
|
for i in range(len(x)):
|
|
if len(x[:i]) < self.min_leaf_size: # noqa: SIM114
|
|
continue
|
|
elif len(x[i:]) < self.min_leaf_size:
|
|
continue
|
|
else:
|
|
error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
|
|
error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
|
|
error = error_left + error_right
|
|
if error < min_error:
|
|
best_split = i
|
|
min_error = error
|
|
|
|
if best_split != 0:
|
|
left_x = x[:best_split]
|
|
left_y = y[:best_split]
|
|
right_x = x[best_split:]
|
|
right_y = y[best_split:]
|
|
|
|
self.decision_boundary = x[best_split]
|
|
self.left = DecisionTree(
|
|
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
|
|
)
|
|
self.right = DecisionTree(
|
|
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
|
|
)
|
|
self.left.train(left_x, left_y)
|
|
self.right.train(right_x, right_y)
|
|
else:
|
|
self.prediction = np.mean(y)
|
|
|
|
return
|
|
|
|
def predict(self, x):
|
|
"""
|
|
predict:
|
|
@param x: a floating point value to predict the label of
|
|
the prediction function works by recursively calling the predict function
|
|
of the appropriate subtrees based on the tree's decision boundary
|
|
"""
|
|
if self.prediction is not None:
|
|
return self.prediction
|
|
elif self.left or self.right is not None:
|
|
if x >= self.decision_boundary:
|
|
return self.right.predict(x)
|
|
else:
|
|
return self.left.predict(x)
|
|
else:
|
|
print("Error: Decision tree not yet trained")
|
|
return None
|
|
|
|
|
|
class TestDecisionTree:
|
|
"""Decision Tres test class"""
|
|
|
|
@staticmethod
|
|
def helper_mean_squared_error_test(labels, prediction):
|
|
"""
|
|
helper_mean_squared_error_test:
|
|
@param labels: a one dimensional numpy array
|
|
@param prediction: a floating point value
|
|
return value: helper_mean_squared_error_test calculates the mean squared error
|
|
"""
|
|
squared_error_sum = float(0)
|
|
for label in labels:
|
|
squared_error_sum += (label - prediction) ** 2
|
|
|
|
return float(squared_error_sum / labels.size)
|
|
|
|
|
|
def main():
|
|
"""
|
|
In this demonstration we're generating a sample data set from the sin function in
|
|
numpy. We then train a decision tree on the data set and use the decision tree to
|
|
predict the label of 10 different test values. Then the mean squared error over
|
|
this test is displayed.
|
|
"""
|
|
x = np.arange(-1.0, 1.0, 0.005)
|
|
y = np.sin(x)
|
|
|
|
tree = DecisionTree(depth=10, min_leaf_size=10)
|
|
tree.train(x, y)
|
|
|
|
rng = np.random.default_rng()
|
|
test_cases = (rng.random(10) * 2) - 1
|
|
predictions = np.array([tree.predict(x) for x in test_cases])
|
|
avg_error = np.mean((predictions - test_cases) ** 2)
|
|
|
|
print("Test values: " + str(test_cases))
|
|
print("Predictions: " + str(predictions))
|
|
print("Average error: " + str(avg_error))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
import doctest
|
|
|
|
doctest.testmod(name="mean_squarred_error", verbose=True)
|