Python/machine_learning/decision_tree.py
Maxim Smolskiy c599f6c910
Fix some SIM114 per file ignores ()
* updating DIRECTORY.md

* Fix some SIM114 per file ignores

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix review issue

---------

Co-authored-by: MaximSmolskiy <MaximSmolskiy@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-05-10 21:59:53 +02:00

205 lines
7.0 KiB
Python

"""
Implementation of a basic regression decision tree.
Input data set: The input data set must be 1-dimensional with continuous labels.
Output: The decision tree maps a real number input to a real number output.
"""
import numpy as np
class DecisionTree:
def __init__(self, depth=5, min_leaf_size=5):
self.depth = depth
self.decision_boundary = 0
self.left = None
self.right = None
self.min_leaf_size = min_leaf_size
self.prediction = None
def mean_squared_error(self, labels, prediction):
"""
mean_squared_error:
@param labels: a one-dimensional numpy array
@param prediction: a floating point value
return value: mean_squared_error calculates the error if prediction is used to
estimate the labels
>>> tester = DecisionTree()
>>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10])
>>> test_prediction = float(6)
>>> tester.mean_squared_error(test_labels, test_prediction) == (
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
... test_prediction))
True
>>> test_labels = np.array([1,2,3])
>>> test_prediction = float(2)
>>> tester.mean_squared_error(test_labels, test_prediction) == (
... TestDecisionTree.helper_mean_squared_error_test(test_labels,
... test_prediction))
True
"""
if labels.ndim != 1:
print("Error: Input labels must be one dimensional")
return np.mean((labels - prediction) ** 2)
def train(self, x, y):
"""
train:
@param x: a one-dimensional numpy array
@param y: a one-dimensional numpy array.
The contents of y are the labels for the corresponding X values
train() does not have a return value
Examples:
1. Try to train when x & y are of same length & 1 dimensions (No errors)
>>> dt = DecisionTree()
>>> dt.train(np.array([10,20,30,40,50]),np.array([0,0,0,1,1]))
2. Try to train when x is 2 dimensions
>>> dt = DecisionTree()
>>> dt.train(np.array([[1,2,3,4,5],[1,2,3,4,5]]),np.array([0,0,0,1,1]))
Traceback (most recent call last):
...
ValueError: Input data set must be one-dimensional
3. Try to train when x and y are not of the same length
>>> dt = DecisionTree()
>>> dt.train(np.array([1,2,3,4,5]),np.array([[0,0,0,1,1],[0,0,0,1,1]]))
Traceback (most recent call last):
...
ValueError: x and y have different lengths
4. Try to train when x & y are of the same length but different dimensions
>>> dt = DecisionTree()
>>> dt.train(np.array([1,2,3,4,5]),np.array([[1],[2],[3],[4],[5]]))
Traceback (most recent call last):
...
ValueError: Data set labels must be one-dimensional
This section is to check that the inputs conform to our dimensionality
constraints
"""
if x.ndim != 1:
raise ValueError("Input data set must be one-dimensional")
if len(x) != len(y):
raise ValueError("x and y have different lengths")
if y.ndim != 1:
raise ValueError("Data set labels must be one-dimensional")
if len(x) < 2 * self.min_leaf_size:
self.prediction = np.mean(y)
return
if self.depth == 1:
self.prediction = np.mean(y)
return
best_split = 0
min_error = self.mean_squared_error(x, np.mean(y)) * 2
"""
loop over all possible splits for the decision tree. find the best split.
if no split exists that is less than 2 * error for the entire array
then the data set is not split and the average for the entire array is used as
the predictor
"""
for i in range(len(x)):
if len(x[:i]) < self.min_leaf_size: # noqa: SIM114
continue
elif len(x[i:]) < self.min_leaf_size:
continue
else:
error_left = self.mean_squared_error(x[:i], np.mean(y[:i]))
error_right = self.mean_squared_error(x[i:], np.mean(y[i:]))
error = error_left + error_right
if error < min_error:
best_split = i
min_error = error
if best_split != 0:
left_x = x[:best_split]
left_y = y[:best_split]
right_x = x[best_split:]
right_y = y[best_split:]
self.decision_boundary = x[best_split]
self.left = DecisionTree(
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
)
self.right = DecisionTree(
depth=self.depth - 1, min_leaf_size=self.min_leaf_size
)
self.left.train(left_x, left_y)
self.right.train(right_x, right_y)
else:
self.prediction = np.mean(y)
return
def predict(self, x):
"""
predict:
@param x: a floating point value to predict the label of
the prediction function works by recursively calling the predict function
of the appropriate subtrees based on the tree's decision boundary
"""
if self.prediction is not None:
return self.prediction
elif self.left or self.right is not None:
if x >= self.decision_boundary:
return self.right.predict(x)
else:
return self.left.predict(x)
else:
print("Error: Decision tree not yet trained")
return None
class TestDecisionTree:
"""Decision Tres test class"""
@staticmethod
def helper_mean_squared_error_test(labels, prediction):
"""
helper_mean_squared_error_test:
@param labels: a one dimensional numpy array
@param prediction: a floating point value
return value: helper_mean_squared_error_test calculates the mean squared error
"""
squared_error_sum = float(0)
for label in labels:
squared_error_sum += (label - prediction) ** 2
return float(squared_error_sum / labels.size)
def main():
"""
In this demonstration we're generating a sample data set from the sin function in
numpy. We then train a decision tree on the data set and use the decision tree to
predict the label of 10 different test values. Then the mean squared error over
this test is displayed.
"""
x = np.arange(-1.0, 1.0, 0.005)
y = np.sin(x)
tree = DecisionTree(depth=10, min_leaf_size=10)
tree.train(x, y)
rng = np.random.default_rng()
test_cases = (rng.random(10) * 2) - 1
predictions = np.array([tree.predict(x) for x in test_cases])
avg_error = np.mean((predictions - test_cases) ** 2)
print("Test values: " + str(test_cases))
print("Predictions: " + str(predictions))
print("Average error: " + str(avg_error))
if __name__ == "__main__":
main()
import doctest
doctest.testmod(name="mean_squarred_error", verbose=True)