""" Implementation of a basic regression decision tree. Input data set: The input data set must be 1-dimensional with continuous labels. Output: The decision tree maps a real number input to a real number output. """ import numpy as np class DecisionTree: def __init__(self, depth=5, min_leaf_size=5): self.depth = depth self.decision_boundary = 0 self.left = None self.right = None self.min_leaf_size = min_leaf_size self.prediction = None def mean_squared_error(self, labels, prediction): """ mean_squared_error: @param labels: a one dimensional numpy array @param prediction: a floating point value return value: mean_squared_error calculates the error if prediction is used to estimate the labels >>> tester = DecisionTree() >>> test_labels = np.array([1,2,3,4,5,6,7,8,9,10]) >>> test_prediction = float(6) >>> tester.mean_squared_error(test_labels, test_prediction) == ( ... TestDecisionTree.helper_mean_squared_error_test(test_labels, ... test_prediction)) True >>> test_labels = np.array([1,2,3]) >>> test_prediction = float(2) >>> tester.mean_squared_error(test_labels, test_prediction) == ( ... TestDecisionTree.helper_mean_squared_error_test(test_labels, ... test_prediction)) True """ if labels.ndim != 1: print("Error: Input labels must be one dimensional") return np.mean((labels - prediction) ** 2) def train(self, x, y): """ train: @param x: a one dimensional numpy array @param y: a one dimensional numpy array. The contents of y are the labels for the corresponding X values train does not have a return value """ """ this section is to check that the inputs conform to our dimensionality constraints """ if x.ndim != 1: print("Error: Input data set must be one dimensional") return if len(x) != len(y): print("Error: X and y have different lengths") return if y.ndim != 1: print("Error: Data set labels must be one dimensional") return if len(x) < 2 * self.min_leaf_size: self.prediction = np.mean(y) return if self.depth == 1: self.prediction = np.mean(y) return best_split = 0 min_error = self.mean_squared_error(x, np.mean(y)) * 2 """ loop over all possible splits for the decision tree. find the best split. if no split exists that is less than 2 * error for the entire array then the data set is not split and the average for the entire array is used as the predictor """ for i in range(len(x)): if len(x[:i]) < self.min_leaf_size: continue elif len(x[i:]) < self.min_leaf_size: continue else: error_left = self.mean_squared_error(x[:i], np.mean(y[:i])) error_right = self.mean_squared_error(x[i:], np.mean(y[i:])) error = error_left + error_right if error < min_error: best_split = i min_error = error if best_split != 0: left_x = x[:best_split] left_y = y[:best_split] right_x = x[best_split:] right_y = y[best_split:] self.decision_boundary = x[best_split] self.left = DecisionTree( depth=self.depth - 1, min_leaf_size=self.min_leaf_size ) self.right = DecisionTree( depth=self.depth - 1, min_leaf_size=self.min_leaf_size ) self.left.train(left_x, left_y) self.right.train(right_x, right_y) else: self.prediction = np.mean(y) return def predict(self, x): """ predict: @param x: a floating point value to predict the label of the prediction function works by recursively calling the predict function of the appropriate subtrees based on the tree's decision boundary """ if self.prediction is not None: return self.prediction elif self.left or self.right is not None: if x >= self.decision_boundary: return self.right.predict(x) else: return self.left.predict(x) else: print("Error: Decision tree not yet trained") return None class TestDecisionTree: """Decision Tres test class""" @staticmethod def helper_mean_squared_error_test(labels, prediction): """ helper_mean_squared_error_test: @param labels: a one dimensional numpy array @param prediction: a floating point value return value: helper_mean_squared_error_test calculates the mean squared error """ squared_error_sum = float(0) for label in labels: squared_error_sum += (label - prediction) ** 2 return float(squared_error_sum / labels.size) def main(): """ In this demonstration first we are generating x which is a numpy array containing values starting from -1 to 1 with an interval of 0.005 i.e [-1,-0.995,....,0.995,1] this is what we are getting by applying arange function of numpy.Then the we are generating y by applying sin function on x which is an array containing values from -1 to 1 with difference of 0.005 i.e we are getting an array y which contains sin of each value of x. We then train a decision tree on the data set and use the decision tree to predict the label of 10 different test values. Here we should prefer calculating Root Mean Squared Error over Mean Sqaured error because RMSE should be used when you need to communicate your results in an understandable way to end users or when penalising outliers is less of a priority. """ x = np.arange(-1.0, 1.0, 0.005) y = np.sin(x) tree = DecisionTree(depth=10, min_leaf_size=10) tree.train(x, y) test_cases = (np.random.rand(10) * 2) - 1 predictions = np.array([tree.predict(x) for x in test_cases]) mse_error = np.mean((predictions - test_cases) ** 2) """RMSE error""" avg_error = np.sqrt(mse_error) print("Test values: " + str(test_cases)) print("Predictions: " + str(predictions)) print("Average error: " + str(avg_error)) if __name__ == "__main__": main() import doctest doctest.testmod(name="mean_squarred_error", verbose=True)