Merge remote-tracking branch 'upstream/master'

2025-04-13 17:27:35 +00:00 · 2017-10-11 14:00:41 +08:00 · 2017-10-11 14:00:41 +08:00 · 7c9a07c0a0
commit 7c9a07c0a0
parent e4d537a75c f9156cfb71
11 changed files with 214 additions and 162 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -1,14 +0,0 @@
-language: python
-python:
-  - "3.2"
-  - "3.3"
-  - "3.4"
-  - "3.5"
-  - "3.6"
-  - "3.6-dev"
-
-install: 
-  - if [ "$TRAVIS_PYTHON_VERSION" == "3.2" ]; then travis_retry pip install coverage==3.7.1; fi
-  - if [ "$TRAVIS_PYTHON_VERSION" != "3.2" ]; then travis_retry pip install coverage; fi
-  - "pip install pytest pytest-cov"
-script: py.test --doctest-modules --cov ./
--- a/Tree/binary_seach_tree.py
+++ b/Tree/binary_seach_tree.py
@ -8,7 +8,7 @@ class Node:
    def __init__(self, label):
        self.label = label
        self.left = None
-        self.rigt = None
+        self.right = None

    def getLabel(self):
        return self.label
@ -23,10 +23,10 @@ class Node:
        self.left = left

    def getRight(self):
-        return self.rigt
+        return self.right

    def setRight(self, right):
-        self.rigt = right
+        self.right = right


 class BinarySearchTree:
--- a/data_structures/Graph/Breadth_First_Search.py
+++ b/data_structures/Graph/Breadth_First_Search.py
@ -1,9 +1,9 @@
 class GRAPH:
    """docstring for GRAPH"""
    def __init__(self, nodes):
-        self.nodes=nodes
-        self.graph=[[0]*nodes for i in range (nodes)]
-        self.visited=[0]*nodes
+        self.nodes = nodes
+        self.graph = [[0]*nodes for i in range (nodes)]
+        self.visited = [0]*nodes


    def show(self):
@ -23,7 +23,7 @@ class GRAPH:
            v = queue[0]
            for u in range(self.vertex):
                if self.graph[v][u] == 1:
-                    if visited[u]== False:
+                    if visited[u] is False:
                        visited[u] = True
                        queue.append(u)
                        print('%d visited' % (u +1))
@ -41,30 +41,32 @@ g.add_edge(4,8)
 g.add_edge(5,9)
 g.add_edge(6,10)
 g.bfs(4)
-=======
-        print self.graph
+
+print(self.graph)

    def add_edge(self, i, j):
        self.graph[i][j]=1
        self.graph[j][i]=1

-    def bfs(self,s):
-        queue=[s]
-        self.visited[s]=1
-        while len(queue)!=0:
-            x=queue.pop(0)
+    def bfs(self, s):
+        queue = [s]
+        self.visited[s] = 1
+        while len(queue)!= 0:
+            x = queue.pop(0)
            print(x)
-            for i in range(0,self.nodes):
-                if self.graph[x][i]==1 and self.visited[i]==0:
+            for i in range(0, self.nodes):
+                if self.graph[x][i] == 1 and self.visited[i] == 0:
                    queue.append(i)     
-                    self.visited[i]=1
+                    self.visited[i] = 1

-n=int(input("Enter the number of Nodes : "))
-g=GRAPH(n)
-e=int(input("Enter the no of edges : "))
+n = int(input("Enter the number of Nodes : "))
+g = GRAPH(n)
+e = int(input("Enter the no of edges : "))
 print("Enter the edges (u v)")
-for i in range(0,e):
-    u,v=map(int, raw_input().split())
-    g.add_edge(u,v)
-s=int(input("Enter the source node :"))
+
+for i in range(0, e):
+    u ,v = map(int, raw_input().split())
+    g.add_edge(u, v)
+
+s = int(input("Enter the source node :"))
 g.bfs(s)
--- a/data_structures/Graph/Deep_First_Search.py
+++ b/data_structures/Graph/Deep_First_Search.py
--- a/data_structures/Graph/Graph_list.py
+++ b/data_structures/Graph/Graph_list.py
--- a/data_structures/Graph/Graph_matrix.py
+++ b/data_structures/Graph/Graph_matrix.py
--- a/data_structures/Graph/P01_BreadthFirstSearch.py
+++ b/data_structures/Graph/P01_BreadthFirstSearch.py
@ -1,61 +0,0 @@
-# Author: OMKAR PATHAK
-
-class Graph():
-    def __init__(self):
-        self.vertex = {}
-
-    # for printing the Graph vertexes
-    def printGraph(self):
-        for i in self.vertex.keys():
-            print(i,' -> ', ' -> '.join([str(j) for j in self.vertex[i]]))
-
-    # for adding the edge beween two vertexes
-    def addEdge(self, fromVertex, toVertex):
-        # check if vertex is already present,
-        if fromVertex in self.vertex.keys():
-            self.vertex[fromVertex].append(toVertex)
-        else:
-            # else make a new vertex
-            self.vertex[fromVertex] = [toVertex]
-
-    def BFS(self, startVertex):
-        # Take a list for stoting already visited vertexes
-        visited = [False] * len(self.vertex)
-
-        # create a list to store all the vertexes for BFS
-        queue = []
-
-        # mark the source node as visited and enqueue it
-        visited[startVertex] = True
-        queue.append(startVertex)
-
-        while queue:
-            startVertex = queue.pop(0)
-            print(startVertex, end = ' ')
-
-            # mark all adjacent nodes as visited and print them
-            for i in self.vertex[startVertex]:
-                if visited[i] == False:
-                    queue.append(i)
-                    visited[i] = True
-
-if __name__ == '__main__':
-    g = Graph()
-    g.addEdge(0, 1)
-    g.addEdge(0, 2)
-    g.addEdge(1, 2)
-    g.addEdge(2, 0)
-    g.addEdge(2, 3)
-    g.addEdge(3, 3)
-
-    g.printGraph()
-    print('BFS:')
-    g.BFS(2)
-
-    # OUTPUT:
-    # 0  ->  1 -> 2
-    # 1  ->  2
-    # 2  ->  0 -> 3
-    # 3  ->  3
-    # BFS:
-    # 2 0 3 1
--- a/data_structures/Graph/P02_DepthFirstSearch.py
+++ b/data_structures/Graph/P02_DepthFirstSearch.py
@ -1,61 +0,0 @@
-# Author: OMKAR PATHAK
-
-class Graph():
-    def __init__(self):
-        self.vertex = {}
-
-    # for printing the Graph vertexes
-    def printGraph(self):
-        print(self.vertex)
-        for i in self.vertex.keys():
-            print(i,' -> ', ' -> '.join([str(j) for j in self.vertex[i]]))
-
-    # for adding the edge beween two vertexes
-    def addEdge(self, fromVertex, toVertex):
-        # check if vertex is already present,
-        if fromVertex in self.vertex.keys():
-            self.vertex[fromVertex].append(toVertex)
-        else:
-            # else make a new vertex
-            self.vertex[fromVertex] = [toVertex]
-
-    def DFS(self):
-        # visited array for storing already visited nodes
-        visited = [False] * len(self.vertex)
-
-        # call the recursive helper function
-        for i in range(len(self.vertex)):
-            if visited[i] == False:
-                self.DFSRec(i, visited)
-
-    def DFSRec(self, startVertex, visited):
-        # mark start vertex as visited
-        visited[startVertex] = True
-
-        print(startVertex, end = ' ')
-
-        # Recur for all the vertexes that are adjacent to this node
-        for i in self.vertex.keys():
-            if visited[i] == False:
-                self.DFSRec(i, visited)
-
-if __name__ == '__main__':
-    g = Graph()
-    g.addEdge(0, 1)
-    g.addEdge(0, 2)
-    g.addEdge(1, 2)
-    g.addEdge(2, 0)
-    g.addEdge(2, 3)
-    g.addEdge(3, 3)
-
-    g.printGraph()
-    print('DFS:')
-    g.DFS()
-
-    # OUTPUT:
-    # 0  ->  1 -> 2
-    # 1  ->  2
-    # 2  ->  0 -> 3
-    # 3  ->  3
-    # DFS:
-    # 0 1 2 3
--- a/machine_learning/decision_tree.py
+++ b/machine_learning/decision_tree.py
@ -0,0 +1,139 @@
+"""
+Implementation of a basic regression decision tree.
+Input data set: The input data set must be 1-dimensional with continuous labels.
+Output: The decision tree maps a real number input to a real number output. 
+"""
+
+import numpy as np
+
+class Decision_Tree:
+    def __init__(self, depth = 5, min_leaf_size = 5):
+        self.depth = depth
+        self.decision_boundary = 0
+        self.left = None
+        self.right = None
+        self.min_leaf_size = min_leaf_size
+        self.prediction = None
+
+    def mean_squared_error(self, labels, prediction):
+        """
+        mean_squared_error:
+        @param labels: a one dimensional numpy array 
+        @param prediction: a floating point value
+        return value: mean_squared_error calculates the error if prediction is used to estimate the labels
+        """
+        if labels.ndim != 1:
+            print("Error: Input labels must be one dimensional")
+
+        return np.mean((labels - prediction) ** 2)
+
+    def train(self, X, y):
+        """
+        train:
+        @param X: a one dimensional numpy array
+        @param y: a one dimensional numpy array. 
+        The contents of y are the labels for the corresponding X values
+
+        train does not have a return value
+        """
+
+        """
+        this section is to check that the inputs conform to our dimensionality constraints
+        """
+        if X.ndim != 1:
+            print("Error: Input data set must be one dimensional")
+            return
+        if len(X) != len(y):
+            print("Error: X and y have different lengths")
+            return
+        if y.ndim != 1:
+            print("Error: Data set labels must be one dimensional")
+            return
+
+        if len(X) < 2 * self.min_leaf_size:
+            self.prediction = np.mean(y)
+            return
+
+        if self.depth == 1:
+            self.prediction = np.mean(y)
+            return
+
+        best_split = 0
+        min_error = self.mean_squared_error(X,np.mean(y)) * 2
+
+
+        """
+        loop over all possible splits for the decision tree. find the best split.
+        if no split exists that is less than 2 * error for the entire array
+        then the data set is not split and the average for the entire array is used as the predictor
+        """
+        for i in range(len(X)):
+            if len(X[:i]) < self.min_leaf_size:
+                continue
+            elif len(X[i:]) < self.min_leaf_size:
+                continue
+            else:
+                error_left = self.mean_squared_error(X[:i], np.mean(y[:i]))
+                error_right = self.mean_squared_error(X[i:], np.mean(y[i:]))
+                error = error_left + error_right
+                if error < min_error:
+                    best_split = i
+                    min_error = error
+
+        if best_split != 0:
+            left_X = X[:best_split]
+            left_y = y[:best_split]
+            right_X = X[best_split:]
+            right_y = y[best_split:]
+
+            self.decision_boundary = X[best_split]
+            self.left = Decision_Tree(depth = self.depth - 1, min_leaf_size = self.min_leaf_size)
+            self.right = Decision_Tree(depth = self.depth - 1, min_leaf_size = self.min_leaf_size)
+            self.left.train(left_X, left_y)
+            self.right.train(right_X, right_y)
+        else:
+            self.prediction = np.mean(y)
+
+        return
+
+    def predict(self, x):
+        """
+        predict:
+        @param x: a floating point value to predict the label of
+        the prediction function works by recursively calling the predict function
+        of the appropriate subtrees based on the tree's decision boundary
+        """
+        if self.prediction is not None:
+            return self.prediction
+        elif self.left or self.right is not None:
+            if x >= self.decision_boundary:
+                return self.right.predict(x)
+            else:
+                return self.left.predict(x)
+        else:
+            print("Error: Decision tree not yet trained")
+            return None
+
+def main():
+    """
+    In this demonstration we're generating a sample data set from the sin function in numpy.
+    We then train a decision tree on the data set and use the decision tree to predict the
+    label of 10 different test values. Then the mean squared error over this test is displayed.
+    """
+    X = np.arange(-1., 1., 0.005)
+    y = np.sin(X)
+
+    tree = Decision_Tree(depth = 10, min_leaf_size = 10)
+    tree.train(X,y)
+
+    test_cases = (np.random.rand(10) * 2) - 1
+    predictions = np.array([tree.predict(x) for x in test_cases])
+    avg_error = np.mean((predictions - test_cases) ** 2)
+
+    print("Test values: " + str(test_cases))
+    print("Predictions: " + str(predictions))
+    print("Average error: " + str(avg_error))
+
+            
+if __name__ == '__main__':
+    main()
--- a/searches/binary_search.py
+++ b/searches/binary_search.py
@ -110,9 +110,9 @@ def binary_search_by_recursion(sorted_collection, item, left, right):
    if sorted_collection[midpoint] == item:
        return midpoint
    elif sorted_collection[midpoint] > item:
-        return binary_search_by_recursion(sorted_collection, item, left, right-1)
+        return binary_search_by_recursion(sorted_collection, item, left, midpoint-1)
    else:
-        return binary_search_by_recursion(sorted_collection, item, left+1, right)
+        return binary_search_by_recursion(sorted_collection, item, midpoint+1, right)

 def __assert_sorted(collection):
    """Check if collection is sorted, if not - raises :py:class:`ValueError`
--- a/searches/quick_select.py
+++ b/searches/quick_select.py
@ -0,0 +1,47 @@
+import collections
+import sys
+import random
+import time
+import math
+"""
+A python implementation of the quick select algorithm, which is efficient for calculating the value that would appear in the index of a list if it would be sorted, even if it is not already sorted
+https://en.wikipedia.org/wiki/Quickselect
+"""
+def _partition(data, pivot):
+    """
+    Three way partition the data into smaller, equal and greater lists,
+    in relationship to the pivot
+    :param data: The data to be sorted (a list)
+    :param pivot: The value to partition the data on
+    :return: Three list: smaller, equal and greater
+    """
+    less, equal, greater = [], [], []
+    for element in data:
+        if element.address < pivot.address:
+            less.append(element)
+        elif element.address > pivot.address:
+            greater.append(element)
+        else:
+            equal.append(element)
+    return less, equal, greater
+    
+    def quickSelect(list, k):
+    #k = len(list) // 2 when trying to find the median (index that value would be when list is sorted)
+      smaller = []
+      larger = []
+      pivot = random.randint(0, len(list) - 1)
+      pivot = list[pivot]
+      count = 0
+      smaller, equal, larger =_partition(list, pivot)
+      count = len(equal)
+      m = len(smaller)
+
+      #k is the pivot
+      if m <= k < m + count:
+        return pivot
+    # must be in smaller
+      elif m > k:
+        return quickSelect(smaller, k)
+    #must be in larger
+      else:
+        return quickSelect(larger, k - (m + count))