diff --git a/machine_learning/NaiveBayes.ipynb b/machine_learning/NaiveBayes.ipynb new file mode 100644 index 000000000..5a427c5cb --- /dev/null +++ b/machine_learning/NaiveBayes.ipynb @@ -0,0 +1,1659 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn import datasets\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "iris = datasets.load_iris()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(iris.data)\n", + "df.columns = [\"sl\", \"sw\", 'pl', 'pw']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def abc(k, *val):\n", + " if k < val[0]:\n", + " return 0\n", + " else:\n", + " return 1" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 0\n", + "2 0\n", + "3 0\n", + "4 1\n", + "5 1\n", + "6 0\n", + "7 1\n", + "8 0\n", + "9 0\n", + "10 1\n", + "11 0\n", + "12 0\n", + "13 0\n", + "14 1\n", + "15 1\n", + "16 1\n", + "17 1\n", + "18 1\n", + "19 1\n", + "20 1\n", + "21 1\n", + "22 0\n", + "23 1\n", + "24 0\n", + "25 1\n", + "26 1\n", + "27 1\n", + "28 1\n", + "29 0\n", + " ..\n", + "120 1\n", + "121 1\n", + "122 1\n", + "123 1\n", + "124 1\n", + "125 1\n", + "126 1\n", + "127 1\n", + "128 1\n", + "129 1\n", + "130 1\n", + "131 1\n", + "132 1\n", + "133 1\n", + "134 1\n", + "135 1\n", + "136 1\n", + "137 1\n", + "138 1\n", + "139 1\n", + "140 1\n", + "141 1\n", + "142 1\n", + "143 1\n", + "144 1\n", + "145 1\n", + "146 1\n", + "147 1\n", + "148 1\n", + "149 1\n", + "Name: sl, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sl.apply(abc, args=(5,))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def label(val, *boundaries):\n", + " if (val < boundaries[0]):\n", + " return 'a'\n", + " elif (val < boundaries[1]):\n", + " return 'b'\n", + " elif (val < boundaries[2]):\n", + " return 'c'\n", + " else:\n", + " return 'd'\n", + "\n", + "def toLabel(df, old_feature_name):\n", + " second = df[old_feature_name].mean()\n", + " minimum = df[old_feature_name].min()\n", + " first = (minimum + second)/2\n", + " maximum = df[old_feature_name].max()\n", + " third = (maximum + second)/2\n", + " return df[old_feature_name].apply(label, args= (first, second, third))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
slswplpwsl_labeledsw_labeledpl_labeledpw_labeled
05.13.51.40.2bcaa
14.93.01.40.2abaa
24.73.21.30.2acaa
34.63.11.50.2acaa
45.03.61.40.2acaa
55.43.91.70.4bdaa
64.63.41.40.3acaa
75.03.41.50.2acaa
84.42.91.40.2abaa
94.93.11.50.1acaa
105.43.71.50.2bcaa
114.83.41.60.2acaa
124.83.01.40.1abaa
134.33.01.10.1abaa
145.84.01.20.2bdaa
155.74.41.50.4bdaa
165.43.91.30.4bdaa
175.13.51.40.3bcaa
185.73.81.70.3bdaa
195.13.81.50.3bdaa
205.43.41.70.2bcaa
215.13.71.50.4bcaa
224.63.61.00.2acaa
235.13.31.70.5bcaa
244.83.41.90.2acaa
255.03.01.60.2abaa
265.03.41.60.4acaa
275.23.51.50.2bcaa
285.23.41.40.2bcaa
294.73.21.60.2acaa
...........................
1206.93.25.72.3dcdd
1215.62.84.92.0bbcd
1227.72.86.72.0dbdd
1236.32.74.91.8cbcc
1246.73.35.72.1ccdd
1257.23.26.01.8dcdc
1266.22.84.81.8cbcc
1276.13.04.91.8cbcc
1286.42.85.62.1cbdd
1297.23.05.81.6dbdc
1307.42.86.11.9dbdd
1317.93.86.42.0dddd
1326.42.85.62.2cbdd
1336.32.85.11.5cbcc
1346.12.65.61.4cbdc
1357.73.06.12.3dbdd
1366.33.45.62.4ccdd
1376.43.15.51.8ccdc
1386.03.04.81.8cbcc
1396.93.15.42.1dcdd
1406.73.15.62.4ccdd
1416.93.15.12.3dccd
1425.82.75.11.9bbcd
1436.83.25.92.3ccdd
1446.73.35.72.5ccdd
1456.73.05.22.3cbcd
1466.32.55.01.9cacd
1476.53.05.22.0cbcd
1486.23.45.42.3ccdd
1495.93.05.11.8cbcc
\n", + "

150 rows × 8 columns

\n", + "
" + ], + "text/plain": [ + " sl sw pl pw sl_labeled sw_labeled pl_labeled pw_labeled\n", + "0 5.1 3.5 1.4 0.2 b c a a\n", + "1 4.9 3.0 1.4 0.2 a b a a\n", + "2 4.7 3.2 1.3 0.2 a c a a\n", + "3 4.6 3.1 1.5 0.2 a c a a\n", + "4 5.0 3.6 1.4 0.2 a c a a\n", + "5 5.4 3.9 1.7 0.4 b d a a\n", + "6 4.6 3.4 1.4 0.3 a c a a\n", + "7 5.0 3.4 1.5 0.2 a c a a\n", + "8 4.4 2.9 1.4 0.2 a b a a\n", + "9 4.9 3.1 1.5 0.1 a c a a\n", + "10 5.4 3.7 1.5 0.2 b c a a\n", + "11 4.8 3.4 1.6 0.2 a c a a\n", + "12 4.8 3.0 1.4 0.1 a b a a\n", + "13 4.3 3.0 1.1 0.1 a b a a\n", + "14 5.8 4.0 1.2 0.2 b d a a\n", + "15 5.7 4.4 1.5 0.4 b d a a\n", + "16 5.4 3.9 1.3 0.4 b d a a\n", + "17 5.1 3.5 1.4 0.3 b c a a\n", + "18 5.7 3.8 1.7 0.3 b d a a\n", + "19 5.1 3.8 1.5 0.3 b d a a\n", + "20 5.4 3.4 1.7 0.2 b c a a\n", + "21 5.1 3.7 1.5 0.4 b c a a\n", + "22 4.6 3.6 1.0 0.2 a c a a\n", + "23 5.1 3.3 1.7 0.5 b c a a\n", + "24 4.8 3.4 1.9 0.2 a c a a\n", + "25 5.0 3.0 1.6 0.2 a b a a\n", + "26 5.0 3.4 1.6 0.4 a c a a\n", + "27 5.2 3.5 1.5 0.2 b c a a\n", + "28 5.2 3.4 1.4 0.2 b c a a\n", + "29 4.7 3.2 1.6 0.2 a c a a\n", + ".. ... ... ... ... ... ... ... ...\n", + "120 6.9 3.2 5.7 2.3 d c d d\n", + "121 5.6 2.8 4.9 2.0 b b c d\n", + "122 7.7 2.8 6.7 2.0 d b d d\n", + "123 6.3 2.7 4.9 1.8 c b c c\n", + "124 6.7 3.3 5.7 2.1 c c d d\n", + "125 7.2 3.2 6.0 1.8 d c d c\n", + "126 6.2 2.8 4.8 1.8 c b c c\n", + "127 6.1 3.0 4.9 1.8 c b c c\n", + "128 6.4 2.8 5.6 2.1 c b d d\n", + "129 7.2 3.0 5.8 1.6 d b d c\n", + "130 7.4 2.8 6.1 1.9 d b d d\n", + "131 7.9 3.8 6.4 2.0 d d d d\n", + "132 6.4 2.8 5.6 2.2 c b d d\n", + "133 6.3 2.8 5.1 1.5 c b c c\n", + "134 6.1 2.6 5.6 1.4 c b d c\n", + "135 7.7 3.0 6.1 2.3 d b d d\n", + "136 6.3 3.4 5.6 2.4 c c d d\n", + "137 6.4 3.1 5.5 1.8 c c d c\n", + "138 6.0 3.0 4.8 1.8 c b c c\n", + "139 6.9 3.1 5.4 2.1 d c d d\n", + "140 6.7 3.1 5.6 2.4 c c d d\n", + "141 6.9 3.1 5.1 2.3 d c c d\n", + "142 5.8 2.7 5.1 1.9 b b c d\n", + "143 6.8 3.2 5.9 2.3 c c d d\n", + "144 6.7 3.3 5.7 2.5 c c d d\n", + "145 6.7 3.0 5.2 2.3 c b c d\n", + "146 6.3 2.5 5.0 1.9 c a c d\n", + "147 6.5 3.0 5.2 2.0 c b c d\n", + "148 6.2 3.4 5.4 2.3 c c d d\n", + "149 5.9 3.0 5.1 1.8 c b c c\n", + "\n", + "[150 rows x 8 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sl_labeled'] = toLabel(df, 'sl')\n", + "df['sw_labeled'] = toLabel(df, 'sw')\n", + "df['pl_labeled'] = toLabel(df, 'pl')\n", + "df['pw_labeled'] = toLabel(df, 'pw')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'a', 'b', 'c', 'd'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set(df['sl_labeled'])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"output\"] = iris.target" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sl_labeledsw_labeledpl_labeledpw_labeledoutput
0bcaa0
1abaa0
2acaa0
3acaa0
4acaa0
5bdaa0
6acaa0
7acaa0
8abaa0
9acaa0
10bcaa0
11acaa0
12abaa0
13abaa0
14bdaa0
15bdaa0
16bdaa0
17bcaa0
18bdaa0
19bdaa0
20bcaa0
21bcaa0
22acaa0
23bcaa0
24acaa0
25abaa0
26acaa0
27bcaa0
28bcaa0
29acaa0
..................
120dcdd2
121bbcd2
122dbdd2
123cbcc2
124ccdd2
125dcdc2
126cbcc2
127cbcc2
128cbdd2
129dbdc2
130dbdd2
131dddd2
132cbdd2
133cbcc2
134cbdc2
135dbdd2
136ccdd2
137ccdc2
138cbcc2
139dcdd2
140ccdd2
141dccd2
142bbcd2
143ccdd2
144ccdd2
145cbcd2
146cacd2
147cbcd2
148ccdd2
149cbcc2
\n", + "

150 rows × 5 columns

\n", + "
" + ], + "text/plain": [ + " sl_labeled sw_labeled pl_labeled pw_labeled output\n", + "0 b c a a 0\n", + "1 a b a a 0\n", + "2 a c a a 0\n", + "3 a c a a 0\n", + "4 a c a a 0\n", + "5 b d a a 0\n", + "6 a c a a 0\n", + "7 a c a a 0\n", + "8 a b a a 0\n", + "9 a c a a 0\n", + "10 b c a a 0\n", + "11 a c a a 0\n", + "12 a b a a 0\n", + "13 a b a a 0\n", + "14 b d a a 0\n", + "15 b d a a 0\n", + "16 b d a a 0\n", + "17 b c a a 0\n", + "18 b d a a 0\n", + "19 b d a a 0\n", + "20 b c a a 0\n", + "21 b c a a 0\n", + "22 a c a a 0\n", + "23 b c a a 0\n", + "24 a c a a 0\n", + "25 a b a a 0\n", + "26 a c a a 0\n", + "27 b c a a 0\n", + "28 b c a a 0\n", + "29 a c a a 0\n", + ".. ... ... ... ... ...\n", + "120 d c d d 2\n", + "121 b b c d 2\n", + "122 d b d d 2\n", + "123 c b c c 2\n", + "124 c c d d 2\n", + "125 d c d c 2\n", + "126 c b c c 2\n", + "127 c b c c 2\n", + "128 c b d d 2\n", + "129 d b d c 2\n", + "130 d b d d 2\n", + "131 d d d d 2\n", + "132 c b d d 2\n", + "133 c b c c 2\n", + "134 c b d c 2\n", + "135 d b d d 2\n", + "136 c c d d 2\n", + "137 c c d c 2\n", + "138 c b c c 2\n", + "139 d c d d 2\n", + "140 c c d d 2\n", + "141 d c c d 2\n", + "142 b b c d 2\n", + "143 c c d d 2\n", + "144 c c d d 2\n", + "145 c b c d 2\n", + "146 c a c d 2\n", + "147 c b c d 2\n", + "148 c c d d 2\n", + "149 c b c c 2\n", + "\n", + "[150 rows x 5 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def fit(data):\n", + " output_name = data.columns[-1]\n", + " features = data.columns[0:-1]\n", + " counts = {}\n", + " possible_outputs = set(data[output_name])\n", + " for output in possible_outputs:\n", + " counts[output] = {}\n", + " smallData = data[data[output_name] == output]\n", + " counts[output][\"total_count\"] = len(smallData)\n", + " for f in features:\n", + " counts[output][f] = {}\n", + " possible_values = set(smallData[f])\n", + " for value in possible_values:\n", + " val_count = len(smallData[smallData[f] == value])\n", + " counts[output][f][value] = val_count\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: {'pl_labeled': {'a': 50},\n", + " 'pw_labeled': {'a': 50},\n", + " 'sl_labeled': {'a': 28, 'b': 22},\n", + " 'sw_labeled': {'a': 1, 'b': 7, 'c': 32, 'd': 10},\n", + " 'total_count': 50},\n", + " 1: {'pl_labeled': {'b': 7, 'c': 43},\n", + " 'pw_labeled': {'b': 10, 'c': 40},\n", + " 'sl_labeled': {'a': 3, 'b': 21, 'c': 24, 'd': 2},\n", + " 'sw_labeled': {'a': 13, 'b': 29, 'c': 8},\n", + " 'total_count': 50},\n", + " 2: {'pl_labeled': {'c': 20, 'd': 30},\n", + " 'pw_labeled': {'c': 16, 'd': 34},\n", + " 'sl_labeled': {'a': 1, 'b': 5, 'c': 29, 'd': 15},\n", + " 'sw_labeled': {'a': 5, 'b': 28, 'c': 15, 'd': 2},\n", + " 'total_count': 50}}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fit(df)" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}