{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn import datasets\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "iris = datasets.load_iris()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "df = pd.DataFrame(iris.data)\n", "df.columns = [\"sl\", \"sw\", 'pl', 'pw']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def abc(k, *val):\n", " if k < val[0]:\n", " return 0\n", " else:\n", " return 1" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1\n", "1 0\n", "2 0\n", "3 0\n", "4 1\n", "5 1\n", "6 0\n", "7 1\n", "8 0\n", "9 0\n", "10 1\n", "11 0\n", "12 0\n", "13 0\n", "14 1\n", "15 1\n", "16 1\n", "17 1\n", "18 1\n", "19 1\n", "20 1\n", "21 1\n", "22 0\n", "23 1\n", "24 0\n", "25 1\n", "26 1\n", "27 1\n", "28 1\n", "29 0\n", " ..\n", "120 1\n", "121 1\n", "122 1\n", "123 1\n", "124 1\n", "125 1\n", "126 1\n", "127 1\n", "128 1\n", "129 1\n", "130 1\n", "131 1\n", "132 1\n", "133 1\n", "134 1\n", "135 1\n", "136 1\n", "137 1\n", "138 1\n", "139 1\n", "140 1\n", "141 1\n", "142 1\n", "143 1\n", "144 1\n", "145 1\n", "146 1\n", "147 1\n", "148 1\n", "149 1\n", "Name: sl, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.sl.apply(abc, args=(5,))" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def label(val, *boundaries):\n", " if (val < boundaries[0]):\n", " return 'a'\n", " elif (val < boundaries[1]):\n", " return 'b'\n", " elif (val < boundaries[2]):\n", " return 'c'\n", " else:\n", " return 'd'\n", "\n", "def toLabel(df, old_feature_name):\n", " second = df[old_feature_name].mean()\n", " minimum = df[old_feature_name].min()\n", " first = (minimum + second)/2\n", " maximum = df[old_feature_name].max()\n", " third = (maximum + second)/2\n", " return df[old_feature_name].apply(label, args= (first, second, third))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>sl</th>\n", " <th>sw</th>\n", " <th>pl</th>\n", " <th>pw</th>\n", " <th>sl_labeled</th>\n", " <th>sw_labeled</th>\n", " <th>pl_labeled</th>\n", " <th>pw_labeled</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>5.1</td>\n", " <td>3.5</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>4.9</td>\n", " <td>3.0</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>4.7</td>\n", " <td>3.2</td>\n", " <td>1.3</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>4.6</td>\n", " <td>3.1</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>5.0</td>\n", " <td>3.6</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>5.4</td>\n", " <td>3.9</td>\n", " <td>1.7</td>\n", " <td>0.4</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>4.6</td>\n", " <td>3.4</td>\n", " <td>1.4</td>\n", " <td>0.3</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>5.0</td>\n", " <td>3.4</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>4.4</td>\n", " <td>2.9</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>4.9</td>\n", " <td>3.1</td>\n", " <td>1.5</td>\n", " <td>0.1</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>5.4</td>\n", " <td>3.7</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>4.8</td>\n", " <td>3.4</td>\n", " <td>1.6</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>4.8</td>\n", " <td>3.0</td>\n", " <td>1.4</td>\n", " <td>0.1</td>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>4.3</td>\n", " <td>3.0</td>\n", " <td>1.1</td>\n", " <td>0.1</td>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>5.8</td>\n", " <td>4.0</td>\n", " <td>1.2</td>\n", " <td>0.2</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>5.7</td>\n", " <td>4.4</td>\n", " <td>1.5</td>\n", " <td>0.4</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>5.4</td>\n", " <td>3.9</td>\n", " <td>1.3</td>\n", " <td>0.4</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>5.1</td>\n", " <td>3.5</td>\n", " <td>1.4</td>\n", " <td>0.3</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>5.7</td>\n", " <td>3.8</td>\n", " <td>1.7</td>\n", " <td>0.3</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>5.1</td>\n", " <td>3.8</td>\n", " <td>1.5</td>\n", " <td>0.3</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>5.4</td>\n", " <td>3.4</td>\n", " <td>1.7</td>\n", " <td>0.2</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>5.1</td>\n", " <td>3.7</td>\n", " <td>1.5</td>\n", " <td>0.4</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>4.6</td>\n", " <td>3.6</td>\n", " <td>1.0</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>5.1</td>\n", " <td>3.3</td>\n", " <td>1.7</td>\n", " <td>0.5</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>4.8</td>\n", " <td>3.4</td>\n", " <td>1.9</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>5.0</td>\n", " <td>3.0</td>\n", " <td>1.6</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>5.0</td>\n", " <td>3.4</td>\n", " <td>1.6</td>\n", " <td>0.4</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>5.2</td>\n", " <td>3.5</td>\n", " <td>1.5</td>\n", " <td>0.2</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>5.2</td>\n", " <td>3.4</td>\n", " <td>1.4</td>\n", " <td>0.2</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>4.7</td>\n", " <td>3.2</td>\n", " <td>1.6</td>\n", " <td>0.2</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>120</th>\n", " <td>6.9</td>\n", " <td>3.2</td>\n", " <td>5.7</td>\n", " <td>2.3</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>121</th>\n", " <td>5.6</td>\n", " <td>2.8</td>\n", " <td>4.9</td>\n", " <td>2.0</td>\n", " <td>b</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>122</th>\n", " <td>7.7</td>\n", " <td>2.8</td>\n", " <td>6.7</td>\n", " <td>2.0</td>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>123</th>\n", " <td>6.3</td>\n", " <td>2.7</td>\n", " <td>4.9</td>\n", " <td>1.8</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>124</th>\n", " <td>6.7</td>\n", " <td>3.3</td>\n", " <td>5.7</td>\n", " <td>2.1</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>125</th>\n", " <td>7.2</td>\n", " <td>3.2</td>\n", " <td>6.0</td>\n", " <td>1.8</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>126</th>\n", " <td>6.2</td>\n", " <td>2.8</td>\n", " <td>4.8</td>\n", " <td>1.8</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>127</th>\n", " <td>6.1</td>\n", " <td>3.0</td>\n", " <td>4.9</td>\n", " <td>1.8</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>128</th>\n", " <td>6.4</td>\n", " <td>2.8</td>\n", " <td>5.6</td>\n", " <td>2.1</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>129</th>\n", " <td>7.2</td>\n", " <td>3.0</td>\n", " <td>5.8</td>\n", " <td>1.6</td>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>130</th>\n", " <td>7.4</td>\n", " <td>2.8</td>\n", " <td>6.1</td>\n", " <td>1.9</td>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>131</th>\n", " <td>7.9</td>\n", " <td>3.8</td>\n", " <td>6.4</td>\n", " <td>2.0</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>132</th>\n", " <td>6.4</td>\n", " <td>2.8</td>\n", " <td>5.6</td>\n", " <td>2.2</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>133</th>\n", " <td>6.3</td>\n", " <td>2.8</td>\n", " <td>5.1</td>\n", " <td>1.5</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>134</th>\n", " <td>6.1</td>\n", " <td>2.6</td>\n", " <td>5.6</td>\n", " <td>1.4</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>135</th>\n", " <td>7.7</td>\n", " <td>3.0</td>\n", " <td>6.1</td>\n", " <td>2.3</td>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>136</th>\n", " <td>6.3</td>\n", " <td>3.4</td>\n", " <td>5.6</td>\n", " <td>2.4</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>137</th>\n", " <td>6.4</td>\n", " <td>3.1</td>\n", " <td>5.5</td>\n", " <td>1.8</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>138</th>\n", " <td>6.0</td>\n", " <td>3.0</td>\n", " <td>4.8</td>\n", " <td>1.8</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " </tr>\n", " <tr>\n", " <th>139</th>\n", " <td>6.9</td>\n", " <td>3.1</td>\n", " <td>5.4</td>\n", " <td>2.1</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>140</th>\n", " <td>6.7</td>\n", " <td>3.1</td>\n", " <td>5.6</td>\n", " <td>2.4</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>141</th>\n", " <td>6.9</td>\n", " <td>3.1</td>\n", " <td>5.1</td>\n", " <td>2.3</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>142</th>\n", " <td>5.8</td>\n", " <td>2.7</td>\n", " <td>5.1</td>\n", " <td>1.9</td>\n", " <td>b</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>143</th>\n", " <td>6.8</td>\n", " <td>3.2</td>\n", " <td>5.9</td>\n", " <td>2.3</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>144</th>\n", " <td>6.7</td>\n", " <td>3.3</td>\n", " <td>5.7</td>\n", " <td>2.5</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>145</th>\n", " <td>6.7</td>\n", " <td>3.0</td>\n", " <td>5.2</td>\n", " <td>2.3</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>146</th>\n", " <td>6.3</td>\n", " <td>2.5</td>\n", " <td>5.0</td>\n", " <td>1.9</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>147</th>\n", " <td>6.5</td>\n", " <td>3.0</td>\n", " <td>5.2</td>\n", " <td>2.0</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>148</th>\n", " <td>6.2</td>\n", " <td>3.4</td>\n", " <td>5.4</td>\n", " <td>2.3</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " </tr>\n", " <tr>\n", " <th>149</th>\n", " <td>5.9</td>\n", " <td>3.0</td>\n", " <td>5.1</td>\n", " <td>1.8</td>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>150 rows × 8 columns</p>\n", "</div>" ], "text/plain": [ " sl sw pl pw sl_labeled sw_labeled pl_labeled pw_labeled\n", "0 5.1 3.5 1.4 0.2 b c a a\n", "1 4.9 3.0 1.4 0.2 a b a a\n", "2 4.7 3.2 1.3 0.2 a c a a\n", "3 4.6 3.1 1.5 0.2 a c a a\n", "4 5.0 3.6 1.4 0.2 a c a a\n", "5 5.4 3.9 1.7 0.4 b d a a\n", "6 4.6 3.4 1.4 0.3 a c a a\n", "7 5.0 3.4 1.5 0.2 a c a a\n", "8 4.4 2.9 1.4 0.2 a b a a\n", "9 4.9 3.1 1.5 0.1 a c a a\n", "10 5.4 3.7 1.5 0.2 b c a a\n", "11 4.8 3.4 1.6 0.2 a c a a\n", "12 4.8 3.0 1.4 0.1 a b a a\n", "13 4.3 3.0 1.1 0.1 a b a a\n", "14 5.8 4.0 1.2 0.2 b d a a\n", "15 5.7 4.4 1.5 0.4 b d a a\n", "16 5.4 3.9 1.3 0.4 b d a a\n", "17 5.1 3.5 1.4 0.3 b c a a\n", "18 5.7 3.8 1.7 0.3 b d a a\n", "19 5.1 3.8 1.5 0.3 b d a a\n", "20 5.4 3.4 1.7 0.2 b c a a\n", "21 5.1 3.7 1.5 0.4 b c a a\n", "22 4.6 3.6 1.0 0.2 a c a a\n", "23 5.1 3.3 1.7 0.5 b c a a\n", "24 4.8 3.4 1.9 0.2 a c a a\n", "25 5.0 3.0 1.6 0.2 a b a a\n", "26 5.0 3.4 1.6 0.4 a c a a\n", "27 5.2 3.5 1.5 0.2 b c a a\n", "28 5.2 3.4 1.4 0.2 b c a a\n", "29 4.7 3.2 1.6 0.2 a c a a\n", ".. ... ... ... ... ... ... ... ...\n", "120 6.9 3.2 5.7 2.3 d c d d\n", "121 5.6 2.8 4.9 2.0 b b c d\n", "122 7.7 2.8 6.7 2.0 d b d d\n", "123 6.3 2.7 4.9 1.8 c b c c\n", "124 6.7 3.3 5.7 2.1 c c d d\n", "125 7.2 3.2 6.0 1.8 d c d c\n", "126 6.2 2.8 4.8 1.8 c b c c\n", "127 6.1 3.0 4.9 1.8 c b c c\n", "128 6.4 2.8 5.6 2.1 c b d d\n", "129 7.2 3.0 5.8 1.6 d b d c\n", "130 7.4 2.8 6.1 1.9 d b d d\n", "131 7.9 3.8 6.4 2.0 d d d d\n", "132 6.4 2.8 5.6 2.2 c b d d\n", "133 6.3 2.8 5.1 1.5 c b c c\n", "134 6.1 2.6 5.6 1.4 c b d c\n", "135 7.7 3.0 6.1 2.3 d b d d\n", "136 6.3 3.4 5.6 2.4 c c d d\n", "137 6.4 3.1 5.5 1.8 c c d c\n", "138 6.0 3.0 4.8 1.8 c b c c\n", "139 6.9 3.1 5.4 2.1 d c d d\n", "140 6.7 3.1 5.6 2.4 c c d d\n", "141 6.9 3.1 5.1 2.3 d c c d\n", "142 5.8 2.7 5.1 1.9 b b c d\n", "143 6.8 3.2 5.9 2.3 c c d d\n", "144 6.7 3.3 5.7 2.5 c c d d\n", "145 6.7 3.0 5.2 2.3 c b c d\n", "146 6.3 2.5 5.0 1.9 c a c d\n", "147 6.5 3.0 5.2 2.0 c b c d\n", "148 6.2 3.4 5.4 2.3 c c d d\n", "149 5.9 3.0 5.1 1.8 c b c c\n", "\n", "[150 rows x 8 columns]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['sl_labeled'] = toLabel(df, 'sl')\n", "df['sw_labeled'] = toLabel(df, 'sw')\n", "df['pl_labeled'] = toLabel(df, 'pl')\n", "df['pw_labeled'] = toLabel(df, 'pw')\n", "df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'a', 'b', 'c', 'd'}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(df['sl_labeled'])" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "df[\"output\"] = iris.target" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>sl_labeled</th>\n", " <th>sw_labeled</th>\n", " <th>pl_labeled</th>\n", " <th>pw_labeled</th>\n", " <th>output</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>5</th>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>6</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>7</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>8</th>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>9</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>10</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>11</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>12</th>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>13</th>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>14</th>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>15</th>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>17</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>18</th>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>20</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>21</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>22</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>23</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>25</th>\n", " <td>a</td>\n", " <td>b</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>27</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>28</th>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>a</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>120</th>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>121</th>\n", " <td>b</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>122</th>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>123</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>124</th>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>125</th>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>126</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>127</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>128</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>129</th>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>130</th>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>131</th>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>132</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>133</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>134</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>135</th>\n", " <td>d</td>\n", " <td>b</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>136</th>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>137</th>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>138</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>139</th>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>140</th>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>141</th>\n", " <td>d</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>142</th>\n", " <td>b</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>143</th>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>144</th>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>145</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>146</th>\n", " <td>c</td>\n", " <td>a</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>147</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>148</th>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>d</td>\n", " <td>d</td>\n", " <td>2</td>\n", " </tr>\n", " <tr>\n", " <th>149</th>\n", " <td>c</td>\n", " <td>b</td>\n", " <td>c</td>\n", " <td>c</td>\n", " <td>2</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>150 rows × 5 columns</p>\n", "</div>" ], "text/plain": [ " sl_labeled sw_labeled pl_labeled pw_labeled output\n", "0 b c a a 0\n", "1 a b a a 0\n", "2 a c a a 0\n", "3 a c a a 0\n", "4 a c a a 0\n", "5 b d a a 0\n", "6 a c a a 0\n", "7 a c a a 0\n", "8 a b a a 0\n", "9 a c a a 0\n", "10 b c a a 0\n", "11 a c a a 0\n", "12 a b a a 0\n", "13 a b a a 0\n", "14 b d a a 0\n", "15 b d a a 0\n", "16 b d a a 0\n", "17 b c a a 0\n", "18 b d a a 0\n", "19 b d a a 0\n", "20 b c a a 0\n", "21 b c a a 0\n", "22 a c a a 0\n", "23 b c a a 0\n", "24 a c a a 0\n", "25 a b a a 0\n", "26 a c a a 0\n", "27 b c a a 0\n", "28 b c a a 0\n", "29 a c a a 0\n", ".. ... ... ... ... ...\n", "120 d c d d 2\n", "121 b b c d 2\n", "122 d b d d 2\n", "123 c b c c 2\n", "124 c c d d 2\n", "125 d c d c 2\n", "126 c b c c 2\n", "127 c b c c 2\n", "128 c b d d 2\n", "129 d b d c 2\n", "130 d b d d 2\n", "131 d d d d 2\n", "132 c b d d 2\n", "133 c b c c 2\n", "134 c b d c 2\n", "135 d b d d 2\n", "136 c c d d 2\n", "137 c c d c 2\n", "138 c b c c 2\n", "139 d c d d 2\n", "140 c c d d 2\n", "141 d c c d 2\n", "142 b b c d 2\n", "143 c c d d 2\n", "144 c c d d 2\n", "145 c b c d 2\n", "146 c a c d 2\n", "147 c b c d 2\n", "148 c c d d 2\n", "149 c b c c 2\n", "\n", "[150 rows x 5 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def fit(data):\n", " output_name = data.columns[-1]\n", " features = data.columns[0:-1]\n", " counts = {}\n", " possible_outputs = set(data[output_name])\n", " for output in possible_outputs:\n", " counts[output] = {}\n", " smallData = data[data[output_name] == output]\n", " counts[output][\"total_count\"] = len(smallData)\n", " for f in features:\n", " counts[output][f] = {}\n", " possible_values = set(smallData[f])\n", " for value in possible_values:\n", " val_count = len(smallData[smallData[f] == value])\n", " counts[output][f][value] = val_count\n", " return counts" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: {'pl_labeled': {'a': 50},\n", " 'pw_labeled': {'a': 50},\n", " 'sl_labeled': {'a': 28, 'b': 22},\n", " 'sw_labeled': {'a': 1, 'b': 7, 'c': 32, 'd': 10},\n", " 'total_count': 50},\n", " 1: {'pl_labeled': {'b': 7, 'c': 43},\n", " 'pw_labeled': {'b': 10, 'c': 40},\n", " 'sl_labeled': {'a': 3, 'b': 21, 'c': 24, 'd': 2},\n", " 'sw_labeled': {'a': 13, 'b': 29, 'c': 8},\n", " 'total_count': 50},\n", " 2: {'pl_labeled': {'c': 20, 'd': 30},\n", " 'pw_labeled': {'c': 16, 'd': 34},\n", " 'sl_labeled': {'a': 1, 'b': 5, 'c': 29, 'd': 15},\n", " 'sw_labeled': {'a': 5, 'b': 28, 'c': 15, 'd': 2},\n", " 'total_count': 50}}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fit(df)" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }