added comments

This commit is contained in:
rasbt 2015-01-24 11:02:44 -05:00
parent bd089d2b1e
commit f681bd6e5d
2 changed files with 363 additions and 138 deletions

View File

@ -220,4 +220,3 @@
- [scikit-learn](http://scikit-learn.org/stable/) - A powerful machine learning library for Python and tools for efficient data mining and analysis.

View File

@ -1,7 +1,7 @@
{
"metadata": {
"name": "",
"signature": "sha256:01adffebfb99d8e7a86af443b9d14ca7695efc917465ea85868cc42681d6e96b"
"signature": "sha256:1ba931b3466a0506e031f8b9bdffcb2ba39138b42f3676b74376988bf095be97"
},
"nbformat": 3,
"nbformat_minor": 0,
@ -88,6 +88,7 @@
"- [Missing Values aka NaNs](#Missing-Values-aka-NaNs)\n",
" - [Selecting NaN Rows](#Selecting-NaN-Rows)\n",
" - [Dropping NaN Rows](#Dropping-NaN-Rows)\n",
" - [Filling NaN Rows](#Filling-NaN-Rows)\n",
"- [Appending Rows to a DataFrame](#Appending-Rows-to-a-DataFrame)\n",
"- [Sorting and Reindexing DataFrames](#Sorting-and-Reindexing-DataFrames)"
]
@ -328,6 +329,10 @@
"# Converting column names to lowercase\n",
"\n",
"df.columns = [c.lower() for c in df.columns]\n",
"\n",
"# or\n",
"# df.rename(columns=lambda x : x.lower())\n",
"\n",
"df.tail()"
],
"language": "python",
@ -572,6 +577,117 @@
"[[back to section overview](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Processing `salary` column\n",
"\n",
"df['salary'] = df['salary'].apply(lambda x: x.strip('$m'))\n",
"df.tail()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>player</th>\n",
" <th>salary</th>\n",
" <th>games</th>\n",
" <th>goals</th>\n",
" <th>assists</th>\n",
" <th>shots_on_target</th>\n",
" <th>points_per_game</th>\n",
" <th>points</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> Santiago Cazorla\\n Midfield \u2014 Arsenal</td>\n",
" <td> 14.8</td>\n",
" <td> 20</td>\n",
" <td> 4</td>\n",
" <td>NaN</td>\n",
" <td> 20</td>\n",
" <td> 9.97</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> David Silva\\n Midfield \u2014 Manchester City</td>\n",
" <td> 14.3</td>\n",
" <td> 15</td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 11</td>\n",
" <td> 10.35</td>\n",
" <td> 155.26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td> Cesc F\u00e0bregas\\n Midfield \u2014 Chelsea</td>\n",
" <td> 14.0</td>\n",
" <td> 20</td>\n",
" <td> 2</td>\n",
" <td> 14</td>\n",
" <td> 10</td>\n",
" <td> 10.47</td>\n",
" <td> 209.49</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td> Saido Berahino\\n Forward \u2014 West Brom</td>\n",
" <td> 13.8</td>\n",
" <td> 21</td>\n",
" <td> 9</td>\n",
" <td> 0</td>\n",
" <td> 20</td>\n",
" <td> 7.02</td>\n",
" <td> 147.43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td> Steven Gerrard\\n Midfield \u2014 Liverpool</td>\n",
" <td> 13.8</td>\n",
" <td> 20</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 11</td>\n",
" <td> 7.50</td>\n",
" <td> 150.01</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
" player salary games goals assists \\\n",
"5 Santiago Cazorla\\n Midfield \u2014 Arsenal 14.8 20 4 NaN \n",
"6 David Silva\\n Midfield \u2014 Manchester City 14.3 15 6 2 \n",
"7 Cesc F\u00e0bregas\\n Midfield \u2014 Chelsea 14.0 20 2 14 \n",
"8 Saido Berahino\\n Forward \u2014 West Brom 13.8 21 9 0 \n",
"9 Steven Gerrard\\n Midfield \u2014 Liverpool 13.8 20 5 1 \n",
"\n",
" shots_on_target points_per_game points \n",
"5 20 9.97 NaN \n",
"6 11 10.35 155.26 \n",
"7 10 10.47 209.49 \n",
"8 20 7.02 147.43 \n",
"9 11 7.50 150.01 "
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
@ -579,6 +695,10 @@
"# Creating a new column\n",
"\n",
"df['team'] = pd.Series('', index=df.index)\n",
"\n",
"# or\n",
"# df.insert(loc=9, column='team', value='') \n",
"\n",
"df.tail(3)"
],
"language": "python",
@ -606,119 +726,6 @@
" <tr>\n",
" <th>7</th>\n",
" <td> Cesc F\u00e0bregas\\n Midfield \u2014 Chelsea</td>\n",
" <td> $14.0m</td>\n",
" <td> 20</td>\n",
" <td> 2</td>\n",
" <td> 14</td>\n",
" <td> 10</td>\n",
" <td> 10.47</td>\n",
" <td> 209.49</td>\n",
" <td> </td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td> Saido Berahino\\n Forward \u2014 West Brom</td>\n",
" <td> $13.8m</td>\n",
" <td> 21</td>\n",
" <td> 9</td>\n",
" <td> 0</td>\n",
" <td> 20</td>\n",
" <td> 7.02</td>\n",
" <td> 147.43</td>\n",
" <td> </td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td> Steven Gerrard\\n Midfield \u2014 Liverpool</td>\n",
" <td> $13.8m</td>\n",
" <td> 20</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 11</td>\n",
" <td> 7.50</td>\n",
" <td> 150.01</td>\n",
" <td> </td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 5,
"text": [
" player salary games goals assists \\\n",
"7 Cesc F\u00e0bregas\\n Midfield \u2014 Chelsea $14.0m 20 2 14 \n",
"8 Saido Berahino\\n Forward \u2014 West Brom $13.8m 21 9 0 \n",
"9 Steven Gerrard\\n Midfield \u2014 Liverpool $13.8m 20 5 1 \n",
"\n",
" shots_on_target points_per_game points team \n",
"7 10 10.47 209.49 \n",
"8 20 7.02 147.43 \n",
"9 11 7.50 150.01 "
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Processing `salary` column\n",
"\n",
"df['salary'] = df['salary'].apply(lambda x: x.strip('$m'))\n",
"df.tail()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>player</th>\n",
" <th>salary</th>\n",
" <th>games</th>\n",
" <th>goals</th>\n",
" <th>assists</th>\n",
" <th>shots_on_target</th>\n",
" <th>points_per_game</th>\n",
" <th>points</th>\n",
" <th>team</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> Santiago Cazorla\\n Midfield \u2014 Arsenal</td>\n",
" <td> 14.8</td>\n",
" <td> 20</td>\n",
" <td> 4</td>\n",
" <td>NaN</td>\n",
" <td> 20</td>\n",
" <td> 9.97</td>\n",
" <td> NaN</td>\n",
" <td> </td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> David Silva\\n Midfield \u2014 Manchester City</td>\n",
" <td> 14.3</td>\n",
" <td> 15</td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 11</td>\n",
" <td> 10.35</td>\n",
" <td> 155.26</td>\n",
" <td> </td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td> Cesc F\u00e0bregas\\n Midfield \u2014 Chelsea</td>\n",
" <td> 14.0</td>\n",
" <td> 20</td>\n",
" <td> 2</td>\n",
@ -730,7 +737,7 @@
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td> Saido Berahino\\n Forward \u2014 West Brom</td>\n",
" <td> Saido Berahino\\n Forward \u2014 West Brom</td>\n",
" <td> 13.8</td>\n",
" <td> 21</td>\n",
" <td> 9</td>\n",
@ -742,7 +749,7 @@
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td> Steven Gerrard\\n Midfield \u2014 Liverpool</td>\n",
" <td> Steven Gerrard\\n Midfield \u2014 Liverpool</td>\n",
" <td> 13.8</td>\n",
" <td> 20</td>\n",
" <td> 5</td>\n",
@ -760,16 +767,12 @@
"output_type": "pyout",
"prompt_number": 6,
"text": [
" player salary games goals assists \\\n",
"5 Santiago Cazorla\\n Midfield \u2014 Arsenal 14.8 20 4 NaN \n",
"6 David Silva\\n Midfield \u2014 Manchester City 14.3 15 6 2 \n",
"7 Cesc F\u00e0bregas\\n Midfield \u2014 Chelsea 14.0 20 2 14 \n",
"8 Saido Berahino\\n Forward \u2014 West Brom 13.8 21 9 0 \n",
"9 Steven Gerrard\\n Midfield \u2014 Liverpool 13.8 20 5 1 \n",
" player salary games goals assists \\\n",
"7 Cesc F\u00e0bregas\\n Midfield \u2014 Chelsea 14.0 20 2 14 \n",
"8 Saido Berahino\\n Forward \u2014 West Brom 13.8 21 9 0 \n",
"9 Steven Gerrard\\n Midfield \u2014 Liverpool 13.8 20 5 1 \n",
"\n",
" shots_on_target points_per_game points team \n",
"5 20 9.97 NaN \n",
"6 11 10.35 155.26 \n",
"7 10 10.47 209.49 \n",
"8 20 7.02 147.43 \n",
"9 11 7.50 150.01 "
@ -1229,6 +1232,227 @@
"<br>"
]
},
{
"cell_type": "heading",
"level": 2,
"metadata": {},
"source": [
"Filling NaN Rows"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[[back to section overview](#Sections)]"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# Filling NaN cells with default value 0\n",
"\n",
"df = df.fillna(value=0)\n",
"df"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>player</th>\n",
" <th>salary</th>\n",
" <th>games</th>\n",
" <th>goals</th>\n",
" <th>assists</th>\n",
" <th>shots_on_target</th>\n",
" <th>points_per_game</th>\n",
" <th>points</th>\n",
" <th>team</th>\n",
" <th>position</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td> Sergio Ag\u00fcero</td>\n",
" <td> 19.2</td>\n",
" <td> 16</td>\n",
" <td> 14</td>\n",
" <td> 3</td>\n",
" <td> 34</td>\n",
" <td> 13.12</td>\n",
" <td> 209.98</td>\n",
" <td> Manchester City</td>\n",
" <td> Forward</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td> Eden Hazard</td>\n",
" <td> 18.9</td>\n",
" <td> 21</td>\n",
" <td> 8</td>\n",
" <td> 4</td>\n",
" <td> 17</td>\n",
" <td> 13.05</td>\n",
" <td> 274.04</td>\n",
" <td> Chelsea</td>\n",
" <td> Midfield</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td> Alexis S\u00e1nchez</td>\n",
" <td> 17.6</td>\n",
" <td> 0</td>\n",
" <td> 12</td>\n",
" <td> 7</td>\n",
" <td> 29</td>\n",
" <td> 11.19</td>\n",
" <td> 223.86</td>\n",
" <td> Arsenal</td>\n",
" <td> Forward</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td> Yaya Tour\u00e9</td>\n",
" <td> 16.6</td>\n",
" <td> 18</td>\n",
" <td> 7</td>\n",
" <td> 1</td>\n",
" <td> 19</td>\n",
" <td> 10.99</td>\n",
" <td> 197.91</td>\n",
" <td> Manchester City</td>\n",
" <td> Midfield</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td> \u00c1ngel Di Mar\u00eda</td>\n",
" <td> 15.0</td>\n",
" <td> 13</td>\n",
" <td> 3</td>\n",
" <td> 0</td>\n",
" <td> 13</td>\n",
" <td> 10.17</td>\n",
" <td> 132.23</td>\n",
" <td> Manchester United</td>\n",
" <td> Midfield</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td> Santiago Cazorla</td>\n",
" <td> 14.8</td>\n",
" <td> 20</td>\n",
" <td> 4</td>\n",
" <td> 0</td>\n",
" <td> 20</td>\n",
" <td> 9.97</td>\n",
" <td> 0.00</td>\n",
" <td> Arsenal</td>\n",
" <td> Midfield</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td> David Silva</td>\n",
" <td> 14.3</td>\n",
" <td> 15</td>\n",
" <td> 6</td>\n",
" <td> 2</td>\n",
" <td> 11</td>\n",
" <td> 10.35</td>\n",
" <td> 155.26</td>\n",
" <td> Manchester City</td>\n",
" <td> Midfield</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td> Cesc F\u00e0bregas</td>\n",
" <td> 14.0</td>\n",
" <td> 20</td>\n",
" <td> 2</td>\n",
" <td> 14</td>\n",
" <td> 10</td>\n",
" <td> 10.47</td>\n",
" <td> 209.49</td>\n",
" <td> Chelsea</td>\n",
" <td> Midfield</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td> Saido Berahino</td>\n",
" <td> 13.8</td>\n",
" <td> 21</td>\n",
" <td> 9</td>\n",
" <td> 0</td>\n",
" <td> 20</td>\n",
" <td> 7.02</td>\n",
" <td> 147.43</td>\n",
" <td> West Brom</td>\n",
" <td> Forward</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td> Steven Gerrard</td>\n",
" <td> 13.8</td>\n",
" <td> 20</td>\n",
" <td> 5</td>\n",
" <td> 1</td>\n",
" <td> 11</td>\n",
" <td> 7.50</td>\n",
" <td> 150.01</td>\n",
" <td> Liverpool</td>\n",
" <td> Midfield</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
" player salary games goals assists shots_on_target \\\n",
"0 Sergio Ag\u00fcero 19.2 16 14 3 34 \n",
"1 Eden Hazard 18.9 21 8 4 17 \n",
"2 Alexis S\u00e1nchez 17.6 0 12 7 29 \n",
"3 Yaya Tour\u00e9 16.6 18 7 1 19 \n",
"4 \u00c1ngel Di Mar\u00eda 15.0 13 3 0 13 \n",
"5 Santiago Cazorla 14.8 20 4 0 20 \n",
"6 David Silva 14.3 15 6 2 11 \n",
"7 Cesc F\u00e0bregas 14.0 20 2 14 10 \n",
"8 Saido Berahino 13.8 21 9 0 20 \n",
"9 Steven Gerrard 13.8 20 5 1 11 \n",
"\n",
" points_per_game points team position \n",
"0 13.12 209.98 Manchester City Forward \n",
"1 13.05 274.04 Chelsea Midfield \n",
"2 11.19 223.86 Arsenal Forward \n",
"3 10.99 197.91 Manchester City Midfield \n",
"4 10.17 132.23 Manchester United Midfield \n",
"5 9.97 0.00 Arsenal Midfield \n",
"6 10.35 155.26 Manchester City Midfield \n",
"7 10.47 209.49 Chelsea Midfield \n",
"8 7.02 147.43 West Brom Forward \n",
"9 7.50 150.01 Liverpool Midfield "
]
}
],
"prompt_number": 10
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<br>\n",
"<br>"
]
},
{
"cell_type": "heading",
"level": 1,
@ -1250,8 +1474,10 @@
"input": [
"# Adding an \"empty\" row to the DataFrame\n",
"\n",
"import numpy as np\n",
"\n",
"df = df.append(pd.Series(\n",
" [None]*len(df.columns), # Fill cells with NaNs\n",
" [np.nan]*len(df.columns), # Fill cells with NaNs\n",
" index=df.columns), \n",
" ignore_index=True)\n",
"\n",
@ -1351,7 +1577,7 @@
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"prompt_number": 11,
"text": [
" player salary games goals assists shots_on_target \\\n",
"6 David Silva 14.3 15 6 2 11 \n",
@ -1369,7 +1595,7 @@
]
}
],
"prompt_number": 10
"prompt_number": 11
},
{
"cell_type": "code",
@ -1475,7 +1701,7 @@
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 11,
"prompt_number": 12,
"text": [
" player salary games goals assists shots_on_target \\\n",
"6 David Silva 14.3 15 6 2 11 \n",
@ -1493,7 +1719,7 @@
]
}
],
"prompt_number": 11
"prompt_number": 12
},
{
"cell_type": "markdown",
@ -1567,7 +1793,7 @@
" <th>2</th>\n",
" <td> Alexis S\u00e1nchez</td>\n",
" <td> 17.6</td>\n",
" <td>NaN</td>\n",
" <td> 0</td>\n",
" <td> 12</td>\n",
" <td> 7</td>\n",
" <td> 29</td>\n",
@ -1621,11 +1847,11 @@
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 12,
"prompt_number": 13,
"text": [
" player salary games goals assists shots_on_target \\\n",
"0 Sergio Ag\u00fcero 19.2 16 14 3 34 \n",
"2 Alexis S\u00e1nchez 17.6 NaN 12 7 29 \n",
"2 Alexis S\u00e1nchez 17.6 0 12 7 29 \n",
"8 Saido Berahino 13.8 21 9 0 20 \n",
"1 Eden Hazard 18.9 21 8 4 17 \n",
"3 Yaya Tour\u00e9 16.6 18 7 1 19 \n",
@ -1639,7 +1865,7 @@
]
}
],
"prompt_number": 12
"prompt_number": 13
},
{
"cell_type": "code",
@ -1690,7 +1916,7 @@
" <th>2</th>\n",
" <td> Alexis S\u00e1nchez</td>\n",
" <td> 17.6</td>\n",
" <td>NaN</td>\n",
" <td> 0</td>\n",
" <td> 12</td>\n",
" <td> 7</td>\n",
" <td> 29</td>\n",
@ -1744,11 +1970,11 @@
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"prompt_number": 14,
"text": [
" player salary games goals assists shots_on_target \\\n",
"1 Sergio Ag\u00fcero 19.2 16 14 3 34 \n",
"2 Alexis S\u00e1nchez 17.6 NaN 12 7 29 \n",
"2 Alexis S\u00e1nchez 17.6 0 12 7 29 \n",
"3 Saido Berahino 13.8 21 9 0 20 \n",
"4 Eden Hazard 18.9 21 8 4 17 \n",
"5 Yaya Tour\u00e9 16.6 18 7 1 19 \n",
@ -1762,7 +1988,7 @@
]
}
],
"prompt_number": 13
"prompt_number": 14
}
],
"metadata": {}