{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "hdd4dapuroBk"
},
"source": [
"# Digit Recognition using Random Forest Classifier"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "k_cWcYTUsWdE"
},
"source": [
"**Import Basic Library**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "t6uu8CVZrllI"
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "S_X9qpm0s4uq"
},
"source": [
"**Choosing Dataset**"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 73
},
"id": "ERRZ3tkOOYFA",
"outputId": "5f8f4aae-398b-4e33-e2c2-53de23174401"
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
"\n",
" \n",
" \n",
" "
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saving train[1].csv to train[1].csv\n"
]
}
],
"source": [
"from google.colab import files\n",
"uploaded = files.upload()"
]
},
{
"cell_type": "markdown",
"source": [
"**Load Dataset**"
],
"metadata": {
"id": "TJRApm0w0Dct"
}
},
{
"cell_type": "code",
"source": [
"dataset = pd.read_csv('train.csv')"
],
"metadata": {
"id": "GyOvJOoR0Lhq"
},
"execution_count": 4,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"**Summarize dataset**"
],
"metadata": {
"id": "0txmydWY0ZEH"
}
},
{
"cell_type": "code",
"source": [
"print(dataset.shape)\n",
"print(dataset.head(5))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "AW-9ITV10cIY",
"outputId": "dce2cb6d-2bdb-41e5-de9e-baf122900140"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"(42000, 785)\n",
" label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 \\\n",
"0 1 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 0 \n",
"2 1 0 0 0 0 0 0 0 0 \n",
"3 4 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 0 \n",
"\n",
" pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 \\\n",
"0 0 ... 0 0 0 0 0 0 \n",
"1 0 ... 0 0 0 0 0 0 \n",
"2 0 ... 0 0 0 0 0 0 \n",
"3 0 ... 0 0 0 0 0 0 \n",
"4 0 ... 0 0 0 0 0 0 \n",
"\n",
" pixel780 pixel781 pixel782 pixel783 \n",
"0 0 0 0 0 \n",
"1 0 0 0 0 \n",
"2 0 0 0 0 \n",
"3 0 0 0 0 \n",
"4 0 0 0 0 \n",
"\n",
"[5 rows x 785 columns]\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"**Segregate Dataset into X(Input/Independent Variable) & Y(Output/Dependent Variable)**"
],
"metadata": {
"id": "QUh5BKq20viv"
}
},
{
"cell_type": "code",
"source": [
"X = dataset.iloc[:,1:]\n",
"print(X)\n",
"print(X.shape)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "OP2TX3iX09ND",
"outputId": "9c8f44e2-a503-4acf-8978-f6576706e402"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 \\\n",
"0 0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 0 \n",
"... ... ... ... ... ... ... ... ... ... \n",
"41995 0 0 0 0 0 0 0 0 0 \n",
"41996 0 0 0 0 0 0 0 0 0 \n",
"41997 0 0 0 0 0 0 0 0 0 \n",
"41998 0 0 0 0 0 0 0 0 0 \n",
"41999 0 0 0 0 0 0 0 0 0 \n",
"\n",
" pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 \\\n",
"0 0 ... 0 0 0 0 0 \n",
"1 0 ... 0 0 0 0 0 \n",
"2 0 ... 0 0 0 0 0 \n",
"3 0 ... 0 0 0 0 0 \n",
"4 0 ... 0 0 0 0 0 \n",
"... ... ... ... ... ... ... ... \n",
"41995 0 ... 0 0 0 0 0 \n",
"41996 0 ... 0 0 0 0 0 \n",
"41997 0 ... 0 0 0 0 0 \n",
"41998 0 ... 0 0 0 0 0 \n",
"41999 0 ... 0 0 0 0 0 \n",
"\n",
" pixel779 pixel780 pixel781 pixel782 pixel783 \n",
"0 0 0 0 0 0 \n",
"1 0 0 0 0 0 \n",
"2 0 0 0 0 0 \n",
"3 0 0 0 0 0 \n",
"4 0 0 0 0 0 \n",
"... ... ... ... ... ... \n",
"41995 0 0 0 0 0 \n",
"41996 0 0 0 0 0 \n",
"41997 0 0 0 0 0 \n",
"41998 0 0 0 0 0 \n",
"41999 0 0 0 0 0 \n",
"\n",
"[42000 rows x 784 columns]\n",
"(42000, 784)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"Y = dataset.iloc[:,0]\n",
"print(Y)\n",
"print(Y.shape)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "2RuBl7671GH4",
"outputId": "96d6afef-f2ed-420f-d95c-826a287fa8dd"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"0 1\n",
"1 0\n",
"2 1\n",
"3 4\n",
"4 0\n",
" ..\n",
"41995 0\n",
"41996 1\n",
"41997 7\n",
"41998 6\n",
"41999 9\n",
"Name: label, Length: 42000, dtype: int64\n",
"(42000,)\n"
]
}
]
},
{
"cell_type": "markdown",
"source": [
"**Splitting Dataset into Test and Train**"
],
"metadata": {
"id": "o1j-AGZd1OQV"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.25, random_state = 0)"
],
"metadata": {
"id": "U_c_R4HA1SeZ"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"**Training**"
],
"metadata": {
"id": "Gf6EgvAc1vjh"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"model = RandomForestClassifier()\n",
"model.fit(X_train, y_train)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RS4TAnDh1yUU",
"outputId": "4803259d-f3a1-461f-d3d0-939bc4495a64"
},
"execution_count": 9,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"RandomForestClassifier()"
]
},
"metadata": {},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"source": [
"y_pred = model.predict(X_test)"
],
"metadata": {
"id": "SljeEEbs2JFT"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"**Model Accuracy**"
],
"metadata": {
"id": "4XEvHILm2OF-"
}
},
{
"cell_type": "code",
"source": [
"from sklearn.metrics import accuracy_score\n",
"print(\"Accuracy of the Model: {0}%\".format(accuracy_score(y_test, y_pred)*100))"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "sHEVc1Qq2Rqy",
"outputId": "06be6e32-1ba4-4035-eafb-3b3c2023abd6"
},
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Accuracy of the Model: 96.31428571428572%\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import matplotlib.pyplot as plt\n",
"index=10\n",
"print(\"Predicted \" +str(model.predict(X_test)[index]))\n",
"plt.axis('off')\n",
"plt.imshow(X_test.iloc[index].values.reshape((28,28)),cmap='gray')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 283
},
"id": "iymJ1Zpj20gk",
"outputId": "ae21ce24-b957-4a30-8f04-ec5c77dd5a53"
},
"execution_count": 13,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Predicted 7\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
""
]
},
"metadata": {},
"execution_count": 13
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"