{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Machine Learning Intro - Python\n",
"\n",
"First, we import the packages we need:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from scipy.io import arff # to read arff files\n",
"import pandas as pd # for data manipulation\n",
"import numpy as np\n",
"from sklearn.neural_network import MLPClassifier\n",
"from sklearn import preprocessing, svm\n",
"from sklearn.feature_selection import SelectKBest\n",
"from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV\n",
"from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, classification_report, confusion_matrix, roc_curve, auc, roc_auc_score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, we load the data set into a dataframe and display the first few rows:"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" preg | \n",
" plas | \n",
" pres | \n",
" skin | \n",
" insu | \n",
" mass | \n",
" pedi | \n",
" age | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6.0 | \n",
" 148.0 | \n",
" 72.0 | \n",
" 35.0 | \n",
" 0.0 | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" 85.0 | \n",
" 66.0 | \n",
" 29.0 | \n",
" 0.0 | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 2 | \n",
" 8.0 | \n",
" 183.0 | \n",
" 64.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" 89.0 | \n",
" 66.0 | \n",
" 23.0 | \n",
" 94.0 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 4 | \n",
" 0.0 | \n",
" 137.0 | \n",
" 40.0 | \n",
" 35.0 | \n",
" 168.0 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 5 | \n",
" 5.0 | \n",
" 116.0 | \n",
" 74.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 25.6 | \n",
" 0.201 | \n",
" 30.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 6 | \n",
" 3.0 | \n",
" 78.0 | \n",
" 50.0 | \n",
" 32.0 | \n",
" 88.0 | \n",
" 31.0 | \n",
" 0.248 | \n",
" 26.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 7 | \n",
" 10.0 | \n",
" 115.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 35.3 | \n",
" 0.134 | \n",
" 29.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 8 | \n",
" 2.0 | \n",
" 197.0 | \n",
" 70.0 | \n",
" 45.0 | \n",
" 543.0 | \n",
" 30.5 | \n",
" 0.158 | \n",
" 53.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 9 | \n",
" 8.0 | \n",
" 125.0 | \n",
" 96.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.232 | \n",
" 54.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 10 | \n",
" 4.0 | \n",
" 110.0 | \n",
" 92.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 37.6 | \n",
" 0.191 | \n",
" 30.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 11 | \n",
" 10.0 | \n",
" 168.0 | \n",
" 74.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 38.0 | \n",
" 0.537 | \n",
" 34.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 12 | \n",
" 10.0 | \n",
" 139.0 | \n",
" 80.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 27.1 | \n",
" 1.441 | \n",
" 57.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 13 | \n",
" 1.0 | \n",
" 189.0 | \n",
" 60.0 | \n",
" 23.0 | \n",
" 846.0 | \n",
" 30.1 | \n",
" 0.398 | \n",
" 59.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 14 | \n",
" 5.0 | \n",
" 166.0 | \n",
" 72.0 | \n",
" 19.0 | \n",
" 175.0 | \n",
" 25.8 | \n",
" 0.587 | \n",
" 51.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 15 | \n",
" 7.0 | \n",
" 100.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 30.0 | \n",
" 0.484 | \n",
" 32.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 16 | \n",
" 0.0 | \n",
" 118.0 | \n",
" 84.0 | \n",
" 47.0 | \n",
" 230.0 | \n",
" 45.8 | \n",
" 0.551 | \n",
" 31.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 17 | \n",
" 7.0 | \n",
" 107.0 | \n",
" 74.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 29.6 | \n",
" 0.254 | \n",
" 31.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 18 | \n",
" 1.0 | \n",
" 103.0 | \n",
" 30.0 | \n",
" 38.0 | \n",
" 83.0 | \n",
" 43.3 | \n",
" 0.183 | \n",
" 33.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 19 | \n",
" 1.0 | \n",
" 115.0 | \n",
" 70.0 | \n",
" 30.0 | \n",
" 96.0 | \n",
" 34.6 | \n",
" 0.529 | \n",
" 32.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 20 | \n",
" 3.0 | \n",
" 126.0 | \n",
" 88.0 | \n",
" 41.0 | \n",
" 235.0 | \n",
" 39.3 | \n",
" 0.704 | \n",
" 27.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 21 | \n",
" 8.0 | \n",
" 99.0 | \n",
" 84.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 35.4 | \n",
" 0.388 | \n",
" 50.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 22 | \n",
" 7.0 | \n",
" 196.0 | \n",
" 90.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 39.8 | \n",
" 0.451 | \n",
" 41.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 23 | \n",
" 9.0 | \n",
" 119.0 | \n",
" 80.0 | \n",
" 35.0 | \n",
" 0.0 | \n",
" 29.0 | \n",
" 0.263 | \n",
" 29.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 24 | \n",
" 11.0 | \n",
" 143.0 | \n",
" 94.0 | \n",
" 33.0 | \n",
" 146.0 | \n",
" 36.6 | \n",
" 0.254 | \n",
" 51.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 25 | \n",
" 10.0 | \n",
" 125.0 | \n",
" 70.0 | \n",
" 26.0 | \n",
" 115.0 | \n",
" 31.1 | \n",
" 0.205 | \n",
" 41.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 26 | \n",
" 7.0 | \n",
" 147.0 | \n",
" 76.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 39.4 | \n",
" 0.257 | \n",
" 43.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 27 | \n",
" 1.0 | \n",
" 97.0 | \n",
" 66.0 | \n",
" 15.0 | \n",
" 140.0 | \n",
" 23.2 | \n",
" 0.487 | \n",
" 22.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 28 | \n",
" 13.0 | \n",
" 145.0 | \n",
" 82.0 | \n",
" 19.0 | \n",
" 110.0 | \n",
" 22.2 | \n",
" 0.245 | \n",
" 57.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 29 | \n",
" 5.0 | \n",
" 117.0 | \n",
" 92.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 34.1 | \n",
" 0.337 | \n",
" 38.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" preg plas pres skin insu mass pedi age class\n",
"0 6.0 148.0 72.0 35.0 0.0 33.6 0.627 50.0 tested_positive\n",
"1 1.0 85.0 66.0 29.0 0.0 26.6 0.351 31.0 tested_negative\n",
"2 8.0 183.0 64.0 0.0 0.0 23.3 0.672 32.0 tested_positive\n",
"3 1.0 89.0 66.0 23.0 94.0 28.1 0.167 21.0 tested_negative\n",
"4 0.0 137.0 40.0 35.0 168.0 43.1 2.288 33.0 tested_positive\n",
"5 5.0 116.0 74.0 0.0 0.0 25.6 0.201 30.0 tested_negative\n",
"6 3.0 78.0 50.0 32.0 88.0 31.0 0.248 26.0 tested_positive\n",
"7 10.0 115.0 0.0 0.0 0.0 35.3 0.134 29.0 tested_negative\n",
"8 2.0 197.0 70.0 45.0 543.0 30.5 0.158 53.0 tested_positive\n",
"9 8.0 125.0 96.0 0.0 0.0 0.0 0.232 54.0 tested_positive\n",
"10 4.0 110.0 92.0 0.0 0.0 37.6 0.191 30.0 tested_negative\n",
"11 10.0 168.0 74.0 0.0 0.0 38.0 0.537 34.0 tested_positive\n",
"12 10.0 139.0 80.0 0.0 0.0 27.1 1.441 57.0 tested_negative\n",
"13 1.0 189.0 60.0 23.0 846.0 30.1 0.398 59.0 tested_positive\n",
"14 5.0 166.0 72.0 19.0 175.0 25.8 0.587 51.0 tested_positive\n",
"15 7.0 100.0 0.0 0.0 0.0 30.0 0.484 32.0 tested_positive\n",
"16 0.0 118.0 84.0 47.0 230.0 45.8 0.551 31.0 tested_positive\n",
"17 7.0 107.0 74.0 0.0 0.0 29.6 0.254 31.0 tested_positive\n",
"18 1.0 103.0 30.0 38.0 83.0 43.3 0.183 33.0 tested_negative\n",
"19 1.0 115.0 70.0 30.0 96.0 34.6 0.529 32.0 tested_positive\n",
"20 3.0 126.0 88.0 41.0 235.0 39.3 0.704 27.0 tested_negative\n",
"21 8.0 99.0 84.0 0.0 0.0 35.4 0.388 50.0 tested_negative\n",
"22 7.0 196.0 90.0 0.0 0.0 39.8 0.451 41.0 tested_positive\n",
"23 9.0 119.0 80.0 35.0 0.0 29.0 0.263 29.0 tested_positive\n",
"24 11.0 143.0 94.0 33.0 146.0 36.6 0.254 51.0 tested_positive\n",
"25 10.0 125.0 70.0 26.0 115.0 31.1 0.205 41.0 tested_positive\n",
"26 7.0 147.0 76.0 0.0 0.0 39.4 0.257 43.0 tested_positive\n",
"27 1.0 97.0 66.0 15.0 140.0 23.2 0.487 22.0 tested_negative\n",
"28 13.0 145.0 82.0 19.0 110.0 22.2 0.245 57.0 tested_negative\n",
"29 5.0 117.0 92.0 0.0 0.0 34.1 0.337 38.0 tested_negative"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# load dataset (arff file)\n",
"data = arff.loadarff('diabetes.arff')\n",
"\n",
"# transform into a dataframe\n",
"df = pd.DataFrame(data[0])\n",
"\n",
"# decode the class label from a byte object into a string with utf-8 encoding\n",
"df['class'] = df['class'].str.decode('utf-8')\n",
"\n",
"#display the first 5 rows\n",
"df.head(30)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As a first step, we could explore the dataset:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" preg | \n",
" plas | \n",
" pres | \n",
" skin | \n",
" insu | \n",
" mass | \n",
" pedi | \n",
" age | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
" 768.000000 | \n",
"
\n",
" \n",
" mean | \n",
" 3.845052 | \n",
" 120.894531 | \n",
" 69.105469 | \n",
" 20.536458 | \n",
" 79.799479 | \n",
" 31.992578 | \n",
" 0.471876 | \n",
" 33.240885 | \n",
"
\n",
" \n",
" std | \n",
" 3.369578 | \n",
" 31.972618 | \n",
" 19.355807 | \n",
" 15.952218 | \n",
" 115.244002 | \n",
" 7.884160 | \n",
" 0.331329 | \n",
" 11.760232 | \n",
"
\n",
" \n",
" min | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.078000 | \n",
" 21.000000 | \n",
"
\n",
" \n",
" 25% | \n",
" 1.000000 | \n",
" 99.000000 | \n",
" 62.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 27.300000 | \n",
" 0.243750 | \n",
" 24.000000 | \n",
"
\n",
" \n",
" 50% | \n",
" 3.000000 | \n",
" 117.000000 | \n",
" 72.000000 | \n",
" 23.000000 | \n",
" 30.500000 | \n",
" 32.000000 | \n",
" 0.372500 | \n",
" 29.000000 | \n",
"
\n",
" \n",
" 75% | \n",
" 6.000000 | \n",
" 140.250000 | \n",
" 80.000000 | \n",
" 32.000000 | \n",
" 127.250000 | \n",
" 36.600000 | \n",
" 0.626250 | \n",
" 41.000000 | \n",
"
\n",
" \n",
" max | \n",
" 17.000000 | \n",
" 199.000000 | \n",
" 122.000000 | \n",
" 99.000000 | \n",
" 846.000000 | \n",
" 67.100000 | \n",
" 2.420000 | \n",
" 81.000000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" preg plas pres skin insu mass \\\n",
"count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 \n",
"mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 \n",
"std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 \n",
"min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
"25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 \n",
"50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 \n",
"75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 \n",
"max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 \n",
"\n",
" pedi age \n",
"count 768.000000 768.000000 \n",
"mean 0.471876 33.240885 \n",
"std 0.331329 11.760232 \n",
"min 0.078000 21.000000 \n",
"25% 0.243750 24.000000 \n",
"50% 0.372500 29.000000 \n",
"75% 0.626250 41.000000 \n",
"max 2.420000 81.000000 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Number of positive samples:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"268"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Number of negative samples:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"500"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Percentage of positive samples:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"34.89583333333333"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# how many samples do we have per class?\n",
"\n",
"display('Number of positive samples:')\n",
"nr_pos_samples = df[df['class'] == 'tested_positive']['class'].count()\n",
"display(nr_pos_samples)\n",
"\n",
"display('Number of negative samples:')\n",
"nr_neg_samples = df[df['class'] == 'tested_negative']['class'].count()\n",
"display(nr_neg_samples)\n",
"\n",
"display('Percentage of positive samples:')\n",
"display((nr_pos_samples / df['class'].count()) * 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Preprocessing: Are there missing values? Do you need to transform data (e.g. dates, ...)?\n",
"\n",
"In this case, we mark zero values as missing values, since we assume that a value of exactly 0 is not a valid measurement."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" preg | \n",
" plas | \n",
" pres | \n",
" skin | \n",
" insu | \n",
" mass | \n",
" pedi | \n",
" age | \n",
" class | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 6.0 | \n",
" 148.0 | \n",
" 72.0 | \n",
" 35.0 | \n",
" NaN | \n",
" 33.6 | \n",
" 0.627 | \n",
" 50.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 1 | \n",
" 1.0 | \n",
" 85.0 | \n",
" 66.0 | \n",
" 29.0 | \n",
" NaN | \n",
" 26.6 | \n",
" 0.351 | \n",
" 31.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 2 | \n",
" 8.0 | \n",
" 183.0 | \n",
" 64.0 | \n",
" NaN | \n",
" NaN | \n",
" 23.3 | \n",
" 0.672 | \n",
" 32.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 3 | \n",
" 1.0 | \n",
" 89.0 | \n",
" 66.0 | \n",
" 23.0 | \n",
" 94.0 | \n",
" 28.1 | \n",
" 0.167 | \n",
" 21.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 4 | \n",
" NaN | \n",
" 137.0 | \n",
" 40.0 | \n",
" 35.0 | \n",
" 168.0 | \n",
" 43.1 | \n",
" 2.288 | \n",
" 33.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 5 | \n",
" 5.0 | \n",
" 116.0 | \n",
" 74.0 | \n",
" NaN | \n",
" NaN | \n",
" 25.6 | \n",
" 0.201 | \n",
" 30.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 6 | \n",
" 3.0 | \n",
" 78.0 | \n",
" 50.0 | \n",
" 32.0 | \n",
" 88.0 | \n",
" 31.0 | \n",
" 0.248 | \n",
" 26.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 7 | \n",
" 10.0 | \n",
" 115.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 35.3 | \n",
" 0.134 | \n",
" 29.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 8 | \n",
" 2.0 | \n",
" 197.0 | \n",
" 70.0 | \n",
" 45.0 | \n",
" 543.0 | \n",
" 30.5 | \n",
" 0.158 | \n",
" 53.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 9 | \n",
" 8.0 | \n",
" 125.0 | \n",
" 96.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0.232 | \n",
" 54.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 10 | \n",
" 4.0 | \n",
" 110.0 | \n",
" 92.0 | \n",
" NaN | \n",
" NaN | \n",
" 37.6 | \n",
" 0.191 | \n",
" 30.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 11 | \n",
" 10.0 | \n",
" 168.0 | \n",
" 74.0 | \n",
" NaN | \n",
" NaN | \n",
" 38.0 | \n",
" 0.537 | \n",
" 34.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 12 | \n",
" 10.0 | \n",
" 139.0 | \n",
" 80.0 | \n",
" NaN | \n",
" NaN | \n",
" 27.1 | \n",
" 1.441 | \n",
" 57.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 13 | \n",
" 1.0 | \n",
" 189.0 | \n",
" 60.0 | \n",
" 23.0 | \n",
" 846.0 | \n",
" 30.1 | \n",
" 0.398 | \n",
" 59.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 14 | \n",
" 5.0 | \n",
" 166.0 | \n",
" 72.0 | \n",
" 19.0 | \n",
" 175.0 | \n",
" 25.8 | \n",
" 0.587 | \n",
" 51.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 15 | \n",
" 7.0 | \n",
" 100.0 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 30.0 | \n",
" 0.484 | \n",
" 32.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 16 | \n",
" NaN | \n",
" 118.0 | \n",
" 84.0 | \n",
" 47.0 | \n",
" 230.0 | \n",
" 45.8 | \n",
" 0.551 | \n",
" 31.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 17 | \n",
" 7.0 | \n",
" 107.0 | \n",
" 74.0 | \n",
" NaN | \n",
" NaN | \n",
" 29.6 | \n",
" 0.254 | \n",
" 31.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 18 | \n",
" 1.0 | \n",
" 103.0 | \n",
" 30.0 | \n",
" 38.0 | \n",
" 83.0 | \n",
" 43.3 | \n",
" 0.183 | \n",
" 33.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 19 | \n",
" 1.0 | \n",
" 115.0 | \n",
" 70.0 | \n",
" 30.0 | \n",
" 96.0 | \n",
" 34.6 | \n",
" 0.529 | \n",
" 32.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 20 | \n",
" 3.0 | \n",
" 126.0 | \n",
" 88.0 | \n",
" 41.0 | \n",
" 235.0 | \n",
" 39.3 | \n",
" 0.704 | \n",
" 27.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 21 | \n",
" 8.0 | \n",
" 99.0 | \n",
" 84.0 | \n",
" NaN | \n",
" NaN | \n",
" 35.4 | \n",
" 0.388 | \n",
" 50.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 22 | \n",
" 7.0 | \n",
" 196.0 | \n",
" 90.0 | \n",
" NaN | \n",
" NaN | \n",
" 39.8 | \n",
" 0.451 | \n",
" 41.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 23 | \n",
" 9.0 | \n",
" 119.0 | \n",
" 80.0 | \n",
" 35.0 | \n",
" NaN | \n",
" 29.0 | \n",
" 0.263 | \n",
" 29.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 24 | \n",
" 11.0 | \n",
" 143.0 | \n",
" 94.0 | \n",
" 33.0 | \n",
" 146.0 | \n",
" 36.6 | \n",
" 0.254 | \n",
" 51.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 25 | \n",
" 10.0 | \n",
" 125.0 | \n",
" 70.0 | \n",
" 26.0 | \n",
" 115.0 | \n",
" 31.1 | \n",
" 0.205 | \n",
" 41.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 26 | \n",
" 7.0 | \n",
" 147.0 | \n",
" 76.0 | \n",
" NaN | \n",
" NaN | \n",
" 39.4 | \n",
" 0.257 | \n",
" 43.0 | \n",
" tested_positive | \n",
"
\n",
" \n",
" 27 | \n",
" 1.0 | \n",
" 97.0 | \n",
" 66.0 | \n",
" 15.0 | \n",
" 140.0 | \n",
" 23.2 | \n",
" 0.487 | \n",
" 22.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 28 | \n",
" 13.0 | \n",
" 145.0 | \n",
" 82.0 | \n",
" 19.0 | \n",
" 110.0 | \n",
" 22.2 | \n",
" 0.245 | \n",
" 57.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
" 29 | \n",
" 5.0 | \n",
" 117.0 | \n",
" 92.0 | \n",
" NaN | \n",
" NaN | \n",
" 34.1 | \n",
" 0.337 | \n",
" 38.0 | \n",
" tested_negative | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" preg plas pres skin insu mass pedi age class\n",
"0 6.0 148.0 72.0 35.0 NaN 33.6 0.627 50.0 tested_positive\n",
"1 1.0 85.0 66.0 29.0 NaN 26.6 0.351 31.0 tested_negative\n",
"2 8.0 183.0 64.0 NaN NaN 23.3 0.672 32.0 tested_positive\n",
"3 1.0 89.0 66.0 23.0 94.0 28.1 0.167 21.0 tested_negative\n",
"4 NaN 137.0 40.0 35.0 168.0 43.1 2.288 33.0 tested_positive\n",
"5 5.0 116.0 74.0 NaN NaN 25.6 0.201 30.0 tested_negative\n",
"6 3.0 78.0 50.0 32.0 88.0 31.0 0.248 26.0 tested_positive\n",
"7 10.0 115.0 NaN NaN NaN 35.3 0.134 29.0 tested_negative\n",
"8 2.0 197.0 70.0 45.0 543.0 30.5 0.158 53.0 tested_positive\n",
"9 8.0 125.0 96.0 NaN NaN NaN 0.232 54.0 tested_positive\n",
"10 4.0 110.0 92.0 NaN NaN 37.6 0.191 30.0 tested_negative\n",
"11 10.0 168.0 74.0 NaN NaN 38.0 0.537 34.0 tested_positive\n",
"12 10.0 139.0 80.0 NaN NaN 27.1 1.441 57.0 tested_negative\n",
"13 1.0 189.0 60.0 23.0 846.0 30.1 0.398 59.0 tested_positive\n",
"14 5.0 166.0 72.0 19.0 175.0 25.8 0.587 51.0 tested_positive\n",
"15 7.0 100.0 NaN NaN NaN 30.0 0.484 32.0 tested_positive\n",
"16 NaN 118.0 84.0 47.0 230.0 45.8 0.551 31.0 tested_positive\n",
"17 7.0 107.0 74.0 NaN NaN 29.6 0.254 31.0 tested_positive\n",
"18 1.0 103.0 30.0 38.0 83.0 43.3 0.183 33.0 tested_negative\n",
"19 1.0 115.0 70.0 30.0 96.0 34.6 0.529 32.0 tested_positive\n",
"20 3.0 126.0 88.0 41.0 235.0 39.3 0.704 27.0 tested_negative\n",
"21 8.0 99.0 84.0 NaN NaN 35.4 0.388 50.0 tested_negative\n",
"22 7.0 196.0 90.0 NaN NaN 39.8 0.451 41.0 tested_positive\n",
"23 9.0 119.0 80.0 35.0 NaN 29.0 0.263 29.0 tested_positive\n",
"24 11.0 143.0 94.0 33.0 146.0 36.6 0.254 51.0 tested_positive\n",
"25 10.0 125.0 70.0 26.0 115.0 31.1 0.205 41.0 tested_positive\n",
"26 7.0 147.0 76.0 NaN NaN 39.4 0.257 43.0 tested_positive\n",
"27 1.0 97.0 66.0 15.0 140.0 23.2 0.487 22.0 tested_negative\n",
"28 13.0 145.0 82.0 19.0 110.0 22.2 0.245 57.0 tested_negative\n",
"29 5.0 117.0 92.0 NaN NaN 34.1 0.337 38.0 tested_negative"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# mark missing values as NaN\n",
"df = df.replace(0, np.NaN)\n",
"df.head(30)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we want to run machine learning, and we define several classifier pipelines that we want to try out.\n",
"\n",
"The classifier pipeline have various possibilities for configuring. Here we try out:\n",
"* Imputation: This preprocessing step replaces missing values with a value, in this case with the mean. This usually improves the performance of the classifier. There are several possibilities to deal with missing values. https://machinelearningmastery.com/handle-missing-data-python/\n",
"* Scaling: The standard scales normalizes the features, such that each feature's values are distributed between -1 and +1. This helps to prevent that a certain features is weighted more only because it has larger values than another one.\n",
"* Feature Selection: only use a subset of the features compared to all\n",
"* Classifiers: try out different classifiers"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# define classifiers and parameters to evaluate\n",
"steps_decision_forest = [\n",
" (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
" (\"scaling\", preprocessing.StandardScaler()),\n",
" (\"feature_selection\", SelectKBest()),\n",
" (\"classifier\", RandomForestClassifier(n_estimators = 500))\n",
"]\n",
"\n",
"parameters_decision_forest = dict(\n",
" feature_selection__k = [2, 4, 6, 'all'],\n",
" classifier__n_estimators = [100, 250, 500, 1000],\n",
" classifier__max_features = [\"sqrt\", \"log2\", 0.25, 0.5],\n",
" classifier__min_samples_leaf = [5, 10, 30, 50]\n",
")\n",
"\n",
"steps_gradient_boost = [\n",
" (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
" (\"scaling\", preprocessing.StandardScaler()),\n",
" (\"feature_selection\", SelectKBest()),\n",
" (\"classifier\", GradientBoostingClassifier(n_estimators = 500))\n",
"]\n",
"\n",
"parameters_gradient_boost = dict(\n",
" feature_selection__k = [2, 4, 6, 'all'],\n",
" classifier__n_estimators = [100, 250, 500, 1000],\n",
" classifier__loss = [\"deviance\", \"exponential\"],\n",
" classifier__learning_rate = [0.05, 0.1, 0.15],\n",
" classifier__max_features = [\"sqrt\", \"log2\", 0.25, 0.5],\n",
" classifier__min_samples_leaf = [5, 10, 30, 50]\n",
")\n",
"\n",
"steps_svm = [\n",
" (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
" (\"scaling\", preprocessing.StandardScaler()),\n",
" (\"feature_selection\", SelectKBest(k=30)),\n",
" (\"classifier\", svm.SVC(kernel=\"rbf\"))\n",
"]\n",
"\n",
"parameters_svm = dict(\n",
" feature_selection__k = [2, 4, 6, 'all'],\n",
" classifier__kernel = [\"rbf\", \"linear\", \"poly\", \"sigmoid\"],\n",
" classifier__C = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
" classifier__gamma = [0.01, 0.02, 0.03, 0.04, 0.05, 0.10, 0.2, 0.3, 0.4, 0.5]\n",
")\n",
"\n",
"steps_neural_network = [\n",
" (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
" (\"scaling\", preprocessing.StandardScaler()),\n",
" (\"feature_selection\", SelectKBest()),\n",
" (\"classifier\", MLPClassifier(solver=\"lbfgs\"))\n",
"]\n",
"\n",
"parameters_neural_network = dict(\n",
" feature_selection__k = [2, 4, 6, 8, 'all'],\n",
" classifier__solver = [\"lbfgs\", \"sgd\"],\n",
" classifier__alpha = [0.00001, 0.0001, 0.001, 0.01, 1],\n",
" classifier__activation = [\"identity\", \"logistic\", \"tanh\", \"relu\"],\n",
" classifier__learning_rate = [\"constant\", \"invscaling\", \"adaptive\"]\n",
")\n",
"\n",
"steps_naive_bayes = [\n",
" (\"imputation\", preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)),\n",
" (\"scaling\", preprocessing.StandardScaler()),\n",
" (\"feature_selection\", SelectKBest(k=30)),\n",
" (\"classifier\", GaussianNB())\n",
"]\n",
"\n",
"parameters_naive_bayes = dict(\n",
" feature_selection__k = [2, 4, 6, 8, 'all']\n",
")\n",
"\n",
"# all our classifiers with parameters\n",
"pipelines = [(steps_decision_forest, parameters_decision_forest),\n",
" (steps_gradient_boost, parameters_gradient_boost),\n",
" (steps_svm, parameters_svm),\n",
" (steps_neural_network, parameters_neural_network),\n",
" (steps_naive_bayes, parameters_naive_bayes)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Predict the class value for each pipeline and print out the results:"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Classifier: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,\n",
" oob_score=False, random_state=None, verbose=0,\n",
" warm_start=False)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Starting Trial 0'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 4, 'classifier__n_estimators': 1000, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 'log2'}\n",
"Best parameters: {'feature_selection__k': 2, 'classifier__n_estimators': 500, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 'log2'}\n",
"Best parameters: {'feature_selection__k': 4, 'classifier__n_estimators': 1000, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 'log2'}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 1'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.5}\n",
"Best parameters: {'feature_selection__k': 2, 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.25}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 'sqrt'}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 2'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 6, 'classifier__n_estimators': 500, 'classifier__min_samples_leaf': 5, 'classifier__max_features': 0.25}\n",
"Best parameters: {'feature_selection__k': 6, 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.25}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 1000, 'classifier__min_samples_leaf': 5, 'classifier__max_features': 'sqrt'}\n"
]
},
{
"data": {
"text/plain": [
"'Scores of classification:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"\"Classifier: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\\n max_depth=None, max_features='auto', max_leaf_nodes=None,\\n min_impurity_decrease=0.0, min_impurity_split=None,\\n min_samples_leaf=1, min_samples_split=2,\\n min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1,\\n oob_score=False, random_state=None, verbose=0,\\n warm_start=False)\""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Accuracy: 0.7530381944444443 +/- 0.006917264518450201'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Precision: 0.7461195899106623 +/- 0.007625903647639041'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Recall: 0.7530381944444443 +/- 0.006917264518450201'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Positive: 0.6922865353037767 +/- 0.012454712762053214'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Negative: 0.7749741071799529 +/- 0.005430011754841385'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Positive: 0.5261194029850746 +/- 0.013279948074665806'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Negative: 0.8746666666666667 +/- 0.005249338582674546'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Confusion Matrix: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[437.33333333, 62.66666667],\n",
" [127. , 141. ]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Classifier: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"GradientBoostingClassifier(criterion='friedman_mse', init=None,\n",
" learning_rate=0.1, loss='deviance', max_depth=3,\n",
" max_features=None, max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=500,\n",
" presort='auto', random_state=None, subsample=1.0, verbose=0,\n",
" warm_start=False)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Starting Trial 0'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 0.25, 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.1}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 'log2', 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.15}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.25, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.15}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 1'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 6, 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.5, 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.15}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 250, 'classifier__min_samples_leaf': 5, 'classifier__max_features': 0.25, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.05}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 0.5, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.1}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 2'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 50, 'classifier__max_features': 0.5, 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.1}\n",
"Best parameters: {'feature_selection__k': 6, 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 10, 'classifier__max_features': 'log2', 'classifier__loss': 'exponential', 'classifier__learning_rate': 0.05}\n",
"Best parameters: {'feature_selection__k': 4, 'classifier__n_estimators': 100, 'classifier__min_samples_leaf': 30, 'classifier__max_features': 'log2', 'classifier__loss': 'deviance', 'classifier__learning_rate': 0.15}\n"
]
},
{
"data": {
"text/plain": [
"'Scores of classification:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"\"Classifier: GradientBoostingClassifier(criterion='friedman_mse', init=None,\\n learning_rate=0.1, loss='deviance', max_depth=3,\\n max_features=None, max_leaf_nodes=None,\\n min_impurity_decrease=0.0, min_impurity_split=None,\\n min_samples_leaf=1, min_samples_split=2,\\n min_weight_fraction_leaf=0.0, n_estimators=500,\\n presort='auto', random_state=None, subsample=1.0, verbose=0,\\n warm_start=False)\""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Accuracy: 0.7565104166666666 +/- 0.00637887953849786'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Precision: 0.7518041356776776 +/- 0.005900446495549101'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Recall: 0.7565104166666666 +/- 0.00637887953849786'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Positive: 0.6703557829662151 +/- 0.015150367018807323'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Negative: 0.7954604527310214 +/- 0.001451126541547686'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Positive: 0.595771144278607 +/- 0.00465380271986807'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Negative: 0.8426666666666667 +/- 0.011585431464655188'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Confusion Matrix: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[421.33333333, 78.66666667],\n",
" [108.33333333, 159.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Classifier: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
" decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n",
" max_iter=-1, probability=False, random_state=None, shrinking=True,\n",
" tol=0.001, verbose=False)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Starting Trial 0'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__kernel': 'sigmoid', 'classifier__gamma': 0.01, 'classifier__C': 1}\n",
"Best parameters: {'feature_selection__k': 6, 'classifier__kernel': 'linear', 'classifier__gamma': 0.5, 'classifier__C': 5}\n",
"Best parameters: {'feature_selection__k': 2, 'classifier__kernel': 'linear', 'classifier__gamma': 0.02, 'classifier__C': 3}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 1'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 2, 'classifier__kernel': 'rbf', 'classifier__gamma': 0.04, 'classifier__C': 9}\n",
"Best parameters: {'feature_selection__k': 6, 'classifier__kernel': 'sigmoid', 'classifier__gamma': 0.01, 'classifier__C': 1}\n",
"Best parameters: {'feature_selection__k': 6, 'classifier__kernel': 'rbf', 'classifier__gamma': 0.05, 'classifier__C': 10}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 2'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 4, 'classifier__kernel': 'sigmoid', 'classifier__gamma': 0.05, 'classifier__C': 1}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__kernel': 'rbf', 'classifier__gamma': 0.01, 'classifier__C': 7}\n",
"Best parameters: {'feature_selection__k': 2, 'classifier__kernel': 'linear', 'classifier__gamma': 0.01, 'classifier__C': 8}\n"
]
},
{
"data": {
"text/plain": [
"'Scores of classification:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"\"Classifier: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\\n decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\\n max_iter=-1, probability=False, random_state=None, shrinking=True,\\n tol=0.001, verbose=False)\""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Accuracy: 0.7625868055555555 +/- 0.006495932963149201'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Precision: 0.7571512667777728 +/- 0.006957592093905651'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Recall: 0.7625868055555555 +/- 0.006495932963149201'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Positive: 0.7202504897449863 +/- 0.00796907915109877'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Negative: 0.7769300832673465 +/- 0.006648553094210303'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Positive: 0.5223880597014925 +/- 0.018279774199874473'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Negative: 0.8913333333333333 +/- 0.0018856180831641283'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Confusion Matrix: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[445.66666667, 54.33333333],\n",
" [128. , 140. ]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Classifier: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\n",
" beta_2=0.999, early_stopping=False, epsilon=1e-08,\n",
" hidden_layer_sizes=(100,), learning_rate='constant',\n",
" learning_rate_init=0.001, max_iter=200, momentum=0.9,\n",
" nesterovs_momentum=True, power_t=0.5, random_state=None,\n",
" shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,\n",
" verbose=False, warm_start=False)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Starting Trial 0'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__solver': 'sgd', 'classifier__learning_rate': 'constant', 'classifier__alpha': 0.01, 'classifier__activation': 'tanh'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 6, 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'constant', 'classifier__alpha': 0.0001, 'classifier__activation': 'identity'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'invscaling', 'classifier__alpha': 1, 'classifier__activation': 'logistic'}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 1'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__solver': 'sgd', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1e-05, 'classifier__activation': 'identity'}\n",
"Best parameters: {'feature_selection__k': 'all', 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1, 'classifier__activation': 'identity'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 4, 'classifier__solver': 'sgd', 'classifier__learning_rate': 'constant', 'classifier__alpha': 0.001, 'classifier__activation': 'relu'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 2'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 6, 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'constant', 'classifier__alpha': 1, 'classifier__activation': 'identity'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 'all', 'classifier__solver': 'lbfgs', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1, 'classifier__activation': 'identity'}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n",
"C:\\Users\\manuz\\Miniconda3\\lib\\site-packages\\sklearn\\neural_network\\multilayer_perceptron.py:564: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.\n",
" % self.max_iter, ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 2, 'classifier__solver': 'sgd', 'classifier__learning_rate': 'adaptive', 'classifier__alpha': 1e-05, 'classifier__activation': 'relu'}\n"
]
},
{
"data": {
"text/plain": [
"'Scores of classification:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"\"Classifier: MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,\\n beta_2=0.999, early_stopping=False, epsilon=1e-08,\\n hidden_layer_sizes=(100,), learning_rate='constant',\\n learning_rate_init=0.001, max_iter=200, momentum=0.9,\\n nesterovs_momentum=True, power_t=0.5, random_state=None,\\n shuffle=True, solver='lbfgs', tol=0.0001, validation_fraction=0.1,\\n verbose=False, warm_start=False)\""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Accuracy: 0.7673611111111112 +/- 0.0070787788325955385'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Precision: 0.7618461993180677 +/- 0.0075941082106042125'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Recall: 0.7673611111111112 +/- 0.0070787788325955385'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Positive: 0.6999569408494454 +/- 0.010800002546392365'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Negative: 0.7950188418572494 +/- 0.006475560416198352'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Positive: 0.5833333333333334 +/- 0.015634085932806636'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Negative: 0.866 +/- 0.00489897948556636'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Confusion Matrix: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[433. , 67. ],\n",
" [111.66666667, 156.33333333]])"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Classifier: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"GaussianNB(priors=None)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Starting Trial 0'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 4}\n",
"Best parameters: {'feature_selection__k': 2}\n",
"Best parameters: {'feature_selection__k': 4}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 1'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 8}\n",
"Best parameters: {'feature_selection__k': 2}\n",
"Best parameters: {'feature_selection__k': 2}\n"
]
},
{
"data": {
"text/plain": [
"'Starting Trial 2'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Best parameters: {'feature_selection__k': 2}\n",
"Best parameters: {'feature_selection__k': 4}\n",
"Best parameters: {'feature_selection__k': 2}\n"
]
},
{
"data": {
"text/plain": [
"'Scores of classification:'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Classifier: GaussianNB(priors=None)'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Accuracy: 0.75 +/- 0.006639348324990605'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Precision: 0.7437151410614428 +/- 0.006537020314428177'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Weighted Recall: 0.75 +/- 0.006639348324990605'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Positive: 0.6689852488588031 +/- 0.014896082027820307'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Precision Tested Negative: 0.7837703632820575 +/- 0.0024749828971138833'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Positive: 0.5621890547263683 +/- 0.0046538027198680795'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Recall Tested Negative: 0.8506666666666667 +/- 0.010370899457402705'"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'Confusion Matrix: '"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"array([[425.33333333, 74.66666667],\n",
" [117.33333333, 150.66666667]])"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# split into training/test set\n",
"stratifiedFolds = StratifiedKFold(n_splits=3, shuffle=True)\n",
"\n",
"X = df.drop(columns=[\"class\"]).values\n",
"y = df[\"class\"].values\n",
"\n",
"for steps in pipelines:\n",
" pipeline = Pipeline(steps[0])\n",
" parameters = steps[1]\n",
" display(\"Classifier: \", pipeline.named_steps.classifier)\n",
"\n",
" numTrials = 3\n",
" accuracies = []\n",
" weighted_precisions = []\n",
" weighted_recalls = []\n",
" precisions_positive = []\n",
" precisions_negative = []\n",
" recalls_positive = []\n",
" recalls_negative = []\n",
"\n",
" totals_originalclass = []\n",
" totals_predictedclass = [] \n",
"\n",
" for i in range(numTrials):\n",
" display(\"Starting Trial \" + str(i))\n",
" originalclass = []\n",
" predictedclass = []\n",
"\n",
" for train_index, test_index in stratifiedFolds.split(X, y):\n",
" # Need to specify the positive label here as the default metric auc_roc only works for binary label.\n",
" #def custom_auc(ground_truth, predictions):\n",
" # fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label='Switch')\n",
" # return auc(fpr, tpr)\n",
"\n",
" #roc_auc_scorer = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)\n",
"\n",
" randomSearch = RandomizedSearchCV(pipeline, parameters, cv=5, scoring=None, n_iter=5)\n",
" randomSearch.fit(X[train_index], y=y[train_index])\n",
" print(\"Best parameters: \", randomSearch.best_params_)\n",
"\n",
" y_pred = randomSearch.predict(X[test_index])\n",
" originalclass.extend(y[test_index])\n",
" predictedclass.extend(y_pred)\n",
" totals_originalclass.extend(y[test_index])\n",
" totals_predictedclass.extend(y_pred)\n",
"\n",
" # after each complete trial, store results\n",
" trial_accuracy = accuracy_score(originalclass, predictedclass)\n",
" accuracies.extend([trial_accuracy])\n",
"\n",
" trial_weighted_precision = precision_score(originalclass, predictedclass, average=\"weighted\")\n",
" weighted_precisions.extend([trial_weighted_precision])\n",
" trial_precision_positive = precision_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_positive\")\n",
" precisions_positive.extend([trial_precision_positive])\n",
" trial_precision_negative = precision_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_negative\")\n",
" precisions_negative.extend([trial_precision_negative])\n",
"\n",
" trial_weighted_recall = recall_score(originalclass, predictedclass, average=\"weighted\")\n",
" weighted_recalls.extend([trial_weighted_recall])\n",
" trial_recall_positive = recall_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_positive\")\n",
" recalls_positive.extend([trial_recall_positive])\n",
" trial_recall_negative = recall_score(originalclass, predictedclass, average=\"binary\", pos_label=\"tested_negative\")\n",
" recalls_negative.extend([trial_recall_negative])\n",
"\n",
" display('Scores of classification:')\n",
" display(\"Classifier: \" + str(pipeline.named_steps.classifier))\n",
" display(\"Accuracy: \" + str(np.mean(accuracies)) + \" +/- \" + str(np.std(accuracies)))\n",
" display(\"Weighted Precision: \" + str(np.mean(weighted_precisions)) + \" +/- \" + str(np.std(weighted_precisions)))\n",
" display(\"Weighted Recall: \" + str(np.mean(weighted_recalls)) + \" +/- \" + str(np.std(weighted_recalls)))\n",
" display(\"Precision Tested Positive: \" + str(np.mean(precisions_positive)) + \" +/- \" + str(np.std(precisions_positive)))\n",
" display(\"Precision Tested Negative: \" + str(np.mean(precisions_negative)) + \" +/- \" + str(np.std(precisions_negative)))\n",
" display(\"Recall Tested Positive: \" + str(np.mean(recalls_positive)) + \" +/- \" + str(np.std(recalls_positive)))\n",
" display(\"Recall Tested Negative: \" + str(np.mean(recalls_negative)) + \" +/- \" + str(np.std(recalls_negative)))\n",
" conf_matrix = confusion_matrix(totals_originalclass, totals_predictedclass) / numTrials\n",
" display(\"Confusion Matrix: \")\n",
" display(conf_matrix)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}