Preliminaries¶
%matplotlib inline
# IMPORT MODULES
import sys
import os
import time
import warnings
import platform
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
from scipy.stats import norm
import seaborn as sns
import hyperopt
import scipy
import xgboost
# Print Anaconda and Python versions
print(f"Anaconda Version is {sys.version}")
print(f"Python version is {platform.python_version()}")
# Print versions of imported libraries
print(f"Pandas version is: {pd.__version__}")
print(f"Numpy version is: {np.__version__}")
print(f"Matplotlib version is: {matplotlib.__version__}")
print(f"Sklearn version is: {sklearn.__version__}")
print(f"Seaborn version is: {sns.__version__}")
print(f"Hyperopt version is: {hyperopt.__version__}")
print(f"Scipy version is: {scipy.__version__}")
print(f"Xgboost version is: {xgboost.__version__}")
# Set seaborn settings
sns.set(context='paper', palette='winter_r', style='darkgrid', rc={'figure.facecolor': 'gray'}, font_scale=1.5)
# Filter out warnings (deprecations, etc.)
if not sys.warnoptions:
warnings.simplefilter("ignore")
os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
Anaconda Version is 3.10.13 (main, Mar 12 2024, 12:16:25) [GCC 12.2.0] Python version is 3.10.13 Pandas version is: 2.2.1 Numpy version is: 1.26.4 Matplotlib version is: 3.8.3 Sklearn version is: 1.4.1.post1 Seaborn version is: 0.13.2 Hyperopt version is: 0.2.7 Scipy version is: 1.12.0 Xgboost version is: 2.0.3
#Alter ast_note_interactivity kernel option so jupyter displays multiple statements at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Import all modules/libraries which will be loaded within this project
import itertools
from time import time
from pprint import pprint
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import tree
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc, classification_report, confusion_matrix
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import cross_val_score, StratifiedKFold
##Importing hyperopt libraries
from functools import partial
from hyperopt import fmin, hp, tpe, Trials, space_eval
from hyperopt.pyll import scope
# 2.1 Data Exploration
# Get the current working directory
current_directory = os.getcwd()
# Construct the file path
file_path = os.path.join(current_directory, 'data', 'oasis_longitudinal_demographics.csv')
# Load in the data
mri = pd.read_csv(file_path)
# Get a feel for the data - observe first 5 rows
print("First 5 rows of the data:")
print(mri.head(5))
# Print out the number of columns in the dataframe (features)
num_columns = len(mri.columns)
print(f"\nThere are {num_columns} attributes within the dataset.")
# Check the columns of the dataset in prep for Data Exploration
print("\nColumns of the dataset:")
print(mri.columns)
First 5 rows of the data: Subject ID MRI ID Group Visit MR Delay M/F Hand Age EDUC \ 0 OAS2_0001 OAS2_0001_MR1 Nondemented 1 0 M R 87 14 1 OAS2_0001 OAS2_0001_MR2 Nondemented 2 457 M R 88 14 2 OAS2_0002 OAS2_0002_MR1 Demented 1 0 M R 75 12 3 OAS2_0002 OAS2_0002_MR2 Demented 2 560 M R 76 12 4 OAS2_0002 OAS2_0002_MR3 Demented 3 1895 M R 80 12 SES MMSE CDR eTIV nWBV ASF 0 2.0 27.0 0.0 1986.550000 0.696 0.883 1 2.0 30.0 0.0 2004.479526 0.681 0.876 2 NaN 23.0 0.5 1678.290000 0.736 1.046 3 NaN 28.0 0.5 1737.620000 0.713 1.010 4 NaN 22.0 0.5 1697.911134 0.701 1.034 There are 15 attributes within the dataset. Columns of the dataset: Index(['Subject ID', 'MRI ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand', 'Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'], dtype='object')
# Check the columns and their type - info prints this info about a dataframe
print("Information about the dataset:")
mri.info()
# Dimensions of the dataset
num_columns = len(mri.columns)
num_rows = len(mri.index)
print(f"\nThe dataset is made up with {num_columns} columns and {num_rows} rows.")
Information about the dataset: <class 'pandas.core.frame.DataFrame'> RangeIndex: 373 entries, 0 to 372 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Subject ID 373 non-null object 1 MRI ID 373 non-null object 2 Group 373 non-null object 3 Visit 373 non-null int64 4 MR Delay 373 non-null int64 5 M/F 373 non-null object 6 Hand 373 non-null object 7 Age 373 non-null int64 8 EDUC 373 non-null int64 9 SES 354 non-null float64 10 MMSE 371 non-null float64 11 CDR 373 non-null float64 12 eTIV 373 non-null float64 13 nWBV 373 non-null float64 14 ASF 373 non-null float64 dtypes: float64(6), int64(4), object(5) memory usage: 43.8+ KB The dataset is made up with 15 columns and 373 rows.
# Let's check the distinct values in certain columns
# Common sense would suggest that Subject ID, MRI ID, MR Delay, eTIV, nWBV, and ASF features would have too many distinct values for the purpose of this test.
# Therefore let's ignore them
num_columns = ['Visit', 'Age', 'EDUC', 'SES', 'MMSE', 'CDR']
for col in num_columns:
unique_values = mri[col].unique()
print(f"{len(unique_values)} unique values for {col}:")
print(unique_values)
print("\nValue counts:")
print(mri[col].value_counts(), "\n")
cat_columns = ['Group', 'M/F', 'Hand']
for col in cat_columns:
unique_values = mri[col].unique()
print(f"{len(unique_values)} unique values for {col}:")
print(unique_values)
print("\nValue counts:")
print(mri[col].value_counts(), "\n")
5 unique values for Visit: [1 2 3 4 5] Value counts: Visit 1 150 2 144 3 58 4 15 5 6 Name: count, dtype: int64 39 unique values for Age: [87 88 75 76 80 90 83 85 71 73 93 95 68 69 66 78 81 82 77 86 92 84 72 61 64 74 60 62 91 79 89 70 94 97 65 67 63 96 98] Value counts: Age 73 26 75 22 78 21 80 20 81 18 71 18 82 17 76 16 77 16 68 14 84 13 69 13 83 12 70 12 74 12 72 11 79 11 66 10 88 10 86 10 85 9 89 7 65 6 67 6 87 6 90 5 62 4 91 4 61 4 92 4 64 3 93 3 63 3 60 2 95 1 94 1 97 1 96 1 98 1 Name: count, dtype: int64 12 unique values for EDUC: [14 12 18 16 8 20 13 6 17 15 23 11] Value counts: EDUC 12 103 16 81 18 64 14 33 13 27 15 17 20 13 11 11 8 9 17 9 6 3 23 3 Name: count, dtype: int64 6 unique values for SES: [ 2. nan 3. 4. 1. 5.] Value counts: SES 2.0 103 1.0 88 3.0 82 4.0 74 5.0 7 Name: count, dtype: int64 19 unique values for MMSE: [27. 30. 23. 28. 22. 29. 24. 21. 16. 25. 26. 15. 20. 19. 7. 4. 17. 18. nan] Value counts: MMSE 30.0 114 29.0 91 28.0 45 27.0 32 26.0 20 25.0 12 21.0 11 23.0 11 22.0 7 20.0 7 17.0 5 24.0 4 16.0 3 19.0 3 15.0 2 18.0 2 7.0 1 4.0 1 Name: count, dtype: int64 4 unique values for CDR: [0. 0.5 1. 2. ] Value counts: CDR 0.0 206 0.5 123 1.0 41 2.0 3 Name: count, dtype: int64 3 unique values for Group: ['Nondemented' 'Demented' 'Converted'] Value counts: Group Nondemented 190 Demented 146 Converted 37 Name: count, dtype: int64 2 unique values for M/F: ['M' 'F'] Value counts: M/F F 213 M 160 Name: count, dtype: int64 1 unique values for Hand: ['R'] Value counts: Hand R 373 Name: count, dtype: int64
#Get a summary of the numerical attributes within our dataset - let's us see if we need to normalize/scale at a later point
for col in num_columns:
mri[col].describe()
count 373.000000 mean 1.882038 std 0.922843 min 1.000000 25% 1.000000 50% 2.000000 75% 2.000000 max 5.000000 Name: Visit, dtype: float64
count 373.000000 mean 77.013405 std 7.640957 min 60.000000 25% 71.000000 50% 77.000000 75% 82.000000 max 98.000000 Name: Age, dtype: float64
count 373.000000 mean 14.597855 std 2.876339 min 6.000000 25% 12.000000 50% 15.000000 75% 16.000000 max 23.000000 Name: EDUC, dtype: float64
count 354.000000 mean 2.460452 std 1.134005 min 1.000000 25% 2.000000 50% 2.000000 75% 3.000000 max 5.000000 Name: SES, dtype: float64
count 371.000000 mean 27.342318 std 3.683244 min 4.000000 25% 27.000000 50% 29.000000 75% 30.000000 max 30.000000 Name: MMSE, dtype: float64
count 373.000000 mean 0.290885 std 0.374557 min 0.000000 25% 0.000000 50% 0.000000 75% 0.500000 max 2.000000 Name: CDR, dtype: float64
2.2 Exploratory Data Analysis¶
# Plotting histograms side by side for pairs of variables
# Define pairs of variables
pairs = [('Age', 'CDR'), ('SES', 'EDUC'), ('MMSE', 'M/F')]
# Plot histograms for each pair of variables
for pair in pairs:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
for i, col in enumerate(pair):
sns.histplot(mri[col], ax=axes[i], kde=False)
axes[i].set_title(f'Histogram of {col}')
axes[i].set_xlabel(col)
axes[i].set_ylabel('Frequency')
plt.tight_layout()
# Adding ; to last statement to get rid of <matplotlib...> <seaborn...> text
plt.show();
A few things can be concluded from these graphs about the dataset:
- It seems that the age of the majority of Subject IDs falls between 70 - 85 with the most frequent age being 73 years old (more than 25 participants in this study are of this age). Note this could be skewed as we have not yet discounted Subject IDs that have had more than one session.
- More than half of the datasets' population received a CDR rating of 0 (>200 Subject IDs) while less than 10 subjects were diagnosed with a 2.0 CDR score.
- The most frequent Socio-Economic-status score in this study was 2.0 with over > 100 subject IDs (103), followed by > 85 subject IDs graded as 1.0 on the SES scale.
- Most of the individuals in this study undertook a significant amount of years in Education with over 2/3s of the study having at least studied 12+ years of Education.
- The majority of Subject IDs scored highly in their MMSE examinations (> 50% scoring 29 or above).
- The percentage ratio between females and males was 57:43.
NOTE:
Although i will be replacing the 'Converted' values in the group attribute, for the purpose of visual analysis, I am dropping the rows for 'Converted' patients so i can directly compare 'demented' and 'non-demented' subjects. To accomodate this, I am taking a copy of the 'mri' dataframe to ensure we dont lose values.
Let's create a function called plot_facetgrid
which will generate FacetGrid plots based on specified variables, using predefined colors, titles, and x-axis limits. It utilizes Seaborn's FacetGrid and kdeplot functions for visualization:
# Create a copy of the dataframe so we can remove the 'converted' value from the 'Group' attribute
mri_copy = mri.copy()
mri_copy =mri_copy[mri_copy.Group != "Converted"]
def plot_facetgrid(data, var):
"""
Function to create a FacetGrid chart based on the variable var.
Parameters:
data (DataFrame): The DataFrame containing the data.
var (str): The variable to plot.
"""
csfont_options = { 'fontname': 'monospace',
'weight': 'bold',
'color':'b',
'size': 'large'
}
plot_colors_lookup = {
'MMSE': {'color': ['r', 'k'], 'title': 'Dementia Variation against MMSE scores', 'xlim': (10, data['MMSE'].max())},
'SES': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Socio-Economic-Status', 'xlim': (0, data['SES'].max())},
'nWBV': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Normalized Whole Brain Volume', 'xlim': (0, data['nWBV'].max())},
'eTIV': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Estimated Total Intracranial volume', 'xlim': (0, data['eTIV'].max())},
'EDUC': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Years of Education', 'xlim': (3, data['EDUC'].max())},
'Age': {'color': ['r', 'k'], 'title': 'Dementia Variation against Age', 'xlim': (10, data['Age'].max())}
}
# Lookup plot colors, title, and xlim based on the variable var
plot_colors = plot_colors_lookup[var]['color']
title = plot_colors_lookup[var]['title']
xlim = plot_colors_lookup[var]['xlim']
# Create FacetGrid
g = sns.FacetGrid(data, hue='Group', hue_kws={'color': plot_colors}, height=8)
# Plot KDE plot
g.map(sns.kdeplot, var, shade=True)
# Set x-axis limit
g.set(xlim=xlim)
# Add legend
plt.legend(loc='best')
# Set title
plt.title(title, **csfont_options)
# Show plot
plt.show()
plot_facetgrid(mri_copy, 'MMSE')
From the above graph, one can see the Dementia distribution as a function of MMSE score. For high MMSE scores, in approximately 70% of cases, the subject IDs are generally considered as non-demented . Finally, as expected, for low MMSE scores (<22) all subjects receive a 'demented' diagnosis.
plot_facetgrid(mri_copy, 'SES')
Interestingly, the distribution of Dementia across the study is greatest for subjects who have receive a score in the socio-economic-status scale of 3 or more. This possibly suggests that the more afluent an individual is, the greater the chance of Dementia. Subjects who generally score 3 or less are considered 'Non-Demented' (in >50% of cases for a score of 2) or 'Converted' (in ~60% of cases where subjects score 1 in the SES scale)
plot_facetgrid(mri_copy, 'nWBV')
As one would expect, subjects who exhibited a greater percentage of normalized whole brain volume, had a greater chance of receiving a 'Non-demented' diagnosis. The lower the nWBV value, the greater the chance a patient was classified as 'demented' or 'converted'. In recent years, brain shrinkage has become a potential important marker for early changes in the brain tissue where such symptoms have been associated as early markers for ailments of Alzheimer's [https://www.webmd.com/alzheimers/news/20110413/brain-shrinkage-may-help-predict-alzheimers].
plot_facetgrid(mri_copy, 'eTIV')
Interestingly, there didn't seem to be much difference between the variation of Dementia with ETIV i.e. the estimated total intracranial volume didn't vary much over time and therefore ETIV was not significantly different between any of classification groups. As referenced in the Paper "Inracranial Volume and Alzheimer Disease: Evidence agaisnt the cerebal Reserve Hypothesis" by Jenkins et al, the Mean TIV did not differ significantly between subjects and that the only significant predictor of TIV was sex. Thus it was concluded that their findings do not suggest that a larger brain provides better 'protection' against AD [https://www.ncbi.nlm.nih.gov/pubmed/10681081]. This explains the visual cues seen in the plot above .i.e 'Demented' and 'nondemented' subjects have relatively the same ETIV values.
plot_facetgrid(mri_copy, 'EDUC')
It seems from quickly glancing at the above, Dementia is more prevelant in subjects who had fewer years in Education. For e.g. any subject who had less than 13 years of Education had a slightly increased possibility of developing Dementia where as the chances of having Dementia-like symptoms (on first visit) decreases quite sharply for subjects who have had 17+ years in Education.
Finally, Dementia is more prevelant in subjects who fall within the 65 - 85 year group. Before 65, one could hypothesise that the subject is too young for the full blown effects of Dementia to manifest. Similarly, after 85, given that a patient could have been living with the disease for many years,certain symptoms could have had a profound effect on an individual's life. This possibly explains why the dementia (not converted) diagnosis is small after the years of 85. Obviously, Dementia forms in people at different stages, as shown by the converted plot where it gets larger in magnitude as ages increase (up to ~85 years old). This would suggest that a subject's condition has been updated from a 'non-demented' to 'demented' status in that timeframe. For the purpose of visual analysis and for reasons of brevity, I'm only discussing the direct comparisons between the 'demented' and 'non-demented' plots.
plot_facetgrid(mri_copy, 'Age')
If you want to see the 'Converted' group plotted against both 'demented' & 'non-demented' groups:
# Define the target variable and features
targ = 'Group'
cols = ['MMSE', 'SES', 'nWBV', 'eTIV', 'EDUC', 'Age']
# Define font options for the title
csfont_options = {'fontname': 'monospace', 'weight': 'bold', 'color': 'b', 'size': 'large'}
# Set min x-axis values for each feature
xlimits = [12, 0, 0.55, 840, 3, 50]
# Define plot colors for different groups
plot_colors = {'Converted': 'r', 'Demented': 'k', 'Non-Demented': 'b'}
# Iterate over the target variable and features
for feature, xlimit in zip(cols, xlimits):
g = sns.FacetGrid(mri, hue=targ, height=8)
g.map(sns.kdeplot, feature, shade=True)
g.set(xlim=(10, mri[feature].max()))
plt.legend(loc='best')
plt.title('Dementia Variation against {}'.format(feature), **csfont_options)
plt.xlim(xlimit)
plt.show();
Out of curiousity, plot the frequency gender for Demented/Non-Demented subjects:
g = sns.catplot(x='M/F',
data=mri_copy, hue='Group',
kind='count', palette="muted",
legend=False, height=6)
g.despine(left=True)
g.set_ylabels("Participation count")
plt.legend(loc='best')
plt.show();
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
2.2.2 Correlation¶
def plot_corr_heatmap(data, method, heading):
"""
Plot correlation heatmap for the given data.
Parameters:
data (DataFrame): The DataFrame containing the data.
method (str): The method used for computing correlations.
heading (str): The title for the heatmap.
"""
# Copy mri dataset and preprocess
mri_copy = data.drop(['Subject ID', 'M/F', 'MRI ID', 'Hand'], axis=1)
mri_copy = pd.get_dummies(mri_copy, drop_first=True)
# Compute correlation matrix
corr = mri_copy.corr(method=method)
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 8))
# Draw the heatmap
sns.heatmap(corr, annot=True, linewidths=.5, fmt='.2f', ax=ax)
# Set title
ax.set_title(heading)
# Show plot
plt.show()
return corr
plot_corr_heatmap(mri, 'pearson', 'Correlation Heat map')
Visit | MR Delay | Age | EDUC | SES | MMSE | CDR | eTIV | nWBV | ASF | Group_Demented | Group_Nondemented | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
Visit | 1.000000 | 0.920009 | 0.183213 | 0.024615 | -0.051622 | -0.029078 | 0.002325 | 0.117423 | -0.126682 | -0.120399 | -0.129800 | 0.095507 |
MR Delay | 0.920009 | 1.000000 | 0.205357 | 0.051630 | -0.030813 | 0.065844 | -0.062915 | 0.119641 | -0.105586 | -0.123545 | -0.180156 | 0.120638 |
Age | 0.183213 | 0.205357 | 1.000000 | -0.027886 | -0.046857 | 0.055612 | -0.026257 | 0.042401 | -0.518359 | -0.035067 | -0.079153 | 0.005941 |
EDUC | 0.024615 | 0.051630 | -0.027886 | 1.000000 | -0.722647 | 0.194884 | -0.153121 | 0.257042 | -0.012200 | -0.241752 | -0.258708 | 0.193060 |
SES | -0.051622 | -0.030813 | -0.046857 | -0.722647 | 1.000000 | -0.149219 | 0.076160 | -0.261582 | 0.090095 | 0.255576 | 0.205556 | -0.062463 |
MMSE | -0.029078 | 0.065844 | 0.055612 | 0.194884 | -0.149219 | 1.000000 | -0.686519 | -0.032088 | 0.341912 | 0.040052 | -0.612448 | 0.524775 |
CDR | 0.002325 | -0.062915 | -0.026257 | -0.153121 | 0.076160 | -0.686519 | 1.000000 | 0.022863 | -0.344819 | -0.029340 | 0.815473 | -0.778049 |
eTIV | 0.117423 | 0.119641 | 0.042401 | 0.257042 | -0.261582 | -0.032088 | 0.022863 | 1.000000 | -0.210180 | -0.988905 | -0.010365 | 0.042579 |
nWBV | -0.126682 | -0.105586 | -0.518359 | -0.012200 | 0.090095 | 0.341912 | -0.344819 | -0.210180 | 1.000000 | 0.213476 | -0.286903 | 0.311346 |
ASF | -0.120399 | -0.123545 | -0.035067 | -0.241752 | 0.255576 | 0.040052 | -0.029340 | -0.988905 | 0.213476 | 1.000000 | 0.008312 | -0.032495 |
Group_Demented | -0.129800 | -0.180156 | -0.079153 | -0.258708 | 0.205556 | -0.612448 | 0.815473 | -0.010365 | -0.286903 | 0.008312 | 1.000000 | -0.817174 |
Group_Nondemented | 0.095507 | 0.120638 | 0.005941 | 0.193060 | -0.062463 | 0.524775 | -0.778049 | 0.042579 | 0.311346 | -0.032495 | -0.817174 | 1.000000 |
Observing the correlation coefficient (pearsons r) between each pair of attributes, I can make several assumptions:
- CDR has a strong positive correlation with both Group_Demented and Group_Nondemented. This is expected given that the CDR scale is used to gauge whether a patient has symptoms of Dementia. Given that this feature is strongly correlated with the target attribute, it would be advisable to drop this column during feature selection.
- Age and nWBV have quite a strong negative correlation.
- EDUC has quite strong correlations with MMSE, eTIV and a v.strong negative correlation with SES
- MMSE and CDR share a strong -ive correlation , quite strong correlation with nWBV.
- Most of the attributes share a correlation that is close to 0 thus we assume that there is little to no linear correlation
Note, these **correlations are only linear and as a result we arent measuring non-linear relationships that could exist within the dataset (if attribute x is close to 0, y goes up) **
2.2.3 Scatter Matrix¶
scatter_matrix(mri[mri.columns], figsize=(12,8));
# Display unique values before and after replacement
for col in ['Group', 'M/F']:
print(f"Before replacement {col} has the following values: {mri[col].unique()}")
# Create a mapping dictionary for replacement
mapping = {'F': 0, 'M': 1, 'Nondemented': 0, 'Demented': 1, 'Converted': 1}
# Replace categorical values with numerical values
mri.replace({'Group': mapping, 'M/F': mapping}, inplace=True)
# Display unique values after replacement
for col in ['Group', 'M/F']:
print(f"After replacement {col} has the following numerical values: {mri[col].unique()}")
Before replacement Group has the following values: ['Nondemented' 'Demented' 'Converted'] Before replacement M/F has the following values: ['M' 'F'] After replacement Group has the following numerical values: [0 1] After replacement M/F has the following numerical values: [1 0]
## Using LabelEncoder from scikit learn to encode categorical features to numerical.
obj_mri = mri.select_dtypes(include=['object'])
# Converting categorical Data
for col in obj_mri.columns:
le = LabelEncoder()
le.fit(mri[col])
list(le.classes_)
mri[col] = le.transform(mri[col])
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
['OAS2_0001', 'OAS2_0002', 'OAS2_0004', 'OAS2_0005', 'OAS2_0007', 'OAS2_0008', 'OAS2_0009', 'OAS2_0010', 'OAS2_0012', 'OAS2_0013', 'OAS2_0014', 'OAS2_0016', 'OAS2_0017', 'OAS2_0018', 'OAS2_0020', 'OAS2_0021', 'OAS2_0022', 'OAS2_0023', 'OAS2_0026', 'OAS2_0027', 'OAS2_0028', 'OAS2_0029', 'OAS2_0030', 'OAS2_0031', 'OAS2_0032', 'OAS2_0034', 'OAS2_0035', 'OAS2_0036', 'OAS2_0037', 'OAS2_0039', 'OAS2_0040', 'OAS2_0041', 'OAS2_0042', 'OAS2_0043', 'OAS2_0044', 'OAS2_0045', 'OAS2_0046', 'OAS2_0047', 'OAS2_0048', 'OAS2_0049', 'OAS2_0050', 'OAS2_0051', 'OAS2_0052', 'OAS2_0053', 'OAS2_0054', 'OAS2_0055', 'OAS2_0056', 'OAS2_0057', 'OAS2_0058', 'OAS2_0060', 'OAS2_0061', 'OAS2_0062', 'OAS2_0063', 'OAS2_0064', 'OAS2_0066', 'OAS2_0067', 'OAS2_0068', 'OAS2_0069', 'OAS2_0070', 'OAS2_0071', 'OAS2_0073', 'OAS2_0075', 'OAS2_0076', 'OAS2_0077', 'OAS2_0078', 'OAS2_0079', 'OAS2_0080', 'OAS2_0081', 'OAS2_0085', 'OAS2_0086', 'OAS2_0087', 'OAS2_0088', 'OAS2_0089', 'OAS2_0090', 'OAS2_0091', 'OAS2_0092', 'OAS2_0094', 'OAS2_0095', 'OAS2_0096', 'OAS2_0097', 'OAS2_0098', 'OAS2_0099', 'OAS2_0100', 'OAS2_0101', 'OAS2_0102', 'OAS2_0103', 'OAS2_0104', 'OAS2_0105', 'OAS2_0106', 'OAS2_0108', 'OAS2_0109', 'OAS2_0111', 'OAS2_0112', 'OAS2_0113', 'OAS2_0114', 'OAS2_0116', 'OAS2_0117', 'OAS2_0118', 'OAS2_0119', 'OAS2_0120', 'OAS2_0121', 'OAS2_0122', 'OAS2_0124', 'OAS2_0126', 'OAS2_0127', 'OAS2_0128', 'OAS2_0129', 'OAS2_0131', 'OAS2_0133', 'OAS2_0134', 'OAS2_0135', 'OAS2_0137', 'OAS2_0138', 'OAS2_0139', 'OAS2_0140', 'OAS2_0141', 'OAS2_0142', 'OAS2_0143', 'OAS2_0144', 'OAS2_0145', 'OAS2_0146', 'OAS2_0147', 'OAS2_0149', 'OAS2_0150', 'OAS2_0152', 'OAS2_0154', 'OAS2_0156', 'OAS2_0157', 'OAS2_0158', 'OAS2_0159', 'OAS2_0160', 'OAS2_0161', 'OAS2_0162', 'OAS2_0164', 'OAS2_0165', 'OAS2_0169', 'OAS2_0171', 'OAS2_0172', 'OAS2_0174', 'OAS2_0175', 'OAS2_0176', 'OAS2_0177', 'OAS2_0178', 'OAS2_0179', 'OAS2_0181', 'OAS2_0182', 'OAS2_0183', 'OAS2_0184', 'OAS2_0185', 'OAS2_0186']
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
['OAS2_0001_MR1', 'OAS2_0001_MR2', 'OAS2_0002_MR1', 'OAS2_0002_MR2', 'OAS2_0002_MR3', 'OAS2_0004_MR1', 'OAS2_0004_MR2', 'OAS2_0005_MR1', 'OAS2_0005_MR2', 'OAS2_0005_MR3', 'OAS2_0007_MR1', 'OAS2_0007_MR3', 'OAS2_0007_MR4', 'OAS2_0008_MR1', 'OAS2_0008_MR2', 'OAS2_0009_MR1', 'OAS2_0009_MR2', 'OAS2_0010_MR1', 'OAS2_0010_MR2', 'OAS2_0012_MR1', 'OAS2_0012_MR2', 'OAS2_0012_MR3', 'OAS2_0013_MR1', 'OAS2_0013_MR2', 'OAS2_0013_MR3', 'OAS2_0014_MR1', 'OAS2_0014_MR2', 'OAS2_0016_MR1', 'OAS2_0016_MR2', 'OAS2_0017_MR1', 'OAS2_0017_MR3', 'OAS2_0017_MR4', 'OAS2_0017_MR5', 'OAS2_0018_MR1', 'OAS2_0018_MR3', 'OAS2_0018_MR4', 'OAS2_0020_MR1', 'OAS2_0020_MR2', 'OAS2_0020_MR3', 'OAS2_0021_MR1', 'OAS2_0021_MR2', 'OAS2_0022_MR1', 'OAS2_0022_MR2', 'OAS2_0023_MR1', 'OAS2_0023_MR2', 'OAS2_0026_MR1', 'OAS2_0026_MR2', 'OAS2_0027_MR1', 'OAS2_0027_MR2', 'OAS2_0027_MR3', 'OAS2_0027_MR4', 'OAS2_0028_MR1', 'OAS2_0028_MR2', 'OAS2_0029_MR1', 'OAS2_0029_MR2', 'OAS2_0030_MR1', 'OAS2_0030_MR2', 'OAS2_0031_MR1', 'OAS2_0031_MR2', 'OAS2_0031_MR3', 'OAS2_0032_MR1', 'OAS2_0032_MR2', 'OAS2_0034_MR1', 'OAS2_0034_MR2', 'OAS2_0034_MR3', 'OAS2_0034_MR4', 'OAS2_0035_MR1', 'OAS2_0035_MR2', 'OAS2_0036_MR1', 'OAS2_0036_MR3', 'OAS2_0036_MR4', 'OAS2_0036_MR5', 'OAS2_0037_MR1', 'OAS2_0037_MR2', 'OAS2_0037_MR3', 'OAS2_0037_MR4', 'OAS2_0039_MR1', 'OAS2_0039_MR2', 'OAS2_0040_MR1', 'OAS2_0040_MR2', 'OAS2_0040_MR3', 'OAS2_0041_MR1', 'OAS2_0041_MR2', 'OAS2_0041_MR3', 'OAS2_0042_MR1', 'OAS2_0042_MR2', 'OAS2_0043_MR1', 'OAS2_0043_MR2', 'OAS2_0044_MR1', 'OAS2_0044_MR2', 'OAS2_0044_MR3', 'OAS2_0045_MR1', 'OAS2_0045_MR2', 'OAS2_0046_MR1', 'OAS2_0046_MR2', 'OAS2_0047_MR1', 'OAS2_0047_MR2', 'OAS2_0048_MR1', 'OAS2_0048_MR2', 'OAS2_0048_MR3', 'OAS2_0048_MR4', 'OAS2_0048_MR5', 'OAS2_0049_MR1', 'OAS2_0049_MR2', 'OAS2_0049_MR3', 'OAS2_0050_MR1', 'OAS2_0050_MR2', 'OAS2_0051_MR1', 'OAS2_0051_MR2', 'OAS2_0051_MR3', 'OAS2_0052_MR1', 'OAS2_0052_MR2', 'OAS2_0053_MR1', 'OAS2_0053_MR2', 'OAS2_0054_MR1', 'OAS2_0054_MR2', 'OAS2_0055_MR1', 'OAS2_0055_MR2', 'OAS2_0056_MR1', 'OAS2_0056_MR2', 'OAS2_0057_MR1', 'OAS2_0057_MR2', 'OAS2_0057_MR3', 'OAS2_0058_MR1', 'OAS2_0058_MR2', 'OAS2_0058_MR3', 'OAS2_0060_MR1', 'OAS2_0060_MR2', 'OAS2_0061_MR1', 'OAS2_0061_MR2', 'OAS2_0061_MR3', 'OAS2_0062_MR1', 'OAS2_0062_MR2', 'OAS2_0062_MR3', 'OAS2_0063_MR1', 'OAS2_0063_MR2', 'OAS2_0064_MR1', 'OAS2_0064_MR2', 'OAS2_0064_MR3', 'OAS2_0066_MR1', 'OAS2_0066_MR2', 'OAS2_0067_MR1', 'OAS2_0067_MR2', 'OAS2_0067_MR3', 'OAS2_0067_MR4', 'OAS2_0068_MR1', 'OAS2_0068_MR2', 'OAS2_0069_MR1', 'OAS2_0069_MR2', 'OAS2_0070_MR1', 'OAS2_0070_MR2', 'OAS2_0070_MR3', 'OAS2_0070_MR4', 'OAS2_0070_MR5', 'OAS2_0071_MR1', 'OAS2_0071_MR2', 'OAS2_0073_MR1', 'OAS2_0073_MR2', 'OAS2_0073_MR3', 'OAS2_0073_MR4', 'OAS2_0073_MR5', 'OAS2_0075_MR1', 'OAS2_0075_MR2', 'OAS2_0076_MR1', 'OAS2_0076_MR2', 'OAS2_0076_MR3', 'OAS2_0077_MR1', 'OAS2_0077_MR2', 'OAS2_0078_MR1', 'OAS2_0078_MR2', 'OAS2_0078_MR3', 'OAS2_0079_MR1', 'OAS2_0079_MR2', 'OAS2_0079_MR3', 'OAS2_0080_MR1', 'OAS2_0080_MR2', 'OAS2_0080_MR3', 'OAS2_0081_MR1', 'OAS2_0081_MR2', 'OAS2_0085_MR1', 'OAS2_0085_MR2', 'OAS2_0086_MR1', 'OAS2_0086_MR2', 'OAS2_0087_MR1', 'OAS2_0087_MR2', 'OAS2_0088_MR1', 'OAS2_0088_MR2', 'OAS2_0089_MR1', 'OAS2_0089_MR3', 'OAS2_0090_MR1', 'OAS2_0090_MR2', 'OAS2_0090_MR3', 'OAS2_0091_MR1', 'OAS2_0091_MR2', 'OAS2_0092_MR1', 'OAS2_0092_MR2', 'OAS2_0094_MR1', 'OAS2_0094_MR2', 'OAS2_0095_MR1', 'OAS2_0095_MR2', 'OAS2_0095_MR3', 'OAS2_0096_MR1', 'OAS2_0096_MR2', 'OAS2_0097_MR1', 'OAS2_0097_MR2', 'OAS2_0098_MR1', 'OAS2_0098_MR2', 'OAS2_0099_MR1', 'OAS2_0099_MR2', 'OAS2_0100_MR1', 'OAS2_0100_MR2', 'OAS2_0100_MR3', 'OAS2_0101_MR1', 'OAS2_0101_MR2', 'OAS2_0101_MR3', 'OAS2_0102_MR1', 'OAS2_0102_MR2', 'OAS2_0102_MR3', 'OAS2_0103_MR1', 'OAS2_0103_MR2', 'OAS2_0103_MR3', 'OAS2_0104_MR1', 'OAS2_0104_MR2', 'OAS2_0105_MR1', 'OAS2_0105_MR2', 'OAS2_0106_MR1', 'OAS2_0106_MR2', 'OAS2_0108_MR1', 'OAS2_0108_MR2', 'OAS2_0109_MR1', 'OAS2_0109_MR2', 'OAS2_0111_MR1', 'OAS2_0111_MR2', 'OAS2_0112_MR1', 'OAS2_0112_MR2', 'OAS2_0113_MR1', 'OAS2_0113_MR2', 'OAS2_0114_MR1', 'OAS2_0114_MR2', 'OAS2_0116_MR1', 'OAS2_0116_MR2', 'OAS2_0117_MR1', 'OAS2_0117_MR2', 'OAS2_0117_MR3', 'OAS2_0117_MR4', 'OAS2_0118_MR1', 'OAS2_0118_MR2', 'OAS2_0119_MR1', 'OAS2_0119_MR2', 'OAS2_0119_MR3', 'OAS2_0120_MR1', 'OAS2_0120_MR2', 'OAS2_0121_MR1', 'OAS2_0121_MR2', 'OAS2_0122_MR1', 'OAS2_0122_MR2', 'OAS2_0124_MR1', 'OAS2_0124_MR2', 'OAS2_0126_MR1', 'OAS2_0126_MR2', 'OAS2_0126_MR3', 'OAS2_0127_MR1', 'OAS2_0127_MR2', 'OAS2_0127_MR3', 'OAS2_0127_MR4', 'OAS2_0127_MR5', 'OAS2_0128_MR1', 'OAS2_0128_MR2', 'OAS2_0129_MR1', 'OAS2_0129_MR2', 'OAS2_0129_MR3', 'OAS2_0131_MR1', 'OAS2_0131_MR2', 'OAS2_0133_MR1', 'OAS2_0133_MR3', 'OAS2_0134_MR1', 'OAS2_0134_MR2', 'OAS2_0135_MR1', 'OAS2_0135_MR2', 'OAS2_0137_MR1', 'OAS2_0137_MR2', 'OAS2_0138_MR1', 'OAS2_0138_MR2', 'OAS2_0139_MR1', 'OAS2_0139_MR2', 'OAS2_0140_MR1', 'OAS2_0140_MR2', 'OAS2_0140_MR3', 'OAS2_0141_MR1', 'OAS2_0141_MR2', 'OAS2_0142_MR1', 'OAS2_0142_MR2', 'OAS2_0143_MR1', 'OAS2_0143_MR2', 'OAS2_0143_MR3', 'OAS2_0144_MR1', 'OAS2_0144_MR2', 'OAS2_0145_MR1', 'OAS2_0145_MR2', 'OAS2_0146_MR1', 'OAS2_0146_MR2', 'OAS2_0147_MR1', 'OAS2_0147_MR2', 'OAS2_0147_MR3', 'OAS2_0147_MR4', 'OAS2_0149_MR1', 'OAS2_0149_MR2', 'OAS2_0150_MR1', 'OAS2_0150_MR2', 'OAS2_0152_MR1', 'OAS2_0152_MR2', 'OAS2_0152_MR3', 'OAS2_0154_MR1', 'OAS2_0154_MR2', 'OAS2_0156_MR1', 'OAS2_0156_MR2', 'OAS2_0157_MR1', 'OAS2_0157_MR2', 'OAS2_0158_MR1', 'OAS2_0158_MR2', 'OAS2_0159_MR1', 'OAS2_0159_MR2', 'OAS2_0160_MR1', 'OAS2_0160_MR2', 'OAS2_0161_MR1', 'OAS2_0161_MR2', 'OAS2_0161_MR3', 'OAS2_0162_MR1', 'OAS2_0162_MR2', 'OAS2_0164_MR1', 'OAS2_0164_MR2', 'OAS2_0165_MR1', 'OAS2_0165_MR2', 'OAS2_0169_MR1', 'OAS2_0169_MR2', 'OAS2_0171_MR1', 'OAS2_0171_MR2', 'OAS2_0171_MR3', 'OAS2_0172_MR1', 'OAS2_0172_MR2', 'OAS2_0174_MR1', 'OAS2_0174_MR2', 'OAS2_0174_MR3', 'OAS2_0175_MR1', 'OAS2_0175_MR2', 'OAS2_0175_MR3', 'OAS2_0176_MR1', 'OAS2_0176_MR2', 'OAS2_0176_MR3', 'OAS2_0177_MR1', 'OAS2_0177_MR2', 'OAS2_0178_MR1', 'OAS2_0178_MR2', 'OAS2_0178_MR3', 'OAS2_0179_MR1', 'OAS2_0179_MR2', 'OAS2_0181_MR1', 'OAS2_0181_MR2', 'OAS2_0181_MR3', 'OAS2_0182_MR1', 'OAS2_0182_MR2', 'OAS2_0183_MR1', 'OAS2_0183_MR2', 'OAS2_0183_MR3', 'OAS2_0183_MR4', 'OAS2_0184_MR1', 'OAS2_0184_MR2', 'OAS2_0185_MR1', 'OAS2_0185_MR2', 'OAS2_0185_MR3', 'OAS2_0186_MR1', 'OAS2_0186_MR2', 'OAS2_0186_MR3']
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
['R']
# Check that all columns are now numerical
mri.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 373 entries, 0 to 372 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Subject ID 373 non-null int64 1 MRI ID 373 non-null int64 2 Group 373 non-null int64 3 Visit 373 non-null int64 4 MR Delay 373 non-null int64 5 M/F 373 non-null int64 6 Hand 373 non-null int64 7 Age 373 non-null int64 8 EDUC 373 non-null int64 9 SES 354 non-null float64 10 MMSE 371 non-null float64 11 CDR 373 non-null float64 12 eTIV 373 non-null float64 13 nWBV 373 non-null float64 14 ASF 373 non-null float64 dtypes: float64(6), int64(9) memory usage: 43.8 KB
3.1.2 Outlier detection¶
def outlier_detection(cols):
"""
Detect outliers in a list of values using z-score method.
Parameters:
cols (list): A list of numerical values.
Returns:
outliers (tuple): A tuple containing the indices of outliers.
"""
mean_col = np.mean(cols)
std_col = np.std(cols)
threshold = 3
z_scores = [(col - mean_col) / std_col for col in cols]
outliers = np.where(np.abs(z_scores) > threshold)
return outliers
cols = ["EDUC", "SES", "MMSE", "CDR", "eTIV", "Visit", "eTIV", "nWBV", "ASF"]
for col in cols:
outliers_indices = outlier_detection(mri[col])
if len(outliers_indices[0]) > 0:
print(f"Outliers for {col} are given below:")
for i in outliers_indices[0]:
outlier_value = mri[col].iloc[i]
print(f"Index: {i}, Value: {outlier_value}")
print()
Outliers for MMSE are given below: Index: 26, Value: 16.0 Index: 89, Value: 15.0 Index: 100, Value: 7.0 Index: 101, Value: 4.0 Index: 172, Value: 16.0 Index: 173, Value: 16.0 Index: 251, Value: 15.0 Outliers for CDR are given below: Index: 184, Value: 2.0 Index: 251, Value: 2.0 Index: 330, Value: 2.0 Outliers for Visit are given below: Index: 32, Value: 5 Index: 71, Value: 5 Index: 101, Value: 5 Index: 153, Value: 5 Index: 160, Value: 5 Index: 265, Value: 5
The outliers for MMSE, CDR and Visit have actual 'meaning' ... i.e. MMSE scores of 16,15,7,4 etc are used to determine the severity of Dementia in a subject (same applies to CDR). Thus, I believe it wouldn't be suitable to drop these outliers from the dataset
3.1.3 Imputation - dealing with missing values¶
#Check which columns contain null values
nulls = mri.columns[mri.isnull().any()]
#Check how many nulls in each column
mri[nulls].isnull().sum()
SES 19 MMSE 2 dtype: int64
# Imputation - only works on numerical attributes, hence why we encoded the categorical text features in 3.1
# Impute missing values with the median value of the attribute
imputer = SimpleImputer(strategy="median")
# Final check to make sure all columns are numerical before proceeding with imputation
for column in mri.columns:
if mri[column].dtype == 'object':
print(f"{column} of type object found!")
else:
print(f"{column} is a numerical attribute - proceed with imputation")
# Fitting imputer instance
imputer.fit(mri)
# Check if the median computed by the imputer matches the median of the original data
print("Medians match:", (imputer.statistics_ == mri.median().values).all())
# Transforming mri data to replace null values via imputation
X = imputer.transform(mri)
# Constructing dataframe from transformed values
mri = pd.DataFrame(X, columns=mri.columns)
Subject ID is a numerical attribute - proceed with imputation MRI ID is a numerical attribute - proceed with imputation Group is a numerical attribute - proceed with imputation Visit is a numerical attribute - proceed with imputation MR Delay is a numerical attribute - proceed with imputation M/F is a numerical attribute - proceed with imputation Hand is a numerical attribute - proceed with imputation Age is a numerical attribute - proceed with imputation EDUC is a numerical attribute - proceed with imputation SES is a numerical attribute - proceed with imputation MMSE is a numerical attribute - proceed with imputation CDR is a numerical attribute - proceed with imputation eTIV is a numerical attribute - proceed with imputation nWBV is a numerical attribute - proceed with imputation ASF is a numerical attribute - proceed with imputation
SimpleImputer(strategy='median')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SimpleImputer(strategy='median')
Medians match: True
#Check that nulls have been replaced
mri[nulls].isnull().sum()
SES 0 MMSE 0 dtype: int64
3.1.4 Standardizing dataframe attributes¶
cols = ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']
max_values = mri[cols].max()
min_values = mri[cols].min()
print(f"max value in mri dataset for {cols} before scaling:\n{max_values}")
print(f"min value in mri dataset for {cols} before scaling:\n{min_values}")
max value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling: Age 98.000000 M/F 1.000000 EDUC 23.000000 SES 5.000000 MMSE 30.000000 eTIV 2004.479526 nWBV 0.837000 ASF 1.587000 dtype: float64 min value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling: Age 60.000000 M/F 0.000000 EDUC 6.000000 SES 1.000000 MMSE 4.000000 eTIV 1105.652499 nWBV 0.644000 ASF 0.876000 dtype: float64
scaler = StandardScaler().fit(mri)
mri[cols]=scaler.fit_transform(mri[cols])
cols = ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']
max_values = mri[cols].max()
min_values = mri[cols].min()
print(f"max value in mri dataset for {cols} before scaling:\n{max_values}")
print(f"min value in mri dataset for {cols} before scaling:\n{min_values}")
max value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling: Age 2.750282 M/F 1.153798 EDUC 2.925048 SES 2.313556 MMSE 0.721664 eTIV 2.935525 nWBV 2.896887 ASF 2.839157 dtype: float64 min value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling: Age -2.229597 M/F -0.866703 EDUC -2.993181 SES -1.297140 MMSE -6.362035 eTIV -2.174359 nWBV -2.307345 ASF -2.316501 dtype: float64
Note: We only standardized certain attributes, as some of these columns will be dropped during feature selection in our next preprocessing step
3.1.5 Dropping features¶
# Drop unneccessary features and also the highly correlated feature CDR from the dataframe
mri = mri.drop(['Subject ID', 'Visit', 'MRI ID', 'Hand', 'MR Delay', 'CDR'], axis=1)
mri.head(10)
Group | M/F | Age | EDUC | SES | MMSE | eTIV | nWBV | ASF | |
---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 1.153798 | 1.308738 | -0.208132 | -0.394466 | -0.095686 | 2.833595 | -0.905169 | -2.265742 |
1 | 0.0 | 1.153798 | 1.439787 | -0.208132 | -0.394466 | 0.721664 | 2.935525 | -1.309643 | -2.316501 |
2 | 1.0 | 1.153798 | -0.263856 | -0.904394 | -0.394466 | -1.185486 | 1.081119 | 0.173429 | -1.083784 |
3 | 1.0 | 1.153798 | -0.132806 | -0.904394 | -0.394466 | 0.176764 | 1.418413 | -0.446765 | -1.344830 |
4 | 1.0 | 1.153798 | 0.391392 | -0.904394 | -0.394466 | -1.457936 | 1.192666 | -0.770344 | -1.170800 |
5 | 0.0 | -0.866703 | 1.439787 | 1.184392 | 0.508208 | 0.176764 | -1.550836 | -0.527660 | 1.802224 |
6 | 0.0 | -0.866703 | 1.701886 | 1.184392 | 0.508208 | -0.095686 | -1.637420 | -0.311940 | 1.932747 |
7 | 0.0 | 1.153798 | 0.391392 | -0.904394 | 1.410882 | 0.176764 | 1.139618 | -0.473730 | -1.134543 |
8 | 0.0 | 1.153798 | 0.784540 | -0.904394 | 1.410882 | 0.449214 | 1.208652 | -0.500695 | -1.185302 |
9 | 0.0 | 1.153798 | 1.046639 | -0.904394 | 1.410882 | 0.721664 | 1.200386 | -0.662484 | -1.178051 |
3.1.6 Splitting data into training / test sets¶
seed = np.random.seed(42)
y = mri['Group'] #Target
X = mri.drop(['Group'], axis=1) # Features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#Create copies of our train/test sets for benchmark testing
BM_X_train = X_train.copy()
BM_X_test = X_test.copy()
BM_y_train = y_train.copy()
BM_y_test = y_test.copy()
#Create further copies incase of further optimization techniques
Copy_X_train = X_train.copy()
Copy_X_test = X_test.copy()
Copy_y_train = y_train.copy()
Copy_y_test = y_test.copy()
print("X_train set is made up of {} rows and {} columns\n".format(len(X_train.index), len(X_train.columns)))
print("X_test set is made up of {} rows and {} columns\n".format(len(X_test.index), len(X_test.columns)))
print("y_train is made up of {} values\n".format(len(y_train.index)))
print("y_test is made up of {} values\n".format(len(y_test.index)))
X_train set is made up of 261 rows and 8 columns X_test set is made up of 112 rows and 8 columns y_train is made up of 261 values y_test is made up of 112 values
## As per the evaluators suggestion, look into using xgboost -> popular boosting model -> appropiate here
#setting random state as 42 for reproducability
# Define the models
models = [
LogisticRegression(random_state=42),
SVC(random_state=42),
DecisionTreeClassifier(random_state=42),
AdaBoostClassifier(random_state=42),
RandomForestClassifier(random_state=42),
XGBClassifier(random_state=42),
GradientBoostingClassifier(random_state=42),
GaussianNB()
]
# Define the corresponding classifier names
classifiers = [
'Logistic Regression',
'SVC',
'Decision Tree Classifier',
'AdaBoost Classifier',
'Random Forest Classifier',
'XGBClassifier',
'GradientBoostingClassifier',
'Naive Bayes'
]
# Define the columns for the benchmark dataframe
ml_columns = ['ML Algo name', 'Model Parameters', 'training AUC accuracy', 'test recall score', 'test AUC score']
# Create an empty dataframe to store benchmark results
benchmark = pd.DataFrame(columns=ml_columns)
# Perform cross-validation and evaluation for each model
for model, clf_name in zip(models, classifiers):
benchmark.loc[len(benchmark)] = [clf_name, str(model.get_params()), np.nan, np.nan, np.nan]
cv_score = cross_val_score(model, BM_X_train, BM_y_train, cv=5, scoring='roc_auc')
benchmark.loc[benchmark['ML Algo name'] == clf_name, 'training AUC accuracy'] = np.mean(cv_score)
mdl = model.fit(BM_X_train, BM_y_train)
test_acc = mdl.score(BM_X_test, BM_y_test)
fpr, tpr, thresholds = roc_curve(BM_y_test, model.predict(BM_X_test))
test_recall = recall_score(BM_y_test, model.predict(BM_X_test))
test_auc = auc(fpr, tpr)
benchmark.loc[benchmark['ML Algo name'] == clf_name, 'test recall score'] = test_recall
benchmark.loc[benchmark['ML Algo name'] == clf_name, 'test AUC score'] = test_auc
# Sort benchmark table by test AUC score
benchmark.sort_values(by='test AUC score', ascending=False, inplace=True)
# Plot the benchmark results
sns.barplot(x='test AUC score', y='ML Algo name', data=benchmark)
# Set the width of the model parameters column
benchmark.style.set_properties(subset=['Model Parameters'], **{'width': '300px'})
<Axes: xlabel='test AUC score', ylabel='ML Algo name'>
 | ML Algo name | Model Parameters | training AUC accuracy | test recall score | test AUC score |
---|---|---|---|---|---|
4 | Random Forest Classifier | {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False} | 0.917676 | 0.833333 | 0.868590 |
6 | GradientBoostingClassifier | {'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False} | 0.895164 | 0.816667 | 0.821795 |
5 | XGBClassifier | {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None} | 0.898903 | 0.766667 | 0.806410 |
1 | SVC | {'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False} | 0.887858 | 0.650000 | 0.776923 |
3 | AdaBoost Classifier | {'algorithm': 'SAMME.R', 'estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': 42} | 0.870394 | 0.783333 | 0.766667 |
2 | Decision Tree Classifier | {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 42, 'splitter': 'best'} | 0.771987 | 0.766667 | 0.748718 |
7 | Naive Bayes | {'priors': None, 'var_smoothing': 1e-09} | 0.879456 | 0.650000 | 0.748077 |
0 | Logistic Regression | {'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False} | 0.907505 | 0.700000 | 0.744231 |
Plotting the roc_curves and confusion metrics for evaluation purposes ...
total_fpr = {}
total_tpr = {}
def roc_curves(model, classifier):
ML_name = model.__class__.__name__
fpr, tpr, thresholds = roc_curve(BM_y_test, model.predict(BM_X_test))
test_auc = auc(fpr, tpr)
total_fpr[ML_name] = fpr
total_tpr[ML_name] = tpr
plt.plot(fpr, tpr, color='darkorange', linewidth=2, label='(Area = %0.2f)' % test_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.title('ROC Curve for {}'.format(classifier))
plt.show()
def confusion_matrix_plot(model, classifier):
cm = confusion_matrix(y_test, model.predict(BM_X_test))
plt.imshow(cm, interpolation='nearest', cmap=plt.get_cmap('Blues'))
labels = ['Nondemented', 'Demented']
plt.title('Confusion Matrix for {}'.format(classifier))
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)
s = [['TN', 'FP'], ['FN', 'TP']]
for i, j in itertools.product(range(2), range(2)):
plt.text(j, i, str(s[i][j]) + " = " + str(cm[i][j]))
plt.show()
for model, classifier in zip(models, classifiers):
roc_curves(model, classifier)
confusion_matrix_plot(model, classifier)
Plotting all of the ROC curves on one graph ...
colors = {
'LogisticRegression': 'red',
'SVC': 'green',
'DecisionTreeClassifier': 'blue',
'AdaBoostClassifier': 'yellow',
'RandomForestClassifier': 'cyan',
'XGBClassifier': 'magenta',
'GradientBoostingClassifier': 'black',
'GaussianNB': 'white'
}
plt.figure(figsize=(20, 12))
for model_name in total_fpr.keys():
plt.plot(total_fpr[model_name], total_tpr[model_name], color=colors[model_name], lw=1, label=model_name)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title('Comparison of ROC curves for all classifiers')
plt.plot([0, 1], [0, 1], color='darkorange', lw=2, linestyle='--')
plt.legend()
plt.show();
I made the mistake of trying to perform GridSearchCV for all classifiers. As you can see in the markdown below, the hyperparameter tuning of the Random Forests Classifier and the decision tree classifier took approximately 135746 & 37069 seconds or in more explicit scaling, upwards of 38 / 10 hours.
Thus, I will only be performing GridSearchCV for LogisticRegression and SVM.
The remaining classifiers will be tuned using RandomizedSearchCV and hyperopt
GridSearch/RandomizedSearchCV¶
The below function 'grid' uses GridSearchCV to find optimal hyperparamters for the Logistic Regression and SVM Models. Due to the vast parameter grids for the remaining Models, it was advisable to tune their hyperparameters via RandomizedSearchCV.
Also, the 3 benchmark models (SVM, RandomForestClassifier,XGBoost) which i wanted to pay particular attention to,are also fitted for the evaluation metric 'accuracy'. Reasoning behind this - I wanted to collect the classification error on training/test sets ... the classification error being equal to (1 - accuracy_score)... to test the robustness of each model.
#Create empty final results dataframe for optimization metrics
refinement_cols = ['ML Model','Method','fitting time', 'optimal hyperparameters', 'best training AUC score', 'best estimator','test recall score', 'test AUC score']
finale = pd.DataFrame(columns = refinement_cols)
#Create empty robustness table to compare training and testing errors
robust_cols = ['ML Model', 'Training Classification Error', 'Test Classification_Error']
robustment_test = pd.DataFrame(columns = robust_cols)
#Storing training times
training_time = []
#Create two empty dictionaries so that the FPR and TPR can be captured for each model. This will allow us to plot all the ROC curves on one graph
total_fpr_opt = {}
total_tpr_opt = {}
#Define classifiers for tuning - set random_state for reproducible results
classifiers = [ LogisticRegression(random_state=42),
SVC(random_state=42),
AdaBoostClassifier(random_state=42),
DecisionTreeClassifier(random_state=42),
RandomForestClassifier(random_state=42),
GradientBoostingClassifier(random_state=42),
XGBClassifier(random_state=42) ]
lr_parameters = {
'C':[0.001, 0.01, 0.1, 1, 10,100,1000, 10000],
'max_iter':list(range(2,100))}
svm_parameters = {
'kernel': ['rbf', 'poly', 'sigmoid'],
'C':[0.00001,0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30, 40, 50 ,60 , 80, 100],
'gamma': [0.00001,0.0001, 0.001, 0.01, 0.1, 1, 10] }
dt_parameters = { #'max_leaf_nodes': list(range(2,100)),
'max_leaf_nodes': list(range(2,20)),
'splitter':['random', 'best'],
'criterion': ['gini', 'entropy'],
'max_depth': list(range(2,20)),
'min_samples_split': list(range(2,20))}
rf_parameters = {
'n_estimators': [10,30,50,100,150,200,250,300,350,400,500,600,700,800,900,1000],
'max_leaf_nodes': list(range(2,30)),
'min_samples_leaf':[1,2,3], 'min_samples_split':[3,4,5,6,7]}
gb_parameters = {
'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_samples_leaf': list(range(1,20)),
'max_depth': list(range(2,20)),
'n_estimators': range(1,200)}
xgboost_parameters = {
'silent': [True],
'max_depth': [6, 10, 15, 20],
'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
'gamma': [0, 0.25, 0.5, 1.0],
'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
'n_estimators': [50,100,120,150]}
ada_parameters = {
'n_estimators': [50,100,200,250,300,400,500,600,700,800,900,1000],
'learning_rate': [ 0.1,0.15, 0.2,0.25, 0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] }
parameters = [lr_parameters,
svm_parameters,
ada_parameters,
dt_parameters,
rf_parameters,
gb_parameters,
xgboost_parameters]
os.environ["PARALLEL_JOBS"] = "16"
def get_n_jobs():
parallel_jobs = os.getenv('PARALLEL_JOBS')
if parallel_jobs is not None:
try:
n_jobs = int(parallel_jobs)
return n_jobs
except ValueError:
print("Error: PARALLEL_JOBS is not an integer. Using default value (None).")
return None
else:
return None
parallel_jobs = get_n_jobs()
def print_summary(ML_name, mdl, method, fitting_time, best_params, confusion_matrix, complete_time,parallel_jobs,feature_importances=False):
print("--" * 50)
print("Finding the best hyperparameters for metric [roc_auc] for {}\n".format(ML_name))
print("Performing {} for {}\n".format(method, ML_name))
print("Beginning to fit {}\n".format(ML_name))
print("Optimal hyperparameters:", best_params)
print("Confusion matrix:")
confusion_matrix_plot(confusion_matrix, ML_name)
if feature_importances:
print_feature_importances(ML_name, mdl)
if parallel_jobs == None:
parallel_jobs = 1
print(f"Hyperparameter tuning complete. Used {parallel_jobs} parallel jobs... Took {complete_time:.2f} seconds.\n")
def print_feature_importances(ML_name, mdl):
print("=="*16,"feature_importances","=="*16)
for name, importance in zip(X_train.columns, mdl.feature_importances_):
print(name, "==", importance)
importances = mdl.feature_importances_
indices = np.argsort(importances)
features = X_train.columns
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importances')
plt.show();
def confusion_matrix_plot(confusion_matrix, classifier):
plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
labels = ['Nondemented', 'Demented']
plt.title('Confusion Matrix for {}'.format(classifier))
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)
s = [['TN', 'FP'], ['FN', 'TP']]
for i, j in itertools.product(range(2), range(2)):
plt.text(j, i, str(s[i][j]) + " = " + str(confusion_matrix[i][j]))
plt.show()
def grid(classifier, parameters,jobs):
ML_name = classifier.__class__.__name__
start_time = time()
models = ['DecisionTreeClassifier', 'RandomForestClassifier', 'XGBClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier']
if ML_name in models:
method = "Randomized Grid Search"
grid = RandomizedSearchCV(estimator=classifier, param_distributions=parameters, cv=10, n_iter=100, scoring="roc_auc", random_state=42,n_jobs=jobs,verbose=0)
else:
method = "Grid Search"
grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=10, scoring="roc_auc",n_jobs=jobs,verbose=0)
grid.fit(X_train, y_train)
fitting_time = time() - start_time
best_params = grid.best_params_
mdl = grid.best_estimator_.fit(X_train, y_train)
pred = mdl.predict(X_test)
test_acc = accuracy_score(y_test, pred)
test_recall = recall_score(y_test, pred, pos_label=1)
fpr,tpr,thresholds = roc_curve(y_test, pred, pos_label=1)
test_auc = auc(fpr, tpr)
total_fpr_opt[ML_name] = fpr #Dit key
total_tpr_opt[ML_name] = tpr #Dict key
conf_matrix = confusion_matrix(y_test, pred)
complete_time = time() - start_time
# Feature importances if applicable
feature_importances = None
if ML_name in models:
feature_importances = True
# Inserting data into finale DataFrame
finale.loc[index, 'ML Model'] = ML_name
finale.loc[index, 'Method'] = method
finale.loc[index, 'fitting time'] = fitting_time
finale.loc[index, 'optimal hyperparameters'] = str(best_params)
finale.loc[index, 'best training AUC score'] = grid.best_score_
finale.loc[index, 'best estimator'] = str(mdl)
# Inserting test recall score and test AUC score into finale DataFrame
finale.loc[index, 'test recall score'] = test_recall
finale.loc[index, 'test AUC score'] = test_auc
print_summary(ML_name, mdl, method, fitting_time, best_params, conf_matrix, complete_time,jobs,feature_importances=feature_importances)
# Robustness check for certain models
robust_models = ['SVC', 'RandomForestClassifier', 'XGBClassifier']
if ML_name in robust_models:
print("------------- Calculating {} classification error for Robustness checks -------------\n".format(ML_name))
if ML_name in models:
grid = RandomizedSearchCV(estimator=classifier, param_distributions=parameters, cv=10, n_iter=100, scoring="accuracy")
robustment_test.loc[index, 'ML Model'] = ML_name
grid.fit(X_train, y_train)
else:
grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=10, scoring="accuracy")
robustment_test.loc[index, 'ML Model'] = ML_name
grid.fit(X_train, y_train)
training_error = 1 - (grid.best_score_)
robustment_test.loc[index, 'Training Classification Error'] = training_error
mdl = grid.best_estimator_.fit(X_train, y_train)
pred = mdl.predict(X_test)
test_error = 1 - (accuracy_score(y_test, pred))
robustment_test.loc[index, 'Test Classification_Error'] = test_error
print("Training classification error: ", training_error)
print("Testing classification error: ", test_error)
print("\n Robustness checks complete!")
index = 0
for classifier, parameter in zip(classifiers, parameters):
grid(classifier, parameter,parallel_jobs)
index += 1
---------------------------------------------------------------------------------------------------- Finding the best hyperparameters for metric [roc_auc] for LogisticRegression Performing Grid Search for LogisticRegression Beginning to fit LogisticRegression Optimal hyperparameters: {'C': 100, 'max_iter': 9} Confusion matrix:
Hyperparameter tuning complete. Used 16 parallel jobs... Took 6.87 seconds. ---------------------------------------------------------------------------------------------------- Finding the best hyperparameters for metric [roc_auc] for SVC Performing Grid Search for SVC Beginning to fit SVC Optimal hyperparameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'} Confusion matrix:
Hyperparameter tuning complete. Used 16 parallel jobs... Took 2.66 seconds. ------------- Calculating SVC classification error for Robustness checks ------------- Training classification error: 0.09943019943019937 Testing classification error: 0.1160714285714286 Robustness checks complete! ---------------------------------------------------------------------------------------------------- Finding the best hyperparameters for metric [roc_auc] for AdaBoostClassifier Performing Randomized Grid Search for AdaBoostClassifier Beginning to fit AdaBoostClassifier Optimal hyperparameters: {'n_estimators': 400, 'learning_rate': 0.8} Confusion matrix:
================================ feature_importances ================================ M/F == 0.045 Age == 0.1325 EDUC == 0.0625 SES == 0.0625 MMSE == 0.0375 eTIV == 0.22 nWBV == 0.2975 ASF == 0.1425
Hyperparameter tuning complete. Used 16 parallel jobs... Took 66.95 seconds. ---------------------------------------------------------------------------------------------------- Finding the best hyperparameters for metric [roc_auc] for DecisionTreeClassifier Performing Randomized Grid Search for DecisionTreeClassifier Beginning to fit DecisionTreeClassifier Optimal hyperparameters: {'splitter': 'best', 'min_samples_split': 6, 'max_leaf_nodes': 7, 'max_depth': 2, 'criterion': 'gini'} Confusion matrix:
================================ feature_importances ================================ M/F == 0.0998291718520719 Age == 0.0 EDUC == 0.0 SES == 0.0 MMSE == 0.9001708281479281 eTIV == 0.0 nWBV == 0.0 ASF == 0.0
Hyperparameter tuning complete. Used 16 parallel jobs... Took 0.68 seconds. ---------------------------------------------------------------------------------------------------- Finding the best hyperparameters for metric [roc_auc] for RandomForestClassifier Performing Randomized Grid Search for RandomForestClassifier Beginning to fit RandomForestClassifier Optimal hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_leaf_nodes': 28} Confusion matrix:
================================ feature_importances ================================ M/F == 0.06072693782658392 Age == 0.08072949191275151 EDUC == 0.08453418287763775 SES == 0.04067643562190814 MMSE == 0.3412779675591842 eTIV == 0.11370031119223188 nWBV == 0.16491149087601517 ASF == 0.11344318213368734
Hyperparameter tuning complete. Used 16 parallel jobs... Took 45.26 seconds. ------------- Calculating RandomForestClassifier classification error for Robustness checks ------------- Training classification error: 0.1495726495726496 Testing classification error: 0.1964285714285714 Robustness checks complete! ---------------------------------------------------------------------------------------------------- Finding the best hyperparameters for metric [roc_auc] for GradientBoostingClassifier Performing Randomized Grid Search for GradientBoostingClassifier Beginning to fit GradientBoostingClassifier Optimal hyperparameters: {'n_estimators': 84, 'min_samples_leaf': 8, 'max_depth': 15, 'learning_rate': 0.8} Confusion matrix:
================================ feature_importances ================================ M/F == 0.06288633186170128 Age == 0.06195875992764563 EDUC == 0.06696296486554618 SES == 0.04390265085288797 MMSE == 0.4892290215022818 eTIV == 0.09386109718420312 nWBV == 0.13803520750280088 ASF == 0.04316396630293312
Hyperparameter tuning complete. Used 16 parallel jobs... Took 12.89 seconds. ---------------------------------------------------------------------------------------------------- Finding the best hyperparameters for metric [roc_auc] for XGBClassifier Performing Randomized Grid Search for XGBClassifier Beginning to fit XGBClassifier Optimal hyperparameters: {'subsample': 0.7, 'silent': True, 'reg_lambda': 0.1, 'n_estimators': 100, 'min_child_weight': 0.5, 'max_depth': 20, 'learning_rate': 0.2, 'gamma': 0.5, 'colsample_bytree': 0.8, 'colsample_bylevel': 0.8} Confusion matrix:
================================ feature_importances ================================ M/F == 0.16292475 Age == 0.0908916 EDUC == 0.09762108 SES == 0.09007445 MMSE == 0.27741197 eTIV == 0.09610281 nWBV == 0.10608934 ASF == 0.07888401
Hyperparameter tuning complete. Used 16 parallel jobs... Took 2.41 seconds. ------------- Calculating XGBClassifier classification error for Robustness checks ------------- Training classification error: 0.1760683760683761 Testing classification error: 0.2053571428571429 Robustness checks complete!
finale.head(10)
finale[['ML Model','test AUC score']]
ML Model | Method | fitting time | optimal hyperparameters | best training AUC score | best estimator | test recall score | test AUC score | |
---|---|---|---|---|---|---|---|---|
0 | LogisticRegression | Grid Search | 6.858933 | {'C': 100, 'max_iter': 9} | 0.904043 | LogisticRegression(C=100, max_iter=9, random_s... | 0.7 | 0.744231 |
1 | SVC | Grid Search | 2.657477 | {'C': 10, 'gamma': 1, 'kernel': 'rbf'} | 0.970661 | SVC(C=10, gamma=1, random_state=42) | 0.866667 | 0.885256 |
2 | AdaBoostClassifier | Randomized Grid Search | 66.5541 | {'n_estimators': 400, 'learning_rate': 0.8} | 0.888874 | AdaBoostClassifier(learning_rate=0.8, n_estima... | 0.8 | 0.794231 |
3 | DecisionTreeClassifier | Randomized Grid Search | 0.671705 | {'splitter': 'best', 'min_samples_split': 6, '... | 0.85049 | DecisionTreeClassifier(max_depth=2, max_leaf_n... | 0.6 | 0.751923 |
4 | RandomForestClassifier | Randomized Grid Search | 45.050044 | {'n_estimators': 300, 'min_samples_split': 5, ... | 0.906121 | RandomForestClassifier(max_leaf_nodes=28, min_... | 0.766667 | 0.787179 |
5 | GradientBoostingClassifier | Randomized Grid Search | 12.803396 | {'n_estimators': 84, 'min_samples_leaf': 8, 'm... | 0.931262 | GradientBoostingClassifier(learning_rate=0.8, ... | 0.783333 | 0.805128 |
6 | XGBClassifier | Randomized Grid Search | 2.382946 | {'subsample': 0.7, 'silent': True, 'reg_lambda... | 0.913014 | XGBClassifier(base_score=None, booster=None, c... | 0.816667 | 0.841026 |
ML Model | test AUC score | |
---|---|---|
0 | LogisticRegression | 0.744231 |
1 | SVC | 0.885256 |
2 | AdaBoostClassifier | 0.794231 |
3 | DecisionTreeClassifier | 0.751923 |
4 | RandomForestClassifier | 0.787179 |
5 | GradientBoostingClassifier | 0.805128 |
6 | XGBClassifier | 0.841026 |
#Check classification scores for the 3 main benchmark models
robustment_test
ML Model | Training Classification Error | Test Classification_Error | |
---|---|---|---|
1 | SVC | 0.09943 | 0.116071 |
4 | RandomForestClassifier | 0.149573 | 0.196429 |
6 | XGBClassifier | 0.176068 | 0.205357 |
Plotting the ROC curve for each classifier ...
##Setting the colour of the ROC curves for each model
colors = { 'LogisticRegression':'red',
'SVC':'green',
'DecisionTreeClassifier':'blue',
'AdaBoostClassifier':'yellow',
'RandomForestClassifier':'cyan',
'XGBClassifier':'magenta',
'GradientBoostingClassifier': 'black'
}
#total_fpr.keys() returns the models name as the key
plt.figure(figsize=(20,12))
for i in total_fpr_opt.keys():
colors = {'LogisticRegression':'red', 'SVC':'green', 'DecisionTreeClassifier':'blue', 'AdaBoostClassifier':'yellow','RandomForestClassifier':'cyan', 'XGBClassifier':'magenta', 'GradientBoostingClassifier': 'black', 'GaussianNB': 'white'}
plt.plot(total_fpr_opt[i],total_tpr_opt[i],colors[i], lw=1, label=i)
plt.xlim([0,1])
plt.ylim([0,1])
plt.title('Comparison of ROC curves for all classifiers')
plt.plot([0, 1], [0, 1], color='darkorange', lw=2, linestyle='--')
plt.legend();
Further Hyperparameter refinement using hyperopt¶
def Performance(model, y, X, fitting_time, best_params):
predictions = model.predict(X)
recall = recall_score(y, predictions, pos_label=1)
fpr, tpr, _ = roc_curve(y, predictions)
auc_score = auc(fpr, tpr)
print('Recall score: {:.4f}'.format(recall))
print('AUC score: {:.4f}'.format(auc_score))
# Update finale table
global index
finale.loc[index, 'ML Model'] = model.__class__.__name__
finale.loc[index, 'Method'] = 'Hyperopt tuning'
finale.loc[index, 'fitting time'] = str(fitting_time)
finale.loc[index, 'optimal hyperparameters'] = str(best_params)
finale.loc[index, 'best training AUC score'] = auc_score
finale.loc[index, 'best estimator'] = str(model)
finale.loc[index, 'test recall score'] = recall
finale.loc[index, 'test AUC score'] = auc_score
# Increment index for the next entry
index += 1
XGBoost hyperopt
def objective(params):
params = {
'n_estimators': params['n_estimators'],
'learning_rate': "{:.3f}".format(params['learning_rate']),
'max_depth': int(params['max_depth']),
'gamma': "{:.3f}".format(params['gamma']),
'min_child_weight': "{:.3f}".format(params['min_child_weight']),
'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
'random_state': params['random_state']
}
#clf = XGBClassifier(n_jobs=4,**params)
clf = XGBClassifier(**params)
skf = StratifiedKFold(n_splits=10, shuffle=True,random_state=42)
score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
print("roc_auc {:.3f} params {}".format(score, params))
training_scores.append(score)
return score
space = {
'n_estimators': scope.int(hp.uniform('n_estimators',100, 2000)),
'learning_rate': hp.uniform('learning_rate', 0.01, 0.4),
'max_depth': scope.int(hp.quniform('max_depth', 2, 14, 1)),
'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
'min_child_weight': hp.uniform('min_child_weight',0.1, 0.6),
'gamma': hp.uniform('gamma', 0.0, 0.5),
'random_state': 42
}
#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))
roc_auc 0.887 params {'n_estimators': 1546, 'learning_rate': '0.129', 'max_depth': 4, 'gamma': '0.469', 'min_child_weight': '0.462', 'colsample_bytree': '0.954', 'random_state': 42} roc_auc 0.886 params {'n_estimators': 910, 'learning_rate': '0.064', 'max_depth': 6, 'gamma': '0.374', 'min_child_weight': '0.452', 'colsample_bytree': '0.952', 'random_state': 42} roc_auc 0.908 params {'n_estimators': 1777, 'learning_rate': '0.110', 'max_depth': 6, 'gamma': '0.267', 'min_child_weight': '0.388', 'colsample_bytree': '0.651', 'random_state': 42} roc_auc 0.900 params {'n_estimators': 896, 'learning_rate': '0.072', 'max_depth': 12, 'gamma': '0.195', 'min_child_weight': '0.135', 'colsample_bytree': '0.822', 'random_state': 42} roc_auc 0.889 params {'n_estimators': 563, 'learning_rate': '0.189', 'max_depth': 8, 'gamma': '0.300', 'min_child_weight': '0.149', 'colsample_bytree': '0.910', 'random_state': 42} roc_auc 0.891 params {'n_estimators': 469, 'learning_rate': '0.189', 'max_depth': 12, 'gamma': '0.146', 'min_child_weight': '0.587', 'colsample_bytree': '0.920', 'random_state': 42} roc_auc 0.890 params {'n_estimators': 1219, 'learning_rate': '0.299', 'max_depth': 12, 'gamma': '0.192', 'min_child_weight': '0.482', 'colsample_bytree': '0.971', 'random_state': 42} roc_auc 0.907 params {'n_estimators': 825, 'learning_rate': '0.336', 'max_depth': 5, 'gamma': '0.179', 'min_child_weight': '0.123', 'colsample_bytree': '0.721', 'random_state': 42} roc_auc 0.887 params {'n_estimators': 1065, 'learning_rate': '0.219', 'max_depth': 9, 'gamma': '0.346', 'min_child_weight': '0.462', 'colsample_bytree': '0.961', 'random_state': 42} roc_auc 0.900 params {'n_estimators': 699, 'learning_rate': '0.220', 'max_depth': 3, 'gamma': '0.159', 'min_child_weight': '0.392', 'colsample_bytree': '0.864', 'random_state': 42} 100%|██████████| 10/10 [00:04<00:00, 2.40trial/s, best loss: 0.8856679894179894] Hyperopt completed in 4.178892374038696 seconds The average training score was 0.894657671957672
Using the best parameters computed using Hypteropt to calculate a Recall and AUC score:
# Call Performance function after hyperparameter tuning
mdl = XGBClassifier(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.9516347225548255, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=0.37420938791338665, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.06443939093069395, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=6, max_leaves=None, min_child_weight=0.4520573760002473, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=910, n_jobs=None, num_parallel_tree=None, random_state=42, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.9516347225548255, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=0.37420938791338665, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.06443939093069395, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=6, max_leaves=None, min_child_weight=0.4520573760002473, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=910, n_jobs=None, num_parallel_tree=None, random_state=42, ...)
Recall score: 0.7833 AUC score: 0.8051
RandomForest hyperopt
def objective(params):
params = {
'max_depth': int(params['max_depth']),
'n_estimators':int(params['n_estimators']),
'random_state': params['random_state']
}
#clf = RandomForestClassifier(n_jobs=4,**params)
clf = RandomForestClassifier(**params)
score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
print("roc_auc {:.3f} params {}".format(score, params))
training_scores.append(score)
return score
space = {
'max_depth': scope.int(hp.quniform('max_depth', 10,30, 1)),
'n_estimators': scope.int(hp.quniform('n_estimators', 100,1500,25)),
'random_state': 42 }
#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))
roc_auc 0.923 params {'max_depth': 26, 'n_estimators': 625, 'random_state': 42} roc_auc 0.923 params {'max_depth': 18, 'n_estimators': 450, 'random_state': 42} roc_auc 0.924 params {'max_depth': 13, 'n_estimators': 725, 'random_state': 42} roc_auc 0.923 params {'max_depth': 13, 'n_estimators': 225, 'random_state': 42} roc_auc 0.924 params {'max_depth': 27, 'n_estimators': 425, 'random_state': 42} roc_auc 0.918 params {'max_depth': 13, 'n_estimators': 150, 'random_state': 42} roc_auc 0.917 params {'max_depth': 25, 'n_estimators': 125, 'random_state': 42} roc_auc 0.922 params {'max_depth': 18, 'n_estimators': 575, 'random_state': 42} roc_auc 0.922 params {'max_depth': 12, 'n_estimators': 1450, 'random_state': 42} roc_auc 0.922 params {'max_depth': 28, 'n_estimators': 1200, 'random_state': 42} 100%|██████████| 10/10 [00:20<00:00, 2.09s/trial, best loss: 0.9165542328042328] Hyperopt completed in 20.87399196624756 seconds The average training score was 0.9216392857142857
# Call Performance function after hyperparameter tuning
mdl = RandomForestClassifier(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)
RandomForestClassifier(max_depth=25, n_estimators=125, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=25, n_estimators=125, random_state=42)
Recall score: 0.8333 AUC score: 0.8590
SVC hyperopt
def objective(params):
params = {
'C': params['C'],
'gamma':params['gamma'],
'kernel': params['kernel'],
'random_state': params['random_state']
}
clf = SVC(**params)
score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
print("roc_auc {:.3f} params {}".format(score, params))
training_scores.append(score)
return score
space = {
'C': hp.uniform('C', 10,11),
'gamma': hp.quniform('gamma', 0.35,0.42,0.01),
'kernel': 'rbf',
'random_state': 42 }
#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))
roc_auc 0.931 params {'C': 10.864562445314965, 'gamma': 0.39, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.933 params {'C': 10.050081000530929, 'gamma': 0.4, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.933 params {'C': 10.59883981122294, 'gamma': 0.4, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.929 params {'C': 10.8091990797788, 'gamma': 0.37, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.928 params {'C': 10.01700859922015, 'gamma': 0.37, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.937 params {'C': 10.970767038409553, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.936 params {'C': 10.741369260572458, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.935 params {'C': 10.08465967125343, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.927 params {'C': 10.616443464899518, 'gamma': 0.36, 'kernel': 'rbf', 'random_state': 42} roc_auc 0.937 params {'C': 10.849924492903604, 'gamma': 0.42, 'kernel': 'rbf', 'random_state': 42} 100%|██████████| 10/10 [00:00<00:00, 37.84trial/s, best loss: 0.9266600529100529] Hyperopt completed in 0.2672393321990967 seconds The average training score was 0.9324859788359788
# Call Performance function after hyperparameter tuning
mdl = SVC(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)
SVC(C=10.616443464899518, gamma=0.36, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=10.616443464899518, gamma=0.36, random_state=42)
Recall score: 0.8667 AUC score: 0.9045
Going to compare the 2 dataframes -- finale & benchmark -- to show the differences due to parameter tuning ...
def process_results(finale, benchmark):
# Rename column in benchmark DataFrame
benchmark.rename(columns={'ML Algo name': 'ML Model'}, inplace=True)
# Extract relevant columns from finale DataFrame and sort by ML Model
output = finale[['ML Model', 'Method', 'test AUC score']].sort_values('ML Model')
# Set Method column to 'untuned' in benchmark DataFrame
benchmark['Method'] = 'untuned'
# Extract relevant columns from benchmark DataFrame
benchmark = benchmark[['ML Model', 'Method', 'test AUC score']]
# Concatenate output and benchmark DataFrames, sort by ML Model, and set index
df = pd.concat([output, benchmark]).sort_values('ML Model')
df.set_index('ML Model', inplace=True)
return df
output = process_results(finale,benchmark)
output
Method | test AUC score | |
---|---|---|
ML Model | ||
AdaBoost Classifier | untuned | 0.766667 |
AdaBoostClassifier | Randomized Grid Search | 0.794231 |
Decision Tree Classifier | untuned | 0.748718 |
DecisionTreeClassifier | Randomized Grid Search | 0.751923 |
GradientBoostingClassifier | Randomized Grid Search | 0.805128 |
GradientBoostingClassifier | untuned | 0.821795 |
Logistic Regression | untuned | 0.744231 |
LogisticRegression | Grid Search | 0.744231 |
Naive Bayes | untuned | 0.748077 |
Random Forest Classifier | untuned | 0.86859 |
RandomForestClassifier | Hyperopt tuning | 0.858974 |
RandomForestClassifier | Randomized Grid Search | 0.787179 |
SVC | Hyperopt tuning | 0.904487 |
SVC | untuned | 0.776923 |
SVC | Grid Search | 0.885256 |
XGBClassifier | Hyperopt tuning | 0.805128 |
XGBClassifier | untuned | 0.80641 |
XGBClassifier | Randomized Grid Search | 0.841026 |
# Extract the 'untuned' test AUC score for each ML Model and store it in a dictionary
untuned_dict = {}
for model, group in output[output['Method'] == 'untuned'].groupby('ML Model'):
untuned_dict[model] = group.iloc[0]['test AUC score']
# List of methods to check against
methods_to_check = ['Randomized Grid Search', 'Hyperopt tuning', 'Grid Search']
# Print out the test AUC scores for each ML model with Randomized Grid Search method that outperformed the untuned method
for model, group in output.groupby('ML Model'):
if any(method in group['Method'].values for method in methods_to_check):
for method in methods_to_check:
if method in group['Method'].values:
method_score = group[group['Method'] == method]['test AUC score'].values[0]
if model in untuned_dict and method_score > untuned_dict[model]:
print(f"The following model and optimisation method outperformed its benchmark value of {untuned_dict[model]}")
print(f"ML Model: {model}")
print(f"Method: {method}")
print(f"Test AUC score: {method_score}")
print()
The following model and optimisation method outperformed its benchmark value of 0.7769230769230769 ML Model: SVC Method: Hyperopt tuning Test AUC score: 0.9044871794871795 The following model and optimisation method outperformed its benchmark value of 0.7769230769230769 ML Model: SVC Method: Grid Search Test AUC score: 0.8852564102564102 The following model and optimisation method outperformed its benchmark value of 0.8064102564102564 ML Model: XGBClassifier Method: Randomized Grid Search Test AUC score: 0.841025641025641
#Define a meshgrid function that will plot the decision surface
def make_meshgrid(x, y, h=.02):
x_min, x_max = x.min() - 1, x.max() + 1
y_min, y_max = y.min() - 1, y.max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
return xx, yy
def plot_contours(ax, clf, xx, yy, **params):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
out = ax.contourf(xx, yy, Z, **params)
return out
#Select 2 features randomly to compare - SES / EDUC in this case
#Convert to array
X = np.array(X_test)
X = X[:, [2,3]]
#ax1 --> benchmark model
#ax2 --> optimised model
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True, figsize=(20,10))
title = ('Decision surface for Benchmark model')
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
mdl = SVC(C=1, gamma='auto', kernel='rbf', random_state=42).fit(X, y_test)
plot_contours(ax1, mdl, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
ax1.scatter(X0, X1, c=y_test, cmap=plt.cm.coolwarm, s=20, marker='x', edgecolors='k')
ax1.set_xlabel(X_test.columns[2])
ax1.set_ylabel(X_test.columns[3])
ax1.set_xticks(())
ax1.set_yticks(())
ax1.legend()
ax1.set_title(title);
title = ('Decision surface for Optimized model')
mdl = SVC(C=10.94, gamma=0.36, kernel='rbf', random_state=42).fit(X, y_test)
plot_contours(ax2, mdl, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
ax2.scatter(X0, X1, c=y_test, cmap=plt.cm.coolwarm, s=25, marker='x', edgecolors='k')
ax2.set_xlabel(X_test.columns[2])
ax2.set_ylabel(X_test.columns[3])
ax2.set_xticks(())
ax2.set_yticks(())
ax2.set_title(title);
No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.