%matplotlib inline

# IMPORT MODULES
import sys
import os
import time
import warnings
import platform
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt 
import sklearn
from scipy.stats import norm
import seaborn as sns 
import hyperopt
import scipy
import xgboost

# Print Anaconda and Python versions
print(f"Anaconda Version is {sys.version}")
print(f"Python version is {platform.python_version()}")

# Print versions of imported libraries
print(f"Pandas version is: {pd.__version__}")
print(f"Numpy version is: {np.__version__}")
print(f"Matplotlib version is: {matplotlib.__version__}")
print(f"Sklearn version is: {sklearn.__version__}")
print(f"Seaborn version is: {sns.__version__}")
print(f"Hyperopt version is: {hyperopt.__version__}")
print(f"Scipy version is: {scipy.__version__}")
print(f"Xgboost version is: {xgboost.__version__}")

# Set seaborn settings
sns.set(context='paper', palette='winter_r', style='darkgrid', rc={'figure.facecolor': 'gray'}, font_scale=1.5)

# Filter out warnings (deprecations, etc.)
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses

Anaconda Version is 3.10.13 (main, Mar 12 2024, 12:16:25) [GCC 12.2.0]
Python version is 3.10.13
Pandas version is: 2.2.1
Numpy version is: 1.26.4
Matplotlib version is: 3.8.3
Sklearn version is: 1.4.1.post1
Seaborn version is: 0.13.2
Hyperopt version is: 0.2.7
Scipy version is: 1.12.0
Xgboost version is: 2.0.3

#Alter ast_note_interactivity kernel option so jupyter displays multiple statements at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import itertools
from time import time
from pprint import pprint
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, GridSearchCV,  RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn import naive_bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn import tree
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_curve, auc, classification_report, confusion_matrix
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import cross_val_score, StratifiedKFold
##Importing hyperopt libraries
from functools import partial
from hyperopt import fmin, hp, tpe, Trials, space_eval
from hyperopt.pyll import scope

# 2.1 Data Exploration

# Get the current working directory
current_directory = os.getcwd()

# Construct the file path
file_path = os.path.join(current_directory, 'data', 'oasis_longitudinal_demographics.csv')

# Load in the data
mri = pd.read_csv(file_path)

# Get a feel for the data - observe first 5 rows
print("First 5 rows of the data:")
print(mri.head(5))

# Print out the number of columns in the dataframe (features)
num_columns = len(mri.columns)
print(f"\nThere are {num_columns} attributes within the dataset.")

# Check the columns of the dataset in prep for Data Exploration
print("\nColumns of the dataset:")
print(mri.columns)

First 5 rows of the data:
  Subject ID         MRI ID        Group  Visit  MR Delay M/F Hand  Age  EDUC  \
0  OAS2_0001  OAS2_0001_MR1  Nondemented      1         0   M    R   87    14   
1  OAS2_0001  OAS2_0001_MR2  Nondemented      2       457   M    R   88    14   
2  OAS2_0002  OAS2_0002_MR1     Demented      1         0   M    R   75    12   
3  OAS2_0002  OAS2_0002_MR2     Demented      2       560   M    R   76    12   
4  OAS2_0002  OAS2_0002_MR3     Demented      3      1895   M    R   80    12   

   SES  MMSE  CDR         eTIV   nWBV    ASF  
0  2.0  27.0  0.0  1986.550000  0.696  0.883  
1  2.0  30.0  0.0  2004.479526  0.681  0.876  
2  NaN  23.0  0.5  1678.290000  0.736  1.046  
3  NaN  28.0  0.5  1737.620000  0.713  1.010  
4  NaN  22.0  0.5  1697.911134  0.701  1.034  

There are 15 attributes within the dataset.

Columns of the dataset:
Index(['Subject ID', 'MRI ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Hand',
       'Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'],
      dtype='object')

# Check the columns and their type - info prints this info about a dataframe
print("Information about the dataset:")
mri.info()

# Dimensions of the dataset
num_columns = len(mri.columns)
num_rows = len(mri.index)
print(f"\nThe dataset is made up with {num_columns} columns and {num_rows} rows.")

Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    object 
 1   MRI ID      373 non-null    object 
 2   Group       373 non-null    object 
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    object 
 6   Hand        373 non-null    object 
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    float64
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(6), int64(4), object(5)
memory usage: 43.8+ KB

The dataset is made up with 15 columns and 373 rows.

# Let's check the distinct values in certain columns
# Common sense would suggest that Subject ID, MRI ID, MR Delay, eTIV, nWBV, and ASF features would have too many distinct values for the purpose of this test.
# Therefore let's ignore them

num_columns = ['Visit', 'Age', 'EDUC', 'SES', 'MMSE', 'CDR']

for col in num_columns:
    unique_values = mri[col].unique()
    print(f"{len(unique_values)} unique values for {col}:")
    print(unique_values)
    print("\nValue counts:")
    print(mri[col].value_counts(), "\n")

cat_columns = ['Group', 'M/F', 'Hand']
for col in cat_columns:
    unique_values = mri[col].unique()
    print(f"{len(unique_values)} unique values for {col}:")
    print(unique_values)
    print("\nValue counts:")
    print(mri[col].value_counts(), "\n")

5 unique values for Visit:
[1 2 3 4 5]

Value counts:
Visit
1    150
2    144
3     58
4     15
5      6
Name: count, dtype: int64 

39 unique values for Age:
[87 88 75 76 80 90 83 85 71 73 93 95 68 69 66 78 81 82 77 86 92 84 72 61
 64 74 60 62 91 79 89 70 94 97 65 67 63 96 98]

Value counts:
Age
73    26
75    22
78    21
80    20
81    18
71    18
82    17
76    16
77    16
68    14
84    13
69    13
83    12
70    12
74    12
72    11
79    11
66    10
88    10
86    10
85     9
89     7
65     6
67     6
87     6
90     5
62     4
91     4
61     4
92     4
64     3
93     3
63     3
60     2
95     1
94     1
97     1
96     1
98     1
Name: count, dtype: int64 

12 unique values for EDUC:
[14 12 18 16  8 20 13  6 17 15 23 11]

Value counts:
EDUC
12    103
16     81
18     64
14     33
13     27
15     17
20     13
11     11
8       9
17      9
6       3
23      3
Name: count, dtype: int64 

6 unique values for SES:
[ 2. nan  3.  4.  1.  5.]

Value counts:
SES
2.0    103
1.0     88
3.0     82
4.0     74
5.0      7
Name: count, dtype: int64 

19 unique values for MMSE:
[27. 30. 23. 28. 22. 29. 24. 21. 16. 25. 26. 15. 20. 19.  7.  4. 17. 18.
 nan]

Value counts:
MMSE
30.0    114
29.0     91
28.0     45
27.0     32
26.0     20
25.0     12
21.0     11
23.0     11
22.0      7
20.0      7
17.0      5
24.0      4
16.0      3
19.0      3
15.0      2
18.0      2
7.0       1
4.0       1
Name: count, dtype: int64 

4 unique values for CDR:
[0.  0.5 1.  2. ]

Value counts:
CDR
0.0    206
0.5    123
1.0     41
2.0      3
Name: count, dtype: int64 

3 unique values for Group:
['Nondemented' 'Demented' 'Converted']

Value counts:
Group
Nondemented    190
Demented       146
Converted       37
Name: count, dtype: int64 

2 unique values for M/F:
['M' 'F']

Value counts:
M/F
F    213
M    160
Name: count, dtype: int64 

1 unique values for Hand:
['R']

Value counts:
Hand
R    373
Name: count, dtype: int64

#Get a summary of the numerical attributes within our dataset - let's us see if we need to normalize/scale at a later point
for col in num_columns:
    mri[col].describe()

count    373.000000
mean       1.882038
std        0.922843
min        1.000000
25%        1.000000
50%        2.000000
75%        2.000000
max        5.000000
Name: Visit, dtype: float64

count    373.000000
mean      77.013405
std        7.640957
min       60.000000
25%       71.000000
50%       77.000000
75%       82.000000
max       98.000000
Name: Age, dtype: float64

count    373.000000
mean      14.597855
std        2.876339
min        6.000000
25%       12.000000
50%       15.000000
75%       16.000000
max       23.000000
Name: EDUC, dtype: float64

count    354.000000
mean       2.460452
std        1.134005
min        1.000000
25%        2.000000
50%        2.000000
75%        3.000000
max        5.000000
Name: SES, dtype: float64

count    371.000000
mean      27.342318
std        3.683244
min        4.000000
25%       27.000000
50%       29.000000
75%       30.000000
max       30.000000
Name: MMSE, dtype: float64

count    373.000000
mean       0.290885
std        0.374557
min        0.000000
25%        0.000000
50%        0.000000
75%        0.500000
max        2.000000
Name: CDR, dtype: float64

# Plotting histograms side by side for pairs of variables

# Define pairs of variables
pairs = [('Age', 'CDR'), ('SES', 'EDUC'), ('MMSE', 'M/F')]

# Plot histograms for each pair of variables
for pair in pairs:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    for i, col in enumerate(pair):
        sns.histplot(mri[col], ax=axes[i], kde=False)
        axes[i].set_title(f'Histogram of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')
    plt.tight_layout()
# Adding ; to last statement to get rid of <matplotlib...> <seaborn...> text
plt.show();

# Create a copy of the dataframe so we can remove the 'converted' value from the 'Group' attribute
mri_copy = mri.copy()

mri_copy =mri_copy[mri_copy.Group != "Converted"]

def plot_facetgrid(data, var):
    """
    Function to create a FacetGrid chart based on the variable var.
    
    Parameters:
        data (DataFrame): The DataFrame containing the data.
        var (str): The variable to plot.
    """
    csfont_options = { 'fontname': 'monospace',
                   'weight': 'bold',
                   'color':'b',
                   'size': 'large'
                  }
    
    plot_colors_lookup = {
        'MMSE': {'color': ['r', 'k'], 'title': 'Dementia Variation against MMSE scores', 'xlim': (10, data['MMSE'].max())},
        'SES': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Socio-Economic-Status', 'xlim': (0, data['SES'].max())},
        'nWBV': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Normalized Whole Brain Volume', 'xlim': (0, data['nWBV'].max())},
        'eTIV': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Estimated Total Intracranial volume', 'xlim': (0, data['eTIV'].max())},
        'EDUC': {'color': ['r', 'k'], 'title': 'Variation of Dementia against Years of Education', 'xlim': (3, data['EDUC'].max())},
        'Age': {'color': ['r', 'k'], 'title': 'Dementia Variation against Age', 'xlim': (10, data['Age'].max())}
    }
    
    # Lookup plot colors, title, and xlim based on the variable var
    plot_colors = plot_colors_lookup[var]['color']
    title = plot_colors_lookup[var]['title']
    xlim = plot_colors_lookup[var]['xlim']
    
    # Create FacetGrid
    g = sns.FacetGrid(data, hue='Group', hue_kws={'color': plot_colors}, height=8)
    
    # Plot KDE plot
    g.map(sns.kdeplot, var, shade=True)
    
    # Set x-axis limit
    g.set(xlim=xlim)
    
    # Add legend
    plt.legend(loc='best')
    
    # Set title
    plt.title(title, **csfont_options)
    
    # Show plot
    plt.show()

plot_facetgrid(mri_copy, 'MMSE')

plot_facetgrid(mri_copy, 'SES')

plot_facetgrid(mri_copy, 'nWBV')

plot_facetgrid(mri_copy, 'eTIV')

plot_facetgrid(mri_copy, 'EDUC')

plot_facetgrid(mri_copy, 'Age')

# Define the target variable and features
targ = 'Group'
cols = ['MMSE', 'SES', 'nWBV', 'eTIV', 'EDUC', 'Age']

# Define font options for the title
csfont_options = {'fontname': 'monospace', 'weight': 'bold', 'color': 'b', 'size': 'large'}

# Set min x-axis values for each feature
xlimits = [12, 0, 0.55, 840, 3, 50]

# Define plot colors for different groups
plot_colors = {'Converted': 'r', 'Demented': 'k', 'Non-Demented': 'b'}

# Iterate over the target variable and features
for feature, xlimit in zip(cols, xlimits):
    g = sns.FacetGrid(mri, hue=targ, height=8)
    g.map(sns.kdeplot, feature, shade=True)
    g.set(xlim=(10, mri[feature].max()))
    plt.legend(loc='best')
    plt.title('Dementia Variation against {}'.format(feature), **csfont_options)
    plt.xlim(xlimit)

plt.show();

g = sns.catplot(x='M/F', 
                data=mri_copy, hue='Group', 
                kind='count', palette="muted", 
                legend=False, height=6)

g.despine(left=True)
g.set_ylabels("Participation count")
plt.legend(loc='best')
plt.show();

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

def plot_corr_heatmap(data, method, heading):
    """
    Plot correlation heatmap for the given data.

    Parameters:
        data (DataFrame): The DataFrame containing the data.
        method (str): The method used for computing correlations.
        heading (str): The title for the heatmap.
    """
    # Copy mri dataset and preprocess
    mri_copy = data.drop(['Subject ID', 'M/F', 'MRI ID', 'Hand'], axis=1)
    mri_copy = pd.get_dummies(mri_copy, drop_first=True)

    # Compute correlation matrix
    corr = mri_copy.corr(method=method)

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(12, 8))

    # Draw the heatmap
    sns.heatmap(corr, annot=True, linewidths=.5, fmt='.2f', ax=ax)

    # Set title
    ax.set_title(heading)

    # Show plot
    plt.show()

    return corr

plot_corr_heatmap(mri, 'pearson', 'Correlation Heat map')

scatter_matrix(mri[mri.columns], figsize=(12,8));

# Display unique values before and after replacement
for col in ['Group', 'M/F']:
    print(f"Before replacement {col} has the following values: {mri[col].unique()}")

# Create a mapping dictionary for replacement
mapping = {'F': 0, 'M': 1, 'Nondemented': 0, 'Demented': 1, 'Converted': 1}

# Replace categorical values with numerical values
mri.replace({'Group': mapping, 'M/F': mapping}, inplace=True)

# Display unique values after replacement
for col in ['Group', 'M/F']:
    print(f"After replacement {col} has the following numerical values: {mri[col].unique()}")

Before replacement Group has the following values: ['Nondemented' 'Demented' 'Converted']
Before replacement M/F has the following values: ['M' 'F']
After replacement Group has the following numerical values: [0 1]
After replacement M/F has the following numerical values: [1 0]

## Using LabelEncoder from scikit learn to encode categorical features to numerical.
obj_mri = mri.select_dtypes(include=['object'])

# Converting categorical Data 
for col in obj_mri.columns:
    le = LabelEncoder()
    le.fit(mri[col])
    list(le.classes_)
    mri[col] = le.transform(mri[col])

LabelEncoder()

LabelEncoder()

['OAS2_0001',
 'OAS2_0002',
 'OAS2_0004',
 'OAS2_0005',
 'OAS2_0007',
 'OAS2_0008',
 'OAS2_0009',
 'OAS2_0010',
 'OAS2_0012',
 'OAS2_0013',
 'OAS2_0014',
 'OAS2_0016',
 'OAS2_0017',
 'OAS2_0018',
 'OAS2_0020',
 'OAS2_0021',
 'OAS2_0022',
 'OAS2_0023',
 'OAS2_0026',
 'OAS2_0027',
 'OAS2_0028',
 'OAS2_0029',
 'OAS2_0030',
 'OAS2_0031',
 'OAS2_0032',
 'OAS2_0034',
 'OAS2_0035',
 'OAS2_0036',
 'OAS2_0037',
 'OAS2_0039',
 'OAS2_0040',
 'OAS2_0041',
 'OAS2_0042',
 'OAS2_0043',
 'OAS2_0044',
 'OAS2_0045',
 'OAS2_0046',
 'OAS2_0047',
 'OAS2_0048',
 'OAS2_0049',
 'OAS2_0050',
 'OAS2_0051',
 'OAS2_0052',
 'OAS2_0053',
 'OAS2_0054',
 'OAS2_0055',
 'OAS2_0056',
 'OAS2_0057',
 'OAS2_0058',
 'OAS2_0060',
 'OAS2_0061',
 'OAS2_0062',
 'OAS2_0063',
 'OAS2_0064',
 'OAS2_0066',
 'OAS2_0067',
 'OAS2_0068',
 'OAS2_0069',
 'OAS2_0070',
 'OAS2_0071',
 'OAS2_0073',
 'OAS2_0075',
 'OAS2_0076',
 'OAS2_0077',
 'OAS2_0078',
 'OAS2_0079',
 'OAS2_0080',
 'OAS2_0081',
 'OAS2_0085',
 'OAS2_0086',
 'OAS2_0087',
 'OAS2_0088',
 'OAS2_0089',
 'OAS2_0090',
 'OAS2_0091',
 'OAS2_0092',
 'OAS2_0094',
 'OAS2_0095',
 'OAS2_0096',
 'OAS2_0097',
 'OAS2_0098',
 'OAS2_0099',
 'OAS2_0100',
 'OAS2_0101',
 'OAS2_0102',
 'OAS2_0103',
 'OAS2_0104',
 'OAS2_0105',
 'OAS2_0106',
 'OAS2_0108',
 'OAS2_0109',
 'OAS2_0111',
 'OAS2_0112',
 'OAS2_0113',
 'OAS2_0114',
 'OAS2_0116',
 'OAS2_0117',
 'OAS2_0118',
 'OAS2_0119',
 'OAS2_0120',
 'OAS2_0121',
 'OAS2_0122',
 'OAS2_0124',
 'OAS2_0126',
 'OAS2_0127',
 'OAS2_0128',
 'OAS2_0129',
 'OAS2_0131',
 'OAS2_0133',
 'OAS2_0134',
 'OAS2_0135',
 'OAS2_0137',
 'OAS2_0138',
 'OAS2_0139',
 'OAS2_0140',
 'OAS2_0141',
 'OAS2_0142',
 'OAS2_0143',
 'OAS2_0144',
 'OAS2_0145',
 'OAS2_0146',
 'OAS2_0147',
 'OAS2_0149',
 'OAS2_0150',
 'OAS2_0152',
 'OAS2_0154',
 'OAS2_0156',
 'OAS2_0157',
 'OAS2_0158',
 'OAS2_0159',
 'OAS2_0160',
 'OAS2_0161',
 'OAS2_0162',
 'OAS2_0164',
 'OAS2_0165',
 'OAS2_0169',
 'OAS2_0171',
 'OAS2_0172',
 'OAS2_0174',
 'OAS2_0175',
 'OAS2_0176',
 'OAS2_0177',
 'OAS2_0178',
 'OAS2_0179',
 'OAS2_0181',
 'OAS2_0182',
 'OAS2_0183',
 'OAS2_0184',
 'OAS2_0185',
 'OAS2_0186']

LabelEncoder()

LabelEncoder()

['OAS2_0001_MR1',
 'OAS2_0001_MR2',
 'OAS2_0002_MR1',
 'OAS2_0002_MR2',
 'OAS2_0002_MR3',
 'OAS2_0004_MR1',
 'OAS2_0004_MR2',
 'OAS2_0005_MR1',
 'OAS2_0005_MR2',
 'OAS2_0005_MR3',
 'OAS2_0007_MR1',
 'OAS2_0007_MR3',
 'OAS2_0007_MR4',
 'OAS2_0008_MR1',
 'OAS2_0008_MR2',
 'OAS2_0009_MR1',
 'OAS2_0009_MR2',
 'OAS2_0010_MR1',
 'OAS2_0010_MR2',
 'OAS2_0012_MR1',
 'OAS2_0012_MR2',
 'OAS2_0012_MR3',
 'OAS2_0013_MR1',
 'OAS2_0013_MR2',
 'OAS2_0013_MR3',
 'OAS2_0014_MR1',
 'OAS2_0014_MR2',
 'OAS2_0016_MR1',
 'OAS2_0016_MR2',
 'OAS2_0017_MR1',
 'OAS2_0017_MR3',
 'OAS2_0017_MR4',
 'OAS2_0017_MR5',
 'OAS2_0018_MR1',
 'OAS2_0018_MR3',
 'OAS2_0018_MR4',
 'OAS2_0020_MR1',
 'OAS2_0020_MR2',
 'OAS2_0020_MR3',
 'OAS2_0021_MR1',
 'OAS2_0021_MR2',
 'OAS2_0022_MR1',
 'OAS2_0022_MR2',
 'OAS2_0023_MR1',
 'OAS2_0023_MR2',
 'OAS2_0026_MR1',
 'OAS2_0026_MR2',
 'OAS2_0027_MR1',
 'OAS2_0027_MR2',
 'OAS2_0027_MR3',
 'OAS2_0027_MR4',
 'OAS2_0028_MR1',
 'OAS2_0028_MR2',
 'OAS2_0029_MR1',
 'OAS2_0029_MR2',
 'OAS2_0030_MR1',
 'OAS2_0030_MR2',
 'OAS2_0031_MR1',
 'OAS2_0031_MR2',
 'OAS2_0031_MR3',
 'OAS2_0032_MR1',
 'OAS2_0032_MR2',
 'OAS2_0034_MR1',
 'OAS2_0034_MR2',
 'OAS2_0034_MR3',
 'OAS2_0034_MR4',
 'OAS2_0035_MR1',
 'OAS2_0035_MR2',
 'OAS2_0036_MR1',
 'OAS2_0036_MR3',
 'OAS2_0036_MR4',
 'OAS2_0036_MR5',
 'OAS2_0037_MR1',
 'OAS2_0037_MR2',
 'OAS2_0037_MR3',
 'OAS2_0037_MR4',
 'OAS2_0039_MR1',
 'OAS2_0039_MR2',
 'OAS2_0040_MR1',
 'OAS2_0040_MR2',
 'OAS2_0040_MR3',
 'OAS2_0041_MR1',
 'OAS2_0041_MR2',
 'OAS2_0041_MR3',
 'OAS2_0042_MR1',
 'OAS2_0042_MR2',
 'OAS2_0043_MR1',
 'OAS2_0043_MR2',
 'OAS2_0044_MR1',
 'OAS2_0044_MR2',
 'OAS2_0044_MR3',
 'OAS2_0045_MR1',
 'OAS2_0045_MR2',
 'OAS2_0046_MR1',
 'OAS2_0046_MR2',
 'OAS2_0047_MR1',
 'OAS2_0047_MR2',
 'OAS2_0048_MR1',
 'OAS2_0048_MR2',
 'OAS2_0048_MR3',
 'OAS2_0048_MR4',
 'OAS2_0048_MR5',
 'OAS2_0049_MR1',
 'OAS2_0049_MR2',
 'OAS2_0049_MR3',
 'OAS2_0050_MR1',
 'OAS2_0050_MR2',
 'OAS2_0051_MR1',
 'OAS2_0051_MR2',
 'OAS2_0051_MR3',
 'OAS2_0052_MR1',
 'OAS2_0052_MR2',
 'OAS2_0053_MR1',
 'OAS2_0053_MR2',
 'OAS2_0054_MR1',
 'OAS2_0054_MR2',
 'OAS2_0055_MR1',
 'OAS2_0055_MR2',
 'OAS2_0056_MR1',
 'OAS2_0056_MR2',
 'OAS2_0057_MR1',
 'OAS2_0057_MR2',
 'OAS2_0057_MR3',
 'OAS2_0058_MR1',
 'OAS2_0058_MR2',
 'OAS2_0058_MR3',
 'OAS2_0060_MR1',
 'OAS2_0060_MR2',
 'OAS2_0061_MR1',
 'OAS2_0061_MR2',
 'OAS2_0061_MR3',
 'OAS2_0062_MR1',
 'OAS2_0062_MR2',
 'OAS2_0062_MR3',
 'OAS2_0063_MR1',
 'OAS2_0063_MR2',
 'OAS2_0064_MR1',
 'OAS2_0064_MR2',
 'OAS2_0064_MR3',
 'OAS2_0066_MR1',
 'OAS2_0066_MR2',
 'OAS2_0067_MR1',
 'OAS2_0067_MR2',
 'OAS2_0067_MR3',
 'OAS2_0067_MR4',
 'OAS2_0068_MR1',
 'OAS2_0068_MR2',
 'OAS2_0069_MR1',
 'OAS2_0069_MR2',
 'OAS2_0070_MR1',
 'OAS2_0070_MR2',
 'OAS2_0070_MR3',
 'OAS2_0070_MR4',
 'OAS2_0070_MR5',
 'OAS2_0071_MR1',
 'OAS2_0071_MR2',
 'OAS2_0073_MR1',
 'OAS2_0073_MR2',
 'OAS2_0073_MR3',
 'OAS2_0073_MR4',
 'OAS2_0073_MR5',
 'OAS2_0075_MR1',
 'OAS2_0075_MR2',
 'OAS2_0076_MR1',
 'OAS2_0076_MR2',
 'OAS2_0076_MR3',
 'OAS2_0077_MR1',
 'OAS2_0077_MR2',
 'OAS2_0078_MR1',
 'OAS2_0078_MR2',
 'OAS2_0078_MR3',
 'OAS2_0079_MR1',
 'OAS2_0079_MR2',
 'OAS2_0079_MR3',
 'OAS2_0080_MR1',
 'OAS2_0080_MR2',
 'OAS2_0080_MR3',
 'OAS2_0081_MR1',
 'OAS2_0081_MR2',
 'OAS2_0085_MR1',
 'OAS2_0085_MR2',
 'OAS2_0086_MR1',
 'OAS2_0086_MR2',
 'OAS2_0087_MR1',
 'OAS2_0087_MR2',
 'OAS2_0088_MR1',
 'OAS2_0088_MR2',
 'OAS2_0089_MR1',
 'OAS2_0089_MR3',
 'OAS2_0090_MR1',
 'OAS2_0090_MR2',
 'OAS2_0090_MR3',
 'OAS2_0091_MR1',
 'OAS2_0091_MR2',
 'OAS2_0092_MR1',
 'OAS2_0092_MR2',
 'OAS2_0094_MR1',
 'OAS2_0094_MR2',
 'OAS2_0095_MR1',
 'OAS2_0095_MR2',
 'OAS2_0095_MR3',
 'OAS2_0096_MR1',
 'OAS2_0096_MR2',
 'OAS2_0097_MR1',
 'OAS2_0097_MR2',
 'OAS2_0098_MR1',
 'OAS2_0098_MR2',
 'OAS2_0099_MR1',
 'OAS2_0099_MR2',
 'OAS2_0100_MR1',
 'OAS2_0100_MR2',
 'OAS2_0100_MR3',
 'OAS2_0101_MR1',
 'OAS2_0101_MR2',
 'OAS2_0101_MR3',
 'OAS2_0102_MR1',
 'OAS2_0102_MR2',
 'OAS2_0102_MR3',
 'OAS2_0103_MR1',
 'OAS2_0103_MR2',
 'OAS2_0103_MR3',
 'OAS2_0104_MR1',
 'OAS2_0104_MR2',
 'OAS2_0105_MR1',
 'OAS2_0105_MR2',
 'OAS2_0106_MR1',
 'OAS2_0106_MR2',
 'OAS2_0108_MR1',
 'OAS2_0108_MR2',
 'OAS2_0109_MR1',
 'OAS2_0109_MR2',
 'OAS2_0111_MR1',
 'OAS2_0111_MR2',
 'OAS2_0112_MR1',
 'OAS2_0112_MR2',
 'OAS2_0113_MR1',
 'OAS2_0113_MR2',
 'OAS2_0114_MR1',
 'OAS2_0114_MR2',
 'OAS2_0116_MR1',
 'OAS2_0116_MR2',
 'OAS2_0117_MR1',
 'OAS2_0117_MR2',
 'OAS2_0117_MR3',
 'OAS2_0117_MR4',
 'OAS2_0118_MR1',
 'OAS2_0118_MR2',
 'OAS2_0119_MR1',
 'OAS2_0119_MR2',
 'OAS2_0119_MR3',
 'OAS2_0120_MR1',
 'OAS2_0120_MR2',
 'OAS2_0121_MR1',
 'OAS2_0121_MR2',
 'OAS2_0122_MR1',
 'OAS2_0122_MR2',
 'OAS2_0124_MR1',
 'OAS2_0124_MR2',
 'OAS2_0126_MR1',
 'OAS2_0126_MR2',
 'OAS2_0126_MR3',
 'OAS2_0127_MR1',
 'OAS2_0127_MR2',
 'OAS2_0127_MR3',
 'OAS2_0127_MR4',
 'OAS2_0127_MR5',
 'OAS2_0128_MR1',
 'OAS2_0128_MR2',
 'OAS2_0129_MR1',
 'OAS2_0129_MR2',
 'OAS2_0129_MR3',
 'OAS2_0131_MR1',
 'OAS2_0131_MR2',
 'OAS2_0133_MR1',
 'OAS2_0133_MR3',
 'OAS2_0134_MR1',
 'OAS2_0134_MR2',
 'OAS2_0135_MR1',
 'OAS2_0135_MR2',
 'OAS2_0137_MR1',
 'OAS2_0137_MR2',
 'OAS2_0138_MR1',
 'OAS2_0138_MR2',
 'OAS2_0139_MR1',
 'OAS2_0139_MR2',
 'OAS2_0140_MR1',
 'OAS2_0140_MR2',
 'OAS2_0140_MR3',
 'OAS2_0141_MR1',
 'OAS2_0141_MR2',
 'OAS2_0142_MR1',
 'OAS2_0142_MR2',
 'OAS2_0143_MR1',
 'OAS2_0143_MR2',
 'OAS2_0143_MR3',
 'OAS2_0144_MR1',
 'OAS2_0144_MR2',
 'OAS2_0145_MR1',
 'OAS2_0145_MR2',
 'OAS2_0146_MR1',
 'OAS2_0146_MR2',
 'OAS2_0147_MR1',
 'OAS2_0147_MR2',
 'OAS2_0147_MR3',
 'OAS2_0147_MR4',
 'OAS2_0149_MR1',
 'OAS2_0149_MR2',
 'OAS2_0150_MR1',
 'OAS2_0150_MR2',
 'OAS2_0152_MR1',
 'OAS2_0152_MR2',
 'OAS2_0152_MR3',
 'OAS2_0154_MR1',
 'OAS2_0154_MR2',
 'OAS2_0156_MR1',
 'OAS2_0156_MR2',
 'OAS2_0157_MR1',
 'OAS2_0157_MR2',
 'OAS2_0158_MR1',
 'OAS2_0158_MR2',
 'OAS2_0159_MR1',
 'OAS2_0159_MR2',
 'OAS2_0160_MR1',
 'OAS2_0160_MR2',
 'OAS2_0161_MR1',
 'OAS2_0161_MR2',
 'OAS2_0161_MR3',
 'OAS2_0162_MR1',
 'OAS2_0162_MR2',
 'OAS2_0164_MR1',
 'OAS2_0164_MR2',
 'OAS2_0165_MR1',
 'OAS2_0165_MR2',
 'OAS2_0169_MR1',
 'OAS2_0169_MR2',
 'OAS2_0171_MR1',
 'OAS2_0171_MR2',
 'OAS2_0171_MR3',
 'OAS2_0172_MR1',
 'OAS2_0172_MR2',
 'OAS2_0174_MR1',
 'OAS2_0174_MR2',
 'OAS2_0174_MR3',
 'OAS2_0175_MR1',
 'OAS2_0175_MR2',
 'OAS2_0175_MR3',
 'OAS2_0176_MR1',
 'OAS2_0176_MR2',
 'OAS2_0176_MR3',
 'OAS2_0177_MR1',
 'OAS2_0177_MR2',
 'OAS2_0178_MR1',
 'OAS2_0178_MR2',
 'OAS2_0178_MR3',
 'OAS2_0179_MR1',
 'OAS2_0179_MR2',
 'OAS2_0181_MR1',
 'OAS2_0181_MR2',
 'OAS2_0181_MR3',
 'OAS2_0182_MR1',
 'OAS2_0182_MR2',
 'OAS2_0183_MR1',
 'OAS2_0183_MR2',
 'OAS2_0183_MR3',
 'OAS2_0183_MR4',
 'OAS2_0184_MR1',
 'OAS2_0184_MR2',
 'OAS2_0185_MR1',
 'OAS2_0185_MR2',
 'OAS2_0185_MR3',
 'OAS2_0186_MR1',
 'OAS2_0186_MR2',
 'OAS2_0186_MR3']

LabelEncoder()

LabelEncoder()

['R']

# Check that all columns are now numerical
mri.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    int64  
 1   MRI ID      373 non-null    int64  
 2   Group       373 non-null    int64  
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    int64  
 6   Hand        373 non-null    int64  
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    float64
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(6), int64(9)
memory usage: 43.8 KB

def outlier_detection(cols):
    """
    Detect outliers in a list of values using z-score method.
    
    Parameters:
        cols (list): A list of numerical values.
    
    Returns:
        outliers (tuple): A tuple containing the indices of outliers.
    """
    mean_col = np.mean(cols)
    std_col = np.std(cols)
    threshold = 3
    z_scores = [(col - mean_col) / std_col for col in cols]
    outliers = np.where(np.abs(z_scores) > threshold)
    return outliers

cols = ["EDUC", "SES", "MMSE", "CDR", "eTIV", "Visit", "eTIV", "nWBV", "ASF"]

for col in cols:
    outliers_indices = outlier_detection(mri[col])
    if len(outliers_indices[0]) > 0:
        print(f"Outliers for {col} are given below:")
        for i in outliers_indices[0]:
            outlier_value = mri[col].iloc[i]
            print(f"Index: {i}, Value: {outlier_value}")
        print()

Outliers for MMSE are given below:
Index: 26, Value: 16.0
Index: 89, Value: 15.0
Index: 100, Value: 7.0
Index: 101, Value: 4.0
Index: 172, Value: 16.0
Index: 173, Value: 16.0
Index: 251, Value: 15.0

Outliers for CDR are given below:
Index: 184, Value: 2.0
Index: 251, Value: 2.0
Index: 330, Value: 2.0

Outliers for Visit are given below:
Index: 32, Value: 5
Index: 71, Value: 5
Index: 101, Value: 5
Index: 153, Value: 5
Index: 160, Value: 5
Index: 265, Value: 5

#Check which columns contain null values
nulls = mri.columns[mri.isnull().any()]

#Check how many nulls in each column
mri[nulls].isnull().sum()

SES     19
MMSE     2
dtype: int64

# Imputation - only works on numerical attributes, hence why we encoded the categorical text features in 3.1
# Impute missing values with the median value of the attribute
imputer = SimpleImputer(strategy="median")

# Final check to make sure all columns are numerical before proceeding with imputation
for column in mri.columns:
    if mri[column].dtype == 'object':
        print(f"{column} of type object found!")
    else:
        print(f"{column} is a numerical attribute - proceed with imputation")

# Fitting imputer instance
imputer.fit(mri)

# Check if the median computed by the imputer matches the median of the original data
print("Medians match:", (imputer.statistics_ == mri.median().values).all())

# Transforming mri data to replace null values via imputation
X = imputer.transform(mri)

# Constructing dataframe from transformed values
mri = pd.DataFrame(X, columns=mri.columns)

Subject ID is a numerical attribute - proceed with imputation
MRI ID is a numerical attribute - proceed with imputation
Group is a numerical attribute - proceed with imputation
Visit is a numerical attribute - proceed with imputation
MR Delay is a numerical attribute - proceed with imputation
M/F is a numerical attribute - proceed with imputation
Hand is a numerical attribute - proceed with imputation
Age is a numerical attribute - proceed with imputation
EDUC is a numerical attribute - proceed with imputation
SES is a numerical attribute - proceed with imputation
MMSE is a numerical attribute - proceed with imputation
CDR is a numerical attribute - proceed with imputation
eTIV is a numerical attribute - proceed with imputation
nWBV is a numerical attribute - proceed with imputation
ASF is a numerical attribute - proceed with imputation

SimpleImputer(strategy='median')

SimpleImputer(strategy='median')

Medians match: True

#Check that nulls have been replaced
mri[nulls].isnull().sum()

SES     0
MMSE    0
dtype: int64

cols = ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']

max_values = mri[cols].max()
min_values = mri[cols].min()

print(f"max value in mri dataset for {cols} before scaling:\n{max_values}")
print(f"min value in mri dataset for {cols} before scaling:\n{min_values}")

max value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age       98.000000
M/F        1.000000
EDUC      23.000000
SES        5.000000
MMSE      30.000000
eTIV    2004.479526
nWBV       0.837000
ASF        1.587000
dtype: float64
min value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age       60.000000
M/F        0.000000
EDUC       6.000000
SES        1.000000
MMSE       4.000000
eTIV    1105.652499
nWBV       0.644000
ASF        0.876000
dtype: float64

scaler = StandardScaler().fit(mri)
mri[cols]=scaler.fit_transform(mri[cols])

cols = ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF']

max_values = mri[cols].max()
min_values = mri[cols].min()

print(f"max value in mri dataset for {cols} before scaling:\n{max_values}")
print(f"min value in mri dataset for {cols} before scaling:\n{min_values}")

max value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age     2.750282
M/F     1.153798
EDUC    2.925048
SES     2.313556
MMSE    0.721664
eTIV    2.935525
nWBV    2.896887
ASF     2.839157
dtype: float64
min value in mri dataset for ['Age', 'M/F', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'ASF'] before scaling:
Age    -2.229597
M/F    -0.866703
EDUC   -2.993181
SES    -1.297140
MMSE   -6.362035
eTIV   -2.174359
nWBV   -2.307345
ASF    -2.316501
dtype: float64

# Drop unneccessary features and also the highly correlated feature CDR from the dataframe
mri = mri.drop(['Subject ID', 'Visit', 'MRI ID', 'Hand', 'MR Delay', 'CDR'], axis=1)

mri.head(10)

seed = np.random.seed(42)
y =  mri['Group']   #Target 
X = mri.drop(['Group'], axis=1) # Features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#Create copies of our train/test sets for benchmark testing
BM_X_train = X_train.copy()
BM_X_test = X_test.copy()
BM_y_train = y_train.copy()
BM_y_test = y_test.copy()

#Create further copies incase of further optimization techniques
Copy_X_train = X_train.copy()
Copy_X_test = X_test.copy()
Copy_y_train = y_train.copy()
Copy_y_test = y_test.copy()

print("X_train set is made up of {} rows and {} columns\n".format(len(X_train.index), len(X_train.columns)))
print("X_test set is made up of {} rows and {} columns\n".format(len(X_test.index), len(X_test.columns)))
print("y_train is made up of {} values\n".format(len(y_train.index)))
print("y_test is made up of {} values\n".format(len(y_test.index)))

X_train set is made up of 261 rows and 8 columns

X_test set is made up of 112 rows and 8 columns

y_train is made up of 261 values

y_test is made up of 112 values

## As per the evaluators suggestion, look into using xgboost  -> popular boosting model -> appropiate here
#setting random state as 42 for reproducability

# Define the models
models = [ 
    LogisticRegression(random_state=42),
    SVC(random_state=42),
    DecisionTreeClassifier(random_state=42),
    AdaBoostClassifier(random_state=42),
    RandomForestClassifier(random_state=42),
    XGBClassifier(random_state=42),
    GradientBoostingClassifier(random_state=42),
    GaussianNB()
]

# Define the corresponding classifier names
classifiers = [
    'Logistic Regression',
    'SVC',
    'Decision Tree Classifier',
    'AdaBoost Classifier',
    'Random Forest Classifier',
    'XGBClassifier',
    'GradientBoostingClassifier',
    'Naive Bayes'
]

# Define the columns for the benchmark dataframe
ml_columns = ['ML Algo name', 'Model Parameters', 'training AUC accuracy', 'test recall score', 'test AUC score']

# Create an empty dataframe to store benchmark results
benchmark = pd.DataFrame(columns=ml_columns)

# Perform cross-validation and evaluation for each model
for model, clf_name in zip(models, classifiers):
    benchmark.loc[len(benchmark)] = [clf_name, str(model.get_params()), np.nan, np.nan, np.nan]
    cv_score = cross_val_score(model, BM_X_train, BM_y_train, cv=5, scoring='roc_auc') 
    benchmark.loc[benchmark['ML Algo name'] == clf_name, 'training AUC accuracy'] = np.mean(cv_score)
    mdl = model.fit(BM_X_train, BM_y_train)
    test_acc = mdl.score(BM_X_test, BM_y_test)
    fpr, tpr, thresholds = roc_curve(BM_y_test, model.predict(BM_X_test))
    test_recall = recall_score(BM_y_test, model.predict(BM_X_test))
    test_auc = auc(fpr, tpr)
    benchmark.loc[benchmark['ML Algo name'] == clf_name, 'test recall score'] = test_recall
    benchmark.loc[benchmark['ML Algo name'] == clf_name, 'test AUC score'] = test_auc 

# Sort benchmark table by test AUC score
benchmark.sort_values(by='test AUC score', ascending=False, inplace=True)

# Plot the benchmark results
sns.barplot(x='test AUC score', y='ML Algo name', data=benchmark)

# Set the width of the model parameters column
benchmark.style.set_properties(subset=['Model Parameters'], **{'width': '300px'})

<Axes: xlabel='test AUC score', ylabel='ML Algo name'>

total_fpr = {}
total_tpr = {}

def roc_curves(model, classifier):
    ML_name = model.__class__.__name__
    fpr, tpr, thresholds = roc_curve(BM_y_test, model.predict(BM_X_test))
    test_auc = auc(fpr, tpr)
    total_fpr[ML_name] = fpr
    total_tpr[ML_name] = tpr
    plt.plot(fpr, tpr, color='darkorange', linewidth=2, label='(Area = %0.2f)' % test_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.title('ROC Curve for {}'.format(classifier))
    plt.show()

def confusion_matrix_plot(model, classifier):
    cm = confusion_matrix(y_test, model.predict(BM_X_test))
    plt.imshow(cm, interpolation='nearest', cmap=plt.get_cmap('Blues'))
    labels = ['Nondemented', 'Demented']
    plt.title('Confusion Matrix for {}'.format(classifier))
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels)
    plt.yticks(tick_marks, labels)
    s = [['TN', 'FP'], ['FN', 'TP']]
    for i, j in itertools.product(range(2), range(2)):
        plt.text(j, i, str(s[i][j]) + " = " + str(cm[i][j]))
    plt.show()

for model, classifier in zip(models, classifiers):
    roc_curves(model, classifier)
    confusion_matrix_plot(model, classifier)

colors = {
    'LogisticRegression': 'red',
    'SVC': 'green',
    'DecisionTreeClassifier': 'blue',
    'AdaBoostClassifier': 'yellow',
    'RandomForestClassifier': 'cyan',
    'XGBClassifier': 'magenta',
    'GradientBoostingClassifier': 'black',
    'GaussianNB': 'white'
}

plt.figure(figsize=(20, 12))
for model_name in total_fpr.keys():
    plt.plot(total_fpr[model_name], total_tpr[model_name], color=colors[model_name], lw=1, label=model_name)
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.title('Comparison of ROC curves for all classifiers')
plt.plot([0, 1], [0, 1], color='darkorange', lw=2, linestyle='--')
plt.legend()
plt.show();

#Create empty final results dataframe for optimization metrics
refinement_cols = ['ML Model','Method','fitting time', 'optimal hyperparameters', 'best training AUC score', 'best estimator','test recall score', 'test AUC score']
finale = pd.DataFrame(columns = refinement_cols)

#Create empty robustness table to compare training and testing errors
robust_cols = ['ML Model', 'Training Classification Error', 'Test Classification_Error']
robustment_test = pd.DataFrame(columns = robust_cols)

#Storing training times
training_time = []
#Create two empty dictionaries so that the FPR and TPR can be captured for each model. This will allow us to plot all the ROC curves on one graph
total_fpr_opt = {} 
total_tpr_opt = {}



#Define classifiers for tuning - set random_state for reproducible results
classifiers = [ LogisticRegression(random_state=42),
                SVC(random_state=42),
                AdaBoostClassifier(random_state=42),
                DecisionTreeClassifier(random_state=42),
                RandomForestClassifier(random_state=42),
                GradientBoostingClassifier(random_state=42),
                XGBClassifier(random_state=42) ]



lr_parameters = { 
                  'C':[0.001, 0.01, 0.1, 1, 10,100,1000, 10000], 
                  'max_iter':list(range(2,100))}


svm_parameters = {
                  'kernel': ['rbf', 'poly', 'sigmoid'],
                  'C':[0.00001,0.0001, 0.001, 0.01, 0.1, 1, 10, 20, 30, 40, 50 ,60 , 80, 100],
                  'gamma': [0.00001,0.0001, 0.001, 0.01, 0.1, 1, 10] }

dt_parameters = { #'max_leaf_nodes': list(range(2,100)),
                  'max_leaf_nodes': list(range(2,20)),
                  'splitter':['random', 'best'],
                  'criterion': ['gini', 'entropy'],
                  'max_depth': list(range(2,20)),
                  'min_samples_split': list(range(2,20))}

rf_parameters = { 
                  'n_estimators': [10,30,50,100,150,200,250,300,350,400,500,600,700,800,900,1000],
                  'max_leaf_nodes': list(range(2,30)),
                  'min_samples_leaf':[1,2,3], 'min_samples_split':[3,4,5,6,7]}

gb_parameters = {
                  'learning_rate': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'min_samples_leaf': list(range(1,20)),
                  'max_depth': list(range(2,20)),
                  'n_estimators': range(1,200)}

xgboost_parameters = { 
                  'silent': [True],
                  'max_depth': [6, 10, 15, 20],
                  'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7],
                  'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
                  'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
                  'gamma': [0, 0.25, 0.5, 1.0],
                  'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
                  'n_estimators': [50,100,120,150]}


ada_parameters = { 
                   'n_estimators': [50,100,200,250,300,400,500,600,700,800,900,1000],
                   'learning_rate': [ 0.1,0.15, 0.2,0.25, 0.3,0.4,0.5,0.6,0.7,0.8,0.9,1] }

parameters = [lr_parameters, 
              svm_parameters,
              ada_parameters, 
              dt_parameters,
              rf_parameters, 
              gb_parameters, 
              xgboost_parameters]

os.environ["PARALLEL_JOBS"] = "16"

def get_n_jobs():
    parallel_jobs = os.getenv('PARALLEL_JOBS')
    if parallel_jobs is not None:
        try:
            n_jobs = int(parallel_jobs)
            return n_jobs
        except ValueError:
            print("Error: PARALLEL_JOBS is not an integer. Using default value (None).")
            return None
    else:
        return None

parallel_jobs = get_n_jobs()

def print_summary(ML_name, mdl, method, fitting_time, best_params, confusion_matrix, complete_time,parallel_jobs,feature_importances=False):
    print("--" * 50)
    print("Finding the best hyperparameters for metric [roc_auc] for {}\n".format(ML_name))
    print("Performing {} for {}\n".format(method, ML_name))
    print("Beginning to fit {}\n".format(ML_name))
    print("Optimal hyperparameters:", best_params)
    print("Confusion matrix:")
    confusion_matrix_plot(confusion_matrix, ML_name)
    if feature_importances:
        print_feature_importances(ML_name, mdl)
    if parallel_jobs == None:
        parallel_jobs = 1
    print(f"Hyperparameter tuning complete. Used {parallel_jobs} parallel jobs... Took {complete_time:.2f} seconds.\n")

def print_feature_importances(ML_name, mdl):
    print("=="*16,"feature_importances","=="*16)
    for name, importance in zip(X_train.columns, mdl.feature_importances_):
        print(name, "==", importance)
    importances = mdl.feature_importances_
    indices = np.argsort(importances)
    features = X_train.columns
    plt.title('Feature Importances')
    plt.barh(range(len(indices)), importances[indices], color='b', align='center')
    plt.yticks(range(len(indices)), [features[i] for i in indices])
    plt.xlabel('Relative Importances')
    plt.show();

def confusion_matrix_plot(confusion_matrix, classifier):
    plt.imshow(confusion_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
    labels = ['Nondemented', 'Demented']
    plt.title('Confusion Matrix for {}'.format(classifier))
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels)
    plt.yticks(tick_marks, labels)
    s = [['TN', 'FP'], ['FN', 'TP']]
    for i, j in itertools.product(range(2), range(2)):
        plt.text(j, i, str(s[i][j]) + " = " + str(confusion_matrix[i][j]))
    plt.show()

def grid(classifier, parameters,jobs):
    ML_name = classifier.__class__.__name__
    start_time = time()
    models = ['DecisionTreeClassifier', 'RandomForestClassifier', 'XGBClassifier', 'GradientBoostingClassifier', 'AdaBoostClassifier']
    if ML_name in models:
        method = "Randomized Grid Search"
        grid = RandomizedSearchCV(estimator=classifier, param_distributions=parameters, cv=10, n_iter=100, scoring="roc_auc", random_state=42,n_jobs=jobs,verbose=0)
    else:
        method = "Grid Search"
        grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=10, scoring="roc_auc",n_jobs=jobs,verbose=0)
    grid.fit(X_train, y_train)
    fitting_time = time() - start_time
    best_params = grid.best_params_
    mdl = grid.best_estimator_.fit(X_train, y_train)
    pred = mdl.predict(X_test)
    test_acc = accuracy_score(y_test, pred)
    test_recall = recall_score(y_test, pred, pos_label=1)
    fpr,tpr,thresholds = roc_curve(y_test, pred, pos_label=1)
    test_auc = auc(fpr, tpr)
    total_fpr_opt[ML_name] = fpr  #Dit key
    total_tpr_opt[ML_name] = tpr #Dict key
    conf_matrix = confusion_matrix(y_test, pred)
    complete_time = time() - start_time
    # Feature importances if applicable
    feature_importances = None
    if ML_name in models:
        feature_importances = True

    # Inserting data into finale DataFrame
    finale.loc[index, 'ML Model'] = ML_name
    finale.loc[index, 'Method'] = method
    finale.loc[index, 'fitting time'] = fitting_time
    finale.loc[index, 'optimal hyperparameters'] = str(best_params)
    finale.loc[index, 'best training AUC score'] = grid.best_score_
    finale.loc[index, 'best estimator'] = str(mdl)
    # Inserting test recall score and test AUC score into finale DataFrame
    finale.loc[index, 'test recall score'] = test_recall
    finale.loc[index, 'test AUC score'] = test_auc
    print_summary(ML_name, mdl, method, fitting_time, best_params, conf_matrix, complete_time,jobs,feature_importances=feature_importances)
    # Robustness check for certain models
    robust_models = ['SVC', 'RandomForestClassifier', 'XGBClassifier']
    if ML_name in robust_models:
        print("------------- Calculating {} classification error for Robustness checks  -------------\n".format(ML_name))
        if ML_name in models:
            grid = RandomizedSearchCV(estimator=classifier, param_distributions=parameters, cv=10, n_iter=100, scoring="accuracy")
            robustment_test.loc[index, 'ML Model'] = ML_name
            grid.fit(X_train, y_train)
        else:
            grid = GridSearchCV(estimator=classifier, param_grid=parameters, cv=10, scoring="accuracy")
            robustment_test.loc[index, 'ML Model'] = ML_name
            grid.fit(X_train, y_train)
        training_error = 1 - (grid.best_score_)
        robustment_test.loc[index, 'Training Classification Error'] = training_error
        mdl = grid.best_estimator_.fit(X_train, y_train)
        pred = mdl.predict(X_test)
        test_error = 1 - (accuracy_score(y_test, pred))
        robustment_test.loc[index, 'Test Classification_Error'] = test_error
        print("Training classification error: ", training_error)
        print("Testing classification error: ", test_error)
        print("\n Robustness checks complete!")


index = 0
for classifier, parameter in zip(classifiers, parameters):
    grid(classifier, parameter,parallel_jobs)
    index += 1

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for LogisticRegression

Performing Grid Search for LogisticRegression

Beginning to fit LogisticRegression

Optimal hyperparameters: {'C': 100, 'max_iter': 9}
Confusion matrix:

Hyperparameter tuning complete. Used 16 parallel jobs... Took 6.87 seconds.

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for SVC

Performing Grid Search for SVC

Beginning to fit SVC

Optimal hyperparameters: {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Confusion matrix:

Hyperparameter tuning complete. Used 16 parallel jobs... Took 2.66 seconds.

------------- Calculating SVC classification error for Robustness checks  -------------

Training classification error:  0.09943019943019937
Testing classification error:  0.1160714285714286

 Robustness checks complete!
----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for AdaBoostClassifier

Performing Randomized Grid Search for AdaBoostClassifier

Beginning to fit AdaBoostClassifier

Optimal hyperparameters: {'n_estimators': 400, 'learning_rate': 0.8}
Confusion matrix:

================================ feature_importances ================================
M/F == 0.045
Age == 0.1325
EDUC == 0.0625
SES == 0.0625
MMSE == 0.0375
eTIV == 0.22
nWBV == 0.2975
ASF == 0.1425

Hyperparameter tuning complete. Used 16 parallel jobs... Took 66.95 seconds.

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for DecisionTreeClassifier

Performing Randomized Grid Search for DecisionTreeClassifier

Beginning to fit DecisionTreeClassifier

Optimal hyperparameters: {'splitter': 'best', 'min_samples_split': 6, 'max_leaf_nodes': 7, 'max_depth': 2, 'criterion': 'gini'}
Confusion matrix:

================================ feature_importances ================================
M/F == 0.0998291718520719
Age == 0.0
EDUC == 0.0
SES == 0.0
MMSE == 0.9001708281479281
eTIV == 0.0
nWBV == 0.0
ASF == 0.0

Hyperparameter tuning complete. Used 16 parallel jobs... Took 0.68 seconds.

----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for RandomForestClassifier

Performing Randomized Grid Search for RandomForestClassifier

Beginning to fit RandomForestClassifier

Optimal hyperparameters: {'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_leaf_nodes': 28}
Confusion matrix:

================================ feature_importances ================================
M/F == 0.06072693782658392
Age == 0.08072949191275151
EDUC == 0.08453418287763775
SES == 0.04067643562190814
MMSE == 0.3412779675591842
eTIV == 0.11370031119223188
nWBV == 0.16491149087601517
ASF == 0.11344318213368734

Hyperparameter tuning complete. Used 16 parallel jobs... Took 45.26 seconds.

------------- Calculating RandomForestClassifier classification error for Robustness checks  -------------

Training classification error:  0.1495726495726496
Testing classification error:  0.1964285714285714

 Robustness checks complete!
----------------------------------------------------------------------------------------------------
Finding the best hyperparameters for metric [roc_auc] for GradientBoostingClassifier

Performing Randomized Grid Search for GradientBoostingClassifier

Beginning to fit GradientBoostingClassifier

Optimal hyperparameters: {'n_estimators': 84, 'min_samples_leaf': 8, 'max_depth': 15, 'learning_rate': 0.8}
Confusion matrix:

================================ feature_importances ================================
M/F == 0.06288633186170128
Age == 0.06195875992764563
EDUC == 0.06696296486554618
SES == 0.04390265085288797
MMSE == 0.4892290215022818
eTIV == 0.09386109718420312
nWBV == 0.13803520750280088
ASF == 0.04316396630293312

finale.head(10)
finale[['ML Model','test AUC score']]

#Check classification scores for the 3 main benchmark models
robustment_test

##Setting the colour of the ROC curves for each model
colors = { 'LogisticRegression':'red',
            'SVC':'green',
            'DecisionTreeClassifier':'blue',
            'AdaBoostClassifier':'yellow',
            'RandomForestClassifier':'cyan',
            'XGBClassifier':'magenta', 
            'GradientBoostingClassifier': 'black'
        }    

#total_fpr.keys() returns the models name as the key
plt.figure(figsize=(20,12))
for i in total_fpr_opt.keys():
    colors = {'LogisticRegression':'red', 'SVC':'green', 'DecisionTreeClassifier':'blue', 'AdaBoostClassifier':'yellow','RandomForestClassifier':'cyan', 'XGBClassifier':'magenta', 'GradientBoostingClassifier': 'black', 'GaussianNB': 'white'}    
    plt.plot(total_fpr_opt[i],total_tpr_opt[i],colors[i], lw=1, label=i)
plt.xlim([0,1])
plt.ylim([0,1])
plt.title('Comparison of ROC curves for all classifiers')
plt.plot([0, 1], [0, 1], color='darkorange', lw=2, linestyle='--')
plt.legend();

def Performance(model, y, X, fitting_time, best_params):
    predictions = model.predict(X)
    recall = recall_score(y, predictions, pos_label=1)
    fpr, tpr, _ = roc_curve(y, predictions)
    auc_score = auc(fpr, tpr)
    print('Recall score: {:.4f}'.format(recall))
    print('AUC score: {:.4f}'.format(auc_score))
    
    # Update finale table
    global index
    finale.loc[index, 'ML Model'] = model.__class__.__name__
    finale.loc[index, 'Method'] = 'Hyperopt tuning'
    finale.loc[index, 'fitting time'] = str(fitting_time)
    finale.loc[index, 'optimal hyperparameters'] = str(best_params)
    finale.loc[index, 'best training AUC score'] = auc_score
    finale.loc[index, 'best estimator'] = str(model)
    finale.loc[index, 'test recall score'] = recall
    finale.loc[index, 'test AUC score'] = auc_score
    
    # Increment index for the next entry
    index += 1

def objective(params):
    params = {
        'n_estimators': params['n_estimators'],
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'min_child_weight': "{:.3f}".format(params['min_child_weight']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'random_state': params['random_state']
    }
    
    #clf = XGBClassifier(n_jobs=4,**params)
    clf = XGBClassifier(**params)
    skf = StratifiedKFold(n_splits=10, shuffle=True,random_state=42)
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("roc_auc {:.3f} params {}".format(score, params))
    training_scores.append(score)
    return score

space = {
    'n_estimators': scope.int(hp.uniform('n_estimators',100, 2000)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.4),
    'max_depth': scope.int(hp.quniform('max_depth', 2, 14, 1)),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0),
    'min_child_weight': hp.uniform('min_child_weight',0.1, 0.6),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
    'random_state': 42 
}

#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))

roc_auc 0.887 params {'n_estimators': 1546, 'learning_rate': '0.129', 'max_depth': 4, 'gamma': '0.469', 'min_child_weight': '0.462', 'colsample_bytree': '0.954', 'random_state': 42}
roc_auc 0.886 params {'n_estimators': 910, 'learning_rate': '0.064', 'max_depth': 6, 'gamma': '0.374', 'min_child_weight': '0.452', 'colsample_bytree': '0.952', 'random_state': 42}
roc_auc 0.908 params {'n_estimators': 1777, 'learning_rate': '0.110', 'max_depth': 6, 'gamma': '0.267', 'min_child_weight': '0.388', 'colsample_bytree': '0.651', 'random_state': 42}
roc_auc 0.900 params {'n_estimators': 896, 'learning_rate': '0.072', 'max_depth': 12, 'gamma': '0.195', 'min_child_weight': '0.135', 'colsample_bytree': '0.822', 'random_state': 42}
roc_auc 0.889 params {'n_estimators': 563, 'learning_rate': '0.189', 'max_depth': 8, 'gamma': '0.300', 'min_child_weight': '0.149', 'colsample_bytree': '0.910', 'random_state': 42}
roc_auc 0.891 params {'n_estimators': 469, 'learning_rate': '0.189', 'max_depth': 12, 'gamma': '0.146', 'min_child_weight': '0.587', 'colsample_bytree': '0.920', 'random_state': 42}
roc_auc 0.890 params {'n_estimators': 1219, 'learning_rate': '0.299', 'max_depth': 12, 'gamma': '0.192', 'min_child_weight': '0.482', 'colsample_bytree': '0.971', 'random_state': 42}
roc_auc 0.907 params {'n_estimators': 825, 'learning_rate': '0.336', 'max_depth': 5, 'gamma': '0.179', 'min_child_weight': '0.123', 'colsample_bytree': '0.721', 'random_state': 42}
roc_auc 0.887 params {'n_estimators': 1065, 'learning_rate': '0.219', 'max_depth': 9, 'gamma': '0.346', 'min_child_weight': '0.462', 'colsample_bytree': '0.961', 'random_state': 42}
roc_auc 0.900 params {'n_estimators': 699, 'learning_rate': '0.220', 'max_depth': 3, 'gamma': '0.159', 'min_child_weight': '0.392', 'colsample_bytree': '0.864', 'random_state': 42}
100%|██████████| 10/10 [00:04<00:00,  2.40trial/s, best loss: 0.8856679894179894]

Hyperopt completed in 4.178892374038696 seconds

The average training score was 0.894657671957672

# Call Performance function after hyperparameter tuning
mdl = XGBClassifier(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9516347225548255, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.37420938791338665,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06443939093069395,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=0.4520573760002473, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=910,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.9516347225548255, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.37420938791338665,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.06443939093069395,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=0.4520573760002473, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=910,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)

Recall score: 0.7833
AUC score: 0.8051

def objective(params):
    params = {
        'max_depth': int(params['max_depth']),
        'n_estimators':int(params['n_estimators']),
        'random_state': params['random_state']
    }
    
    #clf = RandomForestClassifier(n_jobs=4,**params)
    clf = RandomForestClassifier(**params)
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("roc_auc {:.3f} params {}".format(score, params))
    training_scores.append(score)
    return score

space = {
    'max_depth': scope.int(hp.quniform('max_depth', 10,30, 1)),
    'n_estimators': scope.int(hp.quniform('n_estimators', 100,1500,25)),
    'random_state': 42 }

#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))

roc_auc 0.923 params {'max_depth': 26, 'n_estimators': 625, 'random_state': 42}
roc_auc 0.923 params {'max_depth': 18, 'n_estimators': 450, 'random_state': 42} 
roc_auc 0.924 params {'max_depth': 13, 'n_estimators': 725, 'random_state': 42} 
roc_auc 0.923 params {'max_depth': 13, 'n_estimators': 225, 'random_state': 42} 
roc_auc 0.924 params {'max_depth': 27, 'n_estimators': 425, 'random_state': 42} 
roc_auc 0.918 params {'max_depth': 13, 'n_estimators': 150, 'random_state': 42} 
roc_auc 0.917 params {'max_depth': 25, 'n_estimators': 125, 'random_state': 42} 
roc_auc 0.922 params {'max_depth': 18, 'n_estimators': 575, 'random_state': 42} 
roc_auc 0.922 params {'max_depth': 12, 'n_estimators': 1450, 'random_state': 42}
roc_auc 0.922 params {'max_depth': 28, 'n_estimators': 1200, 'random_state': 42}
100%|██████████| 10/10 [00:20<00:00,  2.09s/trial, best loss: 0.9165542328042328]

Hyperopt completed in 20.87399196624756 seconds

The average training score was 0.9216392857142857

# Call Performance function after hyperparameter tuning
mdl = RandomForestClassifier(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)

RandomForestClassifier(max_depth=25, n_estimators=125, random_state=42)

RandomForestClassifier(max_depth=25, n_estimators=125, random_state=42)

Recall score: 0.8333
AUC score: 0.8590

def objective(params):
    params = {
        'C': params['C'],
        'gamma':params['gamma'],
        'kernel': params['kernel'],
        'random_state': params['random_state']
    }
    
    clf = SVC(**params)
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=StratifiedKFold()).mean()
    print("roc_auc {:.3f} params {}".format(score, params))
    training_scores.append(score)
    return score

space = {
    'C': hp.uniform('C', 10,11),
    'gamma': hp.quniform('gamma', 0.35,0.42,0.01),    
    'kernel': 'rbf', 
    'random_state': 42 }

#Capture training scores & fitting times as well
training_scores = []
start_time = time()
best = fmin(fn=objective,space=space,algo=tpe.suggest,max_evals=10)
best_params = space_eval(space, best)
fitting_time = time() - start_time
print("\nHyperopt completed in {} seconds\n".format(fitting_time))
print("The average training score was {} ". format(sum(training_scores)/len(training_scores)))

roc_auc 0.931 params {'C': 10.864562445314965, 'gamma': 0.39, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.933 params {'C': 10.050081000530929, 'gamma': 0.4, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.933 params {'C': 10.59883981122294, 'gamma': 0.4, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.929 params {'C': 10.8091990797788, 'gamma': 0.37, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.928 params {'C': 10.01700859922015, 'gamma': 0.37, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.937 params {'C': 10.970767038409553, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.936 params {'C': 10.741369260572458, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.935 params {'C': 10.08465967125343, 'gamma': 0.41000000000000003, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.927 params {'C': 10.616443464899518, 'gamma': 0.36, 'kernel': 'rbf', 'random_state': 42}
roc_auc 0.937 params {'C': 10.849924492903604, 'gamma': 0.42, 'kernel': 'rbf', 'random_state': 42}
100%|██████████| 10/10 [00:00<00:00, 37.84trial/s, best loss: 0.9266600529100529]

Hyperopt completed in 0.2672393321990967 seconds

The average training score was 0.9324859788359788

# Call Performance function after hyperparameter tuning
mdl = SVC(**best_params)
mdl.fit(X_train, y_train)
Performance(mdl, y_test, X_test, fitting_time, best_params)

SVC(C=10.616443464899518, gamma=0.36, random_state=42)

SVC(C=10.616443464899518, gamma=0.36, random_state=42)

Recall score: 0.8667
AUC score: 0.9045

def process_results(finale, benchmark):
    # Rename column in benchmark DataFrame
    benchmark.rename(columns={'ML Algo name': 'ML Model'}, inplace=True)
    
    # Extract relevant columns from finale DataFrame and sort by ML Model
    output = finale[['ML Model', 'Method', 'test AUC score']].sort_values('ML Model')
    
    # Set Method column to 'untuned' in benchmark DataFrame
    benchmark['Method'] = 'untuned'
    
    # Extract relevant columns from benchmark DataFrame
    benchmark = benchmark[['ML Model', 'Method', 'test AUC score']]
    
    # Concatenate output and benchmark DataFrames, sort by ML Model, and set index
    df = pd.concat([output, benchmark]).sort_values('ML Model')
    df.set_index('ML Model', inplace=True)
    
    return df

output = process_results(finale,benchmark)
output

# Extract the 'untuned' test AUC score for each ML Model and store it in a dictionary
untuned_dict = {}
for model, group in output[output['Method'] == 'untuned'].groupby('ML Model'):
    untuned_dict[model] = group.iloc[0]['test AUC score']

# List of methods to check against
methods_to_check = ['Randomized Grid Search', 'Hyperopt tuning', 'Grid Search']

# Print out the test AUC scores for each ML model with Randomized Grid Search method that outperformed the untuned method
for model, group in output.groupby('ML Model'):
    if any(method in group['Method'].values for method in methods_to_check):
        for method in methods_to_check:
            if method in group['Method'].values:
                method_score = group[group['Method'] == method]['test AUC score'].values[0]
                if model in untuned_dict and method_score > untuned_dict[model]:
                    print(f"The following model and optimisation method outperformed its benchmark value of {untuned_dict[model]}")
                    print(f"ML Model: {model}")
                    print(f"Method: {method}")
                    print(f"Test AUC score: {method_score}")
                    print()

The following model and optimisation method outperformed its benchmark value of 0.7769230769230769
ML Model: SVC
Method: Hyperopt tuning
Test AUC score: 0.9044871794871795

The following model and optimisation method outperformed its benchmark value of 0.7769230769230769
ML Model: SVC
Method: Grid Search
Test AUC score: 0.8852564102564102

The following model and optimisation method outperformed its benchmark value of 0.8064102564102564
ML Model: XGBClassifier
Method: Randomized Grid Search
Test AUC score: 0.841025641025641

#Define a meshgrid function that will plot the decision surface
def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_contours(ax, clf, xx, yy, **params):
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    out = ax.contourf(xx, yy, Z, **params)
    return out

#Select 2 features randomly to compare - SES / EDUC in this case
#Convert to array
X = np.array(X_test)
X = X[:, [2,3]]

#ax1 --> benchmark model
#ax2 --> optimised model
fig, (ax1, ax2) = plt.subplots(1,2, sharey=True, figsize=(20,10))
title = ('Decision surface for Benchmark model')
X0, X1 = X[:, 0], X[:, 1]
xx, yy = make_meshgrid(X0, X1)
mdl = SVC(C=1, gamma='auto', kernel='rbf', random_state=42).fit(X, y_test)
plot_contours(ax1, mdl, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
ax1.scatter(X0, X1, c=y_test, cmap=plt.cm.coolwarm, s=20, marker='x', edgecolors='k')
ax1.set_xlabel(X_test.columns[2])
ax1.set_ylabel(X_test.columns[3])
ax1.set_xticks(())
ax1.set_yticks(())
ax1.legend()
ax1.set_title(title);


title = ('Decision surface for Optimized model')
mdl = SVC(C=10.94, gamma=0.36, kernel='rbf', random_state=42).fit(X, y_test)
plot_contours(ax2, mdl, xx, yy, cmap=plt.cm.coolwarm, alpha=0.7)
ax2.scatter(X0, X1, c=y_test, cmap=plt.cm.coolwarm, s=25, marker='x', edgecolors='k')
ax2.set_xlabel(X_test.columns[2])
ax2.set_ylabel(X_test.columns[3])
ax2.set_xticks(())
ax2.set_yticks(())
ax2.set_title(title);

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.

	Visit	MR Delay	Age	EDUC	SES	MMSE	CDR	eTIV	nWBV	ASF	Group_Demented	Group_Nondemented
Visit	1.000000	0.920009	0.183213	0.024615	-0.051622	-0.029078	0.002325	0.117423	-0.126682	-0.120399	-0.129800	0.095507
MR Delay	0.920009	1.000000	0.205357	0.051630	-0.030813	0.065844	-0.062915	0.119641	-0.105586	-0.123545	-0.180156	0.120638
Age	0.183213	0.205357	1.000000	-0.027886	-0.046857	0.055612	-0.026257	0.042401	-0.518359	-0.035067	-0.079153	0.005941
EDUC	0.024615	0.051630	-0.027886	1.000000	-0.722647	0.194884	-0.153121	0.257042	-0.012200	-0.241752	-0.258708	0.193060
SES	-0.051622	-0.030813	-0.046857	-0.722647	1.000000	-0.149219	0.076160	-0.261582	0.090095	0.255576	0.205556	-0.062463
MMSE	-0.029078	0.065844	0.055612	0.194884	-0.149219	1.000000	-0.686519	-0.032088	0.341912	0.040052	-0.612448	0.524775
CDR	0.002325	-0.062915	-0.026257	-0.153121	0.076160	-0.686519	1.000000	0.022863	-0.344819	-0.029340	0.815473	-0.778049
eTIV	0.117423	0.119641	0.042401	0.257042	-0.261582	-0.032088	0.022863	1.000000	-0.210180	-0.988905	-0.010365	0.042579
nWBV	-0.126682	-0.105586	-0.518359	-0.012200	0.090095	0.341912	-0.344819	-0.210180	1.000000	0.213476	-0.286903	0.311346
ASF	-0.120399	-0.123545	-0.035067	-0.241752	0.255576	0.040052	-0.029340	-0.988905	0.213476	1.000000	0.008312	-0.032495
Group_Demented	-0.129800	-0.180156	-0.079153	-0.258708	0.205556	-0.612448	0.815473	-0.010365	-0.286903	0.008312	1.000000	-0.817174
Group_Nondemented	0.095507	0.120638	0.005941	0.193060	-0.062463	0.524775	-0.778049	0.042579	0.311346	-0.032495	-0.817174	1.000000

	Group	M/F	Age	EDUC	SES	MMSE	eTIV	nWBV	ASF
0	0.0	1.153798	1.308738	-0.208132	-0.394466	-0.095686	2.833595	-0.905169	-2.265742
1	0.0	1.153798	1.439787	-0.208132	-0.394466	0.721664	2.935525	-1.309643	-2.316501
2	1.0	1.153798	-0.263856	-0.904394	-0.394466	-1.185486	1.081119	0.173429	-1.083784
3	1.0	1.153798	-0.132806	-0.904394	-0.394466	0.176764	1.418413	-0.446765	-1.344830
4	1.0	1.153798	0.391392	-0.904394	-0.394466	-1.457936	1.192666	-0.770344	-1.170800
5	0.0	-0.866703	1.439787	1.184392	0.508208	0.176764	-1.550836	-0.527660	1.802224
6	0.0	-0.866703	1.701886	1.184392	0.508208	-0.095686	-1.637420	-0.311940	1.932747
7	0.0	1.153798	0.391392	-0.904394	1.410882	0.176764	1.139618	-0.473730	-1.134543
8	0.0	1.153798	0.784540	-0.904394	1.410882	0.449214	1.208652	-0.500695	-1.185302
9	0.0	1.153798	1.046639	-0.904394	1.410882	0.721664	1.200386	-0.662484	-1.178051

	ML Algo name	Model Parameters	training AUC accuracy	test recall score	test AUC score
4	Random Forest Classifier	{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}	0.917676	0.833333	0.868590
6	GradientBoostingClassifier	{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'log_loss', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_iter_no_change': None, 'random_state': 42, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}	0.895164	0.816667	0.821795
5	XGBClassifier	{'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': None, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': None, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': None}	0.898903	0.766667	0.806410
1	SVC	{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': 42, 'shrinking': True, 'tol': 0.001, 'verbose': False}	0.887858	0.650000	0.776923
3	AdaBoost Classifier	{'algorithm': 'SAMME.R', 'estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': 42}	0.870394	0.783333	0.766667
2	Decision Tree Classifier	{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 42, 'splitter': 'best'}	0.771987	0.766667	0.748718
7	Naive Bayes	{'priors': None, 'var_smoothing': 1e-09}	0.879456	0.650000	0.748077
0	Logistic Regression	{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': 42, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}	0.907505	0.700000	0.744231

	ML Model	Method	fitting time	optimal hyperparameters	best training AUC score	best estimator	test recall score	test AUC score
0	LogisticRegression	Grid Search	6.858933	{'C': 100, 'max_iter': 9}	0.904043	LogisticRegression(C=100, max_iter=9, random_s...	0.7	0.744231
1	SVC	Grid Search	2.657477	{'C': 10, 'gamma': 1, 'kernel': 'rbf'}	0.970661	SVC(C=10, gamma=1, random_state=42)	0.866667	0.885256
2	AdaBoostClassifier	Randomized Grid Search	66.5541	{'n_estimators': 400, 'learning_rate': 0.8}	0.888874	AdaBoostClassifier(learning_rate=0.8, n_estima...	0.8	0.794231
3	DecisionTreeClassifier	Randomized Grid Search	0.671705	{'splitter': 'best', 'min_samples_split': 6, '...	0.85049	DecisionTreeClassifier(max_depth=2, max_leaf_n...	0.6	0.751923
4	RandomForestClassifier	Randomized Grid Search	45.050044	{'n_estimators': 300, 'min_samples_split': 5, ...	0.906121	RandomForestClassifier(max_leaf_nodes=28, min_...	0.766667	0.787179
5	GradientBoostingClassifier	Randomized Grid Search	12.803396	{'n_estimators': 84, 'min_samples_leaf': 8, 'm...	0.931262	GradientBoostingClassifier(learning_rate=0.8, ...	0.783333	0.805128
6	XGBClassifier	Randomized Grid Search	2.382946	{'subsample': 0.7, 'silent': True, 'reg_lambda...	0.913014	XGBClassifier(base_score=None, booster=None, c...	0.816667	0.841026

Preliminaries¶

Section 2: Data Analysis¶

2.1 Data Exploration¶

2.2 Exploratory Data Analysis¶

2.2.2 Correlation¶

2.2.3 Scatter Matrix¶

Section 3: Data preprocessing¶

3.1 Categorical Attribute replacement¶

3.1.2 Outlier detection¶

3.1.3 Imputation - dealing with missing values¶

3.1.4 Standardizing dataframe attributes¶

3.1.5 Dropping features¶

3.1.6 Splitting data into training / test sets¶

3.2 Data Implementation¶

3.2.2 Implementation¶

GridSearch/RandomizedSearchCV¶

Further Hyperparameter refinement using hyperopt¶

Section 4: Results¶

4.1 Model Evaluation and Validation¶

Section 5: Conclusion¶

5.1 Free-form visualization of SVM Decision Boundaries¶

	ML Model	Training Classification Error	Test Classification_Error
1	SVC	0.09943	0.116071
4	RandomForestClassifier	0.149573	0.196429
6	XGBClassifier	0.176068	0.205357

	Method	test AUC score
ML Model
AdaBoost Classifier	untuned	0.766667
AdaBoostClassifier	Randomized Grid Search	0.794231
Decision Tree Classifier	untuned	0.748718
DecisionTreeClassifier	Randomized Grid Search	0.751923
GradientBoostingClassifier	Randomized Grid Search	0.805128
GradientBoostingClassifier	untuned	0.821795
Logistic Regression	untuned	0.744231
LogisticRegression	Grid Search	0.744231
Naive Bayes	untuned	0.748077
Random Forest Classifier	untuned	0.86859
RandomForestClassifier	Hyperopt tuning	0.858974
RandomForestClassifier	Randomized Grid Search	0.787179
SVC	Hyperopt tuning	0.904487
SVC	untuned	0.776923
SVC	Grid Search	0.885256
XGBClassifier	Hyperopt tuning	0.805128
XGBClassifier	untuned	0.80641
XGBClassifier	Randomized Grid Search	0.841026