import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.svm import SVC,SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,StackingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, fbeta_score
from sklearn.metrics import accuracy_score 
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from scipy.stats import normaltest, ttest_ind, mannwhitneyu, spearmanr


# Extracting data from the file
data = pd.read_csv('diabetes.csv')

# Getting the details of the dataframe
data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


# Creating the count plot
sns.countplot(data = data, x = data['Outcome'], palette = 'Greens')
plt.title("Number of Patients in the Dataset with Outcome 0 or Outcome 1")
plt.show()


# Iterating through the columns and creating a boxplot for each feature in regard to outcome 
for col in data.columns:
    if col != 'Outcome':
        sns.boxplot(data = data, x = 'Outcome', y = col, palette = "Paired")
        titlestring = "Box plot of " + col + " and Diabetes Outcomes \n"
        plt.title(titlestring)
        plt.show()


# Computes pairwise correlation (corr()) of columns and then uses it to make a heatmap
corr = data.corr()
plt.figure(figsize = (8,6))
sns.heatmap(corr,annot = True,cmap = 'Purples')
plt.title("Correlation Matrix")
plt.show()


# Creating scatterplots between specific columns
sns.scatterplot(x = 'Pregnancies', y = 'Age', data = data)
plt.title("Correlation Between Pregnancies and Age")
plt.show()

sns.scatterplot(x = 'Insulin', y = 'SkinThickness', data = data)
plt.title("Correlation Between Insulin and SkinThickness")
plt.show()

sns.scatterplot(x = 'Outcome', y = 'Glucose', data = data)
plt.title("Correlation Between Outcome and Glucose")
plt.show()

sns.scatterplot(x = 'Outcome', y = 'BMI', data = data)
plt.title("Correlation Between Outcome and BMI")
plt.show()


# Iterating through the columns and plotting a distribution plot using Seaborn, differentiating
# the distribution in each plot by having two colors (hue) correspond to the two outcomes and 
# specifying we want to use kernal density estimation 'kde'
for col in data.drop('Outcome',1).columns:
        sns.displot(data = data, x = col,hue = 'Outcome',kind='kde', palette='Reds')
        titlestring = "Distribution of " + col + " and Diabetes Outcomes \n"
        plt.title(titlestring)
    
plt.show()

/tmp/ipykernel_120/656021879.py:4: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.
  for col in data.drop('Outcome',1).columns:


# Removing the rows that have an entry of zero in the columns 'Glucose', 'BloodPressure', 
# 'SkinThickness', 'Insulin', and 'BMI' column 
remove_zero_data = data
remove_zero_data = remove_zero_data[(remove_zero_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] != 0).all(axis=1)]
remove_zero_data.describe()


# Replacing the zero values in 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', and 'BMI' 
# columns with the median of its column
replace_zero_data = data
replace_zero_data['Glucose']=replace_zero_data['Glucose'].replace(0,replace_zero_data['Glucose'].median())
replace_zero_data['BloodPressure']=replace_zero_data['BloodPressure'].replace(0,replace_zero_data['BloodPressure'].median())
replace_zero_data['SkinThickness']=replace_zero_data['SkinThickness'].replace(0,replace_zero_data['SkinThickness'].median())
replace_zero_data['Insulin']=replace_zero_data['Insulin'].replace(0,replace_zero_data['Insulin'].median())
replace_zero_data['BMI']=replace_zero_data['BMI'].replace(0,replace_zero_data['BMI'].median())
replace_zero_data.describe()


# Splitting the data into two dataframes based on outcome
outcome_0_data = data[data['Outcome'] == 0]
outcome_1_data = data[data['Outcome'] == 1]

# Arrays that will store the column names + outcome depending on their distribution
normaldist = []
notnormaldist = []

# Iterating through all of the columns and using the normaltest function to test the
# distribution of data for each column/feature and outcome by getting the pvalue and
# placing the name of the column and outcome in the appropriate array
for col in data.columns:
    if col != 'Outcome':
        pval0 = normaltest(outcome_0_data[col])[1]
        pval1 = normaltest(outcome_1_data[col])[1]
        if pval0 <= 0.05:
            col_and_outcome = col + ' Outcome 0'
            notnormaldist.append(col_and_outcome)
        else:
            col_and_outcome = col + ' Outcome 0'
            normaldist.append(col_and_outcome)
        if pval1 <= 0.05:
            col_and_outcome = col + ' Outcome 1'
            notnormaldist.append(col_and_outcome)
        else:
            col_and_outcome = col + ' Outcome 1'
            normaldist.append(col_and_outcome)
            
print("Normal Distribution:\n", normaldist)
print("Not Normal Distribution:\n", notnormaldist)

Normal Distribution:
 []
Not Normal Distribution:
 ['Pregnancies Outcome 0', 'Pregnancies Outcome 1', 'Glucose Outcome 0', 'Glucose Outcome 1', 'BloodPressure Outcome 0', 'BloodPressure Outcome 1', 'SkinThickness Outcome 0', 'SkinThickness Outcome 1', 'Insulin Outcome 0', 'Insulin Outcome 1', 'BMI Outcome 0', 'BMI Outcome 1', 'DiabetesPedigreeFunction Outcome 0', 'DiabetesPedigreeFunction Outcome 1', 'Age Outcome 0', 'Age Outcome 1']


# Repeating process for the dataframe with zeroes removed
outcome_0_remove_zero_data = remove_zero_data[remove_zero_data['Outcome'] == 0]
outcome_1_remove_zero_data = remove_zero_data[remove_zero_data['Outcome'] == 1]

normaldist = []
notnormaldist = []

for col in remove_zero_data.columns:
    if col != 'Outcome':
        pval0 = normaltest(outcome_0_remove_zero_data[col])[1]
        pval1 = normaltest(outcome_1_remove_zero_data[col])[1]
        if pval0 <= 0.05:
            col_and_outcome = col + ' Outcome 0'
            notnormaldist.append(col_and_outcome)
        else:
            col_and_outcome = col + ' Outcome 0'
            normaldist.append(col_and_outcome)
        if pval1 <= 0.05:
            col_and_outcome = col + ' Outcome 1'
            notnormaldist.append(col_and_outcome)
        else:
            col_and_outcome = col + ' Outcome 1'
            normaldist.append(col_and_outcome)
            
print("Dataframe with Zeroes Removed\n")            
print("Normal Distribution:\n", normaldist)
print("Not Normal Distribution:\n", notnormaldist, "\n")

# Repeating process for the dataframe with zeroes replaced by median values
outcome_0_replace_zero_data = replace_zero_data[replace_zero_data['Outcome'] == 0]
outcome_1_replace_zero_data = replace_zero_data[replace_zero_data['Outcome'] == 1]

normaldist = []
notnormaldist = []

for col in replace_zero_data.columns:
    if col != 'Outcome':
        pval0 = normaltest(outcome_0_replace_zero_data[col])[1]
        pval1 = normaltest(outcome_1_replace_zero_data[col])[1]
        if pval0 <= 0.05:
            col_and_outcome = col + ' Outcome 0'
            notnormaldist.append(col_and_outcome)
        else:
            col_and_outcome = col + ' Outcome 0'
            normaldist.append(col_and_outcome)
        if pval1 <= 0.05:
            col_and_outcome = col + ' Outcome 1'
            notnormaldist.append(col_and_outcome)
        else:
            col_and_outcome = col + ' Outcome 1'
            normaldist.append(col_and_outcome)
            
print("Dataframe with Zeroes Replaced by Median\n")            
print("Normal Distribution:\n", normaldist)
print("Not Normal Distribution:\n", notnormaldist)

Dataframe with Zeroes Removed

Normal Distribution:
 ['BloodPressure Outcome 0', 'BloodPressure Outcome 1', 'SkinThickness Outcome 1']
Not Normal Distribution:
 ['Pregnancies Outcome 0', 'Pregnancies Outcome 1', 'Glucose Outcome 0', 'Glucose Outcome 1', 'SkinThickness Outcome 0', 'Insulin Outcome 0', 'Insulin Outcome 1', 'BMI Outcome 0', 'BMI Outcome 1', 'DiabetesPedigreeFunction Outcome 0', 'DiabetesPedigreeFunction Outcome 1', 'Age Outcome 0', 'Age Outcome 1'] 

Dataframe with Zeroes Replaced by Median

Normal Distribution:
 []
Not Normal Distribution:
 ['Pregnancies Outcome 0', 'Pregnancies Outcome 1', 'Glucose Outcome 0', 'Glucose Outcome 1', 'BloodPressure Outcome 0', 'BloodPressure Outcome 1', 'SkinThickness Outcome 0', 'SkinThickness Outcome 1', 'Insulin Outcome 0', 'Insulin Outcome 1', 'BMI Outcome 0', 'BMI Outcome 1', 'DiabetesPedigreeFunction Outcome 0', 'DiabetesPedigreeFunction Outcome 1', 'Age Outcome 0', 'Age Outcome 1']


print("Original Data \n")
# Arrays that will store the names of features that either reject or fail to reject H0
reject = []
fail_to_reject = []
# Iterating through the columns to do the Mann-Whitney test and get the p-value to see if
# it's <= 0.05
for col in data.columns:
    if col != 'Outcome':
        # alternative = 'greater' means that the alternative hypothesis is defined as
        # the distribution underlying x is stochastically greater than the distribution underlying y;
        # x being outcome_1_data and y being outcome_0_data
        manwhit_pvalue = mannwhitneyu(outcome_1_data[col], outcome_0_data[col], \
                                      alternative = 'greater')[1]
        if manwhit_pvalue <= 0.05:
            reject.append(col)
        else:
            fail_to_reject.append(col)
print("Reject:", reject)
print("Fail to Reject:", fail_to_reject, "\n")

# Repeat above process for median data
print("Median Data \n")
reject = []
fail_to_reject = []
for col in replace_zero_data.columns:
    if col != 'Outcome':
        manwhit_pvalue = mannwhitneyu(outcome_1_replace_zero_data[col], \
                                      outcome_0_replace_zero_data[col], alternative = 'greater')[1]
        if manwhit_pvalue <= 0.05:
            reject.append(col)
        else:
            fail_to_reject.append(col)
print("Reject:", reject)
print("Fail to Reject:", fail_to_reject, "\n")

# Repeat above process for removed zero data with the addition of conducting Independent T test
print("Removed Zero Data \n")                    
reject = []
fail_to_reject = []
for col in remove_zero_data.columns:
    if col != 'Outcome' and col != 'BloodPressure':
        manwhit_pvalue = mannwhitneyu(outcome_1_remove_zero_data[col], \
                                      outcome_0_remove_zero_data[col], alternative = 'greater')[1]
        if manwhit_pvalue <= 0.05:
            reject.append(col)
        else:
            fail_to_reject.append(col)
    # Independent T test for BloodPressure
    elif col == 'BloodPressure':
        ttest_pvalue = ttest_ind(outcome_1_remove_zero_data[col], \
                                 outcome_0_remove_zero_data[col], alternative = 'greater')[1]
        if ttest_pvalue <= 0.05:
            reject.append(col)
        else:
            fail_to_reject.append(col)
print("Reject:", reject)
print("Fail to Reject:", fail_to_reject)

Original Data 

Reject: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Fail to Reject: [] 

Median Data 

Reject: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Fail to Reject: [] 

Removed Zero Data 

Reject: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Fail to Reject: []


print("Original Data \n")
reject = []
fail_to_reject = []
# Iterating through the columns to do the Spearman test and get the p-value to see if
# it's <= 0.05
for col in data.columns:
    if col != 'Outcome':
        # alternative = 'greater' means that the alternative hypothesis is defined as
        # the correlation is positive (greater than zero)
        spearman_results = spearmanr(data[col], data['Outcome'], \
                                      alternative = 'greater')
        pval = spearman_results[1]
        print(col,":", spearman_results)
        if pval <= 0.05:
            reject.append(col)
        else:
            fail_to_reject.append(col)
print("\nReject:", reject)
print("Fail to Reject:", fail_to_reject, "\n")

# Repeat above process for median data
print("Median Data \n")
reject = []
fail_to_reject = []
for col in replace_zero_data.columns:
    if col != 'Outcome':
        spearman_results = spearmanr(replace_zero_data[col], \
                                     replace_zero_data['Outcome'], alternative = 'greater')
        pval = spearman_results[1]
        print(col,":", spearman_results)
        if pval <= 0.05:
            reject.append(col)
        else:
            fail_to_reject.append(col)
print("\nReject:", reject)
print("Fail to Reject:", fail_to_reject, "\n")

# Repeat above process for removed zeroes data
print("Removed Zero Data \n")                    
reject = []
fail_to_reject = []
for col in remove_zero_data.columns:
    if col != 'Outcome':
        spearman_results = spearmanr(remove_zero_data[col], \
                                     remove_zero_data['Outcome'], alternative = 'greater')
        pval = spearman_results[1]
        print(col,":", spearman_results)
        if pval <= 0.05:
            reject.append(col)
        else:
            fail_to_reject.append(col)
print("\nReject:", reject)
print("Fail to Reject:", fail_to_reject)

Original Data 

Pregnancies : SpearmanrResult(correlation=0.19868874913189663, pvalue=1.4063922395649354e-08)
Glucose : SpearmanrResult(correlation=0.4813968963350598, pvalue=4.20050321489302e-46)
BloodPressure : SpearmanrResult(correlation=0.1708324018917003, pvalue=9.60417975088519e-07)
SkinThickness : SpearmanrResult(correlation=0.1966084477604776, pvalue=1.971318139766496e-08)
Insulin : SpearmanrResult(correlation=0.0747519433225385, pvalue=0.019174226987829963)
BMI : SpearmanrResult(correlation=0.3073382077773797, pvalue=1.4520901823689255e-18)
DiabetesPedigreeFunction : SpearmanrResult(correlation=0.17535346981239172, pvalue=5.053170547826717e-07)
Age : SpearmanrResult(correlation=0.30904026356718634, pvalue=9.264872122688206e-19)

Reject: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Fail to Reject: [] 

Median Data 

Pregnancies : SpearmanrResult(correlation=0.19868874913189663, pvalue=1.4063922395649354e-08)
Glucose : SpearmanrResult(correlation=0.4813968963350598, pvalue=4.20050321489302e-46)
BloodPressure : SpearmanrResult(correlation=0.1708324018917003, pvalue=9.60417975088519e-07)
SkinThickness : SpearmanrResult(correlation=0.1966084477604776, pvalue=1.971318139766496e-08)
Insulin : SpearmanrResult(correlation=0.0747519433225385, pvalue=0.019174226987829963)
BMI : SpearmanrResult(correlation=0.3073382077773797, pvalue=1.4520901823689255e-18)
DiabetesPedigreeFunction : SpearmanrResult(correlation=0.17535346981239172, pvalue=5.053170547826717e-07)
Age : SpearmanrResult(correlation=0.30904026356718634, pvalue=9.264872122688206e-19)

Reject: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Fail to Reject: [] 

Removed Zero Data 

Pregnancies : SpearmanrResult(correlation=0.20023445259481654, pvalue=3.272010288408068e-05)
Glucose : SpearmanrResult(correlation=0.498728899671441, pvalue=2.421705565130313e-26)
BloodPressure : SpearmanrResult(correlation=0.1981265676706596, pvalue=3.917397274194511e-05)
SkinThickness : SpearmanrResult(correlation=0.2601534347999191, pvalue=8.728669666748573e-08)
Insulin : SpearmanrResult(correlation=0.37501949347790997, pvalue=7.739426816506896e-15)
BMI : SpearmanrResult(correlation=0.26707717071700626, pvalue=3.963032815867018e-08)
DiabetesPedigreeFunction : SpearmanrResult(correlation=0.19821353543782874, pvalue=3.888552617588149e-05)
Age : SpearmanrResult(correlation=0.39735473573446217, pvalue=1.4029475132681835e-16)

Reject: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
Fail to Reject: []


# Initialize X and Y test and train groups
X = replace_zero_data.loc[:, replace_zero_data.columns != 'Outcome']
y = replace_zero_data.loc[:,'Outcome'].values
f2_score = make_scorer(fbeta_score, beta=2, pos_label=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


# Random Forest Classifier for replaced zero dataset
replace_zero_forest = RandomForestClassifier(random_state=0)

# Fit model to training data
replace_zero_forest.fit(X_train, y_train)

# See how model does on testing data
rzf_pred = replace_zero_forest.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, rzf_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzf_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzf_pred))
print ('Classification Report:\n', classification_report(y_test, rzf_pred))

ROC AUC: 0.7317073170731707
Accuracy: 0.75
Confusion Matrix:
 [[98 25]
 [23 46]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.80       123
           1       0.65      0.67      0.66        69

    accuracy                           0.75       192
   macro avg       0.73      0.73      0.73       192
weighted avg       0.75      0.75      0.75       192


# Specifying the hyperparameters 
rzf_param_grid = {'max_samples':[0.1, 0.2, 0.3, 0.4],
                 'max_features': [1, 2],
                 'n_estimators':[5, 10, 50, 100],
                 'max_depth':[8, 9, 10]
                 }
# Splitting data into 5 folds, or 5 sections to do 5-fold cross validation
rzf_cv = KFold(n_splits=5)
# Using grid search
rzf_grid = GridSearchCV(RandomForestClassifier(), rzf_param_grid, cv=rzf_cv, scoring = f2_score)
rzf_grid.fit(X_train, y_train)
# This should take half a minute

print('Random Forest best Params:', rzf_grid.best_params_)
print('Random Forest best Score:', rzf_grid.best_score_)

hrzf_pred = rzf_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, hrzf_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, hrzf_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, hrzf_pred))
print ('Classification Report:\n', classification_report(y_test, hrzf_pred))

Random Forest best Params: {'max_depth': 9, 'max_features': 2, 'max_samples': 0.3, 'n_estimators': 100}
Random Forest best Score: 0.6155776749158334
ROC AUC: 0.6891127606928242
Accuracy: 0.7239583333333334
Confusion Matrix:
 [[100  23]
 [ 30  39]]
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.81      0.79       123
           1       0.63      0.57      0.60        69

    accuracy                           0.72       192
   macro avg       0.70      0.69      0.69       192
weighted avg       0.72      0.72      0.72       192


rep_zero_kNN = KNeighborsClassifier(n_neighbors = 10)
# Fitting model to training data
rep_zero_kNN.fit(X_train, y_train)
# See how model does on testing data
rzk_pred = rep_zero_kNN.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, rzk_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzk_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzk_pred))
print ('Classification Report:\n', classification_report(y_test, rzk_pred))

ROC AUC: 0.6633085896076353
Accuracy: 0.703125
Confusion Matrix:
 [[99 24]
 [33 36]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.80      0.78       123
           1       0.60      0.52      0.56        69

    accuracy                           0.70       192
   macro avg       0.68      0.66      0.67       192
weighted avg       0.70      0.70      0.70       192


# Specifying the hyperparameters 
kNN_param_grid = {'n_neighbors':[2,5,10,15,20,30,50],
                  'weights':['uniform', 'distance'], 
                  'algorithm': ['auto', 'ball_tree', 'kd_tree'],
                  'metric':['minkowski','euclidean','manhattan']}
# do not use linear or poly for kernel, it will softlock your kernel
# Splitting data into 5 folds, or 5 sections to do 5-fold cross validation
kNN_cv = KFold(n_splits=5)
# Using grid search
kNN_grid = GridSearchCV(KNeighborsClassifier(), kNN_param_grid, cv=kNN_cv, scoring = f2_score)
kNN_grid.fit(X_train, y_train)
# This runs the smoothest

print('k-NN best Params:', kNN_grid.best_params_)
print('k-NN best Score:', kNN_grid.best_score_)

kNN_pred = kNN_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, kNN_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, kNN_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, kNN_pred))
print ('Classification Report:\n', classification_report(y_test, kNN_pred))

k-NN best Params: {'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}
k-NN best Score: 0.5820232318924576
ROC AUC: 0.6240721102863201
Accuracy: 0.640625
Confusion Matrix:
 [[84 39]
 [30 39]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.68      0.71       123
           1       0.50      0.57      0.53        69

    accuracy                           0.64       192
   macro avg       0.62      0.62      0.62       192
weighted avg       0.65      0.64      0.64       192


data_SVC = SVC()
# Fitting model to training data
data_SVC.fit(X_train, y_train)
# See how model does on testing data
svc_pred = data_SVC.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, svc_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, svc_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, svc_pred))
print ('Classification Report:\n', classification_report(y_test, svc_pred))

ROC AUC: 0.6772711205372923
Accuracy: 0.7291666666666666
Confusion Matrix:
 [[106  17]
 [ 35  34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.86      0.80       123
           1       0.67      0.49      0.57        69

    accuracy                           0.73       192
   macro avg       0.71      0.68      0.68       192
weighted avg       0.72      0.73      0.72       192


# Specifying the hyperparameters 
svm_param_grid = {'C':[0.01, 0.1, 1, 10, 100], \
                  'kernel':['rbf'], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]}
# do not use linear or poly for kernel, it will softlock your kernel
# Splitting data into 5 folds, or 5 sections to do 5-fold cross validation
svm_cv = KFold(n_splits=5)
# Using grid search
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=svm_cv, scoring = f2_score)
svm_grid.fit(X_train, y_train)

print('SVM best Params:', svm_grid.best_params_)
print('SVM best Score:', svm_grid.best_score_)

svm_pred = svm_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, svm_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, svm_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, svm_pred))
print ('Classification Report:\n', classification_report(y_test, svm_pred))

SVM best Params: {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
SVM best Score: 0.5796992942249825
ROC AUC: 0.7108518911276069
Accuracy: 0.7395833333333334
Confusion Matrix:
 [[100  23]
 [ 27  42]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.81      0.80       123
           1       0.65      0.61      0.63        69

    accuracy                           0.74       192
   macro avg       0.72      0.71      0.71       192
weighted avg       0.74      0.74      0.74       192


# Initialize X and Y test and train groups
X = data.loc[:, data.columns != 'Outcome']
y = data.loc[:,'Outcome'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Random Forest Classifier for original dataset
original_forest = RandomForestClassifier(random_state=0)
original_forest.fit(X_train, y_train)
rzf_pred = original_forest.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, rzf_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzf_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzf_pred))
print ('Classification Report:\n', classification_report(y_test, rzf_pred))

ROC AUC: 0.7353535353535354
Accuracy: 0.7532467532467533
Confusion Matrix:
 [[79 20]
 [18 37]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.80      0.81        99
           1       0.65      0.67      0.66        55

    accuracy                           0.75       154
   macro avg       0.73      0.74      0.73       154
weighted avg       0.76      0.75      0.75       154


rzf_param_grid = {'max_samples':[0.1, 0.2, 0.3, 0.4],
                 'max_features': [1, 2],
                 'n_estimators':[5, 10, 50, 100],
                 'max_depth':[8, 9, 10]
                 }
rzf_cv = KFold(n_splits=5)
rzf_grid = GridSearchCV(RandomForestClassifier(), rzf_param_grid, cv=rzf_cv, scoring = f2_score)
rzf_grid.fit(X_train, y_train)
# This should take half a minute

print('Random Forest best Params:', rzf_grid.best_params_)
print('Random Forest best Score:', rzf_grid.best_score_)

rzf_pred = rzf_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, rzf_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzf_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzf_pred))
print ('Classification Report:\n', classification_report(y_test, rzf_pred))

Random Forest best Params: {'max_depth': 9, 'max_features': 2, 'max_samples': 0.4, 'n_estimators': 5}
Random Forest best Score: 0.6235982694211986
ROC AUC: 0.7161616161616162
Accuracy: 0.7337662337662337
Confusion Matrix:
 [[77 22]
 [19 36]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.78      0.79        99
           1       0.62      0.65      0.64        55

    accuracy                           0.73       154
   macro avg       0.71      0.72      0.71       154
weighted avg       0.74      0.73      0.74       154


# k-NN Classifier for original dataset
original_kNN = KNeighborsClassifier()
original_kNN.fit(X_train, y_train)
rzk_pred = original_kNN.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, rzk_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzk_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzk_pred))
print ('Classification Report:\n', classification_report(y_test, rzk_pred))

ROC AUC: 0.6474747474747474
Accuracy: 0.6558441558441559
Confusion Matrix:
 [[67 32]
 [21 34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.68      0.72        99
           1       0.52      0.62      0.56        55

    accuracy                           0.66       154
   macro avg       0.64      0.65      0.64       154
weighted avg       0.67      0.66      0.66       154


kNN_param_grid = {'n_neighbors':[2,5,10,15,20,30,50],
                  'weights':['uniform', 'distance'], 
                  'metric':['minkowski','euclidean','manhattan']}
# do not use linear or poly for kernel, it will softlock your kernel
kNN_cv = KFold(n_splits=5)
kNN_grid = GridSearchCV(KNeighborsClassifier(), kNN_param_grid, cv=kNN_cv, scoring = f2_score)
kNN_grid.fit(X_train, y_train)
# This runs the smoothest

print('k-NN best Params:', kNN_grid.best_params_)
print('k-NN best Score:', kNN_grid.best_score_)
kNN_pred = kNN_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, kNN_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, kNN_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, kNN_pred))
print ('Classification Report:\n', classification_report(y_test, kNN_pred))

k-NN best Params: {'metric': 'minkowski', 'n_neighbors': 15, 'weights': 'distance'}
k-NN best Score: 0.5816168275146515
ROC AUC: 0.6828282828282828
Accuracy: 0.7012987012987013
Confusion Matrix:
 [[74 25]
 [21 34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.76        99
           1       0.58      0.62      0.60        55

    accuracy                           0.70       154
   macro avg       0.68      0.68      0.68       154
weighted avg       0.71      0.70      0.70       154


# SVM for original dataset
data_SVC = SVC()
data_SVC.fit(X_train, y_train)
svc_pred = data_SVC.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, svc_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, svc_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, svc_pred))
print ('Classification Report:\n', classification_report(y_test, svc_pred))

ROC AUC: 0.7212121212121211
Accuracy: 0.7662337662337663
Confusion Matrix:
 [[87 12]
 [24 31]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.88      0.83        99
           1       0.72      0.56      0.63        55

    accuracy                           0.77       154
   macro avg       0.75      0.72      0.73       154
weighted avg       0.76      0.77      0.76       154


svm_param_grid = {'C':[0.01, 0.1, 1, 10, 100], \
                  'kernel':['rbf'], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]}
# do not use linear or poly for kernel, it will softlock your kernel
svm_cv = KFold(n_splits=5)
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=svm_cv, scoring = f2_score)
svm_grid.fit(X_train, y_train)

print('SVM best Params:', svm_grid.best_params_)
print('SVM best Score:', svm_grid.best_score_)

svm_pred = svm_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, svm_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, svm_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, svm_pred))
print ('Classification Report:\n', classification_report(y_test, svm_pred))

SVM best Params: {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
SVM best Score: 0.5481925824111564
ROC AUC: 0.6818181818181819
Accuracy: 0.6948051948051948
Confusion Matrix:
 [[72 27]
 [20 35]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.73      0.75        99
           1       0.56      0.64      0.60        55

    accuracy                           0.69       154
   macro avg       0.67      0.68      0.68       154
weighted avg       0.70      0.69      0.70       154


# Initialize X and Y test and train groups
X = remove_zero_data.loc[:, remove_zero_data.columns != 'Outcome']
y = remove_zero_data.loc[:,'Outcome'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)


# Random Forest Classifier for removed zeroes dataset
remove_zero_data_forest = RandomForestClassifier(random_state=0)
remove_zero_data_forest.fit(X_train, y_train)
rzf_pred = remove_zero_data_forest.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, rzf_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzf_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzf_pred))
print ('Classification Report:\n', classification_report(y_test, rzf_pred))

ROC AUC: 0.7330827067669173
Accuracy: 0.7796610169491526
Confusion Matrix:
 [[34  4]
 [ 9 12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.89      0.84        38
           1       0.75      0.57      0.65        21

    accuracy                           0.78        59
   macro avg       0.77      0.73      0.74        59
weighted avg       0.78      0.78      0.77        59


rzf_param_grid = {'max_samples':[0.1, 0.2, 0.3, 0.4],
                 'max_features': [1, 2],
                 'n_estimators':[5, 10, 50, 100],
                 'max_depth':[8, 9, 10]
                 }
rzf_cv = KFold(n_splits=5)
rzf_grid = GridSearchCV(RandomForestClassifier(), rzf_param_grid, cv=rzf_cv, scoring = f2_score)
rzf_grid.fit(X_train, y_train)
# This should take half a minute

print('Random Forest best Params:', rzf_grid.best_params_)
print('Random Forest best Score:', rzf_grid.best_score_)

rzf_pred = rzf_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, rzf_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzf_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzf_pred))
print ('Classification Report:\n', classification_report(y_test, rzf_pred))

Random Forest best Params: {'max_depth': 9, 'max_features': 2, 'max_samples': 0.4, 'n_estimators': 5}
Random Forest best Score: 0.6024608091655231
ROC AUC: 0.7199248120300752
Accuracy: 0.7627118644067796
Confusion Matrix:
 [[33  5]
 [ 9 12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.87      0.82        38
           1       0.71      0.57      0.63        21

    accuracy                           0.76        59
   macro avg       0.75      0.72      0.73        59
weighted avg       0.76      0.76      0.76        59


# k-NN Classifier for removed zero dataset
remove_zero_kNN = KNeighborsClassifier()
remove_zero_kNN.fit(X_train, y_train)
rzk_pred = remove_zero_kNN.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, rzk_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, rzk_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, rzk_pred))
print ('Classification Report:\n', classification_report(y_test, rzk_pred))

ROC AUC: 0.6672932330827068
Accuracy: 0.6949152542372882
Confusion Matrix:
 [[29  9]
 [ 9 12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.76      0.76        38
           1       0.57      0.57      0.57        21

    accuracy                           0.69        59
   macro avg       0.67      0.67      0.67        59
weighted avg       0.69      0.69      0.69        59


kNN_param_grid = {'n_neighbors':[2,5,10,15,20,30,50],
                  'weights':['uniform', 'distance'], 
                  'metric':['minkowski','euclidean','manhattan']}
# do not use linear or poly for kernel, it will softlock your kernel
kNN_cv = KFold(n_splits=5)
kNN_grid = GridSearchCV(KNeighborsClassifier(), kNN_param_grid, cv=kNN_cv, scoring=f2_score)
kNN_grid.fit(X_train, y_train)
# This runs the smoothest

print('k-NN best Params:', kNN_grid.best_params_)
print('k-NN best Score:', kNN_grid.best_score_)

kNN_pred = kNN_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, kNN_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, kNN_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, kNN_pred))
print ('Classification Report:\n', classification_report(y_test, kNN_pred))

k-NN best Params: {'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'uniform'}
k-NN best Score: 0.5848404497630284
ROC AUC: 0.6672932330827068
Accuracy: 0.6949152542372882
Confusion Matrix:
 [[29  9]
 [ 9 12]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.76      0.76        38
           1       0.57      0.57      0.57        21

    accuracy                           0.69        59
   macro avg       0.67      0.67      0.67        59
weighted avg       0.69      0.69      0.69        59


# SVM for removed zeroes dataset
data_SVC = SVC()
data_SVC.fit(X_train, y_train)
svc_pred = data_SVC.predict(X_test)

print ('ROC AUC:', roc_auc_score(y_test, svc_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, svc_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, svc_pred))
print ('Classification Report:\n', classification_report(y_test, svc_pred))

ROC AUC: 0.6879699248120301
Accuracy: 0.7627118644067796
Confusion Matrix:
 [[36  2]
 [12  9]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.95      0.84        38
           1       0.82      0.43      0.56        21

    accuracy                           0.76        59
   macro avg       0.78      0.69      0.70        59
weighted avg       0.77      0.76      0.74        59


svm_param_grid = {'C':[0.01, 0.1, 1, 10, 100], \
                  'kernel':['rbf'], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]}
# do not use linear or poly for kernel, it will softlock your kernel
svm_cv = KFold(n_splits=5)
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=svm_cv, scoring=f2_score)
svm_grid.fit(X_train, y_train)

print('SVM best Params:', svm_grid.best_params_)
print('SVM best Score:', svm_grid.best_score_)

svm_pred = svm_grid.predict(X_test)
print ('ROC AUC:', roc_auc_score(y_test, svm_pred, average=None))
print ('Accuracy:', accuracy_score(y_test, svm_pred))
print ('Confusion Matrix:\n', confusion_matrix(y_test, svm_pred))
print ('Classification Report:\n', classification_report(y_test, svm_pred))

SVM best Params: {'C': 10, 'gamma': 1e-05, 'kernel': 'rbf'}
SVM best Score: 0.5797285359397277
ROC AUC: 0.6854636591478697
Accuracy: 0.7457627118644068
Confusion Matrix:
 [[34  4]
 [11 10]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.89      0.82        38
           1       0.71      0.48      0.57        21

    accuracy                           0.75        59
   macro avg       0.73      0.69      0.70        59
weighted avg       0.74      0.75      0.73        59

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	120.894531	69.105469	20.536458	79.799479	31.992578	0.471876	33.240885	0.348958
std	3.369578	31.972618	19.355807	15.952218	115.244002	7.884160	0.331329	11.760232	0.476951
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.078000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	0.000000	0.000000	27.300000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	30.500000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000	392.000000
mean	3.301020	122.627551	70.663265	29.145408	156.056122	33.086224	0.523046	30.864796	0.331633
std	3.211424	30.860781	12.496092	10.516424	118.841690	7.027659	0.345488	10.200777	0.471401
min	0.000000	56.000000	24.000000	7.000000	14.000000	18.200000	0.085000	21.000000	0.000000
25%	1.000000	99.000000	62.000000	21.000000	76.750000	28.400000	0.269750	23.000000	0.000000
50%	2.000000	119.000000	70.000000	29.000000	125.500000	33.200000	0.449500	27.000000	0.000000
75%	5.000000	143.000000	78.000000	37.000000	190.000000	37.100000	0.687000	36.000000	1.000000
max	17.000000	198.000000	110.000000	63.000000	846.000000	67.100000	2.420000	81.000000	1.000000

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
count	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000	768.000000
mean	3.845052	121.656250	72.386719	27.334635	94.652344	32.450911	0.471876	33.240885	0.348958
std	3.369578	30.438286	12.096642	9.229014	105.547598	6.875366	0.331329	11.760232	0.476951
min	0.000000	44.000000	24.000000	7.000000	14.000000	18.200000	0.078000	21.000000	0.000000
25%	1.000000	99.750000	64.000000	23.000000	30.500000	27.500000	0.243750	24.000000	0.000000
50%	3.000000	117.000000	72.000000	23.000000	31.250000	32.000000	0.372500	29.000000	0.000000
75%	6.000000	140.250000	80.000000	32.000000	127.250000	36.600000	0.626250	41.000000	1.000000
max	17.000000	199.000000	122.000000	99.000000	846.000000	67.100000	2.420000	81.000000	1.000000

Predicting Diabetes Outcome of Patients in a Dataset¶

By Feven Huruy, Christina Zhang, Xuechao Zhou¶

Introduction¶

Data Preparation¶

Data Visualization¶

Count Plot¶

Box Plots¶

Heat Map¶

Scatterplots¶

Distribution Plots¶

Data Preprocessing¶

Hypothesis Testing¶

Testing Normality of Data¶

Mann-Whitney U and Independent T tests¶

H0:¶

Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, and Age are not higher (distribution / mean) in patients that have diabetes.¶

H1:¶

Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, and Age are higher (distribution / mean) in patients that have diabetes.¶

Spearman Rank Correlation Coefficient¶

H0:¶

The correlation between features and outcome is zero or not positive.¶

H1:¶

The correlation between features and outcome is positive.¶

Predicting Diabetes Outcome¶

Replaced Zeroes Dataset¶

Random Forest Classifier¶

Random Forest (Hyperparameter tuning)¶

k-NN Classifier¶

k-NN Classifier (Hyperparameter tuning)¶

SVM¶

SVM (Hyperparameter tuning)¶

Original Dataset¶

Random Forest¶

Random Forest (Hyperparameter tuning)¶

k-NN¶

k-NN (Hyperparameter tuning)¶

SVM¶

SVM (Hyperparameter tuning)¶

Removed Zeroes Dataset¶

Random Forest¶

Random Forest (Hyperparameter Tuning)¶

k-NN¶

k-NN (Hyperparameter tuning)¶

SVM¶

SVM (Hyperparameter tuning)¶

Conclusion¶