Classifier Comparison

The code below is to do a comparison for six (6) classifiers using Pima Indian Database. The accuracy for each classifier is determined based on kfold cross-validation (https://machinelearningmastery.com/k-fold-cross-validation/). For each cross-validation, it will be repeated for n-Times.

# Run Algorithms for n-Times and Determine the Average Value based on
# Kfold Method
from pandas import read_csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.utils import shuffle

# load dataset
url = "https://bit.ly/2GX9wC5"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)

# For plotting
fig, ax = plt.subplots()

# prepare models
model1 = LogisticRegression(solver = 'lbfgs',max_iter=1000) #LR
model2 = LinearDiscriminantAnalysis() #LDA
model3 = KNeighborsClassifier(n_neighbors = 3) #KNN
model4 = GaussianNB() #GNB
model5 = DecisionTreeClassifier(max_depth=5) #DT
model6 = QuadraticDiscriminantAnalysis() #QDA

# evaluate model
resultslr = []
resultslda = []
resultsknn = []
resultsgnb = []
resultsdt = []
resultsqda = []

scoring = 'accuracy'
times = 10             # How many times to repeat

for x in range (times):
    # shuffle the data for each times
    dataframe = shuffle(dataframe)
    array = dataframe.values
    X = array[:,0:8]
    Y = array[:,8]
    kfold = KFold(n_splits=10)
  
    # Logic Regression
    cv_results = cross_val_score(model1, X, Y, cv=kfold, scoring=scoring)
    resultslr.append(cv_results.mean()*100)
    # Linear Discriminant Analysis
    cv_results = cross_val_score(model2, X, Y, cv=kfold, scoring=scoring)
    resultslda.append(cv_results.mean()*100)

    # K-Nearest Neighbor
    cv_results = cross_val_score(model3, X, Y, cv=kfold, scoring=scoring)
    resultsknn.append(cv_results.mean()*100)

    # Gaussian Naive Bayes
    cv_results = cross_val_score(model4, X, Y, cv=kfold, scoring=scoring)
    resultsgnb.append(cv_results.mean()*100)

    # Decision Tree
    cv_results = cross_val_score(model5, X, Y, cv=kfold, scoring=scoring)
    resultsdt.append(cv_results.mean()*100)

    # Quadratic Discriminant Analysis
    cv_results = cross_val_score(model6, X, Y, cv=kfold, scoring=scoring)
    resultsqda.append(cv_results.mean()*100)

# Plot the result
t = np.arange(1, times+1, 1)        # to plot from 1 to n-Times
ax.plot(t,resultslr,t,resultslda,t,resultsknn,t,resultsgnb,t,resultsdt,t,resultsqda)

ax.set(xlabel='times', ylabel='Accuracy (%)',
       title='Pima Indian Database')
ax.grid()
ax.set_xlim(1, times)
ax.set_ylim(60, 80)
ax.legend(['Logic Regression','Linear Discriminant Analysis','K-Nearest Neighbor','Gaussian Naive Bayes','Decision Tree','Quadratic Discriminant Analysis'])
#plt.xlabel('times')
#plt.ylabel('Accuracy(%)')
#plt.title('Pima Indian Databases')
#plt.grid(True)
plt.show()