Classifier Comparison: Rescale Data

Data transforms is part of machine learning process. Different algorithm/classifier will make different assumptions of raw data and it may require different view of data. Code below use a MinMaxScaler method from Scikit-learn. The comparison without data transformation method can be found here:> https://norasmadi.unimap.edu.my/2019/03/10/classifier-comparison/

# Run Algorithms for n-Times and Determine the Average Value based on Kfold Method
# Data Transformation Method: Rescale
from pandas import read_csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

# load dataset
url = "https://bit.ly/2GX9wC5"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)

# For plotting
fig, ax = plt.subplots()

# prepare models
model1 = LogisticRegression(solver = 'lbfgs',max_iter=1000) #LR
model2 = LinearDiscriminantAnalysis() #LDA
model3 = KNeighborsClassifier(n_neighbors = 3) #KNN
model4 = GaussianNB() #GNB
model5 = DecisionTreeClassifier(max_depth=5) #DT
model6 = QuadraticDiscriminantAnalysis() #QDA

# evaluate model
resultslr = []
resultslda = []
resultsknn = []
resultsgnb = []
resultsdt = []
resultsqda = []

scoring = 'accuracy'
times = 10             # How many times to repeat

for x in range (times):
    # shuffle the data for each times
    dataframe = shuffle(dataframe)
    array = dataframe.values
    X = array[:,0:8]
    Y = array[:,8]
    # Rescale Data
    scaler = MinMaxScaler(feature_range=(0,1))
    rescaledX = scaler.fit_transform(X)
    kfold = KFold(n_splits=10)
  
    # Logic Regression
    cv_results = cross_val_score(model1, rescaledX, Y, cv=kfold, scoring=scoring)
    resultslr.append(cv_results.mean()*100)
    # Linear Discriminant Analysis
    cv_results = cross_val_score(model2, rescaledX, Y, cv=kfold, scoring=scoring)
    resultslda.append(cv_results.mean()*100)

    # K-Nearest Neighbor
    cv_results = cross_val_score(model3, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsknn.append(cv_results.mean()*100)

    # Gaussian Naive Bayes
    cv_results = cross_val_score(model4, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsgnb.append(cv_results.mean()*100)

    # Decision Tree
    cv_results = cross_val_score(model5, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsdt.append(cv_results.mean()*100)

    # Quadratic Discriminant Analysis
    cv_results = cross_val_score(model6, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsqda.append(cv_results.mean()*100)

# Plot the result
t = np.arange(1, times+1, 1)        # to plot from 1 to n-Times
ax.plot(t,resultslr,t,resultslda,t,resultsknn,t,resultsgnb,t,resultsdt,t,resultsqda)

ax.set(xlabel='times', ylabel='Accuracy (%)',
       title='Pima Indian Database')
ax.grid()
ax.set_xlim(1, times)
ax.set_ylim(60, 80)
ax.legend(['Logic Regression','Linear Discriminant Analysis','K-Nearest Neighbor','Gaussian Naive Bayes','Decision Tree','Quadratic Discriminant Analysis'])
plt.show()

Result:

Classifier Comparison

The code below is to do a comparison for six (6) classifiers using Pima Indian Database. The accuracy for each classifier is determined based on kfold cross-validation (https://machinelearningmastery.com/k-fold-cross-validation/). For each cross-validation, it will be repeated for n-Times.

# Run Algorithms for n-Times and Determine the Average Value based on
# Kfold Method
from pandas import read_csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.utils import shuffle

# load dataset
url = "https://bit.ly/2GX9wC5"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)

# For plotting
fig, ax = plt.subplots()

# prepare models
model1 = LogisticRegression(solver = 'lbfgs',max_iter=1000) #LR
model2 = LinearDiscriminantAnalysis() #LDA
model3 = KNeighborsClassifier(n_neighbors = 3) #KNN
model4 = GaussianNB() #GNB
model5 = DecisionTreeClassifier(max_depth=5) #DT
model6 = QuadraticDiscriminantAnalysis() #QDA

# evaluate model
resultslr = []
resultslda = []
resultsknn = []
resultsgnb = []
resultsdt = []
resultsqda = []

scoring = 'accuracy'
times = 10             # How many times to repeat

for x in range (times):
    # shuffle the data for each times
    dataframe = shuffle(dataframe)
    array = dataframe.values
    X = array[:,0:8]
    Y = array[:,8]
    kfold = KFold(n_splits=10)
  
    # Logic Regression
    cv_results = cross_val_score(model1, X, Y, cv=kfold, scoring=scoring)
    resultslr.append(cv_results.mean()*100)
    # Linear Discriminant Analysis
    cv_results = cross_val_score(model2, X, Y, cv=kfold, scoring=scoring)
    resultslda.append(cv_results.mean()*100)

    # K-Nearest Neighbor
    cv_results = cross_val_score(model3, X, Y, cv=kfold, scoring=scoring)
    resultsknn.append(cv_results.mean()*100)

    # Gaussian Naive Bayes
    cv_results = cross_val_score(model4, X, Y, cv=kfold, scoring=scoring)
    resultsgnb.append(cv_results.mean()*100)

    # Decision Tree
    cv_results = cross_val_score(model5, X, Y, cv=kfold, scoring=scoring)
    resultsdt.append(cv_results.mean()*100)

    # Quadratic Discriminant Analysis
    cv_results = cross_val_score(model6, X, Y, cv=kfold, scoring=scoring)
    resultsqda.append(cv_results.mean()*100)

# Plot the result
t = np.arange(1, times+1, 1)        # to plot from 1 to n-Times
ax.plot(t,resultslr,t,resultslda,t,resultsknn,t,resultsgnb,t,resultsdt,t,resultsqda)

ax.set(xlabel='times', ylabel='Accuracy (%)',
       title='Pima Indian Database')
ax.grid()
ax.set_xlim(1, times)
ax.set_ylim(60, 80)
ax.legend(['Logic Regression','Linear Discriminant Analysis','K-Nearest Neighbor','Gaussian Naive Bayes','Decision Tree','Quadratic Discriminant Analysis'])
#plt.xlabel('times')
#plt.ylabel('Accuracy(%)')
#plt.title('Pima Indian Databases')
#plt.grid(True)
plt.show()

Load Raw_Data using Python Standard Library

Load the rawdata using Python standard library. Then, the array of data is using NumPy.

# Load CSV Using Python Standard Library
import csv
import numpy
filename = 'pima-indians-diabetes.data.csv'
raw_data = open(filename, 'r')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
data = numpy.array(x).astype('float')
print(data.shape)
Data array using NumPy

Load Raw_Data using NumPy

The source code is for load the data from .csv file. However you need a NumPy. NumPy is the fundamental package for array computing with Python. The answer for source code is (768, 9).

import csv
import numpy
filename = 'pima-indians-diabetes.data.csv'
Raw_Data = open(filename, 'r')
reader = csv.reader(Raw_Data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
dataset = numpy.array(x).astype('float')
print(dataset.shape)

The details about the Pima Dataset can be found here >> https://www.kaggle.com/uciml/pima-indians-diabetes-database

or we can load the data form URL

# Load CSV from URL using NumPy
from numpy import loadtxt
import urllib.request
raw_data = urllib.request.urlopen('https://bit.ly/2GX9wC5')
dataset = loadtxt(raw_data, delimiter=",")
print(dataset.shape)