Classifier Comparison: Rescale Data

Data transforms is part of machine learning process. Different algorithm/classifier will make different assumptions of raw data and it may require different view of data. Code below use a MinMaxScaler method from Scikit-learn. The comparison without data transformation method can be found here:> https://norasmadi.unimap.edu.my/2019/03/10/classifier-comparison/

# Run Algorithms for n-Times and Determine the Average Value based on Kfold Method
# Data Transformation Method: Rescale
from pandas import read_csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

# load dataset
url = "https://bit.ly/2GX9wC5"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)

# For plotting
fig, ax = plt.subplots()

# prepare models
model1 = LogisticRegression(solver = 'lbfgs',max_iter=1000) #LR
model2 = LinearDiscriminantAnalysis() #LDA
model3 = KNeighborsClassifier(n_neighbors = 3) #KNN
model4 = GaussianNB() #GNB
model5 = DecisionTreeClassifier(max_depth=5) #DT
model6 = QuadraticDiscriminantAnalysis() #QDA

# evaluate model
resultslr = []
resultslda = []
resultsknn = []
resultsgnb = []
resultsdt = []
resultsqda = []

scoring = 'accuracy'
times = 10             # How many times to repeat

for x in range (times):
    # shuffle the data for each times
    dataframe = shuffle(dataframe)
    array = dataframe.values
    X = array[:,0:8]
    Y = array[:,8]
    # Rescale Data
    scaler = MinMaxScaler(feature_range=(0,1))
    rescaledX = scaler.fit_transform(X)
    kfold = KFold(n_splits=10)
  
    # Logic Regression
    cv_results = cross_val_score(model1, rescaledX, Y, cv=kfold, scoring=scoring)
    resultslr.append(cv_results.mean()*100)
    # Linear Discriminant Analysis
    cv_results = cross_val_score(model2, rescaledX, Y, cv=kfold, scoring=scoring)
    resultslda.append(cv_results.mean()*100)

    # K-Nearest Neighbor
    cv_results = cross_val_score(model3, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsknn.append(cv_results.mean()*100)

    # Gaussian Naive Bayes
    cv_results = cross_val_score(model4, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsgnb.append(cv_results.mean()*100)

    # Decision Tree
    cv_results = cross_val_score(model5, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsdt.append(cv_results.mean()*100)

    # Quadratic Discriminant Analysis
    cv_results = cross_val_score(model6, rescaledX, Y, cv=kfold, scoring=scoring)
    resultsqda.append(cv_results.mean()*100)

# Plot the result
t = np.arange(1, times+1, 1)        # to plot from 1 to n-Times
ax.plot(t,resultslr,t,resultslda,t,resultsknn,t,resultsgnb,t,resultsdt,t,resultsqda)

ax.set(xlabel='times', ylabel='Accuracy (%)',
       title='Pima Indian Database')
ax.grid()
ax.set_xlim(1, times)
ax.set_ylim(60, 80)
ax.legend(['Logic Regression','Linear Discriminant Analysis','K-Nearest Neighbor','Gaussian Naive Bayes','Decision Tree','Quadratic Discriminant Analysis'])
plt.show()

Result:

Classifier Comparison

The code below is to do a comparison for six (6) classifiers using Pima Indian Database. The accuracy for each classifier is determined based on kfold cross-validation (https://machinelearningmastery.com/k-fold-cross-validation/). For each cross-validation, it will be repeated for n-Times.

# Run Algorithms for n-Times and Determine the Average Value based on
# Kfold Method
from pandas import read_csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.utils import shuffle

# load dataset
url = "https://bit.ly/2GX9wC5"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)

# For plotting
fig, ax = plt.subplots()

# prepare models
model1 = LogisticRegression(solver = 'lbfgs',max_iter=1000) #LR
model2 = LinearDiscriminantAnalysis() #LDA
model3 = KNeighborsClassifier(n_neighbors = 3) #KNN
model4 = GaussianNB() #GNB
model5 = DecisionTreeClassifier(max_depth=5) #DT
model6 = QuadraticDiscriminantAnalysis() #QDA

# evaluate model
resultslr = []
resultslda = []
resultsknn = []
resultsgnb = []
resultsdt = []
resultsqda = []

scoring = 'accuracy'
times = 10             # How many times to repeat

for x in range (times):
    # shuffle the data for each times
    dataframe = shuffle(dataframe)
    array = dataframe.values
    X = array[:,0:8]
    Y = array[:,8]
    kfold = KFold(n_splits=10)
  
    # Logic Regression
    cv_results = cross_val_score(model1, X, Y, cv=kfold, scoring=scoring)
    resultslr.append(cv_results.mean()*100)
    # Linear Discriminant Analysis
    cv_results = cross_val_score(model2, X, Y, cv=kfold, scoring=scoring)
    resultslda.append(cv_results.mean()*100)

    # K-Nearest Neighbor
    cv_results = cross_val_score(model3, X, Y, cv=kfold, scoring=scoring)
    resultsknn.append(cv_results.mean()*100)

    # Gaussian Naive Bayes
    cv_results = cross_val_score(model4, X, Y, cv=kfold, scoring=scoring)
    resultsgnb.append(cv_results.mean()*100)

    # Decision Tree
    cv_results = cross_val_score(model5, X, Y, cv=kfold, scoring=scoring)
    resultsdt.append(cv_results.mean()*100)

    # Quadratic Discriminant Analysis
    cv_results = cross_val_score(model6, X, Y, cv=kfold, scoring=scoring)
    resultsqda.append(cv_results.mean()*100)

# Plot the result
t = np.arange(1, times+1, 1)        # to plot from 1 to n-Times
ax.plot(t,resultslr,t,resultslda,t,resultsknn,t,resultsgnb,t,resultsdt,t,resultsqda)

ax.set(xlabel='times', ylabel='Accuracy (%)',
       title='Pima Indian Database')
ax.grid()
ax.set_xlim(1, times)
ax.set_ylim(60, 80)
ax.legend(['Logic Regression','Linear Discriminant Analysis','K-Nearest Neighbor','Gaussian Naive Bayes','Decision Tree','Quadratic Discriminant Analysis'])
#plt.xlabel('times')
#plt.ylabel('Accuracy(%)')
#plt.title('Pima Indian Databases')
#plt.grid(True)
plt.show()

Matplotlib

Matplotlib is a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms. Matplotlib can be used in Python scripts, the Python and IPython shells, the Jupyter notebook, web application servers, and four graphical user interface toolkits.

Matplotlib tries to make easy things easy and hard things possible. You can generate plots, histograms, power spectra, bar charts, errorcharts, scatterplots, etc., with just a few lines of code.
For examples, see the sample plots and thumbnail gallery.

Website: https://matplotlib.org/

import matplotlib
import matplotlib.pyplot as plt
import numpy as np

# Data for plotting
t = np.arange(0.0, 2.0, 0.01)
s = 1 + np.sin(2 * np.pi * t)

fig, ax = plt.subplots()
ax.plot(t, s)

ax.set(xlabel='time (s)', ylabel='voltage (mV)',
       title='About as simple as it gets, folks')
ax.grid()

fig.savefig("test.png")
plt.show()

# The code from here >> https://matplotlib.org/gallery/lines_bars_and_markers/simple_plot.html#sphx-glr-gallery-lines-bars-and-markers-simple-plot-py

Result:

Load Raw_Data using Python Standard Library

Load the rawdata using Python standard library. Then, the array of data is using NumPy.

# Load CSV Using Python Standard Library
import csv
import numpy
filename = 'pima-indians-diabetes.data.csv'
raw_data = open(filename, 'r')
reader = csv.reader(raw_data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
data = numpy.array(x).astype('float')
print(data.shape)
Data array using NumPy

Load Raw_Data using Pandas

The source code is for load the data from .csv file. However you need a Pandas. Pandas is an open source, BSD-licensed library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language . The answer for source code is (768, 9).

# Load CSV using Pandas
from pandas import read_csv
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = read_csv(filename, names=names)
print(dataset.shape)

The details about the Pima Dataset can be found here >> https://www.kaggle.com/uciml/pima-indians-diabetes-database

or we can load the data form URL

# Load CSV using Pandas from URL
from pandas import read_csv
url = 'https://bit.ly/2GX9wC5'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(url, names=names)
print(data.shape)

Visual Studio Code

Visual Studio Code is a lightweight but powerful source code editor which runs on your desktop and is available for Windows, macOS and Linux. It comes with built-in support for JavaScript, TypeScript and Node.js and has a rich ecosystem of extensions for other languages (such as C++, C#, Java, Python, PHP, Go) and runtimes (such as .NET and Unity). Begin your journey with VS Code with these introductory videos.

Website: https://code.visualstudio.com/

Getting Started with Python in VS Code >> https://code.visualstudio.com/docs/python/python-tutorial

Load Raw_Data using NumPy

The source code is for load the data from .csv file. However you need a NumPy. NumPy is the fundamental package for array computing with Python. The answer for source code is (768, 9).

import csv
import numpy
filename = 'pima-indians-diabetes.data.csv'
Raw_Data = open(filename, 'r')
reader = csv.reader(Raw_Data, delimiter=',', quoting=csv.QUOTE_NONE)
x = list(reader)
dataset = numpy.array(x).astype('float')
print(dataset.shape)

The details about the Pima Dataset can be found here >> https://www.kaggle.com/uciml/pima-indians-diabetes-database

or we can load the data form URL

# Load CSV from URL using NumPy
from numpy import loadtxt
import urllib.request
raw_data = urllib.request.urlopen('https://bit.ly/2GX9wC5')
dataset = loadtxt(raw_data, delimiter=",")
print(dataset.shape)