Data
# Run Algorithms for n-Times and Determine the Average Value based on Kfold Method # Data Transformation Method: Rescale from pandas import read_csv import numpy as np import matplotlib import matplotlib.pyplot as plt from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.utils import shuffle from sklearn.preprocessing import MinMaxScaler # load dataset url = "https://bit.ly/2GX9wC5" names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] dataframe = read_csv(url, names=names) # For plotting fig, ax = plt.subplots() # prepare models model1 = LogisticRegression(solver = 'lbfgs',max_iter=1000) #LR model2 = LinearDiscriminantAnalysis() #LDA model3 = KNeighborsClassifier(n_neighbors = 3) #KNN model4 = GaussianNB() #GNB model5 = DecisionTreeClassifier(max_depth=5) #DT model6 = QuadraticDiscriminantAnalysis() #QDA # evaluate model resultslr = [] resultslda = [] resultsknn = [] resultsgnb = [] resultsdt = [] resultsqda = [] scoring = 'accuracy' times = 10 # How many times to repeat for x in range (times): # shuffle the data for each times dataframe = shuffle(dataframe) array = dataframe.values X = array[:,0:8] Y = array[:,8] # Rescale Data scaler = MinMaxScaler(feature_range=(0,1)) rescaledX = scaler.fit_transform(X) kfold = KFold(n_splits=10) # Logic Regression cv_results = cross_val_score(model1, rescaledX, Y, cv=kfold, scoring=scoring) resultslr.append(cv_results.mean()*100) # Linear Discriminant Analysis cv_results = cross_val_score(model2, rescaledX, Y, cv=kfold, scoring=scoring) resultslda.append(cv_results.mean()*100) # K-Nearest Neighbor cv_results = cross_val_score(model3, rescaledX, Y, cv=kfold, scoring=scoring) resultsknn.append(cv_results.mean()*100) # Gaussian Naive Bayes cv_results = cross_val_score(model4, rescaledX, Y, cv=kfold, scoring=scoring) resultsgnb.append(cv_results.mean()*100) # Decision Tree cv_results = cross_val_score(model5, rescaledX, Y, cv=kfold, scoring=scoring) resultsdt.append(cv_results.mean()*100) # Quadratic Discriminant Analysis cv_results = cross_val_score(model6, rescaledX, Y, cv=kfold, scoring=scoring) resultsqda.append(cv_results.mean()*100) # Plot the result t = np.arange(1, times+1, 1) # to plot from 1 to n-Times ax.plot(t,resultslr,t,resultslda,t,resultsknn,t,resultsgnb,t,resultsdt,t,resultsqda) ax.set(xlabel='times', ylabel='Accuracy (%)', title='Pima Indian Database') ax.grid() ax.set_xlim(1, times) ax.set_ylim(60, 80) ax.legend(['Logic Regression','Linear Discriminant Analysis','K-Nearest Neighbor','Gaussian Naive Bayes','Decision Tree','Quadratic Discriminant Analysis']) plt.show()
Result:
