#kNN Classifier implementation – Load the data
#Load the necessary libraries
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use
import matplotlib.pyplot as plt
plt.style.use('ggplot')
print("Started")
The diabetes data set was originated from UCI Machine Learning Repository and can be downloaded from https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/diabetes.csv
#load the dataset
data= pd.read_csv("C:/Users/skmun/Desktop/BITS/datasets-master/datasets-master/diabetes.csv")
#print the first 6 rows from dataframe
data.head(6)
data.dtypes
The diabetes data set consists of 768 data points, with 9 features each:
#Shape of dataframe
data.shape
X= data.drop('Outcome', axis=1).values
y = data['Outcome'].values
X.shape
y.shape
“Outcome” is the feature we are going to predict, 0 means No diabetes, 1 means diabetes. Of these 768 data points, 500 are labeled as 0 and 268 as 1:
print(data.groupby('Outcome').size())
import seaborn as sns
sns.countplot(data['Outcome'],label="Count")
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=42,stratify=y)
X_train.shape
from sklearn.neighbors import KNeighborsClassifier
training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 10
neighbors_settings = range(1, 17)
for n_neighbors in neighbors_settings:
# build the model
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)
# record training set accuracy
training_accuracy.append(knn.score(X_train, y_train))
# record test set accuracy
test_accuracy.append(knn.score(X_test, y_test))
#kNN Classifier implementation – Plot the accuracy
plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
plt.savefig('knn_compare_model')
The above plot shows the training and test set accuracy on the y-axis against the setting of n_neighbors on the x-axis. Considering if we choose one single nearest neighbor, the prediction on the training set is perfect. But when more neighbors are considered, the training accuracy drops, indicating that using the single nearest neighbor leads to a model that is too complex. The best performance is somewhere around 9 neighbors. The plot suggests that we should choose n_neighbors=9. Here we are:
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train, y_train)
knn.score(X_test,y_test)
knn.score(X_train,y_train)
y_pred = knn.predict(X_test)
pd.crosstab(y_test,y_pred,rownames=['True'],colnames=['Predicted'],margins=True)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
y_pred_prob = knn.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_curve
fps,tp,threshold = roc_curve(y_test,y_pred_prob)
plt.plot([0,1],[0,1],'k--')
plt.plot(fps,tp,label='KNN')
plt.xlabel('fps')
plt.ylabel('tp')
plt.title('KNN ROC curve')
plt.show()
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_pred_prob)
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors':np.arange(1,50)}
knn= KNeighborsClassifier()
knn_cv = GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(X,y)
knn_cv.best_score_
knn_cv.best_params_
knn = KNeighborsClassifier(n_neighbors=14)
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))
! jupyter nbconvert --to html Untitled.ipynb