Cyber&Data: Classification Metrics and MLThis provides a foundation in classification metrics. Objectives
ContentCode samples
CodingExample Code 1: [Here]: # https://asecuritysite.com/bigdata/roc from sklearn import metrics import matplotlib.pyplot as plt def show_roc(FPR, TPR, AUC): plt.plot(FPR, TPR, color='blue', label='ROC') plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') plt.xlabel('FPR') plt.ylabel('TPR') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(["AUC=%.3f" % AUC]) plt.show() y = ['Eve', 'Eve', 'Eve', 'Eve','Eve','Bob','Bob', 'Bob','Bob','Bob'] scores = [20,25,16,42,22,50,41,60,54,39] positive_label = 'Bob' fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=positive_label) auc=metrics.auc(fpr, tpr) print ("FPR:",fpr) print ("TPR:",tpr) print ("Thresholds:",thresholds) show_roc(fpr, tpr,auc) Example Code 2 [Here] # https://asecuritysite.com/bigdata/sk01 import sys import matplotlib.pyplot as plt ga=0.011 if (len(sys.argv)>1): file=str(sys.argv[1]) if (len(sys.argv)>2): ga=float(sys.argv[2]) from sklearn import datasets, svm, metrics digits = datasets.load_digits() images_and_labels = list(zip(digits.images, digits.target)) for index, (image, label) in enumerate(images_and_labels[:10]): plt.subplot(2, 10, (index + 1)) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Tr: %i' % label) # To apply a classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) # Create a classifier: a support vector classifier classifier = svm.SVC(gamma=ga) # We learn the digits on the first half of the digits classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2]) # Now predict the value of the digit on the second half: expected = digits.target[n_samples // 2:] predicted = classifier.predict(data[n_samples // 2:]) print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted)) for index, (image, prediction) in enumerate(images_and_predictions[:4]): plt.subplot(2, 4, index + 5) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Prediction: %i' % prediction) plt.show() Example Code 3: [Here] # This code creates a data set with two clusters (defined by the two features. The output is then data_vals[:, 0] and data_vals[:, 1] and these are marked by class_lab from sklearn.datasets import make_classification import matplotlib.pyplot as plt data_vals, class_label =make_classification(n_samples=100,n_features=2, n_redundant=0, n_informative=1,n_clusters_per_class=1) plt.scatter(data_vals[:, 0], data_vals[:, 1], marker='o', c=class_label,s=25, edgecolor='k') plt.savefig('test.png') plt.show() Example Code 4: [Here] # Create ROC from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve import matplotlib.pyplot as plt def show_roc(FPR, TPR, AUC): plt.plot(FPR, TPR, color='blue', label='ROC') plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') plt.xlabel('FPR') plt.ylabel('TPR') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(["AUC=%.3f" % AUC]) plt.show() data_vals, class_label =make_classification(n_samples=100,n_features=2, n_redundant=0, n_informative=1,n_clusters_per_class=1) X_train, X_test, y_train, y_test = train_test_split(data_vals, class_label, test_size=0.3, random_state=1) # RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier, and GradientBoostingRegressor. model = RandomForestClassifier() model.fit(X_train, y_train) y_predict = model.predict_proba(X_test) print ("Model score: ",model.score(X_test, y_test)) # probablities of getting a 1 y_predict = y_predict[:, 1] auc = roc_auc_score(y_test, y_predict) FPR, TPR, thresholds = roc_curve(y_test, y_predict) print ("Thresholds: ",thresholds) print ("FPR: ",FPR) print ("TPR: ",TPR) show_roc(FPR, TPR,auc) plt.scatter(data_vals[:, 0], data_vals[:, 1], marker='o', c=class_label,s=25, edgecolor='k') plt.savefig('test.png') Code sample 5: [Here] from sklearn.metrics import r2_score, mean_squared_error,max_error,mean_squared_log_error bob_login = [48, 12, 7, 11,43,44] bob_predicted= [41, 14, 9, 15,40,41] print ("R^2 score: ",r2_score(bob_login, bob_predicted)) print ("RMSE score: ",mean_squared_error(bob_login, bob_predicted)) print ("Mean squared error: ",mean_squared_log_error(bob_login, bob_predicted)) print ("Max error: ",max_error(bob_login, bob_predicted)) Code 6 (Numeric Prediction): [Here] import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score,mean_squared_error # Features x1= "blood_pressure" x2= "age" # Prediction x3 = "BMI" fdata="diabetes.csv" print ("Training data:\t\t",x1,",",x2) print ("Training against:\t",x3) print ("Data set:\t\t",fdata) ver=pd.read_csv(fdata) dataset=ver[[x1,x2]] train=ver[x3] print (dataset) x_train, x_test, y_train, y_test= train_test_split(dataset,train,test_size=0.3, random_state=1) model= RandomForestRegressor() model.fit(x_train,y_train) y_predictions =model.predict(x_test) accuracy = r2_score(y_test, y_predictions) mse = mean_squared_error(y_test, y_predictions) print ("R^2=",accuracy) print ("MSE=",mse) Code 7 (Cluster prediction and metrics): [Here] # Cluster Prediction import pandas as pd from sklearn.model_selection import train_test_split from sklearn.cluster import KMeans from sklearn.metrics import confusion_matrix, roc_curve, auc # Features x1= "blood_pressure" x2= "age" # Prediction x3 = "response" fdata="diabetes.csv" print ("Training data:\t\t",x1,",",x2) print ("Training against:\t",x3) print ("Data set:\t\t",fdata) ver=pd.read_csv(fdata) dataset=ver[[x1,x2]] train=ver[x3] print (dataset) x_train, x_test, y_train, y_test= train_test_split(dataset,train,test_size=0.3, random_state=1) model= KMeans(n_clusters=2, random_state=0) model.fit(x_train,y_train) y_predictions =model.predict(x_test) conf=confusion_matrix(y_test,y_predictions) print (conf) fpr, tpr, thresholds = roc_curve(y_test,y_predictions) auc=auc(fpr, tpr) print ("FPR:",fpr) print ("TPR:",tpr) print ("Thresholds:",thresholds) print ("AUC: ",auc) Tutorial1. We want to differentiate Eve from Bob. In monitoring Eve's accesses to email on a daily basis we find daily accesses of 20, 25, 16, 42 and 22, and then monitory Bob's accesses as: 50, 41, 60, 54 and 39. With the following we aim to detect Bob from Eve, and plot the ROC Curve. Use the following code to determine the ROC curve and the AUC value:htps://repl.it/@billbuchanan/class01 Listing: # https://asecuritysite.com/bigdata/roc from sklearn import metrics import matplotlib.pyplot as plt def show_roc(FPR, TPR, AUC): plt.plot(FPR, TPR, color='blue', label='ROC') plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') plt.xlabel('FPR') plt.ylabel('TPR') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(["AUC=%.3f" % AUC]) plt.show() y = ['Eve', 'Eve', 'Eve', 'Eve','Eve','Bob','Bob', 'Bob','Bob','Bob'] scores = [20,25,16,42,22,50,41,60,54,39] positive_label = 'Bob' fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=positive_label) auc=metrics.auc(fpr, tpr) print ("FPR:",fpr) print ("TPR:",tpr) print ("Thresholds:",thresholds) show_roc(fpr, tpr,auc)
What is the AUC: Bob's daily accesses for email are now monitored for 50, 55, 43, 90, 110 and 66, and Eve has accesses of 14, 32, 19, 46, 21, 48 and 50. Use the program to determine the new ROC curve:
What is the AUC: 2. Now, we can add Alice, and who has accesses of 13, 23, 32, 40, 11, and 14, and determine the following:
What is the AUC: 3. Now change the positive label to Alice, and determine the following:
What is the AUC: In the following example we will load a dataset for a machine learning model to differentiate hand written digits. Run the following code and determine the confusion matrix: htps://repl.it/@billbuchanan/class02 # https://asecuritysite.com/bigdata/sk01 import sys import matplotlib.pyplot as plt ga=0.011 from sklearn import datasets, svm, metrics digits = datasets.load_digits() images_and_labels = list(zip(digits.images, digits.target)) for index, (image, label) in enumerate(images_and_labels[:10]): plt.subplot(2, 10, (index + 1)) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Tr: %i' % label) # To apply a classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) # Create a classifier: a support vector classifier classifier = svm.SVC(gamma=ga) # We learn the digits on the first half of the digits classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2]) # Now predict the value of the digit on the second half: expected = digits.target[n_samples // 2:] predicted = classifier.predict(data[n_samples // 2:]) print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted)) for index, (image, prediction) in enumerate(images_and_predictions[:4]): plt.subplot(2, 4, index + 5) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Prediction: %i' % prediction) plt.show()
What are the TP for the character '0' 5. The following code generates a data set which has two clusters, and then marks each of the dataset elements for their cluster source. Run the program several times and observe the creation of the clusters: htps://repl.it/@billbuchanan/class03 # This code creates a data set with two clusters (defined by the two features. The output is then data_vals[:, 0] and data_vals[:, 1] and these are marked by class_lab from sklearn.datasets import make_classification import matplotlib.pyplot as plt data_vals, class_label =make_classification(n_samples=100,n_features=2, n_redundant=0, n_informative=1,n_clusters_per_class=1) plt.scatter(data_vals[:, 0], data_vals[:, 1], marker='o', c=class_label,s=25, edgecolor='k') plt.savefig('test.png') plt.show() Modify the code so that it now generates 250 points. 6. We will now use this method of cluster generation, and then split the data into 70\% training data, and 30\% test data, in to train a RandomForestClassifier model to predict the results: htps://repl.it/@billbuchanan/class04 # Create ROC from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve import matplotlib.pyplot as plt def show_roc(FPR, TPR, AUC): plt.plot(FPR, TPR, color='blue', label='ROC') plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') plt.xlabel('FPR') plt.ylabel('TPR') plt.title('Receiver Operating Characteristic (ROC) Curve') plt.legend(["AUC=%.3f" % AUC]) plt.show() data_vals, class_label =make_classification(n_samples=10,n_features=2, n_redundant=0, n_informative=1,n_clusters_per_class=1) X_train, X_test, y_train, y_test = train_test_split(data_vals, class_label, test_size=0.3, random_state=1) model = RandomForestClassifier() model.fit(X_train, y_train) print ("Model score: ",model.score(X_test, y_test)) probs = model.predict_proba(X_test) # probabilities of getting a 1 probs = probs[:, 1] auc = roc_auc_score(y_test, probs) FPR, TPR, thresholds = roc_curve(y_test, probs) print ("Thresholds: ",thresholds) print ("FPR: ",FPR) print ("TPR: ",TPR) show_roc(FPR, TPR,auc) plt.scatter(data_vals[:, 0], data_vals[:, 1], marker='o', c=class_label,s=25, edgecolor='k') plt.savefig('test.png')
For 10 points, what is the AUC?
7. There are a few ensemble methods for machine learning in skLearn, including BaggingClassifier, ExtraTreesClassifier, AdaBoostClassifier, and GradientBoostingRegressor. Modify the code given in Q.6 to support each of the different models: from sklearn.ensemble import AdaBoostClassifier ... model = AdaBoostClassifier() For each of the methods, what is the AUC, and which method is the best performing? ExtraTreesClassifier AUC: AdaBoostClassifier AUC: GradientBoostingRegressor AUC: BaggingClassifier AUC: 8. You have been asked to identify if there is a linkage between gun ownership and population density in US states, and whether there is a link to the number of murders per 100K of the population. An outline of the code is given here: htps://repl.it/@billbuchanan/class08 # Numeric Prediction import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score,mean_squared_error # Features x1= "Gun ownership" x2= "Population density" # Prediction x3 = "Murders per 100K" fdata="guns.csv" print ("Training data:\t\t",x1,",",x2) print ("Training against:\t",x3) print ("Data set:\t\t",fdata) ver=pd.read_csv(fdata) dataset=ver[[x1,x2]] train=ver[x3] print (dataset) x_train, x_test, y_train, y_test= train_test_split(dataset,train,test_size=0.3, random_state=1) model= RandomForestRegressor() model.fit(x_train,y_train) y_predictions =model.predict(x_test) accuracy = r2_score(y_test, y_predictions) mse = mean_squared_error(y_test, y_predictions) print ("R^2=",accuracy) print ("MSE=",mse) # Correlation cor=ver.corr() print (cor[x3])
By examining the \(R^2\) value, is the machine learning implementation a good model? |