123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- # Ignored usage of deprecated modules for sklearn 0.18.
- # This would be updated in future when sklearn 0.20 releases.
- import warnings
- warnings.filterwarnings("ignore")
- """ a basic script for importing student's POI identifier,
- and checking the results that they get from it
- requires that the algorithm, dataset, and features list
- be written to my_classifier.pkl, my_dataset.pkl, and
- my_feature_list.pkl, respectively
- that process should happen at the end of poi_id.py
- """
- import pickle
- import sys
- from sklearn.cross_validation import StratifiedShuffleSplit
- #from sklearn.model_selection import StratifiedShuffleSplit
- sys.path.append("../tools/")
- from feature_format import featureFormat, targetFeatureSplit
- PERF_FORMAT_STRING = "\
- \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
- Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
- RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
- \tFalse negatives: {:4d}\tTrue negatives: {:4d}"
- def test_classifier(clf, dataset, feature_list, folds = 1000):
- data = featureFormat(dataset, feature_list, sort_keys = True)
- labels, features = targetFeatureSplit(data)
- cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
- #cv = StratifiedShuffleSplit(n_splits=folds, test_size=0.3,random_state = 42)
- true_negatives = 0
- false_negatives = 0
- true_positives = 0
- false_positives = 0
- for train_idx, test_idx in cv:
- features_train = []
- features_test = []
- labels_train = []
- labels_test = []
- for ii in train_idx:
- features_train.append( features[ii] )
- labels_train.append( labels[ii] )
- for jj in test_idx:
- features_test.append( features[jj] )
- labels_test.append( labels[jj] )
- ### fit the classifier using training set, and test on test set
- clf.fit(features_train, labels_train)
- predictions = clf.predict(features_test)
- for prediction, truth in zip(predictions, labels_test):
- if prediction == 0 and truth == 0:
- true_negatives += 1
- elif prediction == 0 and truth == 1:
- false_negatives += 1
- elif prediction == 1 and truth == 0:
- false_positives += 1
- elif prediction == 1 and truth == 1:
- true_positives += 1
- else:
- print ("Warning: Found a predicted label not == 0 or 1.")
- print ("All predictions should take value 0 or 1.")
- print ("Evaluating performance for processed predictions:")
- break
- try:
- total_predictions = true_negatives + false_negatives + false_positives + true_positives
- accuracy = 1.0*(true_positives + true_negatives)/total_predictions
- precision = 1.0*true_positives/(true_positives+false_positives)
- recall = 1.0*true_positives/(true_positives+false_negatives)
- f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
- f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
- print (clf)
- print (PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
- print (RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
- print ("")
- except:
- print ("Got a divide by zero when trying out:", clf)
- print ("Precision or recall may be undefined due to a lack of true positive predicitons.")
- CLF_PICKLE_FILENAME = "my_classifier.pkl"
- DATASET_PICKLE_FILENAME = "my_dataset.pkl"
- FEATURE_LIST_FILENAME = "my_feature_list.pkl"
- def dump_classifier_and_data(clf, dataset, feature_list):
- with open(CLF_PICKLE_FILENAME, "wb") as clf_outfile:
- pickle.dump(clf, clf_outfile)
- with open(DATASET_PICKLE_FILENAME, "wb") as dataset_outfile:
- pickle.dump(dataset, dataset_outfile)
- with open(FEATURE_LIST_FILENAME, "wb") as featurelist_outfile:
- pickle.dump(feature_list, featurelist_outfile)
- def load_classifier_and_data():
- with open(CLF_PICKLE_FILENAME, "rb") as clf_infile:
- clf = pickle.load(clf_infile)
- with open(DATASET_PICKLE_FILENAME, "rb") as dataset_infile:
- dataset = pickle.load(dataset_infile)
- with open(FEATURE_LIST_FILENAME, "rb") as featurelist_infile:
- feature_list = pickle.load(featurelist_infile)
- return clf, dataset, feature_list
- def main():
- ### Load up student's classifier, dataset, and feature_list.
- clf, dataset, feature_list = load_classifier_and_data()
- ### Run testing script
- test_classifier(clf, dataset, feature_list)
- if __name__ == '__main__':
- main()
|