tester.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. # Ignored usage of deprecated modules for sklearn 0.18.
  2. # This would be updated in future when sklearn 0.20 releases.
  3. import warnings
  4. warnings.filterwarnings("ignore")
  5. """ a basic script for importing student's POI identifier,
  6. and checking the results that they get from it
  7. requires that the algorithm, dataset, and features list
  8. be written to my_classifier.pkl, my_dataset.pkl, and
  9. my_feature_list.pkl, respectively
  10. that process should happen at the end of poi_id.py
  11. """
  12. import pickle
  13. import sys
  14. from sklearn.cross_validation import StratifiedShuffleSplit
  15. #from sklearn.model_selection import StratifiedShuffleSplit
  16. sys.path.append("../tools/")
  17. from feature_format import featureFormat, targetFeatureSplit
  18. PERF_FORMAT_STRING = "\
  19. \tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
  20. Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
  21. RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
  22. \tFalse negatives: {:4d}\tTrue negatives: {:4d}"
  23. def test_classifier(clf, dataset, feature_list, folds = 1000):
  24. data = featureFormat(dataset, feature_list, sort_keys = True)
  25. labels, features = targetFeatureSplit(data)
  26. cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
  27. #cv = StratifiedShuffleSplit(n_splits=folds, test_size=0.3,random_state = 42)
  28. true_negatives = 0
  29. false_negatives = 0
  30. true_positives = 0
  31. false_positives = 0
  32. for train_idx, test_idx in cv:
  33. features_train = []
  34. features_test = []
  35. labels_train = []
  36. labels_test = []
  37. for ii in train_idx:
  38. features_train.append( features[ii] )
  39. labels_train.append( labels[ii] )
  40. for jj in test_idx:
  41. features_test.append( features[jj] )
  42. labels_test.append( labels[jj] )
  43. ### fit the classifier using training set, and test on test set
  44. clf.fit(features_train, labels_train)
  45. predictions = clf.predict(features_test)
  46. for prediction, truth in zip(predictions, labels_test):
  47. if prediction == 0 and truth == 0:
  48. true_negatives += 1
  49. elif prediction == 0 and truth == 1:
  50. false_negatives += 1
  51. elif prediction == 1 and truth == 0:
  52. false_positives += 1
  53. elif prediction == 1 and truth == 1:
  54. true_positives += 1
  55. else:
  56. print ("Warning: Found a predicted label not == 0 or 1.")
  57. print ("All predictions should take value 0 or 1.")
  58. print ("Evaluating performance for processed predictions:")
  59. break
  60. try:
  61. total_predictions = true_negatives + false_negatives + false_positives + true_positives
  62. accuracy = 1.0*(true_positives + true_negatives)/total_predictions
  63. precision = 1.0*true_positives/(true_positives+false_positives)
  64. recall = 1.0*true_positives/(true_positives+false_negatives)
  65. f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
  66. f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
  67. print (clf)
  68. print (PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
  69. print (RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
  70. print ("")
  71. except:
  72. print ("Got a divide by zero when trying out:", clf)
  73. print ("Precision or recall may be undefined due to a lack of true positive predicitons.")
  74. CLF_PICKLE_FILENAME = "my_classifier.pkl"
  75. DATASET_PICKLE_FILENAME = "my_dataset.pkl"
  76. FEATURE_LIST_FILENAME = "my_feature_list.pkl"
  77. def dump_classifier_and_data(clf, dataset, feature_list):
  78. with open(CLF_PICKLE_FILENAME, "wb") as clf_outfile:
  79. pickle.dump(clf, clf_outfile)
  80. with open(DATASET_PICKLE_FILENAME, "wb") as dataset_outfile:
  81. pickle.dump(dataset, dataset_outfile)
  82. with open(FEATURE_LIST_FILENAME, "wb") as featurelist_outfile:
  83. pickle.dump(feature_list, featurelist_outfile)
  84. def load_classifier_and_data():
  85. with open(CLF_PICKLE_FILENAME, "rb") as clf_infile:
  86. clf = pickle.load(clf_infile)
  87. with open(DATASET_PICKLE_FILENAME, "rb") as dataset_infile:
  88. dataset = pickle.load(dataset_infile)
  89. with open(FEATURE_LIST_FILENAME, "rb") as featurelist_infile:
  90. feature_list = pickle.load(featurelist_infile)
  91. return clf, dataset, feature_list
  92. def main():
  93. ### Load up student's classifier, dataset, and feature_list.
  94. clf, dataset, feature_list = load_classifier_and_data()
  95. ### Run testing script
  96. test_classifier(clf, dataset, feature_list)
  97. if __name__ == '__main__':
  98. main()