| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 | # Ignored usage of deprecated modules for sklearn 0.18.# This would be updated in future when sklearn 0.20 releases.import warningswarnings.filterwarnings("ignore")""" a basic script for importing student's POI identifier,    and checking the results that they get from it    requires that the algorithm, dataset, and features list    be written to my_classifier.pkl, my_dataset.pkl, and    my_feature_list.pkl, respectively    that process should happen at the end of poi_id.py"""import pickleimport sysfrom sklearn.cross_validation import StratifiedShuffleSplit#from sklearn.model_selection import StratifiedShuffleSplitsys.path.append("../tools/")from feature_format import featureFormat, targetFeatureSplitPERF_FORMAT_STRING = "\\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\\tFalse negatives: {:4d}\tTrue negatives: {:4d}"def test_classifier(clf, dataset, feature_list, folds = 1000):    data = featureFormat(dataset, feature_list, sort_keys = True)    labels, features = targetFeatureSplit(data)    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)    #cv = StratifiedShuffleSplit(n_splits=folds, test_size=0.3,random_state = 42)    true_negatives = 0    false_negatives = 0    true_positives = 0    false_positives = 0    for train_idx, test_idx in cv:        features_train = []        features_test  = []        labels_train   = []        labels_test    = []        for ii in train_idx:            features_train.append( features[ii] )            labels_train.append( labels[ii] )        for jj in test_idx:            features_test.append( features[jj] )            labels_test.append( labels[jj] )        ### fit the classifier using training set, and test on test set        clf.fit(features_train, labels_train)        predictions = clf.predict(features_test)        for prediction, truth in zip(predictions, labels_test):            if prediction == 0 and truth == 0:                true_negatives += 1            elif prediction == 0 and truth == 1:                false_negatives += 1            elif prediction == 1 and truth == 0:                false_positives += 1            elif prediction == 1 and truth == 1:                true_positives += 1            else:                print ("Warning: Found a predicted label not == 0 or 1.")                print ("All predictions should take value 0 or 1.")                print ("Evaluating performance for processed predictions:")                break    try:        total_predictions = true_negatives + false_negatives + false_positives + true_positives        accuracy = 1.0*(true_positives + true_negatives)/total_predictions        precision = 1.0*true_positives/(true_positives+false_positives)        recall = 1.0*true_positives/(true_positives+false_negatives)        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)        print (clf)        print (PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))        print (RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))        print ("")    except:        print ("Got a divide by zero when trying out:", clf)        print ("Precision or recall may be undefined due to a lack of true positive predicitons.")CLF_PICKLE_FILENAME = "my_classifier.pkl"DATASET_PICKLE_FILENAME = "my_dataset.pkl"FEATURE_LIST_FILENAME = "my_feature_list.pkl"def dump_classifier_and_data(clf, dataset, feature_list):    with open(CLF_PICKLE_FILENAME, "wb") as clf_outfile:        pickle.dump(clf, clf_outfile)    with open(DATASET_PICKLE_FILENAME, "wb") as dataset_outfile:        pickle.dump(dataset, dataset_outfile)    with open(FEATURE_LIST_FILENAME, "wb") as featurelist_outfile:        pickle.dump(feature_list, featurelist_outfile)def load_classifier_and_data():    with open(CLF_PICKLE_FILENAME, "rb") as clf_infile:        clf = pickle.load(clf_infile)    with open(DATASET_PICKLE_FILENAME, "rb") as dataset_infile:        dataset = pickle.load(dataset_infile)    with open(FEATURE_LIST_FILENAME, "rb") as featurelist_infile:        feature_list = pickle.load(featurelist_infile)    return clf, dataset, feature_listdef main():    ### Load up student's classifier, dataset, and feature_list.    clf, dataset, feature_list = load_classifier_and_data()    ### Run testing script    test_classifier(clf, dataset, feature_list)if __name__ == '__main__':    main()
 |