poi_id.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. # Ignored usage of deprecated modules for sklearn 0.18.
  2. # This would be updated in future when sklearn 0.20 releases.
  3. import warnings
  4. warnings.filterwarnings("ignore")
  5. import pickle
  6. from feature_format import featureFormat, targetFeatureSplit
  7. from tester import dump_classifier_and_data
  8. import pandas as pd
  9. import numpy as np
  10. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  11. from sklearn.grid_search import GridSearchCV
  12. from time import time
  13. ### Task 1: Select what features you'll use.
  14. ### features_list is a list of strings, each of which is a feature name.
  15. ### The first feature must be "poi".
  16. features_list = ['poi', 'salary', 'bonus', 'long_term_incentive',
  17. 'bonus-to-salary_ratio', 'deferral_payments', 'expenses',
  18. 'restricted_stock_deferred', 'restricted_stock',
  19. 'deferred_income', 'fraction_mail_from_poi',
  20. 'total_payments', 'other', 'fraction_mail_to_poi',
  21. 'from_poi_to_this_person', 'from_this_person_to_poi',
  22. 'to_messages','from_messages', 'shared_receipt_with_poi',
  23. 'loan_advances', 'director_fees',
  24. 'exercised_stock_options', 'total_stock_value']
  25. ### Load the dictionary containing the dataset
  26. with open("final_project_dataset.pkl", "rb") as data_file:
  27. data_dict = pickle.load(data_file)
  28. # Converting the given pickled Enron data to a pandas dataframe
  29. enron_df = pd.DataFrame.from_records(list(data_dict.values()))
  30. # Set the index of df to be the employees series:
  31. employees = pd.Series(list(data_dict.keys()))
  32. enron_df.set_index(employees, inplace=True)
  33. # Coerce numeric values into floats or ints; also change NaN to zero:
  34. enron_df_new = enron_df.apply(lambda x : pd.to_numeric(x, errors = 'coerce')).copy().fillna(0)
  35. # Dropping column 'email_address' as not required in analysis
  36. enron_df_new.drop('email_address', axis = 1, inplace = True)
  37. ### Task 2: Remove outliers
  38. enron_df_new.drop(['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'FREVERT MARK A',
  39. 'MARTIN AMANDA K', 'BHATNAGAR SANJAY'], axis = 0, inplace = True)
  40. ### Task 3: Create new feature(s)
  41. enron_df_new['bonus-to-salary_ratio'] = enron_df_new['bonus']/enron_df_new['salary']
  42. enron_df_new['fraction_mail_from_poi'] = enron_df_new['from_poi_to_this_person']/enron_df_new['from_messages']
  43. enron_df_new['fraction_mail_to_poi'] = enron_df_new['from_this_person_to_poi']/enron_df_new['to_messages']
  44. # Clean all 'inf' values which we got if the person's from_messages = 0
  45. enron_df_new = enron_df_new.replace('inf', 0)
  46. enron_df_new = enron_df_new.fillna(0)
  47. # Converting the above modified dataframe to a dictionary
  48. enron_dict = enron_df_new.to_dict('index')
  49. ### Store to my_dataset for easy export below.
  50. my_dataset = enron_dict
  51. ### Extract features and labels from dataset for local testing
  52. data = featureFormat(my_dataset, features_list, sort_keys = True)
  53. labels, features = targetFeatureSplit(data)
  54. ### Task 4: Try a varity of classifiers
  55. ### Please name your classifier clf for easy export below.
  56. ### Note that if you want to do PCA or other multi-stage operations,
  57. ### you'll need to use Pipelines. For more info:
  58. ### http://scikit-learn.org/stable/modules/pipeline.html
  59. ### split data into training and testing datasets
  60. from sklearn import cross_validation
  61. features_train, features_test, \
  62. labels_train, labels_test = cross_validation.train_test_split(features, labels,
  63. test_size=0.3, random_state=42)
  64. # Stratified ShuffleSplit cross-validator
  65. from sklearn.model_selection import StratifiedShuffleSplit
  66. sss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3,random_state = 42)
  67. # Importing modules for feature scaling and selection
  68. from sklearn.preprocessing import MinMaxScaler
  69. from sklearn.feature_selection import SelectKBest, f_classif
  70. from sklearn.decomposition import PCA
  71. from sklearn.pipeline import Pipeline
  72. from sklearn.model_selection import GridSearchCV
  73. # Defining functions to be used via the pipeline
  74. scaler = MinMaxScaler()
  75. skb = SelectKBest(f_classif)
  76. # pca = PCA()
  77. ### Task 5: Tune your classifier to achieve better than .3 precision and recall
  78. ### using our testing script. Check the tester.py script in the final project
  79. ### folder for details on the evaluation method, especially the test_classifier
  80. ### function. Because of the small size of the dataset, the script uses
  81. ### stratified shuffle split cross validation.
  82. from sklearn.naive_bayes import GaussianNB
  83. clf_gnb = GaussianNB()
  84. pipeline = Pipeline(steps = [("SKB", skb), ("NaiveBayes",clf_gnb)])
  85. param_grid = {"SKB__k":[3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]}
  86. grid = GridSearchCV(pipeline, param_grid, verbose = 0, cv = sss, scoring = 'f1')
  87. t0 = time()
  88. grid.fit(features, labels)
  89. print("Training time: ", round(time()-t0, 3), "s")
  90. # Best algorithm
  91. clf = grid.best_estimator_
  92. t0 = time()
  93. # Refit the best algorithm:
  94. clf.fit(features_train, labels_train)
  95. prediction = clf.predict(features_test)
  96. print("Testing time: ", round(time()-t0, 3), "s")
  97. print("Accuracy of GaussianNB classifer is : ", accuracy_score(labels_test, prediction))
  98. print("Precision of GaussianNB classifer is : ", precision_score(prediction, labels_test))
  99. print("Recall of GaussianNB classifer is : ", recall_score(prediction, labels_test))
  100. print("f1-score of GaussianNB classifer is : ", f1_score(prediction, labels_test))
  101. ### Task 6: Dump your classifier, dataset, and features_list so anyone can
  102. ### check your results. You do not need to change anything below, but make sure
  103. ### that the version of poi_id.py that you submit can be run on its own and
  104. ### generates the necessary .pkl files for validating your results.
  105. dump_classifier_and_data(clf, my_dataset, features_list)