# Ignored usage of deprecated modules for sklearn 0.18. # This would be updated in future when sklearn 0.20 releases. import warnings warnings.filterwarnings("ignore") import pickle from feature_format import featureFormat, targetFeatureSplit from tester import dump_classifier_and_data import pandas as pd import numpy as np from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from sklearn.grid_search import GridSearchCV from time import time ### Task 1: Select what features you'll use. ### features_list is a list of strings, each of which is a feature name. ### The first feature must be "poi". features_list = ['poi', 'salary', 'bonus', 'long_term_incentive', 'bonus-to-salary_ratio', 'deferral_payments', 'expenses', 'restricted_stock_deferred', 'restricted_stock', 'deferred_income', 'fraction_mail_from_poi', 'total_payments', 'other', 'fraction_mail_to_poi', 'from_poi_to_this_person', 'from_this_person_to_poi', 'to_messages','from_messages', 'shared_receipt_with_poi', 'loan_advances', 'director_fees', 'exercised_stock_options', 'total_stock_value'] ### Load the dictionary containing the dataset with open("final_project_dataset.pkl", "rb") as data_file: data_dict = pickle.load(data_file) # Converting the given pickled Enron data to a pandas dataframe enron_df = pd.DataFrame.from_records(list(data_dict.values())) # Set the index of df to be the employees series: employees = pd.Series(list(data_dict.keys())) enron_df.set_index(employees, inplace=True) # Coerce numeric values into floats or ints; also change NaN to zero: enron_df_new = enron_df.apply(lambda x : pd.to_numeric(x, errors = 'coerce')).copy().fillna(0) # Dropping column 'email_address' as not required in analysis enron_df_new.drop('email_address', axis = 1, inplace = True) ### Task 2: Remove outliers enron_df_new.drop(['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'FREVERT MARK A', 'MARTIN AMANDA K', 'BHATNAGAR SANJAY'], axis = 0, inplace = True) ### Task 3: Create new feature(s) enron_df_new['bonus-to-salary_ratio'] = enron_df_new['bonus']/enron_df_new['salary'] enron_df_new['fraction_mail_from_poi'] = enron_df_new['from_poi_to_this_person']/enron_df_new['from_messages'] enron_df_new['fraction_mail_to_poi'] = enron_df_new['from_this_person_to_poi']/enron_df_new['to_messages'] # Clean all 'inf' values which we got if the person's from_messages = 0 enron_df_new = enron_df_new.replace('inf', 0) enron_df_new = enron_df_new.fillna(0) # Converting the above modified dataframe to a dictionary enron_dict = enron_df_new.to_dict('index') ### Store to my_dataset for easy export below. my_dataset = enron_dict ### Extract features and labels from dataset for local testing data = featureFormat(my_dataset, features_list, sort_keys = True) labels, features = targetFeatureSplit(data) ### Task 4: Try a varity of classifiers ### Please name your classifier clf for easy export below. ### Note that if you want to do PCA or other multi-stage operations, ### you'll need to use Pipelines. For more info: ### http://scikit-learn.org/stable/modules/pipeline.html ### split data into training and testing datasets from sklearn import cross_validation features_train, features_test, \ labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42) # Stratified ShuffleSplit cross-validator from sklearn.model_selection import StratifiedShuffleSplit sss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3,random_state = 42) # Importing modules for feature scaling and selection from sklearn.preprocessing import MinMaxScaler from sklearn.feature_selection import SelectKBest, f_classif from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline from sklearn.model_selection import GridSearchCV # Defining functions to be used via the pipeline scaler = MinMaxScaler() skb = SelectKBest(f_classif) # pca = PCA() ### Task 5: Tune your classifier to achieve better than .3 precision and recall ### using our testing script. Check the tester.py script in the final project ### folder for details on the evaluation method, especially the test_classifier ### function. Because of the small size of the dataset, the script uses ### stratified shuffle split cross validation. from sklearn.naive_bayes import GaussianNB clf_gnb = GaussianNB() pipeline = Pipeline(steps = [("SKB", skb), ("NaiveBayes",clf_gnb)]) param_grid = {"SKB__k":[3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]} grid = GridSearchCV(pipeline, param_grid, verbose = 0, cv = sss, scoring = 'f1') t0 = time() grid.fit(features, labels) print("Training time: ", round(time()-t0, 3), "s") # Best algorithm clf = grid.best_estimator_ t0 = time() # Refit the best algorithm: clf.fit(features_train, labels_train) prediction = clf.predict(features_test) print("Testing time: ", round(time()-t0, 3), "s") print("Accuracy of GaussianNB classifer is : ", accuracy_score(labels_test, prediction)) print("Precision of GaussianNB classifer is : ", precision_score(prediction, labels_test)) print("Recall of GaussianNB classifer is : ", recall_score(prediction, labels_test)) print("f1-score of GaussianNB classifer is : ", f1_score(prediction, labels_test)) ### Task 6: Dump your classifier, dataset, and features_list so anyone can ### check your results. You do not need to change anything below, but make sure ### that the version of poi_id.py that you submit can be run on its own and ### generates the necessary .pkl files for validating your results. dump_classifier_and_data(clf, my_dataset, features_list)