123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- import warnings
- warnings.filterwarnings("ignore")
- import pickle
- from feature_format import featureFormat, targetFeatureSplit
- from tester import dump_classifier_and_data
- import pandas as pd
- import numpy as np
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
- from sklearn.grid_search import GridSearchCV
- from time import time
- features_list = ['poi', 'salary', 'bonus', 'long_term_incentive',
- 'bonus-to-salary_ratio', 'deferral_payments', 'expenses',
- 'restricted_stock_deferred', 'restricted_stock',
- 'deferred_income', 'fraction_mail_from_poi',
- 'total_payments', 'other', 'fraction_mail_to_poi',
- 'from_poi_to_this_person', 'from_this_person_to_poi',
- 'to_messages','from_messages', 'shared_receipt_with_poi',
- 'loan_advances', 'director_fees',
- 'exercised_stock_options', 'total_stock_value']
- with open("final_project_dataset.pkl", "rb") as data_file:
- data_dict = pickle.load(data_file)
- enron_df = pd.DataFrame.from_records(list(data_dict.values()))
- employees = pd.Series(list(data_dict.keys()))
- enron_df.set_index(employees, inplace=True)
- enron_df_new = enron_df.apply(lambda x : pd.to_numeric(x, errors = 'coerce')).copy().fillna(0)
- enron_df_new.drop('email_address', axis = 1, inplace = True)
- enron_df_new.drop(['TOTAL', 'THE TRAVEL AGENCY IN THE PARK', 'FREVERT MARK A',
- 'MARTIN AMANDA K', 'BHATNAGAR SANJAY'], axis = 0, inplace = True)
- enron_df_new['bonus-to-salary_ratio'] = enron_df_new['bonus']/enron_df_new['salary']
- enron_df_new['fraction_mail_from_poi'] = enron_df_new['from_poi_to_this_person']/enron_df_new['from_messages']
- enron_df_new['fraction_mail_to_poi'] = enron_df_new['from_this_person_to_poi']/enron_df_new['to_messages']
- enron_df_new = enron_df_new.replace('inf', 0)
- enron_df_new = enron_df_new.fillna(0)
- enron_dict = enron_df_new.to_dict('index')
- my_dataset = enron_dict
- data = featureFormat(my_dataset, features_list, sort_keys = True)
- labels, features = targetFeatureSplit(data)
- from sklearn import cross_validation
- features_train, features_test, \
- labels_train, labels_test = cross_validation.train_test_split(features, labels,
- test_size=0.3, random_state=42)
- from sklearn.model_selection import StratifiedShuffleSplit
- sss = StratifiedShuffleSplit(n_splits=1000, test_size=0.3,random_state = 42)
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.feature_selection import SelectKBest, f_classif
- from sklearn.decomposition import PCA
- from sklearn.pipeline import Pipeline
- from sklearn.model_selection import GridSearchCV
- scaler = MinMaxScaler()
- skb = SelectKBest(f_classif)
- from sklearn.naive_bayes import GaussianNB
- clf_gnb = GaussianNB()
- pipeline = Pipeline(steps = [("SKB", skb), ("NaiveBayes",clf_gnb)])
- param_grid = {"SKB__k":[3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]}
- grid = GridSearchCV(pipeline, param_grid, verbose = 0, cv = sss, scoring = 'f1')
- t0 = time()
- grid.fit(features, labels)
- print("Training time: ", round(time()-t0, 3), "s")
- clf = grid.best_estimator_
- t0 = time()
- clf.fit(features_train, labels_train)
- prediction = clf.predict(features_test)
- print("Testing time: ", round(time()-t0, 3), "s")
- print("Accuracy of GaussianNB classifer is : ", accuracy_score(labels_test, prediction))
- print("Precision of GaussianNB classifer is : ", precision_score(prediction, labels_test))
- print("Recall of GaussianNB classifer is : ", recall_score(prediction, labels_test))
- print("f1-score of GaussianNB classifer is : ", f1_score(prediction, labels_test))
- dump_classifier_and_data(clf, my_dataset, features_list)
|