feature_format.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. """
  2. A general tool for converting data from the
  3. dictionary format to an (n x k) python list
  4. that's ready for training an sklearn algorithm.
  5. n: no. of key-value pairs in dictonary
  6. k: no. of features being extracted
  7. dictionary keys are names of persons in dataset
  8. dictionary values are dictionaries, where each
  9. key-value pair in the dict is the name
  10. of a feature, and its value for that person.
  11. In addition to converting a dictionary to a numpy
  12. array, you may want to separate the labels from the
  13. features - this is what targetFeatureSplit is for.
  14. So, if you want to have the poi label as the target,
  15. and the features you want to use are the person's
  16. salary and bonus, here's what you would do:
  17. feature_list = ["poi", "salary", "bonus"]
  18. data_array = featureFormat(data_dictionary, feature_list)
  19. label, features = targetFeatureSplit(data_array)
  20. The line above (targetFeatureSplit) assumes that the
  21. label is the first item in feature_list.
  22. """
  23. import numpy as np
  24. def featureFormat( dictionary, features, remove_NaN=True,
  25. remove_all_zeroes=True, remove_any_zeroes=False,
  26. sort_keys = False):
  27. """ Convert dictionary to numpy array of features.
  28. remove_NaN = True will convert "NaN" string to 0.0
  29. remove_all_zeroes = True will omit any data points for which
  30. all the features you seek are 0.0
  31. remove_any_zeroes = True will omit any data points for which
  32. any of the features you seek are 0.0
  33. sort_keys = True sorts keys by alphabetical order. Setting the value as
  34. a string opens the corresponding pickle file with a preset key
  35. order (this is used for Python 3 compatibility, and sort_keys
  36. should be left as False for the course mini-projects).
  37. NOTE: first feature is assumed to be 'poi' and is not checked for
  38. removal for zero or missing values.
  39. """
  40. return_list = []
  41. # Key order - first branch is for Python 3 compatibility on mini-projects,
  42. # second branch is for compatibility on final project.
  43. if isinstance(sort_keys, str):
  44. import pickle
  45. keys = pickle.load(open(sort_keys, "rb"))
  46. elif sort_keys:
  47. keys = sorted(dictionary.keys())
  48. else:
  49. keys = dictionary.keys()
  50. for key in keys:
  51. tmp_list = []
  52. for feature in features:
  53. try:
  54. dictionary[key][feature]
  55. except KeyError:
  56. print("error: key ", feature, " not present")
  57. return
  58. value = dictionary[key][feature]
  59. if value=="NaN" and remove_NaN:
  60. value = 0
  61. tmp_list.append( float(value) )
  62. # Logic for deciding whether or not to add the data point.
  63. append = True
  64. # Exclude 'poi' class as criteria.
  65. if features[0] == 'poi':
  66. test_list = tmp_list[1:]
  67. else:
  68. test_list = tmp_list
  69. ### If all features are zero and you want to remove
  70. ### data points that are all zero, do that here
  71. if remove_all_zeroes:
  72. append = False
  73. for item in test_list:
  74. if item != 0 and item != "NaN":
  75. append = True
  76. break
  77. ### If any features for a given data point are zero
  78. ### and you want to remove data points with any zeroes,
  79. ### handle that here.
  80. if remove_any_zeroes:
  81. if 0 in test_list or "NaN" in test_list:
  82. append = False
  83. ### Append the data point if flagged for addition.
  84. if append:
  85. return_list.append( np.array(tmp_list) )
  86. return np.array(return_list)
  87. def targetFeatureSplit( data ):
  88. """
  89. Given a numpy array like the one returned from
  90. featureFormat, separate out the first feature
  91. and put it into its own list (this should be the
  92. quantity you want to predict)
  93. return targets and features as separate lists
  94. (sklearn can generally handle both lists and numpy arrays as
  95. input formats when training/predicting)
  96. """
  97. target = []
  98. features = []
  99. for item in data:
  100. target.append( item[0] )
  101. features.append( item[1:] )
  102. return target, features