babs_datacheck.py 2.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import numpy as np
  2. import pandas as pd
  3. from babs_visualizations import usage_stats
  4. def question_3(data):
  5. """
  6. This function will check that the sample data has been wrangled properly.
  7. """
  8. n_correct = 0
  9. # Check that there are a correct number of lines in the dataset.
  10. if data.shape[0] != 27345:
  11. print("Expected 27,345 data points, found only {:d}.".format(data.shape[0]))
  12. else:
  13. n_correct += 1
  14. # Check that the durations have been converted into terms of minutes.
  15. data_duration_stats = usage_stats(data, verbose = False)
  16. expected_duration_stats = np.array([6.816667, 10.716667, 17.28333])
  17. if not np.allclose(data_duration_stats, expected_duration_stats):
  18. print("Duration statistics do not match expected units (minutes).")
  19. if np.allclose(data_duration_stats, np.array([409, 643, 1037])):
  20. print(" It looks like the units are still in terms of seconds.")
  21. elif np.allclose(data_duration_stats, np.array([24520, 38580, 62220])):
  22. print(" It looks like you might have used the wrong operator in your conversion.")
  23. print(" Remember that there are 60 seconds in each minute.")
  24. else:
  25. n_correct += 1
  26. # Check that the timestamps have been wrangled properly.
  27. expected_time_vals = {'start_month': [25243, 2102],
  28. 'start_hour': [2851, 2291, 2219, 2171, 2131, 1976,
  29. 1833, 1799, 1791, 1644, 1359, 1269,
  30. 1071, 797, 644, 440, 394, 276,
  31. 153, 65, 55, 45, 42, 29],
  32. 'weekday': [4712, 4493, 4370, 3860, 3637, 3138, 3135]}
  33. for column in expected_time_vals.keys():
  34. col_data = data[column].value_counts().values
  35. n_values = len(col_data)
  36. n_values_expected = len(expected_time_vals[column])
  37. if not n_values == n_values_expected:
  38. print("Wrong number of unique values found for column: {}".format(column))
  39. print(" {:d} unique values expected; {:d} values found.".format(n_values_expected, n_values))
  40. elif not np.array_equal(col_data, expected_time_vals[column]):
  41. expected_max = expected_time_vals[column][0]
  42. expected_min = expected_time_vals[column][-1]
  43. print("Unexpected count of values for column: {}".format(column))
  44. print(" Most common value expected {:d} data points; {:d} trips found.".format(expected_max, col_data[0]))
  45. print(" Least common value expected {:d} data points; {:d} trips found.".format(expected_min, col_data[-1]))
  46. else:
  47. n_correct += 1
  48. if n_correct == len(expected_time_vals.keys()) + 2:
  49. print("All counts are as expected!")