helper.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. import os
  2. import pickle
  3. def load_data(path):
  4. """
  5. Load Dataset from File
  6. """
  7. input_file = os.path.join(path)
  8. with open(input_file, "r") as f:
  9. data = f.read()
  10. return data
  11. def preprocess_and_save_data(dataset_path, token_lookup, create_lookup_tables):
  12. """
  13. Preprocess Text Data
  14. """
  15. text = load_data(dataset_path)
  16. # Ignore notice, since we don't use it for analysing the data
  17. text = text[81:]
  18. token_dict = token_lookup()
  19. for key, token in token_dict.items():
  20. text = text.replace(key, ' {} '.format(token))
  21. text = text.lower()
  22. text = text.split()
  23. vocab_to_int, int_to_vocab = create_lookup_tables(text)
  24. int_text = [vocab_to_int[word] for word in text]
  25. pickle.dump((int_text, vocab_to_int, int_to_vocab, token_dict), open('preprocess.p', 'wb'))
  26. def load_preprocess():
  27. """
  28. Load the Preprocessed Training data and return them in batches of <batch_size> or less
  29. """
  30. return pickle.load(open('preprocess.p', mode='rb'))
  31. def save_params(params):
  32. """
  33. Save parameters to file
  34. """
  35. pickle.dump(params, open('params.p', 'wb'))
  36. def load_params():
  37. """
  38. Load parameters from file
  39. """
  40. return pickle.load(open('params.p', mode='rb'))