shaping_csv.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302
  1. """This code is part of the final project.
  2. After auditing is complete the next step is to prepare the data to be inserted into a SQL database.
  3. To do so, I will parse the elements in the OSM XML file, transforming them from document format to
  4. tabular format, thus making it possible to write to .csv files. These csv files can then easily be
  5. imported to a SQL database as tables.
  6. """
  7. # Initial Imports
  8. import csv
  9. import codecs
  10. import pprint
  11. import re
  12. import xml.etree.cElementTree as ET
  13. from audit import * #Imports all the functions from audit.py file
  14. import cerberus
  15. import schema
  16. # The directory where the OSM file is located.
  17. OSM_PATH = 'san-francisco_california.osm'
  18. # The directory where the created CSV files will be located
  19. NODES_PATH = "nodes.csv"
  20. NODE_TAGS_PATH = "nodes_tags.csv"
  21. WAYS_PATH = "ways.csv"
  22. WAY_NODES_PATH = "ways_nodes.csv"
  23. WAY_TAGS_PATH = "ways_tags.csv"
  24. # The SQL schema that is defined in schema.py file.
  25. # Both files need to be in the same directory.
  26. SCHEMA = schema.schema
  27. # Regular expression pattern to find problematic characters in value attributes.
  28. problem_chars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
  29. #Regular expression pattern to find different types of streets in street names.
  30. street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
  31. # The list of street types that we want to have.
  32. expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square",
  33. "Lane", "Road", "Trail", "Parkway", "Commons"]
  34. # The list of dictionaries, containing street types
  35. # that need to be changed to match the 'expected' list.
  36. mapping = { "St": "Street", "St.": "Street", "street": "Street",
  37. "Ave": "Avenue", "Ave.": "Avenue", "AVE": "Avenue,",
  38. "avenue": "Avenue", "Rd.": "Road", "Rd": "Road", "road": "Road",
  39. "Blvd": "Boulevard", "Blvd.": "Boulevard", "Blvd,": "Boulevard",
  40. "boulevard": "Boulevard", "broadway": "Broadway",
  41. "square": "Square", "square": "Square", "Sq": "Square",
  42. "way": "Way",
  43. "Dr.": "Drive", "Dr": "Drive",
  44. "ct": "Court", "Ct": "Court", "court": "Court",
  45. "cres": "Crescent", "Cres": "Crescent", "Ctr": "Center",
  46. "Hwy": "Highway", "hwy": "Highway",
  47. "Ln": "Lane", "Ln.": "Lane",
  48. "parkway": "Parkway" }
  49. # The columns in the CSV files. The same columns need to be created for the database.
  50. NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
  51. NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
  52. WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
  53. WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
  54. WAY_NODES_FIELDS = ['id', 'node_id', 'position']
  55. def shape_element(element):
  56. """ A function to Shape each element into several data structures.
  57. Args:
  58. - param element: 'Node' and'way' tags that are passed to this function from
  59. the get_element function; mainly called by process_map function.
  60. The function goes through node and way elements, defining the values for
  61. nodes, nodes_ways, ways, ways_tags, and ways_nodes dictionaties.
  62. The "node" field holds a dictionary of the following top level node
  63. attributes: id, user, uid, version, lat, lon, timestamp, changeset.
  64. The "ways" fields hold a dictionary of the following top level node
  65. attributes: id, users, uid, versiob, timestamp, changeset.
  66. The "node_tags" and "way_tags"field holds a list of dictionaries, one per
  67. secondary tag. Secondary tags are child tags of node which have the tag
  68. name/type: "tag". Each dictionary has the following fields from the secondary
  69. tag attributes:
  70. - id: the top level node id attribute value
  71. - key: the full tag "k" attribute value if no colon is present or the
  72. characters after the colon if one is.
  73. - value: the tag "v" attribute value
  74. - type: either the characters before the colon in the tag "k" value
  75. or "regular" if a colon is not present.
  76. For the value field, I call updates_name and update_postcode functions to
  77. clean problematic street names or postcodes. I call these functions on both
  78. node and way elements.
  79. Return:
  80. The following dictionaries will be returned:
  81. - node
  82. - node_tags
  83. - way
  84. - way_nodes
  85. - way_tags
  86. """
  87. node_attribs = {} # Handle the attributes in node element
  88. way_attribs = {} # Handle the attributes in way element
  89. way_nodes = [] # Handle the 'nd' tag in the way element
  90. tags = [] # Handle secondary tags the same way for both node and way elements
  91. # Handling node elements.
  92. if element.tag == 'node':
  93. for item in NODE_FIELDS:
  94. # If the 'uid' field was empty "9999999" is set as 'uid'.
  95. try:
  96. node_attribs[item] = element.attrib[item]
  97. except:
  98. node_attribs[item] = "9999999"
  99. # Iterating through the 'tag' tags in the node element.
  100. for tg in element.iter('tag'):
  101. #Ignoring values that contain problematic characters.
  102. if not problem_chars.search(tg.attrib['k']):
  103. tag_dict_node = {}
  104. tag_dict_node['id'] = element.attrib['id']
  105. # Calling the update_name function to clean up problematic
  106. # street names based on audit.py file.
  107. if is_street_name(tg):
  108. better_name = update_name(tg.attrib['v'], mapping)
  109. tag_dict_node['value'] = better_name
  110. # Calling the update_postcode function to clean up problematic
  111. # postcodes based on audit.py file.
  112. elif get_postcode(tg):
  113. better_postcode = update_postcode(tg.attrib['v'])
  114. tag_dict_node['value'] = better_postcode
  115. # For other values that are not street names or postcodes.
  116. else:
  117. tag_dict_node['value'] = tg.attrib['v']
  118. if ':' not in tg.attrib['k']:
  119. tag_dict_node['key'] = tg.attrib['k']
  120. tag_dict_node['type'] = 'regular'
  121. # Dividing words before and after a colon ':'
  122. else:
  123. character_before_colon = re.findall('^[a-zA-Z]*:', tg.attrib['k'])
  124. character_after_colon = re.findall(':[a-zA-Z_]+' , tg.attrib['k'])
  125. if len(character_after_colon) != 0: #If the key was an empty field
  126. tag_dict_node['key'] = character_after_colon[0][1:]
  127. else:
  128. tag_dict_node['key'] = 'regular'
  129. if len(character_before_colon) != 0: #If the type was an empty field
  130. tag_dict_node['type'] = character_before_colon[0][: -1]
  131. else:
  132. tag_dict_node['type'] = 'regular'
  133. tags.append(tag_dict_node)
  134. return {'node': node_attribs, 'node_tags': tags}
  135. # Handling way elements.
  136. elif element.tag == 'way':
  137. for item in WAY_FIELDS:
  138. # If the 'uid' field was empty "9999999" is set as 'uid'.
  139. try:
  140. way_attribs[item] = element.attrib[item]
  141. except:
  142. way_attribs[item] = "9999999"
  143. # Iterating through 'tag' tags in way element.
  144. for tg in element.iter('tag'):
  145. if not problem_chars.search(tg.attrib['k']):
  146. tag_dict_way = {}
  147. tag_dict_way['id'] = element.attrib['id']
  148. # Calling the update_name function to clean up problematic
  149. # street names based on audit.py file.
  150. if is_street_name(tg):
  151. better_name_way = update_name(tg.attrib['v'], mapping)
  152. tag_dict_way['value'] = better_name_way
  153. # Calling the update_postcode function to clean up problematic
  154. # postcodes based on audit.py file.
  155. if get_postcode(tg):
  156. better_postcode_way = update_postcode(tg.attrib['v'])
  157. tag_dict_way['value'] = better_postcode_way
  158. # For other values that are not street names or postcodes.
  159. else:
  160. tag_dict_way['value'] = tg.attrib['v']
  161. if ':' not in tg.attrib['k']:
  162. tag_dict_way['key'] = tg.attrib['k']
  163. tag_dict_way['type'] = 'regular'
  164. #Dividing words before and after a colon ':'
  165. else:
  166. character_before_colon = re.findall('^[a-zA-Z]*:', tg.attrib['k'])
  167. character_after_colon = re.findall(':[a-zA-Z_]+', tg.attrib['k'])
  168. if len(character_after_colon) == 1:
  169. tag_dict_way['key'] = character_after_colon[0][1:]
  170. if len(character_after_colon) > 1:
  171. tag_dict_way['key'] = character_after_colon[0][1: ] + character_after_colon[1]
  172. if len(character_before_colon) != 0: #If the type was an empty field
  173. tag_dict_way['type'] = character_before_colon[0][: -1]
  174. else:
  175. tag_dict_way['type'] = 'regular'
  176. tags.append(tag_dict_way)
  177. # Iterating through 'nd' tags in way element.
  178. count = 0
  179. for tg in element.iter('nd'):
  180. tag_dict_nd = {}
  181. tag_dict_nd['id'] = element.attrib['id']
  182. tag_dict_nd['node_id'] = tg.attrib['ref']
  183. tag_dict_nd['position'] = count
  184. count += 1
  185. way_nodes.append(tag_dict_nd)
  186. return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
  187. # Helper functions.
  188. def get_element(osm_file, tags=('node', 'way', 'relation')):
  189. """Yield element if it is the right type of tag"""
  190. context = ET.iterparse(osm_file, events=('start', 'end'))
  191. _, root = next(context)
  192. for event, elem in context:
  193. if event == 'end' and elem.tag in tags:
  194. yield elem
  195. root.clear()
  196. # Validating that during creation of CSV files the fields are all in accordance with
  197. # the columns that should be in the CSV files.
  198. def validate_element(element, validator, schema=SCHEMA):
  199. """Raise ValidationError if element does not match schema"""
  200. if validator.validate(element, schema) is not True:
  201. field, errors = next(validator.errors.items())
  202. message_string = "\nElement of type '{0}' has the following errors:\n{1}"
  203. error_string = pprint.pformat(errors)
  204. raise Exception(message_string.format(field, error_string))
  205. class UnicodeDictWriter(csv.DictWriter, object):
  206. """Extend csv.DictWriter to handle Unicode input"""
  207. def writerow(self, row):
  208. super(UnicodeDictWriter, self).writerow({
  209. k: (v.encode('utf-8') if isinstance(v, str) else v) for k, v in row.items()
  210. })
  211. def writerows(self, rows):
  212. for row in rows:
  213. self.writerow(row)
  214. # Creating CSV Files.
  215. def process_map(file_in, validate):
  216. """Iteratively process each XML element and write to csv(s)"""
  217. with codecs.open(NODES_PATH, 'w') as nodes_file, \
  218. codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
  219. codecs.open(WAYS_PATH, 'w') as ways_file, \
  220. codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
  221. codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:
  222. nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
  223. node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
  224. ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
  225. way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
  226. way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
  227. nodes_writer.writeheader()
  228. node_tags_writer.writeheader()
  229. ways_writer.writeheader()
  230. way_nodes_writer.writeheader()
  231. way_tags_writer.writeheader()
  232. validator = cerberus.Validator()
  233. count = 1
  234. for element in get_element(file_in, tags=('node', 'way')):
  235. # Setting a counter to show how many rows the code has processed.
  236. if count % 10000 == 0:
  237. print (count)
  238. count += 1
  239. el = shape_element(element)
  240. if el:
  241. if validate is True:
  242. validate_element(el, validator)
  243. if element.tag == 'node':
  244. nodes_writer.writerow(el['node'])
  245. node_tags_writer.writerows(el['node_tags'])
  246. elif element.tag == 'way':
  247. ways_writer.writerow(el['way'])
  248. way_nodes_writer.writerows(el['way_nodes'])
  249. way_tags_writer.writerows(el['way_tags'])
  250. if __name__ == '__main__':
  251. # Note: If the validation is set to True,
  252. # the process takes much longer than when it is set to False.
  253. process_map(OSM_PATH, validate=False)