audit.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. import xml.etree.cElementTree as ET
  2. from collections import defaultdict
  3. import re
  4. import pprint
  5. OSMFILE = "san-francisco_california.osm"
  6. """This regular expression matches letters without any white space with zero to one '.'
  7. Extract a string which might or might not have the '.' character in it."""
  8. street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
  9. # List of expected street types.
  10. expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square",
  11. "Lane", "Road", "Trail", "Parkway", "Commons"]
  12. def audit_street_type(street_types, street_name):
  13. """This function will get the list of street types and using the regular expression,
  14. compare them to the expected list. If they do not match the names in the expected list,
  15. it adds it to the street_types dictionary.
  16. Args:
  17. - street_types: list of dictionaries containing different street types.
  18. The key in the dictionary is the type of street (e.g. avenue, street),
  19. and the values are names of streets (e.g. Park avenue, 5th street).
  20. - street_name: name of the street (i.e. tag.attrib['v']). This name is
  21. passed to this function from the audit_name function.
  22. """
  23. m = street_type_re.search(street_name)
  24. if m:
  25. street_type = m.group()
  26. if street_type not in expected:
  27. street_types[street_type].add(street_name)
  28. def is_street_name(elem):
  29. """This unction will get the elements in the file (i.e. the tag element) and
  30. return the attributes in that element for which their key is equal to 'addr:street'.
  31. """
  32. return (elem.attrib['k'] == "addr:street")
  33. def audit_street(osmfile):
  34. """This function uses iterative parsing to go through the XML file,
  35. parse node and way elements, and iterate through their tag element.
  36. It will then call the 'audit_street_type' function to add the value attribute
  37. of the tag (i.e. the street name) to it.
  38. Arg:
  39. - osmfile: reads the OpenStreetMap data
  40. Return:
  41. - returns the list of dictionaries containing list of street types with their
  42. corresponding street name.
  43. """
  44. osm_file = open(osmfile, "r")
  45. street_types = defaultdict(set)
  46. # Parses the XML file.
  47. for event, elem in ET.iterparse(osm_file, events=("start",)):
  48. # Iterate through the 'tag' element of node and way elements.
  49. if elem.tag == "node" or elem.tag == "way":
  50. for tag in elem.iter("tag"):
  51. if is_street_name(tag):
  52. audit_street_type(street_types, tag.attrib['v'])
  53. osm_file.close()
  54. return street_types
  55. street_types = audit_street(OSMFILE)
  56. pprint.pprint(dict(street_types))
  57. # The list of dictionaries, containing street types that need to be changed
  58. # to match the expected list.
  59. mapping = { "St": "Street",
  60. "St.": "Street",
  61. "street": "Street",
  62. "Ave": "Avenue",
  63. "Ave.": "Avenue",
  64. "AVE": "Avenue,",
  65. "avenue": "Avenue",
  66. "Rd.": "Road",
  67. "Rd": "Road",
  68. "road": "Road",
  69. "Blvd": "Boulevard",
  70. "Blvd.": "Boulevard",
  71. "Blvd,": "Boulevard",
  72. "boulevard": "Boulevard",
  73. "broadway": "Broadway",
  74. "square": "Square",
  75. "way": "Way",
  76. "Dr.": "Drive",
  77. "Dr": "Drive",
  78. "ct": "Court",
  79. "Ct": "Court",
  80. "court": "Court",
  81. "Sq": "Square",
  82. "square": "Square",
  83. "cres": "Crescent",
  84. "Cres": "Crescent",
  85. "Ctr": "Center",
  86. "Hwy": "Highway",
  87. "hwy": "Highway",
  88. "Ln": "Lane",
  89. "Ln.": "Lane",
  90. "parkway": "Parkway"
  91. }
  92. def update_name(name, mapping):
  93. """This function takes the street name and split it at the space character.
  94. In case, it finds a string that matches any key in the mapping, it replaces it with
  95. the format that has been specified for it.
  96. e.g. When the function finds 'Blvd' in "Newark Blvd", it goes through mapping and maps
  97. it to 'Boulevard', and the final street name will come out as 'Newark Boulevard'.
  98. Args:
  99. -name: The street name coming from tag.attrib['v'] attribute. This
  100. parameter is defined in shape_element function from shaping_csv.py file.
  101. -mapping: Is the list of mapping created while auditing the street names
  102. in audit_street_type function
  103. Return:
  104. - output: The list of corrected street names.
  105. Example 5th street is separated
  106. to '5th' and 'street', and each is compared to mapping. For 'street' the
  107. mapping expects it to change to 'Street'. Function changes it to 'Street'
  108. and adds '5th Street' to the output list.
  109. """
  110. output = list()
  111. parts = name.split(" ")
  112. for part in parts:
  113. if part in mapping:
  114. output.append(mapping[part])
  115. else:
  116. output.append(part)
  117. return " ".join(output)
  118. # Printing the changes made in street names.
  119. for st_type, ways in street_types.items():
  120. for name in ways:
  121. better_name = update_name(name, mapping)
  122. print(name, "→", better_name)
  123. OSMFILE = 'sample.osm'
  124. def dicti(data, item):
  125. """This function creates a dictionary postcodes can be held.
  126. The dictionary key will be the postcode itself and the dictionary value
  127. will be the number of times that postcode was repeated throughout the map."""
  128. data[item] += 1
  129. def get_postcode(elem):
  130. """This function takes the 'tag' element as an input and
  131. return the elements for which the keys are equal to 'addr:postcode'"""
  132. return (elem.attrib['k'] == "addr:postcode")
  133. def audit(osmfile):
  134. """This function parses the XML file and iterates through node and
  135. way elements. It extracts the value attribute (i.e. the postcode) and
  136. add it to the 'dicti' dictionary.
  137. Arg:
  138. - osmfile: reads the OpenStreetMap data
  139. Return:
  140. - data: a dictionary containing postcodes and the number of times they have been
  141. repeated throughout the data. (Example: {'94122', '94122', '94122', '94611'} will
  142. give dicti{['94122']=3, ['94611']=1}.
  143. """
  144. osm_file = open(osmfile, "r")
  145. data = defaultdict(int)
  146. # Parsing the XML file
  147. for event, elem in ET.iterparse(osm_file, events=("start",)):
  148. # Iterating through node and way elements.
  149. if elem.tag == "node" or elem.tag == "way":
  150. for tag in elem.iter("tag"):
  151. if get_postcode(tag):
  152. dicti(data, tag.attrib['v'])
  153. return data
  154. postcodes = audit(OSMFILE)
  155. pprint.pprint(dict(postcodes))
  156. def update_postcode(digit):
  157. """Makes use of different conditions in the function to match the
  158. postcodes in the 3 categories that can be found for postal codes.
  159. Arg:
  160. - digit: The postcode coming from tag.attrib['v'] attribute. This
  161. parameter is defined in shape_element function from shaping_csv.py file.
  162. Return:
  163. - Output: Return a list of corrected postcodes
  164. """
  165. output = list()
  166. first_category = re.compile(r'^\D*(\d{5}$)', re.IGNORECASE)
  167. second_category = re.compile('^(\d{5})-\d{4}$')
  168. third_category = re.compile('^\d{6}$')
  169. if re.search(first_category, digit):
  170. new_digit = re.search(first_category, digit).group(1)
  171. output.append(new_digit)
  172. elif re.search(second_category, digit):
  173. new_digit = re.search(second_category, digit).group(1)
  174. output.append(new_digit)
  175. elif re.search(third_category, digit):
  176. third_output = third_category.search(digit)
  177. new_digit = '00000'
  178. output.append('00000')
  179. # This condition matches the third category for any other types.
  180. elif digit == 'CA' or len(digit) < 5:
  181. new_digit = '00000'
  182. output.append(new_digit)
  183. return ', '.join(str(x) for x in output)
  184. for postcode, nums in postcodes.items():
  185. better_code = update_postcode(postcode)
  186. print(postcode, "→", better_code)