#!/usr/bin/env python
# coding: utf-8
import json
from pprint import pprint
from time import sleep
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from pymongo.collection import Collection
from pymongo.mongo_client import database
from tqdm import tnrange, tqdm
[docs]def get_suffix_mapping():
"""get a mapping for street suffixes by scraping table from USPS"""
headers = {'User-agent': 'Mozilla/5.0'}
r = requests.get(r"https://pe.usps.com/text/pub28/28apc_002.htm", headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
postal_table = soup.find('table', {'id': 'ep533076'})
df = pd.read_html(str(postal_table), header=0)[0]
df.columns = ['Primary', 'Common', 'Standard']
keys = df.iloc[:, 1:3]
return keys.to_dict(orient='records')
[docs]def tqdm_ipython_test():
"""testing tdqm works in jupyter lab"""
for i in tnrange(3, desc='1st loop'):
for j in tqdm(range(100), desc='2nd loop'):
sleep(0.01)
[docs]def read_osm_file(filename: str):
"""reads in osm file to be processed"""
with open(filename, "r", encoding='UTF-8') as f:
msg = f.read()
return msg
[docs]def get_soup(file, tags):
"""generate some resultsets for the tags from the xml """
soup = BeautifulSoup(file, 'xml')
return [{tag: soup.find_all(tag)} for tag in tqdm(tags)]
[docs]def get_dict_data(result_set_item):
"""generate dictionaries from the resultsets"""
list_of_dicts = []
for k, v in result_set_item:
primary_tag = k
result_set = v
for entry in result_set:
entry_data_dict = dict()
entry_data_dict['type'] = primary_tag
for k, v in entry.attrs.items():
entry_data_dict[k] = v
for tag in entry.find_all('tag'):
entry_data_dict[tag['k']] = tag['v']
list_of_dicts.append(entry_data_dict)
return list_of_dicts
[docs]def json_to_mongo(col: database.Collection, json_file: str = "rochester_osm.json"):
"""create new mongodb collection from json file"""
with open(json_file) as f:
data = json.load(f)
for node_dict in data:
col.insert_many(node_dict)
[docs]def get_col(db_name="udacity", collection="rochester_osm"):
"""create and return a mongodb db/collection connection object"""
from pymongo import MongoClient
client = MongoClient('localhost:27017')
db = client[db_name]
col = db[collection]
return col
[docs]def main():
"""currently set to use sample extract, for full implementation see jupyter notebook version of this file"""
pprint("Processing dataset.. ")
file_name = r"rochester_sample.osm" # change name if using full extract
base = file_name.split('.')[0]
osm_file = read_osm_file(filename=file_name)
tag_list = ['node', 'way']
result_set_list = get_soup(file=osm_file, tags=tag_list)
osm_dicts = [get_dict_data(res.items()) for res in result_set_list]
json_osm = json.dumps(osm_dicts)
with open(f"{base}.json", 'w') as f:
f.write(json_osm)
from importlib import import_module
j2m = import_module('json_to_mongo')
j2m.main(json_file=f"{base}.json")
# setup connection for data exploration and cleaning
osm_col = get_col(collection=base) # type: MongoClient
# ### Query total document count
# In[134]:
total_docs = osm_col.count_documents({})
pprint(total_docs)
# ### Get count of each key in collection
# In[135]:
key_counts_dict = dict()
for entry in tqdm(osm_col.find(), total=total_docs):
for k in entry.keys():
key_counts_dict.setdefault(k, 0)
key_counts_dict[k] += 1
# In[136]:
[print(f"{k}, {v} ") for k, v in key_counts_dict.items() if ':city' in k]
# In[137]:
# itemgetter used with sorted to allow sorting by key values
from operator import itemgetter
pprint(sorted(key_counts_dict.items(), key=itemgetter(1), reverse=True))
# ### Get a list of fields that begin with address
# In[138]:
address_fields = {k: v for (k, v) in key_counts_dict.items() if 'addr' in k}
pprint(sorted(address_fields.items(), key=itemgetter(1), reverse=True))
# In[139]:
# Get a list of distinct streets
distinct_streets = osm_col.distinct('addr:street')
# In[140]:
change_needed = list()
mapping_dict = get_suffix_mapping()
distinct_suffix = set(x.split()[-1] for x in distinct_streets)
# In[164]:
pprint(mapping_dict[0:5])
# In[142]:
for suffix in distinct_suffix:
for mapping in mapping_dict:
if mapping['Common'] == suffix.upper():
print(f"Changing {suffix} to {mapping['Standard']}")
continue
# In[143]:
modified_count = 0
for entry in tqdm(distinct_streets):
suffix = entry.split()[0]
for mapping in mapping_dict:
if mapping['Common'] == suffix.upper():
# print(f"Changing {suffix} to {mapping['Standard']}")
# print(entry.replace(suffix, mapping['Standard']))
result = osm_col.update_many({'addr:street': entry},
{"$set": {'addr:street': entry.replace(suffix, mapping['Standard'])}})
modified_count += result.modified_count
continue
print(f"{modified_count} address suffixes updated")
# In[144]:
# Get a list of distinct street types
pprint(set(x.split()[-1] for x in distinct_streets if x.split()[-1].isalpha()))
# In[145]:
# Get a list of distinct street types
pprint(set(x.split()[-1] for x in distinct_streets))
# ### find all address codes in collection
# In[146]:
unique_zip_codes = osm_col.distinct('addr:postcode')
pprint(unique_zip_codes)
# In[147]:
update_dict = {'modified': 0,
'deleted': 0,
'good': 0}
for zip in tqdm(unique_zip_codes):
if zip[0:5].isdigit() and len(zip) > 5:
result = osm_col.update_many({'addr:postcode': zip}, {"$set": {'addr:postcode': zip[0:5]}})
update_dict['modified'] += result.modified_count
elif not zip.isdigit() and len(zip) != 5:
result = osm_col.delete_many({'addr:postcode': zip})
update_dict['deleted'] += result.deleted_count
elif zip.isdigit() and len(zip) == 5:
update_dict['good'] += 1
pprint(update_dict)
# In[148]:
#
updated_address_code_list = list(osm_col.find({'addr:postcode': {'$exists': True}}, {'addr:postcode': 1, '_id': 0}))
set([x['addr:postcode'] for x in updated_address_code_list])
# [Rochester Zip codes](https://www.zip-codes.com/city/ny-rochester.asp)
# > After running our function we can see that all the unique zip codes in the database are valid Rochester Zip codes
# In[149]:
pprint(list(osm_col.aggregate([
{
'$group': {
'_id': '$addr:postcode',
'count': {
'$sum': 1
}
}
}, {
'$sort': {
'count': -1
}
}
]))[:10])
# # User Counts
# In[150]:
def get_single_users(col: Collection):
user_counts_dict = list(col.aggregate([
{
'$sortByCount': '$user'
}, {
'$sort': {
'count': 1
}
}
]))
single_doc_user = list()
for entry in user_counts_dict:
if entry['count'] == 1:
single_doc_user.append(entry['_id'])
else:
break
pprint(single_doc_user[0:5])
pprint(f"{len(single_doc_user)} users with only one post out of {len(user_counts_dict)}")
# return single_doc_user
get_single_users(osm_col)
# In[151]:
user_df = pd.DataFrame.from_dict(list(osm_col.aggregate([{
'$sortByCount': '$user'
}])))
user_df['percent'] = user_df['count'] / user_df['count'].sum()
# In[152]:
# Percent of entries that came from top two users
user_df[0:2]['percent'].sum() * 100
# In[153]:
# Combined top 10 users contribution
user_df[0:10]['percent'].sum() * 100
# In[154]:
# Combined perecent of users who individually contribute less then 1% of the entries in the database
user_df[user_df.percent <= .01].percent.sum() * 100
# In[155]:
user_df._id
# In[156]:
def top_ten_amenities(col: Collection):
top_amenities = list(col.aggregate([
{
'$match': {
'type': 'way'
}
}, {
'$sortByCount': '$amenity'
}
]))
return top_amenities
top_ten_amenities(col=osm_col)[0:10]
# In[157]:
df = pd.DataFrame.from_dict(top_ten_amenities(osm_col))
pprint(df[0:10])
# In[158]:
df['percent'] = df['count'] / df['count'].sum()
# ``` table
# _id|count|percent\r\n|49953|0.951069055461417\r\nparking|1828|0.034803800239894905\r\nrestaurant|127|0.0024179883098832894\r\nschool|85|0.001618338632599052\r\nfuel|56|0.0010661995697123165\r\n
# ```
# In[159]:
df[1:].describe()
# In[160]:
df.shape[0]
# In[161]:
# Biggest Religion
religion = list(osm_col.aggregate([
{
'$match': {
'amenity': {
'$eq': 'place_of_worship'
}
}
}, {
'$group': {
'_id': '$religion',
'count': {
'$sum': 1
}
}
}, {
'$sort': {
'count': -1
}
}
]))
pprint(religion)
# In[162]:
# Most popular cuisine in restaurants
cuisine = list(osm_col.aggregate([
{
'$match': {
'amenity': {
'$eq': 'restaurant'
}
}
}, {
'$group': {
'_id': '$cuisine',
'count': {
'$sum': 1
}
}
}, {
'$sort': {
'count': -1
}
}
]))
pprint(cuisine[0:10])
# In[163]:
# City counts
city_counts = list(osm_col.aggregate([
{
'$group': {
'_id': '$addr:city',
'count': {
'$sum': 1
}
}
}, {
'$sort': {
'count': -1
}
}
]))
print('\n'.join('{_id!s:<20}{count}'.format(**x) for x in city_counts))
if __name__ == '__main__':
main()
# ### Initial MongoDB collection creation
# - Insert all records from json file
# - Create compound unique index on 'id' and 'type' fields
# read the json file we just read to verify it's working