Source code for create_and_clean

#!/usr/bin/env python
# coding: utf-8

import json
from pprint import pprint
from time import sleep

import pandas as pd
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from pymongo.collection import Collection
from pymongo.mongo_client import database
from tqdm import tnrange, tqdm


[docs]def get_suffix_mapping(): """get a mapping for street suffixes by scraping table from USPS""" headers = {'User-agent': 'Mozilla/5.0'} r = requests.get(r"https://pe.usps.com/text/pub28/28apc_002.htm", headers=headers) soup = BeautifulSoup(r.content, 'lxml') postal_table = soup.find('table', {'id': 'ep533076'}) df = pd.read_html(str(postal_table), header=0)[0] df.columns = ['Primary', 'Common', 'Standard'] keys = df.iloc[:, 1:3] return keys.to_dict(orient='records')
[docs]def tqdm_ipython_test(): """testing tdqm works in jupyter lab""" for i in tnrange(3, desc='1st loop'): for j in tqdm(range(100), desc='2nd loop'): sleep(0.01)
[docs]def read_osm_file(filename: str): """reads in osm file to be processed""" with open(filename, "r", encoding='UTF-8') as f: msg = f.read() return msg
[docs]def get_soup(file, tags): """generate some resultsets for the tags from the xml """ soup = BeautifulSoup(file, 'xml') return [{tag: soup.find_all(tag)} for tag in tqdm(tags)]
[docs]def get_dict_data(result_set_item): """generate dictionaries from the resultsets""" list_of_dicts = [] for k, v in result_set_item: primary_tag = k result_set = v for entry in result_set: entry_data_dict = dict() entry_data_dict['type'] = primary_tag for k, v in entry.attrs.items(): entry_data_dict[k] = v for tag in entry.find_all('tag'): entry_data_dict[tag['k']] = tag['v'] list_of_dicts.append(entry_data_dict) return list_of_dicts
[docs]def json_to_mongo(col: database.Collection, json_file: str = "rochester_osm.json"): """create new mongodb collection from json file""" with open(json_file) as f: data = json.load(f) for node_dict in data: col.insert_many(node_dict)
[docs]def get_col(db_name="udacity", collection="rochester_osm"): """create and return a mongodb db/collection connection object""" from pymongo import MongoClient client = MongoClient('localhost:27017') db = client[db_name] col = db[collection] return col
[docs]def main(): """currently set to use sample extract, for full implementation see jupyter notebook version of this file""" pprint("Processing dataset.. ") file_name = r"rochester_sample.osm" # change name if using full extract base = file_name.split('.')[0] osm_file = read_osm_file(filename=file_name) tag_list = ['node', 'way'] result_set_list = get_soup(file=osm_file, tags=tag_list) osm_dicts = [get_dict_data(res.items()) for res in result_set_list] json_osm = json.dumps(osm_dicts) with open(f"{base}.json", 'w') as f: f.write(json_osm) from importlib import import_module j2m = import_module('json_to_mongo') j2m.main(json_file=f"{base}.json") # setup connection for data exploration and cleaning osm_col = get_col(collection=base) # type: MongoClient # ### Query total document count # In[134]: total_docs = osm_col.count_documents({}) pprint(total_docs) # ### Get count of each key in collection # In[135]: key_counts_dict = dict() for entry in tqdm(osm_col.find(), total=total_docs): for k in entry.keys(): key_counts_dict.setdefault(k, 0) key_counts_dict[k] += 1 # In[136]: [print(f"{k}, {v} ") for k, v in key_counts_dict.items() if ':city' in k] # In[137]: # itemgetter used with sorted to allow sorting by key values from operator import itemgetter pprint(sorted(key_counts_dict.items(), key=itemgetter(1), reverse=True)) # ### Get a list of fields that begin with address # In[138]: address_fields = {k: v for (k, v) in key_counts_dict.items() if 'addr' in k} pprint(sorted(address_fields.items(), key=itemgetter(1), reverse=True)) # In[139]: # Get a list of distinct streets distinct_streets = osm_col.distinct('addr:street') # In[140]: change_needed = list() mapping_dict = get_suffix_mapping() distinct_suffix = set(x.split()[-1] for x in distinct_streets) # In[164]: pprint(mapping_dict[0:5]) # In[142]: for suffix in distinct_suffix: for mapping in mapping_dict: if mapping['Common'] == suffix.upper(): print(f"Changing {suffix} to {mapping['Standard']}") continue # In[143]: modified_count = 0 for entry in tqdm(distinct_streets): suffix = entry.split()[0] for mapping in mapping_dict: if mapping['Common'] == suffix.upper(): # print(f"Changing {suffix} to {mapping['Standard']}") # print(entry.replace(suffix, mapping['Standard'])) result = osm_col.update_many({'addr:street': entry}, {"$set": {'addr:street': entry.replace(suffix, mapping['Standard'])}}) modified_count += result.modified_count continue print(f"{modified_count} address suffixes updated") # In[144]: # Get a list of distinct street types pprint(set(x.split()[-1] for x in distinct_streets if x.split()[-1].isalpha())) # In[145]: # Get a list of distinct street types pprint(set(x.split()[-1] for x in distinct_streets)) # ### find all address codes in collection # In[146]: unique_zip_codes = osm_col.distinct('addr:postcode') pprint(unique_zip_codes) # In[147]: update_dict = {'modified': 0, 'deleted': 0, 'good': 0} for zip in tqdm(unique_zip_codes): if zip[0:5].isdigit() and len(zip) > 5: result = osm_col.update_many({'addr:postcode': zip}, {"$set": {'addr:postcode': zip[0:5]}}) update_dict['modified'] += result.modified_count elif not zip.isdigit() and len(zip) != 5: result = osm_col.delete_many({'addr:postcode': zip}) update_dict['deleted'] += result.deleted_count elif zip.isdigit() and len(zip) == 5: update_dict['good'] += 1 pprint(update_dict) # In[148]: # updated_address_code_list = list(osm_col.find({'addr:postcode': {'$exists': True}}, {'addr:postcode': 1, '_id': 0})) set([x['addr:postcode'] for x in updated_address_code_list]) # [Rochester Zip codes](https://www.zip-codes.com/city/ny-rochester.asp) # > After running our function we can see that all the unique zip codes in the database are valid Rochester Zip codes # In[149]: pprint(list(osm_col.aggregate([ { '$group': { '_id': '$addr:postcode', 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } } ]))[:10]) # # User Counts # In[150]: def get_single_users(col: Collection): user_counts_dict = list(col.aggregate([ { '$sortByCount': '$user' }, { '$sort': { 'count': 1 } } ])) single_doc_user = list() for entry in user_counts_dict: if entry['count'] == 1: single_doc_user.append(entry['_id']) else: break pprint(single_doc_user[0:5]) pprint(f"{len(single_doc_user)} users with only one post out of {len(user_counts_dict)}") # return single_doc_user get_single_users(osm_col) # In[151]: user_df = pd.DataFrame.from_dict(list(osm_col.aggregate([{ '$sortByCount': '$user' }]))) user_df['percent'] = user_df['count'] / user_df['count'].sum() # In[152]: # Percent of entries that came from top two users user_df[0:2]['percent'].sum() * 100 # In[153]: # Combined top 10 users contribution user_df[0:10]['percent'].sum() * 100 # In[154]: # Combined perecent of users who individually contribute less then 1% of the entries in the database user_df[user_df.percent <= .01].percent.sum() * 100 # In[155]: user_df._id # In[156]: def top_ten_amenities(col: Collection): top_amenities = list(col.aggregate([ { '$match': { 'type': 'way' } }, { '$sortByCount': '$amenity' } ])) return top_amenities top_ten_amenities(col=osm_col)[0:10] # In[157]: df = pd.DataFrame.from_dict(top_ten_amenities(osm_col)) pprint(df[0:10]) # In[158]: df['percent'] = df['count'] / df['count'].sum() # ``` table # _id|count|percent\r\n|49953|0.951069055461417\r\nparking|1828|0.034803800239894905\r\nrestaurant|127|0.0024179883098832894\r\nschool|85|0.001618338632599052\r\nfuel|56|0.0010661995697123165\r\n # ``` # In[159]: df[1:].describe() # In[160]: df.shape[0] # In[161]: # Biggest Religion religion = list(osm_col.aggregate([ { '$match': { 'amenity': { '$eq': 'place_of_worship' } } }, { '$group': { '_id': '$religion', 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } } ])) pprint(religion) # In[162]: # Most popular cuisine in restaurants cuisine = list(osm_col.aggregate([ { '$match': { 'amenity': { '$eq': 'restaurant' } } }, { '$group': { '_id': '$cuisine', 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } } ])) pprint(cuisine[0:10]) # In[163]: # City counts city_counts = list(osm_col.aggregate([ { '$group': { '_id': '$addr:city', 'count': { '$sum': 1 } } }, { '$sort': { 'count': -1 } } ])) print('\n'.join('{_id!s:<20}{count}'.format(**x) for x in city_counts))
if __name__ == '__main__': main() # ### Initial MongoDB collection creation # - Insert all records from json file # - Create compound unique index on 'id' and 'type' fields # read the json file we just read to verify it's working