prep hospitals accessibility.py

import numpy as np
import pandas as pd
import geopandas as gpd
import os
from scipy.spatial import cKDTree

############
#
# Globals & Functions
#
############
data_dir = ".\\data"

GBcrs = {"init":"epsg:27700"}

# OS Open Roads accessed from: https://www.ordnancesurvey.co.uk/opendatadownload/products.html
open_roads_dir = os.path.join(data_dir, "OS Open Roads")

# Hospital Names and Locations: https://data.gov.uk/dataset/f4420d1c-043a-42bc-afbc-4c0f7d3f1620/hospitals
hosp_data_dir = os.path.join(data_dir, "hospital_data")
hospitals_file = "Hospital.csv"

# LSOA 2011 Pop Weighted Centroids: https://geoportal.statistics.gov.uk/datasets/lower-layer-super-output-areas-december-2011-population-weighted-centroids
lsoa_pop_weighted_centroids_file = "Lower_Layer_Super_Output_Areas_December_2011_Population_Weighted_Centroids.csv"

##############
#
# Read and clean data
#
##############
dfHospitals = pd.read_csv(os.path.join(hosp_data_dir, hospitals_file), delimiter="¬")
dfLSOACent = pd.read_csv(os.path.join(data_dir, lsoa_pop_weighted_centroids_file))

# Create geometries from coordinates in the Hospitals and LSOA Centroids data
gdfHospitals = gpd.GeoDataFrame(dfHospitals, geometry=gpd.points_from_xy(dfHospitals.Longitude, dfHospitals.Latitude), crs = "EPSG:4326")
gdfHospitals = gdfHospitals.to_crs(GBcrs)
gdfHospitals.dropna(subset = ['Latitude','Longitude'], inplace = True)

gdfLSOACent = gpd.GeoDataFrame(dfLSOACent, geometry = gpd.points_from_xy(dfLSOACent.X, dfLSOACent.Y), crs = "EPSG:27700")

# Open Roads Data

# commented out because once data files have been combined and saved as a single file this section
# of code doesn't need to be executed

'''
# Combine mutiple OS Road Nodes files into a single geodataframe
road_node_files = [i for i in os.listdir(open_roads_dir) if "RoadNode.shp" in i]
gdfRoadNodes = gpd.GeoDataFrame()
for f in road_node_files:
    gdf = gpd.read_file(os.path.join(open_roads_dir, f))
    gdf['file'] = f
    gdfRoadNodes = pd.concat([gdfRoadNodes, gdf])
gdfRoadNodes = gdfRoadNodes.to_crs(GBcrs)
gdfRoadNodes.to_file(os.path.join(data_dir, "OS Open Road Nodes.shp"))
'''

'''
road_link_files = [i for i in os.listdir(open_roads_dir) if "RoadLink.shp" in i]

gdfRoadLinks = gpd.GeoDataFrame()
for f in road_link_files:
    gdf = gpd.read_file(os.path.join(open_roads_dir, f))
    gdfRoadLinks = pd.concat([gdfRoadLinks, gdf])
gdfRoadLinks = gdfRoadLinks.to_crs(GBcrs)
gdfRoadLinks.to_file(os.path.join(data_dir,"OS Open Road Links.shp"))
'''

########################
#
# Use road network to find distances to nearest hospitals
#
########################

import networkx as nx
import pandana as pdna

# Extract the road network topology from the gis data
print("Loading network topology")
'''
# commented out because once network topology has been extracted from open road gis data this section doesn't need tp be executed

gdfRoadNodes = gpd.read_file(os.path.join(data_dir, "OS Open Road Nodes.shp"))
gdfRoadLinks = gpd.read_file(os.path.join(data_dir,"OS Open Road Links.shp"))

dfNodes = pd.DataFrame({"identifier":gdfRoadNodes.identifier, "x": gdfRoadNodes.geometry.x, "y":gdfRoadNodes.geometry.y})
dfNodes.set_index("identifier", inplace=True)
dfLinks = gdfRoadLinks.loc[:, ['startNode', 'endNode', 'length']]
dfNodes.to_csv(os.path.join(data_dir, "OS Open Road Nodes.csv"))
dfLinks.to_csv(os.path.join(data_dir, "OS Open Road Links.csv"))
gdfRoadNodes = None
gdfRoadLinks = None
'''
dfNodes = pd.read_csv(os.path.join(data_dir, "OS Open Road Nodes.csv"))
dfLinks = pd.read_csv(os.path.join(data_dir, "OS Open Road Links.csv"))

dfNodes.drop_duplicates(inplace=True)
dfNodes.set_index('identifier', inplace=True)

gdfHospitals.set_index('OrganisationCode', inplace = True)

# Get largest connected component
print("Finding largest connected component")
edges = dfLinks.loc[:,['startNode','endNode','length']].values
G = nx.Graph()
G.add_weighted_edges_from(edges)
largest_connected_component = sorted(nx.connected_components(G), key = len, reverse=True)[0]

# Clean up to save memory
G = None
edges = None

# Create pandana network. It's much faster for nearest point-of-interest analysis
print("Creating Network")

# Filter nodes and edges to just include those in the largest connected componet
dfLinksLCC = dfLinks.loc[(dfLinks['startNode'].isin(largest_connected_component)) & (dfLinks['endNode'].isin(largest_connected_component))]
dfNodesLCC = dfNodes.loc[largest_connected_component]

net=pdna.Network(dfNodesLCC["x"], dfNodesLCC["y"], dfLinksLCC["startNode"], dfLinksLCC["endNode"], dfLinksLCC[["length"]])

# Can then get the nearest 3 hospitals to each LSOA centroid
print("Finding nearest hospitals")
search_distance = 200000
net.set_pois("hospitals", search_distance, 3, gdfHospitals.geometry.x, gdfHospitals.geometry.y)
dfNear = net.nearest_pois(search_distance,"hospitals", num_pois=3, include_poi_ids=True)


# Get just the LSOA centroids and their nearest hospitals
lsoa_nodes = net.get_node_ids(gdfLSOACent.X, gdfLSOACent.Y)
dfNearLSOACentroids = dfNear.loc[lsoa_nodes]

# Include LSOA Codes
gdfLSOACent['lsoa_nodes'] = lsoa_nodes
dfNearLSOACentroids.reset_index(inplace=True)
dfNearLSOACentroids = pd.merge(dfNearLSOACentroids, gdfLSOACent, left_on = 'identifier', right_on = 'lsoa_nodes', how = 'outer', indicator = True)

# Check each LSOA Centroid has nearest 3 hospitals
assert dfNearLSOACentroids['_merge'].value_counts()['left_only'] == 0
assert dfNearLSOACentroids['_merge'].value_counts()['right_only'] == 0
assert dfNearLSOACentroids['poi1'].isnull().any() == False
assert dfNearLSOACentroids['poi2'].isnull().any() == False
assert dfNearLSOACentroids['poi3'].isnull().any() == False


#########################
#
# Clearing up and making output more presentable
#
#########################

# Compute distance to nearest hospital and average distance to nearest 3 hospitals
dfNearLSOACentroids['distance_nearest_hospital'] = dfNearLSOACentroids[1]
dfNearLSOACentroids['mean_distance_nearest_three_hospitals'] = dfNearLSOACentroids.loc[:,[1,2,3]].mean(axis = 1)

dfNearLSOACentroids = dfNearLSOACentroids.reindex(columns = [   'lsoa11cd','lsoa11nm','lsoa_nodes',
                                                                1,2,3,'poi1','poi2','poi3',
                                                                'mean_distance_nearest_three_hospitals'])

dfNearLSOACentroids = dfNearLSOACentroids.rename(columns = {1:'distance1',2:'distance2',3:'distance3',
                                                            'poi1':'hospital1','poi2':'hospital2','poi3':'hospital3',
                                                            'lsoa_nodes':'lsoa_centroid_road_node'})

dfNearLSOACentroids.to_csv(os.path.join(data_dir, "lsoa_nearest_hospitals.csv"), index=False)


####################
#
# Make a map
#
###################
f, ax = plt.subplots(1,1)
legend_info ={'label': "Mean distance to nearest 3 hospitals (m)"}
gdflsoas.plot(column = 'mean_distance_nearest_three_hospitals', ax = ax, legend = True, legend_kwds = legend_info)
plt.axis('off')
f.savefig(os.path.join(data_dir, 'lsoa_hospital_distance_map.png'))