diff --git a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.cpg b/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.cpg deleted file mode 100644 index 3ad133c0..00000000 --- a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.cpg +++ /dev/null @@ -1 +0,0 @@ -UTF-8 \ No newline at end of file diff --git a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.dbf b/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.dbf deleted file mode 100644 index a03d23d7..00000000 Binary files a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.dbf and /dev/null differ diff --git a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.prj b/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.prj deleted file mode 100644 index 747df588..00000000 --- a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.prj +++ /dev/null @@ -1 +0,0 @@ -GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137,298.257222101]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]] \ No newline at end of file diff --git a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp b/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp deleted file mode 100644 index 32820bf3..00000000 Binary files a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp and /dev/null differ diff --git a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp.ea.iso.xml b/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp.ea.iso.xml deleted file mode 100644 index c066d309..00000000 --- a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp.ea.iso.xml +++ /dev/null @@ -1,406 +0,0 @@ - - - - Feature Catalog for the Current Block Groups TIGER/Line Shapefiles - - - Block Groups - - - 2022 - - - 2022-09-29 - - - eng; USA - - - utf8 - - - - - U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch - - - - - - - 301-763-1128 - - - 301-763-4710 - - - - - - - 4600 Silver Hill Road, Stop 7400 - - - Washington - - - DC - - - 20233-7400 - - - United States - - - geo.geography@census.gov - - - - - - - pointOfContact - - - - - - - BG.shp - - - Current Block Group State-based - - - false - - - - - - STATEFP - - - Current state Federal Information Processing Series (FIPS) code - - - - - - - National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States/State Equivalents - - - - - - - - - - COUNTYFP - - - Current county Federal Information Processing Series (FIPS) code - - - - - - - National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents - - - - - - - - - - TRACTCE - - - Current census tract code - - - - - - - 000100 to 998999 - - - Census tract number - - - - - - - - 990000 to 990099 - - - Water tract (2010 and continuing) - - - - - - - - 990100 to 998999 - - - Census tract number - - - - - - - - - - BLKGRPCE - - - Current block group number. A block group number of 0 represents a water block group. - - - - - - - 0 - - - 9 - - - - - - - - - - - - GEOID - - - Census block group identifier; a concatenation of the current state Federal Information Processing Series (FIPS) code, county FIPS code, census tract code, and block group number - - - - - - - - The GEOID attribute is a concatenation of the state FIPS code, followed by the county FIPS code, followed by the census tract code, followed by the block group number. No spaces are allowed between the two codes. The State FIPS code is taken from "National Standard Codes (ANSI INCITS 38-2009), Federal Information Processing Series (FIPS) - States". The county FIPS code is taken from "National Standard Codes (ANSI INCITS 31-2009), Federal Information Processing Series (FIPS) - Counties/County Equivalents". The census tract code is taken from the "TRACTCE" attribute. The block group number is taken from the "BLKGRPCE" attribute. - - - - - - - - - NAMELSAD - - - Current translated legal/statistical area description and the block group number - - - - - - - - Refer to the block group number that appears in BLKGRPCE and translated legal/statistical area description code BG, Block Group - - - - - - - - - MTFCC - - - MAF/TIGER feature class code - - - - - - - G5030 - - - Block Group - - - - - - - - - - FUNCSTAT - - - Current functional status - - - - - - - S - - - Statistical entity - - - - - - - - - - ALAND - - - Current land area (square meters) - - - - - - - 0 - - - 9,999,999,999,999 - - - - - - - - - - - - AWATER - - - Current water area (square meters) - - - - - - - 0 - - - 9,999,999,999,999 - - - - - - - - - - - - INTPTLAT - - - Current latitude of the internal point - - - - - - - -90 - - - 90 - - - - - - - - - - - - INTPTLON - - - Current longitude of the internal point - - - - - - - -180 - - - 180 - - - - - - - - - - - \ No newline at end of file diff --git a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp.iso.xml b/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp.iso.xml deleted file mode 100644 index f30c2eb7..00000000 --- a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shp.iso.xml +++ /dev/null @@ -1,852 +0,0 @@ - - - - tl_2022_45_bg.shp.iso.xml - - - eng; USA - - - utf8 - - - series_tl_2022_bg.shp.iso.xml - - - dataset - - - - - U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch - - - - - - - 301-763-1128 - - - 301-763-4710 - - - - - - - 4600 Silver Hill Road, Stop 7400 - - - Washington - - - DC - - - 20233-7400 - - - United States - - - geo.geography@census.gov - - - - - - - pointOfContact - - - - - 2022-09-29 - - - ISO 19115-2 Geographic Information - Metadata - Part 2: Extensions for Imagery and Gridded Data - - - ISO 19115-2:2009(E) - - - https://meta.geo.census.gov/data/existing/decennial/GEO/GPMB/TIGERline/Current_19115/tl_2022_45_bg.shp.iso.xml - - - - - - - complex - - - 3408 - - - - - - - - - - - - - North American Datum of 1983 - - - NAD83 - - - - - 2007-01-19 - - - revision - - - - - - - - - - - - https://spatialreference.org/ref/epsg/4269/gml/ - - - NAD83 - - - Link to Geographic Markup Language (GML) description of reference system. - - - information - - - - - - - resourceProvider - - - - - - - urn:ogc:def:crs:EPSG::4269 - - - - - - - - - - - TIGER/Line Shapefile, Current, State, South Carolina, Block Group - - - TIGER/Line Shapefile, 2022, State, South Carolina, SC, Block Group - - - - - 2022 - - - publication - - - - - - - 2022-08 - - - creation - - - - - - - 2022-08 - - - lastUpdate - - - - - - - tl_2022_45_bg - - - - - - - U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch - - - - - - - 301-763-1128 - - - 301-763-4710 - - - - - - - 4600 Silver Hill Road, Stop 7400 - - - Washington - - - DC - - - 20233-7400 - - - United States - - - geo.geography@census.gov - - - - - - - pointOfContact - - - - - mapDigital - - - - - This resource is a member of a series. The TIGER/Line shapefiles and related database files (.dbf) are an extract of selected geographic and cartographic information from the U.S. Census Bureau's Master Address File / Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) Database (MTDB). The MTDB represents a seamless national file with no overlaps or gaps between parts, however, each TIGER/Line shapefile is designed to stand alone as an independent data set, or they can be combined to cover the entire nation. -Block Groups (BGs) are clusters of blocks within the same census tract. Each census tract contains at least one BG, and BGs are uniquely numbered within census tracts. BGs have a valid code range of 0 through 9. BGs have the same first digit of their 4-digit census block number from the same decennial census. For example, tabulation blocks numbered 3001, 3002, 3003,.., 3999 within census tract 1210.02 are also within BG 3 within that census tract. BGs coded 0 are intended to only include water area, no land area, and they are generally in territorial seas, coastal water, and Great Lakes water areas. Block groups generally contain between 600 and 3,000 people. A BG usually covers a contiguous area but never crosses county or census tract boundaries. They may, however, cross the boundaries of other geographic entities like county subdivisions, places, urban areas, voting districts, congressional districts, and American Indian / Alaska Native / Native Hawaiian areas. -The BG boundaries in this release are those that were delineated as part of the Census Bureau's Participant Statistical Areas Program (PSAP) for the 2020 Census. - - - The Census Bureau releases extracts of the MAF/TIGER database in the form of TIGER/Line Shapefiles for others to use in geographic information systems (GIS) or for other geographic applications. - - - completed - - - - - U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch - - - - - - - 301-763-1128 - - - 301-763-4710 - - - - - - - 4600 Silver Hill Road, Stop 7400 - - - Washington - - - DC - - - 20233-7400 - - - United States - - - geo.geography@census.gov - - - - - - - pointOfContact - - - - - - - notPlanned - - - - - - - https://tigerweb.geo.census.gov/arcgis/services/TIGERweb/tigerWMS_Current/MapServer/WmsServer?REQUEST=GetMap&SERVICE=WMS&VERSION=1.3.0&LAYERS=Census_Block_Groups59805,Census_Block_Groups_Labels32376&STYLES=default,default&FORMAT=image/png+xml&BGCOLOR=0xFFFFFF&TRANSPARENT=TRUE&CRS=EPSG:4326&BBOX=+38.8810,-77.0401,+38.9008,-77.0035&WIDTH=256&HEIGHT=256 - - - URL for the TIGERWeb Mapping Service - - - URL - - - - - - - State or equivalent entity - - - Polygon - - - Block Group - - - BG - - - - - - - - None - - - - - - - - - - State or Equivalent Entity - - - South Carolina - - - SC - - - 45 - - - - - - - - INCITS 38-2009[R2019]: Information Technology - Codes for the Identification of the States and Equivalent Areas within the United States, Puerto Rico, and the Insular Areas. - - - - - - - - - - otherRestrictions - - - otherRestrictions - - - Access constraints: None - - - Use Constraints: The TIGER/Line Shapefile products are not copyrighted however TIGER/Line and Census TIGER are registered trademarks of the U.S. Census Bureau. These products are free to use in a product or publication, however acknowledgement must be given to the U.S. Census Bureau as the source. -The boundary information in the TIGER/Line Shapefiles are for statistical data collection and tabulation purposes only; their depiction and designation for statistical purposes does not constitute a determination of jurisdictional authority or rights of ownership or entitlement and they are not legal land descriptions. Coordinates in the TIGER/Line shapefiles have six implied decimal places, but the positional accuracy of these coordinates is not as great as the six decimal places suggest. - - - - - - - - - Series Information for Block Group State-based TIGER/Line Shapefiles, Current - - - - - 2022 - - - publication - - - - - - - series_tl_2022_bg - - - - - - - U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch - - - - - - - 301-763-1128 - - - 301-763-4710 - - - - - - - 4600 Silver Hill Road, Stop 7400 - - - Washington - - - DC - - - 20233-7400 - - - United States - - - geo.geography@census.gov - - - - - - - pointOfContact - - - - - mapDigital - - - - - largerWorkCitation - - - - - vector - - - eng; USA - - - utf8 - - - boundaries - - - The TIGER/Line shapefiles contain geographic data only and do not include display mapping software or statistical data. For information on how to use the TIGER/Line shapefile data with specific software package users shall contact the company that produced the software. - - - - - - - -83.353928 - - - -78.499301 - - - 31.995954 - - - 35.215485 - - - - - - - - publication date - 2021-06 - 2022-08 - - - - - - - - - - - - 1 - - - Block Groups - - - - - Feature Catalog for the Current Block Groups TIGER/Line Shapefiles - - - - - 2022 - - - publication - - - - - - - U.S. Department of Commerce, U.S. Census Bureau, Geography Division, Spatial Data Collection and Products Branch - - - - - - - 301-763-1128 - - - 301-763-4710 - - - - - - - 4600 Silver Hill Road, Stop 7400 - - - Washington - - - DC - - - 20233-7400 - - - United States - - - geo.geography@census.gov - - - - - - - pointOfContact - - - - - https://meta.geo.census.gov/data/existing/decennial/GEO/GPMB/TIGERline/Current_19110/tl_2022_bg.shp.ea.iso.xml - - - - - - - - - - - ZIP - - - - - - - - - - U.S. Department of Commerce, U.S. Census Bureau, - Geography Division, Geographic Customer Services Branch - - - - - - - - 301-763-1128 - - - 301-763-4710 - - - - - - - 4600 Silver Hill Road, Stop 7400 - - - Washington - - - DC - - - 20233-7400 - - - United States - - - geo.geography@census.gov - - - - - - - distributor - - - - - - - The online copy of the TIGER/Line Shapefiles may be accessed without charge. - - - - - - - - - - - https://www2.census.gov/geo/tiger/TIGER2022/BG/tl_2022_45_bg.zip - - - Shapefile Zip File - - - tl_2022_45_bg.zip - - - The TIGER/Line Shapefiles are the fully supported, core geographic product from the U.S. Census Bureau. They are extracts of selected geographic and cartographic information from the U.S. Census Bureau's Master Address File/Topologically Integrated Geographic Encoding and Referencing (MAF/TIGER) database. - - - download - - - - - - - - - - - https://meta.geo.census.gov/data/existing/decennial/GEO/GPMB/TIGERline/Current_19110/tl_2022_bg.shp.ea.iso.xml - - - XML - - - tl_2022_bg.shp.ea.iso.xml - - - Entity and attribute file - - - download - - - - - - - - - - - https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/tigerWMS_Current/MapServer - - - https://www.ogc.org/standards/wms - - - TIGERweb/tigerWMS_Current (MapServer) - - - The Open Geospatial Consortium, Inc. (OGC) Web Map Service interface standard (WMS) provides a simple HTTP interface for requesting geo-registered map images from our geospatial database. The response to the request is one or more geo-registered map images that can be displayed in a browser or WMS client application. By gaining access to our data through our WMS, users can produce maps containing TIGERweb layers combined with layers from other servers. - - - search - - - - - - - - - - - https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/Tracts_Blocks/MapServer - - - https://geoservices.github.io - - - TIGERweb/Tracts_Blocks (MapServer) - - - The GeoServices REST Specification provides a way for Web clients to communicate with geographic information system (GIS) servers through Representational State Transfer (REST) technology. - - - search - - - - - - - - - - - - - dataset - - - - - - - - The Census Bureau performed automated tests to ensure logical consistency and limits of shapefiles. Segments making up the outer and inner boundaries of a polygon tie end-to-end to completely enclose the area. All polygons are tested for closure. - The Census Bureau uses its internally developed geographic update system to enhance and modify spatial and attribute data in the Census MAF/TIGER database. Standard geographic codes, such as FIPS codes for states, counties, municipalities, county subdivisions, places, American Indian/Alaska Native/Native Hawaiian areas, and congressional districts are used when encoding spatial entities. The Census Bureau performed spatial data tests for logical consistency of the codes during the compilation of the original Census MAF/TIGER database files. Most of the codes for geographic entities except states, counties, urban areas, Core Based Statistical Areas (CBSAs), American Indian Areas (AIAs), and congressional districts were provided to the Census Bureau by the USGS, the agency responsible for maintaining the Geographic Names Information System (GNIS). Feature attribute information has been examined but has not been fully tested for consistency. - For the TIGER/Line Shapefiles, the Point and Vector Object Count for the G-polygon SDTS Point and Vector Object Type reflects the number of records in the shapefile attribute table. For multi-polygon features, only one attribute record exists for each multi-polygon rather than one attribute record per individual G-polygon component of the multi-polygon feature. TIGER/Line Shapefile multi-polygons are an exception to the G-polygon object type classification. Therefore, when multi-polygons exist in a shapefile, the object count will be less than the actual number of G-polygons. - - - - - - - - - Data completeness of the TIGER/Line Shapefiles reflects the contents of the Census MAF/TIGER database at the time the TIGER/Line Shapefiles were created. - - - - - - - - Data completeness of the TIGER/Line Shapefiles reflects the contents of the Census MAF/TIGER database at the time the TIGER/Line Shapefiles were created. - - - - - - - - - - annually - - - This was transformed from the Census Bureau Geospatial Product Metadata Content Standard. - - - - \ No newline at end of file diff --git a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shx b/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shx deleted file mode 100644 index d07ebcd9..00000000 Binary files a/hydromt_fiat/data/social_vulnerability/test_blockgroup_shp/tl_2022_45_bg.shx and /dev/null differ diff --git a/hydromt_fiat/fiat.py b/hydromt_fiat/fiat.py index daf4d9f2..f811418b 100644 --- a/hydromt_fiat/fiat.py +++ b/hydromt_fiat/fiat.py @@ -25,6 +25,7 @@ check_map_uniqueness, create_risk_dataset, ) +from hydromt_fiat.workflows.equity_data import EquityData from hydromt_fiat.workflows.social_vulnerability_index import SocialVulnerabilityIndex from hydromt_fiat.workflows.vulnerability import Vulnerability from hydromt_fiat.workflows.aggregation_areas import join_exposure_aggregation_areas @@ -563,6 +564,8 @@ def setup_social_vulnerability_index( state_abbreviation: str, user_dataset_fn: str = None, blockgroup_fn: str = None, + year_data: int = None, + county: str = None ): """Setup the social vulnerability index for the vector exposure data for Delft-FIAT. This method has so far only been tested with US Census data @@ -591,7 +594,7 @@ def setup_social_vulnerability_index( svi.variable_code_csv_to_pd_df(codebook_fn) svi.set_up_download_codes() svi.set_up_state_code(state_abbreviation) - svi.download_census_data() + svi.download_census_data(year_data) svi.rename_census_data("Census_code_withE", "Census_variable_name") svi.identify_no_data() svi.check_nan_variable_columns("Census_variable_name", "Indicator_code") @@ -604,12 +607,12 @@ def setup_social_vulnerability_index( svi.domain_scores() svi.composite_scores() svi.match_geo_ID() - svi.load_shp_geom(blockgroup_fn) + svi.download_shp_geom(year_data, county) svi.merge_svi_data_shp() # store the relevant tables coming out of the social vulnerability module self.set_tables(df=svi.svi_data_shp, name="social_vulnerability_scores") - # self.set_tables(df=svi.excluded_regions, name="social_vulnerability_nodataregions") + # TODO: Think about adding an indicator for missing data to the svi.svi_data_shp # Check if the exposure data exists if self.exposure: @@ -624,6 +627,46 @@ def setup_social_vulnerability_index( svi_exp_joined.drop(columns=["geometry"], inplace=True) svi_exp_joined = pd.DataFrame(svi_exp_joined) self.exposure.exposure_db = svi_exp_joined + + def setup_equity_data( + self, + census_key: str, + state_abbreviation: str, + blockgroup_fn: str = None, + year_data: int = None, + county: str = None + ): + """Setup the download procedure for equity data similarly to the SVI setup + + Parameters + ---------- + path_dataset : str + The path to a predefined dataset + census_key : str + The user's unique Census key that they got from the census.gov website + (https://api.census.gov/data/key_signup.html) to be able to download the + Census data + path : Union[str, Path] + The path to the codebook excel + state_abbreviation : str + The abbreviation of the US state one would like to use in the analysis + """ + + # Create equity object + equity = EquityData(self.data_catalog, self.logger) + + # Call functionalities of equity + equity.set_up_census_key(census_key) + equity.variables_to_download() + equity.set_up_state_code(state_abbreviation) + equity.download_census_data(year_data) + equity.rename_census_data() + equity.match_geo_ID() + equity.download_shp_geom(year_data, county) + equity.merge_svi_data_shp() + + self.set_tables(df=equity.equity_data_shp, name="equity_data") + def setup_aggregation_areas( self, diff --git a/hydromt_fiat/workflows/equity_data.py b/hydromt_fiat/workflows/equity_data.py new file mode 100644 index 00000000..2a4dc14c --- /dev/null +++ b/hydromt_fiat/workflows/equity_data.py @@ -0,0 +1,194 @@ +from census import Census # install in your environment using pip install Census +from us import states # install in your environment using pip install us +from hydromt.data_catalog import DataCatalog +from hydromt.log import setuplog +import logging +import pandas as pd +import numpy as np +import geopandas as gpd +from urllib.request import urlopen +from io import BytesIO +from zipfile import ZipFile +from pathlib import Path + + + +class EquityData: + def __init__(self, data_catalog: DataCatalog = None, logger: logging.Logger = None): + self.data_catalog = data_catalog + self.census_key = Census + self.download_codes = {} + self.state_fips = 0 + self.pd_census_data = pd.DataFrame() + self.codebook = pd.DataFrame() + self.indicator_groups = {} + self.processed_census_data = pd.DataFrame() + + self.pd_domain_scores_geo = pd.DataFrame() + self.logger = setuplog("SVI", log_level=10) + self.svi_data_shp = gpd.GeoDataFrame() + self.block_groups = gpd.GeoDataFrame() + + + def set_up_census_key(self, census_key: str): + """The Census key can be inputted in the ini file. + This is a unique key that every user needs to specify to download census data + + Parameters + ---------- + census_key : str + The unique key a user gets from the census download website (an API token basically) + """ + + self.census_key = Census(census_key) + self.logger.info( + f"your census key {census_key} is used to download data from the Census website " + ) + + + def set_up_state_code(self, state_abbreviation: str): + """download census data for a state + + Parameters + ---------- + state_abbreviation : str + Abbreviation of the state for which you want to set up the census data download + """ + state = [ + state_abbreviation + ] # read in the state abbreviation as specified in the ini file + state_obj = getattr(states, state[0]) + self.state_fips = state_obj.fips + self.logger.info(f"The state abbreviation specified is: {state_abbreviation}") + + def variables_to_download(self): + self.download_variables = ['B01003_001E', 'B19301_001E', 'NAME', 'GEO_ID'] # TODO: later make this a user input? + + def download_census_data(self, year_data): + """download the census data + it is possible to also make the county, tract and blockgroup flexible so that a user could specify exactly what to download + But: bear in mind, with social vulneraiblity we compare areas against each other, so Ideally you would have a large enough dataset (for statistical and validity purposes) + """ + download_census_codes = self.census_key.acs.state_county_blockgroup( + fields=self.download_variables, + state_fips=self.state_fips, + county_fips="*", + tract="*", + blockgroup="*", + year=year_data + ) + self.pd_census_data = pd.DataFrame(download_census_codes) + self.logger.info( + "The equity data was succesfully downloaded from the Census website" + ) + return self.pd_census_data + + def rename_census_data(self): + """renaming the columns so that they have variable names instead of variable codes as their headers + + Parameters + ---------- + from_name : str + The name that you want to replace + to_name : str + The name to replace with + """ + name_dict = {'B01003_001E': "TotalPopulationBG", 'B19301_001E': "PerCapitaIncomeBG"} # TODO: Later make this a user input? + self.pd_census_data = self.pd_census_data.rename(columns=name_dict) + + def match_geo_ID(self): + """Matches GEO_IDs for the block groups""" + self.pd_domain_scores_geo = self.pd_census_data.copy() + self.pd_domain_scores_geo[ + "GEO_ID" + ] = None # Create a new column 'GEO_ID' with initial values set to None + + for index, value in enumerate(self.pd_domain_scores_geo["NAME"]): + if value in self.pd_census_data["NAME"].values: + matching_row = self.pd_census_data.loc[ + self.pd_census_data["NAME"] == value + ] + geo_id = matching_row["GEO_ID"].values[ + 0 + ] # Assuming there's only one matching row, extract the GEO_ID value + self.pd_domain_scores_geo.at[ + index, "GEO_ID" + ] = geo_id # Assign the GEO_ID value to the corresponding row in self.pd_domain_scores_geo + self.pd_domain_scores_geo["GEOID_short"] = ( + self.pd_domain_scores_geo["GEO_ID"].str.split("US").str[1] + ) + + def download_and_unzip(self, url, extract_to='.'): + """function to download the shapefile data from census tiger website + + Parameters + ---------- + url : webpage + URL to census website (TIGER) to download shapefiles for visualisation + extract_to : str, optional + _description_, by default '.' + """ + + try: + http_response = urlopen(url) + zipfile = ZipFile(BytesIO(http_response.read())) + zipfile.extractall(path=extract_to) + except Exception as e: + print(f"Error during download and unzip: {e}") + + def download_shp_geom(self, year_data, county): + """Downloading the shapefiles from the government Tiger website + + Parameters + ---------- + year_data : int + The year for which you want to download the census data and the corresponding shapefiles (for geometry) + county : int + the county code in which your area of interest lies + """ + # Download shapefile of blocks + if year_data == 2022: + url = f"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/FACES/tl_rd22_{self.state_fips}{county}_faces.zip" + code = "20" + self.logger.info("Downloading the county shapefile for 2022") + elif year_data == 2021: + url = f"https://www2.census.gov/geo/tiger/TIGER2021/FACES/tl_2021_{self.state_fips}{county}_faces.zip" + code = "20" + self.logger.info("Downloading the county shapefile for 2021") + elif year_data == 2020: + url = f"https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/FACES/tl_2020_{self.state_fips}{county}_faces.zip" + code = "20" + self.logger.info("Downloading the county shapefile for 2020") + else: + print("year not supported") + return + # Save shapefiles + fold_name = f'Shapefiles/{self.state_fips}{county}/{year_data}' + self.download_and_unzip(url, fold_name) + shapefiles = list(Path(fold_name).glob("*.shp")) + if shapefiles: + self.shp = gpd.read_file(shapefiles[0]) + self.logger.info("The shapefile was downloaded") + else: + print("No shapefile found in the directory.") + + # Dissolve shapefile based on block groups + attrs = ["STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE"] + attrs = [attr + code for attr in attrs] + + self.block_groups = self.shp.dissolve(by=attrs, as_index=False) + self.block_groups = self.block_groups[attrs + ["geometry"]] + # block_groups["Census_Bg"] = block_groups['TRACTCE' + code].astype(str) + "-block" + block_groups['BLKGRPCE' + code].astype(str) + self.block_groups["GEO_ID"] = "1500000US" + self.block_groups['STATEFP' + code].astype(str) + self.block_groups['COUNTYFP' + code].astype(str) + self.block_groups['TRACTCE' + code].astype(str) + self.block_groups['BLKGRPCE' + code].astype(str) + + + def merge_equity_data_shp(self): + """Merges the geometry data with the equity_data downloaded""" + self.equity_data_shp = self.pd_domain_scores_geo.merge(self.block_groups[["GEO_ID", "geometry"]], on="GEO_ID", how="left") + self.equity_data_shp = gpd.GeoDataFrame(self.equity_data_shp) + + #self.svi_data_shp.drop(columns=columns_to_drop, inplace=True) + self.equity_data_shp = self.equity_data_shp.to_crs(epsg=4326) + self.logger.info( + "The geometry information was successfully added to the equity information" + ) \ No newline at end of file diff --git a/hydromt_fiat/workflows/social_vulnerability_index.py b/hydromt_fiat/workflows/social_vulnerability_index.py index b0ca9e7d..de68b9ca 100644 --- a/hydromt_fiat/workflows/social_vulnerability_index.py +++ b/hydromt_fiat/workflows/social_vulnerability_index.py @@ -6,7 +6,69 @@ import pandas as pd import numpy as np import geopandas as gpd - +from urllib.request import urlopen +from io import BytesIO +from zipfile import ZipFile +from pathlib import Path + +def list_of_states(): + states = { + 'AK': 'Alaska', + 'AL': 'Alabama', + 'AR': 'Arkansas', + 'AZ': 'Arizona', + 'CA': 'California', + 'CO': 'Colorado', + 'CT': 'Connecticut', + 'DC': 'District of Columbia', + 'DE': 'Delaware', + 'FL': 'Florida', + 'GA': 'Georgia', + 'HI': 'Hawaii', + 'IA': 'Iowa', + 'ID': 'Idaho', + 'IL': 'Illinois', + 'IN': 'Indiana', + 'KS': 'Kansas', + 'KY': 'Kentucky', + 'LA': 'Louisiana', + 'MA': 'Massachusetts', + 'MD': 'Maryland', + 'ME': 'Maine', + 'MI': 'Michigan', + 'MN': 'Minnesota', + 'MO': 'Missouri', + 'MS': 'Mississippi', + 'MT': 'Montana', + 'NC': 'North Carolina', + 'ND': 'North Dakota', + 'NE': 'Nebraska', + 'NH': 'New Hampshire', + 'NJ': 'New Jersey', + 'NM': 'New Mexico', + 'NV': 'Nevada', + 'NY': 'New York', + 'OH': 'Ohio', + 'OK': 'Oklahoma', + 'OR': 'Oregon', + 'PA': 'Pennsylvania', + 'RI': 'Rhode Island', + 'SC': 'South Carolina', + 'SD': 'South Dakota', + 'TN': 'Tennessee', + 'TX': 'Texas', + 'UT': 'Utah', + 'VA': 'Virginia', + 'VT': 'Vermont', + 'WA': 'Washington', + 'WI': 'Wisconsin', + 'WV': 'West Virginia', + 'WY': 'Wyoming' + } + + states_inverted = {value: key for key, value in states.items()} + + return states_inverted class SocialVulnerabilityIndex: def __init__(self, data_catalog: DataCatalog = None, logger: logging.Logger = None): @@ -27,7 +89,8 @@ def __init__(self, data_catalog: DataCatalog = None, logger: logging.Logger = No self.pd_domain_scores_geo = pd.DataFrame() self.logger = setuplog("SVI", log_level=10) self.svi_data_shp = gpd.GeoDataFrame() - + self.block_groups = gpd.GeoDataFrame() + def read_dataset(self, path: str): """If you have your own dataset (e.g. already downloaded census data), you can use this to load it from a csv Make sure to only use the relevant functions in fiat then""" @@ -84,7 +147,7 @@ def set_up_state_code(self, state_abbreviation: str): self.state_fips = state_obj.fips self.logger.info(f"The state abbreviation specified is: {state_abbreviation}") - def download_census_data(self): + def download_census_data(self, year_data): """download the census data it is possible to also make the county, tract and blockgroup flexible so that a user could specify exactly what to download But: bear in mind, with social vulneraiblity we compare areas against each other, so Ideally you would have a large enough dataset (for statistical and validity purposes) @@ -95,6 +158,7 @@ def download_census_data(self): county_fips="*", tract="*", blockgroup="*", + year=year_data ) self.pd_census_data = pd.DataFrame(download_census_codes) self.logger.info( @@ -419,17 +483,17 @@ def domain_scores(self): for key, values in domaingroups.items(): # Add _norm to all values to match with the normalization column names - values_norm = [v + "_norm" for v in values] - sum_variables = self.svi_fiat.filter( - items=values_norm - ) # use the normalized dataset! - sum_values = sum_variables.sum(axis=1) / len( - values_norm - ) # use the normalized dataset! + values_norm = [v + '_norm' for v in values] + sum_variables = self.svi_fiat.filter(items=values_norm) #use the normalized dataset! + z_variables = self.zscore(sum_variables) + sum_values = z_variables.sum(axis=1) / len(values_norm) #use the normalized dataset! self.pd_domain_scores[key] = sum_values + self.pd_domain_scores_z[key] = sum_values - self.pd_domain_scores_z = self.zscore(self.pd_domain_scores) - # Now we add the names so we can later match these regions with their geometry + #finding and storing the highest scoring domain for each row and adding it to the dataset + self.pd_domain_scores_z['highest_domain'] = self.pd_domain_scores_z.apply(lambda row: row.idxmax(), axis=1) + + #Now we add the names so we can later match these regions with their geometry self.pd_domain_scores_z["NAME"] = self.processed_census_data["NAME"] self.pd_domain_scores["NAME"] = self.processed_census_data["NAME"] @@ -439,13 +503,10 @@ def domain_scores(self): def composite_scores(self): """create composite scores from the domain scores""" - columns_to_sum = self.pd_domain_scores_z.columns.drop("NAME") - self.pd_domain_scores_z["composite_SVI"] = self.pd_domain_scores_z[ - columns_to_sum - ].sum(axis=1) - self.pd_domain_scores_z["composite_svi_z"] = self.zscore( - self.pd_domain_scores_z["composite_SVI"] - ) + columns_to_sum = self.pd_domain_scores_z.columns.drop(["NAME", "highest_domain"]) + self.pd_domain_scores_z["composite_SVI"] = self.pd_domain_scores_z[columns_to_sum].sum(axis=1) + self.pd_domain_scores_z["composite_svi_z"] = self.zscore(self.pd_domain_scores_z["composite_SVI"]) + self.logger.info( "Composite SVI scores are created based on the domain scores using equal weighting and are standardized again using zscores" ) @@ -472,92 +533,77 @@ def match_geo_ID(self): self.pd_domain_scores_geo["GEO_ID"].str.split("US").str[1] ) - def load_shp_geom(self, path): - """loading geometry data from the user's specified shapefile. - The user can download the right block groups for their preferred state from: https://www.census.gov/cgi-bin/geo/shapefiles/index.php + def download_and_unzip(self, url, extract_to='.'): + """function to download the shapefile data from census tiger website Parameters ---------- - path : path to shapefile - A shapefile containing block groups geometry: https://www.census.gov/cgi-bin/geo/shapefiles/index.php + url : webpage + URL to census website (TIGER) to download shapefiles for visualisation + extract_to : str, optional + _description_, by default '.' """ - self.shape_data = self.data_catalog.get_geodataframe(path) - self.logger.info(f"Succesfully read in the shapefile from {path}") + + try: + http_response = urlopen(url) + zipfile = ZipFile(BytesIO(http_response.read())) + zipfile.extractall(path=extract_to) + except Exception as e: + print(f"Error during download and unzip: {e}") + + def download_shp_geom(self, year_data, county): + """Downloading the shapefiles from the government Tiger website + + Parameters + ---------- + year_data : int + The year for which you want to download the census data and the corresponding shapefiles (for geometry) + county : int + the county code in which your area of interest lies + """ + # Download shapefile of blocks + if year_data == 2022: + url = f"https://www2.census.gov/geo/tiger/TIGER_RD18/LAYER/FACES/tl_rd22_{self.state_fips}{county}_faces.zip" + code = "20" + self.logger.info("Downloading the county shapefile for 2022") + elif year_data == 2021: + url = f"https://www2.census.gov/geo/tiger/TIGER2021/FACES/tl_2021_{self.state_fips}{county}_faces.zip" + code = "20" + self.logger.info("Downloading the county shapefile for 2021") + elif year_data == 2020: + url = f"https://www2.census.gov/geo/tiger/TIGER2020PL/LAYER/FACES/tl_2020_{self.state_fips}{county}_faces.zip" + code = "20" + self.logger.info("Downloading the county shapefile for 2020") + else: + print("year not supported") + return + # Save shapefiles + fold_name = f'Shapefiles/{self.state_fips}{county}/{year_data}' + self.download_and_unzip(url, fold_name) + shapefiles = list(Path(fold_name).glob("*.shp")) + if shapefiles: + self.shp = gpd.read_file(shapefiles[0]) + self.logger.info("The shapefile was downloaded") + else: + print("No shapefile found in the directory.") + + # Dissolve shapefile based on block groups + attrs = ["STATEFP", "COUNTYFP", "TRACTCE", "BLKGRPCE"] + attrs = [attr + code for attr in attrs] + + self.block_groups = self.shp.dissolve(by=attrs, as_index=False) + self.block_groups = self.block_groups[attrs + ["geometry"]] + # block_groups["Census_Bg"] = block_groups['TRACTCE' + code].astype(str) + "-block" + block_groups['BLKGRPCE' + code].astype(str) + self.block_groups["GEO_ID"] = "1500000US" + self.block_groups['STATEFP' + code].astype(str) + self.block_groups['COUNTYFP' + code].astype(str) + self.block_groups['TRACTCE' + code].astype(str) + self.block_groups['BLKGRPCE' + code].astype(str) + def merge_svi_data_shp(self): """Merges the geometry data with the social vulnerability index computed in the module""" - self.svi_data_shp = self.pd_domain_scores_geo.merge( - self.shape_data, left_on="GEOID_short", right_on="GEOID" - ) + self.svi_data_shp = self.pd_domain_scores_geo.merge(self.block_groups[["GEO_ID", "geometry"]], on="GEO_ID", how="left") self.svi_data_shp = gpd.GeoDataFrame(self.svi_data_shp) - columns_to_drop = [ - "STATEFP", - "COUNTYFP", - "TRACTCE", - "BLKGRPCE", - "GEOID", - "NAMELSAD", - "MTFCC", - "FUNCSTAT", - "ALAND", - "AWATER", - "INTPTLAT", - "INTPTLON", - ] - self.svi_data_shp.drop(columns=columns_to_drop, inplace=True) + + #self.svi_data_shp.drop(columns=columns_to_drop, inplace=True) self.svi_data_shp = self.svi_data_shp.to_crs(epsg=4326) self.logger.info( "The geometry information was successfully added to the social vulnerability information" - ) - - -###OTHER - -# import requests -# import json - -# This function does not work yet. Somehow the json data does not contain the acutal geometry data. -# svi.load_geometry_data("https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/tigerWMS_ACS2022/MapServer/8?f=pjson") -# def load_geometry_data(self, url:str): - -# # Specify the parameters for the request -# params = { -# "where": "1=1", -# "geometryType": "esriGeometryPolygon", -# "spatialRel": "esriSpatialRelIntersects", -# "outFields": "*", -# "returnGeometry": True, -# "f": "geojson" # Retrieve data as GeoJSON -# } - -# # Send the GET request -# response = requests.get(url, params=params) -# response2 = response.text -# # Load the JSON data -# json_data = json.loads(response2) - -# geometry_data = json_data['geometryField'] - -# # Extract the GEOID data -# geoid_data = [feature['attributes']['GEOID'] for feature in json_data['features']] - -# # Create a GeoPandas DataFrame -# df = gpd.GeoDataFrame(pd.DataFrame(geoid_data, columns=['GEOID'])) -# df['geometry'] = gpd.GeoSeries.from_wkt(geometry_data) - -# # Print the resulting GeoPandas DataFrame -# print(df) - - -# Testing the functions -# Removing some variables from the codebook -# I think in all cases you would need the total column, otherwise you would mess up the making of your indicators and thus final scores -# but you would not see that in the results as they will be standardized anyways -# the code works, so that does not really matter - -# lowering the nan-threshold to 3% -# it runs - -# changing the input data now also works. The code can still run if there are varaibles missing or if there is a large amount of Nans -# we now assume that we will still make the indciator if data is missing, as a default. In future versions we might want to chagne this for accuracy purposes -# We now printed the information about missing variables / indicators to lists that can be inspected so that informed decision can be made. + ) \ No newline at end of file diff --git a/tests/test_SVI_exposure.py b/tests/test_SVI_exposure.py index c499f56a..70c4f87a 100644 --- a/tests/test_SVI_exposure.py +++ b/tests/test_SVI_exposure.py @@ -63,6 +63,8 @@ "codebook_fn": "social_vulnerability", "state_abbreviation": "SC", "blockgroup_fn": "blockgroup_shp_data", + "year_data": 2021, + "county": "019" }, }, } @@ -80,25 +82,7 @@ def test_SVI_exposure(case): data_libs = EXAMPLEDIR.joinpath(_cases[case]["data_catalogue"]) fm = FiatModel(root=root, mode="w", data_libs=data_libs, logger=logger) - # Now we will add data from the user to the data catalog. - to_add = { - "blockgroup_shp_data": { - "path": str( - EXAMPLEDIR - / "social_vulnerability" - / "test_blockgroup_shp" - / "tl_2022_45_bg.shp" - ), - "data_type": "GeoDataFrame", - "driver": "vector", - "crs": 4326, - "category": "social_vulnerability", - } - } - region = gpd.GeoDataFrame.from_features(_cases[case]["region"], crs=4326) - - fm.data_catalog.from_dict(to_add) fm.build(region={"geom": region}, opt=_cases[case]["configuration"]) fm.write() diff --git a/tests/test_equity_data.py b/tests/test_equity_data.py new file mode 100644 index 00000000..25395c9e --- /dev/null +++ b/tests/test_equity_data.py @@ -0,0 +1,48 @@ +from hydromt_fiat.fiat import FiatModel +from hydromt.log import setuplog +from pathlib import Path +import pytest +import shutil +import geopandas as gpd + +EXAMPLEDIR = Path( + "P:/11207949-dhs-phaseii-floodadapt/Model-builder/Delft-FIAT/local_test_database" +) +DATADIR = Path().absolute() / "hydromt_fiat" / "data" + +_cases = { + "Test_equity_data": { + "data_catalogue": DATADIR / "hydromt_fiat_catalog_USA.yml", + "folder": "Test_equity_data", + "configuration": { + "setup_global_settings": {"crs": "epsg:4326"}, + "setup_output": { + "output_dir": "output", + "output_csv_name": "output.csv", + "output_vector_name": "spatial.gpkg", + }, + + "setup_equity_data": { + "census_key": "495a349ce22bdb1294b378fb199e4f27e57471a9", + "state_abbreviation": "SC", + "year_data": 2021, + "county": "019" + }, + }, + } +} + +@pytest.mark.parametrize("case", list(_cases.keys())) +# @pytest.mark.skip(reason="Needs to be updated") +def test_equity_data(case): + # Read model in examples folder. + root = EXAMPLEDIR.joinpath(_cases[case]["folder"]) + if root.exists(): + shutil.rmtree(root) + logger = setuplog("hydromt_fiat", log_level=10) + data_libs = EXAMPLEDIR.joinpath(_cases[case]["data_catalogue"]) + fm = FiatModel(root=root, mode="w", data_libs=data_libs, logger=logger) + + + fm.build(opt=_cases[case]["configuration"]) + fm.write()