From 12e35fee7c92b002fb0ffd94460b3263d1965cd5 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sat, 19 Oct 2024 14:39:45 -0400 Subject: [PATCH 1/6] remove parquet step --- data/src/classes/featurelayer.py | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/data/src/classes/featurelayer.py b/data/src/classes/featurelayer.py index 35251706..e704ce97 100644 --- a/data/src/classes/featurelayer.py +++ b/data/src/classes/featurelayer.py @@ -314,35 +314,7 @@ def build_and_publish(self, tiles_file_id_prefix: str) -> None: self.centroid_gdf["geometry"] = self.centroid_gdf["geometry"].centroid self.centroid_gdf = self.centroid_gdf.to_crs(epsg=4326) self.centroid_gdf.to_file(temp_geojson_points, driver="GeoJSON") - - # Load the GeoJSON from the polygons, drop geometry, and save as Parquet - gdf_polygons = gpd.read_file(temp_geojson_polygons) - df_no_geom = gdf_polygons.drop(columns=["geometry"]) - - # Check if the DataFrame has fewer than 25,000 rows - num_rows, num_cols = df_no_geom.shape - if num_rows < 25000: - print( - f"Parquet file has {num_rows} rows, which is fewer than 25,000. Skipping upload." - ) - return - - # Save the DataFrame as Parquet - df_no_geom.to_parquet(temp_parquet) - - # Upload Parquet to Google Cloud Storage - blob_parquet = bucket.blob(f"{tiles_file_id_prefix}.parquet") - try: - blob_parquet.upload_from_filename(temp_parquet) - parquet_size = os.stat(temp_parquet).st_size - parquet_size_mb = parquet_size / (1024 * 1024) - print( - f"Parquet upload successful! Size: {parquet_size} bytes ({parquet_size_mb:.2f} MB), Dimensions: {num_rows} rows, {num_cols} columns." - ) - except Exception as e: - print(f"Parquet upload failed: {e}") - return - + # Command for generating PMTiles for points up to zoom level zoom_threshold points_command: list[str] = [ "tippecanoe", From de7acc820604ce3c8fcf450bf5af20f15c4d5a99 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sat, 19 Oct 2024 14:40:12 -0400 Subject: [PATCH 2/6] correctly assign in vs out of landcare --- data/src/data_utils/phs_properties.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data/src/data_utils/phs_properties.py b/data/src/data_utils/phs_properties.py index e5627850..c906c2d1 100644 --- a/data/src/data_utils/phs_properties.py +++ b/data/src/data_utils/phs_properties.py @@ -22,10 +22,10 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer: primary_featurelayer.spatial_join(phs_properties) # Initialize 'phs_care_program' column with default "no" for all rows - primary_featurelayer.gdf["phs_care_program"] = "no" + primary_featurelayer.gdf["phs_care_program"] = "No" # Set 'phs_care_program' to "yes" for matched rows - primary_featurelayer.gdf.loc[primary_featurelayer.gdf["phs_care_program"] != "no", "phs_care_program"] = "yes" + primary_featurelayer.gdf.loc[primary_featurelayer.gdf["program"].notna(), "phs_care_program"] = "Yes" # Rebuild the GeoDataFrame after updates primary_featurelayer.rebuild_gdf() From c427022fa136fd8614a85967df333e9af3229e2e Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sat, 19 Oct 2024 14:42:44 -0400 Subject: [PATCH 3/6] move logging of access process distribution --- data/src/data_utils/access_process.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/data/src/data_utils/access_process.py b/data/src/data_utils/access_process.py index 039843f1..7c8e79de 100644 --- a/data/src/data_utils/access_process.py +++ b/data/src/data_utils/access_process.py @@ -39,10 +39,5 @@ def access_process(dataset: Any) -> Any: access_processes.append(access_process) dataset.gdf["access_process"] = access_processes - - # Print the distribution of "access_process" - distribution = dataset.gdf["access_process"].value_counts() - print("Distribution of access process:") - print(distribution) - + return dataset From 07f668e059c5610a39261e78ccda9a3b2a49dc0f Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sat, 19 Oct 2024 14:43:00 -0400 Subject: [PATCH 4/6] move logging of priority level distribution --- data/src/data_utils/priority_level.py | 1 + 1 file changed, 1 insertion(+) diff --git a/data/src/data_utils/priority_level.py b/data/src/data_utils/priority_level.py index 613313f2..33097de3 100644 --- a/data/src/data_utils/priority_level.py +++ b/data/src/data_utils/priority_level.py @@ -47,4 +47,5 @@ def priority_level(dataset): priority_levels.append(priority_level) dataset.gdf["priority_level"] = priority_levels + return dataset From 2de67f2490bd770c2b6af588556888630ea0a9e8 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sat, 19 Oct 2024 14:43:17 -0400 Subject: [PATCH 5/6] drop duplicates early on --- data/src/data_utils/vacant_properties.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/data/src/data_utils/vacant_properties.py b/data/src/data_utils/vacant_properties.py index 87a8b6f7..d6573218 100644 --- a/data/src/data_utils/vacant_properties.py +++ b/data/src/data_utils/vacant_properties.py @@ -145,13 +145,7 @@ def vacant_properties() -> FeatureLayer: vacant_properties.gdf, geometry="geometry" ) - print( - f"Vacant properties data size before dropping NAs: {len(vacant_properties.gdf)} rows." - ) vacant_properties.gdf.dropna(subset=["opa_id"], inplace=True) - print( - f"Vacant properties data size after dropping NAs: {len(vacant_properties.gdf)} rows." - ) # Final null value check before returning check_null_percentage(vacant_properties.gdf) @@ -184,4 +178,9 @@ def vacant_properties() -> FeatureLayer: # Ensure concatenated data is still a GeoDataFrame vacant_properties.gdf = gpd.GeoDataFrame(vacant_properties.gdf, geometry="geometry") + before_drop = vacant_properties.gdf.shape[0] + vacant_properties.gdf = vacant_properties.gdf.drop_duplicates(subset="opa_id") + after_drop = vacant_properties.gdf.shape[0] + print(f"Duplicate vacant properties dropped: {before_drop - after_drop}") + return vacant_properties From 3d89cc74aeffdb5a233c36a3e2d24c4653b2ebd1 Mon Sep 17 00:00:00 2001 From: nlebovits Date: Sat, 19 Oct 2024 14:43:42 -0400 Subject: [PATCH 6/6] drop parquet step and add logging for distribution of access process and priority level --- data/src/script.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/data/src/script.py b/data/src/script.py index 46e1db3b..78c5f90d 100644 --- a/data/src/script.py +++ b/data/src/script.py @@ -80,12 +80,32 @@ for service in services: dataset = service(dataset) +before_drop = dataset.gdf.shape[0] +dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") +after_drop = dataset.gdf.shape[0] +print(f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}") + # Add Priority Level dataset = priority_level(dataset) +# Print the distribution of "priority_level" +distribution = dataset.gdf["priority_level"].value_counts() +print("Distribution of priority level:") +print(distribution) + # Add Access Process dataset = access_process(dataset) +# Print the distribution of "access_process" +distribution = dataset.gdf["access_process"].value_counts() +print("Distribution of access process:") +print(distribution) + +before_drop = dataset.gdf.shape[0] +dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id") +after_drop = dataset.gdf.shape[0] +print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}") + # back up old tiles file whether we are reloading data or not if backup is None: backup = BackupArchiveDatabase()