Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Lebovits/fix phs and duplicates #966

Merged
merged 6 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 1 addition & 29 deletions data/src/classes/featurelayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,35 +314,7 @@ def build_and_publish(self, tiles_file_id_prefix: str) -> None:
self.centroid_gdf["geometry"] = self.centroid_gdf["geometry"].centroid
self.centroid_gdf = self.centroid_gdf.to_crs(epsg=4326)
self.centroid_gdf.to_file(temp_geojson_points, driver="GeoJSON")

# Load the GeoJSON from the polygons, drop geometry, and save as Parquet
gdf_polygons = gpd.read_file(temp_geojson_polygons)
df_no_geom = gdf_polygons.drop(columns=["geometry"])

# Check if the DataFrame has fewer than 25,000 rows
num_rows, num_cols = df_no_geom.shape
if num_rows < 25000:
print(
f"Parquet file has {num_rows} rows, which is fewer than 25,000. Skipping upload."
)
return

# Save the DataFrame as Parquet
df_no_geom.to_parquet(temp_parquet)

# Upload Parquet to Google Cloud Storage
blob_parquet = bucket.blob(f"{tiles_file_id_prefix}.parquet")
try:
blob_parquet.upload_from_filename(temp_parquet)
parquet_size = os.stat(temp_parquet).st_size
parquet_size_mb = parquet_size / (1024 * 1024)
print(
f"Parquet upload successful! Size: {parquet_size} bytes ({parquet_size_mb:.2f} MB), Dimensions: {num_rows} rows, {num_cols} columns."
)
except Exception as e:
print(f"Parquet upload failed: {e}")
return


# Command for generating PMTiles for points up to zoom level zoom_threshold
points_command: list[str] = [
"tippecanoe",
Expand Down
7 changes: 1 addition & 6 deletions data/src/data_utils/access_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,5 @@ def access_process(dataset: Any) -> Any:
access_processes.append(access_process)

dataset.gdf["access_process"] = access_processes

# Print the distribution of "access_process"
distribution = dataset.gdf["access_process"].value_counts()
print("Distribution of access process:")
print(distribution)


return dataset
4 changes: 2 additions & 2 deletions data/src/data_utils/phs_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
primary_featurelayer.spatial_join(phs_properties)

# Initialize 'phs_care_program' column with default "no" for all rows
primary_featurelayer.gdf["phs_care_program"] = "no"
primary_featurelayer.gdf["phs_care_program"] = "No"

# Set 'phs_care_program' to "yes" for matched rows
primary_featurelayer.gdf.loc[primary_featurelayer.gdf["phs_care_program"] != "no", "phs_care_program"] = "yes"
primary_featurelayer.gdf.loc[primary_featurelayer.gdf["program"].notna(), "phs_care_program"] = "Yes"

# Rebuild the GeoDataFrame after updates
primary_featurelayer.rebuild_gdf()
Expand Down
1 change: 1 addition & 0 deletions data/src/data_utils/priority_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,5 @@ def priority_level(dataset):
priority_levels.append(priority_level)

dataset.gdf["priority_level"] = priority_levels

return dataset
11 changes: 5 additions & 6 deletions data/src/data_utils/vacant_properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,7 @@ def vacant_properties() -> FeatureLayer:
vacant_properties.gdf, geometry="geometry"
)

print(
f"Vacant properties data size before dropping NAs: {len(vacant_properties.gdf)} rows."
)
vacant_properties.gdf.dropna(subset=["opa_id"], inplace=True)
print(
f"Vacant properties data size after dropping NAs: {len(vacant_properties.gdf)} rows."
)

# Final null value check before returning
check_null_percentage(vacant_properties.gdf)
Expand Down Expand Up @@ -184,4 +178,9 @@ def vacant_properties() -> FeatureLayer:
# Ensure concatenated data is still a GeoDataFrame
vacant_properties.gdf = gpd.GeoDataFrame(vacant_properties.gdf, geometry="geometry")

before_drop = vacant_properties.gdf.shape[0]
vacant_properties.gdf = vacant_properties.gdf.drop_duplicates(subset="opa_id")
after_drop = vacant_properties.gdf.shape[0]
print(f"Duplicate vacant properties dropped: {before_drop - after_drop}")

return vacant_properties
20 changes: 20 additions & 0 deletions data/src/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,32 @@
for service in services:
dataset = service(dataset)

before_drop = dataset.gdf.shape[0]
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
after_drop = dataset.gdf.shape[0]
print(f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}")

# Add Priority Level
dataset = priority_level(dataset)

# Print the distribution of "priority_level"
distribution = dataset.gdf["priority_level"].value_counts()
print("Distribution of priority level:")
print(distribution)

# Add Access Process
dataset = access_process(dataset)

# Print the distribution of "access_process"
distribution = dataset.gdf["access_process"].value_counts()
print("Distribution of access process:")
print(distribution)

before_drop = dataset.gdf.shape[0]
dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
after_drop = dataset.gdf.shape[0]
print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")

# back up old tiles file whether we are reloading data or not
if backup is None:
backup = BackupArchiveDatabase()
Expand Down
Loading