From 12e35fee7c92b002fb0ffd94460b3263d1965cd5 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Sat, 19 Oct 2024 14:39:45 -0400
Subject: [PATCH 1/6] remove parquet step

---
 data/src/classes/featurelayer.py | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/data/src/classes/featurelayer.py b/data/src/classes/featurelayer.py
index 35251706..e704ce97 100644
--- a/data/src/classes/featurelayer.py
+++ b/data/src/classes/featurelayer.py
@@ -314,35 +314,7 @@ def build_and_publish(self, tiles_file_id_prefix: str) -> None:
         self.centroid_gdf["geometry"] = self.centroid_gdf["geometry"].centroid
         self.centroid_gdf = self.centroid_gdf.to_crs(epsg=4326)
         self.centroid_gdf.to_file(temp_geojson_points, driver="GeoJSON")
-
-        # Load the GeoJSON from the polygons, drop geometry, and save as Parquet
-        gdf_polygons = gpd.read_file(temp_geojson_polygons)
-        df_no_geom = gdf_polygons.drop(columns=["geometry"])
-
-        # Check if the DataFrame has fewer than 25,000 rows
-        num_rows, num_cols = df_no_geom.shape
-        if num_rows < 25000:
-            print(
-                f"Parquet file has {num_rows} rows, which is fewer than 25,000. Skipping upload."
-            )
-            return
-
-        # Save the DataFrame as Parquet
-        df_no_geom.to_parquet(temp_parquet)
-
-        # Upload Parquet to Google Cloud Storage
-        blob_parquet = bucket.blob(f"{tiles_file_id_prefix}.parquet")
-        try:
-            blob_parquet.upload_from_filename(temp_parquet)
-            parquet_size = os.stat(temp_parquet).st_size
-            parquet_size_mb = parquet_size / (1024 * 1024)
-            print(
-                f"Parquet upload successful! Size: {parquet_size} bytes ({parquet_size_mb:.2f} MB), Dimensions: {num_rows} rows, {num_cols} columns."
-            )
-        except Exception as e:
-            print(f"Parquet upload failed: {e}")
-            return
-
+        
         # Command for generating PMTiles for points up to zoom level zoom_threshold
         points_command: list[str] = [
             "tippecanoe",

From de7acc820604ce3c8fcf450bf5af20f15c4d5a99 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Sat, 19 Oct 2024 14:40:12 -0400
Subject: [PATCH 2/6] correctly assign in vs out of landcare

---
 data/src/data_utils/phs_properties.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/data/src/data_utils/phs_properties.py b/data/src/data_utils/phs_properties.py
index e5627850..c906c2d1 100644
--- a/data/src/data_utils/phs_properties.py
+++ b/data/src/data_utils/phs_properties.py
@@ -22,10 +22,10 @@ def phs_properties(primary_featurelayer: FeatureLayer) -> FeatureLayer:
     primary_featurelayer.spatial_join(phs_properties)
 
     # Initialize 'phs_care_program' column with default "no" for all rows
-    primary_featurelayer.gdf["phs_care_program"] = "no"
+    primary_featurelayer.gdf["phs_care_program"] = "No"
     
     # Set 'phs_care_program' to "yes" for matched rows
-    primary_featurelayer.gdf.loc[primary_featurelayer.gdf["phs_care_program"] != "no", "phs_care_program"] = "yes"
+    primary_featurelayer.gdf.loc[primary_featurelayer.gdf["program"].notna(), "phs_care_program"] = "Yes"
 
     # Rebuild the GeoDataFrame after updates
     primary_featurelayer.rebuild_gdf()

From c427022fa136fd8614a85967df333e9af3229e2e Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Sat, 19 Oct 2024 14:42:44 -0400
Subject: [PATCH 3/6] move logging of access process distribution

---
 data/src/data_utils/access_process.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/data/src/data_utils/access_process.py b/data/src/data_utils/access_process.py
index 039843f1..7c8e79de 100644
--- a/data/src/data_utils/access_process.py
+++ b/data/src/data_utils/access_process.py
@@ -39,10 +39,5 @@ def access_process(dataset: Any) -> Any:
         access_processes.append(access_process)
 
     dataset.gdf["access_process"] = access_processes
-
-    # Print the distribution of "access_process"
-    distribution = dataset.gdf["access_process"].value_counts()
-    print("Distribution of access process:")
-    print(distribution)
-
+    
     return dataset

From 07f668e059c5610a39261e78ccda9a3b2a49dc0f Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Sat, 19 Oct 2024 14:43:00 -0400
Subject: [PATCH 4/6] move logging of priority level distribution

---
 data/src/data_utils/priority_level.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/data/src/data_utils/priority_level.py b/data/src/data_utils/priority_level.py
index 613313f2..33097de3 100644
--- a/data/src/data_utils/priority_level.py
+++ b/data/src/data_utils/priority_level.py
@@ -47,4 +47,5 @@ def priority_level(dataset):
         priority_levels.append(priority_level)
 
     dataset.gdf["priority_level"] = priority_levels
+
     return dataset

From 2de67f2490bd770c2b6af588556888630ea0a9e8 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Sat, 19 Oct 2024 14:43:17 -0400
Subject: [PATCH 5/6] drop duplicates early on

---
 data/src/data_utils/vacant_properties.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/data/src/data_utils/vacant_properties.py b/data/src/data_utils/vacant_properties.py
index 87a8b6f7..d6573218 100644
--- a/data/src/data_utils/vacant_properties.py
+++ b/data/src/data_utils/vacant_properties.py
@@ -145,13 +145,7 @@ def vacant_properties() -> FeatureLayer:
             vacant_properties.gdf, geometry="geometry"
         )
 
-    print(
-        f"Vacant properties data size before dropping NAs: {len(vacant_properties.gdf)} rows."
-    )
     vacant_properties.gdf.dropna(subset=["opa_id"], inplace=True)
-    print(
-        f"Vacant properties data size after dropping NAs: {len(vacant_properties.gdf)} rows."
-    )
 
     # Final null value check before returning
     check_null_percentage(vacant_properties.gdf)
@@ -184,4 +178,9 @@ def vacant_properties() -> FeatureLayer:
     # Ensure concatenated data is still a GeoDataFrame
     vacant_properties.gdf = gpd.GeoDataFrame(vacant_properties.gdf, geometry="geometry")
 
+    before_drop = vacant_properties.gdf.shape[0]
+    vacant_properties.gdf = vacant_properties.gdf.drop_duplicates(subset="opa_id")
+    after_drop = vacant_properties.gdf.shape[0]
+    print(f"Duplicate vacant properties dropped: {before_drop - after_drop}")
+
     return vacant_properties

From 3d89cc74aeffdb5a233c36a3e2d24c4653b2ebd1 Mon Sep 17 00:00:00 2001
From: nlebovits <nissim.lebovits@vanderbilt.edu>
Date: Sat, 19 Oct 2024 14:43:42 -0400
Subject: [PATCH 6/6] drop parquet step and add logging for distribution of
 access process and priority level

---
 data/src/script.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/data/src/script.py b/data/src/script.py
index 46e1db3b..78c5f90d 100644
--- a/data/src/script.py
+++ b/data/src/script.py
@@ -80,12 +80,32 @@
 for service in services:
     dataset = service(dataset)
 
+before_drop = dataset.gdf.shape[0]
+dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
+after_drop = dataset.gdf.shape[0]
+print(f"Duplicate dataset rows dropped after initial services: {before_drop - after_drop}")
+
 # Add Priority Level
 dataset = priority_level(dataset)
 
+# Print the distribution of "priority_level"
+distribution = dataset.gdf["priority_level"].value_counts()
+print("Distribution of priority level:")
+print(distribution)
+    
 # Add Access Process
 dataset = access_process(dataset)
 
+# Print the distribution of "access_process"
+distribution = dataset.gdf["access_process"].value_counts()
+print("Distribution of access process:")
+print(distribution)
+
+before_drop = dataset.gdf.shape[0]
+dataset.gdf = dataset.gdf.drop_duplicates(subset="opa_id")
+after_drop = dataset.gdf.shape[0]
+print(f"Duplicate final dataset rows droppeds: {before_drop - after_drop}")
+
 # back up old tiles file whether we are reloading data or not
 if backup is None:
     backup = BackupArchiveDatabase()