Merge pull request #49 from datakind/feat/ingestion-free-standing

Feat/ingestion free standing
datakind · Jun 15, 2024 · 5b049c9 · 5b049c9
2 parents 8d0150e + ac2a625
commit 5b049c9
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -64,6 +64,26 @@ To run the ingestion module for ingested datasets, so assistants and plugins can
 
 It may take a while!
 
+Note: By default, rerunning the ingestion will not download data if the file already exists locally. To override this, you can run with ...
+
+`python3 ingest.py --force_download`
+
+#### Running ingestion without running full environment
+
+If you want to *just* download data and not run the full environment, this is possible as follows:
+
+First setup conda environment ...
+
+1. Install [miniconda](https://docs.conda.io/en/latest/miniconda.html) by selecting the installer that fits your OS version. Once it is installed you may have to restart your terminal (closing your terminal and opening again)
+2. Open terminal and `cd ingestion`
+3. `conda env create -f environment.yml`
+4. `conda activate data_recipes`
+
+Then run ingestion in download only mode ...
+
+5. `python ingest.py --skip_processing --skip_uploading`
+
+
 ### Recipes in Open AI (or Azure OpenAI) assistants
 
 The above will provide basic data recipes via the plugins architecture. If you want to also explore using Azure or Open AI assistants, the repo includes an approach where data files are uploaded to the assistant, and a prompt for it to analyse. 

diff --git a/ingestion/environment.yml b/ingestion/environment.yml
@@ -0,0 +1,26 @@
+# A Docker environment is provided with Data Recipes, but below you can also find
+# a simple conda environment. To use this ...
+#
+# 1. Install [miniconda](https://docs.conda.io/en/latest/miniconda.html) by selecting the installer that fits your OS version. Once it is installed you may have to restart your terminal (closing your terminal and opening again)
+# 2. In this directory, open terminal
+# 3. `conda env create -f environment.yml`
+# 4. `conda activate data_recipes`
+#
+name: data_recipes
+dependencies:
+  - pip
+  - python=3.11.4
+  - pip:
+    - psycopg2_binary==2.9.9
+    - python-dotenv==1.0.0
+    - sqlalchemy==2.0.30
+    - pandas==2.2.2
+    - datetime==4.3
+    - matplotlib==3.9.0
+    - geopandas==0.10.2
+    - seaborn==0.13.2
+    - hdx_python_api==6.2.4
+    - geoalchemy2==0.14.6
+    - hdx-python-country==3.6.9
+    - hdx-python-utilities==3.6.6
+
diff --git a/ingestion/ingest.py b/ingestion/ingest.py
@@ -1,3 +1,4 @@
+import argparse
 import json
 import os
 import re
@@ -107,10 +108,12 @@ def download_openapi_data(
     limit = 1000
     offset = 0
 
+    skip_delete_files = ["openapi.json", "processed_data", ".gitkeep"]
+
     files = os.listdir(save_path)
     if skip_downloaded is False:
         for f in files:
-            if "openapi.json" not in f:
+            if f not in skip_delete_files:
                 filename = f"{save_path}/{f}"
                 os.remove(filename)
 
@@ -403,12 +406,14 @@ def map_field_names(df, field_map):
     return df
 
 
-def main(skip_downloaded=False):
+def main(skip_downloaded=True, process_data=True, upload_data=True):
     """
     Main function for data ingestion.
 
     Args:
-        skip_downloaded (bool, optional): Flag to skip downloaded data. Defaults to False.
+        skip_downloaded (bool): If True, skip downloading data that already exists locally.
+        process_data (bool): If True, process and normalize downloaded data according to ingest.config.
+        upload_data (bool): If True, upload the processed data to the database.
     """
     apis, field_map, standard_names = read_integration_config(INTEGRATION_CONFIG)
     conn = connect_to_db()
@@ -451,10 +456,12 @@ def main(skip_downloaded=False):
         )
 
         # Standardize column names
-        process_openapi_data(api_name, save_path, field_map, standard_names)
+        if process_data is True:
+            process_openapi_data(api_name, save_path, field_map, standard_names)
 
         # Upload CSV files to the database, with supporting metadata
-        save_openapi_data(f"{save_path}/processed", conn, api_name)
+        if upload_data is True:
+            save_openapi_data(f"{save_path}/processed", conn, api_name)
 
     # Download shapefiles from HDX. Note, this also standardizes column names
     download_hdx_boundaries(
@@ -466,8 +473,54 @@ def main(skip_downloaded=False):
     )
 
     # Upload shapefiles to the database
-    upload_hdx_shape_files("./api/hdx/", conn)
+    if upload_data is True:
+        upload_hdx_shape_files("./api/hdx/", conn)
 
 
 if __name__ == "__main__":
-    main(skip_downloaded=True)
+
+    # Here get command line parameters --skip_downloaded, --process_data, --upload_data
+
+    parser = argparse.ArgumentParser(
+        description="Run data recipes ingestion process. See ingestion.config for the configured interfaces"
+    )
+
+    parser.add_argument(
+        "--force_download",
+        action="store_true",
+        help="Force the re-download of data that already exists locally",
+    )
+
+    parser.add_argument(
+        "--skip_processing",
+        action="store_true",
+        help="Skip any processing of downloaded data according to ingest.config",
+    )
+
+    parser.add_argument(
+        "--skip_uploading",
+        action="store_true",
+        help="Skip upload the processed data to the database",
+    )
+
+    args = parser.parse_args()
+
+    skip_downloaded = True
+    process_data = True
+    upload_data = True
+
+    if args.force_download:
+        print("Forcing re-download of data, even if it exists locally")
+        skip_downloaded = False
+    if args.skip_processing:
+        print("Won't post process data")
+        process_data = False
+    if args.skip_uploading:
+        print("Won't uploading data")
+        upload_data = False
+
+    main(
+        skip_downloaded=skip_downloaded,
+        process_data=process_data,
+        upload_data=upload_data,
+    )