From b6d82a05592dbdee3306364a68680df73cee0f8c Mon Sep 17 00:00:00 2001 From: jangevaare Date: Mon, 22 Apr 2024 15:52:11 +0000 Subject: [PATCH] remove arcgis python library dependency; just use requests --- README.md | 10 ++--- retrieval/dockerfile | 4 +- retrieval/getdata.py | 99 +++++++++++++++++++++++++++++++------------- 3 files changed, 78 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index b0e2ee9..4313b06 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,16 @@ This containers is part of a data pipeline to automatically retrieve data from t Container images are built by Github actions, and pushed to Github's container registry. You can find up-to-date built images [here](https://github.com/orgs/WDGPH/packages?repo_name=workflow-WSI). ## Retrieval Container -This container utilizes the [ArcGIS API Python Package](https://developers.arcgis.com/python/guide/install-and-set-up/) to authenticate to ArcGIS online, which is then used to download resources by item ID. +This container downloads ArcGIS online items from a specified url. To use, `ARCGIS_USER` and `ARCGIS_PASSWORD` environment variables must be set for the container (credentials for WSI Data and Visualization Hub). It is strongly suggested that a secure key vault is utilized for this process and that credentials are rotated frequently. Additionally, the following arguments are required: -**1. `item_id`** -ArcGIS Online item id. Changes with addition/removal of features to dataset requiring occasional updates. -**Example**: `1a111aa1a1aa1a1aaaa1a111aa1a1aa1` +**1. `url`** +ArcGIS Online item url. Changes with addition/removal of features to dataset requiring occasional updates. +**Example**: `https://services6.arcgis.com/ghjer345tert/arcgis/rest/services/PROD_PHU_Base_Aggregated/FeatureServer/0/query` **2. `output`** -The filename where the output will be written. +The filename where the output will be written. **Example**: `wsi.csv` ## Pipeline Orchestration diff --git a/retrieval/dockerfile b/retrieval/dockerfile index 966e2ea..53315b4 100644 --- a/retrieval/dockerfile +++ b/retrieval/dockerfile @@ -1,8 +1,8 @@ FROM python:3.9-bullseye -RUN pip install --upgrade pip && pip install arcgis==2.0.* +RUN python3 -m pip install --upgrade pip && pip install requests==2.* pandas==2.* --no-cache-dir COPY getdata.py ./ RUN chmod a+x /getdata.py -ENTRYPOINT ["python", "./getdata.py"] \ No newline at end of file +ENTRYPOINT ["python3", "./getdata.py"] \ No newline at end of file diff --git a/retrieval/getdata.py b/retrieval/getdata.py index 7397a9a..be5a209 100644 --- a/retrieval/getdata.py +++ b/retrieval/getdata.py @@ -1,54 +1,98 @@ import argparse -import arcgis +import requests +import pandas as pd import os import logging # Argument parser def parse_args(): parser = argparse.ArgumentParser( - description='Extract a feature set from an ArcGIS Online item') + description='Extract a feature set from an ArcGIS Online URL') parser.add_argument( - 'item_id', - help = "ArcGIS Online item id", + '--url', + help = "ArcGIS Online url for item", + required = True, type = str) parser.add_argument( - 'output', + '--output', help = "Filename to write output to", + required = True, type = str) return parser.parse_args() - -# Main function to extract and output data from ArcGIS Online -def main(item_id, output): - if os.getenv('ARCGIS_USER') is not None: +# Main function to extract and output data from PHO WTISEN +def main(features_url, output): + + # Load credentials and remove environment variables + username = os.getenv('ARCGIS_USER') + if username is not None: logging.info("ARCGIS_USER environment variable found") + os.environ.pop('ARCGIS_USER', None) else: raise ValueError("ARCGIS_USER environment variable not found.") - - if os.getenv('ARCGIS_PASSWORD') is not None: + + password = os.getenv('ARCGIS_PASSWORD') + if password is not None: logging.info("ARCGIS_PASSWORD environment variable found") + os.environ.pop('ARCGIS_PASSWORD', None) else: raise ValueError("ARCGIS_PASSWORD environment variable not found.") - logging.info("Connecting to ArcGIS Online") - gis = arcgis.gis.GIS( - username = os.getenv('ARCGIS_USER'), - password = os.getenv('ARCGIS_PASSWORD'), - verify_cert = False) - - logging.info("Logged in to ArcGIS Online as " + str(gis.properties.user.username)) - - logging.info(f"Retrieving {item_id}") - item = gis.content.get(item_id) - - logging.info("Extracting feature set") - feature_set = item.layers[0].query() - - logging.info(f"Outputting feature set to {output}") - feature_set.sdf.to_csv(output) + logging.info("Generating ArcGIS API token") + token = requests.post( + url = 'https://www.arcgis.com/sharing/rest/generateToken', + data = { + 'f': 'json', + 'username': username, + 'password': password, + 'referer': 'https://www.arcgis.com', + 'expiration': 60, # minutes + }).json()['token'] + + # Set up pagination + batch_size = 1000 + offset = 0 + all_records = [] + continue_pagination = True + + logging.info(f"Retrieving data in batch sizes of {batch_size} from {features_url} in JSON format") + + while continue_pagination: + logging.info(f"Retrieving data batch {(offset//batch_size) + 1}") + + # Fetch batch of records + response = requests.get( + url = features_url, + params= { + 'f': 'json', + 'where': '1=1', + 'outFields': '*', + 'resultOffset': offset, + 'resultRecordCount': batch_size, + 'token': token + }).json() + + # Add records to all_records list + all_records.extend(response.get('features', [])) + + # Check if exceededTransferLimit is true to determine if pagination continues + continue_pagination = response.get('exceededTransferLimit', False) + + # Increment offset + offset += batch_size + + logging.info("All data retrieved") + logging.info("Converting JSON to tabular format") + features = pd.DataFrame([record['attributes'] for record in all_records]) + + rows, columns = features.shape + logging.info(f"Data contains {rows} rows and {columns} columns") + + logging.info(f"Exporting data as {output}") + features.to_csv(output, index = False) if __name__ == '__main__': @@ -56,5 +100,4 @@ def main(item_id, output): # Parse and unpack keyword arguments main(**vars(parse_args())) - logging.info("Done") \ No newline at end of file