Skip to content

Commit

Permalink
remove arcgis python library dependency; just use requests
Browse files Browse the repository at this point in the history
  • Loading branch information
jangevaare committed Apr 22, 2024
1 parent 79c5039 commit b6d82a0
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 35 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ This containers is part of a data pipeline to automatically retrieve data from t
Container images are built by Github actions, and pushed to Github's container registry. You can find up-to-date built images [here](https://github.com/orgs/WDGPH/packages?repo_name=workflow-WSI).

## Retrieval Container
This container utilizes the [ArcGIS API Python Package](https://developers.arcgis.com/python/guide/install-and-set-up/) to authenticate to ArcGIS online, which is then used to download resources by item ID.
This container downloads ArcGIS online items from a specified url.

To use, `ARCGIS_USER` and `ARCGIS_PASSWORD` environment variables must be set for the container (credentials for WSI Data and Visualization Hub). It is strongly suggested that a secure key vault is utilized for this process and that credentials are rotated frequently. Additionally, the following arguments are required:

**1. `item_id`**
ArcGIS Online item id. Changes with addition/removal of features to dataset requiring occasional updates.
**Example**: `1a111aa1a1aa1a1aaaa1a111aa1a1aa1`
**1. `url`**
ArcGIS Online item url. Changes with addition/removal of features to dataset requiring occasional updates.
**Example**: `https://services6.arcgis.com/ghjer345tert/arcgis/rest/services/PROD_PHU_Base_Aggregated/FeatureServer/0/query`

**2. `output`**
The filename where the output will be written.
The filename where the output will be written.
**Example**: `wsi.csv`

## Pipeline Orchestration
Expand Down
4 changes: 2 additions & 2 deletions retrieval/dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
FROM python:3.9-bullseye

RUN pip install --upgrade pip && pip install arcgis==2.0.*
RUN python3 -m pip install --upgrade pip && pip install requests==2.* pandas==2.* --no-cache-dir

COPY getdata.py ./
RUN chmod a+x /getdata.py

ENTRYPOINT ["python", "./getdata.py"]
ENTRYPOINT ["python3", "./getdata.py"]
99 changes: 71 additions & 28 deletions retrieval/getdata.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,103 @@
import argparse
import arcgis
import requests
import pandas as pd
import os
import logging

# Argument parser
def parse_args():
parser = argparse.ArgumentParser(
description='Extract a feature set from an ArcGIS Online item')
description='Extract a feature set from an ArcGIS Online URL')

parser.add_argument(
'item_id',
help = "ArcGIS Online item id",
'--url',
help = "ArcGIS Online url for item",
required = True,
type = str)

parser.add_argument(
'output',
'--output',
help = "Filename to write output to",
required = True,
type = str)

return parser.parse_args()


# Main function to extract and output data from ArcGIS Online
def main(item_id, output):
if os.getenv('ARCGIS_USER') is not None:
# Main function to extract and output data from PHO WTISEN
def main(features_url, output):

# Load credentials and remove environment variables
username = os.getenv('ARCGIS_USER')
if username is not None:
logging.info("ARCGIS_USER environment variable found")
os.environ.pop('ARCGIS_USER', None)
else:
raise ValueError("ARCGIS_USER environment variable not found.")

if os.getenv('ARCGIS_PASSWORD') is not None:

password = os.getenv('ARCGIS_PASSWORD')
if password is not None:
logging.info("ARCGIS_PASSWORD environment variable found")
os.environ.pop('ARCGIS_PASSWORD', None)
else:
raise ValueError("ARCGIS_PASSWORD environment variable not found.")

logging.info("Connecting to ArcGIS Online")
gis = arcgis.gis.GIS(
username = os.getenv('ARCGIS_USER'),
password = os.getenv('ARCGIS_PASSWORD'),
verify_cert = False)

logging.info("Logged in to ArcGIS Online as " + str(gis.properties.user.username))

logging.info(f"Retrieving {item_id}")
item = gis.content.get(item_id)

logging.info("Extracting feature set")
feature_set = item.layers[0].query()

logging.info(f"Outputting feature set to {output}")
feature_set.sdf.to_csv(output)
logging.info("Generating ArcGIS API token")
token = requests.post(
url = 'https://www.arcgis.com/sharing/rest/generateToken',
data = {
'f': 'json',
'username': username,
'password': password,
'referer': 'https://www.arcgis.com',
'expiration': 60, # minutes
}).json()['token']

# Set up pagination
batch_size = 1000
offset = 0
all_records = []
continue_pagination = True

logging.info(f"Retrieving data in batch sizes of {batch_size} from {features_url} in JSON format")

while continue_pagination:
logging.info(f"Retrieving data batch {(offset//batch_size) + 1}")

# Fetch batch of records
response = requests.get(
url = features_url,
params= {
'f': 'json',
'where': '1=1',
'outFields': '*',
'resultOffset': offset,
'resultRecordCount': batch_size,
'token': token
}).json()

# Add records to all_records list
all_records.extend(response.get('features', []))

# Check if exceededTransferLimit is true to determine if pagination continues
continue_pagination = response.get('exceededTransferLimit', False)

# Increment offset
offset += batch_size

logging.info("All data retrieved")
logging.info("Converting JSON to tabular format")
features = pd.DataFrame([record['attributes'] for record in all_records])

rows, columns = features.shape
logging.info(f"Data contains {rows} rows and {columns} columns")

logging.info(f"Exporting data as {output}")
features.to_csv(output, index = False)


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO)

# Parse and unpack keyword arguments
main(**vars(parse_args()))

logging.info("Done")

0 comments on commit b6d82a0

Please sign in to comment.