Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Commit

Permalink
Remove mutable parameters in provider api scripts (#100)
Browse files Browse the repository at this point in the history
  • Loading branch information
obulat authored Jun 23, 2021
1 parent cd159cc commit 5351b5e
Show file tree
Hide file tree
Showing 14 changed files with 101 additions and 51 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
delay_request = DelayedRequester(delay=DELAY)
image_store = ImageStore(provider=PROVIDER)

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
'cc': '1',
'has_image': '1',
'limit': LIMIT,
Expand Down Expand Up @@ -48,8 +48,10 @@ def main():


def _build_query_param(offset=0,
default_query_param=DEFAULT_QUERY_PARAM
default_query_param=None
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param = default_query_param.copy()
query_param.update(
skip=offset
Expand Down
5 changes: 3 additions & 2 deletions src/cc_catalog_airflow/dags/provider_api_scripts/europeana.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

logger = logging.getLogger(__name__)

DELAY = 30.0
Expand Down Expand Up @@ -256,8 +255,10 @@ def _build_query_param_dict(
end_timestamp,
cursor,
api_key=API_KEY,
default_query_param=DEFAULT_QUERY_PARAMS,
default_query_param=None,
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param_dict = default_query_param.copy()
query_param_dict.update(
wskey=api_key,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ def _get_object_list(building, endpoint=ENDPOINT, retries=RETRIES):
return


def _build_params(building, default_params=DEFAULT_QUERY_PARAMS, page=1):
def _build_params(building, default_params=None, page=1):
if default_params is None:
default_params = DEFAULT_QUERY_PARAMS
query_params = default_params.copy()
query_params.update(
{
Expand Down
13 changes: 9 additions & 4 deletions src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

logger = logging.getLogger(__name__)

DELAY = 1.0
Expand Down Expand Up @@ -206,10 +205,14 @@ def _build_query_param_dict(
cur_page,
date_type,
api_key=API_KEY,
license_info=LICENSE_INFO,
license_info=None,
limit=LIMIT,
default_query_param=DEFAULT_QUERY_PARAMS,
default_query_param=None,
):
if license_info is None:
license_info = LICENSE_INFO.copy()
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param_dict = default_query_param.copy()
query_param_dict.update(
{
Expand Down Expand Up @@ -329,7 +332,9 @@ def _get_image_url(image_data):
return None, None, None


def _get_license(license_id, license_info=LICENSE_INFO):
def _get_license(license_id, license_info=None):
if license_info is None:
license_info = LICENSE_INFO.copy()
license_id = str(license_id)

if license_id not in license_info:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def main(date=None):
_extract_the_data(fetch_the_object_id[1])

total_images = image_store.commit()
logger.info(f'Total CC0 images recieved {total_images}')
logger.info(f'Total CC0 images received {total_images}')


def _get_object_ids(date, endpoint=ENDPOINT):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

logger = logging.getLogger(__name__)

LIMIT = 100
Expand All @@ -24,7 +23,7 @@
"Accept": "application/json"
}

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"has_image": "yes",
"perpage": LIMIT,
"imagelicence": "cc by",
Expand Down Expand Up @@ -66,17 +65,22 @@ def main():


def _get_query_params(
query_params=DEFAULT_QUERY_PARAM, license_type="cc by", page=0
default_query_params=None, license_type="cc by", page=0
):
if default_query_params is None:
default_query_params = DEFAULT_QUERY_PARAMS
query_params = default_query_params.copy()
query_params["imagelicence"] = license_type
query_params["page"] = page
return query_params


def _get_batch_objects(
endpoint=ENDPOINT, params=None,
headers=HEADERS, retries=RETRIES
headers=None, retries=RETRIES
):
if headers is None:
headers = HEADERS.copy()
data = None
for retry in range(retries):
response = delay_request.get(
Expand Down
24 changes: 17 additions & 7 deletions src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
delay_request = DelayedRequester(delay=DELAY)
image_store = ImageStore(provider=PROVIDER)

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"q": "CC_0",
"field": "use_rtxt_s",
"page": 1,
Expand Down Expand Up @@ -65,20 +65,24 @@ def main():


def _get_query_param(
default_query_param=DEFAULT_QUERY_PARAM,
default_query_params=None,
page=1,
):
query_param = default_query_param
if default_query_params is None:
default_query_params = DEFAULT_QUERY_PARAMS
query_param = default_query_params.copy()
query_param["page"] = page
return query_param


def _request_handler(
endpoint=BASE_ENDPOINT,
params=None,
headers=HEADERS,
headers=None,
retries=RETRIES
):
if headers is None:
headers = HEADERS.copy()
results = None
for retry in range(retries):
response = delay_request.get(
Expand Down Expand Up @@ -133,11 +137,13 @@ def _handle_results(results):


def _get_capture_details(
captures=[],
captures=None,
metadata=None,
creator=None,
title=None
):
if captures is None:
captures = []
for img in captures:
image_id = img.get("imageID", {}).get("$")
if image_id is None:
Expand Down Expand Up @@ -190,9 +196,13 @@ def _get_creators(creatorinfo):

def _get_images(
images,
image_url_dimensions=IMAGE_URL_DIMENSIONS,
thumbnail_dimensions=THUMBNAIL_DIMENSIONS
image_url_dimensions=None,
thumbnail_dimensions=None
):
if thumbnail_dimensions is None:
thumbnail_dimensions = THUMBNAIL_DIMENSIONS
if image_url_dimensions is None:
image_url_dimensions = IMAGE_URL_DIMENSIONS
image_type = {
parse_qs(urlparse(img.get("$")).query)['t'][0]: img.get("$")
for img in images
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
format="%(asctime)s - %(name)s - %(levelname)s: %(message)s",
level=logging.INFO,
)

logger = logging.getLogger(__name__)

delayed_requester = DelayedRequester(DELAY)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"Accept": "application/json"
}

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"has_image": 1,
"image_license": "CC",
"page[size]": LIMIT,
Expand Down Expand Up @@ -94,8 +94,10 @@ def _get_query_param(
page_number=0,
from_year=0,
to_year=1500,
default_query_param=DEFAULT_QUERY_PARAM
default_query_param=None
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param = default_query_param.copy()
query_param["page[number]"] = page_number
query_param["date[from]"] = from_year
Expand All @@ -105,10 +107,12 @@ def _get_query_param(

def _get_batch_objects(
endpoint=ENDPOINT,
headers=HEADERS,
headers=None,
retries=RETRIES,
query_param=None
):
if headers is None:
headers = HEADERS.copy()
data = None
for retry in range(retries):
response = delay_request.get(
Expand Down
33 changes: 22 additions & 11 deletions src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
from util.loader import provider_details as prov

logger = logging.getLogger(__name__)
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

API_KEY = os.getenv('DATA_GOV_API_KEY')
DELAY = 5.0
Expand Down Expand Up @@ -104,7 +108,7 @@
'description (brief spanish)', 'gallery label',
'exhibition label', 'luce center label',
'publication label', 'new acquisition label'}
TAG_TYPES = ['date', 'object_type', 'topic', 'place']
TAG_TYPES = ('date', 'object_type', 'topic', 'place')

image_store = ImageStore(provider=PROVIDER)
delayed_requester = DelayedRequester(delay=DELAY)
Expand All @@ -127,7 +131,7 @@ def main():

def gather_samples(
units_endpoint=UNITS_ENDPOINT,
default_params=DEFAULT_PARAMS,
default_query_params=None,
target_dir='/tmp'
):
"""
Expand All @@ -138,13 +142,16 @@ def gather_samples(
This function is for gathering test data only, and is untested.
"""
if default_query_params is None:
default_query_params = DEFAULT_PARAMS
query_params = default_query_params.copy()
now_str = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
sample_dir = os.path.join(target_dir, f'si_samples_{now_str}')
logger.info(f'Creating sample_dir {sample_dir}')
os.mkdir(sample_dir)
unit_code_json = delayed_requester.get_response_json(
units_endpoint,
query_params=default_params
query_params=query_params
)
unit_code_list = unit_code_json.get('response', {}).get('terms', [])
logger.info(f'found unit codes: {unit_code_list}')
Expand Down Expand Up @@ -233,9 +240,11 @@ def _process_hash_prefix(
def _build_query_params(
row_offset,
hash_prefix=None,
default_params=DEFAULT_PARAMS,
default_params=None,
unit_code=None
):
if default_params is None:
default_params = DEFAULT_PARAMS
query_params = default_params.copy()
query_string = 'online_media_type:Images AND media_usage:CC0'
if hash_prefix is not None:
Expand Down Expand Up @@ -291,7 +300,9 @@ def _get_title(row):
return row.get('title')


def _get_creator(row, creator_types=CREATOR_TYPES):
def _get_creator(row, creator_types=None):
if creator_types is None:
creator_types = CREATOR_TYPES.copy()
freetext = _get_freetext_dict(row)
indexed_structured = _get_indexed_structured_dict(row)
ordered_freetext_creator_objects = sorted(
Expand Down Expand Up @@ -335,7 +346,9 @@ def _get_creator(row, creator_types=CREATOR_TYPES):
return creator


def _extract_meta_data(row, description_types=DESCRIPTION_TYPES):
def _extract_meta_data(row, description_types=None):
if description_types is None:
description_types = DESCRIPTION_TYPES.copy()
freetext = _get_freetext_dict(row)
descriptive_non_repeating = _get_descriptive_non_repeating_dict(row)
description = ''
Expand Down Expand Up @@ -372,7 +385,9 @@ def _extract_source(meta_data, sub_providers=SUB_PROVIDERS):
return source


def _extract_tags(row, tag_types=TAG_TYPES):
def _extract_tags(row, tag_types=None):
if tag_types is None:
tag_types = TAG_TYPES
indexed_structured = _get_indexed_structured_dict(row)
tag_lists_generator = (
_check_type(indexed_structured.get(key), list) for key in tag_types
Expand Down Expand Up @@ -471,8 +486,4 @@ def _process_image_list(


if __name__ == '__main__':
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)
main()
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
delay_request = DelayedRequester(delay=DELAY)
image_store = ImageStore(provider=PROVIDER)

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"keys": "*",
"filters": "[has_image:true],[public_domain:true]",
"offset": 0,
Expand Down Expand Up @@ -57,8 +57,10 @@ def main():

def _get_query_param(
offset=0,
default_query_param=DEFAULT_QUERY_PARAM
default_query_param=None
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_params = default_query_param.copy()
query_params.update(
offset=offset
Expand All @@ -69,9 +71,11 @@ def _get_query_param(
def _get_batch_items(
endpoint=ENDPOINT,
query_params=None,
headers=HEADERS,
headers=None,
retries=RETRIES
):
if headers is None:
headers = HEADERS.copy()
items = None
for retry in range(retries):
response = delay_request.get(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
logger = logging.getLogger(__name__)

logging.basicConfig(
format='{asctime} - {name} - {levelname}: {message}',
style='{',
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.DEBUG
)

Expand Down
Loading

0 comments on commit 5351b5e

Please sign in to comment.