Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Remove mutable parameters in provider api scripts #100

Merged
merged 8 commits into from
Jun 23, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
"api_key": API_KEY
}

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"has_images": 1,
"rights_type_permissive": 1,
"limit": LIMIT,
Expand Down Expand Up @@ -55,19 +55,23 @@ def main():

def _get_query_param(
offset=0,
default_query_param=DEFAULT_QUERY_PARAM
default_query_param=None
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param = default_query_param.copy()
query_param.update(offset=offset)
return query_param


def _get_object_json(
headers=HEADERS,
headers=None,
endpoint=ENDPOINT,
retries=RETRIES,
query_param=None
):
if headers is None:
headers = HEADERS.copy()
for tries in range(retries):
response = delay_request.get(
endpoint,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
delay_request = DelayedRequester(delay=DELAY)
image_store = ImageStore(provider=PROVIDER)

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
'cc': '1',
'has_image': '1',
'limit': LIMIT,
Expand Down Expand Up @@ -48,8 +48,10 @@ def main():


def _build_query_param(offset=0,
default_query_param=DEFAULT_QUERY_PARAM
default_query_param=None
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param = default_query_param.copy()
query_param.update(
skip=offset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

logger = logging.getLogger(__name__)

DELAY = 30.0
Expand Down Expand Up @@ -255,8 +254,10 @@ def _build_query_param_dict(
end_timestamp,
cursor,
api_key=API_KEY,
default_query_param=DEFAULT_QUERY_PARAMS,
default_query_param=None,
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param_dict = default_query_param.copy()
query_param_dict.update(
wskey=api_key,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,9 @@ def _get_object_list(building, endpoint=ENDPOINT, retries=RETRIES):
return


def _build_params(building, default_params=DEFAULT_QUERY_PARAMS, page=1):
def _build_params(building, default_params=None, page=1):
if default_params is None:
default_params = DEFAULT_QUERY_PARAMS
query_params = default_params.copy()
query_params.update(
{
Expand Down
13 changes: 9 additions & 4 deletions src/cc_catalog_airflow/dags/provider_api_scripts/flickr.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

logger = logging.getLogger(__name__)

DELAY = 1.0
Expand Down Expand Up @@ -205,10 +204,14 @@ def _build_query_param_dict(
cur_page,
date_type,
api_key=API_KEY,
license_info=LICENSE_INFO,
license_info=None,
limit=LIMIT,
default_query_param=DEFAULT_QUERY_PARAMS,
default_query_param=None,
):
if license_info is None:
license_info = LICENSE_INFO.copy()
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param_dict = default_query_param.copy()
query_param_dict.update(
{
Expand Down Expand Up @@ -328,7 +331,9 @@ def _get_image_url(image_data):
return None, None, None


def _get_license(license_id, license_info=LICENSE_INFO):
def _get_license(license_id, license_info=None):
if license_info is None:
license_info = LICENSE_INFO.copy()
license_id = str(license_id)

if license_id not in license_info:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def main(date=None):
_extract_the_data(fetch_the_object_id[1])

total_images = image_store.commit()
logger.info(f'Total CC0 images recieved {total_images}')
logger.info(f'Total CC0 images received {total_images}')


def _get_object_ids(date, endpoint=ENDPOINT):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

logger = logging.getLogger(__name__)

LIMIT = 100
Expand All @@ -24,7 +23,7 @@
"Accept": "application/json"
}

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"has_image": "yes",
"perpage": LIMIT,
"imagelicence": "cc by",
Expand Down Expand Up @@ -66,17 +65,22 @@ def main():


def _get_query_params(
query_params=DEFAULT_QUERY_PARAM, license_type="cc by", page=0
default_query_params=None, license_type="cc by", page=0
):
if default_query_params is None:
default_query_params = DEFAULT_QUERY_PARAMS
query_params = default_query_params.copy()
query_params["imagelicence"] = license_type
query_params["page"] = page
return query_params


def _get_batch_objects(
endpoint=ENDPOINT, params=None,
headers=HEADERS, retries=RETRIES
headers=None, retries=RETRIES
):
if headers is None:
headers = HEADERS.copy()
for retry in range(retries):
response = delay_request.get(
endpoint,
Expand Down
24 changes: 17 additions & 7 deletions src/cc_catalog_airflow/dags/provider_api_scripts/nypl.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
delay_request = DelayedRequester(delay=DELAY)
image_store = ImageStore(provider=PROVIDER)

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"q": "CC_0",
"field": "use_rtxt_s",
"page": 1,
Expand Down Expand Up @@ -65,20 +65,24 @@ def main():


def _get_query_param(
default_query_param=DEFAULT_QUERY_PARAM,
default_query_params=None,
page=1,
):
query_param = default_query_param
if default_query_params is None:
default_query_params = DEFAULT_QUERY_PARAMS
query_param = default_query_params.copy()
query_param["page"] = page
return query_param


def _request_handler(
endpoint=BASE_ENDPOINT,
params=None,
headers=HEADERS,
headers=None,
retries=RETRIES
):
if headers is None:
headers = HEADERS.copy()
results = None
for retry in range(retries):
response = delay_request.get(
Expand Down Expand Up @@ -133,11 +137,13 @@ def _handle_results(results):


def _get_capture_details(
captures=[],
captures=None,
metadata=None,
creator=None,
title=None
):
if captures is None:
captures = []
for img in captures:
image_id = img.get("imageID", {}).get("$")
if image_id is None:
Expand Down Expand Up @@ -190,9 +196,13 @@ def _get_creators(creatorinfo):

def _get_images(
images,
image_url_dimensions=IMAGE_URL_DIMENSIONS,
thumbnail_dimensions=THUMBNAIL_DIMENSIONS
image_url_dimensions=None,
thumbnail_dimensions=None
):
if thumbnail_dimensions is None:
thumbnail_dimensions = THUMBNAIL_DIMENSIONS
if image_url_dimensions is None:
image_url_dimensions = IMAGE_URL_DIMENSIONS
image_url, thumbnail_url = None, None
image_type = {
parse_qs(urlparse(img.get("$")).query)['t'][0]: img.get("$")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
format="%(asctime)s - %(name)s - %(levelname)s: %(message)s",
level=logging.INFO,
)

logger = logging.getLogger(__name__)

delayed_requester = DelayedRequester(DELAY)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"Accept": "application/json"
}

DEFAULT_QUERY_PARAM = {
DEFAULT_QUERY_PARAMS = {
"has_image": 1,
"image_license": "CC",
"page[size]": LIMIT,
Expand Down Expand Up @@ -94,8 +94,10 @@ def _get_query_param(
page_number=0,
from_year=0,
to_year=1500,
default_query_param=DEFAULT_QUERY_PARAM
default_query_param=None
):
if default_query_param is None:
default_query_param = DEFAULT_QUERY_PARAMS
query_param = default_query_param.copy()
query_param["page[number]"] = page_number
query_param["date[from]"] = from_year
Expand All @@ -105,10 +107,12 @@ def _get_query_param(

def _get_batch_objects(
endpoint=ENDPOINT,
headers=HEADERS,
headers=None,
retries=RETRIES,
query_param=None
):
if headers is None:
headers = HEADERS.copy()
for retry in range(retries):
response = delay_request.get(
endpoint,
Expand Down
34 changes: 22 additions & 12 deletions src/cc_catalog_airflow/dags/provider_api_scripts/smithsonian.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
from util.loader import provider_details as prov

logger = logging.getLogger(__name__)
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s: %(message)s',
level=logging.INFO
)

API_KEY = os.getenv('DATA_GOV_API_KEY')
DELAY = 5.0
Expand Down Expand Up @@ -104,7 +108,7 @@
'description (brief spanish)', 'gallery label',
'exhibition label', 'luce center label',
'publication label', 'new acquisition label'}
TAG_TYPES = ['date', 'object_type', 'topic', 'place']
TAG_TYPES = ('date', 'object_type', 'topic', 'place')

image_store = ImageStore(provider=PROVIDER)
delayed_requester = DelayedRequester(delay=DELAY)
Expand All @@ -127,7 +131,7 @@ def main():

def gather_samples(
units_endpoint=UNITS_ENDPOINT,
default_params=DEFAULT_PARAMS,
default_query_params=None,
target_dir='/tmp'
):
"""
Expand All @@ -138,13 +142,16 @@ def gather_samples(

This function is for gathering test data only, and is untested.
"""
if default_query_params is None:
default_query_params = DEFAULT_PARAMS
query_params = default_query_params.copy()
now_str = datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')
sample_dir = os.path.join(target_dir, f'si_samples_{now_str}')
logger.info(f'Creating sample_dir {sample_dir}')
os.mkdir(sample_dir)
unit_code_json = delayed_requester.get_response_json(
units_endpoint,
query_params=default_params
query_params=query_params
)
unit_code_list = unit_code_json.get('response', {}).get('terms', [])
logger.info(f'found unit codes: {unit_code_list}')
Expand Down Expand Up @@ -233,9 +240,11 @@ def _process_hash_prefix(
def _build_query_params(
row_offset,
hash_prefix=None,
default_params=DEFAULT_PARAMS,
default_params=None,
unit_code=None
):
if default_params is None:
default_params = DEFAULT_PARAMS
query_params = default_params.copy()
query_string = 'online_media_type:Images AND media_usage:CC0'
if hash_prefix is not None:
Expand Down Expand Up @@ -291,7 +300,9 @@ def _get_title(row):
return row.get('title')


def _get_creator(row, creator_types=CREATOR_TYPES):
def _get_creator(row, creator_types=None):
if creator_types is None:
creator_types = CREATOR_TYPES.copy()
freetext = _get_freetext_dict(row)
indexed_structured = _get_indexed_structured_dict(row)
ordered_freetext_creator_objects = sorted(
Expand Down Expand Up @@ -335,7 +346,9 @@ def _get_creator(row, creator_types=CREATOR_TYPES):
return creator


def _extract_meta_data(row, description_types=DESCRIPTION_TYPES):
def _extract_meta_data(row, description_types=None):
if description_types is None:
description_types = DESCRIPTION_TYPES.copy()
freetext = _get_freetext_dict(row)
descriptive_non_repeating = _get_descriptive_non_repeating_dict(row)
description = ''
Expand Down Expand Up @@ -372,7 +385,9 @@ def _extract_source(meta_data, sub_providers=SUB_PROVIDERS):
return source


def _extract_tags(row, tag_types=TAG_TYPES):
def _extract_tags(row, tag_types=None):
if tag_types is None:
tag_types = TAG_TYPES
indexed_structured = _get_indexed_structured_dict(row)
tag_lists_generator = (
_check_type(indexed_structured.get(key), list) for key in tag_types
Expand Down Expand Up @@ -471,9 +486,4 @@ def _process_image_list(


if __name__ == '__main__':
logging.basicConfig(
format='{asctime} - {name} - {levelname}: {message}',
style='{',
level=logging.INFO
)
main()
Loading