Skip to content
This repository has been archived by the owner on Aug 4, 2023. It is now read-only.

Commit

Permalink
Create a Provider API script template (#93)
Browse files Browse the repository at this point in the history
* Create a Provider API script template

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Update src/cc_catalog_airflow/templates/template_provider.py_template

Co-authored-by: Zack Krida <zackkrida@pm.me>

* Better wording for script date parameter

Co-authored-by: Zack Krida <zackkrida@pm.me>

* Replace relative path with absolute to fix file not found errors

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Make image the default media type

Co-authored-by: Zack Krida <zackkrida@pm.me>

* Fix typo in provider template script

Co-authored-by: Krystle Salazar <krystle.salazar@ciens.ucv.ve>

* Improve DAG creation template
Signed-off-by: Olga Bulat <obulat@gmail.com>

* Update src/cc_catalog_airflow/templates/template_provider.py_template

Co-authored-by: Krystle Salazar <krystle.salazar@ciens.ucv.ve>

* Add tests for template script

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Fix item column names

* Pass `template_name` of the template for tests

* Add `{media_type}` placeholders to template_provider.py_template

* Minor fixes of typos and comments

* Template falls back to `image` if no image type specified

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Small bug and documentation fixes

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Make get_license_info_from_license_pair public

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Add README

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Add sample output to the README

Signed-off-by: Olga Bulat <obulat@gmail.com>

* Commit media_store items so that the last batch is not lost

Signed-off-by: Olga Bulat <obulat@gmail.com>

Co-authored-by: Zack Krida <zackkrida@pm.me>
Co-authored-by: Krystle Salazar <krystle.salazar@ciens.ucv.ve>
  • Loading branch information
3 people authored Jul 9, 2021
1 parent 447b5c6 commit f5e8a9d
Show file tree
Hide file tree
Showing 10 changed files with 731 additions and 10 deletions.
Empty file.
13 changes: 10 additions & 3 deletions src/cc_catalog_airflow/dags/common/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
# flake8: noqa
from .licenses import constants
from .licenses.licenses import (
get_license_info, LicenseInfo, is_valid_license_info
get_license_info,
get_license_info_from_license_pair,
is_valid_license_info,
LicenseInfo,
)
from .storage.image import (
Image, ImageStore, MockImageStore
Image,
ImageStore,
MockImageStore,
)
from .storage.audio import (
Audio, AudioStore, MockAudioStore
Audio,
AudioStore,
MockAudioStore
)
from .storage import columns
from .requester import DelayedRequester
4 changes: 2 additions & 2 deletions src/cc_catalog_airflow/dags/common/licenses/licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def get_license_info(
f'Falling back to given license_ {license_}'
f' and license_version {license_version}'
)
license_info = _get_license_info_from_license_pair(
license_info = get_license_info_from_license_pair(
license_, license_version
)
license_info = (*license_info, license_url)
Expand Down Expand Up @@ -190,7 +190,7 @@ def _get_valid_cc_url(license_url) -> Optional[str]:
return validated_license_url


def _get_license_info_from_license_pair(
def get_license_info_from_license_pair(
license_, license_version, pair_map=REVERSE_LICENSE_PATH_MAP
) -> Tuple[Optional[str], Optional[str], Optional[str]]:
"""
Expand Down
10 changes: 5 additions & 5 deletions src/cc_catalog_airflow/dags/common/licenses/test_licenses.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def test_get_license_info_from_license_pair_nones_when_missing_license(
mock_rewriter
):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
license_info = licenses._get_license_info_from_license_pair(
license_info = licenses.get_license_info_from_license_pair(
None,
'1.0',
pair_map=pair_map
Expand All @@ -189,7 +189,7 @@ def test_get_license_info_from_license_pair_nones_missing_version(
mock_rewriter
):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
license_info = licenses._get_license_info_from_license_pair(
license_info = licenses.get_license_info_from_license_pair(
'by',
None,
pair_map=pair_map
Expand All @@ -199,7 +199,7 @@ def test_get_license_info_from_license_pair_nones_missing_version(

def test_validate_license_pair_handles_float_version(mock_rewriter):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
actual_license_info = licenses._get_license_info_from_license_pair(
actual_license_info = licenses.get_license_info_from_license_pair(
'by',
1.0,
pair_map=pair_map
Expand All @@ -212,7 +212,7 @@ def test_validate_license_pair_handles_float_version(mock_rewriter):

def test_validate_license_pair_handles_int_version(mock_rewriter):
pair_map = {('by', '1.0'): 'licenses/by/1.0'}
actual_license_info = licenses._get_license_info_from_license_pair(
actual_license_info = licenses.get_license_info_from_license_pair(
'by',
1,
pair_map=pair_map
Expand All @@ -225,7 +225,7 @@ def test_validate_license_pair_handles_int_version(mock_rewriter):

def test_validate_license_pair_handles_na_version(mock_rewriter):
pair_map = {('publicdomain', 'N/A'): 'licenses/publicdomain'}
actual_license_info = licenses._get_license_info_from_license_pair(
actual_license_info = licenses.get_license_info_from_license_pair(
'publicdomain',
'N/A',
pair_map=pair_map
Expand Down
22 changes: 22 additions & 0 deletions src/cc_catalog_airflow/templates/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
## Adding new provider API script

Openverse Catalog uses APIs of sites that share openly-licensed media to collect the data about the media and save it to the database. We call the scripts that pull the data from these APIs "Provider API scripts". You can find examples in [`provider_api_scripts` folder](../dags/provider_api_scripts).

To add a Provider API script using this template, you will need to have Python 3 installed on your machine (preferably, version 3.9). You will also need to know the name of provider, and the type of media you are going to collect (`image` or `audio`).

To add a script for collecting audio data from provider named "MyProvider", open your terminal and run
```bash
python3 src/cc_catalog_airflow/templates/create_api_script.py MyProvider -m audio
```
You should see output similar to this:
```bash
Creating files in path/to/openverse-catalog
API script: src/cc_catalog_airflow/dags/provider_api_scripts/myprovider.py
API script test: src/cc_catalog_airflow/dags/provider_api_scripts/test_myprovider.py
Airflow workflow file: src/cc_catalog_airflow/dags/myprovider_workflow.py

```
The following files have been created:
1. Airflow workflow file. You will probably NOT need to edit it.
2. `myprovider.py` script. This is a template that simplifies creating an API provider script by providing the basic structure. The scripts use small and easily-testable functions. Follow the instructions within the script comments, and complete all the TODOs. Make sure to look at sample `.json` files that will be saved for testing.
3. `test_myprovider.py`. This is a skeleton for your tests. Write tests for the functions in your Provider API script, using the `json` files with sample API responses.
Empty file.
99 changes: 99 additions & 0 deletions src/cc_catalog_airflow/templates/create_api_script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import argparse
from pathlib import Path


IMAGE_STORE_INIT = 'image_store = ImageStore(provider=PROVIDER)'
AUDIO_STORE_INIT = 'audio_store = AudioStore(provider=PROVIDER)'


def _get_filled_template(template_path, provider, media_type='image'):
with open(template_path, 'r', encoding='utf8') as template:
template_string = template.read()
script_string = template_string.replace(
'{provider_title_case}', provider.title()
).replace(
'{provider_upper_case}', provider.upper()
).replace(
'{provider}', provider.lower()
)
if media_type == 'audio':
media_store_init = AUDIO_STORE_INIT
media_store = 'audio_store'
else:
media_store_init = IMAGE_STORE_INIT
media_store = 'image_store'
script_string = script_string.replace(
'media_store_init', media_store_init
).replace(
'{media_store}', media_store
).replace(
'{media_type}', media_type
)

return script_string


def fill_template(provider, media_type, templates_path):
project_path = templates_path.parent.parent.parent
template_name = 'template_provider.py_template'
script_template_path = templates_path / template_name
print(f"Creating files in {project_path}")

dags_path = templates_path.parent / 'dags'
filename = provider.replace(" ", '_').lower()

api_path = dags_path / 'provider_api_scripts'
api_script_path = api_path / f"{filename}.py"
with open(api_script_path, 'w+', encoding='utf8') as api_script:
api_script_string = _get_filled_template(
script_template_path, provider, media_type
)
api_script.write(api_script_string)
print(f"API script: {api_script_path.relative_to(project_path)}")

template_name = 'template_test.py_template'
script_template_path = templates_path / template_name
test_script_path = api_path / f"test_{filename}.py"
with open(test_script_path, 'w+', encoding='utf8') as test_script:
test_string = _get_filled_template(
script_template_path, provider, media_type
)
test_script.write(test_string)
print(f"API script test: {test_script_path.relative_to(project_path)}")

workflow_template_path = templates_path / 'workflow.py_template'
workflow_path = dags_path / f"{filename}_workflow.py"
with open(workflow_path, 'w+', encoding='utf8') as workflow_file:
workflow_string = _get_filled_template(
workflow_template_path, provider
)
workflow_file.write(workflow_string)
print("Airflow workflow file: "
f"{workflow_path.relative_to(project_path)}")


def main():
parser = argparse.ArgumentParser(
description='Create a new provider API script',
add_help=True,
)
parser.add_argument(
"provider",
help='Create the script for this provider (eg. "Wikimedia").')
parser.add_argument(
'-m', '--media', type=str, choices=['image', 'audio'],
help="Script will collect media of this type"
" ('audio'/'image'). Default value is 'image'"
)
args = parser.parse_args()
provider = args.provider
media_type = args.media
if media_type not in ['audio', 'image']:
print("No media type given, assuming it's `image`")
media_type = 'image'
templates_path = Path(__file__).parent
fill_template(provider, media_type, templates_path)


if __name__ == "__main__":
main()
Loading

0 comments on commit f5e8a9d

Please sign in to comment.