Skip to content

Commit

Permalink
[c2f,#19][m]: misc improvements to dataset mapping.
Browse files Browse the repository at this point in the history
* correct basic mappings of keys
* author and maintainer mapping - fixes #2
* licenses uses licenses if already there (rather than overwriting) - or merges ...
  • Loading branch information
rufuspollock committed Jun 18, 2020
1 parent 6a788cb commit 45b440e
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 84 deletions.
95 changes: 51 additions & 44 deletions frictionless_ckan_mapper/ckan_to_frictionless.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,9 @@ def resource(self, ckandict):
dataset_keys_to_remove = [
'state'
]
package_mapping = {
dataset_mapping = {
'notes': 'description',
'tags': 'keywords', # this is flattened and simplified
}

package_sources_mapping = {
'author': 'title',
'author_email': 'email',
'url': 'path',
'url': 'homepage'
}

def dataset(self, ckandict):
Expand All @@ -99,8 +93,7 @@ def dataset(self, ckandict):
4. Remove unneeded keys
5. Apply special formatting for key fields
'''
outdict = dict(ckandict)

outdict = dict(ckandict)
# Convert the structure of extras
# structure of extra item is {key: xxx, value: xxx}
if 'extras' in ckandict:
Expand All @@ -114,52 +107,66 @@ def dataset(self, ckandict):
outdict[key] = value
del outdict['extras']


# Remap necessary package keys
for k, v in self.package_mapping.items():
if k in ckandict and k == 'url':
outdict[v] = ckandict[k]
del outdict[k]
elif k in ckandict and k == 'tags':
outdict[v] = []
for tag in ckandict[k]:
outdict[v].append(tag['name'])
del outdict[k]
elif k in ckandict:
# Map dataset keys
for k, v in self.dataset_mapping.items():
if k in ckandict:
outdict[v] = ckandict[k]
del outdict[k]


# Remap properties in sources
if 'author' in outdict:
outdict['sources'] = []
source = {}
for k, v in self.package_sources_mapping.items():
if k in outdict:
source[v] = outdict[k]
del outdict[k]
outdict['sources'].append(source)

# Reformat expected output for some keys in package
if 'name' in outdict:
outdict['name'] = outdict['name'].replace('-', '_')

# Reformat resources inside package

# tags
if 'tags' in ckandict:
outdict['keywords'] = [ tag['name'] for tag in ckandict['tags'] ]
del outdict['tags']

# author, maintainer => contributors
# what to do if contributors already there? Options:
# 1. Just use that and ignore author/maintainer
# 2. replace with author/maintainer
# 3. merge i.e. use contributors and merge in (this is sort of complex)
# e.g. how to i avoid duplicating the same person
# ANS: for now, is 1 ...
if (not ('contributors' in outdict and outdict['contributors']) and
('author' in outdict or 'maintainer' in outdict)):
outdict['contributors'] = []
if 'author' in outdict and outdict['author']:
contrib = {
'title': outdict['author'],
'role': 'author'
}
if 'author_email' in outdict:
contrib['email'] = outdict['author_email']
outdict['contributors'].append(contrib)
if 'maintainer' in outdict and outdict['maintainer']:
contrib = {
'title': outdict['maintainer'],
'role': 'maintainer'
}
if 'maintainer_email' in outdict:
contrib['email'] = outdict['maintainer_email']
outdict['contributors'].append(contrib)

for k in ['author', 'author_email', 'maintainer', 'maintainer_email']:
outdict.pop(k, None)

# Reformat resources inside dataset
if 'resources' in outdict:
outdict['resources'] = [self.resource(res) for res in
outdict['resources']]

# TODO: do we always license_id - can we have license_title w/o
# package_show can have license_id and license_title
# TODO: do we always license_id i.e. can we have license_title w/o
# license_id?
if 'license_id' in outdict:
if ('licenses' not in outdict and 'license_id' in outdict):
outdict['licenses'] = [{
'type': outdict['license_id'],
}]
del outdict['license_id']
if 'license_title' in outdict:
outdict['licenses'][0]['title'] = outdict['license_title']
outdict.pop('license_id', None)
outdict.pop('license_title', None)

for k in self.dataset_keys_to_remove:
if k in outdict:
del outdict[k]
outdict.pop(k, None)

for k in list(outdict.keys()):
if outdict[k] is None:
Expand Down
169 changes: 129 additions & 40 deletions tests/test_ckan_to_frictionless.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,10 +225,47 @@ def test_unjsonify_all_extra_values_in_nested_lists(self):
exp = {'numbers': [[[1, 2, 3], [2, 4, 5]], [[7, 6, 0]]]}
assert out == exp

def test_author_and_maintainer(self):
pass

def test_dataset_license(self):
indict = {
'license_id': 'odc-odbl'
}
exp = {
'licenses': [{
'type': 'odc-odbl'
}]
}
out = converter.dataset(indict)
assert out == exp

indict = {
'license_id': 'odc-odbl',
'license_title': 'Open Data Commons Open Database License'
}
exp = {
'licenses': [{
'type': 'odc-odbl',
'title': 'Open Data Commons Open Database License'
}]
}
out = converter.dataset(indict)
assert out == exp

# finally what if license*s* already there ...
indict = {
'licenses': [{
'type': 'odc-pddl'
}],
'license_id': 'odc-odbl'
}
exp = {
'licenses': [{
'type': 'odc-pddl'
}]
}
out = converter.dataset(indict)
assert out == exp

def test_dataset_license_with_licenses_in_extras(self):
indict = {
'license_id': 'odc-odbl',
# TODO: check package_show in CKAN
Expand All @@ -243,45 +280,94 @@ def test_dataset_license(self):
out = converter.dataset(indict)
assert out == exp

def test_dataset_name_title_and_version(self):
self.dataset_dict.update({
def test_keys_are_passed_through(self):
indict = {
'name': 'gdp',
'title': 'Countries GDP',
'version': '1.0',
})
result = converter.dataset(self.dataset_dict)
assert result['title'] == self.dataset_dict['title']
assert result['name'] == self.dataset_dict['name']
assert result['version'] == self.dataset_dict['version']

def test_dataset_notes(self):
self.dataset_dict.update({
'notes': 'Country, regional and world GDP in current US Dollars.'
})
result = converter.dataset(self.dataset_dict)
assert result.get('description') == self.dataset_dict['notes']

def test_dataset_author_and_source(self):
sources = [
{
'title': 'World Bank and OECD',
'email': 'someone@worldbank.org',
'path': 'http://data.worldbank.org/indicator/NY.GDP.MKTP.CD',
}
]
self.dataset_dict.update({
'author': sources[0]['title'],
'author_email': sources[0]['email'],
'url': sources[0]['path']
})
result = converter.dataset(self.dataset_dict)
assert result.get('sources') == sources
# random
'xxx': 'aldka'
}
out = converter.dataset(indict)
exp = {
'name': 'gdp',
'title': 'Countries GDP',
'version': '1.0',
'xxx': 'aldka'
}
assert out == exp

def test_key_mappings(self):
# notes
indict = {
'notes': 'Country, regional and world GDP',
'url': 'https://datopian.com'
}
exp = {
'description': 'Country, regional and world GDP',
'homepage': 'https://datopian.com'
}
out = converter.dataset(indict)
assert out == exp

def test_dataset_author_and_maintainer(self):
indict = {
'author': 'World Bank and OECD',
'author_email': 'someone@worldbank.org'
}
exp = {
'contributors': [
{
'title': 'World Bank and OECD',
'email': 'someone@worldbank.org',
'role': 'author'
}
]
}
out = converter.dataset(indict)
assert out == exp

indict = {
'author': 'World Bank and OECD',
'author_email': 'someone@worldbank.org',
'maintainer': 'Datopian',
'maintainer_email': 'helloxxx@datopian.com'
}
exp = {
'contributors': [
{
'title': 'World Bank and OECD',
'email': 'someone@worldbank.org',
'role': 'author'
},
{
'title': 'Datopian',
'email': 'helloxxx@datopian.com',
'role': 'maintainer'
},

]
}
out = converter.dataset(indict)
assert out == exp

# if we already have contributors use that ...
indict = {
'contributors': [{
'title': 'Datopians'
}],
'author': 'World Bank and OECD',
}
exp = {
'contributors': [{
'title': 'Datopians'
}]
}
out = converter.dataset(indict)
assert out == exp

def test_dataset_tags(self):
keywords = [
'economy', 'worldbank'
]
self.dataset_dict.update({
indict = {
'tags': [
{
'display_name': 'economy',
Expand All @@ -296,9 +382,12 @@ def test_dataset_tags(self):
'state': 'active'
}
]
})
result = converter.dataset(self.dataset_dict)
assert result.get('keywords') == keywords
}
exp = {
'keywords': [ 'economy', 'worldbank' ]
}
out = converter.dataset(indict)
assert out == exp

def test_resources_are_converted(self):
# Package has multiple resources
Expand Down

0 comments on commit 45b440e

Please sign in to comment.