[c2f,#19][m]: misc improvements to dataset mapping.

* correct basic mappings of keys * author and maintainer mapping - fixes #2 * licenses uses licenses if already there (rather than overwriting) - or merges ...
frictionlessdata · Jun 18, 2020 · 45b440e · 45b440e
1 parent 6a788cb
commit 45b440e
Show file tree

Hide file tree

Showing 2 changed files with 180 additions and 84 deletions.
diff --git a/frictionless_ckan_mapper/ckan_to_frictionless.py b/frictionless_ckan_mapper/ckan_to_frictionless.py
@@ -78,15 +78,9 @@ def resource(self, ckandict):
     dataset_keys_to_remove = [
         'state'
     ]
-    package_mapping = {
+    dataset_mapping = {
         'notes': 'description',
-        'tags': 'keywords',  # this is flattened and simplified
-    }
-
-    package_sources_mapping = {
-        'author': 'title',
-        'author_email': 'email',
-        'url': 'path',
+        'url': 'homepage'
     }
 
     def dataset(self, ckandict):
@@ -99,8 +93,7 @@ def dataset(self, ckandict):
         4. Remove unneeded keys
         5. Apply special formatting for key fields
         '''
-        outdict = dict(ckandict)
-
+        outdict = dict(ckandict) 
         # Convert the structure of extras
         # structure of extra item is {key: xxx, value: xxx} 
         if 'extras' in ckandict:
@@ -114,52 +107,66 @@ def dataset(self, ckandict):
                 outdict[key] = value
             del outdict['extras']
 
-
-        # Remap necessary package keys
-        for k, v in self.package_mapping.items():
-            if k in ckandict and k == 'url':
-                outdict[v] = ckandict[k]
-                del outdict[k]
-            elif k in ckandict and k == 'tags':
-                outdict[v] = []
-                for tag in ckandict[k]:
-                    outdict[v].append(tag['name'])
-                del outdict[k]
-            elif k in ckandict:
+        # Map dataset keys
+        for k, v in self.dataset_mapping.items():
+            if k in ckandict:
                 outdict[v] = ckandict[k]
                 del outdict[k]
-
-
-        # Remap properties in sources
-        if 'author' in outdict:
-            outdict['sources'] = []
-            source = {}
-            for k, v in self.package_sources_mapping.items():
-                if k in outdict:
-                    source[v] = outdict[k]
-                    del outdict[k]
-            outdict['sources'].append(source)
-
-        # Reformat expected output for some keys in package
-        if 'name' in outdict:
-            outdict['name'] = outdict['name'].replace('-', '_')
-
-        # Reformat resources inside package
+
+        # tags
+        if 'tags' in ckandict:
+            outdict['keywords'] = [ tag['name'] for tag in ckandict['tags'] ]
+            del outdict['tags']
+
+        # author, maintainer => contributors
+        # what to do if contributors already there? Options:
+        # 1. Just use that and ignore author/maintainer
+        # 2. replace with author/maintainer
+        # 3. merge i.e. use contributors and merge in (this is sort of complex)
+        # e.g. how to i avoid duplicating the same person
+        # ANS: for now, is 1 ...
+        if (not ('contributors' in outdict and outdict['contributors']) and
+                ('author' in outdict or 'maintainer' in outdict)):
+            outdict['contributors'] = []
+            if 'author' in outdict and outdict['author']:
+                contrib = {
+                    'title': outdict['author'],
+                    'role': 'author'
+                    }
+                if 'author_email' in outdict:
+                    contrib['email'] = outdict['author_email']
+                outdict['contributors'].append(contrib)
+            if 'maintainer' in outdict and outdict['maintainer']:
+                contrib = {
+                    'title': outdict['maintainer'],
+                    'role': 'maintainer'
+                    }
+                if 'maintainer_email' in outdict:
+                    contrib['email'] = outdict['maintainer_email']
+                outdict['contributors'].append(contrib)
+
+        for k in ['author', 'author_email', 'maintainer', 'maintainer_email']:
+            outdict.pop(k, None)
+
+        # Reformat resources inside dataset
         if 'resources' in outdict:
             outdict['resources'] = [self.resource(res) for res in
                     outdict['resources']]
 
-        # TODO: do we always license_id - can we have license_title w/o
+        # package_show can have license_id and license_title
+        # TODO: do we always license_id i.e. can we have license_title w/o
         # license_id?
-        if 'license_id' in outdict:
+        if ('licenses' not in outdict and 'license_id' in outdict):
             outdict['licenses'] = [{
                 'type': outdict['license_id'],
                 }]
-            del outdict['license_id']
+            if 'license_title' in outdict:
+                outdict['licenses'][0]['title'] = outdict['license_title']
+        outdict.pop('license_id', None)
+        outdict.pop('license_title', None)
 
         for k in self.dataset_keys_to_remove:
-            if k in outdict:
-                del outdict[k]
+            outdict.pop(k, None)
 
         for k in list(outdict.keys()):
             if outdict[k] is None:

diff --git a/tests/test_ckan_to_frictionless.py b/tests/test_ckan_to_frictionless.py
@@ -225,10 +225,47 @@ def test_unjsonify_all_extra_values_in_nested_lists(self):
         exp = {'numbers': [[[1, 2, 3], [2, 4, 5]], [[7, 6, 0]]]}
         assert out == exp
 
-    def test_author_and_maintainer(self):
-        pass
-
     def test_dataset_license(self):
+        indict = {
+            'license_id': 'odc-odbl'
+        }
+        exp = {
+            'licenses': [{
+                'type': 'odc-odbl'
+            }]
+        }
+        out = converter.dataset(indict)
+        assert out == exp
+
+        indict = {
+            'license_id': 'odc-odbl',
+            'license_title': 'Open Data Commons Open Database License'
+        }
+        exp = {
+            'licenses': [{
+                'type': 'odc-odbl',
+                'title': 'Open Data Commons Open Database License'
+            }]
+        }
+        out = converter.dataset(indict)
+        assert out == exp
+
+        # finally what if license*s* already there ...
+        indict = {
+            'licenses': [{
+                'type': 'odc-pddl'
+            }],
+            'license_id': 'odc-odbl'
+        }
+        exp = {
+            'licenses': [{
+                'type': 'odc-pddl'
+            }]
+        }
+        out = converter.dataset(indict)
+        assert out == exp
+
+    def test_dataset_license_with_licenses_in_extras(self):
         indict = {
             'license_id': 'odc-odbl',
             # TODO: check package_show in CKAN
@@ -243,45 +280,94 @@ def test_dataset_license(self):
         out = converter.dataset(indict)
         assert out == exp
 
-    def test_dataset_name_title_and_version(self):
-        self.dataset_dict.update({
+    def test_keys_are_passed_through(self):
+        indict = {
             'name': 'gdp',
             'title': 'Countries GDP',
             'version': '1.0',
-        })
-        result = converter.dataset(self.dataset_dict)
-        assert result['title'] == self.dataset_dict['title']
-        assert result['name'] == self.dataset_dict['name']
-        assert result['version'] == self.dataset_dict['version']
-
-    def test_dataset_notes(self):
-        self.dataset_dict.update({
-            'notes': 'Country, regional and world GDP in current US Dollars.'
-        })
-        result = converter.dataset(self.dataset_dict)
-        assert result.get('description') == self.dataset_dict['notes']
-
-    def test_dataset_author_and_source(self):
-        sources = [
-            {
-                'title': 'World Bank and OECD',
-                'email': 'someone@worldbank.org',
-                'path': 'http://data.worldbank.org/indicator/NY.GDP.MKTP.CD',
-            }
-        ]
-        self.dataset_dict.update({
-            'author': sources[0]['title'],
-            'author_email': sources[0]['email'],
-            'url': sources[0]['path']
-        })
-        result = converter.dataset(self.dataset_dict)
-        assert result.get('sources') == sources
+            # random
+            'xxx': 'aldka'
+        }
+        out = converter.dataset(indict)
+        exp = {
+            'name': 'gdp',
+            'title': 'Countries GDP',
+            'version': '1.0',
+            'xxx': 'aldka'
+        }
+        assert out == exp
+
+    def test_key_mappings(self):
+        # notes
+        indict = {
+            'notes': 'Country, regional and world GDP',
+            'url': 'https://datopian.com'
+        }
+        exp = {
+            'description': 'Country, regional and world GDP',
+            'homepage': 'https://datopian.com'
+        }
+        out = converter.dataset(indict)
+        assert out == exp
+
+    def test_dataset_author_and_maintainer(self):
+        indict = {
+            'author': 'World Bank and OECD',
+            'author_email': 'someone@worldbank.org'
+        }
+        exp = {
+            'contributors': [
+                {
+                    'title': 'World Bank and OECD',
+                    'email': 'someone@worldbank.org',
+                    'role': 'author'
+                }
+            ]
+        }
+        out = converter.dataset(indict)
+        assert out == exp
+
+        indict = {
+            'author': 'World Bank and OECD',
+            'author_email': 'someone@worldbank.org',
+            'maintainer': 'Datopian',
+            'maintainer_email': 'helloxxx@datopian.com'
+        }
+        exp = {
+            'contributors': [
+                {
+                    'title': 'World Bank and OECD',
+                    'email': 'someone@worldbank.org',
+                    'role': 'author'
+                },
+                {
+                    'title': 'Datopian',
+                    'email': 'helloxxx@datopian.com',
+                    'role': 'maintainer'
+                },
+
+            ]
+        }
+        out = converter.dataset(indict)
+        assert out == exp
+
+        # if we already have contributors use that ...
+        indict = {
+            'contributors': [{
+                'title': 'Datopians'
+            }],
+            'author': 'World Bank and OECD',
+        }
+        exp = {
+            'contributors': [{
+                    'title': 'Datopians'
+                }]
+        }
+        out = converter.dataset(indict)
+        assert out == exp
 
     def test_dataset_tags(self):
-        keywords = [
-            'economy', 'worldbank'
-        ]
-        self.dataset_dict.update({
+        indict = {
             'tags': [
                 {
                     'display_name': 'economy',
@@ -296,9 +382,12 @@ def test_dataset_tags(self):
                     'state': 'active'
                 }
             ]
-        })
-        result = converter.dataset(self.dataset_dict)
-        assert result.get('keywords') == keywords
+        }
+        exp = {
+            'keywords': [ 'economy', 'worldbank' ]
+        }
+        out = converter.dataset(indict)
+        assert out == exp
 
     def test_resources_are_converted(self):
         # Package has multiple resources