Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not duplicate harvest_extras if exist in root schema #521

Merged
merged 3 commits into from
Mar 14, 2023
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 61 additions & 53 deletions ckanext/harvest/plugin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,37 @@ def before_dataset_search(self, search_params):

return search_params

def _add_or_update_harvest_metadata(self, key, value, data_dict):
"""Adds extras fields or updates them if already exist."""
if not data_dict.get("extras"):
data_dict["extras"] = []

for e in data_dict.get("extras"):
if e.get("key") == key:
e.update({"value": value})
break
else:
data_dict["extras"].append({"key": key, "value": value})

def before_dataset_index(self, pkg_dict):
"""Adds harvest metadata to the extra field of the dataset.

This method will add or update harvest related metadata in `pkg_dict`,
`data_dict` and `validated_data_dict` so it can be obtained when
calling package_show API (that depends on Solr data). This metadata will
be stored in the `extra` field of the dictionaries.

By default all harvest metadata will go in the extra field. If
another extension adds any of them to the `package_show` schema
then it will not be added again in the `extras` field to avoid
validation errors when updating a package.
"""
# Fix to support Solr8
if isinstance(pkg_dict.get('status'), dict):
try:
pkg_dict['status'] = json.dumps(pkg_dict['status'])
except ValueError:
pkg_dict.pop('status', None)

harvest_object = model.Session.query(HarvestObject) \
.filter(HarvestObject.package_id == pkg_dict["id"]) \
Expand All @@ -125,59 +155,37 @@ def before_dataset_index(self, pkg_dict):
).order_by(HarvestObject.import_finished.desc()) \
.first()

if harvest_object:

data_dict = json.loads(pkg_dict["data_dict"])

validated_data_dict = json.loads(pkg_dict["validated_data_dict"])

harvest_extras = [
("harvest_object_id", harvest_object.id),
("harvest_source_id", harvest_object.source.id),
("harvest_source_title", harvest_object.source.title),
]

for key, value in harvest_extras:

# If the harvest extras are there, update them. This can
# happen eg when calling package_update or resource_update,
# which call package_show
harvest_not_found = True
harvest_not_found_validated = True
if not data_dict.get("extras"):
data_dict["extras"] = []

for e in data_dict.get("extras"):
if e.get("key") == key:
e.update({"value": value})
harvest_not_found = False
if harvest_not_found:
data_dict["extras"].append({"key": key, "value": value})

if not validated_data_dict.get("extras"):
validated_data_dict["extras"] = []

for e in validated_data_dict.get("extras"):
if e.get("key") == key:
e.update({"value": value})
harvest_not_found_validated = False
if harvest_not_found_validated:
validated_data_dict["extras"].append({"key": key, "value": value})

# The commented line isn't cataloged correctly, if we pass the
# basic key the extras are prepended and the system works as
# expected.
# pkg_dict['extras_{0}'.format(key)] = value
pkg_dict[key] = value

pkg_dict["data_dict"] = json.dumps(data_dict)
pkg_dict["validated_data_dict"] = json.dumps(validated_data_dict)

if isinstance(pkg_dict.get('status'), dict):
try:
pkg_dict['status'] = json.dumps(pkg_dict['status'])
except ValueError:
pkg_dict.pop('status', None)
if not harvest_object:
return pkg_dict

harvest_extras = [
("harvest_object_id", harvest_object.id),
("harvest_source_id", harvest_object.source.id),
("harvest_source_title", harvest_object.source.title),
]

# Add harvest extras to data_dict
data_dict = json.loads(pkg_dict["data_dict"])
for key, value in harvest_extras:
if key in data_dict.keys():
data_dict[key] = value
continue
self._add_or_update_harvest_metadata(key, value, data_dict)

# Add harvest extras to validated_data_dict
validated_data_dict = json.loads(pkg_dict["validated_data_dict"])
for key, value in harvest_extras:
if key in validated_data_dict.keys():
validated_data_dict[key] = value
continue
self._add_or_update_harvest_metadata(key, value, validated_data_dict)

# Add harvest extras to main indexed pkg_dict
for key, value in harvest_extras:
pkg_dict[key] = value
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code has been here but I don't see it working.

Indexed packages do not have this attributes when calling Solr directly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was done to add the harvest field as the "catch-all" string field in Solr. These values are indexed but not stored, so that's why you don't see them when calling Solr directly.
With your changes, we should only add them if not already there in pkg_dict


pkg_dict["data_dict"] = json.dumps(data_dict)
pkg_dict["validated_data_dict"] = json.dumps(validated_data_dict)

return pkg_dict

Expand Down