Skip to content

Commit

Permalink
Merge pull request #42 from nih-cfde/validation
Browse files Browse the repository at this point in the history
Validation
  • Loading branch information
DavidKelly-Praedictus authored Aug 18, 2021
2 parents fd64186 + c76b8ed commit c0bb431
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 77 deletions.
6 changes: 4 additions & 2 deletions cfde_submit/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,8 @@ def check(self, raise_exception=True):

def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, server=None,
output_dir=None, delete_dir=False, handle_git_repos=True,
dry_run=False, test_sub=False, globus=False, **kwargs):
dry_run=False, test_sub=False, globus=False, disable_validation=False,
**kwargs):
"""Start the Globus Automate Flow to ingest CFDE data into DERIVA.
Arguments:
Expand Down Expand Up @@ -332,7 +333,8 @@ def start_deriva_flow(self, data_path, dcc_id, catalog_id=None, schema=None, ser
handle_git_repos=handle_git_repos, bdbag_kwargs=kwargs
)
# Raises exc.ValidationException if something doesn't match up with the schema
validation.validate_user_submission(data_path, schema)
if not disable_validation:
validation.validate_user_submission(data_path, schema)

flow_info = self.remote_config["FLOWS"][self.service_instance]
dest_path = "{}{}".format(flow_info["cfde_ep_path"], os.path.basename(data_path))
Expand Down
9 changes: 6 additions & 3 deletions cfde_submit/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def cli():
@click.argument("data-path", nargs=1, type=click.Path(exists=True))
@click.option("--dcc-id", "--dcc", default=None, show_default=True)
@click.option("--catalog", default=None, show_default=True)
@click.option("--disable-validation", is_flag=True, default=False, show_default=True)
@click.option("--schema", default=None, show_default=True)
@click.option("--output-dir", default=None, show_default=True, type=click.Path(exists=False))
@click.option("--delete-dir/--keep-dir", is_flag=True, default=False, show_default=True)
Expand All @@ -34,7 +35,8 @@ def cli():
@click.option("--bag-kwargs-file", type=click.Path(exists=True), default=None)
@click.option("--client-state-file", type=click.Path(exists=True), default=None)
def run(data_path, dcc_id, catalog, schema, output_dir, delete_dir, ignore_git, dry_run,
test_submission, verbose, server, globus, bag_kwargs_file, client_state_file):
test_submission, verbose, server, globus, disable_validation, bag_kwargs_file,
client_state_file):
"""Start the Globus Automate Flow to ingest CFDE data into DERIVA."""

# Set log levels
Expand Down Expand Up @@ -108,7 +110,8 @@ def run(data_path, dcc_id, catalog, schema, output_dir, delete_dir, ignore_git,
delete_dir=delete_dir,
handle_git_repos=(not ignore_git), server=server,
dry_run=dry_run, test_sub=test_submission,
globus=globus, **bag_kwargs)
globus=globus, disable_validation=disable_validation,
**bag_kwargs)
else:
exit_on_exception("Aborted. No data submitted.")
except (exc.SubmissionsUnavailable, exc.InvalidInput, exc.ValidationException,
Expand Down Expand Up @@ -255,4 +258,4 @@ def yes_or_no(question):

def exit_on_exception(e):
""" Print an exception and exit with an error """
sys.exit(click.wrap_text(click.style(str(e), fg='red')))
sys.exit(click.style(str(e), fg='red'))
103 changes: 32 additions & 71 deletions cfde_submit/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@
import logging

from bdbag import bdbag_api
from datapackage import Package
from tableschema.exceptions import CastError

from cfde_submit import exc
from frictionless import FrictionlessException, Package, validate
from cfde_submit.exc import ValidationException, InvalidInput

logger = logging.getLogger(__name__)

Expand All @@ -26,79 +24,46 @@ def ts_validate(data_path, schema=None):
raw_errors (list): The raw Exceptions generated from any validation errors.
error (str): A formatted error message about any validation errors.
"""
# If data_path is BDBag archive, unarchive to temp dir
try:
data_path = bdbag_api.extract_bag(data_path, temp=True)
# data_path is not archive
except RuntimeError:
pass
# If data_path is dir (incl. if was unarchived), find JSON desc
if os.path.isfile(data_path):
archive_file = data_path
try:
data_path = bdbag_api.extract_bag(data_path, temp=True)
except Exception as e:
raise InvalidInput("Error extracting %s: %s" % (archive_file, e))
if not bdbag_api.is_bag(data_path):
raise InvalidInput("Input %s does not appear to be a valid BDBag. This tool requires a"
" prepared BDBag archive when invoked on an existing archive file."
% archive_file)

# If data_path is a directory, find JSON
if os.path.isdir(data_path):
# If 'data' dir present, search there instead
if "data" in os.listdir(data_path):
data_path = os.path.join(data_path, "data")
# Find .json file (cannot be hidden)
desc_file_list = [filename for filename in os.listdir(data_path)
if filename.endswith(".json") and not filename.startswith(".")]
if len(desc_file_list) < 1:
return {
"is_valid": False,
"raw_errors": [FileNotFoundError("No TableSchema JSON file found.")],
"error": "No TableSchema JSON file found."
}
raise ValidationException("No TableSchema JSON file found")
elif len(desc_file_list) > 1:
return {
"is_valid": False,
"raw_errors": [RuntimeError("Multiple JSON files found in directory.")],
"error": "Multiple JSON files found in directory."
}
raise ValidationException("Mutiple JSON files found in directory")
else:
data_path = os.path.join(data_path, desc_file_list[0])
# data_path should/must be file now (JSON desc)
if not os.path.isfile(data_path):
return {
"is_valid": False,
"raw_errors": [ValueError("Path '{}' does not refer to a file".format(data_path))],
"error": "Path '{}' does not refer to a file".format(data_path)
}

# Read into Package (identical to DataPackage), return error on failure
# Read into Package
try:
pkg = Package(descriptor=data_path, strict=True)
except Exception as e:
return {
"is_valid": False,
"raw_errors": e.errors,
"error": "\n".join([str(err) for err in pkg.errors])
}
# Check and return package validity based on non-Exception-throwing Package validation
if not pkg.valid:
return {
"is_valid": pkg.valid,
"raw_errors": pkg.errors,
"error": "\n".join([str(err) for err in pkg.errors])
}
# Perform manual validation as well
for resource in pkg.resources:
try:
resource.read()
except CastError as e:
return {
"is_valid": False,
"raw_errors": e.errors,
"error": "\n".join([str(err) for err in e.errors])
}
except Exception as e:
return {
"is_valid": False,
"raw_errors": repr(e),
"error": str(e)
}
return {
"is_valid": True,
"raw_errors": [],
"error": None
}
pkg = Package(data_path)
report = validate(pkg, schema=schema)
except FrictionlessException as e:
raise ValidationException("Validation error\n%s" % e.error.message)

if not report.valid:
if report.errors:
msg = report.errors[0]['message']
else:
for task in report['tasks']:
if not task.valid:
msg = task['resource']['path'] + "\n"
msg += task['errors'][0]['message']
raise ValidationException("Validation error in %s" % msg)


def validate_user_submission(data_path, schema, output_dir=None, delete_dir=False,
Expand Down Expand Up @@ -132,10 +97,6 @@ def validate_user_submission(data_path, schema, output_dir=None, delete_dir=Fals

# Validate TableSchema in BDBag
logger.debug("Validating TableSchema in BDBag '{}'".format(data_path))
validation_res = ts_validate(data_path, schema=schema)
if not validation_res["is_valid"]:
raise exc.ValidationException("TableSchema invalid due to the following errors: "
"\n{}\n".format(validation_res["error"]))

ts_validate(data_path, schema=schema)
logger.debug("Validation successful")
return data_path
2 changes: 1 addition & 1 deletion cfde_submit/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Single source of truth for package version
__version__ = "0.1.11"
__version__ = "0.2"
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
"Click>=7.0",
"datapackage>=1.10.0",
"fair-research-login>=0.2.0",
"frictionless>=4.16.2",
"GitPython>=3.0.4",
"globus-automate-client>=0.10.5",
"globus-sdk>=1.8.0,<2.0",
Expand Down

0 comments on commit c0bb431

Please sign in to comment.