diff --git a/qiimp/src/__main__.py b/qiimp/src/__main__.py index d665a9d..a4cb03a 100644 --- a/qiimp/src/__main__.py +++ b/qiimp/src/__main__.py @@ -7,21 +7,27 @@ def root(): pass -@root.command() -@click.argument('metadata_file_path', type=click.Path(exists=True), - help='path to the metadata file to be extended') -@click.argument('config_fp', type=click.Path(exists=True), - help='path to the study-specific config yaml file') -@click.argument('name_base', type=str, - help='base name for the output extended metadata file') +@root.command("write-extended-metadata", + context_settings={'show_default': True}) +@click.argument('metadata_file_path', type=click.Path(exists=True)) +# help='path to the metadata file to be extended') +@click.argument('config_fp', type=click.Path(exists=True)) +# help='path to the study-specific config yaml file') +@click.argument('name_base', type=str) +# help='base name for the output extended metadata file') @click.option('--out_dir', default=".", - help='output directory for the extended metadata file') + help='output directory for the extended metadata file') @click.option('--sep', default="\t", - help='separator of input file; not applicable to excel files') + help='separator of input file (default is tab); ' + 'not applicable to excel files') +@click.option('--suppress_fails_files', is_flag=True, + help='suppress output of QC and validation error files if no' + 'errors found. Default is to output empty files.') def write_extended_metadata(metadata_file_path, config_fp, - out_dir, name_base, sep): + out_dir, name_base, sep, suppress_fails_files): _write_extended_metadata( - metadata_file_path, config_fp, out_dir, name_base, sep) + metadata_file_path, config_fp, out_dir, name_base, + sep, suppress_fails_files) if __name__ == '__main__': diff --git a/qiimp/src/metadata_extender.py b/qiimp/src/metadata_extender.py index a0916b5..db3904a 100644 --- a/qiimp/src/metadata_extender.py +++ b/qiimp/src/metadata_extender.py @@ -45,7 +45,7 @@ def write_extended_metadata( raw_metadata_fp, study_specific_config_fp, - out_dir, out_name_base, sep="\t"): + out_dir, out_name_base, sep="\t", suppress_empty_fails=False): # extract the extension from the raw_metadata_fp file path extension = os.path.splitext(raw_metadata_fp)[1] @@ -69,12 +69,14 @@ def write_extended_metadata( return write_extended_metadata_from_df( raw_metadata_df, study_specific_config_dict, - out_dir, out_name_base, sep=sep) + out_dir, out_name_base, sep=sep, + suppress_empty_fails=suppress_empty_fails) def write_extended_metadata_from_df( raw_metadata_df, study_specific_config_dict, out_dir, out_name_base, - study_specific_transformers_dict=None, sep="\t"): + study_specific_transformers_dict=None, sep="\t", + suppress_empty_fails=False): validate_required_columns_exist( raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS, @@ -100,8 +102,10 @@ def write_extended_metadata_from_df( study_specific_config_dict) _output_to_df(metadata_df, out_dir, out_name_base, - INTERNAL_COL_KEYS, remove_internals=True, sep=sep) - output_validation_msgs(validation_msgs, out_dir, out_name_base, sep=",") + INTERNAL_COL_KEYS, remove_internals=True, sep=sep, + suppress_empty_fails=suppress_empty_fails) + output_validation_msgs(validation_msgs, out_dir, out_name_base, sep=",", + suppress_empty_fails=suppress_empty_fails) return metadata_df @@ -411,7 +415,8 @@ def _fill_na_if_default(metadata_df, specific_dict, settings_dict): def _output_to_df(a_df, out_dir, out_base, internal_col_names, - sep="\t", remove_internals=False): + sep="\t", remove_internals=False, + suppress_empty_fails=False): timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') extension = get_extension(sep) @@ -428,7 +433,9 @@ def _output_to_df(a_df, out_dir, out_base, internal_col_names, qc_fails_fp = os.path.join( out_dir, f"{timestamp_str}_{out_base}_fails.csv") if qc_fails_df.empty: - Path(qc_fails_fp).touch() + if not suppress_empty_fails: + Path(qc_fails_fp).touch() + # else, just do nothing else: qc_fails_df.to_csv(qc_fails_fp, sep=",", index=False) @@ -447,3 +454,11 @@ def _output_to_df(a_df, out_dir, out_base, internal_col_names, out_fp = os.path.join(out_dir, f"{timestamp_str}_{out_base}.{extension}") output_df.to_csv(out_fp, sep=sep, index=False) + + +if __name__ == '__main__': + write_extended_metadata( + "/Users/abirmingham/Desktop/extended_abtx_metadata_w_faked_host_height_not_applicable.csv", + "/Users/abirmingham/Work/Repositories/custom_abtx_metadata_generator/config.yml", + "/Users/abirmingham/Desktop", + "test_qiimp2_cli2") \ No newline at end of file diff --git a/qiimp/src/metadata_validator.py b/qiimp/src/metadata_validator.py index 03b8b12..9508514 100644 --- a/qiimp/src/metadata_validator.py +++ b/qiimp/src/metadata_validator.py @@ -4,6 +4,7 @@ from dateutil import parser import os import pandas +from pathlib import Path from qiimp.src.util import SAMPLE_NAME_KEY, get_extension @@ -67,13 +68,19 @@ def validate_metadata_df(metadata_df, sample_type_full_metadata_fields_dict): return validation_msgs -def output_validation_msgs(validation_msgs, out_dir, out_base, sep="\t"): +def output_validation_msgs(validation_msgs, out_dir, out_base, sep="\t", + suppress_empty_fails=False): timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S') extension = get_extension(sep) out_fp = os.path.join( out_dir, f"{timestamp_str}_{out_base}_validation_errors.{extension}") msgs_df = pandas.DataFrame(validation_msgs) - msgs_df.to_csv(out_fp, sep=sep, index=False) + if msgs_df.empty: + if not suppress_empty_fails: + Path(msgs_df).touch() + # else, just do nothing + else: + msgs_df.to_csv(out_fp, sep=sep, index=False) def _make_cerberus_schema(sample_type_metadata_dict): diff --git a/qiimp/src/util.py b/qiimp/src/util.py index e7c89ca..22698cc 100644 --- a/qiimp/src/util.py +++ b/qiimp/src/util.py @@ -76,7 +76,7 @@ def deepcopy_dict(input_dict): return output_dict -def load_df_with_best_fit_encoding(an_fp, a_file_separator): +def load_df_with_best_fit_encoding(an_fp, a_file_separator, dtype=None): result = None # from https://stackoverflow.com/a/76366653 @@ -84,7 +84,7 @@ def load_df_with_best_fit_encoding(an_fp, a_file_separator): for encoding in encodings: try: result = pandas.read_csv( - an_fp, sep=a_file_separator, encoding=encoding) + an_fp, sep=a_file_separator, encoding=encoding, dtype=dtype) break except Exception: pass diff --git a/standards.yml b/standards.yml index 75b58d4..d82d88a 100644 --- a/standards.yml +++ b/standards.yml @@ -1,3 +1,10 @@ +ebi_null_vals_all: &id001 + allowed: + - "not collected" + - "not provided" + - "restricted access" + - "not applicable" + type: string host_type_specific_metadata: base: metadata_fields: @@ -36,11 +43,12 @@ host_type_specific_metadata: - regex: '^([0-9]{4})(?:-([0-1][0-9])(?:-([0-3][0-9])(?: ([0-2][0-9])(?::([0-5][0-9])(?::([0-5][0-9]))?)?)?)?)?$' type: string check_with: date_not_in_future - - allowed: - - not collected - - not provided - - restricted access - type: string + - *id001 +# - allowed: +# - not collected +# - not provided +# - restricted access +# type: string empty: false field_desc: The day and time of sampling as a single point in time expressed in 24-hour time format, e.g. 2016-11-22.