Skip to content

Commit

Permalink
click interface, ability to suppress fail files if no errors, central…
Browse files Browse the repository at this point in the history
…izing 'missing' value definitions
  • Loading branch information
AmandaBirmingham committed Jun 25, 2024
1 parent b72daa1 commit a530abb
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 27 deletions.
28 changes: 17 additions & 11 deletions qiimp/src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,27 @@ def root():
pass


@root.command()
@click.argument('metadata_file_path', type=click.Path(exists=True),
help='path to the metadata file to be extended')
@click.argument('config_fp', type=click.Path(exists=True),
help='path to the study-specific config yaml file')
@click.argument('name_base', type=str,
help='base name for the output extended metadata file')
@root.command("write-extended-metadata",
context_settings={'show_default': True})
@click.argument('metadata_file_path', type=click.Path(exists=True))
# help='path to the metadata file to be extended')
@click.argument('config_fp', type=click.Path(exists=True))
# help='path to the study-specific config yaml file')
@click.argument('name_base', type=str)
# help='base name for the output extended metadata file')
@click.option('--out_dir', default=".",
help='output directory for the extended metadata file')
help='output directory for the extended metadata file')
@click.option('--sep', default="\t",
help='separator of input file; not applicable to excel files')
help='separator of input file (default is tab); '
'not applicable to excel files')
@click.option('--suppress_fails_files', is_flag=True,
help='suppress output of QC and validation error files if no'
'errors found. Default is to output empty files.')
def write_extended_metadata(metadata_file_path, config_fp,
out_dir, name_base, sep):
out_dir, name_base, sep, suppress_fails_files):
_write_extended_metadata(
metadata_file_path, config_fp, out_dir, name_base, sep)
metadata_file_path, config_fp, out_dir, name_base,
sep, suppress_fails_files)


if __name__ == '__main__':
Expand Down
29 changes: 22 additions & 7 deletions qiimp/src/metadata_extender.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

def write_extended_metadata(
raw_metadata_fp, study_specific_config_fp,
out_dir, out_name_base, sep="\t"):
out_dir, out_name_base, sep="\t", suppress_empty_fails=False):

# extract the extension from the raw_metadata_fp file path
extension = os.path.splitext(raw_metadata_fp)[1]
Expand All @@ -69,12 +69,14 @@ def write_extended_metadata(

return write_extended_metadata_from_df(
raw_metadata_df, study_specific_config_dict,
out_dir, out_name_base, sep=sep)
out_dir, out_name_base, sep=sep,
suppress_empty_fails=suppress_empty_fails)


def write_extended_metadata_from_df(
raw_metadata_df, study_specific_config_dict, out_dir, out_name_base,
study_specific_transformers_dict=None, sep="\t"):
study_specific_transformers_dict=None, sep="\t",
suppress_empty_fails=False):

validate_required_columns_exist(
raw_metadata_df, REQUIRED_RAW_METADATA_FIELDS,
Expand All @@ -100,8 +102,10 @@ def write_extended_metadata_from_df(
study_specific_config_dict)

_output_to_df(metadata_df, out_dir, out_name_base,
INTERNAL_COL_KEYS, remove_internals=True, sep=sep)
output_validation_msgs(validation_msgs, out_dir, out_name_base, sep=",")
INTERNAL_COL_KEYS, remove_internals=True, sep=sep,
suppress_empty_fails=suppress_empty_fails)
output_validation_msgs(validation_msgs, out_dir, out_name_base, sep=",",
suppress_empty_fails=suppress_empty_fails)
return metadata_df


Expand Down Expand Up @@ -411,7 +415,8 @@ def _fill_na_if_default(metadata_df, specific_dict, settings_dict):


def _output_to_df(a_df, out_dir, out_base, internal_col_names,
sep="\t", remove_internals=False):
sep="\t", remove_internals=False,
suppress_empty_fails=False):

timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
extension = get_extension(sep)
Expand All @@ -428,7 +433,9 @@ def _output_to_df(a_df, out_dir, out_base, internal_col_names,
qc_fails_fp = os.path.join(
out_dir, f"{timestamp_str}_{out_base}_fails.csv")
if qc_fails_df.empty:
Path(qc_fails_fp).touch()
if not suppress_empty_fails:
Path(qc_fails_fp).touch()
# else, just do nothing
else:
qc_fails_df.to_csv(qc_fails_fp, sep=",", index=False)

Expand All @@ -447,3 +454,11 @@ def _output_to_df(a_df, out_dir, out_base, internal_col_names,

out_fp = os.path.join(out_dir, f"{timestamp_str}_{out_base}.{extension}")
output_df.to_csv(out_fp, sep=sep, index=False)


if __name__ == '__main__':
write_extended_metadata(
"/Users/abirmingham/Desktop/extended_abtx_metadata_w_faked_host_height_not_applicable.csv",
"/Users/abirmingham/Work/Repositories/custom_abtx_metadata_generator/config.yml",
"/Users/abirmingham/Desktop",
"test_qiimp2_cli2")
11 changes: 9 additions & 2 deletions qiimp/src/metadata_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from dateutil import parser
import os
import pandas
from pathlib import Path
from qiimp.src.util import SAMPLE_NAME_KEY, get_extension


Expand Down Expand Up @@ -67,13 +68,19 @@ def validate_metadata_df(metadata_df, sample_type_full_metadata_fields_dict):
return validation_msgs


def output_validation_msgs(validation_msgs, out_dir, out_base, sep="\t"):
def output_validation_msgs(validation_msgs, out_dir, out_base, sep="\t",
suppress_empty_fails=False):
timestamp_str = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
extension = get_extension(sep)
out_fp = os.path.join(
out_dir, f"{timestamp_str}_{out_base}_validation_errors.{extension}")
msgs_df = pandas.DataFrame(validation_msgs)
msgs_df.to_csv(out_fp, sep=sep, index=False)
if msgs_df.empty:
if not suppress_empty_fails:
Path(msgs_df).touch()
# else, just do nothing
else:
msgs_df.to_csv(out_fp, sep=sep, index=False)


def _make_cerberus_schema(sample_type_metadata_dict):
Expand Down
4 changes: 2 additions & 2 deletions qiimp/src/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,15 @@ def deepcopy_dict(input_dict):
return output_dict


def load_df_with_best_fit_encoding(an_fp, a_file_separator):
def load_df_with_best_fit_encoding(an_fp, a_file_separator, dtype=None):
result = None

# from https://stackoverflow.com/a/76366653
encodings = ["utf-8", "utf-8-sig", "iso-8859-1", "latin1", "cp1252"]
for encoding in encodings:
try:
result = pandas.read_csv(
an_fp, sep=a_file_separator, encoding=encoding)
an_fp, sep=a_file_separator, encoding=encoding, dtype=dtype)
break
except Exception:
pass
Expand Down
18 changes: 13 additions & 5 deletions standards.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
ebi_null_vals_all: &id001
allowed:
- "not collected"
- "not provided"
- "restricted access"
- "not applicable"
type: string
host_type_specific_metadata:
base:
metadata_fields:
Expand Down Expand Up @@ -36,11 +43,12 @@ host_type_specific_metadata:
- regex: '^([0-9]{4})(?:-([0-1][0-9])(?:-([0-3][0-9])(?: ([0-2][0-9])(?::([0-5][0-9])(?::([0-5][0-9]))?)?)?)?)?$'
type: string
check_with: date_not_in_future
- allowed:
- not collected
- not provided
- restricted access
type: string
- *id001
# - allowed:
# - not collected
# - not provided
# - restricted access
# type: string
empty: false
field_desc: The day and time of sampling as a single point in time expressed in
24-hour time format, e.g. 2016-11-22.
Expand Down

0 comments on commit a530abb

Please sign in to comment.