Skip to content

Commit

Permalink
✨ add multiple phenotypes as a list
Browse files Browse the repository at this point in the history
  • Loading branch information
bunop committed Sep 11, 2023
1 parent 839b57d commit 526e49e
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 7 deletions.
58 changes: 55 additions & 3 deletions src/data/import_multiple_phenotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,34 @@
from click_option_group import optgroup, RequiredMutuallyExclusiveOptionGroup
from pathlib import Path

from src.features.smarterdb import global_connection
from src.features.smarterdb import global_connection, Phenotype
from src.data.common import pandas_open, deal_with_datasets, get_sample_species
from src.features.utils import sanitize

logger = logging.getLogger(__name__)


def create_or_update_phenotype(
sample, phenotype: dict):

if not phenotype:
logger.debug(f"Skipping {sample}: nothing to update")
return

if not sample.phenotype:
logger.debug(f"Create a new phenotype for {sample}")
sample.phenotype = Phenotype()

for key, value in phenotype.items():
setattr(sample.phenotype, key, value)

logger.info(
f"Updating '{sample}' phenotype with '{sample.phenotype}'")

# update sample
sample.save()


@click.command()
@click.option(
'--src_dataset', type=str, required=True,
Expand All @@ -43,14 +66,15 @@
@optgroup.option('--alias_column', type=str, help="An alias for original_id")
@click.option(
'--column',
'columns',
required=True,
multiple=True,
help=(
"Column to track. Could be specified multiple times")
)
@click.option('--na_values', type=str, help="pandas NA values")
def main(src_dataset, dst_dataset, datafile, sheet_name, breed_column,
id_column, alias_column, column, na_values):
id_column, alias_column, columns, na_values):
"""Read multiple data for the same sample from phenotype file and add it
to SMARTER-database samples"""

Expand All @@ -61,7 +85,35 @@ def main(src_dataset, dst_dataset, datafile, sheet_name, breed_column,
"Loading multiple phenotypes by breed or alias is not yet "
"implemented")

logger.debug(f"Reading {column} columns")
logger.debug(f"Reading {columns} columns")

src_dataset, dst_dataset, datapath = deal_with_datasets(
src_dataset, dst_dataset, datafile)

SampleSpecie = get_sample_species(dst_dataset.species)

if sheet_name and sheet_name.isnumeric():
sheet_name = int(sheet_name)

# open data with pandas
data = pandas_open(datapath, na_values=na_values, sheet_name=sheet_name)

# process unique ids
for id_ in data[id_column].unique():
subset = data[data[id_column] == id_]

phenotype = {}

for column in columns:
phenotype[sanitize(column)] = subset[column].to_list()

original_id = str(id_)

# ok iterate over all samples of this dataset
for sample in SampleSpecie.objects.filter(
dataset=dst_dataset, original_id=original_id):

create_or_update_phenotype(sample, phenotype)

logger.info(f"{Path(__file__).name} ended")

Expand Down
11 changes: 7 additions & 4 deletions tests/data/test_import_multiple_phenotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,12 @@ def setUpClass(cls):
cls.sheet.cell(row=2, column=1, value="test-1")
cls.sheet.cell(row=2, column=2, value=123)
cls.sheet.cell(row=2, column=3, value=11.5)
cls.sheet.cell(row=2, column=1, value="test-1")
cls.sheet.cell(row=2, column=2, value=456)
cls.sheet.cell(row=2, column=3, value=23.0)
cls.sheet.cell(row=3, column=1, value="test-1")
cls.sheet.cell(row=3, column=2, value=456)
cls.sheet.cell(row=3, column=3, value=23.0)
cls.sheet.cell(row=4, column=1, value="test-2")
cls.sheet.cell(row=4, column=2, value=123)
cls.sheet.cell(row=4, column=3, value=11.5)

@classmethod
def tearDownClass(cls):
Expand Down Expand Up @@ -97,7 +100,7 @@ def test_help(self):
class TestImportPhenotypeBySamples(PhenotypeMixin, unittest.TestCase):
@patch('src.features.smarterdb.Dataset.working_dir',
new_callable=PropertyMock)
def test_import_phenotype(self, my_working_dir):
def test_import_multiple_phenotype(self, my_working_dir):
# create a temporary directory using the context manager
with tempfile.TemporaryDirectory() as tmpdirname:
working_dir = pathlib.Path(tmpdirname)
Expand Down

0 comments on commit 526e49e

Please sign in to comment.