diff --git a/HISTORY.rst b/HISTORY.rst index 930a89b..db8e3ec 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -15,6 +15,11 @@ TODO * Manage python packages with `poetry `__ * Rename ``manifacturer`` into ``manufacturer`` +0.4.8.post1 +----------- + +* Update *datasets* metadata + 0.4.8 (2023-06-28) ------------------ diff --git a/docs/commands.rst b/docs/commands.rst index c76c5f0..cbec468 100644 --- a/docs/commands.rst +++ b/docs/commands.rst @@ -80,6 +80,12 @@ documentation sections. :prog: src/data/import_metadata.py :nested: full +.. _import_multiple_phenotypes: + +.. click:: src.data.import_multiple_phenotypes:main + :prog: src/data/import_multiple_phenotypes.py + :nested: full + .. _import_phenotypes: .. click:: src.data.import_phenotypes:main diff --git a/src/data/import_multiple_phenotypes.py b/src/data/import_multiple_phenotypes.py new file mode 100644 index 0000000..49f5ce7 --- /dev/null +++ b/src/data/import_multiple_phenotypes.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 11 12:13:47 2023 + +@author: Paolo Cozzi + +This program acts like import_phenotypes but adding more data for the same +individual +""" + +import click +import logging + +from click_option_group import optgroup, RequiredMutuallyExclusiveOptionGroup +from pathlib import Path + +from src.features.smarterdb import global_connection + +logger = logging.getLogger(__name__) + + +@click.command() +@click.option( + '--src_dataset', type=str, required=True, + help="The raw dataset file name (zip archive) in which search datafile" +) +@click.option( + '--dst_dataset', type=str, required=False, + help=("The raw dataset file name (zip archive) in which add metadata" + "(def. the 'src_dataset')") +) +@click.option('--datafile', type=str, required=True) +@click.option('--sheet_name', + default="0", + help="pandas 'sheet_name' option") +@optgroup.group( + 'Add metadata relying on breeds or samples columns', + cls=RequiredMutuallyExclusiveOptionGroup +) +@optgroup.option('--breed_column', type=str, help="The breed column") +@optgroup.option('--id_column', type=str, help="The original_id column") +@optgroup.option('--alias_column', type=str, help="An alias for original_id") +@click.option( + '--column', + required=True, + multiple=True, + help=( + "Column to track. Could be specified multiple times") +) +@click.option('--na_values', type=str, help="pandas NA values") +def main(src_dataset, dst_dataset, datafile, sheet_name, breed_column, + id_column, alias_column, column, na_values): + """Read multiple data for the same sample from phenotype file and add it + to SMARTER-database samples""" + + logger.info(f"{Path(__file__).name} started") + + if breed_column or alias_column: + raise NotImplementedError( + "Loading multiple phenotypes by breed or alias is not yet " + "implemented") + + logger.debug(f"Reading {column} columns") + + logger.info(f"{Path(__file__).name} ended") + + +if __name__ == '__main__': + log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + logging.basicConfig(level=logging.INFO, format=log_fmt) + + # connect to database + global_connection() + + main() diff --git a/tests/data/test_import_multiple_phenotypes.py b/tests/data/test_import_multiple_phenotypes.py new file mode 100644 index 0000000..9d55705 --- /dev/null +++ b/tests/data/test_import_multiple_phenotypes.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Mon Sep 11 12:20:08 2023 + +@author: Paolo Cozzi +""" + +import unittest +import pathlib +import tempfile + +from openpyxl import Workbook +from click.testing import CliRunner +from unittest.mock import patch, PropertyMock + +from src.data.import_multiple_phenotypes import ( + main as import_multiple_phenotypes) +from src.features.smarterdb import Dataset, SampleSheep, Phenotype + +from ..common import MongoMockMixin, SmarterIDMixin, SupportedChipMixin + + +class PhenotypeMixin(SmarterIDMixin, SupportedChipMixin, MongoMockMixin): + @classmethod + def setUpClass(cls): + super().setUpClass() + + # getting destination dataset + cls.dst_dataset = Dataset.objects.get(file="test.zip") + + # create a src dataset for genotypes + cls.src_dataset = Dataset( + file="test2.zip", + country="Italy", + species="Sheep", + contents=[ + "phenotypes.xlsx" + ] + ) + cls.src_dataset.save() + + # create a workbook + cls.workbook = Workbook() + cls.sheet = cls.workbook.active + + # adding header + cls.sheet.cell(row=1, column=1, value="Id") + cls.sheet.cell(row=1, column=2, value="daily_activity_min") + cls.sheet.cell(row=1, column=3, value="daily_distance_km") + + # adding values + cls.sheet.cell(row=2, column=1, value="test-1") + cls.sheet.cell(row=2, column=2, value=123) + cls.sheet.cell(row=2, column=3, value=11.5) + cls.sheet.cell(row=2, column=1, value="test-1") + cls.sheet.cell(row=2, column=2, value=456) + cls.sheet.cell(row=2, column=3, value=23.0) + + @classmethod + def tearDownClass(cls): + Dataset.objects.delete() + SampleSheep.objects.delete() + + super().tearDownClass() + + def setUp(self): + self.runner = CliRunner() + + # need also a sample + self.sample = SampleSheep( + original_id="test-1", + smarter_id="ITOA-TEX-000000001", + country="Italy", + breed="Texel", + breed_code="TEX", + dataset=self.dst_dataset, + type_="background", + chip_name=self.chip_name, + alias="alias-1", + ) + self.sample.save() + + def tearDown(self): + SampleSheep.objects.delete() + + super().tearDown() + + +class TestImportPhenotypeCLI(PhenotypeMixin, unittest.TestCase): + def test_help(self): + result = self.runner.invoke(import_multiple_phenotypes, ["--help"]) + self.assertEqual(0, result.exit_code) + self.assertIn('Usage: main', result.output) + + +class TestImportPhenotypeBySamples(PhenotypeMixin, unittest.TestCase): + @patch('src.features.smarterdb.Dataset.working_dir', + new_callable=PropertyMock) + def test_import_phenotype(self, my_working_dir): + # create a temporary directory using the context manager + with tempfile.TemporaryDirectory() as tmpdirname: + working_dir = pathlib.Path(tmpdirname) + my_working_dir.return_value = working_dir + + # save worksheet in temporary folder + self.workbook.save(f"{working_dir}/phenotypes.xlsx") + + # got first sample from database + self.assertEqual(SampleSheep.objects.count(), 1) + + result = self.runner.invoke( + import_multiple_phenotypes, + [ + "--src_dataset", + "test2.zip", + "--dst_dataset", + "test.zip", + "--datafile", + "phenotypes.xlsx", + "--id_column", + "Id", + "--column", + "daily_activity_min", + "--column", + "daily_distance_km", + ] + ) + + self.assertEqual(0, result.exit_code, msg=result.exception) + self.sample.reload() + self.assertIsInstance(self.sample.phenotype, Phenotype) + + reference = Phenotype( + daily_activity_min=[123, 456], + daily_distance_km=[11.5, 23.0] + ) + + self.assertEqual(reference, self.sample.phenotype)