Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

59 Validate metadata #64

Merged
merged 4 commits into from
Jan 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions MSMetaEnhancer/libs/Annotator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from MSMetaEnhancer.libs.Curator import Curator
from MSMetaEnhancer.libs.utils import logger
from MSMetaEnhancer.libs.utils.Errors import ConversionNotSupported, TargetAttributeNotRetrieved, \
SourceAttributeNotAvailable, ServiceNotAvailable, UnknownResponse
Expand All @@ -10,6 +11,7 @@ class Annotator:
"""
def __init__(self, services):
self.services = services
self.curator = Curator()

async def annotate(self, spectra, jobs, repeat=False):
"""
Expand Down Expand Up @@ -75,6 +77,7 @@ async def execute_job_with_cache(self, job, metadata, cache):
else:
if service.is_available:
result = await service.convert(job.source, job.target, data)
result = self.curator.filter_invalid_metadata(result)
cache[job.service].update(result)
if job.target in cache[job.service]:
metadata[job.target] = cache[job.service][job.target]
Expand Down
26 changes: 26 additions & 0 deletions MSMetaEnhancer/libs/Curator.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from matchms import utils


class Curator:
"""
Curator makes sure that all data is curated before the actual annotation can proceed.
Expand Down Expand Up @@ -36,3 +39,26 @@ def fix_cas_number(cas_number):
if "-" not in cas_number:
return f'{cas_number[:-3]}-{cas_number[-3:-1]}-{cas_number[-1]}'
return cas_number

@staticmethod
def filter_invalid_metadata(metadata):
"""
Validates metadata and filters out invalid ones.

:param metadata: metadata content
:return: only valid metadata
"""
filters = {
'smiles': utils.is_valid_smiles,
'inchi': utils.is_valid_inchi,
'inchikey': utils.is_valid_inchikey
}

valid_metadata = {}
for (attribute, value) in metadata.items():
if attribute in filters.keys():
if filters[attribute](value):
valid_metadata[attribute] = value
else:
valid_metadata[attribute] = value
return valid_metadata
1 change: 1 addition & 0 deletions conda/environment-dev.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ dependencies:
- pytest-aiohttp
- pytest-cov
- pytest-dependency
- rdkit
1 change: 1 addition & 0 deletions conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ requirements:
- asyncstdlib
- frozendict
- tabulate
- rdkit

test:
imports:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
matchms~=0.9.2
matchms~=0.11.0
pandas~=1.2.4
requests~=2.25.1
mock~=4.0.3
Expand All @@ -12,3 +12,4 @@ tabulate~=0.8.9
sphinx==4.2.0
sphinx_rtd_theme==1.0.0
myst-parser==0.15.2
rdkit-pypi~=2021.9.3
6 changes: 6 additions & 0 deletions tests/test_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,17 @@ def test_annotate(data, expected, repeat, mocked):


def test_execute_job_with_cache():
curator = mock.Mock()
curator.filter_invalid_metadata = mock.MagicMock(side_effect=lambda a: a)

pubchem = mock.Mock()
pubchem.convert = mock.AsyncMock(return_value={'smiles': '$SMILES'})

job = Job(('inchi', 'smiles', 'PubChem'))
job.validate = mock.Mock(return_value=(pubchem, None))

annotator = Annotator({'PubChem': pubchem})
annotator.curator = curator
metadata, cache = asyncio.run(annotator.execute_job_with_cache(job, {'inchi': '$InChi'}, dict()))
assert metadata == {'inchi': '$InChi', 'smiles': '$SMILES'}

Expand All @@ -50,6 +54,7 @@ def test_execute_job_with_cache():
cache = {job.service: {'formula': '$FORMULA'}}

annotator = Annotator({'CTS': cts})
annotator.curator = curator
metadata, cache = asyncio.run(annotator.execute_job_with_cache(job, {'smiles': '$SMILES'}, cache))
assert metadata == {'smiles': '$SMILES', 'formula': '$FORMULA'}

Expand All @@ -59,6 +64,7 @@ def test_execute_job_with_cache():
cir.convert = mock.AsyncMock(return_value=dict())

annotator = Annotator({'CIR': cir})
annotator.curator = curator

with pytest.raises(TargetAttributeNotRetrieved):
metadata, cache = asyncio.run(annotator.execute_job_with_cache(job, {'smiles': '$SMILES'}, dict()))
13 changes: 13 additions & 0 deletions tests/test_curator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,20 @@
import pytest

from MSMetaEnhancer.libs.Curator import Curator


def test_fix_cas_number():
curator = Curator()
assert curator.fix_cas_number('7783893') == '7783-89-3'
assert curator.fix_cas_number('7783-89-3') == '7783-89-3'


@pytest.mark.parametrize('metadata, validated_metadata', [
[{'formula': 'CH4', 'smiles': 'C', 'iupac_name': 'methane', 'inchi': 'InChI=1S/CH4/h1H4'},
{'formula': 'CH4', 'iupac_name': 'methane', 'inchi': 'InChI=1S/CH4/h1H4'}],
[{'inchikey': '<html>random content</html>'}, {}],
[{'smiles': 'CC(NC(C)=O)C#N'}, {'smiles': 'CC(NC(C)=O)C#N'}]
])
def test_filter_invalid_metadata(metadata, validated_metadata):
curator = Curator()
assert curator.filter_invalid_metadata(metadata) == validated_metadata