Plannotate (#224)

* plannotate working version * closes #223 * fix test
manulera · Nov 26, 2024 · f8e3c82 · f8e3c82
1 parent 80ab2c1
commit f8e3c82
Show file tree

Hide file tree

Showing 8 changed files with 197 additions and 8 deletions.
diff --git a/dna_functions.py b/dna_functions.py
@@ -5,6 +5,7 @@
 from pydna.dseqrecord import Dseqrecord
 from pydna.dseq import Dseq
 from pydantic_models import TextFileSequence, AddGeneIdSource, SequenceFileFormat
+from shareyourcloning_linkml.datamodel import PlannotateAnnotationReport
 from pydna.parsers import parse as pydna_parse
 import requests
 from bs4 import BeautifulSoup
@@ -315,3 +316,28 @@ async def get_sequence_from_euroscarf_url(plasmid_id: str) -> Dseqrecord:
         raise HTTPError(url, 503, msg, msg, None)
     genbank_url = f'http://www.euroscarf.de/{subpath.get("href")}'
     return (await get_sequences_from_gb_file_url(genbank_url))[0]
+
+
+async def annotate_with_plannotate(
+    file_content: str, file_name: str, url: str
+) -> tuple[Dseqrecord, PlannotateAnnotationReport, str]:
+    async with httpx.AsyncClient() as client:
+        try:
+            response = await client.post(
+                url,
+                files={'file': (file_name, file_content, 'text/plain')},
+                timeout=20,
+            )
+            if response.status_code != 200:
+                detail = response.json().get('detail', 'plannotate server error')
+                raise HTTPError(url, response.status_code, detail, detail, None)
+            data = response.json()
+            dseqr = custom_file_parser(io.StringIO(data['gb_file']), 'genbank')[0]
+            report = [PlannotateAnnotationReport.model_validate(r) for r in data['report']]
+            return dseqr, report, data['version']
+        except httpx.TimeoutException as e:
+            raise HTTPError(url, 504, 'plannotate server timeout', 'plannotate server timeout', None) from e
+        except httpx.ConnectError as e:
+            raise HTTPError(
+                url, 500, 'cannot connect to plannotate server', 'cannot connect to plannotate server', None
+            ) from e
diff --git a/main.py b/main.py
@@ -19,6 +19,7 @@
     get_sequence_from_snagene_url,
     custom_file_parser,
     get_sequence_from_euroscarf_url,
+    annotate_with_plannotate as _annotate_with_plannotate,
 )
 from pydantic_models import (
     PCRSource,
@@ -47,6 +48,7 @@
     EuroscarfSource,
     OverlapExtensionPCRLigationSource,
     GatewaySource,
+    AnnotationSource,
 )
 from fastapi.middleware.cors import CORSMiddleware
 from Bio.Restriction.Restriction import RestrictionBatch
@@ -82,6 +84,11 @@
 # ENV variables ========================================
 RECORD_STUBS = os.environ['RECORD_STUBS'] == '1' if 'RECORD_STUBS' in os.environ else False
 SERVE_FRONTEND = os.environ['SERVE_FRONTEND'] == '1' if 'SERVE_FRONTEND' in os.environ else False
+PLANNOTATE_URL = os.environ['PLANNOTATE_URL'] if 'PLANNOTATE_URL' in os.environ else None
+
+# Handle trailing slash:
+if PLANNOTATE_URL is not None and not PLANNOTATE_URL.endswith('/'):
+    PLANNOTATE_URL += '/'
 
 origins = []
 if os.environ.get('ALLOWED_ORIGINS') is not None:
@@ -1334,6 +1341,38 @@ async def primer_design_simple_pair(
     return {'primers': [fwd, rvs]}
 
 
+if PLANNOTATE_URL is not None:
+
+    @router.post(
+        '/annotate/plannotate',
+        summary='Annotate a sequence with Plannotate',
+        response_model=create_model(
+            'PlannotateResponse',
+            sources=(list[AnnotationSource], ...),
+            sequences=(list[TextFileSequence], ...),
+        ),
+    )
+    async def annotate_with_plannotate(
+        sequence: TextFileSequence,
+        source: AnnotationSource,
+    ):
+        input_seqr = read_dsrecord_from_json(sequence)
+        # Make a request submitting sequence as a file:
+        try:
+            seqr, annotations, version = await _annotate_with_plannotate(
+                sequence.file_content, f'{sequence.id}.gb', PLANNOTATE_URL + 'annotate'
+            )
+        except HTTPError as e:
+            raise HTTPException(e.code, e.msg) from e
+
+        source.annotation_report = annotations
+        source.annotation_tool = 'plannotate'
+        source.annotation_tool_version = version
+        seqr.name = input_seqr.name + '_annotated'
+
+        return {'sources': [source], 'sequences': [format_sequence_genbank(seqr, source.output_name)]}
+
+
 @router.post(
     '/validate',
     summary='Validate a cloning strategy',

diff --git a/poetry.lock b/poetry.lock
diff --git a/pydantic_models.py b/pydantic_models.py
@@ -39,6 +39,7 @@
     EuroscarfSource as _EuroscarfSource,
     GatewaySource as _GatewaySource,
     InFusionSource as _InFusionSource,
+    AnnotationSource as _AnnotationSource,
 )
 from pydna.utils import shift_location as _shift_location
 from assembly2 import edge_representation2subfragment_representation, subfragment_representation2edge_representation
@@ -131,6 +132,10 @@ class GenomeCoordinatesSource(SourceCommonClass, _GenomeCoordinatesSource):
     pass
 
 
+class AnnotationSource(SourceCommonClass, _AnnotationSource):
+    pass
+
+
 class RestrictionSequenceCut(_RestrictionSequenceCut):
 
     @classmethod

diff --git a/pyproject.toml b/pyproject.toml
@@ -17,7 +17,7 @@ pydna = {git = "https://github.com/BjornFJohansson/pydna", rev = "9d112d71534194
 requests = "^2.31.0"
 regex = "^2023.10.3"
 pydantic = "^2.7.1"
-shareyourcloning-linkml = "0.1.9a0"
+shareyourcloning-linkml = "0.1.10a0"
 pandas = "^2.2.3"
 openpyxl = "^3.1.5"
 
@@ -33,6 +33,7 @@ pytest = "^7.4.3"
 pre-commit = "^3.6.2"
 pytest-cov = "^4.1.0"
 pytest-rerunfailures = "^14.0"
+respx = "^0.21.1"
 
 
 [tool.poetry.group.ipython.dependencies]

diff --git a/test_endpoints.py b/test_endpoints.py
@@ -1,5 +1,5 @@
-from dna_functions import format_sequence_genbank, read_dsrecord_from_json
-from main import app
+from dna_functions import format_sequence_genbank, read_dsrecord_from_json, annotate_with_plannotate
+import main as _main
 from fastapi.testclient import TestClient
 from pydna.parsers import parse as pydna_parse
 from Bio.Restriction.Restriction import CommOnly
@@ -28,6 +28,7 @@
     EuroscarfSource,
     SnapGenePlasmidSource,
     GatewaySource,
+    AnnotationSource,
 )
 from pydna.dseqrecord import Dseqrecord
 import unittest
@@ -39,6 +40,10 @@
 import pytest
 from Bio.Seq import reverse_complement
 import os
+from importlib import reload
+import respx
+import httpx
+from urllib.error import HTTPError
 
 
 def get_all_feature_labels(seq: Dseqrecord):
@@ -60,7 +65,7 @@ def wrapper(*args, **kwargs):
     return decorator
 
 
-client = TestClient(app)
+client = TestClient(_main.app)
 
 
 class VersionTest(unittest.TestCase):
@@ -2629,5 +2634,64 @@ def test_single_input(self):
         self.assertEqual(str(seqs[0].seq), product)
 
 
+class PlannotateTest(unittest.TestCase):
+    def setUp(self):
+        # Has to be imported here to get the right environment variable
+        pytest.MonkeyPatch().setenv('PLANNOTATE_URL', 'http://dummy/url')
+
+        reload(_main)
+        self.client = TestClient(_main.app)
+
+    def tearDown(self):
+        pytest.MonkeyPatch().setenv('PLANNOTATE_URL', '')
+        reload(_main)
+
+    @respx.mock
+    def test_plannotate(self):
+        seq = Dseqrecord(
+            'AAAAttgagatcctttttttctgcgcgtaatctgctgcttgcaaacaaaaaaaccaccgctaccagcggtggtttgtttgccggatcaagagctaccaactctttttccgaaggtaactggcttcagcagagcgcagataccaaatactgttcttctagtgtagccgtagttaggccaccacttcaagaactctgtagcaccgcctacatacctcgctctgctaatcctgttaccagtggctgctgccagtggcgataagtcgtgtcttaccgggttggactcaagacgatagttaccggataaggcgcagcggtcgggctgaacggggggttcgtgcacacagcccagcttggagcgaacgacctacaccgaactgagatacctacagcgtgagctatgagaaagcgccacgcttcccgaagggagaaaggcggacaggtatccggtaagcggcagggtcggaacaggagagcgcacgagggagcttccagggggaaacgcctggtatctttatagtcctgtcgggtttcgccacctctgacttgagcgtcgatttttgtgatgctcgtcaggggggcggagcctatggaaaAAAA'
+        )
+        seq = format_sequence_genbank(seq)
+        mock_response_success = json.load(open('test_files/planottate/mock_response_success.json'))
+        # Mock the HTTPX GET request
+        respx.post('http://dummy/url/annotate').respond(200, json=mock_response_success)
+
+        source = AnnotationSource(id=0, annotation_tool='plannotate')
+        response = self.client.post(
+            '/annotate/plannotate', json={'sequence': seq.model_dump(), 'source': source.model_dump()}
+        )
+        self.assertEqual(response.status_code, 200)
+        payload = response.json()
+        seq = read_dsrecord_from_json(TextFileSequence.model_validate(payload['sequences'][0]))
+        source = payload['sources'][0]
+        self.assertEqual(source['annotation_tool'], 'plannotate')
+        self.assertEqual(source['annotation_tool_version'], '1.2.2')
+        self.assertEqual(len(source['annotation_report']), 2)
+        feature_names = [f.qualifiers['label'][0] for f in seq.features]
+        self.assertIn('ori', feature_names)
+        self.assertIn('RNAI', feature_names)
+
+    @respx.mock
+    def test_plannotate_down(self):
+        respx.post('http://dummy/url/annotate').mock(side_effect=httpx.ConnectError('Connection error'))
+        seq = Dseqrecord('aaa')
+        seq = format_sequence_genbank(seq)
+        source = AnnotationSource(id=0, annotation_tool='plannotate')
+        response = self.client.post(
+            '/annotate/plannotate', json={'sequence': seq.model_dump(), 'source': source.model_dump()}
+        )
+        self.assertEqual(response.status_code, 500)
+
+    @respx.mock
+    async def test_plannotate_other_error(self):
+        # This is tested here because it's impossible to send a malformed request from the backend
+        respx.post('http://dummy/url/annotate').respond(400, json={'error': 'bad request'})
+
+        with pytest.raises(HTTPError) as e:
+            await annotate_with_plannotate('hello', 'hello.blah', 'http://dummy/url/annotate')
+
+        self.assertEqual(e.code, 400)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test_files/planottate/input.fasta b/test_files/planottate/input.fasta
@@ -0,0 +1,2 @@
+> blah
+AAAAttgagatcctttttttctgcgcgtaatctgctgcttgcaaacaaaaaaaccaccgctaccagcggtggtttgtttgccggatcaagagctaccaactctttttccgaaggtaactggcttcagcagagcgcagataccaaatactgttcttctagtgtagccgtagttaggccaccacttcaagaactctgtagcaccgcctacatacctcgctctgctaatcctgttaccagtggctgctgccagtggcgataagtcgtgtcttaccgggttggactcaagacgatagttaccggataaggcgcagcggtcgggctgaacggggggttcgtgcacacagcccagcttggagcgaacgacctacaccgaactgagatacctacagcgtgagctatgagaaagcgccacgcttcccgaagggagaaaggcggacaggtatccggtaagcggcagggtcggaacaggagagcgcacgagggagcttccagggggaaacgcctggtatctttatagtcctgtcgggtttcgccacctctgacttgagcgtcgatttttgtgatgctcgtcaggggggcggagcctatggaaaAAAA
diff --git a/test_files/planottate/mock_response_success.json b/test_files/planottate/mock_response_success.json
@@ -0,0 +1,38 @@
+{
+    "version": "1.2.2",
+    "gb_file": "LOCUS       plasmid                  597 bp    DNA     linear   SYN 26-NOV-2024\nDEFINITION  .\nACCESSION   .\nVERSION     .\nKEYWORDS    .\nSOURCE      .\n  ORGANISM  .\n            .\nCOMMENT     Annotated with pLannotate v1.2.2\nFEATURES             Location/Qualifiers\n     rep_origin      5..593\n                     /note=\"pLannotate\"\n                     /label=\"ori\"\n                     /database=\"snapgene\"\n                     /identity=\"99.8\"\n                     /match_length=\"100.0\"\n                     /fragment=\"False\"\n                     /other=\"rep_origin\"\n     ncRNA           complement(44..148)\n                     /note=\"pLannotate\"\n                     /label=\"RNAI\"\n                     /database=\"Rfam\"\n                     /identity=\"100.0\"\n                     /match_length=\"102.9\"\n                     /fragment=\"False\"\n                     /other=\"ncRNA\"\nORIGIN\n        1 aaaattgaga tccttttttt ctgcgcgtaa tctgctgctt gcaaacaaaa aaaccaccgc\n       61 taccagcggt ggtttgtttg ccggatcaag agctaccaac tctttttccg aaggtaactg\n      121 gcttcagcag agcgcagata ccaaatactg ttcttctagt gtagccgtag ttaggccacc\n      181 acttcaagaa ctctgtagca ccgcctacat acctcgctct gctaatcctg ttaccagtgg\n      241 ctgctgccag tggcgataag tcgtgtctta ccgggttgga ctcaagacga tagttaccgg\n      301 ataaggcgca gcggtcgggc tgaacggggg gttcgtgcac acagcccagc ttggagcgaa\n      361 cgacctacac cgaactgaga tacctacagc gtgagctatg agaaagcgcc acgcttcccg\n      421 aagggagaaa ggcggacagg tatccggtaa gcggcagggt cggaacagga gagcgcacga\n      481 gggagcttcc agggggaaac gcctggtatc tttatagtcc tgtcgggttt cgccacctct\n      541 gacttgagcg tcgatttttg tgatgctcgt caggggggcg gagcctatgg aaaaaaa\n//\n",
+    "report": [
+        {
+            "sseqid": "ori",
+            "start_location": 4,
+            "end_location": 593,
+            "strand": 1,
+            "percent_identity": 99.83,
+            "full_length_of_feature_in_db": 589,
+            "length_of_found_feature": 589,
+            "percent_match_length": 100,
+            "fragment": false,
+            "database": "snapgene",
+            "Feature": "ori",
+            "Type": "rep_origin",
+            "Description": "high-copy-number ColE1/pMB1/pBR322/pUC origin of replication ",
+            "sequence": "TTGAGATCCTTTTTTTCTGCGCGTAATCTGCTGCTTGCAAACAAAAAAACCACCGCTACCAGCGGTGGTTTGTTTGCCGGATCAAGAGCTACCAACTCTTTTTCCGAAGGTAACTGGCTTCAGCAGAGCGCAGATACCAAATACTGTTCTTCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCACCGCCTACATACCTCGCTCTGCTAATCCTGTTACCAGTGGCTGCTGCCAGTGGCGATAAGTCGTGTCTTACCGGGTTGGACTCAAGACGATAGTTACCGGATAAGGCGCAGCGGTCGGGCTGAACGGGGGGTTCGTGCACACAGCCCAGCTTGGAGCGAACGACCTACACCGAACTGAGATACCTACAGCGTGAGCTATGAGAAAGCGCCACGCTTCCCGAAGGGAGAAAGGCGGACAGGTATCCGGTAAGCGGCAGGGTCGGAACAGGAGAGCGCACGAGGGAGCTTCCAGGGGGAAACGCCTGGTATCTTTATAGTCCTGTCGGGTTTCGCCACCTCTGACTTGAGCGTCGATTTTTGTGATGCTCGTCAGGGGGGCGGAGCCTATGGAAA"
+        },
+        {
+            "sseqid": "1",
+            "start_location": 43,
+            "end_location": 148,
+            "strand": -1,
+            "percent_identity": 100,
+            "full_length_of_feature_in_db": 102,
+            "length_of_found_feature": 105,
+            "percent_match_length": 97.05882352941177,
+            "fragment": false,
+            "database": "Rfam",
+            "Feature": "RNAI",
+            "Type": "ncRNA",
+            "Description": "Accession: RF00106 - RNAI",
+            "sequence": "AGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGT"
+        }
+    ]
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		> blah
		AAAAttgagatcctttttttctgcgcgtaatctgctgcttgcaaacaaaaaaaccaccgctaccagcggtggtttgtttgccggatcaagagctaccaactctttttccgaaggtaactggcttcagcagagcgcagataccaaatactgttcttctagtgtagccgtagttaggccaccacttcaagaactctgtagcaccgcctacatacctcgctctgctaatcctgttaccagtggctgctgccagtggcgataagtcgtgtcttaccgggttggactcaagacgatagttaccggataaggcgcagcggtcgggctgaacggggggttcgtgcacacagcccagcttggagcgaacgacctacaccgaactgagatacctacagcgtgagctatgagaaagcgccacgcttcccgaagggagaaaggcggacaggtatccggtaagcggcagggtcggaacaggagagcgcacgagggagcttccagggggaaacgcctggtatctttatagtcctgtcgggtttcgccacctctgacttgagcgtcgatttttgtgatgctcgtcaggggggcggagcctatggaaaAAAA